Source code for cassiopeia.tools.autocorrelation
"""
Utility file for computing autocorrelation statistics on trees.
"""
from typing import Callable, List, Optional, Union
import numpy as np
import pandas as pd
from cassiopeia.data import CassiopeiaTree
from cassiopeia.mixins import AutocorrelationError
from cassiopeia.data import utilities
[docs]
def compute_morans_i(
tree: CassiopeiaTree,
meta_columns: Optional[List] = None,
X: Optional[pd.DataFrame] = None,
W: Optional[pd.DataFrame] = None,
inverse_weight_fn: Callable[[Union[int, float]], float] = lambda x: 1.0 / x,
) -> Union[float, pd.DataFrame]:
"""Computes Moran's I statistic.
Using the cross-correlation between leaves as specified on the tree, compute
the Moran's I statistic for each of the data items specified. This will
only work for numerical data, and will thrown an error otherwise.
Generally, this statistic takes in a weight matrix (which can be computed
directly from a phylogenetic tree) and a set of numerical observations that
are centered and standardized (i.e., mean 0 and population standard deviation
of 1). Then, the Moran's I statistic is:
I = X' * Wn * X
where X' denotes a tranpose, * denotes the matrix multiplier, and Wn is the
normalized weight matrix such that sum([w_i,j for all i,j]) = 1.
Inspired from the tools and code used in Chaligne et al, Nature Genetics
2021.
The mathematical details of the statistic can be found in:
Wartenberg, "Multivariate Spatial Correlation: A Method for Exploratory
Geographical Analysis", Geographical Analysis (1985)
Args:
tree: CassiopeiaTree
meta_columns: Columns in the Cassiopeia Tree :attr:cell_meta object
for which to compute autocorrelations
X: Extra data matrix for computing autocorrelations.
W: Phylogenetic weight matrix. If this is not specified, then the
weight matrix will be computed within the function.
inverse_weight_fn: Inverse function to apply to the weights, if the
weight matrix must be computed.
Returns:
Moran's I statistic
"""
if X is None and meta_columns is None:
raise AutocorrelationError(
"Specify data for computing autocorrelations."
)
_X = None
if meta_columns is not None:
_X = tree.cell_meta[meta_columns]
if X is not None:
if len(np.intersect1d(tree.leaves, X.index)) != tree.n_cell:
raise AutocorrelationError(
"Specified argument X must be a dataframe with identical"
" indices to the leaves of the CassiopeiaTree."
)
_X = pd.concat([_X, X], axis=0)
# check to make sure all values are numerical
if not np.all(
_X.apply(lambda s: pd.to_numeric(s, errors="coerce").notnull().all())
):
raise AutocorrelationError(
"There are some columns that are not numeric in the specified data."
)
# cast to numeric
_X = _X.apply(lambda s: pd.to_numeric(s, errors="coerce"))
# instantiate the weight matrix if None is specified
if W is None:
W = utilities.compute_phylogenetic_weight_matrix(
tree, inverse=True, inverse_fn=inverse_weight_fn
)
# make sure that W has the correct indices
if len(np.intersect1d(tree.leaves, W.index)) != tree.n_cell:
raise AutocorrelationError(
"Weight matrix does not have the same leaves as the tree."
)
N = tree.n_cell
# normalize W to 1
_W = W / W.sum().sum()
# center and standardize _X
_X = (_X - _X.mean()) / _X.std(axis=0, ddof=0)
I = _X.T.dot(_W).dot(_X)
# if we're only testing one variable, return a float
if _X.shape[1] == 1:
I = I.iloc[0, 0]
return I