Source code for cassiopeia.simulator.UniformLeafSubsampler

"""
A subclass of LeafSubsampler, the UniformLeafSubsampler. 

Takes a uniform random sample of the leaves of a CassiopeiaTree and produces a
new CassiopeiaTree that keeps only the lineages pertaining to the sample.
"""

import copy
import numpy as np
from typing import Optional

from cassiopeia.data import CassiopeiaTree
from cassiopeia.simulator.LeafSubsampler import (
    LeafSubsampler,
    LeafSubsamplerError,
)


[docs] class UniformLeafSubsampler(LeafSubsampler): def __init__( self, ratio: Optional[float] = None, number_of_leaves: Optional[int] = None, ): """ Uniformly subsample leaf samples of a CassiopeiaTree. If 'ratio' is provided, samples 'ratio' of the leaves, rounded down, uniformly at random. If instead 'number_of_leaves' is provided, 'number_of_leaves' of the leaves are sampled uniformly at random. Only one of the two criteria can be provided. Args: ratio: Specifies the number of leaves to be sampled as a ratio of the total number of leaves number_of_leaves: Explicitly specifies the number of leaves to be sampled """ if ratio is None and number_of_leaves is None: raise LeafSubsamplerError( "At least one of 'ratio' and 'number_of_leaves' " "must be specified." ) if ratio is not None and number_of_leaves is not None: raise LeafSubsamplerError( "Exactly one of 'ratio' and 'number_of_leaves'" "must be specified." ) self.__ratio = ratio self.__number_of_leaves = number_of_leaves
[docs] def subsample_leaves( self, tree: CassiopeiaTree, keep_singular_root_edge: bool = True ) -> CassiopeiaTree: """Uniformly subsample leaf samples of a given tree. Generates a uniform random sample on the leaves of the given CassiopeiaTree and returns a tree pruned to contain lineages relevant to only leaves in the sample (the "induced subtree" on the sample). All fields on the original character matrix persist, but maintains character states, meta data, and the dissimilarity map for the sampled cells only. Has the option to keep the single edge leading from the root in the induced subtree, if it exists. This edge is often used to represent the time that the root lives before any divisions occur in the phyologeny, and is useful in instances where the branch lengths are critical, like simulating ground truth phylogenies or estimating branch lengths. Args: tree: The CassiopeiaTree for which to subsample leaves keep_singular_root_edge: Whether or not to collapse the single edge leading from the root in the subsample, if it exists Returns: A new CassiopeiaTree that is the induced subtree on a sample of the leaves in the given tree Raises: LeafSubsamplerError if the sample size is <= 0, or larger than the number of leaves in the tree """ ratio = self.__ratio number_of_leaves = self.__number_of_leaves n_subsample = ( number_of_leaves if number_of_leaves is not None else int(tree.n_cell * ratio) ) if n_subsample <= 0: raise LeafSubsamplerError( "Specified number of leaves sampled is <= 0." ) if n_subsample > tree.n_cell: raise LeafSubsamplerError( "Specified number of leaves sampled is greater than the number" " of leaves in the given tree." ) n_remove = len(tree.leaves) - n_subsample subsampled_tree = copy.deepcopy(tree) leaf_remove = np.random.choice( subsampled_tree.leaves, n_remove, replace=False ) subsampled_tree.remove_leaves_and_prune_lineages(leaf_remove) # Keep the singular root edge if it exists and is indicated to be kept if ( len(subsampled_tree.children(subsampled_tree.root)) == 1 and keep_singular_root_edge ): collapse_source = subsampled_tree.children(subsampled_tree.root)[0] else: collapse_source = None subsampled_tree.collapse_unifurcations(source=collapse_source) # Copy and annotate branch lengths and times subsampled_tree.set_times( dict( [(node, tree.get_time(node)) for node in subsampled_tree.nodes] ) ) return subsampled_tree