Source code for cassiopeia.critique.compare

"""
A library that stores functions for comparing two trees to one another.
Currently, we'll support a triplets correct function and a Robinson-Foulds
function.
"""
from collections import defaultdict
import copy
import ete3
import networkx as nx
import numpy as np
from typing import Dict, Tuple

from cassiopeia.critique import critique_utilities
from cassiopeia.data import CassiopeiaTree


[docs] def triplets_correct( tree1: CassiopeiaTree, tree2: CassiopeiaTree, number_of_trials: int = 1000, min_triplets_at_depth: int = 1, ) -> Tuple[ Dict[int, float], Dict[int, float], Dict[int, float], Dict[int, float] ]: """Calculate the triplets correct accuracy between two trees. Takes in two newick strings and computes the proportion of triplets in the tree (defined as a set of three leaves) that are the same across the two trees. This procedure samples the same number of triplets at every depth such as to reduce the amount of bias of sampling triplets randomly. Args: tree1: Input CassiopeiaTree tree2: CassiopeiaTree to be compared to the first tree. number_of_trials: Number of triplets to sample at each depth min_triplets_at_depth: The minimum number of triplets needed with LCA at a depth for that depth to be included Returns: Four dictionaries storing triplet information at each depth: all_triplets_correct: the total triplets correct resolvable_triplets_correct: the triplets correct for resolvable triplets unresolved_triplets_correct: the triplets correct for unresolvable triplets proportion_resolvable: the proportion of unresolvable triplets per depth """ # keep dictionary of triplets correct all_triplets_correct = defaultdict(int) unresolved_triplets_correct = defaultdict(int) resolvable_triplets_correct = defaultdict(int) proportion_unresolvable = defaultdict(int) # create copies of the trees and collapse process T1 = copy.deepcopy(tree1) T2 = copy.deepcopy(tree2) T1.collapse_unifurcations() T2.collapse_unifurcations() # set depths in T1 and compute number of triplets that are rooted at # ancestors at each depth depth_to_nodes = critique_utilities.annotate_tree_depths(T1) max_depth = np.max([T1.get_attribute(n, "depth") for n in T1.nodes]) for depth in range(max_depth): score = 0 number_unresolvable_triplets = 0 # check that there are enough triplets at this depth candidate_nodes = depth_to_nodes[depth] total_triplets = sum( [T1.get_attribute(v, "number_of_triplets") for v in candidate_nodes] ) if total_triplets < min_triplets_at_depth: continue for _ in range(number_of_trials): (i, j, k), out_group = critique_utilities.sample_triplet_at_depth( T1, depth, depth_to_nodes ) reconstructed_outgroup = critique_utilities.get_outgroup( T2, (i, j, k) ) is_resolvable = True if out_group == "None": number_unresolvable_triplets += 1 is_resolvable = False # increment score if the reconstructed outgroup is the same as the # ground truth score = int(reconstructed_outgroup == out_group) all_triplets_correct[depth] += score if is_resolvable: resolvable_triplets_correct[depth] += score else: unresolved_triplets_correct[depth] += score all_triplets_correct[depth] /= number_of_trials if number_unresolvable_triplets == 0: unresolved_triplets_correct[depth] = 1.0 else: unresolved_triplets_correct[depth] /= number_unresolvable_triplets proportion_unresolvable[depth] = ( number_unresolvable_triplets / number_of_trials ) if proportion_unresolvable[depth] < 1: resolvable_triplets_correct[depth] /= ( number_of_trials - number_unresolvable_triplets ) else: resolvable_triplets_correct[depth] = 1.0 return ( all_triplets_correct, resolvable_triplets_correct, unresolved_triplets_correct, proportion_unresolvable, )
[docs] def robinson_foulds( tree1: CassiopeiaTree, tree2: CassiopeiaTree ) -> Tuple[float, float]: """Compares two trees with Robinson-Foulds distance. Computes the Robinsons-Foulds distance between two trees. Currently, this is the unweighted variant as most of the algorithms we use are maximum- parsimony based and do not use edge weights. This is mostly just a wrapper around the `robinson_foulds` method from Ete3. Args: tree1: A CassiopeiaTree representing the first tree tree2: A CassiopeiaTree representing the second tree Returns: The Robinson-Foulds distance between the two trees and the maximum Robinson-Foulds distance for downstream normalization """ # create copies of the trees and collapse process T1 = copy.deepcopy(tree1) T2 = copy.deepcopy(tree2) # convert to Ete3 trees and collapse unifurcations T1.collapse_unifurcations() T2.collapse_unifurcations() T1 = ete3.Tree(T1.get_newick(), format=1) T2 = ete3.Tree(T2.get_newick(), format=1) ( rf, rf_max, names, edges_t1, edges_t2, discarded_edges_t1, discarded_edges_t2, ) = T1.robinson_foulds(T2, unrooted_trees=True) return rf, rf_max