Source code for cassiopeia.plotting.itol_utilities

"""
Basic utilities that can be used to generate visualizations of tree using the
iTOL tree plotting interface. See: https://itol.embl.de/ for more information
on the iTOL software and how to create an account.
"""

import os

import configparser
from pathlib import Path
import shutil
import tempfile
from typing import Dict, List, Tuple, Optional

from bokeh import palettes
from itolapi import Itol, ItolExport
from matplotlib.colors import hsv_to_rgb
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from cassiopeia.data import CassiopeiaTree
from cassiopeia.mixins import iTOLError
from cassiopeia.plotting import utilities


[docs] def upload_and_export_itol( cassiopeia_tree: CassiopeiaTree, tree_name: str, export_filepath: str, itol_config: str = "~/.itolconfig", api_key: Optional[str] = None, project_name: Optional[str] = None, meta_data: List[str] = [], allele_table: Optional[pd.DataFrame] = None, indel_colors: Optional[pd.DataFrame] = None, indel_priors: Optional[pd.DataFrame] = None, rect: bool = False, include_legend: bool = False, use_branch_lengths: bool = False, palette: List[str] = palettes.Category20[20], random_state: Optional[np.random.RandomState] = None, user_dataset_files: Optional[List[str]] = None, verbose: bool = True, **kwargs, ): """Uploads a tree to iTOL and exports it. This function takes in a tree, plots it with iTOL, and exports it locally. A user can also specify meta data and allele tables to visualize alongside their tree. The function requires a user to have an account with iTOL and can pass these credentials to the function in one of two ways: first, the user can specify an `api_key` and a `project_name`, which corresponds to one in the user's iTOL account' second, the user can have a hidden `itolconfig` file that can store these credentials (the default location we will check is in ~/.itolconfig, though this can be overridden by the user). We preferentially take values passed in explicitly to the function in the `api_key` and `project_name` arguments. TODO(mgjones): Add the ability to pass in min/max colors for specific numeric meta data to use when creating the gradient files. Args: cassiopeia_tree: A CassiopeiaTree instance, populated with a tree. tree_name: Name of the tree. This is what the tree will be called within your project directory on iTOL export_filepath: Output file path to save your tree. Must end with one of the following suffixes: ['png', 'svg', 'eps', 'ps', 'pdf']. itol_config: A configuration file that a user can maintain for storing iTOL account details (specifically, an API key and a project name for uploading trees). We assume that the information is stored under the [DEFAULT] header. We also will be default check the path `~/itolconfig` but the user can pass in another path should they wish. If an `api_key` and `project_name` are also passed in, we will preferentially take those values. api_key: API key linking to your iTOL account project_name: Project name to upload to. meta_data: Meta data to plot alongside the tree, which must be columns in the CassiopeiaTree.cell_meta variable. allele_table: Alleletable to plot alongside the tree. indel_colors: Color mapping to use for plotting the alleles for each cell. Only necessary if `allele_table` is specified. indel_priors: Prior probabilities for each indel. Only useful if an allele table is to be plotted and `indel_colors` is None. rect: Boolean indicating whether or not to save your tree as a circle or rectangle. use_branch_lengths: Whether or not to use branch lengths when exporting the tree. include_legend: Plot legend along with meta data. palette: A palette of colors in hex format. random_state: A random state for reproducibility user_dataset_files: List of paths to additional dataset files to upload. See https://itol.embl.de/help.cgi#dsType for available dataset types and their definitions. verbose: Include extra print statements. **kwargs: additional keyword arguments are passed as iTOL export parameters. See https://itol.embl.de/help.cgi#batch for a full list. Raises: iTOLError if iTOL credentials cannot be found, if the output format is not supported, if meta data to be plotted cannot be found, or if an error with iTOL is encountered. """ # create temporary directory for storing files we'll upload to iTOL temporary_directory = tempfile.mkdtemp() if api_key is None or project_name is None: if os.path.exists(os.path.expanduser(itol_config)): config = configparser.ConfigParser() with open(os.path.expanduser(itol_config), "r") as f: config_string = f.read() config.read_string(config_string) try: api_key = config["DEFAULT"]["api_key"] project_name = config["DEFAULT"]["project_name"] except KeyError: raise iTOLError("Error reading the itol config file passed in.") else: raise iTOLError( "Specify an api_key and project_name, or a valid iTOL " "config file." ) with open(os.path.join(temporary_directory, "tree_to_plot.tree"), "w") as f: f.write(cassiopeia_tree.get_newick(record_branch_lengths=True)) file_format = export_filepath.split("/")[-1].split(".")[-1] if file_format not in ["png", "svg", "eps", "ps", "pdf"]: raise iTOLError( "File format must be one of " "'png', 'svg', 'eps', 'ps', 'pdf']" ) itol_uploader = Itol() itol_uploader.add_file( Path(os.path.join(temporary_directory, "tree_to_plot.tree")) ) files = user_dataset_files.copy() if user_dataset_files else [] if allele_table is not None: files += create_indel_heatmap( allele_table, cassiopeia_tree, f"{tree_name}.allele", temporary_directory, indel_colors, indel_priors, random_state, ) for meta_item in meta_data: if meta_item not in cassiopeia_tree.cell_meta.columns: raise iTOLError("Meta data item not in CassiopeiaTree cell meta.") values = cassiopeia_tree.cell_meta[meta_item] if pd.api.types.is_numeric_dtype(values): files.append( create_gradient_from_df( values, cassiopeia_tree, f"{tree_name}.{meta_item}", temporary_directory, ) ) if pd.api.types.is_string_dtype(values): colors = palette[: len(values.unique())] colors = [utilities.hex_to_rgb(color) for color in colors] colormap = dict(zip(np.unique(values), colors)) files.append( create_colorbar( values, cassiopeia_tree, colormap, f"{tree_name}.{meta_item}", temporary_directory, create_legend=include_legend, ) ) for _file in files: itol_uploader.add_file(Path(_file)) itol_uploader.params["treeName"] = tree_name itol_uploader.params["APIkey"] = api_key itol_uploader.params["projectName"] = project_name good_upload = itol_uploader.upload() if not good_upload: raise iTOLError(itol_uploader.comm.upload_output) if verbose: print("iTOL output: " + str(itol_uploader.comm.upload_output)) print("Tree Web Page URL: " + itol_uploader.get_webpage()) print("Warnings: " + str(itol_uploader.comm.warnings)) tree_id = itol_uploader.comm.tree_id itol_exporter = ItolExport() # set parameters: itol_exporter.set_export_param_value("tree", tree_id) itol_exporter.set_export_param_value("format", file_format) if rect: # rectangular tree settings itol_exporter.set_export_param_value("display_mode", 1) else: # circular tree settings itol_exporter.set_export_param_value("display_mode", 2) itol_exporter.set_export_param_value("arc", 359) itol_exporter.set_export_param_value("rotation", 270) itol_exporter.set_export_param_value("leaf_sorting", 1) itol_exporter.set_export_param_value("label_display", 0) itol_exporter.set_export_param_value("internal_marks", 0) itol_exporter.set_export_param_value( "ignore_branch_length", 1 - int(use_branch_lengths) ) itol_exporter.set_export_param_value( "datasets_visible", ",".join([str(i) for i in range(len(files))]) ) itol_exporter.set_export_param_value("horizontal_scale_factor", 1) for key, value in kwargs.items(): itol_exporter.set_export_param_value(key, value) # export! itol_exporter.export(export_filepath) # remove intermediate files shutil.rmtree(temporary_directory)
def create_gradient_from_df( df: pd.Series, tree: CassiopeiaTree, dataset_name: str, output_directory: str = "./tmp/", color_min: str = "#ffffff", color_max: str = "#000000", ) -> str: """Creates a gradient file for the iTOL batch uploader Creates a gradient file for iTOL from numerical data. This will write out the file to the specified location, which can then be uploaded to iTOL. Args: df: A pandas series with numerical data tree: CassiopeiaTree dataset_name: Name for the dataset output_directory: Where to write the output file color_min: Minimum color for gradient color_max: Maximum color for gradient Returns: The filepath to new gradient file. """ _leaves = tree.leaves df = df.loc[_leaves].copy() outdf = pd.DataFrame() outdf["cellBC"] = _leaves outdf["gradient"] = df.values header = [ "DATASET_GRADIENT", "SEPARATOR TAB", "COLOR\t#00000", f"COLOR_MIN\t{color_min}", f"COLOR_MAX\t{color_max}", "MARGIN\t100", f"DATASET_LABEL\t{df.name}", "STRIP_WIDTH\t100", "SHOW_INTERNAL\t0", "DATA", "", ] outfp = os.path.join(output_directory, f"{dataset_name}.{df.name}.txt") with open(outfp, "w") as fOut: for line in header: fOut.write(line + "\n") df_writeout = outdf.to_csv(None, sep="\t", header=False, index=False) fOut.write(df_writeout) return outfp def create_colorbar( labels: pd.DataFrame, tree: CassiopeiaTree, colormap: Dict[str, Tuple[int, int, int]], dataset_name: str, output_directory: str = ".tmp/", create_legend: bool = False, ) -> str: """Creates a colorbar file for the iTOL batch uploader Creates a colorbar file for iTOL from categorical data. This will write out the file to the specified location, which can then be uploaded to iTOL. Args: labels: A pandas series with categorical data (can be represented as strings or categories) tree: CassiopeiaTree colormap: A mapping from category to RGB colors dataset_name: Name for the dataset output_directory: Where to write the output file create_legend: Include legend for this colorbar. Returns: The filepath to new colorbar file. """ _leaves = tree.leaves labelcolors_iTOL = [] for i in labels.loc[_leaves].values: colors_i = colormap[i] color_i = ( "rgb(" + str(colors_i[0]) + "," + str(colors_i[1]) + "," + str(colors_i[2]) + ")" ) labelcolors_iTOL.append(color_i) dfCellColor = pd.DataFrame() dfCellColor["cellBC"] = _leaves dfCellColor["color"] = labelcolors_iTOL # save file with header header = [ "DATASET_COLORSTRIP", "SEPARATOR TAB", "COLOR\t#FF0000", "MARGIN\t100", f"DATASET_LABEL\t{dataset_name}", "STRIP_WIDTH\t100", "SHOW_INTERNAL\t0", "", ] outfp = os.path.join(output_directory, f"{dataset_name}.txt") with open(outfp, "w") as SIDout: for line in header: SIDout.write(line + "\n") if create_legend: number_of_items = len(colormap) SIDout.write(f"LEGEND_TITLE\t{dataset_name} legend\n") SIDout.write("LEGEND_SHAPES") for _ in range(number_of_items): SIDout.write("\t1") SIDout.write("\nLEGEND_COLORS") for col in colormap.values(): SIDout.write(f"\t{utilities.rgb_to_hex(col)}") SIDout.write("\nLEGEND_LABELS") for key in colormap.keys(): SIDout.write(f"\t{key}") SIDout.write("\n") SIDout.write("\nDATA\n") df_writeout = dfCellColor.to_csv( None, sep="\t", header=False, index=False ) SIDout.write(df_writeout) return outfp def create_indel_heatmap( alleletable: pd.DataFrame, tree: CassiopeiaTree, dataset_name: str, output_directory: str, indel_colors: Optional[pd.DataFrame] = None, indel_priors: Optional[pd.DataFrame] = None, random_state: Optional[np.random.RandomState] = None, ) -> List[str]: """Creates a set of files for displaying an indel heatmap with iTOL Creates a set of files, each one corresponding to a character, that can be used to display an allele heatmap alongside a tree in iTOL. If neither indel colors nor indel priors are provided, a random color mapping is created for each unique indel. Args: alleletable: An AlleleTable for the tree tree: CassiopeiaTree dataset_name: Name for the dataset output_directory: Where to write the output files indel_colors: Mapping of indels to colors. indel_priors: Prior probabilities for each indel. Only `indel_colors` are not provided, in which case a new indel color mapping is created by displaying low-probability indels with bright colors and high-probability ones with dull colors. random_state: Random state for reproducibility Returns: A set of filepaths to each cut-site's color bar file for iTOL batch uploading. """ _leaves = tree.leaves clustered_linprof, _indel_colors = utilities.prepare_alleletable( alleletable, _leaves, indel_priors, random_state ) if indel_colors is None: indel_colors = _indel_colors # Convert colors and make colored alleleTable (rgb_heatmap) r, g, b = ( np.zeros(clustered_linprof.shape), np.zeros(clustered_linprof.shape), np.zeros(clustered_linprof.shape), ) for i in tqdm(range(clustered_linprof.shape[0])): for j in range(clustered_linprof.shape[1]): ind = str(clustered_linprof.iloc[i, j]) if ind == "nan": r[i, j], g[i, j], b[i, j] = 1, 1, 1 elif "none" in ind.lower(): r[i, j], g[i, j], b[i, j] = 192 / 255, 192 / 255, 192 / 255 else: col = hsv_to_rgb(tuple(indel_colors.loc[ind, "color"])) r[i, j], g[i, j], b[i, j] = col[0], col[1], col[2] rgb_heatmap = np.stack((r, g, b), axis=2) allele_files = [] for j in range(0, rgb_heatmap.shape[1]): item_list = [] for i in rgb_heatmap[:, j]: item = ( "rgb(" + str(int(round(255 * i[0]))) + "," + str(int(round(255 * i[1]))) + "," + str(int(round(255 * i[2]))) + ")" ) item_list.append(item) dfAlleleColor = pd.DataFrame() dfAlleleColor["cellBC"] = clustered_linprof.index.values dfAlleleColor["color"] = item_list if j == 0: header = [ "DATASET_COLORSTRIP", "SEPARATOR TAB", "COLOR\t#000000", "MARGIN\t100", "DATASET_LABEL\tallele" + str(j), "STRIP_WIDTH\t50", "SHOW_INTERNAL\t0", "DATA", "", ] else: header = [ "DATASET_COLORSTRIP", "SEPARATOR TAB", "COLOR\t#000000", "DATASET_LABEL\tallele" + str(j), "STRIP_WIDTH\t50", "SHOW_INTERNAL\t0", "DATA", "", ] alleleLabel_fileout = os.path.join( output_directory, f"indelColors_0{str(j).zfill(4)}.txt") with open(alleleLabel_fileout, "w") as ALout: for line in header: ALout.write(line + "\n") df_writeout = dfAlleleColor.to_csv( None, sep="\t", header=False, index=False ) ALout.write(df_writeout) allele_files.append(alleleLabel_fileout) return allele_files