Source code for cytomulate.emulation.cytof_data

# Progress bar
from tqdm import tqdm

# List manipulation
from copy import deepcopy
import numpy as np
from collections import Counter

# Classes to include
from cytomulate.emulation.cell_type import EmulationCellType
from cytomulate.emulation.cell_graph import EmulationCellGraph

# Superclass
from cytomulate.cytof_data_general import GeneralCytofData

# Typing
from typing import Union, Optional, Callable, Tuple, List


[docs] class EmulationCytofData(GeneralCytofData): def __init__(self, n_batches: int = 1, background_noise_model: Optional[Union[Callable, dict]] = None, bead_label: Optional[Union[str, int]] = None) -> None: """The Emulation Mode object for Cytomulate. This class serves as a starting point for the Emulation Mode of Cytomulate. The constructor defines the key parameters of the simulation, including the number of batches. Unlike the Creation mode, other parameters such as the number of protein markers are fixed from the dataset rather than user-soecified. The number of cells is defined later at a sampling step. Parameters ---------- n_batches: int The number of batches to be simulated background_noise_model: Callable or dict The model used to generate random values. It should have only one input: size bead_label: str or int The label for beads """ super().__init__(n_batches, background_noise_model) self.bead_label = bead_label self.observed_cell_abundances = {} self.cell_graph = EmulationCellGraph()
[docs] def initialize_cell_types(self, expression_matrix: np.ndarray, labels: np.ndarray, max_components: int = 9, min_components: int = 1, covariance_types: Union[List[str], Tuple[str]] = ("full", "tied", "diag", "spherical")) -> None: """Initialize cell type models by fitting Gaussian mixtures This method fits the GMM models for each cell type. Namely, a Gaussian Mixture Model is generated for each cell type at this stage according to the parameters specified. An extensive model selection procedure based on the Bayesian Information Criterion (BIC) is performed when multiple possibilities of components and covariance types are specified. See details in `max_components` and `covariance_types`. Parameters ---------- expression_matrix: np.ndarray A matrix containing the expression levels of cell events labels: np.ndarray A vector of cell type labels max_components: int The maximal number of components for a Gaussian mixture. Used for Gaussian mixture model selection. This must be smaller or equal to the `max_components`. If `max_components` equals `min_components`, the exact number will be used for fitting. Otherwise, a model selection procedure will ensue using Bayesian Information Criterion. min_components: int The minimal number of components for a Gaussian mxitrue. Used for Gaussian mixture model selection. This must be smaller or equal to the `max_components`. See `max_components` for details on model selection. covariance_types: list or tuple The candidate types of covariances used for Gaussian mixture model selection. If only one is specified, no model selection will be performed based on the covariance structure. """ self.n_markers = np.shape(expression_matrix)[1] unique_labels = np.unique(labels) abundances = Counter(labels) cell_id = 0 for c_type in tqdm(unique_labels): self.observed_cell_abundances[c_type] = abundances[c_type]/len(labels) self.cell_type_labels_to_ids[c_type] = cell_id self.cell_type_ids_to_labels[cell_id] = c_type self.cell_types[c_type] = EmulationCellType(label=c_type, cell_id=cell_id, n_markers=self.n_markers) ind = np.where(labels == c_type)[0] D = expression_matrix[ind, :] self.cell_types[c_type].fit(data=D, max_components=max_components, min_components=min_components, covariance_types=covariance_types) cell_id += 1
[docs] def generate_cell_graph(self, graph_topology: str = "forest", **kwargs) -> None: """Generate a cell graph as well as differentiation paths This method is part of complex simulation's cellular trajectory simulation. It generates differentiation paths, which will be used at the sampling stage. Parameters ---------- graph_topology: str Type of graph to be generated kwargs: Other parameters used for trajectory generation """ self.cell_graph.initialize_graph(self.cell_types, bead_label=self.bead_label) self.cell_graph.prune_graph(graph_topology) self.cell_graph.generate_trajectories(self.cell_types, **kwargs)
[docs] def generate_cell_abundances(self, use_observed: bool = True, is_random: bool = True) -> None: """Generate cell abundances Generate the cell abundances for all cell types: namely, the amount of cells in each cell type. This method supports either data-based cell abundance or randomly-generated cell abundance. In the latter case, each cell type's probability can be further randomized. Parameters ---------- use_observed: bool Whether the cell abundances should use the observed ones is_random: bool In the case that `user_obsersed` is `False`, whether the cell abundances' probability should be randomly generated. If `True`, the abundance of each cell type is sampled from a dirichlet distribution. If `False`, then all cell types an have equal probability. Note ----- If you wish to use the default observed cell abundance from the data, it is not necessary to call this method. Otherwise, you should always set ``used_observed`` to ``False``. """ if use_observed: for b in range(self.n_batches): self.cell_abundances[b] = deepcopy(self.observed_cell_abundances) else: super().generate_cell_abundances(is_random)