Source code for troppo.omics.gene_level_thresholding

import numpy as np
import pandas
import pandas as pd
from numpy import log


[docs]class GeneLevelThresholding: """ This class is used to transform the dataframe containing the omics data and perform gene-level thresholding on omics data. It currently supports Global and Local thresholding approaches described by Richelle, Joshi and Lewis (2019) (https://doi.org/10.1371/journal.pcbi.1007185). These include: - global: genes with a value lower than the upper global threshold (GTU) are considered inactive; genes with a value greater than the lower global threshold (GTL) are considered active. - local t1: genes with a value lower than the upper global threshold (GTU) are considered inactive; for genes with a value greater than the GTU, if the value is lower than the local threshold (LT), the gene is considered inactive, otherwise it is considered active. - local t2: genes with a value lower than the upper global threshold (GTU) are considered inactive; genes with a value greater than the lower global threshold (GTU) are considered active; for genes with a value between the GTU and the lower global threshold (GTL), they are only considered active if their value is greater than the local threshold (LT). Thresholds are selected in accordance with the distribution of the data. The numbers in the thresholding options represent the position of the value to use. Currently, the options are: [0.1, 0.25, 0.5, 0.75, 0.9]; the threshold value will then be the value on the dataset that corresponds to that quantile. Parameters ---------- omics_dataframe: pandas.DataFrame Omics data to be thresholded. thresholding_strat: str Thresholding strategy to be used. Must be one of: global, local t1, local t2. global_threshold_lower: int or None, default = None Position of the Global Lower threshold value on the quantile list. global_threshold_upper: int or None, default = None Position of the Global Upper threshold value on the quantile list. local_threshold: int or None, default = None Position of the Local threshold value on the quantile list. """ def __init__(self, omics_dataframe: pandas.DataFrame, thresholding_strat: str = 'global', global_threshold_lower: int = None, global_threshold_upper: int = None, local_threshold: int = None): self.omics_dataframe = omics_dataframe self.filter_name = '_'.join(map(str, [thresholding_strat, global_threshold_lower, global_threshold_upper, local_threshold])) self.filter_name = self.filter_name.replace(' ', '_') if thresholding_strat not in ['global', 'local t1', 'local t2']: raise ValueError('Invalid thresholding strategy. Must be one of: global, local t1, local t2') self.thresholding_strat = thresholding_strat self.max_expression = np.log(omics_dataframe.max().max()) qvalues = [0.1, 0.25, 0.5, 0.75, 0.9] self.quantiles = omics_dataframe.quantile(qvalues) self.global_quantiles = self.quantiles.T.apply(lambda x: x.mean()) if thresholding_strat == 'global': self.global_threshold_lower = self.global_quantiles.iloc[global_threshold_lower] self.global_threshold_upper = None self.local_threshold = None elif thresholding_strat == 'local t1': self.global_threshold_lower = self.global_quantiles.iloc[global_threshold_lower] self.global_threshold_upper = None self.local_threshold = self.quantiles.iloc[local_threshold,] elif thresholding_strat == 'local t2': self.global_threshold_lower = self.global_quantiles.iloc[global_threshold_lower] self.global_threshold_upper = self.global_quantiles.iloc[global_threshold_upper] self.local_threshold = self.quantiles.iloc[local_threshold,]
[docs] @staticmethod def global_thresholding(sample_series: pd.Series, gtlow: float, maxexp: float) -> dict: """ Global thresholding strategy for the omics data. Processes a single sample at the time. Parameters ---------- sample_series: pandas.Series Omics data from a specific sample. gtlow: float Global threshold lower value. maxexp: float Maximum expression value of the dataset. Returns ------- filtered_sample: dict """ return (sample_series / gtlow).apply(log).clip(-maxexp, maxexp).to_dict()
[docs] @staticmethod def local_t1_thresholding(sample_series: pd.Series, gtlow: float, lt: pd.Series, maxexp: float) -> dict: """ Local T1 thresholding strategy for the omics data. Processes a single sample at the time. Parameters ---------- sample_series: pandas.Series Omics data from a specific sample. gtlow: float Global threshold lower value. lt: pd.Series Local threshold value for each sample. maxexp: float Maximum expression value of the dataset. Returns ------- filtered_sample: dict """ activity = (sample_series / gtlow).apply(log).clip(-maxexp, maxexp) gt_active = activity >= 0 activity[gt_active] = (log(sample_series[gt_active] / lt[gt_active]) * maxexp).clip(-maxexp, maxexp) return activity.to_dict()
[docs] @staticmethod def local_t2_thresholding(sample_series: pd.Series, gtlow: float, gtup: float, lt: pd.Series, maxexp: float) -> dict: """ Local T2 thresholding strategy for the omics data. Processes a single sample at the time. Parameters ---------- sample_series: pandas.Series Omics data from a specific sample. gtlow: float Global threshold lower value. gtup: float Global threshold upper value. lt: pandas.Series Local threshold value for each gene. maxexp: float Maximum expression value of the dataset. Returns ------- filtered_sample: dict """ upp_activity = (1 + (sample_series / gtlow).apply(log)).clip(-maxexp, 1 + maxexp) gtu_inactive = upp_activity < 1 low_activity = (sample_series / gtup).apply(log).clip(-maxexp, maxexp) gtl_maybes, gtl_lows = (low_activity >= 0) & gtu_inactive, low_activity < 0 upp_activity[gtl_lows] = low_activity[gtl_lows] activity_maybe = (sample_series[gtl_maybes] / lt[gtl_maybes]). \ apply(log).clip(-maxexp, maxexp) upp_activity[gtl_maybes] = activity_maybe.clip(-1, 1) return upp_activity.to_dict()
[docs] def threshold_strategy(self, sample_series) -> dict: """ Thresholding strategy for the omics data. Processes a single sample at the time. Parameters ---------- sample_series: pandas.Series Omics data from a specific sample. Returns ------- filtered_sample: dict Filtered omics data from a specific sample. """ if self.thresholding_strat == 'global': return self.global_thresholding(sample_series, self.global_threshold_lower, self.max_expression) elif self.thresholding_strat == 'local t1': return self.local_t1_thresholding(sample_series, self.global_threshold_lower, self.local_threshold, self.max_expression) elif self.thresholding_strat == 'local t2': return self.local_t2_thresholding(sample_series, self.global_threshold_lower, self.global_threshold_lower, self.local_threshold, self.max_expression)
[docs] def apply_thresholding_filter(self) -> pandas.DataFrame: """ Thresholding filter for the omics data. Returns ------- filtered_dataset: pandas.DataFrame Filtered omics dataframe. """ filter_results = {} for sample_id in self.omics_dataframe.index: sample = self.omics_dataframe.loc[sample_id, :] filter_results[sample.name + '_' + self.filter_name] = self.threshold_strategy(sample) filtered_dataset = pd.DataFrame(filter_results).T return filtered_dataset