Source code for troppo.omics.readers.generic

"""
 Created by Jorge Gomes on 06/09/2018
 source
 generic_reader
 
"""
import pandas as pd
from pandas import read_csv, DataFrame

from troppo import omics


[docs]class TabularReader(object): """ A generic reader for tabular files. It can be used to read any tabular file, but it is recommended to use specialized readers for specific file types, such as ProbeReader for microarray files, or HpaReader for HPA files. Arguments --------- path_or_df: str or pandas.DataFrame The path to the file to be read, or a pandas DataFrame index_col: int, optional The index column of the file, by default 0 sample_in_rows: bool, optional Whether the samples are in rows or columns, by default True header_offset: int, optional The number of lines to skip before the header, by default 0 cache_df: bool, optional Whether to cache the DataFrame, by default False ignore_samples: list, optional A list of samples to ignore, by default None omics_type: str, optional The type of omics, by default 'transcriptomics' nomenclature: str, optional The nomenclature of the omics, by default None dsapply: function, optional A function to apply to the DataFrame, by default None **kwargs: dict, optional Additional arguments to pass to pandas.read_csv Methods ------- __iter__: Iterates over the file, yielding a tuple of (sample, data) to_containers: Converts the file to a list of OmicsContainers """ def __init__(self, path_or_df: str or pd.DataFrame, index_col: int = 0, sample_in_rows: bool = True, header_offset: int = 0, cache_df: bool = False, ignore_samples: list = None, omics_type: str = 'transcriptomics', nomenclature: str = None, dsapply=None, **kwargs): self.path, self.index_col, self.sample_axis, self.header_offset = \ path_or_df, index_col, sample_in_rows, header_offset self.pandas_args = kwargs self.dsapply = dsapply self.dfcache = None self.cache_df = cache_df self.ignore_samples = ignore_samples self.omics_type = omics_type self.nomenclature = nomenclature def __iter__(self): if self.dfcache is None: if isinstance(self.path, DataFrame): df = self.path else: df = read_csv(self.path, index_col=self.index_col, **self.pandas_args) self.dfcache = df elif self.cache_df: df = self.dfcache else: df = self.dfcache if not self.sample_axis: df = df.T if self.ignore_samples is not None and len(self.ignore_samples) > 0: df = df.drop(labels=self.ignore_samples, axis=0) if self.dsapply is not None: df = self.dsapply(df) for name, series in df.iterrows(): yield name, series.to_dict()
[docs] def to_containers(self): """ Converts the file to a list of OmicsContainers Returns ------- list : A list of OmicsContainers """ ocs = [omics.OmicsContainer(data=data, condition=sample, nomenclature=self.nomenclature, omicstype=self.omics_type) for sample, data in self] self.dsapply = None return ocs
[docs]class GenericReader: """ A generic reader to be used with omics files that are unable to be loaded by ProbeReader, or HpaReader, such as RNA-seq files from the gdc. Capable of handling files with additional info before the file header when supplied header_start by the user. Arguments --------- path: str complete path to the file from which expresion data is read. idCol: int or str either the name of the identifier column or its index in the file header expCol: int or str either the name of the expression values column or its index in the file header header_start: int line of the file header. Default = 0 sep: str field separator used in the omics file. Default = "," """ def __init__(self, path: str, idCol: int or str, expCol: int or str, header_start: int = 0, sep: str = (',')): self._path = path self._idCol = idCol self._expCol = expCol self._headerStart = header_start self._sep = sep
[docs] def load(self, **kwargs): """ Executes the loading of supplied omics file. Returns ------- dict: dictionary with the identifiers as keys and the expression values as values. """ omics_ds = read_csv(self._path, header=self._headerStart, sep=self._sep) omics = {} t_id = type(self._idCol) t_exp = type(self._expCol) if t_id not in (int, str) or t_exp not in (int, str): print('Invalid idCol or expCol. Please input one of the following:\n', omics_ds.columns.values) return try: if t_id == t_exp == int: omics = dict(zip(omics_ds.iloc[:, self._idCol], omics_ds.iloc[:, self._expCol])) elif t_id == t_exp == str: omics = dict(zip(omics_ds[self._idCol], omics_ds[self._expCol])) elif t_id is int and t_exp is str: omics = dict(zip(omics_ds.iloc[:, self._idCol], omics_ds[self._expCol])) elif t_id is str and t_exp is int: omics = dict(zip(omics_ds[self._idCol], omics_ds.iloc[:, self._expCol])) except (KeyError, IndexError): print('One or both of the supplied columns do not match any column of the file or supplied column index is' ' out of range\n Header Size:{0}\n Columns:{1}' .format(len(omics_ds.columns.values) - 1, omics_ds.columns.values)) return omics
if __name__ == "__main__": # path1 = "C:/Users/Tese_Avoid_Namespaces/Tese/TsmRec/files/abc-gpl571-formatted_v3.csv" # gr = GenericReader(path1,'probe_id', 22) # gr.load() reader = TabularReader('C:/Users/biosy/Documents/troppo/tests/data/Desai-GTEx_ensembl.csv', nomenclature='ensemble_gene_id', omics_type='transcriptomics').to_containers() print(reader)