Source code for troppo.omics.readers.microarray

"""
 Created by Jorge Gomes on 19/03/2018
 source
 probe_reader
 
"""
import numpy as np
from pandas import read_csv


[docs]class ProbeReader:
    """
    Reads expression files sourced from microarrays DBs such as Gene Expression Barcode or Gene Expression OmniBus.
    Considers each value is identified by a probeID on the first column of the file. An annotation file supplied by
    the microarray chip vendor must be supplied for appropriate probe to gene Id conversion.
    Cases where a probe has no match with convTarget nomenclature will be ignored.
    Handles cases where more than one probe translate to the same gene, and where a probe translates to more
    than a gene.

    Parameters
    ----------
    fPath: str
        complete path to the file from which expresion data is read.
    expCol: int
        index of the column where expression values are retrieved from.
    annotFile: str
        complete path to the annotation file.
    convTarget: str
        exact match to the column name of the nomenclature used for probeID to geneID conversion
        recommended: Either Gene Symbol or Entrez Gene or equivalent.
    expSep: str
        field separator used in the probe intesity/expression file. Default is ","
    """

    def __init__(self, fPath: str, expCol: int, annotFile: str, convTarget: str, convSep: str = ',', expSep: str = ','):
        self._fpath = fPath
        self._expCol = expCol
        self._cPath = annotFile
        self._convTarget = convTarget
        self._convSep = convSep
        self._expSep = expSep
        self._IdMapping = self.__createMapping()

[docs]    def load(self) -> dict or None:
        """
        Executes the loading of supplied omics file.

        Returns
        -------
        dict: a dictionary of geneID: expressionValue
        """
        # avoid loading when mapping does not exist
        if self._IdMapping is None:
            return

        tup_list = []  # auxiliary structure
        values = {}  # {ID: Exp_val}

        with open(self._fpath, 'r') as f:
            header = f.readline().split(self._expSep)

            if self._expCol == 0 or self._expCol > len(header):
                raise Exception('Column \'{0}\' exceeds number of columns in file, or is the probe Id column \n '
                                'Please input a valid column. File header: {1}'.format(self._expCol, header))

            else:
                for line in f:
                    fields = line.replace('\"', '').split(',')
                    # don't add genes to tuplist that don't have an id mapping
                    if isinstance(self._IdMapping[fields[0]], str):
                        genes = self._IdMapping[fields[0]].replace(' ', '').split('///')  # 1 probe : many genes
                        for geneID in genes:
                            if geneID not in ('---', ''):  # filters cases where one probe does not match an id
                                tup_list.append((geneID, float(fields[self._expCol])))
        for gene, val in tup_list:  # 2 probes translated to same gene
            if gene not in values:
                values[gene] = [val]  # simple entries
            else:
                values[gene].append(val)  # multiple entries -> mean will occur after
        new_values = {g: np.mean(val) for g, val in values.items()}
        return new_values

    # handles annotation file
    def __createMapping(self) -> dict or None:
        """
        Creates a dictionary of probeID: geneID from the annotation file supplied.

        Returns
        -------
        dict: a dictionary of probeID: geneID
        """
        field_sep = self._convSep
        mapping = {}  # handling more of one probe for the same gene

        # find header of annot file
        with open(self._cPath, 'r') as f:
            header_start = 0
            for line in f:
                if len(line.split(field_sep)) < 10:
                    header_start += 1
                else:
                    break

        # actually read the file
        annot = read_csv(self._cPath, header=header_start, sep=field_sep)
        if self._convTarget not in list(annot):
            print('convTarget is not present in the annotation file please input one of the following:', '\n',
                  list(annot))
            return
        else:
            return dict(zip(annot.iloc[:, 0], annot[self._convTarget]))


if __name__ == '__main__':
    path = "C:/Users/Tese_Avoid_Namespaces/Tese/TsmRec/files/abc-tis-gpl570-formatted_v3.csv"
    tissue = 'brain'
    convFile = "C:/Users/Tese_Avoid_Namespaces/Tese/TsmRec/files/rembrandt_study/HG-U133_Plus_2.na35.annot.csv"
    convS = ('Probe Set ID', 'Gene Symbol', ',')
    help(ProbeReader)
    gr = ProbeReader(path, 3, convFile, convTarget="Gene Symbol")
    gr.load()