Source code for troppo.omics.readers.hpa

"""
 Created by Jorge Gomes on 09/03/2018
 source
 HPA_Reader
 
"""
import numpy as np


[docs]class HpaReader:
    """
    Reads the HPA pathology.tsv file from a fpath in the system.
    Discrete values are converted to numerical and expression values account for the level with the most patients.

    Parameters
    ----------
    fpath: str
        complete path to the file from which omics data is read
    tissue: str
        Exactly as in the file, regarding the column where expression values should be retrieved
    id_col: int,
        either 0 (="ensembl") or 1(="gene_symbol") regarding which column shall be used for gene id
    includeNA: bool
        flag if NA values should be included or not
    """

    def __init__(self, fpath: str, tissue: str, id_col: int = 0, includeNA: bool = False):
        self._tissue = tissue
        self._id_col = id_col
        self._path = fpath
        self._includeNA = includeNA

[docs]    def load(self):
        """
        Executes the loading of supplied omics file.

        Returns
        -------
        dict: a dictionary of geneID: expressionValue
        """
        if self._id_col not in (0, 1):
            print('Invalid id_col. Using column 0 for gene ids')
            self._id_col = 0
        with open(self._path, 'r') as f:
            header = f.readline().split('\t')
            levels = header[3:7]

            record = {}  # {Gene symbol: Expression Value}

            for line in f:
                fields = line.split('\t')
                if fields[2] == self._tissue:
                    # record
                    if not np.isnan(_handle_exp_val(fields[3:7])):
                        record[fields[self._id_col]] = levels[_handle_exp_val(fields[3:7])]
                    elif self._includeNA:
                        record[fields[self._id_col]] = np.NaN

        return record


# Auxiliary functions


def _handle_exp_val(exp_values: list) -> int or np.NaN:
    """
    Retrieves the index of the expression value with the most patients.

    Parameters
    ----------
    exp_values: list
        list of expression values for a given gene

    Returns
    -------
    int: index of the expression value with the most patients
    """

    if exp_values == ['', '', '', '']:
        return np.NaN
    else:
        max_idx = [i for i, x in enumerate(exp_values) if x == max(exp_values)]
        return max_idx[0]


# as of now not being used
def _handle_prog(prog: list) -> int or str:
    """
    Retrieves the output prognostic based on the score placement in HPA file.

    Parameters
    ----------
    prog: list
        list of prognostic scores for a given gene

    Returns
    -------
    int: index of the prognostic score with the most patients. If all are empty, returns 'None'

    """
    # record['Prognostic'].append(progs[handle_prog(fields[7:])].strip('\n') if handle_prog(fields[7:])
    # is not 'None' else 'None')
    if prog == ['', '', '', '\n']:
        return 'None'
    else:
        idx = [len(i) for i in prog]
        return idx.index(max(idx))


if __name__ == '__main__':
    PATH = "C:/Users/Tese_Avoid_Namespaces/Tese/TsmRec/files/pathology.tsv"
    d2num = {'High': 20.0,
             'Medium': 15.0,
             'Low': 10.0,
             'Not detected': -8.0}

    hpa = HpaReader(PATH, 'breast cancer', id_col=2, includeNA=False)
    a = hpa.load()
    print(a)