"""
Created by Jorge Gomes on 09/03/2018
source
HPA_Reader
"""
import numpy as np
[docs]class HpaReader:
"""
Reads the HPA pathology.tsv file from a fpath in the system.
Discrete values are converted to numerical and expression values account for the level with the most patients.
Parameters
----------
fpath: str
complete path to the file from which omics data is read
tissue: str
Exactly as in the file, regarding the column where expression values should be retrieved
id_col: int,
either 0 (="ensembl") or 1(="gene_symbol") regarding which column shall be used for gene id
includeNA: bool
flag if NA values should be included or not
"""
def __init__(self, fpath: str, tissue: str, id_col: int = 0, includeNA: bool = False):
self._tissue = tissue
self._id_col = id_col
self._path = fpath
self._includeNA = includeNA
[docs] def load(self):
"""
Executes the loading of supplied omics file.
Returns
-------
dict: a dictionary of geneID: expressionValue
"""
if self._id_col not in (0, 1):
print('Invalid id_col. Using column 0 for gene ids')
self._id_col = 0
with open(self._path, 'r') as f:
header = f.readline().split('\t')
levels = header[3:7]
record = {} # {Gene symbol: Expression Value}
for line in f:
fields = line.split('\t')
if fields[2] == self._tissue:
# record
if not np.isnan(_handle_exp_val(fields[3:7])):
record[fields[self._id_col]] = levels[_handle_exp_val(fields[3:7])]
elif self._includeNA:
record[fields[self._id_col]] = np.NaN
return record
# Auxiliary functions
def _handle_exp_val(exp_values: list) -> int or np.NaN:
"""
Retrieves the index of the expression value with the most patients.
Parameters
----------
exp_values: list
list of expression values for a given gene
Returns
-------
int: index of the expression value with the most patients
"""
if exp_values == ['', '', '', '']:
return np.NaN
else:
max_idx = [i for i, x in enumerate(exp_values) if x == max(exp_values)]
return max_idx[0]
# as of now not being used
def _handle_prog(prog: list) -> int or str:
"""
Retrieves the output prognostic based on the score placement in HPA file.
Parameters
----------
prog: list
list of prognostic scores for a given gene
Returns
-------
int: index of the prognostic score with the most patients. If all are empty, returns 'None'
"""
# record['Prognostic'].append(progs[handle_prog(fields[7:])].strip('\n') if handle_prog(fields[7:])
# is not 'None' else 'None')
if prog == ['', '', '', '\n']:
return 'None'
else:
idx = [len(i) for i in prog]
return idx.index(max(idx))
if __name__ == '__main__':
PATH = "C:/Users/Tese_Avoid_Namespaces/Tese/TsmRec/files/pathology.tsv"
d2num = {'High': 20.0,
'Medium': 15.0,
'Low': 10.0,
'Not detected': -8.0}
hpa = HpaReader(PATH, 'breast cancer', id_col=2, includeNA=False)
a = hpa.load()
print(a)