"""
Created by Jorge Gomes on 19/03/2018
source
probe_reader
"""
import numpy as np
from pandas import read_csv
[docs]class ProbeReader:
"""
Reads expression files sourced from microarrays DBs such as Gene Expression Barcode or Gene Expression OmniBus.
Considers each value is identified by a probeID on the first column of the file. An annotation file supplied by
the microarray chip vendor must be supplied for appropriate probe to gene Id conversion.
Cases where a probe has no match with convTarget nomenclature will be ignored.
Handles cases where more than one probe translate to the same gene, and where a probe translates to more
than a gene.
Parameters
----------
fPath: str
complete path to the file from which expresion data is read.
expCol: int
index of the column where expression values are retrieved from.
annotFile: str
complete path to the annotation file.
convTarget: str
exact match to the column name of the nomenclature used for probeID to geneID conversion
recommended: Either Gene Symbol or Entrez Gene or equivalent.
expSep: str
field separator used in the probe intesity/expression file. Default is ","
"""
def __init__(self, fPath: str, expCol: int, annotFile: str, convTarget: str, convSep: str = ',', expSep: str = ','):
self._fpath = fPath
self._expCol = expCol
self._cPath = annotFile
self._convTarget = convTarget
self._convSep = convSep
self._expSep = expSep
self._IdMapping = self.__createMapping()
[docs] def load(self) -> dict or None:
"""
Executes the loading of supplied omics file.
Returns
-------
dict: a dictionary of geneID: expressionValue
"""
# avoid loading when mapping does not exist
if self._IdMapping is None:
return
tup_list = [] # auxiliary structure
values = {} # {ID: Exp_val}
with open(self._fpath, 'r') as f:
header = f.readline().split(self._expSep)
if self._expCol == 0 or self._expCol > len(header):
raise Exception('Column \'{0}\' exceeds number of columns in file, or is the probe Id column \n '
'Please input a valid column. File header: {1}'.format(self._expCol, header))
else:
for line in f:
fields = line.replace('\"', '').split(',')
# don't add genes to tuplist that don't have an id mapping
if isinstance(self._IdMapping[fields[0]], str):
genes = self._IdMapping[fields[0]].replace(' ', '').split('///') # 1 probe : many genes
for geneID in genes:
if geneID not in ('---', ''): # filters cases where one probe does not match an id
tup_list.append((geneID, float(fields[self._expCol])))
for gene, val in tup_list: # 2 probes translated to same gene
if gene not in values:
values[gene] = [val] # simple entries
else:
values[gene].append(val) # multiple entries -> mean will occur after
new_values = {g: np.mean(val) for g, val in values.items()}
return new_values
# handles annotation file
def __createMapping(self) -> dict or None:
"""
Creates a dictionary of probeID: geneID from the annotation file supplied.
Returns
-------
dict: a dictionary of probeID: geneID
"""
field_sep = self._convSep
mapping = {} # handling more of one probe for the same gene
# find header of annot file
with open(self._cPath, 'r') as f:
header_start = 0
for line in f:
if len(line.split(field_sep)) < 10:
header_start += 1
else:
break
# actually read the file
annot = read_csv(self._cPath, header=header_start, sep=field_sep)
if self._convTarget not in list(annot):
print('convTarget is not present in the annotation file please input one of the following:', '\n',
list(annot))
return
else:
return dict(zip(annot.iloc[:, 0], annot[self._convTarget]))
if __name__ == '__main__':
path = "C:/Users/Tese_Avoid_Namespaces/Tese/TsmRec/files/abc-tis-gpl570-formatted_v3.csv"
tissue = 'brain'
convFile = "C:/Users/Tese_Avoid_Namespaces/Tese/TsmRec/files/rembrandt_study/HG-U133_Plus_2.na35.annot.csv"
convS = ('Probe Set ID', 'Gene Symbol', ',')
help(ProbeReader)
gr = ProbeReader(path, 3, convFile, convTarget="Gene Symbol")
gr.load()