"""
Created by Jorge Gomes on 06/06/2018
id_converter
"""
import pandas as pd
import urllib.request
import urllib.error
import shutil
from pathlib import Path
from datetime import date
"""
This python file contains two functions that rely on the HGNC complete set file.
idConverter:
This function converts the ids from a given omics dataset into the desired ones to better match a metabolic model.
Conversion is done based on the HGNC database.
searchNomenclature:
This function searches which gene identification nomenclature is used on the provided ids.
"""
def _get_HGNC() -> str:
"""
This function downloads the HGNC complete set file from the HGNC ftp server.
The file is downloaded only once per day, if the file has already been downloaded today, the function will
return the file name.
Returns
-------
string: the file name of the HGNC complete set file
"""
# Download the file from `url` and save it locally under `hgnc_complete_set_[Date]`:
now = date.today()
url = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt"
path = "hgnc_complete_set_" + str(now) + ".tsv"
file = Path(path)
if file.is_file(): # if file has already been downloaded today skip this step
return path
else:
try:
file = "hgnc_complete_set_" + str(now) + ".tsv"
with urllib.request.urlopen(url) as response, open(file, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
except urllib.error.URLError as e:
file = "hgnc_complete_set_2018-09-13.tsv"
print('Please Check Internet Connection, using locally available HGNC file')
return file
[docs]def idConverter(ids: list or set, old: str, new: str) -> dict or None:
"""
This function converts the ids from a given omics dataset into the desired ones to better match a metabolic model.
Conversion is done based on the HGNC database.
NOMENCLATURES:
["hgnc_id","symbol","name","entrez_id","ensembl_gene_id","vega_id","ucsc_id","ccds_id", "uniprot_ids",
"pubmed_id","omim_id","locus_group","locus_type","alias_symbol","alias_name", "prev_symbol","prev_name",
"ena","refseq_accession","rna_central_ids"]
Parameters
----------
ids: list or set
containing the ids to be converted
old: string
exact match, the nomenclature designation of the input IDS. Must be different from new and contained in
NOMENCLATURES
new: string
exact match, the nomenclature designation of the output IDs. Must be different from old and contained in
NOMENCLATURES
Returns
-------
dict: dictionary with the converted ids as keys and the original ids as values
"""
file = _get_HGNC()
ds = pd.read_csv(file, sep='\t', low_memory=False)
try:
d = dict(zip(ds[old.lower()], ds[new.lower()]))
except KeyError:
print('The new ID designation is incorrect, must be one of the following:\n',
ds.columns.values.tolist())
return
res = {x: str(d[x]).split('.')[0] for x in ids if x in d}
return res
[docs]def searchNomenclature(ids: list) -> str or None:
"""
This function searches which gene identification nomenclature is used on the provided ids.
When ids from different nomenclatures are input, the result will be the nomenclature with the most matches.
Also handles cases where some ids do not match but others do.
Parameters
-----------
ids: list
List of ids (all using the same nomenclature)
Returns
--------
string
the nomenclature designation according to HGNC complete set table.
"""
file = _get_HGNC()
found = False # some ids may not be contained in HGNC
nomenclature_col = None
matches = {} # workaround for mixed ids
while not found: # cross each id with every line till a match comes up
test = ids.pop()
with open(file, 'r', encoding='utf8') as f:
for line in f.readlines():
if test in line.split('\t'): # ensures an exact match
nomenclature_col = line.split('\t').index(test)
if nomenclature_col not in matches:
matches[nomenclature_col] = 1
break
else:
matches[nomenclature_col] += 1
break
if matches != {}:
if len(ids) < 10:
threshold = 1
else:
threshold = 10
if matches[max(matches, key=matches.get)] >= threshold:
found = True
nomenclature_col = [x for x, y in matches.items() if y == matches[max(matches, key=matches.get)]][0]
if len(ids) == 0 and not found:
print('No match was found for the provided ids')
break
ds = pd.read_csv(file, sep='\t', low_memory=False)
if found:
return ds.columns.values[nomenclature_col]
else:
return None
if __name__ == "__main__":
a = ['A1BG', 'HGNC:1']
print(idConverter(a, 'symbol', 'entrez_id'))