Source code for troppo.omics.id_converter

"""
 Created by Jorge Gomes on 06/06/2018
 id_converter

"""
import pandas as pd
import urllib.request
import urllib.error
import shutil
from pathlib import Path
from datetime import date


"""
This python file contains two functions that rely on the HGNC complete set file.

idConverter:
    This function converts the ids from a given omics dataset into the desired ones to better match a metabolic model.
    Conversion is done based on the HGNC database.
    
searchNomenclature:
    This function searches which gene identification nomenclature is used on the provided ids.
"""


def _get_HGNC() -> str:
    """
    This function downloads the HGNC complete set file from the HGNC ftp server.
    The file is downloaded only once per day, if the file has already been downloaded today, the function will
    return the file name.

    Returns
    -------
    string: the file name of the HGNC complete set file

    """
    # Download the file from `url` and save it locally under `hgnc_complete_set_[Date]`:
    now = date.today()
    url = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt"
    path = "hgnc_complete_set_" + str(now) + ".tsv"
    file = Path(path)

    if file.is_file():  # if file has already been downloaded today skip this step
        return path
    else:
        try:
            file = "hgnc_complete_set_" + str(now) + ".tsv"
            with urllib.request.urlopen(url) as response, open(file, 'wb') as out_file:
                shutil.copyfileobj(response, out_file)

        except urllib.error.URLError as e:
            file = "hgnc_complete_set_2018-09-13.tsv"

            print('Please Check Internet Connection, using locally available HGNC file')
        return file


[docs]def idConverter(ids: list or set, old: str, new: str) -> dict or None: """ This function converts the ids from a given omics dataset into the desired ones to better match a metabolic model. Conversion is done based on the HGNC database. NOMENCLATURES: ["hgnc_id","symbol","name","entrez_id","ensembl_gene_id","vega_id","ucsc_id","ccds_id", "uniprot_ids", "pubmed_id","omim_id","locus_group","locus_type","alias_symbol","alias_name", "prev_symbol","prev_name", "ena","refseq_accession","rna_central_ids"] Parameters ---------- ids: list or set containing the ids to be converted old: string exact match, the nomenclature designation of the input IDS. Must be different from new and contained in NOMENCLATURES new: string exact match, the nomenclature designation of the output IDs. Must be different from old and contained in NOMENCLATURES Returns ------- dict: dictionary with the converted ids as keys and the original ids as values """ file = _get_HGNC() ds = pd.read_csv(file, sep='\t', low_memory=False) try: d = dict(zip(ds[old.lower()], ds[new.lower()])) except KeyError: print('The new ID designation is incorrect, must be one of the following:\n', ds.columns.values.tolist()) return res = {x: str(d[x]).split('.')[0] for x in ids if x in d} return res
[docs]def searchNomenclature(ids: list) -> str or None: """ This function searches which gene identification nomenclature is used on the provided ids. When ids from different nomenclatures are input, the result will be the nomenclature with the most matches. Also handles cases where some ids do not match but others do. Parameters ----------- ids: list List of ids (all using the same nomenclature) Returns -------- string the nomenclature designation according to HGNC complete set table. """ file = _get_HGNC() found = False # some ids may not be contained in HGNC nomenclature_col = None matches = {} # workaround for mixed ids while not found: # cross each id with every line till a match comes up test = ids.pop() with open(file, 'r', encoding='utf8') as f: for line in f.readlines(): if test in line.split('\t'): # ensures an exact match nomenclature_col = line.split('\t').index(test) if nomenclature_col not in matches: matches[nomenclature_col] = 1 break else: matches[nomenclature_col] += 1 break if matches != {}: if len(ids) < 10: threshold = 1 else: threshold = 10 if matches[max(matches, key=matches.get)] >= threshold: found = True nomenclature_col = [x for x, y in matches.items() if y == matches[max(matches, key=matches.get)]][0] if len(ids) == 0 and not found: print('No match was found for the provided ids') break ds = pd.read_csv(file, sep='\t', low_memory=False) if found: return ds.columns.values[nomenclature_col] else: return None
if __name__ == "__main__": a = ['A1BG', 'HGNC:1'] print(idConverter(a, 'symbol', 'entrez_id'))