Mercurial > repos > rsajulga > uniprot_id_mapper
changeset 0:4961ddd2f782 draft
Uploaded
author | rsajulga |
---|---|
date | Thu, 16 Jul 2015 16:28:42 -0400 |
parents | |
children | a9a1b182bc6d |
files | uniprot_id_mapping.py |
diffstat | 1 files changed, 697 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/uniprot_id_mapping.py Thu Jul 16 16:28:42 2015 -0400 @@ -0,0 +1,697 @@ +#!/usr/bin/env python +""" +# +#------------------------------------------------------------------------------ +# University of Minnesota +# Copyright 2015, Regents of the University of Minnesota +#------------------------------------------------------------------------------ +# Author: +# +# James E Johnson +# +#------------------------------------------------------------------------------ +""" +import json +import logging +import optparse +from optparse import OptionParser +import os +import sys +import re +import urllib +import urllib2 +import timeit +start = timeit.default_timer() +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +"""print +http://www.uniprot.org/help/programmatic_access#id_mapping_examples +http://www.uniprot.org/help/uploadlists + +From input files or stdin get accessions/IDs to map. +An input option can specify what id_type the input accessions/IDs are, +otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type + +input_dict = dict() +for id_string in input_ids: + (id,id_type) = getAccession(id_string) + if not id_type in input_dict: + input_dict[id_type] = [] + input_dict[id_type].append(id) + + +# We need to retrieve by from_id_type and to_id_type +for id_type in input_dict: + for + + +tabular output: +#input_id to_id to_id + + +""" +def warn_err(msg,exit_code=1): + sys.stderr.write(msg) + if exit_code: + sys.exit(exit_code) + +def __main__(): + version = '1.0' + idDict = { + #Category:UniProt + 'ACC+ID':'UniProtKB AC/ID', + 'ACC':'UniProtKB AC', + 'ID':'UniProtKB ID', + 'UPARC':'UniParc', + 'NF50':'UniRef50', + 'NF90':'UniRef90', + 'NF100':'UniRef100', + 'GENENAME':'Gene name', + #Category:Other sequence databases + 'EMBL_ID':'EMBL/GenBank/DDBJ', + 'EMBL':'EMBL/GenBank/DDBJ CDS', + 'PIR':'PIR', + 'UNIGENE_ID':'UniGene', + 'P_ENTREZGENEID':'Entrez Gene (GeneID)', + 'P_GI':'GI number*', + 'P_REFSEQ_AC':'RefSeq Protein', + 'REFSEQ_NT_ID':'RefSeq Nucleotide', + #Category:3D structure databases + 'PDB_ID':'PDB', + 'DISPROT_ID':'DisProt', + #Category:Protein-protein interaction databases + 'BIOGRID_ID':'BioGrid', + 'DIP_ID':'DIP', + 'MINT_ID':'MINT', + 'STRING_ID':'STRING', + #Category:Chemistry + 'CHEMBL_ID':'ChEMBL', + 'DRUGBANK_ID':'DrugBank', + 'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY', + #Category:Protein family/group databases + 'ALLERGOME_ID':'Allergome', + 'MEROPS_ID':'MEROPS', + 'MYCOCLAP_ID':'mycoCLAP', + 'PEROXIBASE_ID':'PeroxiBase', + 'REBASE_ID':'REBASE', + 'TCDB_ID':'TCDB', + #Category:Polymorphism databases + 'DMDM_ID':'DMDM', + #Category:2D gel databases + 'WORLD_2DPAGE_ID':'World-2DPAGE', + #Category:Protocols and materials databases + 'DNASU_ID':'DNASU', + #Category:Genome annotation databases + 'ENSEMBL_ID':'Ensembl', + 'ENSEMBL_PRO_ID':'Ensembl Protein', + 'ENSEMBL_TRS_ID':'Ensembl Transcript', + 'ENSEMBLGENOME_ID':'Ensembl Genomes', + 'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein', + 'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript', + 'P_ENTREZGENEID':'GeneID', + 'KEGG_ID':'KEGG', + 'PATRIC_ID':'PATRIC', + 'UCSC_ID':'UCSC', + 'VECTORBASE_ID':'VectorBase', + #Category:Organism-specific gene databases + 'ARACHNOSERVER_ID':'ArachnoServer', + 'CGD':'CGD', + 'CONOSERVER_ID':'ConoServer', + 'CYGD_ID':'CYGD', + 'DICTYBASE_ID':'dictyBase', + 'ECHOBASE_ID':'EchoBASE', + 'ECOGENE_ID':'EcoGene', + 'EUHCVDB_ID':'euHCVdb', + 'EUPATHDB_ID':'EuPathDB', + 'FLYBASE_ID':'FlyBase', + 'GENECARDS_ID':'GeneCards', + 'GENEFARM_ID':'GeneFarm', + 'GENOLIST_ID':'GenoList', + 'H_INVDB_ID':'H-InvDB', + 'HGNC_ID':'HGNC', + 'HPA_ID':'HPA', + 'LEGIOLIST_ID':'LegioList', + 'LEPROMA_ID':'Leproma', + 'MAIZEGDB_ID':'MaizeGDB', + 'MIM_ID':'MIM', + 'MGI_ID':'MGI', + 'NEXTPROT_ID':'neXtProt', + 'ORPHANET_ID':'Orphanet', + 'PHARMGKB_ID':'PharmGKB', + 'POMBASE_ID':'PomBase', + 'PSEUDOCAP_ID':'PseudoCAP', + 'RGD_ID':'RGD', + 'SGD_ID':'SGD', + 'TAIR_ID':'TAIR', + 'TUBERCULIST_ID':'TubercuList', + 'WORMBASE_ID':'WormBase', + 'WORMBASE_TRS_ID':'WormBase Transcript', + 'WORMBASE_PRO_ID':'WormBase Protein', + 'XENBASE_ID':'Xenbase', + 'ZFIN_ID':'ZFIN', + #Category:Phylogenomic databases + 'EGGNOG_ID':'eggNOG', + 'GENETREE_ID':'GeneTree', + 'HOGENOM_ID':'HOGENOM', + 'HOVERGEN_ID':'HOVERGEN', + 'KO_ID':'KO', + 'OMA_ID':'OMA', + 'ORTHODB_ID':'OrthoDB', + 'PROTCLUSTDB_ID':'ProtClustDB', + 'TREEFAM_ID':'TreeFam', + #Category:Enzyme and pathway databases + 'BIOCYC_ID':'BioCyc', + 'REACTOME_ID':'Reactome', + 'UNIPATHWAY_ID':'UniPathWay', + #Category:Gene expression databases + 'CLEANEX_ID':'CleanEx', + #Category:Other + 'CHITARS_ID':'ChiTaRS', + 'GENOMERNAI_ID':'GenomeRNAi', + 'GENEWIKI_ID':'GeneWiki', + 'NEXTBIO_ID':'NextBio' + } + + """ + TODO: + Would be better to be able to infer the from_id type for the input accession/ID especially for a fasta file + - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession. + + # rather than an array of input accessions, we need to put them in a dict() by type + def addAccession(id_dict,id_type,id): + if id_type not in id_dict: + id_dict[id_type] = []-++ + id_dict[id_type] = [].append(id) + + # returns(accession, id_type) + def getAccession(header, matchType): + # (format regex pattern, FROM_ID) + # TODO try to determine which type of accession ID we have by matching by regular expressions + # each regex match should have a groups[0] that given the accession + # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples + fmts = [ + ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), + ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'), + ('>/UniRef\d+_(\w+).*','ACC+ID'), + ('>/(UPI\d+).*','ACC+ID'), + ('NP_\d+\.\d+','ACC+ID'), + ('ENSP\d+','ENSEMBL_PRO_ID'), + ('ENST\d+','ENSEMBL_TRS_ID'), + ] + for pat,cat in fmts: + m = re.match(pat,header) + if m: + matchType.append(cat) + return (m.groups()[0],cat) + matchType.append('ACC+ID') + return (header,'ACC+ID') + + # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types + idMaps = dict() # {to_id,idMap} + for to_id in options.to_id: + idMap = dict() + idMaps[to_id] = idMap + for (from_id,ids) id_dict.items(): + # limit the requests to 500 at a time + idx = range(0,len(ids),500) + idx.append(len(ids)) + for i in range(len(idx)-1): + getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap) + """ + + """ + Some examples of fasta ID lines From: + https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java +>sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ... +>gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ... +>IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr +>HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein. +>OExyz (OExyz) xxx xxx xxx +>hflu_lsi_xxxx xxx xxx xxx +>C_tr_Lx_x [xxx - xxx] | xxx xxx +>M. tub.xxx|Rvxxx| xxx xxx +// Drosophile DB. +// We need to find two elements: +// - the accession String (retrieved as the trimmed version of everything +// up to (and NOT including) " pep:" +// - the description (everything (trimmed) starting from (and including) the " pep:". +>CGxxxxxx pep:xxxxxx +>xxxx xxx SGDID:xxxx xxx +>generic_some_tag|proten_accession|a description for this protein +// Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as +// present in the Expasy FTP FASTA file. +// Is formatted something like this: +>XXX_YYYY (acc) rest +>sp|accession|ID descr rest (including taxonomy, if available +>tr|accession|ID descr rest (including taxonomy, if available) +>nxp|NX_P02768-1|ALB|Serum albumin|Iso +// New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as +// present in the Expasy FTP FASTA file. +// Is formatted something like this: +>accession|ID descr rest (including taxonomy, if available) +// Flybase FASTA format. +>FBxxx type=xxx +// A header translating a genome sequence into a protein sequences. +// We need to find two elements, separated by a space: +// - the accession string (retrieved as the first part of a space delimited String). +// - the nucleic acid start and stop site (between brackets, separated by a '-'). +>dm345_3L-sense [234353534-234353938] +>dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database +>synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF +// The Arabidopsis thaliana database; TAIR format +>AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166 +... +// Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all. +// Testing for this is somewhat more complicated. +// Often used simple header; looks like: +// >NP0465 (NP0465) A description for this protein. +// We need to find two elements: +// - the accession String (easily retrieved as the next String until a space is encountered). +// - the description +>NP0465 (NP0465) A description for this protein. + GenBank gb|accession|locus + EMBL Data Library emb|accession|locus + DDBJ, DNA Database of Japan dbj|accession|locus + NBRF PIR pir||entry + Protein Research Foundation prf||name + SWISS-PROT sp|accession|entry name + Brookhaven Protein Data Bank pdb|entry|chain + Patents pat|country|number + GenInfo Backbone Id bbs|number + General database identifier gnl|database|identifier + NCBI Reference Sequence ref|accession|locus + Local Sequence identifier lcl|identifier + """ + + + def getFastaAccession(header): + # TODO parse the ID and return the accession + # (format regex pattern, FROM_ID) + # TODO try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions + # each regex match should have a groups[0] that given the accession + # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples + fmts = [ + ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), + ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'), + ('NP_\d+\.\d+','ACC+ID'), + + ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), + ('(...*)_....*','ID'), + ('>/(UPI\d+).*','UPARC'), + ('>/UniRef50_(\w+).*','NF50'), + ('UniRef90_(\w+).*','NF90'), + ('UniRef100_(\w+).*','NF100'), + ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'), + ('ENSP\d+','ENSEMBL_PRO_ID'), + ('ENST\d+','ENSEMBL_TRS_ID') + ] + for pat,cat in fmts: + m = re.match(pat,header) + if m: + return m.groups()[0] + return header + + # rather than an array of input accessions, we need to put them in a dict() by type + def addAccession(id_dict,id_type,id): + for idtype in id_type: + if idtype not in id_dict: + id_dict[idtype] = [] + id_dict[idtype].append(id) + + # returns(accession, id_type) + def getAccession(header): + # (format regex pattern, FROM_ID) + # TODO try to determine which type of accession ID we have by matching by regular expressions + # each regex match should have a groups[0] that given the accession + # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples + fmts = [ + ('([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), + ('.*_[A-Z]*','ID'), + ('UPI(\d+).*','UPARC'), + ('(UniRef50_\w+.*)','NF50'), + ('UniRef90_(\w+).*','NF90'), + ('(UniRef100_\w+.*)','NF100'), + ('a[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2}','GENENAME'), + + ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'), + ('K\d*','KO_ID'), + + ('([A-Z]*\d*\.\d$)','EMBL'), + ('([IBC]\d{5})','PIR'), + ('(Hs\.\d*)','UNIGENE_ID'), + ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'), + ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'), + ('(\d[A-Z0-9]{3})','PDB_ID'), + ('(DP\d{5})','DISPROT_ID'), + ('(DIP-\d*N$)','DIP_ID'), + ('(MINT-\d*)','MINT_ID'), + ('(9606\.ENSP\d*)','STRING_ID'), + ('(CHEMBL\d*)','CHEMBL_ID'), + ('(DB\d*)','DRUGBANK_ID'), + ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'), + + ('[A-Z]*_[A-Z]*','MYCOCLAP_ID'), + + ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'), + ('\d{4}:([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','WORLD_2DPAGE_ID'), + ('ENS.?*G\d*','ENSEMBL_ID'), + ('ENS.?*P\d*','ENSEMBL_PRO_ID'), + ('ENS.?*T\d*','ENSEMBL_TRS_ID'), + (' ','ENSEMBLGENOME_ID'), + (' ','ENSEMBLGENOME_PRO_ID'), + (' ','ENSEMBLGENOME_TRS_ID'), + ('(hsa:\d*)','KEGG_ID'), + ('(uc\d*[a-z]*\.\d$)','UCSC_ID'), + ('VEL:J:LDKJFS','VECTORBASE_ID'), + ('AS\d*','ARACHNOSERVER_ID'), + ('CAL\d*','CGD'), + ('Y[A-Z]{2}[0-9]{3}[cw].*|Q\d{4}|R\d{4}[wc]','CYGD_ID'), + ('DDB_G\d*','DICTYBASE_ID'), + ('EB\d{4}','ECHOBASE_ID'), + ('EG\d{5}','ECOGENE_ID'), + ('(?:HM|G[QU]|F[JMN]|E[FU]|DQ|A[A-Z])\d{6}|[DLMSUXYZ]\d{5}','EUHCVDB_ID'), + ('[A-Z][a-z]*DB:.*','EUPATHDB_ID'), + ('FBgn\d*','FLYBASE_ID'), + ('GC[A-Z0-9]{9}','GENECARDS_ID'), + ('(?:BSU\d|gbs|LIN|LMO|MUL_|MYPU_|pl[iu])\d{4}|MUP\d{3}c?','GENOLIST_ID'), + ('HIX\d*','H_INVDB_ID'), + ('HGNC:\d*','HGNC_ID'), + ('(?:CAB|HPA)\d{6}','HPA_ID'), + ('lp[lp]\d{4}','LEGIOLIST_ID'), + ('ML\d{4}','LEPROMA_ID'), + ('MGI:\d*','MGI_ID'), + ('NX_(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','NEXTPROT_ID'), + ('PA\d{3,5,9}','PHARMGKB_ID'), + ('SP.*\.\d\dc?,'POMBASE_ID'), + ('PA\d{4}','PSEUDOCAP_ID'), + ('S\d{9}','SGD_ID'), + ('AT.G\d{5}','TAIR_ID'), + ('Rv\d{4}c?','TUBERCULIST_ID'), + ('WBGene\d{8}','WORMBASE_ID'), + ('(\dR(SSE|\d\d?)|A[CH]\d\d?|[BC]\d\d[A-Z]\d)\.\d.[0-9a-z]?|CBG\d{5}|C[CD][48]\.\d.[a-z]?|CE7X_3\.1|cTel5\dX\.1[ab]?|D\d{4}\.\d.[a-z]?,'WORMBASE_ID'), + ('C(BP|E)\d{5}','WORMBASE_PRO_ID'), + ('XB-GENE-\d*','XENBASE_ID'), + ('ZDB-GENE-\d{6}-\d{4}','ZFIN_ID'), + + + ('(.*[CKN]OG\d*)','EGGNOG_ID'), + ('(ENSGT\d*)','GENETREE_ID'), + ('HOG\d*','HOGENOM_ID'), + ('HBG\d*','HOVERGEN_ID'), + + ('[A-Z]{7}','OMA_ID'), + + ('EOG\d*','ORTHODB_ID'), + ('asdfsadfasdf','PROTCLUSTDB_ID'), + ('TF\d*','TREEFAM_ID'), + ('REACT_\d*','REACTOME_ID'), + ('asdfasl;DF:LJk','UNIPATHWAY_ID'), + ('HS_\d*','CLEANEX_ID'), + ('SAMEAS GENE NAME','CHITARS_ID'), + ('GENENAME_\(gene\)','GENEWIKI_ID'), + ] + # remove the need for .groups() (i.e. parantheses) + if re.match('\d*$', header): # For ambiguous number only ID types + numIDtypes = [ + 'P_ENTREZGENEID', + 'P_GI', + 'DMDM_ID', + 'BIOGRID_ID', + 'GUIDETOPHARMACOLOGY_ID', + 'ALLERGOME_ID', + 'PEROXIBASE_ID', + 'REBASE_ID', + 'DNASU_ID', + 'GENOMERNAI_ID', + 'NEXTBIO_ID', + 'CONOSERVER_ID', + 'GENEFARM_ID', + 'MAIZEGDB_ID', + 'MIM_ID', + 'MGI_ID', + 'ORPHANET_ID', + 'RGD_ID'] + ambiguous = [] + for numIDs in numIDtypes: + nm = getIdMapping([header],numIDs,'ACC',ambiguity=True) + if nm != None: + ambiguous.append(nm) + if ambiguous == []: + ambiguous.append('No Match') + return (header, ambiguous) + for pat,cat in fmts: + m = re.match(pat,header) + if m: + #return (m.groups()[0],[cat]) + return (header,[cat]) + return (header,['ACC+ID']) + + + def read_tabular(filepath,col): + accessions = [] + with open(filepath) as fp: + for i,line in enumerate(fp): + if line.strip() == '' or line.startswith('#'): + continue + fields = line.rstrip('\n').split('\t') + accession = fields[col] + accessions.append(accession) + return accessions + + def get_fasta_entries(fp): + name, seq = None, [] + for line in fp: + line = line.rstrip() + if line.startswith(">"): + if name: yield (name, ''.join(seq)) + name, seq = line, [] + else: + seq.append(line) + if name: yield (name, ''.join(seq)) + + def read_fasta(filepath): + accessions = [] + with open(filepath) as fp: + for id, peptide in get_fasta_entries(fp): + accessions.append(getFastaAccession(id)) + return accessions + + def read_mzid(fp): + accessions = [] + for event, elem in ET.iterparse(fp): + if event == 'end': + if re.search('DBSequence',elem.tag): + accessions.append(elem.attrib['accession']) + return accessions + + def read_pepxml(fp): + accessions = [] + for event, elem in ET.iterparse(fp): + if event == 'end': + if re.search('search_hit',elem.tag): + accessions.append(elem.get('protein')) + return accessions + + def getUniprotSequence(uniprot_id): + url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id + print url + fp = urllib2.urlopen(url) + for id, seq in get_fasta_entries(fp): + if seq: + return seq + return '' + + + + + def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,ambiguity=None,crossReference=None,idMaps=None): + # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)])) + if not accessions: + return + url = 'http://www.uniprot.org/mapping/' + # Cross Referencing: Mapping to non-UniprotKB ('ACC') IDs to other non-UniprotKB ('ACC') IDs + if to_id != 'ACC' and from_id != 'ACC': + crMap = {} + getIdMapping(accessions,from_id,'ACC',idMap=crMap) + crMap2 = crMap.copy() + for x in crMap.keys(): + for y in crMap[x].keys(): + for z in crMap[x][y]: + crMap2[x][y] = [] + getIdMapping([z],'ACC',to_id,idMap=crMap2,crossReference=[x,from_id]) + idMap.update(crMap2) + return + params = { + 'from': from_id, + 'to': to_id, + 'format':'tab', + 'query': '\n'.join(accessions) + } + data = urllib.urlencode(params) + request = urllib2.Request(url, data) + contact = "" # Please set your email address here to help us debug in case of problems. + request.add_header('User-Agent', 'Python %s' % contact) + response = None + for i in range(3): + try: + response = urllib2.urlopen(request) + break + except Exception, e: + warn_err("%s",exit_code=None) + if response: + response.next() + print params + + for line in response: + # print >> sys.stderr, "idMap: %s" % line + if fh: + fh.write(line) + if ambiguity: # if there was a response, then an ambiguous match can be made + return from_id + if idMap != None: + id1,id2 = line.strip().split('\t') + print id2 + if crossReference != None: + id1, from_id = crossReference + # print >> sys.stderr, "idMap: %s:%s" % (id1,id2) + try: + idMap[id1][from_id].append(id2) + except: + idMap[id1] = {from_id:[id2]} + if ambiguity == None: + for line in response: + for acc in accessions: + idMap[acc] = {from_id:['N/A']} + return + + + #Parse Command Line + parser = optparse.OptionParser() + # input files + parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) + parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) + parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) + parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) + parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) + # Decoy pattern + parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' ) + # filter patterns + parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' ) + parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' ) + # Unipept Flags + parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' ) + parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' ) + # output files + parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values') + # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' ) + parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' ) + (options, args) = parser.parse_args() + if options.version: + print >> sys.stdout,"%s" % version + sys.exit(0) + + accessions = [] + ## Get accessions + if options.mzid: + accessions += read_mzid(options.mzid) + if options.pepxml: + accessions += read_pepxml(options.pepxml) + if options.tabular: + accessions += read_tabular(options.tabular,options.column) + if options.fasta: + accessions += read_fasta(options.fasta) + if args and len(args) > 0: + for i,accession in enumerate(args): + accessions.append(accession) + # filter accessions + if options.decoy: + filtered_accs = [re.sub(options.decoy,'',x) for x in accessions] + accessions = filtered_accs + if options.include_pat: + filtered_accs = [] + for acc in accessions: + if re.match(options.include_pat,acc): + filtered_accs.append(acc) + accessions = filtered_accs + if options.exclude_pat: + filtered_accs = [] + for acc in accessions: + if not re.match(options.exclude_pat,acc): + filtered_accs.append(acc) + accessions = filtered_accs + if len(accessions) < 1: + warn_err("No accessions input!",exit_code=1) + if options.output != None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(3) + else: + outputFile = sys.stdout + + + # Removes duplicates in accessions + seen = set() + seen_add = seen.add + accessions = [x for x in accessions if not (x in seen or seen_add(x))] + # Sorts accessions to inferred ID types i+n a dictionary + id_dict = {} + for header in accessions: + id , id_types = getAccession(header) + addAccession(id_dict,id_types,id) + idMaps = dict() # {to_id,idMap} + for to_id in options.to_id: + idMap = dict() + idMaps[to_id] = idMap + for (from_id,ids) in id_dict.items(): + # limit the requests to 500 at a time + idx = range(0,len(ids),500) + idx.append(len(ids)) + for i in range(len(idx)-1): + getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,idMaps=idMaps) + print ids +#Write output + #Output Table Header + outputFile.write("\n#%-17s" % (options.from_id)) + for t in options.to_id: + outputFile.write("%-18s" % t) + outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n") + + # Create an output-friendly matrix + idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))] + for a, acc in enumerate(accessions): + idLength = 0 + for i, to_id in enumerate(options.to_id): # [[ids],[ids]] + idArrayColumn = [] + idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3] + for from_id in idPairs: + ids = idPairs[from_id] + for id_ in ids: + idArrayColumn.append("%s[%s]" % (id_,from_id)) + if idLength < len(idArrayColumn): + idLength = len(idArrayColumn) + idArray[a][i].extend(idArrayColumn) + for y in range(idLength): + outputFile.write("%-18s" % acc) + for x in range(len(options.to_id)): + try: + outputFile.write("%-18s" % idArray[a][x][y]) + except: + outputFile.write(" " * 18) + outputFile.write("\n") + # Output Matrix + + + + #print idMaps + # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv + stop = timeit.default_timer() + outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds")) + +if __name__ == "__main__" : __main__() +