Mercurial > repos > rsajulga > uniprot_id_mapper
view uniprot_id_mapping.py @ 0:4961ddd2f782 draft
Uploaded
author | rsajulga |
---|---|
date | Thu, 16 Jul 2015 16:28:42 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python """ # #------------------------------------------------------------------------------ # University of Minnesota # Copyright 2015, Regents of the University of Minnesota #------------------------------------------------------------------------------ # Author: # # James E Johnson # #------------------------------------------------------------------------------ """ import json import logging import optparse from optparse import OptionParser import os import sys import re import urllib import urllib2 import timeit start = timeit.default_timer() try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET """print http://www.uniprot.org/help/programmatic_access#id_mapping_examples http://www.uniprot.org/help/uploadlists From input files or stdin get accessions/IDs to map. An input option can specify what id_type the input accessions/IDs are, otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type input_dict = dict() for id_string in input_ids: (id,id_type) = getAccession(id_string) if not id_type in input_dict: input_dict[id_type] = [] input_dict[id_type].append(id) # We need to retrieve by from_id_type and to_id_type for id_type in input_dict: for tabular output: #input_id to_id to_id """ def warn_err(msg,exit_code=1): sys.stderr.write(msg) if exit_code: sys.exit(exit_code) def __main__(): version = '1.0' idDict = { #Category:UniProt 'ACC+ID':'UniProtKB AC/ID', 'ACC':'UniProtKB AC', 'ID':'UniProtKB ID', 'UPARC':'UniParc', 'NF50':'UniRef50', 'NF90':'UniRef90', 'NF100':'UniRef100', 'GENENAME':'Gene name', #Category:Other sequence databases 'EMBL_ID':'EMBL/GenBank/DDBJ', 'EMBL':'EMBL/GenBank/DDBJ CDS', 'PIR':'PIR', 'UNIGENE_ID':'UniGene', 'P_ENTREZGENEID':'Entrez Gene (GeneID)', 'P_GI':'GI number*', 'P_REFSEQ_AC':'RefSeq Protein', 'REFSEQ_NT_ID':'RefSeq Nucleotide', #Category:3D structure databases 'PDB_ID':'PDB', 'DISPROT_ID':'DisProt', #Category:Protein-protein interaction databases 'BIOGRID_ID':'BioGrid', 'DIP_ID':'DIP', 'MINT_ID':'MINT', 'STRING_ID':'STRING', #Category:Chemistry 'CHEMBL_ID':'ChEMBL', 'DRUGBANK_ID':'DrugBank', 'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY', #Category:Protein family/group databases 'ALLERGOME_ID':'Allergome', 'MEROPS_ID':'MEROPS', 'MYCOCLAP_ID':'mycoCLAP', 'PEROXIBASE_ID':'PeroxiBase', 'REBASE_ID':'REBASE', 'TCDB_ID':'TCDB', #Category:Polymorphism databases 'DMDM_ID':'DMDM', #Category:2D gel databases 'WORLD_2DPAGE_ID':'World-2DPAGE', #Category:Protocols and materials databases 'DNASU_ID':'DNASU', #Category:Genome annotation databases 'ENSEMBL_ID':'Ensembl', 'ENSEMBL_PRO_ID':'Ensembl Protein', 'ENSEMBL_TRS_ID':'Ensembl Transcript', 'ENSEMBLGENOME_ID':'Ensembl Genomes', 'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein', 'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript', 'P_ENTREZGENEID':'GeneID', 'KEGG_ID':'KEGG', 'PATRIC_ID':'PATRIC', 'UCSC_ID':'UCSC', 'VECTORBASE_ID':'VectorBase', #Category:Organism-specific gene databases 'ARACHNOSERVER_ID':'ArachnoServer', 'CGD':'CGD', 'CONOSERVER_ID':'ConoServer', 'CYGD_ID':'CYGD', 'DICTYBASE_ID':'dictyBase', 'ECHOBASE_ID':'EchoBASE', 'ECOGENE_ID':'EcoGene', 'EUHCVDB_ID':'euHCVdb', 'EUPATHDB_ID':'EuPathDB', 'FLYBASE_ID':'FlyBase', 'GENECARDS_ID':'GeneCards', 'GENEFARM_ID':'GeneFarm', 'GENOLIST_ID':'GenoList', 'H_INVDB_ID':'H-InvDB', 'HGNC_ID':'HGNC', 'HPA_ID':'HPA', 'LEGIOLIST_ID':'LegioList', 'LEPROMA_ID':'Leproma', 'MAIZEGDB_ID':'MaizeGDB', 'MIM_ID':'MIM', 'MGI_ID':'MGI', 'NEXTPROT_ID':'neXtProt', 'ORPHANET_ID':'Orphanet', 'PHARMGKB_ID':'PharmGKB', 'POMBASE_ID':'PomBase', 'PSEUDOCAP_ID':'PseudoCAP', 'RGD_ID':'RGD', 'SGD_ID':'SGD', 'TAIR_ID':'TAIR', 'TUBERCULIST_ID':'TubercuList', 'WORMBASE_ID':'WormBase', 'WORMBASE_TRS_ID':'WormBase Transcript', 'WORMBASE_PRO_ID':'WormBase Protein', 'XENBASE_ID':'Xenbase', 'ZFIN_ID':'ZFIN', #Category:Phylogenomic databases 'EGGNOG_ID':'eggNOG', 'GENETREE_ID':'GeneTree', 'HOGENOM_ID':'HOGENOM', 'HOVERGEN_ID':'HOVERGEN', 'KO_ID':'KO', 'OMA_ID':'OMA', 'ORTHODB_ID':'OrthoDB', 'PROTCLUSTDB_ID':'ProtClustDB', 'TREEFAM_ID':'TreeFam', #Category:Enzyme and pathway databases 'BIOCYC_ID':'BioCyc', 'REACTOME_ID':'Reactome', 'UNIPATHWAY_ID':'UniPathWay', #Category:Gene expression databases 'CLEANEX_ID':'CleanEx', #Category:Other 'CHITARS_ID':'ChiTaRS', 'GENOMERNAI_ID':'GenomeRNAi', 'GENEWIKI_ID':'GeneWiki', 'NEXTBIO_ID':'NextBio' } """ TODO: Would be better to be able to infer the from_id type for the input accession/ID especially for a fasta file - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession. # rather than an array of input accessions, we need to put them in a dict() by type def addAccession(id_dict,id_type,id): if id_type not in id_dict: id_dict[id_type] = []-++ id_dict[id_type] = [].append(id) # returns(accession, id_type) def getAccession(header, matchType): # (format regex pattern, FROM_ID) # TODO try to determine which type of accession ID we have by matching by regular expressions # each regex match should have a groups[0] that given the accession # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples fmts = [ ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'), ('>/UniRef\d+_(\w+).*','ACC+ID'), ('>/(UPI\d+).*','ACC+ID'), ('NP_\d+\.\d+','ACC+ID'), ('ENSP\d+','ENSEMBL_PRO_ID'), ('ENST\d+','ENSEMBL_TRS_ID'), ] for pat,cat in fmts: m = re.match(pat,header) if m: matchType.append(cat) return (m.groups()[0],cat) matchType.append('ACC+ID') return (header,'ACC+ID') # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types idMaps = dict() # {to_id,idMap} for to_id in options.to_id: idMap = dict() idMaps[to_id] = idMap for (from_id,ids) id_dict.items(): # limit the requests to 500 at a time idx = range(0,len(ids),500) idx.append(len(ids)) for i in range(len(idx)-1): getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap) """ """ Some examples of fasta ID lines From: https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java >sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ... >gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ... >IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr >HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein. >OExyz (OExyz) xxx xxx xxx >hflu_lsi_xxxx xxx xxx xxx >C_tr_Lx_x [xxx - xxx] | xxx xxx >M. tub.xxx|Rvxxx| xxx xxx // Drosophile DB. // We need to find two elements: // - the accession String (retrieved as the trimmed version of everything // up to (and NOT including) " pep:" // - the description (everything (trimmed) starting from (and including) the " pep:". >CGxxxxxx pep:xxxxxx >xxxx xxx SGDID:xxxx xxx >generic_some_tag|proten_accession|a description for this protein // Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as // present in the Expasy FTP FASTA file. // Is formatted something like this: >XXX_YYYY (acc) rest >sp|accession|ID descr rest (including taxonomy, if available >tr|accession|ID descr rest (including taxonomy, if available) >nxp|NX_P02768-1|ALB|Serum albumin|Iso // New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as // present in the Expasy FTP FASTA file. // Is formatted something like this: >accession|ID descr rest (including taxonomy, if available) // Flybase FASTA format. >FBxxx type=xxx // A header translating a genome sequence into a protein sequences. // We need to find two elements, separated by a space: // - the accession string (retrieved as the first part of a space delimited String). // - the nucleic acid start and stop site (between brackets, separated by a '-'). >dm345_3L-sense [234353534-234353938] >dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database >synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF // The Arabidopsis thaliana database; TAIR format >AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166 ... // Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all. // Testing for this is somewhat more complicated. // Often used simple header; looks like: // >NP0465 (NP0465) A description for this protein. // We need to find two elements: // - the accession String (easily retrieved as the next String until a space is encountered). // - the description >NP0465 (NP0465) A description for this protein. GenBank gb|accession|locus EMBL Data Library emb|accession|locus DDBJ, DNA Database of Japan dbj|accession|locus NBRF PIR pir||entry Protein Research Foundation prf||name SWISS-PROT sp|accession|entry name Brookhaven Protein Data Bank pdb|entry|chain Patents pat|country|number GenInfo Backbone Id bbs|number General database identifier gnl|database|identifier NCBI Reference Sequence ref|accession|locus Local Sequence identifier lcl|identifier """ def getFastaAccession(header): # TODO parse the ID and return the accession # (format regex pattern, FROM_ID) # TODO try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions # each regex match should have a groups[0] that given the accession # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples fmts = [ ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'), ('NP_\d+\.\d+','ACC+ID'), ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), ('(...*)_....*','ID'), ('>/(UPI\d+).*','UPARC'), ('>/UniRef50_(\w+).*','NF50'), ('UniRef90_(\w+).*','NF90'), ('UniRef100_(\w+).*','NF100'), ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'), ('ENSP\d+','ENSEMBL_PRO_ID'), ('ENST\d+','ENSEMBL_TRS_ID') ] for pat,cat in fmts: m = re.match(pat,header) if m: return m.groups()[0] return header # rather than an array of input accessions, we need to put them in a dict() by type def addAccession(id_dict,id_type,id): for idtype in id_type: if idtype not in id_dict: id_dict[idtype] = [] id_dict[idtype].append(id) # returns(accession, id_type) def getAccession(header): # (format regex pattern, FROM_ID) # TODO try to determine which type of accession ID we have by matching by regular expressions # each regex match should have a groups[0] that given the accession # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples fmts = [ ('([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), ('.*_[A-Z]*','ID'), ('UPI(\d+).*','UPARC'), ('(UniRef50_\w+.*)','NF50'), ('UniRef90_(\w+).*','NF90'), ('(UniRef100_\w+.*)','NF100'), ('a[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2}','GENENAME'), ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'), ('K\d*','KO_ID'), ('([A-Z]*\d*\.\d$)','EMBL'), ('([IBC]\d{5})','PIR'), ('(Hs\.\d*)','UNIGENE_ID'), ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'), ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'), ('(\d[A-Z0-9]{3})','PDB_ID'), ('(DP\d{5})','DISPROT_ID'), ('(DIP-\d*N$)','DIP_ID'), ('(MINT-\d*)','MINT_ID'), ('(9606\.ENSP\d*)','STRING_ID'), ('(CHEMBL\d*)','CHEMBL_ID'), ('(DB\d*)','DRUGBANK_ID'), ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'), ('[A-Z]*_[A-Z]*','MYCOCLAP_ID'), ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'), ('\d{4}:([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','WORLD_2DPAGE_ID'), ('ENS.?*G\d*','ENSEMBL_ID'), ('ENS.?*P\d*','ENSEMBL_PRO_ID'), ('ENS.?*T\d*','ENSEMBL_TRS_ID'), (' ','ENSEMBLGENOME_ID'), (' ','ENSEMBLGENOME_PRO_ID'), (' ','ENSEMBLGENOME_TRS_ID'), ('(hsa:\d*)','KEGG_ID'), ('(uc\d*[a-z]*\.\d$)','UCSC_ID'), ('VEL:J:LDKJFS','VECTORBASE_ID'), ('AS\d*','ARACHNOSERVER_ID'), ('CAL\d*','CGD'), ('Y[A-Z]{2}[0-9]{3}[cw].*|Q\d{4}|R\d{4}[wc]','CYGD_ID'), ('DDB_G\d*','DICTYBASE_ID'), ('EB\d{4}','ECHOBASE_ID'), ('EG\d{5}','ECOGENE_ID'), ('(?:HM|G[QU]|F[JMN]|E[FU]|DQ|A[A-Z])\d{6}|[DLMSUXYZ]\d{5}','EUHCVDB_ID'), ('[A-Z][a-z]*DB:.*','EUPATHDB_ID'), ('FBgn\d*','FLYBASE_ID'), ('GC[A-Z0-9]{9}','GENECARDS_ID'), ('(?:BSU\d|gbs|LIN|LMO|MUL_|MYPU_|pl[iu])\d{4}|MUP\d{3}c?','GENOLIST_ID'), ('HIX\d*','H_INVDB_ID'), ('HGNC:\d*','HGNC_ID'), ('(?:CAB|HPA)\d{6}','HPA_ID'), ('lp[lp]\d{4}','LEGIOLIST_ID'), ('ML\d{4}','LEPROMA_ID'), ('MGI:\d*','MGI_ID'), ('NX_(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','NEXTPROT_ID'), ('PA\d{3,5,9}','PHARMGKB_ID'), ('SP.*\.\d\dc?,'POMBASE_ID'), ('PA\d{4}','PSEUDOCAP_ID'), ('S\d{9}','SGD_ID'), ('AT.G\d{5}','TAIR_ID'), ('Rv\d{4}c?','TUBERCULIST_ID'), ('WBGene\d{8}','WORMBASE_ID'), ('(\dR(SSE|\d\d?)|A[CH]\d\d?|[BC]\d\d[A-Z]\d)\.\d.[0-9a-z]?|CBG\d{5}|C[CD][48]\.\d.[a-z]?|CE7X_3\.1|cTel5\dX\.1[ab]?|D\d{4}\.\d.[a-z]?,'WORMBASE_ID'), ('C(BP|E)\d{5}','WORMBASE_PRO_ID'), ('XB-GENE-\d*','XENBASE_ID'), ('ZDB-GENE-\d{6}-\d{4}','ZFIN_ID'), ('(.*[CKN]OG\d*)','EGGNOG_ID'), ('(ENSGT\d*)','GENETREE_ID'), ('HOG\d*','HOGENOM_ID'), ('HBG\d*','HOVERGEN_ID'), ('[A-Z]{7}','OMA_ID'), ('EOG\d*','ORTHODB_ID'), ('asdfsadfasdf','PROTCLUSTDB_ID'), ('TF\d*','TREEFAM_ID'), ('REACT_\d*','REACTOME_ID'), ('asdfasl;DF:LJk','UNIPATHWAY_ID'), ('HS_\d*','CLEANEX_ID'), ('SAMEAS GENE NAME','CHITARS_ID'), ('GENENAME_\(gene\)','GENEWIKI_ID'), ] # remove the need for .groups() (i.e. parantheses) if re.match('\d*$', header): # For ambiguous number only ID types numIDtypes = [ 'P_ENTREZGENEID', 'P_GI', 'DMDM_ID', 'BIOGRID_ID', 'GUIDETOPHARMACOLOGY_ID', 'ALLERGOME_ID', 'PEROXIBASE_ID', 'REBASE_ID', 'DNASU_ID', 'GENOMERNAI_ID', 'NEXTBIO_ID', 'CONOSERVER_ID', 'GENEFARM_ID', 'MAIZEGDB_ID', 'MIM_ID', 'MGI_ID', 'ORPHANET_ID', 'RGD_ID'] ambiguous = [] for numIDs in numIDtypes: nm = getIdMapping([header],numIDs,'ACC',ambiguity=True) if nm != None: ambiguous.append(nm) if ambiguous == []: ambiguous.append('No Match') return (header, ambiguous) for pat,cat in fmts: m = re.match(pat,header) if m: #return (m.groups()[0],[cat]) return (header,[cat]) return (header,['ACC+ID']) def read_tabular(filepath,col): accessions = [] with open(filepath) as fp: for i,line in enumerate(fp): if line.strip() == '' or line.startswith('#'): continue fields = line.rstrip('\n').split('\t') accession = fields[col] accessions.append(accession) return accessions def get_fasta_entries(fp): name, seq = None, [] for line in fp: line = line.rstrip() if line.startswith(">"): if name: yield (name, ''.join(seq)) name, seq = line, [] else: seq.append(line) if name: yield (name, ''.join(seq)) def read_fasta(filepath): accessions = [] with open(filepath) as fp: for id, peptide in get_fasta_entries(fp): accessions.append(getFastaAccession(id)) return accessions def read_mzid(fp): accessions = [] for event, elem in ET.iterparse(fp): if event == 'end': if re.search('DBSequence',elem.tag): accessions.append(elem.attrib['accession']) return accessions def read_pepxml(fp): accessions = [] for event, elem in ET.iterparse(fp): if event == 'end': if re.search('search_hit',elem.tag): accessions.append(elem.get('protein')) return accessions def getUniprotSequence(uniprot_id): url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id print url fp = urllib2.urlopen(url) for id, seq in get_fasta_entries(fp): if seq: return seq return '' def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,ambiguity=None,crossReference=None,idMaps=None): # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)])) if not accessions: return url = 'http://www.uniprot.org/mapping/' # Cross Referencing: Mapping to non-UniprotKB ('ACC') IDs to other non-UniprotKB ('ACC') IDs if to_id != 'ACC' and from_id != 'ACC': crMap = {} getIdMapping(accessions,from_id,'ACC',idMap=crMap) crMap2 = crMap.copy() for x in crMap.keys(): for y in crMap[x].keys(): for z in crMap[x][y]: crMap2[x][y] = [] getIdMapping([z],'ACC',to_id,idMap=crMap2,crossReference=[x,from_id]) idMap.update(crMap2) return params = { 'from': from_id, 'to': to_id, 'format':'tab', 'query': '\n'.join(accessions) } data = urllib.urlencode(params) request = urllib2.Request(url, data) contact = "" # Please set your email address here to help us debug in case of problems. request.add_header('User-Agent', 'Python %s' % contact) response = None for i in range(3): try: response = urllib2.urlopen(request) break except Exception, e: warn_err("%s",exit_code=None) if response: response.next() print params for line in response: # print >> sys.stderr, "idMap: %s" % line if fh: fh.write(line) if ambiguity: # if there was a response, then an ambiguous match can be made return from_id if idMap != None: id1,id2 = line.strip().split('\t') print id2 if crossReference != None: id1, from_id = crossReference # print >> sys.stderr, "idMap: %s:%s" % (id1,id2) try: idMap[id1][from_id].append(id2) except: idMap[id1] = {from_id:[id2]} if ambiguity == None: for line in response: for acc in accessions: idMap[acc] = {from_id:['N/A']} return #Parse Command Line parser = optparse.OptionParser() # input files parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) # Decoy pattern parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' ) # filter patterns parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' ) parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' ) # Unipept Flags parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' ) parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' ) # output files parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values') # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' ) parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' ) (options, args) = parser.parse_args() if options.version: print >> sys.stdout,"%s" % version sys.exit(0) accessions = [] ## Get accessions if options.mzid: accessions += read_mzid(options.mzid) if options.pepxml: accessions += read_pepxml(options.pepxml) if options.tabular: accessions += read_tabular(options.tabular,options.column) if options.fasta: accessions += read_fasta(options.fasta) if args and len(args) > 0: for i,accession in enumerate(args): accessions.append(accession) # filter accessions if options.decoy: filtered_accs = [re.sub(options.decoy,'',x) for x in accessions] accessions = filtered_accs if options.include_pat: filtered_accs = [] for acc in accessions: if re.match(options.include_pat,acc): filtered_accs.append(acc) accessions = filtered_accs if options.exclude_pat: filtered_accs = [] for acc in accessions: if not re.match(options.exclude_pat,acc): filtered_accs.append(acc) accessions = filtered_accs if len(accessions) < 1: warn_err("No accessions input!",exit_code=1) if options.output != None: try: outputPath = os.path.abspath(options.output) outputFile = open(outputPath, 'w') except Exception, e: print >> sys.stderr, "failed: %s" % e exit(3) else: outputFile = sys.stdout # Removes duplicates in accessions seen = set() seen_add = seen.add accessions = [x for x in accessions if not (x in seen or seen_add(x))] # Sorts accessions to inferred ID types i+n a dictionary id_dict = {} for header in accessions: id , id_types = getAccession(header) addAccession(id_dict,id_types,id) idMaps = dict() # {to_id,idMap} for to_id in options.to_id: idMap = dict() idMaps[to_id] = idMap for (from_id,ids) in id_dict.items(): # limit the requests to 500 at a time idx = range(0,len(ids),500) idx.append(len(ids)) for i in range(len(idx)-1): getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,idMaps=idMaps) print ids #Write output #Output Table Header outputFile.write("\n#%-17s" % (options.from_id)) for t in options.to_id: outputFile.write("%-18s" % t) outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n") # Create an output-friendly matrix idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))] for a, acc in enumerate(accessions): idLength = 0 for i, to_id in enumerate(options.to_id): # [[ids],[ids]] idArrayColumn = [] idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3] for from_id in idPairs: ids = idPairs[from_id] for id_ in ids: idArrayColumn.append("%s[%s]" % (id_,from_id)) if idLength < len(idArrayColumn): idLength = len(idArrayColumn) idArray[a][i].extend(idArrayColumn) for y in range(idLength): outputFile.write("%-18s" % acc) for x in range(len(options.to_id)): try: outputFile.write("%-18s" % idArray[a][x][y]) except: outputFile.write(" " * 18) outputFile.write("\n") # Output Matrix #print idMaps # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv stop = timeit.default_timer() outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds")) if __name__ == "__main__" : __main__()