comparison uniprot_ID_mapper/uniprot_id_mapping.py @ 3:6651ac4651f0 draft default tip

Uploaded
author rsajulga
date Thu, 16 Jul 2015 16:41:35 -0400
parents
children
comparison
equal deleted inserted replaced
2:d0311668d442 3:6651ac4651f0
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2015, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14 import json
15 import logging
16 import optparse
17 from optparse import OptionParser
18 import os
19 import sys
20 import re
21 import urllib
22 import urllib2
23 import timeit
24 start = timeit.default_timer()
25 try:
26 import xml.etree.cElementTree as ET
27 except ImportError:
28 import xml.etree.ElementTree as ET
29
30 """print
31 http://www.uniprot.org/help/programmatic_access#id_mapping_examples
32 http://www.uniprot.org/help/uploadlists
33
34 From input files or stdin get accessions/IDs to map.
35 An input option can specify what id_type the input accessions/IDs are,
36 otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type
37
38 input_dict = dict()
39 for id_string in input_ids:
40 (id,id_type) = getAccession(id_string)
41 if not id_type in input_dict:
42 input_dict[id_type] = []
43 input_dict[id_type].append(id)
44
45
46 # We need to retrieve by from_id_type and to_id_type
47 for id_type in input_dict:
48 for
49
50
51 tabular output:
52 #input_id to_id to_id
53
54
55 """
56 def warn_err(msg,exit_code=1):
57 sys.stderr.write(msg)
58 if exit_code:
59 sys.exit(exit_code)
60
61 def __main__():
62 version = '1.0'
63 idDict = {
64 #Category:UniProt
65 'ACC+ID':'UniProtKB AC/ID',
66 'ACC':'UniProtKB AC',
67 'ID':'UniProtKB ID',
68 'UPARC':'UniParc',
69 'NF50':'UniRef50',
70 'NF90':'UniRef90',
71 'NF100':'UniRef100',
72 'GENENAME':'Gene name',
73 #Category:Other sequence databases
74 'EMBL_ID':'EMBL/GenBank/DDBJ',
75 'EMBL':'EMBL/GenBank/DDBJ CDS',
76 'PIR':'PIR',
77 'UNIGENE_ID':'UniGene',
78 'P_ENTREZGENEID':'Entrez Gene (GeneID)',
79 'P_GI':'GI number*',
80 'P_REFSEQ_AC':'RefSeq Protein',
81 'REFSEQ_NT_ID':'RefSeq Nucleotide',
82 #Category:3D structure databases
83 'PDB_ID':'PDB',
84 'DISPROT_ID':'DisProt',
85 #Category:Protein-protein interaction databases
86 'BIOGRID_ID':'BioGrid',
87 'DIP_ID':'DIP',
88 'MINT_ID':'MINT',
89 'STRING_ID':'STRING',
90 #Category:Chemistry
91 'CHEMBL_ID':'ChEMBL',
92 'DRUGBANK_ID':'DrugBank',
93 'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY',
94 #Category:Protein family/group databases
95 'ALLERGOME_ID':'Allergome',
96 'MEROPS_ID':'MEROPS',
97 'MYCOCLAP_ID':'mycoCLAP',
98 'PEROXIBASE_ID':'PeroxiBase',
99 'REBASE_ID':'REBASE',
100 'TCDB_ID':'TCDB',
101 #Category:Polymorphism databases
102 'DMDM_ID':'DMDM',
103 #Category:2D gel databases
104 'WORLD_2DPAGE_ID':'World-2DPAGE',
105 #Category:Protocols and materials databases
106 'DNASU_ID':'DNASU',
107 #Category:Genome annotation databases
108 'ENSEMBL_ID':'Ensembl',
109 'ENSEMBL_PRO_ID':'Ensembl Protein',
110 'ENSEMBL_TRS_ID':'Ensembl Transcript',
111 'ENSEMBLGENOME_ID':'Ensembl Genomes',
112 'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein',
113 'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript',
114 'P_ENTREZGENEID':'GeneID',
115 'KEGG_ID':'KEGG',
116 'PATRIC_ID':'PATRIC',
117 'UCSC_ID':'UCSC',
118 'VECTORBASE_ID':'VectorBase',
119 #Category:Organism-specific gene databases
120 'ARACHNOSERVER_ID':'ArachnoServer',
121 'CGD':'CGD',
122 'CONOSERVER_ID':'ConoServer',
123 'CYGD_ID':'CYGD',
124 'DICTYBASE_ID':'dictyBase',
125 'ECHOBASE_ID':'EchoBASE',
126 'ECOGENE_ID':'EcoGene',
127 'EUHCVDB_ID':'euHCVdb',
128 'EUPATHDB_ID':'EuPathDB',
129 'FLYBASE_ID':'FlyBase',
130 'GENECARDS_ID':'GeneCards',
131 'GENEFARM_ID':'GeneFarm',
132 'GENOLIST_ID':'GenoList',
133 'H_INVDB_ID':'H-InvDB',
134 'HGNC_ID':'HGNC',
135 'HPA_ID':'HPA',
136 'LEGIOLIST_ID':'LegioList',
137 'LEPROMA_ID':'Leproma',
138 'MAIZEGDB_ID':'MaizeGDB',
139 'MIM_ID':'MIM',
140 'MGI_ID':'MGI',
141 'NEXTPROT_ID':'neXtProt',
142 'ORPHANET_ID':'Orphanet',
143 'PHARMGKB_ID':'PharmGKB',
144 'POMBASE_ID':'PomBase',
145 'PSEUDOCAP_ID':'PseudoCAP',
146 'RGD_ID':'RGD',
147 'SGD_ID':'SGD',
148 'TAIR_ID':'TAIR',
149 'TUBERCULIST_ID':'TubercuList',
150 'WORMBASE_ID':'WormBase',
151 'WORMBASE_TRS_ID':'WormBase Transcript',
152 'WORMBASE_PRO_ID':'WormBase Protein',
153 'XENBASE_ID':'Xenbase',
154 'ZFIN_ID':'ZFIN',
155 #Category:Phylogenomic databases
156 'EGGNOG_ID':'eggNOG',
157 'GENETREE_ID':'GeneTree',
158 'HOGENOM_ID':'HOGENOM',
159 'HOVERGEN_ID':'HOVERGEN',
160 'KO_ID':'KO',
161 'OMA_ID':'OMA',
162 'ORTHODB_ID':'OrthoDB',
163 'PROTCLUSTDB_ID':'ProtClustDB',
164 'TREEFAM_ID':'TreeFam',
165 #Category:Enzyme and pathway databases
166 'BIOCYC_ID':'BioCyc',
167 'REACTOME_ID':'Reactome',
168 'UNIPATHWAY_ID':'UniPathWay',
169 #Category:Gene expression databases
170 'CLEANEX_ID':'CleanEx',
171 #Category:Other
172 'CHITARS_ID':'ChiTaRS',
173 'GENOMERNAI_ID':'GenomeRNAi',
174 'GENEWIKI_ID':'GeneWiki',
175 'NEXTBIO_ID':'NextBio'
176 }
177
178 """
179 TODO:
180 Would be better to be able to infer the from_id type for the input accession/ID especially for a fasta file
181 - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession.
182
183 # rather than an array of input accessions, we need to put them in a dict() by type
184 def addAccession(id_dict,id_type,id):
185 if id_type not in id_dict:
186 id_dict[id_type] = []-++
187 id_dict[id_type] = [].append(id)
188
189 # returns(accession, id_type)
190 def getAccession(header, matchType):
191 # (format regex pattern, FROM_ID)
192 # TODO try to determine which type of accession ID we have by matching by regular expressions
193 # each regex match should have a groups[0] that given the accession
194 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
195 fmts = [
196 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
197 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
198 ('>/UniRef\d+_(\w+).*','ACC+ID'),
199 ('>/(UPI\d+).*','ACC+ID'),
200 ('NP_\d+\.\d+','ACC+ID'),
201 ('ENSP\d+','ENSEMBL_PRO_ID'),
202 ('ENST\d+','ENSEMBL_TRS_ID'),
203 ]
204 for pat,cat in fmts:
205 m = re.match(pat,header)
206 if m:
207 matchType.append(cat)
208 return (m.groups()[0],cat)
209 matchType.append('ACC+ID')
210 return (header,'ACC+ID')
211
212 # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types
213 idMaps = dict() # {to_id,idMap}
214 for to_id in options.to_id:
215 idMap = dict()
216 idMaps[to_id] = idMap
217 for (from_id,ids) id_dict.items():
218 # limit the requests to 500 at a time
219 idx = range(0,len(ids),500)
220 idx.append(len(ids))
221 for i in range(len(idx)-1):
222 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap)
223 """
224
225 """
226 Some examples of fasta ID lines From:
227 https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java
228 >sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ...
229 >gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ...
230 >IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr
231 >HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein.
232 >OExyz (OExyz) xxx xxx xxx
233 >hflu_lsi_xxxx xxx xxx xxx
234 >C_tr_Lx_x [xxx - xxx] | xxx xxx
235 >M. tub.xxx|Rvxxx| xxx xxx
236 // Drosophile DB.
237 // We need to find two elements:
238 // - the accession String (retrieved as the trimmed version of everything
239 // up to (and NOT including) " pep:"
240 // - the description (everything (trimmed) starting from (and including) the " pep:".
241 >CGxxxxxx pep:xxxxxx
242 >xxxx xxx SGDID:xxxx xxx
243 >generic_some_tag|proten_accession|a description for this protein
244 // Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as
245 // present in the Expasy FTP FASTA file.
246 // Is formatted something like this:
247 >XXX_YYYY (acc) rest
248 >sp|accession|ID descr rest (including taxonomy, if available
249 >tr|accession|ID descr rest (including taxonomy, if available)
250 >nxp|NX_P02768-1|ALB|Serum albumin|Iso
251 // New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as
252 // present in the Expasy FTP FASTA file.
253 // Is formatted something like this:
254 >accession|ID descr rest (including taxonomy, if available)
255 // Flybase FASTA format.
256 >FBxxx type=xxx
257 // A header translating a genome sequence into a protein sequences.
258 // We need to find two elements, separated by a space:
259 // - the accession string (retrieved as the first part of a space delimited String).
260 // - the nucleic acid start and stop site (between brackets, separated by a '-').
261 >dm345_3L-sense [234353534-234353938]
262 >dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database
263 >synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF
264 // The Arabidopsis thaliana database; TAIR format
265 >AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166
266 ...
267 // Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all.
268 // Testing for this is somewhat more complicated.
269 // Often used simple header; looks like:
270 // >NP0465 (NP0465) A description for this protein.
271 // We need to find two elements:
272 // - the accession String (easily retrieved as the next String until a space is encountered).
273 // - the description
274 >NP0465 (NP0465) A description for this protein.
275 GenBank gb|accession|locus
276 EMBL Data Library emb|accession|locus
277 DDBJ, DNA Database of Japan dbj|accession|locus
278 NBRF PIR pir||entry
279 Protein Research Foundation prf||name
280 SWISS-PROT sp|accession|entry name
281 Brookhaven Protein Data Bank pdb|entry|chain
282 Patents pat|country|number
283 GenInfo Backbone Id bbs|number
284 General database identifier gnl|database|identifier
285 NCBI Reference Sequence ref|accession|locus
286 Local Sequence identifier lcl|identifier
287 """
288
289
290 def getFastaAccession(header):
291 # TODO parse the ID and return the accession
292 # (format regex pattern, FROM_ID)
293 # TODO try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions
294 # each regex match should have a groups[0] that given the accession
295 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
296 fmts = [
297 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
298 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
299 ('NP_\d+\.\d+','ACC+ID'),
300
301 ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
302 ('(...*)_....*','ID'),
303 ('>/(UPI\d+).*','UPARC'),
304 ('>/UniRef50_(\w+).*','NF50'),
305 ('UniRef90_(\w+).*','NF90'),
306 ('UniRef100_(\w+).*','NF100'),
307 ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'),
308 ('ENSP\d+','ENSEMBL_PRO_ID'),
309 ('ENST\d+','ENSEMBL_TRS_ID')
310 ]
311 for pat,cat in fmts:
312 m = re.match(pat,header)
313 if m:
314 return m.groups()[0]
315 return header
316
317 # rather than an array of input accessions, we need to put them in a dict() by type
318 def addAccession(id_dict,id_type,id):
319 for idtype in id_type:
320 if idtype not in id_dict:
321 id_dict[idtype] = []
322 id_dict[idtype].append(id)
323
324 # returns(accession, id_type)
325 def getAccession(header):
326 # (format regex pattern, FROM_ID)
327 # TODO try to determine which type of accession ID we have by matching by regular expressions
328 # each regex match should have a groups[0] that given the accession
329 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
330 fmts = [
331 ('([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
332 ('.*_[A-Z]*','ID'),
333 ('UPI(\d+).*','UPARC'),
334 ('(UniRef50_\w+.*)','NF50'),
335 ('UniRef90_(\w+).*','NF90'),
336 ('(UniRef100_\w+.*)','NF100'),
337 ('a[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2}','GENENAME'),
338
339 ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'),
340 ('K\d*','KO_ID'),
341
342 ('([A-Z]*\d*\.\d$)','EMBL'),
343 ('([IBC]\d{5})','PIR'),
344 ('(Hs\.\d*)','UNIGENE_ID'),
345 ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'),
346 ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'),
347 ('(\d[A-Z0-9]{3})','PDB_ID'),
348 ('(DP\d{5})','DISPROT_ID'),
349 ('(DIP-\d*N$)','DIP_ID'),
350 ('(MINT-\d*)','MINT_ID'),
351 ('(9606\.ENSP\d*)','STRING_ID'),
352 ('(CHEMBL\d*)','CHEMBL_ID'),
353 ('(DB\d*)','DRUGBANK_ID'),
354 ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'),
355
356 ('[A-Z]*_[A-Z]*','MYCOCLAP_ID'),
357
358 ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'),
359 ('\d{4}:([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','WORLD_2DPAGE_ID'),
360 ('ENS.?*G\d*','ENSEMBL_ID'),
361 ('ENS.?*P\d*','ENSEMBL_PRO_ID'),
362 ('ENS.?*T\d*','ENSEMBL_TRS_ID'),
363 (' ','ENSEMBLGENOME_ID'),
364 (' ','ENSEMBLGENOME_PRO_ID'),
365 (' ','ENSEMBLGENOME_TRS_ID'),
366 ('(hsa:\d*)','KEGG_ID'),
367 ('(uc\d*[a-z]*\.\d$)','UCSC_ID'),
368 ('VEL:J:LDKJFS','VECTORBASE_ID'),
369 ('AS\d*','ARACHNOSERVER_ID'),
370 ('CAL\d*','CGD'),
371 ('Y[A-Z]{2}[0-9]{3}[cw].*|Q\d{4}|R\d{4}[wc]','CYGD_ID'),
372 ('DDB_G\d*','DICTYBASE_ID'),
373 ('EB\d{4}','ECHOBASE_ID'),
374 ('EG\d{5}','ECOGENE_ID'),
375 ('(?:HM|G[QU]|F[JMN]|E[FU]|DQ|A[A-Z])\d{6}|[DLMSUXYZ]\d{5}','EUHCVDB_ID'),
376 ('[A-Z][a-z]*DB:.*','EUPATHDB_ID'),
377 ('FBgn\d*','FLYBASE_ID'),
378 ('GC[A-Z0-9]{9}','GENECARDS_ID'),
379 ('(?:BSU\d|gbs|LIN|LMO|MUL_|MYPU_|pl[iu])\d{4}|MUP\d{3}c?','GENOLIST_ID'),
380 ('HIX\d*','H_INVDB_ID'),
381 ('HGNC:\d*','HGNC_ID'),
382 ('(?:CAB|HPA)\d{6}','HPA_ID'),
383 ('lp[lp]\d{4}','LEGIOLIST_ID'),
384 ('ML\d{4}','LEPROMA_ID'),
385 ('MGI:\d*','MGI_ID'),
386 ('NX_(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','NEXTPROT_ID'),
387 ('PA\d{3,5,9}','PHARMGKB_ID'),
388 ('SP.*\.\d\dc?,'POMBASE_ID'),
389 ('PA\d{4}','PSEUDOCAP_ID'),
390 ('S\d{9}','SGD_ID'),
391 ('AT.G\d{5}','TAIR_ID'),
392 ('Rv\d{4}c?','TUBERCULIST_ID'),
393 ('WBGene\d{8}','WORMBASE_ID'),
394 ('(\dR(SSE|\d\d?)|A[CH]\d\d?|[BC]\d\d[A-Z]\d)\.\d.[0-9a-z]?|CBG\d{5}|C[CD][48]\.\d.[a-z]?|CE7X_3\.1|cTel5\dX\.1[ab]?|D\d{4}\.\d.[a-z]?,'WORMBASE_ID'),
395 ('C(BP|E)\d{5}','WORMBASE_PRO_ID'),
396 ('XB-GENE-\d*','XENBASE_ID'),
397 ('ZDB-GENE-\d{6}-\d{4}','ZFIN_ID'),
398
399
400 ('(.*[CKN]OG\d*)','EGGNOG_ID'),
401 ('(ENSGT\d*)','GENETREE_ID'),
402 ('HOG\d*','HOGENOM_ID'),
403 ('HBG\d*','HOVERGEN_ID'),
404
405 ('[A-Z]{7}','OMA_ID'),
406
407 ('EOG\d*','ORTHODB_ID'),
408 ('asdfsadfasdf','PROTCLUSTDB_ID'),
409 ('TF\d*','TREEFAM_ID'),
410 ('REACT_\d*','REACTOME_ID'),
411 ('asdfasl;DF:LJk','UNIPATHWAY_ID'),
412 ('HS_\d*','CLEANEX_ID'),
413 ('SAMEAS GENE NAME','CHITARS_ID'),
414 ('GENENAME_\(gene\)','GENEWIKI_ID'),
415 ]
416 # remove the need for .groups() (i.e. parantheses)
417 if re.match('\d*$', header): # For ambiguous number only ID types
418 numIDtypes = [
419 'P_ENTREZGENEID',
420 'P_GI',
421 'DMDM_ID',
422 'BIOGRID_ID',
423 'GUIDETOPHARMACOLOGY_ID',
424 'ALLERGOME_ID',
425 'PEROXIBASE_ID',
426 'REBASE_ID',
427 'DNASU_ID',
428 'GENOMERNAI_ID',
429 'NEXTBIO_ID',
430 'CONOSERVER_ID',
431 'GENEFARM_ID',
432 'MAIZEGDB_ID',
433 'MIM_ID',
434 'MGI_ID',
435 'ORPHANET_ID',
436 'RGD_ID']
437 ambiguous = []
438 for numIDs in numIDtypes:
439 nm = getIdMapping([header],numIDs,'ACC',ambiguity=True)
440 if nm != None:
441 ambiguous.append(nm)
442 if ambiguous == []:
443 ambiguous.append('No Match')
444 return (header, ambiguous)
445 for pat,cat in fmts:
446 m = re.match(pat,header)
447 if m:
448 #return (m.groups()[0],[cat])
449 return (header,[cat])
450 return (header,['ACC+ID'])
451
452
453 def read_tabular(filepath,col):
454 accessions = []
455 with open(filepath) as fp:
456 for i,line in enumerate(fp):
457 if line.strip() == '' or line.startswith('#'):
458 continue
459 fields = line.rstrip('\n').split('\t')
460 accession = fields[col]
461 accessions.append(accession)
462 return accessions
463
464 def get_fasta_entries(fp):
465 name, seq = None, []
466 for line in fp:
467 line = line.rstrip()
468 if line.startswith(">"):
469 if name: yield (name, ''.join(seq))
470 name, seq = line, []
471 else:
472 seq.append(line)
473 if name: yield (name, ''.join(seq))
474
475 def read_fasta(filepath):
476 accessions = []
477 with open(filepath) as fp:
478 for id, peptide in get_fasta_entries(fp):
479 accessions.append(getFastaAccession(id))
480 return accessions
481
482 def read_mzid(fp):
483 accessions = []
484 for event, elem in ET.iterparse(fp):
485 if event == 'end':
486 if re.search('DBSequence',elem.tag):
487 accessions.append(elem.attrib['accession'])
488 return accessions
489
490 def read_pepxml(fp):
491 accessions = []
492 for event, elem in ET.iterparse(fp):
493 if event == 'end':
494 if re.search('search_hit',elem.tag):
495 accessions.append(elem.get('protein'))
496 return accessions
497
498 def getUniprotSequence(uniprot_id):
499 url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id
500 print url
501 fp = urllib2.urlopen(url)
502 for id, seq in get_fasta_entries(fp):
503 if seq:
504 return seq
505 return ''
506
507
508
509
510 def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,ambiguity=None,crossReference=None,idMaps=None):
511 # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)]))
512 if not accessions:
513 return
514 url = 'http://www.uniprot.org/mapping/'
515 # Cross Referencing: Mapping to non-UniprotKB ('ACC') IDs to other non-UniprotKB ('ACC') IDs
516 if to_id != 'ACC' and from_id != 'ACC':
517 crMap = {}
518 getIdMapping(accessions,from_id,'ACC',idMap=crMap)
519 crMap2 = crMap.copy()
520 for x in crMap.keys():
521 for y in crMap[x].keys():
522 for z in crMap[x][y]:
523 crMap2[x][y] = []
524 getIdMapping([z],'ACC',to_id,idMap=crMap2,crossReference=[x,from_id])
525 idMap.update(crMap2)
526 return
527 params = {
528 'from': from_id,
529 'to': to_id,
530 'format':'tab',
531 'query': '\n'.join(accessions)
532 }
533 data = urllib.urlencode(params)
534 request = urllib2.Request(url, data)
535 contact = "" # Please set your email address here to help us debug in case of problems.
536 request.add_header('User-Agent', 'Python %s' % contact)
537 response = None
538 for i in range(3):
539 try:
540 response = urllib2.urlopen(request)
541 break
542 except Exception, e:
543 warn_err("%s",exit_code=None)
544 if response:
545 response.next()
546 print params
547
548 for line in response:
549 # print >> sys.stderr, "idMap: %s" % line
550 if fh:
551 fh.write(line)
552 if ambiguity: # if there was a response, then an ambiguous match can be made
553 return from_id
554 if idMap != None:
555 id1,id2 = line.strip().split('\t')
556 print id2
557 if crossReference != None:
558 id1, from_id = crossReference
559 # print >> sys.stderr, "idMap: %s:%s" % (id1,id2)
560 try:
561 idMap[id1][from_id].append(id2)
562 except:
563 idMap[id1] = {from_id:[id2]}
564 if ambiguity == None:
565 for line in response:
566 for acc in accessions:
567 idMap[acc] = {from_id:['N/A']}
568 return
569
570
571 #Parse Command Line
572 parser = optparse.OptionParser()
573 # input files
574 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
575 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
576 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
577 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
578 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
579 # Decoy pattern
580 parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' )
581 # filter patterns
582 parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' )
583 parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' )
584 # Unipept Flags
585 parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' )
586 parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' )
587 # output files
588 parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values')
589 # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' )
590 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
591 (options, args) = parser.parse_args()
592 if options.version:
593 print >> sys.stdout,"%s" % version
594 sys.exit(0)
595
596 accessions = []
597 ## Get accessions
598 if options.mzid:
599 accessions += read_mzid(options.mzid)
600 if options.pepxml:
601 accessions += read_pepxml(options.pepxml)
602 if options.tabular:
603 accessions += read_tabular(options.tabular,options.column)
604 if options.fasta:
605 accessions += read_fasta(options.fasta)
606 if args and len(args) > 0:
607 for i,accession in enumerate(args):
608 accessions.append(accession)
609 # filter accessions
610 if options.decoy:
611 filtered_accs = [re.sub(options.decoy,'',x) for x in accessions]
612 accessions = filtered_accs
613 if options.include_pat:
614 filtered_accs = []
615 for acc in accessions:
616 if re.match(options.include_pat,acc):
617 filtered_accs.append(acc)
618 accessions = filtered_accs
619 if options.exclude_pat:
620 filtered_accs = []
621 for acc in accessions:
622 if not re.match(options.exclude_pat,acc):
623 filtered_accs.append(acc)
624 accessions = filtered_accs
625 if len(accessions) < 1:
626 warn_err("No accessions input!",exit_code=1)
627 if options.output != None:
628 try:
629 outputPath = os.path.abspath(options.output)
630 outputFile = open(outputPath, 'w')
631 except Exception, e:
632 print >> sys.stderr, "failed: %s" % e
633 exit(3)
634 else:
635 outputFile = sys.stdout
636
637
638 # Removes duplicates in accessions
639 seen = set()
640 seen_add = seen.add
641 accessions = [x for x in accessions if not (x in seen or seen_add(x))]
642 # Sorts accessions to inferred ID types i+n a dictionary
643 id_dict = {}
644 for header in accessions:
645 id , id_types = getAccession(header)
646 addAccession(id_dict,id_types,id)
647 idMaps = dict() # {to_id,idMap}
648 for to_id in options.to_id:
649 idMap = dict()
650 idMaps[to_id] = idMap
651 for (from_id,ids) in id_dict.items():
652 # limit the requests to 500 at a time
653 idx = range(0,len(ids),500)
654 idx.append(len(ids))
655 for i in range(len(idx)-1):
656 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,idMaps=idMaps)
657 print ids
658 #Write output
659 #Output Table Header
660 outputFile.write("\n#%-17s" % (options.from_id))
661 for t in options.to_id:
662 outputFile.write("%-18s" % t)
663 outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n")
664
665 # Create an output-friendly matrix
666 idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))]
667 for a, acc in enumerate(accessions):
668 idLength = 0
669 for i, to_id in enumerate(options.to_id): # [[ids],[ids]]
670 idArrayColumn = []
671 idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3]
672 for from_id in idPairs:
673 ids = idPairs[from_id]
674 for id_ in ids:
675 idArrayColumn.append("%s[%s]" % (id_,from_id))
676 if idLength < len(idArrayColumn):
677 idLength = len(idArrayColumn)
678 idArray[a][i].extend(idArrayColumn)
679 for y in range(idLength):
680 outputFile.write("%-18s" % acc)
681 for x in range(len(options.to_id)):
682 try:
683 outputFile.write("%-18s" % idArray[a][x][y])
684 except:
685 outputFile.write(" " * 18)
686 outputFile.write("\n")
687 # Output Matrix
688
689
690
691 #print idMaps
692 # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv
693 stop = timeit.default_timer()
694 outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds"))
695
696 if __name__ == "__main__" : __main__()
697