comparison uniprot_ID_mapper/uniprot_id_mapping _071315.py @ 3:6651ac4651f0 draft default tip

Uploaded
author rsajulga
date Thu, 16 Jul 2015 16:41:35 -0400
parents
children
comparison
equal deleted inserted replaced
2:d0311668d442 3:6651ac4651f0
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2015, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14 import json
15 import logging
16 import optparse
17 from optparse import OptionParser
18 import os
19 import sys
20 import re
21 import urllib
22 import urllib2
23 import timeit
24 start = timeit.default_timer()
25 try:
26 import xml.etree.cElementTree as ET
27 except ImportError:
28 import xml.etree.ElementTree as ET
29
30 """print
31 http://www.uniprot.org/help/programmatic_access#id_mapping_examples
32 http://www.uniprot.org/help/uploadlists
33
34 From input files or stdin get accessions/IDs to map.
35 An input option can specify what id_type the input accessions/IDs are,
36 otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type
37
38 input_dict = dict()
39 for id_string in input_ids:
40 (id,id_type) = getAccession(id_string)
41 if not id_type in input_dict:
42 input_dict[id_type] = []
43 input_dict[id_type].append(id)
44
45
46 # We need to retrieve by from_id_type and to_id_type
47 for id_type in input_dict:
48 for
49
50
51 tabular output:
52 #input_id to_id to_id
53
54
55 """
56 def warn_err(msg,exit_code=1):
57 sys.stderr.write(msg)
58 if exit_code:
59 sys.exit(exit_code)
60
61 def __main__():
62 version = '1.0'
63 idDict = {
64 #Category:UniProtprint
65 'ACC+ID':'UniProtKB AC/ID',
66 'ACC':'UniProtKB AC',
67 'ID':'UniProtKB ID',
68 'UPARC':'UniParc',
69 'NF50':'UniRef50',
70 'NF90':'UniRef90',
71 'NF100':'UniRef100',
72 #Category:Other sequence databases
73 'EMBL_ID':'EMBL/GenBank/DDBJ',
74 'EMBL':'EMBL/GenBank/DDBJ CDS',
75 'PIR':'PIR',
76 'UNIGENE_ID':'UniGene',
77 'P_ENTREZGENEID':'Entrez Gene (GeneID)',
78 'P_GI':'GI number*',
79 'P_REFSEQ_AC':'RefSeq Protein',
80 'REFSEQ_NT_ID':'RefSeq Nucleotide',
81 #Category:3D structure databases
82 'PDB_ID':'PDB',
83 'DISPROT_ID':'DisProt',
84 #Category:Protein-protein interaction databases
85 'BIOGRID_ID':'BioGrid',
86 'DIP_ID':'DIP',
87 'MINT_ID':'MINT',
88 'STRING_ID':'STRING',
89 #Category:Chemistry
90 'CHEMBL_ID':'ChEMBL',
91 'DRUGBANK_ID':'DrugBank',
92 'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY',
93 #Category:Protein family/group databases
94 'ALLERGOME_ID':'Allergome',
95 'MEROPS_ID':'MEROPS',
96 'MYCOCLAP_ID':'mycoCLAP',
97 'PEROXIBASE_ID':'PeroxiBase',
98 'REBASE_ID':'REBASE',
99 'TCDB_ID':'TCDB',
100 #Category:Polymorphism databases
101 'DMDM_ID':'DMDM',
102 #Category:2D gel databases
103 'WORLD_2DPAGE_ID':'World-2DPAGE',
104 #Category:Protocols and materials databases
105 'DNASU_ID':'DNASU',
106 #Category:Genome annotation databases
107 'ENSEMBL_ID':'Ensembl',
108 'ENSEMBL_PRO_ID':'Ensembl Protein',
109 'ENSEMBL_TRS_ID':'Ensembl Transcript',
110 'ENSEMBLGENOME_ID':'Ensembl Genomes',
111 'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein',
112 'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript',
113 'P_ENTREZGENEID':'GeneID',
114 'KEGG_ID':'KEGG',
115 'PATRIC_ID':'PATRIC',
116 'UCSC_ID':'UCSC',
117 'VECTORBASE_ID':'VectorBase',
118 #Category:Organism-specific gene databases
119 'ARACHNOSERVER_ID':'ArachnoServer',
120 'CGD':'CGD',
121 'CONOSERVER_ID':'ConoServer',
122 'CYGD_ID':'CYGD',
123 'DICTYBASE_ID':'dictyBase',
124 'ECHOBASE_ID':'EchoBASE',
125 'ECOGENE_ID':'EcoGene',
126 'EUHCVDB_ID':'euHCVdb',
127 'EUPATHDB_ID':'EuPathDB',
128 'FLYBASE_ID':'FlyBase',
129 'GENECARDS_ID':'GeneCards',
130 'GENEFARM_ID':'GeneFarm',
131 'GENOLIST_ID':'GenoList',
132 'H_INVDB_ID':'H-InvDB',
133 'HGNC_ID':'HGNC',
134 'HPA_ID':'HPA',
135 'LEGIOLIST_ID':'LegioList',
136 'LEPROMA_ID':'Leproma',
137 'MAIZEGDB_ID':'MaizeGDB',
138 'MIM_ID':'MIM',
139 'MGI_ID':'MGI',
140 'NEXTPROT_ID':'neXtProt',
141 'ORPHANET_ID':'Orphanet',
142 'PHARMGKB_ID':'PharmGKB',
143 'POMBASE_ID':'PomBase',
144 'PSEUDOCAP_ID':'PseudoCAP',
145 'RGD_ID':'RGD',
146 'SGD_ID':'SGD',
147 'TAIR_ID':'TAIR',
148 'TUBERCULIST_ID':'TubercuList',
149 'WORMBASE_ID':'WormBase',
150 'WORMBASE_TRS_ID':'WormBase Transcript',
151 'WORMBASE_PRO_ID':'WormBase Protein',
152 'XENBASE_ID':'Xenbase',
153 'ZFIN_ID':'ZFIN',
154 #Category:Phylogenomic databases
155 'EGGNOG_ID':'eggNOG',
156 'GENETREE_ID':'GeneTree',
157 'HOGENOM_ID':'HOGENOM',
158 'HOVERGEN_ID':'HOVERGEN',
159 'KO_ID':'KO',
160 'OMA_ID':'OMA',
161 'ORTHODB_ID':'OrthoDB',
162 'PROTCLUSTDB_ID':'ProtClustDB',
163 'TREEFAM_ID':'TreeFam',
164 #Category:Enzyme and pathway databases
165 'BIOCYC_ID':'BioCyc',
166 'REACTOME_ID':'Reactome',
167 'UNIPATHWAY_ID':'UniPathWay',
168 #Category:Gene expression databases
169 'CLEANEX_ID':'CleanEx',
170 #Category:Other
171 'CHITARS_ID':'ChiTaRS',
172 'GENOMERNAI_ID':'GenomeRNAi',
173 'GENEWIKI_ID':'GeneWiki',
174 'NEXTBIO_ID':'NextBio'
175 }
176
177 """
178 TODO:
179 Would be better to be able to infer the from_id type for the input accession/ID especially for a fasta file
180 - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession.
181
182 # rather than an array of input accessions, we need to put them in a dict() by type
183 def addAccession(id_dict,id_type,id):
184 if id_type not in id_dict:
185 id_dict[id_type] = []-++
186 id_dict[id_type] = [].append(id)
187
188 # returns(accession, id_type)
189 def getAccession(header, matchType):
190 # (format regex pattern, FROM_ID)
191 # TODO try to determine which type of accession ID we have by matching by regular expressions
192 # each regex match should have a groups[0] that given the accession
193 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
194 fmts = [
195 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
196 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
197 ('>/UniRef\d+_(\w+).*','ACC+ID'),
198 ('>/(UPI\d+).*','ACC+ID'),
199 ('NP_\d+\.\d+','ACC+ID'),
200 ('ENSP\d+','ENSEMBL_PRO_ID'),
201 ('ENST\d+','ENSEMBL_TRS_ID'),
202 ]
203 for pat,cat in fmts:
204 m = re.match(pat,header)
205 if m:
206 matchType.append(cat)
207 return (m.groups()[0],cat)
208 matchType.append('ACC+ID')
209 return (header,'ACC+ID')
210
211 # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types
212 idMaps = dict() # {to_id,idMap}
213 for to_id in options.to_id:
214 idMap = dict()
215 idMaps[to_id] = idMap
216 for (from_id,ids) id_dict.items():
217 # limit the requests to 500 at a time
218 idx = range(0,len(ids),500)
219 idx.append(len(ids))
220 for i in range(len(idx)-1):
221 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap)
222 """
223
224 """
225 Some examples of fasta ID lines From:
226 https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java
227 >sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ...
228 >gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ...
229 >IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr
230 >HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein.
231 >OExyz (OExyz) xxx xxx xxx
232 >hflu_lsi_xxxx xxx xxx xxx
233 >C_tr_Lx_x [xxx - xxx] | xxx xxx
234 >M. tub.xxx|Rvxxx| xxx xxx
235 // Drosophile DB.
236 // We need to find two elements:
237 // - the accession String (retrieved as the trimmed version of everything
238 // up to (and NOT including) " pep:"
239 // - the description (everything (trimmed) starting from (and including) the " pep:".
240 >CGxxxxxx pep:xxxxxx
241 >xxxx xxx SGDID:xxxx xxx
242 >generic_some_tag|proten_accession|a description for this protein
243 // Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as
244 // present in the Expasy FTP FASTA file.
245 // Is formatted something like this:
246 >XXX_YYYY (acc) rest
247 >sp|accession|ID descr rest (including taxonomy, if available
248 >tr|accession|ID descr rest (including taxonomy, if available)
249 >nxp|NX_P02768-1|ALB|Serum albumin|Iso
250 // New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as
251 // present in the Expasy FTP FASTA file.
252 // Is formatted something like this:
253 >accession|ID descr rest (including taxonomy, if available)
254 // Flybase FASTA format.
255 >FBxxx type=xxx
256 // A header translating a genome sequence into a protein sequences.
257 // We need to find two elements, separated by a space:
258 // - the accession string (retrieved as the first part of a space delimited String).
259 // - the nucleic acid start and stop site (between brackets, separated by a '-').
260 >dm345_3L-sense [234353534-234353938]
261 >dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database
262 >synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF
263 // The Arabidopsis thaliana database; TAIR format
264 >AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166
265 ...
266 // Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all.
267 // Testing for this is somewhat more complicated.
268 // Often used simple header; looks like:
269 // >NP0465 (NP0465) A description for this protein.
270 // We need to find two elements:
271 // - the accession String (easily retrieved as the next String until a space is encountered).
272 // - the description
273 >NP0465 (NP0465) A description for this protein.
274 GenBank gb|accession|locus
275 EMBL Data Library emb|accession|locus
276 DDBJ, DNA Database of Japan dbj|accession|locus
277 NBRF PIR pir||entry
278 Protein Research Foundation prf||name
279 SWISS-PROT sp|accession|entry name
280 Brookhaven Protein Data Bank pdb|entry|chain
281 Patents pat|country|number
282 GenInfo Backbone Id bbs|number
283 General database identifier gnl|database|identifier
284 NCBI Reference Sequence ref|accession|locus
285 Local Sequence identifier lcl|identifier
286 """
287
288
289 def getFastaAccession(header):
290 # TODO parse the ID and return the accession
291 # (format regex pattern, FROM_ID)
292 # TODO try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions
293 # each regex match should have a groups[0] that given the accession
294 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
295 fmts = [
296 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
297 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
298 ('NP_\d+\.\d+','ACC+ID'),
299
300 ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
301 ('(...*)_....*','ID'),
302 ('>/(UPI\d+).*','UPARC'),
303 ('>/UniRef50_(\w+).*','NF50'),
304 ('UniRef90_(\w+).*','NF90'),
305 ('UniRef100_(\w+).*','NF100'),
306 ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'),
307 ('ENSP\d+','ENSEMBL_PRO_ID'),
308 ('ENST\d+','ENSEMBL_TRS_ID')
309 ]
310 for pat,cat in fmts:
311 m = re.match(pat,header)
312 if m:
313 return m.groups()[0]
314 return header
315
316 # rather than an array of input accessions, we need to put them in a dict() by type
317 def addAccession(id_dict,id_type,id):
318 print id_type
319 for idtype in id_type:
320 if idtype not in id_dict:
321 id_dict[idtype] = []
322 id_dict[idtype].append(id)
323
324 # returns(accession, id_type)
325 def getAccession(header, matchType):
326 # (format regex pattern, FROM_ID)
327 # TODO try to determine which type of accession ID we have by matching by regular expressions
328 # each regex match should have a groups[0] that given the accession
329 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
330 fmts = [
331 ('([OPQ][0-9][A-Z0-9]{3}[0-9])','ACC'),
332 ('([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
333 ('(THISDOESNTWORK...*)_....*','ID'),
334 ('UPI(\d+).*','UPARC'),
335 ('(UniRef50_\w+.*)','NF50'),
336 ('UniRef90_(\w+).*','NF90'),
337 ('(UniRef100_\w+.*)','NF100'),
338 ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'),
339 ('([A-Z]*\d*\.\d$)','EMBL'),
340 ('([IBC]\d{5})','PIR'),
341 ('(Hs\.\d*)','UNIGENE_ID'),
342 ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'),
343 ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'),
344 ('(\d[A-Z0-9]{3})','PDB_ID'),
345 ('(DP\d{5})','DISPROT_ID'),
346
347 ('(\d*$)','P_ENTREZGENEID'),
348 ('(\d*$)','P_GI'),
349 ('(\d*$)','BIOGRID_ID'),
350 ('(\d*$)','GUIDETOPHARMACOLOGY_ID'),
351 ('(\d*$)','ALLERGOME_ID'),
352 ('(\d*$)','PEROXIBASE_ID'),
353 ('(\d*$)','REBASE_ID'),
354 ('(\d*$)','DNASU_ID'),
355 ('(\d*$)','DMDM_ID'),
356
357 ('(DIP-\d*N$)','DIP_ID'),
358 ('(MINT-\d*)','MINT_ID'),
359 ('(9606\.ENSP\d*)','STRING_ID'),
360 ('(CHEMBL\d*)','CHEMBL_ID'),
361 ('(DB\d*)','DRUGBANK_ID'),
362 ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'),
363 ('NOTHING POPS UP','MYCOCLAP_ID'),
364 ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'),
365 ('NOTHING POPS UP','WORLD_2DPAGE_ID'),
366 ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'),
367 ('(ENSP\d+)','ENSEMBL_PRO_ID'),
368 ('(ENST\d+)','ENSEMBL_TRS_ID'),
369 (' ','ENSEMBLGENOME_ID'),
370 (' ','ENSEMBLGENOME_PRO_ID'),
371 (' ','ENSEMBLGENOME_TRS_ID'),
372 ('(hsa:\d*)','KEGG_ID'),
373 ('(uc\d*[a-z]*\.\d$)','UCSC_ID'),
374 ('(.*[CKN]OG\d*)','EGGNOG_ID')
375 ]
376 n = re.match('\d*$', header)
377 if n: # For ambiguous number only ID types
378 numIDtypes = [
379 'P_ENTREZGENEID',
380 'P_GI',
381 'DMDM_ID',
382 'BIOGRID_ID',
383 'GUIDETOPHARMACOLOGY_ID',
384 'ALLERGOME_ID',
385 'PEROXIBASE_ID',
386 'REBASE_ID',
387 'DNASU_ID']
388 ambiguous = []
389 for numIDs in numIDtypes:
390 nm = getIdMapping([header],numIDs,'ACC',numReturn=True)
391 if nm != None:
392 ambiguous.append(nm)
393 matchType.append(ambiguous)
394 return (header, ambiguous)
395 for pat,cat in fmts:
396 m = re.match(pat,header)
397 if m:
398 matchType.append([cat])
399 return (m.groups()[0],[cat])
400 matchType.append(['ACC+ID'])
401 return (header,['ACC+ID'])
402
403
404 def read_tabular(filepath,col):
405 accessions = []
406 with open(filepath) as fp:
407 for i,line in enumerate(fp):
408 if line.strip() == '' or line.startswith('#'):
409 continue
410 fields = line.rstrip('\n').split('\t')
411 accession = fields[col]
412 accessions.append(accession)
413 return accessions
414
415 def get_fasta_entries(fp):
416 name, seq = None, []
417 for line in fp:
418 line = line.rstrip()
419 if line.startswith(">"):
420 if name: yield (name, ''.join(seq))
421 name, seq = line, []
422 else:
423 seq.append(line)
424 if name: yield (name, ''.join(seq))
425
426 def read_fasta(filepath):
427 accessions = []
428 with open(filepath) as fp:
429 for id, peptide in get_fasta_entries(fp):
430 accessions.append(getFastaAccession(id))
431 return accessions
432
433 def read_mzid(fp):
434 accessions = []
435 for event, elem in ET.iterparse(fp):
436 if event == 'end':
437 if re.search('DBSequence',elem.tag):
438 accessions.append(elem.attrib['accession'])
439 return accessions
440
441 def read_pepxml(fp):
442 accessions = []
443 for event, elem in ET.iterparse(fp):
444 if event == 'end':
445 if re.search('search_hit',elem.tag):
446 accessions.append(elem.get('protein'))
447 return accessions
448
449 def getUniprotSequence(uniprot_id):
450 url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id
451 print url
452 fp = urllib2.urlopen(url)
453 for id, seq in get_fasta_entries(fp):
454 if seq:
455 return seq
456 return ''
457
458 def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,numReturn=None):
459 # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)]))
460 if not accessions:
461 return
462 url = 'http://www.uniprot.org/mapping/'
463 params = {
464 'from': from_id,
465 'to': to_id,
466 'format':'tab',
467 'query': '\n'.join(accessions)
468 }
469 data = urllib.urlencode(params)
470 request = urllib2.Request(url, data)
471 contact = "" # Please set your email address here to help us debug in case of problems.
472 request.add_header('User-Agent', 'Python %s' % contact)
473 response = None
474 for i in range(3):
475 try:
476 response = urllib2.urlopen(request)
477 break
478 except Exception, e:
479 warn_err("%s",exit_code=None)
480 if response:
481 response.next()
482 for line in response:
483 # print >> sys.stderr, "idMap: %s" % line
484 if fh:
485 fh.write(line)
486 if numReturn:
487 id1,id2 = line.strip().split('\t')
488 return from_id
489 if idMap != None:
490 id1,id2 = line.strip().split('\t')
491 # print >> sys.stderr, "idMap: %s:%s" % (id1,id2)
492 if idMap.get(id1,'nothing') == 'nothing':
493 idMap[id1] = [id2]
494 else:
495 id3 = idMap[id1]
496 if id2 not in id3:
497 id3.append(id2)
498 idMap[id1] = id3
499 # python uniprot_id_mapping.py -T 'NF100' -test-data/old-inputs/var.tsv
500
501 #Parse Command Line
502 parser = optparse.OptionParser()
503 # input files
504 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
505 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
506 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
507 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
508 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
509 # Decoy pattern
510 parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' )
511 # filter patterns
512 parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' )
513 parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' )
514 # Unipept Flags
515 parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' )
516 parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' )
517 # output files
518 parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values')
519 # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' )
520 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
521 (options, args) = parser.parse_args()
522 if options.version:
523 print >> sys.stdout,"%s" % version
524 sys.exit(0)
525
526 accessions = []
527 ## Get accessions
528 if options.mzid:
529 accessions += read_mzid(options.mzid)
530 if options.pepxml:
531 accessions += read_pepxml(options.pepxml)
532 if options.tabular:
533 accessions += read_tabular(options.tabular,options.column)
534 if options.fasta:
535 accessions += read_fasta(options.fasta)
536 if args and len(args) > 0:
537 for i,accession in enumerate(args):
538 accessions.append(accession)
539 # filter accessions
540 if options.decoy:
541 filtered_accs = [re.sub(options.decoy,'',x) for x in accessions]
542 accessions = filtered_accs
543 if options.include_pat:
544 filtered_accs = []
545 for acc in accessions:
546 if re.match(options.include_pat,acc):
547 filtered_accs.append(acc)
548 accessions = filtered_accs
549 if options.exclude_pat:
550 filtered_accs = []
551 for acc in accessions:
552 if not re.match(options.exclude_pat,acc):
553 filtered_accs.append(acc)
554 accessions = filtered_accs
555 if len(accessions) < 1:
556 warn_err("No accessions input!",exit_code=1)
557 if options.output != None:
558 try:
559 outputPath = os.path.abspath(options.output)
560 outputFile = open(outputPath, 'w')
561 except Exception, e:
562 print >> sys.stderr, "failed: %s" % e
563 exit(3)
564 else:
565 outputFile = sys.stdout
566
567
568 # Sorts accessions to inferred ID types i+n a dictionary
569 id_dict = {}
570 matchType = []
571 for header in accessions:
572 id , id_types = getAccession(header,matchType)
573 addAccession(id_dict,id_types,id)
574 #print id_dict
575 idMaps = dict() # {to_id,idMap}
576 #print matchType
577 #print id_dict.items()
578 for to_id in options.to_id:
579 idMap = dict()
580 idMaps[to_id] = idMap
581 for (from_id,ids) in id_dict.items():
582 # limit the requests to 500 at a time
583 idx = range(0,len(ids),500)
584 idx.append(len(ids))
585 for i in range(len(idx)-1):
586 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap)
587 #print idMap
588 #Write output
589 #Output Table Header
590 outputFile.write("\n#%-16s %-16s" % (options.from_id,'Matched Type'))
591 for t in options.to_id:
592 outputFile.write("%-18s" % t)
593 outputFile.write("\n" + ("=" * 34) + ("=" * 18 * len(options.to_id)) + "\n")
594
595 #Output a row with values for each input ID
596 for i, acc in enumerate(accessions):
597 #multi = 1
598 #for types in options.to_id:
599 # IDs = idMaps[types].get(acc,['none'])
600 # if len(IDs) > multi:
601 # multi = len(IDs)
602 outputFile.write("%-17s %-16s" % (acc, matchType[i][0]))
603 for x in range(len(matchType[i])):
604 if x != 0:
605 outputFile.write(("''" + " " * 16 + "%-16s") % matchType[i][x])
606 for types in options.to_id:
607 IDs = idMaps[types].get(acc,['none'])
608 print types
609 print IDs
610 if len(IDs) > x:
611 outputFile.write("%-18s" % IDs[x])
612 else:
613 if len(matchType[i]) > 1:
614 outputFile.write("%-18s" % IDs[0])
615 else:
616 outputFile.write("'' ")
617 outputFile.write("\n")
618
619 # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv
620 stop = timeit.default_timer()
621 outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds"))
622
623 if __name__ == "__main__" : __main__()
624