comparison uniprot_ID_mapper/uniprot_id_mapping_071615.py @ 3:6651ac4651f0 draft default tip

Uploaded
author rsajulga
date Thu, 16 Jul 2015 16:41:35 -0400
parents
children
comparison
equal deleted inserted replaced
2:d0311668d442 3:6651ac4651f0
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2015, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14 import json
15 import logging
16 import optparse
17 from optparse import OptionParser
18 import os
19 import sys
20 import re
21 import urllib
22 import urllib2
23 import timeit
24 start = timeit.default_timer()
25 try:
26 import xml.etree.cElementTree as ET
27 except ImportError:
28 import xml.etree.ElementTree as ET
29
30 """print
31 http://www.uniprot.org/help/programmatic_access#id_mapping_examples
32 http://www.uniprot.org/help/uploadlists
33
34 From input files or stdin get accessions/IDs to map.
35 An input option can specify what id_type the input accessions/IDs are,
36 otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type
37
38 input_dict = dict()
39 for id_string in input_ids:
40 (id,id_type) = getAccession(id_string)
41 if not id_type in input_dict:
42 input_dict[id_type] = []
43 input_dict[id_type].append(id)
44
45
46 # We need to retrieve by from_id_type and to_id_type
47 for id_type in input_dict:
48 for
49
50
51 tabular output:
52 #input_id to_id to_id
53
54
55 """
56 def warn_err(msg,exit_code=1):
57 sys.stderr.write(msg)
58 if exit_code:
59 sys.exit(exit_code)
60
61 def __main__():
62 version = '1.0'
63 idDict = {
64 #Category:UniProt
65 'ACC+ID':'UniProtKB AC/ID',
66 'ACC':'UniProtKB AC',
67 'ID':'UniProtKB ID',
68 'UPARC':'UniParc',
69 'NF50':'UniRef50',
70 'NF90':'UniRef90',
71 'NF100':'UniRef100',
72 'GENENAME':'Gene name',
73 #Category:Other sequence databases
74 'EMBL_ID':'EMBL/GenBank/DDBJ',
75 'EMBL':'EMBL/GenBank/DDBJ CDS',
76 'PIR':'PIR',
77 'UNIGENE_ID':'UniGene',
78 'P_ENTREZGENEID':'Entrez Gene (GeneID)',
79 'P_GI':'GI number*',
80 'P_REFSEQ_AC':'RefSeq Protein',
81 'REFSEQ_NT_ID':'RefSeq Nucleotide',
82 #Category:3D structure databases
83 'PDB_ID':'PDB',
84 'DISPROT_ID':'DisProt',
85 #Category:Protein-protein interaction databases
86 'BIOGRID_ID':'BioGrid',
87 'DIP_ID':'DIP',
88 'MINT_ID':'MINT',
89 'STRING_ID':'STRING',
90 #Category:Chemistry
91 'CHEMBL_ID':'ChEMBL',
92 'DRUGBANK_ID':'DrugBank',
93 'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY',
94 #Category:Protein family/group databases
95 'ALLERGOME_ID':'Allergome',
96 'MEROPS_ID':'MEROPS',
97 'MYCOCLAP_ID':'mycoCLAP',
98 'PEROXIBASE_ID':'PeroxiBase',
99 'REBASE_ID':'REBASE',
100 'TCDB_ID':'TCDB',
101 #Category:Polymorphism databases
102 'DMDM_ID':'DMDM',
103 #Category:2D gel databases
104 'WORLD_2DPAGE_ID':'World-2DPAGE',
105 #Category:Protocols and materials databases
106 'DNASU_ID':'DNASU',
107 #Category:Genome annotation databases
108 'ENSEMBL_ID':'Ensembl',
109 'ENSEMBL_PRO_ID':'Ensembl Protein',
110 'ENSEMBL_TRS_ID':'Ensembl Transcript',
111 'ENSEMBLGENOME_ID':'Ensembl Genomes',
112 'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein',
113 'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript',
114 'P_ENTREZGENEID':'GeneID',
115 'KEGG_ID':'KEGG',
116 'PATRIC_ID':'PATRIC',
117 'UCSC_ID':'UCSC',
118 'VECTORBASE_ID':'VectorBase',
119 #Category:Organism-specific gene databases
120 'ARACHNOSERVER_ID':'ArachnoServer',
121 'CGD':'CGD',
122 'CONOSERVER_ID':'ConoServer',
123 'CYGD_ID':'CYGD',
124 'DICTYBASE_ID':'dictyBase',
125 'ECHOBASE_ID':'EchoBASE',
126 'ECOGENE_ID':'EcoGene',
127 'EUHCVDB_ID':'euHCVdb',
128 'EUPATHDB_ID':'EuPathDB',
129 'FLYBASE_ID':'FlyBase',
130 'GENECARDS_ID':'GeneCards',
131 'GENEFARM_ID':'GeneFarm',
132 'GENOLIST_ID':'GenoList',
133 'H_INVDB_ID':'H-InvDB',
134 'HGNC_ID':'HGNC',
135 'HPA_ID':'HPA',
136 'LEGIOLIST_ID':'LegioList',
137 'LEPROMA_ID':'Leproma',
138 'MAIZEGDB_ID':'MaizeGDB',
139 'MIM_ID':'MIM',
140 'MGI_ID':'MGI',
141 'NEXTPROT_ID':'neXtProt',
142 'ORPHANET_ID':'Orphanet',
143 'PHARMGKB_ID':'PharmGKB',
144 'POMBASE_ID':'PomBase',
145 'PSEUDOCAP_ID':'PseudoCAP',
146 'RGD_ID':'RGD',
147 'SGD_ID':'SGD',
148 'TAIR_ID':'TAIR',
149 'TUBERCULIST_ID':'TubercuList',
150 'WORMBASE_ID':'WormBase',
151 'WORMBASE_TRS_ID':'WormBase Transcript',
152 'WORMBASE_PRO_ID':'WormBase Protein',
153 'XENBASE_ID':'Xenbase',
154 'ZFIN_ID':'ZFIN',
155 #Category:Phylogenomic databases
156 'EGGNOG_ID':'eggNOG',
157 'GENETREE_ID':'GeneTree',
158 'HOGENOM_ID':'HOGENOM',
159 'HOVERGEN_ID':'HOVERGEN',
160 'KO_ID':'KO',
161 'OMA_ID':'OMA',
162 'ORTHODB_ID':'OrthoDB',
163 'PROTCLUSTDB_ID':'ProtClustDB',
164 'TREEFAM_ID':'TreeFam',
165 #Category:Enzyme and pathway databases
166 'BIOCYC_ID':'BioCyc',
167 'REACTOME_ID':'Reactome',
168 'UNIPATHWAY_ID':'UniPathWay',
169 #Category:Gene expression databases
170 'CLEANEX_ID':'CleanEx',
171 #Category:Other
172 'CHITARS_ID':'ChiTaRS',
173 'GENOMERNAI_ID':'GenomeRNAi',
174 'GENEWIKI_ID':'GeneWiki',
175 'NEXTBIO_ID':'NextBio'
176 }
177
178 """
179 TODO:
180 Would be better to be able to infer the from_id type for the input accession/ID especially for a fasta file
181 - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession.
182
183 # rather than an array of input accessions, we need to put them in a dict() by type
184 def addAccession(id_dict,id_type,id):
185 if id_type not in id_dict:
186 id_dict[id_type] = []-++
187 id_dict[id_type] = [].append(id)
188
189 # returns(accession, id_type)
190 def getAccession(header, matchType):
191 # (format regex pattern, FROM_ID)
192 # TODO try to determine which type of accession ID we have by matching by regular expressions
193 # each regex match should have a groups[0] that given the accession
194 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
195 fmts = [
196 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
197 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
198 ('>/UniRef\d+_(\w+).*','ACC+ID'),
199 ('>/(UPI\d+).*','ACC+ID'),
200 ('NP_\d+\.\d+','ACC+ID'),
201 ('ENSP\d+','ENSEMBL_PRO_ID'),
202 ('ENST\d+','ENSEMBL_TRS_ID'),
203 ]
204 for pat,cat in fmts:
205 m = re.match(pat,header)
206 if m:
207 matchType.append(cat)
208 return (m.groups()[0],cat)
209 matchType.append('ACC+ID')
210 return (header,'ACC+ID')
211
212 # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types
213 idMaps = dict() # {to_id,idMap}
214 for to_id in options.to_id:
215 idMap = dict()
216 idMaps[to_id] = idMap
217 for (from_id,ids) id_dict.items():
218 # limit the requests to 500 at a time
219 idx = range(0,len(ids),500)
220 idx.append(len(ids))
221 for i in range(len(idx)-1):
222 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap)
223 """
224
225 """
226 Some examples of fasta ID lines From:
227 https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java
228 >sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ...
229 >gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ...
230 >IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr
231 >HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein.
232 >OExyz (OExyz) xxx xxx xxx
233 >hflu_lsi_xxxx xxx xxx xxx
234 >C_tr_Lx_x [xxx - xxx] | xxx xxx
235 >M. tub.xxx|Rvxxx| xxx xxx
236 // Drosophile DB.
237 // We need to find two elements:
238 // - the accession String (retrieved as the trimmed version of everything
239 // up to (and NOT including) " pep:"
240 // - the description (everything (trimmed) starting from (and including) the " pep:".
241 >CGxxxxxx pep:xxxxxx
242 >xxxx xxx SGDID:xxxx xxx
243 >generic_some_tag|proten_accession|a description for this protein
244 // Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as
245 // present in the Expasy FTP FASTA file.
246 // Is formatted something like this:
247 >XXX_YYYY (acc) rest
248 >sp|accession|ID descr rest (including taxonomy, if available
249 >tr|accession|ID descr rest (including taxonomy, if available)
250 >nxp|NX_P02768-1|ALB|Serum albumin|Iso
251 // New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as
252 // present in the Expasy FTP FASTA file.
253 // Is formatted something like this:
254 >accession|ID descr rest (including taxonomy, if available)
255 // Flybase FASTA format.
256 >FBxxx type=xxx
257 // A header translating a genome sequence into a protein sequences.
258 // We need to find two elements, separated by a space:
259 // - the accession string (retrieved as the first part of a space delimited String).
260 // - the nucleic acid start and stop site (between brackets, separated by a '-').
261 >dm345_3L-sense [234353534-234353938]
262 >dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database
263 >synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF
264 // The Arabidopsis thaliana database; TAIR format
265 >AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166
266 ...
267 // Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all.
268 // Testing for this is somewhat more complicated.
269 // Often used simple header; looks like:
270 // >NP0465 (NP0465) A description for this protein.
271 // We need to find two elements:
272 // - the accession String (easily retrieved as the next String until a space is encountered).
273 // - the description
274 >NP0465 (NP0465) A description for this protein.
275 GenBank gb|accession|locus
276 EMBL Data Library emb|accession|locus
277 DDBJ, DNA Database of Japan dbj|accession|locus
278 NBRF PIR pir||entry
279 Protein Research Foundation prf||name
280 SWISS-PROT sp|accession|entry name
281 Brookhaven Protein Data Bank pdb|entry|chain
282 Patents pat|country|number
283 GenInfo Backbone Id bbs|number
284 General database identifier gnl|database|identifier
285 NCBI Reference Sequence ref|accession|locus
286 Local Sequence identifier lcl|identifier
287 """
288
289
290 def getFastaAccession(header):
291 # TODO parse the ID and return the accession
292 # (format regex pattern, FROM_ID)
293 # TODO try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions
294 # each regex match should have a groups[0] that given the accession
295 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
296 fmts = [
297 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
298 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
299 ('NP_\d+\.\d+','ACC+ID'),
300
301 ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
302 ('(...*)_....*','ID'),
303 ('>/(UPI\d+).*','UPARC'),
304 ('>/UniRef50_(\w+).*','NF50'),
305 ('UniRef90_(\w+).*','NF90'),
306 ('UniRef100_(\w+).*','NF100'),
307 ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'),
308 ('ENSP\d+','ENSEMBL_PRO_ID'),
309 ('ENST\d+','ENSEMBL_TRS_ID')
310 ]
311 for pat,cat in fmts:
312 m = re.match(pat,header)
313 if m:
314 return m.groups()[0]
315 return header
316
317 # rather than an array of input accessions, we need to put them in a dict() by type
318 def addAccession(id_dict,id_type,id):
319 for idtype in id_type:
320 if idtype not in id_dict:
321 id_dict[idtype] = []
322 id_dict[idtype].append(id)
323
324 # returns(accession, id_type)
325 def getAccession(header):
326 # (format regex pattern, FROM_ID)
327 # TODO try to determine which type of accession ID we have by matching by regular expressions
328 # each regex match should have a groups[0] that given the accession
329 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
330 fmts = [
331 ('([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
332 ('.*_[A-Z]*','ID'),
333 ('UPI(\d+).*','UPARC'),
334 ('(UniRef50_\w+.*)','NF50'),
335 ('UniRef90_(\w+).*','NF90'),
336 ('(UniRef100_\w+.*)','NF100'),
337 ('a[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2}','GENENAME'),
338
339 ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'),
340 ('K\d*','KO_ID'),
341
342 ('([A-Z]*\d*\.\d$)','EMBL'),
343 ('([IBC]\d{5})','PIR'),
344 ('(Hs\.\d*)','UNIGENE_ID'),
345 ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'),
346 ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'),
347 ('(\d[A-Z0-9]{3})','PDB_ID'),
348 ('(DP\d{5})','DISPROT_ID'),
349 ('(DIP-\d*N$)','DIP_ID'),
350 ('(MINT-\d*)','MINT_ID'),
351 ('(9606\.ENSP\d*)','STRING_ID'),
352 ('(CHEMBL\d*)','CHEMBL_ID'),
353 ('(DB\d*)','DRUGBANK_ID'),
354 ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'),
355 ('NOTHING POPS UP','MYCOCLAP_ID'),
356 ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'),
357 ('NOTHING POPS UP','WORLD_2DPAGE_ID'),
358 ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'),
359 ('(ENSP\d+)','ENSEMBL_PRO_ID'),
360 ('(ENST\d+)','ENSEMBL_TRS_ID'),
361 (' ','ENSEMBLGENOME_ID'),
362 (' ','ENSEMBLGENOME_PRO_ID'),
363 (' ','ENSEMBLGENOME_TRS_ID'),
364 ('(hsa:\d*)','KEGG_ID'),
365 ('(uc\d*[a-z]*\.\d$)','UCSC_ID'),
366 ('VEL:J:LDKJFS','VECTORBASE_ID'),
367 ('AS\d*','ARACHNOSERVER_ID'),
368
369
370
371 ('(.*[CKN]OG\d*)','EGGNOG_ID'),
372 ('(ENSGT\d*)','GENETREE_ID'),
373 ('HOG\d*','HOGENOM_ID'),
374 ('HBG\d*','HOVERGEN_ID'),
375
376 ('[A-Z]{7}','OMA_ID'),
377
378 ('EOG\d*','ORTHODB_ID'),
379 ('asdfsadfasdf','PROTCLUSTDB_ID'),
380 ('TF\d*','TREEFAM_ID'),
381 ('REACT_\d*','REACTOME_ID'),
382 ('asdfasl;DF:LJk','UNIPATHWAY_ID'),
383 ('HS_\d*','CLEANEX_ID'),
384 ('SAMEAS GENE NAME','CHITARS_ID'),
385 ('GENENAME_\(gene\)','GENEWIKI_ID'),
386 ]
387 if re.match('\d*$', header): # For ambiguous number only ID types
388 numIDtypes = [
389 'P_ENTREZGENEID',
390 'P_GI',
391 'DMDM_ID',
392 'BIOGRID_ID',
393 'GUIDETOPHARMACOLOGY_ID',
394 'ALLERGOME_ID',
395 'PEROXIBASE_ID',
396 'REBASE_ID',
397 'DNASU_ID',
398 'GENOMERNAI_ID',
399 'NEXTBIO_ID']
400 ambiguous = []
401 for numIDs in numIDtypes:
402 nm = getIdMapping([header],numIDs,'ACC',ambiguity=True)
403 if nm != None:
404 ambiguous.append(nm)
405 if ambiguous == []:
406 ambiguous.append('No Match')
407 return (header, ambiguous)
408 for pat,cat in fmts:
409 m = re.match(pat,header)
410 if m:
411 return (m.groups()[0],[cat])
412 return (header,['ACC+ID'])
413
414
415 def read_tabular(filepath,col):
416 accessions = []
417 with open(filepath) as fp:
418 for i,line in enumerate(fp):
419 if line.strip() == '' or line.startswith('#'):
420 continue
421 fields = line.rstrip('\n').split('\t')
422 accession = fields[col]
423 accessions.append(accession)
424 return accessions
425
426 def get_fasta_entries(fp):
427 name, seq = None, []
428 for line in fp:
429 line = line.rstrip()
430 if line.startswith(">"):
431 if name: yield (name, ''.join(seq))
432 name, seq = line, []
433 else:
434 seq.append(line)
435 if name: yield (name, ''.join(seq))
436
437 def read_fasta(filepath):
438 accessions = []
439 with open(filepath) as fp:
440 for id, peptide in get_fasta_entries(fp):
441 accessions.append(getFastaAccession(id))
442 return accessions
443
444 def read_mzid(fp):
445 accessions = []
446 for event, elem in ET.iterparse(fp):
447 if event == 'end':
448 if re.search('DBSequence',elem.tag):
449 accessions.append(elem.attrib['accession'])
450 return accessions
451
452 def read_pepxml(fp):
453 accessions = []
454 for event, elem in ET.iterparse(fp):
455 if event == 'end':
456 if re.search('search_hit',elem.tag):
457 accessions.append(elem.get('protein'))
458 return accessions
459
460 def getUniprotSequence(uniprot_id):
461 url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id
462 print url
463 fp = urllib2.urlopen(url)
464 for id, seq in get_fasta_entries(fp):
465 if seq:
466 return seq
467 return ''
468
469
470
471
472 def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,ambiguity=None,crossReference=None,idMaps=None):
473 # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)]))
474 if not accessions:
475 return
476 url = 'http://www.uniprot.org/mapping/'
477 # Cross Referencing: Mapping to non-UniprotKB ('ACC') IDs to other non-UniprotKB ('ACC') IDs
478 if to_id != 'ACC' and from_id != 'ACC':
479 crMap = {}
480 getIdMapping(accessions,from_id,'ACC',idMap=crMap)
481 crMap2 = crMap.copy()
482 for x in crMap.keys():
483 for y in crMap[x].keys():
484 for z in crMap[x][y]:
485 crMap2[x][y] = []
486 getIdMapping([z],'ACC',to_id,idMap=crMap2,crossReference=[x,from_id])
487 idMap.update(crMap2)
488 return
489 params = {
490 'from': from_id,
491 'to': to_id,
492 'format':'tab',
493 'query': '\n'.join(accessions)
494 }
495 data = urllib.urlencode(params)
496 request = urllib2.Request(url, data)
497 contact = "" # Please set your email address here to help us debug in case of problems.
498 request.add_header('User-Agent', 'Python %s' % contact)
499 response = None
500 for i in range(3):
501 try:
502 response = urllib2.urlopen(request)
503 break
504 except Exception, e:
505 warn_err("%s",exit_code=None)
506 if response:
507 response.next()
508 print params
509
510 for line in response:
511 # print >> sys.stderr, "idMap: %s" % line
512 if fh:
513 fh.write(line)
514 if ambiguity: # if there was a response, then an ambiguous match can be made
515 return from_id
516 if idMap != None:
517 id1,id2 = line.strip().split('\t')
518 print id2
519 if crossReference != None:
520 id1, from_id = crossReference
521 # print >> sys.stderr, "idMap: %s:%s" % (id1,id2)
522 try:
523 idMap[id1][from_id].append(id2)
524 except:
525 idMap[id1] = {from_id:[id2]}
526 if ambiguity == None:
527 for line in response:
528 for acc in accessions:
529 idMap[acc] = {from_id:['N/A']}
530 return
531
532
533 #Parse Command Line
534 parser = optparse.OptionParser()
535 # input files
536 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
537 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
538 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
539 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
540 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
541 # Decoy pattern
542 parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' )
543 # filter patterns
544 parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' )
545 parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' )
546 # Unipept Flags
547 parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' )
548 parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' )
549 # output files
550 parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values')
551 # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' )
552 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
553 (options, args) = parser.parse_args()
554 if options.version:
555 print >> sys.stdout,"%s" % version
556 sys.exit(0)
557
558 accessions = []
559 ## Get accessions
560 if options.mzid:
561 accessions += read_mzid(options.mzid)
562 if options.pepxml:
563 accessions += read_pepxml(options.pepxml)
564 if options.tabular:
565 accessions += read_tabular(options.tabular,options.column)
566 if options.fasta:
567 accessions += read_fasta(options.fasta)
568 if args and len(args) > 0:
569 for i,accession in enumerate(args):
570 accessions.append(accession)
571 # filter accessions
572 if options.decoy:
573 filtered_accs = [re.sub(options.decoy,'',x) for x in accessions]
574 accessions = filtered_accs
575 if options.include_pat:
576 filtered_accs = []
577 for acc in accessions:
578 if re.match(options.include_pat,acc):
579 filtered_accs.append(acc)
580 accessions = filtered_accs
581 if options.exclude_pat:
582 filtered_accs = []
583 for acc in accessions:
584 if not re.match(options.exclude_pat,acc):
585 filtered_accs.append(acc)
586 accessions = filtered_accs
587 if len(accessions) < 1:
588 warn_err("No accessions input!",exit_code=1)
589 if options.output != None:
590 try:
591 outputPath = os.path.abspath(options.output)
592 outputFile = open(outputPath, 'w')
593 except Exception, e:
594 print >> sys.stderr, "failed: %s" % e
595 exit(3)
596 else:
597 outputFile = sys.stdout
598
599
600 # Removes duplicates in accessions
601 seen = set()
602 seen_add = seen.add
603 accessions = [x for x in accessions if not (x in seen or seen_add(x))]
604 # Sorts accessions to inferred ID types i+n a dictionary
605 id_dict = {}
606 for header in accessions:
607 id , id_types = getAccession(header)
608 addAccession(id_dict,id_types,id)
609 idMaps = dict() # {to_id,idMap}
610 for to_id in options.to_id:
611 idMap = dict()
612 idMaps[to_id] = idMap
613 for (from_id,ids) in id_dict.items():
614 # limit the requests to 500 at a time
615 idx = range(0,len(ids),500)
616 idx.append(len(ids))
617 for i in range(len(idx)-1):
618 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,idMaps=idMaps)
619 print ids
620 #Write output
621 #Output Table Header
622 outputFile.write("\n#%-17s" % (options.from_id))
623 for t in options.to_id:
624 outputFile.write("%-18s" % t)
625 outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n")
626
627 # Create an output-friendly matrix
628 idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))]
629 for a, acc in enumerate(accessions):
630 idLength = 0
631 for i, to_id in enumerate(options.to_id): # [[ids],[ids]]
632 idArrayColumn = []
633 idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3]
634 for from_id in idPairs:
635 ids = idPairs[from_id]
636 for id_ in ids:
637 idArrayColumn.append("%s[%s]" % (id_,from_id))
638 if idLength < len(idArrayColumn):
639 idLength = len(idArrayColumn)
640 idArray[a][i].extend(idArrayColumn)
641 for y in range(idLength):
642 outputFile.write("%-18s" % acc)
643 for x in range(len(options.to_id)):
644 try:
645 outputFile.write("%-18s" % idArray[a][x][y])
646 except:
647 outputFile.write(" " * 18)
648 outputFile.write("\n")
649 # Output Matrix
650
651
652
653 #print idMaps
654 # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv
655 stop = timeit.default_timer()
656 outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds"))
657
658 if __name__ == "__main__" : __main__()
659