comparison uniprot_ID_mapper/uniprot_id_mapping _071415.py @ 3:6651ac4651f0 draft default tip

Uploaded
author rsajulga
date Thu, 16 Jul 2015 16:41:35 -0400
parents
children
comparison
equal deleted inserted replaced
2:d0311668d442 3:6651ac4651f0
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2015, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14 import json
15 import logging
16 import optparse
17 from optparse import OptionParser
18 import os
19 import sys
20 import re
21 import urllib
22 import urllib2
23 import timeit
24 start = timeit.default_timer()
25 try:
26 import xml.etree.cElementTree as ET
27 except ImportError:
28 import xml.etree.ElementTree as ET
29
30 """print
31 http://www.uniprot.org/help/programmatic_access#id_mapping_examples
32 http://www.uniprot.org/help/uploadlists
33
34 From input files or stdin get accessions/IDs to map.
35 An input option can specify what id_type the input accessions/IDs are,
36 otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type
37
38 input_dict = dict()
39 for id_string in input_ids:
40 (id,id_type) = getAccession(id_string)
41 if not id_type in input_dict:
42 input_dict[id_type] = []
43 input_dict[id_type].append(id)
44
45
46 # We need to retrieve by from_id_type and to_id_type
47 for id_type in input_dict:
48 for
49
50
51 tabular output:
52 #input_id to_id to_id
53
54
55 """
56 def warn_err(msg,exit_code=1):
57 sys.stderr.write(msg)
58 if exit_code:
59 sys.exit(exit_code)
60
61 def __main__():
62 version = '1.0'
63 idDict = {
64 #Category:UniProt
65 'ACC+ID':'UniProtKB AC/ID',
66 'ACC':'UniProtKB AC',
67 'ID':'UniProtKB ID',
68 'UPARC':'UniParc',
69 'NF50':'UniRef50',
70 'NF90':'UniRef90',
71 'NF100':'UniRef100',
72 #Category:Other sequence databases
73 'EMBL_ID':'EMBL/GenBank/DDBJ',
74 'EMBL':'EMBL/GenBank/DDBJ CDS',
75 'PIR':'PIR',
76 'UNIGENE_ID':'UniGene',
77 'P_ENTREZGENEID':'Entrez Gene (GeneID)',
78 'P_GI':'GI number*',
79 'P_REFSEQ_AC':'RefSeq Protein',
80 'REFSEQ_NT_ID':'RefSeq Nucleotide',
81 #Category:3D structure databases
82 'PDB_ID':'PDB',
83 'DISPROT_ID':'DisProt',
84 #Category:Protein-protein interaction databases
85 'BIOGRID_ID':'BioGrid',
86 'DIP_ID':'DIP',
87 'MINT_ID':'MINT',
88 'STRING_ID':'STRING',
89 #Category:Chemistry
90 'CHEMBL_ID':'ChEMBL',
91 'DRUGBANK_ID':'DrugBank',
92 'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY',
93 #Category:Protein family/group databases
94 'ALLERGOME_ID':'Allergome',
95 'MEROPS_ID':'MEROPS',
96 'MYCOCLAP_ID':'mycoCLAP',
97 'PEROXIBASE_ID':'PeroxiBase',
98 'REBASE_ID':'REBASE',
99 'TCDB_ID':'TCDB',
100 #Category:Polymorphism databases
101 'DMDM_ID':'DMDM',
102 #Category:2D gel databases
103 'WORLD_2DPAGE_ID':'World-2DPAGE',
104 #Category:Protocols and materials databases
105 'DNASU_ID':'DNASU',
106 #Category:Genome annotation databases
107 'ENSEMBL_ID':'Ensembl',
108 'ENSEMBL_PRO_ID':'Ensembl Protein',
109 'ENSEMBL_TRS_ID':'Ensembl Transcript',
110 'ENSEMBLGENOME_ID':'Ensembl Genomes',
111 'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein',
112 'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript',
113 'P_ENTREZGENEID':'GeneID',
114 'KEGG_ID':'KEGG',
115 'PATRIC_ID':'PATRIC',
116 'UCSC_ID':'UCSC',
117 'VECTORBASE_ID':'VectorBase',
118 #Category:Organism-specific gene databases
119 'ARACHNOSERVER_ID':'ArachnoServer',
120 'CGD':'CGD',
121 'CONOSERVER_ID':'ConoServer',
122 'CYGD_ID':'CYGD',
123 'DICTYBASE_ID':'dictyBase',
124 'ECHOBASE_ID':'EchoBASE',
125 'ECOGENE_ID':'EcoGene',
126 'EUHCVDB_ID':'euHCVdb',
127 'EUPATHDB_ID':'EuPathDB',
128 'FLYBASE_ID':'FlyBase',
129 'GENECARDS_ID':'GeneCards',
130 'GENEFARM_ID':'GeneFarm',
131 'GENOLIST_ID':'GenoList',
132 'H_INVDB_ID':'H-InvDB',
133 'HGNC_ID':'HGNC',
134 'HPA_ID':'HPA',
135 'LEGIOLIST_ID':'LegioList',
136 'LEPROMA_ID':'Leproma',
137 'MAIZEGDB_ID':'MaizeGDB',
138 'MIM_ID':'MIM',
139 'MGI_ID':'MGI',
140 'NEXTPROT_ID':'neXtProt',
141 'ORPHANET_ID':'Orphanet',
142 'PHARMGKB_ID':'PharmGKB',
143 'POMBASE_ID':'PomBase',
144 'PSEUDOCAP_ID':'PseudoCAP',
145 'RGD_ID':'RGD',
146 'SGD_ID':'SGD',
147 'TAIR_ID':'TAIR',
148 'TUBERCULIST_ID':'TubercuList',
149 'WORMBASE_ID':'WormBase',
150 'WORMBASE_TRS_ID':'WormBase Transcript',
151 'WORMBASE_PRO_ID':'WormBase Protein',
152 'XENBASE_ID':'Xenbase',
153 'ZFIN_ID':'ZFIN',
154 #Category:Phylogenomic databases
155 'EGGNOG_ID':'eggNOG',
156 'GENETREE_ID':'GeneTree',
157 'HOGENOM_ID':'HOGENOM',
158 'HOVERGEN_ID':'HOVERGEN',
159 'KO_ID':'KO',
160 'OMA_ID':'OMA',
161 'ORTHODB_ID':'OrthoDB',
162 'PROTCLUSTDB_ID':'ProtClustDB',
163 'TREEFAM_ID':'TreeFam',
164 #Category:Enzyme and pathway databases
165 'BIOCYC_ID':'BioCyc',
166 'REACTOME_ID':'Reactome',
167 'UNIPATHWAY_ID':'UniPathWay',
168 #Category:Gene expression databases
169 'CLEANEX_ID':'CleanEx',
170 #Category:Other
171 'CHITARS_ID':'ChiTaRS',
172 'GENOMERNAI_ID':'GenomeRNAi',
173 'GENEWIKI_ID':'GeneWiki',
174 'NEXTBIO_ID':'NextBio'
175 }
176
177 """
178 TODO:
179 Would be better to be able to infer the from_id type for the input accession/ID especially for a fasta file
180 - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession.
181
182 # rather than an array of input accessions, we need to put them in a dict() by type
183 def addAccession(id_dict,id_type,id):
184 if id_type not in id_dict:
185 id_dict[id_type] = []-++
186 id_dict[id_type] = [].append(id)
187
188 # returns(accession, id_type)
189 def getAccession(header, matchType):
190 # (format regex pattern, FROM_ID)
191 # TODO try to determine which type of accession ID we have by matching by regular expressions
192 # each regex match should have a groups[0] that given the accession
193 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
194 fmts = [
195 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
196 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
197 ('>/UniRef\d+_(\w+).*','ACC+ID'),
198 ('>/(UPI\d+).*','ACC+ID'),
199 ('NP_\d+\.\d+','ACC+ID'),
200 ('ENSP\d+','ENSEMBL_PRO_ID'),
201 ('ENST\d+','ENSEMBL_TRS_ID'),
202 ]
203 for pat,cat in fmts:
204 m = re.match(pat,header)
205 if m:
206 matchType.append(cat)
207 return (m.groups()[0],cat)
208 matchType.append('ACC+ID')
209 return (header,'ACC+ID')
210
211 # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types
212 idMaps = dict() # {to_id,idMap}
213 for to_id in options.to_id:
214 idMap = dict()
215 idMaps[to_id] = idMap
216 for (from_id,ids) id_dict.items():
217 # limit the requests to 500 at a time
218 idx = range(0,len(ids),500)
219 idx.append(len(ids))
220 for i in range(len(idx)-1):
221 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap)
222 """
223
224 """
225 Some examples of fasta ID lines From:
226 https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java
227 >sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ...
228 >gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ...
229 >IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr
230 >HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein.
231 >OExyz (OExyz) xxx xxx xxx
232 >hflu_lsi_xxxx xxx xxx xxx
233 >C_tr_Lx_x [xxx - xxx] | xxx xxx
234 >M. tub.xxx|Rvxxx| xxx xxx
235 // Drosophile DB.
236 // We need to find two elements:
237 // - the accession String (retrieved as the trimmed version of everything
238 // up to (and NOT including) " pep:"
239 // - the description (everything (trimmed) starting from (and including) the " pep:".
240 >CGxxxxxx pep:xxxxxx
241 >xxxx xxx SGDID:xxxx xxx
242 >generic_some_tag|proten_accession|a description for this protein
243 // Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as
244 // present in the Expasy FTP FASTA file.
245 // Is formatted something like this:
246 >XXX_YYYY (acc) rest
247 >sp|accession|ID descr rest (including taxonomy, if available
248 >tr|accession|ID descr rest (including taxonomy, if available)
249 >nxp|NX_P02768-1|ALB|Serum albumin|Iso
250 // New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as
251 // present in the Expasy FTP FASTA file.
252 // Is formatted something like this:
253 >accession|ID descr rest (including taxonomy, if available)
254 // Flybase FASTA format.
255 >FBxxx type=xxx
256 // A header translating a genome sequence into a protein sequences.
257 // We need to find two elements, separated by a space:
258 // - the accession string (retrieved as the first part of a space delimited String).
259 // - the nucleic acid start and stop site (between brackets, separated by a '-').
260 >dm345_3L-sense [234353534-234353938]
261 >dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database
262 >synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF
263 // The Arabidopsis thaliana database; TAIR format
264 >AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166
265 ...
266 // Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all.
267 // Testing for this is somewhat more complicated.
268 // Often used simple header; looks like:
269 // >NP0465 (NP0465) A description for this protein.
270 // We need to find two elements:
271 // - the accession String (easily retrieved as the next String until a space is encountered).
272 // - the description
273 >NP0465 (NP0465) A description for this protein.
274 GenBank gb|accession|locus
275 EMBL Data Library emb|accession|locus
276 DDBJ, DNA Database of Japan dbj|accession|locus
277 NBRF PIR pir||entry
278 Protein Research Foundation prf||name
279 SWISS-PROT sp|accession|entry name
280 Brookhaven Protein Data Bank pdb|entry|chain
281 Patents pat|country|number
282 GenInfo Backbone Id bbs|number
283 General database identifier gnl|database|identifier
284 NCBI Reference Sequence ref|accession|locus
285 Local Sequence identifier lcl|identifier
286 """
287
288
289 def getFastaAccession(header):
290 # TODO parse the ID and return the accession
291 # (format regex pattern, FROM_ID)
292 # TODO try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions
293 # each regex match should have a groups[0] that given the accession
294 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
295 fmts = [
296 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
297 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
298 ('NP_\d+\.\d+','ACC+ID'),
299
300 ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
301 ('(...*)_....*','ID'),
302 ('>/(UPI\d+).*','UPARC'),
303 ('>/UniRef50_(\w+).*','NF50'),
304 ('UniRef90_(\w+).*','NF90'),
305 ('UniRef100_(\w+).*','NF100'),
306 ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'),
307 ('ENSP\d+','ENSEMBL_PRO_ID'),
308 ('ENST\d+','ENSEMBL_TRS_ID')
309 ]
310 for pat,cat in fmts:
311 m = re.match(pat,header)
312 if m:
313 return m.groups()[0]
314 return header
315
316 # rather than an array of input accessions, we need to put them in a dict() by type
317 def addAccession(id_dict,id_type,id):
318 for idtype in id_type:
319 if idtype not in id_dict:
320 id_dict[idtype] = []
321 id_dict[idtype].append(id)
322
323 # returns(accession, id_type)
324 def getAccession(header, matchType):
325 # (format regex pattern, FROM_ID)
326 # TODO try to determine which type of accession ID we have by matching by regular expressions
327 # each regex match should have a groups[0] that given the accession
328 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
329 fmts = [
330 ('([OPQ][0-9][A-Z0-9]{3}[0-9])','ACC'),
331 ('([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
332 ('(THISDOESNTWORK...*)_....*','ID'),
333 ('UPI(\d+).*','UPARC'),
334 ('(UniRef50_\w+.*)','NF50'),
335 ('UniRef90_(\w+).*','NF90'),
336 ('(UniRef100_\w+.*)','NF100'),
337 ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'),
338 ('([A-Z]*\d*\.\d$)','EMBL'),
339 ('([IBC]\d{5})','PIR'),
340 ('(Hs\.\d*)','UNIGENE_ID'),
341 ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'),
342 ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'),
343 ('(\d[A-Z0-9]{3})','PDB_ID'),
344 ('(DP\d{5})','DISPROT_ID'),
345
346 ('(\d*$)','P_ENTREZGENEID'),
347 ('(\d*$)','P_GI'),
348 ('(\d*$)','BIOGRID_ID'),
349 ('(\d*$)','GUIDETOPHARMACOLOGY_ID'),
350 ('(\d*$)','ALLERGOME_ID'),
351 ('(\d*$)','PEROXIBASE_ID'),
352 ('(\d*$)','REBASE_ID'),
353 ('(\d*$)','DNASU_ID'),
354 ('(\d*$)','DMDM_ID'),
355
356 ('(DIP-\d*N$)','DIP_ID'),
357 ('(MINT-\d*)','MINT_ID'),
358 ('(9606\.ENSP\d*)','STRING_ID'),
359 ('(CHEMBL\d*)','CHEMBL_ID'),
360 ('(DB\d*)','DRUGBANK_ID'),
361 ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'),
362 ('NOTHING POPS UP','MYCOCLAP_ID'),
363 ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'),
364 ('NOTHING POPS UP','WORLD_2DPAGE_ID'),
365 ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'),
366 ('(ENSP\d+)','ENSEMBL_PRO_ID'),
367 ('(ENST\d+)','ENSEMBL_TRS_ID'),
368 (' ','ENSEMBLGENOME_ID'),
369 (' ','ENSEMBLGENOME_PRO_ID'),
370 (' ','ENSEMBLGENOME_TRS_ID'),
371 ('(hsa:\d*)','KEGG_ID'),
372 ('(uc\d*[a-z]*\.\d$)','UCSC_ID'),
373 ('(.*[CKN]OG\d*)','EGGNOG_ID')
374 ]
375 n = re.match('\d*$', header)
376 if n: # For ambiguous number only ID types
377 numIDtypes = [
378 'P_ENTREZGENEID',
379 'P_GI',
380 'DMDM_ID',
381 'BIOGRID_ID',
382 'GUIDETOPHARMACOLOGY_ID',
383 'ALLERGOME_ID',
384 'PEROXIBASE_ID',
385 'REBASE_ID',
386 'DNASU_ID']
387 ambiguous = []
388 for numIDs in numIDtypes:
389 nm = getIdMapping([header],numIDs,'ACC',numReturn=True)
390 if nm != None:
391 ambiguous.append(nm)
392 matchType.append(ambiguous)
393 return (header, ambiguous)
394 for pat,cat in fmts:
395 m = re.match(pat,header)
396 if m:
397 matchType.append([cat])
398 return (m.groups()[0],[cat])
399 matchType.append(['ACC+ID'])
400 return (header,['ACC+ID'])
401
402
403 def read_tabular(filepath,col):
404 accessions = []
405 with open(filepath) as fp:
406 for i,line in enumerate(fp):
407 if line.strip() == '' or line.startswith('#'):
408 continue
409 fields = line.rstrip('\n').split('\t')
410 accession = fields[col]
411 accessions.append(accession)
412 return accessions
413
414 def get_fasta_entries(fp):
415 name, seq = None, []
416 for line in fp:
417 line = line.rstrip()
418 if line.startswith(">"):
419 if name: yield (name, ''.join(seq))
420 name, seq = line, []
421 else:
422 seq.append(line)
423 if name: yield (name, ''.join(seq))
424
425 def read_fasta(filepath):
426 accessions = []
427 with open(filepath) as fp:
428 for id, peptide in get_fasta_entries(fp):
429 accessions.append(getFastaAccession(id))
430 return accessions
431
432 def read_mzid(fp):
433 accessions = []
434 for event, elem in ET.iterparse(fp):
435 if event == 'end':
436 if re.search('DBSequence',elem.tag):
437 accessions.append(elem.attrib['accession'])
438 return accessions
439
440 def read_pepxml(fp):
441 accessions = []
442 for event, elem in ET.iterparse(fp):
443 if event == 'end':
444 if re.search('search_hit',elem.tag):
445 accessions.append(elem.get('protein'))
446 return accessions
447
448 def getUniprotSequence(uniprot_id):
449 url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id
450 print url
451 fp = urllib2.urlopen(url)
452 for id, seq in get_fasta_entries(fp):
453 if seq:
454 return seq
455 return ''
456
457 def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,numReturn=None,crossReference=None,idMaps=None):
458 # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)]))
459 if not accessions:
460 return
461 url = 'http://www.uniprot.org/mapping/'
462 if to_id != 'ACC' and crossReference: # cross-referencing from nonUniprot IDs
463 if 'ACC' in idMaps:
464 print idMaps
465 idMaps[to_id] = idMaps['ACC']
466 idMap = idMaps[to_id]
467 for acc in idMaps[to_id].keys():
468 print idMaps
469 for from_ID_ in idMaps[to_id][acc].keys():
470 cr = []
471 for ids in idMaps[to_id][acc][from_ID_]:
472 print ids
473 print to_id
474 accessions = getIdMapping(ids,'ACC',to_id,idMap=idMap,crossReference=False)
475 cr.append(accessions)
476 print cr
477 idMaps[to_id][acc][from_ID_] = cr
478 #accessions= getIdMapping(accessions,from_id,'ACC',idMap=idMap,crossReference=False)
479 if accessions != None:
480 if idMap != None:
481 idMap[accessions[0]] = {from_id:['N/A']}
482 return
483 params = {
484 'from': from_id,
485 'to': to_id,
486 'format':'tab',
487 'query': '\n'.join(accessions)
488 }
489 data = urllib.urlencode(params)
490 request = urllib2.Request(url, data)
491 contact = "" # Please set your email address here to help us debug in case of problems.
492 request.add_header('User-Agent', 'Python %s' % contact)
493 response = None
494 for i in range(3):
495 try:
496 response = urllib2.urlopen(request)
497 break
498 except Exception, e:
499 warn_err("%s",exit_code=None)
500 if response:
501 response.next()
502 for line in response:
503 # print >> sys.stderr, "idMap: %s" % line
504 if fh:
505 fh.write(line)
506 if numReturn:
507 id1,id2 = line.strip().split('\t')
508 return from_id
509 if idMap != None:
510 id1,id2 = line.strip().split('\t')
511 #print >> sys.stderr, "idMap: %s:%s" % (id1,id2)
512 if idMap.get(id1,'nothing') == 'nothing':
513 #idMap[id1] = [id2]
514 idMap[id1] = {from_id:[id2]}
515 else:
516 id3 = idMap[id1]
517 #if id2 not in id3:
518 #print id3
519 if id3.get(from_id,'nothing') == 'nothing':
520 id3[from_id] = [id2]
521 else:
522 id4 = id3[from_id]
523 id4.append(id2)
524 else:
525 print 'recursive'
526 id3 = []
527 id1,id2 = line.strip().split('\t')
528 id3.append(id2)
529 return id3
530 #idMap[id1] = id3
531 # python uniprot_id_mapping.py -T 'NF100' -test-data/old-inputs/var.tsv
532
533 #Parse Command Line
534 parser = optparse.OptionParser()
535 # input files
536 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
537 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
538 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
539 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
540 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
541 # Decoy pattern
542 parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' )
543 # filter patterns
544 parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' )
545 parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' )
546 # Unipept Flags
547 parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' )
548 parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' )
549 # output files
550 parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values')
551 # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' )
552 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
553 (options, args) = parser.parse_args()
554 if options.version:
555 print >> sys.stdout,"%s" % version
556 sys.exit(0)
557
558 accessions = []
559 ## Get accessions
560 if options.mzid:
561 accessions += read_mzid(options.mzid)
562 if options.pepxml:
563 accessions += read_pepxml(options.pepxml)
564 if options.tabular:
565 accessions += read_tabular(options.tabular,options.column)
566 if options.fasta:
567 accessions += read_fasta(options.fasta)
568 if args and len(args) > 0:
569 for i,accession in enumerate(args):
570 accessions.append(accession)
571 # filter accessions
572 if options.decoy:
573 filtered_accs = [re.sub(options.decoy,'',x) for x in accessions]
574 accessions = filtered_accs
575 if options.include_pat:
576 filtered_accs = []
577 for acc in accessions:
578 if re.match(options.include_pat,acc):
579 filtered_accs.append(acc)
580 accessions = filtered_accs
581 if options.exclude_pat:
582 filtered_accs = []
583 for acc in accessions:
584 if not re.match(options.exclude_pat,acc):
585 filtered_accs.append(acc)
586 accessions = filtered_accs
587 if len(accessions) < 1:
588 warn_err("No accessions input!",exit_code=1)
589 if options.output != None:
590 try:
591 outputPath = os.path.abspath(options.output)
592 outputFile = open(outputPath, 'w')
593 except Exception, e:
594 print >> sys.stderr, "failed: %s" % e
595 exit(3)
596 else:
597 outputFile = sys.stdout
598
599
600 # Sorts accessions to inferred ID types i+n a dictionary
601 id_dict = {}
602 matchType = []
603 for header in accessions:
604 id , id_types = getAccession(header,matchType)
605 addAccession(id_dict,id_types,id)
606 #print id_dict
607 #print id_dict
608 idMaps = dict() # {to_id,idMap}
609 #print matchType
610 #print id_dict.items()
611 for to_id in options.to_id:
612 idMap = dict()
613 idMaps[to_id] = idMap
614 for (from_id,ids) in id_dict.items():
615 # limit the requests to 500 at a time
616 idx = range(0,len(ids),500)
617 idx.append(len(ids))
618 for i in range(len(idx)-1):
619 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,crossReference=True,idMaps=idMaps)
620 #print idMaps
621 #Write output
622 #Output Table Header
623 outputFile.write("\n#%-17s" % (options.from_id))
624 for t in options.to_id:
625 outputFile.write("%-18s" % t)
626 outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n")
627 #Output a row with values for each input ID
628 #for i, acc in enumerate(accessions):
629 #multi = 1
630 #for types in options.to_id:
631 # IDs = idMaps[types].get(acc,['none'])
632 # if len(IDs) > multi:
633 # multi = len(IDs)
634 """
635 outputFile.write("%-17s %-16s" % (acc, matchType[i][0]))
636 for x in range(len(matchType[i])):
637 if x != 0:
638 outputFile.write(("''" + " " * 16 + "%-16s") % matchType[i][x])
639 for types in options.to_id:
640 IDs = idMaps[types].get(acc,['none'])
641 if len(IDs) > x:
642 outputFile.write("%-18s" % IDs[x])
643 else:
644 if len(matchType[i]) > 1:
645 outputFile.write("%-18s" % IDs[0])
646 else:
647 outputFile.write("'' ")
648 outputFile.write("\n")
649 """
650 # Create an output-friendly matrix
651 idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))]
652 for a, acc in enumerate(accessions):
653 idLength = 0
654 for i, to_id in enumerate(options.to_id): # [[ids],[ids]]
655 idArrayColumn = []
656 idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3]
657 for from_id in idPairs:
658 ids = idPairs[from_id]
659 for id_ in ids:
660 idArrayColumn.append("%s[%s]" % (id_,from_id))
661 if idLength < len(idArrayColumn):
662 idLength = len(idArrayColumn)
663 idArray[a][i].extend(idArrayColumn)
664 for y in range(idLength):
665 outputFile.write("%-18s" % acc)
666 for x in range(len(options.to_id)):
667 try:
668 outputFile.write("%-18s" % idArray[a][x][y])
669 except:
670 outputFile.write(" " * 18)
671 outputFile.write("\n")
672 # Output Matrix
673
674
675
676 #print idMaps
677 # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv
678 stop = timeit.default_timer()
679 outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds"))
680
681 if __name__ == "__main__" : __main__()
682