comparison uniprot_ID_mapper/uniprot_id_mapping_3.py @ 3:6651ac4651f0 draft default tip

Uploaded
author rsajulga
date Thu, 16 Jul 2015 16:41:35 -0400
parents
children
comparison
equal deleted inserted replaced
2:d0311668d442 3:6651ac4651f0
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2015, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14 import json
15 import logging
16 import optparse
17 from optparse import OptionParser
18 import os
19 import sys
20 import re
21 import urllib
22 import urllib2
23 import timeit
24 start = timeit.default_timer()
25 try:
26 import xml.etree.cElementTree as ET
27 except ImportError:
28 import xml.etree.ElementTree as ET
29
30 """print
31 http://www.uniprot.org/help/programmatic_access#id_mapping_examples
32 http://www.uniprot.org/help/uploadlists
33
34 From input files or stdin get accessions/IDs to map.
35 An input option can specify what id_type the input accessions/IDs are,
36 otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type
37
38 input_dict = dict()
39 for id_string in input_ids:
40 (id,id_type) = getAccession(id_string)
41 if not id_type in input_dict:
42 input_dict[id_type] = []
43 input_dict[id_type].append(id)
44
45
46 # We need to retrieve by from_id_type and to_id_type
47 for id_type in input_dict:
48 for
49
50
51 tabular output:
52 #input_id to_id to_id
53
54
55 """
56 def warn_err(msg,exit_code=1):
57 sys.stderr.write(msg)
58 if exit_code:
59 sys.exit(exit_code)
60
61 def __main__():
62 version = '1.0'
63 idDict = {
64 #Category:UniProt
65 'ACC+ID':'UniProtKB AC/ID',
66 'ACC':'UniProtKB AC',
67 'ID':'UniProtKB ID',
68 'UPARC':'UniParc',
69 'NF50':'UniRef50',
70 'NF90':'UniRef90',
71 'NF100':'UniRef100',
72 #Category:Other sequence databases
73 'EMBL_ID':'EMBL/GenBank/DDBJ',
74 'EMBL':'EMBL/GenBank/DDBJ CDS',
75 'PIR':'PIR',
76 'UNIGENE_ID':'UniGene',
77 'P_ENTREZGENEID':'Entrez Gene (GeneID)',
78 'P_GI':'GI number*',
79 'P_REFSEQ_AC':'RefSeq Protein',
80 'REFSEQ_NT_ID':'RefSeq Nucleotide',
81 #Category:3D structure databases
82 'PDB_ID':'PDB',
83 'DISPROT_ID':'DisProt',
84 #Category:Protein-protein interaction databases
85 'BIOGRID_ID':'BioGrid',
86 'DIP_ID':'DIP',
87 'MINT_ID':'MINT',
88 'STRING_ID':'STRING',
89 #Category:Chemistry
90 'CHEMBL_ID':'ChEMBL',
91 'DRUGBANK_ID':'DrugBank',
92 'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY',
93 #Category:Protein family/group databases
94 'ALLERGOME_ID':'Allergome',
95 'MEROPS_ID':'MEROPS',
96 'MYCOCLAP_ID':'mycoCLAP',
97 'PEROXIBASE_ID':'PeroxiBase',
98 'REBASE_ID':'REBASE',
99 'TCDB_ID':'TCDB',
100 #Category:Polymorphism databases
101 'DMDM_ID':'DMDM',
102 #Category:2D gel databases
103 'WORLD_2DPAGE_ID':'World-2DPAGE',
104 #Category:Protocols and materials databases
105 'DNASU_ID':'DNASU',
106 #Category:Genome annotation databases
107 'ENSEMBL_ID':'Ensembl',
108 'ENSEMBL_PRO_ID':'Ensembl Protein',
109 'ENSEMBL_TRS_ID':'Ensembl Transcript',
110 'ENSEMBLGENOME_ID':'Ensembl Genomes',
111 'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein',
112 'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript',
113 'P_ENTREZGENEID':'GeneID',
114 'KEGG_ID':'KEGG',
115 'PATRIC_ID':'PATRIC',
116 'UCSC_ID':'UCSC',
117 'VECTORBASE_ID':'VectorBase',
118 #Category:Organism-specific gene databases
119 'ARACHNOSERVER_ID':'ArachnoServer',
120 'CGD':'CGD',
121 'CONOSERVER_ID':'ConoServer',
122 'CYGD_ID':'CYGD',
123 'DICTYBASE_ID':'dictyBase',
124 'ECHOBASE_ID':'EchoBASE',
125 'ECOGENE_ID':'EcoGene',
126 'EUHCVDB_ID':'euHCVdb',
127 'EUPATHDB_ID':'EuPathDB',
128 'FLYBASE_ID':'FlyBase',
129 'GENECARDS_ID':'GeneCards',
130 'GENEFARM_ID':'GeneFarm',
131 'GENOLIST_ID':'GenoList',
132 'H_INVDB_ID':'H-InvDB',
133 'HGNC_ID':'HGNC',
134 'HPA_ID':'HPA',
135 'LEGIOLIST_ID':'LegioList',
136 'LEPROMA_ID':'Leproma',
137 'MAIZEGDB_ID':'MaizeGDB',
138 'MIM_ID':'MIM',
139 'MGI_ID':'MGI',
140 'NEXTPROT_ID':'neXtProt',
141 'ORPHANET_ID':'Orphanet',
142 'PHARMGKB_ID':'PharmGKB',
143 'POMBASE_ID':'PomBase',
144 'PSEUDOCAP_ID':'PseudoCAP',
145 'RGD_ID':'RGD',
146 'SGD_ID':'SGD',
147 'TAIR_ID':'TAIR',
148 'TUBERCULIST_ID':'TubercuList',
149 'WORMBASE_ID':'WormBase',
150 'WORMBASE_TRS_ID':'WormBase Transcript',
151 'WORMBASE_PRO_ID':'WormBase Protein',
152 'XENBASE_ID':'Xenbase',
153 'ZFIN_ID':'ZFIN',
154 #Category:Phylogenomic databases
155 'EGGNOG_ID':'eggNOG',
156 'GENETREE_ID':'GeneTree',
157 'HOGENOM_ID':'HOGENOM',
158 'HOVERGEN_ID':'HOVERGEN',
159 'KO_ID':'KO',
160 'OMA_ID':'OMA',
161 'ORTHODB_ID':'OrthoDB',
162 'PROTCLUSTDB_ID':'ProtClustDB',
163 'TREEFAM_ID':'TreeFam',
164 #Category:Enzyme and pathway databases
165 'BIOCYC_ID':'BioCyc',
166 'REACTOME_ID':'Reactome',
167 'UNIPATHWAY_ID':'UniPathWay',
168 #Category:Gene expression databases
169 'CLEANEX_ID':'CleanEx',
170 #Category:Other
171 'CHITARS_ID':'ChiTaRS',
172 'GENOMERNAI_ID':'GenomeRNAi',
173 'GENEWIKI_ID':'GeneWiki',
174 'NEXTBIO_ID':'NextBio'
175 }
176
177 """
178 TODO:
179 Would be better to be able to infer the from_id type for the input accession/ID especially for a fasta file
180 - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession.
181
182 # rather than an array of input accessions, we need to put them in a dict() by type
183 def addAccession(id_dict,id_type,id):
184 if id_type not in id_dict:
185 id_dict[id_type] = []-++
186 id_dict[id_type] = [].append(id)
187
188 # returns(accession, id_type)
189 def getAccession(header, matchType):
190 # (format regex pattern, FROM_ID)
191 # TODO try to determine which type of accession ID we have by matching by regular expressions
192 # each regex match should have a groups[0] that given the accession
193 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
194 fmts = [
195 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
196 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
197 ('>/UniRef\d+_(\w+).*','ACC+ID'),
198 ('>/(UPI\d+).*','ACC+ID'),
199 ('NP_\d+\.\d+','ACC+ID'),
200 ('ENSP\d+','ENSEMBL_PRO_ID'),
201 ('ENST\d+','ENSEMBL_TRS_ID'),
202 ]
203 for pat,cat in fmts:
204 m = re.match(pat,header)
205 if m:
206 matchType.append(cat)
207 return (m.groups()[0],cat)
208 matchType.append('ACC+ID')
209 return (header,'ACC+ID')
210
211 # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types
212 idMaps = dict() # {to_id,idMap}
213 for to_id in options.to_id:
214 idMap = dict()
215 idMaps[to_id] = idMap
216 for (from_id,ids) id_dict.items():
217 # limit the requests to 500 at a time
218 idx = range(0,len(ids),500)
219 idx.append(len(ids))
220 for i in range(len(idx)-1):
221 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap)
222 """
223
224 """
225 Some examples of fasta ID lines From:
226 https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java
227 >sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ...
228 >gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ...
229 >IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr
230 >HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein.
231 >OExyz (OExyz) xxx xxx xxx
232 >hflu_lsi_xxxx xxx xxx xxx
233 >C_tr_Lx_x [xxx - xxx] | xxx xxx
234 >M. tub.xxx|Rvxxx| xxx xxx
235 // Drosophile DB.
236 // We need to find two elements:
237 // - the accession String (retrieved as the trimmed version of everything
238 // up to (and NOT including) " pep:"
239 // - the description (everything (trimmed) starting from (and including) the " pep:".
240 >CGxxxxxx pep:xxxxxx
241 >xxxx xxx SGDID:xxxx xxx
242 >generic_some_tag|proten_accession|a description for this protein
243 // Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as
244 // present in the Expasy FTP FASTA file.
245 // Is formatted something like this:
246 >XXX_YYYY (acc) rest
247 >sp|accession|ID descr rest (including taxonomy, if available
248 >tr|accession|ID descr rest (including taxonomy, if available)
249 >nxp|NX_P02768-1|ALB|Serum albumin|Iso
250 // New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as
251 // present in the Expasy FTP FASTA file.
252 // Is formatted something like this:
253 >accession|ID descr rest (including taxonomy, if available)
254 // Flybase FASTA format.
255 >FBxxx type=xxx
256 // A header translating a genome sequence into a protein sequences.
257 // We need to find two elements, separated by a space:
258 // - the accession string (retrieved as the first part of a space delimited String).
259 // - the nucleic acid start and stop site (between brackets, separated by a '-').
260 >dm345_3L-sense [234353534-234353938]
261 >dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database
262 >synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF
263 // The Arabidopsis thaliana database; TAIR format
264 >AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166
265 ...
266 // Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all.
267 // Testing for this is somewhat more complicated.
268 // Often used simple header; looks like:
269 // >NP0465 (NP0465) A description for this protein.
270 // We need to find two elements:
271 // - the accession String (easily retrieved as the next String until a space is encountered).
272 // - the description
273 >NP0465 (NP0465) A description for this protein.
274 GenBank gb|accession|locus
275 EMBL Data Library emb|accession|locus
276 DDBJ, DNA Database of Japan dbj|accession|locus
277 NBRF PIR pir||entry
278 Protein Research Foundation prf||name
279 SWISS-PROT sp|accession|entry name
280 Brookhaven Protein Data Bank pdb|entry|chain
281 Patents pat|country|number
282 GenInfo Backbone Id bbs|number
283 General database identifier gnl|database|identifier
284 NCBI Reference Sequence ref|accession|locus
285 Local Sequence identifier lcl|identifier
286 """
287
288
289 def getFastaAccession(header):
290 # TODO parse the ID and return the accession
291 # (format regex pattern, FROM_ID)
292 # TODO try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions
293 # each regex match should have a groups[0] that given the accession
294 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
295 fmts = [
296 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
297 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
298 ('NP_\d+\.\d+','ACC+ID'),
299
300 ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
301 ('(...*)_....*','ID'),
302 ('>/(UPI\d+).*','UPARC'),
303 ('>/UniRef50_(\w+).*','NF50'),
304 ('UniRef90_(\w+).*','NF90'),
305 ('UniRef100_(\w+).*','NF100'),
306 ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'),
307 ('ENSP\d+','ENSEMBL_PRO_ID'),
308 ('ENST\d+','ENSEMBL_TRS_ID')
309 ]
310 for pat,cat in fmts:
311 m = re.match(pat,header)
312 if m:
313 return m.groups()[0]
314 return header
315
316 # rather than an array of input accessions, we need to put them in a dict() by type
317 def addAccession(id_dict,id_type,id):
318 for idtype in id_type:
319 if idtype not in id_dict:
320 id_dict[idtype] = []
321 id_dict[idtype].append(id)
322
323 # returns(accession, id_type)
324 def getAccession(header, matchType):
325 # (format regex pattern, FROM_ID)
326 # TODO try to determine which type of accession ID we have by matching by regular expressions
327 # each regex match should have a groups[0] that given the accession
328 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
329 fmts = [
330 ('([OPQ][0-9][A-Z0-9]{3}[0-9])','ACC'),
331 ('([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
332 ('(THISDOESNTWORK...*)_....*','ID'),
333 ('UPI(\d+).*','UPARC'),
334 ('(UniRef50_\w+.*)','NF50'),
335 ('UniRef90_(\w+).*','NF90'),
336 ('(UniRef100_\w+.*)','NF100'),
337 ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'),
338 ('([A-Z]*\d*\.\d$)','EMBL'),
339 ('([IBC]\d{5})','PIR'),
340 ('(Hs\.\d*)','UNIGENE_ID'),
341 ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'),
342 ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'),
343 ('(\d[A-Z0-9]{3})','PDB_ID'),
344 ('(DP\d{5})','DISPROT_ID'),
345
346 ('(\d*$)','P_ENTREZGENEID'),
347 ('(\d*$)','P_GI'),
348 ('(\d*$)','BIOGRID_ID'),
349 ('(\d*$)','GUIDETOPHARMACOLOGY_ID'),
350 ('(\d*$)','ALLERGOME_ID'),
351 ('(\d*$)','PEROXIBASE_ID'),
352 ('(\d*$)','REBASE_ID'),
353 ('(\d*$)','DNASU_ID'),
354 ('(\d*$)','DMDM_ID'),
355
356 ('(DIP-\d*N$)','DIP_ID'),
357 ('(MINT-\d*)','MINT_ID'),
358 ('(9606\.ENSP\d*)','STRING_ID'),
359 ('(CHEMBL\d*)','CHEMBL_ID'),
360 ('(DB\d*)','DRUGBANK_ID'),
361 ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'),
362 ('NOTHING POPS UP','MYCOCLAP_ID'),
363 ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'),
364 ('NOTHING POPS UP','WORLD_2DPAGE_ID'),
365 ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'),
366 ('(ENSP\d+)','ENSEMBL_PRO_ID'),
367 ('(ENST\d+)','ENSEMBL_TRS_ID'),
368 (' ','ENSEMBLGENOME_ID'),
369 (' ','ENSEMBLGENOME_PRO_ID'),
370 (' ','ENSEMBLGENOME_TRS_ID'),
371 ('(hsa:\d*)','KEGG_ID'),
372 ('(uc\d*[a-z]*\.\d$)','UCSC_ID'),
373 ('(.*[CKN]OG\d*)','EGGNOG_ID')
374 ]
375 n = re.match('\d*$', header)
376 if n: # For ambiguous number only ID types
377 numIDtypes = [
378 'P_ENTREZGENEID',
379 'P_GI',
380 'DMDM_ID',
381 'BIOGRID_ID',
382 'GUIDETOPHARMACOLOGY_ID',
383 'ALLERGOME_ID',
384 'PEROXIBASE_ID',
385 'REBASE_ID',
386 'DNASU_ID']
387 ambiguous = []
388 for numIDs in numIDtypes:
389 nm = getIdMapping([header],numIDs,'ACC',numReturn=True)
390 if nm != None:
391 ambiguous.append(nm)
392 matchType.append(ambiguous)
393 return (header, ambiguous)
394 for pat,cat in fmts:
395 m = re.match(pat,header)
396 if m:
397 matchType.append([cat])
398 return (m.groups()[0],[cat])
399 matchType.append(['ACC+ID'])
400 return (header,['ACC+ID'])
401
402
403 def read_tabular(filepath,col):
404 accessions = []
405 with open(filepath) as fp:
406 for i,line in enumerate(fp):
407 if line.strip() == '' or line.startswith('#'):
408 continue
409 fields = line.rstrip('\n').split('\t')
410 accession = fields[col]
411 accessions.append(accession)
412 return accessions
413
414 def get_fasta_entries(fp):
415 name, seq = None, []
416 for line in fp:
417 line = line.rstrip()
418 if line.startswith(">"):
419 if name: yield (name, ''.join(seq))
420 name, seq = line, []
421 else:
422 seq.append(line)
423 if name: yield (name, ''.join(seq))
424
425 def read_fasta(filepath):
426 accessions = []
427 with open(filepath) as fp:
428 for id, peptide in get_fasta_entries(fp):
429 accessions.append(getFastaAccession(id))
430 return accessions
431
432 def read_mzid(fp):
433 accessions = []
434 for event, elem in ET.iterparse(fp):
435 if event == 'end':
436 if re.search('DBSequence',elem.tag):
437 accessions.append(elem.attrib['accession'])
438 return accessions
439
440 def read_pepxml(fp):
441 accessions = []
442 for event, elem in ET.iterparse(fp):
443 if event == 'end':
444 if re.search('search_hit',elem.tag):
445 accessions.append(elem.get('protein'))
446 return accessions
447
448 def getUniprotSequence(uniprot_id):
449 url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id
450 print url
451 fp = urllib2.urlopen(url)
452 for id, seq in get_fasta_entries(fp):
453 if seq:
454 return seq
455 return ''
456
457
458
459
460 def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,numReturn=None,crossReference=None,idMaps=None):
461 # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)]))
462 if not accessions:
463 return
464 url = 'http://www.uniprot.org/mapping/'
465 if to_id != 'ACC' and from_id != 'ACC':
466 crossReference = True
467 oldto_id = to_id
468 to_id = 'ACC'
469 #accessions= getIdMapping(accessions,from_id,'ACC',idMap=idMap,crossReference=False)
470 #if accessions != None:
471 # if idMap != None:
472 # idMap[accessions[0]] = {from_id:['N/A']}
473 params = {
474 'from': from_id,
475 'to': to_id,
476 'format':'tab',
477 'query': '\n'.join(accessions)
478 }
479 data = urllib.urlencode(params)
480 request = urllib2.Request(url, data)
481 contact = "" # Please set your email address here to help us debug in case of problems.
482 request.add_header('User-Agent', 'Python %s' % contact)
483 response = None
484 for i in range(3):
485 try:
486 response = urllib2.urlopen(request)
487 break
488 except Exception, e:
489 warn_err("%s",exit_code=None)
490 if response:
491 response.next()
492 id3 = []
493 for line in response:
494 # print >> sys.stderr, "idMap: %s" % line
495 if fh:
496 fh.write(line)
497 if numReturn:
498 id1,id2 = line.strip().split('\t')
499 return from_id
500 if idMap != None:
501 id1,id2 = line.strip().split('\t')
502 # print >> sys.stderr, "idMap: %s:%s" % (id1,id2)
503 if crossReference==True:
504 id2 = getIdMapping([id2],'ACC',oldto_id,crossReference='return',idMap={})
505 print id2
506 if crossReference=='return':
507 id3.append(id2)
508 print id3
509 else:
510 # break
511 if idMap.get(id1,'nothing') == 'nothing':
512 #idMap[id1] = [id2]
513 idMap[id1] = {from_id:[id2]}
514 else:
515 id3 = idMap[id1]
516 #if id2 not in id3:
517 #print id3
518 if id3.get(from_id,'nothing') == 'nothing':
519 if crossReference:
520 id3[from_id] = id2
521 else:
522 id3[from_id] = [id2]
523 else:
524 id4 = id3[from_id]
525 if crossReference:
526 id4.extend(id2)
527 else:
528 id4.append(id2)
529 else:
530 id1,id2 = line.strip().split('\t')
531 id3[id1] = id2
532 try:
533 return id3
534 except:
535 return
536 #idMap[id1] = id3
537
538 #Parse Command Line
539 parser = optparse.OptionParser()
540 # input files
541 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
542 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
543 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
544 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
545 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
546 # Decoy pattern
547 parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' )
548 # filter patterns
549 parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' )
550 parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' )
551 # Unipept Flags
552 parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' )
553 parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' )
554 # output files
555 parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values')
556 # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' )
557 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
558 (options, args) = parser.parse_args()
559 if options.version:
560 print >> sys.stdout,"%s" % version
561 sys.exit(0)
562
563 accessions = []
564 ## Get accessions
565 if options.mzid:
566 accessions += read_mzid(options.mzid)
567 if options.pepxml:
568 accessions += read_pepxml(options.pepxml)
569 if options.tabular:
570 accessions += read_tabular(options.tabular,options.column)
571 if options.fasta:
572 accessions += read_fasta(options.fasta)
573 if args and len(args) > 0:
574 for i,accession in enumerate(args):
575 accessions.append(accession)
576 # filter accessions
577 if options.decoy:
578 filtered_accs = [re.sub(options.decoy,'',x) for x in accessions]
579 accessions = filtered_accs
580 if options.include_pat:
581 filtered_accs = []
582 for acc in accessions:
583 if re.match(options.include_pat,acc):
584 filtered_accs.append(acc)
585 accessions = filtered_accs
586 if options.exclude_pat:
587 filtered_accs = []
588 for acc in accessions:
589 if not re.match(options.exclude_pat,acc):
590 filtered_accs.append(acc)
591 accessions = filtered_accs
592 if len(accessions) < 1:
593 warn_err("No accessions input!",exit_code=1)
594 if options.output != None:
595 try:
596 outputPath = os.path.abspath(options.output)
597 outputFile = open(outputPath, 'w')
598 except Exception, e:
599 print >> sys.stderr, "failed: %s" % e
600 exit(3)
601 else:
602 outputFile = sys.stdout
603
604
605 # Removes duplicates in accessions
606 seen = set()
607 seen_add = seen.add
608 accessions = [x for x in accessions if not (x in seen or seen_add(x))]
609 # Sorts accessions to inferred ID types i+n a dictionary
610 id_dict = {}
611 matchType = []
612 for header in accessions:
613 id , id_types = getAccession(header,matchType)
614 addAccession(id_dict,id_types,id)
615 #print id_dict
616 #print id_dict
617 idMaps = dict() # {to_id,idMap}
618 #print matchType
619 #print id_dict.items()
620 for to_id in options.to_id:
621 idMap = dict()
622 idMaps[to_id] = idMap
623 for (from_id,ids) in id_dict.items():
624 # limit the requests to 500 at a time
625 idx = range(0,len(ids),500)
626 idx.append(len(ids))
627 for i in range(len(idx)-1):
628 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,idMaps=idMaps)
629 #Write output
630 #Output Table Header
631 outputFile.write("\n#%-17s" % (options.from_id))
632 for t in options.to_id:
633 outputFile.write("%-18s" % t)
634 outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n")
635
636 # Create an output-friendly matrix
637 idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))]
638 for a, acc in enumerate(accessions):
639 idLength = 0
640 for i, to_id in enumerate(options.to_id): # [[ids],[ids]]
641 idArrayColumn = []
642 idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3]
643 for from_id in idPairs:
644 ids = idPairs[from_id]
645 for id_ in ids:
646 idArrayColumn.append("%s[%s]" % (id_,from_id))
647 if idLength < len(idArrayColumn):
648 idLength = len(idArrayColumn)
649 idArray[a][i].extend(idArrayColumn)
650 for y in range(idLength):
651 outputFile.write("%-18s" % acc)
652 for x in range(len(options.to_id)):
653 try:
654 outputFile.write("%-18s" % idArray[a][x][y])
655 except:
656 outputFile.write(" " * 18)
657 outputFile.write("\n")
658 # Output Matrix
659
660
661
662 #print idMaps
663 # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv
664 stop = timeit.default_timer()
665 outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds"))
666
667 if __name__ == "__main__" : __main__()
668