|
3
|
1 #!/usr/bin/env python
|
|
|
2 """
|
|
|
3 #
|
|
|
4 #------------------------------------------------------------------------------
|
|
|
5 # University of Minnesota
|
|
|
6 # Copyright 2015, Regents of the University of Minnesota
|
|
|
7 #------------------------------------------------------------------------------
|
|
|
8 # Author:
|
|
|
9 #
|
|
|
10 # James E Johnson
|
|
|
11 #
|
|
|
12 #------------------------------------------------------------------------------
|
|
|
13 """
|
|
|
14 import json
|
|
|
15 import logging
|
|
|
16 import optparse
|
|
|
17 from optparse import OptionParser
|
|
|
18 import os
|
|
|
19 import sys
|
|
|
20 import re
|
|
|
21 import urllib
|
|
|
22 import urllib2
|
|
|
23 import timeit
|
|
|
24 start = timeit.default_timer()
|
|
|
25 try:
|
|
|
26 import xml.etree.cElementTree as ET
|
|
|
27 except ImportError:
|
|
|
28 import xml.etree.ElementTree as ET
|
|
|
29
|
|
|
30 """print
|
|
|
31 http://www.uniprot.org/help/programmatic_access#id_mapping_examples
|
|
|
32 http://www.uniprot.org/help/uploadlists
|
|
|
33
|
|
|
34 From input files or stdin get accessions/IDs to map.
|
|
|
35 An input option can specify what id_type the input accessions/IDs are,
|
|
|
36 otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type
|
|
|
37
|
|
|
38 input_dict = dict()
|
|
|
39 for id_string in input_ids:
|
|
|
40 (id,id_type) = getAccession(id_string)
|
|
|
41 if not id_type in input_dict:
|
|
|
42 input_dict[id_type] = []
|
|
|
43 input_dict[id_type].append(id)
|
|
|
44
|
|
|
45
|
|
|
46 # We need to retrieve by from_id_type and to_id_type
|
|
|
47 for id_type in input_dict:
|
|
|
48 for
|
|
|
49
|
|
|
50
|
|
|
51 tabular output:
|
|
|
52 #input_id to_id to_id
|
|
|
53
|
|
|
54
|
|
|
55 """
|
|
|
56 def warn_err(msg,exit_code=1):
|
|
|
57 sys.stderr.write(msg)
|
|
|
58 if exit_code:
|
|
|
59 sys.exit(exit_code)
|
|
|
60
|
|
|
61 def __main__():
|
|
|
62 version = '1.0'
|
|
|
63 idDict = {
|
|
|
64 #Category:UniProt
|
|
|
65 'ACC+ID':'UniProtKB AC/ID',
|
|
|
66 'ACC':'UniProtKB AC',
|
|
|
67 'ID':'UniProtKB ID',
|
|
|
68 'UPARC':'UniParc',
|
|
|
69 'NF50':'UniRef50',
|
|
|
70 'NF90':'UniRef90',
|
|
|
71 'NF100':'UniRef100',
|
|
|
72 #Category:Other sequence databases
|
|
|
73 'EMBL_ID':'EMBL/GenBank/DDBJ',
|
|
|
74 'EMBL':'EMBL/GenBank/DDBJ CDS',
|
|
|
75 'PIR':'PIR',
|
|
|
76 'UNIGENE_ID':'UniGene',
|
|
|
77 'P_ENTREZGENEID':'Entrez Gene (GeneID)',
|
|
|
78 'P_GI':'GI number*',
|
|
|
79 'P_REFSEQ_AC':'RefSeq Protein',
|
|
|
80 'REFSEQ_NT_ID':'RefSeq Nucleotide',
|
|
|
81 #Category:3D structure databases
|
|
|
82 'PDB_ID':'PDB',
|
|
|
83 'DISPROT_ID':'DisProt',
|
|
|
84 #Category:Protein-protein interaction databases
|
|
|
85 'BIOGRID_ID':'BioGrid',
|
|
|
86 'DIP_ID':'DIP',
|
|
|
87 'MINT_ID':'MINT',
|
|
|
88 'STRING_ID':'STRING',
|
|
|
89 #Category:Chemistry
|
|
|
90 'CHEMBL_ID':'ChEMBL',
|
|
|
91 'DRUGBANK_ID':'DrugBank',
|
|
|
92 'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY',
|
|
|
93 #Category:Protein family/group databases
|
|
|
94 'ALLERGOME_ID':'Allergome',
|
|
|
95 'MEROPS_ID':'MEROPS',
|
|
|
96 'MYCOCLAP_ID':'mycoCLAP',
|
|
|
97 'PEROXIBASE_ID':'PeroxiBase',
|
|
|
98 'REBASE_ID':'REBASE',
|
|
|
99 'TCDB_ID':'TCDB',
|
|
|
100 #Category:Polymorphism databases
|
|
|
101 'DMDM_ID':'DMDM',
|
|
|
102 #Category:2D gel databases
|
|
|
103 'WORLD_2DPAGE_ID':'World-2DPAGE',
|
|
|
104 #Category:Protocols and materials databases
|
|
|
105 'DNASU_ID':'DNASU',
|
|
|
106 #Category:Genome annotation databases
|
|
|
107 'ENSEMBL_ID':'Ensembl',
|
|
|
108 'ENSEMBL_PRO_ID':'Ensembl Protein',
|
|
|
109 'ENSEMBL_TRS_ID':'Ensembl Transcript',
|
|
|
110 'ENSEMBLGENOME_ID':'Ensembl Genomes',
|
|
|
111 'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein',
|
|
|
112 'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript',
|
|
|
113 'P_ENTREZGENEID':'GeneID',
|
|
|
114 'KEGG_ID':'KEGG',
|
|
|
115 'PATRIC_ID':'PATRIC',
|
|
|
116 'UCSC_ID':'UCSC',
|
|
|
117 'VECTORBASE_ID':'VectorBase',
|
|
|
118 #Category:Organism-specific gene databases
|
|
|
119 'ARACHNOSERVER_ID':'ArachnoServer',
|
|
|
120 'CGD':'CGD',
|
|
|
121 'CONOSERVER_ID':'ConoServer',
|
|
|
122 'CYGD_ID':'CYGD',
|
|
|
123 'DICTYBASE_ID':'dictyBase',
|
|
|
124 'ECHOBASE_ID':'EchoBASE',
|
|
|
125 'ECOGENE_ID':'EcoGene',
|
|
|
126 'EUHCVDB_ID':'euHCVdb',
|
|
|
127 'EUPATHDB_ID':'EuPathDB',
|
|
|
128 'FLYBASE_ID':'FlyBase',
|
|
|
129 'GENECARDS_ID':'GeneCards',
|
|
|
130 'GENEFARM_ID':'GeneFarm',
|
|
|
131 'GENOLIST_ID':'GenoList',
|
|
|
132 'H_INVDB_ID':'H-InvDB',
|
|
|
133 'HGNC_ID':'HGNC',
|
|
|
134 'HPA_ID':'HPA',
|
|
|
135 'LEGIOLIST_ID':'LegioList',
|
|
|
136 'LEPROMA_ID':'Leproma',
|
|
|
137 'MAIZEGDB_ID':'MaizeGDB',
|
|
|
138 'MIM_ID':'MIM',
|
|
|
139 'MGI_ID':'MGI',
|
|
|
140 'NEXTPROT_ID':'neXtProt',
|
|
|
141 'ORPHANET_ID':'Orphanet',
|
|
|
142 'PHARMGKB_ID':'PharmGKB',
|
|
|
143 'POMBASE_ID':'PomBase',
|
|
|
144 'PSEUDOCAP_ID':'PseudoCAP',
|
|
|
145 'RGD_ID':'RGD',
|
|
|
146 'SGD_ID':'SGD',
|
|
|
147 'TAIR_ID':'TAIR',
|
|
|
148 'TUBERCULIST_ID':'TubercuList',
|
|
|
149 'WORMBASE_ID':'WormBase',
|
|
|
150 'WORMBASE_TRS_ID':'WormBase Transcript',
|
|
|
151 'WORMBASE_PRO_ID':'WormBase Protein',
|
|
|
152 'XENBASE_ID':'Xenbase',
|
|
|
153 'ZFIN_ID':'ZFIN',
|
|
|
154 #Category:Phylogenomic databases
|
|
|
155 'EGGNOG_ID':'eggNOG',
|
|
|
156 'GENETREE_ID':'GeneTree',
|
|
|
157 'HOGENOM_ID':'HOGENOM',
|
|
|
158 'HOVERGEN_ID':'HOVERGEN',
|
|
|
159 'KO_ID':'KO',
|
|
|
160 'OMA_ID':'OMA',
|
|
|
161 'ORTHODB_ID':'OrthoDB',
|
|
|
162 'PROTCLUSTDB_ID':'ProtClustDB',
|
|
|
163 'TREEFAM_ID':'TreeFam',
|
|
|
164 #Category:Enzyme and pathway databases
|
|
|
165 'BIOCYC_ID':'BioCyc',
|
|
|
166 'REACTOME_ID':'Reactome',
|
|
|
167 'UNIPATHWAY_ID':'UniPathWay',
|
|
|
168 #Category:Gene expression databases
|
|
|
169 'CLEANEX_ID':'CleanEx',
|
|
|
170 #Category:Other
|
|
|
171 'CHITARS_ID':'ChiTaRS',
|
|
|
172 'GENOMERNAI_ID':'GenomeRNAi',
|
|
|
173 'GENEWIKI_ID':'GeneWiki',
|
|
|
174 'NEXTBIO_ID':'NextBio'
|
|
|
175 }
|
|
|
176
|
|
|
177 """
|
|
|
178 TODO:
|
|
|
179 Would be better to be able to infer the from_id type for the input accession/ID especially for a fasta file
|
|
|
180 - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession.
|
|
|
181
|
|
|
182 # rather than an array of input accessions, we need to put them in a dict() by type
|
|
|
183 def addAccession(id_dict,id_type,id):
|
|
|
184 if id_type not in id_dict:
|
|
|
185 id_dict[id_type] = []-++
|
|
|
186 id_dict[id_type] = [].append(id)
|
|
|
187
|
|
|
188 # returns(accession, id_type)
|
|
|
189 def getAccession(header, matchType):
|
|
|
190 # (format regex pattern, FROM_ID)
|
|
|
191 # TODO try to determine which type of accession ID we have by matching by regular expressions
|
|
|
192 # each regex match should have a groups[0] that given the accession
|
|
|
193 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
|
|
|
194 fmts = [
|
|
|
195 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
|
|
|
196 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
|
|
|
197 ('>/UniRef\d+_(\w+).*','ACC+ID'),
|
|
|
198 ('>/(UPI\d+).*','ACC+ID'),
|
|
|
199 ('NP_\d+\.\d+','ACC+ID'),
|
|
|
200 ('ENSP\d+','ENSEMBL_PRO_ID'),
|
|
|
201 ('ENST\d+','ENSEMBL_TRS_ID'),
|
|
|
202 ]
|
|
|
203 for pat,cat in fmts:
|
|
|
204 m = re.match(pat,header)
|
|
|
205 if m:
|
|
|
206 matchType.append(cat)
|
|
|
207 return (m.groups()[0],cat)
|
|
|
208 matchType.append('ACC+ID')
|
|
|
209 return (header,'ACC+ID')
|
|
|
210
|
|
|
211 # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types
|
|
|
212 idMaps = dict() # {to_id,idMap}
|
|
|
213 for to_id in options.to_id:
|
|
|
214 idMap = dict()
|
|
|
215 idMaps[to_id] = idMap
|
|
|
216 for (from_id,ids) id_dict.items():
|
|
|
217 # limit the requests to 500 at a time
|
|
|
218 idx = range(0,len(ids),500)
|
|
|
219 idx.append(len(ids))
|
|
|
220 for i in range(len(idx)-1):
|
|
|
221 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap)
|
|
|
222 """
|
|
|
223
|
|
|
224 """
|
|
|
225 Some examples of fasta ID lines From:
|
|
|
226 https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java
|
|
|
227 >sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ...
|
|
|
228 >gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ...
|
|
|
229 >IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr
|
|
|
230 >HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein.
|
|
|
231 >OExyz (OExyz) xxx xxx xxx
|
|
|
232 >hflu_lsi_xxxx xxx xxx xxx
|
|
|
233 >C_tr_Lx_x [xxx - xxx] | xxx xxx
|
|
|
234 >M. tub.xxx|Rvxxx| xxx xxx
|
|
|
235 // Drosophile DB.
|
|
|
236 // We need to find two elements:
|
|
|
237 // - the accession String (retrieved as the trimmed version of everything
|
|
|
238 // up to (and NOT including) " pep:"
|
|
|
239 // - the description (everything (trimmed) starting from (and including) the " pep:".
|
|
|
240 >CGxxxxxx pep:xxxxxx
|
|
|
241 >xxxx xxx SGDID:xxxx xxx
|
|
|
242 >generic_some_tag|proten_accession|a description for this protein
|
|
|
243 // Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as
|
|
|
244 // present in the Expasy FTP FASTA file.
|
|
|
245 // Is formatted something like this:
|
|
|
246 >XXX_YYYY (acc) rest
|
|
|
247 >sp|accession|ID descr rest (including taxonomy, if available
|
|
|
248 >tr|accession|ID descr rest (including taxonomy, if available)
|
|
|
249 >nxp|NX_P02768-1|ALB|Serum albumin|Iso
|
|
|
250 // New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as
|
|
|
251 // present in the Expasy FTP FASTA file.
|
|
|
252 // Is formatted something like this:
|
|
|
253 >accession|ID descr rest (including taxonomy, if available)
|
|
|
254 // Flybase FASTA format.
|
|
|
255 >FBxxx type=xxx
|
|
|
256 // A header translating a genome sequence into a protein sequences.
|
|
|
257 // We need to find two elements, separated by a space:
|
|
|
258 // - the accession string (retrieved as the first part of a space delimited String).
|
|
|
259 // - the nucleic acid start and stop site (between brackets, separated by a '-').
|
|
|
260 >dm345_3L-sense [234353534-234353938]
|
|
|
261 >dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database
|
|
|
262 >synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF
|
|
|
263 // The Arabidopsis thaliana database; TAIR format
|
|
|
264 >AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166
|
|
|
265 ...
|
|
|
266 // Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all.
|
|
|
267 // Testing for this is somewhat more complicated.
|
|
|
268 // Often used simple header; looks like:
|
|
|
269 // >NP0465 (NP0465) A description for this protein.
|
|
|
270 // We need to find two elements:
|
|
|
271 // - the accession String (easily retrieved as the next String until a space is encountered).
|
|
|
272 // - the description
|
|
|
273 >NP0465 (NP0465) A description for this protein.
|
|
|
274 GenBank gb|accession|locus
|
|
|
275 EMBL Data Library emb|accession|locus
|
|
|
276 DDBJ, DNA Database of Japan dbj|accession|locus
|
|
|
277 NBRF PIR pir||entry
|
|
|
278 Protein Research Foundation prf||name
|
|
|
279 SWISS-PROT sp|accession|entry name
|
|
|
280 Brookhaven Protein Data Bank pdb|entry|chain
|
|
|
281 Patents pat|country|number
|
|
|
282 GenInfo Backbone Id bbs|number
|
|
|
283 General database identifier gnl|database|identifier
|
|
|
284 NCBI Reference Sequence ref|accession|locus
|
|
|
285 Local Sequence identifier lcl|identifier
|
|
|
286 """
|
|
|
287
|
|
|
288
|
|
|
289 def getFastaAccession(header):
|
|
|
290 # TODO parse the ID and return the accession
|
|
|
291 # (format regex pattern, FROM_ID)
|
|
|
292 # TODO try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions
|
|
|
293 # each regex match should have a groups[0] that given the accession
|
|
|
294 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
|
|
|
295 fmts = [
|
|
|
296 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
|
|
|
297 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
|
|
|
298 ('NP_\d+\.\d+','ACC+ID'),
|
|
|
299
|
|
|
300 ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
|
|
|
301 ('(...*)_....*','ID'),
|
|
|
302 ('>/(UPI\d+).*','UPARC'),
|
|
|
303 ('>/UniRef50_(\w+).*','NF50'),
|
|
|
304 ('UniRef90_(\w+).*','NF90'),
|
|
|
305 ('UniRef100_(\w+).*','NF100'),
|
|
|
306 ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'),
|
|
|
307 ('ENSP\d+','ENSEMBL_PRO_ID'),
|
|
|
308 ('ENST\d+','ENSEMBL_TRS_ID')
|
|
|
309 ]
|
|
|
310 for pat,cat in fmts:
|
|
|
311 m = re.match(pat,header)
|
|
|
312 if m:
|
|
|
313 return m.groups()[0]
|
|
|
314 return header
|
|
|
315
|
|
|
316 # rather than an array of input accessions, we need to put them in a dict() by type
|
|
|
317 def addAccession(id_dict,id_type,id):
|
|
|
318 for idtype in id_type:
|
|
|
319 if idtype not in id_dict:
|
|
|
320 id_dict[idtype] = []
|
|
|
321 id_dict[idtype].append(id)
|
|
|
322
|
|
|
323 # returns(accession, id_type)
|
|
|
324 def getAccession(header, matchType):
|
|
|
325 # (format regex pattern, FROM_ID)
|
|
|
326 # TODO try to determine which type of accession ID we have by matching by regular expressions
|
|
|
327 # each regex match should have a groups[0] that given the accession
|
|
|
328 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
|
|
|
329 fmts = [
|
|
|
330 ('([OPQ][0-9][A-Z0-9]{3}[0-9])','ACC'),
|
|
|
331 ('([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
|
|
|
332 ('(THISDOESNTWORK...*)_....*','ID'),
|
|
|
333 ('UPI(\d+).*','UPARC'),
|
|
|
334 ('(UniRef50_\w+.*)','NF50'),
|
|
|
335 ('UniRef90_(\w+).*','NF90'),
|
|
|
336 ('(UniRef100_\w+.*)','NF100'),
|
|
|
337 ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'),
|
|
|
338 ('([A-Z]*\d*\.\d$)','EMBL'),
|
|
|
339 ('([IBC]\d{5})','PIR'),
|
|
|
340 ('(Hs\.\d*)','UNIGENE_ID'),
|
|
|
341 ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'),
|
|
|
342 ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'),
|
|
|
343 ('(\d[A-Z0-9]{3})','PDB_ID'),
|
|
|
344 ('(DP\d{5})','DISPROT_ID'),
|
|
|
345
|
|
|
346 ('(\d*$)','P_ENTREZGENEID'),
|
|
|
347 ('(\d*$)','P_GI'),
|
|
|
348 ('(\d*$)','BIOGRID_ID'),
|
|
|
349 ('(\d*$)','GUIDETOPHARMACOLOGY_ID'),
|
|
|
350 ('(\d*$)','ALLERGOME_ID'),
|
|
|
351 ('(\d*$)','PEROXIBASE_ID'),
|
|
|
352 ('(\d*$)','REBASE_ID'),
|
|
|
353 ('(\d*$)','DNASU_ID'),
|
|
|
354 ('(\d*$)','DMDM_ID'),
|
|
|
355
|
|
|
356 ('(DIP-\d*N$)','DIP_ID'),
|
|
|
357 ('(MINT-\d*)','MINT_ID'),
|
|
|
358 ('(9606\.ENSP\d*)','STRING_ID'),
|
|
|
359 ('(CHEMBL\d*)','CHEMBL_ID'),
|
|
|
360 ('(DB\d*)','DRUGBANK_ID'),
|
|
|
361 ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'),
|
|
|
362 ('NOTHING POPS UP','MYCOCLAP_ID'),
|
|
|
363 ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'),
|
|
|
364 ('NOTHING POPS UP','WORLD_2DPAGE_ID'),
|
|
|
365 ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'),
|
|
|
366 ('(ENSP\d+)','ENSEMBL_PRO_ID'),
|
|
|
367 ('(ENST\d+)','ENSEMBL_TRS_ID'),
|
|
|
368 (' ','ENSEMBLGENOME_ID'),
|
|
|
369 (' ','ENSEMBLGENOME_PRO_ID'),
|
|
|
370 (' ','ENSEMBLGENOME_TRS_ID'),
|
|
|
371 ('(hsa:\d*)','KEGG_ID'),
|
|
|
372 ('(uc\d*[a-z]*\.\d$)','UCSC_ID'),
|
|
|
373 ('(.*[CKN]OG\d*)','EGGNOG_ID')
|
|
|
374 ]
|
|
|
375 n = re.match('\d*$', header)
|
|
|
376 if n: # For ambiguous number only ID types
|
|
|
377 numIDtypes = [
|
|
|
378 'P_ENTREZGENEID',
|
|
|
379 'P_GI',
|
|
|
380 'DMDM_ID',
|
|
|
381 'BIOGRID_ID',
|
|
|
382 'GUIDETOPHARMACOLOGY_ID',
|
|
|
383 'ALLERGOME_ID',
|
|
|
384 'PEROXIBASE_ID',
|
|
|
385 'REBASE_ID',
|
|
|
386 'DNASU_ID']
|
|
|
387 ambiguous = []
|
|
|
388 for numIDs in numIDtypes:
|
|
|
389 nm = getIdMapping([header],numIDs,'ACC',numReturn=True)
|
|
|
390 if nm != None:
|
|
|
391 ambiguous.append(nm)
|
|
|
392 matchType.append(ambiguous)
|
|
|
393 return (header, ambiguous)
|
|
|
394 for pat,cat in fmts:
|
|
|
395 m = re.match(pat,header)
|
|
|
396 if m:
|
|
|
397 matchType.append([cat])
|
|
|
398 return (m.groups()[0],[cat])
|
|
|
399 matchType.append(['ACC+ID'])
|
|
|
400 return (header,['ACC+ID'])
|
|
|
401
|
|
|
402
|
|
|
403 def read_tabular(filepath,col):
|
|
|
404 accessions = []
|
|
|
405 with open(filepath) as fp:
|
|
|
406 for i,line in enumerate(fp):
|
|
|
407 if line.strip() == '' or line.startswith('#'):
|
|
|
408 continue
|
|
|
409 fields = line.rstrip('\n').split('\t')
|
|
|
410 accession = fields[col]
|
|
|
411 accessions.append(accession)
|
|
|
412 return accessions
|
|
|
413
|
|
|
414 def get_fasta_entries(fp):
|
|
|
415 name, seq = None, []
|
|
|
416 for line in fp:
|
|
|
417 line = line.rstrip()
|
|
|
418 if line.startswith(">"):
|
|
|
419 if name: yield (name, ''.join(seq))
|
|
|
420 name, seq = line, []
|
|
|
421 else:
|
|
|
422 seq.append(line)
|
|
|
423 if name: yield (name, ''.join(seq))
|
|
|
424
|
|
|
425 def read_fasta(filepath):
|
|
|
426 accessions = []
|
|
|
427 with open(filepath) as fp:
|
|
|
428 for id, peptide in get_fasta_entries(fp):
|
|
|
429 accessions.append(getFastaAccession(id))
|
|
|
430 return accessions
|
|
|
431
|
|
|
432 def read_mzid(fp):
|
|
|
433 accessions = []
|
|
|
434 for event, elem in ET.iterparse(fp):
|
|
|
435 if event == 'end':
|
|
|
436 if re.search('DBSequence',elem.tag):
|
|
|
437 accessions.append(elem.attrib['accession'])
|
|
|
438 return accessions
|
|
|
439
|
|
|
440 def read_pepxml(fp):
|
|
|
441 accessions = []
|
|
|
442 for event, elem in ET.iterparse(fp):
|
|
|
443 if event == 'end':
|
|
|
444 if re.search('search_hit',elem.tag):
|
|
|
445 accessions.append(elem.get('protein'))
|
|
|
446 return accessions
|
|
|
447
|
|
|
448 def getUniprotSequence(uniprot_id):
|
|
|
449 url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id
|
|
|
450 print url
|
|
|
451 fp = urllib2.urlopen(url)
|
|
|
452 for id, seq in get_fasta_entries(fp):
|
|
|
453 if seq:
|
|
|
454 return seq
|
|
|
455 return ''
|
|
|
456
|
|
|
457
|
|
|
458
|
|
|
459
|
|
|
460 def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,numReturn=None,crossReference=None,idMaps=None):
|
|
|
461 # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)]))
|
|
|
462 if not accessions:
|
|
|
463 return
|
|
|
464 url = 'http://www.uniprot.org/mapping/'
|
|
|
465 if to_id != 'ACC' and from_id != 'ACC':
|
|
|
466 crossReference = True
|
|
|
467 oldto_id = to_id
|
|
|
468 to_id = 'ACC'
|
|
|
469 #accessions= getIdMapping(accessions,from_id,'ACC',idMap=idMap,crossReference=False)
|
|
|
470 #if accessions != None:
|
|
|
471 # if idMap != None:
|
|
|
472 # idMap[accessions[0]] = {from_id:['N/A']}
|
|
|
473 params = {
|
|
|
474 'from': from_id,
|
|
|
475 'to': to_id,
|
|
|
476 'format':'tab',
|
|
|
477 'query': '\n'.join(accessions)
|
|
|
478 }
|
|
|
479 data = urllib.urlencode(params)
|
|
|
480 request = urllib2.Request(url, data)
|
|
|
481 contact = "" # Please set your email address here to help us debug in case of problems.
|
|
|
482 request.add_header('User-Agent', 'Python %s' % contact)
|
|
|
483 response = None
|
|
|
484 for i in range(3):
|
|
|
485 try:
|
|
|
486 response = urllib2.urlopen(request)
|
|
|
487 break
|
|
|
488 except Exception, e:
|
|
|
489 warn_err("%s",exit_code=None)
|
|
|
490 if response:
|
|
|
491 response.next()
|
|
|
492 id3 = []
|
|
|
493 for line in response:
|
|
|
494 # print >> sys.stderr, "idMap: %s" % line
|
|
|
495 if fh:
|
|
|
496 fh.write(line)
|
|
|
497 if numReturn:
|
|
|
498 id1,id2 = line.strip().split('\t')
|
|
|
499 return from_id
|
|
|
500 if idMap != None:
|
|
|
501 id1,id2 = line.strip().split('\t')
|
|
|
502 # print >> sys.stderr, "idMap: %s:%s" % (id1,id2)
|
|
|
503 if crossReference==True:
|
|
|
504 id2 = getIdMapping([id2],'ACC',oldto_id,crossReference='return',idMap={})
|
|
|
505 print id2
|
|
|
506 if crossReference=='return':
|
|
|
507 id3.append(id2)
|
|
|
508 print id3
|
|
|
509 else:
|
|
|
510 # break
|
|
|
511 if idMap.get(id1,'nothing') == 'nothing':
|
|
|
512 #idMap[id1] = [id2]
|
|
|
513 idMap[id1] = {from_id:[id2]}
|
|
|
514 else:
|
|
|
515 id3 = idMap[id1]
|
|
|
516 #if id2 not in id3:
|
|
|
517 #print id3
|
|
|
518 if id3.get(from_id,'nothing') == 'nothing':
|
|
|
519 if crossReference:
|
|
|
520 id3[from_id] = id2
|
|
|
521 else:
|
|
|
522 id3[from_id] = [id2]
|
|
|
523 else:
|
|
|
524 id4 = id3[from_id]
|
|
|
525 if crossReference:
|
|
|
526 id4.extend(id2)
|
|
|
527 else:
|
|
|
528 id4.append(id2)
|
|
|
529 else:
|
|
|
530 id1,id2 = line.strip().split('\t')
|
|
|
531 id3[id1] = id2
|
|
|
532 try:
|
|
|
533 return id3
|
|
|
534 except:
|
|
|
535 return
|
|
|
536 #idMap[id1] = id3
|
|
|
537
|
|
|
538 #Parse Command Line
|
|
|
539 parser = optparse.OptionParser()
|
|
|
540 # input files
|
|
|
541 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
|
|
|
542 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
|
|
|
543 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
|
|
|
544 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
|
|
|
545 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
|
|
|
546 # Decoy pattern
|
|
|
547 parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' )
|
|
|
548 # filter patterns
|
|
|
549 parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' )
|
|
|
550 parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' )
|
|
|
551 # Unipept Flags
|
|
|
552 parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' )
|
|
|
553 parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' )
|
|
|
554 # output files
|
|
|
555 parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values')
|
|
|
556 # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' )
|
|
|
557 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
|
|
|
558 (options, args) = parser.parse_args()
|
|
|
559 if options.version:
|
|
|
560 print >> sys.stdout,"%s" % version
|
|
|
561 sys.exit(0)
|
|
|
562
|
|
|
563 accessions = []
|
|
|
564 ## Get accessions
|
|
|
565 if options.mzid:
|
|
|
566 accessions += read_mzid(options.mzid)
|
|
|
567 if options.pepxml:
|
|
|
568 accessions += read_pepxml(options.pepxml)
|
|
|
569 if options.tabular:
|
|
|
570 accessions += read_tabular(options.tabular,options.column)
|
|
|
571 if options.fasta:
|
|
|
572 accessions += read_fasta(options.fasta)
|
|
|
573 if args and len(args) > 0:
|
|
|
574 for i,accession in enumerate(args):
|
|
|
575 accessions.append(accession)
|
|
|
576 # filter accessions
|
|
|
577 if options.decoy:
|
|
|
578 filtered_accs = [re.sub(options.decoy,'',x) for x in accessions]
|
|
|
579 accessions = filtered_accs
|
|
|
580 if options.include_pat:
|
|
|
581 filtered_accs = []
|
|
|
582 for acc in accessions:
|
|
|
583 if re.match(options.include_pat,acc):
|
|
|
584 filtered_accs.append(acc)
|
|
|
585 accessions = filtered_accs
|
|
|
586 if options.exclude_pat:
|
|
|
587 filtered_accs = []
|
|
|
588 for acc in accessions:
|
|
|
589 if not re.match(options.exclude_pat,acc):
|
|
|
590 filtered_accs.append(acc)
|
|
|
591 accessions = filtered_accs
|
|
|
592 if len(accessions) < 1:
|
|
|
593 warn_err("No accessions input!",exit_code=1)
|
|
|
594 if options.output != None:
|
|
|
595 try:
|
|
|
596 outputPath = os.path.abspath(options.output)
|
|
|
597 outputFile = open(outputPath, 'w')
|
|
|
598 except Exception, e:
|
|
|
599 print >> sys.stderr, "failed: %s" % e
|
|
|
600 exit(3)
|
|
|
601 else:
|
|
|
602 outputFile = sys.stdout
|
|
|
603
|
|
|
604
|
|
|
605 # Removes duplicates in accessions
|
|
|
606 seen = set()
|
|
|
607 seen_add = seen.add
|
|
|
608 accessions = [x for x in accessions if not (x in seen or seen_add(x))]
|
|
|
609 # Sorts accessions to inferred ID types i+n a dictionary
|
|
|
610 id_dict = {}
|
|
|
611 matchType = []
|
|
|
612 for header in accessions:
|
|
|
613 id , id_types = getAccession(header,matchType)
|
|
|
614 addAccession(id_dict,id_types,id)
|
|
|
615 #print id_dict
|
|
|
616 #print id_dict
|
|
|
617 idMaps = dict() # {to_id,idMap}
|
|
|
618 #print matchType
|
|
|
619 #print id_dict.items()
|
|
|
620 for to_id in options.to_id:
|
|
|
621 idMap = dict()
|
|
|
622 idMaps[to_id] = idMap
|
|
|
623 for (from_id,ids) in id_dict.items():
|
|
|
624 # limit the requests to 500 at a time
|
|
|
625 idx = range(0,len(ids),500)
|
|
|
626 idx.append(len(ids))
|
|
|
627 for i in range(len(idx)-1):
|
|
|
628 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,idMaps=idMaps)
|
|
|
629 #Write output
|
|
|
630 #Output Table Header
|
|
|
631 outputFile.write("\n#%-17s" % (options.from_id))
|
|
|
632 for t in options.to_id:
|
|
|
633 outputFile.write("%-18s" % t)
|
|
|
634 outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n")
|
|
|
635
|
|
|
636 # Create an output-friendly matrix
|
|
|
637 idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))]
|
|
|
638 for a, acc in enumerate(accessions):
|
|
|
639 idLength = 0
|
|
|
640 for i, to_id in enumerate(options.to_id): # [[ids],[ids]]
|
|
|
641 idArrayColumn = []
|
|
|
642 idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3]
|
|
|
643 for from_id in idPairs:
|
|
|
644 ids = idPairs[from_id]
|
|
|
645 for id_ in ids:
|
|
|
646 idArrayColumn.append("%s[%s]" % (id_,from_id))
|
|
|
647 if idLength < len(idArrayColumn):
|
|
|
648 idLength = len(idArrayColumn)
|
|
|
649 idArray[a][i].extend(idArrayColumn)
|
|
|
650 for y in range(idLength):
|
|
|
651 outputFile.write("%-18s" % acc)
|
|
|
652 for x in range(len(options.to_id)):
|
|
|
653 try:
|
|
|
654 outputFile.write("%-18s" % idArray[a][x][y])
|
|
|
655 except:
|
|
|
656 outputFile.write(" " * 18)
|
|
|
657 outputFile.write("\n")
|
|
|
658 # Output Matrix
|
|
|
659
|
|
|
660
|
|
|
661
|
|
|
662 #print idMaps
|
|
|
663 # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv
|
|
|
664 stop = timeit.default_timer()
|
|
|
665 outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds"))
|
|
|
666
|
|
|
667 if __name__ == "__main__" : __main__()
|
|
|
668
|