annotate uniprot_ID_mapper/uniprot_id_mapping.py @ 3:6651ac4651f0 draft default tip

Uploaded
author rsajulga
date Thu, 16 Jul 2015 16:41:35 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
1 #!/usr/bin/env python
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
2 """
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
3 #
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
4 #------------------------------------------------------------------------------
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
5 # University of Minnesota
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
6 # Copyright 2015, Regents of the University of Minnesota
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
7 #------------------------------------------------------------------------------
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
8 # Author:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
9 #
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
10 # James E Johnson
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
11 #
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
12 #------------------------------------------------------------------------------
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
13 """
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
14 import json
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
15 import logging
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
16 import optparse
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
17 from optparse import OptionParser
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
18 import os
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
19 import sys
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
20 import re
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
21 import urllib
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
22 import urllib2
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
23 import timeit
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
24 start = timeit.default_timer()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
25 try:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
26 import xml.etree.cElementTree as ET
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
27 except ImportError:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
28 import xml.etree.ElementTree as ET
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
29
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
30 """print
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
31 http://www.uniprot.org/help/programmatic_access#id_mapping_examples
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
32 http://www.uniprot.org/help/uploadlists
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
33
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
34 From input files or stdin get accessions/IDs to map.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
35 An input option can specify what id_type the input accessions/IDs are,
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
36 otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
37
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
38 input_dict = dict()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
39 for id_string in input_ids:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
40 (id,id_type) = getAccession(id_string)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
41 if not id_type in input_dict:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
42 input_dict[id_type] = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
43 input_dict[id_type].append(id)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
44
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
45
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
46 # We need to retrieve by from_id_type and to_id_type
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
47 for id_type in input_dict:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
48 for
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
49
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
50
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
51 tabular output:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
52 #input_id to_id to_id
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
53
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
54
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
55 """
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
56 def warn_err(msg,exit_code=1):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
57 sys.stderr.write(msg)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
58 if exit_code:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
59 sys.exit(exit_code)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
60
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
61 def __main__():
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
62 version = '1.0'
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
63 idDict = {
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
64 #Category:UniProt
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
65 'ACC+ID':'UniProtKB AC/ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
66 'ACC':'UniProtKB AC',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
67 'ID':'UniProtKB ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
68 'UPARC':'UniParc',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
69 'NF50':'UniRef50',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
70 'NF90':'UniRef90',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
71 'NF100':'UniRef100',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
72 'GENENAME':'Gene name',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
73 #Category:Other sequence databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
74 'EMBL_ID':'EMBL/GenBank/DDBJ',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
75 'EMBL':'EMBL/GenBank/DDBJ CDS',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
76 'PIR':'PIR',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
77 'UNIGENE_ID':'UniGene',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
78 'P_ENTREZGENEID':'Entrez Gene (GeneID)',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
79 'P_GI':'GI number*',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
80 'P_REFSEQ_AC':'RefSeq Protein',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
81 'REFSEQ_NT_ID':'RefSeq Nucleotide',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
82 #Category:3D structure databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
83 'PDB_ID':'PDB',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
84 'DISPROT_ID':'DisProt',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
85 #Category:Protein-protein interaction databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
86 'BIOGRID_ID':'BioGrid',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
87 'DIP_ID':'DIP',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
88 'MINT_ID':'MINT',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
89 'STRING_ID':'STRING',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
90 #Category:Chemistry
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
91 'CHEMBL_ID':'ChEMBL',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
92 'DRUGBANK_ID':'DrugBank',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
93 'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
94 #Category:Protein family/group databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
95 'ALLERGOME_ID':'Allergome',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
96 'MEROPS_ID':'MEROPS',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
97 'MYCOCLAP_ID':'mycoCLAP',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
98 'PEROXIBASE_ID':'PeroxiBase',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
99 'REBASE_ID':'REBASE',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
100 'TCDB_ID':'TCDB',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
101 #Category:Polymorphism databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
102 'DMDM_ID':'DMDM',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
103 #Category:2D gel databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
104 'WORLD_2DPAGE_ID':'World-2DPAGE',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
105 #Category:Protocols and materials databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
106 'DNASU_ID':'DNASU',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
107 #Category:Genome annotation databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
108 'ENSEMBL_ID':'Ensembl',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
109 'ENSEMBL_PRO_ID':'Ensembl Protein',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
110 'ENSEMBL_TRS_ID':'Ensembl Transcript',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
111 'ENSEMBLGENOME_ID':'Ensembl Genomes',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
112 'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
113 'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
114 'P_ENTREZGENEID':'GeneID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
115 'KEGG_ID':'KEGG',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
116 'PATRIC_ID':'PATRIC',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
117 'UCSC_ID':'UCSC',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
118 'VECTORBASE_ID':'VectorBase',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
119 #Category:Organism-specific gene databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
120 'ARACHNOSERVER_ID':'ArachnoServer',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
121 'CGD':'CGD',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
122 'CONOSERVER_ID':'ConoServer',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
123 'CYGD_ID':'CYGD',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
124 'DICTYBASE_ID':'dictyBase',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
125 'ECHOBASE_ID':'EchoBASE',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
126 'ECOGENE_ID':'EcoGene',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
127 'EUHCVDB_ID':'euHCVdb',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
128 'EUPATHDB_ID':'EuPathDB',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
129 'FLYBASE_ID':'FlyBase',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
130 'GENECARDS_ID':'GeneCards',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
131 'GENEFARM_ID':'GeneFarm',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
132 'GENOLIST_ID':'GenoList',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
133 'H_INVDB_ID':'H-InvDB',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
134 'HGNC_ID':'HGNC',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
135 'HPA_ID':'HPA',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
136 'LEGIOLIST_ID':'LegioList',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
137 'LEPROMA_ID':'Leproma',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
138 'MAIZEGDB_ID':'MaizeGDB',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
139 'MIM_ID':'MIM',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
140 'MGI_ID':'MGI',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
141 'NEXTPROT_ID':'neXtProt',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
142 'ORPHANET_ID':'Orphanet',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
143 'PHARMGKB_ID':'PharmGKB',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
144 'POMBASE_ID':'PomBase',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
145 'PSEUDOCAP_ID':'PseudoCAP',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
146 'RGD_ID':'RGD',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
147 'SGD_ID':'SGD',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
148 'TAIR_ID':'TAIR',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
149 'TUBERCULIST_ID':'TubercuList',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
150 'WORMBASE_ID':'WormBase',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
151 'WORMBASE_TRS_ID':'WormBase Transcript',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
152 'WORMBASE_PRO_ID':'WormBase Protein',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
153 'XENBASE_ID':'Xenbase',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
154 'ZFIN_ID':'ZFIN',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
155 #Category:Phylogenomic databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
156 'EGGNOG_ID':'eggNOG',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
157 'GENETREE_ID':'GeneTree',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
158 'HOGENOM_ID':'HOGENOM',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
159 'HOVERGEN_ID':'HOVERGEN',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
160 'KO_ID':'KO',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
161 'OMA_ID':'OMA',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
162 'ORTHODB_ID':'OrthoDB',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
163 'PROTCLUSTDB_ID':'ProtClustDB',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
164 'TREEFAM_ID':'TreeFam',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
165 #Category:Enzyme and pathway databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
166 'BIOCYC_ID':'BioCyc',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
167 'REACTOME_ID':'Reactome',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
168 'UNIPATHWAY_ID':'UniPathWay',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
169 #Category:Gene expression databases
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
170 'CLEANEX_ID':'CleanEx',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
171 #Category:Other
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
172 'CHITARS_ID':'ChiTaRS',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
173 'GENOMERNAI_ID':'GenomeRNAi',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
174 'GENEWIKI_ID':'GeneWiki',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
175 'NEXTBIO_ID':'NextBio'
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
176 }
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
177
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
178 """
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
179 TODO:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
180 Would be better to be able to infer the from_id type for the input accession/ID especially for a fasta file
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
181 - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
182
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
183 # rather than an array of input accessions, we need to put them in a dict() by type
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
184 def addAccession(id_dict,id_type,id):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
185 if id_type not in id_dict:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
186 id_dict[id_type] = []-++
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
187 id_dict[id_type] = [].append(id)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
188
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
189 # returns(accession, id_type)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
190 def getAccession(header, matchType):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
191 # (format regex pattern, FROM_ID)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
192 # TODO try to determine which type of accession ID we have by matching by regular expressions
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
193 # each regex match should have a groups[0] that given the accession
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
194 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
195 fmts = [
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
196 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
197 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
198 ('>/UniRef\d+_(\w+).*','ACC+ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
199 ('>/(UPI\d+).*','ACC+ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
200 ('NP_\d+\.\d+','ACC+ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
201 ('ENSP\d+','ENSEMBL_PRO_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
202 ('ENST\d+','ENSEMBL_TRS_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
203 ]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
204 for pat,cat in fmts:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
205 m = re.match(pat,header)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
206 if m:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
207 matchType.append(cat)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
208 return (m.groups()[0],cat)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
209 matchType.append('ACC+ID')
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
210 return (header,'ACC+ID')
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
211
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
212 # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
213 idMaps = dict() # {to_id,idMap}
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
214 for to_id in options.to_id:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
215 idMap = dict()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
216 idMaps[to_id] = idMap
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
217 for (from_id,ids) id_dict.items():
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
218 # limit the requests to 500 at a time
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
219 idx = range(0,len(ids),500)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
220 idx.append(len(ids))
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
221 for i in range(len(idx)-1):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
222 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
223 """
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
224
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
225 """
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
226 Some examples of fasta ID lines From:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
227 https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
228 >sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ...
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
229 >gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ...
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
230 >IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
231 >HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
232 >OExyz (OExyz) xxx xxx xxx
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
233 >hflu_lsi_xxxx xxx xxx xxx
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
234 >C_tr_Lx_x [xxx - xxx] | xxx xxx
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
235 >M. tub.xxx|Rvxxx| xxx xxx
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
236 // Drosophile DB.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
237 // We need to find two elements:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
238 // - the accession String (retrieved as the trimmed version of everything
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
239 // up to (and NOT including) " pep:"
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
240 // - the description (everything (trimmed) starting from (and including) the " pep:".
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
241 >CGxxxxxx pep:xxxxxx
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
242 >xxxx xxx SGDID:xxxx xxx
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
243 >generic_some_tag|proten_accession|a description for this protein
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
244 // Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
245 // present in the Expasy FTP FASTA file.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
246 // Is formatted something like this:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
247 >XXX_YYYY (acc) rest
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
248 >sp|accession|ID descr rest (including taxonomy, if available
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
249 >tr|accession|ID descr rest (including taxonomy, if available)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
250 >nxp|NX_P02768-1|ALB|Serum albumin|Iso
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
251 // New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
252 // present in the Expasy FTP FASTA file.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
253 // Is formatted something like this:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
254 >accession|ID descr rest (including taxonomy, if available)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
255 // Flybase FASTA format.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
256 >FBxxx type=xxx
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
257 // A header translating a genome sequence into a protein sequences.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
258 // We need to find two elements, separated by a space:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
259 // - the accession string (retrieved as the first part of a space delimited String).
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
260 // - the nucleic acid start and stop site (between brackets, separated by a '-').
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
261 >dm345_3L-sense [234353534-234353938]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
262 >dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
263 >synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
264 // The Arabidopsis thaliana database; TAIR format
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
265 >AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
266 ...
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
267 // Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
268 // Testing for this is somewhat more complicated.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
269 // Often used simple header; looks like:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
270 // >NP0465 (NP0465) A description for this protein.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
271 // We need to find two elements:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
272 // - the accession String (easily retrieved as the next String until a space is encountered).
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
273 // - the description
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
274 >NP0465 (NP0465) A description for this protein.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
275 GenBank gb|accession|locus
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
276 EMBL Data Library emb|accession|locus
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
277 DDBJ, DNA Database of Japan dbj|accession|locus
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
278 NBRF PIR pir||entry
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
279 Protein Research Foundation prf||name
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
280 SWISS-PROT sp|accession|entry name
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
281 Brookhaven Protein Data Bank pdb|entry|chain
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
282 Patents pat|country|number
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
283 GenInfo Backbone Id bbs|number
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
284 General database identifier gnl|database|identifier
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
285 NCBI Reference Sequence ref|accession|locus
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
286 Local Sequence identifier lcl|identifier
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
287 """
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
288
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
289
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
290 def getFastaAccession(header):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
291 # TODO parse the ID and return the accession
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
292 # (format regex pattern, FROM_ID)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
293 # TODO try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
294 # each regex match should have a groups[0] that given the accession
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
295 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
296 fmts = [
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
297 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
298 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
299 ('NP_\d+\.\d+','ACC+ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
300
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
301 ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
302 ('(...*)_....*','ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
303 ('>/(UPI\d+).*','UPARC'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
304 ('>/UniRef50_(\w+).*','NF50'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
305 ('UniRef90_(\w+).*','NF90'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
306 ('UniRef100_(\w+).*','NF100'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
307 ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
308 ('ENSP\d+','ENSEMBL_PRO_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
309 ('ENST\d+','ENSEMBL_TRS_ID')
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
310 ]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
311 for pat,cat in fmts:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
312 m = re.match(pat,header)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
313 if m:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
314 return m.groups()[0]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
315 return header
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
316
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
317 # rather than an array of input accessions, we need to put them in a dict() by type
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
318 def addAccession(id_dict,id_type,id):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
319 for idtype in id_type:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
320 if idtype not in id_dict:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
321 id_dict[idtype] = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
322 id_dict[idtype].append(id)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
323
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
324 # returns(accession, id_type)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
325 def getAccession(header):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
326 # (format regex pattern, FROM_ID)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
327 # TODO try to determine which type of accession ID we have by matching by regular expressions
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
328 # each regex match should have a groups[0] that given the accession
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
329 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
330 fmts = [
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
331 ('([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
332 ('.*_[A-Z]*','ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
333 ('UPI(\d+).*','UPARC'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
334 ('(UniRef50_\w+.*)','NF50'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
335 ('UniRef90_(\w+).*','NF90'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
336 ('(UniRef100_\w+.*)','NF100'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
337 ('a[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2}','GENENAME'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
338
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
339 ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
340 ('K\d*','KO_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
341
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
342 ('([A-Z]*\d*\.\d$)','EMBL'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
343 ('([IBC]\d{5})','PIR'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
344 ('(Hs\.\d*)','UNIGENE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
345 ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
346 ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
347 ('(\d[A-Z0-9]{3})','PDB_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
348 ('(DP\d{5})','DISPROT_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
349 ('(DIP-\d*N$)','DIP_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
350 ('(MINT-\d*)','MINT_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
351 ('(9606\.ENSP\d*)','STRING_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
352 ('(CHEMBL\d*)','CHEMBL_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
353 ('(DB\d*)','DRUGBANK_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
354 ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
355
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
356 ('[A-Z]*_[A-Z]*','MYCOCLAP_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
357
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
358 ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
359 ('\d{4}:([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','WORLD_2DPAGE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
360 ('ENS.?*G\d*','ENSEMBL_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
361 ('ENS.?*P\d*','ENSEMBL_PRO_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
362 ('ENS.?*T\d*','ENSEMBL_TRS_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
363 (' ','ENSEMBLGENOME_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
364 (' ','ENSEMBLGENOME_PRO_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
365 (' ','ENSEMBLGENOME_TRS_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
366 ('(hsa:\d*)','KEGG_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
367 ('(uc\d*[a-z]*\.\d$)','UCSC_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
368 ('VEL:J:LDKJFS','VECTORBASE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
369 ('AS\d*','ARACHNOSERVER_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
370 ('CAL\d*','CGD'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
371 ('Y[A-Z]{2}[0-9]{3}[cw].*|Q\d{4}|R\d{4}[wc]','CYGD_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
372 ('DDB_G\d*','DICTYBASE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
373 ('EB\d{4}','ECHOBASE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
374 ('EG\d{5}','ECOGENE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
375 ('(?:HM|G[QU]|F[JMN]|E[FU]|DQ|A[A-Z])\d{6}|[DLMSUXYZ]\d{5}','EUHCVDB_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
376 ('[A-Z][a-z]*DB:.*','EUPATHDB_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
377 ('FBgn\d*','FLYBASE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
378 ('GC[A-Z0-9]{9}','GENECARDS_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
379 ('(?:BSU\d|gbs|LIN|LMO|MUL_|MYPU_|pl[iu])\d{4}|MUP\d{3}c?','GENOLIST_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
380 ('HIX\d*','H_INVDB_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
381 ('HGNC:\d*','HGNC_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
382 ('(?:CAB|HPA)\d{6}','HPA_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
383 ('lp[lp]\d{4}','LEGIOLIST_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
384 ('ML\d{4}','LEPROMA_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
385 ('MGI:\d*','MGI_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
386 ('NX_(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','NEXTPROT_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
387 ('PA\d{3,5,9}','PHARMGKB_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
388 ('SP.*\.\d\dc?,'POMBASE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
389 ('PA\d{4}','PSEUDOCAP_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
390 ('S\d{9}','SGD_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
391 ('AT.G\d{5}','TAIR_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
392 ('Rv\d{4}c?','TUBERCULIST_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
393 ('WBGene\d{8}','WORMBASE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
394 ('(\dR(SSE|\d\d?)|A[CH]\d\d?|[BC]\d\d[A-Z]\d)\.\d.[0-9a-z]?|CBG\d{5}|C[CD][48]\.\d.[a-z]?|CE7X_3\.1|cTel5\dX\.1[ab]?|D\d{4}\.\d.[a-z]?,'WORMBASE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
395 ('C(BP|E)\d{5}','WORMBASE_PRO_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
396 ('XB-GENE-\d*','XENBASE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
397 ('ZDB-GENE-\d{6}-\d{4}','ZFIN_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
398
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
399
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
400 ('(.*[CKN]OG\d*)','EGGNOG_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
401 ('(ENSGT\d*)','GENETREE_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
402 ('HOG\d*','HOGENOM_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
403 ('HBG\d*','HOVERGEN_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
404
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
405 ('[A-Z]{7}','OMA_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
406
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
407 ('EOG\d*','ORTHODB_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
408 ('asdfsadfasdf','PROTCLUSTDB_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
409 ('TF\d*','TREEFAM_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
410 ('REACT_\d*','REACTOME_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
411 ('asdfasl;DF:LJk','UNIPATHWAY_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
412 ('HS_\d*','CLEANEX_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
413 ('SAMEAS GENE NAME','CHITARS_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
414 ('GENENAME_\(gene\)','GENEWIKI_ID'),
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
415 ]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
416 # remove the need for .groups() (i.e. parantheses)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
417 if re.match('\d*$', header): # For ambiguous number only ID types
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
418 numIDtypes = [
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
419 'P_ENTREZGENEID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
420 'P_GI',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
421 'DMDM_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
422 'BIOGRID_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
423 'GUIDETOPHARMACOLOGY_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
424 'ALLERGOME_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
425 'PEROXIBASE_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
426 'REBASE_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
427 'DNASU_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
428 'GENOMERNAI_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
429 'NEXTBIO_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
430 'CONOSERVER_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
431 'GENEFARM_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
432 'MAIZEGDB_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
433 'MIM_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
434 'MGI_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
435 'ORPHANET_ID',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
436 'RGD_ID']
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
437 ambiguous = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
438 for numIDs in numIDtypes:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
439 nm = getIdMapping([header],numIDs,'ACC',ambiguity=True)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
440 if nm != None:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
441 ambiguous.append(nm)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
442 if ambiguous == []:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
443 ambiguous.append('No Match')
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
444 return (header, ambiguous)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
445 for pat,cat in fmts:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
446 m = re.match(pat,header)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
447 if m:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
448 #return (m.groups()[0],[cat])
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
449 return (header,[cat])
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
450 return (header,['ACC+ID'])
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
451
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
452
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
453 def read_tabular(filepath,col):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
454 accessions = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
455 with open(filepath) as fp:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
456 for i,line in enumerate(fp):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
457 if line.strip() == '' or line.startswith('#'):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
458 continue
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
459 fields = line.rstrip('\n').split('\t')
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
460 accession = fields[col]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
461 accessions.append(accession)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
462 return accessions
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
463
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
464 def get_fasta_entries(fp):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
465 name, seq = None, []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
466 for line in fp:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
467 line = line.rstrip()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
468 if line.startswith(">"):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
469 if name: yield (name, ''.join(seq))
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
470 name, seq = line, []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
471 else:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
472 seq.append(line)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
473 if name: yield (name, ''.join(seq))
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
474
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
475 def read_fasta(filepath):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
476 accessions = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
477 with open(filepath) as fp:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
478 for id, peptide in get_fasta_entries(fp):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
479 accessions.append(getFastaAccession(id))
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
480 return accessions
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
481
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
482 def read_mzid(fp):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
483 accessions = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
484 for event, elem in ET.iterparse(fp):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
485 if event == 'end':
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
486 if re.search('DBSequence',elem.tag):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
487 accessions.append(elem.attrib['accession'])
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
488 return accessions
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
489
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
490 def read_pepxml(fp):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
491 accessions = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
492 for event, elem in ET.iterparse(fp):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
493 if event == 'end':
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
494 if re.search('search_hit',elem.tag):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
495 accessions.append(elem.get('protein'))
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
496 return accessions
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
497
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
498 def getUniprotSequence(uniprot_id):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
499 url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
500 print url
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
501 fp = urllib2.urlopen(url)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
502 for id, seq in get_fasta_entries(fp):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
503 if seq:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
504 return seq
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
505 return ''
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
506
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
507
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
508
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
509
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
510 def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,ambiguity=None,crossReference=None,idMaps=None):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
511 # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)]))
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
512 if not accessions:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
513 return
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
514 url = 'http://www.uniprot.org/mapping/'
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
515 # Cross Referencing: Mapping to non-UniprotKB ('ACC') IDs to other non-UniprotKB ('ACC') IDs
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
516 if to_id != 'ACC' and from_id != 'ACC':
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
517 crMap = {}
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
518 getIdMapping(accessions,from_id,'ACC',idMap=crMap)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
519 crMap2 = crMap.copy()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
520 for x in crMap.keys():
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
521 for y in crMap[x].keys():
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
522 for z in crMap[x][y]:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
523 crMap2[x][y] = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
524 getIdMapping([z],'ACC',to_id,idMap=crMap2,crossReference=[x,from_id])
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
525 idMap.update(crMap2)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
526 return
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
527 params = {
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
528 'from': from_id,
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
529 'to': to_id,
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
530 'format':'tab',
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
531 'query': '\n'.join(accessions)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
532 }
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
533 data = urllib.urlencode(params)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
534 request = urllib2.Request(url, data)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
535 contact = "" # Please set your email address here to help us debug in case of problems.
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
536 request.add_header('User-Agent', 'Python %s' % contact)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
537 response = None
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
538 for i in range(3):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
539 try:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
540 response = urllib2.urlopen(request)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
541 break
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
542 except Exception, e:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
543 warn_err("%s",exit_code=None)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
544 if response:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
545 response.next()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
546 print params
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
547
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
548 for line in response:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
549 # print >> sys.stderr, "idMap: %s" % line
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
550 if fh:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
551 fh.write(line)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
552 if ambiguity: # if there was a response, then an ambiguous match can be made
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
553 return from_id
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
554 if idMap != None:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
555 id1,id2 = line.strip().split('\t')
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
556 print id2
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
557 if crossReference != None:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
558 id1, from_id = crossReference
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
559 # print >> sys.stderr, "idMap: %s:%s" % (id1,id2)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
560 try:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
561 idMap[id1][from_id].append(id2)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
562 except:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
563 idMap[id1] = {from_id:[id2]}
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
564 if ambiguity == None:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
565 for line in response:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
566 for acc in accessions:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
567 idMap[acc] = {from_id:['N/A']}
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
568 return
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
569
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
570
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
571 #Parse Command Line
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
572 parser = optparse.OptionParser()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
573 # input files
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
574 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
575 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
576 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
577 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
578 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
579 # Decoy pattern
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
580 parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
581 # filter patterns
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
582 parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
583 parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
584 # Unipept Flags
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
585 parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
586 parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
587 # output files
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
588 parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values')
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
589 # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
590 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
591 (options, args) = parser.parse_args()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
592 if options.version:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
593 print >> sys.stdout,"%s" % version
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
594 sys.exit(0)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
595
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
596 accessions = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
597 ## Get accessions
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
598 if options.mzid:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
599 accessions += read_mzid(options.mzid)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
600 if options.pepxml:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
601 accessions += read_pepxml(options.pepxml)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
602 if options.tabular:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
603 accessions += read_tabular(options.tabular,options.column)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
604 if options.fasta:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
605 accessions += read_fasta(options.fasta)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
606 if args and len(args) > 0:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
607 for i,accession in enumerate(args):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
608 accessions.append(accession)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
609 # filter accessions
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
610 if options.decoy:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
611 filtered_accs = [re.sub(options.decoy,'',x) for x in accessions]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
612 accessions = filtered_accs
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
613 if options.include_pat:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
614 filtered_accs = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
615 for acc in accessions:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
616 if re.match(options.include_pat,acc):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
617 filtered_accs.append(acc)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
618 accessions = filtered_accs
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
619 if options.exclude_pat:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
620 filtered_accs = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
621 for acc in accessions:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
622 if not re.match(options.exclude_pat,acc):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
623 filtered_accs.append(acc)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
624 accessions = filtered_accs
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
625 if len(accessions) < 1:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
626 warn_err("No accessions input!",exit_code=1)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
627 if options.output != None:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
628 try:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
629 outputPath = os.path.abspath(options.output)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
630 outputFile = open(outputPath, 'w')
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
631 except Exception, e:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
632 print >> sys.stderr, "failed: %s" % e
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
633 exit(3)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
634 else:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
635 outputFile = sys.stdout
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
636
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
637
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
638 # Removes duplicates in accessions
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
639 seen = set()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
640 seen_add = seen.add
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
641 accessions = [x for x in accessions if not (x in seen or seen_add(x))]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
642 # Sorts accessions to inferred ID types i+n a dictionary
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
643 id_dict = {}
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
644 for header in accessions:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
645 id , id_types = getAccession(header)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
646 addAccession(id_dict,id_types,id)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
647 idMaps = dict() # {to_id,idMap}
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
648 for to_id in options.to_id:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
649 idMap = dict()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
650 idMaps[to_id] = idMap
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
651 for (from_id,ids) in id_dict.items():
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
652 # limit the requests to 500 at a time
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
653 idx = range(0,len(ids),500)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
654 idx.append(len(ids))
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
655 for i in range(len(idx)-1):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
656 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,idMaps=idMaps)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
657 print ids
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
658 #Write output
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
659 #Output Table Header
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
660 outputFile.write("\n#%-17s" % (options.from_id))
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
661 for t in options.to_id:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
662 outputFile.write("%-18s" % t)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
663 outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n")
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
664
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
665 # Create an output-friendly matrix
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
666 idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
667 for a, acc in enumerate(accessions):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
668 idLength = 0
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
669 for i, to_id in enumerate(options.to_id): # [[ids],[ids]]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
670 idArrayColumn = []
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
671 idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
672 for from_id in idPairs:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
673 ids = idPairs[from_id]
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
674 for id_ in ids:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
675 idArrayColumn.append("%s[%s]" % (id_,from_id))
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
676 if idLength < len(idArrayColumn):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
677 idLength = len(idArrayColumn)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
678 idArray[a][i].extend(idArrayColumn)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
679 for y in range(idLength):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
680 outputFile.write("%-18s" % acc)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
681 for x in range(len(options.to_id)):
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
682 try:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
683 outputFile.write("%-18s" % idArray[a][x][y])
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
684 except:
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
685 outputFile.write(" " * 18)
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
686 outputFile.write("\n")
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
687 # Output Matrix
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
688
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
689
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
690
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
691 #print idMaps
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
692 # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
693 stop = timeit.default_timer()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
694 outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds"))
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
695
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
696 if __name__ == "__main__" : __main__()
6651ac4651f0 Uploaded
rsajulga
parents:
diff changeset
697