Mercurial > repos > rsajulga > uniprot_id_mapper
comparison uniprot_ID_mapper/uniprot_id_mapping_3.py @ 3:6651ac4651f0 draft default tip
Uploaded
| author | rsajulga |
|---|---|
| date | Thu, 16 Jul 2015 16:41:35 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:d0311668d442 | 3:6651ac4651f0 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 # | |
| 4 #------------------------------------------------------------------------------ | |
| 5 # University of Minnesota | |
| 6 # Copyright 2015, Regents of the University of Minnesota | |
| 7 #------------------------------------------------------------------------------ | |
| 8 # Author: | |
| 9 # | |
| 10 # James E Johnson | |
| 11 # | |
| 12 #------------------------------------------------------------------------------ | |
| 13 """ | |
| 14 import json | |
| 15 import logging | |
| 16 import optparse | |
| 17 from optparse import OptionParser | |
| 18 import os | |
| 19 import sys | |
| 20 import re | |
| 21 import urllib | |
| 22 import urllib2 | |
| 23 import timeit | |
| 24 start = timeit.default_timer() | |
| 25 try: | |
| 26 import xml.etree.cElementTree as ET | |
| 27 except ImportError: | |
| 28 import xml.etree.ElementTree as ET | |
| 29 | |
| 30 """print | |
| 31 http://www.uniprot.org/help/programmatic_access#id_mapping_examples | |
| 32 http://www.uniprot.org/help/uploadlists | |
| 33 | |
| 34 From input files or stdin get accessions/IDs to map. | |
| 35 An input option can specify what id_type the input accessions/IDs are, | |
| 36 otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type | |
| 37 | |
| 38 input_dict = dict() | |
| 39 for id_string in input_ids: | |
| 40 (id,id_type) = getAccession(id_string) | |
| 41 if not id_type in input_dict: | |
| 42 input_dict[id_type] = [] | |
| 43 input_dict[id_type].append(id) | |
| 44 | |
| 45 | |
| 46 # We need to retrieve by from_id_type and to_id_type | |
| 47 for id_type in input_dict: | |
| 48 for | |
| 49 | |
| 50 | |
| 51 tabular output: | |
| 52 #input_id to_id to_id | |
| 53 | |
| 54 | |
| 55 """ | |
| 56 def warn_err(msg,exit_code=1): | |
| 57 sys.stderr.write(msg) | |
| 58 if exit_code: | |
| 59 sys.exit(exit_code) | |
| 60 | |
| 61 def __main__(): | |
| 62 version = '1.0' | |
| 63 idDict = { | |
| 64 #Category:UniProt | |
| 65 'ACC+ID':'UniProtKB AC/ID', | |
| 66 'ACC':'UniProtKB AC', | |
| 67 'ID':'UniProtKB ID', | |
| 68 'UPARC':'UniParc', | |
| 69 'NF50':'UniRef50', | |
| 70 'NF90':'UniRef90', | |
| 71 'NF100':'UniRef100', | |
| 72 #Category:Other sequence databases | |
| 73 'EMBL_ID':'EMBL/GenBank/DDBJ', | |
| 74 'EMBL':'EMBL/GenBank/DDBJ CDS', | |
| 75 'PIR':'PIR', | |
| 76 'UNIGENE_ID':'UniGene', | |
| 77 'P_ENTREZGENEID':'Entrez Gene (GeneID)', | |
| 78 'P_GI':'GI number*', | |
| 79 'P_REFSEQ_AC':'RefSeq Protein', | |
| 80 'REFSEQ_NT_ID':'RefSeq Nucleotide', | |
| 81 #Category:3D structure databases | |
| 82 'PDB_ID':'PDB', | |
| 83 'DISPROT_ID':'DisProt', | |
| 84 #Category:Protein-protein interaction databases | |
| 85 'BIOGRID_ID':'BioGrid', | |
| 86 'DIP_ID':'DIP', | |
| 87 'MINT_ID':'MINT', | |
| 88 'STRING_ID':'STRING', | |
| 89 #Category:Chemistry | |
| 90 'CHEMBL_ID':'ChEMBL', | |
| 91 'DRUGBANK_ID':'DrugBank', | |
| 92 'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY', | |
| 93 #Category:Protein family/group databases | |
| 94 'ALLERGOME_ID':'Allergome', | |
| 95 'MEROPS_ID':'MEROPS', | |
| 96 'MYCOCLAP_ID':'mycoCLAP', | |
| 97 'PEROXIBASE_ID':'PeroxiBase', | |
| 98 'REBASE_ID':'REBASE', | |
| 99 'TCDB_ID':'TCDB', | |
| 100 #Category:Polymorphism databases | |
| 101 'DMDM_ID':'DMDM', | |
| 102 #Category:2D gel databases | |
| 103 'WORLD_2DPAGE_ID':'World-2DPAGE', | |
| 104 #Category:Protocols and materials databases | |
| 105 'DNASU_ID':'DNASU', | |
| 106 #Category:Genome annotation databases | |
| 107 'ENSEMBL_ID':'Ensembl', | |
| 108 'ENSEMBL_PRO_ID':'Ensembl Protein', | |
| 109 'ENSEMBL_TRS_ID':'Ensembl Transcript', | |
| 110 'ENSEMBLGENOME_ID':'Ensembl Genomes', | |
| 111 'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein', | |
| 112 'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript', | |
| 113 'P_ENTREZGENEID':'GeneID', | |
| 114 'KEGG_ID':'KEGG', | |
| 115 'PATRIC_ID':'PATRIC', | |
| 116 'UCSC_ID':'UCSC', | |
| 117 'VECTORBASE_ID':'VectorBase', | |
| 118 #Category:Organism-specific gene databases | |
| 119 'ARACHNOSERVER_ID':'ArachnoServer', | |
| 120 'CGD':'CGD', | |
| 121 'CONOSERVER_ID':'ConoServer', | |
| 122 'CYGD_ID':'CYGD', | |
| 123 'DICTYBASE_ID':'dictyBase', | |
| 124 'ECHOBASE_ID':'EchoBASE', | |
| 125 'ECOGENE_ID':'EcoGene', | |
| 126 'EUHCVDB_ID':'euHCVdb', | |
| 127 'EUPATHDB_ID':'EuPathDB', | |
| 128 'FLYBASE_ID':'FlyBase', | |
| 129 'GENECARDS_ID':'GeneCards', | |
| 130 'GENEFARM_ID':'GeneFarm', | |
| 131 'GENOLIST_ID':'GenoList', | |
| 132 'H_INVDB_ID':'H-InvDB', | |
| 133 'HGNC_ID':'HGNC', | |
| 134 'HPA_ID':'HPA', | |
| 135 'LEGIOLIST_ID':'LegioList', | |
| 136 'LEPROMA_ID':'Leproma', | |
| 137 'MAIZEGDB_ID':'MaizeGDB', | |
| 138 'MIM_ID':'MIM', | |
| 139 'MGI_ID':'MGI', | |
| 140 'NEXTPROT_ID':'neXtProt', | |
| 141 'ORPHANET_ID':'Orphanet', | |
| 142 'PHARMGKB_ID':'PharmGKB', | |
| 143 'POMBASE_ID':'PomBase', | |
| 144 'PSEUDOCAP_ID':'PseudoCAP', | |
| 145 'RGD_ID':'RGD', | |
| 146 'SGD_ID':'SGD', | |
| 147 'TAIR_ID':'TAIR', | |
| 148 'TUBERCULIST_ID':'TubercuList', | |
| 149 'WORMBASE_ID':'WormBase', | |
| 150 'WORMBASE_TRS_ID':'WormBase Transcript', | |
| 151 'WORMBASE_PRO_ID':'WormBase Protein', | |
| 152 'XENBASE_ID':'Xenbase', | |
| 153 'ZFIN_ID':'ZFIN', | |
| 154 #Category:Phylogenomic databases | |
| 155 'EGGNOG_ID':'eggNOG', | |
| 156 'GENETREE_ID':'GeneTree', | |
| 157 'HOGENOM_ID':'HOGENOM', | |
| 158 'HOVERGEN_ID':'HOVERGEN', | |
| 159 'KO_ID':'KO', | |
| 160 'OMA_ID':'OMA', | |
| 161 'ORTHODB_ID':'OrthoDB', | |
| 162 'PROTCLUSTDB_ID':'ProtClustDB', | |
| 163 'TREEFAM_ID':'TreeFam', | |
| 164 #Category:Enzyme and pathway databases | |
| 165 'BIOCYC_ID':'BioCyc', | |
| 166 'REACTOME_ID':'Reactome', | |
| 167 'UNIPATHWAY_ID':'UniPathWay', | |
| 168 #Category:Gene expression databases | |
| 169 'CLEANEX_ID':'CleanEx', | |
| 170 #Category:Other | |
| 171 'CHITARS_ID':'ChiTaRS', | |
| 172 'GENOMERNAI_ID':'GenomeRNAi', | |
| 173 'GENEWIKI_ID':'GeneWiki', | |
| 174 'NEXTBIO_ID':'NextBio' | |
| 175 } | |
| 176 | |
| 177 """ | |
| 178 TODO: | |
| 179 Would be better to be able to infer the from_id type for the input accession/ID especially for a fasta file | |
| 180 - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession. | |
| 181 | |
| 182 # rather than an array of input accessions, we need to put them in a dict() by type | |
| 183 def addAccession(id_dict,id_type,id): | |
| 184 if id_type not in id_dict: | |
| 185 id_dict[id_type] = []-++ | |
| 186 id_dict[id_type] = [].append(id) | |
| 187 | |
| 188 # returns(accession, id_type) | |
| 189 def getAccession(header, matchType): | |
| 190 # (format regex pattern, FROM_ID) | |
| 191 # TODO try to determine which type of accession ID we have by matching by regular expressions | |
| 192 # each regex match should have a groups[0] that given the accession | |
| 193 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples | |
| 194 fmts = [ | |
| 195 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), | |
| 196 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'), | |
| 197 ('>/UniRef\d+_(\w+).*','ACC+ID'), | |
| 198 ('>/(UPI\d+).*','ACC+ID'), | |
| 199 ('NP_\d+\.\d+','ACC+ID'), | |
| 200 ('ENSP\d+','ENSEMBL_PRO_ID'), | |
| 201 ('ENST\d+','ENSEMBL_TRS_ID'), | |
| 202 ] | |
| 203 for pat,cat in fmts: | |
| 204 m = re.match(pat,header) | |
| 205 if m: | |
| 206 matchType.append(cat) | |
| 207 return (m.groups()[0],cat) | |
| 208 matchType.append('ACC+ID') | |
| 209 return (header,'ACC+ID') | |
| 210 | |
| 211 # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types | |
| 212 idMaps = dict() # {to_id,idMap} | |
| 213 for to_id in options.to_id: | |
| 214 idMap = dict() | |
| 215 idMaps[to_id] = idMap | |
| 216 for (from_id,ids) id_dict.items(): | |
| 217 # limit the requests to 500 at a time | |
| 218 idx = range(0,len(ids),500) | |
| 219 idx.append(len(ids)) | |
| 220 for i in range(len(idx)-1): | |
| 221 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap) | |
| 222 """ | |
| 223 | |
| 224 """ | |
| 225 Some examples of fasta ID lines From: | |
| 226 https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java | |
| 227 >sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ... | |
| 228 >gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ... | |
| 229 >IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr | |
| 230 >HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein. | |
| 231 >OExyz (OExyz) xxx xxx xxx | |
| 232 >hflu_lsi_xxxx xxx xxx xxx | |
| 233 >C_tr_Lx_x [xxx - xxx] | xxx xxx | |
| 234 >M. tub.xxx|Rvxxx| xxx xxx | |
| 235 // Drosophile DB. | |
| 236 // We need to find two elements: | |
| 237 // - the accession String (retrieved as the trimmed version of everything | |
| 238 // up to (and NOT including) " pep:" | |
| 239 // - the description (everything (trimmed) starting from (and including) the " pep:". | |
| 240 >CGxxxxxx pep:xxxxxx | |
| 241 >xxxx xxx SGDID:xxxx xxx | |
| 242 >generic_some_tag|proten_accession|a description for this protein | |
| 243 // Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as | |
| 244 // present in the Expasy FTP FASTA file. | |
| 245 // Is formatted something like this: | |
| 246 >XXX_YYYY (acc) rest | |
| 247 >sp|accession|ID descr rest (including taxonomy, if available | |
| 248 >tr|accession|ID descr rest (including taxonomy, if available) | |
| 249 >nxp|NX_P02768-1|ALB|Serum albumin|Iso | |
| 250 // New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as | |
| 251 // present in the Expasy FTP FASTA file. | |
| 252 // Is formatted something like this: | |
| 253 >accession|ID descr rest (including taxonomy, if available) | |
| 254 // Flybase FASTA format. | |
| 255 >FBxxx type=xxx | |
| 256 // A header translating a genome sequence into a protein sequences. | |
| 257 // We need to find two elements, separated by a space: | |
| 258 // - the accession string (retrieved as the first part of a space delimited String). | |
| 259 // - the nucleic acid start and stop site (between brackets, separated by a '-'). | |
| 260 >dm345_3L-sense [234353534-234353938] | |
| 261 >dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database | |
| 262 >synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF | |
| 263 // The Arabidopsis thaliana database; TAIR format | |
| 264 >AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166 | |
| 265 ... | |
| 266 // Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all. | |
| 267 // Testing for this is somewhat more complicated. | |
| 268 // Often used simple header; looks like: | |
| 269 // >NP0465 (NP0465) A description for this protein. | |
| 270 // We need to find two elements: | |
| 271 // - the accession String (easily retrieved as the next String until a space is encountered). | |
| 272 // - the description | |
| 273 >NP0465 (NP0465) A description for this protein. | |
| 274 GenBank gb|accession|locus | |
| 275 EMBL Data Library emb|accession|locus | |
| 276 DDBJ, DNA Database of Japan dbj|accession|locus | |
| 277 NBRF PIR pir||entry | |
| 278 Protein Research Foundation prf||name | |
| 279 SWISS-PROT sp|accession|entry name | |
| 280 Brookhaven Protein Data Bank pdb|entry|chain | |
| 281 Patents pat|country|number | |
| 282 GenInfo Backbone Id bbs|number | |
| 283 General database identifier gnl|database|identifier | |
| 284 NCBI Reference Sequence ref|accession|locus | |
| 285 Local Sequence identifier lcl|identifier | |
| 286 """ | |
| 287 | |
| 288 | |
| 289 def getFastaAccession(header): | |
| 290 # TODO parse the ID and return the accession | |
| 291 # (format regex pattern, FROM_ID) | |
| 292 # TODO try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions | |
| 293 # each regex match should have a groups[0] that given the accession | |
| 294 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples | |
| 295 fmts = [ | |
| 296 ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), | |
| 297 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'), | |
| 298 ('NP_\d+\.\d+','ACC+ID'), | |
| 299 | |
| 300 ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), | |
| 301 ('(...*)_....*','ID'), | |
| 302 ('>/(UPI\d+).*','UPARC'), | |
| 303 ('>/UniRef50_(\w+).*','NF50'), | |
| 304 ('UniRef90_(\w+).*','NF90'), | |
| 305 ('UniRef100_(\w+).*','NF100'), | |
| 306 ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'), | |
| 307 ('ENSP\d+','ENSEMBL_PRO_ID'), | |
| 308 ('ENST\d+','ENSEMBL_TRS_ID') | |
| 309 ] | |
| 310 for pat,cat in fmts: | |
| 311 m = re.match(pat,header) | |
| 312 if m: | |
| 313 return m.groups()[0] | |
| 314 return header | |
| 315 | |
| 316 # rather than an array of input accessions, we need to put them in a dict() by type | |
| 317 def addAccession(id_dict,id_type,id): | |
| 318 for idtype in id_type: | |
| 319 if idtype not in id_dict: | |
| 320 id_dict[idtype] = [] | |
| 321 id_dict[idtype].append(id) | |
| 322 | |
| 323 # returns(accession, id_type) | |
| 324 def getAccession(header, matchType): | |
| 325 # (format regex pattern, FROM_ID) | |
| 326 # TODO try to determine which type of accession ID we have by matching by regular expressions | |
| 327 # each regex match should have a groups[0] that given the accession | |
| 328 # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples | |
| 329 fmts = [ | |
| 330 ('([OPQ][0-9][A-Z0-9]{3}[0-9])','ACC'), | |
| 331 ('([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), | |
| 332 ('(THISDOESNTWORK...*)_....*','ID'), | |
| 333 ('UPI(\d+).*','UPARC'), | |
| 334 ('(UniRef50_\w+.*)','NF50'), | |
| 335 ('UniRef90_(\w+).*','NF90'), | |
| 336 ('(UniRef100_\w+.*)','NF100'), | |
| 337 ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'), | |
| 338 ('([A-Z]*\d*\.\d$)','EMBL'), | |
| 339 ('([IBC]\d{5})','PIR'), | |
| 340 ('(Hs\.\d*)','UNIGENE_ID'), | |
| 341 ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'), | |
| 342 ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'), | |
| 343 ('(\d[A-Z0-9]{3})','PDB_ID'), | |
| 344 ('(DP\d{5})','DISPROT_ID'), | |
| 345 | |
| 346 ('(\d*$)','P_ENTREZGENEID'), | |
| 347 ('(\d*$)','P_GI'), | |
| 348 ('(\d*$)','BIOGRID_ID'), | |
| 349 ('(\d*$)','GUIDETOPHARMACOLOGY_ID'), | |
| 350 ('(\d*$)','ALLERGOME_ID'), | |
| 351 ('(\d*$)','PEROXIBASE_ID'), | |
| 352 ('(\d*$)','REBASE_ID'), | |
| 353 ('(\d*$)','DNASU_ID'), | |
| 354 ('(\d*$)','DMDM_ID'), | |
| 355 | |
| 356 ('(DIP-\d*N$)','DIP_ID'), | |
| 357 ('(MINT-\d*)','MINT_ID'), | |
| 358 ('(9606\.ENSP\d*)','STRING_ID'), | |
| 359 ('(CHEMBL\d*)','CHEMBL_ID'), | |
| 360 ('(DB\d*)','DRUGBANK_ID'), | |
| 361 ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'), | |
| 362 ('NOTHING POPS UP','MYCOCLAP_ID'), | |
| 363 ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'), | |
| 364 ('NOTHING POPS UP','WORLD_2DPAGE_ID'), | |
| 365 ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'), | |
| 366 ('(ENSP\d+)','ENSEMBL_PRO_ID'), | |
| 367 ('(ENST\d+)','ENSEMBL_TRS_ID'), | |
| 368 (' ','ENSEMBLGENOME_ID'), | |
| 369 (' ','ENSEMBLGENOME_PRO_ID'), | |
| 370 (' ','ENSEMBLGENOME_TRS_ID'), | |
| 371 ('(hsa:\d*)','KEGG_ID'), | |
| 372 ('(uc\d*[a-z]*\.\d$)','UCSC_ID'), | |
| 373 ('(.*[CKN]OG\d*)','EGGNOG_ID') | |
| 374 ] | |
| 375 n = re.match('\d*$', header) | |
| 376 if n: # For ambiguous number only ID types | |
| 377 numIDtypes = [ | |
| 378 'P_ENTREZGENEID', | |
| 379 'P_GI', | |
| 380 'DMDM_ID', | |
| 381 'BIOGRID_ID', | |
| 382 'GUIDETOPHARMACOLOGY_ID', | |
| 383 'ALLERGOME_ID', | |
| 384 'PEROXIBASE_ID', | |
| 385 'REBASE_ID', | |
| 386 'DNASU_ID'] | |
| 387 ambiguous = [] | |
| 388 for numIDs in numIDtypes: | |
| 389 nm = getIdMapping([header],numIDs,'ACC',numReturn=True) | |
| 390 if nm != None: | |
| 391 ambiguous.append(nm) | |
| 392 matchType.append(ambiguous) | |
| 393 return (header, ambiguous) | |
| 394 for pat,cat in fmts: | |
| 395 m = re.match(pat,header) | |
| 396 if m: | |
| 397 matchType.append([cat]) | |
| 398 return (m.groups()[0],[cat]) | |
| 399 matchType.append(['ACC+ID']) | |
| 400 return (header,['ACC+ID']) | |
| 401 | |
| 402 | |
| 403 def read_tabular(filepath,col): | |
| 404 accessions = [] | |
| 405 with open(filepath) as fp: | |
| 406 for i,line in enumerate(fp): | |
| 407 if line.strip() == '' or line.startswith('#'): | |
| 408 continue | |
| 409 fields = line.rstrip('\n').split('\t') | |
| 410 accession = fields[col] | |
| 411 accessions.append(accession) | |
| 412 return accessions | |
| 413 | |
| 414 def get_fasta_entries(fp): | |
| 415 name, seq = None, [] | |
| 416 for line in fp: | |
| 417 line = line.rstrip() | |
| 418 if line.startswith(">"): | |
| 419 if name: yield (name, ''.join(seq)) | |
| 420 name, seq = line, [] | |
| 421 else: | |
| 422 seq.append(line) | |
| 423 if name: yield (name, ''.join(seq)) | |
| 424 | |
| 425 def read_fasta(filepath): | |
| 426 accessions = [] | |
| 427 with open(filepath) as fp: | |
| 428 for id, peptide in get_fasta_entries(fp): | |
| 429 accessions.append(getFastaAccession(id)) | |
| 430 return accessions | |
| 431 | |
| 432 def read_mzid(fp): | |
| 433 accessions = [] | |
| 434 for event, elem in ET.iterparse(fp): | |
| 435 if event == 'end': | |
| 436 if re.search('DBSequence',elem.tag): | |
| 437 accessions.append(elem.attrib['accession']) | |
| 438 return accessions | |
| 439 | |
| 440 def read_pepxml(fp): | |
| 441 accessions = [] | |
| 442 for event, elem in ET.iterparse(fp): | |
| 443 if event == 'end': | |
| 444 if re.search('search_hit',elem.tag): | |
| 445 accessions.append(elem.get('protein')) | |
| 446 return accessions | |
| 447 | |
| 448 def getUniprotSequence(uniprot_id): | |
| 449 url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id | |
| 450 print url | |
| 451 fp = urllib2.urlopen(url) | |
| 452 for id, seq in get_fasta_entries(fp): | |
| 453 if seq: | |
| 454 return seq | |
| 455 return '' | |
| 456 | |
| 457 | |
| 458 | |
| 459 | |
| 460 def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,numReturn=None,crossReference=None,idMaps=None): | |
| 461 # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)])) | |
| 462 if not accessions: | |
| 463 return | |
| 464 url = 'http://www.uniprot.org/mapping/' | |
| 465 if to_id != 'ACC' and from_id != 'ACC': | |
| 466 crossReference = True | |
| 467 oldto_id = to_id | |
| 468 to_id = 'ACC' | |
| 469 #accessions= getIdMapping(accessions,from_id,'ACC',idMap=idMap,crossReference=False) | |
| 470 #if accessions != None: | |
| 471 # if idMap != None: | |
| 472 # idMap[accessions[0]] = {from_id:['N/A']} | |
| 473 params = { | |
| 474 'from': from_id, | |
| 475 'to': to_id, | |
| 476 'format':'tab', | |
| 477 'query': '\n'.join(accessions) | |
| 478 } | |
| 479 data = urllib.urlencode(params) | |
| 480 request = urllib2.Request(url, data) | |
| 481 contact = "" # Please set your email address here to help us debug in case of problems. | |
| 482 request.add_header('User-Agent', 'Python %s' % contact) | |
| 483 response = None | |
| 484 for i in range(3): | |
| 485 try: | |
| 486 response = urllib2.urlopen(request) | |
| 487 break | |
| 488 except Exception, e: | |
| 489 warn_err("%s",exit_code=None) | |
| 490 if response: | |
| 491 response.next() | |
| 492 id3 = [] | |
| 493 for line in response: | |
| 494 # print >> sys.stderr, "idMap: %s" % line | |
| 495 if fh: | |
| 496 fh.write(line) | |
| 497 if numReturn: | |
| 498 id1,id2 = line.strip().split('\t') | |
| 499 return from_id | |
| 500 if idMap != None: | |
| 501 id1,id2 = line.strip().split('\t') | |
| 502 # print >> sys.stderr, "idMap: %s:%s" % (id1,id2) | |
| 503 if crossReference==True: | |
| 504 id2 = getIdMapping([id2],'ACC',oldto_id,crossReference='return',idMap={}) | |
| 505 print id2 | |
| 506 if crossReference=='return': | |
| 507 id3.append(id2) | |
| 508 print id3 | |
| 509 else: | |
| 510 # break | |
| 511 if idMap.get(id1,'nothing') == 'nothing': | |
| 512 #idMap[id1] = [id2] | |
| 513 idMap[id1] = {from_id:[id2]} | |
| 514 else: | |
| 515 id3 = idMap[id1] | |
| 516 #if id2 not in id3: | |
| 517 #print id3 | |
| 518 if id3.get(from_id,'nothing') == 'nothing': | |
| 519 if crossReference: | |
| 520 id3[from_id] = id2 | |
| 521 else: | |
| 522 id3[from_id] = [id2] | |
| 523 else: | |
| 524 id4 = id3[from_id] | |
| 525 if crossReference: | |
| 526 id4.extend(id2) | |
| 527 else: | |
| 528 id4.append(id2) | |
| 529 else: | |
| 530 id1,id2 = line.strip().split('\t') | |
| 531 id3[id1] = id2 | |
| 532 try: | |
| 533 return id3 | |
| 534 except: | |
| 535 return | |
| 536 #idMap[id1] = id3 | |
| 537 | |
| 538 #Parse Command Line | |
| 539 parser = optparse.OptionParser() | |
| 540 # input files | |
| 541 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) | |
| 542 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) | |
| 543 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) | |
| 544 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) | |
| 545 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) | |
| 546 # Decoy pattern | |
| 547 parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' ) | |
| 548 # filter patterns | |
| 549 parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' ) | |
| 550 parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' ) | |
| 551 # Unipept Flags | |
| 552 parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' ) | |
| 553 parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' ) | |
| 554 # output files | |
| 555 parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values') | |
| 556 # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' ) | |
| 557 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' ) | |
| 558 (options, args) = parser.parse_args() | |
| 559 if options.version: | |
| 560 print >> sys.stdout,"%s" % version | |
| 561 sys.exit(0) | |
| 562 | |
| 563 accessions = [] | |
| 564 ## Get accessions | |
| 565 if options.mzid: | |
| 566 accessions += read_mzid(options.mzid) | |
| 567 if options.pepxml: | |
| 568 accessions += read_pepxml(options.pepxml) | |
| 569 if options.tabular: | |
| 570 accessions += read_tabular(options.tabular,options.column) | |
| 571 if options.fasta: | |
| 572 accessions += read_fasta(options.fasta) | |
| 573 if args and len(args) > 0: | |
| 574 for i,accession in enumerate(args): | |
| 575 accessions.append(accession) | |
| 576 # filter accessions | |
| 577 if options.decoy: | |
| 578 filtered_accs = [re.sub(options.decoy,'',x) for x in accessions] | |
| 579 accessions = filtered_accs | |
| 580 if options.include_pat: | |
| 581 filtered_accs = [] | |
| 582 for acc in accessions: | |
| 583 if re.match(options.include_pat,acc): | |
| 584 filtered_accs.append(acc) | |
| 585 accessions = filtered_accs | |
| 586 if options.exclude_pat: | |
| 587 filtered_accs = [] | |
| 588 for acc in accessions: | |
| 589 if not re.match(options.exclude_pat,acc): | |
| 590 filtered_accs.append(acc) | |
| 591 accessions = filtered_accs | |
| 592 if len(accessions) < 1: | |
| 593 warn_err("No accessions input!",exit_code=1) | |
| 594 if options.output != None: | |
| 595 try: | |
| 596 outputPath = os.path.abspath(options.output) | |
| 597 outputFile = open(outputPath, 'w') | |
| 598 except Exception, e: | |
| 599 print >> sys.stderr, "failed: %s" % e | |
| 600 exit(3) | |
| 601 else: | |
| 602 outputFile = sys.stdout | |
| 603 | |
| 604 | |
| 605 # Removes duplicates in accessions | |
| 606 seen = set() | |
| 607 seen_add = seen.add | |
| 608 accessions = [x for x in accessions if not (x in seen or seen_add(x))] | |
| 609 # Sorts accessions to inferred ID types i+n a dictionary | |
| 610 id_dict = {} | |
| 611 matchType = [] | |
| 612 for header in accessions: | |
| 613 id , id_types = getAccession(header,matchType) | |
| 614 addAccession(id_dict,id_types,id) | |
| 615 #print id_dict | |
| 616 #print id_dict | |
| 617 idMaps = dict() # {to_id,idMap} | |
| 618 #print matchType | |
| 619 #print id_dict.items() | |
| 620 for to_id in options.to_id: | |
| 621 idMap = dict() | |
| 622 idMaps[to_id] = idMap | |
| 623 for (from_id,ids) in id_dict.items(): | |
| 624 # limit the requests to 500 at a time | |
| 625 idx = range(0,len(ids),500) | |
| 626 idx.append(len(ids)) | |
| 627 for i in range(len(idx)-1): | |
| 628 getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,idMaps=idMaps) | |
| 629 #Write output | |
| 630 #Output Table Header | |
| 631 outputFile.write("\n#%-17s" % (options.from_id)) | |
| 632 for t in options.to_id: | |
| 633 outputFile.write("%-18s" % t) | |
| 634 outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n") | |
| 635 | |
| 636 # Create an output-friendly matrix | |
| 637 idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))] | |
| 638 for a, acc in enumerate(accessions): | |
| 639 idLength = 0 | |
| 640 for i, to_id in enumerate(options.to_id): # [[ids],[ids]] | |
| 641 idArrayColumn = [] | |
| 642 idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3] | |
| 643 for from_id in idPairs: | |
| 644 ids = idPairs[from_id] | |
| 645 for id_ in ids: | |
| 646 idArrayColumn.append("%s[%s]" % (id_,from_id)) | |
| 647 if idLength < len(idArrayColumn): | |
| 648 idLength = len(idArrayColumn) | |
| 649 idArray[a][i].extend(idArrayColumn) | |
| 650 for y in range(idLength): | |
| 651 outputFile.write("%-18s" % acc) | |
| 652 for x in range(len(options.to_id)): | |
| 653 try: | |
| 654 outputFile.write("%-18s" % idArray[a][x][y]) | |
| 655 except: | |
| 656 outputFile.write(" " * 18) | |
| 657 outputFile.write("\n") | |
| 658 # Output Matrix | |
| 659 | |
| 660 | |
| 661 | |
| 662 #print idMaps | |
| 663 # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv | |
| 664 stop = timeit.default_timer() | |
| 665 outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds")) | |
| 666 | |
| 667 if __name__ == "__main__" : __main__() | |
| 668 |
