| 3 | 1 #!/usr/bin/env python | 
|  | 2 """ | 
|  | 3 # | 
|  | 4 #------------------------------------------------------------------------------ | 
|  | 5 #                         University of Minnesota | 
|  | 6 #         Copyright 2015, Regents of the University of Minnesota | 
|  | 7 #------------------------------------------------------------------------------ | 
|  | 8 # Author: | 
|  | 9 # | 
|  | 10 #  James E Johnson | 
|  | 11 # | 
|  | 12 #------------------------------------------------------------------------------ | 
|  | 13 """ | 
|  | 14 import json | 
|  | 15 import logging | 
|  | 16 import optparse | 
|  | 17 from optparse import OptionParser | 
|  | 18 import os | 
|  | 19 import sys | 
|  | 20 import re | 
|  | 21 import urllib | 
|  | 22 import urllib2 | 
|  | 23 import timeit | 
|  | 24 start = timeit.default_timer() | 
|  | 25 try: | 
|  | 26     import xml.etree.cElementTree as ET | 
|  | 27 except ImportError: | 
|  | 28     import xml.etree.ElementTree as ET | 
|  | 29 | 
|  | 30 """print | 
|  | 31 http://www.uniprot.org/help/programmatic_access#id_mapping_examples | 
|  | 32 http://www.uniprot.org/help/uploadlists | 
|  | 33 | 
|  | 34 From input files or stdin get accessions/IDs to map. | 
|  | 35 An input option can specify what id_type the input accessions/IDs are, | 
|  | 36 otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type | 
|  | 37 | 
|  | 38 input_dict = dict() | 
|  | 39 for id_string in input_ids: | 
|  | 40   (id,id_type) = getAccession(id_string) | 
|  | 41   if not id_type in input_dict: | 
|  | 42     input_dict[id_type] = [] | 
|  | 43   input_dict[id_type].append(id) | 
|  | 44 | 
|  | 45 | 
|  | 46 # We need to retrieve by from_id_type and to_id_type | 
|  | 47 for id_type in input_dict: | 
|  | 48   for | 
|  | 49 | 
|  | 50 | 
|  | 51 tabular output: | 
|  | 52 #input_id	to_id	to_id | 
|  | 53 | 
|  | 54 | 
|  | 55 """ | 
|  | 56 def warn_err(msg,exit_code=1): | 
|  | 57     sys.stderr.write(msg) | 
|  | 58     if exit_code: | 
|  | 59       sys.exit(exit_code) | 
|  | 60 | 
|  | 61 def __main__(): | 
|  | 62   version = '1.0' | 
|  | 63   idDict = { | 
|  | 64     #Category:UniProt | 
|  | 65     'ACC+ID':'UniProtKB AC/ID', | 
|  | 66     'ACC':'UniProtKB AC', | 
|  | 67     'ID':'UniProtKB ID', | 
|  | 68     'UPARC':'UniParc', | 
|  | 69     'NF50':'UniRef50', | 
|  | 70     'NF90':'UniRef90', | 
|  | 71     'NF100':'UniRef100', | 
|  | 72     #Category:Other sequence databases | 
|  | 73     'EMBL_ID':'EMBL/GenBank/DDBJ', | 
|  | 74     'EMBL':'EMBL/GenBank/DDBJ CDS', | 
|  | 75     'PIR':'PIR', | 
|  | 76     'UNIGENE_ID':'UniGene', | 
|  | 77     'P_ENTREZGENEID':'Entrez Gene (GeneID)', | 
|  | 78     'P_GI':'GI number*', | 
|  | 79     'P_REFSEQ_AC':'RefSeq Protein', | 
|  | 80     'REFSEQ_NT_ID':'RefSeq Nucleotide', | 
|  | 81     #Category:3D structure databases | 
|  | 82     'PDB_ID':'PDB', | 
|  | 83     'DISPROT_ID':'DisProt', | 
|  | 84     #Category:Protein-protein interaction databases | 
|  | 85     'BIOGRID_ID':'BioGrid', | 
|  | 86     'DIP_ID':'DIP', | 
|  | 87     'MINT_ID':'MINT', | 
|  | 88     'STRING_ID':'STRING', | 
|  | 89     #Category:Chemistry | 
|  | 90     'CHEMBL_ID':'ChEMBL', | 
|  | 91     'DRUGBANK_ID':'DrugBank', | 
|  | 92     'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY', | 
|  | 93     #Category:Protein family/group databases | 
|  | 94     'ALLERGOME_ID':'Allergome', | 
|  | 95     'MEROPS_ID':'MEROPS', | 
|  | 96     'MYCOCLAP_ID':'mycoCLAP', | 
|  | 97     'PEROXIBASE_ID':'PeroxiBase', | 
|  | 98     'REBASE_ID':'REBASE', | 
|  | 99     'TCDB_ID':'TCDB', | 
|  | 100     #Category:Polymorphism databases | 
|  | 101     'DMDM_ID':'DMDM', | 
|  | 102     #Category:2D gel databases | 
|  | 103     'WORLD_2DPAGE_ID':'World-2DPAGE', | 
|  | 104     #Category:Protocols and materials databases | 
|  | 105     'DNASU_ID':'DNASU', | 
|  | 106     #Category:Genome annotation databases | 
|  | 107     'ENSEMBL_ID':'Ensembl', | 
|  | 108     'ENSEMBL_PRO_ID':'Ensembl Protein', | 
|  | 109     'ENSEMBL_TRS_ID':'Ensembl Transcript', | 
|  | 110     'ENSEMBLGENOME_ID':'Ensembl Genomes', | 
|  | 111     'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein', | 
|  | 112     'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript', | 
|  | 113     'P_ENTREZGENEID':'GeneID', | 
|  | 114     'KEGG_ID':'KEGG', | 
|  | 115     'PATRIC_ID':'PATRIC', | 
|  | 116     'UCSC_ID':'UCSC', | 
|  | 117     'VECTORBASE_ID':'VectorBase', | 
|  | 118     #Category:Organism-specific gene databases | 
|  | 119     'ARACHNOSERVER_ID':'ArachnoServer', | 
|  | 120     'CGD':'CGD', | 
|  | 121     'CONOSERVER_ID':'ConoServer', | 
|  | 122     'CYGD_ID':'CYGD', | 
|  | 123     'DICTYBASE_ID':'dictyBase', | 
|  | 124     'ECHOBASE_ID':'EchoBASE', | 
|  | 125     'ECOGENE_ID':'EcoGene', | 
|  | 126     'EUHCVDB_ID':'euHCVdb', | 
|  | 127     'EUPATHDB_ID':'EuPathDB', | 
|  | 128     'FLYBASE_ID':'FlyBase', | 
|  | 129     'GENECARDS_ID':'GeneCards', | 
|  | 130     'GENEFARM_ID':'GeneFarm', | 
|  | 131     'GENOLIST_ID':'GenoList', | 
|  | 132     'H_INVDB_ID':'H-InvDB', | 
|  | 133     'HGNC_ID':'HGNC', | 
|  | 134     'HPA_ID':'HPA', | 
|  | 135     'LEGIOLIST_ID':'LegioList', | 
|  | 136     'LEPROMA_ID':'Leproma', | 
|  | 137     'MAIZEGDB_ID':'MaizeGDB', | 
|  | 138     'MIM_ID':'MIM', | 
|  | 139     'MGI_ID':'MGI', | 
|  | 140     'NEXTPROT_ID':'neXtProt', | 
|  | 141     'ORPHANET_ID':'Orphanet', | 
|  | 142     'PHARMGKB_ID':'PharmGKB', | 
|  | 143     'POMBASE_ID':'PomBase', | 
|  | 144     'PSEUDOCAP_ID':'PseudoCAP', | 
|  | 145     'RGD_ID':'RGD', | 
|  | 146     'SGD_ID':'SGD', | 
|  | 147     'TAIR_ID':'TAIR', | 
|  | 148     'TUBERCULIST_ID':'TubercuList', | 
|  | 149     'WORMBASE_ID':'WormBase', | 
|  | 150     'WORMBASE_TRS_ID':'WormBase Transcript', | 
|  | 151     'WORMBASE_PRO_ID':'WormBase Protein', | 
|  | 152     'XENBASE_ID':'Xenbase', | 
|  | 153     'ZFIN_ID':'ZFIN', | 
|  | 154     #Category:Phylogenomic databases | 
|  | 155     'EGGNOG_ID':'eggNOG', | 
|  | 156     'GENETREE_ID':'GeneTree', | 
|  | 157     'HOGENOM_ID':'HOGENOM', | 
|  | 158     'HOVERGEN_ID':'HOVERGEN', | 
|  | 159     'KO_ID':'KO', | 
|  | 160     'OMA_ID':'OMA', | 
|  | 161     'ORTHODB_ID':'OrthoDB', | 
|  | 162     'PROTCLUSTDB_ID':'ProtClustDB', | 
|  | 163     'TREEFAM_ID':'TreeFam', | 
|  | 164     #Category:Enzyme and pathway databases | 
|  | 165     'BIOCYC_ID':'BioCyc', | 
|  | 166     'REACTOME_ID':'Reactome', | 
|  | 167     'UNIPATHWAY_ID':'UniPathWay', | 
|  | 168     #Category:Gene expression databases | 
|  | 169     'CLEANEX_ID':'CleanEx', | 
|  | 170     #Category:Other | 
|  | 171     'CHITARS_ID':'ChiTaRS', | 
|  | 172     'GENOMERNAI_ID':'GenomeRNAi', | 
|  | 173     'GENEWIKI_ID':'GeneWiki', | 
|  | 174     'NEXTBIO_ID':'NextBio' | 
|  | 175   } | 
|  | 176 | 
|  | 177   """ | 
|  | 178   TODO: | 
|  | 179     Would be better to be able to infer the from_id type for the input accession/ID  especially for a fasta file | 
|  | 180     - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession. | 
|  | 181 | 
|  | 182   # rather than an array of input accessions, we need to put them in a dict() by type | 
|  | 183   def addAccession(id_dict,id_type,id): | 
|  | 184     if id_type not in id_dict: | 
|  | 185       id_dict[id_type] = []-++ | 
|  | 186     id_dict[id_type] = [].append(id) | 
|  | 187 | 
|  | 188   # returns(accession, id_type) | 
|  | 189   def getAccession(header, matchType): | 
|  | 190     # (format regex pattern, FROM_ID) | 
|  | 191     # TODO  try to determine which type of accession ID we have by matching by regular expressions | 
|  | 192     # each regex match should have a groups[0] that given the accession | 
|  | 193     # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples | 
|  | 194     fmts = [ | 
|  | 195       ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), | 
|  | 196       ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'), | 
|  | 197       ('>/UniRef\d+_(\w+).*','ACC+ID'), | 
|  | 198       ('>/(UPI\d+).*','ACC+ID'), | 
|  | 199       ('NP_\d+\.\d+','ACC+ID'), | 
|  | 200       ('ENSP\d+','ENSEMBL_PRO_ID'), | 
|  | 201       ('ENST\d+','ENSEMBL_TRS_ID'), | 
|  | 202     ] | 
|  | 203     for pat,cat in fmts: | 
|  | 204       m = re.match(pat,header) | 
|  | 205       if m: | 
|  | 206         matchType.append(cat) | 
|  | 207         return (m.groups()[0],cat) | 
|  | 208     matchType.append('ACC+ID') | 
|  | 209     return (header,'ACC+ID') | 
|  | 210 | 
|  | 211     # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types | 
|  | 212   idMaps = dict() # {to_id,idMap} | 
|  | 213   for to_id in options.to_id: | 
|  | 214     idMap = dict() | 
|  | 215     idMaps[to_id] = idMap | 
|  | 216     for (from_id,ids) id_dict.items(): | 
|  | 217       # limit the requests to 500 at a time | 
|  | 218       idx = range(0,len(ids),500) | 
|  | 219       idx.append(len(ids)) | 
|  | 220       for i in range(len(idx)-1): | 
|  | 221         getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap) | 
|  | 222   """ | 
|  | 223 | 
|  | 224   """ | 
|  | 225   Some examples of fasta ID lines From: | 
|  | 226     https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java | 
|  | 227 >sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ... | 
|  | 228 >gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ... | 
|  | 229 >IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr | 
|  | 230 >HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein. | 
|  | 231 >OExyz (OExyz) xxx xxx xxx | 
|  | 232 >hflu_lsi_xxxx xxx xxx xxx | 
|  | 233 >C_tr_Lx_x [xxx - xxx] | xxx xxx | 
|  | 234 >M. tub.xxx|Rvxxx| xxx xxx | 
|  | 235 // Drosophile DB. | 
|  | 236 // We need to find two elements: | 
|  | 237 //   - the accession String (retrieved as the trimmed version of everything | 
|  | 238 //     up to (and NOT including) " pep:" | 
|  | 239 //   - the description (everything (trimmed) starting from (and including) the " pep:". | 
|  | 240 >CGxxxxxx pep:xxxxxx | 
|  | 241 >xxxx xxx SGDID:xxxx xxx | 
|  | 242 >generic_some_tag|proten_accession|a description for this protein | 
|  | 243 // Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as | 
|  | 244 // present in the Expasy FTP FASTA file. | 
|  | 245 // Is formatted something like this: | 
|  | 246 >XXX_YYYY (acc) rest | 
|  | 247 >sp|accession|ID descr rest (including taxonomy, if available | 
|  | 248 >tr|accession|ID descr rest (including taxonomy, if available) | 
|  | 249 >nxp|NX_P02768-1|ALB|Serum albumin|Iso | 
|  | 250 // New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as | 
|  | 251 // present in the Expasy FTP FASTA file. | 
|  | 252 // Is formatted something like this: | 
|  | 253 >accession|ID descr rest (including taxonomy, if available) | 
|  | 254 // Flybase FASTA format. | 
|  | 255 >FBxxx type=xxx | 
|  | 256 // A header translating a genome sequence into a protein sequences. | 
|  | 257 // We need to find two elements, separated by a space: | 
|  | 258 //   - the accession string (retrieved as the first part of a space delimited String). | 
|  | 259 //   - the nucleic acid start and stop site (between brackets, separated by a '-'). | 
|  | 260 >dm345_3L-sense [234353534-234353938] | 
|  | 261 >dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database | 
|  | 262 >synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089]  ORF | 
|  | 263 // The Arabidopsis thaliana database; TAIR format | 
|  | 264 >AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166 | 
|  | 265 ... | 
|  | 266 // Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all. | 
|  | 267 // Testing for this is somewhat more complicated. | 
|  | 268 // Often used simple header; looks like: | 
|  | 269 // >NP0465 (NP0465) A description for this protein. | 
|  | 270 // We need to find two elements: | 
|  | 271 //   - the accession String (easily retrieved as the next String until a space is encountered). | 
|  | 272 //   - the description | 
|  | 273 >NP0465 (NP0465) A description for this protein. | 
|  | 274  GenBank                           gb|accession|locus | 
|  | 275  EMBL Data Library                 emb|accession|locus | 
|  | 276  DDBJ, DNA Database of Japan       dbj|accession|locus | 
|  | 277  NBRF PIR                          pir||entry | 
|  | 278  Protein Research Foundation       prf||name | 
|  | 279  SWISS-PROT                        sp|accession|entry name | 
|  | 280  Brookhaven Protein Data Bank      pdb|entry|chain | 
|  | 281  Patents                           pat|country|number | 
|  | 282  GenInfo Backbone Id               bbs|number | 
|  | 283  General database identifier       gnl|database|identifier | 
|  | 284  NCBI Reference Sequence           ref|accession|locus | 
|  | 285  Local Sequence identifier         lcl|identifier | 
|  | 286   """ | 
|  | 287 | 
|  | 288 | 
|  | 289   def getFastaAccession(header): | 
|  | 290     # TODO parse the ID and return the accession | 
|  | 291     # (format regex pattern, FROM_ID) | 
|  | 292     # TODO  try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions | 
|  | 293     # each regex match should have a groups[0] that given the accession | 
|  | 294     # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples | 
|  | 295     fmts = [ | 
|  | 296       ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), | 
|  | 297       ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'), | 
|  | 298       ('NP_\d+\.\d+','ACC+ID'), | 
|  | 299 | 
|  | 300       ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), | 
|  | 301       ('(...*)_....*','ID'), | 
|  | 302       ('>/(UPI\d+).*','UPARC'), | 
|  | 303       ('>/UniRef50_(\w+).*','NF50'), | 
|  | 304       ('UniRef90_(\w+).*','NF90'), | 
|  | 305       ('UniRef100_(\w+).*','NF100'), | 
|  | 306       ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'), | 
|  | 307       ('ENSP\d+','ENSEMBL_PRO_ID'), | 
|  | 308       ('ENST\d+','ENSEMBL_TRS_ID') | 
|  | 309     ] | 
|  | 310     for pat,cat in fmts: | 
|  | 311       m = re.match(pat,header) | 
|  | 312       if m: | 
|  | 313         return m.groups()[0] | 
|  | 314     return header | 
|  | 315 | 
|  | 316   # rather than an array of input accessions, we need to put them in a dict() by type | 
|  | 317   def addAccession(id_dict,id_type,id): | 
|  | 318     for idtype in id_type: | 
|  | 319       if idtype not in id_dict: | 
|  | 320         id_dict[idtype] = [] | 
|  | 321       id_dict[idtype].append(id) | 
|  | 322 | 
|  | 323   # returns(accession, id_type) | 
|  | 324   def getAccession(header, matchType): | 
|  | 325     # (format regex pattern, FROM_ID) | 
|  | 326     # TODO  try to determine which type of accession ID we have by matching by regular expressions | 
|  | 327     # each regex match should have a groups[0] that given the accession | 
|  | 328     # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples | 
|  | 329     fmts = [ | 
|  | 330       ('([OPQ][0-9][A-Z0-9]{3}[0-9])','ACC'), | 
|  | 331       ('([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), | 
|  | 332       ('(THISDOESNTWORK...*)_....*','ID'), | 
|  | 333       ('UPI(\d+).*','UPARC'), | 
|  | 334       ('(UniRef50_\w+.*)','NF50'), | 
|  | 335       ('UniRef90_(\w+).*','NF90'), | 
|  | 336       ('(UniRef100_\w+.*)','NF100'), | 
|  | 337       ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'), | 
|  | 338       ('([A-Z]*\d*\.\d$)','EMBL'), | 
|  | 339       ('([IBC]\d{5})','PIR'), | 
|  | 340       ('(Hs\.\d*)','UNIGENE_ID'), | 
|  | 341       ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'), | 
|  | 342       ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'), | 
|  | 343       ('(\d[A-Z0-9]{3})','PDB_ID'), | 
|  | 344       ('(DP\d{5})','DISPROT_ID'), | 
|  | 345 | 
|  | 346       ('(\d*$)','P_ENTREZGENEID'), | 
|  | 347       ('(\d*$)','P_GI'), | 
|  | 348       ('(\d*$)','BIOGRID_ID'), | 
|  | 349       ('(\d*$)','GUIDETOPHARMACOLOGY_ID'), | 
|  | 350       ('(\d*$)','ALLERGOME_ID'), | 
|  | 351       ('(\d*$)','PEROXIBASE_ID'), | 
|  | 352       ('(\d*$)','REBASE_ID'), | 
|  | 353       ('(\d*$)','DNASU_ID'), | 
|  | 354       ('(\d*$)','DMDM_ID'), | 
|  | 355 | 
|  | 356       ('(DIP-\d*N$)','DIP_ID'), | 
|  | 357       ('(MINT-\d*)','MINT_ID'), | 
|  | 358       ('(9606\.ENSP\d*)','STRING_ID'), | 
|  | 359       ('(CHEMBL\d*)','CHEMBL_ID'), | 
|  | 360       ('(DB\d*)','DRUGBANK_ID'), | 
|  | 361       ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'), | 
|  | 362       ('NOTHING POPS UP','MYCOCLAP_ID'), | 
|  | 363       ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'), | 
|  | 364       ('NOTHING POPS UP','WORLD_2DPAGE_ID'), | 
|  | 365       ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'), | 
|  | 366       ('(ENSP\d+)','ENSEMBL_PRO_ID'), | 
|  | 367       ('(ENST\d+)','ENSEMBL_TRS_ID'), | 
|  | 368       (' ','ENSEMBLGENOME_ID'), | 
|  | 369       (' ','ENSEMBLGENOME_PRO_ID'), | 
|  | 370       (' ','ENSEMBLGENOME_TRS_ID'), | 
|  | 371       ('(hsa:\d*)','KEGG_ID'), | 
|  | 372       ('(uc\d*[a-z]*\.\d$)','UCSC_ID'), | 
|  | 373       ('(.*[CKN]OG\d*)','EGGNOG_ID') | 
|  | 374     ] | 
|  | 375     n = re.match('\d*$', header) | 
|  | 376     if n: # For ambiguous number only ID types | 
|  | 377          numIDtypes = [ | 
|  | 378            'P_ENTREZGENEID', | 
|  | 379            'P_GI', | 
|  | 380            'DMDM_ID', | 
|  | 381            'BIOGRID_ID', | 
|  | 382            'GUIDETOPHARMACOLOGY_ID', | 
|  | 383            'ALLERGOME_ID', | 
|  | 384            'PEROXIBASE_ID', | 
|  | 385            'REBASE_ID', | 
|  | 386            'DNASU_ID'] | 
|  | 387          ambiguous = [] | 
|  | 388          for numIDs in numIDtypes: | 
|  | 389            nm = getIdMapping([header],numIDs,'ACC',numReturn=True) | 
|  | 390            if nm != None: | 
|  | 391               ambiguous.append(nm) | 
|  | 392          matchType.append(ambiguous) | 
|  | 393          return (header, ambiguous) | 
|  | 394     for pat,cat in fmts: | 
|  | 395       m = re.match(pat,header) | 
|  | 396       if m: | 
|  | 397         matchType.append([cat]) | 
|  | 398         return (m.groups()[0],[cat]) | 
|  | 399     matchType.append(['ACC+ID']) | 
|  | 400     return (header,['ACC+ID']) | 
|  | 401 | 
|  | 402 | 
|  | 403   def read_tabular(filepath,col): | 
|  | 404     accessions = [] | 
|  | 405     with open(filepath) as fp: | 
|  | 406       for i,line in enumerate(fp): | 
|  | 407         if line.strip() == '' or line.startswith('#'): | 
|  | 408           continue | 
|  | 409         fields = line.rstrip('\n').split('\t') | 
|  | 410         accession = fields[col] | 
|  | 411         accessions.append(accession) | 
|  | 412     return accessions | 
|  | 413 | 
|  | 414   def get_fasta_entries(fp): | 
|  | 415     name, seq = None, [] | 
|  | 416     for line in fp: | 
|  | 417       line = line.rstrip() | 
|  | 418       if line.startswith(">"): | 
|  | 419         if name: yield (name, ''.join(seq)) | 
|  | 420         name, seq = line, [] | 
|  | 421       else: | 
|  | 422         seq.append(line) | 
|  | 423     if name: yield (name, ''.join(seq)) | 
|  | 424 | 
|  | 425   def read_fasta(filepath): | 
|  | 426     accessions = [] | 
|  | 427     with open(filepath) as fp: | 
|  | 428       for id, peptide in get_fasta_entries(fp): | 
|  | 429         accessions.append(getFastaAccession(id)) | 
|  | 430     return accessions | 
|  | 431 | 
|  | 432   def read_mzid(fp): | 
|  | 433     accessions = [] | 
|  | 434     for event, elem in ET.iterparse(fp): | 
|  | 435       if event == 'end': | 
|  | 436         if re.search('DBSequence',elem.tag): | 
|  | 437           accessions.append(elem.attrib['accession']) | 
|  | 438     return accessions | 
|  | 439 | 
|  | 440   def read_pepxml(fp): | 
|  | 441     accessions = [] | 
|  | 442     for event, elem in ET.iterparse(fp): | 
|  | 443       if event == 'end': | 
|  | 444         if re.search('search_hit',elem.tag): | 
|  | 445           accessions.append(elem.get('protein')) | 
|  | 446     return accessions | 
|  | 447 | 
|  | 448   def getUniprotSequence(uniprot_id): | 
|  | 449     url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id | 
|  | 450     print url | 
|  | 451     fp = urllib2.urlopen(url) | 
|  | 452     for id, seq in get_fasta_entries(fp): | 
|  | 453       if seq: | 
|  | 454         return seq | 
|  | 455     return '' | 
|  | 456 | 
|  | 457   def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,numReturn=None,crossReference=None,idMaps=None): | 
|  | 458     # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)])) | 
|  | 459     if not accessions: | 
|  | 460       return | 
|  | 461     url = 'http://www.uniprot.org/mapping/' | 
|  | 462     if to_id != 'ACC' and crossReference: # cross-referencing from nonUniprot IDs | 
|  | 463       if 'ACC' in idMaps: | 
|  | 464         print idMaps | 
|  | 465         idMaps[to_id] = idMaps['ACC'] | 
|  | 466         idMap = idMaps[to_id] | 
|  | 467         for acc in idMaps[to_id].keys(): | 
|  | 468            print idMaps | 
|  | 469            for from_ID_ in idMaps[to_id][acc].keys(): | 
|  | 470               cr = [] | 
|  | 471               for ids in idMaps[to_id][acc][from_ID_]: | 
|  | 472                  print ids | 
|  | 473                  print to_id | 
|  | 474                  accessions = getIdMapping(ids,'ACC',to_id,idMap=idMap,crossReference=False) | 
|  | 475                  cr.append(accessions) | 
|  | 476               print cr | 
|  | 477               idMaps[to_id][acc][from_ID_] = cr | 
|  | 478       #accessions= getIdMapping(accessions,from_id,'ACC',idMap=idMap,crossReference=False) | 
|  | 479       if accessions != None: | 
|  | 480         if idMap != None: | 
|  | 481            idMap[accessions[0]] = {from_id:['N/A']} | 
|  | 482       return | 
|  | 483     params = { | 
|  | 484     'from': from_id, | 
|  | 485     'to': to_id, | 
|  | 486     'format':'tab', | 
|  | 487     'query': '\n'.join(accessions) | 
|  | 488     } | 
|  | 489     data = urllib.urlencode(params) | 
|  | 490     request = urllib2.Request(url, data) | 
|  | 491     contact = "" # Please set your email address here to help us debug in case of problems. | 
|  | 492     request.add_header('User-Agent', 'Python %s' % contact) | 
|  | 493     response = None | 
|  | 494     for i in range(3): | 
|  | 495       try: | 
|  | 496         response = urllib2.urlopen(request) | 
|  | 497         break | 
|  | 498       except Exception, e: | 
|  | 499         warn_err("%s",exit_code=None) | 
|  | 500     if response: | 
|  | 501       response.next() | 
|  | 502     for line in response: | 
|  | 503       # print >> sys.stderr, "idMap: %s" % line | 
|  | 504       if fh: | 
|  | 505         fh.write(line) | 
|  | 506       if numReturn: | 
|  | 507         id1,id2 = line.strip().split('\t') | 
|  | 508         return from_id | 
|  | 509       if idMap != None: | 
|  | 510         id1,id2 = line.strip().split('\t') | 
|  | 511         #print >> sys.stderr, "idMap: %s:%s" % (id1,id2) | 
|  | 512         if idMap.get(id1,'nothing') == 'nothing': | 
|  | 513            #idMap[id1] = [id2] | 
|  | 514            idMap[id1] = {from_id:[id2]} | 
|  | 515         else: | 
|  | 516            id3 = idMap[id1] | 
|  | 517            #if id2 not in id3: | 
|  | 518            #print id3 | 
|  | 519            if id3.get(from_id,'nothing') == 'nothing': | 
|  | 520                id3[from_id] = [id2] | 
|  | 521            else: | 
|  | 522                id4 = id3[from_id] | 
|  | 523                id4.append(id2) | 
|  | 524       else: | 
|  | 525         print 'recursive' | 
|  | 526         id3 = [] | 
|  | 527         id1,id2 = line.strip().split('\t') | 
|  | 528         id3.append(id2) | 
|  | 529         return id3 | 
|  | 530            #idMap[id1] = id3 | 
|  | 531   # python uniprot_id_mapping.py -T 'NF100' -test-data/old-inputs/var.tsv | 
|  | 532 | 
|  | 533   #Parse Command Line | 
|  | 534   parser = optparse.OptionParser() | 
|  | 535   # input files | 
|  | 536   parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) | 
|  | 537   parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) | 
|  | 538   parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) | 
|  | 539   parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) | 
|  | 540   parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) | 
|  | 541   # Decoy pattern | 
|  | 542   parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' ) | 
|  | 543   # filter patterns | 
|  | 544   parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' ) | 
|  | 545   parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' ) | 
|  | 546   # Unipept Flags | 
|  | 547   parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' ) | 
|  | 548   parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' ) | 
|  | 549   # output files | 
|  | 550   parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values') | 
|  | 551   # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' ) | 
|  | 552   parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' ) | 
|  | 553   (options, args) = parser.parse_args() | 
|  | 554   if options.version: | 
|  | 555     print >> sys.stdout,"%s" % version | 
|  | 556     sys.exit(0) | 
|  | 557 | 
|  | 558   accessions = [] | 
|  | 559   ## Get accessions | 
|  | 560   if options.mzid: | 
|  | 561     accessions += read_mzid(options.mzid) | 
|  | 562   if options.pepxml: | 
|  | 563     accessions += read_pepxml(options.pepxml) | 
|  | 564   if options.tabular: | 
|  | 565     accessions += read_tabular(options.tabular,options.column) | 
|  | 566   if options.fasta: | 
|  | 567     accessions += read_fasta(options.fasta) | 
|  | 568   if args and len(args) > 0: | 
|  | 569     for i,accession in enumerate(args): | 
|  | 570       accessions.append(accession) | 
|  | 571   # filter accessions | 
|  | 572   if options.decoy: | 
|  | 573     filtered_accs = [re.sub(options.decoy,'',x) for x in accessions] | 
|  | 574     accessions = filtered_accs | 
|  | 575   if options.include_pat: | 
|  | 576     filtered_accs = [] | 
|  | 577     for acc in accessions: | 
|  | 578       if re.match(options.include_pat,acc): | 
|  | 579         filtered_accs.append(acc) | 
|  | 580     accessions = filtered_accs | 
|  | 581   if options.exclude_pat: | 
|  | 582     filtered_accs = [] | 
|  | 583     for acc in accessions: | 
|  | 584       if not re.match(options.exclude_pat,acc): | 
|  | 585         filtered_accs.append(acc) | 
|  | 586     accessions = filtered_accs | 
|  | 587   if len(accessions) < 1: | 
|  | 588     warn_err("No accessions input!",exit_code=1) | 
|  | 589   if options.output != None: | 
|  | 590     try: | 
|  | 591       outputPath = os.path.abspath(options.output) | 
|  | 592       outputFile = open(outputPath, 'w') | 
|  | 593     except Exception, e: | 
|  | 594       print >> sys.stderr, "failed: %s" % e | 
|  | 595       exit(3) | 
|  | 596   else: | 
|  | 597     outputFile = sys.stdout | 
|  | 598 | 
|  | 599 | 
|  | 600   # Sorts accessions to inferred ID types i+n a dictionary | 
|  | 601   id_dict = {} | 
|  | 602   matchType = [] | 
|  | 603   for header in accessions: | 
|  | 604       id , id_types = getAccession(header,matchType) | 
|  | 605       addAccession(id_dict,id_types,id) | 
|  | 606       #print id_dict | 
|  | 607   #print id_dict | 
|  | 608   idMaps = dict() # {to_id,idMap} | 
|  | 609   #print matchType | 
|  | 610   #print id_dict.items() | 
|  | 611   for to_id in options.to_id: | 
|  | 612     idMap = dict() | 
|  | 613     idMaps[to_id] = idMap | 
|  | 614     for (from_id,ids) in id_dict.items(): | 
|  | 615       # limit the requests to 500 at a time | 
|  | 616       idx = range(0,len(ids),500) | 
|  | 617       idx.append(len(ids)) | 
|  | 618       for i in range(len(idx)-1): | 
|  | 619         getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,crossReference=True,idMaps=idMaps) | 
|  | 620   #print idMaps | 
|  | 621   #Write output | 
|  | 622   #Output Table Header | 
|  | 623   outputFile.write("\n#%-17s" % (options.from_id)) | 
|  | 624   for t in options.to_id: | 
|  | 625     outputFile.write("%-18s" % t) | 
|  | 626   outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n") | 
|  | 627   #Output a row with values for each input ID | 
|  | 628   #for i, acc in enumerate(accessions): | 
|  | 629      #multi = 1 | 
|  | 630      #for types in options.to_id: | 
|  | 631      #    IDs = idMaps[types].get(acc,['none']) | 
|  | 632      #    if len(IDs) > multi: | 
|  | 633      #        multi = len(IDs) | 
|  | 634   """ | 
|  | 635      outputFile.write("%-17s %-16s" % (acc, matchType[i][0])) | 
|  | 636      for x in range(len(matchType[i])): | 
|  | 637          if x != 0: | 
|  | 638              outputFile.write(("''" + " " * 16 + "%-16s") % matchType[i][x]) | 
|  | 639          for types in options.to_id: | 
|  | 640              IDs = idMaps[types].get(acc,['none']) | 
|  | 641              if len(IDs) > x: | 
|  | 642                  outputFile.write("%-18s" % IDs[x]) | 
|  | 643              else: | 
|  | 644                  if len(matchType[i]) > 1: | 
|  | 645                     outputFile.write("%-18s" % IDs[0]) | 
|  | 646                  else: | 
|  | 647                     outputFile.write("''                ") | 
|  | 648          outputFile.write("\n") | 
|  | 649   """ | 
|  | 650   # Create an output-friendly matrix | 
|  | 651   idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))] | 
|  | 652   for a, acc in enumerate(accessions): | 
|  | 653      idLength = 0 | 
|  | 654      for i, to_id in enumerate(options.to_id): # [[ids],[ids]] | 
|  | 655         idArrayColumn = [] | 
|  | 656         idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3] | 
|  | 657         for from_id in idPairs: | 
|  | 658             ids = idPairs[from_id] | 
|  | 659             for id_ in ids: | 
|  | 660                 idArrayColumn.append("%s[%s]" % (id_,from_id)) | 
|  | 661         if idLength < len(idArrayColumn): | 
|  | 662             idLength = len(idArrayColumn) | 
|  | 663         idArray[a][i].extend(idArrayColumn) | 
|  | 664      for y in range(idLength): | 
|  | 665         outputFile.write("%-18s" % acc) | 
|  | 666         for x in range(len(options.to_id)): | 
|  | 667            try: | 
|  | 668              outputFile.write("%-18s" % idArray[a][x][y]) | 
|  | 669            except: | 
|  | 670              outputFile.write(" " * 18) | 
|  | 671         outputFile.write("\n") | 
|  | 672   # Output Matrix | 
|  | 673 | 
|  | 674 | 
|  | 675 | 
|  | 676   #print idMaps | 
|  | 677  # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv | 
|  | 678   stop = timeit.default_timer() | 
|  | 679   outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds")) | 
|  | 680 | 
|  | 681 if __name__ == "__main__" : __main__() | 
|  | 682 |