uniprot_id_mapper: uniprot_ID_mapper/uniprot_id

comparison uniprot_ID_mapper/uniprot_id_mapping.py @ 3:6651ac4651f0 draft default tip

Uploaded

author	rsajulga
date	Thu, 16 Jul 2015 16:41:35 -0400
parents
children

comparison

equal deleted inserted replaced

-:d0311668d442
+:6651ac4651f0
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+#                         University of Minnesota
+#         Copyright 2015, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+#  James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+import json
+import logging
+import optparse
+from optparse import OptionParser
+import os
+import sys
+import re
+import urllib
+import urllib2
+import timeit
+start = timeit.default_timer()
+try:
+import xml.etree.cElementTree as ET
+except ImportError:
+import xml.etree.ElementTree as ET
+"""print
+http://www.uniprot.org/help/programmatic_access#id_mapping_examples
+http://www.uniprot.org/help/uploadlists
+From input files or stdin get accessions/IDs to map.
+An input option can specify what id_type the input accessions/IDs are,
+otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type
+input_dict = dict()
+for id_string in input_ids:
+(id,id_type) = getAccession(id_string)
+if not id_type in input_dict:
+input_dict[id_type] = []
+input_dict[id_type].append(id)
+# We need to retrieve by from_id_type and to_id_type
+for id_type in input_dict:
+for
+tabular output:
+#input_id	to_id	to_id
+"""
+def warn_err(msg,exit_code=1):
+sys.stderr.write(msg)
+if exit_code:
+sys.exit(exit_code)
+def __main__():
+version = '1.0'
+idDict = {
+#Category:UniProt
+'ACC+ID':'UniProtKB AC/ID',
+'ACC':'UniProtKB AC',
+'ID':'UniProtKB ID',
+'UPARC':'UniParc',
+'NF50':'UniRef50',
+'NF90':'UniRef90',
+'NF100':'UniRef100',
+'GENENAME':'Gene name',
+#Category:Other sequence databases
+'EMBL_ID':'EMBL/GenBank/DDBJ',
+'EMBL':'EMBL/GenBank/DDBJ CDS',
+'PIR':'PIR',
+'UNIGENE_ID':'UniGene',
+'P_ENTREZGENEID':'Entrez Gene (GeneID)',
+'P_GI':'GI number*',
+'P_REFSEQ_AC':'RefSeq Protein',
+'REFSEQ_NT_ID':'RefSeq Nucleotide',
+#Category:3D structure databases
+'PDB_ID':'PDB',
+'DISPROT_ID':'DisProt',
+#Category:Protein-protein interaction databases
+'BIOGRID_ID':'BioGrid',
+'DIP_ID':'DIP',
+'MINT_ID':'MINT',
+'STRING_ID':'STRING',
+#Category:Chemistry
+'CHEMBL_ID':'ChEMBL',
+'DRUGBANK_ID':'DrugBank',
+'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY',
+#Category:Protein family/group databases
+'ALLERGOME_ID':'Allergome',
+'MEROPS_ID':'MEROPS',
+'MYCOCLAP_ID':'mycoCLAP',
+'PEROXIBASE_ID':'PeroxiBase',
+'REBASE_ID':'REBASE',
+'TCDB_ID':'TCDB',
+#Category:Polymorphism databases
+'DMDM_ID':'DMDM',
+#Category:2D gel databases
+'WORLD_2DPAGE_ID':'World-2DPAGE',
+#Category:Protocols and materials databases
+'DNASU_ID':'DNASU',
+#Category:Genome annotation databases
+'ENSEMBL_ID':'Ensembl',
+'ENSEMBL_PRO_ID':'Ensembl Protein',
+'ENSEMBL_TRS_ID':'Ensembl Transcript',
+'ENSEMBLGENOME_ID':'Ensembl Genomes',
+'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein',
+'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript',
+'P_ENTREZGENEID':'GeneID',
+'KEGG_ID':'KEGG',
+'PATRIC_ID':'PATRIC',
+'UCSC_ID':'UCSC',
+'VECTORBASE_ID':'VectorBase',
+#Category:Organism-specific gene databases
+'ARACHNOSERVER_ID':'ArachnoServer',
+'CGD':'CGD',
+'CONOSERVER_ID':'ConoServer',
+'CYGD_ID':'CYGD',
+'DICTYBASE_ID':'dictyBase',
+'ECHOBASE_ID':'EchoBASE',
+'ECOGENE_ID':'EcoGene',
+'EUHCVDB_ID':'euHCVdb',
+'EUPATHDB_ID':'EuPathDB',
+'FLYBASE_ID':'FlyBase',
+'GENECARDS_ID':'GeneCards',
+'GENEFARM_ID':'GeneFarm',
+'GENOLIST_ID':'GenoList',
+'H_INVDB_ID':'H-InvDB',
+'HGNC_ID':'HGNC',
+'HPA_ID':'HPA',
+'LEGIOLIST_ID':'LegioList',
+'LEPROMA_ID':'Leproma',
+'MAIZEGDB_ID':'MaizeGDB',
+'MIM_ID':'MIM',
+'MGI_ID':'MGI',
+'NEXTPROT_ID':'neXtProt',
+'ORPHANET_ID':'Orphanet',
+'PHARMGKB_ID':'PharmGKB',
+'POMBASE_ID':'PomBase',
+'PSEUDOCAP_ID':'PseudoCAP',
+'RGD_ID':'RGD',
+'SGD_ID':'SGD',
+'TAIR_ID':'TAIR',
+'TUBERCULIST_ID':'TubercuList',
+'WORMBASE_ID':'WormBase',
+'WORMBASE_TRS_ID':'WormBase Transcript',
+'WORMBASE_PRO_ID':'WormBase Protein',
+'XENBASE_ID':'Xenbase',
+'ZFIN_ID':'ZFIN',
+#Category:Phylogenomic databases
+'EGGNOG_ID':'eggNOG',
+'GENETREE_ID':'GeneTree',
+'HOGENOM_ID':'HOGENOM',
+'HOVERGEN_ID':'HOVERGEN',
+'KO_ID':'KO',
+'OMA_ID':'OMA',
+'ORTHODB_ID':'OrthoDB',
+'PROTCLUSTDB_ID':'ProtClustDB',
+'TREEFAM_ID':'TreeFam',
+#Category:Enzyme and pathway databases
+'BIOCYC_ID':'BioCyc',
+'REACTOME_ID':'Reactome',
+'UNIPATHWAY_ID':'UniPathWay',
+#Category:Gene expression databases
+'CLEANEX_ID':'CleanEx',
+#Category:Other
+'CHITARS_ID':'ChiTaRS',
+'GENOMERNAI_ID':'GenomeRNAi',
+'GENEWIKI_ID':'GeneWiki',
+'NEXTBIO_ID':'NextBio'
+}
+"""
+TODO:
+Would be better to be able to infer the from_id type for the input accession/ID  especially for a fasta file
+- Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession.
+# rather than an array of input accessions, we need to put them in a dict() by type
+def addAccession(id_dict,id_type,id):
+if id_type not in id_dict:
+id_dict[id_type] = []-++
+id_dict[id_type] = [].append(id)
+# returns(accession, id_type)
+def getAccession(header, matchType):
+# (format regex pattern, FROM_ID)
+# TODO  try to determine which type of accession ID we have by matching by regular expressions
+# each regex match should have a groups[0] that given the accession
+# for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
+fmts = [
+('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
+('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
+('>/UniRef\d+_(\w+).*','ACC+ID'),
+('>/(UPI\d+).*','ACC+ID'),
+('NP_\d+\.\d+','ACC+ID'),
+('ENSP\d+','ENSEMBL_PRO_ID'),
+('ENST\d+','ENSEMBL_TRS_ID'),
+]
+for pat,cat in fmts:
+m = re.match(pat,header)
+if m:
+matchType.append(cat)
+return (m.groups()[0],cat)
+matchType.append('ACC+ID')
+return (header,'ACC+ID')
+# Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types
+idMaps = dict() # {to_id,idMap}
+for to_id in options.to_id:
+idMap = dict()
+idMaps[to_id] = idMap
+for (from_id,ids) id_dict.items():
+# limit the requests to 500 at a time
+idx = range(0,len(ids),500)
+idx.append(len(ids))
+for i in range(len(idx)-1):
+getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap)
+"""
+"""
+Some examples of fasta ID lines From:
+https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java
+>sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ...
+>gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ...
+>IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr
+>HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein.
+>OExyz (OExyz) xxx xxx xxx
+>hflu_lsi_xxxx xxx xxx xxx
+>C_tr_Lx_x [xxx - xxx] | xxx xxx
+>M. tub.xxx|Rvxxx| xxx xxx
+// Drosophile DB.
+// We need to find two elements:
+//   - the accession String (retrieved as the trimmed version of everything
+//     up to (and NOT including) " pep:"
+//   - the description (everything (trimmed) starting from (and including) the " pep:".
+>CGxxxxxx pep:xxxxxx
+>xxxx xxx SGDID:xxxx xxx
+>generic_some_tag|proten_accession|a description for this protein
+// Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as
+// present in the Expasy FTP FASTA file.
+// Is formatted something like this:
+>XXX_YYYY (acc) rest
+>sp|accession|ID descr rest (including taxonomy, if available
+>tr|accession|ID descr rest (including taxonomy, if available)
+>nxp|NX_P02768-1|ALB|Serum albumin|Iso
+// New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as
+// present in the Expasy FTP FASTA file.
+// Is formatted something like this:
+>accession|ID descr rest (including taxonomy, if available)
+// Flybase FASTA format.
+>FBxxx type=xxx
+// A header translating a genome sequence into a protein sequences.
+// We need to find two elements, separated by a space:
+//   - the accession string (retrieved as the first part of a space delimited String).
+//   - the nucleic acid start and stop site (between brackets, separated by a '-').
+>dm345_3L-sense [234353534-234353938]
+>dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database
+>synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089]  ORF
+// The Arabidopsis thaliana database; TAIR format
+>AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166
+...
+// Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all.
+// Testing for this is somewhat more complicated.
+// Often used simple header; looks like:
+// >NP0465 (NP0465) A description for this protein.
+// We need to find two elements:
+//   - the accession String (easily retrieved as the next String until a space is encountered).
+//   - the description
+>NP0465 (NP0465) A description for this protein.
+GenBank                           gb|accession|locus
+EMBL Data Library                 emb|accession|locus
+DDBJ, DNA Database of Japan       dbj|accession|locus
+NBRF PIR                          pir||entry
+Protein Research Foundation       prf||name
+SWISS-PROT                        sp|accession|entry name
+Brookhaven Protein Data Bank      pdb|entry|chain
+Patents                           pat|country|number
+GenInfo Backbone Id               bbs|number
+General database identifier       gnl|database|identifier
+NCBI Reference Sequence           ref|accession|locus
+Local Sequence identifier         lcl|identifier
+"""
+def getFastaAccession(header):
+# TODO parse the ID and return the accession
+# (format regex pattern, FROM_ID)
+# TODO  try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions
+# each regex match should have a groups[0] that given the accession
+# for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
+fmts = [
+('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
+('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
+('NP_\d+\.\d+','ACC+ID'),
+('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
+('(...*)_....*','ID'),
+('>/(UPI\d+).*','UPARC'),
+('>/UniRef50_(\w+).*','NF50'),
+('UniRef90_(\w+).*','NF90'),
+('UniRef100_(\w+).*','NF100'),
+('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'),
+('ENSP\d+','ENSEMBL_PRO_ID'),
+('ENST\d+','ENSEMBL_TRS_ID')
+]
+for pat,cat in fmts:
+m = re.match(pat,header)
+if m:
+return m.groups()[0]
+return header
+# rather than an array of input accessions, we need to put them in a dict() by type
+def addAccession(id_dict,id_type,id):
+for idtype in id_type:
+if idtype not in id_dict:
+id_dict[idtype] = []
+id_dict[idtype].append(id)
+# returns(accession, id_type)
+def getAccession(header):
+# (format regex pattern, FROM_ID)
+# TODO  try to determine which type of accession ID we have by matching by regular expressions
+# each regex match should have a groups[0] that given the accession
+# for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
+fmts = [
+('([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
+('.*_[A-Z]*','ID'),
+('UPI(\d+).*','UPARC'),
+('(UniRef50_\w+.*)','NF50'),
+('UniRef90_(\w+).*','NF90'),
+('(UniRef100_\w+.*)','NF100'),
+('a[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2}','GENENAME'),
+('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'),
+('K\d*','KO_ID'),
+('([A-Z]*\d*\.\d$)','EMBL'),
+('([IBC]\d{5})','PIR'),
+('(Hs\.\d*)','UNIGENE_ID'),
+('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'),
+('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'),
+('(\d[A-Z0-9]{3})','PDB_ID'),
+('(DP\d{5})','DISPROT_ID'),
+('(DIP-\d*N$)','DIP_ID'),
+('(MINT-\d*)','MINT_ID'),
+('(9606\.ENSP\d*)','STRING_ID'),
+('(CHEMBL\d*)','CHEMBL_ID'),
+('(DB\d*)','DRUGBANK_ID'),
+('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'),
+('[A-Z]*_[A-Z]*','MYCOCLAP_ID'),
+('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'),
+('\d{4}:([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','WORLD_2DPAGE_ID'),
+('ENS.?*G\d*','ENSEMBL_ID'),
+('ENS.?*P\d*','ENSEMBL_PRO_ID'),
+('ENS.?*T\d*','ENSEMBL_TRS_ID'),
+(' ','ENSEMBLGENOME_ID'),
+(' ','ENSEMBLGENOME_PRO_ID'),
+(' ','ENSEMBLGENOME_TRS_ID'),
+('(hsa:\d*)','KEGG_ID'),
+('(uc\d*[a-z]*\.\d$)','UCSC_ID'),
+('VEL:J:LDKJFS','VECTORBASE_ID'),
+('AS\d*','ARACHNOSERVER_ID'),
+('CAL\d*','CGD'),
+('Y[A-Z]{2}[0-9]{3}[cw].*|Q\d{4}|R\d{4}[wc]','CYGD_ID'),
+('DDB_G\d*','DICTYBASE_ID'),
+('EB\d{4}','ECHOBASE_ID'),
+('EG\d{5}','ECOGENE_ID'),
+('(?:HM|G[QU]|F[JMN]|E[FU]|DQ|A[A-Z])\d{6}|[DLMSUXYZ]\d{5}','EUHCVDB_ID'),
+('[A-Z][a-z]*DB:.*','EUPATHDB_ID'),
+('FBgn\d*','FLYBASE_ID'),
+('GC[A-Z0-9]{9}','GENECARDS_ID'),
+('(?:BSU\d|gbs|LIN|LMO|MUL_|MYPU_|pl[iu])\d{4}|MUP\d{3}c?','GENOLIST_ID'),
+('HIX\d*','H_INVDB_ID'),
+('HGNC:\d*','HGNC_ID'),
+('(?:CAB|HPA)\d{6}','HPA_ID'),
+('lp[lp]\d{4}','LEGIOLIST_ID'),
+('ML\d{4}','LEPROMA_ID'),
+('MGI:\d*','MGI_ID'),
+('NX_(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','NEXTPROT_ID'),
+('PA\d{3,5,9}','PHARMGKB_ID'),
+('SP.*\.\d\dc?,'POMBASE_ID'),
+('PA\d{4}','PSEUDOCAP_ID'),
+('S\d{9}','SGD_ID'),
+('AT.G\d{5}','TAIR_ID'),
+('Rv\d{4}c?','TUBERCULIST_ID'),
+('WBGene\d{8}','WORMBASE_ID'),
+('(\dR(SSE|\d\d?)|A[CH]\d\d?|[BC]\d\d[A-Z]\d)\.\d.[0-9a-z]?|CBG\d{5}|C[CD][48]\.\d.[a-z]?|CE7X_3\.1|cTel5\dX\.1[ab]?|D\d{4}\.\d.[a-z]?,'WORMBASE_ID'),
+('C(BP|E)\d{5}','WORMBASE_PRO_ID'),
+('XB-GENE-\d*','XENBASE_ID'),
+('ZDB-GENE-\d{6}-\d{4}','ZFIN_ID'),
+('(.*[CKN]OG\d*)','EGGNOG_ID'),
+('(ENSGT\d*)','GENETREE_ID'),
+('HOG\d*','HOGENOM_ID'),
+('HBG\d*','HOVERGEN_ID'),
+('[A-Z]{7}','OMA_ID'),
+('EOG\d*','ORTHODB_ID'),
+('asdfsadfasdf','PROTCLUSTDB_ID'),
+('TF\d*','TREEFAM_ID'),
+('REACT_\d*','REACTOME_ID'),
+('asdfasl;DF:LJk','UNIPATHWAY_ID'),
+('HS_\d*','CLEANEX_ID'),
+('SAMEAS GENE NAME','CHITARS_ID'),
+('GENENAME_\(gene\)','GENEWIKI_ID'),
+]
+# remove the need for .groups() (i.e. parantheses)
+if re.match('\d*$', header): # For ambiguous number only ID types
+numIDtypes = [
+'P_ENTREZGENEID',
+'P_GI',
+'DMDM_ID',
+'BIOGRID_ID',
+'GUIDETOPHARMACOLOGY_ID',
+'ALLERGOME_ID',
+'PEROXIBASE_ID',
+'REBASE_ID',
+'DNASU_ID',
+'GENOMERNAI_ID',
+'NEXTBIO_ID',
+'CONOSERVER_ID',
+'GENEFARM_ID',
+'MAIZEGDB_ID',
+'MIM_ID',
+'MGI_ID',
+'ORPHANET_ID',
+'RGD_ID']
+ambiguous = []
+for numIDs in numIDtypes:
+nm = getIdMapping([header],numIDs,'ACC',ambiguity=True)
+if nm != None:
+ambiguous.append(nm)
+if ambiguous == []:
+ambiguous.append('No Match')
+return (header, ambiguous)
+for pat,cat in fmts:
+m = re.match(pat,header)
+if m:
+#return (m.groups()[0],[cat])
+return (header,[cat])
+return (header,['ACC+ID'])
+def read_tabular(filepath,col):
+accessions = []
+with open(filepath) as fp:
+for i,line in enumerate(fp):
+if line.strip() == '' or line.startswith('#'):
+continue
+fields = line.rstrip('\n').split('\t')
+accession = fields[col]
+accessions.append(accession)
+return accessions
+def get_fasta_entries(fp):
+name, seq = None, []
+for line in fp:
+line = line.rstrip()
+if line.startswith(">"):
+if name: yield (name, ''.join(seq))
+name, seq = line, []
+else:
+seq.append(line)
+if name: yield (name, ''.join(seq))
+def read_fasta(filepath):
+accessions = []
+with open(filepath) as fp:
+for id, peptide in get_fasta_entries(fp):
+accessions.append(getFastaAccession(id))
+return accessions
+def read_mzid(fp):
+accessions = []
+for event, elem in ET.iterparse(fp):
+if event == 'end':
+if re.search('DBSequence',elem.tag):
+accessions.append(elem.attrib['accession'])
+return accessions
+def read_pepxml(fp):
+accessions = []
+for event, elem in ET.iterparse(fp):
+if event == 'end':
+if re.search('search_hit',elem.tag):
+accessions.append(elem.get('protein'))
+return accessions
+def getUniprotSequence(uniprot_id):
+url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id
+print url
+fp = urllib2.urlopen(url)
+for id, seq in get_fasta_entries(fp):
+if seq:
+return seq
+return ''
+def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,ambiguity=None,crossReference=None,idMaps=None):
+# print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)]))
+if not accessions:
+return
+url = 'http://www.uniprot.org/mapping/'
+# Cross Referencing: Mapping to non-UniprotKB ('ACC') IDs to other non-UniprotKB ('ACC') IDs
+if to_id != 'ACC' and from_id != 'ACC':
+crMap = {}
+getIdMapping(accessions,from_id,'ACC',idMap=crMap)
+crMap2 = crMap.copy()
+for x in crMap.keys():
+for y in crMap[x].keys():
+for z in crMap[x][y]:
+crMap2[x][y] = []
+getIdMapping([z],'ACC',to_id,idMap=crMap2,crossReference=[x,from_id])
+idMap.update(crMap2)
+return
+params = {
+'from': from_id,
+'to': to_id,
+'format':'tab',
+'query': '\n'.join(accessions)
+}
+data = urllib.urlencode(params)
+request = urllib2.Request(url, data)
+contact = "" # Please set your email address here to help us debug in case of problems.
+request.add_header('User-Agent', 'Python %s' % contact)
+response = None
+for i in range(3):
+try:
+response = urllib2.urlopen(request)
+break
+except Exception, e:
+warn_err("%s",exit_code=None)
+if response:
+response.next()
+print params
+for line in response:
+# print >> sys.stderr, "idMap: %s" % line
+if fh:
+fh.write(line)
+if ambiguity: # if there was a response, then an ambiguous match can be made
+return from_id
+if idMap != None:
+id1,id2 = line.strip().split('\t')
+print id2
+if crossReference != None:
+id1, from_id = crossReference
+# print >> sys.stderr, "idMap: %s:%s" % (id1,id2)
+try:
+idMap[id1][from_id].append(id2)
+except:
+idMap[id1] = {from_id:[id2]}
+if ambiguity == None:
+for line in response:
+for acc in accessions:
+idMap[acc] = {from_id:['N/A']}
+return
+#Parse Command Line
+parser = optparse.OptionParser()
+# input files
+parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
+parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
+parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
+parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
+parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
+# Decoy pattern
+parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' )
+# filter patterns
+parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' )
+parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' )
+# Unipept Flags
+parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' )
+parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' )
+# output files
+parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values')
+# parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' )
+parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
+(options, args) = parser.parse_args()
+if options.version:
+print >> sys.stdout,"%s" % version
+sys.exit(0)
+accessions = []
+## Get accessions
+if options.mzid:
+accessions += read_mzid(options.mzid)
+if options.pepxml:
+accessions += read_pepxml(options.pepxml)
+if options.tabular:
+accessions += read_tabular(options.tabular,options.column)
+if options.fasta:
+accessions += read_fasta(options.fasta)
+if args and len(args) > 0:
+for i,accession in enumerate(args):
+accessions.append(accession)
+# filter accessions
+if options.decoy:
+filtered_accs = [re.sub(options.decoy,'',x) for x in accessions]
+accessions = filtered_accs
+if options.include_pat:
+filtered_accs = []
+for acc in accessions:
+if re.match(options.include_pat,acc):
+filtered_accs.append(acc)
+accessions = filtered_accs
+if options.exclude_pat:
+filtered_accs = []
+for acc in accessions:
+if not re.match(options.exclude_pat,acc):
+filtered_accs.append(acc)
+accessions = filtered_accs
+if len(accessions) < 1:
+warn_err("No accessions input!",exit_code=1)
+if options.output != None:
+try:
+outputPath = os.path.abspath(options.output)
+outputFile = open(outputPath, 'w')
+except Exception, e:
+print >> sys.stderr, "failed: %s" % e
+exit(3)
+else:
+outputFile = sys.stdout
+# Removes duplicates in accessions
+seen = set()
+seen_add = seen.add
+accessions = [x for x in accessions if not (x in seen or seen_add(x))]
+# Sorts accessions to inferred ID types i+n a dictionary
+id_dict = {}
+for header in accessions:
+id , id_types = getAccession(header)
+addAccession(id_dict,id_types,id)
+idMaps = dict() # {to_id,idMap}
+for to_id in options.to_id:
+idMap = dict()
+idMaps[to_id] = idMap
+for (from_id,ids) in id_dict.items():
+# limit the requests to 500 at a time
+idx = range(0,len(ids),500)
+idx.append(len(ids))
+for i in range(len(idx)-1):
+getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,idMaps=idMaps)
+print ids
+#Write output
+#Output Table Header
+outputFile.write("\n#%-17s" % (options.from_id))
+for t in options.to_id:
+outputFile.write("%-18s" % t)
+outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n")
+# Create an output-friendly matrix
+idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))]
+for a, acc in enumerate(accessions):
+idLength = 0
+for i, to_id in enumerate(options.to_id): # [[ids],[ids]]
+idArrayColumn = []
+idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3]
+for from_id in idPairs:
+ids = idPairs[from_id]
+for id_ in ids:
+idArrayColumn.append("%s[%s]" % (id_,from_id))
+if idLength < len(idArrayColumn):
+idLength = len(idArrayColumn)
+idArray[a][i].extend(idArrayColumn)
+for y in range(idLength):
+outputFile.write("%-18s" % acc)
+for x in range(len(options.to_id)):
+try:
+outputFile.write("%-18s" % idArray[a][x][y])
+except:
+outputFile.write(" " * 18)
+outputFile.write("\n")
+# Output Matrix
+#print idMaps
+# python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv
+stop = timeit.default_timer()
+outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds"))
+if __name__ == "__main__" : __main__()

Mercurial > repos > rsajulga > uniprot_id_mapper

comparison uniprot_ID_mapper/uniprot_id_mapping.py @ 3:6651ac4651f0 draft default tip