Mercurial > repos > rsajulga > uniprot_id_mapper
changeset 1:a9a1b182bc6d draft
Deleted selected files
| author | rsajulga | 
|---|---|
| date | Thu, 16 Jul 2015 16:30:25 -0400 | 
| parents | 4961ddd2f782 | 
| children | d0311668d442 | 
| files | uniprot_id_mapping.py | 
| diffstat | 1 files changed, 0 insertions(+), 697 deletions(-) [+] | 
line wrap: on
 line diff
--- a/uniprot_id_mapping.py Thu Jul 16 16:28:42 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,697 +0,0 @@ -#!/usr/bin/env python -""" -# -#------------------------------------------------------------------------------ -# University of Minnesota -# Copyright 2015, Regents of the University of Minnesota -#------------------------------------------------------------------------------ -# Author: -# -# James E Johnson -# -#------------------------------------------------------------------------------ -""" -import json -import logging -import optparse -from optparse import OptionParser -import os -import sys -import re -import urllib -import urllib2 -import timeit -start = timeit.default_timer() -try: - import xml.etree.cElementTree as ET -except ImportError: - import xml.etree.ElementTree as ET - -"""print -http://www.uniprot.org/help/programmatic_access#id_mapping_examples -http://www.uniprot.org/help/uploadlists - -From input files or stdin get accessions/IDs to map. -An input option can specify what id_type the input accessions/IDs are, -otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type - -input_dict = dict() -for id_string in input_ids: - (id,id_type) = getAccession(id_string) - if not id_type in input_dict: - input_dict[id_type] = [] - input_dict[id_type].append(id) - - -# We need to retrieve by from_id_type and to_id_type -for id_type in input_dict: - for - - -tabular output: -#input_id to_id to_id - - -""" -def warn_err(msg,exit_code=1): - sys.stderr.write(msg) - if exit_code: - sys.exit(exit_code) - -def __main__(): - version = '1.0' - idDict = { - #Category:UniProt - 'ACC+ID':'UniProtKB AC/ID', - 'ACC':'UniProtKB AC', - 'ID':'UniProtKB ID', - 'UPARC':'UniParc', - 'NF50':'UniRef50', - 'NF90':'UniRef90', - 'NF100':'UniRef100', - 'GENENAME':'Gene name', - #Category:Other sequence databases - 'EMBL_ID':'EMBL/GenBank/DDBJ', - 'EMBL':'EMBL/GenBank/DDBJ CDS', - 'PIR':'PIR', - 'UNIGENE_ID':'UniGene', - 'P_ENTREZGENEID':'Entrez Gene (GeneID)', - 'P_GI':'GI number*', - 'P_REFSEQ_AC':'RefSeq Protein', - 'REFSEQ_NT_ID':'RefSeq Nucleotide', - #Category:3D structure databases - 'PDB_ID':'PDB', - 'DISPROT_ID':'DisProt', - #Category:Protein-protein interaction databases - 'BIOGRID_ID':'BioGrid', - 'DIP_ID':'DIP', - 'MINT_ID':'MINT', - 'STRING_ID':'STRING', - #Category:Chemistry - 'CHEMBL_ID':'ChEMBL', - 'DRUGBANK_ID':'DrugBank', - 'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY', - #Category:Protein family/group databases - 'ALLERGOME_ID':'Allergome', - 'MEROPS_ID':'MEROPS', - 'MYCOCLAP_ID':'mycoCLAP', - 'PEROXIBASE_ID':'PeroxiBase', - 'REBASE_ID':'REBASE', - 'TCDB_ID':'TCDB', - #Category:Polymorphism databases - 'DMDM_ID':'DMDM', - #Category:2D gel databases - 'WORLD_2DPAGE_ID':'World-2DPAGE', - #Category:Protocols and materials databases - 'DNASU_ID':'DNASU', - #Category:Genome annotation databases - 'ENSEMBL_ID':'Ensembl', - 'ENSEMBL_PRO_ID':'Ensembl Protein', - 'ENSEMBL_TRS_ID':'Ensembl Transcript', - 'ENSEMBLGENOME_ID':'Ensembl Genomes', - 'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein', - 'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript', - 'P_ENTREZGENEID':'GeneID', - 'KEGG_ID':'KEGG', - 'PATRIC_ID':'PATRIC', - 'UCSC_ID':'UCSC', - 'VECTORBASE_ID':'VectorBase', - #Category:Organism-specific gene databases - 'ARACHNOSERVER_ID':'ArachnoServer', - 'CGD':'CGD', - 'CONOSERVER_ID':'ConoServer', - 'CYGD_ID':'CYGD', - 'DICTYBASE_ID':'dictyBase', - 'ECHOBASE_ID':'EchoBASE', - 'ECOGENE_ID':'EcoGene', - 'EUHCVDB_ID':'euHCVdb', - 'EUPATHDB_ID':'EuPathDB', - 'FLYBASE_ID':'FlyBase', - 'GENECARDS_ID':'GeneCards', - 'GENEFARM_ID':'GeneFarm', - 'GENOLIST_ID':'GenoList', - 'H_INVDB_ID':'H-InvDB', - 'HGNC_ID':'HGNC', - 'HPA_ID':'HPA', - 'LEGIOLIST_ID':'LegioList', - 'LEPROMA_ID':'Leproma', - 'MAIZEGDB_ID':'MaizeGDB', - 'MIM_ID':'MIM', - 'MGI_ID':'MGI', - 'NEXTPROT_ID':'neXtProt', - 'ORPHANET_ID':'Orphanet', - 'PHARMGKB_ID':'PharmGKB', - 'POMBASE_ID':'PomBase', - 'PSEUDOCAP_ID':'PseudoCAP', - 'RGD_ID':'RGD', - 'SGD_ID':'SGD', - 'TAIR_ID':'TAIR', - 'TUBERCULIST_ID':'TubercuList', - 'WORMBASE_ID':'WormBase', - 'WORMBASE_TRS_ID':'WormBase Transcript', - 'WORMBASE_PRO_ID':'WormBase Protein', - 'XENBASE_ID':'Xenbase', - 'ZFIN_ID':'ZFIN', - #Category:Phylogenomic databases - 'EGGNOG_ID':'eggNOG', - 'GENETREE_ID':'GeneTree', - 'HOGENOM_ID':'HOGENOM', - 'HOVERGEN_ID':'HOVERGEN', - 'KO_ID':'KO', - 'OMA_ID':'OMA', - 'ORTHODB_ID':'OrthoDB', - 'PROTCLUSTDB_ID':'ProtClustDB', - 'TREEFAM_ID':'TreeFam', - #Category:Enzyme and pathway databases - 'BIOCYC_ID':'BioCyc', - 'REACTOME_ID':'Reactome', - 'UNIPATHWAY_ID':'UniPathWay', - #Category:Gene expression databases - 'CLEANEX_ID':'CleanEx', - #Category:Other - 'CHITARS_ID':'ChiTaRS', - 'GENOMERNAI_ID':'GenomeRNAi', - 'GENEWIKI_ID':'GeneWiki', - 'NEXTBIO_ID':'NextBio' - } - - """ - TODO: - Would be better to be able to infer the from_id type for the input accession/ID especially for a fasta file - - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession. - - # rather than an array of input accessions, we need to put them in a dict() by type - def addAccession(id_dict,id_type,id): - if id_type not in id_dict: - id_dict[id_type] = []-++ - id_dict[id_type] = [].append(id) - - # returns(accession, id_type) - def getAccession(header, matchType): - # (format regex pattern, FROM_ID) - # TODO try to determine which type of accession ID we have by matching by regular expressions - # each regex match should have a groups[0] that given the accession - # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples - fmts = [ - ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), - ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'), - ('>/UniRef\d+_(\w+).*','ACC+ID'), - ('>/(UPI\d+).*','ACC+ID'), - ('NP_\d+\.\d+','ACC+ID'), - ('ENSP\d+','ENSEMBL_PRO_ID'), - ('ENST\d+','ENSEMBL_TRS_ID'), - ] - for pat,cat in fmts: - m = re.match(pat,header) - if m: - matchType.append(cat) - return (m.groups()[0],cat) - matchType.append('ACC+ID') - return (header,'ACC+ID') - - # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types - idMaps = dict() # {to_id,idMap} - for to_id in options.to_id: - idMap = dict() - idMaps[to_id] = idMap - for (from_id,ids) id_dict.items(): - # limit the requests to 500 at a time - idx = range(0,len(ids),500) - idx.append(len(ids)) - for i in range(len(idx)-1): - getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap) - """ - - """ - Some examples of fasta ID lines From: - https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java ->sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ... ->gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ... ->IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr ->HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein. ->OExyz (OExyz) xxx xxx xxx ->hflu_lsi_xxxx xxx xxx xxx ->C_tr_Lx_x [xxx - xxx] | xxx xxx ->M. tub.xxx|Rvxxx| xxx xxx -// Drosophile DB. -// We need to find two elements: -// - the accession String (retrieved as the trimmed version of everything -// up to (and NOT including) " pep:" -// - the description (everything (trimmed) starting from (and including) the " pep:". ->CGxxxxxx pep:xxxxxx ->xxxx xxx SGDID:xxxx xxx ->generic_some_tag|proten_accession|a description for this protein -// Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as -// present in the Expasy FTP FASTA file. -// Is formatted something like this: ->XXX_YYYY (acc) rest ->sp|accession|ID descr rest (including taxonomy, if available ->tr|accession|ID descr rest (including taxonomy, if available) ->nxp|NX_P02768-1|ALB|Serum albumin|Iso -// New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as -// present in the Expasy FTP FASTA file. -// Is formatted something like this: ->accession|ID descr rest (including taxonomy, if available) -// Flybase FASTA format. ->FBxxx type=xxx -// A header translating a genome sequence into a protein sequences. -// We need to find two elements, separated by a space: -// - the accession string (retrieved as the first part of a space delimited String). -// - the nucleic acid start and stop site (between brackets, separated by a '-'). ->dm345_3L-sense [234353534-234353938] ->dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database ->synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF -// The Arabidopsis thaliana database; TAIR format ->AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166 -... -// Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all. -// Testing for this is somewhat more complicated. -// Often used simple header; looks like: -// >NP0465 (NP0465) A description for this protein. -// We need to find two elements: -// - the accession String (easily retrieved as the next String until a space is encountered). -// - the description ->NP0465 (NP0465) A description for this protein. - GenBank gb|accession|locus - EMBL Data Library emb|accession|locus - DDBJ, DNA Database of Japan dbj|accession|locus - NBRF PIR pir||entry - Protein Research Foundation prf||name - SWISS-PROT sp|accession|entry name - Brookhaven Protein Data Bank pdb|entry|chain - Patents pat|country|number - GenInfo Backbone Id bbs|number - General database identifier gnl|database|identifier - NCBI Reference Sequence ref|accession|locus - Local Sequence identifier lcl|identifier - """ - - - def getFastaAccession(header): - # TODO parse the ID and return the accession - # (format regex pattern, FROM_ID) - # TODO try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions - # each regex match should have a groups[0] that given the accession - # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples - fmts = [ - ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), - ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'), - ('NP_\d+\.\d+','ACC+ID'), - - ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), - ('(...*)_....*','ID'), - ('>/(UPI\d+).*','UPARC'), - ('>/UniRef50_(\w+).*','NF50'), - ('UniRef90_(\w+).*','NF90'), - ('UniRef100_(\w+).*','NF100'), - ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'), - ('ENSP\d+','ENSEMBL_PRO_ID'), - ('ENST\d+','ENSEMBL_TRS_ID') - ] - for pat,cat in fmts: - m = re.match(pat,header) - if m: - return m.groups()[0] - return header - - # rather than an array of input accessions, we need to put them in a dict() by type - def addAccession(id_dict,id_type,id): - for idtype in id_type: - if idtype not in id_dict: - id_dict[idtype] = [] - id_dict[idtype].append(id) - - # returns(accession, id_type) - def getAccession(header): - # (format regex pattern, FROM_ID) - # TODO try to determine which type of accession ID we have by matching by regular expressions - # each regex match should have a groups[0] that given the accession - # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples - fmts = [ - ('([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), - ('.*_[A-Z]*','ID'), - ('UPI(\d+).*','UPARC'), - ('(UniRef50_\w+.*)','NF50'), - ('UniRef90_(\w+).*','NF90'), - ('(UniRef100_\w+.*)','NF100'), - ('a[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2}','GENENAME'), - - ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'), - ('K\d*','KO_ID'), - - ('([A-Z]*\d*\.\d$)','EMBL'), - ('([IBC]\d{5})','PIR'), - ('(Hs\.\d*)','UNIGENE_ID'), - ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'), - ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'), - ('(\d[A-Z0-9]{3})','PDB_ID'), - ('(DP\d{5})','DISPROT_ID'), - ('(DIP-\d*N$)','DIP_ID'), - ('(MINT-\d*)','MINT_ID'), - ('(9606\.ENSP\d*)','STRING_ID'), - ('(CHEMBL\d*)','CHEMBL_ID'), - ('(DB\d*)','DRUGBANK_ID'), - ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'), - - ('[A-Z]*_[A-Z]*','MYCOCLAP_ID'), - - ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'), - ('\d{4}:([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','WORLD_2DPAGE_ID'), - ('ENS.?*G\d*','ENSEMBL_ID'), - ('ENS.?*P\d*','ENSEMBL_PRO_ID'), - ('ENS.?*T\d*','ENSEMBL_TRS_ID'), - (' ','ENSEMBLGENOME_ID'), - (' ','ENSEMBLGENOME_PRO_ID'), - (' ','ENSEMBLGENOME_TRS_ID'), - ('(hsa:\d*)','KEGG_ID'), - ('(uc\d*[a-z]*\.\d$)','UCSC_ID'), - ('VEL:J:LDKJFS','VECTORBASE_ID'), - ('AS\d*','ARACHNOSERVER_ID'), - ('CAL\d*','CGD'), - ('Y[A-Z]{2}[0-9]{3}[cw].*|Q\d{4}|R\d{4}[wc]','CYGD_ID'), - ('DDB_G\d*','DICTYBASE_ID'), - ('EB\d{4}','ECHOBASE_ID'), - ('EG\d{5}','ECOGENE_ID'), - ('(?:HM|G[QU]|F[JMN]|E[FU]|DQ|A[A-Z])\d{6}|[DLMSUXYZ]\d{5}','EUHCVDB_ID'), - ('[A-Z][a-z]*DB:.*','EUPATHDB_ID'), - ('FBgn\d*','FLYBASE_ID'), - ('GC[A-Z0-9]{9}','GENECARDS_ID'), - ('(?:BSU\d|gbs|LIN|LMO|MUL_|MYPU_|pl[iu])\d{4}|MUP\d{3}c?','GENOLIST_ID'), - ('HIX\d*','H_INVDB_ID'), - ('HGNC:\d*','HGNC_ID'), - ('(?:CAB|HPA)\d{6}','HPA_ID'), - ('lp[lp]\d{4}','LEGIOLIST_ID'), - ('ML\d{4}','LEPROMA_ID'), - ('MGI:\d*','MGI_ID'), - ('NX_(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','NEXTPROT_ID'), - ('PA\d{3,5,9}','PHARMGKB_ID'), - ('SP.*\.\d\dc?,'POMBASE_ID'), - ('PA\d{4}','PSEUDOCAP_ID'), - ('S\d{9}','SGD_ID'), - ('AT.G\d{5}','TAIR_ID'), - ('Rv\d{4}c?','TUBERCULIST_ID'), - ('WBGene\d{8}','WORMBASE_ID'), - ('(\dR(SSE|\d\d?)|A[CH]\d\d?|[BC]\d\d[A-Z]\d)\.\d.[0-9a-z]?|CBG\d{5}|C[CD][48]\.\d.[a-z]?|CE7X_3\.1|cTel5\dX\.1[ab]?|D\d{4}\.\d.[a-z]?,'WORMBASE_ID'), - ('C(BP|E)\d{5}','WORMBASE_PRO_ID'), - ('XB-GENE-\d*','XENBASE_ID'), - ('ZDB-GENE-\d{6}-\d{4}','ZFIN_ID'), - - - ('(.*[CKN]OG\d*)','EGGNOG_ID'), - ('(ENSGT\d*)','GENETREE_ID'), - ('HOG\d*','HOGENOM_ID'), - ('HBG\d*','HOVERGEN_ID'), - - ('[A-Z]{7}','OMA_ID'), - - ('EOG\d*','ORTHODB_ID'), - ('asdfsadfasdf','PROTCLUSTDB_ID'), - ('TF\d*','TREEFAM_ID'), - ('REACT_\d*','REACTOME_ID'), - ('asdfasl;DF:LJk','UNIPATHWAY_ID'), - ('HS_\d*','CLEANEX_ID'), - ('SAMEAS GENE NAME','CHITARS_ID'), - ('GENENAME_\(gene\)','GENEWIKI_ID'), - ] - # remove the need for .groups() (i.e. parantheses) - if re.match('\d*$', header): # For ambiguous number only ID types - numIDtypes = [ - 'P_ENTREZGENEID', - 'P_GI', - 'DMDM_ID', - 'BIOGRID_ID', - 'GUIDETOPHARMACOLOGY_ID', - 'ALLERGOME_ID', - 'PEROXIBASE_ID', - 'REBASE_ID', - 'DNASU_ID', - 'GENOMERNAI_ID', - 'NEXTBIO_ID', - 'CONOSERVER_ID', - 'GENEFARM_ID', - 'MAIZEGDB_ID', - 'MIM_ID', - 'MGI_ID', - 'ORPHANET_ID', - 'RGD_ID'] - ambiguous = [] - for numIDs in numIDtypes: - nm = getIdMapping([header],numIDs,'ACC',ambiguity=True) - if nm != None: - ambiguous.append(nm) - if ambiguous == []: - ambiguous.append('No Match') - return (header, ambiguous) - for pat,cat in fmts: - m = re.match(pat,header) - if m: - #return (m.groups()[0],[cat]) - return (header,[cat]) - return (header,['ACC+ID']) - - - def read_tabular(filepath,col): - accessions = [] - with open(filepath) as fp: - for i,line in enumerate(fp): - if line.strip() == '' or line.startswith('#'): - continue - fields = line.rstrip('\n').split('\t') - accession = fields[col] - accessions.append(accession) - return accessions - - def get_fasta_entries(fp): - name, seq = None, [] - for line in fp: - line = line.rstrip() - if line.startswith(">"): - if name: yield (name, ''.join(seq)) - name, seq = line, [] - else: - seq.append(line) - if name: yield (name, ''.join(seq)) - - def read_fasta(filepath): - accessions = [] - with open(filepath) as fp: - for id, peptide in get_fasta_entries(fp): - accessions.append(getFastaAccession(id)) - return accessions - - def read_mzid(fp): - accessions = [] - for event, elem in ET.iterparse(fp): - if event == 'end': - if re.search('DBSequence',elem.tag): - accessions.append(elem.attrib['accession']) - return accessions - - def read_pepxml(fp): - accessions = [] - for event, elem in ET.iterparse(fp): - if event == 'end': - if re.search('search_hit',elem.tag): - accessions.append(elem.get('protein')) - return accessions - - def getUniprotSequence(uniprot_id): - url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id - print url - fp = urllib2.urlopen(url) - for id, seq in get_fasta_entries(fp): - if seq: - return seq - return '' - - - - - def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,ambiguity=None,crossReference=None,idMaps=None): - # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)])) - if not accessions: - return - url = 'http://www.uniprot.org/mapping/' - # Cross Referencing: Mapping to non-UniprotKB ('ACC') IDs to other non-UniprotKB ('ACC') IDs - if to_id != 'ACC' and from_id != 'ACC': - crMap = {} - getIdMapping(accessions,from_id,'ACC',idMap=crMap) - crMap2 = crMap.copy() - for x in crMap.keys(): - for y in crMap[x].keys(): - for z in crMap[x][y]: - crMap2[x][y] = [] - getIdMapping([z],'ACC',to_id,idMap=crMap2,crossReference=[x,from_id]) - idMap.update(crMap2) - return - params = { - 'from': from_id, - 'to': to_id, - 'format':'tab', - 'query': '\n'.join(accessions) - } - data = urllib.urlencode(params) - request = urllib2.Request(url, data) - contact = "" # Please set your email address here to help us debug in case of problems. - request.add_header('User-Agent', 'Python %s' % contact) - response = None - for i in range(3): - try: - response = urllib2.urlopen(request) - break - except Exception, e: - warn_err("%s",exit_code=None) - if response: - response.next() - print params - - for line in response: - # print >> sys.stderr, "idMap: %s" % line - if fh: - fh.write(line) - if ambiguity: # if there was a response, then an ambiguous match can be made - return from_id - if idMap != None: - id1,id2 = line.strip().split('\t') - print id2 - if crossReference != None: - id1, from_id = crossReference - # print >> sys.stderr, "idMap: %s:%s" % (id1,id2) - try: - idMap[id1][from_id].append(id2) - except: - idMap[id1] = {from_id:[id2]} - if ambiguity == None: - for line in response: - for acc in accessions: - idMap[acc] = {from_id:['N/A']} - return - - - #Parse Command Line - parser = optparse.OptionParser() - # input files - parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) - parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) - parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) - parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) - parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) - # Decoy pattern - parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' ) - # filter patterns - parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' ) - parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' ) - # Unipept Flags - parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' ) - parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' ) - # output files - parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values') - # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' ) - parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' ) - (options, args) = parser.parse_args() - if options.version: - print >> sys.stdout,"%s" % version - sys.exit(0) - - accessions = [] - ## Get accessions - if options.mzid: - accessions += read_mzid(options.mzid) - if options.pepxml: - accessions += read_pepxml(options.pepxml) - if options.tabular: - accessions += read_tabular(options.tabular,options.column) - if options.fasta: - accessions += read_fasta(options.fasta) - if args and len(args) > 0: - for i,accession in enumerate(args): - accessions.append(accession) - # filter accessions - if options.decoy: - filtered_accs = [re.sub(options.decoy,'',x) for x in accessions] - accessions = filtered_accs - if options.include_pat: - filtered_accs = [] - for acc in accessions: - if re.match(options.include_pat,acc): - filtered_accs.append(acc) - accessions = filtered_accs - if options.exclude_pat: - filtered_accs = [] - for acc in accessions: - if not re.match(options.exclude_pat,acc): - filtered_accs.append(acc) - accessions = filtered_accs - if len(accessions) < 1: - warn_err("No accessions input!",exit_code=1) - if options.output != None: - try: - outputPath = os.path.abspath(options.output) - outputFile = open(outputPath, 'w') - except Exception, e: - print >> sys.stderr, "failed: %s" % e - exit(3) - else: - outputFile = sys.stdout - - - # Removes duplicates in accessions - seen = set() - seen_add = seen.add - accessions = [x for x in accessions if not (x in seen or seen_add(x))] - # Sorts accessions to inferred ID types i+n a dictionary - id_dict = {} - for header in accessions: - id , id_types = getAccession(header) - addAccession(id_dict,id_types,id) - idMaps = dict() # {to_id,idMap} - for to_id in options.to_id: - idMap = dict() - idMaps[to_id] = idMap - for (from_id,ids) in id_dict.items(): - # limit the requests to 500 at a time - idx = range(0,len(ids),500) - idx.append(len(ids)) - for i in range(len(idx)-1): - getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,idMaps=idMaps) - print ids -#Write output - #Output Table Header - outputFile.write("\n#%-17s" % (options.from_id)) - for t in options.to_id: - outputFile.write("%-18s" % t) - outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n") - - # Create an output-friendly matrix - idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))] - for a, acc in enumerate(accessions): - idLength = 0 - for i, to_id in enumerate(options.to_id): # [[ids],[ids]] - idArrayColumn = [] - idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3] - for from_id in idPairs: - ids = idPairs[from_id] - for id_ in ids: - idArrayColumn.append("%s[%s]" % (id_,from_id)) - if idLength < len(idArrayColumn): - idLength = len(idArrayColumn) - idArray[a][i].extend(idArrayColumn) - for y in range(idLength): - outputFile.write("%-18s" % acc) - for x in range(len(options.to_id)): - try: - outputFile.write("%-18s" % idArray[a][x][y]) - except: - outputFile.write(" " * 18) - outputFile.write("\n") - # Output Matrix - - - - #print idMaps - # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv - stop = timeit.default_timer() - outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds")) - -if __name__ == "__main__" : __main__() -
