Mercurial > repos > rsajulga > uniprot_id_mapper

--- a/uniprot_id_mapping.py	Thu Jul 16 16:28:42 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,697 +0,0 @@
-#!/usr/bin/env python
-"""
-#
-#------------------------------------------------------------------------------
-#                         University of Minnesota
-#         Copyright 2015, Regents of the University of Minnesota
-#------------------------------------------------------------------------------
-# Author:
-#
-#  James E Johnson
-#
-#------------------------------------------------------------------------------
-"""
-import json
-import logging
-import optparse
-from optparse import OptionParser
-import os
-import sys
-import re
-import urllib
-import urllib2
-import timeit
-start = timeit.default_timer()
-try:
-    import xml.etree.cElementTree as ET
-except ImportError:
-    import xml.etree.ElementTree as ET
-
-"""print
-http://www.uniprot.org/help/programmatic_access#id_mapping_examples
-http://www.uniprot.org/help/uploadlists
-
-From input files or stdin get accessions/IDs to map.
-An input option can specify what id_type the input accessions/IDs are,
-otherwise, we should attempt to categorize the input accessions/IDs according to their inferred id_type
-
-input_dict = dict()
-for id_string in input_ids:
-  (id,id_type) = getAccession(id_string)
-  if not id_type in input_dict:
-    input_dict[id_type] = []
-  input_dict[id_type].append(id)
-
-
-# We need to retrieve by from_id_type and to_id_type
-for id_type in input_dict:
-  for
-
-
-tabular output:
-#input_id	to_id	to_id
-
-
-"""
-def warn_err(msg,exit_code=1):
-    sys.stderr.write(msg)
-    if exit_code:
-      sys.exit(exit_code)
-
-def __main__():
-  version = '1.0'
-  idDict = {
-    #Category:UniProt
-    'ACC+ID':'UniProtKB AC/ID',
-    'ACC':'UniProtKB AC',
-    'ID':'UniProtKB ID',
-    'UPARC':'UniParc',
-    'NF50':'UniRef50',
-    'NF90':'UniRef90',
-    'NF100':'UniRef100',
-    'GENENAME':'Gene name',
-    #Category:Other sequence databases
-    'EMBL_ID':'EMBL/GenBank/DDBJ',
-    'EMBL':'EMBL/GenBank/DDBJ CDS',
-    'PIR':'PIR',
-    'UNIGENE_ID':'UniGene',
-    'P_ENTREZGENEID':'Entrez Gene (GeneID)',
-    'P_GI':'GI number*',
-    'P_REFSEQ_AC':'RefSeq Protein',
-    'REFSEQ_NT_ID':'RefSeq Nucleotide',
-    #Category:3D structure databases
-    'PDB_ID':'PDB',
-    'DISPROT_ID':'DisProt',
-    #Category:Protein-protein interaction databases
-    'BIOGRID_ID':'BioGrid',
-    'DIP_ID':'DIP',
-    'MINT_ID':'MINT',
-    'STRING_ID':'STRING',
-    #Category:Chemistry
-    'CHEMBL_ID':'ChEMBL',
-    'DRUGBANK_ID':'DrugBank',
-    'GUIDETOPHARMACOLOGY_ID':'GuidetoPHARMACOLOGY',
-    #Category:Protein family/group databases
-    'ALLERGOME_ID':'Allergome',
-    'MEROPS_ID':'MEROPS',
-    'MYCOCLAP_ID':'mycoCLAP',
-    'PEROXIBASE_ID':'PeroxiBase',
-    'REBASE_ID':'REBASE',
-    'TCDB_ID':'TCDB',
-    #Category:Polymorphism databases
-    'DMDM_ID':'DMDM',
-    #Category:2D gel databases
-    'WORLD_2DPAGE_ID':'World-2DPAGE',
-    #Category:Protocols and materials databases
-    'DNASU_ID':'DNASU',
-    #Category:Genome annotation databases
-    'ENSEMBL_ID':'Ensembl',
-    'ENSEMBL_PRO_ID':'Ensembl Protein',
-    'ENSEMBL_TRS_ID':'Ensembl Transcript',
-    'ENSEMBLGENOME_ID':'Ensembl Genomes',
-    'ENSEMBLGENOME_PRO_ID':'Ensembl Genomes Protein',
-    'ENSEMBLGENOME_TRS_ID':'Ensembl Genomes Transcript',
-    'P_ENTREZGENEID':'GeneID',
-    'KEGG_ID':'KEGG',
-    'PATRIC_ID':'PATRIC',
-    'UCSC_ID':'UCSC',
-    'VECTORBASE_ID':'VectorBase',
-    #Category:Organism-specific gene databases
-    'ARACHNOSERVER_ID':'ArachnoServer',
-    'CGD':'CGD',
-    'CONOSERVER_ID':'ConoServer',
-    'CYGD_ID':'CYGD',
-    'DICTYBASE_ID':'dictyBase',
-    'ECHOBASE_ID':'EchoBASE',
-    'ECOGENE_ID':'EcoGene',
-    'EUHCVDB_ID':'euHCVdb',
-    'EUPATHDB_ID':'EuPathDB',
-    'FLYBASE_ID':'FlyBase',
-    'GENECARDS_ID':'GeneCards',
-    'GENEFARM_ID':'GeneFarm',
-    'GENOLIST_ID':'GenoList',
-    'H_INVDB_ID':'H-InvDB',
-    'HGNC_ID':'HGNC',
-    'HPA_ID':'HPA',
-    'LEGIOLIST_ID':'LegioList',
-    'LEPROMA_ID':'Leproma',
-    'MAIZEGDB_ID':'MaizeGDB',
-    'MIM_ID':'MIM',
-    'MGI_ID':'MGI',
-    'NEXTPROT_ID':'neXtProt',
-    'ORPHANET_ID':'Orphanet',
-    'PHARMGKB_ID':'PharmGKB',
-    'POMBASE_ID':'PomBase',
-    'PSEUDOCAP_ID':'PseudoCAP',
-    'RGD_ID':'RGD',
-    'SGD_ID':'SGD',
-    'TAIR_ID':'TAIR',
-    'TUBERCULIST_ID':'TubercuList',
-    'WORMBASE_ID':'WormBase',
-    'WORMBASE_TRS_ID':'WormBase Transcript',
-    'WORMBASE_PRO_ID':'WormBase Protein',
-    'XENBASE_ID':'Xenbase',
-    'ZFIN_ID':'ZFIN',
-    #Category:Phylogenomic databases
-    'EGGNOG_ID':'eggNOG',
-    'GENETREE_ID':'GeneTree',
-    'HOGENOM_ID':'HOGENOM',
-    'HOVERGEN_ID':'HOVERGEN',
-    'KO_ID':'KO',
-    'OMA_ID':'OMA',
-    'ORTHODB_ID':'OrthoDB',
-    'PROTCLUSTDB_ID':'ProtClustDB',
-    'TREEFAM_ID':'TreeFam',
-    #Category:Enzyme and pathway databases
-    'BIOCYC_ID':'BioCyc',
-    'REACTOME_ID':'Reactome',
-    'UNIPATHWAY_ID':'UniPathWay',
-    #Category:Gene expression databases
-    'CLEANEX_ID':'CleanEx',
-    #Category:Other
-    'CHITARS_ID':'ChiTaRS',
-    'GENOMERNAI_ID':'GenomeRNAi',
-    'GENEWIKI_ID':'GeneWiki',
-    'NEXTBIO_ID':'NextBio'
-  }
-
-  """
-  TODO:
-    Would be better to be able to infer the from_id type for the input accession/ID  especially for a fasta file
-    - Allow the options.from_id to be unspecified and empty, and try to determine the from_id by the input ID/Accession.
-
-  # rather than an array of input accessions, we need to put them in a dict() by type
-  def addAccession(id_dict,id_type,id):
-    if id_type not in id_dict:
-      id_dict[id_type] = []-++
-    id_dict[id_type] = [].append(id)
-
-  # returns(accession, id_type)
-  def getAccession(header, matchType):
-    # (format regex pattern, FROM_ID)
-    # TODO  try to determine which type of accession ID we have by matching by regular expressions
-    # each regex match should have a groups[0] that given the accession
-    # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
-    fmts = [
-      ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
-      ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
-      ('>/UniRef\d+_(\w+).*','ACC+ID'),
-      ('>/(UPI\d+).*','ACC+ID'),
-      ('NP_\d+\.\d+','ACC+ID'),
-      ('ENSP\d+','ENSEMBL_PRO_ID'),
-      ('ENST\d+','ENSEMBL_TRS_ID'),
-    ]
-    for pat,cat in fmts:
-      m = re.match(pat,header)
-      if m:
-        matchType.append(cat)
-        return (m.groups()[0],cat)
-    matchType.append('ACC+ID')
-    return (header,'ACC+ID')
-
-    # Then when we are retrieving the id mappings, we need to incrementally fetch by from_id / to_id types
-  idMaps = dict() # {to_id,idMap}
-  for to_id in options.to_id:
-    idMap = dict()
-    idMaps[to_id] = idMap
-    for (from_id,ids) id_dict.items():
-      # limit the requests to 500 at a time
-      idx = range(0,len(ids),500)
-      idx.append(len(ids))
-      for i in range(len(idx)-1):
-        getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap)
-  """
-
-  """
-  Some examples of fasta ID lines From:
-    https://code.google.com/p/compomics-utilities/source/browse/trunk/src/main/java/com/compomics/util/protein/Header.java
->sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ...
->gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ...
->IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr
->HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein.
->OExyz (OExyz) xxx xxx xxx
->hflu_lsi_xxxx xxx xxx xxx
->C_tr_Lx_x [xxx - xxx] | xxx xxx
->M. tub.xxx|Rvxxx| xxx xxx
-// Drosophile DB.
-// We need to find two elements:
-//   - the accession String (retrieved as the trimmed version of everything
-//     up to (and NOT including) " pep:"
-//   - the description (everything (trimmed) starting from (and including) the " pep:".
->CGxxxxxx pep:xxxxxx
->xxxx xxx SGDID:xxxx xxx
->generic_some_tag|proten_accession|a description for this protein
-// Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as
-// present in the Expasy FTP FASTA file.
-// Is formatted something like this:
->XXX_YYYY (acc) rest
->sp|accession|ID descr rest (including taxonomy, if available
->tr|accession|ID descr rest (including taxonomy, if available)
->nxp|NX_P02768-1|ALB|Serum albumin|Iso
-// New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as
-// present in the Expasy FTP FASTA file.
-// Is formatted something like this:
->accession|ID descr rest (including taxonomy, if available)
-// Flybase FASTA format.
->FBxxx type=xxx
-// A header translating a genome sequence into a protein sequences.
-// We need to find two elements, separated by a space:
-//   - the accession string (retrieved as the first part of a space delimited String).
-//   - the nucleic acid start and stop site (between brackets, separated by a '-').
->dm345_3L-sense [234353534-234353938]
->dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database
->synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089]  ORF
-// The Arabidopsis thaliana database; TAIR format
->AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166
-...
-// Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all.
-// Testing for this is somewhat more complicated.
-// Often used simple header; looks like:
-// >NP0465 (NP0465) A description for this protein.
-// We need to find two elements:
-//   - the accession String (easily retrieved as the next String until a space is encountered).
-//   - the description
->NP0465 (NP0465) A description for this protein.
- GenBank                           gb|accession|locus
- EMBL Data Library                 emb|accession|locus
- DDBJ, DNA Database of Japan       dbj|accession|locus
- NBRF PIR                          pir||entry
- Protein Research Foundation       prf||name
- SWISS-PROT                        sp|accession|entry name
- Brookhaven Protein Data Bank      pdb|entry|chain
- Patents                           pat|country|number
- GenInfo Backbone Id               bbs|number
- General database identifier       gnl|database|identifier
- NCBI Reference Sequence           ref|accession|locus
- Local Sequence identifier         lcl|identifier
-  """
-
-
-  def getFastaAccession(header):
-    # TODO parse the ID and return the accession
-    # (format regex pattern, FROM_ID)
-    # TODO  try to determine which type of accession ID we have by matching by reg|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})ular expressions
-    # each regex match should have a groups[0] that given the accession
-    # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
-    fmts = [
-      ('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
-      ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID'),
-      ('NP_\d+\.\d+','ACC+ID'),
-
-      ('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
-      ('(...*)_....*','ID'),
-      ('>/(UPI\d+).*','UPARC'),
-      ('>/UniRef50_(\w+).*','NF50'),
-      ('UniRef90_(\w+).*','NF90'),
-      ('UniRef100_(\w+).*','NF100'),
-      ('([A-C][A-Z]?\d{6})|(DM-Z)\d{5})','EMBL_ID'),
-      ('ENSP\d+','ENSEMBL_PRO_ID'),
-      ('ENST\d+','ENSEMBL_TRS_ID')
-    ]
-    for pat,cat in fmts:
-      m = re.match(pat,header)
-      if m:
-        return m.groups()[0]
-    return header
-
-  # rather than an array of input accessions, we need to put them in a dict() by type
-  def addAccession(id_dict,id_type,id):
-    for idtype in id_type:
-      if idtype not in id_dict:
-        id_dict[idtype] = []
-      id_dict[idtype].append(id)
-
-  # returns(accession, id_type)
-  def getAccession(header):
-    # (format regex pattern, FROM_ID)
-    # TODO  try to determine which type of accession ID we have by matching by regular expressions
-    # each regex match should have a groups[0] that given the accession
-    # for id_types, see: http://www.uniprot.org/help/programmatic_access#id_mapping_examples
-    fmts = [
-      ('([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
-      ('.*_[A-Z]*','ID'),
-      ('UPI(\d+).*','UPARC'),
-      ('(UniRef50_\w+.*)','NF50'),
-      ('UniRef90_(\w+).*','NF90'),
-      ('(UniRef100_\w+.*)','NF100'),
-      ('a[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2}','GENENAME'),
-
-      ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'),
-      ('K\d*','KO_ID'),
-
-      ('([A-Z]*\d*\.\d$)','EMBL'),
-      ('([IBC]\d{5})','PIR'),
-      ('(Hs\.\d*)','UNIGENE_ID'),
-      ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'),
-      ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'),
-      ('(\d[A-Z0-9]{3})','PDB_ID'),
-      ('(DP\d{5})','DISPROT_ID'),
-      ('(DIP-\d*N$)','DIP_ID'),
-      ('(MINT-\d*)','MINT_ID'),
-      ('(9606\.ENSP\d*)','STRING_ID'),
-      ('(CHEMBL\d*)','CHEMBL_ID'),
-      ('(DB\d*)','DRUGBANK_ID'),
-      ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'),
-
-      ('[A-Z]*_[A-Z]*','MYCOCLAP_ID'),
-
-      ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'),
-      ('\d{4}:([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','WORLD_2DPAGE_ID'),
-      ('ENS.?*G\d*','ENSEMBL_ID'),
-      ('ENS.?*P\d*','ENSEMBL_PRO_ID'),
-      ('ENS.?*T\d*','ENSEMBL_TRS_ID'),
-      (' ','ENSEMBLGENOME_ID'),
-      (' ','ENSEMBLGENOME_PRO_ID'),
-      (' ','ENSEMBLGENOME_TRS_ID'),
-      ('(hsa:\d*)','KEGG_ID'),
-      ('(uc\d*[a-z]*\.\d$)','UCSC_ID'),
-      ('VEL:J:LDKJFS','VECTORBASE_ID'),
-      ('AS\d*','ARACHNOSERVER_ID'),
-      ('CAL\d*','CGD'),
-      ('Y[A-Z]{2}[0-9]{3}[cw].*|Q\d{4}|R\d{4}[wc]','CYGD_ID'),
-      ('DDB_G\d*','DICTYBASE_ID'),
-      ('EB\d{4}','ECHOBASE_ID'),
-      ('EG\d{5}','ECOGENE_ID'),
-      ('(?:HM|G[QU]|F[JMN]|E[FU]|DQ|A[A-Z])\d{6}|[DLMSUXYZ]\d{5}','EUHCVDB_ID'),
-      ('[A-Z][a-z]*DB:.*','EUPATHDB_ID'),
-      ('FBgn\d*','FLYBASE_ID'),
-      ('GC[A-Z0-9]{9}','GENECARDS_ID'),
-      ('(?:BSU\d|gbs|LIN|LMO|MUL_|MYPU_|pl[iu])\d{4}|MUP\d{3}c?','GENOLIST_ID'),
-      ('HIX\d*','H_INVDB_ID'),
-      ('HGNC:\d*','HGNC_ID'),
-      ('(?:CAB|HPA)\d{6}','HPA_ID'),
-      ('lp[lp]\d{4}','LEGIOLIST_ID'),
-      ('ML\d{4}','LEPROMA_ID'),
-      ('MGI:\d*','MGI_ID'),
-      ('NX_(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','NEXTPROT_ID'),
-      ('PA\d{3,5,9}','PHARMGKB_ID'),
-      ('SP.*\.\d\dc?,'POMBASE_ID'),
-      ('PA\d{4}','PSEUDOCAP_ID'),
-      ('S\d{9}','SGD_ID'),
-      ('AT.G\d{5}','TAIR_ID'),
-      ('Rv\d{4}c?','TUBERCULIST_ID'),
-      ('WBGene\d{8}','WORMBASE_ID'),
-      ('(\dR(SSE|\d\d?)|A[CH]\d\d?|[BC]\d\d[A-Z]\d)\.\d.[0-9a-z]?|CBG\d{5}|C[CD][48]\.\d.[a-z]?|CE7X_3\.1|cTel5\dX\.1[ab]?|D\d{4}\.\d.[a-z]?,'WORMBASE_ID'),
-      ('C(BP|E)\d{5}','WORMBASE_PRO_ID'),
-      ('XB-GENE-\d*','XENBASE_ID'),
-      ('ZDB-GENE-\d{6}-\d{4}','ZFIN_ID'),
-
-
-      ('(.*[CKN]OG\d*)','EGGNOG_ID'),
-      ('(ENSGT\d*)','GENETREE_ID'),
-      ('HOG\d*','HOGENOM_ID'),
-      ('HBG\d*','HOVERGEN_ID'),
-
-      ('[A-Z]{7}','OMA_ID'),
-
-      ('EOG\d*','ORTHODB_ID'),
-      ('asdfsadfasdf','PROTCLUSTDB_ID'),
-      ('TF\d*','TREEFAM_ID'),
-      ('REACT_\d*','REACTOME_ID'),
-      ('asdfasl;DF:LJk','UNIPATHWAY_ID'),
-      ('HS_\d*','CLEANEX_ID'),
-      ('SAMEAS GENE NAME','CHITARS_ID'),
-      ('GENENAME_\(gene\)','GENEWIKI_ID'),
-    ]
-    # remove the need for .groups() (i.e. parantheses)
-    if re.match('\d*$', header): # For ambiguous number only ID types
-         numIDtypes = [
-           'P_ENTREZGENEID',
-           'P_GI',
-           'DMDM_ID',
-           'BIOGRID_ID',
-           'GUIDETOPHARMACOLOGY_ID',
-           'ALLERGOME_ID',
-           'PEROXIBASE_ID',
-           'REBASE_ID',
-           'DNASU_ID',
-           'GENOMERNAI_ID',
-           'NEXTBIO_ID',
-           'CONOSERVER_ID',
-           'GENEFARM_ID',
-           'MAIZEGDB_ID',
-           'MIM_ID',
-           'MGI_ID',
-           'ORPHANET_ID',
-           'RGD_ID']
-         ambiguous = []
-         for numIDs in numIDtypes:
-           nm = getIdMapping([header],numIDs,'ACC',ambiguity=True)
-           if nm != None:
-              ambiguous.append(nm)
-         if ambiguous == []:
-           ambiguous.append('No Match')
-         return (header, ambiguous)
-    for pat,cat in fmts:
-      m = re.match(pat,header)
-      if m:
-        #return (m.groups()[0],[cat])
-        return (header,[cat])
-    return (header,['ACC+ID'])
-
-
-  def read_tabular(filepath,col):
-    accessions = []
-    with open(filepath) as fp:
-      for i,line in enumerate(fp):
-        if line.strip() == '' or line.startswith('#'):
-          continue
-        fields = line.rstrip('\n').split('\t')
-        accession = fields[col]
-        accessions.append(accession)
-    return accessions
-
-  def get_fasta_entries(fp):
-    name, seq = None, []
-    for line in fp:
-      line = line.rstrip()
-      if line.startswith(">"):
-        if name: yield (name, ''.join(seq))
-        name, seq = line, []
-      else:
-        seq.append(line)
-    if name: yield (name, ''.join(seq))
-
-  def read_fasta(filepath):
-    accessions = []
-    with open(filepath) as fp:
-      for id, peptide in get_fasta_entries(fp):
-        accessions.append(getFastaAccession(id))
-    return accessions
-
-  def read_mzid(fp):
-    accessions = []
-    for event, elem in ET.iterparse(fp):
-      if event == 'end':
-        if re.search('DBSequence',elem.tag):
-          accessions.append(elem.attrib['accession'])
-    return accessions
-
-  def read_pepxml(fp):
-    accessions = []
-    for event, elem in ET.iterparse(fp):
-      if event == 'end':
-        if re.search('search_hit',elem.tag):
-          accessions.append(elem.get('protein'))
-    return accessions
-
-  def getUniprotSequence(uniprot_id):
-    url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id
-    print url
-    fp = urllib2.urlopen(url)
-    for id, seq in get_fasta_entries(fp):
-      if seq:
-        return seq
-    return ''
-
-
-
-
-  def getIdMapping(accessions,from_id,to_id,fh=None,idMap=None,ambiguity=None,crossReference=None,idMaps=None):
-    # print >> sys.stderr, "%s accessions(%d): %s ..." % (to_id,len(accessions),' '.join(accessions[:min(len(accessions),3)]))
-    if not accessions:
-      return
-    url = 'http://www.uniprot.org/mapping/'
-    # Cross Referencing: Mapping to non-UniprotKB ('ACC') IDs to other non-UniprotKB ('ACC') IDs
-    if to_id != 'ACC' and from_id != 'ACC':
-      crMap = {}
-      getIdMapping(accessions,from_id,'ACC',idMap=crMap)
-      crMap2 = crMap.copy()
-      for x in crMap.keys():
-         for y in crMap[x].keys():
-            for z in crMap[x][y]:
-               crMap2[x][y] = []
-               getIdMapping([z],'ACC',to_id,idMap=crMap2,crossReference=[x,from_id])
-      idMap.update(crMap2)
-      return
-    params = {
-    'from': from_id,
-    'to': to_id,
-    'format':'tab',
-    'query': '\n'.join(accessions)
-    }
-    data = urllib.urlencode(params)
-    request = urllib2.Request(url, data)
-    contact = "" # Please set your email address here to help us debug in case of problems.
-    request.add_header('User-Agent', 'Python %s' % contact)
-    response = None
-    for i in range(3):
-      try:
-        response = urllib2.urlopen(request)
-        break
-      except Exception, e:
-        warn_err("%s",exit_code=None)
-    if response:
-      response.next()
-    print params
-
-    for line in response:
-      # print >> sys.stderr, "idMap: %s" % line
-      if fh:
-        fh.write(line)
-      if ambiguity: # if there was a response, then an ambiguous match can be made
-        return from_id
-      if idMap != None:
-        id1,id2 = line.strip().split('\t')
-        print id2
-        if crossReference != None:
-          id1, from_id = crossReference
-        # print >> sys.stderr, "idMap: %s:%s" % (id1,id2)
-        try:
-           idMap[id1][from_id].append(id2)
-        except:
-           idMap[id1] = {from_id:[id2]}
-    if ambiguity == None:
-      for line in response:
-        for acc in accessions:
-          idMap[acc] = {from_id:['N/A']}
-        return
-
-
-  #Parse Command Line
-  parser = optparse.OptionParser()
-  # input files
-  parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
-  parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
-  parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
-  parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
-  parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
-  # Decoy pattern
-  parser.add_option( '-D', '--decoy', dest='decoy', default=None, help='Decoy pattern to be trimmed from IDs , e.g. _REVERSED' )
-  # filter patterns
-  parser.add_option( '-I', '--include', dest='include_pat', default=None, help='Include pattern to filter IDs' )
-  parser.add_option( '-X', '--exclude', dest='exclude_pat', default=None, help='Exclude pattern to filter IDs' )
-  # Unipept Flags
-  parser.add_option( '-F', '--from', dest='from_id', default='ACC+ID', choices=idDict.keys(), help='From ID type' )
-  parser.add_option( '-T', '--to', dest='to_id', default=[], action="append", choices=idDict.keys(), help='To ID type' )
-  # output files
-  parser.add_option( '-o', '--output', dest='output', default=None, help='Output file path for TAB-separated-values')
-  # parser.add_option( '-O', '--format', dest='format', default='tab', choices=['list','tab','json'], help='output format' )
-  parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
-  (options, args) = parser.parse_args()
-  if options.version:
-    print >> sys.stdout,"%s" % version
-    sys.exit(0)
-
-  accessions = []
-  ## Get accessions
-  if options.mzid:
-    accessions += read_mzid(options.mzid)
-  if options.pepxml:
-    accessions += read_pepxml(options.pepxml)
-  if options.tabular:
-    accessions += read_tabular(options.tabular,options.column)
-  if options.fasta:
-    accessions += read_fasta(options.fasta)
-  if args and len(args) > 0:
-    for i,accession in enumerate(args):
-      accessions.append(accession)
-  # filter accessions
-  if options.decoy:
-    filtered_accs = [re.sub(options.decoy,'',x) for x in accessions]
-    accessions = filtered_accs
-  if options.include_pat:
-    filtered_accs = []
-    for acc in accessions:
-      if re.match(options.include_pat,acc):
-        filtered_accs.append(acc)
-    accessions = filtered_accs
-  if options.exclude_pat:
-    filtered_accs = []
-    for acc in accessions:
-      if not re.match(options.exclude_pat,acc):
-        filtered_accs.append(acc)
-    accessions = filtered_accs
-  if len(accessions) < 1:
-    warn_err("No accessions input!",exit_code=1)
-  if options.output != None:
-    try:
-      outputPath = os.path.abspath(options.output)
-      outputFile = open(outputPath, 'w')
-    except Exception, e:
-      print >> sys.stderr, "failed: %s" % e
-      exit(3)
-  else:
-    outputFile = sys.stdout
-
-
-  # Removes duplicates in accessions
-  seen = set()
-  seen_add = seen.add
-  accessions = [x for x in accessions if not (x in seen or seen_add(x))]
-  # Sorts accessions to inferred ID types i+n a dictionary
-  id_dict = {}
-  for header in accessions:
-      id , id_types = getAccession(header)
-      addAccession(id_dict,id_types,id)
-  idMaps = dict() # {to_id,idMap}
-  for to_id in options.to_id:
-    idMap = dict()
-    idMaps[to_id] = idMap
-    for (from_id,ids) in id_dict.items():
-      # limit the requests to 500 at a time
-      idx = range(0,len(ids),500)
-      idx.append(len(ids))
-      for i in range(len(idx)-1):
-        getIdMapping(ids[idx[i]:idx[i+1]],from_id,to_id,idMap=idMap,idMaps=idMaps)
-        print ids
-#Write output
-  #Output Table Header
-  outputFile.write("\n#%-17s" % (options.from_id))
-  for t in options.to_id:
-    outputFile.write("%-18s" % t)
-  outputFile.write("\n" + ("=" * 18) + ("=" * 18 * len(options.to_id)) + "\n")
-
-  # Create an output-friendly matrix
-  idArray = [[[] for x in range(len(options.to_id))] for x in range(len(accessions))]
-  for a, acc in enumerate(accessions):
-     idLength = 0
-     for i, to_id in enumerate(options.to_id): # [[ids],[ids]]
-        idArrayColumn = []
-        idPairs = idMaps[to_id][acc] #{from_id:[IDs]} -> [from_id1,from_id2,from_id3]
-        for from_id in idPairs:
-            ids = idPairs[from_id]
-            for id_ in ids:
-                idArrayColumn.append("%s[%s]" % (id_,from_id))
-        if idLength < len(idArrayColumn):
-            idLength = len(idArrayColumn)
-        idArray[a][i].extend(idArrayColumn)
-     for y in range(idLength):
-        outputFile.write("%-18s" % acc)
-        for x in range(len(options.to_id)):
-           try:
-             outputFile.write("%-18s" % idArray[a][x][y])
-           except:
-             outputFile.write(" " * 18)
-        outputFile.write("\n")
-  # Output Matrix
-
-
-
-  #print idMaps
- # python uniprot_id_mapping.py -T ACC -T PDB_ID -T ENSEMBL_PRO_ID -T ENSEMBL_TRS_ID -t test-data/old-inputs/var.tsv
-  stop = timeit.default_timer()
-  outputFile.write("%s %s %-s\n\n" % ("run time: ", stop - start, "seconds"))
-
-if __name__ == "__main__" : __main__()
-