changeset 4:4953dcd7dd39 draft

planemo upload for repository http://unipept.ugent.be/apidocs commit e91b0fe16bf468b34884508652359b91847d1f95-dirty
author galaxyp
date Wed, 23 Jan 2019 09:16:38 -0500
parents 34758ab8aaa4
children 917fd3ebc223
files unipept.py unipept.xml
diffstat 2 files changed, 593 insertions(+), 110 deletions(-) [+]
line wrap: on
line diff
--- a/unipept.py	Mon Feb 20 10:32:03 2017 -0500
+++ b/unipept.py	Wed Jan 23 09:16:38 2019 -0500
@@ -21,6 +21,18 @@
 import re
 import urllib
 import urllib2
+
+"""
+pept2taxa	json
+pept2lca	json
+pept2prot	
+pept2ec		ecjson	ec
+pept2go			go
+pept2funct	go	ec
+peptinfo	json ecjson ec go
+
+"""
+
 try:
     import xml.etree.cElementTree as ET
 except ImportError:
@@ -31,11 +43,87 @@
     if exit_code:
       sys.exit(exit_code)
 
+go_types = ['biological process', 'molecular function', 'cellular component']
+ec_name_dict = {
+'1' : 'Oxidoreductase',
+'1.1' : 'act on the CH-OH group of donors',
+'1.2' : 'act on the aldehyde or oxo group of donors',
+'1.3' : 'act on the CH-CH group of donors',
+'1.4' : 'act on the CH-NH2 group of donors',
+'1.5' : 'act on CH-NH group of donors',
+'1.6' : 'act on NADH or NADPH',
+'1.7' : 'act on other nitrogenous compounds as donors',
+'1.8' : 'act on a sulfur group of donors',
+'1.9' : 'act on a heme group of donors',
+'1.10' : 'act on diphenols and related substances as donors',
+'1.11' : 'act on peroxide as an acceptor -- peroxidases',
+'1.12' : 'act on hydrogen as a donor',
+'1.13' : 'act on single donors with incorporation of molecular oxygen',
+'1.14' : 'act on paired donors with incorporation of molecular oxygen',
+'1.15' : 'act on superoxide radicals as acceptors',
+'1.16' : 'oxidize metal ions',
+'1.17' : 'act on CH or CH2 groups',
+'1.18' : 'act on iron-sulfur proteins as donors',
+'1.19' : 'act on reduced flavodoxin as donor',
+'1.20' : 'act on phosphorus or arsenic as donors',
+'1.21' : 'act on X-H and Y-H to form an X-Y bond',
+'1.97' : 'other oxidoreductases',
+'2' : 'Transferase',
+'2.1' : 'transfer one-carbon groups, Methylase',
+'2.2' : 'transfer aldehyde or ketone groups',
+'2.3' : 'acyltransferases',
+'2.4' : 'glycosyltransferases',
+'2.5' : 'transfer alkyl or aryl groups, other than methyl groups',
+'2.6' : 'transfer nitrogenous groups',
+'2.7' : 'transfer phosphorus-containing groups',
+'2.8' : 'transfer sulfur-containing groups',
+'2.9' : 'transfer selenium-containing groups',
+'3' : 'Hydrolase',
+'3.1' : 'act on ester bonds',
+'3.2' : 'act on sugars - glycosylases',
+'3.3' : 'act on ether bonds',
+'3.4' : 'act on peptide bonds - Peptidase',
+'3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds',
+'3.6' : 'act on acid anhydrides',
+'3.7' : 'act on carbon-carbon bonds',
+'3.8' : 'act on halide bonds',
+'3.9' : 'act on phosphorus-nitrogen bonds',
+'3.10' : 'act on sulfur-nitrogen bonds',
+'3.11' : 'act on carbon-phosphorus bonds',
+'3.12' : 'act on sulfur-sulfur bonds',
+'3.13' : 'act on carbon-sulfur bonds',
+'4' : 'Lyase',
+'4.1' : 'carbon-carbon lyases',
+'4.2' : 'carbon-oxygen lyases',
+'4.3' : 'carbon-nitrogen lyases',
+'4.4' : 'carbon-sulfur lyases',
+'4.5' : 'carbon-halide lyases',
+'4.6' : 'phosphorus-oxygen lyases',
+'5' : 'Isomerase',
+'5.1' : 'racemases and epimerases',
+'5.2' : 'cis-trans-isomerases',
+'5.3' : 'intramolecular oxidoreductases',
+'5.4' : 'intramolecular transferases -- mutases',
+'5.5' : 'intramolecular lyases',
+'5.99' : 'other isomerases',
+'6' : 'Ligase',
+'6.1' : 'form carbon-oxygen bonds',
+'6.2' : 'form carbon-sulfur bonds',
+'6.3' : 'form carbon-nitrogen bonds',
+'6.4' : 'form carbon-carbon bonds',
+'6.5' : 'form phosphoric ester bonds',
+'6.6' : 'form nitrogen-metal bonds',
+}
 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name']
 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
 pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
+pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']]
+pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']]
+pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']]
+pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']]
+pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go']
 
 def __main__():
   version = '2.0'
@@ -95,7 +183,7 @@
       return None
     elif len(matches) == 1:
       return matches[0].copy()
-    else:
+    elif 'taxon_rank' in matches[0]:
       # find the most specific match (peptide is always the first column order field)
       for col in reversed(pept2lca_extra_column_order[1:]):
         col_id = col+"_id" if options.extra else col
@@ -104,12 +192,229 @@
             return match.copy()
           if col_id in match and match[col_id]:
             return match.copy()
+    else:
+      return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy()
     return None
 
+  def get_taxon_json(resp):
+    found_keys = set()
+    for i,pdict in enumerate(resp):
+      found_keys |= set(pdict.keys())
+    taxa_cols = []
+    for col in pept2lca_extra_column_order[-1:0:-1]:
+      if col+'_id' in found_keys:
+        taxa_cols.append(col)
+    id_to_node = dict()
+    def get_node(id,name,rank,child,seq):
+      if id not in id_to_node:
+        data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1,  'rank' : rank, 'sequences' : [] }
+        node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
+        id_to_node[id] = node
+      else:
+        node = id_to_node[id]
+      node['data']['count'] += 1
+      if seq is not None and seq not in node['data']['sequences']:
+         node['data']['sequences'].append(seq)
+      if child is None:
+        node['data']['self_count'] += 1
+      elif child['id'] not in node['kids']:
+        node['kids'].append(child['id'])
+        node['children'].append(child)
+      return node
+    root = get_node(1,'root','no rank',None,None)
+    for i,pdict in enumerate(resp):
+      sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
+      seq = sequence
+      child = None
+      for col in taxa_cols:
+        col_id = col+'_id'
+        if col_id in pdict and pdict.get(col_id):
+          col_name = col if col in found_keys else col+'_name'
+          child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq)
+          seq = None
+      if child:
+        get_node(1,'root','no rank',child,None)
+    return root
+
+  def get_ec_json(resp):
+    ecMap = dict()
+    for pdict in resp:
+      if 'ec' in pdict:
+        for ec in pdict['ec']:
+          ec_number = ec['ec_number']
+          if ec_number not in ecMap:
+            ecMap[ec_number] = []
+          ecMap[ec_number].append(pdict)
+    def get_ids(ec):
+      ids = []
+      i = len(ec)
+      while i >= 0:
+        ids.append(ec[:i])
+        i = ec.rfind('.',0,i - 1)
+      return ids
+    id_to_node = dict()
+    def get_node(id,name,child,seq):
+      if id not in id_to_node:
+        data = {'count' : 0, 'self_count' : 0, 'sequences' : [] }
+        node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
+        id_to_node[id] = node
+      else:
+        node = id_to_node[id]
+      node['data']['count'] += 1
+      if seq is not None and seq not in node['data']['sequences']:
+         node['data']['sequences'].append(seq)
+      if child is None:
+        node['data']['self_count'] += 1
+      elif child['id'] not in node['kids']:
+        node['kids'].append(child['id'])
+        node['children'].append(child)
+      return node
+    root = get_node(0,'-.-.-.-',None,None)
+    for i in range(1,7):
+      child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None)
+      get_node(0,'-.-.-.-',child,None)
+    for i,pdict in enumerate(resp):
+      sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
+      seq = sequence
+      if 'ec' in pdict:
+        for ec in pdict['ec']:
+          child = None
+          protein_count = ec['protein_count']
+          ec_number = ec['ec_number']
+          for ec_id in get_ids(ec_number):
+            ec_name = str(ec_id)
+            ## if len(ec_id) == 3:
+            ##   ec_name = '%s\n%s\n%s' %(str(ec_id), ec_name_dict[str(ec_id[0])],  ec_name_dict[str(ec_id)])
+            child = get_node(ec_id,ec_name,child,seq)
+            seq = None
+          if child:
+            get_node(0,'-.-.-.-',child,None)
+    return root
+
+  def get_taxon_dict(resp, column_order, extra=False, names=False):
+    found_keys = set()
+    results = []
+    for i,pdict in enumerate(resp):
+      results.append(pdict)
+      found_keys |= set(pdict.keys())
+      # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys)
+    column_names = []
+    column_keys = []
+    for col in column_order:
+      if col in found_keys:
+        column_names.append(col)
+        column_keys.append(col)
+      elif names:
+        col_id = col+'_id'
+        col_name = col+'_name'
+        if extra:
+          if col_id in found_keys:
+            column_names.append(col_id)
+            column_keys.append(col_id)
+        if names:
+          if col_name in found_keys:
+            column_names.append(col)
+            column_keys.append(col_name)
+      else:
+        if col+'_name' in found_keys:
+          column_names.append(col)
+          column_keys.append(col+'_name')
+        elif col+'_id' in found_keys:
+          column_names.append(col)
+          column_keys.append(col+'_id')
+    # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
+    taxa = dict() ## peptide : [taxonomy]
+    for i,pdict in enumerate(results):
+      peptide = pdict['peptide'] if 'peptide' in pdict else None
+      if peptide and peptide not in taxa:
+          vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
+          taxa[peptide] = vals
+    return (taxa,column_names)
+
+  def get_ec_dict(resp, extra=False):
+    ec_cols = ['ec_numbers', 'ec_protein_counts']
+    if extra:
+      ec_cols.append('ec_names')
+    ec_dict = dict()
+    for i,pdict in enumerate(resp):
+      peptide = pdict['peptide']
+      ec_numbers = []
+      protein_counts = []
+      ec_names = []
+      if 'ec' in pdict:
+        for ec in pdict['ec']:
+          ec_numbers.append(ec['ec_number'])
+          protein_counts.append(str(ec['protein_count']))
+          if extra:
+            ec_names.append(ec['name'] if 'name' in ec else '')
+      vals = [','.join(ec_numbers),','.join(protein_counts)]
+      if extra:
+        vals.append(','.join(ec_names))
+      ec_dict[peptide] = vals
+    return (ec_dict, ec_cols)
+
+  def get_go_dict(resp, extra=False):
+    go_cols = ['go_terms', 'go_protein_counts']
+    if extra:
+      go_cols.append('go_names')
+    go_dict = dict()
+    for i,pdict in enumerate(resp):
+      peptide = pdict['peptide']
+      go_terms = []
+      protein_counts = []
+      go_names = []
+      if 'go' in pdict:
+        for go in pdict['go']:
+          if 'go_term' in go:
+            go_terms.append(go['go_term'])
+            protein_counts.append(str(go['protein_count']))
+            if extra:
+              go_names.append(go['name'] if 'name' in go else '')
+          else:
+            for go_type in go_types:
+              if go_type in go:
+                for _go in go[go_type]:
+                  go_terms.append(_go['go_term'])
+                  protein_counts.append(str(_go['protein_count']))
+                  if extra:
+                    go_names.append(_go['name'] if 'name' in _go else '')
+      vals = [','.join(go_terms),','.join(protein_counts)]
+      if extra:
+        vals.append(','.join(go_names))
+      go_dict[peptide] = vals
+    return (go_dict, go_cols)
+
+  def write_ec_table(outfile, resp, column_order):
+    with open(outfile,'w') as fh:
+      for i,pdict in enumerate(resp):
+        if 'ec' in pdict:
+          tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
+          for ec in pdict['ec']:
+            vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]]
+            fh.write('%s\n' % '\t'.join(tvals + vals)) 
+
+  def write_go_table(outfile, resp, column_order):
+    with open(outfile,'w') as fh:
+      for i,pdict in enumerate(resp):
+        if 'go' in pdict:
+          tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
+          for go in pdict['go']:
+            if 'go_term' in go:
+              vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]]
+              fh.write('%s\n' % '\t'.join(tvals + vals)) 
+            else:
+              for go_type in go_types:
+                if go_type in go:
+                  for _go in go[go_type]:
+                    vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]]
+                    vals.append(go_type)
+                    fh.write('%s\n' % '\t'.join(tvals + vals)) 
+
   #Parse Command Line
   parser = optparse.OptionParser()
   # unipept API choice
-  parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' )
+  parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'], 
+      help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' )
   # input files
   parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
   parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
@@ -120,6 +425,7 @@
   parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
   parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
   parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
+  parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' )
   parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' )
   
   # output fields
@@ -128,10 +434,16 @@
   parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
   # output files
   parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
+  parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results')
+  parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results')
+  parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results')
+  parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results')
   parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
   parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
   parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
+  parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' )
   # debug
+  parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' )
   parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
   parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
   (options, args) = parser.parse_args()
@@ -188,15 +500,32 @@
       post_data.append(("names","true"))
     elif options.extra or options.json:
       post_data.append(("extra","true"))
+    if options.domains:
+      post_data.append(("domains","true"))
     post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]]
+    if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data))
     headers = {'Content-Type': 'application/x-www-form-urlencoded',  'Accept': 'application/json'}
-    url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
-    req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
-    unipept_resp += json.loads( urllib2.urlopen( req ).read() )
+    ## headers = {'Accept': 'application/json'}
+    url = '%s/%s' % (options.url.rstrip('/'),options.unipept)
+    if options.get:
+      params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data])
+      url = '%s.json?%s' % (url,params)
+      req = urllib2.Request( url )
+    else:
+      url = '%s.json' % (url)
+      req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
+    if options.debug: print >> sys.stdout, "url: %s\n" % (str(url))
+    try:
+      resp = urllib2.urlopen( req ) 
+      if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode()))
+      if resp.getcode() == 200:
+        unipept_resp += json.loads( urllib2.urlopen( req ).read() )
+    except Exception, e:
+      warn_err('HTTP Error %s\n' % (str(e)),exit_code=None)
   unmatched_peptides = []
   peptideMatches = []
   if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
-  if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa':
+  if options.unipept in ['pept2prot', 'pept2taxa']:
     dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
     ## multiple entries per trypticPeptide for pep2prot or pep2taxa
     mapping = {}
@@ -222,7 +551,7 @@
           match['tryptic_peptide'] = match['peptide']
           match['peptide'] = peptide
           peptideMatches.append(match)
-  else:
+  elif options.unipept in ['pept2lca', 'peptinfo']:
     ## should be one response per trypticPeptide for pep2lca
     respMap = {v['peptide']:v for v in unipept_resp}
     ## map resp back to peptides
@@ -239,6 +568,22 @@
       match['tryptic_peptide'] = match['peptide']
       match['peptide'] = peptide
       peptideMatches.append(match)
+  else:
+    respMap = {v['peptide']:v for v in unipept_resp}
+    ## map resp back to peptides
+    for peptide in peptides:
+      matches = list()
+      for part in pepToParts[peptide]:
+        if part in respMap and 'total_protein_count' in respMap[part]:
+          matches.append(respMap[part])
+      match = best_match(peptide,matches)
+      if not match:
+        unmatched_peptides.append(peptide)
+        longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
+        match = {'peptide' : longest_tryptic_peptide}
+      match['tryptic_peptide'] = match['peptide']
+      match['peptide'] = peptide
+      peptideMatches.append(match)
   resp = peptideMatches
   if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp)
   ## output results
@@ -250,96 +595,69 @@
         if peptide in unmatched_peptides:
           outputFile.write("%s\n" % peptide)
   if options.json:
-    if options.unipept == 'pept2prot':
-      with open(options.json,'w') as outputFile:
-        outputFile.write(str(resp))
-    else:
-      found_keys = set()
-      for i,pdict in enumerate(resp):
-        found_keys |= set(pdict.keys())
-      taxa_cols = []
-      for col in pept2lca_extra_column_order[-1:0:-1]:
-        if col+'_id' in found_keys:
-          taxa_cols.append(col)
-      id_to_node = dict()
-      def get_node(id,name,rank,child,seq):
-        if id not in id_to_node:
-          data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1,  'rank' : rank, 'sequences' : [] }
-          node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
-          id_to_node[id] = node
-        else:
-          node = id_to_node[id]
-        node['data']['count'] += 1
-        if seq is not None and seq not in node['data']['sequences']:
-           node['data']['sequences'].append(seq)
-        if child is None:
-          node['data']['self_count'] += 1
-        elif child['id'] not in node['kids']:
-          node['kids'].append(child['id'])
-          node['children'].append(child)
-        return node
-      root = get_node(1,'root','no rank',None,None)   
-      for i,pdict in enumerate(resp):
-        sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
-        seq = sequence
-        child = None
-        for col in taxa_cols:
-          col_id = col+'_id'
-          if col_id in pdict and pdict.get(col_id): 
-            col_name = col if col in found_keys else col+'_name'
-            child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq)
-            seq = None
-        if child:
-          get_node(1,'root','no rank',child,None)
+    if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']:
+      root = get_taxon_json(resp)
       with open(options.json,'w') as outputFile:
         outputFile.write(json.dumps(root))  
+    elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']:
+      with open(options.json,'w') as outputFile:
+        outputFile.write(str(resp))
+  if options.ec_json:
+    if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
+      root = get_ec_json(resp)
+      with open(options.ec_json,'w') as outputFile:
+        outputFile.write(json.dumps(root))
   if options.tsv or options.csv:
-    # 'pept2lca','pept2taxa','pept2prot'
-    found_keys = set()
-    results = []
-    for i,pdict in enumerate(resp):
-      results.append(pdict)
-      found_keys |= set(pdict.keys())
-      # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys)
-    column_names = []
-    column_keys = []
-    for col in column_order:
-      if col in found_keys:
-        column_names.append(col)
-        column_keys.append(col)
-      elif options.extra or options.names:
-        col_id = col+'_id'
-        col_name = col+'_name'
-        if options.extra:
-          if col_id in found_keys:
-            column_names.append(col_id)
-            column_keys.append(col_id)
-        if options.names:
-          if col_name in found_keys:
-            column_names.append(col)
-            column_keys.append(col_name)
-      else:
-        if col+'_name' in found_keys:
-          column_names.append(col)
-          column_keys.append(col+'_name')
-        elif col+'_id' in found_keys:
-          column_names.append(col)
-          column_keys.append(col+'_id')
-    # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
-    taxa = []
-    for i,pdict in enumerate(results):
-      vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
-      if vals not in taxa:
-        taxa.append(vals)
+    rows = []
+    column_names = None
+    if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']:
+      taxa = None
+      ec_dict = None
+      go_dict = None
+      if options.unipept in ['peptinfo']:
+        (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
+      if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
+        (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra)
+      if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
+        (go_dict,go_cols) = get_go_dict(resp, extra=options.extra)
+      for i,pdict in enumerate(resp):
+        peptide = pdict['peptide'] 
+        total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0'
+        column_names = ['peptide', 'total_protein_count']
+        vals = [peptide,total_protein_count] 
+        if ec_dict:
+          vals += ec_dict[peptide]
+          column_names += ec_cols
+        if go_dict:
+          vals += go_dict[peptide]
+          column_names += go_cols
+        if taxa:
+          vals += taxa[peptide][1:]
+          column_names += taxon_cols[1:]
+        rows.append(vals)
+    elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']:
+      (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
+      column_names = taxon_cols
+      rows = taxa.values()
+      for peptide,vals in taxa.iteritems():
+        rows.append(vals)
     if options.tsv:
       with open(options.tsv,'w') as outputFile:
-        outputFile.write("#%s\n"% '\t'.join(column_names))
-        for vals in taxa:
+        if column_names:
+          outputFile.write("#%s\n"% '\t'.join(column_names))
+        for vals in rows:
           outputFile.write("%s\n"% '\t'.join(vals))
     if options.csv:
       with open(options.csv,'w') as outputFile:
-        outputFile.write("%s\n"% ','.join(column_names))
-        for vals in taxa:
+        if column_names:
+          outputFile.write("%s\n"% ','.join(column_names))
+        for vals in rows:
           outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals]))
+  if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
+    column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order
+    write_ec_table(options.ec_tsv, resp, column_order)
+  if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
+    column_order = pept2go_extra_column_order if options.extra else pept2go_column_order
+    write_go_table(options.go_tsv, resp, column_order)
 
 if __name__ == "__main__" : __main__()
--- a/unipept.xml	Mon Feb 20 10:32:03 2017 -0500
+++ b/unipept.xml	Wed Jan 23 09:16:38 2019 -0500
@@ -1,4 +1,4 @@
-<tool id="unipept" name="Unipept" version="2.0.1">
+<tool id="unipept" name="Unipept" version="4.0.0">
     <description>retrieve taxonomy for peptides</description>
     <macros>
         <xml name="equate_il">
@@ -24,6 +24,20 @@
                 <help>include fields for most specific taxonomic classification: taxon_rank,taxon_id,taxon_name before lineage</help>
             </param>
         </xml>
+        <xml name="domains">
+            <param name="domains" type="boolean" truevalue="-D" falsevalue="" checked="false" label="group responses by GO namespace (biological process, molecular function, cellular component)">
+                <yield/>
+            </param>
+        </xml>
+        <xml name="selected_outputs">
+            <param name="selected_outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs">
+                <option value="tsv" selected="true">Tabular with one line per peptide</option>
+                <option value="csv">Comma Separated Values (.csv) with one line per peptide</option>
+                <option value="json">JSON Taxomony Tree (for pept2lca, pep2taxa, and peptinfo)</option>
+                <yield/>
+                <option value="unmatched">Unmatched peptides</option>
+            </param>
+        </xml>
     </macros>
     <requirements>
         <requirement type="package" version="2.7">python</requirement>
@@ -33,11 +47,15 @@
     </stdio>
     <command><![CDATA[
       python '$__tool_directory__/unipept.py' 
+      ## --url 'http://morty.ugent.be/api/v1' -g -M 1  
       --api=$unipept.api
       $unipept.equate_il $unipept.extra 
-      #if $unipept.api != 'pept2prot':
+      #if $unipept.api in ['pept2lca', 'pept2taxa', 'peptinfo']:
         $unipept.names $unipept.allfields
       #end if
+      #if $unipept.api in ['pept2go', 'pept2funct', 'peptinfo']:
+        $unipept.domains
+      #end if
       $strict
       #if str($peptide_src.fmt) == 'proteomic':
         #if $peptide_src.input.datatype.file_ext == 'fasta':
@@ -58,16 +76,25 @@
       #elif str($peptide_src.fmt) == 'pepxml':
         --pepxml="$peptide_src.input_pepxml"
       #end if
-      #if 'json' in str($outputs).split(',') and str($unipept.api) != 'pept2prot':
+      #if 'json' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2lca', 'pept2taxa', 'peptinfo']:
         --json $output_json
       #end if
-      #if 'tsv' in str($outputs).split(','):
+      #if 'ec_json' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2ec', 'pept2funct', 'peptinfo']:
+        --ec_json $output_ec_json
+      #end if
+      #if 'tsv' in str($selected_outputs).split(','):
         --tsv $output_tsv
       #end if
-      #if 'csv' in str($outputs).split(','):
+      #if 'csv' in str($selected_outputs).split(','):
         --csv $output_csv
       #end if
-      #if 'unmatched' in str($outputs).split(','):
+      #if 'ec_tsv' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2ec', 'pept2funct', 'peptinfo']:
+        --ec_tsv $output_ec_tsv
+      #end if
+      #if 'go_tsv' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2go', 'pept2funct', 'peptinfo']:
+        --go_tsv $output_go_tsv
+      #end if
+      #if 'unmatched' in str($selected_outputs).split(','):
         --unmatched $output_unmatched
       #end if
     ]]></command>
@@ -77,6 +104,10 @@
               <option value="pept2lca" selected="true">pept2lca: lowest common ancestor</option>
               <option value="pept2taxa">pept2taxa: organisms associated with the UniProt entries containing a given tryptic peptide</option>
               <option value="pept2prot">pept2prot: UniProt entries containing a given tryptic peptide</option>
+              <option value="pept2ec">pept2ec: Tryptic peptides and associated EC terms</option>
+              <option value="pept2go">pept2go: Tryptic peptides and associated GO terms</option>
+              <option value="pept2funct">pept2funct: Tryptic peptides and associated EC and GO terms</option>
+              <option value="peptinfo">peptinfo: Tryptic peptides and associated EC and GO terms and lowest common ancestor taxonomy</option>
           </param>
           <when value="pept2lca">
               <expand macro="equate_il" />
@@ -100,6 +131,38 @@
                   </help>
               </expand>
           </when>
+          <when value="pept2ec">
+              <expand macro="equate_il" />
+              <expand macro="extra_true">
+                  <help>Return the name of the EC-number.
+                  </help>
+              </expand>
+          </when>
+          <when value="pept2go">
+              <expand macro="equate_il" />
+              <expand macro="extra_true">
+                  <help>Return the name of the GO-term.
+                  </help>
+              </expand>
+              <expand macro="domains" />
+          </when>
+          <when value="pept2funct">
+              <expand macro="equate_il" />
+              <expand macro="extra_true">
+                  <help>Return the name of the  EC-number and GO-term.
+                  </help>
+              </expand>
+              <expand macro="domains" />
+          </when>
+          <when value="peptinfo">
+              <expand macro="equate_il" />
+              <expand macro="extra_true">
+                  <help>Return the name of the  EC-number and GO-term.
+                  </help>
+              </expand>
+              <expand macro="domains" />
+              <expand macro="names" />
+          </when>
       </conditional>
       <conditional name="peptide_src">
         <param name="fmt" type="select" label="Peptides input format" >
@@ -126,29 +189,66 @@
           <param name="input_pepxml" type="data" format="pepxml" label="mzIndetML Input" />
         </when>
       </conditional>
-      <param name="outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs">
-        <option value="tsv" selected="true">tabular</option>
-        <option value="csv">Comma Separated Values (.csv)</option>
-        <option value="json">JSON Taxomony Tree (ignored for pept2prot)</option>
+      <param name="selected_outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs">
+       <option value="tsv" selected="true">Tabular with one line per peptide</option>
+       <option value="csv">Comma Separated Values (.csv) with one line per peptide</option>
+        <option value="json">JSON Taxomony Tree (for pept2lca, pep2taxa, and peptinfo)</option>
+        <option value="go_tsv">Peptide GO terms in normalized tabular (for pept2go, pept2funct, and peptinfo)</option>
+        <option value="ec_tsv">Peptide EC terms in normalized tabular (for pept2ec, pept2funct, and peptinfo)</option>
+        <option value="ec_json">JSON EC Coverage Tree (for pept2ec, pep2funct, and peptinfo)</option>
         <option value="unmatched">Unmatched peptides</option>
       </param>
       <param name="strict" type="boolean" truevalue="--strict" falsevalue="" checked="false" label="Exit with error on invalid peptides, otherwise ignore them"/>
     </inputs>
     <outputs>
-      <data name="output_json" format="d3_hierarchy" label="${tool.name} ${unipept.api} on ${on_string} json"> 
-        <filter>'json' in outputs and unipept['api'] != 'pept2prot'</filter>
+      <data name="output_json" format="d3_hierarchy" label="${tool.name} ${unipept.api} on ${on_string} Taxonomy json"> 
+        <filter>'json' in selected_outputs and unipept['api'] in ('pept2lca', 'pept2taxa', 'peptinfo')</filter>
         <change_format>
             <when input="api" value="pept2prot" format="json" />
         </change_format>
       </data> 
+      <data name="output_ec_json" format="d3_hierarchy" label="${tool.name} ${unipept.api} on ${on_string} EC json">
+        <filter>'ec_json' in selected_outputs and unipept['api'] in ('pept2ec', 'pept2funct', 'peptinfo')</filter>
+      </data>
       <data name="output_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} tsv"> 
-        <filter>'tsv' in outputs</filter>
+        <filter>'tsv' in selected_outputs</filter>
+        <actions>
+            <action name="comment_lines" type="metadata" default="1" />
+            <!--
+            <conditional name="unipept.api">
+                <when value="pept2funct">
+                    <action name="column_names" type="metadata" default="peptide,total_protein_count,ec_numbers,ec_protein_counts,ec_names,go_terms,go_protein_counts,go_names" />
+                </when>
+                <when value="pept2go">
+                    <action name="column_names" type="metadata" default="peptide,total_protein_count,go_terms,go_protein_counts,go_names" />
+                </when>
+                <when value="pept2ec">
+                    <action name="column_names" type="metadata" default="peptide,total_protein_count,ec_numbers,ec_protein_counts,ec_names" />
+                </when>
+            </conditional>
+            -->
+        </actions>
       </data> 
       <data name="output_csv" format="csv" label="${tool.name} ${unipept.api} on ${on_string} csv"> 
-        <filter>'csv' in outputs</filter>
+        <filter>'csv' in selected_outputs</filter>
+      </data> 
+      <data name="output_ec_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} EC tsv"> 
+        <filter>'ec_tsv' in selected_outputs and unipept['api'] in ('pept2ec', 'pept2funct', 'peptinfo')</filter>
+        <actions>
+            <action name="column_names" type="metadata" default="Peptide,Total Protein Count,EC Number,Protein Count,EC Name" />
+        </actions>
+      </data> 
+      <data name="output_go_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} GO tsv"> 
+        <filter>'go_tsv' in selected_outputs and unipept['api'] in ('pept2go', 'pept2funct', 'peptinfo')</filter>
+        <actions>
+            <action name="column_names" type="metadata" default="Peptide,Total Protein Count,GO Term,Protein Count,GO Name" />
+        </actions>
       </data> 
       <data name="output_unmatched" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} unmatched"> 
-        <filter>'unmatched' in outputs</filter>
+        <filter>'unmatched' in selected_outputs</filter>
+        <actions>
+            <action name="column_names" type="metadata" default="Unmatched Peptide" />
+        </actions>
       </data> 
     </outputs>
     <tests>
@@ -159,7 +259,7 @@
         <param name="column" value="2"/>
         <param name="extra" value="True"/>
         <param name="names" value="True"/>
-        <param name="outputs" value="tsv,unmatched"/>
+        <param name="selected_outputs" value="tsv,unmatched"/>
         <output name="output_tsv">
             <assert_contents>
               <has_text text="Homininae" />
@@ -178,7 +278,7 @@
         <param name="equate_il" value="True"/>
         <param name="extra" value="True"/>
         <param name="names" value="True"/>
-        <param name="outputs" value="json,tsv"/>
+        <param name="selected_outputs" value="json,tsv"/>
         <output name="output_json">
             <assert_contents>
               <has_text text="VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKGSNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV" />
@@ -187,7 +287,7 @@
         <output name="output_tsv">
             <assert_contents>
               <has_text text="9606" />
-              <has_text text="9598" />
+              <has_text text="9596" />
             </assert_contents>
         </output>
       </test>
@@ -198,13 +298,37 @@
         <param name="equate_il" value="True"/>
         <param name="extra" value="False"/>
         <param name="names" value="False"/>
-        <param name="outputs" value="tsv"/>
+        <param name="selected_outputs" value="tsv"/>
         <output name="output_tsv">
             <assert_contents>
               <has_text text="sapiens" />
-              <has_text text="troglodytes" />
+              <has_text text="paniscus" />
               <has_text text="Gorilla" />
-              <has_text text="Macaca" />
+            </assert_contents>
+        </output>
+      </test>
+      <test>
+        <param name="api" value="pept2funct"/>
+        <param name="fmt" value="tabular"/>
+        <param name="input_tsv" value="input.tsv"/>
+        <param name="column" value="2"/>
+        <param name="extra" value="True"/>
+        <param name="names" value="True"/>
+        <param name="selected_outputs" value="tsv,ec_tsv,go_tsv,unmatched"/>
+        <output name="output_tsv">
+            <assert_contents>
+              <has_text text="GO:0004802" />
+              <has_text text="2.2.1.1" />
+            </assert_contents>
+        </output>
+        <output name="output_ec_tsv">
+            <assert_contents>
+              <has_text text="2.2.1.1" />
+            </assert_contents>
+        </output>
+        <output name="output_go_tsv">
+            <assert_contents>
+              <has_text text="GO:0004802" />
             </assert_contents>
         </output>
       </test>
@@ -335,6 +459,47 @@
         varietas_id
         forma_id
 
+    **pept2ec**  - http://unipept.ugent.be/apidocs/pept2ec
+
+    Returns the functional EC-numbers associated with a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.
+
+    By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::
+
+        peptide: the peptide that matched this record
+        total_protein_count: Total amount of proteins matched with the given peptide
+        ec_number: EC-number associated with the current tryptic peptide.
+        protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current EC-number.
+        name: Optional, name of the EC-number. Included when the extra parameter is set to true.
+
+
+    **pept2go**  - http://unipept.ugent.be/apidocs/pept2go
+
+    Returns the functional GO-terms associated with a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.
+
+    By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::
+
+        peptide: the peptide that matched this record
+        total_protein_count: Total amount of proteins matched with the given peptide
+        go_term: The GO-term associated with the current tryptic peptide.
+        protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current GO-term.
+        name: Optional, name of the GO-term. Included when the extra parameter is set to true.
+
+
+    **pept2funct**  - http://unipept.ugent.be/apidocs/pept2funct
+
+    Returns the functional EC-numbers and GO-terms associated with a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.
+
+    By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::
+
+        peptide: the peptide that matched this record
+        total_protein_count: Total amount of proteins matched with the given peptide
+        ec_number: EC-number associated with the current tryptic peptide.
+        protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current EC-number.
+        name: Optional, name of the EC-number. Included when the extra parameter is set to true.
+        go_term: The GO-term associated with the current tryptic peptide.
+        protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current GO-term.
+        name: Optional, name of the GO-term. Included when the extra parameter is set to true.
+
 
     **Attributions**