Mercurial > repos > galaxyp > unipept

--- a/unipept.py	Wed Dec 07 16:44:07 2016 -0500
+++ b/unipept.py	Mon Feb 20 10:32:03 2017 -0500
@@ -38,7 +38,7 @@
 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']

 def __main__():
-  version = '1.1'
+  version = '2.0'
   pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'

   def read_tabular(filepath,col):
@@ -120,6 +120,8 @@
   parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
   parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
   parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
+  parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' )
+
   # output fields
   parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
   # Warn vs Error Flag
@@ -174,19 +176,23 @@
         partToPeps.setdefault(part,[]).append(peptide)
   trypticPeptides = partToPeps.keys()
   ## unipept
-  post_data = []
-  if options.equate_il:
-    post_data.append(("equate_il","true"))
-  if options.names:
-    post_data.append(("extra","true"))
-    post_data.append(("names","true"))
-  elif options.extra:
-    post_data.append(("extra","true"))
-  post_data += [('input[]', x) for x in trypticPeptides]
-  headers = {'Content-Type': 'application/x-www-form-urlencoded',  'Accept': 'application/json'}
-  url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
-  req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
-  unipept_resp = json.loads( urllib2.urlopen( req ).read() )
+  unipept_resp = []
+  idx = range(0,len(trypticPeptides),options.max_request)
+  idx.append(len(trypticPeptides))
+  for i in range(len(idx)-1):
+    post_data = []
+    if options.equate_il:
+      post_data.append(("equate_il","true"))
+    if options.names or options.json:
+      post_data.append(("extra","true"))
+      post_data.append(("names","true"))
+    elif options.extra or options.json:
+      post_data.append(("extra","true"))
+    post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]]
+    headers = {'Content-Type': 'application/x-www-form-urlencoded',  'Accept': 'application/json'}
+    url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
+    req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
+    unipept_resp += json.loads( urllib2.urlopen( req ).read() )
   unmatched_peptides = []
   peptideMatches = []
   if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
@@ -244,8 +250,49 @@
         if peptide in unmatched_peptides:
           outputFile.write("%s\n" % peptide)
   if options.json:
-    with open(options.json,'w') as outputFile:
-      outputFile.write(str(resp))
+    if options.unipept == 'pept2prot':
+      with open(options.json,'w') as outputFile:
+        outputFile.write(str(resp))
+    else:
+      found_keys = set()
+      for i,pdict in enumerate(resp):
+        found_keys |= set(pdict.keys())
+      taxa_cols = []
+      for col in pept2lca_extra_column_order[-1:0:-1]:
+        if col+'_id' in found_keys:
+          taxa_cols.append(col)
+      id_to_node = dict()
+      def get_node(id,name,rank,child,seq):
+        if id not in id_to_node:
+          data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1,  'rank' : rank, 'sequences' : [] }
+          node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
+          id_to_node[id] = node
+        else:
+          node = id_to_node[id]
+        node['data']['count'] += 1
+        if seq is not None and seq not in node['data']['sequences']:
+           node['data']['sequences'].append(seq)
+        if child is None:
+          node['data']['self_count'] += 1
+        elif child['id'] not in node['kids']:
+          node['kids'].append(child['id'])
+          node['children'].append(child)
+        return node
+      root = get_node(1,'root','no rank',None,None)
+      for i,pdict in enumerate(resp):
+        sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
+        seq = sequence
+        child = None
+        for col in taxa_cols:
+          col_id = col+'_id'
+          if col_id in pdict and pdict.get(col_id):
+            col_name = col if col in found_keys else col+'_name'
+            child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq)
+            seq = None
+        if child:
+          get_node(1,'root','no rank',child,None)
+      with open(options.json,'w') as outputFile:
+        outputFile.write(json.dumps(root))
   if options.tsv or options.csv:
     # 'pept2lca','pept2taxa','pept2prot'
     found_keys = set()
--- a/unipept.xml	Wed Dec 07 16:44:07 2016 -0500
+++ b/unipept.xml	Mon Feb 20 10:32:03 2017 -0500
@@ -1,33 +1,38 @@
-<tool id="unipept" name="Unipept" version="1.1.0">
+<tool id="unipept" name="Unipept" version="2.0.1">
     <description>retrieve taxonomy for peptides</description>
     <macros>
         <xml name="equate_il">
             <param name="equate_il" type="boolean" truevalue="-e" falsevalue="" checked="true" label="Equate isoleucine and leucine">
                 <help>isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records</help>
-            </param >
+            </param>
         </xml>
         <xml name="extra">
             <param name="extra" type="boolean" truevalue="-x" falsevalue="" checked="false" label="retrieve extra information">
                 <yield/>
-            </param >
+            </param>
+        </xml>
+        <xml name="extra_true">
+            <param name="extra" type="boolean" truevalue="-x" falsevalue="" checked="true" label="retrieve extra information">
+                <yield/>
+            </param>
         </xml>
         <xml name="names">
             <param name="names" type="boolean" truevalue="-n" falsevalue="" checked="true" label="names" >
                 <help>return the names in complete taxonomic lineage</help>
-            </param >
+            </param>
             <param name="allfields" type="boolean" truevalue="-A" falsevalue="" checked="false" label="allfields" >
                 <help>include fields for most specific taxonomic classification: taxon_rank,taxon_id,taxon_name before lineage</help>
-            </param >
+            </param>
         </xml>
     </macros>
     <requirements>
-        <requirement type="package" version="2.7.12">python</requirement>
+        <requirement type="package" version="2.7">python</requirement>
     </requirements>
     <stdio>
         <exit_code range="1:" />
     </stdio>
-    <command interpreter="python"><![CDATA[
-      unipept.py
+    <command><![CDATA[
+      python '$__tool_directory__/unipept.py'
       --api=$unipept.api
       $unipept.equate_il $unipept.extra
       #if $unipept.api != 'pept2prot':
@@ -53,7 +58,7 @@
       #elif str($peptide_src.fmt) == 'pepxml':
         --pepxml="$peptide_src.input_pepxml"
       #end if
-      #if 'json' in str($outputs).split(','):
+      #if 'json' in str($outputs).split(',') and str($unipept.api) != 'pept2prot':
         --json $output_json
       #end if
       #if 'tsv' in str($outputs).split(','):
@@ -82,8 +87,7 @@
           </when>
           <when value="pept2taxa">
               <expand macro="equate_il" />
-              <expand macro="extra">
-                  <checked>true</checked>
+              <expand macro="extra_true">
                   <help>Return the complete lineage of each organism, and include ID fields.</help>
               </expand>
               <expand macro="names" />
@@ -125,14 +129,17 @@
       <param name="outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs">
         <option value="tsv" selected="true">tabular</option>
         <option value="csv">Comma Separated Values (.csv)</option>
-        <option value="json">JSON</option>
+        <option value="json">JSON Taxomony Tree (ignored for pept2prot)</option>
         <option value="unmatched">Unmatched peptides</option>
       </param>
       <param name="strict" type="boolean" truevalue="--strict" falsevalue="" checked="false" label="Exit with error on invalid peptides, otherwise ignore them"/>
     </inputs>
     <outputs>
-      <data name="output_json" format="json" label="${tool.name} ${unipept.api} on ${on_string} json">
-        <filter>'json' in outputs</filter>
+      <data name="output_json" format="d3_hierarchy" label="${tool.name} ${unipept.api} on ${on_string} json">
+        <filter>'json' in outputs and unipept['api'] != 'pept2prot'</filter>
+        <change_format>
+            <when input="api" value="pept2prot" format="json" />
+        </change_format>
       </data>
       <data name="output_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} tsv">
         <filter>'tsv' in outputs</filter>