Repository 'unipept'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxyp/unipept

Changeset 0:6430407e5869 (2015-04-03)
Next changeset 1:0c1ee95282fa (2015-04-14)
Commit message:
Uploaded
added:
.shed.yml
repository_dependencies.xml
test-data/input.fasta
test-data/input.tsv
test-data/input_bad.fasta
unipept.py
unipept.xml
b
diff -r 000000000000 -r 6430407e5869 .shed.yml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml Fri Apr 03 14:55:49 2015 -0400
b
@@ -0,0 +1,18 @@
+categories: 
+ - Proteomics
+ - Metaproteomics
+description: Unipept retrieves metaproteomics information
+homepage_url: https://github.com/galaxyproteomics/tools-galaxyp
+long_description: 'Unipept retrieves taxonomy for tryptic peptides using the unipept API. 
+  http://unipept.ugent.be
+  http://unipept.ugent.be/apidocs 
+
+  The Unipept metaproteomics analysis pipeline
+  Bart Mesuere1,*, Griet Debyser2, Maarten Aerts3, Bart Devreese2, Peter Vandamme3 andPeter Dawyndt1
+  Article first published online: 11 FEB 2015
+  DOI: 10.1002/pmic.201400361
+  http://onlinelibrary.wiley.com/doi/10.1002/pmic.201400361/abstract;jsessionid=BFF1994E4C14DA73D7C907EB208AD710.f04t04
+  '
+name: unipept
+owner: galaxyp
+remote_repository_url: http://unipept.ugent.be/apidocs
b
diff -r 000000000000 -r 6430407e5869 repository_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/repository_dependencies.xml Fri Apr 03 14:55:49 2015 -0400
b
@@ -0,0 +1,4 @@
+<?xml version="1.0"?>
+<repositories description="Required proteomics dependencies.">
+    <repository changeset_revision="de34893b3834" name="proteomics_datatypes" owner="iracooke" toolshed="https://toolshed.g2.bx.psu.edu" />
+</repositories>
b
diff -r 000000000000 -r 6430407e5869 test-data/input.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.fasta Fri Apr 03 14:55:49 2015 -0400
b
@@ -0,0 +1,10 @@
+>1
+AIPQLEVARPADAYETAEAYR
+>2
+AAEGGLSR
+>3
+APVLSDSSCK
+>4
+DQIAHEGK
+>5
+ATLTSGAAR
b
diff -r 000000000000 -r 6430407e5869 test-data/input.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.tsv Fri Apr 03 14:55:49 2015 -0400
b
@@ -0,0 +1,5 @@
+1 AIPQLEVARPADAYETAEAYR AIPQLEVARPADAYETAEAYR
+2 AAEGGLSR AAEGQLSR
+3 APVLSDSSCK APVLJDSSCK
+4 DQIAHEGK DQUAHEGK
+5 ATLTSGAAR ATLTSGAAR
b
diff -r 000000000000 -r 6430407e5869 test-data/input_bad.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_bad.fasta Fri Apr 03 14:55:49 2015 -0400
b
@@ -0,0 +1,10 @@
+>1
+AIPQLEVARPADAYETAEAYR
+>2
+AAEGQLSR
+>3
+APVLJDSSCK
+>4
+DQUAHEGK
+>5
+ATLTSGAAR
b
diff -r 000000000000 -r 6430407e5869 unipept.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/unipept.py Fri Apr 03 14:55:49 2015 -0400
[
b'@@ -0,0 +1,194 @@\n+#!/usr/bin/env python\n+"""\n+#\n+#------------------------------------------------------------------------------\n+#                         University of Minnesota\n+#         Copyright 2015, Regents of the University of Minnesota\n+#------------------------------------------------------------------------------\n+# Author:\n+#\n+#  James E Johnson\n+#\n+#------------------------------------------------------------------------------\n+"""\n+\n+import json\n+import logging\n+import optparse\n+from optparse import OptionParser\n+import os\n+import sys\n+import re\n+import urllib\n+import urllib2\n+try:\n+    import xml.etree.cElementTree as ET\n+except ImportError:\n+    import xml.etree.ElementTree as ET\n+\n+def warn_err(msg,exit_code=1):\n+    sys.stderr.write(msg)\n+    if exit_code:\n+      sys.exit(exit_code)\n+\n+def read_fasta(fp):\n+    name, seq = None, []\n+    for line in fp:\n+        line = line.rstrip()\n+        if line.startswith(">"):\n+            if name: yield (name, \'\'.join(seq))\n+            name, seq = line, []\n+        else:\n+            seq.append(line)\n+    if name: yield (name, \'\'.join(seq))\n+\n+def read_mzid(fp):\n+  peptides = []\n+  for event, elem in ET.iterparse(fp):\n+    if event == \'end\':\n+      if re.search(\'PeptideSequence\',elem.tag):\n+        peptides.append(elem.text)\n+  return peptides\n+\n+def read_pepxml(fp):\n+  peptides = []\n+  for event, elem in ET.iterparse(fp):\n+    if event == \'end\':\n+      if re.search(\'search_hit\',elem.tag):\n+        peptides.append(elem.get(\'peptide\'))\n+  return peptides\n+\n+def __main__():\n+  #Parse Command Line\n+  parser = optparse.OptionParser()\n+  # unipept API\n+  parser.add_option( \'-A\', \'--api\', dest=\'unipept\', default=\'pept2lca\', choices=[\'pept2lca\',\'pept2taxa\',\'pept2prot\'], help=\'The unipept application: pept2lca, pept2taxa, or pept2prot\' )\n+  # files\n+  parser.add_option( \'-t\', \'--tabular\', dest=\'tabular\', default=None, help=\'A tabular file that contains a peptide column\' )\n+  parser.add_option( \'-c\', \'--column\', dest=\'column\', type=\'int\', default=0, help=\'The column (zero-based) in the tabular file that contains peptide sequences\' )\n+  parser.add_option( \'-f\', \'--fasta\', dest=\'fasta\', default=None, help=\'A fasta file containing peptide sequences\' )\n+  parser.add_option( \'-m\', \'--mzid\', dest=\'mzid\', default=None, help=\'A mxIdentML file containing peptide sequences\' )\n+  parser.add_option( \'-p\', \'--pepxml\', dest=\'pepxml\', default=None, help=\'A pepxml file containing peptide sequences\' )\n+  # Unipept Flags\n+  parser.add_option( \'-e\', \'--equate_il\', dest=\'equate_il\', action=\'store_true\', default=False, help=\'isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records\' )\n+  parser.add_option( \'-x\', \'--extra\', dest=\'extra\', action=\'store_true\', default=False, help=\'return the complete lineage of the taxonomic lowest common ancestor\' )\n+  parser.add_option( \'-n\', \'--names\', dest=\'names\', action=\'store_true\', default=False, help=\'return the names of all ranks in the lineage of the taxonomic lowest common ancestor\' )\n+  # Warn vs Error Flag\n+  parser.add_option( \'-S\', \'--strict\', dest=\'strict\', action=\'store_true\', default=False, help=\'Print exit on invalid peptide\' )\n+  # outputs\n+  parser.add_option( \'-J\', \'--json\', dest=\'json\', default=None, help=\'Output file path for json formatted results\')\n+  parser.add_option( \'-T\', \'--tsv\', dest=\'tsv\', default=None, help=\'Output file path for TAB-separated-values (.tsv) formatted results\')\n+  parser.add_option( \'-C\', \'--csv\', dest=\'csv\', default=None, help=\'Output file path for Comma-separated-values (.csv) formatted results\')\n+  parser.add_option( \'-M\', \'--mismatch\', dest=\'mismatch\', default=None, help=\'Output file path for peptide with no matches\' )\n+  (options, args) = parser.parse_args()\n+  invalid_ec = 2 if options.strict else None\n+  peptides = []\n+  pep_pat = \'^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$\'\n+  ## Get peptide sequences\n+  if options.mzid:\n+    peptides += read_mzid(options.mzid)\n+  if options.pepxml:\n+  '..b'id_ec)\n+        peptides.append(peptide) \n+  if args and len(args) > 0:\n+    for i,peptide in enumerate(args):\n+      if not re.match(pep_pat,peptide):\n+        warn_err(\'"%s" is not a peptide (arg %d)\\n\' % (peptide,i),exit_code=invalid_ec)\n+      peptides.append(peptide) \n+  if len(peptides) < 1:\n+    warn_err("No peptides input!",exit_code=1)\n+  ## unipept\n+  post_data = []\n+  if options.equate_il:\n+    post_data.append(("equate_il","true"))\n+  if options.names:\n+    post_data.append(("extra","true"))\n+    post_data.append(("names","true"))\n+  elif options.extra:\n+    post_data.append(("extra","true"))\n+  post_data += [(\'input[]\', x) for x in peptides]\n+  headers = {\'Content-Type\': \'application/x-www-form-urlencoded\',  \'Accept\': \'application/json\'}\n+  url = \'http://api.unipept.ugent.be/api/v1/%s\' % options.unipept\n+  req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )\n+  resp = json.loads( urllib2.urlopen( req ).read() )\n+  ## output results\n+  if not (options.mismatch or options.json or options.tsv or options.csv):\n+    print >> sys.stdout, str(resp)\n+  if options.mismatch:\n+    peptides_matched = []\n+    for i,pdict in enumerate(resp):\n+      peptides_matched.append(pdict[\'peptide\'])\n+    with open(options.mismatch,\'w\') as outputFile:\n+      for peptide in peptides:\n+        if not peptide in peptides_matched:\n+          outputFile.write("%s\\n" % peptide)\n+  if options.json:\n+    with open(options.json,\'w\') as outputFile:\n+      outputFile.write(str(resp))  \n+  if options.tsv or options.csv:\n+    # \'pept2lca\',\'pept2taxa\',\'pept2prot\'\n+    pept2lca_column_order = [ \'peptide\',\'superkingdom\',\'kingdom\',\'subkingdom\',\'superphylum\',\'phylum\',\'subphylum\',\'superclass\',\'class_\',\'subclass\',\'infraclass\',\'superorder\',\'order\',\'suborder\',\'infraorder\',\'parvorder\',\'superfamily\',\'family\',\'subfamily\',\'tribe\',\'subtribe\',\'genus\',\'subgenus\',\'species_group\',\'species_subgroup\',\'species\',\'subspecies\',\'varietas\',\'forma\' ]\n+    pept2prot_column_order = [ \'peptide\',\'uniprot_id\',\'taxon_id\',\'taxon_name\',\'ec_references\',\'go_references\',\'refseq_ids\',\'refseq_protein_ids\',\'insdc_ids\',\'insdc_protein_ids\']\n+    column_order = pept2prot_column_order if options.unipept == \'pept2prot\' else pept2lca_column_order\n+    found_keys = set()\n+    results = []\n+    for i,pdict in enumerate(resp):\n+      results.append(pdict)\n+      found_keys |= set(pdict.keys())\n+      # print >> sys.stderr, "%s\\n%s" % (pdict.keys(),found_keys)\n+    column_names = []\n+    column_keys = []\n+    for col in column_order:\n+      if col in found_keys:\n+        column_names.append(col)\n+        column_keys.append(col)\n+      elif options.extra or options.names:\n+        col_id = col+\'_id\'\n+        col_name = col+\'_name\'\n+        if options.extra:\n+          if col_id in found_keys:\n+            column_names.append(col_id)\n+            column_keys.append(col_id)\n+        if options.names:\n+          if col_name in found_keys:\n+            column_names.append(col)\n+            column_keys.append(col_name)\n+      else:\n+        if col+\'_name\' in found_keys:\n+          column_names.append(col)\n+          column_keys.append(col+\'_name\')\n+        elif col+\'_id\' in found_keys:\n+          column_names.append(col)\n+          column_keys.append(col+\'_id\')\n+    # print >> sys.stderr, "%s\\n%s" % (column_names,column_keys)\n+    taxa = []\n+    for i,pdict in enumerate(results):\n+      vals = [str(pdict[x]) if x in pdict and pdict[x] else \'\' for x in column_keys]\n+      taxa.append(vals)\n+    if options.tsv:\n+      with open(options.tsv,\'w\') as outputFile:\n+        outputFile.write("#%s\\n"% \'\\t\'.join(column_names))\n+        for vals in taxa:\n+          outputFile.write("%s\\n"% \'\\t\'.join(vals))\n+    if options.csv:\n+      with open(options.csv,\'w\') as outputFile:\n+        outputFile.write("%s\\n"% \',\'.join(column_names))\n+        for vals in taxa:\n+          outputFile.write("%s\\n"% \',\'.join([\'"%s"\' % (v if v else \'\') for v in vals]))\n+\n+if __name__ == "__main__" : __main__()\n'
b
diff -r 000000000000 -r 6430407e5869 unipept.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/unipept.xml Fri Apr 03 14:55:49 2015 -0400
[
b'@@ -0,0 +1,306 @@\n+<tool id="unipept" name="Unipept" version="0.1.0">\n+    <description>retrieve taxonomy for peptides</description>\n+    <macros>\n+        <xml name="equate_il">\n+            <param name="equate_il" type="boolean" truevalue="-e" falsevalue="" checked="false" label="Equate isoleucine and leucine">\n+                <help>isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records</help>\n+            </param >\n+        </xml>\n+        <xml name="extra">\n+            <param name="extra" type="boolean" truevalue="-x" falsevalue="" checked="false" label="retrieve extra information">\n+                <yield/>\n+            </param >\n+        </xml>\n+        <xml name="names">\n+            <param name="names" type="boolean" truevalue="-n" falsevalue="" checked="true" label="names" >\n+                <help>return the names of taxons</help>\n+            </param >\n+        </xml>\n+    </macros>\n+    <requirements>\n+    </requirements>\n+    <stdio>\n+        <exit_code range="1:" />\n+    </stdio>\n+    <command interpreter="python"><![CDATA[\n+      unipept.py \n+      --api=$unipept.api\n+      $unipept.equate_il $unipept.extra \n+      #if $unipept.api != \'pept2prot\':\n+        $unipept.names \n+      #end if\n+      $strict\n+      #if str($peptide_src.fmt) == \'proteomic\':\n+        #if $peptide_src.input.datatype.file_ext == \'fasta\':\n+          --fasta="$peptide_src.input"\n+        #elif $peptide_src.input.datatype.file_ext == \'mzid\':\n+          --mzid="$peptide_src.input"\n+        #elif $peptide_src.input.datatype.file_ext == \'pepxml\':\n+          --pepxml="$peptide_src.input"\n+        #end if\n+      #elif str($peptide_src.fmt) == \'tabular\':\n+        --tabular="$peptide_src.input_tsv"\n+        #set $col = int(str($peptide_src.column)) - 1\n+        --column=$col\n+      #elif str($peptide_src.fmt) == \'fasta\':\n+        --fasta="$peptide_src.input_fasta"\n+      #elif str($peptide_src.fmt) == \'mzid\':\n+        --mzid="$peptide_src.input_mzid"\n+      #elif str($peptide_src.fmt) == \'pepxml\':\n+        --pepxml="$peptide_src.input_pepxml"\n+      #end if\n+      #if \'json\' in str($outputs).split(\',\'):\n+        --json $output_json\n+      #end if\n+      #if \'tsv\' in str($outputs).split(\',\'):\n+        --tsv $output_tsv\n+      #end if\n+      #if \'csv\' in str($outputs).split(\',\'):\n+        --csv $output_csv\n+      #end if\n+      #if \'mismatch\' in str($outputs).split(\',\'):\n+        --mismatch $output_mismatch\n+      #end if\n+    ]]></command>\n+    <inputs>\n+      <conditional name="unipept">\n+          <param name="api" type="select" label="Unipept application" >\n+              <option value="pept2taxa" selected="true">pept2taxa: organisms associated with the UniProt entries containing a given tryptic peptide</option>\n+              <option value="pept2lca">pept2lca: lowest common ancestor</option>\n+              <option value="pept2prot">pept2prot: UniProt entries containing a given tryptic peptide</option>\n+          </param>\n+          <when value="pept2taxa">\n+              <expand macro="equate_il" />\n+              <expand macro="extra">\n+                  <checked>true</checked>\n+                  <help>Return the complete lineage of each organism.</help>\n+              </expand>\n+              <expand macro="names" />\n+          </when>\n+          <when value="pept2lca">\n+              <expand macro="equate_il" />\n+              <expand macro="extra">\n+                  <help>Return the complete lineage of the taxonomic lowest common ancestor.</help>\n+              </expand>\n+              <expand macro="names" />\n+          </when>\n+          <when value="pept2prot">\n+              <expand macro="equate_il" />\n+              <expand macro="extra">\n+                  <help>Return additional information fields: taxon_name, ec_references, go_references, refseq_ids, refseq_protein_ids, insdc_ids, insdc_protein_ids\n+                        WARNING: Huge perfomance penalty!  Only use for small number of peptide'..b'ted RefSeq protein accession numbers\n+        insdc_ids: a space separated list of associated insdc accession numbers\n+        insdc_protein_ids: a space separated list of associated insdc protein accession numbers\n+\n+    http://unipept.ugent.be/apidocs/pept2prot\n+\n+    **pept2taxa**\n+    Returns the set of organisms associated with the UniProt entries containing a given tryptic peptide. This is the same information as provided on the Lineage table tab when performing a search with the Tryptic Peptide Analysis in the web interface.\n+\n+    By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::\n+\n+        peptide: the peptide that matched this record\n+        taxon_id: the NCBI taxon id of the organism associated with the matching record\n+        taxon_name: the name of the organism associated with the matching record\n+        taxon_rank: the taxonomic rank of the organism associated with the matching record\n+\n+    When the extra parameter is set to true, objects contain additional information about the lineages of the organism extracted from the NCBI taxonomy. The taxon id of each rank in the lineage is specified using the following information fields::\n+\n+        superkingdom_id\n+        kingdom_id\n+        subkingdom_id\n+        superphylum_id\n+        phylum_id\n+        subphylum_id\n+        superclass_id\n+        class_id\n+        subclass_id\n+        infraclass_id\n+        superorder_id\n+        order_id\n+        suborder_id\n+        infraorder_id\n+        parvorder_id\n+        superfamily_id\n+        family_id\n+        subfamily_id\n+        tribe_id\n+        subtribe_id\n+        genus_id\n+        subgenus_id\n+        species_group_id\n+        species_subgroup_id\n+        species_id\n+        subspecies_id\n+        varietas_id\n+        forma_id\n+\n+    http://unipept.ugent.be/apidocs/pept2taxa\n+\n+    **pept2lca** \n+    Returns the taxonomic lowest common ancestor for a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.\n+\n+    By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::\n+\n+        peptide: the peptide that matched this record\n+        taxon_id: the NCBI taxon id of the organism associated with the matching record\n+        taxon_name: the name of the organism associated with the matching record\n+        taxon_rank: the taxonomic rank of the organism associated with the matching record\n+\n+    When the extra parameter is set to true, objects contain additional information about the lineage of the taxonomic lowest common ancestor extracted from the NCBI taxonomy. The taxon id of each rank in the lineage is specified using the following information fields::\n+\n+        superkingdom_id\n+        kingdom_id\n+        subkingdom_id\n+        superphylum_id\n+        phylum_id\n+        subphylum_id\n+        superclass_id\n+        class_id\n+        subclass_id\n+        infraclass_id\n+        superorder_id\n+        order_id\n+        suborder_id\n+        infraorder_id\n+        parvorder_id\n+        superfamily_id\n+        family_id\n+        subfamily_id\n+        tribe_id\n+        subtribe_id\n+        genus_id\n+        subgenus_id\n+        species_group_id\n+        species_subgroup_id\n+        species_id\n+        subspecies_id\n+        varietas_id\n+        forma_id\n+\n+    http://unipept.ugent.be/apidocs/pept2lca\n+\n+    **Attributions**\n+\n+    The Unipept metaproteomics analysis pipeline\n+    Bart Mesuere1,*, Griet Debyser2, Maarten Aerts3, Bart Devreese2, Peter Vandamme3 andPeter Dawyndt1\n+    Article first published online: 11 FEB 2015\n+    DOI: 10.1002/pmic.201400361\n+    http://onlinelibrary.wiley.com/doi/10.1002/pmic.201400361/abstract;jsessionid=BFF1994E4C14DA73D7C907EB208AD710.f04t04\n+\n+    ]]></help>\n+  <citations>\n+    <citation type="doi">doi:10.1002/pmic.201400361</citation>\n+  </citations>\n+\n+</tool>\n'