Repository 'unipept'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxyp/unipept

Changeset 1:0c1ee95282fa (2015-04-14)
Previous changeset 0:6430407e5869 (2015-04-03) Next changeset 2:503ab8a39006 (2016-12-07)
Commit message:
Uploaded
modified:
unipept.py
unipept.xml
added:
test-data/peptide.fa
test-data/tryptic.fa
test-data/tryptic.tsv
removed:
.shed.yml
b
diff -r 6430407e5869 -r 0c1ee95282fa .shed.yml
--- a/.shed.yml Fri Apr 03 14:55:49 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,18 +0,0 @@
-categories: 
- - Proteomics
- - Metaproteomics
-description: Unipept retrieves metaproteomics information
-homepage_url: https://github.com/galaxyproteomics/tools-galaxyp
-long_description: 'Unipept retrieves taxonomy for tryptic peptides using the unipept API. 
-  http://unipept.ugent.be
-  http://unipept.ugent.be/apidocs 
-
-  The Unipept metaproteomics analysis pipeline
-  Bart Mesuere1,*, Griet Debyser2, Maarten Aerts3, Bart Devreese2, Peter Vandamme3 andPeter Dawyndt1
-  Article first published online: 11 FEB 2015
-  DOI: 10.1002/pmic.201400361
-  http://onlinelibrary.wiley.com/doi/10.1002/pmic.201400361/abstract;jsessionid=BFF1994E4C14DA73D7C907EB208AD710.f04t04
-  '
-name: unipept
-owner: galaxyp
-remote_repository_url: http://unipept.ugent.be/apidocs
b
diff -r 6430407e5869 -r 0c1ee95282fa test-data/peptide.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/peptide.fa Tue Apr 14 16:44:22 2015 -0400
b
@@ -0,0 +1,9 @@
+>tr|G3RWV1|G3RWV1_GORGO
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKG
+SNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV
+>sp|Q9BYE9|CDHR2_HUMAN
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPRIPIDDLTMVVYDPDKG
+SNGTFLLSLGGPDAEAFSVSPERAVGSASVQVLVRVSALVDYERQTAMAV
+>tr|H2QS28|H2QS28_PANTR
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPRIPIDDLTMVVYDPDKG
+SNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSGLVDYERQTAMAV
b
diff -r 6430407e5869 -r 0c1ee95282fa test-data/tryptic.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tryptic.fa Tue Apr 14 16:44:22 2015 -0400
b
@@ -0,0 +1,19 @@
+>trypticQTAMAV
+QTAMAV
+>trypticAAGSASVQVLVR
+AAGSASVQVLVR
+>trypticAVGSASVQVLVR
+AVGSASVQVLVR
+>trypticIPIDDLTMVVYDPDK
+IPIDDLTMVVYDPDK
+>trypticVSGLVDYER
+VSGLVDYER
+>trypticVSALVDYER
+VSALVDYER
+>trypticGSNGTFLLSLGGPDAEAFSVSPER
+GSNGTFLLSLGGPDAEAFSVSPER
+>trypticVMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR
+
+
+
b
diff -r 6430407e5869 -r 0c1ee95282fa test-data/tryptic.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tryptic.tsv Tue Apr 14 16:44:22 2015 -0400
b
@@ -0,0 +1,8 @@
+1 QTAMAV QTAMAV
+2 AAGSASVQVLVR AAGSASJQVLVR
+3 AVGSASVQVLVR AVGSASVQVLVR
+4 IPIDDLTMVVYDPDK IPIDDLTMVVYDPDK
+5 GSNGTFLLSLGGPDAEAFSVSPER GSNGTFLLSLGGPDAEAFSVSPE
+6 VSGLVDYER VSGLVDYER
+7 VSALVDYER VSALVDYER
+8 VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR
b
diff -r 6430407e5869 -r 0c1ee95282fa unipept.py
--- a/unipept.py Fri Apr 03 14:55:49 2015 -0400
+++ b/unipept.py Tue Apr 14 16:44:22 2015 -0400
[
b'@@ -31,39 +31,86 @@\n     if exit_code:\n       sys.exit(exit_code)\n \n-def read_fasta(fp):\n+pept2lca_column_order = [\'peptide\',\'taxon_rank\',\'taxon_id\',\'taxon_name\']\n+pept2lca_extra_column_order = [\'peptide\',\'superkingdom\',\'kingdom\',\'subkingdom\',\'superphylum\',\'phylum\',\'subphylum\',\'superclass\',\'class\',\'subclass\',\'infraclass\',\'superorder\',\'order\',\'suborder\',\'infraorder\',\'parvorder\',\'superfamily\',\'family\',\'subfamily\',\'tribe\',\'subtribe\',\'genus\',\'subgenus\',\'species_group\',\'species_subgroup\',\'species\',\'subspecies\',\'varietas\',\'forma\' ]\n+pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]\n+pept2prot_column_order = [\'peptide\',\'uniprot_id\',\'taxon_id\']\n+pept2prot_extra_column_order = pept2prot_column_order + [\'taxon_name\',\'ec_references\',\'go_references\',\'refseq_ids\',\'refseq_protein_ids\',\'insdc_ids\',\'insdc_protein_ids\']\n+\n+def __main__():\n+  version = \'1.1\'\n+  pep_pat = \'^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$\'\n+\n+  def read_tabular(filepath,col):\n+    peptides = []\n+    with open(filepath) as fp:\n+      for i,line in enumerate(fp):\n+        if line.strip() == \'\' or line.startswith(\'#\'):\n+          continue\n+        fields = line.rstrip(\'\\n\').split(\'\\t\')\n+        peptide = fields[col]\n+        if not re.match(pep_pat,peptide):\n+          warn_err(\'"%s" is not a peptide (line %d column %d of tabular file: %s)\\n\' % (peptide,i,col,filepath),exit_code=invalid_ec)\n+        peptides.append(peptide)\n+    return peptides\n+\n+  def get_fasta_entries(fp):\n     name, seq = None, []\n     for line in fp:\n-        line = line.rstrip()\n-        if line.startswith(">"):\n-            if name: yield (name, \'\'.join(seq))\n-            name, seq = line, []\n-        else:\n-            seq.append(line)\n+      line = line.rstrip()\n+      if line.startswith(">"):\n+        if name: yield (name, \'\'.join(seq))\n+        name, seq = line, []\n+      else:\n+        seq.append(line)\n     if name: yield (name, \'\'.join(seq))\n \n-def read_mzid(fp):\n-  peptides = []\n-  for event, elem in ET.iterparse(fp):\n-    if event == \'end\':\n-      if re.search(\'PeptideSequence\',elem.tag):\n-        peptides.append(elem.text)\n-  return peptides\n+  def read_fasta(filepath):\n+    peptides = []\n+    with open(filepath) as fp:\n+      for id, peptide in get_fasta_entries(fp):\n+        if not re.match(pep_pat,peptide):\n+          warn_err(\'"%s" is not a peptide (id %s of fasta file: %s)\\n\' % (peptide,id,filepath),exit_code=invalid_ec)\n+        peptides.append(peptide)\n+    return peptides\n+\n+  def read_mzid(fp):\n+    peptides = []\n+    for event, elem in ET.iterparse(fp):\n+      if event == \'end\':\n+        if re.search(\'PeptideSequence\',elem.tag):\n+          peptides.append(elem.text)\n+    return peptides\n \n-def read_pepxml(fp):\n-  peptides = []\n-  for event, elem in ET.iterparse(fp):\n-    if event == \'end\':\n-      if re.search(\'search_hit\',elem.tag):\n-        peptides.append(elem.get(\'peptide\'))\n-  return peptides\n+  def read_pepxml(fp):\n+    peptides = []\n+    for event, elem in ET.iterparse(fp):\n+      if event == \'end\':\n+        if re.search(\'search_hit\',elem.tag):\n+          peptides.append(elem.get(\'peptide\'))\n+    return peptides\n \n-def __main__():\n+  def best_match(peptide,matches):\n+    if not matches:\n+      return None\n+    elif len(matches) == 1:\n+      return matches[0].copy()\n+    else:\n+      # find the most specific match (peptide is always the first column order field)\n+      for col in reversed(pept2lca_extra_column_order[1:]):\n+        col_id = col+"_id" if options.extra else col\n+        for match in matches:\n+          if \'taxon_rank\' in match and match[\'taxon_rank\'] == col:\n+            return match.copy()\n+          if col_id in match and match[col_id]:\n+            return match.copy()\n+    return None\n+\n   #Parse Command Line\n   parser = optparse.OptionParser()\n-  # unipept API\n-  parser.add_option( \'-A\', \'--api\', dest=\'unipept\', default=\'pept2lca\', choices=[\'pept2lca\',\'pept2taxa\',\'pept2prot\'], help=\'The unipept application: pept2lca, pept2taxa, or pep'..b'rint >> sys.stdout,"unipept response: %s\\n" % str(unipept_resp)\n+  if options.unipept == \'pept2prot\' or options.unipept == \'pept2taxa\':\n+    dupkey = \'uniprot_id\' if options.unipept == \'pept2prot\' else \'taxon_id\' ## should only keep one of these per input peptide\n+    ## multiple entries per trypticPeptide for pep2prot or pep2taxa\n+    mapping = {}\n+    for match in unipept_resp:\n+      mapping.setdefault(match[\'peptide\'],[]).append(match)\n+    for peptide in peptides:\n+      # Get the intersection of matches to the tryptic parts\n+      keyToMatch = None\n+      for part in pepToParts[peptide]:\n+        if part in mapping:\n+          temp = {match[dupkey] : match  for match in mapping[part]}\n+          if keyToMatch:\n+            dkeys = set(keyToMatch.keys()) - set(temp.keys())\n+            for k in dkeys:\n+              del keyToMatch[k]\n+          else:\n+            keyToMatch = temp\n+          ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp\n+      if not keyToMatch:\n+        unmatched_peptides.append(peptide)\n+      else:\n+        for key,match in keyToMatch.iteritems():\n+          match[\'tryptic_peptide\'] = match[\'peptide\']\n+          match[\'peptide\'] = peptide\n+          peptideMatches.append(match)\n+  else:\n+    ## should be one response per trypticPeptide for pep2lca\n+    respMap = {v[\'peptide\']:v for v in unipept_resp}\n+    ## map resp back to peptides\n+    for peptide in peptides:\n+      matches = list()\n+      for part in pepToParts[peptide]:\n+        if part in respMap:\n+          matches.append(respMap[part])\n+      match = best_match(peptide,matches)\n+      if not match:\n+        unmatched_peptides.append(peptide)\n+        longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]\n+        match = {\'peptide\' : longest_tryptic_peptide}\n+      match[\'tryptic_peptide\'] = match[\'peptide\']\n+      match[\'peptide\'] = peptide\n+      peptideMatches.append(match)\n+  resp = peptideMatches\n+  if options.debug: print >> sys.stdout,"\\nmapped response: %s\\n" % str(resp)\n   ## output results\n-  if not (options.mismatch or options.json or options.tsv or options.csv):\n+  if not (options.unmatched or options.json or options.tsv or options.csv):\n     print >> sys.stdout, str(resp)\n-  if options.mismatch:\n-    peptides_matched = []\n-    for i,pdict in enumerate(resp):\n-      peptides_matched.append(pdict[\'peptide\'])\n-    with open(options.mismatch,\'w\') as outputFile:\n+  if options.unmatched:\n+    with open(options.unmatched,\'w\') as outputFile:\n       for peptide in peptides:\n-        if not peptide in peptides_matched:\n+        if peptide in unmatched_peptides:\n           outputFile.write("%s\\n" % peptide)\n   if options.json:\n     with open(options.json,\'w\') as outputFile:\n       outputFile.write(str(resp))  \n   if options.tsv or options.csv:\n     # \'pept2lca\',\'pept2taxa\',\'pept2prot\'\n-    pept2lca_column_order = [ \'peptide\',\'superkingdom\',\'kingdom\',\'subkingdom\',\'superphylum\',\'phylum\',\'subphylum\',\'superclass\',\'class_\',\'subclass\',\'infraclass\',\'superorder\',\'order\',\'suborder\',\'infraorder\',\'parvorder\',\'superfamily\',\'family\',\'subfamily\',\'tribe\',\'subtribe\',\'genus\',\'subgenus\',\'species_group\',\'species_subgroup\',\'species\',\'subspecies\',\'varietas\',\'forma\' ]\n-    pept2prot_column_order = [ \'peptide\',\'uniprot_id\',\'taxon_id\',\'taxon_name\',\'ec_references\',\'go_references\',\'refseq_ids\',\'refseq_protein_ids\',\'insdc_ids\',\'insdc_protein_ids\']\n-    column_order = pept2prot_column_order if options.unipept == \'pept2prot\' else pept2lca_column_order\n     found_keys = set()\n     results = []\n     for i,pdict in enumerate(resp):\n@@ -179,7 +282,8 @@\n     taxa = []\n     for i,pdict in enumerate(results):\n       vals = [str(pdict[x]) if x in pdict and pdict[x] else \'\' for x in column_keys]\n-      taxa.append(vals)\n+      if vals not in taxa:\n+        taxa.append(vals)\n     if options.tsv:\n       with open(options.tsv,\'w\') as outputFile:\n         outputFile.write("#%s\\n"% \'\\t\'.join(column_names))\n'
b
diff -r 6430407e5869 -r 0c1ee95282fa unipept.xml
--- a/unipept.xml Fri Apr 03 14:55:49 2015 -0400
+++ b/unipept.xml Tue Apr 14 16:44:22 2015 -0400
b
b'@@ -1,8 +1,8 @@\n-<tool id="unipept" name="Unipept" version="0.1.0">\n+<tool id="unipept" name="Unipept" version="1.1.0">\n     <description>retrieve taxonomy for peptides</description>\n     <macros>\n         <xml name="equate_il">\n-            <param name="equate_il" type="boolean" truevalue="-e" falsevalue="" checked="false" label="Equate isoleucine and leucine">\n+            <param name="equate_il" type="boolean" truevalue="-e" falsevalue="" checked="true" label="Equate isoleucine and leucine">\n                 <help>isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records</help>\n             </param >\n         </xml>\n@@ -13,7 +13,10 @@\n         </xml>\n         <xml name="names">\n             <param name="names" type="boolean" truevalue="-n" falsevalue="" checked="true" label="names" >\n-                <help>return the names of taxons</help>\n+                <help>return the names in complete taxonomic lineage</help>\n+            </param >\n+            <param name="allfields" type="boolean" truevalue="-A" falsevalue="" checked="false" label="allfields" >\n+                <help>include fields for most specific taxonomic classification: taxon_rank,taxon_id,taxon_name before lineage</help>\n             </param >\n         </xml>\n     </macros>\n@@ -27,7 +30,7 @@\n       --api=$unipept.api\n       $unipept.equate_il $unipept.extra \n       #if $unipept.api != \'pept2prot\':\n-        $unipept.names \n+        $unipept.names $unipept.allfields\n       #end if\n       $strict\n       #if str($peptide_src.fmt) == \'proteomic\':\n@@ -58,29 +61,29 @@\n       #if \'csv\' in str($outputs).split(\',\'):\n         --csv $output_csv\n       #end if\n-      #if \'mismatch\' in str($outputs).split(\',\'):\n-        --mismatch $output_mismatch\n+      #if \'unmatched\' in str($outputs).split(\',\'):\n+        --unmatched $output_unmatched\n       #end if\n     ]]></command>\n     <inputs>\n       <conditional name="unipept">\n           <param name="api" type="select" label="Unipept application" >\n-              <option value="pept2taxa" selected="true">pept2taxa: organisms associated with the UniProt entries containing a given tryptic peptide</option>\n-              <option value="pept2lca">pept2lca: lowest common ancestor</option>\n+              <option value="pept2lca" selected="true">pept2lca: lowest common ancestor</option>\n+              <option value="pept2taxa">pept2taxa: organisms associated with the UniProt entries containing a given tryptic peptide</option>\n               <option value="pept2prot">pept2prot: UniProt entries containing a given tryptic peptide</option>\n           </param>\n+          <when value="pept2lca">\n+              <expand macro="equate_il" />\n+              <expand macro="extra">\n+                  <help>Return the complete lineage of the taxonomic lowest common ancestor, and include ID fields.</help>\n+              </expand>\n+              <expand macro="names" />\n+          </when>\n           <when value="pept2taxa">\n               <expand macro="equate_il" />\n               <expand macro="extra">\n                   <checked>true</checked>\n-                  <help>Return the complete lineage of each organism.</help>\n-              </expand>\n-              <expand macro="names" />\n-          </when>\n-          <when value="pept2lca">\n-              <expand macro="equate_il" />\n-              <expand macro="extra">\n-                  <help>Return the complete lineage of the taxonomic lowest common ancestor.</help>\n+                  <help>Return the complete lineage of each organism, and include ID fields.</help>\n               </expand>\n               <expand macro="names" />\n           </when>\n@@ -122,7 +125,7 @@\n         <option value="tsv" selected="true">tabular</option>\n         <option value="csv">Comma Separated Values (.csv)</option>\n         <option value="json">JSON</option>\n-        <option value="mismatch">Mismatches</option>\n+        <option value="unmatched">Unmatched peptides</option>\n       </par'..b'   <param name="input_fasta" value="peptide.fa"/>\n         <param name="equate_il" value="True"/>\n         <param name="extra" value="True"/>\n         <param name="names" value="True"/>\n-        <param name="outputs" value="json,mismatch"/>\n+        <param name="outputs" value="json,tsv"/>\n         <output name="output_json">\n             <assert_contents>\n-              <has_text text="AIPQLEVARPADAYETAEAYR" />\n+              <has_text text="VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKGSNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV" />\n+            </assert_contents>\n+        </output>\n+        <output name="output_tsv">\n+            <assert_contents>\n+              <has_text text="9606" />\n+              <has_text text="9598" />\n             </assert_contents>\n         </output>\n-        <output name="output_mismatch">\n+      </test>\n+      <test>\n+        <param name="api" value="pept2taxa"/>\n+        <param name="fmt" value="fasta"/>\n+        <param name="input_fasta" value="peptide.fa"/>\n+        <param name="equate_il" value="True"/>\n+        <param name="extra" value="False"/>\n+        <param name="names" value="False"/>\n+        <param name="outputs" value="tsv"/>\n+        <output name="output_tsv">\n             <assert_contents>\n-              <has_text text="DQIAHEGK" />\n+              <has_text text="sapiens" />\n+              <has_text text="troglodytes" />\n+              <has_text text="Gorilla" />\n+              <has_text text="Macaca" />\n             </assert_contents>\n         </output>\n       </test>\n@@ -182,8 +205,24 @@\n     **Unipept** \n \n     Retrieve Uniprot and taxanomic information for trypic peptides.\n+    \n+    Unipept API documentation - http://unipept.ugent.be/apidocs \n \n-    **pept2prot**\n+    **Input**\n+\n+    Input peptides can be retrieved from tabular, fasta, mzid, or pepxml datasets.  \n+ \n+    Processing deatils::\n+\n+        The input peptides are split into typtic peptide fragments in order to match the Unipept records.   \n+        Only fragments that are complete tryptic peptides between 5 and 50 animo acid in length will be matched by Unipept.\n+        The match to the most specific tryptic fragment is reported.\n+\n+\n+    **Unipept APIs**\n+\n+    **pept2prot**  - http://unipept.ugent.be/apidocs/pept2prot\n+\n     Returns the list of UniProt entries containing a given tryptic peptide. This is the same information as provided on the Protein matches tab when performing a search with the Tryptic Peptide Analysis in the web interface. \n \n     By default, each object contains the following information fields extracted from the UniProt record::\n@@ -202,9 +241,9 @@\n         insdc_ids: a space separated list of associated insdc accession numbers\n         insdc_protein_ids: a space separated list of associated insdc protein accession numbers\n \n-    http://unipept.ugent.be/apidocs/pept2prot\n \n-    **pept2taxa**\n+    **pept2taxa**  - http://unipept.ugent.be/apidocs/pept2taxa\n+\n     Returns the set of organisms associated with the UniProt entries containing a given tryptic peptide. This is the same information as provided on the Lineage table tab when performing a search with the Tryptic Peptide Analysis in the web interface.\n \n     By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::\n@@ -245,9 +284,9 @@\n         varietas_id\n         forma_id\n \n-    http://unipept.ugent.be/apidocs/pept2taxa\n \n-    **pept2lca** \n+    **pept2lca**  - http://unipept.ugent.be/apidocs/pept2lca\n+\n     Returns the taxonomic lowest common ancestor for a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.\n \n     By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::\n@@ -288,7 +327,6 @@\n         varietas_id\n         forma_id\n \n-    http://unipept.ugent.be/apidocs/pept2lca\n \n     **Attributions**\n \n'