changeset 1:0c1ee95282fa draft

Uploaded
author galaxyp
date Tue, 14 Apr 2015 16:44:22 -0400
parents 6430407e5869
children 503ab8a39006
files .shed.yml test-data/peptide.fa test-data/tryptic.fa test-data/tryptic.tsv unipept.py unipept.xml
diffstat 6 files changed, 268 insertions(+), 108 deletions(-) [+]
line wrap: on
line diff
--- a/.shed.yml	Fri Apr 03 14:55:49 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,18 +0,0 @@
-categories: 
- - Proteomics
- - Metaproteomics
-description: Unipept retrieves metaproteomics information
-homepage_url: https://github.com/galaxyproteomics/tools-galaxyp
-long_description: 'Unipept retrieves taxonomy for tryptic peptides using the unipept API. 
-  http://unipept.ugent.be
-  http://unipept.ugent.be/apidocs 
-
-  The Unipept metaproteomics analysis pipeline
-  Bart Mesuere1,*, Griet Debyser2, Maarten Aerts3, Bart Devreese2, Peter Vandamme3 andPeter Dawyndt1
-  Article first published online: 11 FEB 2015
-  DOI: 10.1002/pmic.201400361
-  http://onlinelibrary.wiley.com/doi/10.1002/pmic.201400361/abstract;jsessionid=BFF1994E4C14DA73D7C907EB208AD710.f04t04
-  '
-name: unipept
-owner: galaxyp
-remote_repository_url: http://unipept.ugent.be/apidocs
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/peptide.fa	Tue Apr 14 16:44:22 2015 -0400
@@ -0,0 +1,9 @@
+>tr|G3RWV1|G3RWV1_GORGO
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKG
+SNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV
+>sp|Q9BYE9|CDHR2_HUMAN
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPRIPIDDLTMVVYDPDKG
+SNGTFLLSLGGPDAEAFSVSPERAVGSASVQVLVRVSALVDYERQTAMAV
+>tr|H2QS28|H2QS28_PANTR
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPRIPIDDLTMVVYDPDKG
+SNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSGLVDYERQTAMAV
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tryptic.fa	Tue Apr 14 16:44:22 2015 -0400
@@ -0,0 +1,19 @@
+>trypticQTAMAV
+QTAMAV
+>trypticAAGSASVQVLVR
+AAGSASVQVLVR
+>trypticAVGSASVQVLVR
+AVGSASVQVLVR
+>trypticIPIDDLTMVVYDPDK
+IPIDDLTMVVYDPDK
+>trypticVSGLVDYER
+VSGLVDYER
+>trypticVSALVDYER
+VSALVDYER
+>trypticGSNGTFLLSLGGPDAEAFSVSPER
+GSNGTFLLSLGGPDAEAFSVSPER
+>trypticVMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tryptic.tsv	Tue Apr 14 16:44:22 2015 -0400
@@ -0,0 +1,8 @@
+1	QTAMAV	QTAMAV
+2	AAGSASVQVLVR	AAGSASJQVLVR
+3	AVGSASVQVLVR	AVGSASVQVLVR
+4	IPIDDLTMVVYDPDK	IPIDDLTMVVYDPDK
+5	GSNGTFLLSLGGPDAEAFSVSPER	GSNGTFLLSLGGPDAEAFSVSPE
+6	VSGLVDYER	VSGLVDYER
+7	VSALVDYER	VSALVDYER
+8	VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR	VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR
--- a/unipept.py	Fri Apr 03 14:55:49 2015 -0400
+++ b/unipept.py	Tue Apr 14 16:44:22 2015 -0400
@@ -31,39 +31,86 @@
     if exit_code:
       sys.exit(exit_code)
 
-def read_fasta(fp):
+pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name']
+pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
+pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
+pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
+pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
+
+def __main__():
+  version = '1.1'
+  pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
+
+  def read_tabular(filepath,col):
+    peptides = []
+    with open(filepath) as fp:
+      for i,line in enumerate(fp):
+        if line.strip() == '' or line.startswith('#'):
+          continue
+        fields = line.rstrip('\n').split('\t')
+        peptide = fields[col]
+        if not re.match(pep_pat,peptide):
+          warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec)
+        peptides.append(peptide)
+    return peptides
+
+  def get_fasta_entries(fp):
     name, seq = None, []
     for line in fp:
-        line = line.rstrip()
-        if line.startswith(">"):
-            if name: yield (name, ''.join(seq))
-            name, seq = line, []
-        else:
-            seq.append(line)
+      line = line.rstrip()
+      if line.startswith(">"):
+        if name: yield (name, ''.join(seq))
+        name, seq = line, []
+      else:
+        seq.append(line)
     if name: yield (name, ''.join(seq))
 
-def read_mzid(fp):
-  peptides = []
-  for event, elem in ET.iterparse(fp):
-    if event == 'end':
-      if re.search('PeptideSequence',elem.tag):
-        peptides.append(elem.text)
-  return peptides
+  def read_fasta(filepath):
+    peptides = []
+    with open(filepath) as fp:
+      for id, peptide in get_fasta_entries(fp):
+        if not re.match(pep_pat,peptide):
+          warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec)
+        peptides.append(peptide)
+    return peptides
+
+  def read_mzid(fp):
+    peptides = []
+    for event, elem in ET.iterparse(fp):
+      if event == 'end':
+        if re.search('PeptideSequence',elem.tag):
+          peptides.append(elem.text)
+    return peptides
 
-def read_pepxml(fp):
-  peptides = []
-  for event, elem in ET.iterparse(fp):
-    if event == 'end':
-      if re.search('search_hit',elem.tag):
-        peptides.append(elem.get('peptide'))
-  return peptides
+  def read_pepxml(fp):
+    peptides = []
+    for event, elem in ET.iterparse(fp):
+      if event == 'end':
+        if re.search('search_hit',elem.tag):
+          peptides.append(elem.get('peptide'))
+    return peptides
 
-def __main__():
+  def best_match(peptide,matches):
+    if not matches:
+      return None
+    elif len(matches) == 1:
+      return matches[0].copy()
+    else:
+      # find the most specific match (peptide is always the first column order field)
+      for col in reversed(pept2lca_extra_column_order[1:]):
+        col_id = col+"_id" if options.extra else col
+        for match in matches:
+          if 'taxon_rank' in match and match['taxon_rank'] == col:
+            return match.copy()
+          if col_id in match and match[col_id]:
+            return match.copy()
+    return None
+
   #Parse Command Line
   parser = optparse.OptionParser()
-  # unipept API
-  parser.add_option( '-A', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' )
-  # files
+  # unipept API choice
+  parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' )
+  # input files
   parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
   parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
   parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
@@ -73,38 +120,33 @@
   parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
   parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
   parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
+  # output fields
+  parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
   # Warn vs Error Flag
   parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
-  # outputs
+  # output files
   parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
   parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
   parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
-  parser.add_option( '-M', '--mismatch', dest='mismatch', default=None, help='Output file path for peptide with no matches' )
+  parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
+  # debug
+  parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
+  parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
   (options, args) = parser.parse_args()
+  if options.version:
+    print >> sys.stdout,"%s" % version
+    sys.exit(0)
   invalid_ec = 2 if options.strict else None
   peptides = []
-  pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
   ## Get peptide sequences
   if options.mzid:
     peptides += read_mzid(options.mzid)
   if options.pepxml:
     peptides += read_pepxml(options.pepxml)
   if options.tabular:
-    with open(options.tabular) as fp:
-      for i,line in enumerate(fp):
-        if line.strip() == '' or line.startswith('#'):
-          continue
-        fields = line.rstrip('\n').split('\t')
-        peptide = fields[options.column]
-        if not re.match(pep_pat,peptide):
-          warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,options.column,options.tabular),exit_code=invalid_ec)
-        peptides.append(peptide) 
+    peptides += read_tabular(options.tabular,options.column) 
   if options.fasta:
-    with open(options.fasta) as fp:
-      for id, peptide in read_fasta(fp):
-        if not re.match(pep_pat,peptide):
-          warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,options.fasta),exit_code=invalid_ec)
-        peptides.append(peptide) 
+    peptides += read_fasta(options.fasta) 
   if args and len(args) > 0:
     for i,peptide in enumerate(args):
       if not re.match(pep_pat,peptide):
@@ -112,6 +154,25 @@
       peptides.append(peptide) 
   if len(peptides) < 1:
     warn_err("No peptides input!",exit_code=1)
+  column_order = pept2lca_column_order
+  if options.unipept == 'pept2prot':
+    column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
+  else:
+    if options.extra or options.names:
+      column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
+    else:
+      column_order = pept2lca_column_order
+  ## map to tryptic peptides
+  pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides}
+  partToPeps = {}
+  for peptide, parts in pepToParts.iteritems():
+    if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts)
+    for part in parts:
+      if len(part) > 50:
+        warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None)
+      if 5 <= len(part) <= 50:
+        partToPeps.setdefault(part,[]).append(peptide)
+  trypticPeptides = partToPeps.keys()
   ## unipept
   post_data = []
   if options.equate_il:
@@ -121,30 +182,72 @@
     post_data.append(("names","true"))
   elif options.extra:
     post_data.append(("extra","true"))
-  post_data += [('input[]', x) for x in peptides]
+  post_data += [('input[]', x) for x in trypticPeptides]
   headers = {'Content-Type': 'application/x-www-form-urlencoded',  'Accept': 'application/json'}
   url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
   req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
-  resp = json.loads( urllib2.urlopen( req ).read() )
+  unipept_resp = json.loads( urllib2.urlopen( req ).read() )
+  unmatched_peptides = []
+  peptideMatches = []
+  if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
+  if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa':
+    dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
+    ## multiple entries per trypticPeptide for pep2prot or pep2taxa
+    mapping = {}
+    for match in unipept_resp:
+      mapping.setdefault(match['peptide'],[]).append(match)
+    for peptide in peptides:
+      # Get the intersection of matches to the tryptic parts
+      keyToMatch = None
+      for part in pepToParts[peptide]:
+        if part in mapping:
+          temp = {match[dupkey] : match  for match in mapping[part]}
+          if keyToMatch:
+            dkeys = set(keyToMatch.keys()) - set(temp.keys())
+            for k in dkeys:
+              del keyToMatch[k]
+          else:
+            keyToMatch = temp
+          ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
+      if not keyToMatch:
+        unmatched_peptides.append(peptide)
+      else:
+        for key,match in keyToMatch.iteritems():
+          match['tryptic_peptide'] = match['peptide']
+          match['peptide'] = peptide
+          peptideMatches.append(match)
+  else:
+    ## should be one response per trypticPeptide for pep2lca
+    respMap = {v['peptide']:v for v in unipept_resp}
+    ## map resp back to peptides
+    for peptide in peptides:
+      matches = list()
+      for part in pepToParts[peptide]:
+        if part in respMap:
+          matches.append(respMap[part])
+      match = best_match(peptide,matches)
+      if not match:
+        unmatched_peptides.append(peptide)
+        longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
+        match = {'peptide' : longest_tryptic_peptide}
+      match['tryptic_peptide'] = match['peptide']
+      match['peptide'] = peptide
+      peptideMatches.append(match)
+  resp = peptideMatches
+  if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp)
   ## output results
-  if not (options.mismatch or options.json or options.tsv or options.csv):
+  if not (options.unmatched or options.json or options.tsv or options.csv):
     print >> sys.stdout, str(resp)
-  if options.mismatch:
-    peptides_matched = []
-    for i,pdict in enumerate(resp):
-      peptides_matched.append(pdict['peptide'])
-    with open(options.mismatch,'w') as outputFile:
+  if options.unmatched:
+    with open(options.unmatched,'w') as outputFile:
       for peptide in peptides:
-        if not peptide in peptides_matched:
+        if peptide in unmatched_peptides:
           outputFile.write("%s\n" % peptide)
   if options.json:
     with open(options.json,'w') as outputFile:
       outputFile.write(str(resp))  
   if options.tsv or options.csv:
     # 'pept2lca','pept2taxa','pept2prot'
-    pept2lca_column_order = [ 'peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class_','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
-    pept2prot_column_order = [ 'peptide','uniprot_id','taxon_id','taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
-    column_order = pept2prot_column_order if options.unipept == 'pept2prot' else pept2lca_column_order
     found_keys = set()
     results = []
     for i,pdict in enumerate(resp):
@@ -179,7 +282,8 @@
     taxa = []
     for i,pdict in enumerate(results):
       vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
-      taxa.append(vals)
+      if vals not in taxa:
+        taxa.append(vals)
     if options.tsv:
       with open(options.tsv,'w') as outputFile:
         outputFile.write("#%s\n"% '\t'.join(column_names))
--- a/unipept.xml	Fri Apr 03 14:55:49 2015 -0400
+++ b/unipept.xml	Tue Apr 14 16:44:22 2015 -0400
@@ -1,8 +1,8 @@
-<tool id="unipept" name="Unipept" version="0.1.0">
+<tool id="unipept" name="Unipept" version="1.1.0">
     <description>retrieve taxonomy for peptides</description>
     <macros>
         <xml name="equate_il">
-            <param name="equate_il" type="boolean" truevalue="-e" falsevalue="" checked="false" label="Equate isoleucine and leucine">
+            <param name="equate_il" type="boolean" truevalue="-e" falsevalue="" checked="true" label="Equate isoleucine and leucine">
                 <help>isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records</help>
             </param >
         </xml>
@@ -13,7 +13,10 @@
         </xml>
         <xml name="names">
             <param name="names" type="boolean" truevalue="-n" falsevalue="" checked="true" label="names" >
-                <help>return the names of taxons</help>
+                <help>return the names in complete taxonomic lineage</help>
+            </param >
+            <param name="allfields" type="boolean" truevalue="-A" falsevalue="" checked="false" label="allfields" >
+                <help>include fields for most specific taxonomic classification: taxon_rank,taxon_id,taxon_name before lineage</help>
             </param >
         </xml>
     </macros>
@@ -27,7 +30,7 @@
       --api=$unipept.api
       $unipept.equate_il $unipept.extra 
       #if $unipept.api != 'pept2prot':
-        $unipept.names 
+        $unipept.names $unipept.allfields
       #end if
       $strict
       #if str($peptide_src.fmt) == 'proteomic':
@@ -58,29 +61,29 @@
       #if 'csv' in str($outputs).split(','):
         --csv $output_csv
       #end if
-      #if 'mismatch' in str($outputs).split(','):
-        --mismatch $output_mismatch
+      #if 'unmatched' in str($outputs).split(','):
+        --unmatched $output_unmatched
       #end if
     ]]></command>
     <inputs>
       <conditional name="unipept">
           <param name="api" type="select" label="Unipept application" >
-              <option value="pept2taxa" selected="true">pept2taxa: organisms associated with the UniProt entries containing a given tryptic peptide</option>
-              <option value="pept2lca">pept2lca: lowest common ancestor</option>
+              <option value="pept2lca" selected="true">pept2lca: lowest common ancestor</option>
+              <option value="pept2taxa">pept2taxa: organisms associated with the UniProt entries containing a given tryptic peptide</option>
               <option value="pept2prot">pept2prot: UniProt entries containing a given tryptic peptide</option>
           </param>
+          <when value="pept2lca">
+              <expand macro="equate_il" />
+              <expand macro="extra">
+                  <help>Return the complete lineage of the taxonomic lowest common ancestor, and include ID fields.</help>
+              </expand>
+              <expand macro="names" />
+          </when>
           <when value="pept2taxa">
               <expand macro="equate_il" />
               <expand macro="extra">
                   <checked>true</checked>
-                  <help>Return the complete lineage of each organism.</help>
-              </expand>
-              <expand macro="names" />
-          </when>
-          <when value="pept2lca">
-              <expand macro="equate_il" />
-              <expand macro="extra">
-                  <help>Return the complete lineage of the taxonomic lowest common ancestor.</help>
+                  <help>Return the complete lineage of each organism, and include ID fields.</help>
               </expand>
               <expand macro="names" />
           </when>
@@ -122,7 +125,7 @@
         <option value="tsv" selected="true">tabular</option>
         <option value="csv">Comma Separated Values (.csv)</option>
         <option value="json">JSON</option>
-        <option value="mismatch">Mismatches</option>
+        <option value="unmatched">Unmatched peptides</option>
       </param>
       <param name="strict" type="boolean" truevalue="--strict" falsevalue="" checked="false" label="Exit with error on invalid peptides, otherwise ignore them"/>
     </inputs>
@@ -136,44 +139,64 @@
       <data name="output_csv" format="csv" label="${tool.name} ${unipept.api} on ${on_string} csv"> 
         <filter>'csv' in outputs</filter>
       </data> 
-      <data name="output_mismatch" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} mismatch"> 
-        <filter>'mismatch' in outputs</filter>
+      <data name="output_unmatched" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} unmatched"> 
+        <filter>'unmatched' in outputs</filter>
       </data> 
     </outputs>
     <tests>
       <test>
+        <param name="api" value="pept2lca"/>
         <param name="fmt" value="tabular"/>
-        <param name="input_tsv" value="input.tsv"/>
+        <param name="input_tsv" value="tryptic.tsv"/>
         <param name="column" value="2"/>
         <param name="extra" value="True"/>
         <param name="names" value="True"/>
-        <param name="outputs" value="tsv,mismatch"/>
+        <param name="outputs" value="tsv,unmatched"/>
         <output name="output_tsv">
             <assert_contents>
-              <has_text text="AIPQLEVARPADAYETAEAYR" />
+              <has_text text="Homininae" />
             </assert_contents>
         </output>
-        <output name="output_mismatch">
+        <output name="output_unmatched">
             <assert_contents>
-              <has_text text="DQIAHEGK" />
+              <has_text text="QTAMAV" />
             </assert_contents>
         </output>
       </test>
       <test>
+        <param name="api" value="pept2lca"/>
         <param name="fmt" value="fasta"/>
-        <param name="input_tsv" value="input.fasta"/>
+        <param name="input_fasta" value="peptide.fa"/>
         <param name="equate_il" value="True"/>
         <param name="extra" value="True"/>
         <param name="names" value="True"/>
-        <param name="outputs" value="json,mismatch"/>
+        <param name="outputs" value="json,tsv"/>
         <output name="output_json">
             <assert_contents>
-              <has_text text="AIPQLEVARPADAYETAEAYR" />
+              <has_text text="VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKGSNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV" />
+            </assert_contents>
+        </output>
+        <output name="output_tsv">
+            <assert_contents>
+              <has_text text="9606" />
+              <has_text text="9598" />
             </assert_contents>
         </output>
-        <output name="output_mismatch">
+      </test>
+      <test>
+        <param name="api" value="pept2taxa"/>
+        <param name="fmt" value="fasta"/>
+        <param name="input_fasta" value="peptide.fa"/>
+        <param name="equate_il" value="True"/>
+        <param name="extra" value="False"/>
+        <param name="names" value="False"/>
+        <param name="outputs" value="tsv"/>
+        <output name="output_tsv">
             <assert_contents>
-              <has_text text="DQIAHEGK" />
+              <has_text text="sapiens" />
+              <has_text text="troglodytes" />
+              <has_text text="Gorilla" />
+              <has_text text="Macaca" />
             </assert_contents>
         </output>
       </test>
@@ -182,8 +205,24 @@
     **Unipept** 
 
     Retrieve Uniprot and taxanomic information for trypic peptides.
+    
+    Unipept API documentation - http://unipept.ugent.be/apidocs 
 
-    **pept2prot**
+    **Input**
+
+    Input peptides can be retrieved from tabular, fasta, mzid, or pepxml datasets.  
+ 
+    Processing deatils::
+
+        The input peptides are split into typtic peptide fragments in order to match the Unipept records.   
+        Only fragments that are complete tryptic peptides between 5 and 50 animo acid in length will be matched by Unipept.
+        The match to the most specific tryptic fragment is reported.
+
+
+    **Unipept APIs**
+
+    **pept2prot**  - http://unipept.ugent.be/apidocs/pept2prot
+
     Returns the list of UniProt entries containing a given tryptic peptide. This is the same information as provided on the Protein matches tab when performing a search with the Tryptic Peptide Analysis in the web interface. 
 
     By default, each object contains the following information fields extracted from the UniProt record::
@@ -202,9 +241,9 @@
         insdc_ids: a space separated list of associated insdc accession numbers
         insdc_protein_ids: a space separated list of associated insdc protein accession numbers
 
-    http://unipept.ugent.be/apidocs/pept2prot
 
-    **pept2taxa**
+    **pept2taxa**  - http://unipept.ugent.be/apidocs/pept2taxa
+
     Returns the set of organisms associated with the UniProt entries containing a given tryptic peptide. This is the same information as provided on the Lineage table tab when performing a search with the Tryptic Peptide Analysis in the web interface.
 
     By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::
@@ -245,9 +284,9 @@
         varietas_id
         forma_id
 
-    http://unipept.ugent.be/apidocs/pept2taxa
 
-    **pept2lca** 
+    **pept2lca**  - http://unipept.ugent.be/apidocs/pept2lca
+
     Returns the taxonomic lowest common ancestor for a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.
 
     By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::
@@ -288,7 +327,6 @@
         varietas_id
         forma_id
 
-    http://unipept.ugent.be/apidocs/pept2lca
 
     **Attributions**