diff tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml @ 11:4c4a0da938ff draft

Uploaded v0.0.22, now wraps BLAST+ 2.2.28 allowing extended tabular output to include the hit descriptions as column 25. Supports $GALAXY_SLOTS. Includes more tests and heavy use of macros.
author peterjc
date Thu, 05 Dec 2013 06:55:59 -0500
parents 70e7dcbf6573
children 623f727cdff1
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml	Mon Sep 23 06:14:13 2013 -0400
+++ b/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml	Thu Dec 05 06:55:59 2013 -0500
@@ -1,156 +1,62 @@
-<tool id="ncbi_blastp_wrapper" name="NCBI BLAST+ blastp" version="0.0.20">
+<tool id="ncbi_blastp_wrapper" name="NCBI BLAST+ blastp" version="0.0.22">
     <description>Search protein database with protein query sequence(s)</description>
     <!-- If job splitting is enabled, break up the query file into parts -->
-    <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject,histdb" merge_outputs="output1"></parallelism>
-    <requirements>
-        <requirement type="binary">blastp</requirement>
-        <requirement type="package" version="2.2.26+">blast+</requirement>
-    </requirements>
-    <version_command>blastp -version</version_command>
+    <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1" />
+    <macros>
+        <token name="@BINARY@">blastp</token>
+        <import>ncbi_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
     <command>
 ## The command is a Cheetah template which allows some Python based syntax.
 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
 blastp
 -query "$query"
-#if $db_opts.db_opts_selector == "db":
-  -db "${db_opts.database.fields.path}"
-#elif $db_opts.db_opts_selector == "histdb":
-  -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}"
-#else:
-  -subject "$db_opts.subject"
-#end if
+@BLAST_DB_SUBJECT@
 -task $blast_type
 -evalue $evalue_cutoff
--out "$output1"
-##Set the extended list here so if/when we add things, saved workflows are not affected
-#if str($out_format)=="ext":
-    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"
-#else:
-    -outfmt $out_format
-#end if
--num_threads 8
+@BLAST_OUTPUT@
+@THREADS@
 #if $adv_opts.adv_opts_selector=="advanced":
-$adv_opts.filter_query
 -matrix $adv_opts.matrix
-## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string
-## Note -max_target_seqs overrides -num_descriptions and -num_alignments
-#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):
--max_target_seqs $adv_opts.max_hits
-#end if
-#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):
--word_size $adv_opts.word_size
-#end if
+@ADVANCED_OPTIONS@
 ##Ungapped disabled for now - see comments below
 ##$adv_opts.ungapped
-$adv_opts.parse_deflines
 ## End of advanced options:
 #end if
     </command>
-    <stdio>
-        <!-- Anything other than zero is an error -->
-        <exit_code range="1:" />
-        <exit_code range=":-1" />
-        <!-- In case the return code has not been set propery check stderr too -->
-        <regex match="Error:" />
-        <regex match="Exception:" />
-    </stdio>
+
+    <expand macro="stdio" />
+
     <inputs>
         <param name="query" type="data" format="fasta" label="Protein query sequence(s)"/> 
-        <conditional name="db_opts">
-            <param name="db_opts_selector" type="select" label="Subject database/sequences">
-              <option value="db" selected="True">Locally installed BLAST database</option>
-              <option value="histdb">BLAST database from your history</option>
-              <option value="file">FASTA file from your history (see warning note below)</option>
-            </param>
-            <when value="db">
-                <param name="database" type="select" label="Protein BLAST database">
-                    <options from_file="blastdb_p.loc">
-                      <column name="value" index="0"/>
-                      <column name="name" index="1"/>
-                      <column name="path" index="2"/>
-                    </options>
-                </param>
-                <param name="histdb" type="hidden" value="" />
-                <param name="subject" type="hidden" value="" /> 
-            </when>
-            <when value="histdb">
-                <param name="database" type="hidden" value="" />
-                <param name="histdb" type="data" format="blastdbp" label="Protein BLAST database" />
-                <param name="subject" type="hidden" value="" />
-            </when>
-            <when value="file">
-                <param name="database" type="hidden" value="" /> 
-                <param name="histdb" type="hidden" value="" />
-                <param name="subject" type="data" format="fasta" label="Protein FASTA file to use as database"/> 
-            </when>
-        </conditional>
+
+        <expand macro="input_conditional_protein_db" />
+
         <param name="blast_type" type="select" display="radio" label="Type of BLAST">
             <option value="blastp">blastp</option>
             <option value="blastp-short">blastp-short</option>
         </param>
-        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" />
-        <param name="out_format" type="select" label="Output format">
-            <option value="6">Tabular (standard 12 columns)</option>
-            <option value="ext" selected="True">Tabular (extended 24 columns)</option>
-            <option value="5">BLAST XML</option>
-            <option value="0">Pairwise text</option>
-            <option value="0 -html">Pairwise HTML</option>
-            <option value="2">Query-anchored text</option>
-            <option value="2 -html">Query-anchored HTML</option>
-            <option value="4">Flat query-anchored text</option>
-            <option value="4 -html">Flat query-anchored HTML</option>
+        <expand macro="input_evalue" />
+        <expand macro="input_out_format" />
+        <expand macro="advanced_options">
+            <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' -->
+            <expand macro="input_filter_query_default_false" />
+            <expand macro="input_scoring_matrix" />
+            <expand macro="input_max_hits" />
+            <expand macro="input_word_size" />
             <!--
-            <option value="-outfmt 11">BLAST archive format (ASN.1)</option>
+            Can't use '-ungapped' on its own, error back is:
+            Composition-adjusted searched are not supported with an ungapped search, please add -comp_based_stats F or do a gapped search
+            Tried using '-ungapped -comp_based_stats F' and blastp crashed with 'Attempt to access NULL pointer.'
+            <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped -comp_based_stats F" falsevalue="" checked="false" />
             -->
-        </param>
-        <conditional name="adv_opts">
-            <param name="adv_opts_selector" type="select" label="Advanced Options">
-              <option value="basic" selected="True">Hide Advanced Options</option>
-              <option value="advanced">Show Advanced Options</option>
-            </param>
-            <when value="basic" />
-            <when value="advanced">
-                <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' -->
-                <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="false" />
-                <param name="matrix" type="select" label="Scoring matrix">
-                    <option value="BLOSUM90">BLOSUM90</option>
-                    <option value="BLOSUM80">BLOSUM80</option>
-                    <option value="BLOSUM62" selected="true">BLOSUM62 (default)</option>
-                    <option value="BLOSUM50">BLOSUM50</option> 
-                    <option value="BLOSUM45">BLOSUM45</option>
-                    <option value="PAM250">PAM250</option>
-                    <option value="PAM70">PAM70</option>
-                    <option value="PAM30">PAM30</option>
-                </param>
-                <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer -->
-                <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits">
-                    <validator type="in_range" min="0" />
-                </param>
-                <!-- I'd like word_size to be optional, with minimum 2 for blastp -->
-                <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2.">
-                    <validator type="in_range" min="0" />
-                </param>
-                <!--
-                Can't use '-ungapped' on its own, error back is:
-                Composition-adjusted searched are not supported with an ungapped search, please add -comp_based_stats F or do a gapped search
-                Tried using '-ungapped -comp_based_stats F' and blastp crashed with 'Attempt to access NULL pointer.'
-                <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped -comp_based_stats F" falsevalue="" checked="false" />
-                -->
-                <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/>
-            </when>
-        </conditional>
+            <expand macro="input_parse_deflines" />
+        </expand>
     </inputs>
     <outputs>
-        <data name="output1" format="tabular" label="${blast_type.value_label} on ${on_string}">
-            <change_format>
-                <when input="out_format" value="0" format="txt"/>
-                <when input="out_format" value="0 -html" format="html"/>
-                <when input="out_format" value="2" format="txt"/>
-                <when input="out_format" value="2 -html" format="html"/>
-                <when input="out_format" value="4" format="txt"/>
-                <when input="out_format" value="4 -html" format="html"/>
-                <when input="out_format" value="5" format="blastxml"/>
-            </change_format>
+        <data name="output1" format="tabular" label="${blast_type.value_label} $query.name vs @ON_DB_SUBJECT@">
+            <expand macro="output_change_format" />
         </data>
     </outputs>
     <tests>
@@ -216,83 +122,18 @@
     </tests>
     <help>
     
-.. class:: warningmark
-
-**Note**. Database searches may take a substantial amount of time.
-For large input datasets it is advisable to allow overnight processing.  
-
------
+@SEARCH_TIME_WARNING@
 
 **What it does**
 
 Search a *protein database* using a *protein query*,
 using the NCBI BLAST+ blastp command line tool.
 
-.. class:: warningmark
-
-You can also search against a FASTA file of subject protein
-sequences. This is *not* advised because it is slower (only one
-CPU is used), but more importantly gives e-values for pairwise
-searches (very small e-values which will look overly signficiant).
-In most cases you should instead turn the other FASTA file into a
-database first using *makeblastdb* and search against that.
+@FASTA_WARNING@
 
 -----
 
-**Output format**
-
-Because Galaxy focuses on processing tabular data, the default output of this
-tool is tabular. The standard BLAST+ tabular output contains 12 columns:
-
-====== ========= ============================================
-Column NCBI name Description
------- --------- --------------------------------------------
-     1 qseqid    Query Seq-id (ID of your sequence)
-     2 sseqid    Subject Seq-id (ID of the database hit)
-     3 pident    Percentage of identical matches
-     4 length    Alignment length
-     5 mismatch  Number of mismatches
-     6 gapopen   Number of gap openings
-     7 qstart    Start of alignment in query
-     8 qend      End of alignment in query
-     9 sstart    Start of alignment in subject (database hit)
-    10 send      End of alignment in subject (database hit)
-    11 evalue    Expectation value (E-value)
-    12 bitscore  Bit score
-====== ========= ============================================
-
-The BLAST+ tools can optionally output additional columns of information,
-but this takes longer to calculate. Most (but not all) of these columns are
-included by selecting the extended tabular output. The extra columns are
-included *after* the standard 12 columns. This is so that you can write
-workflow filtering steps that accept either the 12 or 24 column tabular
-BLAST output. Galaxy now uses this extended 24 column output by default.
-
-====== ============= ===========================================
-Column NCBI name     Description
------- ------------- -------------------------------------------
-    13 sallseqid     All subject Seq-id(s), separated by a ';'
-    14 score         Raw score
-    15 nident        Number of identical matches
-    16 positive      Number of positive-scoring matches
-    17 gaps          Total number of gaps
-    18 ppos          Percentage of positive-scoring matches
-    19 qframe        Query frame
-    20 sframe        Subject frame
-    21 qseq          Aligned part of query sequence
-    22 sseq          Aligned part of subject sequence
-    23 qlen          Query sequence length
-    24 slen          Subject sequence length
-====== ============= ===========================================
-
-The third option is BLAST XML output, which is designed to be parsed by
-another program, and is understood by some Galaxy tools.
-
-You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).
-The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.
-The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.
-The two query anchored outputs show a multiple sequence alignment between the query and all the matches,
-and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).
+@OUTPUT_FORMAT@
 
 -------
 
@@ -301,17 +142,6 @@
 If you use this Galaxy tool in work leading to a scientific publication please
 cite the following papers:
 
-Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
-Galaxy tools and workflows for sequence analysis with applications
-in molecular plant pathology. PeerJ 1:e167
-http://dx.doi.org/10.7717/peerj.167
-
-Christiam Camacho et al. (2009).
-BLAST+: architecture and applications.
-BMC Bioinformatics. 15;10:421.
-http://dx.doi.org/10.1186/1471-2105-10-421
-
-This wrapper is available to install into other Galaxy Instances via the Galaxy
-Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus
+@REFERENCES@
     </help>
 </tool>