diff tools/ncbi_blast_plus/ncbi_macros.xml @ 25:e25d3acf6e68 draft

v0.3.1 completed gzip support
author peterjc
date Tue, 23 Oct 2018 08:48:19 -0400
parents 31e517610e1f
children 2889433c7ae1
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/ncbi_macros.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@WRAPPER_VERSION@">0.3.0</token>
+    <token name="@WRAPPER_VERSION@">0.3.1</token>
     <xml name="parallelism">
         <!-- If job splitting is enabled, break up the query file into parts -->
         <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1" />
@@ -12,6 +12,14 @@
         <version_command>@BINARY@ -version</version_command>
     </xml>
 
+    <xml name="nucl_query">
+        <param argument="-query" type="data" format="fasta,fasta.gz" label="Nucleotide query sequence(s)"/>
+    </xml>
+
+    <xml name="prot_query">
+        <param argument="-query" type="data" format="fasta,fasta.gz" label="Protein query sequence(s)"/>
+    </xml>
+
     <xml name="output_change_format">
         <change_format>
             <when input="output.out_format" value="0" format="txt"/>
@@ -443,7 +451,7 @@
     </xml>
 
     <xml name="input_max_hits">
-        <param name="max_hits" type="integer" min="0" value="0" label="Maximum hits to show" help="Use zero for default limits" />
+        <param name="max_hits" type="integer" min="0" value="0" label="Maximum hits to consider/show" help="Use zero for default limits. For HTML and plain text output this value is passed -num_descriptions and -num_alignments but for XML and tabular etc, this is used with -max_target_seqs instead. In either case, in addition to limiting the final output, this alters internal limits during the search, which can in some cases exclude matches which would otherwise become the best hit." />
         <param argument="-max_hsps" type="integer" min="1" optional="true" value="" label="Maximum number of HSPs (alignments) to keep for any single query-subject pair" help="The HSPs shown will be the best as judged by expect value. If this option is not set, BLAST shows all HSPs meeting the expect value criteria" />
     </xml>
 
@@ -558,13 +566,21 @@
 
     <token name="@THREADS@">-num_threads "\${GALAXY_SLOTS:-8}"</token>
 
+    <token name="@QUERY@"><![CDATA[
+#if $query.is_of_type('fasta.gz') and $query.ext != "fasta":
+  -query <(gunzip -c '${query}')
+#else:
+  -query '${query}'
+#end if
+    ]]></token>
+
     <token name="@BLAST_DB_SUBJECT@"><![CDATA[
 #if $db_opts.db_opts_selector == "db":
   -db '${" ".join(str($db_opts.database.fields.path).split(","))}'
 #elif $db_opts.db_opts_selector == "histdb":
   -db '${os.path.join($db_opts.histdb.extra_files_path, "blastdb")}'
 #else:
-    #if $db_opts.subject.is_of_type('fasta.gz'):
+    #if $db_opts.subject.is_of_type('fasta.gz') and $db_opts.subject.ext != "fasta":
         -subject <(gunzip -c '${$db_opts.subject}')
     #else:
         -subject '${db_opts.subject}'
@@ -589,15 +605,41 @@
     <token name="@ADV_FILTER_QUERY@">$adv_opts.filter_query</token>
     <token name="@ADV_MAX_HITS@"><![CDATA[
 ## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string
-## Note -max_target_seqs used to simply override -num_descriptions and -num_alignments
-## but this was changed in BLAST+ 2.2.27 onwards to force their use (raised with NCBI)
+##
+## Quoting BLAST 2.7.1+ output from "blastp --help" or "blastn --help":
+##
+##  *** Formatting options
+##   -num_descriptions <Integer, >=0>
+##    Number of database sequences to show one-line descriptions for
+##    Not applicable for outfmt > 4
+##    Default = `500'
+##     * Incompatible with:  max_target_seqs
+##  -num_alignments <Integer, >=0>
+##    Number of database sequences to show alignments for
+##    Default = `250'
+##     * Incompatible with:  max_target_seqs
+##
+##  *** Restrict search or results
+##
+##  -max_target_seqs <Integer, >=1>
+##   Maximum number of aligned sequences to keep
+##   Not applicable for outfmt <= 4
+##   Default = `500'
+##    * Incompatible with:  num_descriptions, num_alignments
+##
+## So, taken at face value we do still need to treat the Text and HTML output
+## differently from the Tabular and XML, yet the treatment of these limits is
+## different (during search or after the search when writing the output):
+## https://blastedbio.blogspot.com/2015/12/blast-max-target-sequences-bug.html
+##
+## See also our user-facing help text.
 #if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):
     #if str($output.out_format) in ["6", "ext", "cols", "5"]:
         ## Most output formats use this, including tabular and XML:
         -max_target_seqs '${adv_opts.max_hits}'
     #else
         ## Text and HTML output formats 0-4 currently need this instead:
-        -num_descriptions $adv_opts.max_hits -num_alignments $adv_opts.max_hits
+        -num_descriptions '${adv_opts.max_hits}' -num_alignments '${adv_opts.max_hits}'
     #end if
 #end if
 #if str($adv_opts.max_hsps)