changeset 25:e25d3acf6e68 draft

v0.3.1 completed gzip support
author peterjc
date Tue, 23 Oct 2018 08:48:19 -0400
parents c877294f8025
children 2889433c7ae1
files tools/ncbi_blast_plus/README.rst tools/ncbi_blast_plus/blastxml_to_tabular.py tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml tools/ncbi_blast_plus/ncbi_macros.xml tools/ncbi_blast_plus/ncbi_makeblastdb.xml tools/ncbi_blast_plus/ncbi_makeprofiledb.xml tools/ncbi_blast_plus/ncbi_rpsblast_wrapper.xml tools/ncbi_blast_plus/ncbi_rpstblastn_wrapper.xml tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml tools/ncbi_blast_plus/tool_dependencies.xml
diffstat 13 files changed, 75 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/README.rst	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/README.rst	Tue Oct 23 08:48:19 2018 -0400
@@ -213,7 +213,7 @@
           setup via ``tool-data/tool_data_table_conf.xml.sample``
         - Replace ``.extra_files_path`` with ``.files_path`` (internal change,
           thanks to Bjoern Gruening and John Chilton).
-        - Added "NCBI BLAST+ integrated into Galaxy" preprint citation.
+        - Added *"NCBI BLAST+ integrated into Galaxy"* preprint citation.
 v0.1.03 - Reorder XML elements (internal change only).
         - Planemo for Tool Shed upload (``.shed.yml``, internal change only).
 v0.1.04 - Fixed regression using BLAST databases from the history. Currently
@@ -252,13 +252,18 @@
         - Support for per-matrix recommended gaps settings (``-gapopen`` and
           ``-gapextend``, contribution from Caleb Easterly and Jim Johnson).
         - Support for ``-window_size``, ``-threshold``, ``-comp_based_stats``
-          and revising ``-word_size`` to avoid using zero to mean  default
+          and revising ``-word_size`` to avoid using zero to mean default
           (contribution from Caleb Easterly).
 v0.3.0  - Updated for NCBI BLAST+ 2.7.1,
         - Depends on BioConda or legacy ToolShed ``package_blast_plus_2_7_1``.
         - Document the BLAST+ 2.6.0 change in the standard 12 column output
           from ``qacc,sacc,...`` to ``qaccver,saccver,...`` instead.
-        - Accept gzipped FASTA inputs (contribution from Anton Nekrutenko).
+        - Accept gzipped FASTA inputs for subject files, queries to ``blastn``
+          and input to ``makeblastdb`` (contribution from Anton Nekrutenko).
+v0.3.1  - Clarify help text for max hits option, confusing as depending on the
+          output format it must be mapped to different command line arguments.
+        - Extend gzipped query support to all the command line tools.
+        - Workaround for gzipped support under Galaxy release 16.01 or older.
 ======= ======================================================================
 
 
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py	Tue Oct 23 08:48:19 2018 -0400
@@ -168,6 +168,7 @@
 
 
 def convert(blastxml_filename, output_handle):
+    """Convert BLAST XML input from a file to tabular on given handle."""
     blast_program = None
     # get an iterable
     try:
--- a/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -11,11 +11,7 @@
 ## The command is a Cheetah template which allows some Python based syntax.
 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
 blastn
-#if $query.is_of_type('fasta.gz'):
--query <(gunzip -c '${query}')
-#else:
--query '${query}'
-#end if
+@QUERY@
 @BLAST_DB_SUBJECT@
 -task '${blast_type}'
 -evalue '${evalue_cutoff}'
@@ -43,7 +39,7 @@
 ]]>
     </command>
     <inputs>
-        <param argument="-query" type="data" format="fasta,fasta.gz" label="Nucleotide query sequence(s)"/>
+        <expand macro="nucl_query" />
         <expand macro="input_conditional_nucleotide_db" />
         <param name="blast_type" argument="-task" type="select" display="radio" label="Type of BLAST">
             <option value="megablast">megablast - Traditional megablast used to find very similar (e.g., intraspecies or closely related species) sequences</option>
--- a/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -10,7 +10,7 @@
 ## The command is a Cheetah template which allows some Python based syntax.
 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
 blastp
--query '$query'
+@QUERY@
 @BLAST_DB_SUBJECT@
 -task $blast_type
 -evalue $evalue_cutoff
@@ -33,7 +33,7 @@
 #end if
     </command>
     <inputs>
-        <param argument="-query" type="data" format="fasta" label="Protein query sequence(s)"/>
+        <expand macro="prot_query" />
 
         <expand macro="input_conditional_protein_db" />
 
--- a/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -10,7 +10,7 @@
 ## The command is a Cheetah template which allows some Python based syntax.
 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
 blastx
--query '$query'
+@QUERY@
 @BLAST_DB_SUBJECT@
 -query_gencode $query_gencode
 -task $blast_type
@@ -33,7 +33,7 @@
 #end if
     </command>
     <inputs>
-        <param argument="-query" type="data" format="fasta" label="Nucleotide query sequence(s)"/>
+        <expand macro="nucl_query" />
 
         <expand macro="input_conditional_protein_db" />
         <expand macro="input_query_gencode" />
--- a/tools/ncbi_blast_plus/ncbi_macros.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@WRAPPER_VERSION@">0.3.0</token>
+    <token name="@WRAPPER_VERSION@">0.3.1</token>
     <xml name="parallelism">
         <!-- If job splitting is enabled, break up the query file into parts -->
         <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1" />
@@ -12,6 +12,14 @@
         <version_command>@BINARY@ -version</version_command>
     </xml>
 
+    <xml name="nucl_query">
+        <param argument="-query" type="data" format="fasta,fasta.gz" label="Nucleotide query sequence(s)"/>
+    </xml>
+
+    <xml name="prot_query">
+        <param argument="-query" type="data" format="fasta,fasta.gz" label="Protein query sequence(s)"/>
+    </xml>
+
     <xml name="output_change_format">
         <change_format>
             <when input="output.out_format" value="0" format="txt"/>
@@ -443,7 +451,7 @@
     </xml>
 
     <xml name="input_max_hits">
-        <param name="max_hits" type="integer" min="0" value="0" label="Maximum hits to show" help="Use zero for default limits" />
+        <param name="max_hits" type="integer" min="0" value="0" label="Maximum hits to consider/show" help="Use zero for default limits. For HTML and plain text output this value is passed -num_descriptions and -num_alignments but for XML and tabular etc, this is used with -max_target_seqs instead. In either case, in addition to limiting the final output, this alters internal limits during the search, which can in some cases exclude matches which would otherwise become the best hit." />
         <param argument="-max_hsps" type="integer" min="1" optional="true" value="" label="Maximum number of HSPs (alignments) to keep for any single query-subject pair" help="The HSPs shown will be the best as judged by expect value. If this option is not set, BLAST shows all HSPs meeting the expect value criteria" />
     </xml>
 
@@ -558,13 +566,21 @@
 
     <token name="@THREADS@">-num_threads "\${GALAXY_SLOTS:-8}"</token>
 
+    <token name="@QUERY@"><![CDATA[
+#if $query.is_of_type('fasta.gz') and $query.ext != "fasta":
+  -query <(gunzip -c '${query}')
+#else:
+  -query '${query}'
+#end if
+    ]]></token>
+
     <token name="@BLAST_DB_SUBJECT@"><![CDATA[
 #if $db_opts.db_opts_selector == "db":
   -db '${" ".join(str($db_opts.database.fields.path).split(","))}'
 #elif $db_opts.db_opts_selector == "histdb":
   -db '${os.path.join($db_opts.histdb.extra_files_path, "blastdb")}'
 #else:
-    #if $db_opts.subject.is_of_type('fasta.gz'):
+    #if $db_opts.subject.is_of_type('fasta.gz') and $db_opts.subject.ext != "fasta":
         -subject <(gunzip -c '${$db_opts.subject}')
     #else:
         -subject '${db_opts.subject}'
@@ -589,15 +605,41 @@
     <token name="@ADV_FILTER_QUERY@">$adv_opts.filter_query</token>
     <token name="@ADV_MAX_HITS@"><![CDATA[
 ## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string
-## Note -max_target_seqs used to simply override -num_descriptions and -num_alignments
-## but this was changed in BLAST+ 2.2.27 onwards to force their use (raised with NCBI)
+##
+## Quoting BLAST 2.7.1+ output from "blastp --help" or "blastn --help":
+##
+##  *** Formatting options
+##   -num_descriptions <Integer, >=0>
+##    Number of database sequences to show one-line descriptions for
+##    Not applicable for outfmt > 4
+##    Default = `500'
+##     * Incompatible with:  max_target_seqs
+##  -num_alignments <Integer, >=0>
+##    Number of database sequences to show alignments for
+##    Default = `250'
+##     * Incompatible with:  max_target_seqs
+##
+##  *** Restrict search or results
+##
+##  -max_target_seqs <Integer, >=1>
+##   Maximum number of aligned sequences to keep
+##   Not applicable for outfmt <= 4
+##   Default = `500'
+##    * Incompatible with:  num_descriptions, num_alignments
+##
+## So, taken at face value we do still need to treat the Text and HTML output
+## differently from the Tabular and XML, yet the treatment of these limits is
+## different (during search or after the search when writing the output):
+## https://blastedbio.blogspot.com/2015/12/blast-max-target-sequences-bug.html
+##
+## See also our user-facing help text.
 #if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):
     #if str($output.out_format) in ["6", "ext", "cols", "5"]:
         ## Most output formats use this, including tabular and XML:
         -max_target_seqs '${adv_opts.max_hits}'
     #else
         ## Text and HTML output formats 0-4 currently need this instead:
-        -num_descriptions $adv_opts.max_hits -num_alignments $adv_opts.max_hits
+        -num_descriptions '${adv_opts.max_hits}' -num_alignments '${adv_opts.max_hits}'
     #end if
 #end if
 #if str($adv_opts.max_hsps)
--- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -17,7 +17,7 @@
 ##into a single cat command below
 cat
 #for i in $input_file:
-    #if $i.is_of_type('fasta.gz'):
+    #if $i.is_of_type('fasta.gz') and $i.ext != "fasta":
         <(gunzip -c ${i})
     #else:
         ${i}
--- a/tools/ncbi_blast_plus/ncbi_makeprofiledb.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_makeprofiledb.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -90,7 +90,7 @@
             <param name="contain_pssm_scores_type" value="yes" />
             <output name="outfile" file="empty_file.dat" ftype="blastdbd" >
                 <extra_files type="file" value="cd00003_and_cd00008.phr" name="blastdb.phr" />
-                <extra_files type="file" value="cd00003_and_cd00008.pin" name="blastdb.pin" lines_diff="2" />
+                <extra_files type="file" value="cd00003_and_cd00008.pin" name="blastdb.pin" compare="sim_size" delta="0" />
                 <extra_files type="file" value="cd00003_and_cd00008.psq" name="blastdb.psq" />
                 <extra_files type="file" value="cd00003_and_cd00008.freq" name="blastdb.freq" />
                 <extra_files type="file" value="cd00003_and_cd00008.loo" name="blastdb.loo" />
--- a/tools/ncbi_blast_plus/ncbi_rpsblast_wrapper.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_rpsblast_wrapper.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -10,7 +10,7 @@
 ## The command is a Cheetah template which allows some Python based syntax.
 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
 rpsblast
--query '$query'
+@QUERY@
 #if $db_opts.db_opts_selector == "db":
   -db '${db_opts.database.fields.path}'
 #elif $db_opts.db_opts_selector == "histdb":
@@ -28,7 +28,7 @@
 #end if
     </command>
     <inputs>
-        <param argument="-query" type="data" format="fasta" label="Protein query sequence(s)"/>
+        <expand macro="prot_query" />
 
         <expand macro="input_conditional_pssm" />
 
--- a/tools/ncbi_blast_plus/ncbi_rpstblastn_wrapper.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_rpstblastn_wrapper.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -10,7 +10,7 @@
 ## The command is a Cheetah template which allows some Python based syntax.
 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
 rpstblastn
--query '$query'
+@QUERY@
 #if $db_opts.db_opts_selector == "db":
   -db '${db_opts.database.fields.path}'
 #elif $db_opts.db_opts_selector == "histdb":
@@ -28,7 +28,7 @@
 #end if
     </command>
     <inputs>
-        <param argument="-query" type="data" format="fasta" label="Nucleotide query sequence(s)"/>
+        <expand macro="nucl_query" />
 
         <expand macro="input_conditional_pssm" />
 
--- a/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -10,7 +10,7 @@
 ## The command is a Cheetah template which allows some Python based syntax.
 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
 tblastn
--query '$query'
+@QUERY@
 @BLAST_DB_SUBJECT@
 -task $blast_type
 -evalue $evalue_cutoff
@@ -33,7 +33,7 @@
 #end if
     </command>
     <inputs>
-        <param argument="-query" type="data" format="fasta" label="Protein query sequence(s)"/>
+        <expand macro="prot_query" />
 
         <expand macro="input_conditional_nucleotide_db" />
         <param name="blast_type" argument="-task" type="select" display="radio" label="Type of BLAST">
--- a/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -10,7 +10,7 @@
 ## The command is a Cheetah template which allows some Python based syntax.
 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
 tblastx
--query '$query'
+@QUERY@
 @BLAST_DB_SUBJECT@
 -query_gencode $query_gencode
 -evalue $evalue_cutoff
@@ -30,7 +30,7 @@
 #end if
     </command>
     <inputs>
-        <param argument="-query" type="data" format="fasta" label="Nucleotide query sequence(s)"/>
+        <expand macro="nucl_query" />
 
         <expand macro="input_conditional_nucleotide_db" />
         <expand macro="input_query_gencode" />
--- a/tools/ncbi_blast_plus/tool_dependencies.xml	Mon Jul 09 10:08:16 2018 -0400
+++ b/tools/ncbi_blast_plus/tool_dependencies.xml	Tue Oct 23 08:48:19 2018 -0400
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <tool_dependency>
-    <package name="blast" version="2.7.1">
-        <repository changeset_revision="2e9109a8924f" name="package_blast_plus_2_7_1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+    <package name="blast" version="2.5.0">
+        <repository changeset_revision="5dd2b68c7d04" name="package_blast_plus_2_5_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>