changeset 30:acf4e47b734c draft

"2.10.1+galaxy1 with taxid improvements"
author peterjc
date Tue, 29 Mar 2022 14:54:02 +0000
parents 5edc472ec434
children 0e3cf9594bb7
files tools/ncbi_blast_plus/README.rst tools/ncbi_blast_plus/get_species_taxids.xml tools/ncbi_blast_plus/ncbi_macros.xml
diffstat 3 files changed, 144 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/README.rst	Thu Sep 10 13:23:34 2020 +0000
+++ b/tools/ncbi_blast_plus/README.rst	Tue Mar 29 14:54:02 2022 +0000
@@ -136,6 +136,15 @@
 ============== ===============================================================
 Version        Changes
 -------------- ---------------------------------------------------------------
+2.10.1+galaxy1 - Add tool `NCBI get species taxids` that wraps NCBI's
+                 `get_species_taxids.sh` script
+                 (https://www.ncbi.nlm.nih.gov/books/NBK546209/).
+                 It allows to determine all species taxids below a certain
+                 Taxon.
+               - Add the possibility to restrict BLAST searches taxonomically
+                 by species taxids given in a file.
+               - Properly quote cached databases
+               - Make locally installed database selector non-optional.
 2.10.1+galaxy0 - Updated for NCBI BLAST+ 2.10.1 release.
                - Supports locally installed v4 or v5 format BLAST databases
                  (listed in the ``blastdb*.loc`` files).
@@ -248,6 +257,15 @@
         - Supports setting a taxonomy ID in ``makeblastdb`` wrapper.
         - Subtle changes like new conditional settings will require some old
           workflows be updated to cope.
+v0.0.22 - More use of macros to simplify the wrappers.
+        - Set number of threads via ``$GALAXY_SLOTS`` environment variable.
+        - More descriptive default output names.
+        - Tests require updated BLAST DB definitions (``blast_datatypes``
+          v0.0.18).
+        - Pre-check for duplicate identifiers in ``makeblastdb`` wrapper.
+        - Tests updated for BLAST+ 2.2.28 instead of BLAST+ 2.2.27.
+        - Now depends on ``package_blast_plus_2_2_28`` in ToolShed.
+        - Extended tabular output includes 'salltitles' as column 25.
 v0.0.21 - Use macros to simplify the XML wrappers (by John Chilton).
         - Added wrapper for dustmasker.
         - Enabled masking for makeblastdb (Nicola Soranzo).
@@ -288,15 +306,6 @@
           e-values
 v0.0.11 - Final revision as part of the Galaxy main repository, and the
           first release via the Tool Shed
-v0.0.22 - More use of macros to simplify the wrappers.
-        - Set number of threads via ``$GALAXY_SLOTS`` environment variable.
-        - More descriptive default output names.
-        - Tests require updated BLAST DB definitions (``blast_datatypes``
-          v0.0.18).
-        - Pre-check for duplicate identifiers in ``makeblastdb`` wrapper.
-        - Tests updated for BLAST+ 2.2.28 instead of BLAST+ 2.2.27.
-        - Now depends on ``package_blast_plus_2_2_28`` in ToolShed.
-        - Extended tabular output includes 'salltitles' as column 25.
 ======= ======================================================================
 
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/get_species_taxids.xml	Tue Mar 29 14:54:02 2022 +0000
@@ -0,0 +1,106 @@
+<tool id="get_species_taxids" name="NCBI get species taxids" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description></description>
+    <macros>
+        <import>ncbi_macros.xml</import>
+    </macros>
+    <expand macro="preamble"/>
+    <command detect_errors="aggressive"><![CDATA[
+#if $type_cond.type_sel == 'names'
+    #for name in $type_cond.names.split(',')
+        taxid=\$(get_species_taxids.sh -n '$name' | grep Taxid | sed 's/Taxid://') &&
+        if [ -z \$taxid ]; then
+            >&2 echo "could not find taxid for $name" && exit 1;
+        else
+            echo " $name -> \$taxid";
+        fi &&
+        get_species_taxids.sh -t \$taxid >> species_ids.txt &&
+    #end for
+#else
+    #for taxid in $type_cond.ids.split(',')
+       get_species_taxids.sh -t $taxid >> species_ids.txt &&
+    #end for
+#end if
+sort -n -u  species_ids.txt > '$output'
+    ]]></command>
+    <inputs>
+        <conditional name="type_cond">
+            <param name="type_sel" type="select" label="Get taxids by">
+                <option value="names">Taxon names</option>
+                <option value="ids">Taxon ids</option>
+            </param>
+            <when value="names">
+                <param name="names" type="text" label="Taxon names" help="comma separated">
+                    <validator type="regex" message="Enter a comma separated list of taxon names">[a-zA-Z ,]+$</validator>
+                </param>
+            </when>
+            <when value="ids">
+                <param name="ids" type="text" label="Taxon ids" help="comma separated">
+                    <validator type="regex" message="Enter a comma separated list of taxids">[0-9,]+$</validator>
+                </param>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format="txt" name="output"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="type_cond|type_sel" value="names"/>
+            <param name="type_cond|names" value="Enterobacterales"/>
+            <output name="output" ftype="txt">
+                <assert_contents>
+                    <has_line line="9"/>
+                    <has_line line="2791989"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="type_cond|type_sel" value="ids"/>
+            <param name="type_cond|ids" value="91347"/>
+            <output name="output" ftype="txt">
+                <assert_contents>
+                    <has_line line="9"/>
+                    <has_line line="2791989"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="type_cond|type_sel" value="names"/>
+            <param name="type_cond|names" value="Enterobacterales,Hominidae"/>
+            <output name="output" ftype="txt">
+                <assert_contents>
+                    <has_line line="9"/>
+                    <has_line line="9606"/>
+                    <has_line line="2791989"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="type_cond|type_sel" value="ids"/>
+            <param name="type_cond|ids" value="91347,9604"/>
+            <output name="output" ftype="txt">
+                <assert_contents>
+                    <has_line line="9"/>
+                    <has_line line="9606"/>
+                    <has_line line="2791989"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+
+**What it does**
+
+Returns a list of species taxids for a taxon. It relies on the get_species_taxids.sh script of the BLAST+ package https://www.ncbi.nlm.nih.gov/books/NBK546209/
+
+-------
+
+**References**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite the following papers:
+
+@REFERENCES@
+    </help>
+    <expand macro="blast_citations"/>
+</tool>
--- a/tools/ncbi_blast_plus/ncbi_macros.xml	Thu Sep 10 13:23:34 2020 +0000
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml	Tue Mar 29 14:54:02 2022 +0000
@@ -1,6 +1,6 @@
 <macros>
     <token name="@TOOL_VERSION@">2.10.1</token>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <token name="@PROFILE@">16.10</token>
     <xml name="parallelism">
         <!-- If job splitting is enabled, break up the query file into parts -->
@@ -353,7 +353,7 @@
               <option value="file">FASTA file from your history (see warning note below)</option>
             </param>
             <when value="db">
-                <param name="database" type="select" multiple="true" label="Nucleotide BLAST database">
+                <param name="database" type="select" multiple="true" optional="false" label="Nucleotide BLAST database">
                     <options from_data_table="blastdb" />
                 </param>
                 <param name="histdb" type="hidden" value="" />
@@ -381,7 +381,7 @@
               <option value="file">FASTA file from your history (see warning note below)</option>
             </param>
             <when value="db">
-                <param name="database" type="select" multiple="true" label="Protein BLAST database">
+                <param name="database" type="select" multiple="true" optional="false" label="Protein BLAST database">
                     <options from_data_table="blastdb_p" />
                 </param>
                 <param name="histdb" type="hidden" value="" />
@@ -452,7 +452,7 @@
                           <option value="histdb">BLAST database from your history</option>
                       </param>
                       <when value="db">
-                          <param name="database" argument="-db" type="select" multiple="true" label="Protein BLAST database">
+                          <param name="database" argument="-db" type="select" multiple="true" optional="false" label="Protein BLAST database">
                             <options from_data_table="blastdb_p" />
                         </param>
                         <param name="histdb" type="hidden" value="" />
@@ -558,6 +558,8 @@
                 <option value="gilist">GI identifiers</option>
                 <option value="negative_gilist">Negative GI identifiers</option>
                 <option value="seqidlist">Sequence identifiers (SeqId's)</option>
+                <option value="taxidlist">Taxonomy identifiers (TaxId's)</option>
+                <option value="negative_taxidlist">Negative taxonomy identifiers (TaxId's)</option>
             </param>
             <when value="none" />
             <when value="gilist">
@@ -572,8 +574,17 @@
                 <param argument="-seqidlist" type="data" format="txt" label=" Restrict search of database to list of SeqId's"
                        help="This option is only available for database searches."/>
             </when>
+            <when value="taxidlist">
+                <param argument="-taxidlist" type="data" format="txt" label="Restrict search of database to list of TaxId's"
+                       help="This option is only available for database searches."/>
+            </when>
+            <when value="negative_taxidlist">
+                <param argument="-negative_taxidlist" type="data" format="txt" label="Restrict search of database to list of TaxId's"
+                       help="This option is only available for database searches."/>
+            </when>
         </conditional>
     </xml>
+
 <!--Tokens-->
     <token name="@ADV_MATRIX_GAPCOSTS@"><![CDATA[
 #if str($adv_opts.matrix_gapcosts.matrix):
@@ -595,6 +606,10 @@
     -gilist '{$adv_opts.adv_optional_id_files_opts.gilist}'
 #elif $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == 'seqidlist':
     -seqidlist '${adv_opts.adv_optional_id_files_opts.seqidlist}'
+#elif $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == 'taxidlist':
+    -taxidlist '${adv_opts.adv_optional_id_files_opts.taxidlist}'
+#elif $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == 'negative_taxidlist':
+    -negative_taxidlist '${adv_opts.adv_optional_id_files_opts.negative_taxidlist}'
 #end if
     ]]></token>
 
@@ -621,7 +636,7 @@
     <!-- Implement -db ... / -subject ... command line options -->
     <token name="@BLAST_DB_SUBJECT@"><![CDATA[
 #if $db_opts.db_opts_selector == "db":
-  -db '${" ".join(str($db_opts.database.fields.path).split(","))}'
+  -db '"${'" "'.join(str($db_opts.database.fields.path).split(","))}"'
 #elif $db_opts.db_opts_selector == "histdb":
   -db '${os.path.join($db_opts.histdb.extra_files_path, "blastdb")}'
 #else: