diff tools/ncbi_blast_plus/ncbi_makeblastdb.xml @ 23:31e517610e1f draft

v0.3.0 Updated for NCBI BLAST+ 2.7.1
author peterjc
date Sat, 30 Jun 2018 17:22:46 -0400
parents 6f386c5dc4fb
children e25d3acf6e68
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml	Mon Sep 18 06:21:27 2017 -0400
+++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml	Sat Jun 30 17:22:46 2018 -0400
@@ -5,21 +5,30 @@
         <import>ncbi_macros.xml</import>
     </macros>
     <expand macro="preamble" />
-    <command detect_errors="aggressive" strict="true">
+    <command detect_errors="aggressive" strict="true"><![CDATA[
 python $__tool_directory__/check_no_duplicates.py
 ##First check for duplicates (since BLAST+ 2.2.28 fails to do so)
 ##and abort (via the ampersand ampersand trick) if any are found.
 #for i in $input_file#'${i}' #end for#
-&amp;&amp;
-makeblastdb -out '${os.path.join($outfile.files_path, "blastdb")}'
+&&
+##makeblastdb does not like input redirects of the sort
+##makeblastdb -in <(gunzip -c gzipped_fasta_file)
+##therefore we're cramming everything
+##into a single cat command below
+cat
+#for i in $input_file:
+    #if $i.is_of_type('fasta.gz'):
+        <(gunzip -c ${i})
+    #else:
+        ${i}
+    #end if
+#end for
+| makeblastdb -out '${os.path.join($outfile.files_path, "blastdb")}'
 $parse_seqids
 $hash_index
-## Single call to -in with multiple filenames space separated with outer quotes
-## (presumably any filenames with spaces would be a problem). Note this gives
-## some extra spaces, e.g. -in "file1 file2 file3 " but BLAST seems happy:
--in '#for i in $input_file#${i} #end for#'
+-in -
 #if $title:
--title '$title'
+-title '${title}'
 #else:
 ##Would default to being based on the cryptic Galaxy filenames, which is unhelpful
 -title 'BLAST Database'
@@ -46,8 +55,8 @@
 #end if
 ## --------------------------------------------------------------------
 ## Capture the stdout log information to the primary file (plain text):
-&gt; "$outfile"
-    </command>
+> '$outfile'
+    ]]></command>
     <inputs>
         <param argument="-dbtype" type="select" display="radio" label="Molecule type of input">
             <option value="prot">protein</option>
@@ -57,7 +66,7 @@
              NOTE Double check the new database would be self contained first
         -->
         <!-- Note this is a mandatory parameter - default should be most recent FASTA file -->
-        <param name="input_file" argument="-in" type="data" multiple="true" optional="false" format="fasta" label="Input FASTA files(s)" help="One or more FASTA files" />
+        <param name="input_file" argument="-in" type="data" multiple="true" optional="false" format="fasta,fasta.gz" label="Input FASTA files(s)" help="One or more FASTA files" />
         <param argument="-title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" />
         <param argument="-parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="false" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" />
         <param argument="-hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." />
@@ -110,7 +119,7 @@
             <param name="hash_index" value="true" />
             <output name="outfile" compare="contains" file="four_human_proteins.fasta.log.txt" ftype="blastdbp">
                 <extra_files type="file" value="four_human_proteins.fasta.phr" name="blastdb.phr" />
-                <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" lines_diff="2" />
+                <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" compare="sim_size" delta="0" />
                 <extra_files type="file" value="four_human_proteins.fasta.psq" name="blastdb.psq" />
                 <extra_files type="file" value="four_human_proteins.fasta.pog" name="blastdb.pog" />
                 <extra_files type="file" value="four_human_proteins.fasta.phd" name="blastdb.phd" />
@@ -129,7 +138,7 @@
             <param name="taxid" value="9606" />
             <output name="outfile" compare="contains" file="four_human_proteins_taxid.fasta.log.txt" ftype="blastdbp">
                 <extra_files type="file" value="four_human_proteins_taxid.fasta.phr" name="blastdb.phr" />
-                <extra_files type="file" value="four_human_proteins_taxid.fasta.pin" name="blastdb.pin" lines_diff="2" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.pin" name="blastdb.pin" compare="sim_size" delta="0" />
                 <extra_files type="file" value="four_human_proteins_taxid.fasta.psq" name="blastdb.psq" />
                 <extra_files type="file" value="four_human_proteins_taxid.fasta.pog" name="blastdb.pog" />
                 <extra_files type="file" value="four_human_proteins_taxid.fasta.phd" name="blastdb.phd" />
@@ -147,7 +156,7 @@
             <param name="mask_data_file" value="segmasker_four_human.maskinfo-asn1" ftype="maskinfo-asn1" />
             <output name="outfile" compare="contains" file="four_human_proteins.fasta.log.txt" ftype="blastdbp">
                 <extra_files type="file" value="four_human_proteins.fasta.phr" name="blastdb.phr" />
-                <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" lines_diff="2" />
+                <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" compare="sim_size" delta="0" />
                 <extra_files type="file" value="four_human_proteins.fasta.psq" name="blastdb.psq" />
                 <extra_files type="file" value="four_human_proteins.fasta.pog" name="blastdb.pog" />
                 <extra_files type="file" value="four_human_proteins.fasta.phd" name="blastdb.phd" />
@@ -158,7 +167,7 @@
         </test>
         <test>
             <param name="dbtype" value="nucl" />
-            <param name="input_file" value="three_human_mRNA.fasta" ftype="fasta" />
+            <param name="input_file" value="three_human_mRNA.fasta.gz" ftype="fasta.gz" />
             <param name="title" value="Just 3 human mRNA sequences" />
             <param name="parse_seqids" value="" />
             <param name="hash_index" value="true" />
@@ -166,7 +175,7 @@
             <param name="taxid" value="9606" />
             <output name="outfile" compare="contains" file="three_human_mRNA.fasta.log.txt" ftype="blastdbn">
                 <extra_files type="file" value="three_human_mRNA.fasta.nhr" name="blastdb.nhr" />
-                <extra_files type="file" value="three_human_mRNA.fasta.nin" name="blastdb.nin" lines_diff="2" />
+                <extra_files type="file" value="three_human_mRNA.fasta.nin" name="blastdb.nin" compare="sim_size" delta="8" />
                 <extra_files type="file" value="three_human_mRNA.fasta.nsq" name="blastdb.nsq" />
                 <extra_files type="file" value="three_human_mRNA.fasta.nog" name="blastdb.nog" />
                 <extra_files type="file" value="three_human_mRNA.fasta.nhd" name="blastdb.nhd" />
@@ -184,15 +193,16 @@
 This is a wrapper for the NCBI BLAST+ tool 'makeblastdb', which is the
 replacement for the 'formatdb' tool in the NCBI 'legacy' BLAST suite.
 
+More information about makeblastdb can be found in the `BLAST Command Line Applications User Manual`_.
+
+.. _BLAST Command Line Applications User Manual: https://www.ncbi.nlm.nih.gov/books/NBK279690/
+
+
 <!--
 Applying masks to an existing BLAST database will not change the original database; a new database will be created.
 For this reason, it's best to apply all masks at once to minimize the number of unnecessary intermediate databases.
 -->
 
-**Documentation**
-
-https://www.ncbi.nlm.nih.gov/books/NBK279690/
-
 **References**
 
 If you use this Galaxy tool in work leading to a scientific publication please