diff tools/ncbi_blast_plus/ncbi_macros.xml @ 11:4c4a0da938ff draft

Uploaded v0.0.22, now wraps BLAST+ 2.2.28 allowing extended tabular output to include the hit descriptions as column 25. Supports $GALAXY_SLOTS. Includes more tests and heavy use of macros.
author peterjc
date Thu, 05 Dec 2013 06:55:59 -0500
parents
children 623f727cdff1
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml	Thu Dec 05 06:55:59 2013 -0500
@@ -0,0 +1,382 @@
+<macros>
+    <xml name="output_change_format">
+        <change_format>
+            <when input="out_format" value="0" format="txt"/>
+            <when input="out_format" value="0 -html" format="html"/>
+            <when input="out_format" value="2" format="txt"/>
+            <when input="out_format" value="2 -html" format="html"/>
+            <when input="out_format" value="4" format="txt"/>
+            <when input="out_format" value="4 -html" format="html"/>
+            <when input="out_format" value="5" format="blastxml"/>
+        </change_format>
+    </xml>
+    <xml name="input_out_format">
+        <param name="out_format" type="select" label="Output format">
+            <option value="6">Tabular (standard 12 columns)</option>
+            <option value="ext" selected="True">Tabular (extended 25 columns)</option>
+            <option value="5">BLAST XML</option>
+            <option value="0">Pairwise text</option>
+            <option value="0 -html">Pairwise HTML</option>
+            <option value="2">Query-anchored text</option>
+            <option value="2 -html">Query-anchored HTML</option>
+            <option value="4">Flat query-anchored text</option>
+            <option value="4 -html">Flat query-anchored HTML</option>
+            <!--
+            <option value="-outfmt 11">BLAST archive format (ASN.1)</option>
+            -->
+        </param>
+    </xml>
+    <xml name="input_scoring_matrix">
+        <param name="matrix" type="select" label="Scoring matrix">
+            <option value="BLOSUM90">BLOSUM90</option>
+            <option value="BLOSUM80">BLOSUM80</option>
+            <option value="BLOSUM62" selected="true">BLOSUM62 (default)</option>
+            <option value="BLOSUM50">BLOSUM50</option> 
+            <option value="BLOSUM45">BLOSUM45</option>
+            <option value="PAM250">PAM250</option>
+            <option value="PAM70">PAM70</option>
+            <option value="PAM30">PAM30</option>
+        </param>
+    </xml>
+    <xml name="stdio">
+        <stdio>
+            <!-- Anything other than zero is an error -->
+            <exit_code range="1:" />
+            <exit_code range=":-1" />
+            <!-- In case the return code has not been set propery check stderr too -->
+            <regex match="Error:" />
+            <regex match="Exception:" />
+        </stdio>
+    </xml>
+    <xml name="input_query_gencode">
+        <param name="query_gencode" type="select" label="Query genetic code">
+            <!-- See http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi for details -->
+            <option value="1" select="True">1. Standard</option>
+            <option value="2">2. Vertebrate Mitochondrial</option>
+            <option value="3">3. Yeast Mitochondrial</option>
+            <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
+            <option value="5">5. Invertebrate Mitochondrial</option>
+            <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
+            <option value="9">9. Echinoderm Mitochondrial</option>
+            <option value="10">10. Euplotid Nuclear</option>
+            <option value="11">11. Bacteria and Archaea</option>
+            <option value="12">12. Alternative Yeast Nuclear</option>
+            <option value="13">13. Ascidian Mitochondrial</option>
+            <option value="14">14. Flatworm Mitochondrial</option>
+            <option value="15">15. Blepharisma Macronuclear</option>
+            <option value="16">16. Chlorophycean Mitochondrial Code</option>
+            <option value="21">21. Trematode Mitochondrial Code</option>
+            <option value="22">22. Scenedesmus obliquus mitochondrial Code</option>
+            <option value="23">23. Thraustochytrium Mitochondrial Code</option>
+            <option value="24">24. Pterobranchia mitochondrial code</option>
+        </param>
+    </xml>
+    <xml name="input_db_gencode">
+        <param name="db_gencode" type="select" label="Database/subject genetic code">
+            <!-- See http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi for details -->
+            <option value="1" select="True">1. Standard</option>
+            <option value="2">2. Vertebrate Mitochondrial</option>
+            <option value="3">3. Yeast Mitochondrial</option>
+            <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
+            <option value="5">5. Invertebrate Mitochondrial</option>
+            <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
+            <option value="9">9. Echinoderm Mitochondrial</option>
+            <option value="10">10. Euplotid Nuclear</option>
+            <option value="11">11. Bacteria and Archaea</option>
+            <option value="12">12. Alternative Yeast Nuclear</option>
+            <option value="13">13. Ascidian Mitochondrial</option>
+            <option value="14">14. Flatworm Mitochondrial</option>
+            <option value="15">15. Blepharisma Macronuclear</option>
+            <option value="16">16. Chlorophycean Mitochondrial Code</option>
+            <option value="21">21. Trematode Mitochondrial Code</option>
+            <option value="22">22. Scenedesmus obliquus mitochondrial Code</option>
+            <option value="23">23. Thraustochytrium Mitochondrial Code</option>
+            <option value="24">24. Pterobranchia mitochondrial code</option>
+        </param>
+    </xml>
+    <xml name="input_conditional_nucleotide_db">
+        <conditional name="db_opts">
+            <param name="db_opts_selector" type="select" label="Subject database/sequences">
+              <option value="db" selected="True">Locally installed BLAST database</option>
+              <option value="histdb">BLAST database from your history</option>
+              <option value="file">FASTA file from your history (see warning note below)</option>
+            </param>
+            <when value="db">
+                <param name="database" type="select" label="Nucleotide BLAST database">
+                    <options from_file="blastdb.loc">
+                      <column name="value" index="0"/>
+                      <column name="name" index="1"/>
+                      <column name="path" index="2"/>
+                    </options>
+                </param>
+                <param name="histdb" type="hidden" value="" />
+                <param name="subject" type="hidden" value="" /> 
+            </when>
+            <when value="histdb">
+                <param name="database" type="hidden" value="" />
+                <param name="histdb" type="data" format="blastdbn" label="Nucleotide BLAST database" />
+                <param name="subject" type="hidden" value="" />
+            </when>
+            <when value="file">
+                <param name="database" type="hidden" value="" />
+                <param name="histdb" type="hidden" value="" />
+                <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> 
+            </when>
+        </conditional>
+    </xml>
+    <xml name="input_conditional_protein_db">
+        <conditional name="db_opts">
+            <param name="db_opts_selector" type="select" label="Subject database/sequences">
+              <option value="db" selected="True">Locally installed BLAST database</option>
+              <option value="histdb">BLAST database from your history</option>
+              <option value="file">FASTA file from your history (see warning note below)</option>
+            </param>
+            <when value="db">
+                <param name="database" type="select" label="Protein BLAST database">
+                    <options from_file="blastdb_p.loc">
+                      <column name="value" index="0"/>
+                      <column name="name" index="1"/>
+                      <column name="path" index="2"/>
+                    </options>
+                </param>
+                <param name="histdb" type="hidden" value="" />
+                <param name="subject" type="hidden" value="" />
+            </when>
+            <when value="histdb">
+                <param name="database" type="hidden" value="" />
+                <param name="histdb" type="data" format="blastdbp" label="Protein BLAST database" />
+                <param name="subject" type="hidden" value="" />
+            </when>
+            <when value="file">
+                <param name="database" type="hidden" value="" />
+                <param name="histdb" type="hidden" value="" />
+                <param name="subject" type="data" format="fasta" label="Protein FASTA file to use as database"/> 
+            </when>
+        </conditional>
+    </xml>
+    <xml name="input_conditional_pssm">
+        <conditional name="db_opts">
+            <param name="db_opts_selector" type="select" label="Protein domain database (PSSM)">
+              <option value="db" selected="True">Locally installed BLAST database</option>
+              <!-- TODO - define new datatype
+              <option value="histdb">BLAST protein domain database from your history</option>
+              -->
+            </param>
+            <when value="db">
+                <param name="database" type="select" label="Protein domain database">
+                    <options from_file="blastdb_d.loc">
+                      <column name="value" index="0"/>
+                      <column name="name" index="1"/>
+                      <column name="path" index="2"/>
+                    </options>
+                </param>
+                <param name="histdb" type="hidden" value="" />
+                <param name="subject" type="hidden" value="" />
+            </when>
+            <!-- TODO - define new datatype
+            <when value="histdb">
+                <param name="database" type="hidden" value="" />
+                <param name="histdb" type="data" format="blastdbd" label="Protein domain database" />
+                <param name="subject" type="hidden" value="" />
+            </when>
+            -->
+        </conditional>
+    </xml>
+    <xml name="input_conditional_choose_db_type">
+        <conditional name="db_opts">
+            <param name="db_type" type="select" label="Type of BLAST database">
+              <option value="nucl" selected="True">Nucleotide</option>
+              <option value="prot">Protein</option>
+            </param>
+            <when value="nucl">
+                <param name="database" type="select" label="Nucleotide BLAST database">
+                    <options from_file="blastdb.loc">
+                      <column name="value" index="0"/>
+                      <column name="name" index="1"/>
+                      <column name="path" index="2"/>
+                    </options>
+                </param>
+            </when>
+            <when value="prot">
+                <param name="database" type="select" label="Protein BLAST database">
+                    <options from_file="blastdb_p.loc">
+                      <column name="value" index="0"/>
+                      <column name="name" index="1"/>
+                      <column name="path" index="2"/>
+                    </options>
+                </param>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="input_parse_deflines">
+        <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/>
+    </xml>
+    <xml name="input_filter_query_default_false">
+        <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="false" />
+    </xml>
+    <xml name="input_filter_query_default_true">
+        <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="true" />
+    </xml>
+    <xml name="input_max_hits">
+        <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits">
+            <validator type="in_range" min="0" />
+        </param>        
+    </xml>
+    <xml name="input_evalue">
+        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" />
+    </xml>
+    <xml name="input_word_size">
+        <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2.">
+            <validator type="in_range" min="0" />
+        </param>        
+    </xml>
+    <xml name="input_strand">
+        <param name="strand" type="select" label="Query strand(s) to search against database/subject">
+            <option value="-strand both">Both</option>
+            <option value="-strand plus">Plus (forward)</option>
+            <option value="-strand minus">Minus (reverse complement)</option>
+        </param>
+    </xml>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="binary">@BINARY@</requirement>
+            <requirement type="package" version="2.2.28">blast+</requirement>
+        </requirements>
+        <version_command>@BINARY@ -version</version_command>
+    </xml>
+    <xml name="advanced_options">
+        <conditional name="adv_opts">
+            <param name="adv_opts_selector" type="select" label="Advanced Options">
+              <option value="basic" selected="True">Hide Advanced Options</option>
+              <option value="advanced">Show Advanced Options</option>
+            </param>
+            <when value="basic" />
+            <when value="advanced">
+                <yield />
+            </when>
+        </conditional>        
+    </xml>
+    <token name="@THREADS@">-num_threads "\${GALAXY_SLOTS:-8}"</token>
+    <token name="@BLAST_DB_SUBJECT@">
+#if $db_opts.db_opts_selector == "db":
+  -db "${db_opts.database.fields.path}"
+#elif $db_opts.db_opts_selector == "histdb":
+  -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}"
+#else:
+  -subject "$db_opts.subject"
+#end if
+    </token>
+    <token name="@BLAST_OUTPUT@">-out "$output1"
+##Set the extended list here so when we add things, saved workflows are not affected
+#if str($out_format)=="ext":
+    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen salltitles"
+#else:
+    -outfmt $out_format
+#end if
+    </token>
+    <token name="@ADVANCED_OPTIONS@">$adv_opts.filter_query
+## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string
+## Note -max_target_seqs overrides -num_descriptions and -num_alignments
+#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):
+-max_target_seqs $adv_opts.max_hits
+#end if
+#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):
+-word_size $adv_opts.word_size
+#end if
+$adv_opts.parse_deflines
+    </token>
+    <!-- @ON_DB_SUBJECT@ is for use with @BLAST_DB_SUBJECT@ -->
+    <token name="@ON_DB_SUBJECT@">#if str($db_opts.db_opts_selector)=='db'
+${db_opts.database}
+#elif str($db_opts.db_opts_selector)=='histdb'
+${db_opts.histdb.name}
+#else
+${db_opts.subject.name}
+#end if</token>
+    <token name="@REFERENCES@">
+Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
+Galaxy tools and workflows for sequence analysis with applications
+in molecular plant pathology. PeerJ 1:e167
+http://dx.doi.org/10.7717/peerj.167
+
+Christiam Camacho et al. (2009).
+BLAST+: architecture and applications.
+BMC Bioinformatics. 15;10:421.
+http://dx.doi.org/10.1186/1471-2105-10-421
+
+This wrapper is available to install into other Galaxy Instances via the Galaxy
+Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus
+    </token>
+    <token name="@OUTPUT_FORMAT@">**Output format**
+
+Because Galaxy focuses on processing tabular data, the default output of this
+tool is tabular. The standard BLAST+ tabular output contains 12 columns:
+
+====== ========= ============================================
+Column NCBI name Description
+------ --------- --------------------------------------------
+     1 qseqid    Query Seq-id (ID of your sequence)
+     2 sseqid    Subject Seq-id (ID of the database hit)
+     3 pident    Percentage of identical matches
+     4 length    Alignment length
+     5 mismatch  Number of mismatches
+     6 gapopen   Number of gap openings
+     7 qstart    Start of alignment in query
+     8 qend      End of alignment in query
+     9 sstart    Start of alignment in subject (database hit)
+    10 send      End of alignment in subject (database hit)
+    11 evalue    Expectation value (E-value)
+    12 bitscore  Bit score
+====== ========= ============================================
+
+The BLAST+ tools can optionally output additional columns of information,
+but this takes longer to calculate. Most (but not all) of these columns are
+included by selecting the extended tabular output. The extra columns are
+included *after* the standard 12 columns. This is so that you can write
+workflow filtering steps that accept either the 12 or 25 column tabular
+BLAST output. Galaxy now uses this extended 25 column output by default.
+
+====== ============= ===========================================
+Column NCBI name     Description
+------ ------------- -------------------------------------------
+    13 sallseqid     All subject Seq-id(s), separated by ';'
+    14 score         Raw score
+    15 nident        Number of identical matches
+    16 positive      Number of positive-scoring matches
+    17 gaps          Total number of gaps
+    18 ppos          Percentage of positive-scoring matches
+    19 qframe        Query frame
+    20 sframe        Subject frame
+    21 qseq          Aligned part of query sequence
+    22 sseq          Aligned part of subject sequence
+    23 qlen          Query sequence length
+    24 slen          Subject sequence length
+    25 salltitles    All subject title(s), separated by '&lt;&gt;'
+====== ============= ===========================================
+
+The third option is BLAST XML output, which is designed to be parsed by
+another program, and is understood by some Galaxy tools.
+
+You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).
+The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.
+The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.
+The two query anchored outputs show a multiple sequence alignment between the query and all the matches,
+and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).
+    </token>
+    <token name="@FASTA_WARNING@">.. class:: warningmark
+
+You can also search against a FASTA file of subject (target)
+sequences. This is *not* advised because it is slower (only one
+CPU is used), but more importantly gives e-values for pairwise
+searches (very small e-values which will look overly signficiant).
+In most cases you should instead turn the other FASTA file into a
+database first using *makeblastdb* and search against that.
+    </token>
+    <token name="@SEARCH_TIME_WARNING@">.. class:: warningmark
+
+**Note**. Database searches may take a substantial amount of time.
+For large input datasets it is advisable to allow overnight processing.  
+
+-----
+    </token>
+</macros>