Mercurial > repos > devteam > ncbi_blast_plus
diff tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml @ 11:4c4a0da938ff draft
Uploaded v0.0.22, now wraps BLAST+ 2.2.28 allowing extended tabular output to include the hit descriptions as column 25.
Supports $GALAXY_SLOTS.
Includes more tests and heavy use of macros.
author | peterjc |
---|---|
date | Thu, 05 Dec 2013 06:55:59 -0500 |
parents | 70e7dcbf6573 |
children | 623f727cdff1 |
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Mon Sep 23 06:14:13 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Thu Dec 05 06:55:59 2013 -0500 @@ -1,91 +1,40 @@ -<tool id="ncbi_blastn_wrapper" name="NCBI BLAST+ blastn" version="0.0.20"> +<tool id="ncbi_blastn_wrapper" name="NCBI BLAST+ blastn" version="0.0.22"> <description>Search nucleotide database with nucleotide query sequence(s)</description> <!-- If job splitting is enabled, break up the query file into parts --> - <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject,histdb" merge_outputs="output1"></parallelism> - <requirements> - <requirement type="binary">blastn</requirement> - <requirement type="package" version="2.2.26+">blast+</requirement> - </requirements> - <version_command>blastn -version</version_command> + <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1"></parallelism> + <macros> + <token name="@BINARY@">blastn</token> + <import>ncbi_macros.xml</import> + </macros> + <expand macro="requirements" /> <command> ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces blastn -query "$query" -#if $db_opts.db_opts_selector == "db": - -db "${db_opts.database.fields.path}" -#elif $db_opts.db_opts_selector == "histdb": - -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" -#else: - -subject "$db_opts.subject" -#end if +@BLAST_DB_SUBJECT@ -task $blast_type -evalue $evalue_cutoff --out "$output1" -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 +@BLAST_OUTPUT@ +@THREADS@ #if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.filter_query $adv_opts.strand -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if +@ADVANCED_OPTIONS@ #if (str($adv_opts.identity_cutoff) and float(str($adv_opts.identity_cutoff)) > 0 ): -perc_identity $adv_opts.identity_cutoff #end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if $adv_opts.ungapped -$adv_opts.parse_deflines ## End of advanced options: #end if </command> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - <!-- In case the return code has not been set propery check stderr too --> - <regex match="Error:" /> - <regex match="Exception:" /> - </stdio> + + <expand macro="stdio" /> + <inputs> <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> - <conditional name="db_opts"> - <param name="db_opts_selector" type="select" label="Subject database/sequences"> - <option value="db" selected="True">Locally installed BLAST database</option> - <option value="histdb">BLAST database from your history</option> - <option value="file">FASTA file from your history (see warning note below)</option> - </param> - <when value="db"> - <param name="database" type="select" label="Nucleotide BLAST database"> - <options from_file="blastdb.loc"> - <column name="value" index="0"/> - <column name="name" index="1"/> - <column name="path" index="2"/> - </options> - </param> - <param name="histdb" type="hidden" value="" /> - <param name="subject" type="hidden" value="" /> - </when> - <when value="histdb"> - <param name="database" type="hidden" value="" /> - <param name="histdb" type="data" format="blastdbn" label="Nucleotide BLAST database" /> - <param name="subject" type="hidden" value="" /> - </when> - <when value="file"> - <param name="database" type="hidden" value="" /> - <param name="histdb" type="hidden" value="" /> - <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> - </when> - </conditional> + + <expand macro="input_conditional_nucleotide_db" /> + <param name="blast_type" type="select" display="radio" label="Type of BLAST"> <option value="megablast">megablast</option> <option value="blastn">blastn</option> @@ -96,60 +45,26 @@ <option value="vecscreen">vecscreen</option> --> </param> - <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> - <param name="out_format" type="select" label="Output format"> - <option value="6">Tabular (standard 12 columns)</option> - <option value="ext" selected="True">Tabular (extended 24 columns)</option> - <option value="5">BLAST XML</option> - <option value="0">Pairwise text</option> - <option value="0 -html">Pairwise HTML</option> - <option value="2">Query-anchored text</option> - <option value="2 -html">Query-anchored HTML</option> - <option value="4">Flat query-anchored text</option> - <option value="4 -html">Flat query-anchored HTML</option> - <!-- - <option value="-outfmt 11">BLAST archive format (ASN.1)</option> - --> - </param> - <conditional name="adv_opts"> - <param name="adv_opts_selector" type="select" label="Advanced Options"> - <option value="basic" selected="True">Hide Advanced Options</option> - <option value="advanced">Show Advanced Options</option> + <expand macro="input_evalue" /> + <expand macro="input_out_format" /> + <expand macro="advanced_options"> + <!-- Could use a select (yes, no, other) where other allows setting 'level window linker' --> + <param name="filter_query" type="boolean" label="Filter out low complexity regions (with DUST)" truevalue="-dust yes" falsevalue="-dust no" checked="true" /> + <expand macro="input_strand" /> + <expand macro="input_max_hits" /> + <param name="identity_cutoff" type="float" min="0" max="100" value="0" label="Percent identity cutoff (-perc_identity)" help="Use zero for no cutoff" /> + + <!-- I'd like word_size to be optional, with minimum 4 for blastn --> + <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 4."> + <validator type="in_range" min="0" /> </param> - <when value="basic" /> - <when value="advanced"> - <!-- Could use a select (yes, no, other) where other allows setting 'level window linker' --> - <param name="filter_query" type="boolean" label="Filter out low complexity regions (with DUST)" truevalue="-dust yes" falsevalue="-dust no" checked="true" /> - <param name="strand" type="select" label="Query strand(s) to search against database/subject"> - <option value="-strand both">Both</option> - <option value="-strand plus">Plus (forward)</option> - <option value="-strand minus">Minus (reverse complement)</option> - </param> - <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> - <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> - <validator type="in_range" min="0" /> - </param> - <param name="identity_cutoff" type="float" min="0" max="100" value="0" label="Percent identity cutoff (-perc_identity)" help="Use zero for no cutoff" /> - <!-- I'd like word_size to be optional, with minimum 4 for blastn --> - <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 4."> - <validator type="in_range" min="0" /> - </param> - <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped" falsevalue="" checked="false" /> - <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> - </when> - </conditional> + <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped" falsevalue="" checked="false" /> + <expand macro="input_parse_deflines" /> + </expand> </inputs> <outputs> - <data name="output1" format="tabular" label="${blast_type.value_label} on ${on_string}"> - <change_format> - <when input="out_format" value="0" format="txt"/> - <when input="out_format" value="0 -html" format="html"/> - <when input="out_format" value="2" format="txt"/> - <when input="out_format" value="2 -html" format="html"/> - <when input="out_format" value="4" format="txt"/> - <when input="out_format" value="4 -html" format="html"/> - <when input="out_format" value="5" format="blastxml"/> - </change_format> + <data name="output1" format="tabular" label="${blast_type.value_label} $query.name vs @ON_DB_SUBJECT@"> + <expand macro="output_change_format" /> </data> </outputs> <tests> @@ -166,12 +81,7 @@ </tests> <help> -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ +@SEARCH_TIME_WARNING@ **What it does** @@ -179,71 +89,11 @@ using the NCBI BLAST+ blastn command line tool. Algorithms include blastn, megablast, and discontiguous megablast. -.. class:: warningmark - -You can also search against a FASTA file of subject nucleotide -sequences. This is *not* advised because it is slower (only one -CPU is used), but more importantly gives e-values for pairwise -searches (very small e-values which will look overly signficiant). -In most cases you should instead turn the other FASTA file into a -database first using *makeblastdb* and search against that. +@FASTA_WARNING@ ----- -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. Galaxy now uses this extended 24 column output by default. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). +@OUTPUT_FORMAT@ ------- @@ -252,17 +102,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>