comparison bwa-mem.xml @ 0:ff1ae217ccc2 draft

Uploaded
author devteam
date Tue, 16 Dec 2014 15:22:56 -0500
parents
children c71dd035971e
comparison
equal deleted inserted replaced
-1:000000000000 0:ff1ae217ccc2
1 <?xml version="1.0"?>
2 <tool id="bwa_mem" name="BWA-MEM" version="0.1">
3
4 <macros>
5 <import>bwa_macros.xml</import>
6 </macros>
7
8 <requirements>
9 <requirement type="package" version="0.7.10.039ea20639">bwa</requirement>
10 <requirement type="package" version="1.1">samtools</requirement>
11 </requirements>
12 <description>- map medium and long reads (&gt; 100 bp) against reference genome</description>
13 <command>
14
15 #set $reference_fasta_filename = "localref.fa"
16
17 #if str( $reference_source.reference_source_selector ) == "history":
18
19 ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &amp;&amp;
20
21 ## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run
22 ## depending ob the size of the input FASTA dataset
23
24 (
25 size=`stat -c %s "${reference_fasta_filename}" 2&gt;/dev/null`; ## Linux
26 if [ $? -eq 0 ];
27 then
28 if [ \$size -lt 2000000000 ];
29 then
30 bwa index -a is "${reference_fasta_filename}";
31 echo "Generating BWA index with is algorithm";
32 else
33 bwa index -a bwtsw "${reference_fasta_filename}";
34 echo "Generating BWA index with bwtsw algorithm";
35 fi;
36 fi;
37
38 eval \$(stat -s "${reference_fasta_filename}"); ## OSX
39 if [ $? -eq 0 ];
40 then
41 if [ \$st_size -lt 2000000000 ];
42 then
43 bwa index -a is "${reference_fasta_filename}";
44 echo "Generating BWA index with is algorithm";
45 else
46 bwa index -a bwtsw "${reference_fasta_filename}";
47 echo "Generating BWA index with bwtsw algorithm";
48 fi;
49 fi;
50 ) &amp;&amp;
51
52 #else:
53 #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
54 #end if
55
56 ## Begin BWA-MEM command line
57
58 bwa mem
59 -t "\${GALAXY_SLOTS:-1}"
60 -v 1 ## Verbosity is set to 1 (errors only)
61
62 #if str( $fastq_input.fastq_input_selector ) == "paired_iv": ## For interleaved fastq files set -p option
63 -p
64 #if str( $fastq_input.iset_stats ): ## check that insert statistics is used
65 -I "${fastq_input.iset_stats}"
66 #end if
67 #end if
68
69 #if str( $analysis_type.analysis_type_selector ) == "pacbio":
70 -x
71
72 #elif str( $analysis_type.analysis_type_selector ) == "full":
73
74 #if str( $analysis_type.algorithmic_options.algorithmic_options_selector ) == "True": ## Algorithmic options
75
76 -k "${analysis_type.algorithmic_options.k}"
77 -w "${analysis_type.algorithmic_options.w}"
78 -d "${analysis_type.algorithmic_options.d}"
79 -r "${analysis_type.algorithmic_options.r}"
80 -y "${analysis_type.algorithmic_options.y}"
81 -c "${analysis_type.algorithmic_options.c}"
82 -D "${analysis_type.algorithmic_options.D}"
83 -W "${analysis_type.algorithmic_options.W}"
84 -m "${analysis_type.algorithmic_options.m}"
85 ${analysis_type.algorithmic_options.S}
86 ${analysis_type.algorithmic_options.P}
87 ${analysis_type.algorithmic_options.e}
88
89 #end if
90
91 #if str( $analysis_type.scoring_options.scoring_options_selector ) == "True": ## Scoring options
92
93 -A "${analysis_type.scoring_options.A}"
94 -B "${analysis_type.scoring_options.B}"
95 -O "${analysis_type.scoring_options.O}"
96 -E "${analysis_type.scoring_options.E}"
97 -L "${analysis_type.scoring_options.L}"
98 -U "${analysis_type.scoring_options.U}"
99
100 #end if
101
102 #if str( $analysis_type.io_options.io_options_selector ) == "True": ## IO options
103
104 -T "${analysis_type.io_options.T}"
105 -h "${analysis_type.io_options.h}"
106 ${analysis_type.io_options.a}
107 ${analysis_type.io_options.C}
108 ${analysis_type.io_options.V}
109 ${analysis_type.io_options.Y}
110 ${analysis_type.io_options.M}
111
112 #end if
113
114 #end if
115
116 #if str( $rg.rg_selector ) == "True":
117 -R "@RG\tID:$rg.ID\tSM:$rg.SM"
118 #end if
119
120 #if str( $fastq_input.fastq_input_selector ) == "paired":
121
122 #if str( $fastq_input.iset_stats ): ## check that insert statistics is used
123 -I "${fastq_input.iset_stats}"
124 #end if
125
126 "${reference_fasta_filename}"
127
128 "${fastq_input.fastq_input1}" "${fastq_input.fastq_input2}"
129
130 #elif str( $fastq_input.fastq_input_selector ) == "paired_collection":
131
132 #if str( $fastq_input.iset_stats ): ## check that insert statistics is used
133 -I "${fastq_input.iset_stats}"
134 #end if
135
136
137 "${reference_fasta_filename}"
138
139 "${fastq_input.fastq_input1.forward}" "${fastq_input.fastq_input1.reverse}"
140
141 #else:
142
143
144 "${reference_fasta_filename}"
145
146 "${fastq_input.fastq_input1}"
147
148 #end if
149
150 | samtools view -Sb - > temporary_bam_file.bam &amp;&amp;
151
152 samtools sort -f temporary_bam_file.bam ${bam_output}
153
154 </command>
155
156 <inputs>
157
158 <conditional name="reference_source">
159 <param name="reference_source_selector" type="select" label="Load reference genome from">
160 <option value="cached">Local cache</option>
161 <option value="history">History</option>
162 </param>
163 <when value="cached">
164 <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
165 <options from_data_table="bwa_mem_indexes">
166 <filter type="sort_by" column="2" />
167 <validator type="no_options" message="No indexes are available" />
168 </options>
169 <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
170 </param>
171 </when>
172 <when value="history">
173 <param name="ref_file" type="data" format="fasta" label="Use the following dataset as the reference sequence" help="You can upload a FASTA sequence to the history and use it as reference" />
174 </when>
175 </conditional>
176 <conditional name="fastq_input">
177 <param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data">
178 <option value="paired">Paired</option>
179 <option value="single">Single</option>
180 <option value="paired_collection">Paired Collection</option>
181 <option value="paired_iv">Paired Interleaved</option>
182 </param>
183 <when value="paired">
184 <param name="fastq_input1" type="data" format="fastqsanger" label="Select first set of reads" help="Specify dataset with forward reads"/>
185 <param name="fastq_input2" type="data" format="fastqsanger" label="Select second set of reads" help="Specify dataset with reverse reads"/>
186 <param name="iset_stats" type="text" optional="True" size="10" label="Enter mean, standerd deviation, max, and min for insert lengths." help="-I; This parameter is only used for paired reads. Only mean is required while sd, max, and min will be inferred. Examples: both &quot;250&quot; and &quot;250,25&quot; will work while &quot;250,,10&quot; will not. See below for details.">
187 <sanitizer invalid_char="">
188 <valid initial="string.digits"><add value=","/> </valid>
189 </sanitizer>
190 </param>
191 </when>
192 <when value="single">
193 <param name="fastq_input1" type="data" format="fastqsanger" label="Select fastq dataset" help="Specify dataset with single reads"/>
194 </when>
195 <when value="paired_collection">
196 <param name="fastq_input1" format="fastqsanger" type="data_collection" collection_type="paired" label="Select a paired collection" help="See help section for an explanation of dataset collections"/>
197 <param name="iset_stats" type="text" optional="True" size="10" label="Enter mean, standerd deviation, max, and min for insert lengths." help="-I; This parameter is only used for paired reads. Only mean is required while sd, max, and min will be inferred. Examples: both &quot;250&quot; and &quot;250,25&quot; will work while &quot;250,,10&quot; will not. See below for details.">
198 <sanitizer invalid_char="">
199 <valid initial="string.digits"><add value=","/> </valid>
200 </sanitizer>
201 </param>
202 </when>
203 <when value="paired_iv">
204 <param name="fastq_input1" type="data" format="fastqsanger" label="Select fastq dataset" help="Specify dataset with interleaved reads"/>
205 <param name="iset_stats" type="text" optional="True" size="10" label="Enter mean, standerd deviation, max, and min for insert lengths." help="-I; This parameter is only used for paired reads. Only mean is required while sd, max, and min will be inferred. Examples: both &quot;250&quot; and &quot;250,25&quot; will work while &quot;250,,10&quot; will not. See below for details.">
206 <sanitizer invalid_char="">
207 <valid initial="string.digits"><add value=","/> </valid>
208 </sanitizer>
209 </param>
210 </when>
211 </conditional>
212
213
214 <conditional name="rg">
215 <param name="rg_selector" type="select" label="Set read groups information?" help="Specifying readgroup information can greatly simplify your downstream analyses by allowing combining multiple datasets. See help below for more details">
216 <option value="set">Set</option>
217 <option value="do_not_set" selected="True">Do not set</option>
218 </param>
219 <when value="set">
220 <param name="ID" type="text" value="readgroup1" size="20" label="Specify readgroup ID" help="This value must be unique among multiple samples in your experiment">
221 <sanitizer invalid_char="">
222 <valid initial="string.printable"/>
223 </sanitizer>
224 </param>
225 <param name="SM" type="text" value="blood" size="20" label="Specify readgroup sample name (SM)" help="This value should be descriptive">
226 <sanitizer invalid_char="">
227 <valid initial="string.printable"/>
228 </sanitizer>
229 </param>
230 </when>
231 <when value="do_not_set">
232 <!-- do nothing -->
233 </when>
234 </conditional>
235
236 <conditional name="analysis_type">
237 <param name="analysis_type_selector" type="select" label="Select analysis mode">
238 <option value="illumina">1.Simple Illumina mode</option>
239 <option value="pacbio">2.PacBio mode</option>
240 <option value="full">3.Full list of options</option>
241 </param>
242 <when value="illumina">
243 <!-- do nothing -->
244 </when>
245 <when value="pacbio">
246 <!-- do nothing. all magic happens within <command> tag -->
247 </when>
248 <when value="full">
249 <conditional name="algorithmic_options">
250 <param name="algorithmic_options_selector" type="select" label="Set algorithmic options?" help="Sets -k, -w, -d, -r, -y, -c, -D, -W, -m, -S, -P, and -e options.">
251 <option value="set">Set</option>
252 <option value="do_not_set" selected="True">Do not set</option>
253 </param>
254 <when value="set">
255 <param name="k" type="integer" value="19" label="minimum seed length" help="-k; default=19"/>
256 <param name="w" type="integer" value="100" label="band width for banded alignment" help="-w; default=100"/>
257 <param name="d" type="integer" value="100" label="off-diagonal X-dropoff" help="-d; default=100"/>
258 <param name="r" type="float" value="1.5" label="look for internal seeds inside a seed longer than -k * THIS VALUE" help="-r; default=1.5"/>
259 <param name="y" type="integer" value="0" label="find maximum exact matches (MEMs) longer than -k * -r with size less than THIS VALUE" help="-y; default=0"/>
260 <param name="c" type="integer" value="500" label="skip seeds with more than that many occurrences" help="-c; default=500"/>
261 <param name="D" type="float" value="0.5" label="drop chains shorter than this fraction of the longest overlapping chain" help="-D; default=0.5"/>
262 <param name="W" type="integer" value="0" label="discard a chain if seeded bases shorter than" help="-W; default=0"/>
263 <param name="m" type="integer" value="50" label="perform at most this many rounds of mate rescues for each read" help="-m; default=50"/>
264 <param name="S" type="boolean" truevalue="-S" falsevalue="" label="skip mate rescue" help="-S"/>
265 <param name="P" type="boolean" truevalue="-P" falsevalue="" label="skip pairing; mate rescue performed unless -S also in use" help="-P"/>
266 <param name="e" type="boolean" truevalue="-e" falsevalue="" label="discard full-length exact matches" help="-e"/>
267 </when>
268 <when value="do_not_set">
269 <!-- do nothing -->
270 </when>
271 </conditional>
272
273 <conditional name="scoring_options">
274 <param name="scoring_options_selector" type="select" label="Set scoring options?" help="Sets -A, -B, -O, -E, -L, and -U options.">
275 <option value="set">Set</option>
276 <option value="do_not_set" selected="True">Do not set</option>
277 </param>
278 <when value="set">
279 <param name="A" type="integer" value="1" label="score for a sequence match" help="-A; scales options -T, -d, -B, -O, -E, -L, and -U; default=1"/>
280 <param name="B" type="integer" value="4" label="penalty for mismatch" help="-B; default=4"/>
281 <param name="O" type="text" value="6,6" label="gap open penalty for deletions and insertions" help="-O; default=6,6">
282 <sanitizer invalid_char="">
283 <valid initial="string.digits"><add value=","/> </valid>
284 </sanitizer>
285 </param>
286 <param name="E" type="text" value="1,1" label="gap extension penalty; a gap of size k cost &#39;-O + -E*k&#39; " help="-E; default=1,1">
287 <sanitizer invalid_char="">
288 <valid initial="string.digits"><add value=","/> </valid>
289 </sanitizer>
290 </param>
291 <param name="L" type="text" value="5,5" label="penalty for 5&#39;-end and 3&#39;-end clipping" help="-L; default=5,5">
292 <sanitizer invalid_char="">
293 <valid initial="string.digits"><add value=","/> </valid>
294 </sanitizer>
295 </param>
296 <param name="U" type="integer" value="17" label="penalty for an unpaired read pair" help="-U; default=17"/>
297 </when>
298 <when value="do_not_set">
299 <!-- do nothing -->
300 </when>
301 </conditional>
302
303 <conditional name="io_options">
304 <param name="io_options_selector" type="select" label="Set input/output options" help="Sets -T, -h, -a, -C, -V, -Y, and -M options.">
305 <option value="set">Set</option>
306 <option value="do_not_set" selected="True">Do not set</option>
307 </param>
308 <when value="set">
309 <param name="T" type="integer" value="30" label="minimum score to output" help="-T; default=30"/>
310 <param name="h" type="integer" value="5" label="if there are this many hits with score >80% of the max score, output all in XA tag" help="-h; default=5"/>
311 <param name="a" type="boolean" truevalue="-a" falsevalue="" label="output all alignments for single-ends or unpaired paired-ends" help="-a"/>
312 <param name="C" type="boolean" truevalue="-C" falsevalue="" label="append FASTA/FASTQ comment to BAM output" help="-C"/>
313 <param name="V" type="boolean" truevalue="-V" falsevalue="" label="output the reference FASTA header in the XR tag" help="-C"/>
314 <param name="Y" type="boolean" truevalue="-Y" falsevalue="" label="use soft clipping for supplementary alignments" help="-Y"/>
315 <param name="M" type="boolean" truevalue="-M" falsevalue="" label="mark shorter split hits as secondary" help="-M"/>
316 </when>
317 <when value="do_not_set">
318 <!-- do nothing -->
319 </when>
320 </conditional>
321 </when>
322 </conditional>
323 </inputs>
324
325 <outputs>
326 <data format="bam" name="bam_output" label="${tool.name} on ${on_string} (mapped reads in BAM format)"/>
327 </outputs>
328
329 <tests>
330 <test>
331 <param name="reference_source_selector" value="history" />
332 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
333 <param name="fastq_input_selector" value="paired"/>
334 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
335 <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
336 <param name="analysis_type_selector" value="illumina"/>
337 <output name="bam_output" ftype="bam" file="bwa-mem-test1.bam" lines_diff="2" />
338 </test>
339 </tests>
340 <stdio>
341 <exit_code range="1:" />
342 </stdio>
343 <help>
344
345 **What is does**
346
347 From http://arxiv.org/abs/1303.3997:
348
349 BWA-MEM is a new alignment algorithm for aligning sequence reads or long query sequences against a large reference genome such as human.
350 It automatically chooses between local and end-to-end alignments, supports paired-end reads and performs chimeric alignment.
351 The algorithm is robust to sequencing errors and applicable to a wide range of sequence lengths from 70bp to a few megabases.
352 For mapping 100bp sequences, BWA-MEM shows better performance than several state-of-art read aligners to date.
353
354 It is best suited for mapping long (>70 nt) reads against large reference genomes.
355
356 This Galaxy tool wraps bwa-mem module of bwa read mapping tool. Galaxy implementation takes fastq files as input and produces output in BAM (not SAM) format, which can be further processed using various BAM utilities exiting in Galaxy (BAMTools, SAMTools, Picard).
357
358 -----
359
360 **Galaxy-specific option**
361
362 Galaxy allows four levels of control over bwa-mem options provided by **Select analysis mode** menu option. These are:
363
364 1. *Simple Illumina mode*: The simplest possible bwa mem application in which it alignes single or paired-end data to reference using default parameters. It is equivalent to the following command: bwa mem &lt;reference index&gt; &lt;fastq dataset1&gt; [fastq dataset2]
365 2. *PacBio mode*: The mode adjusted specifically for mapping of long PacBio subreads. Equivalent to the following command: bwa mem -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 &lt;reference index&gt; &lt;PacBio dataset in fastq format&gt;
366 3. *Full list of options*: Allows access to all options through Galaxy interface.
367
368 ------
369
370 **BWA MEM options**
371
372 Each Galaxy parameter widget corresponds to command line flags listed below:
373
374 Algorithm options::
375
376 -k INT minimum seed length [19]
377 -w INT band width for banded alignment [100]
378 -d INT off-diagonal X-dropoff [100]
379 -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [1.5]
380 -y INT find MEMs longer than {-k} * {-r} with size less than INT [0]
381 -c INT skip seeds with more than INT occurrences [500]
382 -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [0.50]
383 -W INT discard a chain if seeded bases shorter than INT [0]
384 -m INT perform at most INT rounds of mate rescues for each read [50]
385 -S skip mate rescue
386 -P skip pairing; mate rescue performed unless -S also in use
387 -e discard full-length exact matches
388
389 Scoring options::
390
391 -A INT score for a sequence match, which scales options -TdBOELU unless overridden [1]
392 -B INT penalty for a mismatch [4]
393 -O INT[,INT] gap open penalties for deletions and insertions [6,6]
394 -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [1,1]
395 -L INT[,INT] penalty for 5'- and 3'-end clipping [5,5]
396 -U INT penalty for an unpaired read pair [17]
397
398 Input/output options::
399
400 -p first query file consists of interleaved paired-end sequences
401 -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]
402
403 -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [3]
404 -T INT minimum score to output [30]
405 -h INT if there are &lt;INT hits with score &gt;80% of the max score, output all in XA [5]
406 -a output all alignments for SE or unpaired PE
407 -C append FASTA/FASTQ comment to SAM output
408 -V output the reference FASTA header in the XR tag
409 -Y use soft clipping for supplementary alignments
410 -M mark shorter split hits as secondary
411
412 -I FLOAT[,FLOAT[,INT[,INT]]]
413 specify the mean, standard deviation (10% of the mean if absent), max
414 (4 sigma from the mean if absent) and min of the insert size distribution.
415 FR orientation only. [inferred]
416
417
418 @dataset_collections@
419
420 @RG@
421
422 @info@
423
424
425
426 </help>
427 <citations>
428 <citation type="doi">10.1093/bioinformatics/btp324</citation>
429 <citation type="doi">10.1093/bioinformatics/btp698</citation>
430 <citation type="bibtex">@misc{1303.3997,
431 Author = {Heng Li},
432 Title = {Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM},
433 Year = {2013},
434 Eprint = {arXiv:1303.3997},
435 url = {http://arxiv.org/abs/1303.3997},
436 }</citation>
437 </citations>
438 </tool>