comparison bwa.xml @ 0:5e72d136a39e draft

Uploaded
author devteam
date Mon, 29 Sep 2014 16:22:24 -0400
parents
children 6bfb657c8fe1
comparison
equal deleted inserted replaced
-1:000000000000 0:5e72d136a39e
1 <?xml version="1.0"?>
2 <tool id="bwa_aln_0_7_10" name="BWA" version="bwa-0.7.10-r837-dirty_galaxy_0.1">
3 <requirements>
4 <requirement type="package" version="0.7.10.039ea20639">bwa</requirement>
5 <requirement type="package" version="1.1">samtools</requirement>
6 </requirements>
7 <description>- map short reads (&lt; 100 bp) against referece genome</description>
8 <command>
9
10 #set $reference_fasta_filename = "localref.fa"
11
12 #if str( $reference_source.reference_source_selector ) == "history":
13
14 ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &amp;&amp;
15
16 ## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run
17 ## depending ob the size of the input FASTA dataset
18
19 (
20 size=`stat -c %s "${reference_fasta_filename}" 2&gt;/dev/null`; ## Linux
21 if [ $? -eq 0 ];
22 then
23 if [ \$size -lt 2000000000 ];
24 then
25 bwa index -a is "${reference_fasta_filename}";
26 else
27 bwa index -a bwtsw "${reference_fasta_filename}";
28 fi;
29 fi;
30
31 eval \$(stat -s "${reference_fasta_filename}"); ## OSX
32 if [ $? -eq 0 ];
33 then
34 if [ \$st_size -lt 2000000000 ];
35 then
36 bwa index -a is "${reference_fasta_filename}";
37 echo "Generating BWA index with is algorithm";
38 else
39 bwa index -a bwtsw "${reference_fasta_filename}";
40 echo "Generating BWA index with bwtsw algorithm";
41 fi;
42 fi;
43 ) &amp;&amp;
44
45 #else:
46 #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
47 #end if
48
49 ## Begin bwa command line
50
51 ####### Fastq paired
52
53 #if str( $input_type.input_type_selector ) == "paired":
54
55 bwa aln
56 -t "\${GALAXY_SLOTS:-1}"
57
58 @command_options@
59
60 "${reference_fasta_filename}"
61 "${input_type.fastq_input1}"
62 > first.sai &amp;&amp;
63
64 bwa aln
65 -t "\${GALAXY_SLOTS:-1}"
66
67 @command_options@
68
69 "${reference_fasta_filename}"
70 "${input_type.fastq_input2}"
71 > second.sai &amp;&amp;
72
73 bwa sampe
74
75 #if str( $input_type.adv_pe_options.adv_pe_options_selector) == "True":
76
77 -a ${$input_type.adv_pe_options.a}
78 -o ${$input_type.adv_pe_options.o}
79 -n ${$input_type.adv_pe_options.n}
80 -N ${$input_type.adv_pe_options.N}
81
82 #end if
83
84 @read_group_options@
85
86 "${reference_fasta_filename}" first.sai second.sai "${input_type.fastq_input1}" "${input_type.fastq_input2}"
87
88 ####### Fastq single
89
90 #elif str( $input_type.input_type_selector ) == "single":
91
92 bwa aln
93 -t "\${GALAXY_SLOTS:-1}"
94
95 @command_options@
96
97 "${reference_fasta_filename}"
98 "${input_type.fastq_input1}"
99 > first.sai &amp;&amp;
100
101 bwa samse
102
103 #if str( $input_type.adv_se_options.adv_se_options_selector) == "True":
104
105 -n ${$input_type.adv_se_options.n}
106
107 #end if
108
109 @read_group_options@
110
111 "${reference_fasta_filename}" first.sai "${input_type.fastq_input1}"
112
113 ####### BAM paired
114
115 #elif str( $input_type.input_type_selector ) == "paired_bam":
116
117 bwa aln
118 -t "\${GALAXY_SLOTS:-1}"
119 -b
120 -1
121
122 @command_options@
123
124 "${reference_fasta_filename}"
125 "${input_type.bam_input}"
126 > first.sai &amp;&amp;
127
128 bwa aln
129 -t "\${GALAXY_SLOTS:-1}"
130 -b
131 -2
132 @command_options@
133 "${reference_fasta_filename}"
134 "${input_type.bam_input}"
135 > second.sai &amp;&amp;
136
137 bwa sampe
138
139 #if str( $input_type.adv_bam_pe_options.adv_pe_options_selector) == "True":
140
141 -a ${$input_type.adv_bam_pe_options.a}
142 -o ${$input_type.adv_bam_pe_options.o}
143 -n ${$input_type.adv_bam_pe_options.n}
144 -N ${$input_type.adv_bam_pe_options.N}
145
146 #end if
147
148 @read_group_options@
149
150 "${reference_fasta_filename}" first.sai second.sai "${input_type.bam_input}" "${input_type.bam_input}"
151
152 ####### Fastq single ------------ to do next
153
154 #elif str( $input_type.input_type_selector ) == "single_bam":
155
156 bwa aln
157 -t "\${GALAXY_SLOTS:-1}"
158 -b
159 -0
160
161 @command_options@
162
163 "${reference_fasta_filename}"
164 "${input_type.bam_input}"
165 > first.sai &amp;&amp;
166
167 bwa samse
168
169 #if str( $input_type.adv_bam_se_options.adv_se_options_selector) == "True":
170
171 -n ${$input_type.adv_bam_se_options.n}
172
173 #end if
174
175 @read_group_options@
176
177 "${reference_fasta_filename}" first.sai "${input_type.bam_input}"
178
179 #end if
180
181 | samtools view -Sb - > $bam_output
182
183 </command>
184
185 <macros>
186 <token name="@command_options@">
187 #if str( $analysis_type.analysis_type_selector ) == "illumina":
188
189 ## do nothing -> just align with default parameters
190
191 #elif str( $analysis_type.analysis_type_selector ) == "full":
192
193 -n ${analysis_type.n}
194 -o ${analysis_type.o}
195 -e ${analysis_type.e}
196 -i ${analysis_type.i}
197 -d ${analysis_type.d}
198 -l ${analysis_type.l}
199 -k ${analysis_type.k}
200 -m ${analysis_type.m}
201 -M ${analysis_type.M}
202 -O ${analysis_type.O}
203 -E ${analysis_type.E}
204 -R ${analysis_type.R}
205 -q ${analysis_type.q}
206
207 #if str( $analysis_type.B ):
208 -B ${analysis_type.B}
209 #end if
210
211 #if str( $analysis_type.L ):
212 -B ${analysis_type.L}
213 #end if
214
215 #elif str( $analysis_type.analysis_type_selector ) == "cline":
216 ${analysis_type.cline}
217 #end if
218 </token>
219 <token name="@read_group_options@">
220
221 #if str( $rg.rg_selector ) == "True":
222
223 -r "@RG\tID:$rg.ID\tSM:$rg.SM"
224
225 #end if
226 </token>
227
228 <xml name="advanced_pe_options">
229 <param name="adv_pe_options_selector" type="boolean" truevalue="set" falsevalue="do_not_set" label="Set advanced paired end options?" help="Provides additional controls"/>
230 <when value="set">
231 <param name="a" type="integer" value="500" label="Maximum insert size for a read pair to be considered being mapped properly." help="sampe -a; This option is only used when there are not enough good alignment to infer the distribution of insert sizes; default=500"/>
232 <param name="o" type="integer" value="100000" label="Maximum occurrences of a read for pairing. A read with more occurrences will be treated as a single-end read." help="sampe -o; Reducing this parameter helps faster pairing; default=100000"/>
233 <param name="n" type="integer" value="3" label="Maximum number of alignments to output in the XA tag for reads paired properly." help="sampe -n; If a read has more than this many hits, the XA tag will not be written; default=3"/>
234 <param name="N" type="integer" value="10" label="Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons)." help="sampe -N; If a read has more than this many hits, the XA tag will not be written; default=10"/>
235 <param name="c" type="float" value="0.00005" label="Prior of chimeric rate (lower bound)" help="sampe -c"/>
236 </when>
237 <when value="do_not_set">
238 <!-- do nothing -->
239 </when>
240 </xml>
241 <xml name="advances_se_options">
242 <param name="adv_se_options_selector" type="boolean" truevalue="set" falsevalue="do_not_set" label="Set advanced single end options?" help="Provides additional controls"/>
243 <when value="set">
244 <param name="n" type="integer" value="3" label="Maximum number of alignments to output in the XA tag." help="-n; If a read has more than this many hits, the XA tag will not be written; default=3"/>
245 </when>
246 <when value="do_not_set">
247 <!-- do nothing -->
248 </when>
249 </xml>
250 </macros>
251
252 <inputs>
253
254 <conditional name="reference_source">
255 <param name="reference_source_selector" type="select" label="Load reference genome from">
256 <option value="cached">Local cache</option>
257 <option value="history">History</option>
258 </param>
259 <when value="cached">
260 <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
261 <options from_data_table="bwa_mem_indexes">
262 <filter type="sort_by" column="2" />
263 <validator type="no_options" message="No indexes are available" />
264 </options>
265 <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
266 </param>
267 </when>
268 <when value="history">
269 <param name="ref_file" type="data" format="fasta" label="Use the folloing dataset as the reference sequence" help="You can upload a FASTA sequence to the history and use it as reference" />
270 </when>
271 </conditional>
272 <conditional name="input_type">
273 <param name="input_type_selector" type="select" label="Select input type" help="Select between fastq and bam datasets and between paired and single end data">
274 <option value="paired">Paired fastq</option>
275 <option value="single">Single fastq</option>
276 <option value="paired_bam">Paired BAM</option>
277 <option value="single_bam">Single BAM</option>
278 </param>
279 <when value="paired">
280 <param name="fastq_input1" type="data" format="fastqsanger" label="Select first set of reads" help="Specify dataset with forward reads"/>
281 <param name="fastq_input2" type="data" format="fastqsanger" label="Select second set of reads" help="Specify dataset with reverse reads"/>
282 <conditional name="adv_pe_options">
283
284 <expand macro="advanced_pe_options" />
285
286 </conditional>
287 </when>
288 <when value="single">
289 <param name="fastq_input1" type="data" format="fastqsanger" label="Select fastq dataset" help="Specify dataset with single reads"/>
290 <conditional name="adv_se_options">
291
292 <expand macro="advances_se_options" />
293
294 </conditional>
295 </when>
296
297 <!-- the difference between single and paired bams is in the <command> tag portion and realated to -0, -1, and -2 options -->
298
299 <when value="paired_bam">
300 <param name="bam_input" type="data" format="bam" label="Select BAM dataset" help="Specify BAM dataset with paired reads"/>
301 <conditional name="adv_bam_pe_options">
302
303 <expand macro="advanced_pe_options" />
304
305 </conditional>
306 </when>
307 <when value="single_bam">
308 <param name="bam_input" type="data" format="bam" label="Select BAM dataset" help="Specify BAM dataset with single reads"/>
309 <conditional name="adv_bam_se_options">
310
311 <expand macro="advances_se_options" />
312
313 </conditional>
314 </when>
315
316 </conditional>
317
318 <conditional name="rg">
319 <param name="rg_selector" type="boolean" truevalue="set" falsevalue="do_not_set" label="Specify readgroup information?" help="Specifying readgroup information can greatly simplify your downstream analyses by allowing combining multiple datasets. See help below for more details"/>
320 <when value="set">
321 <param name="ID" type="text" value="readgroup1" size="20" label="Specify readgroup ID" help="This value must be unique among multiple samples in your experiment">
322 <sanitizer invalid_char="">
323 <valid initial="string.printable"/>
324 </sanitizer>
325 </param>
326 <param name="SM" type="text" value="blood" size="20" label="Specify readgroup sample name (SM)" help="This value should be descriptive">
327 <sanitizer invalid_char="">
328 <valid initial="string.printable"/>
329 </sanitizer>
330 </param>
331 </when>
332 <when value="do_not_set">
333 <!-- do nothing -->
334 </when>
335 </conditional>
336
337 <conditional name="analysis_type">
338 <param name="analysis_type_selector" type="select" label="Select analysis mode">
339 <option value="illumina">1.Simple Illumina mode</option>
340 <option value="full">2.Full list of options</option>
341 <option value="cline">3.Input parameters on the command line</option>
342 </param>
343 <when value="illumina">
344 <!-- do nothing -->
345 </when>
346 <when value="full">
347 <param name="n" type="text" value="0.04" label="maximum edit distance if the value is integer, or the fraction of missing alignments given 2% uniform base error rate if float. In the latter case, the maximum edit distance is automatically chosen for different read lengths." help="aln -n; default=0.04"/>
348 <param name="o" type="integer" value="1" label="maximum number or gap openings" help="aln -o; default=1"/>
349 <param name="e" type="integer" value="-1" label="maximum number of gap extensions" help="aln -e; -1 disables long gaps and invokes k-difference mode; default=-1"/>
350 <param name="i" type="integer" value="5" label="do not put an indel within this many bp towards the ends" help="aln -i; default=5"/>
351 <param name="d" type="integer" value="10" label="maximum occurrences for extending a long deletion" help="aln -d; default=10"/>
352 <param name="l" type="integer" value="32" label="seed length" help="aln -l; default=32"/>
353 <param name="k" type="integer" value="2" label="maximum differences in the seed" help="aln -k; default=2"/>
354 <param name="m" type="integer" value="2000000" label="maximum entries in the queue" help="aln -m; default=2000000"/>
355 <param name="M" type="integer" value="3" label="mismatch penalty" help="aln -M; default=3"/>
356 <param name="O" type="integer" value="11" label="gap open penalty" help="aln -O; default=11"/>
357 <param name="E" type="integer" value="4" label="gap extension penalty" help="aln -E; default=4"/>
358 <param name="R" type="integer" value="30" label="stop searching when there are more than this value of equally best hits" help="aln -R; default=30"/>
359 <param name="q" type="integer" value="0" label="quality threshold for read trimming down to 35bp" help="aln -q; default=0"/>
360 <param name="B" type="integer" optional="True" label="length of barcode" help="aln -B; optional parameter"/>
361 <param name="L" type="float" optional="True" label="log-scaled gap penalty for long deletions" help="aln -L; optional parameter"/>
362 </when>
363
364 <when value="cline">
365 <param name="cline" size="60" type="text" value="-n 0.04 -R 10" label="Type command line options here" help="All paremeters that DO NOT involve filenames can be typed here.">
366 <sanitizer>
367 <valid initial="string.printable">
368 <remove value="&apos;"/>
369 </valid>
370 </sanitizer>
371 </param>
372 </when>
373 </conditional>
374 </inputs>
375
376 <outputs>
377 <data format="bam" name="bam_output" label="${tool.name} on ${on_string} (mapped reads in BAM format)"/>
378 </outputs>
379
380 <tests>
381 <test>
382 <param name="reference_source_selector" value="history" />
383 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
384 <param name="input_type_selector" value="paired"/>
385 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
386 <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
387 <param name="analysis_type_selector" value="illumina"/>
388 <output name="bam_output" ftype="bam" file="bwa-aln-test1.bam" lines_diff="2" />
389 </test>
390 <test>
391 <param name="reference_source_selector" value="history" />
392 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
393 <param name="input_type_selector" value="paired_bam"/>
394 <param name="bam_input" ftype="bam" value="bwa-aln-bam-input.bam"/>
395 <param name="analysis_type_selector" value="illumina"/>
396 <output name="bam_output" ftype="bam" file="bwa-aln-test2.bam" lines_diff="2" />
397 </test>
398 </tests>
399 <stdio>
400 <exit_code range="1:" />
401 </stdio>
402 <help>
403
404 **What is does**
405
406 BWA is a software package for mapping low-divergent sequences against a large reference genome, such as the human genome. The bwa-aln algorithm is designed for Illumina sequence reads up to 100bp. For longer reads use BWA-MEM algorithm distributed as separate Galaxy tool.
407
408 This Galaxy tool wraps bwa-aln, bwa-samse and -sampe modules of bwa read mapping tool:
409
410 - bwa aln - actual mapper placing reads onto the reference sequence
411 - bwa samse - post-processor converting suffix array coordinates into genome coordinates in SAM format for single reads
412 - bam sampe - post-processor for paired reads
413
414 Galaxy implementation takes fastq or BAM (unaligned BAM) datasets as input and produces output in BAM (not SAM; in reality SAM produced by the bwa is converted to BAM on the fly by samtools view command) format, which can be further processed using various BAM utilities exiting in Galaxy (BAMTools, SAMTools, Picard).
415
416 -----
417
418 **Galaxy-specific option**
419
420 Galaxy allows three levels of control over bwa-mem options provided by **Select analysis mode** menu option. These are:
421
422 1. *Simple Illumina mode*: The simplest possible bwa mem application in which it alignes single or paired-end data to reference using default parameters. It is equivalent to the following command: bwa mem &lt;reference index&gt; &lt;fastq dataset1&gt; [fastq dataset2]
423 2. *Full list of options*: Allows access to all options through Galaxy interface.
424 3. *Input parameters on the command line*: Similar to the choice above but for those who does not like clicking. Here options can be directly typed into a text box.
425
426 ------
427
428 **bwa-aln options**
429
430 Each Galaxy parameter widget corresponds to command line flags listed below::
431
432 -n NUM max #diff (int) or missing prob under 0.02 err rate (float) [0.04]
433 -o INT maximum number or fraction of gap opens [1]
434 -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]
435 -i INT do not put an indel within INT bp towards the ends [5]
436 -d INT maximum occurrences for extending a long deletion [10]
437 -l INT seed length [32]
438 -k INT maximum differences in the seed [2]
439 -m INT maximum entries in the queue [2000000]
440 -M INT mismatch penalty [3]
441 -O INT gap open penalty [11]
442 -E INT gap extension penalty [4]
443 -R INT stop searching when there are >INT equally best hits [30]
444 -q INT quality threshold for read trimming down to 35bp [0]
445 -B INT length of barcode
446 -L log-scaled gap penalty for long deletions
447 -N non-iterative mode: search for all n-difference hits (slooow)
448 -I the input is in the Illumina 1.3+ FASTQ-like format
449 -b the input read file is in the BAM format
450 -0 use single-end reads only (effective with -b)
451 -1 use the 1st read in a pair (effective with -b)
452 -2 use the 2nd read in a pair (effective with -b)
453
454 **bwa-samse options**::
455
456 -a INT maximum insert size [500]
457 -o INT maximum occurrences for one end [100000]
458 -n INT maximum hits to output for paired reads [3]
459 -N INT maximum hits to output for discordant pairs [10]
460 -c FLOAT prior of chimeric rate (lower bound) [1.0e-05]
461 -r STR read group header line [null]
462
463 **bwa-sampe options**::
464
465 -n INT maximum hits to output for paired reads [3]
466 -r STR read group header line [null]
467
468 ------
469
470 .. class:: warningmark
471
472 **An important note on Read Groups**
473
474 One of the recommended best practices in NGS analysis is adding read group information to BAM files. You can do thid directly in BWA interface using the
475 **Specify readgroup information?** widget. If you are not familiar with readgroups you shold know that this is effectively a way to tag reads with an additional ID.
476 This allows you to combine BAM files from, for example, multiple BWA runs into a single dataset. This significantly simplifies downstream processing as
477 instead of dealing with multiple datasets you only have to handle only one. This is possible because the readgroup information allows you to identify
478 data from different experiments even if they are combined in one file. Many downstream analysis tools such as varinat callers (e.g., FreeBayes or Naive Varinat Caller
479 present in Galaxy) are aware of readgtroups and will automatically generate calls for each individual sample even if they are combined within a single file.
480
481 -----
482
483 .. class:: infomark
484
485 **More info**
486
487 To obtain more information about BWA and ask questions use these resources:
488
489 1. https://biostar.usegalaxy.org/
490 2. https://www.biostars.org/
491 3. https://github.com/lh3/bwa
492 4. http://bio-bwa.sourceforge.net/
493
494
495 </help>
496 <citations>
497 <citation type="doi">10.1093/bioinformatics/btp324</citation>
498 <citation type="doi">10.1093/bioinformatics/btp698</citation>
499 </citations>
500 </tool>