comparison bwa-mem.xml @ 0:5e72d136a39e draft

Uploaded
author devteam
date Mon, 29 Sep 2014 16:22:24 -0400
parents
children 86c73f0eb389
comparison
equal deleted inserted replaced
-1:000000000000 0:5e72d136a39e
1 <?xml version="1.0"?>
2 <tool id="bwa_mem_0_7_10" name="BWA-MEM" version="bwa-0.7.10-r837-dirty_galaxy_0.1">
3 <requirements>
4 <requirement type="package" version="0.7.10.039ea20639">bwa</requirement>
5 <requirement type="package" version="1.1">samtools</requirement>
6 </requirements>
7 <description>- map medium and long reads (&gt; 100 bp) against reference genome</description>
8 <command>
9
10 #set $reference_fasta_filename = "localref.fa"
11
12 #if str( $reference_source.reference_source_selector ) == "history":
13
14 ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &amp;&amp;
15
16 ## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run
17 ## depending ob the size of the input FASTA dataset
18
19 (
20 size=`stat -c %s "${reference_fasta_filename}" 2&gt;/dev/null`; ## Linux
21 if [ $? -eq 0 ];
22 then
23 if [ \$size -lt 2000000000 ];
24 then
25 bwa index -a is "${reference_fasta_filename}";
26 echo "Generating BWA index with is algorithm";
27 else
28 bwa index -a bwtsw "${reference_fasta_filename}";
29 echo "Generating BWA index with bwtsw algorithm";
30 fi;
31 fi;
32
33 eval \$(stat -s "${reference_fasta_filename}"); ## OSX
34 if [ $? -eq 0 ];
35 then
36 if [ \$st_size -lt 2000000000 ];
37 then
38 bwa index -a is "${reference_fasta_filename}";
39 echo "Generating BWA index with is algorithm";
40 else
41 bwa index -a bwtsw "${reference_fasta_filename}";
42 echo "Generating BWA index with bwtsw algorithm";
43 fi;
44 fi;
45 ) &amp;&amp;
46
47 #else:
48 #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
49 #end if
50
51 ## Begin BWA-MEM command line
52
53 bwa mem
54 -t "\${GALAXY_SLOTS:-1}"
55 -v 1 ## Verbosity is set to 1 (errors only)
56
57 #if str( $fastq_input.fastq_input_selector ) == "paired_iv": ## For interleaved fastq files set -p option
58 -p
59 #if str( $fastq_input.iv_stats.iv_stats_selector ) == "True": ## check that insert statistics is used
60 -I "${fastq_input.iv_stats.iset_stats}"
61 #end if
62 #end if
63
64 #if str( $analysis_type.analysis_type_selector ) == "pacbio":
65 -x
66
67 #elif str( $analysis_type.analysis_type_selector ) == "full":
68
69 #if str( $analysis_type.algorithmic_options.algorithmic_options_selector ) == "True": ## Algorithmic options
70
71 -k "${analysis_type.algorithmic_options.k}"
72 -w "${analysis_type.algorithmic_options.w}"
73 -d "${analysis_type.algorithmic_options.d}"
74 -r "${analysis_type.algorithmic_options.r}"
75 -y "${analysis_type.algorithmic_options.y}"
76 -c "${analysis_type.algorithmic_options.c}"
77 -D "${analysis_type.algorithmic_options.D}"
78 -W "${analysis_type.algorithmic_options.W}"
79 -m "${analysis_type.algorithmic_options.m}"
80 ${analysis_type.algorithmic_options.S}
81 ${analysis_type.algorithmic_options.P}
82 ${analysis_type.algorithmic_options.e}
83
84 #end if
85
86 #if str( $analysis_type.scoring_options.scoring_options_selector ) == "True": ## Scoring options
87
88 -A "${analysis_type.scoring_options.A}"
89 -B "${analysis_type.scoring_options.B}"
90 -O "${analysis_type.scoring_options.O}"
91 -E "${analysis_type.scoring_options.E}"
92 -L "${analysis_type.scoring_options.L}"
93 -U "${analysis_type.scoring_options.U}"
94
95 #end if
96
97 #if str( $analysis_type.io_options.io_options_selector ) == "True": ## IO options
98
99 -T "${analysis_type.io_options.T}"
100 -h "${analysis_type.io_options.h}"
101 ${analysis_type.io_options.a}
102 ${analysis_type.io_options.C}
103 ${analysis_type.io_options.V}
104 ${analysis_type.io_options.Y}
105 ${analysis_type.io_options.M}
106
107 #end if
108
109 #elif str( $analysis_type.analysis_type_selector ) == "cline":
110
111 ${analysis_type.cline}
112
113 #end if
114
115 #if str( $rg.rg_selector ) == "True":
116 -R "@RG\tID:$rg.ID\tSM:$rg.SM"
117 #end if
118
119 #if str( $fastq_input.fastq_input_selector ) == "paired":
120
121 #if str( $fastq_input.paired_stats.paired_stats_selector ) == "True": ## check that insert statistics is used
122 -I "${fastq_input.paired_stats.iset_stats}"
123 #end if
124
125 "${reference_fasta_filename}"
126
127 "${fastq_input.fastq_input1}" "${fastq_input.fastq_input2}"
128
129 #else:
130
131 "${reference_fasta_filename}"
132
133 "${fastq_input.fastq_input1}"
134
135 #end if
136
137 | samtools view -Sb - > $bam_output
138
139 </command>
140
141 <inputs>
142
143 <conditional name="reference_source">
144 <param name="reference_source_selector" type="select" label="Load reference genome from">
145 <option value="cached">Local cache</option>
146 <option value="history">History</option>
147 </param>
148 <when value="cached">
149 <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
150 <options from_data_table="bwa_mem_indexes">
151 <filter type="sort_by" column="2" />
152 <validator type="no_options" message="No indexes are available" />
153 </options>
154 <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
155 </param>
156 </when>
157 <when value="history">
158 <param name="ref_file" type="data" format="fasta" label="Use the folloing dataset as the reference sequence" help="You can upload a FASTA sequence to the history and use it as reference" />
159 </when>
160 </conditional>
161 <conditional name="fastq_input">
162 <param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data">
163 <option value="paired">Paired</option>
164 <option value="single">Single</option>
165 <option value="paired_iv">Paired Interleaved</option>
166 </param>
167 <when value="paired">
168 <param name="fastq_input1" type="data" format="fastqsanger" label="Select first set of reads" help="Specify dataset with forward reads"/>
169 <param name="fastq_input2" type="data" format="fastqsanger" label="Select second set of reads" help="Specify dataset with reverse reads"/>
170
171 <!-- PE stat selection block 1: If you make any changes in this conditional block, copy them to PE stat selection block 2 below as well -->
172
173 <conditional name="paired_stats">
174 <param name="paired_stats_selector" type="boolean" truevalue="set" falsevalue="do_not_set" label="Specify insert size statistics?" help="-I; if you choose to not specify, it will be inferred from the data"/>
175 <when value="set">
176
177 <param name="iset_stats" type="text" value="250" size="10" label="Enter mean, standerd deviation, max, and min for insert lengths in the form mean,sd,min,max" help="-I; only mean is required while sd, max, and min will be inferred. Examples: both &quot;250&quot; and &quot;250,25&quot; will work while &quot;250,,10&quot; will not. See below for details.">
178 <sanitizer invalid_char="">
179 <valid initial="string.digits"><add value=","/> </valid>
180 </sanitizer>
181 </param>
182
183 </when>
184 <when value="do_not_set">
185 <!-- do nothing -->
186 </when>
187 </conditional>
188
189 <!-- end of PE stat selection block 1 -->
190
191 </when>
192 <when value="single">
193 <param name="fastq_input1" type="data" format="fastqsanger" label="Select fastq dataset" help="Specify dataset with single reads"/>
194 </when>
195 <when value="paired_iv">
196 <param name="fastq_input1" type="data" format="fastqsanger" label="Select fastq dataset" help="Specify dataset with interleaved reads"/>
197
198 <!-- PE stat selection block 2: If you make any changes in this conditional block, copy them to PE stat selection block 1 above as well -->
199
200 <conditional name="iv_stats">
201 <param name="iv_stats_selector" type="boolean" truevalue="set" falsevalue="do_not_set" label="Specify insert size statistics?" help="-I; if you choose to not specify, it will be inferred from the data"/>
202 <when value="set">
203
204 <param name="iset_stats" type="text" value="250" size="10" label="Enter mean, standerd deviation, max, and min for insert lengths in the form mean,sd,min,max" help="-I; only mean is required while sd, max, and min will be inferred. Examples: both &quot;250&quot; and &quot;250,25&quot; will work while &quot;250,,10&quot; will not. See below for details.">
205 <sanitizer invalid_char="">
206 <valid initial="string.digits"><add value=","/> </valid>
207 </sanitizer>
208 </param>
209
210 </when>
211 <when value="do_not_set">
212 <!-- do nothing -->
213 </when>
214 </conditional>
215
216 <!-- end of PE stat selection block 2 -->
217
218 </when>
219 </conditional>
220
221 <conditional name="rg">
222 <param name="rg_selector" type="boolean" truevalue="set" falsevalue="do_not_set" label="Specify readgroup information?" help="Specifying readgroup information can greatly simplify your downstream analyses by allowing combining multiple datasets. See help below for more details"/>
223 <when value="set">
224 <param name="ID" type="text" value="readgroup1" size="20" label="Specify readgroup ID" help="This value must be unique among multiple samples in your experiment">
225 <sanitizer invalid_char="">
226 <valid initial="string.printable"/>
227 </sanitizer>
228 </param>
229 <param name="SM" type="text" value="blood" size="20" label="Specify readgroup sample name (SM)" help="This value should be descriptive">
230 <sanitizer invalid_char="">
231 <valid initial="string.printable"/>
232 </sanitizer>
233 </param>
234 </when>
235 <when value="do_not_set">
236 <!-- do nothing -->
237 </when>
238 </conditional>
239
240 <conditional name="analysis_type">
241 <param name="analysis_type_selector" type="select" label="Select analysis mode">
242 <option value="illumina">1.Simple Illumina mode</option>
243 <option value="pacbio">2.PacBio mode</option>
244 <option value="full">3.Full list of options</option>
245 <option value="cline">4.Input parameters on the command line</option>
246 </param>
247 <when value="illumina">
248 <!-- do nothing -->
249 </when>
250 <when value="pacbio">
251 <!-- do nothing. all magic happens within <command> tag -->
252 </when>
253 <when value="full">
254 <conditional name="algorithmic_options">
255 <param name="algorithmic_options_selector" type="boolean" truevalue="set" falsevalue="do_not_set" label="Set algorithmic options?" help="Sets -k, -w, -d, -r, -y, -c, -D, -W, -m, -S, -P, and -e options." />
256 <when value="set">
257 <param name="k" type="integer" value="19" label="minimum seed length" help="-k; default=19"/>
258 <param name="w" type="integer" value="100" label="band width for banded alignment" help="-w; default=100"/>
259 <param name="d" type="integer" value="100" label="off-diagonal X-dropoff" help="-d; default=100"/>
260 <param name="r" type="float" value="1.5" label="look for internal seeds inside a seed longer than -k * THIS VALUE" help="-r; default=1.5"/>
261 <param name="y" type="integer" value="0" label="find maximum exact matches (MEMs) longer than -k * -r with size less than THIS VALUE" help="-y; default=0"/>
262 <param name="c" type="integer" value="500" label="skip seeds with more than that many occurrences" help="-c; default=500"/>
263 <param name="D" type="float" value="0.5" label="drop chains shorter than this fraction of the longest overlapping chain" help="-D; default=0.5"/>
264 <param name="W" type="integer" value="0" label="discard a chain if seeded bases shorter than" help="-W; default=0"/>
265 <param name="m" type="integer" value="50" label="perform at most this many rounds of mate rescues for each read" help="-m; default=50"/>
266 <param name="S" type="boolean" truevalue="-S" falsevalue="" label="skip mate rescue" help="-S"/>
267 <param name="P" type="boolean" truevalue="-P" falsevalue="" label="skip pairing; mate rescue performed unless -S also in use" help="-P"/>
268 <param name="e" type="boolean" truevalue="-e" falsevalue="" label="discard full-length exact matches" help="-e"/>
269 </when>
270 <when value="do_not_set">
271 <!-- do nothing -->
272 </when>
273 </conditional>
274 <conditional name="scoring_options">
275 <param name="scoring_options_selector" type="boolean" truevalue="set" falsevalue="do_not_set" label="Set scoring options?" help="Sets -A, -B, -O, -E, -L, and -U options." />
276 <when value="set">
277 <param name="A" type="integer" value="1" label="score for a sequence match" help="-A; scales options -T, -d, -B, -O, -E, -L, and -U; default=1"/>
278 <param name="B" type="integer" value="4" label="penalty for mismatch" help="-B; default=4"/>
279 <param name="O" type="text" value="6,6" label="gap open penalty for deletions and insertions" help="-O; default=6,6">
280 <sanitizer invalid_char="">
281 <valid initial="string.digits"><add value=","/> </valid>
282 </sanitizer>
283 </param>
284 <param name="E" type="text" value="1,1" label="gap extension penalty; a gap of size k cost &#39;-O + -E*k&#39; " help="-E; default=1,1">
285 <sanitizer invalid_char="">
286 <valid initial="string.digits"><add value=","/> </valid>
287 </sanitizer>
288 </param>
289 <param name="L" type="text" value="5,5" label="penalty for 5&#39;-end and 3&#39;-end clipping" help="-L; default=5,5">
290 <sanitizer invalid_char="">
291 <valid initial="string.digits"><add value=","/> </valid>
292 </sanitizer>
293 </param>
294 <param name="U" type="integer" value="17" label="penalty for an unpaired read pair" help="-U; default=17"/>
295 </when>
296 <when value="do_not_set">
297 <!-- do nothing -->
298 </when>
299 </conditional>
300 <conditional name="io_options">
301 <param name="io_options_selector" type="boolean" truevalue="set" falsevalue="do_not_set" label="Set input/output options" help="Sets -T, -h, -a, -C, -V, -Y, and -M options." />
302 <when value="set">
303 <param name="T" type="integer" value="30" label="minimum score to output" help="-T; default=30"/>
304 <param name="h" type="integer" value="5" label="if there are this many hits with score >80% of the max score, output all in XA tag" help="-h; default=5"/>
305 <param name="a" type="boolean" truevalue="-a" falsevalue="" label="output all alignments for single-ends or unpaired paired-ends" help="-a"/>
306 <param name="C" type="boolean" truevalue="-C" falsevalue="" label="append FASTA/FASTQ comment to BAM output" help="-C"/>
307 <param name="V" type="boolean" truevalue="-V" falsevalue="" label="output the reference FASTA header in the XR tag" help="-C"/>
308 <param name="Y" type="boolean" truevalue="-Y" falsevalue="" label="use soft clipping for supplementary alignments" help="-Y"/>
309 <param name="M" type="boolean" truevalue="-M" falsevalue="" label="mark shorter split hits as secondary" help="-M"/>
310 </when>
311 <when value="do_not_set">
312 <!-- do nothing -->
313 </when>
314 </conditional>
315 </when>
316 <when value="cline">
317 <param name="cline" size="60" type="text" value="-T 30 -c 250" label="Type command line options here" help="All paremeters that DO NOT involve filenames can be typed here.">
318 <sanitizer>
319 <valid initial="string.printable">
320 <remove value="&apos;"/>
321 </valid>
322 </sanitizer>
323 </param>
324 </when>
325 </conditional>
326 </inputs>
327
328 <outputs>
329 <data format="bam" name="bam_output" label="${tool.name} on ${on_string} (mapped reads in BAM format)"/>
330 </outputs>
331
332 <tests>
333 <test>
334 <param name="reference_source_selector" value="history" />
335 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
336 <param name="fastq_input_selector" value="paired"/>
337 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
338 <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
339 <param name="analysis_type_selector" value="illumina"/>
340 <output name="bam_output" ftype="bam" file="bwa-mem-test1.bam" lines_diff="2" />
341 </test>
342 </tests>
343 <stdio>
344 <exit_code range="1:" />
345 </stdio>
346 <help>
347
348 **What is does**
349
350 From http://arxiv.org/abs/1303.3997:
351
352 BWA-MEM is a new alignment algorithm for aligning sequence reads or long query sequences against a large reference genome such as human.
353 It automatically chooses between local and end-to-end alignments, supports paired-end reads and performs chimeric alignment.
354 The algorithm is robust to sequencing errors and applicable to a wide range of sequence lengths from 70bp to a few megabases.
355 For mapping 100bp sequences, BWA-MEM shows better performance than several state-of-art read aligners to date.
356
357 It is best suited for mapping long (>70 nt) reads against large reference genomes.
358
359 This Galaxy tool wraps bwa-mem module of bwa read mapping tool. Galaxy implementation takes fastq files as input and produces output in BAM (not SAM) format, which can be further processed using various BAM utilities exiting in Galaxy (BAMTools, SAMTools, Picard).
360
361 -----
362
363 **Galaxy-specific option**
364
365 Galaxy allows four levels of control over bwa-mem options provided by **Select analysis mode** menu option. These are:
366
367 1. *Simple Illumina mode*: The simplest possible bwa mem application in which it alignes single or paired-end data to reference using default parameters. It is equivalent to the following command: bwa mem &lt;reference index&gt; &lt;fastq dataset1&gt; [fastq dataset2]
368 2. *PacBio mode*: The mode adjusted specifically for mapping of long PacBio subreads. Equivalent to the following command: bwa mem -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 &lt;reference index&gt; &lt;PacBio dataset in fastq format&gt;
369 3. *Full list of options*: Allows access to all options through Galaxy interface.
370 4. *Input parameters on the command line*: Similar to the choice above but for those who does not like clicking. Here options can be directly typed into a text box.
371
372 ------
373
374 **BWA MEM options**
375
376 Each Galaxy parameter widget corresponds to command line flags listed below:
377
378 Algorithm options::
379
380 -k INT minimum seed length [19]
381 -w INT band width for banded alignment [100]
382 -d INT off-diagonal X-dropoff [100]
383 -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [1.5]
384 -y INT find MEMs longer than {-k} * {-r} with size less than INT [0]
385 -c INT skip seeds with more than INT occurrences [500]
386 -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [0.50]
387 -W INT discard a chain if seeded bases shorter than INT [0]
388 -m INT perform at most INT rounds of mate rescues for each read [50]
389 -S skip mate rescue
390 -P skip pairing; mate rescue performed unless -S also in use
391 -e discard full-length exact matches
392
393 Scoring options::
394
395 -A INT score for a sequence match, which scales options -TdBOELU unless overridden [1]
396 -B INT penalty for a mismatch [4]
397 -O INT[,INT] gap open penalties for deletions and insertions [6,6]
398 -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [1,1]
399 -L INT[,INT] penalty for 5'- and 3'-end clipping [5,5]
400 -U INT penalty for an unpaired read pair [17]
401
402 Input/output options::
403
404 -p first query file consists of interleaved paired-end sequences
405 -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]
406
407 -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [3]
408 -T INT minimum score to output [30]
409 -h INT if there are &lt;INT hits with score &gt;80% of the max score, output all in XA [5]
410 -a output all alignments for SE or unpaired PE
411 -C append FASTA/FASTQ comment to SAM output
412 -V output the reference FASTA header in the XR tag
413 -Y use soft clipping for supplementary alignments
414 -M mark shorter split hits as secondary
415
416 -I FLOAT[,FLOAT[,INT[,INT]]]
417 specify the mean, standard deviation (10% of the mean if absent), max
418 (4 sigma from the mean if absent) and min of the insert size distribution.
419 FR orientation only. [inferred]
420
421 ------
422
423 .. class:: warningmark
424
425 **An important note on Read Groups**
426
427 One of the recommended best practices in NGS analysis is adding read group information to BAM files. You can do thid directly in BWA MEM interface using the
428 **Specify readgroup information?** widget. If you are not familiar with readgroups you shold know that this is effectively a way to tag reads with an additional ID.
429 This allows you to combine BAM files from, for example, multiple BWA MEM runs into a single dataset. This significantly simplifies downstream processing as
430 instead of dealing with multiple datasets you only have to handle only one. This is possible because the readgroup information allows you to identify
431 data from different experiments even if they are combined in one file. Many downstream analysis tools such as varinat callers (e.g., FreeBayes or Naive Varinat Caller
432 present in Galaxy) are aware of readgtroups and will automatically generate calls for each individual sample even if they are combined within a single file.
433
434 -----
435
436 .. class:: infomark
437
438 **More info**
439
440 To obtain more information about BWA MEM and ask questions use these resources:
441
442 1. https://biostar.usegalaxy.org/
443 2. https://www.biostars.org/
444 3. https://github.com/lh3/bwa
445 4. http://bio-bwa.sourceforge.net/
446
447
448 </help>
449 <citations>
450 <citation type="doi">10.1093/bioinformatics/btp324</citation>
451 <citation type="doi">10.1093/bioinformatics/btp698</citation>
452 <citation type="bibtex">@misc{1303.3997,
453 Author = {Heng Li},
454 Title = {Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM},
455 Year = {2013},
456 Eprint = {arXiv:1303.3997},
457 url = {http://arxiv.org/abs/1303.3997},
458 }</citation>
459 </citations>
460 </tool>