0
|
1 <tool id="bwa_wrapper_stacks" name="Map with BWA for STACKS" version="1.2.3">
|
|
2 <description>from zip file with fastqsanger files</description>
|
|
3 <requirements>
|
|
4 <requirement type="package" version="0.6.2">bwa</requirement>
|
|
5 </requirements>
|
|
6 <description></description>
|
|
7 <parallelism method="basic"></parallelism>
|
|
8 <command interpreter="python">
|
|
9 bwa_wrapper.py
|
|
10 --threads="4"
|
|
11
|
|
12 #if $input1.ext == "fastqillumina":
|
|
13 --illumina1.3
|
|
14 #end if
|
|
15
|
|
16 ## reference source
|
|
17 --fileSource="${genomeSource.refGenomeSource}"
|
|
18 #if $genomeSource.refGenomeSource == "history":
|
|
19 ##build index on the fly
|
|
20 --ref="${genomeSource.ownFile}"
|
|
21 --dbkey="${dbkey}"
|
|
22 #else:
|
|
23 ##use precomputed indexes
|
|
24 --ref="${genomeSource.indices.fields.path}"
|
|
25 --do_not_build_index
|
|
26 #end if
|
|
27
|
|
28 ## input file(s)
|
|
29 --input1="${paired.input1}"
|
|
30
|
|
31 ## output file
|
|
32 --output="${output}"
|
|
33
|
|
34 ## run parameters
|
|
35 --params="${params.source_select}"
|
|
36 #if $params.source_select != "pre_set":
|
|
37 --maxEditDist="${params.maxEditDist}"
|
|
38 --fracMissingAligns="${params.fracMissingAligns}"
|
|
39 --maxGapOpens="${params.maxGapOpens}"
|
|
40 --maxGapExtens="${params.maxGapExtens}"
|
|
41 --disallowLongDel="${params.disallowLongDel}"
|
|
42 --disallowIndel="${params.disallowIndel}"
|
|
43 --seed="${params.seed}"
|
|
44 --maxEditDistSeed="${params.maxEditDistSeed}"
|
|
45 --mismatchPenalty="${params.mismatchPenalty}"
|
|
46 --gapOpenPenalty="${params.gapOpenPenalty}"
|
|
47 --gapExtensPenalty="${params.gapExtensPenalty}"
|
|
48 --suboptAlign="${params.suboptAlign}"
|
|
49 --noIterSearch="${params.noIterSearch}"
|
|
50 --outputTopN="${params.outputTopN}"
|
|
51 --outputTopNDisc="${params.outputTopNDisc}"
|
|
52 --maxInsertSize="${params.maxInsertSize}"
|
|
53 --maxOccurPairing="${params.maxOccurPairing}"
|
|
54 #if $params.readGroup.specReadGroup == "yes"
|
|
55 --rgid="${params.readGroup.rgid}"
|
|
56 --rgcn="${params.readGroup.rgcn}"
|
|
57 --rgds="${params.readGroup.rgds}"
|
|
58 --rgdt="${params.readGroup.rgdt}"
|
|
59 --rgfo="${params.readGroup.rgfo}"
|
|
60 --rgks="${params.readGroup.rgks}"
|
|
61 --rglb="${params.readGroup.rglb}"
|
|
62 --rgpg="${params.readGroup.rgpg}"
|
|
63 --rgpi="${params.readGroup.rgpi}"
|
|
64 --rgpl="${params.readGroup.rgpl}"
|
|
65 --rgpu="${params.readGroup.rgpu}"
|
|
66 --rgsm="${params.readGroup.rgsm}"
|
|
67 #end if
|
|
68 #end if
|
|
69
|
|
70 ## suppress output SAM header
|
|
71 --suppressHeader="${suppressHeader}"
|
|
72 </command>
|
|
73 <inputs>
|
|
74 <conditional name="genomeSource">
|
|
75 <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?">
|
|
76 <option value="indexed">Use a built-in index</option>
|
|
77 <option value="history">Use one from the history</option>
|
|
78 </param>
|
|
79 <when value="indexed">
|
|
80 <param name="indices" type="select" label="Select a reference genome">
|
|
81 <options from_data_table="bwa_indexes">
|
|
82 <filter type="sort_by" column="2" />
|
|
83 <validator type="no_options" message="No indexes are available" />
|
|
84 </options>
|
|
85 </param>
|
|
86 </when>
|
|
87 <when value="history">
|
|
88 <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />
|
|
89 </when>
|
|
90 </conditional>
|
|
91 <conditional name="paired">
|
|
92 <param name="sPaired" type="select" label="Is this library mate-paired?">
|
|
93 <option value="single">Single-end</option>
|
|
94 </param>
|
|
95 <when value="single">
|
|
96 <param name="input1" type="data" format="zip" label="Zip file" help="Zip file with several FASTQ with either Sanger-scaled quality values (fastqsanger) or Illumina-scaled quality values (fastqillumina)" />
|
|
97 </when>
|
|
98 </conditional>
|
|
99 <conditional name="params">
|
|
100 <param name="source_select" type="select" label="BWA settings to use" help="For most mapping needs use Commonly Used settings. If you want full control use Full Parameter List">
|
|
101 <option value="pre_set">Commonly Used</option>
|
|
102 <option value="full">Full Parameter List</option>
|
|
103 </param>
|
|
104 <when value="pre_set" />
|
|
105 <when value="full">
|
|
106 <param name="maxEditDist" type="integer" value="0" label="Maximum edit distance (aln -n)" help="Enter this value OR a fraction of missing alignments, not both" />
|
|
107 <param name="fracMissingAligns" type="float" value="0.04" label="Fraction of missing alignments given 2% uniform base error rate (aln -n)" help="Enter this value OR maximum edit distance, not both" />
|
|
108 <param name="maxGapOpens" type="integer" value="1" label="Maximum number of gap opens (aln -o)" />
|
|
109 <param name="maxGapExtens" type="integer" value="-1" label="Maximum number of gap extensions (aln -e)" help="-1 for k-difference mode (disallowing long gaps)" />
|
|
110 <param name="disallowLongDel" type="integer" value="16" label="Disallow long deletion within [value] bp towards the 3'-end (aln -d)" />
|
|
111 <param name="disallowIndel" type="integer" value="5" label="Disallow insertion/deletion within [value] bp towards the end (aln -i)" />
|
|
112 <param name="seed" type="integer" value="-1" label="Number of first subsequences to take as seed (aln -l)" help="Enter -1 for infinity" />
|
|
113 <param name="maxEditDistSeed" type="integer" value="2" label="Maximum edit distance in the seed (aln -k)" />
|
|
114 <param name="mismatchPenalty" type="integer" value="3" label="Mismatch penalty (aln -M)" help="BWA will not search for suboptimal hits with a score lower than [value]" />
|
|
115 <param name="gapOpenPenalty" type="integer" value="11" label="Gap open penalty (aln -O)" />
|
|
116 <param name="gapExtensPenalty" type="integer" value="4" label="Gap extension penalty (aln -E)" />
|
|
117 <param name="suboptAlign" type="integer" optional="True" label="Proceed with suboptimal alignments if there are no more than INT equally best hits. (aln -R)" help="For paired-end reads only. By default, BWA only searches for suboptimal alignments if the top hit is unique. Using this option has no effect on accuracy for single-end reads. It is mainly designed for improving the alignment accuracy of paired-end reads. However, the pairing procedure will be slowed down, especially for very short reads (~32bp)" />
|
|
118 <param name="noIterSearch" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Disable iterative search (aln -N)" help="All hits with no more than maxDiff differences will be found. This mode is much slower than the default" />
|
|
119 <param name="outputTopN" type="integer" value="3" label="Maximum number of alignments to output in the XA tag for reads paired properly (samse/sampe -n)" help="If a read has more than INT hits, the XA tag will not be written" />
|
|
120 <param name="outputTopNDisc" type="integer" value="10" label="Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons) (sampe -N)" help="For paired-end reads only. If a read has more than INT hits, the XA tag will not be written" />
|
|
121 <param name="maxInsertSize" type="integer" value="500" label="Maximum insert size for a read pair to be considered as being mapped properly (sampe -a)" help="For paired-end reads only. Only used when there are not enough good alignments to infer the distribution of insert sizes" />
|
|
122 <param name="maxOccurPairing" type="integer" value="100000" label="Maximum occurrences of a read for pairing (sampe -o)" help="For paired-end reads only. A read with more occurrences will be treated as a single-end read. Reducing this parameter helps faster pairing" />
|
|
123 <conditional name="readGroup">
|
|
124 <param name="specReadGroup" type="select" label="Specify the read group for this file? (samse/sampe -r)">
|
|
125 <option value="yes">Yes</option>
|
|
126 <option value="no" selected="True">No</option>
|
|
127 </param>
|
|
128 <when value="yes">
|
|
129 <param name="rgid" type="text" size="25" label="Read group identifier (ID). Each @RG line must have a unique ID. The value of ID is used in the RG
|
|
130 tags of alignment records. Must be unique among all read groups in header section." help="Required if RG specified. Read group
|
|
131 IDs may be modified when merging SAM files in order to handle collisions." />
|
|
132 <param name="rgcn" type="text" size="25" label="Sequencing center that produced the read (CN)" help="Optional" />
|
|
133 <param name="rgds" type="text" size="25" label="Description (DS)" help="Optional" />
|
|
134 <param name="rgdt" type="text" size="25" label="Date that run was produced (DT)" help="Optional. ISO8601 format date or date/time, like YYYY-MM-DD" />
|
|
135 <param name="rgfo" type="text" size="25" label="Flow order (FO). The array of nucleotide bases that correspond to the nucleotides used for each
|
|
136 flow of each read." help="Optional. Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by
|
|
137 various other characters. Format : /\*|[ACMGRSVTWYHKDBN]+/" />
|
|
138 <param name="rgks" type="text" size="25" label="The array of nucleotide bases that correspond to the key sequence of each read (KS)" help="Optional" />
|
|
139 <param name="rglb" type="text" size="25" label="Library name (LB)" help="Required if RG specified" />
|
|
140 <param name="rgpg" type="text" size="25" label="Programs used for processing the read group (PG)" help="Optional" />
|
|
141 <param name="rgpi" type="text" size="25" label="Predicted median insert size (PI)" help="Optional" />
|
|
142 <param name="rgpl" type="text" size="25" label="Platform/technology used to produce the reads (PL)" help="Required if RG specified. Valid values : CAPILLARY, LS454, ILLUMINA,
|
|
143 SOLID, HELICOS, IONTORRENT and PACBIO" />
|
|
144 <param name="rgpu" type="text" size="25" label="Platform unit (PU)" help="Optional. Unique identifier (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)" />
|
|
145 <param name="rgsm" type="text" size="25" label="Sample (SM)" help="Required if RG specified. Use pool name where a pool is being sequenced" />
|
|
146 </when>
|
|
147 <when value="no" />
|
|
148 </conditional>
|
|
149 </when>
|
|
150 </conditional>
|
|
151 <param name="suppressHeader" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Suppress the header in the output SAM file" help="BWA produces SAM with several lines of header information" />
|
|
152 </inputs>
|
|
153 <outputs>
|
|
154 <data format="zip" name="output" label="${tool.name} on ${on_string}: mapped reads"/>
|
|
155 </outputs>
|
|
156 <help>
|
|
157
|
|
158 **What it does**
|
|
159
|
|
160 BWA is a fast light-weighted tool that aligns relatively short sequences (queries) to a sequence database (large), such as the human reference genome. It is developed by Heng Li at the Sanger Insitute. Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 25, 1754-60.
|
|
161
|
|
162 ------
|
|
163
|
|
164 **Know what you are doing**
|
|
165
|
|
166 .. class:: warningmark
|
|
167
|
|
168 There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.
|
|
169
|
|
170 .. __: http://bio-bwa.sourceforge.net/
|
|
171
|
|
172
|
|
173 Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ .
|
|
174
|
|
175 ------
|
|
176
|
|
177 **Input formats**
|
|
178
|
|
179 BWA accepts files in either Sanger FASTQ format (galaxy type *fastqsanger*) or Illumina FASTQ format (galaxy type *fastqillumina*). Use the FASTQ Groomer to prepare your files.
|
|
180
|
|
181 ------
|
|
182
|
|
183 **A Note on Built-in Reference Genomes**
|
|
184
|
|
185 The default variant for all genomes is "Full", defined as all primary chromosomes (or scaffolds/contigs) including mitochondrial plus associated unmapped, plasmid, and other segments. When only one version of a genome is available in this tool, it represents the default "Full" variant. Some genomes will have more than one variant available. The "Canonical Male" or sometimes simply "Canonical" variant contains the primary chromosomes for a genome. For example a human "Canonical" variant contains chr1-chr22, chrX, chrY, and chrM. The "Canonical Female" variant contains the primary chromosomes excluding chrY.
|
|
186
|
|
187 ------
|
|
188
|
|
189 **Outputs**
|
|
190
|
|
191 The output is in SAM format, and has the following columns::
|
|
192
|
|
193 Column Description
|
|
194 -------- --------------------------------------------------------
|
|
195 1 QNAME Query (pair) NAME
|
|
196 2 FLAG bitwise FLAG
|
|
197 3 RNAME Reference sequence NAME
|
|
198 4 POS 1-based leftmost POSition/coordinate of clipped sequence
|
|
199 5 MAPQ MAPping Quality (Phred-scaled)
|
|
200 6 CIGAR extended CIGAR string
|
|
201 7 MRNM Mate Reference sequence NaMe ('=' if same as RNAME)
|
|
202 8 MPOS 1-based Mate POSition
|
|
203 9 ISIZE Inferred insert SIZE
|
|
204 10 SEQ query SEQuence on the same strand as the reference
|
|
205 11 QUAL query QUALity (ASCII-33 gives the Phred base quality)
|
|
206 12 OPT variable OPTional fields in the format TAG:VTYPE:VALU
|
|
207
|
|
208 The flags are as follows::
|
|
209
|
|
210 Flag Description
|
|
211 ------ -------------------------------------
|
|
212 0x0001 the read is paired in sequencing
|
|
213 0x0002 the read is mapped in a proper pair
|
|
214 0x0004 the query sequence itself is unmapped
|
|
215 0x0008 the mate is unmapped
|
|
216 0x0010 strand of the query (1 for reverse)
|
|
217 0x0020 strand of the mate
|
|
218 0x0040 the read is the first read in a pair
|
|
219 0x0080 the read is the second read in a pair
|
|
220 0x0100 the alignment is not primary
|
|
221
|
|
222 It looks like this (scroll sideways to see the entire example)::
|
|
223
|
|
224 QNAME FLAG RNAME POS MAPQ CIAGR MRNM MPOS ISIZE SEQ QUAL OPT
|
|
225 HWI-EAS91_1_30788AAXX:1:1:1761:343 4 * 0 0 * * 0 0 AAAAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAACNNANNGAGTNGNNNNNNNGCTTCCCACAGNNCTGG hhhhhhh;;hhhhhhhhhhh^hOhhhhghhhfhhhgh;;h;;hhhh;h;;;;;;;hhhhhhghhhh;;Phhh
|
|
226 HWI-EAS91_1_30788AAXX:1:1:1578:331 4 * 0 0 * * 0 0 GTATAGANNAATAAGAAAAAAAAAAATGAAGACTTTCNNANNTCTGNANNNNNNNTCTTTTTTCAGNNGTAG hhhhhhh;;hhhhhhhhhhhhhhhhhhhhhhhhhhhh;;h;;hhhh;h;;;;;;;hhhhhhhhhhh;;hhVh
|
|
227
|
|
228 -------
|
|
229
|
|
230 **BWA settings**
|
|
231
|
|
232 All of the options have a default value. You can change any of them. All of the options in BWA have been implemented here.
|
|
233
|
|
234 ------
|
|
235
|
|
236 **BWA parameter list**
|
|
237
|
|
238 This is an exhaustive list of BWA options:
|
|
239
|
|
240 For **aln**::
|
|
241
|
|
242 -n NUM Maximum edit distance if the value is INT, or the fraction of missing
|
|
243 alignments given 2% uniform base error rate if FLOAT. In the latter
|
|
244 case, the maximum edit distance is automatically chosen for different
|
|
245 read lengths. [0.04]
|
|
246 -o INT Maximum number of gap opens [1]
|
|
247 -e INT Maximum number of gap extensions, -1 for k-difference mode
|
|
248 (disallowing long gaps) [-1]
|
|
249 -d INT Disallow a long deletion within INT bp towards the 3'-end [16]
|
|
250 -i INT Disallow an indel within INT bp towards the ends [5]
|
|
251 -l INT Take the first INT subsequence as seed. If INT is larger than the
|
|
252 query sequence, seeding will be disabled. For long reads, this option
|
|
253 is typically ranged from 25 to 35 for '-k 2'. [inf]
|
|
254 -k INT Maximum edit distance in the seed [2]
|
|
255 -t INT Number of threads (multi-threading mode) [1]
|
|
256 -M INT Mismatch penalty. BWA will not search for suboptimal hits with a score
|
|
257 lower than (bestScore-misMsc). [3]
|
|
258 -O INT Gap open penalty [11]
|
|
259 -E INT Gap extension penalty [4]
|
|
260 -c Reverse query but not complement it, which is required for alignment
|
|
261 in the color space.
|
|
262 -R Proceed with suboptimal alignments even if the top hit is a repeat. By
|
|
263 default, BWA only searches for suboptimal alignments if the top hit is
|
|
264 unique. Using this option has no effect on accuracy for single-end
|
|
265 reads. It is mainly designed for improving the alignment accuracy of
|
|
266 paired-end reads. However, the pairing procedure will be slowed down,
|
|
267 especially for very short reads (~32bp).
|
|
268 -N Disable iterative search. All hits with no more than maxDiff
|
|
269 differences will be found. This mode is much slower than the default.
|
|
270
|
|
271 For **samse**::
|
|
272
|
|
273 -n INT Maximum number of alignments to output in the XA tag for reads paired
|
|
274 properly. If a read has more than INT hits, the XA tag will not be
|
|
275 written. [3]
|
|
276 -r STR Specify the read group in a format like '@RG\tID:foo\tSM:bar' [null]
|
|
277
|
|
278 For **sampe**::
|
|
279
|
|
280 -a INT Maximum insert size for a read pair to be considered as being mapped
|
|
281 properly. Since version 0.4.5, this option is only used when there
|
|
282 are not enough good alignment to infer the distribution of insert
|
|
283 sizes. [500]
|
|
284 -n INT Maximum number of alignments to output in the XA tag for reads paired
|
|
285 properly. If a read has more than INT hits, the XA tag will not be
|
|
286 written. [3]
|
|
287 -N INT Maximum number of alignments to output in the XA tag for disconcordant
|
|
288 read pairs (excluding singletons). If a read has more than INT hits,
|
|
289 the XA tag will not be written. [10]
|
|
290 -o INT Maximum occurrences of a read for pairing. A read with more
|
|
291 occurrences will be treated as a single-end read. Reducing this
|
|
292 parameter helps faster pairing. [100000]
|
|
293 -r STR Specify the read group in a format like '@RG\tID:foo\tSM:bar' [null]
|
|
294
|
|
295 For specifying the read group in **samse** or **sampe**, use the following::
|
|
296
|
|
297 @RG Read group. Unordered multiple @RG lines are allowed.
|
|
298 ID Read group identifier. Each @RG line must have a unique ID. The value of
|
|
299 ID is used in the RG tags of alignment records. Must be unique among all
|
|
300 read groups in header section. Read group IDs may be modified when
|
|
301 merging SAM files in order to handle collisions.
|
|
302 CN Name of sequencing center producing the read.
|
|
303 DS Description.
|
|
304 DT Date the run was produced (ISO8601 date or date/time).
|
|
305 FO Flow order. The array of nucleotide bases that correspond to the
|
|
306 nucleotides used for each flow of each read. Multi-base flows are encoded
|
|
307 in IUPAC format, and non-nucleotide flows by various other characters.
|
|
308 Format : /\*|[ACMGRSVTWYHKDBN]+/
|
|
309 KS The array of nucleotide bases that correspond to the key sequence of each read.
|
|
310 LB Library.
|
|
311 PG Programs used for processing the read group.
|
|
312 PI Predicted median insert size.
|
|
313 PL Platform/technology used to produce the reads. Valid values : CAPILLARY,
|
|
314 LS454, ILLUMINA, SOLID, HELICOS, IONTORRENT and PACBIO.
|
|
315 PU Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for
|
|
316 SOLiD). Unique identifier.
|
|
317 SM Sample. Use pool name where a pool is being sequenced.
|
|
318
|
|
319 </help>
|
|
320 <citations>
|
|
321 <citation type="doi">10.1111/mec.12354</citation>
|
|
322 <citation type="doi">10.1111/mec.12330</citation>
|
|
323 <citation type="doi">10.1534/g3.111.000240</citation>
|
|
324 <citation type="doi">10.1534/genetics.111.127324</citation>
|
|
325 <citation type="doi">10.1111/j.1755-0998.2010.02967.x</citation>
|
|
326 <citation type="doi">10.1073/pnas.1006538107</citation>
|
|
327
|
|
328 <citation type="bibtex">@INPROCEEDINGS{JOBIM2013,
|
|
329 author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.},
|
|
330 title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France},
|
|
331 booktitle = {JOBIM 2013 Proceedings},
|
|
332 year = {2013},
|
|
333 url = {https://www.e-biogenouest.org/resources/128},
|
|
334 pages = {97-106}
|
|
335 }</citation>
|
|
336 </citations>
|
|
337 </tool>
|
|
338
|
|
339
|