comparison snap_caller.xml @ 0:d801b0675eb5 draft

planemo upload for repository https://github.com/wm75/mimodd_galaxy_wrappers commit b36048cd608ede0ec6f6559648525c9350caae34-dirty
author wolma
date Sat, 11 Nov 2017 18:18:54 -0500
parents
children e76e813f615a
comparison
equal deleted inserted replaced
-1:000000000000 0:d801b0675eb5
1 <tool id="mimodd_align" name="MiModD Read Alignment" version="@MIMODD_WRAPPER_VERSION@">
2 <description>maps sequence reads to a reference genome using SNAP</description>
3 <macros>
4 <import>macros.xml</import>
5 <macro name="require_metadata">
6 <param name="header" type="data" format="sam"
7 label="metadata source for this sample" />
8 </macro>
9 <macro name="sam_bam_selector" token_format="sam">
10 <param name="ifile" type="data" format="@FORMAT@"
11 label="input file"/>
12 <param name="header" type="data" format="sam" optional="true"
13 label="(optional) metadata source for this sample"
14 help="a SAM format dataset providing information about the sequences in the input data in its header; do NOT provide this dataset if the information is already part of your input dataset unless you want to have the original metadata overwritten. If needed, a metadata source dataset can be generated with the MiModD Run Annotation tool." />
15 </macro>
16 </macros>
17 <expand macro="requirements" />
18 <expand macro="stdio" />
19 <expand macro="version_command" />
20 <command><![CDATA[
21 ## Currently Galaxy does not autoconvert collections of fastq.gz files.
22 ## This tool wrapper fixes that by allowing fastq and fastq.gz as input
23 ## collection formats.
24 ## gz_input is then used as flag to indicate a fastq.gz input file
25 #set gz_input = False
26
27 mimodd snap-batch -s
28 #if str($reference.source) == "cached":
29 #set ref_genome = $reference.genome.fields.path
30 #else:
31 #set ref_genome = $reference.genome
32 #end if
33 #for $i in $datasets
34 "snap ${i.mode_choose.mode} '$ref_genome'
35 #if $str($i.mode_choose.mode) == "paired" and $str($i.mode_choose.input.iformat) == "fastq":
36 #if $str($i.mode_choose.input.pe_source.type) == 'collection':
37 ## PE input provided as a paired collection - if the forward
38 ## dataset is gzipped we assume the reverse dataset is too.
39 '${i.mode_choose.input.pe_source.input_data.forward}'
40 '${i.mode_choose.input.pe_source.input_data.reverse}'
41 #if $i.mode_choose.input.pe_source.input_data.forward.is_of_type('fastq.gz'):
42 #set gz_input = True
43 #end if
44 #else
45 ## PE input provided as separate fastq datasets
46 '${i.mode_choose.input.pe_source.ifile1}'
47 '${i.mode_choose.input.pe_source.ifile2}'
48 #end if
49 #else:
50 ## Input is either SE data or not in fastq format =>
51 ## only one input dataset
52 '${i.mode_choose.input.ifile}'
53 #end if
54 #if $gz_input:
55 ## a gzipped fastq input dataset was encountered
56 --iformat gz
57 #else
58 --iformat ${i.mode_choose.input.iformat}
59 #end if
60 --ofile '$ofile' --oformat ${output_options.oformat}
61 ${output_options.sort} ${output_options.explicit_mmatch_notation}
62 --idx-seedsize $indexing.seedsize
63 --idx-slack $indexing.slack
64 --idx-overflow $indexing.overflow
65 #set $aln_spec = $i.mode_choose.aln_options
66 #if $str($i.mode_choose.mode) == "paired":
67 #set $aln_global = $alignment.paired
68 #set $treat_overlaps = $aln_spec.discard_overlapping_mates or $aln_global.discard_overlapping_mates
69 --spacing #if $aln_spec.sp_min then $aln_spec.sp_min else $aln_global.sp_min
70 #if $aln_spec.sp_max then $aln_spec.sp_max else $aln_global.sp_max
71 #else
72 #set $aln_global = $alignment.single
73 #set $treat_overlaps = ""
74 #end if
75 --maxseeds #if $aln_spec.maxseeds then $aln_spec.maxseeds else $aln_global.maxseeds
76 --maxhits #if $aln_spec.maxhits then $aln_spec.maxhits else $aln_global.maxhits
77 --clipping #if $aln_spec.clipping then $aln_spec.clipping else $aln_global.clipping
78 --maxdist #if $aln_spec.maxdist then $aln_spec.maxdist else $aln_global.maxdist
79 --confdiff #if $aln_spec.confdiff then $aln_spec.confdiff else $aln_global.confdiff
80 --confadapt #if $aln_spec.confadpt then $aln_spec.confadpt else $aln_global.confadpt
81 #if $i.mode_choose.input.header:
82 --header '${i.mode_choose.input.header}'
83 #end if
84 --selectivity $output_options.selectivity
85 #if $str($output_options.filter_output) != "off":
86 --filter-output $output_options.filter_output
87 #end if
88 #if $treat_overlaps:
89 --discard-overlapping-mates
90 ## remove ',' (and possibly adjacent whitespace) and replace with ' '
91 '#echo ("' '".join($treat_overlaps.replace(" ", "").split(',')))#'
92 #end if
93 --verbose"
94 #end for
95 ]]></command>
96
97 <inputs>
98 <conditional name="reference">
99 <param name="source" type="select"
100 label="Will you select a reference genome from your history or use a built-in genome?">
101 <option value="cached">Use a built-in genome</option>
102 <option value="history">Use a genome from my history</option>
103 </param>
104 <when value="cached">
105 <param name="genome" type="select"
106 label="reference genome"
107 help="The fasta reference genome that SNAP should align reads against.">
108 <options from_data_table="all_fasta" />
109 </param>
110 </when>
111 <when value="history">
112 <param name="genome" type="data" format="fasta"
113 label="reference genome"
114 help="The fasta reference genome that SNAP should align reads against."/>
115 </when>
116 </conditional>
117 <section name="indexing" title="Parameters affecting reference genome indexing" expanded="false">
118 <param name="seedsize" type="integer" value="20"
119 label="seed size (default: 20)"
120 help="Length of the seeds used in the reference genome hash table (SNAP index option -s)."/>
121 <param name="slack" type="float" value="0.3"
122 label="hash table slack size (default: 0.3)"
123 help="Corresponds to the -h option of SNAP index."/>
124 <param name="overflow" type="integer" min="1" max="1000" value="40"
125 label="index overflow factor (default: 40)"
126 help="Factor (between 1 and 1000) that controls the size of the index build overflow space. For certain genomes you may have to increase this value if you are getting a corresponding error from the tool." />
127 </section>
128 <section name="alignment" title="Alignment parameters" expanded="false"
129 help="The global alignment parameters in this section will be used for samples for which you do not provide their own sample-specific settings.">
130 <section name="single" title="Parameters applied to single-end samples"
131 help="These parameters will affect the alignments for any single-end sample
132 for which you do not provide sample-specific settings.">
133 <param name="maxdist" type="integer" value="8"
134 label="edit distance (default: 8)"
135 help="maximum edit distance allowed per read or pair (SNAP option -d); higher values allow more divergent alignments to be found, but increase the rate of misalignments."/>
136 <param name="confdiff" type="integer" value="2"
137 label="confidence threshold (default: 2)"
138 help="Confidence threshold (SNAP option -c); the minimum edit distance difference between two alternate alignments required to reject the poorer alignment as suboptimal; higher values increase the rate of ambiguously aligned reads."/>
139 <param name="confadpt" type="integer" value="7"
140 label="adaptive confdiff behaviour (default: 7)"
141 help="Specifies how many seeds of a read may be ignored (based on the maximum hits value above) before the confidence threshold above gets increased by one for that read (SNAP option -a); helps fine-tuning alignment accuracy in repetitive regions of the genome."/>
142 <param name="maxseeds" type="integer" value="25"
143 label="maximum seeds per read (default: 25)"
144 help="Number of seeds to use per read (SNAP option -n) when trying to match it to the reference genome; higher numbers will increase the rate of aligned reads and reduce the rate of misalignments, but will reduce performance."/>
145 <param name="maxhits" type="integer" value="250"
146 label="maximum hits per seed (default: 250)"
147 help="Maximum hits to consider per seed (SNAP option -h); don't use a seed region in the alignment process if it matches more than maxhits regions in the reference genome. Higher values reduce the rate of misalignments, but reduce performance."/>
148 <param name="clipping" type="select" display="radio"
149 label="read clipping (default: from back and front)"
150 help="Specifies from which end of a read low-quality bases should be clipped (SNAP option -Cxx)">
151 <option value="++">from back and front</option>
152 <option value="x+">from back only</option>
153 <option value="+x">from front only</option>
154 <option value="xx">no clipping</option>
155 </param>
156 </section>
157 <section name="paired" title="Parameters applied to paired-end samples"
158 help="These parameters will affect the alignments for any paired-end sample
159 for which you do not provide sample-specific settings.">
160 <param name="sp_min" type="integer" value="100"
161 label="minimum spacing to allow between paired ends (default: 100)"
162 help="Corresponds to the first value of the SNAP option -s."/>
163 <param name="sp_max" type="integer" value="10000"
164 label="maximum spacing to allow between paired ends (default: 10000)"
165 help="Corresponds to the second value of the SNAP option -s."/>
166 <param name="discard_overlapping_mates" type="text" optional="true"
167 label="discard overlapping read pairs of type"
168 help="Consider overlapping mate pairs of the given orientation type(s) anomalous and discard them; allowed values: RF, FR, FF, RR; multiple types may be specified as a comma-separated list and ALL can be used as a shortcut for discarding all overlapping mate pairs; leave blank to retain all overlapping pairs." />
169 <param name="maxdist" type="integer" value="8"
170 label="edit distance (default: 8)"
171 help="maximum edit distance allowed per read or pair (SNAP option -d); higher values allow more divergent alignments to be found, but increase the rate of misalignments."/>
172 <param name="confdiff" type="integer" value="2"
173 label="confidence threshold (default: 2)"
174 help="Confidence threshold (SNAP option -c); the minimum edit distance difference between two alternate alignments required to reject the poorer alignment as suboptimal; higher values increase the rate of ambiguously aligned reads."/>
175 <param name="confadpt" type="integer" value="7"
176 label="adaptive confdiff behaviour (default: 7)"
177 help="Specifies how many seeds of a read may be ignored (based on the maximum hits value above) before the confidence threshold above gets increased by one for that read (SNAP option -a); helps fine-tuning alignment accuracy in repetitive regions of the genome."/>
178 <param name="maxseeds" type="integer" value="25"
179 label="maximum seeds per read (default: 25)"
180 help="Number of seeds to use per read (SNAP option -n) when trying to match it to the reference genome; higher numbers will increase the rate of aligned reads and reduce the rate of misalignments, but will reduce performance."/>
181 <param name="maxhits" type="integer" value="250"
182 label="maximum hits per seed (default: 250)"
183 help="Maximum hits to consider per seed (SNAP option -h); don't use a seed region in the alignment process if it matches more than maxhits regions in the reference genome. Higher values reduce the rate of misalignments, but reduce performance."/>
184 <param name="clipping" type="select" display="radio"
185 label="read clipping (default: from back and front)"
186 help="Specifies from which end of a read low-quality bases should be clipped (SNAP option -Cxx)">
187 <option value="++">from back and front</option>
188 <option value="x+">from back only</option>
189 <option value="+x">from front only</option>
190 <option value="xx">no clipping</option>
191 </param>
192 </section>
193 </section>
194 <conditional name="output_options">
195 <param name="config" type="select"
196 label="Output options"
197 help="No matter how many input datasets you specify below and what there formats are, this tool will produce a single output file with the aligned reads from all samples. In this section you can configure some aspects of what the output should look like. Unless you have a really special usecase, you can (and probably should) just go with the default settings though.">
198 <option value="default">Just go with the defaults</option>
199 <option value="custom">Show detailed output options</option>
200 </param>
201 <when value="default">
202 <param name="oformat" type="hidden" value="bam" />
203 <param name="sort" type="hidden" value=""/>
204 <param name="explicit_mmatch_notation" type="hidden" value=""/>
205 <param name="filter_output" type="hidden" value="off"/>
206 <param name="selectivity" type="hidden" value="1"/>
207 </when>
208 <when value="custom">
209 <param name="oformat" type="select" display="radio"
210 label="Output format">
211 <option value="bam">BAM</option>
212 <option value="sam">SAM</option>
213 </param>
214 <param name="sort" type="boolean" falsevalue="--no-sort" truevalue="" checked="true"
215 label="Sort aligned reads in the output by coordinates"
216 help="Turn off if you want to retain the read order of the input file(s) (mimodd snap option --no-sort)." />
217 <param name="explicit_mmatch_notation" type="boolean" truevalue="-X" falsevalue="" checked="false"
218 label="Use = and X to indicate matches/mismatches in CIGAR strings explicitly instead of using M for both"
219 help="Warning: Downstream tools may still rely on the classic M notation! Turn this on at your own risk (mimodd snap option -X)." />
220 <param name="selectivity" type="integer" min="1" value="1"
221 label="selectivity (default: 1)"
222 help="randomly choose 1/selectivity of the reads to score (SNAP option -S). The default of 1 indicates that all reads should be worked with." />
223 <param name="filter_output" type="select" display="radio"
224 label="filter output (default: no filtering)"
225 help="filter output (SNAP option -F) to retain only specific classes of reads.">
226 <option value="off">no filtering</option>
227 <option value="a">aligned only</option>
228 <option value="s">single-aligned only</option>
229 <option value="u">unaligned only</option>
230 </param>
231 </when>
232 </conditional>
233 <repeat name="datasets" title="datasets" default="1" min="1">
234 <conditional name="mode_choose">
235 <param name="mode" type="select" label="choose mode"
236 help="Reads obtained from single-end sequencing runs should be aligned in 'single' mode, paired-end reads in 'paired' mode. **WARNING**: if the read input file is in SAM/BAM format, the current version of this tool will **not** verify the mode and may produce erroneous alignments with wrong settings!">
237 <option value="single">single-end</option>
238 <option value="paired">paired-end</option>
239 </param>
240 <when value="single">
241 <conditional name="input">
242 <param name="iformat" type="select" label="input file format">
243 <option value="bam">BAM</option>
244 <option value="sam">SAM</option>
245 <option value="fastq">fastq</option>
246 </param>
247 <when value="bam">
248 <expand macro="sam_bam_selector" format="bam" />
249 </when>
250 <when value="sam">
251 <expand macro="sam_bam_selector" format="sam" />
252 </when>
253 <when value="fastq">
254 <param name="ifile" type="data" format="fastq"
255 label="input file"/>
256 <expand macro="require_metadata" />
257 </when>
258 </conditional>
259 <section name="aln_options" title="Alignment options for this sample" expanded="false"
260 help="Any options you specify here will overwrite the global alignment settings defined for all single-end samples above.">
261 <param name="maxdist" type="integer" optional="true" value=""
262 label="edit distance"
263 help="maximum edit distance allowed per read or pair (SNAP option -d); higher values allow more divergent alignments to be found, but increase the rate of misalignments."/>
264 <param name="confdiff" type="integer" optional="true" value=""
265 label="confidence threshold"
266 help="Confidence threshold (SNAP option -c); the minimum edit distance difference between two alternate alignments required to reject the poorer alignment as suboptimal; higher values increase the rate of ambiguously aligned reads."/>
267 <param name="confadpt" type="integer" optional="true" value=""
268 label="adaptive confdiff behaviour"
269 help="Specifies how many seeds of a read may be ignored (based on the maximum hits value above) before the confidence threshold above gets increased by one for that read (SNAP option -a); helps fine-tuning alignment accuracy in repetitive regions of the genome."/>
270 <param name="maxseeds" type="integer" optional="true" value=""
271 label="maximum seeds per read"
272 help="Number of seeds to use per read (SNAP option -n) when trying to match it to the reference genome; higher numbers will increase the rate of aligned reads and reduce the rate of misalignments, but will reduce performance."/>
273 <param name="maxhits" type="integer" optional="true" value=""
274 label="maximum hits per seed"
275 help="Maximum hits to consider per seed (SNAP option -h); don't use a seed region in the alignment process if it matches more than maxhits regions in the reference genome. Higher values reduce the rate of misalignments, but reduce performance."/>
276 <param name="clipping" type="select" display="radio"
277 label="read clipping (default: from back and front)"
278 help="Specifies from which end of a read low-quality bases should be clipped (SNAP option -Cxx)">
279 <option value="">use global setting</option>
280 <option value="++">from back and front</option>
281 <option value="x+">from back only</option>
282 <option value="+x">from front only</option>
283 <option value="xx">no clipping</option>
284 </param>
285 </section>
286 </when>
287 <when value="paired">
288 <conditional name="input">
289 <param name="iformat" type="select" label="input file format">
290 <option value="bam">BAM</option>
291 <option value="sam">SAM</option>
292 <option value="fastq">fastq</option>
293 </param>
294 <when value="bam">
295 <expand macro="sam_bam_selector" format="bam" />
296 </when>
297 <when value="sam">
298 <expand macro="sam_bam_selector" format="sam" />
299 </when>
300 <when value="fastq">
301 <conditional name="pe_source">
302 <param name="type" type="select"
303 label="the paired-end fastq input is provided as">
304 <option value="individual">Individual datasets</option>
305 <option value="collection">a Paired collection</option>
306 </param>
307 <when value="individual">
308 <param name="ifile1" type="data" format="fastq"
309 label="inputfile with the first set of reads of paired-end data"/>
310 <param name="ifile2" type="data" format="fastq"
311 label="inputfile with the second set of reads of paired-end data"/>
312 </when>
313 <when value="collection">
314 <param name="input_data" type="data_collection"
315 collection_type="paired" format="fastq, fastq.gz"
316 label="paired input dataset collection"/>
317 </when>
318 </conditional>
319 <expand macro="require_metadata" />
320 </when>
321 </conditional>
322 <section name="aln_options" title="Alignment options for this sample" expanded="false"
323 help="Any options you specify here will overwrite the global alignment settings defined for all paired-end samples above.">
324 <param name="sp_min" type="integer" optional="true" value="0"
325 label="minimum spacing to allow between paired ends"
326 help="Corresponds to the first value of the SNAP option -s."/>
327 <param name="sp_max" type="integer" optional="true" value="0"
328 label="maximum spacing to allow between paired ends"
329 help="Corresponds to the second value of the SNAP option -s."/>
330 <param name="discard_overlapping_mates" type="text" optional="true" value=""
331 label="discard overlapping read pairs of type"
332 help="Consider overlapping mate pairs of the given orientation type(s) anomalous and discard them; allowed values: RF, FR, FF, RR; multiple types may be specified as a comma-separated list and ALL can be used as a shortcut for discarding all overlapping mate pairs; leave blank to retain all overlapping pairs." />
333 <param name="maxdist" type="integer" optional="true" value="0"
334 label="edit distance"
335 help="maximum edit distance allowed per read or pair (SNAP option -d); higher values allow more divergent alignments to be found, but increase the rate of misalignments."/>
336 <param name="confdiff" type="integer" optional="true" value=""
337 label="confidence threshold"
338 help="Confidence threshold (SNAP option -c); the minimum edit distance difference between two alternate alignments required to reject the poorer alignment as suboptimal; higher values increase the rate of ambiguously aligned reads."/>
339 <param name="confadpt" type="integer" optional="true" value=""
340 label="adaptive confdiff behaviour"
341 help="Specifies how many seeds of a read may be ignored (based on the maximum hits value above) before the confidence threshold above gets increased by one for that read (SNAP option -a); helps fine-tuning alignment accuracy in repetitive regions of the genome."/>
342 <param name="maxseeds" type="integer" optional="true" value=""
343 label="maximum seeds per read"
344 help="Number of seeds to use per read (SNAP option -n) when trying to match it to the reference genome; higher numbers will increase the rate of aligned reads and reduce the rate of misalignments, but will reduce performance."/>
345 <param name="maxhits" type="integer" optional="true" value=""
346 label="maximum hits per seed"
347 help="Maximum hits to consider per seed (SNAP option -h); don't use a seed region in the alignment process if it matches more than maxhits regions in the reference genome. Higher values reduce the rate of misalignments, but reduce performance."/>
348 <param name="clipping" type="select" display="radio"
349 label="read clipping (default: from back and front)"
350 help="Specifies from which end of a read low-quality bases should be clipped (SNAP option -Cxx)">
351 <option value="">use global setting</option>
352 <option value="++">from back and front</option>
353 <option value="x+">from back only</option>
354 <option value="+x">from front only</option>
355 <option value="xx">no clipping</option>
356 </param>
357 </section>
358 </when>
359 </conditional>
360 </repeat>
361 </inputs>
362
363 <outputs>
364 <data name="ofile" format="bam"
365 label="Aligned reads from MiModd ${tool.name} on ${on_string}">
366 <change_format>
367 <when input="output_options.oformat" value="sam" format="sam"/>
368 </change_format>
369 <actions>
370 <conditional name="reference.source">
371 <when value="cached">
372 <action type="metadata" name="dbkey">
373 <option type="from_data_table" name="all_fasta" column="1" offset="0">
374 <filter type="param_value" ref="reference.genome" column="0" />
375 </option>
376 </action>
377 </when>
378 </conditional>
379 </actions>
380 </data>
381 </outputs>
382
383 <tests>
384 <test>
385 <conditional name="reference">
386 <param name="source" value="history" />
387 <param name="genome" value="a.fa" />
388 </conditional>
389 <repeat name="datasets">
390 <conditional name="mode_choose">
391 <param name="mode" value="single" />
392 <conditional name="input">
393 <param name="iformat" value="bam" />
394 <param name="ifile" value="a_part1.bam" />
395 </conditional>
396 </conditional>
397 </repeat>
398 <assert_command>
399 <has_text text="--idx-slack 0.3" />
400 <has_text text="--iformat bam" />
401 <has_text text="--oformat bam" />
402 <has_text text="--idx-seedsize 20" />
403 <has_text text="--idx-slack 0.3" />
404 <has_text text="--idx-overflow 40" />
405 <has_text text="--maxseeds 25" />
406 <has_text text="--maxhits 250" />
407 <has_text text="--clipping ++" />
408 <has_text text="--maxdist 8" />
409 <has_text text="--confdiff 2" />
410 <has_text text="--confadapt 7" />
411 <has_text text="--selectivity 1" />
412 </assert_command>
413 </test>
414 <test>
415 <conditional name="reference">
416 <param name="source" value="history" />
417 <param name="genome" value="a.fa" />
418 </conditional>
419 <repeat name="datasets">
420 <conditional name="mode_choose">
421 <param name="mode" value="single" />
422 <conditional name="input">
423 <param name="iformat" value="bam" />
424 <param name="ifile" value="a_part1.bam" />
425 </conditional>
426 <section name="aln_options">
427 <param name="maxdist" value="7" />
428 </section>
429 </conditional>
430 </repeat>
431 <assert_command>
432 <has_text text="--idx-slack 0.3" />
433 <has_text text="--iformat bam" />
434 <has_text text="--oformat bam" />
435 <has_text text="--idx-seedsize 20" />
436 <has_text text="--idx-slack 0.3" />
437 <has_text text="--idx-overflow 40" />
438 <has_text text="--maxseeds 25" />
439 <has_text text="--maxhits 250" />
440 <has_text text="--clipping ++" />
441 <has_text text="--maxdist 7" />
442 <has_text text="--confdiff 2" />
443 <has_text text="--confadapt 7" />
444 <has_text text="--selectivity 1" />
445 </assert_command>
446 </test>
447 </tests>
448
449 <help><![CDATA[
450 .. class:: infomark
451
452 **What it does**
453
454 The tool aligns the sequenced reads in an arbitrary number of input datasets
455 against a common reference genome and stores the results in a single, possibly
456 multi-sample output dataset.
457
458 Internally, the tool uses the ultrafast, hashtable-based aligner SNAP (http://snap.cs.berkeley.edu).
459
460 ----------
461
462 **Notes:**
463
464 *Input formats*
465
466 - The tool accepts SAM, BAM, fastq and fastq.gz input datasets of sequenced
467 reads and supports both single-end and paired-end data.
468
469 The recommended approach with MiModD is to store NGS datasets in SAM/BAM
470 format with *Run Metadata* (see below) stored in the file header. You can use
471 the *MiModD Run Annotation* and *MiModD Convert* tools to convert data from
472 fastq format to SAM/BAM format while attaching run metadata to it.
473
474 While alignments **directly from fastq format** are supported, this **is less
475 reliable** due to less strict specifications of this format. If you find
476 the tool complaining about malformed fastq input, it is likely that you can
477 fix this problem by converting the data to SAM/BAM format first.
478
479 - If you wish to align paired-end data directly from fastq format, the mate
480 sequence data has to be split over two datasets as is mostly standard today.
481 If you have your paired-end data as a single dataset you may look into the
482 *FASTQ splitter* and *FASTQ de-interlacer* tools for Galaxy, which are
483 available from the `Fastq Manipulation category`_ of the Galaxy Tool Shed and
484 may be able to convert your files to the expected format.
485
486 *Run Metadata*
487
488 - **Every input file requires accompanying Run Metadata!** Most importantly,
489 this includes a *read-group ID* (an identifier of the sequencing run that
490 produced the data) and a *sample name* (identifying the
491 biological sample sequenced in the run).
492
493 - If an input dataset does not provide this information directly (fastq
494 datasets never do; SAM/BAM datasets may provide it in their header), you need
495 to specify a separate SAM/BAM dataset with an appropriate header as the
496 source of the Run Metadata.
497
498 You can use the *MiModD Run Annotation* tool to generate such a file.
499
500 - If a SAM/BAM input dataset already provides Run Metadata, you can still
501 specify a different Run Metadata source, which will then overwrite the
502 information already present in the input. This is useful, for example, to
503 resolve read-group ID conflicts between multiple input datasets.
504
505 - Every input dataset can only contain reads from a single read-group. If you
506 would like, for example, to realign the reads in a multi-sample SAM/BAM
507 dataset. You should first use the *MiModD Sort* tool to sort the data by read
508 names (this step is only necessary for paired-end data), then split the reads
509 into new per-read-group datasets using the *MiModD Convert* tool.
510
511 - Several input datasets can declare identical read-group IDs and/or sample
512 names.
513
514 Identical read-group IDs mean that the datasets were produced in the
515 same sequencing run, as is the case, for example, with partial fastq
516 sequencing data. In the output dataset, the corresponding reads will be
517 merged and it will not be possible to trace back their source.
518
519 Identical sample names (but different read-group IDs) indicate that the same
520 sample has been sequenced multiple times. In the output dataset, the
521 corresponding reads will be tagged appropriately and tools like the
522 *MiModD Variant Calling* tool will let you decide whether you want to treat
523 them together or separately.
524
525 ----------
526
527 **Tool Options**
528
529 The section *Alignment parameters* lets you configure global settings for the
530 alignment job that will be applied to all input datasets. For each input
531 dataset, however, you can overwrite some or all of these settings by specifying
532 new values in the section *Alignment options for this sample*. Some of the
533 alignment parameters may have **big** effects on the alignment quality, but
534 these effects are very dependent on the type of input sequences. You are
535 strongly encouraged to consult the in-depth `tool documentation`_ for detailed
536 explanations of the available options.
537
538 .. _Fastq Manipulation category: https://toolshed.g2.bx.psu.edu/repository/browse_repositories_in_category?id=310ff67d4caf6531
539 .. _recipe for using gzipped fastq files in Galaxy: http://mimodd.readthedocs.org/en/latest/recipes.html#use-gzipped-fastq-files-in-galaxy
540 .. _tool documentation: http://mimodd.readthedocs.io/en/@MIMODD_REAL_VERSION@/tool_doc.html#snap
541
542 @HELP_FOOTER@
543 ]]></help>
544 <expand macro="citations" />
545 </tool>
546