comparison winnowmap.xml @ 0:1c070debf549 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/winnowmap commit f3c75fcbec417c749fca98a8d0cb3415c427dbb8"
author iuc
date Thu, 29 Apr 2021 21:35:38 +0000
parents
children 35ea4a8b1ea6
comparison
equal deleted inserted replaced
-1:000000000000 0:1c070debf549
1 <tool id="winnowmap" name="Winnowmap" version="@TOOL_VERSION@+galaxy0" profile="20.01">
2 <description>a mapping tool optimized for repetitive sequences</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="edam_ontology"/>
7 <expand macro="requirements"/>
8 <stdio>
9 <exit_code range="1:" level="fatal" />
10 <regex match="\[ERROR\]" source="stderr" level="fatal" />
11 </stdio>
12 <version_command>winnowmap --version</version_command>
13 <command>
14 <![CDATA[
15 #if $reference_source.reference_source_selector == 'history':
16 ln -f -s '$reference_source.ref_file' reference.fa &&
17 #else:
18 ln -f -s '$reference_source.ref_file.fields.path' reference.fa &&
19 #end if
20 winnowmap
21 #if str($analysis_type_selector) == 'self-homology':
22 -DP -k19 -w19 -m200
23 #elif $analysis_type_selector:
24 -x ${analysis_type_selector}
25 #end if
26 #if str($highfreq_kmers):
27 -W $highfreq_kmers
28 #end if
29 ## indexing options
30 $indexing_options.H
31 #if str($indexing_options.k) and not str($highfreq_kmers):
32 -k $indexing_options.k
33 #end if
34 #if str($indexing_options.w):
35 -w $indexing_options.w
36 #end if
37 #if str($indexing_options.I):
38 -I $indexing_options.I
39 #end if
40
41 ## Mapping options
42 #if str($mapping_options.f):
43 -f $mapping_options.f
44 #end if
45 #if str($mapping_options.min_occ_floor):
46 --min-occ-floor $mapping_options.min_occ_floor
47 #end if
48 #if str($mapping_options.g):
49 -g $mapping_options.g
50 #end if
51 #if str($mapping_options.r):
52 -r $mapping_options.r
53 #end if
54 #if str($mapping_options.n):
55 -n $mapping_options.n
56 #end if
57 #if str($mapping_options.m):
58 -m $mapping_options.m
59 #end if
60 #if str($mapping_options.max_chain_skip):
61 --max-chain-skip $mapping_options.max_chain_skip
62 #end if
63 #if str($mapping_options.max_chain_iter):
64 --max-chain-iter $mapping_options.max_chain_iter
65 #end if
66 $mapping_options.X
67 #if str($mapping_options.p):
68 -p $mapping_options.p
69 #end if
70 #if str($mapping_options.sv_off):
71 --sv-off
72 #end if
73
74 ## Alignment options
75 #if str($alignment_options.splicing.splice_mode) == '--splice':
76 --frag=no --splice
77 #if str($alignment_options.splicing.G):
78 -G $alignment_options.splicing.G
79 #end if
80 #if str($alignment_options.splicing.C):
81 -C $alignment_options.splicing.C
82 #end if
83 #if $alignment_options.splicing.u:
84 -u $alignment_options.splicing.u
85 #end if
86 $alignment_options.splicing.splice_flank
87 #if str($alignment_options.splicing.splice_site_annotations.use_annotations) == 'yes':
88 --junc-bed '$alignment_options.splicing.splice_site_annotations.junc_bed'
89 --junc-bonus $alignment_options.splicing.splice_site_annotations.junc_bonus
90 #end if
91 #elif str($mapping_options.F) and 'splice' not in str($analysis_type_selector):
92 --frag=yes -F $mapping_options.F
93 #end if
94 #if str($alignment_options.A):
95 -A $alignment_options.A
96 #end if
97 #if str($alignment_options.B):
98 -B $alignment_options.B
99 #end if
100 #if str($alignment_options.O):
101 #if str($alignment_options.O2):
102 -O $alignment_options.O,$alignment_options.O2
103 #else
104 -O $alignment_options.O
105 #end if
106 #end if
107 #if str($alignment_options.E):
108 #if str($alignment_options.E2):
109 -E $alignment_options.E,$alignment_options.E2
110 #else
111 -E $alignment_options.E
112 #end if
113 #end if
114 #if str($alignment_options.z):
115 #if str($alignment_options.z2):
116 -z $alignment_options.z,$alignment_options.z2
117 #else
118 -z $alignment_options.z
119 #end if
120 #end if
121 #if str($alignment_options.s):
122 -s $alignment_options.s
123 #end if
124 $alignment_options.no_end_flt
125 ## Output options
126 $io_options.Q
127 $io_options.L
128 $io_options.c
129 #if $io_options.cs:
130 --cs $io_options.cs
131 #end if
132 $io_options.Y
133 #if $io_options.K:
134 -K $io_options.K
135 #end if
136 -t \${GALAXY_SLOTS:-4}
137 reference.fa
138 #if $fastq_input.fastq_input_selector in ['single', 'paired_iv']:
139 '$fastq_input.fastq_input1'
140 #else if $fastq_input.fastq_input_selector == 'paired':
141 '$fastq_input.fastq_input1' '$fastq_input.fastq_input2'
142 #else if $fastq_input.fastq_input_selector == 'paired_collection':
143 '$fastq_input.fastq_input1.forward' '$fastq_input.fastq_input1.reverse'
144 #end if
145 #if $io_options.output_format == 'BAM':
146 -a
147 | samtools sort
148 -@\${GALAXY_SLOTS:-2}
149 -T "\${TMPDIR:-.}"
150 -O $io_options.output_format
151 -o '$alignment_output'
152 #else if $io_options.output_format == 'CRAM':
153 -a
154 | samtools sort
155 -T "\${TMPDIR:-.}"
156 -@\${GALAXY_SLOTS:-2}
157 -O $io_options.output_format
158 $io_options.eqx
159 --reference reference.fa
160 --output-fmt-option no_ref
161 -o '$alignment_output'
162 #else:
163 > '$alignment_output'
164 #end if
165 ]]>
166 </command>
167 <inputs>
168 <conditional name="reference_source">
169 <param name="reference_source_selector" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options. See `Indexes` section of help below. If you would like to perform self-mapping select `history` here, then choose your input file as reference.">
170 <option value="cached">Use a built-in genome index</option>
171 <option value="history">Use a genome from history and build index</option>
172 </param>
173 <when value="cached">
174 <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
175 <options from_data_table="all_fasta">
176 <filter type="sort_by" column="2" />
177 <validator type="no_options" message="No reference genomes are available" />
178 </options>
179 <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
180 </param>
181 </when>
182 <when value="history">
183 <param name="ref_file" type="data" format="fasta,fastq" label="Use the following dataset as the reference sequence" help="You can upload a FASTA or FASTQ sequence to the history and use it as reference" />
184 </when>
185 </conditional>
186 <!-- start unchanged copy from bwa-mem -->
187 <conditional name="fastq_input">
188 <param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data">
189 <option value="single">Single</option>
190 <option value="paired">Paired</option>
191 <option value="paired_collection">Paired Collection</option>
192 <option value="paired_iv">Paired Interleaved</option>
193 </param>
194 <!-- below, preset options are only offered for single-end input
195 because paired-end alignment in minimap2 is only enabled with -x sr
196 (see https://github.com/lh3/minimap2/issues/190) -->
197 <when value="single">
198 <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with single reads"/>
199 </when>
200 <when value="paired">
201 <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select first set of reads" help="Specify dataset with forward reads"/>
202 <param name="fastq_input2" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select second set of reads" help="Specify dataset with reverse reads"/>
203 </when>
204 <when value="paired_collection">
205 <param name="fastq_input1" format="fastqsanger,fastqsanger.gz,fasta" type="data_collection" collection_type="paired" label="Select a paired collection" help="See help section for an explanation of dataset collections"/>
206 </when>
207 <when value="paired_iv">
208 <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with interleaved reads"/>
209 </when>
210 </conditional>
211 <param name="highfreq_kmers" argument="-W" type="data" format="tabular" label="High frequency k-mers dataset" optional="True" help="Input file containing list of high freq. k-mers generated by meryl"/>
212 <param name="analysis_type_selector" type="select" optional="True"
213 label="Select a profile of preset options"
214 help="Each profile comes with the preconfigured settings mentioned in parentheses. You can customize each profile further in the indexing, mapping and alignment options sections below. If you do not select a profile here, the tool will use the per-parameter defaults listed in the below sections unless you customize them." >
215 <option value="map-pb">PacBio/Oxford Nanopore read to reference mapping (-Hk19) (map-pb)</option>
216 <option value="map-ont">Oxford Nanopore read to reference mapping. Slightly more sensitive for Oxford Nanopore to reference mapping (-k15). For PacBio reads, HPC minimizers consistently leads to faster performance and more sensitive results in comparison to normal minimizers. For Oxford Nanopore data, normal minimizers are better, though not much. The effectiveness of HPC is determined by the sequencing error mode. (map-ont)</option>
217 <option value="map-pb-clr">Turn off SV-aware mode for (relatively) short and noisy reads (map-pb-ctr)</option>
218 <option value="asm5">Long assembly to reference mapping (-k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 --min-occ-floor=100). Typically, the alignment will not extend to regions with 5% or higher sequence divergence. Only use this preset if the average divergence is far below 5%. (asm5)</option>
219 <option value="asm10">Long assembly to reference mapping (-k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 --min-occ-floor=100). Up to 10% sequence divergence. (asm10)</option>
220 <option value="asm20">Long assembly to reference mapping (-k19 -w10 -A1 -B6 -O6,26 -E2,1 -s200 -z200 --min-occ-floor=100). Up to 20% sequence divergence. (asm20)</option>
221 <option value="splice">Long-read spliced alignment (-k15 -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --splice-flank=yes). In the splice mode, 1) long deletions are taken as introns and represented as the `N' CIGAR operator 2) long insertions are disabled 3) deletion and insertion gap costs are different during chaining 4) the computation of the `ms` tag ignores introns to demote hits to pseudogenes. (splice)</option>
222 <option value="splice:hq">Long-read splice alignment for PacBio CCS reads (same as `splice` but with -C5 -O6,24 -B4) (splice:hq)</option>
223 <option value="self-homology">Construct a self-homology map - use same genome as query and reference (-DP -k19 -w19 -m200) (self-homology)</option>
224 </param>
225 <section name="indexing_options" title="Indexing options">
226 <param argument="-H" name="H" type="boolean" optional="true" truevalue="-H" falsevalue="" label="Use homopolymer-compressed k-mer ?"/>
227 <param argument="-k" type="integer" min="4" max="28" optional="true" label="K-mer size" help="k-mer size (no larger than 28). "/>
228 <param argument="-w" type="integer" min="1" optional="true" label="Minimizer window size" help=""/>
229 <param argument="-I" type="integer" min="1" optional="true" label="Split index for every N input gigabases" help=""/>
230 </section>
231 <section name="mapping_options" title="Mapping options" help="Sets -f, -g, -F, -r, -n, -m, -X, -p, --sv-off and --min-occ-floor options." expanded="False">
232 <param name="sv_off" argument="--sv-off" type="boolean" truevalue="--sv-off" falsevalue="" checked="False" label="Turn off SV-aware mode" help="SV aware k-mer search allows to find approximate mapping locations for a read"/>
233 <param argument="-F" type="integer" min="0" value="" optional="true"
234 label="Max fragment length for PE alignment"
235 help="The maximum apparent fragment length up to which paired-end reads are aligned together; at higher fragment lengths the mates will be aligned independent of each other; effective only for paired-end data and when spliced alignment mode is turned off; default=800" />
236 <param argument="-f" type="float" value="" optional="true" label="Filter out top FLOAT fraction of repetitive minimizers" help="default=0.0002"/>
237 <param argument="--min-occ-floor" name="min_occ_floor" type="integer" label="Force winnowmap to always use k-mers occuring this many times or fewer" help="Maximum occurence is the number of repetitive minimizers determined by '-f' or this value, whichever is higher." optional="true" />
238 <param argument="-g" type="integer" value="" optional="true" label="Stop chain enlongation if there are no minimizers in INT-bp" help="default=5000"/>
239 <param argument="-r" type="integer" value="" optional="true" label="Bandwidth used in chaining and DP-based alignment" help="default=500" />
240 <param argument="-n" type="integer" value="" optional="true" label="Minimal number of minimizers on a chain" help="default=3"/>
241 <param argument="-m" type="integer" value="" optional="true" label="Minimal chaining score (matching bases minus log gap penalty)" help="default=40"/>
242 <param argument="--max-chain-skip" type="integer" value="" optional="true"
243 label="Maximum seed skips during chaining"
244 help="A heuristics that stops chaining early. Winnowmap uses dynamic programming for chaining. The time complexity is quadratic in the number of seeds. This option makes winnowmap exit the inner loop if it repeatedly sees seeds already on chains. Set to a large number to switch off this heurstics effectively. default=25" />
245 <param argument="--max-chain-iter" type="integer" value="" optional="true"
246 label="Maximum number of partial chains checked during chaining"
247 help="A heuristics to avoid quadratic time complexity in the worst case. default=5000" />
248 <param argument="-X" type="boolean" truevalue="-X" falsevalue="" optional="true" label="Skip self and dual mappings (for the all-vs-all mode)"/>
249 <param argument="-p" type="float" value="" max="1" optional="true" label="Min secondary-to-primary score ratio" help="default=0.8"/>
250 </section>
251 <section name="alignment_options" title="Alignment options" help="Sets -A, -B, -O, -E, -z, -s, and spliced alignments options." expanded="False">
252 <conditional name="splicing">
253 <param name="splice_mode" type="select"
254 label="Customize spliced alignment mode?"
255 help="Warning: Winnowmap cannot currently perform spliced alignments of read pairs. If you enable spliced alignment for paired-end data it will be treated as single-end!" >
256 <option value="preset">No, use profile setting or leave turned off</option>
257 <option value="">Disable spliced alignments (overwrite profile setting if necessary)</option>
258 <option value="--splice">Yes, enable spliced alignments (--splice)</option>
259 </param>
260 <when value="preset" />
261 <when value="" />
262 <when value="--splice">
263 <param argument="-G" type="integer" value="" optional="true"
264 label="Maximum allowed gap on the reference"
265 help="Higher values cause slower spliced alignment. When in use, this option causes -r (in mapping options) to be set to the same value. default=200k" />
266 <param argument="-C" type="integer" min="0" optional="true"
267 label="Cost of non-canonical (non-GT-AG) splicing"
268 help="default=0" />
269 <param argument="-u" type="select" optional="true"
270 label="how to find GT-AG"
271 help="default=n (don't match GT-AG)">
272 <option value="n">don't match GT-AG (-un)</option>
273 <option value="f">transcript strand (-uf)</option>
274 <option value="b">both strands (-ub)</option>
275 </param>
276 <param argument="--splice-flank" type="boolean" truevalue="--splice-flank=yes" falsevalue="--splice-flank=no" checked="true"
277 label="Assume conserved flanking region of splice sites?"
278 help="Assume the next base to a GT donor site tends to be A/G (91% in human and 92% in mouse) and the preceding base to a AG acceptor tends to be C/T. This trend is evolutionarily conserved, all the way to S. cerevisiae (PMID:18688272). Specifying this option generally leads to higher junction accuracy by several percents, so it is applied by default with --splice. However, the SIRV control does not honor this trend (only ~60%) so this option reduces accuracy. If you are benchmarking winnowmap on SIRV data, please disable this option." />
279 <conditional name="splice_site_annotations">
280 <param name="use_annotations" type="select"
281 label="Use previously annotated splice sites to guide the alignment?"
282 help="">
283 <option value="no">No, perform unbiased alignment</option>
284 <option value="yes">Yes, favor annotated splice sites</option>
285 </param>
286 <when value="no" />
287 <when value="yes">
288 <param argument="--junc-bed" type="data" format="bed"
289 label="Dataset with annotated genes or introns"
290 help="Gene annotations should be provided in BED12 (aka 12-column BED), intron positions in 5-column BED format." />
291 <param argument="--junc-bonus" type="integer" min="1" value="1"
292 label="Annotated splice site bonus"
293 help="Score bonus for a splice donor or acceptor found in annotation." />
294 </when>
295 </conditional>
296 </when>
297 </conditional>
298 <param argument="-A" type="integer" min="0" optional="true"
299 label="Score for a sequence match" help="default=2"/>
300 <param argument="-B" type="integer" min="0" optional="true"
301 label="Penalty for a mismatch" help="-B; default=4" />
302 <param argument="-O" type="integer" min="1" optional="true"
303 label="Gap open penalties for deletions" help="-O; default=4"/>
304 <param name="O2" type="integer" min="0" optional="true"
305 label="Gap open penalties for insertions" help="-O; default=24"/>
306 <param argument="-E" type="integer" min="1" optional="true"
307 label="Gap extension penalties; a gap of size k cost &#39;-O + -E*k&#39;. If two numbers are specified, the first is the penalty of extending a deletion and the second for extending an insertion"
308 help="-E; default=2"/>
309 <param name="E2" type="integer" min="0" optional="true"
310 label="Gap extension penalty for extending an insertion; if left empty uses the value specified for Gap extension penalties above"
311 help="-E; default=1"/>
312 <param argument="-z" type="integer" min="0" optional="true"
313 label="Z-drop threshold for truncating an alignment"
314 help="Increase to improve the contiguity of alignments at the cost of poorer alignments in the middle. default=400" />
315 <param name="z2" type="integer" min="0" optional="true"
316 label="Z-drop threshold for reverse-complementing the query"
317 help="Decrease to find small inversions at the cost of performance and false positives. default=200" />
318 <param argument="-s" type="integer" min="0" optional="true"
319 label="minimal peak DP alignment score" help="default=80"/>
320 <param name="no_end_flt" type="boolean" falsevalue="--no-end-flt" truevalue="" checked="true"
321 label="Filter seeds towards the ends of chains before performing base-level alignment?" />
322 </section>
323 <section name="io_options" title="Set advanced output options" help="Sets -Q, -L, -R, -c, --cs and -K options." expanded="False">
324 <param name="output_format" type="select" label="Select an output format">
325 <option value="BAM">BAM</option>
326 <option value="CRAM">CRAM</option>
327 <option value="paf">paf</option>
328 </param>
329 <param argument="-Q" type="boolean" truevalue="-Q" falsevalue="" label="don't output base quality"/>
330 <param argument="-L" type="boolean" truevalue="-L" falsevalue="" label="write CIGAR with >65535 ops to the CG tag" help="Useful for very long reads in SAM/BAM format"/>
331 <param argument="-K" type="integer" optional="true" label="minibatch size for mapping (in megabyte)" help="default=500M"/>
332 <param argument="--cs" type="select" optional="true" label="Output cs tag?" help="The cs tag is a more compact standalone representation of the MD tag, see help below.">
333 <option value="none">no</option>
334 <option value="short">short</option>
335 <option value="long">long</option>
336 </param>
337 <param argument="-c" type="boolean" truevalue="-c" falsevalue="" label="Generate CIGAR"
338 help="In PAF, the CIGAR is written to the ‘cg’ custom tag." />
339
340 <param argument="--eqx" type="boolean" truevalue="--eqx" falsevalue="" label="write =/X CIGAR operators"/>
341 <param argument="-Y" type="boolean" truevalue="-Y" falsevalue="" label="use soft clipping for supplementary alignments ?"/>
342 </section>
343 </inputs>
344 <outputs>
345 <data format="bam" name="alignment_output" label="${tool.name} on ${on_string} (mapped reads in ${io_options.output_format} format)">
346 <actions>
347 <conditional name="reference_source.reference_source_selector">
348 <when value="cached">
349 <action type="metadata" name="dbkey">
350 <option type="from_data_table" name="all_fasta" column="1" offset="0">
351 <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
352 <filter type="param_value" ref="reference_source.ref_file" column="0"/>
353 </option>
354 </action>
355 </when>
356 <when value="history">
357 <action type="metadata" name="dbkey">
358 <option type="from_param" name="reference_source.ref_file" param_attribute="dbkey" />
359 </action>
360 </when>
361 </conditional>
362 </actions>
363 <change_format>
364 <when input="io_options.output_format" value="paf" format="tabular" />
365 <when input="io_options.output_format" value="CRAM" format="cram" />
366 </change_format>
367 </data>
368 </outputs>
369 <tests>
370 <test>
371 <!-- test single input -->
372 <param name="reference_source_selector" value="history" />
373 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
374 <param name="fastq_input_selector" value="single"/>
375 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
376 <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
377 <param name="analysis_type_selector" value="map-ont"/>
378 <output name="alignment_output" ftype="bam" file="winnowmap-test1-fasta.bam" lines_diff="2" />
379 </test>
380 <test>
381 <!-- test cram output -->
382 <param name="reference_source_selector" value="history" />
383 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
384 <param name="fastq_input_selector" value="single"/>
385 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
386 <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
387 <param name="analysis_type_selector" value="map-ont"/>
388 <param name="output_format" value="CRAM"/>
389 <output name="alignment_output" ftype="cram" file="winnowmap-test1-fasta.cram" compare="sim_size" />
390 </test>
391 <test>
392 <!-- test paired input -->
393 <param name="reference_source_selector" value="history" />
394 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
395 <param name="fastq_input_selector" value="paired"/>
396 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
397 <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
398 <param name="analysis_type_selector" value="map-ont"/>
399 <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
400 <output name="alignment_output" ftype="bam" file="winnowmap-test1.bam" lines_diff="2" />
401 </test>
402 <test>
403 <!-- test paired input with one pair compressed -->
404 <param name="reference_source_selector" value="history" />
405 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
406 <param name="fastq_input_selector" value="paired"/>
407 <param name="fastq_input1" ftype="fastqsanger.gz" value="bwa-mem-fastq1.fq.gz"/>
408 <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
409 <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
410 <param name="analysis_type_selector" value="map-ont"/>
411 <output name="alignment_output" ftype="bam" file="winnowmap-test1.bam" lines_diff="2" />
412 </test>
413 <test>
414 <!-- test collection input -->
415 <param name="reference_source_selector" value="history" />
416 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
417 <param name="fastq_input_selector" value="paired_collection"/>
418 <param name="fastq_input1">
419 <collection type="paired">
420 <element name="forward" value="bwa-mem-fastq1.fq" />
421 <element name="reverse" value="bwa-mem-fastq2.fq" />
422 </collection>
423 </param>
424 <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
425 <output name="alignment_output" ftype="bam" file="winnowmap-test2.bam" lines_diff="2" />
426 </test>
427 <test>
428 <!-- test data table reference -->
429 <param name="reference_source_selector" value="cached" />
430 <param name="ref_file" value="bwa-mem-mt-genome"/>
431 <param name="fastq_input_selector" value="single"/>
432 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
433 <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
434 <param name="analysis_type_selector" value="map-ont"/>
435 <output name="alignment_output" ftype="bam" file="winnowmap-test1-fasta.bam" lines_diff="2" />
436 </test>
437 <test>
438 <!-- test alignment options -->
439 <param name="reference_source_selector" value="cached" />
440 <param name="min_occ_floor" value="1000"/>
441 <param name="ref_file" value="bwa-mem-mt-genome"/>
442 <param name="fastq_input_selector" value="single"/>
443 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
444 <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
445 <param name="analysis_type_selector" value="map-ont"/>
446 <section name="alignment_options">
447 <!-- the folowing settings correspond to the defaults for "sr"
448 mode. The purpose is to check that all alignment params get
449 parsed correctly. -->
450 <param name="A" value="2" />
451 <param name="B" value="8" />
452 <param name="O" value="12" />
453 <param name="O2" value="32" />
454 <param name="E" value="2" />
455 <param name="E2" value="1" />
456 <param name="z" value="400" />
457 <param name="s" value="40" />
458 </section>
459 <output name="alignment_output" ftype="bam" file="winnowmap-test1-fasta.bam" lines_diff="2" />
460 </test>
461 <test>
462 <!-- test paf output -->
463 <param name="reference_source_selector" value="history" />
464 <param name="ref_file" ftype="fastqsanger" value="mini_reads.fq" />
465 <param name="fastq_input_selector" value="single"/>
466 <param name="fastq_input1" ftype="fastqsanger" value="mini_reads.fq" />
467 <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
468 <param name="analysis_type_selector" value="map-ont"/>
469 <param name="output_format" value="paf"/>
470 <output name="alignment_output" ftype="tabular" file="mini_reads.paf" />
471 </test>
472 <test>
473 <!-- test self-homology mode -->
474 <param name="reference_source_selector" value="history" />
475 <param name="ref_file" ftype="fasta" value="winnowmap-self-homology.fasta" />
476 <param name="fastq_input_selector" value="single" />
477 <param name="fastq_input1" ftype="fasta" value="winnowmap-self-homology.fasta" />
478 <param name="analysis_type_selector" value="self-homology" />
479 <output name="alignment_output" ftype="bam" file="winnowmap-self-homology.bam" lines_diff="2" />
480 </test>
481 <test>
482 <!-- test sv_off parameter -->
483 <param name="reference_source_selector" value="history" />
484 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
485 <param name="fastq_input_selector" value="single"/>
486 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
487 <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
488 <param name="analysis_type_selector" value="map-ont"/>
489 <param name="sv_off" value="True"/>
490 <output name="alignment_output" ftype="bam" file="winnowmap-test2-fasta.bam" lines_diff="2" />
491 </test>
492 </tests>
493 <help>
494
495 Users’ Guide
496 ------------
497
498 Winnowmap is a long-read mapping algorithm optimized for mapping ONT and
499 PacBio reads to repetitive reference sequences. Winnowmap development began
500 on top of minimap2 codebase, and since then we have incorporated the
501 following two ideas to improve mapping accuracy within repeats.
502
503 Winnowmap implements a novel weighted minimizer sampling algorithm (>=v1.0).
504 This optimization was motivated by the need to avoid masking of frequently
505 occurring k-mers during the seeding stage in an efficient manner, and achieve
506 better mapping accuracy in complex repeats (e.g., long tandem repeats) of
507 the human genome. Using weighted minimizers, Winnowmap down-weights
508 frequently occurring k-mers, thus reducing their chance of getting selected
509 as minimizers. Users can refer to this paper for more details. This idea is
510 helpful to preserve the theoretical guarantee of minimizer sampling technique,
511 i.e., if two sequences share a substring of a specified length, then they
512 must be guaranteed to have a matching minimizer.
513
514 We noticed that the highest scoring alignment doesn't necessarily correspond
515 to correct placement of reads in repetitive regions of T2T human chromosomes.
516 In the presence of a non-reference allele within a repeat, a read sampled
517 from that region could be mapped to an incorrect repeat copy because the
518 standard pairwise sequence alignment scoring system penalizes true variants.
519 This is also sometimes referred to as allelic bias. To address this bias,
520 we introduced and implemented an idea of using minimal confidently alignable
521 substrings (>=v2.0). These are minimal-length substrings in a read that align
522 end-to-end to a reference with mapping quality score above a user-specified
523 threshold. This approach treats each read mapping as a collection of
524 confident sub-alignments, which is more tolerant of structural variation
525 and more sensitive to paralog-specific variants (PSVs). Our most recent
526 paper desribes this concept and benchmarking results.
527
528 General usage
529 ~~~~~~~~~~~~~
530
531 For either mapping long reads or computing whole-genome alignments, Winnowmap
532 requires pre-computing high frequency k-mers (e.g., top 0.02% most frequent)
533 in a reference. Winnowmap uses meryl k-mer counting tool for this purpose.
534
535 Mapping ONT or PacBio-hifi WGS reads
536
537
538 .. code::
539
540 meryl count k=15 output merylDB ref.fa
541 meryl print greater-than distinct=0.9998 merylDB > repetitive_k15.txt
542
543
544 .. code::
545
546 winnowmap -W repetitive_k15.txt -ax map-ont ref.fa ont.fq.gz > output.sam [OR]
547 winnowmap -W repetitive_k15.txt -ax map-pb ref.fa hifi.fq.gz > output.sam
548
549 Mapping genome assemblies
550
551
552 .. code::
553
554 meryl count k=19 output merylDB asm1.fa
555 meryl print greater-than distinct=0.9998 merylDB > repetitive_k19.txt
556
557
558 .. code::
559
560 winnowmap -W repetitive_k19.txt -ax asm20 asm1.fa asm2.fa > output.sam
561
562 For the genome-to-genome use case, it may be useful to visualize the dot plot. This perl script can be used to generate a dot plot from paf-formatted output. In both usage cases, pre-computing repetitive k-mers using meryl is quite fast, e.g., it typically takes 2-3 minutes for the human genome reference.
563
564 Use cases
565 ~~~~~~~~~
566
567 Winnowmap uses the same base algorithm for all applications. However, due
568 to the different data types it supports, Winnowmap needs to be tuned for
569 optimal performance and accuracy. It is usually recommended to choose a
570 preset with option **-x**, which sets multiple parameters at the same
571 time. The default setting is the same as ``map-ont``.
572
573 Map long noisy genomic reads
574 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
575
576 The difference between ``map-pb`` and ``map-ont`` is that ``map-pb``
577 uses homopolymer-compressed (HPC) minimizers as seeds, while ``map-ont``
578 uses ordinary minimizers as seeds. Emperical evaluation suggests HPC
579 minimizers improve performance and sensitivity when aligning PacBio
580 reads, but hurt when aligning Nanopore reads.
581
582 Map long mRNA/cDNA reads
583 ^^^^^^^^^^^^^^^^^^^^^^^^
584
585 There are different long-read RNA-seq technologies, including
586 tranditional full-length cDNA, EST, PacBio Iso-seq, Nanopore 2D cDNA-seq
587 and Direct RNA-seq. They produce data of varying quality and properties.
588 By default, ``-x splice`` assumes the read orientation relative to the
589 transcript strand is unknown. It tries two rounds of alignment to infer
590 the orientation and write the strand to the ``ts`` SAM/PAF tag if
591 possible. For Iso-seq, Direct RNA-seq and tranditional full-length
592 cDNAs, it would be desired to apply ``-u f`` to force Winnowmap to
593 consider the forward transcript strand only. This speeds up alignment
594 with slight improvement to accuracy. For noisy Nanopore Direct RNA-seq
595 reads, it is recommended to use a smaller k-mer size for increased
596 sensitivity to the first or the last exons.
597
598 It is worth noting that by default ``-x splice`` prefers
599 GT[A/G]..[C/T]AG over GT[C/T]..[A/G]AG, and then over other splicing
600 signals. Considering one additional base improves the junction accuracy
601 for noisy reads, but reduces the accuracy when aligning against the
602 widely used SIRV control data. This is because SIRV does not honor the
603 evolutionarily conservative splicing signal. If you are studying SIRV,
604 you may apply ``--splice-flank=no`` to let Winnowmap only model GT..AG,
605 ignoring the additional base.
606
607
608 Map short accurate genomic reads
609 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
610
611 When two read files are specified, Winnowmap reads from each file in turn
612 and merge them into an interleaved stream internally. Two reads are
613 considered to be paired if they are adjacent in the input stream and
614 have the same name (with the ``/[0-9]`` suffix trimmed if present).
615 Single- and paired-end reads can be mixed.
616
617 Winnowmap does not work well with short spliced reads. There are many
618 capable RNA-seq mappers for short reads.
619
620 Full genome/assembly alignment
621 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
622
623 For cross-species full-genome alignment, the scoring system needs to be
624 tuned according to the sequence divergence.
625
626 Self-homology map creation
627 ^^^^^^^^^^^^^^^^^^^^^^^^^^
628
629 A self-homology map is created by mapping a genome (e.g. that of E. coli)
630 against itself. When this option is used the same FASTA file should
631 be used for reference and for the (single ended mode) query.
632
633 Advanced features
634 ~~~~~~~~~~~~~~~~~
635
636 Working with >65535 CIGAR operations
637 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
638
639 Due to a design flaw, BAM does not work with CIGAR strings with >65535
640 operations (SAM and CRAM work). However, for ultra-long nanopore reads
641 Winnowmap may align ~1% of read bases with long CIGARs beyond the
642 capability of BAM. If you convert such SAM/CRAM to BAM, Picard and
643 recent samtools will throw an error and abort. Older samtools and other
644 tools may create corrupted BAM.
645
646 To avoid this issue, you can add option ``-L`` at the Winnowmap command line.
647 This option moves a long CIGAR to the ``CG`` tag and leaves a fully clipped
648 CIGAR at the SAM CIGAR column. Current tools that don’t read CIGAR
649 (e.g. merging and sorting) still work with such BAM records; tools that read
650 CIGAR will effectively ignore these records. It has been decided that future
651 tools will seamlessly recognize long-cigar records generated by option `-L`.
652
653 **TD;DR**: if you work with ultra-long reads and use tools that only
654 process BAM files, please add option ``-L``.
655
656 The cs optional tag
657 ^^^^^^^^^^^^^^^^^^^
658
659 The ``cs`` SAM/PAF tag encodes bases at mismatches and INDELs. It
660 matches regular expression
661 ``/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/``. Like CIGAR, ``cs``
662 consists of series of operations. Each leading character specifies the
663 operation; the following sequence is the one involved in the operation.
664
665 The ``cs`` tag is enabled by command line option ``--cs``. The following
666 alignment, for example:
667
668 .. code::
669
670 CGATCGATAAATAGAGTAG---GAATAGCA
671 |||||| |||||||||| |||| |||
672 CGATCG---AATAGAGTAGGTCGAATtGCA
673
674 is represented as ``:6-ata:10+gtc:4*at:3``, where ``:[0-9]+`` represents
675 an identical block, ``-ata`` represents a deltion, ``+gtc`` an insertion
676 and ``*at`` indicates reference base ``a`` is substituted with a query
677 base ``t``. It is similar to the ``MD`` SAM tag but is standalone and
678 easier to parse.
679
680 If ``--cs=long`` is used, the ``cs`` string also contains identical
681 sequences in the alignment. The above example will become
682 ``=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA``. The long form of ``cs``
683 encodes both reference and query sequences in one string.
684
685 Benchmarking
686 ~~~~~~~~~~~~
687
688 When comparing Winnowmap (v1.0) to minimap2 (v2.17-r954), we observed
689 a reduction in the mapping error-rate from 0.14% to 0.06% in the recently
690 finished human X chromosome, and from 3.6% to 0% within the highly repetitive
691 X centromere (3.1 Mbp). Winnowmap improves mapping accuracy within repeats
692 and achieves these results with sparser sampling, leading to better index
693 compression and competitive runtimes. By avoiding masking, we show that
694 Winnowmap maintains uniform minimizer density.
695
696 </help>
697 <expand macro="citations"/>
698 </tool>