comparison minimap2.xml @ 0:2445d53549ba draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/minimap2 commit 7cb87c310b34cb2af2547ad8a14679107fd86d5d
author iuc
date Sat, 04 Nov 2017 05:41:11 -0400
parents
children b103bc946f57
comparison
equal deleted inserted replaced
-1:000000000000 0:2445d53549ba
1 <?xml version="1.0"?>
2 <tool id="minimap2" name="Map with minimap2" version="2.3" profile="17.01">
3 <description>A fast pairwise aligner for genomic and spliced nucleotide sequences</description>
4 <requirements>
5 <requirement type="package" version="2.3">minimap2</requirement>
6 <requirement type="package" version="1.6">samtools</requirement>
7 </requirements>
8 <version_command>minimap2 --version</version_command>
9 <command>
10 <![CDATA[
11 #if $reference_source.reference_source_selector == 'history':
12 ln -f -s '$reference_source.ref_file' reference.fa &&
13 #else:
14 ln -f -s '$reference_source.ref_file.fields.path' reference.fa &&
15 #end if
16 minimap2 -a
17 -x $analysis_type_selector
18 ## indexing options
19 #if $indexing_options.k:
20 -k $indexing_options.k
21 #end if
22 #if $indexing_options.w:
23 -w $indexing_options.w
24 #end if
25 #if $indexing_options.I:
26 -I $indexing_options.I
27 #end if
28 ## Mapping options
29 #if $mapping_options.f:
30 -f $mapping_options.f
31 #end if
32 #if $mapping_options.g:
33 -g $mapping_options.g
34 #end if
35 #if $mapping_options.G:
36 -G $mapping_options.G
37 #end if
38 #if $mapping_options.F:
39 -F $mapping_options.F
40 #end if
41 #if $mapping_options.r:
42 -r $mapping_options.r
43 #end if
44 #if $mapping_options.n:
45 -n $mapping_options.n
46 #end if
47 #if $mapping_options.m:
48 -m $mapping_options.m
49 #end if
50 $mapping_options.X
51 #if $mapping_options.p:
52 -p $mapping_options.p
53 #end if
54 #if $mapping_options.N:
55 -N $mapping_options.N
56 #end if
57 ## Alignment options
58 #if $alignment_options.A:
59 -A $alignment_options.A
60 #end if
61 #if $alignment_options.B:
62 -B $alignment_options.B
63 #end if
64 #if $alignment_options.O:
65 #if $alignment_options.O2:
66 -O $alignment_options.O,$alignment_options.O2
67 #end if
68 -O $alignment_options.O
69 #end if
70 #if $alignment_options.E:
71 #if $alignment_options.E2:
72 -E $alignment_options.E,$alignment_options.E2
73 #else
74 -E $alignment_options
75 #end if
76 #end if
77 #if $alignment_options.z:
78 $alignment_options.z
79 #end if
80 #if $alignment_options.s:
81 -s $alignment_options.s
82 #end if
83 #if $alignment_options.u:
84 -u $alignment_options.u
85 #end if
86 ## Output options
87 $io_options.Q
88 $io_options.L
89 #if $io_options.cs:
90 --cs $io_options.cs
91 #end if
92 #if $io_options.K:
93 -K $io_options.K
94 #end if
95 -t \${GALAXY_SLOTS:-4}
96 reference.fa
97 #if $fastq_input.fastq_input_selector in ['single', 'paired_iv']:
98 '$fastq_input.fastq_input1'
99 #else if $fastq_input.fastq_input_selector == 'paired':
100 '$fastq_input.fastq_input1' '$fastq_input.fastq_input2'
101 #else if $fastq_input.fastq_input_selector == 'paired_collection':
102 '$fastq_input.fastq_input1.forward' '$fastq_input.fastq_input1.reverse'
103 #end if
104 | samtools sort
105 -@\${GALAXY_SLOTS:-2}
106 -O $io_options.output_format
107 #if $io_options.output_format == 'CRAM':
108 --reference reference.fa
109 #end if
110 -o '$alignment_output'
111 ]]>
112 </command>
113 <inputs>
114 <conditional name="reference_source">
115 <param name="reference_source_selector" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options. See `Indexes` section of help below">
116 <option value="cached">Use a built-in genome index</option>
117 <option value="history">Use a genome from history and build index</option>
118 </param>
119 <when value="cached">
120 <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
121 <options from_data_table="all_fasta">
122 <filter type="sort_by" column="2" />
123 <validator type="no_options" message="No reference genomes are available" />
124 </options>
125 <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
126 </param>
127 </when>
128 <when value="history">
129 <param name="ref_file" type="data" format="fasta" label="Use the following dataset as the reference sequence" help="You can upload a FASTA sequence to the history and use it as reference" />
130 </when>
131 </conditional>
132 <section name="indexing_options" title="Indexing options">
133 <!-- Homopolymer setting seems to not properly overwrite sr preset
134 <param argument="-H" name="H" type="boolean" optional="true" truevalue="-H" falsevalue="" label="Use homopolymer-compressed k-mer ?"/>
135 -->
136 <param argument="-k" type="integer" min="4" max="28" optional="true" label="k-mer size" help=""/>
137 <param argument="-w" type="integer" min="1" optional="true" label="minimizer window size" help=""/>
138 <param argument="-I" type="integer" min="1" optional="true" label="split index for every N input gigabases" help=""/>
139 </section>
140 <!-- start unchanged copy from bwa-mem -->
141 <conditional name="fastq_input">
142 <param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data">
143 <option value="single">Single</option>
144 <option value="paired">Paired</option>
145 <option value="paired_collection">Paired Collection</option>
146 <option value="paired_iv">Paired Interleaved</option>
147 </param>
148 <when value="paired">
149 <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select first set of reads" help="Specify dataset with forward reads"/>
150 <param name="fastq_input2" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select second set of reads" help="Specify dataset with reverse reads"/>
151 </when>
152 <when value="single">
153 <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with single reads"/>
154 </when>
155 <when value="paired_collection">
156 <param name="fastq_input1" format="fastqsanger,fastqsanger.gz,fasta" type="data_collection" collection_type="paired" label="Select a paired collection" help="See help section for an explanation of dataset collections"/>
157 </when>
158 <when value="paired_iv">
159 <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with interleaved reads"/>
160 </when>
161 </conditional>
162 <!-- end unchanged copy from bwa-mem -->
163 <param name="analysis_type_selector" type="select" label="Select analysis mode (sets default)">
164 <option value="map-pb">-Hk19 (PacBio vs reference mapping)</option>
165 <option value="map-ont">-k15 (Oxford Nanopore vs reference mapping)</option>
166 <option value="asm5">-k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5% div.)</option>
167 <option value="asm10">-k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10% div.)</option>
168 <option value="ava-pb">-Hk19 -w5 -Xp0 -m100 -g10000 --max-chain-skip 25 (PacBio read overlap)</option>
169 <option value="ava-ont">-k15 -w5 -Xp0 -m100 -g10000 --max-chain-skip 25 (ONT read overlap)</option>
170 <option value="splice">long-read spliced alignment</option>
171 <option value="sr">short single-end reads without splicing</option>
172 </param>
173 <section name="mapping_options" title="Set advanced mapping options" help="Sets -f, -g, -G, -F, -r, -n, -m, -X, -p and -N options." expanded="False">
174 <param argument="-f" type="float" value="" optional="true" label="filter out top FLOAT fraction of repetitive minimizers" help="default=0.0002"/>
175 <param argument="-g" type="integer" value="" optional="true" label="stop chain enlongation if there are no minimizers in INT-bp" help="default=5000"/>
176 <param argument="-G" type="integer" value="" optional="true" label="max intron length in thousand (effective with -xsplice; changing -r)" help="default=200"/>
177 <param argument="-F" type="integer" value="" optional="true" label="max fragment length (effective with -xsr or in the fragment mode)" help="default=800" />
178 <param argument="-r" type="integer" value="" optional="true" label="bandwidth used in chaining and DP-based alignment" help="default=500" />
179 <param argument="-n" type="integer" value="" optional="true" label="minimal number of minimizers on a chain" help="default=3"/>
180 <param argument="-m" type="integer" value="" optional="true" label="minimal chaining score (matching bases minus log gap penalty)" help="default=40"/>
181 <param argument="-X" type="boolean" truevalue="-X" falsevalue="" optional="true" label="skip self and dual mappings (for the all-vs-all mode)"/>
182 <param argument="-p" type="float" value="" max="1" optional="true" label="min secondary-to-primary score ratio" help="default=0.8"/>
183 <param argument="-N" type="integer" min="0" optional="true" label="retain at most INT secondary alignments" help="default=5"/>
184 </section>
185 <section name="alignment_options" title="Set advanced alignment options" help="Sets -A, -B, -O, -E, -z, -s and -u options." expanded="False">
186 <param argument="-A" type="integer" optional="true" label="Score for a sequence match" help="default=2"/>
187 <param argument="-B" type="integer" optional="true" label="Penalty for a mismatch" help="-B; default=4" />
188 <param argument="-O" type="integer" min="0" optional="true" label="Gap open penalties for deletions" help="-O; default=4"/>
189 <param name="-O2" type="integer" min="0" optional="true" label="Gap open penalties for insertions" help="-O; default=24"/>
190 <param argument="-E" type="integer" min="0" optional="true" label="Gap extension penalties; a gap of size k cost &#39;-O + -E*k&#39;. If two numbers are specified, the first is the penalty of extending a deletion and the second for extending an insertion" help="-E; default=2"/>
191 <param name="E2" type="integer" min="0" optional="true" label="Gap extension penalty for extending an insertion; if left empty uses the value specified for Gap extension penalties above" help="-E; default=1"/>
192 <param argument="-z" type="integer" optional="true" label="Z-drop score" help="default=400"/>
193 <param argument="-s" type="integer" optional="true" label="minimal peak DP alignment score" help="default=80"/>
194 <param argument="-u" type="select" optional="true" label="how to find GT-AG">
195 <option value="n">don't match GT-AG</option>
196 <option value="f">transcript strand</option>
197 <option value="b">both strands</option>
198 </param>
199 </section>
200 <section name="io_options" title="Set advanced output options" help="Sets -Q, -L, -R, -c, --cs and -K options." expanded="False">
201 <param name="output_format" type="select" label="Produce BAM or CRAM file?">
202 <option value="BAM">BAM</option>
203 <option value="CRAM">CRAM</option>
204 </param>
205 <param argument="-Q" type="boolean" truevalue="-Q" falsevalue="" optional="true" label="don't output base quality"/>
206 <param argument="-L" type="boolean" truevalue="-L" falsevalue="" optional="true" label="write CIGAR with >65535 ops to the CG tag" help="Useful for very long reads in SAM/BAM format"/>
207 <param argument="-K" type="integer" optional="true" label="minibatch size for mapping (in megabyte)" help="default=500M"/>
208 <param argument="--cs" type="select" optional="true" label="Output cs tag?" help="The cs tag is a more compact standalone representation of the MD tag, see help below.">
209 <option value="none">no</option>
210 <option value="short">short</option>
211 <option value="long">long</option>
212 </param>
213 </section>
214 </inputs>
215 <outputs>
216 <data format="bam" name="alignment_output" label="${tool.name} on ${on_string} (mapped reads in ${io_options.output_format} format)">
217 <actions>
218 <conditional name="reference_source.reference_source_selector">
219 <when value="cached">
220 <action type="metadata" name="dbkey">
221 <option type="from_data_table" name="all_fasta" column="1" offset="0">
222 <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
223 <filter type="param_value" ref="reference_source.ref_file" column="0"/>
224 </option>
225 </action>
226 </when>
227 <when value="history">
228 <action type="metadata" name="dbkey">
229 <option type="from_param" name="reference_source.ref_file" param_attribute="dbkey" />
230 </action>
231 </when>
232 </conditional>
233 </actions>
234 <change_format>
235 <when input="io_options.output_format" value="CRAM" format="cram" />
236 </change_format>
237 </data>
238 </outputs>
239 <tests>
240 <test>
241 <!-- test single input -->
242 <param name="reference_source_selector" value="history" />
243 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
244 <param name="fastq_input_selector" value="single"/>
245 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
246 <param name="analysis_type_selector" value="sr"/>
247 <output name="alignment_output" ftype="bam" file="minimap2-test1-fasta.bam" lines_diff="2" />
248 </test>
249 <test>
250 <!-- test cram output -->
251 <param name="reference_source_selector" value="history" />
252 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
253 <param name="fastq_input_selector" value="single"/>
254 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
255 <param name="analysis_type_selector" value="sr"/>
256 <param name="output_format" value="CRAM"/>
257 <output name="alignment_output" ftype="cram" file="minimap2-test1-fasta.cram" compare="sim_size" />
258 </test>
259 <test>
260 <!-- test paired input -->
261 <param name="reference_source_selector" value="history" />
262 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
263 <param name="fastq_input_selector" value="paired"/>
264 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
265 <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
266 <param name="analysis_type_selector" value="sr"/>
267 <output name="alignment_output" ftype="bam" file="minimap2-test1.bam" lines_diff="2" />
268 </test>
269 <test>
270 <!-- test paired input with one pair compressed -->
271 <param name="reference_source_selector" value="history" />
272 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
273 <param name="fastq_input_selector" value="paired"/>
274 <param name="fastq_input1" ftype="fastqsanger.gz" value="bwa-mem-fastq1.fq.gz"/>
275 <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
276 <param name="analysis_type_selector" value="sr"/>
277 <output name="alignment_output" ftype="bam" file="minimap2-test1.bam" lines_diff="2" />
278 </test>
279 <test>
280 <!-- test collection input -->
281 <param name="reference_source_selector" value="history" />
282 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
283 <param name="fastq_input_selector" value="paired_collection"/>
284 <param name="fastq_input1">
285 <collection type="paired">
286 <element name="forward" value="bwa-mem-fastq1.fq" />
287 <element name="reverse" value="bwa-mem-fastq2.fq" />
288 </collection>
289 </param>
290 <param name="analysis_type_selector" value="sr"/>
291 <output name="alignment_output" ftype="bam" file="minimap2-test2.bam" lines_diff="2" />
292 </test>
293 <test>
294 <!-- test data table reference -->
295 <param name="reference_source_selector" value="cached" />
296 <param name="ref_file" value="bwa-mem-mt-genome"/>
297 <param name="fastq_input_selector" value="single"/>
298 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
299 <param name="analysis_type_selector" value="sr"/>
300 <output name="alignment_output" ftype="bam" file="minimap2-test1-fasta.bam" lines_diff="2" />
301 </test>
302 </tests>
303 <help>
304
305 Users’ Guide
306 ------------
307
308 Minimap2 is a versatile sequence alignment program that aligns DNA or
309 mRNA sequences against a large reference database. Typical use cases
310 include: (1) mapping PacBio or Oxford Nanopore genomic reads to the
311 human genome; (2) finding overlaps between long reads with error rate up
312 to ~15%; (3) splice-aware alignment of PacBio Iso-Seq or Nanopore cDNA
313 or Direct RNA reads against a reference genome; (4) aligning Illumina
314 single- or paired-end reads; (5) assembly-to-assembly alignment; (6)
315 full-genome alignment between two closely related species with
316 divergence below ~15%.
317
318 For ~10kb noisy reads sequences, minimap2 is tens of times faster than
319 mainstream long-read mappers such as BLASR, BWA-MEM, NGMLR and GMAP. It
320 is more accurate on simulated long reads and produces biologically
321 meaningful alignment ready for downstream analyses. For >100bp Illumina
322 short reads, minimap2 is three times as fast as BWA-MEM and Bowtie2, and
323 as accurate on simulated data. Detailed evaluations are available from
324 the `minimap2 preprint`.
325
326 General usage
327 ~~~~~~~~~~~~~
328
329 Minimap2 seamlessly works with gzip’d FASTA and FASTQ formats as input.
330 You don’t need to convert between FASTA and FASTQ or decompress gzip’d
331 files first.
332
333 For the human reference genome, minimap2 takes a few minutes to generate
334 a minimizer index for the reference before mapping. To reduce indexing
335 time, you can optionally save the index with option **-d** and replace
336 the reference sequence file with the index file on the minimap2 command
337 line:
338
339 ***Importantly***, it should be noted that once you build the index,
340 indexing parameters such as **-k**, **-w**, **-H** and **-I** can’t be
341 changed during mapping. If you are running minimap2 for different data
342 types, you will probably need to keep multiple indexes generated with
343 different parameters. This makes minimap2 different from BWA which
344 always uses the same index regardless of query data types.
345
346 Use cases
347 ~~~~~~~~~
348
349 Minimap2 uses the same base algorithm for all applications. However, due
350 to the different data types it supports (e.g. short vs long reads; DNA
351 vs mRNA reads), minimap2 needs to be tuned for optimal performance and
352 accuracy. It is usually recommended to choose a preset with option
353 **-x**, which sets multiple parameters at the same time. The default
354 setting is the same as ``map-ont``.
355
356 Map long noisy genomic reads
357 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
358
359 The difference between ``map-pb`` and ``map-ont`` is that ``map-pb``
360 uses homopolymer-compressed (HPC) minimizers as seeds, while ``map-ont``
361 uses ordinary minimizers as seeds. Emperical evaluation suggests HPC
362 minimizers improve performance and sensitivity when aligning PacBio
363 reads, but hurt when aligning Nanopore reads.
364
365 Map long mRNA/cDNA reads
366 ^^^^^^^^^^^^^^^^^^^^^^^^
367
368
369 There are different long-read RNA-seq technologies, including
370 tranditional full-length cDNA, EST, PacBio Iso-seq, Nanopore 2D cDNA-seq
371 and Direct RNA-seq. They produce data of varying quality and properties.
372 By default, ``-x splice`` assumes the read orientation relative to the
373 transcript strand is unknown. It tries two rounds of alignment to infer
374 the orientation and write the strand to the ``ts`` SAM/PAF tag if
375 possible. For Iso-seq, Direct RNA-seq and tranditional full-length
376 cDNAs, it would be desired to apply ``-u f`` to force minimap2 to
377 consider the forward transcript strand only. This speeds up alignment
378 with slight improvement to accuracy. For noisy Nanopore Direct RNA-seq
379 reads, it is recommended to use a smaller k-mer size for increased
380 sensitivity to the first or the last exons.
381
382 It is worth noting that by default ``-x splice`` prefers
383 GT[A/G]..[C/T]AG over GT[C/T]..[A/G]AG, and then over other splicing
384 signals. Considering one additional base improves the junction accuracy
385 for noisy reads, but reduces the accuracy when aligning against the
386 widely used SIRV control data. This is because SIRV does not honor the
387 evolutionarily conservative splicing signal. If you are studying SIRV,
388 you may apply ``--splice-flank=no`` to let minimap2 only model GT..AG,
389 ignoring the additional base.
390
391 Find overlaps between long reads
392 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
393
394 Similarly, ``ava-pb`` uses HPC minimizers while ``ava-ont`` uses
395 ordinary minimizers. It is usually not recommended to perform base-level
396 alignment in the overlapping mode because it is slow and may produce
397 false positive overlaps. However, if performance is not a concern, you
398 may try to add ``-a`` or ``-c`` anyway.
399
400 Map short accurate genomic reads
401 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
402
403
404 When two read files are specified, minimap2 reads from each file in turn
405 and merge them into an interleaved stream internally. Two reads are
406 considered to be paired if they are adjacent in the input stream and
407 have the same name (with the ``/[0-9]`` suffix trimmed if present).
408 Single- and paired-end reads can be mixed.
409
410 Minimap2 does not work well with short spliced reads. There are many
411 capable RNA-seq mappers for short reads.
412
413 Full genome/assembly alignment
414 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
415
416 For cross-species full-genome alignment, the scoring system needs to be
417 tuned according to the sequence divergence.
418
419 Advanced features
420 ~~~~~~~~~~~~~~~~~
421
422 Working with >65535 CIGAR operations
423 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
424
425 Due to a design flaw, BAM does not work with CIGAR strings with >65535
426 operations (SAM and CRAM work). However, for ultra-long nanopore reads
427 minimap2 may align ~1% of read bases with long CIGARs beyond the
428 capability of BAM. If you convert such SAM/CRAM to BAM, Picard and
429 recent samtools will throw an error and abort. Older samtools and other
430 tools may create corrupted BAM.
431
432 To avoid this issue, you can add option ``-L`` at the minimap2 command
433 line. This option moves a long CIGAR to the ``CG`` tag and leaves a
434 fully clipped CIGAR at the SAM CIGAR column. Current tools that don’t
435 read CIGAR (e.g. merging and sorting) still work with such BAM records;
436 tools that read CIGAR will effectively ignore these records. I have pull
437 requests to the SAM spec, htslib, htsjdk, bedtools2, Rsamtools and
438 igv.js. If they are accepted, future versions of these tools will
439 seamlessly recognize long-cigar records generated by option ``-L``.
440
441 **TD;DR**: if you work with ultra-long reads and use tools that only
442 process BAM files, please add option ``-L``.
443
444 The cs optional tag
445 ^^^^^^^^^^^^^^^^^^^
446
447 The ``cs`` SAM/PAF tag encodes bases at mismatches and INDELs. It
448 matches regular expression
449 ``/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/``. Like CIGAR, ``cs``
450 consists of series of operations. Each leading character specifies the
451 operation; the following sequence is the one involved in the operation.
452
453 The ``cs`` tag is enabled by command line option ``--cs``. The following
454 alignment, for example:
455
456 .. code::
457
458 CGATCGATAAATAGAGTAG---GAATAGCA
459 |||||| |||||||||| |||| |||
460 CGATCG---AATAGAGTAGGTCGAATtGCA
461
462 is represented as ``:6-ata:10+gtc:4*at:3``, where ``:[0-9]+`` represents
463 an identical block, ``-ata`` represents a deltion, ``+gtc`` an insertion
464 and ``*at`` indicates reference base ``a`` is substituted with a query
465 base ``t``. It is similar to the ``MD`` SAM tag but is standalone and
466 easier to parse.
467
468 If ``--cs=long`` is used, the ``cs`` string also contains identical
469 sequences in the alignment. The above example will become
470 ``=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA``. The long form of ``cs``
471 encodes both reference and query sequences in one string.
472
473 Algorithm overview
474 ~~~~~~~~~~~~~~~~~~
475
476 In the following, minimap2 command line options have a dash ahead and
477 are highlighted in bold. The description may help to tune minimap2
478 parameters.
479
480 1. Read **-I** [=*4G*] reference bases, extract
481 (**-k**,\ **-w**)-minimizers and index them in a hash table.
482
483 2. Read **-K** [=*200M*] query bases. For each query sequence, do step 3
484 through 7:
485
486 3. For each (**-k**,\ **-w**)-minimizer on the query, check against the
487 reference index. If a reference minimizer is not among the top **-f**
488 [=*2e-4*] most frequent, collect its the occurrences in the
489 reference, which are called *seeds*.
490
491 4. Sort seeds by position in the reference. Chain them with dynamic
492 programming. Each chain represents a potential mapping. For read
493 overlapping, report all chains and then go to step 8. For reference
494 mapping, do step 5 through 7:
495
496 5. Let *P* be the set of primary mappings, which is an empty set
497 initially. For each chain from the best to the worst according to
498 their chaining scores: if on the query, the chain overlaps with a
499 chain in *P* by **–mask-level** [=*0.5*] or higher fraction of the
500 shorter chain, mark the chain as *secondary* to the chain in *P*;
501 otherwise, add the chain to *P*.
502
503 6. Retain all primary mappings. Also retain up to **-N** [=*5*] top
504 secondary mappings if their chaining scores are higher than **-p**
505 [=*0.8*] of their corresponding primary mappings.
506
507 7. If alignment is requested, filter out an internal seed if it
508 potentially leads to both a long insertion and a long deletion.
509 Extend from the left-most seed. Perform global alignments between
510 internal seeds. Split the chain if the accumulative score along the
511 global alignment drops by **-z** [=*400*], disregarding long gaps.
512 Extend from the right-most seed. Output chains and their alignments.
513
514 8. If there are more query sequences in the input, go to step 2 until no
515 more queries are left.
516
517 9. If there are more reference sequences, reopen the query file from the
518 start and go to step 1; otherwise stop.
519
520 Limitations
521 -----------
522
523 - Minimap2 may produce suboptimal alignments through long
524 low-complexity regions where seed positions may be suboptimal. This
525 should not be a big concern because even the optimal alignment may be
526 wrong in such regions.
527 </help>
528 <citations>
529 <citation type="doi">10.1093/bioinformatics/btp324</citation>
530 <citation type="doi">10.1093/bioinformatics/btp698</citation>
531 <citation type="bibtex">@misc{1303.3997,
532 Author = {Heng Li},
533 Title = {Minimap2: fast pairwise alignment for long nucleotide sequences},
534 Year = {2017},
535 Eprint = {arXiv:1708.01492},
536 url = {https://arxiv.org/abs/1708.01492},
537 }</citation>
538 </citations>
539 </tool>