Mercurial > repos > iuc > minimap2

diff minimap2.xml @ 0:2445d53549ba draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/minimap2 commit 7cb87c310b34cb2af2547ad8a14679107fd86d5d
author: iuc
date: Sat, 04 Nov 2017 05:41:11 -0400
children: b103bc946f57
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/minimap2.xml	Sat Nov 04 05:41:11 2017 -0400
@@ -0,0 +1,539 @@
+<?xml version="1.0"?>
+<tool id="minimap2" name="Map with minimap2" version="2.3" profile="17.01">
+    <description>A fast pairwise aligner for genomic and spliced nucleotide sequences</description>
+    <requirements>
+        <requirement type="package" version="2.3">minimap2</requirement>
+        <requirement type="package" version="1.6">samtools</requirement>
+    </requirements>
+    <version_command>minimap2 --version</version_command>
+    <command>
+<![CDATA[
+    #if $reference_source.reference_source_selector == 'history':
+        ln -f -s '$reference_source.ref_file' reference.fa &&
+    #else:
+        ln -f -s '$reference_source.ref_file.fields.path' reference.fa &&
+    #end if
+    minimap2 -a
+    -x $analysis_type_selector
+    ## indexing options
+    #if $indexing_options.k:
+        -k $indexing_options.k
+    #end if
+    #if $indexing_options.w:
+        -w $indexing_options.w
+    #end if
+    #if $indexing_options.I:
+        -I $indexing_options.I
+    #end if
+    ## Mapping options
+    #if $mapping_options.f:
+        -f $mapping_options.f
+    #end if
+    #if $mapping_options.g:
+        -g $mapping_options.g
+    #end if
+    #if $mapping_options.G:
+        -G $mapping_options.G
+    #end if
+    #if $mapping_options.F:
+        -F $mapping_options.F
+    #end if
+    #if $mapping_options.r:
+        -r $mapping_options.r
+    #end if
+    #if $mapping_options.n:
+        -n $mapping_options.n
+    #end if
+    #if $mapping_options.m:
+        -m $mapping_options.m
+    #end if
+    $mapping_options.X
+    #if $mapping_options.p:
+        -p $mapping_options.p
+    #end if
+    #if $mapping_options.N:
+        -N $mapping_options.N
+    #end if
+    ## Alignment options
+    #if $alignment_options.A:
+        -A $alignment_options.A
+    #end if
+    #if $alignment_options.B:
+        -B $alignment_options.B
+    #end if
+    #if $alignment_options.O:
+        #if $alignment_options.O2:
+            -O $alignment_options.O,$alignment_options.O2
+        #end if
+            -O $alignment_options.O
+        #end if
+    #if $alignment_options.E:
+        #if $alignment_options.E2:
+            -E $alignment_options.E,$alignment_options.E2
+        #else
+            -E $alignment_options
+        #end if
+    #end if
+    #if $alignment_options.z:
+        $alignment_options.z
+    #end if
+    #if $alignment_options.s:
+        -s $alignment_options.s
+    #end if
+    #if $alignment_options.u:
+        -u $alignment_options.u
+    #end if
+    ## Output options
+    $io_options.Q
+    $io_options.L
+    #if $io_options.cs:
+        --cs $io_options.cs
+    #end if
+    #if $io_options.K:
+        -K $io_options.K
+    #end if
+    -t \${GALAXY_SLOTS:-4}
+    reference.fa
+    #if $fastq_input.fastq_input_selector in ['single', 'paired_iv']:
+        '$fastq_input.fastq_input1'
+    #else if $fastq_input.fastq_input_selector == 'paired':
+         '$fastq_input.fastq_input1' '$fastq_input.fastq_input2'
+    #else if $fastq_input.fastq_input_selector == 'paired_collection':
+        '$fastq_input.fastq_input1.forward' '$fastq_input.fastq_input1.reverse'
+    #end if
+    | samtools sort
+    -@\${GALAXY_SLOTS:-2}
+    -O $io_options.output_format
+    #if $io_options.output_format == 'CRAM':
+        --reference reference.fa
+    #end if
+    -o '$alignment_output'
+]]>
+    </command>
+    <inputs>
+        <conditional name="reference_source">
+            <param name="reference_source_selector" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options. See `Indexes` section of help below">
+                <option value="cached">Use a built-in genome index</option>
+                <option value="history">Use a genome from history and build index</option>
+            </param>
+            <when value="cached">
+                <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
+                    <options from_data_table="all_fasta">
+                        <filter type="sort_by" column="2" />
+                        <validator type="no_options" message="No reference genomes are available" />
+                    </options>
+                    <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
+                </param>
+            </when>
+            <when value="history">
+                <param name="ref_file" type="data" format="fasta" label="Use the following dataset as the reference sequence" help="You can upload a FASTA sequence to the history and use it as reference" />
+            </when>
+        </conditional>
+        <section name="indexing_options" title="Indexing options">
+            <!-- Homopolymer setting seems to not properly overwrite sr preset
+            <param argument="-H" name="H" type="boolean" optional="true" truevalue="-H" falsevalue="" label="Use homopolymer-compressed k-mer ?"/>
+            -->
+            <param argument="-k" type="integer" min="4" max="28" optional="true"  label="k-mer size" help=""/>
+            <param argument="-w" type="integer" min="1" optional="true"  label="minimizer window size" help=""/>
+            <param argument="-I" type="integer" min="1" optional="true"  label="split index for every N input gigabases" help=""/>
+        </section>
+        <!-- start unchanged copy from bwa-mem -->
+        <conditional name="fastq_input">
+            <param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data">
+                <option value="single">Single</option>
+                <option value="paired">Paired</option>
+                <option value="paired_collection">Paired Collection</option>
+                <option value="paired_iv">Paired Interleaved</option>
+            </param>
+            <when value="paired">
+                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select first set of reads" help="Specify dataset with forward reads"/>
+                <param name="fastq_input2" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select second set of reads" help="Specify dataset with reverse reads"/>
+            </when>
+            <when value="single">
+                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with single reads"/>
+            </when>
+            <when value="paired_collection">
+                <param name="fastq_input1" format="fastqsanger,fastqsanger.gz,fasta" type="data_collection" collection_type="paired" label="Select a paired collection" help="See help section for an explanation of dataset collections"/>
+            </when>
+            <when value="paired_iv">
+                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with interleaved reads"/>
+            </when>
+        </conditional>
+        <!-- end unchanged copy from bwa-mem -->
+        <param name="analysis_type_selector" type="select" label="Select analysis mode (sets default)">
+            <option value="map-pb">-Hk19 (PacBio vs reference mapping)</option>
+            <option value="map-ont">-k15 (Oxford Nanopore vs reference mapping)</option>
+            <option value="asm5">-k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5% div.)</option>
+            <option value="asm10">-k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10% div.)</option>
+            <option value="ava-pb">-Hk19 -w5 -Xp0 -m100 -g10000 --max-chain-skip 25 (PacBio read overlap)</option>
+            <option value="ava-ont">-k15 -w5 -Xp0 -m100 -g10000 --max-chain-skip 25 (ONT read overlap)</option>
+            <option value="splice">long-read spliced alignment</option>
+            <option value="sr">short single-end reads without splicing</option>
+        </param>
+        <section name="mapping_options" title="Set advanced mapping options" help="Sets -f, -g, -G, -F, -r, -n, -m, -X, -p and -N options." expanded="False">
+            <param argument="-f" type="float" value="" optional="true" label="filter out top FLOAT fraction of repetitive minimizers" help="default=0.0002"/>
+            <param argument="-g" type="integer" value="" optional="true" label="stop chain enlongation if there are no minimizers in INT-bp" help="default=5000"/>
+            <param argument="-G" type="integer" value="" optional="true" label="max intron length in thousand (effective with -xsplice; changing -r)" help="default=200"/>
+            <param argument="-F" type="integer" value="" optional="true" label="max fragment length (effective with -xsr or in the fragment mode)" help="default=800" />
+            <param argument="-r" type="integer" value="" optional="true" label="bandwidth used in chaining and DP-based alignment" help="default=500" />
+            <param argument="-n" type="integer" value="" optional="true" label="minimal number of minimizers on a chain" help="default=3"/>
+            <param argument="-m" type="integer" value="" optional="true" label="minimal chaining score (matching bases minus log gap penalty)" help="default=40"/>
+            <param argument="-X" type="boolean" truevalue="-X" falsevalue="" optional="true" label="skip self and dual mappings (for the all-vs-all mode)"/>
+            <param argument="-p" type="float" value="" max="1" optional="true" label="min secondary-to-primary score ratio" help="default=0.8"/>
+            <param argument="-N" type="integer" min="0" optional="true" label="retain at most INT secondary alignments" help="default=5"/>
+        </section>
+        <section name="alignment_options" title="Set advanced alignment options" help="Sets -A, -B, -O, -E, -z, -s and -u options." expanded="False">
+            <param argument="-A" type="integer" optional="true" label="Score for a sequence match" help="default=2"/>
+            <param argument="-B" type="integer" optional="true" label="Penalty for a mismatch" help="-B; default=4" />
+            <param argument="-O" type="integer" min="0" optional="true" label="Gap open penalties for deletions" help="-O; default=4"/>
+            <param name="-O2" type="integer" min="0" optional="true" label="Gap open penalties for insertions" help="-O; default=24"/>
+            <param argument="-E" type="integer" min="0" optional="true" label="Gap extension penalties; a gap of size k cost &#39;-O + -E*k&#39;. If two numbers are specified, the first is the penalty of extending a deletion and the second for extending an insertion" help="-E; default=2"/>
+            <param name="E2" type="integer" min="0" optional="true" label="Gap extension penalty for extending an insertion; if left empty uses the value specified for Gap extension penalties above" help="-E; default=1"/>
+            <param argument="-z" type="integer" optional="true" label="Z-drop score" help="default=400"/>
+            <param argument="-s" type="integer" optional="true" label="minimal peak DP alignment score" help="default=80"/>
+            <param argument="-u" type="select" optional="true" label="how to find GT-AG">
+                <option value="n">don't match GT-AG</option>
+                <option value="f">transcript strand</option>
+                <option value="b">both strands</option>
+            </param>
+        </section>
+        <section name="io_options" title="Set advanced output options" help="Sets -Q, -L, -R, -c, --cs and -K options." expanded="False">
+            <param name="output_format" type="select" label="Produce BAM or CRAM file?">
+                <option value="BAM">BAM</option>
+                <option value="CRAM">CRAM</option>
+            </param>
+            <param argument="-Q" type="boolean" truevalue="-Q" falsevalue="" optional="true" label="don't output base quality"/>
+            <param argument="-L" type="boolean" truevalue="-L" falsevalue="" optional="true" label="write CIGAR with >65535 ops to the CG tag" help="Useful for very long reads in SAM/BAM format"/>
+            <param argument="-K" type="integer" optional="true" label="minibatch size for mapping (in megabyte)" help="default=500M"/>
+            <param argument="--cs" type="select" optional="true" label="Output cs tag?" help="The cs tag is a more compact standalone representation of the MD tag, see help below.">
+                <option value="none">no</option>
+                <option value="short">short</option>
+                <option value="long">long</option>
+            </param>
+        </section>
+    </inputs>
+    <outputs>
+        <data format="bam" name="alignment_output" label="${tool.name} on ${on_string} (mapped reads in ${io_options.output_format} format)">
+            <actions>
+                <conditional name="reference_source.reference_source_selector">
+                    <when value="cached">
+                        <action type="metadata" name="dbkey">
+                            <option type="from_data_table" name="all_fasta" column="1" offset="0">
+                                <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
+                                <filter type="param_value" ref="reference_source.ref_file" column="0"/>
+                            </option>
+                        </action>
+                    </when>
+                    <when value="history">
+                        <action type="metadata" name="dbkey">
+                            <option type="from_param" name="reference_source.ref_file" param_attribute="dbkey" />
+                        </action>
+                    </when>
+                </conditional>
+            </actions>
+            <change_format>
+                <when input="io_options.output_format" value="CRAM" format="cram" />
+            </change_format>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <!-- test single input -->
+            <param name="reference_source_selector" value="history" />
+            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
+            <param name="fastq_input_selector" value="single"/>
+            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
+            <param name="analysis_type_selector" value="sr"/>
+            <output name="alignment_output" ftype="bam" file="minimap2-test1-fasta.bam" lines_diff="2" />
+        </test>
+        <test>
+            <!-- test cram output -->
+            <param name="reference_source_selector" value="history" />
+            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
+            <param name="fastq_input_selector" value="single"/>
+            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
+            <param name="analysis_type_selector" value="sr"/>
+            <param name="output_format" value="CRAM"/>
+            <output name="alignment_output" ftype="cram" file="minimap2-test1-fasta.cram" compare="sim_size" />
+        </test>
+        <test>
+            <!-- test paired input -->
+            <param name="reference_source_selector" value="history" />
+            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
+            <param name="fastq_input_selector" value="paired"/>
+            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
+            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
+            <param name="analysis_type_selector" value="sr"/>
+            <output name="alignment_output" ftype="bam" file="minimap2-test1.bam" lines_diff="2" />
+        </test>
+        <test>
+            <!-- test paired input with one pair compressed -->
+            <param name="reference_source_selector" value="history" />
+            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
+            <param name="fastq_input_selector" value="paired"/>
+            <param name="fastq_input1" ftype="fastqsanger.gz" value="bwa-mem-fastq1.fq.gz"/>
+            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
+            <param name="analysis_type_selector" value="sr"/>
+            <output name="alignment_output" ftype="bam" file="minimap2-test1.bam" lines_diff="2" />
+        </test>
+        <test>
+            <!-- test collection input -->
+            <param name="reference_source_selector" value="history" />
+            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
+            <param name="fastq_input_selector" value="paired_collection"/>
+            <param name="fastq_input1">
+                <collection type="paired">
+                    <element name="forward" value="bwa-mem-fastq1.fq" />
+                    <element name="reverse" value="bwa-mem-fastq2.fq" />
+                </collection>
+            </param>
+            <param name="analysis_type_selector" value="sr"/>
+            <output name="alignment_output" ftype="bam" file="minimap2-test2.bam" lines_diff="2" />
+        </test>
+        <test>
+            <!-- test data table reference -->
+            <param name="reference_source_selector" value="cached" />
+            <param name="ref_file" value="bwa-mem-mt-genome"/>
+            <param name="fastq_input_selector" value="single"/>
+            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
+            <param name="analysis_type_selector" value="sr"/>
+            <output name="alignment_output" ftype="bam" file="minimap2-test1-fasta.bam" lines_diff="2" />
+        </test>
+    </tests>
+    <help>
+
+Users’ Guide
+------------
+
+Minimap2 is a versatile sequence alignment program that aligns DNA or
+mRNA sequences against a large reference database. Typical use cases
+include: (1) mapping PacBio or Oxford Nanopore genomic reads to the
+human genome; (2) finding overlaps between long reads with error rate up
+to ~15%; (3) splice-aware alignment of PacBio Iso-Seq or Nanopore cDNA
+or Direct RNA reads against a reference genome; (4) aligning Illumina
+single- or paired-end reads; (5) assembly-to-assembly alignment; (6)
+full-genome alignment between two closely related species with
+divergence below ~15%.
+
+For ~10kb noisy reads sequences, minimap2 is tens of times faster than
+mainstream long-read mappers such as BLASR, BWA-MEM, NGMLR and GMAP. It
+is more accurate on simulated long reads and produces biologically
+meaningful alignment ready for downstream analyses. For >100bp Illumina
+short reads, minimap2 is three times as fast as BWA-MEM and Bowtie2, and
+as accurate on simulated data. Detailed evaluations are available from
+the `minimap2 preprint`.
+
+General usage
+~~~~~~~~~~~~~
+
+Minimap2 seamlessly works with gzip’d FASTA and FASTQ formats as input.
+You don’t need to convert between FASTA and FASTQ or decompress gzip’d
+files first.
+
+For the human reference genome, minimap2 takes a few minutes to generate
+a minimizer index for the reference before mapping. To reduce indexing
+time, you can optionally save the index with option **-d** and replace
+the reference sequence file with the index file on the minimap2 command
+line:
+
+***Importantly***, it should be noted that once you build the index,
+indexing parameters such as **-k**, **-w**, **-H** and **-I** can’t be
+changed during mapping. If you are running minimap2 for different data
+types, you will probably need to keep multiple indexes generated with
+different parameters. This makes minimap2 different from BWA which
+always uses the same index regardless of query data types.
+
+Use cases
+~~~~~~~~~
+
+Minimap2 uses the same base algorithm for all applications. However, due
+to the different data types it supports (e.g. short vs long reads; DNA
+vs mRNA reads), minimap2 needs to be tuned for optimal performance and
+accuracy. It is usually recommended to choose a preset with option
+**-x**, which sets multiple parameters at the same time. The default
+setting is the same as ``map-ont``.
+
+Map long noisy genomic reads
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The difference between ``map-pb`` and ``map-ont`` is that ``map-pb``
+uses homopolymer-compressed (HPC) minimizers as seeds, while ``map-ont``
+uses ordinary minimizers as seeds. Emperical evaluation suggests HPC
+minimizers improve performance and sensitivity when aligning PacBio
+reads, but hurt when aligning Nanopore reads.
+
+Map long mRNA/cDNA reads
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+There are different long-read RNA-seq technologies, including
+tranditional full-length cDNA, EST, PacBio Iso-seq, Nanopore 2D cDNA-seq
+and Direct RNA-seq. They produce data of varying quality and properties.
+By default, ``-x splice`` assumes the read orientation relative to the
+transcript strand is unknown. It tries two rounds of alignment to infer
+the orientation and write the strand to the ``ts`` SAM/PAF tag if
+possible. For Iso-seq, Direct RNA-seq and tranditional full-length
+cDNAs, it would be desired to apply ``-u f`` to force minimap2 to
+consider the forward transcript strand only. This speeds up alignment
+with slight improvement to accuracy. For noisy Nanopore Direct RNA-seq
+reads, it is recommended to use a smaller k-mer size for increased
+sensitivity to the first or the last exons.
+
+It is worth noting that by default ``-x splice`` prefers
+GT[A/G]..[C/T]AG over GT[C/T]..[A/G]AG, and then over other splicing
+signals. Considering one additional base improves the junction accuracy
+for noisy reads, but reduces the accuracy when aligning against the
+widely used SIRV control data. This is because SIRV does not honor the
+evolutionarily conservative splicing signal. If you are studying SIRV,
+you may apply ``--splice-flank=no`` to let minimap2 only model GT..AG,
+ignoring the additional base.
+
+Find overlaps between long reads
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Similarly, ``ava-pb`` uses HPC minimizers while ``ava-ont`` uses
+ordinary minimizers. It is usually not recommended to perform base-level
+alignment in the overlapping mode because it is slow and may produce
+false positive overlaps. However, if performance is not a concern, you
+may try to add ``-a`` or ``-c`` anyway.
+
+Map short accurate genomic reads
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+When two read files are specified, minimap2 reads from each file in turn
+and merge them into an interleaved stream internally. Two reads are
+considered to be paired if they are adjacent in the input stream and
+have the same name (with the ``/[0-9]`` suffix trimmed if present).
+Single- and paired-end reads can be mixed.
+
+Minimap2 does not work well with short spliced reads. There are many
+capable RNA-seq mappers for short reads.
+
+Full genome/assembly alignment
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For cross-species full-genome alignment, the scoring system needs to be
+tuned according to the sequence divergence.
+
+Advanced features
+~~~~~~~~~~~~~~~~~
+
+Working with >65535 CIGAR operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Due to a design flaw, BAM does not work with CIGAR strings with >65535
+operations (SAM and CRAM work). However, for ultra-long nanopore reads
+minimap2 may align ~1% of read bases with long CIGARs beyond the
+capability of BAM. If you convert such SAM/CRAM to BAM, Picard and
+recent samtools will throw an error and abort. Older samtools and other
+tools may create corrupted BAM.
+
+To avoid this issue, you can add option ``-L`` at the minimap2 command
+line. This option moves a long CIGAR to the ``CG`` tag and leaves a
+fully clipped CIGAR at the SAM CIGAR column. Current tools that don’t
+read CIGAR (e.g. merging and sorting) still work with such BAM records;
+tools that read CIGAR will effectively ignore these records. I have pull
+requests to the SAM spec, htslib, htsjdk, bedtools2, Rsamtools and
+igv.js. If they are accepted, future versions of these tools will
+seamlessly recognize long-cigar records generated by option ``-L``.
+
+**TD;DR**: if you work with ultra-long reads and use tools that only
+process BAM files, please add option ``-L``.
+
+The cs optional tag
+^^^^^^^^^^^^^^^^^^^
+
+The ``cs`` SAM/PAF tag encodes bases at mismatches and INDELs. It
+matches regular expression
+``/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/``. Like CIGAR, ``cs``
+consists of series of operations. Each leading character specifies the
+operation; the following sequence is the one involved in the operation.
+
+The ``cs`` tag is enabled by command line option ``--cs``. The following
+alignment, for example:
+
+.. code::
+
+    CGATCGATAAATAGAGTAG---GAATAGCA
+    ||||||   ||||||||||   |||| |||
+    CGATCG---AATAGAGTAGGTCGAATtGCA
+
+is represented as ``:6-ata:10+gtc:4*at:3``, where ``:[0-9]+`` represents
+an identical block, ``-ata`` represents a deltion, ``+gtc`` an insertion
+and ``*at`` indicates reference base ``a`` is substituted with a query
+base ``t``. It is similar to the ``MD`` SAM tag but is standalone and
+easier to parse.
+
+If ``--cs=long`` is used, the ``cs`` string also contains identical
+sequences in the alignment. The above example will become
+``=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA``. The long form of ``cs``
+encodes both reference and query sequences in one string.
+
+Algorithm overview
+~~~~~~~~~~~~~~~~~~
+
+In the following, minimap2 command line options have a dash ahead and
+are highlighted in bold. The description may help to tune minimap2
+parameters.
+
+1. Read **-I** [=*4G*] reference bases, extract
+   (**-k**,\ **-w**)-minimizers and index them in a hash table.
+
+2. Read **-K** [=*200M*] query bases. For each query sequence, do step 3
+   through 7:
+
+3. For each (**-k**,\ **-w**)-minimizer on the query, check against the
+   reference index. If a reference minimizer is not among the top **-f**
+   [=*2e-4*] most frequent, collect its the occurrences in the
+   reference, which are called *seeds*.
+
+4. Sort seeds by position in the reference. Chain them with dynamic
+   programming. Each chain represents a potential mapping. For read
+   overlapping, report all chains and then go to step 8. For reference
+   mapping, do step 5 through 7:
+
+5. Let *P* be the set of primary mappings, which is an empty set
+   initially. For each chain from the best to the worst according to
+   their chaining scores: if on the query, the chain overlaps with a
+   chain in *P* by **–mask-level** [=*0.5*] or higher fraction of the
+   shorter chain, mark the chain as *secondary* to the chain in *P*;
+   otherwise, add the chain to *P*.
+
+6. Retain all primary mappings. Also retain up to **-N** [=*5*] top
+   secondary mappings if their chaining scores are higher than **-p**
+   [=*0.8*] of their corresponding primary mappings.
+
+7. If alignment is requested, filter out an internal seed if it
+   potentially leads to both a long insertion and a long deletion.
+   Extend from the left-most seed. Perform global alignments between
+   internal seeds. Split the chain if the accumulative score along the
+   global alignment drops by **-z** [=*400*], disregarding long gaps.
+   Extend from the right-most seed. Output chains and their alignments.
+
+8. If there are more query sequences in the input, go to step 2 until no
+   more queries are left.
+
+9. If there are more reference sequences, reopen the query file from the
+   start and go to step 1; otherwise stop.
+
+Limitations
+-----------
+
+-  Minimap2 may produce suboptimal alignments through long
+   low-complexity regions where seed positions may be suboptimal. This
+   should not be a big concern because even the optimal alignment may be
+   wrong in such regions.
+    </help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btp324</citation>
+        <citation type="doi">10.1093/bioinformatics/btp698</citation>
+        <citation type="bibtex">@misc{1303.3997,
+            Author = {Heng Li},
+            Title = {Minimap2: fast pairwise alignment for long nucleotide sequences},
+            Year = {2017},
+            Eprint = {arXiv:1708.01492},
+            url = {https://arxiv.org/abs/1708.01492},
+            }</citation>
+    </citations>
+</tool>
author	iuc
date	Sat, 04 Nov 2017 05:41:11 -0400
parents
children	b103bc946f57