Mercurial > repos > iuc > minimap2

<?xml version="1.0"?>
<tool id="minimap2" name="Map with minimap2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
    <description>A fast pairwise aligner for genomic and spliced nucleotide sequences</description>
    <xrefs>
        <xref type="bio.tools">minimap2</xref>
    </xrefs>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="edam_ontology"/>
    <expand macro="requirements"/>
    <stdio>
        <exit_code range="1:" level="fatal" />
        <regex match="\[ERROR\]" source="stderr" level="fatal" />
    </stdio>
    <version_command>minimap2 --version</version_command>
    <command>
<![CDATA[
    #if $reference_source.reference_source_selector == 'history':
        ln -f -s '$reference_source.ref_file' reference.fa &&
    #else:
        ln -f -s '$reference_source.ref_file.fields.path' reference.fa &&
    #end if
    minimap2
    #if str($fastq_input.analysis_type_selector) == 'self-homology':
        -DP -k19 -w19 -m200
    #elif $fastq_input.analysis_type_selector:
        -x ${fastq_input.analysis_type_selector}
    #end if

    ## indexing options
    $indexing_options.H
    #if str($indexing_options.k):
        -k $indexing_options.k
    #end if
    #if str($indexing_options.w):
        -w $indexing_options.w
    #end if
    #if str($indexing_options.I):
        -I $indexing_options.I
    #end if

    ## Mapping options
    #if str($mapping_options.f):
        -f $mapping_options.f
    #end if
    #if str($mapping_options.min_occ_floor):
        --min-occ-floor $mapping_options.min_occ_floor
    #end if
    --q-occ-frac $mapping_options.q_occ_frac
    #if str($mapping_options.g):
        -g $mapping_options.g
    #end if
    #if str($mapping_options.r):
        -r $mapping_options.r
    #end if
    #if str($mapping_options.n):
        -n $mapping_options.n
    #end if
    #if str($mapping_options.m):
        -m $mapping_options.m
    #end if
    #if str($mapping_options.max_chain_skip):
        --max-chain-skip $mapping_options.max_chain_skip
    #end if
    #if str($mapping_options.max_chain_iter):
        --max-chain-iter $mapping_options.max_chain_iter
    #end if
    $mapping_options.X
    #if str($mapping_options.p):
        -p $mapping_options.p
    #end if
    #if str($mapping_options.N):
        -N $mapping_options.N
    #end if
    #if str($mapping_options.mask_len):
        --mask-len $mapping_options.mask_len
    #end if
    #if str($mapping_options.kmer_ocurrence_interval.interval):
        -U $mapping_options.kmer_ocurrence_interval.lower_limit,$mapping_options.kmer_ocurrence_interval.upper_limit
    #end if
    ## Alignment options
    #if str($alignment_options.splicing.splice_mode) == '--splice':
        --frag=no --splice
        #if str($alignment_options.splicing.G):
            -G $alignment_options.splicing.G
        #end if
        #if str($alignment_options.splicing.C):
            -C $alignment_options.splicing.C
        #end if
        #if $alignment_options.splicing.u:
            -u $alignment_options.splicing.u
        #end if
        $alignment_options.splicing.splice_flank
        #if str($alignment_options.splicing.splice_site_annotations.use_annotations) == 'yes':
            --junc-bed '$alignment_options.splicing.splice_site_annotations.junc_bed'
            --junc-bonus $alignment_options.splicing.splice_site_annotations.junc_bonus
        #end if
    #elif str($mapping_options.F) and 'splice' not in str($fastq_input.analysis_type_selector):
        --frag=yes -F $mapping_options.F
    #end if
    #if str($alignment_options.A):
        -A $alignment_options.A
    #end if
    #if str($alignment_options.B):
        -B $alignment_options.B
    #end if
    #if str($alignment_options.O):
        #if str($alignment_options.O2):
            -O $alignment_options.O,$alignment_options.O2
        #else
            -O $alignment_options.O
        #end if
    #end if
    #if str($alignment_options.E):
        #if str($alignment_options.E2):
            -E $alignment_options.E,$alignment_options.E2
        #else
            -E $alignment_options.E
        #end if
    #end if
    #if str($alignment_options.z):
        #if str($alignment_options.z2):
            -z $alignment_options.z,$alignment_options.z2
        #else
            -z $alignment_options.z
        #end if
    #end if
    #if str($alignment_options.s):
        -s $alignment_options.s
    #end if
    $alignment_options.no_end_flt
    ## Output options
    $io_options.Q
    $io_options.L
    $io_options.c
    #if $io_options.cs:
        --cs=$io_options.cs
    #end if
    $io_options.Y
    #if $io_options.K:
        -K $io_options.K
    #end if
    -t \${GALAXY_SLOTS:-4}
    reference.fa
    #if $fastq_input.fastq_input_selector in ['single', 'paired_iv']:
        '$fastq_input.fastq_input1'
    #else if $fastq_input.fastq_input_selector == 'paired':
         '$fastq_input.fastq_input1' '$fastq_input.fastq_input2'
    #else if $fastq_input.fastq_input_selector == 'paired_collection':
         '$fastq_input.fastq_input1.forward' '$fastq_input.fastq_input1.reverse'
    #end if
    #if str($io_options.output_format) in ('BAM', 'CRAM'):
        -a | samtools view --no-PG -hT reference.fa
    #end if
    #if $io_options.output_format == 'BAM':
        | samtools sort
        -@\${GALAXY_SLOTS:-2}
        -T "\${TMPDIR:-.}"
        -O $io_options.output_format
        -o '$alignment_output'
    #else if $io_options.output_format == 'CRAM':
        | samtools sort
        -T "\${TMPDIR:-.}"
        -@\${GALAXY_SLOTS:-2}
        -O $io_options.output_format
        $io_options.eqx
        --reference reference.fa
        --output-fmt-option no_ref
        -o '$alignment_output'
    #else:
        > '$alignment_output'
    #end if
]]>
    </command>
    <inputs>
        <conditional name="reference_source">
            <param name="reference_source_selector" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options. See `Indexes` section of help below. If you would like to perform self-mapping select `history` here, then choose your input file as reference.">
                <option value="cached">Use a built-in genome index</option>
                <option value="history">Use a genome from history and build index</option>
            </param>
            <when value="cached">
                <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
                    <options from_data_table="all_fasta">
                        <filter type="sort_by" column="2" />
                        <validator type="no_options" message="No reference genomes are available" />
                    </options>
                    <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
                </param>
            </when>
            <when value="history">
                <param name="ref_file" type="data" format="fasta,fastq" label="Use the following dataset as the reference sequence" help="You can upload a FASTA or FASTQ sequence to the history and use it as reference" />
            </when>
        </conditional>
        <!-- start unchanged copy from bwa-mem -->
        <conditional name="fastq_input">
            <param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data">
                <option value="single">Single</option>
                <option value="paired">Paired</option>
                <option value="paired_collection">Paired Collection</option>
                <option value="paired_iv">Paired Interleaved</option>
            </param>
            <!-- below, preset options are only offered for single-end input
            because paired-end alignment in minimap2 is only enabled with -x sr
            (see https://github.com/lh3/minimap2/issues/190) -->
            <when value="single">
                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with single reads"/>
                <param name="analysis_type_selector" type="select" optional="True"
                label="Select a profile of preset options"
                help="Each profile comes with the preconfigured settings mentioned in parentheses. You can customize each profile further in the indexing, mapping and alignment options sections below. If you do not select a profile here, the tool will use the per-parameter defaults listed in the below sections unless you customize them." >
                    <option value="map-pb">PacBio/Oxford Nanopore read to reference mapping (-Hk19) (map-pb)</option>
                    <option value="map-ont">Oxford Nanopore read to reference mapping. Slightly more sensitive for Oxford Nanopore to reference mapping (-k15). For PacBio reads, HPC minimizers consistently leads to faster performance and more sensitive results in comparison to normal minimizers. For Oxford Nanopore data, normal minimizers are better, though not much. The effectiveness of HPC is determined by the sequencing error mode. (map-ont)</option>
                    <option value="map-hifi">PacBio HiFi reads vs reference mapping (-k19 -w19 -U50,500 -g10k -A1 -B4 -O6,26 -E2,1 -s200 ) (map-hifi)</option>
                    <option value="ava-pb">PacBio all-vs-all overlap mapping (-Hk19 -Xw5 -m100 -g10000 --max-chain-skip 25) (ava-pb)</option>
                    <option value="ava-ont">Oxford Nanopore all-vs-all overlap mapping (-k15 -Xw5 -m100 -g10000 -r2000 --max-chain-skip 25). Similarly, the major difference from ava-pb is that this preset is not using HPC minimizers. (ava-ont)</option>
                    <option value="asm5">Long assembly to reference mapping (-k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 --min-occ-floor=100). Typically, the alignment will not extend to regions with 5% or higher sequence divergence. Only use this preset if the average divergence is far below 5%. (asm5)</option>
                    <option value="asm10">Long assembly to reference mapping (-k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 --min-occ-floor=100). Up to 10% sequence divergence. (asm10)</option>
                    <option value="asm20">Long assembly to reference mapping (-k19 -w10 -A1 -B6 -O6,26 -E2,1 -s200 -z200 --min-occ-floor=100). Up to 20% sequence divergence. (asm20)</option>
                    <option value="splice">Long-read spliced alignment (-k15 -w5 --splice -g2000 -G200k  -A1 -B2  -O2,32  -E1,0  -C9  -z200  -ub  --splice-flank=yes). In the splice mode, 1) long deletions are taken as  introns  and  represented as the `N' CIGAR operator 2) long insertions are disabled 3) deletion and insertion gap costs are different during chaining 4) the computation of the `ms` tag ignores introns to demote hits to pseudogenes. (splice)</option>
                    <option value="splice:hq">Long-read splice alignment for PacBio CCS reads (same as `splice` but with -C5 -O6,24 -B4) (splice:hq)</option>
                    <option value="sr">Short single-end reads without splicing (-k21 -w11 --sr -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g200 -2K50m --heap-sort=yes --secondary=no) (sr)</option>
                    <option value="self-homology">Construct a self-homology map - use same genome as query and reference (-DP -k19 -w19 -m200) (self-homology)</option>
                </param>
            </when>
            <when value="paired">
                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select first set of reads" help="Specify dataset with forward reads"/>
                <param name="fastq_input2" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select second set of reads" help="Specify dataset with reverse reads"/>
                <expand macro="pe_anaylsis_fixed_selector" />
            </when>
            <when value="paired_collection">
                <param name="fastq_input1" format="fastqsanger,fastqsanger.gz,fasta" type="data_collection" collection_type="paired" label="Select a paired collection" help="See help section for an explanation of dataset collections"/>
                <expand macro="pe_anaylsis_fixed_selector" />
            </when>
            <when value="paired_iv">
                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with interleaved reads"/>
                <expand macro="pe_anaylsis_fixed_selector" />
            </when>
        </conditional>
        <section name="indexing_options" title="Indexing options">
            <param argument="-H" name="H" type="boolean" optional="true" truevalue="-H" falsevalue="" label="Use homopolymer-compressed k-mer ?"/>
            <param argument="-k" type="integer" min="4" max="28" optional="true"  label="K-mer size" help=""/>
            <param argument="-w" type="integer" min="1" optional="true"  label="Minimizer window size" help=""/>
            <param argument="-I" type="integer" min="1" optional="true"  label="Split index for every N input gigabases" help=""/>
        </section>
        <section name="mapping_options" title="Mapping options" help="Sets -f, -g, -F, -r, -n, -m, -X, -p, -N and --min-occ-floor options." expanded="False">
            <param argument="-N" type="integer" min="0" optional="true" label="Retain at most INT secondary alignments" help="default=5"/>
            <param argument="-F" type="integer" min="0" value="" optional="true"
            label="Max fragment length for PE alignment"
            help="The maximum apparent fragment length up to which paired-end reads are aligned together; at higher fragment lengths the mates will be aligned independent of each other; effective only for paired-end data and when spliced alignment mode is turned off; default=800" />
            <param argument="-f" type="float" value="" optional="true" label="Filter out top FLOAT fraction of repetitive minimizers" help="default=0.0002"/>
            <conditional name="kmer_ocurrence_interval">
                <param name="interval" argument="-U" type="select" label="Specify an interval of k-mer occurrences" help="Allows to specify an interval of k-mer occurrences with -U. For repeat-rich genomes, the automatic k-mer occurrence threshold determined by -f may be too large and makes alignment impractically slow. The new option protects against such cases. Enabled for asm* and map-hifi.">
                    <option value="enabled">Enabled</option>
                    <option value="" selected="true">Disabled</option>
                </param>
                <when value="enabled">
                    <param name="lower_limit" type="integer" min="0" max="10000" value="" label="K-mer ocurrence lower limit"/>
                    <param name="upper_limit" type="integer" min="0" max="10000" value="" label="K-mer ocurrence upper limit"/>
                </when>
                <when value="">
                </when>
            </conditional>
            <param argument="--min-occ-floor" type="integer" label="Force minimap2 to always use k-mers occuring this many times or fewer" help="Maximum occurence is the number of repetitive minimizers determined by '-f' or this value, whichever is higher." optional="true" />
            <param argument="--q-occ-frac" type="float" value="0.01" label="Discard a query minimizer"
                help="Discard a query minimizer if its occurrence is higher than this fraction of query minimizers and than the reference occurrence threshold. It allows to avoid extremely long mapping time for pathologic reads with highly repeated k-mers not in the reference. Set to 0 to disable the new heuristic." />

            <param argument="-g" type="integer" value="" optional="true" label="Stop chain enlongation if there are no minimizers in INT-bp" help="default=5000"/>
            <param argument="-r" type="integer" value="" optional="true" label="Bandwidth used in chaining and DP-based alignment" help="default=500" />
            <param argument="-n" type="integer" value="" optional="true" label="Minimal number of minimizers on a chain" help="default=3"/>
            <param argument="-m" type="integer" value="" optional="true" label="Minimal chaining score (matching bases minus log gap penalty)" help="default=40"/>
            <param argument="--max-chain-skip" type="integer" value="" optional="true"
            label="Maximum seed skips during chaining"
            help="A heuristics that stops chaining early. Minimap2 uses dynamic programming for chaining. The time complexity is quadratic in the number of seeds. This option makes minimap2 exit the inner loop if it repeatedly sees seeds already on chains. Set to a large number to switch off this heurstics effectively. default=25" />
            <param argument="--max-chain-iter" type="integer" value="" optional="true"
            label="Maximum number of partial chains checked during chaining"
            help="A heuristics to avoid quadratic time complexity in the worst case. default=5000" />
            <param argument="-X" type="boolean" truevalue="-X" falsevalue="" optional="true" label="Skip self and dual mappings (for the all-vs-all mode)"/>
            <param argument="-p" type="float" value="" max="1" optional="true" label="Min secondary-to-primary score ratio" help="default=0.8"/>
            <param argument="--mask-len" type="integer" min="0" max="10000" value="" optional="true" label="Fine control the removal of redundant hits" help="Keep an alignment if dropping it leaves an unaligned region on query longer than INT"/>
        </section>
        <section name="alignment_options" title="Alignment options" help="Sets -A, -B, -O, -E, -z, -s, and spliced alignments options." expanded="False">
            <conditional name="splicing">
                <param name="splice_mode" type="select"
                label="Customize spliced alignment mode?"
                help="Warning: Minimap2 cannot currently perform spliced alignments of read pairs. If you enable spliced alignment for paired-end data it will be treated as single-end!" >
                    <option value="preset">No, use profile setting or leave turned off</option>
                    <option value="">Disable spliced alignments (overwrite profile setting if necessary)</option>
                    <option value="--splice">Yes, enable spliced alignments (--splice)</option>
                </param>
                <when value="preset" />
                <when value="" />
                <when value="--splice">
                    <param argument="-G" type="integer" value="" optional="true"
                    label="Maximum allowed gap on the reference"
                    help="Higher values cause slower spliced alignment. When in use, this option causes -r (in mapping options) to be set to the same value. default=200k" />
                    <param argument="-C" type="integer" min="0" optional="true"
                    label="Cost of non-canonical (non-GT-AG) splicing"
                    help="default=0" />
                    <param argument="-u" type="select" optional="true"
                    label="How to find GT-AG"
                    help="default=n (don't match GT-AG)">
                        <option value="n">Don't match GT-AG (-un)</option>
                        <option value="f">Transcript strand (-uf)</option>
                        <option value="b">Both strands (-ub)</option>
                    </param>
                    <param argument="--splice-flank" type="boolean" truevalue="--splice-flank=yes" falsevalue="--splice-flank=no" checked="true"
                    label="Assume conserved flanking region of splice sites?"
                    help="Assume the next base to a GT donor site tends to be A/G (91% in human and 92% in mouse) and the preceding base to a AG acceptor tends to be C/T. This trend is evolutionarily conserved, all the way to S. cerevisiae (PMID:18688272). Specifying this option generally leads to higher junction accuracy by several percents, so it is applied by default with --splice. However, the SIRV control does not honor this trend (only ~60%) so this option reduces accuracy. If you are benchmarking minimap2 on SIRV data, please disable this option." />
                    <conditional name="splice_site_annotations">
                        <param name="use_annotations" type="select"
                        label="Use previously annotated splice sites to guide the alignment?"
                        help="">
                            <option value="no">No, perform unbiased alignment</option>
                            <option value="yes">Yes, favor annotated splice sites</option>
                        </param>
                        <when value="no" />
                        <when value="yes">
                            <param argument="--junc-bed" type="data" format="bed"
                            label="Dataset with annotated genes or introns"
                            help="Gene annotations should be provided in BED12 (aka 12-column BED), intron positions in 5-column BED format." />
                            <param argument="--junc-bonus" type="integer" min="1" value="1"
                            label="Annotated splice site bonus"
                            help="Score bonus for a splice donor or acceptor found in annotation." />
                        </when>
                    </conditional>
                </when>
            </conditional>
            <param argument="-A" type="integer" min="0" optional="true"
            label="Score for a sequence match" help="default=2"/>
            <param argument="-B" type="integer" min="0" optional="true"
            label="Penalty for a mismatch" help="-B; default=4" />
            <param argument="-O" type="integer" min="1" optional="true"
            label="Gap open penalties for deletions" help="-O; default=4"/>
            <param name="O2" type="integer" min="0" optional="true"
            label="Gap open penalties for insertions" help="-O; default=24"/>
            <param argument="-E" type="integer" min="1" optional="true"
            label="Gap extension penalties; a gap of size k cost &#39;-O + -E*k&#39;. If two numbers are specified, the first is the penalty of extending a deletion and the second for extending an insertion"
            help="-E; default=2"/>
            <param name="E2" type="integer" min="0" optional="true"
            label="Gap extension penalty for extending an insertion; if left empty uses the value specified for Gap extension penalties above"
            help="-E; default=1"/>
            <param argument="-z" type="integer" min="0" optional="true"
            label="Z-drop threshold for truncating an alignment"
            help="Increase to improve the contiguity of alignments at the cost of poorer alignments in the middle. default=400" />
            <param name="z2" type="integer" min="0" optional="true"
            label="Z-drop threshold for reverse-complementing the query"
            help="Decrease to find small inversions at the cost of performance and false positives. default=200" />
            <param argument="-s" type="integer" min="0" optional="true"
            label="Minimal peak DP alignment score" help="default=80"/>
            <param name="no_end_flt" type="boolean" falsevalue="--no-end-flt" truevalue="" checked="true"
            label="Filter seeds towards the ends of chains before performing base-level alignment?" />
        </section>
        <section name="io_options" title="Set advanced output options" help="Sets -Q, -L, -R, -c, --cs and -K options." expanded="False">
            <param name="output_format" type="select" label="Select an output format">
                <option value="BAM">BAM</option>
                <option value="CRAM">CRAM</option>
                <option value="paf">PAF</option>
            </param>
            <param argument="-Q" type="boolean" truevalue="-Q" falsevalue="" label="Don't output base quality"/>
            <param argument="-L" type="boolean" truevalue="-L" falsevalue="" label="Write CIGAR with >65535 ops to the CG tag" help="Useful for very long reads in SAM/BAM format"/>
            <param argument="-K" type="integer" optional="true" label="Minibatch size for mapping (in megabyte)" help="default=500M"/>
            <param argument="--cs" type="select" optional="true" label="Output cs tag?" help="The cs tag is a more compact standalone representation of the MD tag, see help below.">
                <option value="none">no</option>
                <option value="short">short</option>
                <option value="long">long</option>
            </param>
            <param argument="-c" type="boolean" truevalue="-c" falsevalue="" label="Generate CIGAR"
                help="In PAF, the CIGAR is written to the ‘cg’ custom tag." />

            <param argument="--eqx" type="boolean" truevalue="--eqx" falsevalue="" label="Write =/X CIGAR operators"/>
            <param argument="-Y" type="boolean" truevalue="-Y" falsevalue="" label="Use soft clipping for supplementary alignments ?"/>
        </section>
    </inputs>
    <outputs>
        <data format="bam" name="alignment_output" label="${tool.name} on ${on_string} (mapped reads in ${io_options.output_format} format)">
            <actions>
                <conditional name="reference_source.reference_source_selector">
                    <when value="cached">
                        <action type="metadata" name="dbkey">
                            <option type="from_data_table" name="all_fasta" column="1" offset="0">
                                <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
                                <filter type="param_value" ref="reference_source.ref_file" column="0"/>
                            </option>
                        </action>
                    </when>
                    <when value="history">
                        <action type="metadata" name="dbkey">
                            <option type="from_param" name="reference_source.ref_file" param_attribute="dbkey" />
                        </action>
                    </when>
                </conditional>
            </actions>
            <change_format>
                <when input="io_options.output_format" value="paf" format="paf" />
                <when input="io_options.output_format" value="CRAM" format="cram" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <test>
            <!-- test single input -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
            <param name="analysis_type_selector" value="sr"/>
            <output name="alignment_output" ftype="bam" file="minimap2-test1-fasta.bam" lines_diff="4" />
        </test>
        <test>
            <!-- test cram output -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
            <param name="analysis_type_selector" value="sr"/>
            <param name="output_format" value="CRAM"/>
            <output name="alignment_output" ftype="cram" file="minimap2-test1-fasta.cram" compare="sim_size" />
        </test>
        <test>
            <!-- test paired input -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="paired"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
            <output name="alignment_output" ftype="bam" file="minimap2-test1.bam" lines_diff="4" />
        </test>
        <test>
            <!-- test paired input with one pair compressed -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="paired"/>
            <param name="fastq_input1" ftype="fastqsanger.gz" value="bwa-mem-fastq1.fq.gz"/>
            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
            <output name="alignment_output" ftype="bam" file="minimap2-test1.bam" lines_diff="4" />
        </test>
        <test>
            <!-- test collection input -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="paired_collection"/>
            <param name="fastq_input1">
                <collection type="paired">
                    <element name="forward" value="bwa-mem-fastq1.fq" />
                    <element name="reverse" value="bwa-mem-fastq2.fq" />
                </collection>
            </param>
            <output name="alignment_output" ftype="bam" file="minimap2-test2.bam" lines_diff="4" />
        </test>
        <test>
            <!-- test data table reference -->
            <param name="reference_source_selector" value="cached" />
            <param name="ref_file" value="bwa-mem-mt-genome"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
            <param name="analysis_type_selector" value="sr"/>
            <output name="alignment_output" ftype="bam" file="minimap2-test1-fasta.bam" lines_diff="4" />
        </test>
        <test>
            <!-- test alignment options -->
            <param name="reference_source_selector" value="cached" />
            <param name="min_occ_floor" value="1000"/>
            <param name="ref_file" value="bwa-mem-mt-genome"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
            <param name="analysis_type_selector" value="sr"/>
            <section name="alignment_options">
                <!-- the folowing settings correspond to the defaults for "sr"
                mode. The purpose is to check that all alignment params get
                parsed correctly. -->
                <param name="A" value="2" />
                <param name="B" value="8" />
                <param name="O" value="12" />
                <param name="O2" value="32" />
                <param name="E" value="2" />
                <param name="E2" value="1" />
                <param name="z" value="400" />
                <param name="s" value="40" />
            </section>
            <section name="io_options">
                <!-- the next setting is a noop for bam output, but tests that
                a valid command line is formed for the cs option -->
                <param name="cs" value="none" />
            </section>
            <output name="alignment_output" ftype="bam" file="minimap2-test1-fasta.bam" lines_diff="4" />
        </test>
        <test>
            <!-- test paf output -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fastqsanger"  value="mini_reads.fq" />
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger"  value="mini_reads.fq" />
            <param name="analysis_type_selector" value="ava-ont"/>
            <param name="output_format" value="paf"/>
            <output name="alignment_output" ftype="paf" file="mini_reads.paf" />
        </test>
        <test>
            <!-- test self-homology mode -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="minimap2-self-homology.fasta" />
            <param name="fastq_input_selector" value="single" />
            <param name="fastq_input1" ftype="fasta" value="minimap2-self-homology.fasta" />
            <param name="analysis_type_selector" value="self-homology" />
            <output name="alignment_output" ftype="bam" file="minimap2-self-homology.bam" lines_diff="4" />
        </test>
        <test>
            <!-- test mask-len option -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
            <param name="analysis_type_selector" value="sr"/>
            <section name="mapping_options">
                <param name="mask_len" value="100"/>
            </section>
            <output name="alignment_output" ftype="bam" file="minimap2-test-mask_len.bam" lines_diff="4" />
        </test>
        <test>
            <!-- test map-hifi -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="pacbio_hifi_assembly.fa.gz"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="pacbio_hifi_reads.fasta.gz"/>
            <param name="analysis_type_selector" value="map-hifi"/>
            <output name="alignment_output" ftype="bam" file="minimap2-test_hifi-fasta.bam" lines_diff="4" />
        </test>
        <test>
            <!-- test map-hifi uncompressed reference-->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="pacbio_hifi_assembly.fa"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="pacbio_hifi_reads.fasta.gz"/>
            <param name="analysis_type_selector" value="map-hifi"/>
            <output name="alignment_output" ftype="bam" file="minimap2-test_hifi-2-fasta.bam" lines_diff="4" />
        </test>
        <test>
            <!-- test kmer ocurrence interval option -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
            <param name="analysis_type_selector" value="sr"/>
            <conditional name="kmer_ocurrence_interval">
                <param name="interval" value="enabled"/>
                <param name="lower_limit" value="10"/>
                <param name="upper_limit" value="30"/>
            </conditional>
            <output name="alignment_output" ftype="bam" file="minimap2-test-kmer_ocurrence.bam" lines_diff="4" />
        </test>
    </tests>
    <help>

Users’ Guide
------------

Minimap2 is a versatile sequence alignment program that aligns DNA or
mRNA sequences against a large reference database. Typical use cases
include: (1) mapping PacBio or Oxford Nanopore genomic reads to the
human genome; (2) finding overlaps between long reads with error rate up
to ~15%; (3) splice-aware alignment of PacBio Iso-Seq or Nanopore cDNA
or Direct RNA reads against a reference genome; (4) aligning Illumina
single- or paired-end reads; (5) assembly-to-assembly alignment; (6)
full-genome alignment between two closely related species with
divergence below ~15%.

For ~10kb noisy reads sequences, minimap2 is tens of times faster than
mainstream long-read mappers such as BLASR, BWA-MEM, NGMLR and GMAP. It
is more accurate on simulated long reads and produces biologically
meaningful alignment ready for downstream analyses. For >100bp Illumina
short reads, minimap2 is three times as fast as BWA-MEM and Bowtie2, and
as accurate on simulated data. Detailed evaluations are available from
the `minimap2 preprint`.

General usage
~~~~~~~~~~~~~

Minimap2 seamlessly works with gzip’d FASTA and FASTQ formats as input.
You don’t need to convert between FASTA and FASTQ or decompress gzip’d
files first.

For the human reference genome, minimap2 takes a few minutes to generate
a minimizer index for the reference before mapping. To reduce indexing
time, you can optionally save the index with option **-d** and replace
the reference sequence file with the index file on the minimap2 command
line:

***Importantly***, it should be noted that once you build the index,
indexing parameters such as **-k**, **-w**, **-H** and **-I** can’t be
changed during mapping. If you are running minimap2 for different data
types, you will probably need to keep multiple indexes generated with
different parameters. This makes minimap2 different from BWA which
always uses the same index regardless of query data types.

Use cases
~~~~~~~~~

Minimap2 uses the same base algorithm for all applications. However, due
to the different data types it supports (e.g. short vs long reads; DNA
vs mRNA reads), minimap2 needs to be tuned for optimal performance and
accuracy. It is usually recommended to choose a preset with option
**-x**, which sets multiple parameters at the same time. The default
setting is the same as ``map-ont``.

Map long noisy genomic reads
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The difference between ``map-pb`` and ``map-ont`` is that ``map-pb``
uses homopolymer-compressed (HPC) minimizers as seeds, while ``map-ont``
uses ordinary minimizers as seeds. Emperical evaluation suggests HPC
minimizers improve performance and sensitivity when aligning PacBio
reads, but hurt when aligning Nanopore reads.

Map long mRNA/cDNA reads
^^^^^^^^^^^^^^^^^^^^^^^^

There are different long-read RNA-seq technologies, including
tranditional full-length cDNA, EST, PacBio Iso-seq, Nanopore 2D cDNA-seq
and Direct RNA-seq. They produce data of varying quality and properties.
By default, ``-x splice`` assumes the read orientation relative to the
transcript strand is unknown. It tries two rounds of alignment to infer
the orientation and write the strand to the ``ts`` SAM/PAF tag if
possible. For Iso-seq, Direct RNA-seq and tranditional full-length
cDNAs, it would be desired to apply ``-u f`` to force minimap2 to
consider the forward transcript strand only. This speeds up alignment
with slight improvement to accuracy. For noisy Nanopore Direct RNA-seq
reads, it is recommended to use a smaller k-mer size for increased
sensitivity to the first or the last exons.

It is worth noting that by default ``-x splice`` prefers
GT[A/G]..[C/T]AG over GT[C/T]..[A/G]AG, and then over other splicing
signals. Considering one additional base improves the junction accuracy
for noisy reads, but reduces the accuracy when aligning against the
widely used SIRV control data. This is because SIRV does not honor the
evolutionarily conservative splicing signal. If you are studying SIRV,
you may apply ``--splice-flank=no`` to let minimap2 only model GT..AG,
ignoring the additional base.

Find overlaps between long reads
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Similarly, ``ava-pb`` uses HPC minimizers while ``ava-ont`` uses
ordinary minimizers. It is usually not recommended to perform base-level
alignment in the overlapping mode because it is slow and may produce
false positive overlaps. However, if performance is not a concern, you
may try to add ``-a`` or ``-c`` anyway.

Map short accurate genomic reads
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

When two read files are specified, minimap2 reads from each file in turn
and merge them into an interleaved stream internally. Two reads are
considered to be paired if they are adjacent in the input stream and
have the same name (with the ``/[0-9]`` suffix trimmed if present).
Single- and paired-end reads can be mixed.

Minimap2 does not work well with short spliced reads. There are many
capable RNA-seq mappers for short reads.

Full genome/assembly alignment
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

For cross-species full-genome alignment, the scoring system needs to be
tuned according to the sequence divergence.

Self-homology map creation
^^^^^^^^^^^^^^^^^^^^^^^^^^

A self-homology map is created by mapping a genome (e.g. that of E. coli)
against itself. When this option is used the same FASTA file should
be used for reference and for the (single ended mode) query.

Advanced features
~~~~~~~~~~~~~~~~~

Working with >65535 CIGAR operations
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Due to a design flaw, BAM does not work with CIGAR strings with >65535
operations (SAM and CRAM work). However, for ultra-long nanopore reads
minimap2 may align ~1% of read bases with long CIGARs beyond the
capability of BAM. If you convert such SAM/CRAM to BAM, Picard and
recent samtools will throw an error and abort. Older samtools and other
tools may create corrupted BAM.

To avoid this issue, you can add option ``-L`` at the minimap2 command line.
This option moves a long CIGAR to the ``CG`` tag and leaves a fully clipped
CIGAR at the SAM CIGAR column. Current tools that don’t read CIGAR
(e.g. merging and sorting) still work with such BAM records; tools that read
CIGAR will effectively ignore these records. It has been decided that future
tools will seamlessly recognize long-cigar records generated by option `-L`.

**TD;DR**: if you work with ultra-long reads and use tools that only
process BAM files, please add option ``-L``.

The cs optional tag
^^^^^^^^^^^^^^^^^^^

The ``cs`` SAM/PAF tag encodes bases at mismatches and INDELs. It
matches regular expression
``/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/``. Like CIGAR, ``cs``
consists of series of operations. Each leading character specifies the
operation; the following sequence is the one involved in the operation.

The ``cs`` tag is enabled by command line option ``--cs``. The following
alignment, for example:

.. code::

    CGATCGATAAATAGAGTAG---GAATAGCA
    ||||||   ||||||||||   |||| |||
    CGATCG---AATAGAGTAGGTCGAATtGCA

is represented as ``:6-ata:10+gtc:4*at:3``, where ``:[0-9]+`` represents
an identical block, ``-ata`` represents a deltion, ``+gtc`` an insertion
and ``*at`` indicates reference base ``a`` is substituted with a query
base ``t``. It is similar to the ``MD`` SAM tag but is standalone and
easier to parse.

If ``--cs=long`` is used, the ``cs`` string also contains identical
sequences in the alignment. The above example will become
``=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA``. The long form of ``cs``
encodes both reference and query sequences in one string.

Algorithm overview
~~~~~~~~~~~~~~~~~~

In the following, minimap2 command line options have a dash ahead and
are highlighted in bold. The description may help to tune minimap2
parameters.

1. Read **-I** [=*4G*] reference bases, extract
   (**-k**,\ **-w**)-minimizers and index them in a hash table.

2. Read **-K** [=*200M*] query bases. For each query sequence, do step 3
   through 7:

3. For each (**-k**,\ **-w**)-minimizer on the query, check against the
   reference index. If a reference minimizer is not among the top **-f**
   [=*2e-4*] most frequent, collect its the occurrences in the
   reference, which are called *seeds*.

4. Sort seeds by position in the reference. Chain them with dynamic
   programming. Each chain represents a potential mapping. For read
   overlapping, report all chains and then go to step 8. For reference
   mapping, do step 5 through 7:

5. Let *P* be the set of primary mappings, which is an empty set
   initially. For each chain from the best to the worst according to
   their chaining scores: if on the query, the chain overlaps with a
   chain in *P* by **–mask-level** [=*0.5*] or higher fraction of the
   shorter chain, mark the chain as *secondary* to the chain in *P*;
   otherwise, add the chain to *P*.

6. Retain all primary mappings. Also retain up to **-N** [=*5*] top
   secondary mappings if their chaining scores are higher than **-p**
   [=*0.8*] of their corresponding primary mappings.

7. If alignment is requested, filter out an internal seed if it
   potentially leads to both a long insertion and a long deletion.
   Extend from the left-most seed. Perform global alignments between
   internal seeds. Split the chain if the accumulative score along the
   global alignment drops by **-z** [=*400*], disregarding long gaps.
   Extend from the right-most seed. Output chains and their alignments.

8. If there are more query sequences in the input, go to step 2 until no
   more queries are left.

9. If there are more reference sequences, reopen the query file from the
   start and go to step 1; otherwise stop.

Limitations
-----------

-  Minimap2 may produce suboptimal alignments through long
   low-complexity regions where seed positions may be suboptimal. This
   should not be a big concern because even the optimal alignment may be
   wrong in such regions.
    </help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Fri, 19 Nov 2021 14:35:04 +0000
parents	7db8d4bc1eea
children	92678fcb1a5f