Mercurial > repos > iuc > winnowmap

<tool id="winnowmap" name="Winnowmap" version="@TOOL_VERSION@+galaxy1" profile="20.05">
    <description>a mapping tool optimized for repetitive sequences</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="edam_ontology"/>
    <expand macro="requirements"/>
    <stdio>
        <exit_code range="1:" level="fatal" />
        <regex match="\[ERROR\]" source="stderr" level="fatal" />
    </stdio>
    <version_command>winnowmap --version</version_command>
    <command>
<![CDATA[
    #if $reference_source.reference_source_selector == 'history':
        ln -f -s '$reference_source.ref_file' reference.fa &&
    #else:
        ln -f -s '$reference_source.ref_file.fields.path' reference.fa &&
    #end if
    winnowmap
    #if str($analysis_type_selector) == 'self-homology':
        -DP -k19 -w19 -m200
    #elif $analysis_type_selector:
        -x ${analysis_type_selector}
    #end if
    #if str($highfreq_kmers):
        -W $highfreq_kmers
    #end if
    ## indexing options
    $indexing_options.H
    #if str($indexing_options.k) and not str($highfreq_kmers):
        -k $indexing_options.k
    #end if
    #if str($indexing_options.w):
        -w $indexing_options.w
    #end if
    #if str($indexing_options.I):
        -I $indexing_options.I
    #end if

    ## Mapping options
    #if str($mapping_options.f):
        -f $mapping_options.f
    #end if
    #if str($mapping_options.min_occ_floor):
        --min-occ-floor $mapping_options.min_occ_floor
    #end if
    #if str($mapping_options.g):
        -g $mapping_options.g
    #end if
    #if str($mapping_options.r):
        -r $mapping_options.r
    #end if
    #if str($mapping_options.n):
        -n $mapping_options.n
    #end if
    #if str($mapping_options.m):
        -m $mapping_options.m
    #end if
    #if str($mapping_options.max_chain_skip):
        --max-chain-skip $mapping_options.max_chain_skip
    #end if
    #if str($mapping_options.max_chain_iter):
        --max-chain-iter $mapping_options.max_chain_iter
    #end if
    $mapping_options.X
    #if str($mapping_options.p):
        -p $mapping_options.p
    #end if
    #if str($mapping_options.sv_off):
        --sv-off
    #end if

    ## Alignment options
    #if str($alignment_options.splicing.splice_mode) == '--splice':
        --frag=no --splice
        #if str($alignment_options.splicing.G):
            -G $alignment_options.splicing.G
        #end if
        #if str($alignment_options.splicing.C):
            -C $alignment_options.splicing.C
        #end if
        #if $alignment_options.splicing.u:
            -u $alignment_options.splicing.u
        #end if
        $alignment_options.splicing.splice_flank
        #if str($alignment_options.splicing.splice_site_annotations.use_annotations) == 'yes':
            --junc-bed '$alignment_options.splicing.splice_site_annotations.junc_bed'
            --junc-bonus $alignment_options.splicing.splice_site_annotations.junc_bonus
        #end if
    #elif str($mapping_options.F) and 'splice' not in str($analysis_type_selector):
        --frag=yes -F $mapping_options.F
    #end if
    #if str($alignment_options.A):
        -A $alignment_options.A
    #end if
    #if str($alignment_options.B):
        -B $alignment_options.B
    #end if
    #if str($alignment_options.O):
        #if str($alignment_options.O2):
            -O $alignment_options.O,$alignment_options.O2
        #else
            -O $alignment_options.O
        #end if
    #end if
    #if str($alignment_options.E):
        #if str($alignment_options.E2):
            -E $alignment_options.E,$alignment_options.E2
        #else
            -E $alignment_options.E
        #end if
    #end if
    #if str($alignment_options.z):
        #if str($alignment_options.z2):
            -z $alignment_options.z,$alignment_options.z2
        #else
            -z $alignment_options.z
        #end if
    #end if
    #if str($alignment_options.s):
        -s $alignment_options.s
    #end if
    $alignment_options.no_end_flt
    ## Output options
    $io_options.Q
    $io_options.L
    $io_options.c
    #if $io_options.cs:
        --cs $io_options.cs
    #end if
    $io_options.Y
    #if $io_options.K:
        -K $io_options.K
    #end if
    -t \${GALAXY_SLOTS:-4}
    reference.fa
    #if $fastq_input.fastq_input_selector in ['single', 'paired_iv']:
        '$fastq_input.fastq_input1'
    #else if $fastq_input.fastq_input_selector == 'paired':
         '$fastq_input.fastq_input1' '$fastq_input.fastq_input2'
    #else if $fastq_input.fastq_input_selector == 'paired_collection':
         '$fastq_input.fastq_input1.forward' '$fastq_input.fastq_input1.reverse'
    #end if
    #if $io_options.output_format == 'BAM':
        -a
        | samtools sort
        --no-PG
        -@\${GALAXY_SLOTS:-2}
        -T "\${TMPDIR:-.}"
        -O $io_options.output_format
        -o '$alignment_output'
    #else if $io_options.output_format == 'CRAM':
        -a
        | samtools sort
        --no-PG
        -T "\${TMPDIR:-.}"
        -@\${GALAXY_SLOTS:-2}
        -O $io_options.output_format
        $io_options.eqx
        --reference reference.fa
        --output-fmt-option no_ref
        -o '$alignment_output'
    #else:
        > '$alignment_output'
    #end if
]]>
    </command>
    <inputs>
        <conditional name="reference_source">
            <param name="reference_source_selector" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options. See `Indexes` section of help below. If you would like to perform self-mapping select `history` here, then choose your input file as reference.">
                <option value="cached">Use a built-in genome index</option>
                <option value="history">Use a genome from history and build index</option>
            </param>
            <when value="cached">
                <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
                    <options from_data_table="all_fasta">
                        <filter type="sort_by" column="2" />
                        <validator type="no_options" message="No reference genomes are available" />
                    </options>
                    <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
                </param>
            </when>
            <when value="history">
                <param name="ref_file" type="data" format="fasta,fastq" label="Use the following dataset as the reference sequence" help="You can upload a FASTA or FASTQ sequence to the history and use it as reference" />
            </when>
        </conditional>
        <!-- start unchanged copy from bwa-mem -->
        <conditional name="fastq_input">
            <param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data">
                <option value="single">Single</option>
                <option value="paired">Paired</option>
                <option value="paired_collection">Paired Collection</option>
                <option value="paired_iv">Paired Interleaved</option>
            </param>
            <!-- below, preset options are only offered for single-end input
            because paired-end alignment in minimap2 is only enabled with -x sr
            (see https://github.com/lh3/minimap2/issues/190) -->
            <when value="single">
                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with single reads"/>
            </when>
            <when value="paired">
                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select first set of reads" help="Specify dataset with forward reads"/>
                <param name="fastq_input2" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select second set of reads" help="Specify dataset with reverse reads"/>
            </when>
            <when value="paired_collection">
                <param name="fastq_input1" format="fastqsanger,fastqsanger.gz,fasta" type="data_collection" collection_type="paired" label="Select a paired collection" help="See help section for an explanation of dataset collections"/>
            </when>
            <when value="paired_iv">
                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with interleaved reads"/>
            </when>
        </conditional>
        <param name="highfreq_kmers" argument="-W" type="data" format="tabular" label="High frequency k-mers dataset" optional="True" help="Input file containing list of high freq. k-mers generated by meryl"/>
        <param name="analysis_type_selector" type="select" optional="True"
            label="Select a profile of preset options"
            help="Each profile comes with the preconfigured settings mentioned in parentheses. You can customize each profile further in the indexing, mapping and alignment options sections below. If you do not select a profile here, the tool will use the per-parameter defaults listed in the below sections unless you customize them." >
            <option value="map-pb">PacBio/Oxford Nanopore read to reference mapping (-Hk19) (map-pb)</option>
            <option value="map-ont">Oxford Nanopore read to reference mapping. Slightly more sensitive for Oxford Nanopore to reference mapping (-k15). For PacBio reads, HPC minimizers consistently leads to faster performance and more sensitive results in comparison to normal minimizers. For Oxford Nanopore data, normal minimizers are better, though not much. The effectiveness of HPC is determined by the sequencing error mode. (map-ont)</option>
            <option value="map-pb-clr">Turn off SV-aware mode for (relatively) short and noisy reads (map-pb-ctr)</option>
            <option value="asm5">Long assembly to reference mapping (-k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 --min-occ-floor=100). Typically, the alignment will not extend to regions with 5% or higher sequence divergence. Only use this preset if the average divergence is far below 5%. (asm5)</option>
            <option value="asm10">Long assembly to reference mapping (-k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 --min-occ-floor=100). Up to 10% sequence divergence. (asm10)</option>
            <option value="asm20">Long assembly to reference mapping (-k19 -w10 -A1 -B6 -O6,26 -E2,1 -s200 -z200 --min-occ-floor=100). Up to 20% sequence divergence. (asm20)</option>
            <option value="splice">Long-read spliced alignment (-k15 -w5 --splice -g2000 -G200k  -A1 -B2  -O2,32  -E1,0  -C9  -z200  -ub  --splice-flank=yes). In the splice mode, 1) long deletions are taken as  introns  and  represented as the `N' CIGAR operator 2) long insertions are disabled 3) deletion and insertion gap costs are different during chaining 4) the computation of the `ms` tag ignores introns to demote hits to pseudogenes. (splice)</option>
            <option value="splice:hq">Long-read splice alignment for PacBio CCS reads (same as `splice` but with -C5 -O6,24 -B4) (splice:hq)</option>
            <option value="self-homology">Construct a self-homology map - use same genome as query and reference (-DP -k19 -w19 -m200) (self-homology)</option>
        </param>
        <section name="indexing_options" title="Indexing options">
            <param argument="-H" name="H" type="boolean" optional="true" truevalue="-H" falsevalue="" label="Use homopolymer-compressed k-mer ?"/>
            <param argument="-k" type="integer" min="4" max="28" optional="true"  label="K-mer size" help="k-mer size (no larger than 28). "/>
            <param argument="-w" type="integer" min="1" optional="true"  label="Minimizer window size" help=""/>
            <param argument="-I" type="integer" min="1" optional="true"  label="Split index for every N input gigabases" help=""/>
        </section>
        <section name="mapping_options" title="Mapping options" help="Sets -f, -g, -F, -r, -n, -m, -X, -p, --sv-off and --min-occ-floor options." expanded="False">
            <param argument="--sv-off" type="boolean" truevalue="--sv-off" falsevalue=""  checked="False" label="Turn off SV-aware mode" help="SV aware k-mer search allows to find approximate mapping locations for a read"/>
            <param argument="-F" type="integer" min="0" value="" optional="true"
            label="Max fragment length for PE alignment"
            help="The maximum apparent fragment length up to which paired-end reads are aligned together; at higher fragment lengths the mates will be aligned independent of each other; effective only for paired-end data and when spliced alignment mode is turned off; default=800" />
            <param argument="-f" type="float" value="" optional="true" label="Filter out top FLOAT fraction of repetitive minimizers" help="default=0.0002"/>
            <param argument="--min-occ-floor" name="min_occ_floor" type="integer" label="Force winnowmap to always use k-mers occuring this many times or fewer" help="Maximum occurence is the number of repetitive minimizers determined by '-f' or this value, whichever is higher." optional="true" />
            <param argument="-g" type="integer" value="" optional="true" label="Stop chain enlongation if there are no minimizers in INT-bp" help="default=5000"/>
            <param argument="-r" type="integer" value="" optional="true" label="Bandwidth used in chaining and DP-based alignment" help="default=500" />
            <param argument="-n" type="integer" value="" optional="true" label="Minimal number of minimizers on a chain" help="default=3"/>
            <param argument="-m" type="integer" value="" optional="true" label="Minimal chaining score (matching bases minus log gap penalty)" help="default=40"/>
            <param argument="--max-chain-skip" type="integer" value="" optional="true"
            label="Maximum seed skips during chaining"
            help="A heuristics that stops chaining early. Winnowmap uses dynamic programming for chaining. The time complexity is quadratic in the number of seeds. This option makes winnowmap exit the inner loop if it repeatedly sees seeds already on chains. Set to a large number to switch off this heurstics effectively. default=25" />
            <param argument="--max-chain-iter" type="integer" value="" optional="true"
            label="Maximum number of partial chains checked during chaining"
            help="A heuristics to avoid quadratic time complexity in the worst case. default=5000" />
            <param argument="-X" type="boolean" truevalue="-X" falsevalue="" optional="true" label="Skip self and dual mappings (for the all-vs-all mode)"/>
            <param argument="-p" type="float" value="" max="1" optional="true" label="Min secondary-to-primary score ratio" help="default=0.8"/>
        </section>
        <section name="alignment_options" title="Alignment options" help="Sets -A, -B, -O, -E, -z, -s, and spliced alignments options." expanded="False">
            <conditional name="splicing">
                <param name="splice_mode" type="select"
                label="Customize spliced alignment mode?"
                help="Warning: Winnowmap cannot currently perform spliced alignments of read pairs. If you enable spliced alignment for paired-end data it will be treated as single-end!" >
                    <option value="preset">No, use profile setting or leave turned off</option>
                    <option value="">Disable spliced alignments (overwrite profile setting if necessary)</option>
                    <option value="--splice">Yes, enable spliced alignments (--splice)</option>
                </param>
                <when value="preset" />
                <when value="" />
                <when value="--splice">
                    <param argument="-G" type="integer" value="" optional="true"
                    label="Maximum allowed gap on the reference"
                    help="Higher values cause slower spliced alignment. When in use, this option causes -r (in mapping options) to be set to the same value. default=200k" />
                    <param argument="-C" type="integer" min="0" optional="true"
                    label="Cost of non-canonical (non-GT-AG) splicing"
                    help="default=0" />
                    <param argument="-u" type="select" optional="true"
                    label="how to find GT-AG"
                    help="default=n (don't match GT-AG)">
                        <option value="n">don't match GT-AG (-un)</option>
                        <option value="f">transcript strand (-uf)</option>
                        <option value="b">both strands (-ub)</option>
                    </param>
                    <param argument="--splice-flank" type="boolean" truevalue="--splice-flank=yes" falsevalue="--splice-flank=no" checked="true"
                    label="Assume conserved flanking region of splice sites?"
                    help="Assume the next base to a GT donor site tends to be A/G (91% in human and 92% in mouse) and the preceding base to a AG acceptor tends to be C/T. This trend is evolutionarily conserved, all the way to S. cerevisiae (PMID:18688272). Specifying this option generally leads to higher junction accuracy by several percents, so it is applied by default with --splice. However, the SIRV control does not honor this trend (only ~60%) so this option reduces accuracy. If you are benchmarking winnowmap on SIRV data, please disable this option." />
                    <conditional name="splice_site_annotations">
                        <param name="use_annotations" type="select"
                        label="Use previously annotated splice sites to guide the alignment?"
                        help="">
                            <option value="no">No, perform unbiased alignment</option>
                            <option value="yes">Yes, favor annotated splice sites</option>
                        </param>
                        <when value="no" />
                        <when value="yes">
                            <param argument="--junc-bed" type="data" format="bed"
                            label="Dataset with annotated genes or introns"
                            help="Gene annotations should be provided in BED12 (aka 12-column BED), intron positions in 5-column BED format." />
                            <param argument="--junc-bonus" type="integer" min="1" value="1"
                            label="Annotated splice site bonus"
                            help="Score bonus for a splice donor or acceptor found in annotation." />
                        </when>
                    </conditional>
                </when>
            </conditional>
            <param argument="-A" type="integer" min="0" optional="true"
            label="Score for a sequence match" help="default=2"/>
            <param argument="-B" type="integer" min="0" optional="true"
            label="Penalty for a mismatch" help="-B; default=4" />
            <param argument="-O" type="integer" min="1" optional="true"
            label="Gap open penalties for deletions" help="-O; default=4"/>
            <param name="O2" type="integer" min="0" optional="true"
            label="Gap open penalties for insertions" help="-O; default=24"/>
            <param argument="-E" type="integer" min="1" optional="true"
            label="Gap extension penalties; a gap of size k cost &#39;-O + -E*k&#39;. If two numbers are specified, the first is the penalty of extending a deletion and the second for extending an insertion"
            help="-E; default=2"/>
            <param name="E2" type="integer" min="0" optional="true"
            label="Gap extension penalty for extending an insertion; if left empty uses the value specified for Gap extension penalties above"
            help="-E; default=1"/>
            <param argument="-z" type="integer" min="0" optional="true"
            label="Z-drop threshold for truncating an alignment"
            help="Increase to improve the contiguity of alignments at the cost of poorer alignments in the middle. default=400" />
            <param name="z2" type="integer" min="0" optional="true"
            label="Z-drop threshold for reverse-complementing the query"
            help="Decrease to find small inversions at the cost of performance and false positives. default=200" />
            <param argument="-s" type="integer" min="0" optional="true"
            label="minimal peak DP alignment score" help="default=80"/>
            <param name="no_end_flt" type="boolean" falsevalue="--no-end-flt" truevalue="" checked="true"
            label="Filter seeds towards the ends of chains before performing base-level alignment?" />
        </section>
        <section name="io_options" title="Set advanced output options" help="Sets -Q, -L, -R, -c, --cs and -K options." expanded="False">
            <param name="output_format" type="select" label="Select an output format">
                <option value="BAM">BAM</option>
                <option value="CRAM">CRAM</option>
                <option value="paf">paf</option>
            </param>
            <param argument="-Q" type="boolean" truevalue="-Q" falsevalue="" label="don't output base quality"/>
            <param argument="-L" type="boolean" truevalue="-L" falsevalue="" label="write CIGAR with >65535 ops to the CG tag" help="Useful for very long reads in SAM/BAM format"/>
            <param argument="-K" type="integer" optional="true" label="minibatch size for mapping (in megabyte)" help="default=500M"/>
            <param argument="--cs" type="select" optional="true" label="Output cs tag?" help="The cs tag is a more compact standalone representation of the MD tag, see help below.">
                <option value="none">no</option>
                <option value="short">short</option>
                <option value="long">long</option>
            </param>
            <param argument="-c" type="boolean" truevalue="-c" falsevalue="" label="Generate CIGAR"
                help="In PAF, the CIGAR is written to the ‘cg’ custom tag." />

            <param argument="--eqx" type="boolean" truevalue="--eqx" falsevalue="" label="write =/X CIGAR operators"/>
            <param argument="-Y" type="boolean" truevalue="-Y" falsevalue="" label="use soft clipping for supplementary alignments ?"/>
        </section>
    </inputs>
    <outputs>
        <data format="bam" name="alignment_output" label="${tool.name} on ${on_string} (mapped reads in ${io_options.output_format} format)">
            <actions>
                <conditional name="reference_source.reference_source_selector">
                    <when value="cached">
                        <action type="metadata" name="dbkey">
                            <option type="from_data_table" name="all_fasta" column="1" offset="0">
                                <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
                                <filter type="param_value" ref="reference_source.ref_file" column="0"/>
                            </option>
                        </action>
                    </when>
                    <when value="history">
                        <action type="metadata" name="dbkey">
                            <option type="from_param" name="reference_source.ref_file" param_attribute="dbkey" />
                        </action>
                    </when>
                </conditional>
            </actions>
            <change_format>
                <when input="io_options.output_format" value="paf" format="paf" />
                <when input="io_options.output_format" value="CRAM" format="cram" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <test>
            <!-- test single input -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
            <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
            <param name="analysis_type_selector" value="map-ont"/>
            <param name="min_occ_floor" value="1000"/>
            <section name="alignment_options">
                <param name="min_occ_floor" value="1000"/>
                <param name="A" value="2"/>
                <param name="B" value="8"/>
                <param name="O" value="12"/>
                <param name="O2" value="32"/>
                <param name="E" value="2"/>
                <param name="E2" value="1"/>
                <param name="z" value="400"/>
                <param name="s" value="40"/>
            </section>
            <output name="alignment_output" ftype="bam" file="winnowmap-test1-fasta.bam" lines_diff="2" >
                <metadata name="dbkey" value="?" />
            </output>
        </test>
        <test>
            <!-- test cram output -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa" dbkey="hg19"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
            <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
            <param name="analysis_type_selector" value="map-ont"/>
            <param name="output_format" value="CRAM"/>
            <output name="alignment_output" ftype="cram" file="winnowmap-test1-fasta.cram" lines_diff="2">
                <metadata name="dbkey" value="hg19" />
            </output>
        </test>
        <test>
            <!-- test paired input -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="paired"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
            <param name="analysis_type_selector" value="map-ont"/>
            <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
            <output name="alignment_output" ftype="bam" file="winnowmap-test1.bam" lines_diff="2" />
        </test>
        <test>
            <!-- test paired input with one pair compressed -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="paired"/>
            <param name="fastq_input1" ftype="fastqsanger.gz" value="bwa-mem-fastq1.fq.gz"/>
            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
            <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
            <param name="analysis_type_selector" value="map-ont"/>
            <output name="alignment_output" ftype="bam" file="winnowmap-test1.bam" lines_diff="2" />
        </test>
        <test>
            <!-- test collection input -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="paired_collection"/>
            <param name="fastq_input1">
                <collection type="paired">
                    <element name="forward" value="bwa-mem-fastq1.fq" />
                    <element name="reverse" value="bwa-mem-fastq2.fq" />
                </collection>
            </param>
            <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
            <output name="alignment_output" ftype="bam" file="winnowmap-test2.bam" lines_diff="2"/>
        </test>
        <test>
            <!-- test data table reference -->
            <param name="reference_source_selector" value="cached" />
            <param name="ref_file" value="bwa-mem-mt-genome" dbkey="hg19"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
            <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
            <param name="analysis_type_selector" value="map-ont"/>
            <section name="alignment_options">
                <param name="min_occ_floor" value="1000"/>
                <param name="A" value="2"/>
                <param name="B" value="8"/>
                <param name="O" value="12"/>
                <param name="O2" value="32"/>
                <param name="E" value="2"/>
                <param name="E2" value="1"/>
                <param name="z" value="400"/>
                <param name="s" value="40"/>
            </section>
            <output name="alignment_output" ftype="bam" file="winnowmap-test1-fasta.bam" lines_diff="2" >
                <metadata name="dbkey" value="bwa-mem-mt-genome-dbkey" />
            </output>
        </test>
        <test>
            <!-- test alignment options -->
            <param name="reference_source_selector" value="cached" />
            <param name="min_occ_floor" value="1000"/>
            <param name="ref_file" value="bwa-mem-mt-genome"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
            <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
            <param name="analysis_type_selector" value="map-ont"/>
            <section name="alignment_options">
                <!-- the folowing settings correspond to the defaults for "sr"
                mode. The purpose is to check that all alignment params get
                parsed correctly. -->
                <param name="A" value="2" />
                <param name="B" value="8" />
                <param name="O" value="12" />
                <param name="O2" value="32" />
                <param name="E" value="2" />
                <param name="E2" value="1" />
                <param name="z" value="400" />
                <param name="s" value="40" />
            </section>
            <output name="alignment_output" ftype="bam" file="winnowmap-test1-fasta.bam" lines_diff="2" />
            <assert_command>
                <has_text text="-A 2"/>
                <has_text text="-B 8"/>
                <has_text text="-O 12,32"/>
                <has_text text="-E 2,1"/>
                <has_text text="-z 400"/>
                <has_text text="-s 40"/>
            </assert_command>
        </test>
        <test>
            <!-- test paf output -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fastqsanger"  value="mini_reads.fq" />
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger"  value="mini_reads.fq" />
            <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
            <param name="analysis_type_selector" value="map-ont"/>
            <param name="output_format" value="paf"/>
            <output name="alignment_output" ftype="paf" file="mini_reads.paf" />
        </test>
        <test>
            <!-- test self-homology mode -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="winnowmap-self-homology.fasta" />
            <param name="fastq_input_selector" value="single" />
            <param name="fastq_input1" ftype="fasta" value="winnowmap-self-homology.fasta" />
            <param name="analysis_type_selector" value="self-homology" />
            <output name="alignment_output" ftype="bam" file="winnowmap-self-homology.bam" lines_diff="2" />
        </test>
        <test>
            <!-- test sv_off parameter -->
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
            <param name="fastq_input_selector" value="single"/>
            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
            <param name="highfreq_kmers" ftype="tabular" value="repetitive_k15.txt"/>
            <param name="analysis_type_selector" value="map-ont"/>
            <param name="sv_off" value="True"/>
            <output name="alignment_output" ftype="bam" file="winnowmap-test2-fasta.bam" lines_diff="2" />
        </test>
    </tests>
    <help>

Users’ Guide
------------

Winnowmap is a long-read mapping algorithm optimized for mapping ONT and
PacBio reads to repetitive reference sequences. Winnowmap development began
on top of minimap2 codebase, and since then we have incorporated the
following two ideas to improve mapping accuracy within repeats.

Winnowmap implements a novel weighted minimizer sampling algorithm (>=v1.0).
This optimization was motivated by the need to avoid masking of frequently
occurring k-mers during the seeding stage in an efficient manner, and achieve
better mapping accuracy in complex repeats (e.g., long tandem repeats) of
the human genome. Using weighted minimizers, Winnowmap down-weights
frequently occurring k-mers, thus reducing their chance of getting selected
as minimizers. Users can refer to this paper for more details. This idea is
helpful to preserve the theoretical guarantee of minimizer sampling technique,
i.e., if two sequences share a substring of a specified length, then they
must be guaranteed to have a matching minimizer.

We noticed that the highest scoring alignment doesn't necessarily correspond
to correct placement of reads in repetitive regions of T2T human chromosomes.
In the presence of a non-reference allele within a repeat, a read sampled
from that region could be mapped to an incorrect repeat copy because the
standard pairwise sequence alignment scoring system penalizes true variants.
This is also sometimes referred to as allelic bias. To address this bias,
we introduced and implemented an idea of using minimal confidently alignable
substrings (>=v2.0). These are minimal-length substrings in a read that align
end-to-end to a reference with mapping quality score above a user-specified
threshold. This approach treats each read mapping as a collection of
confident sub-alignments, which is more tolerant of structural variation
and more sensitive to paralog-specific variants (PSVs). Our most recent
paper desribes this concept and benchmarking results.

General usage
~~~~~~~~~~~~~

For either mapping long reads or computing whole-genome alignments, Winnowmap
requires pre-computing high frequency k-mers (e.g., top 0.02% most frequent)
in a reference. Winnowmap uses meryl k-mer counting tool for this purpose.

Mapping ONT or PacBio-hifi WGS reads


.. code::

        meryl count k=15 output merylDB ref.fa
        meryl print greater-than distinct=0.9998 merylDB > repetitive_k15.txt


.. code::

        winnowmap -W repetitive_k15.txt -ax map-ont ref.fa ont.fq.gz > output.sam  [OR]
        winnowmap -W repetitive_k15.txt -ax map-pb ref.fa hifi.fq.gz > output.sam

Mapping genome assemblies


.. code::

        meryl count k=19 output merylDB asm1.fa
        meryl print greater-than distinct=0.9998 merylDB > repetitive_k19.txt


.. code::

        winnowmap -W repetitive_k19.txt -ax asm20 asm1.fa asm2.fa > output.sam

For the genome-to-genome use case, it may be useful to visualize the dot plot. This perl script can be used to generate a dot plot from paf-formatted output. In both usage cases, pre-computing repetitive k-mers using meryl is quite fast, e.g., it typically takes 2-3 minutes for the human genome reference.

Use cases
~~~~~~~~~

Winnowmap uses the same base algorithm for all applications. However, due
to the different data types it supports, Winnowmap needs to be tuned for
optimal performance and accuracy. It is usually recommended to choose a
preset with option **-x**, which sets multiple parameters at the same
time. The default setting is the same as ``map-ont``.

Map long noisy genomic reads
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The difference between ``map-pb`` and ``map-ont`` is that ``map-pb``
uses homopolymer-compressed (HPC) minimizers as seeds, while ``map-ont``
uses ordinary minimizers as seeds. Emperical evaluation suggests HPC
minimizers improve performance and sensitivity when aligning PacBio
reads, but hurt when aligning Nanopore reads.

Map long mRNA/cDNA reads
^^^^^^^^^^^^^^^^^^^^^^^^

There are different long-read RNA-seq technologies, including
tranditional full-length cDNA, EST, PacBio Iso-seq, Nanopore 2D cDNA-seq
and Direct RNA-seq. They produce data of varying quality and properties.
By default, ``-x splice`` assumes the read orientation relative to the
transcript strand is unknown. It tries two rounds of alignment to infer
the orientation and write the strand to the ``ts`` SAM/PAF tag if
possible. For Iso-seq, Direct RNA-seq and tranditional full-length
cDNAs, it would be desired to apply ``-u f`` to force Winnowmap to
consider the forward transcript strand only. This speeds up alignment
with slight improvement to accuracy. For noisy Nanopore Direct RNA-seq
reads, it is recommended to use a smaller k-mer size for increased
sensitivity to the first or the last exons.

It is worth noting that by default ``-x splice`` prefers
GT[A/G]..[C/T]AG over GT[C/T]..[A/G]AG, and then over other splicing
signals. Considering one additional base improves the junction accuracy
for noisy reads, but reduces the accuracy when aligning against the
widely used SIRV control data. This is because SIRV does not honor the
evolutionarily conservative splicing signal. If you are studying SIRV,
you may apply ``--splice-flank=no`` to let Winnowmap only model GT..AG,
ignoring the additional base.


Map short accurate genomic reads
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

When two read files are specified, Winnowmap reads from each file in turn
and merge them into an interleaved stream internally. Two reads are
considered to be paired if they are adjacent in the input stream and
have the same name (with the ``/[0-9]`` suffix trimmed if present).
Single- and paired-end reads can be mixed.

Winnowmap does not work well with short spliced reads. There are many
capable RNA-seq mappers for short reads.

Full genome/assembly alignment
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

For cross-species full-genome alignment, the scoring system needs to be
tuned according to the sequence divergence.

Self-homology map creation
^^^^^^^^^^^^^^^^^^^^^^^^^^

A self-homology map is created by mapping a genome (e.g. that of E. coli)
against itself. When this option is used the same FASTA file should
be used for reference and for the (single ended mode) query.

Advanced features
~~~~~~~~~~~~~~~~~

Working with >65535 CIGAR operations
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Due to a design flaw, BAM does not work with CIGAR strings with >65535
operations (SAM and CRAM work). However, for ultra-long nanopore reads
Winnowmap may align ~1% of read bases with long CIGARs beyond the
capability of BAM. If you convert such SAM/CRAM to BAM, Picard and
recent samtools will throw an error and abort. Older samtools and other
tools may create corrupted BAM.

To avoid this issue, you can add option ``-L`` at the Winnowmap command line.
This option moves a long CIGAR to the ``CG`` tag and leaves a fully clipped
CIGAR at the SAM CIGAR column. Current tools that don’t read CIGAR
(e.g. merging and sorting) still work with such BAM records; tools that read
CIGAR will effectively ignore these records. It has been decided that future
tools will seamlessly recognize long-cigar records generated by option `-L`.

**TD;DR**: if you work with ultra-long reads and use tools that only
process BAM files, please add option ``-L``.

The cs optional tag
^^^^^^^^^^^^^^^^^^^

The ``cs`` SAM/PAF tag encodes bases at mismatches and INDELs. It
matches regular expression
``/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/``. Like CIGAR, ``cs``
consists of series of operations. Each leading character specifies the
operation; the following sequence is the one involved in the operation.

The ``cs`` tag is enabled by command line option ``--cs``. The following
alignment, for example:

.. code::

    CGATCGATAAATAGAGTAG---GAATAGCA
    ||||||   ||||||||||   |||| |||
    CGATCG---AATAGAGTAGGTCGAATtGCA

is represented as ``:6-ata:10+gtc:4*at:3``, where ``:[0-9]+`` represents
an identical block, ``-ata`` represents a deltion, ``+gtc`` an insertion
and ``*at`` indicates reference base ``a`` is substituted with a query
base ``t``. It is similar to the ``MD`` SAM tag but is standalone and
easier to parse.

If ``--cs=long`` is used, the ``cs`` string also contains identical
sequences in the alignment. The above example will become
``=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA``. The long form of ``cs``
encodes both reference and query sequences in one string.

Benchmarking
~~~~~~~~~~~~

When comparing Winnowmap (v1.0) to minimap2 (v2.17-r954), we observed
a reduction in the mapping error-rate from 0.14% to 0.06% in the recently
finished human X chromosome, and from 3.6% to 0% within the highly repetitive
X centromere (3.1 Mbp). Winnowmap improves mapping accuracy within repeats
and achieves these results with sparser sampling, leading to better index
compression and competitive runtimes. By avoiding masking, we show that
Winnowmap maintains uniform minimizer density.

    </help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Tue, 13 Jul 2021 12:16:19 +0000
parents	1c070debf549
children