Mercurial > repos > iuc > purge_dups

<tool id="purge_dups" name="Purge overlaps" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
    <description>and haplotigs in an assembly based on read depth (purge_dups)</description>
    <macros>
        <token name="@TOOL_VERSION@">1.2.5</token>
        <token name="@VERSION_SUFFIX@">3</token>
        <xml name="trimmers">
            <section name="section_hist" title="Histogram plot options" >
                <!--<param name="cutoffs_his" type="data" optional="true" format="txt" label="Read depth cutoffs file" />-->
                <param argument="--ymin" type="integer" optional="true" min="0" label="Specify a minimum for the Y axis"/>
                <param argument="--ymax" type="integer" optional="true" label="Specify a maximum for the Y axis"/>
                <param argument="--xmin" type="integer" optional="true" min="0" label="Specify a minimum for the X axis"/>
                <param argument="--xmax" type="integer" optional="true" label="Specify a maximum for the X axis"/>
                <param argument="--title" type="text" value="Read depth histogram plot" label="Histogram title"/>
            </section>
        </xml>
        <token name="@HIST_PLOT@"><![CDATA[
            python '$__tool_directory__/hist_plot.py'
            --cutoffs cutoffs.tsv
            #if $function_select.section_hist.ymin
                --ymin $function_select.section_hist.ymin
            #end if
            #if $function_select.section_hist.ymax
                --ymax $function_select.section_hist.ymax
            #end if
            #if $function_select.section_hist.xmin
                --xmin $function_select.section_hist.xmin
            #end if
            #if $function_select.section_hist.xmax
                --xmax $function_select.section_hist.xmax
            #end if
            #if $function_select.section_hist.title
                --title '${function_select.section_hist.title}'
            #end if
            depth.stat hist.png
        ]]></token>
        <token name="@CALCUTS@"><![CDATA[
            calcuts
            #if $function_select.section_calcuts.min_depth:
                -f $function_select.section_calcuts.min_depth
            #end if
            #if $function_select.section_calcuts.low_depth:
                -l $function_select.section_calcuts.low_depth
            #end if
            #if $function_select.section_calcuts.transition:
                -m $function_select.section_calcuts.transition
            #end if
            #if $function_select.section_calcuts.upper_depth:
                -u $function_select.section_calcuts.upper_depth
            #end if
            $function_select.section_calcuts.ploidy
        ]]></token>
        <xml name="calcuts">
            <section name="section_calcuts" title="Calcuts options">
                <param name="min_depth" type="float" label="Minimum depth count fraction to maximum depth coun" min="0" max="1" argument="-f" optional="true" help="Default = 0.1"/>
                <param name="low_depth" label="Lower bound for read depth" type="integer" argument="-l" optional="true"/>
                <param name="transition" label="Transition between haploid and diploid" type="integer" argument="-m" optional="true"/>
                <param name="upper_depth" label="Upper bound for read depth" type="integer" argument="-u" optional="true"/>
                <param name="ploidy" argument="-d" type="select" label="Ploidy">
                    <option value="-d 0" selected="true">Diploid [0]</option>
                    <option value="-d 1">Haploid [1]</option>
                </param>
            </section>
        </xml>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">purge_dups</requirement>
        <requirement type="package" version="3.4.2">matplotlib-base</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        #if $function_select.functions == 'purge_dups':
            #for $i, $file in enumerate($function_select.input):
                #if $file.is_of_type("paf"):
                    gzip -c '${file}' > '${i}.gz' &&
                #else
                    ln -s '${file}' '${i}.gz' &&
                #end if
            #end for
            purge_dups
            #if $function_select.coverage:
                -c '${function_select.coverage}'
            #end if
            #if $function_select.cutoffs:
                -T '${function_select.cutoffs}'
            #end if
            #if $function_select.min_bad:
                -f $function_select.min_bad
            #end if
            #if $function_select.min_align:
                -a $function_select.min_align
            #end if
            #if $function_select.min_match:
                -b $function_select.min_match
            #end if
            #if $function_select.min_chain:
                -m $function_select.min_chain
            #end if
            #if $function_select.max_gap:
                -M $function_select.max_gap
            #end if
            #if $function_select.double_chain.chaining_rounds == 'two':
                -2
                #if $function_select.double_chain.max_gap_2:
                    -G $function_select.double_chain.max_gap_2
                #end if
            #end if
            #if $function_select.min_chain_score:
                -l $function_select.min_chain_score
            #end if
            #if $function_select.max_extend:
                -E $function_select.max_extend
            #end if
            #for $i, $file in enumerate($function_select.input):
                '${i}.gz'
            #end for
            > dups.bed 2> purge_dups.log
        #else if $function_select.functions == 'split_fa':
            split_fa
            #if $function_select.split:
                -n $function_select.split
            #end if
            '${function_select.input}' > split.fasta
        #else if $function_select.functions == 'pbcstat':
            #for $i, $file in enumerate($function_select.input):
                #if $file.is_of_type('paf'):
                    gzip -c '${file}' > '${i}.gz' &&
                #else
                    ln -s '${file}' '${i}.gz' &&
                #end if
            #end for
            pbcstat
            #if $function_select.max_cov:
                -M $function_select.max_cov
            #end if
            #if $function_select.min_map_ratio:
                -f $function_select.min_map_ratio
            #end if
            #if $function_select.min_map_qual:
                -q $function_select.min_map_qual
            #end if
            #if $function_select.flank:
                -l $function_select.flank
            #end if
            $function_select.primary_alignments

            #for $i, $file in enumerate($function_select.input):
                '${i}.gz'
            #end for
            && mv PB.stat depth.stat
            && @CALCUTS@ depth.stat > cutoffs.tsv 2>calcuts.log
            && @HIST_PLOT@

        #else if $function_select.functions == 'ngscstat':
            ngscstat
            #if $function_select.min_align_qual:
                -q $function_select.min_align_qual
            #end if
    ##        #if $function_select.max_depth:
    ##            -M $function_select.max_depth
    ##        #end if
            #if $function_select.max_insert:
                -L $function_select.max_insert
            #end if
            '${function_select.input}'
            && mv TX.stat depth.stat
            && @CALCUTS@ depth.stat > cutoffs.tsv 2>calcuts.log
            && @HIST_PLOT@

        #else if $function_select.functions == 'calcuts':
            @CALCUTS@ '${function_select.input}' > cutoffs.tsv 2>calcuts.log

        #else if $function_select.functions == 'get_seqs':
            get_seqs
            $function_select.coverage
            $function_select.haplotigs
            $function_select.end_trim
            $function_select.split
            #if $function_select.length:
                -l $function_select.length
            #end if
            #if $function_select.min_ratio:
                -m $function_select.min_ratio
            #end if
            #if $function_select.min_gap:
                -g $function_select.min_gap
            #end if
            '${function_select.bed_input}' '${function_select.fasta_input}'
        #end if
    ]]></command>
    <inputs>
        <conditional name="function_select">
            <param type="select" name="functions" label="Select the purge_dups function">
                <option value="purge_dups">Purge haplotigs and overlaps for an assembly (purge_dups)</option>
                <option value="split_fa">Split FASTA file by 'N's (split_fa)</option>
                <option value="pbcstat">Calculate coverage cutoff and create read depth histogram and base-levelread depth for PacBio data (calcuts+pbcstats)</option>
                <option value="ngscstat">Calculate coverage cutoff and create read depth histogram and base-level read detph for Illumina data (calcuts+ngscstat)</option>
                <option value="calcuts">calculate coverage cutoffs (calcuts)</option>
                <option value="get_seqs">Obtain seqeuences after purging (get_seqs)</option>
            </param>
            <when value="purge_dups">
                <param name="input" type="data" format="paf,paf.gz" multiple="true" label="PAF input file"/>
                <param name="coverage" type="data" format="tabular" optional="true" argument="-c" label="Base-level coverage file" />
                <param name="cutoffs" type="data" format="tabular" label ="Cutoffs file" optional="true" argument="-T"/>
                <param name="min_bad" type="float" min="0" max="1" argument="-f" optional="true" label="Minimum fraction of haploid/diploid/bad/repetitive bases in a sequence" help="Default = 0.8"/>
                <param name="min_align" type="integer" label="Minimum alignment score" argument="-a" optional="true"/>
                <param name="min_match" type="integer" label="Minimum max match score" argument="-b" optional="true"/>
                <param name="min_chain" label="Minimum matching bases for chaining" type="integer" argument="-m" optional="true"/>
                <param name="max_gap" label="Maximum gap size for chaining" type="integer" argument="-M" optional="true"/>
                <conditional name="double_chain">
                    <param type="select" name="chaining_rounds" label="Rounds of chaining">
                        <option value="one">1 round</option>
                        <option value="two">2 rounds</option>
                    </param>
                    <when value="two">
                        <param name="max_gap_2" argument="-G" optional="true" label="Maximum gap size for second round of chaining" type="integer"/>
                    </when>
                    <when value="one"/>
                </conditional>
                <param name="min_chain_score" argument="-l" optional="true" label="Minimum chaining score for a match" type="integer" />
                <param name="max_extend" argument="-E" optional="true" label="Maximum extension for contig ends" type="integer" />
            </when>
            <when value="split_fa">
                <param name="input" type="data" format="fasta" label="Base-level coverage file"/>
                <param name="split" type="boolean" truevalue="-n" falsevalue="" checked="false" label="Base-level coverage file" />
            </when>
            <when value="pbcstat">
                <param name="input" type="data" format="paf,paf.gz" multiple="true" label="PAF input file"/>
                <param name="max_cov" type="integer" label="Maximum coverage" argument="-M" optional="true"/>
                <param name="min_map_ratio" argument="-f" type="float" min="0" max="1" value="0" label="Minimum mapping length ratio"/>
                <param name="min_map_qual" type="integer"  argument="-q" optional="true" label="Minimum mapping quality"/>
                <param name="flank" type="integer" argument="-l" optional="true" label="Flanking space" />
                <param name="primary_alignments" argument="-p" type="boolean" truevalue="-p" falsevalue="" checked="true" label="Use only primary alignments" />
                <expand macro="calcuts" />
                <expand macro="trimmers"/>
            </when>
            <when value="ngscstat">
                <param name="input" type="data" format="bam" label="BAM input file"/>
                <param name="min_align_qual" type="integer"  argument="-q" optional="true" label="Minimum alignment quality" />
                <!-- Param exists in help text, but isn't actually part of the code. Maybe in the next release? -->
                <!-- <param name="max_depth" type="integer" label="Maximum read depth" argument="-M" optional="true"/> -->
                <param name="max_insert" type="integer"  argument="-L" optional="true" label="Maximum insert size"/>
                <expand macro="calcuts" />
                <expand macro="trimmers"/>
            </when>

            <when value="calcuts">
                <param name="input" type="data" format="tabular" label="STAT input file"/>
                <expand macro="calcuts" />
            </when>

            <when value="get_seqs">
                <param name="fasta_input" type="data" format="fasta" label="Fasta input file"/>
                <param name="bed_input" type="data" format="bed" label="Bed input file"/>
                <param name="coverage" type="boolean" argument="-c" truevalue="-c" falsevalue="" checked="false" label="Keep high coverage contigs in the primary contig set"/>
                <param name="haplotigs" type="boolean" argument="-a" truevalue="-a" falsevalue="" checked="false" label="Do not add prefix to haplotigs"/>
                <param name="length" type="integer" argument="-l" optional="true" label="Minimum primary contig length" help="Default: 1000"/>
                <param name="min_ratio" type="float" min="0" max="1" argument="-m" optional="true" label="Minimum ratio of remaining primary contig length to the original contig length"/>
                <param name="end_trim" type="boolean" argument="-e" truevalue="-e" falsevalue="" checked="true" label="Trim end sequences" help="Only remove sequences at end of halplotigs If you also want to remove the duplications in the middle, set to false, however that may delete false positive duplications."/>
                <param name="split" type="boolean" argument="-s" truevalue="-s" falsevalue="" checked="false" label="Split contigs"/>
                <param name="min_gap" type="integer" argument="-g" optional="true" help="default=10k" label="Minimum gap size between duplications" />
            </when>
        </conditional>
    </inputs>
    <outputs>
        <!-- Get Seqs -->
        <data name="get_seqs_hap" format="fasta" from_work_dir="hap.fa" label="${tool.name} on ${on_string}: get seqs haplotype fasta" >
            <filter>function_select['functions'] == 'get_seqs'</filter>
        </data>
        <data name="get_seqs_purged" format="fasta" from_work_dir="purged.fa" label="${tool.name} on ${on_string}: get seqs purged fasta">
            <filter>function_select['functions'] == 'get_seqs'</filter>
        </data>
        <!-- Split FA -->
        <data name="split_fasta" format="fasta" from_work_dir="split.fasta" label="${tool.name} on ${on_string}: split fasta">
            <filter>function_select['functions'] == 'split_fa'</filter>
        </data>
        <!-- Ngscstat -->
        <data name="ngscstat_cov" format="tabular" from_work_dir="TX.base.cov" label="${tool.name} on ${on_string}: ngscstat base coverage file">
            <filter>function_select['functions'] == 'ngscstat'</filter>
        </data>
        <data name="stat_file" format="tabular" from_work_dir="depth.stat"  label="${tool.name} on ${on_string}: stat file">
            <filter>function_select['functions'] == 'ngscstat' or function_select['functions'] == 'pbcstat'</filter>
        </data>
        <!-- Pbcstat -->
        <data name="pbcstat_cov" format="tabular" from_work_dir="PB.base.cov"  label="${tool.name} on ${on_string}: pbcstat base coverage file">
            <filter>function_select['functions'] == 'pbcstat'</filter>
        </data>
        <data name="pbcstat_wig" format="wig" from_work_dir="PB.cov.wig" label="${tool.name} on ${on_string}: pbcstat base wig file">
            <filter>function_select['functions'] == 'pbcstat'</filter>
        </data>

        <data name="hist" format="png" from_work_dir="hist.png" label="${tool.name} on ${on_string}: histogram plot">
            <filter>function_select['functions'] == 'pbcstat' or function_select['functions'] == 'ngscstat'</filter>
        </data>

        <!-- Calcuts -->
        <data name="calcuts_log" format="txt" from_work_dir="calcuts.log" label="${tool.name} on ${on_string}: calcuts log file">
            <filter>function_select['functions'] in ('pbcstat', 'ngscstat', 'calcuts')</filter>
        </data>
        <data name="calcuts_tab" format="tabular" from_work_dir="cutoffs.tsv" label="${tool.name} on ${on_string}: calcuts cutoff file">
            <filter>function_select['functions'] in ('pbcstat', 'ngscstat', 'calcuts')</filter>
        </data>
        <!-- Purge dups -->
        <data name="purge_dups_log" format="txt" from_work_dir="purge_dups.log" label="${tool.name} on ${on_string}: purge_dups log file">
            <filter>function_select['functions'] == 'purge_dups'</filter>
        </data>
        <data name="purge_dups_bed" format="bed" from_work_dir="dups.bed" label="${tool.name} on ${on_string}: purge_dups bed file">
            <filter>function_select['functions'] == 'purge_dups'</filter>
        </data>
    </outputs>
    <tests>
        <!-- Purge dups -->
        <test expect_num_outputs="2">
            <conditional name="function_select">
                <param name="functions" value="purge_dups"/>
                <param name="input" value="test.paf"/>
                <param name="coverage" value="test.cov" ftype="tabular"/>
                <param name="cutoffs" value="cutoffs.tsv" ftype="tabular"/>
                <param name="min_bad" value="0.01"/>
                <param name="min_align" value="10"/>
                <param name="min_match" value="100"/>
                <param name="min_chain" value="1"/>
                <param name="max_gap" value="1000"/>
                <conditional name="double_chain">
                    <param name="chaining_rounds" value="two"/>
                    <param name="max_gap_2" value="1001"/>
                </conditional>
                <param name="min_chain_score" value="1"/>
                <param name="max_extend" value="100"/>
            </conditional>
            <output name="purge_dups_bed" value="purge_dups_out.bed"/>
        </test>
        <!-- Purge dups gzip -->
        <test expect_num_outputs="2">
            <conditional name="function_select">
                <param name="functions" value="purge_dups"/>
                <param name="input" value="test.paf.gz" ftype="paf.gz"/>
                <param name="coverage" value="test.cov" ftype="tabular"/>
                <param name="cutoffs" value="cutoffs.tsv" ftype="tabular"/>
                <param name="min_bad" value="0.01"/>
                <param name="min_align" value="10"/>
                <param name="min_match" value="100"/>
                <param name="min_chain" value="1"/>
                <param name="max_gap" value="1000"/>
                <conditional name="double_chain">
                    <param name="chaining_rounds" value="two"/>
                    <param name="max_gap_2" value="1001"/>
                </conditional>
                <param name="min_chain_score" value="1"/>
                <param name="max_extend" value="100"/>
            </conditional>
            <output name="purge_dups_bed" value="purge_dups_out.bed"/>
        </test>
        <!-- Purge dups multiple input -->
        <test expect_num_outputs="2">
            <conditional name="function_select">
                <param name="functions" value="purge_dups"/>
                <param name="input" value="test.paf,test2.paf.gz"/>
            </conditional>
            <output name="purge_dups_bed" value="purge_dups_out_2.bed"/>
        </test>
        <!-- Split fa -->
        <test expect_num_outputs="1">
            <conditional name="function_select">
                <param name="functions" value="split_fa"/>
                <param name="input" value="test.fasta"/>
                <param name="split" value="-n"/>
            </conditional>
            <output name="split_fasta" value="split_out.fasta"/>
        </test>
        <!-- pbcstat -->
        <test expect_num_outputs="6">
            <conditional name="function_select">
                <param name="functions" value="pbcstat"/>
                <param name="input" value="test.paf"/>
                <param name="max_cov" value="1000"/>
                <param name="min_map_ratio" value="0.01"/>
                <param name="min_map_qual" value="1"/>
                <param name="flank" value="1"/>
                <param name="primary_alignments" value="-p"/>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
            </conditional>
            <output name="calcuts_tab" value="calcuts_out.tsv"/>
            <output name="pbcstat_cov" value="out.cov"/>
            <output name="pbcstat_wig" value="out.wig"/>
            <output name="stat_file" value="pbcstats.tabular"/>
            <output name="hist" value="hist.png" ftype="png" compare="sim_size"/>
        </test>
        <!-- pbcstat gzip -->
        <test expect_num_outputs="6">
            <conditional name="function_select">
                <param name="functions" value="pbcstat"/>
                <param name="input" value="test.paf.gz" ftype="paf.gz"/>
                <param name="max_cov" value="1000"/>
                <param name="min_map_ratio" value="0.01"/>
                <param name="min_map_qual" value="1"/>
                <param name="flank" value="1"/>
                <param name="primary_alignments" value="-p"/>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
            </conditional>
            <output name="calcuts_tab" value="calcuts_out.tsv"/>
            <output name="pbcstat_cov" value="out.cov"/>
            <output name="pbcstat_wig" value="out.wig"/>
        </test>
        <!-- Pbcstat multiple input -->
        <test expect_num_outputs="6">
            <conditional name="function_select">
                <param name="functions" value="pbcstat"/>
                <param name="input" value="test.paf,test2.paf.gz"/>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
            </conditional>
            <output name="calcuts_tab" value="calcuts_out.tsv"/>
            <output name="pbcstat_cov" value="out2.cov"/>
            <output name="stat_file" value="pbcstats2.tabular"/>
            <output name="pbcstat_wig" value="out2.wig"/>
        </test>
        <!-- ngscstat -->
        <test expect_num_outputs="5">
            <conditional name="function_select">
                <param name="functions" value="ngscstat"/>
                <param name="input" value="test.bam"/>
                <param name="min_align_qual" value="10"/>
                <param name="max_insert" value="100"/>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
            </conditional>
            <output name="calcuts_tab" value="calcuts_out.tsv"/>
            <output name="ngscstat_cov" value="ngsc_out.cov"/>
            <output name="stat_file" value="tx_stats.tabular"/>
            <output name="hist" value="hist.png" ftype="png" compare="sim_size"/>
        </test>
        <!-- Calcuts -->
        <test expect_num_outputs="2">
            <conditional name="function_select">
                <param name="functions" value="calcuts"/>
                <param name="input" value="test.stat"/>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
            </conditional>
            <output name="calcuts_tab" value="calcuts_out.tsv"/>
        </test>
        <!-- Get seqs -->
        <test expect_num_outputs="2">
            <conditional name="function_select">
                <param name="functions" value="get_seqs"/>
                <param name="fasta_input" value="split_out.fasta"/>
                <param name="bed_input" value="dups.bed"/>
                <param name="coverage" value="-c"/>
                <param name="length" value="10"/>
                <param name="haplotigs" value="-a"/>
                <param name="min_ratio" value=".01"/>
                <param name="end_trim" value="-e"/>
                <param name="split" value="-s"/>
                <param name="min_gap" value="100000"/>
            </conditional>
            <output name="get_seqs_purged" value="purged_out.fa"/>
        </test>
        <!-- pbcstat histogram options-->
        <test expect_num_outputs="6">
            <conditional name="function_select">
                <param name="functions" value="pbcstat"/>
                <param name="input" value="test.paf"/>
                <param name="max_cov" value="1000"/>
                <param name="min_map_ratio" value="0.01"/>
                <param name="min_map_qual" value="1"/>
                <param name="flank" value="1"/>
                <param name="primary_alignments" value="-p"/>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
                <section name="section_hist">
                    <param name="ymax" value="100"/>
                    <param name="xmax" value="100"/>
                    <param name="cutoffs_his" value="calcuts_out.tsv"/>
                </section>
            </conditional>
            <output name="calcuts_tab" value="calcuts_out.tsv"/>
            <output name="pbcstat_cov" value="out_hist_options.cov"/>
            <output name="pbcstat_wig" value="out_hist_options.wig"/>
            <output name="stat_file" value="pbcstats_hist_options.tabular"/>
            <output name="hist" value="hist_options.png" ftype="png" compare="sim_size"/>
        </test>
    </tests>
    <help><![CDATA[
        .. class:: infomark

        **What it does**

        The purge_dups tools are designed to remove haplotigs and contig overlaps in a de novo assembly based on read depth.

    ]]></help>
        <citations>
        <citation type="doi">10.1093/bioinformatics/btaa025</citation>
    </citations>
</tool>
author	iuc
date	Mon, 14 Jun 2021 18:01:05 +0000
parents	17b378303f2d
children	a315c25dc813