Mercurial > repos > iuc > bbtools_bbnorm

<tool id="bbtools_bbnorm" name="BBTools: BBNorm" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Normalise sequencing coverage</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="edam_ontology"/>
    <expand macro="requirements"/>
    <stdio>
        <regex match="This table is (very|crazy|totally) full, which may reduce accuracy.  Ideal load is under" source="stderr" level="fatal_oom" description="Too low memory - generated results might be inaccurate."/>
    </stdio>
    <command detect_errors="exit_code"><![CDATA[
## bbtools recommends keeping a proper file extension to determine the input format
## so we hardcode them in the below block.
#if str($input_type_cond.input_type) == 'paired':
    #set read1 = $input_type_cond.reads_collection['forward']
    #set read2 = $input_type_cond.reads_collection['reverse']
#else:
    #set read1 = $input_type_cond.read1
    #if str($input_type_cond.input_type) == 'PE_2files':
        #set read2 = $input_type_cond.read2
    #end if
#end if

#if $read1.ext.endswith('.gz'):
    #set read1_file = 'forward.fastq.gz'
#else
    #set read1_file = 'forward.fastq'
#end if
ln -s '${read1}' '${read1_file}' &&

#if $str($input_type_cond.input_type) in ['PE_2files', 'paired']:
    #if $read2.ext.endswith('.gz'):
        #set read2_file = 'reverse.fastq.gz'
    #else
        #set read2_file = 'reverse.fastq'
    #end if
    ln -s '${read2}' '${read2_file}' &&
#end if


## As the program relies on a count-min-sketch algorithm, the potential hash collisions
## are ignored and translates to a decreased accuracy of the results. Therefore the allocated
## memory amount effects the numerical output: the more RAM the more accurate the results.
## Check if a memory cap was set.
if [[ "\${_JAVA_OPTIONS}" != *-Xmx* && "\${JAVA_TOOL_OPTIONS}" != *-Xmx* ]]; then
    export _JAVA_OPTIONS="\${_JAVA_OPTIONS} -Xmx\${GALAXY_MEMORY_MB:-4096}m -Xms256m";
fi &&

bbnorm.sh tmpdir="\$TMPDIR" t="\${GALAXY_SLOTS:-2}"

#### Input parameters
#if str($input_type_cond.input_type) == 'single_end':
    in='${read1_file}'
    interleaved=f
#else:
    #if str($input_type_cond.input_type) == 'PE_1file':
        in='${read1_file}'
        interleaved=t
    #else:
        in1='${read1_file}'
        in2='${read2_file}'
        interleaved=f
    #end if
#end if

#### Output options
out=normalised_R1.fastq
#if $str($input_type_cond.input_type) in ['PE_2files', 'paired']:
    out2=normalised_R2.fastq
#end if
#if $output_options.save_discarded_reads:
    outt=discarded.fastq
#end if
touppercase=t
#if $output_options.save_kmer_hists:
    hist=kmer_hist_input.tabular
    histout=kmer_hist_output.tabular
#end if

#### Hashing parameters
k=$hashing_params.k
bits=$hashing_params.bits
hashes=$hashing_params.hashes
#if str($hashing_params.prefilter) == "true":
    prefilter=t
    prehashes=$hashing_params.prehashes
    prefilterbits=$hashing_params.prefilterbits
    prefiltersize=$hashing_params.prefiltersize
#end if
buildpasses=$hashing_params.buildpasses
minq=$hashing_params.minq
minprob=$hashing_params.minprob
rdk=$hashing_params.rdk

### Normalization parameters
fixspikes=$norm_params.fixspikes
target=$target
maxdepth=$norm_params.maxdepth
mindepth=$norm_params.mindepth
minkmers=$norm_params.minkmers
percentile=$norm_params.percentile
uselowerdepth=$norm_params.uselowerdepth
deterministic=$norm_params.deterministic
passes=$norm_params.passes

### Error detection parameters
hdp=$error_det_params.hdp
ldp=$error_det_params.ldp
tossbadreads=$error_det_params.tossbadreads
requirebothbad=$error_det_params.requirebothbad
errordetectratio=$error_det_params.errordetectratio
highthresh=$error_det_params.highthresh
lowthresh=$error_det_params.lowthresh

### Error correction parameters
#if str($error_corr_params.ecc.ecc) == "true":
    ecc=t
    ecclimit=$error_corr_params.ecclimit
    errorcorrectratio=$error_corr_params.errorcorrectratio
    echighthresh=$error_corr_params.echighthresh
    eclowthresh=$error_corr_params.eclowthresh
    eccmaxqual=$error_corr_params.eccmaxqual
    meo=$error_corr_params.meo
    mue=$error_corr_params.mue
    overlap=$error_corr_params.overlap
#end if
]]></command>
    <inputs>
        <conditional name="input_type_cond">
            <param name="input_type" type="select" label="Choose the category of inputs to be analyzed">
                <option value="single_end">Single-end reads</option>
                <option value="PE_1file">Paired-end reads as a single, interleaved dataset</option>
                <option value="PE_2files" selected="true">Paired-end reads as two separate datasets</option>
                <option value="paired">Paired-end reads as a collection</option>
            </param>
            <when value="single_end">
                <param name="read1" type="data" format="fastqsanger,fastqsanger.gz" label="Single-end data"/>
            </when>
            <when value="PE_1file">
                <param name="read1" type="data" format="fastqsanger,fastqsanger.gz" label="Interleaved paired-end data"/>
            </when>
            <when value="PE_2files">
                <param name="read1" type="data" format="fastqsanger,fastqsanger.gz" label="Forward reads"/>
                <param name="read2" type="data" format="fastqsanger,fastqsanger.gz" label="Reverse reads"/>
            </when>
            <when value="paired">
                <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of forward and reverse reads"/>
            </when>
        </conditional>
        <param argument="target" type="integer" value="100" min="1" label="Target normalization depth" help="All depth parameters control kmer depth, not read depth. For kmer depth Dk, read depth Dr, read length R, and kmer size K: Dr=Dk*(R/(R-K+1))"/>
        <section name="norm_params" title="Normalization parameters">
            <param argument="maxdepth" type="integer" value="-1" min="-1" label="Reads will not be downsampled when below this depth, even if they are above the target depth." help="All depth parameters control kmer depth, not read depth. For kmer depth Dk, read depth Dr, read length R, and kmer size K: Dr=Dk*(R/(R-K+1))"/>
            <param argument="mindepth" type="integer" value="5" min="0" label="kmers with depth below this number will not be included when calculating the depth of a read." help="All depth parameters control kmer depth, not read depth. For kmer depth Dk, read depth Dr, read length R, and kmer size K: Dr=Dk*(R/(R-K+1))"/>
            <param argument="minkmers" type="integer" value="15" min="0" label="Reads must have at least this many kmers over min depth to be retained."/>
            <param argument="percentile" type="integer" value="54" min="1" max="100" label="Percentile to infer read depth" help="Read depth is by default inferred from the 54th percentile of kmer depth, but this may be changed to any number 1-100."/>
            <param argument="uselowerdepth" type="boolean" checked="true" label="For pairs, use the depth of the lower read as the depth proxy."/>
            <param argument="deterministic" type="boolean" checked="true" label="Generate random numbers deterministically" help="This would ensure identical output between multiple runs. May decrease speed with a huge number of threads."/>
            <param argument="fixspikes" type="boolean" checked="false" label="Do a slower, high-precision bloom filter lookup of kmers that appear to have an abnormally high depth due to collisions."/>
            <param argument="passes" type="integer" value="2" label="Number of passes to perform" help=" pass is the basic mode. 2 passes allows greater accuracy, error detection, better contol of output depth."/>
        </section>
        <section name="hashing_params" title="Hashing parameters">
            <param argument="k" type="integer" value="31" min="1" label="kmer length" help="Values under 32 are most efficient, but arbitrarily high values are supported."/>
            <param argument="bits" type="select" label="Bits per cell in bloom filter" help="Maximum kmer depth recorded is 2^c bits. Large values decrease accuracy for a fixed amount of memory, so use the lowest number you can that will still capture highest-depth kmers.">
                <option value="2">2</option>
                <option value="4">4</option>
                <option value="8">8</option>
                <option value="16" selected="true">16</option>
                <option value="32">32</option>
            </param>
            <param argument="hashes" type="integer" value="3" min="1" label="Number of times each kmer is hashed and stored." help="Higher is slower. Higher is more accurate if there is enough memory, but less accurate if there is not enough memory."/>
            <conditional name="prefilter">
                <param argument="prefilter" type="select" label="Use a prefilter to eliminate low-depth kmers" help="True is slower, but generally more accurate; filters out low-depth kmers from the main hashtable. The prefilter is more memory-efficient because it uses 2-bit cells.">
                    <option value="true">Yes</option>
                    <option value="false" selected="true">No</option>
                </param>
                <when value="false"/>
                <when value="true">
                    <param argument="prehashes" type="integer" value="2" min="1" label="Number of hashes for the prefilter"/>
                    <param argument="prefilterbits" type="integer" value="2" min="1" label="Bits per cell in prefilter"/>
                    <param argument="prefiltersize" type="float" value="0.35" min="0" max="1" label="Fraction of memory to allocate for the prefilter."/>
                </when>
            </conditional>
            <param argument="buildpasses" type="integer" value="1" min="1" label="Number of passes" help="More passes can sometimes increase accuracy by iteratively removing low-depth kmers"/>
            <param argument="minq" type="integer" value="6" min="0" label="Ignore kmers containing bases with quality below this threshold"/>
            <param argument="minprob" type="float" value="0.5" min="0" max="1" label="Ignore kmers with overall probability of correctness below this threshold"/>
            <param argument="rdk" type="boolean" checked="true" label="Remove duplicate kmers" help="When true, a kmer's count will only be incremented once per read pair, even if that kmer occurs more than once."/>
        </section>
        <section name="error_det_params" title="Error detection parameters">
            <param argument="hdp" type="integer" value="90" min="0" max="100" label="highdepthpercentile" help="Position in sorted kmer depth array used as proxy of a read's high kmer depth."/>
            <param argument="ldp" type="integer" value="25" min="0" max="100" label="lowdepthpercentile" help="Position in sorted kmer depth array used as proxy of a read's low kmer depth."/>
            <param argument="tossbadreads" type="boolean" checked="false" label="Throw away reads detected as containing errors."/>
            <param argument="requirebothbad" type="boolean" checked="false" label="Only toss bad pairs if both reads are bad."/>
            <param argument="errordetectratio" type="integer" value="125" min="0" label="Error detection ratio" help="Reads with a ratio of at least this much between their high and low depth kmers will be classified as error reads."/>
            <param argument="highthresh" type="integer" value="12" min="0" label="Threshold for high kmer" help="A high kmer at this or above are considered non-error."/>
            <param argument="lowthresh" type="integer" value="3" min="0" label="Threshold for low kmer" help="Kmers at this and below are always considered errors."/>
        </section>

        <section name="error_corr_params" title="Error correction parameters">
            <conditional name="ecc">
                <param argument="ecc" type="select" label="What should be done with detected errors?" help="Tadpole is now preferred for error correction, as it does a better job.">
                    <option value="true" >Correct errors when possible</option>
                    <option value="false" selected="true">Do not attempt to correct errors</option>
                </param>
                <when value="false"/>
                <when value="true">
                    <param argument="ecclimit" type="integer" value="3" min="1" label="Correct up to this many errors per read." help="If more are detected, the read will remain unchanged."/>
                    <param argument="errorcorrectratio" type="integer" value="140" min="0" label="Depth ratio" help="Adjacent kmers with a depth ratio of at least this much between will be classified as an error."/>
                    <param argument="echighthresh" type="integer" value="22" min="0" label="Threshold for high kmer" help="A kmer at this or above may be considered non-error."/>
                    <param argument="eclowthresh" type="integer" value="2" min="0" label="Threshold for low kmer." help="kmers at this depth or below will be considered as errors."/>
                    <param argument="eccmaxqual" type="integer" value="127" min="0" label="Do not correct bases with quality above this value."/>
                    <param argument="meo" type="boolean" checked="false" label="Marks errors by reducing quality value of suspected errors; does not correct anything."/>
                    <param argument="mue" type="boolean" checked="true" label="Mark errors only on uncorrectable reads."/>
                    <param argument="overlap" type="boolean" checked="false" label="Correct errors by read overlap."/>
                </when>
            </conditional>
        </section>

        <section name="output_options" title="Output options">
            <param name="save_discarded_reads" type="boolean" checked="false" label="Save the reads that were eliminated from the input datasets to the history"/>
            <param name="save_kmer_hists" type="boolean" checked="false" label="Save the kmer histograms (in tabular format) for the input and output datasets to the history"/>
        </section>
    </inputs>
    <outputs>
        <data name="output_normalised_R1" format_source="read1" from_work_dir="normalised_R1.fastq" label="${tool.name} on ${on_string} (normalised R1 reads)">
            <filter>input_type_cond['input_type'] != 'paired'</filter>
        </data>
        <data name="output_normalised_R2" format_source="read2" from_work_dir="normalised_R2.fastq" label="${tool.name} on ${on_string} (normalised R2 reads)">
            <filter>input_type_cond['input_type'] == 'PE_2files'</filter>
        </data>
        <collection name="output_pair" type="paired" format_source="reads_collection" label="${tool.name} on ${on_string} (normalised reads)">
            <filter>input_type_cond['input_type'] == 'paired'</filter>
            <data name="forward" from_work_dir="normalised_R1.fastq" label="${tool.name} on ${on_string} (normalised R1 reads)"/>
            <data name="reverse" from_work_dir="normalised_R2.fastq" label="${tool.name} on ${on_string} (normalised R2 reads)"/>
        </collection>
        <data name="output_discarded" format="fastqsanger" from_work_dir="discarded.fastq" label="${tool.name} on ${on_string} (discarded reads)">
            <filter>output_options['save_discarded_reads'] is True</filter>
        </data>
        <data name="kmer_hist_input" format="tabular" from_work_dir="kmer_hist_input.tabular" label="${tool.name} on ${on_string} (kmer histogram input)">
            <filter>output_options['save_kmer_hists'] is True</filter>
        </data>
        <data name="kmer_hist_output" format="tabular" from_work_dir="kmer_hist_output.tabular" label="${tool.name} on ${on_string} (kmer histogram output)">
            <filter>output_options['save_kmer_hists'] is True</filter>
        </data>
    </outputs>
    <tests>
        <!-- Single end sequencing -->
        <test expect_num_outputs="1">
            <param name="input_type" value="single_end"/>
            <param name="read1" ftype="fastqsanger" value="bbnorm/input_R1.fastq"/>
            <param name="target" value="4"/>
            <section name="norm_params">
                <param name="deterministic" value="true"/>
                <param name="mindepth" value="0"/>
            </section>
            <output name="output_normalised_R1" ftype="fastqsanger" value="bbnorm/normalised_R1.fastq"/>
        </test>
        <!-- Single end sequencing, compressed -->
        <test expect_num_outputs="1">
            <param name="input_type" value="single_end"/>
            <param name="read1" ftype="fastqsanger.gz" value="bbnorm/input_R1.fastq.gz"/>
            <param name="target" value="4"/>
            <section name="norm_params">
                <param name="deterministic" value="true"/>
                <param name="mindepth" value="0"/>
            </section>
            <output name="output_normalised_R1" ftype="fastqsanger.gz" value="bbnorm/normalised_R1.fastq"/>
        </test>
        <!-- PE as an interleaved file -->
        <test expect_num_outputs="4">
            <param name="input_type" value="PE_1file"/>
            <param name="read1" ftype="fastqsanger" value="bbnorm/input_interleaved.fastq"/>
            <param name="target" value="4"/>
            <section name="norm_params">
                <param name="deterministic" value="true"/>
                <param name="mindepth" value="0"/>
            </section>
            <section name="output_options">
                <param name="save_discarded_reads" value="true"/>
                <param name="save_kmer_hists" value="true"/>
            </section>
            <output name="output_normalised_R1" ftype="fastqsanger" value="bbnorm/normalised_interleaved.fastq"/>
            <output name="output_discarded" ftype="fastqsanger" value="bbnorm/discarded.fastq"/>
            <output name="kmer_hist_input" ftype="tabular" file="bbnorm/kmer_hist_input.tabular"/>
            <output name="kmer_hist_output" ftype="tabular" file="bbnorm/kmer_hist_output.tabular"/>
        </test>
        <!-- PE as 2 files -->
        <test expect_num_outputs="2">
            <param name="input_type" value="PE_2files"/>
            <param name="read1" ftype="fastqsanger" value="bbnorm/input_R1.fastq"/>
            <param name="read2" ftype="fastqsanger" value="bbnorm/input_R2.fastq"/>
            <param name="target" value="4"/>
            <section name="norm_params">
                <param name="deterministic" value="true"/>
                <param name="mindepth" value="0"/>
            </section>
            <output name="output_normalised_R1" ftype="fastqsanger" value="bbnorm/normalised_R1.fastq"/>
            <output name="output_normalised_R2" ftype="fastqsanger" value="bbnorm/normalised_R2.fastq"/>
        </test>
        <!-- Paired end collection -->
        <test expect_num_outputs="3">
            <param name="input_type" value="paired"/>
            <param name="reads_collection">
                <collection type="paired">
                    <element name="forward" ftype="fastqsanger" value="bbnorm/input_R1.fastq"/>
                    <element name="reverse" ftype="fastqsanger" value="bbnorm/input_R2.fastq"/>
                </collection>
            </param>
            <param name="target" value="4"/>
            <section name="norm_params">
                <param name="deterministic" value="true"/>
                <param name="mindepth" value="0"/>
            </section>
            <output_collection name="output_pair" type="paired" count="2">
                <element name="forward" ftype="fastqsanger" value="bbnorm/normalised_R1.fastq"/>
                <element name="reverse" ftype="fastqsanger" value="bbnorm/normalised_R2.fastq"/>
            </output_collection>
        </test>
    </tests>
    <help>
**What it does**

BBNorm downsamples a provided sequencing output, while paying attention to potential heteregeneities in sequencing depth obtained from the wet-lab workflow. The reads corresponding to regions with low coverage will be kept as is, whereas some of the reads contributing to an above-threshold coverage depth will be subsampled. The resulting data set is expected to be smaller in size, whereas the genome regions with low coverage levels will still be represented in the subsampled dataset. This provides a more uniform coverage depth against all genomic coordinates while the computational resources needed for subsequent steps such as assembly can be substantially reduced without losing coverage anywhere.

-----

**If the target sequencing depth is 2X, a Martian genome sequencing result is expected to be down-sampled as follows:**

input.fastq::

    @read_header_1
    AAAAATTTTTCCCCCGGGGGAAATTT
    +
    FFFFFFFFFFFFFFFEFFFFFF,FFE
    @read_header_2
    TTTTTCCCCCGGGGGAAATTTCCCGGG
    +
    FFFFFFFFFFFFFFFEFFFFFFEFFDD
    @read_header_3
    TTTTTCCCCCGGGGGAAATTTCCCGGG
    +
    FFFFFFFFFFFCEFFEFFFFFFEFFEE
    @read_header_4
    TTTTTCCCCCGGGGGAAATTTCCCGGG
    +
    FFFFFDDFFFFFFFFEFFFFFFEFFEF
    @read_header_5
    TTTTTCCCCCGGGGGAAATTTCCCGGG
    +
    FFFFFEFFFFEEFFFEFFFFFFDFFFF
    @read_header_6
    AAAAATTTTTCCCCCGGGGGAAATTT
    +
    FFFFFFFFFFFFFFFEFFFFFFEFFD


output.fastq::

    @read_header_1
    AAAAATTTTTCCCCCGGGGGAAATTT
    +
    FFFFFFFFFFFFFFFEFFFFFF,FFE
    @read_header_2
    TTTTTCCCCCGGGGGAAATTTCCCGGG
    +
    FFFFFFFFFFFFFFFEFFFFFFEFFDD
    @read_header_3
    TTTTTCCCCCGGGGGAAATTTCCCGGG
    +
    FFFFFFFFFFFCEFFEFFFFFFEFFEE
    @read_header_6
    AAAAATTTTTCCCCCGGGGGAAATTT
    +
    FFFFFFFFFFFFFFFEFFFFFFEFFD


**Indications**

BBNorm is mainly intended for use in assembly pipelines out of short reads. It might be useful when there is too much data that increases computation time or a highly skewed coverage distribution by subsampling from the existing data. As opposed to keeping a randomly selected subset of reads, such as retaining the first n-many elements, this is a weighted resampling that tries to reduce coverage around coordinates of very high sequencing depth.


**Contraindications**

* The data already has a roughly uniform coverage that does not need to be normalised further.
* You do not have any excess data to discard: BBnorm does not increase data quantity by imputation or by repeatedly sampling with replacement.
* Your pipeline reports results that rely on quantification of abundance (ex: differential expression profiling or ChIP-Seq)
* You want to do variant discovery. Reduction of sequencing depth might bias significance levels, or even obscure the existence of rare variants altogether.
* The sequencing platform has a very high error rate (ex: ONT) that might mislead this algorithm.

    </help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Tue, 30 May 2023 09:02:11 +0000
parents
children	1baa4ad1ac2f