Mercurial > repos > mvdbeek > dedup_hash

<tool id="dedup_hash" name="Deduplicate FASTQ files" version="0.1.1">
    <description>with fast and memory-efficient sequence hashes</description>
    <requirements>
        <requirement type="package" version="0.150.1">smhasher</requirement>
    </requirements>
    <command><![CDATA[
    python '$__tool_directory__/dedup_hash/dedup_hash.py'
    #if str($readtype.single_or_paired) == "se":
        --r1_in '${readtype.input_single}'
        --r1_out '$output_single'
    #elif str($readtype.single_or_paired) == "pe_sep":
        --r1_in '${readtype.input_paired1}'
        --r2_in '${readtype.input_paired2}'
        --r1_out '$output_paired1'
        --r2_out '$output_paired2'
    #else
        --r1_in '${readtype.input_paired.forward}'
        --r2_in '${readtype.input_paired.reverse}'
        --r1_out '${output_paired_coll.forward}'
        --r2_out '${output_paired_coll.reverse}'
    #end if
    $compress_fastq
    ]]></command>
    <inputs>
        <conditional name="readtype">
            <param name="single_or_paired" type="select" label="Single-end or paired-end reads?">
                <option value="se" selected="true">Single-end</option>
                <option value="pe_sep">Paired-end (two separate input files)</option>
                <option value="pe_collection">Paired-end (as collection)</option>
            </param>
            <when value="se">
                <param format="fastq,fastq.gz" name="input_single" type="data" label="Single-end FASTQ reads" help="(-f)" />
            </when>
            <when value="pe_sep">
                <param format="fastq,fastq.gz" name="input_paired1" type="data" label="Paired-end forward strand FASTQ reads" help="(-f)" />
                <param format="fastq,fastq.gz" name="input_paired2" type="data" label="Paired-end reverse strand FASTQ reads" help="(-r)" />
            </when>
            <when value="pe_collection">
                <param name="input_paired" format="fastq,fastq.gz" type="data_collection" collection_type="paired" label="Paired-end FASTQ reads as paired collection" />
            </when>
        </conditional>
        <param name="compress_fastq" type="boolean" checked="true" truevalue="--write_gzip" falsevalue="" label="Produce compressed fastq?"/>
    </inputs>
    <outputs>
        <data name="output_single" format="fastq" label="Single-end output of ${tool.name} on ${on_string}">
            <filter>readtype['single_or_paired'] == 'se'</filter>
            <change_format>
                <when input="compress_fastq" value="--write_gzip" format="fastq.gz" />
            </change_format>
        </data>
        <data name="output_paired1" format="fastq" label="Paired-end forward strand output of ${tool.name} on ${on_string}">
            <filter>readtype['single_or_paired'] == 'pe_sep'</filter>
            <change_format>
                <when input="compress_fastq" value="--write_gzip" format="fastq.gz" />
            </change_format>
        </data>
        <data name="output_paired2" format="fastq" label="Paired-end reverse strand output of ${tool.name} on ${on_string}">
            <filter>readtype['single_or_paired'] == 'pe_sep'</filter>
            <change_format>
                <when input="compress_fastq" value="--write_gzip" format="fastq.gz" />
            </change_format>
        </data>
        <collection name="output_paired_coll" type="paired" structured_like="readtype.pe_collection" label="Paired-end output of ${tool.name} on ${on_string}">
            <filter>readtype['single_or_paired'] == 'pe_collection'</filter>
            <data name="forward" format="fastq">
                <change_format>
                    <when input="compress_fastq" value="--write_gzip" format="fastq.gz" />
                </change_format>
            </data>
            <data name="reverse" format="fastq">
                <change_format>
                    <when input="compress_fastq" value="--write_gzip" format="fastq.gz" />
                </change_format>
            </data>
        </collection>
    </outputs>
    <tests>
        <test>
            <param name="single_or_paired" value="pe_sep"/>
            <param name="input_paired1" value="r1.fastq.gz" ftype="fastq.gz"/>
            <param name="input_paired2" value="r2.fastq.gz" ftype="fastq.gz"/>
            <param name="compress_fastq" value="--write_gzip"/>
            <output name="output_paired1" file="r1_dedup.fastq.gz" ftype="fastq.gz" compare="sim_size"/>
            <output name="output_paired2" file="r2_dedup.fastq.gz" ftype="fastq.gz" compare="sim_size"/>
        </test>
        <test>
            <param name="single_or_paired" value="pe_sep"/>
            <param name="input_paired1" value="r1.fastq" ftype="fastq"/>
            <param name="input_paired2" value="r2.fastq" ftype="fastq"/>
            <param name="compress_fastq" value=""/>
            <output name="output_paired1" file="r1_dedup.fastq" ftype="fastq"/>
            <output name="output_paired2" file="r2_dedup.fastq" ftype="fastq"/>
        </test>
    </tests>
    <help> <![CDATA[
**Deduplicate paired fastq** is a fast and memory-efficient tool for removal of duplicates in paired short DNA sequence reads in fastq format.
It identifies duplicates by concatenating the sequence of a readpair and calculating a short hash that uniquely identifies the concatenated sequence.
Sequences that are not unique (i.e a hash of the concatenated sequence has been seen previously) are being discarded.

Compared to fastuniq this tool requires only a fraction of the memory, but does not identify pairs that are identical,
except for a switch of R1 and R2. Such reads may nevertheless align to different places based on the seed-searching of the aligner,
so this may or may not be a problem for your application.

Fastuniq consumed 76 GB of memory and took 4:01.52 on a typical dataset of 100nt 25 x 10^6 paired end reads,
while this tool took 4.7GB of memory and 3:23.27 for the same dataset.

Both tools produced the exact same result, arguing that, at least before quality and/or adapter trimming,
the previously mentioned limitations are of theoretical concern.

     ]]> </help>
    <citations>
        <citation type="doi">doi:10.1371/journal.pone.0052249</citation>
    </citations>
</tool>
author	mvdbeek
date	Wed, 23 Nov 2016 07:49:05 -0500
parents
children