Mercurial > repos > peterjc > sample_seqs

<tool id="sample_seqs" name="Sub-sample sequences files" version="0.2.1">
    <description>e.g. to reduce coverage</description>
    <requirements>
        <requirement type="package" version="1.65">biopython</requirement>
        <requirement type="python-module">Bio</requirement>
    </requirements>
    <version_command interpreter="python">sample_seqs.py --version</version_command>
    <command interpreter="python">
sample_seqs.py -f "$input_file.ext" -i "$input_file" -o "$output_file"
#if str($sampling.type) == "everyNth":
-n "${sampling.every_n}"
#elif str($sampling.type) == "percentage":
-p "${sampling.percent}"
#else
-c "${sampling.count}"
#end if
#if $interleaved
--interleaved
#end if
    </command>
    <stdio>
        <!-- Anything other than zero is an error -->
        <exit_code range="1:" />
        <exit_code range=":-1" />
    </stdio>
    <inputs>
        <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file" help="FASTA, FASTQ, or SFF format." />
        <conditional name="sampling">
            <param name="type" type="select" label="Sub-sampling approach">
                <option value="everyNth">Take every N-th sequence (or pair, e.g. every fifth sequence)</option>
                <option value="percentage">Take some percentage of the sequences (or pairs, e.g. 20% will take every fifth sequence)</option>
                <option value="desired_count">Take exactly N sequences (or pairs, e.g. 1000 sequences)</option>
                <!-- TODO - target coverage etc -->
            </param>
            <when value="everyNth">
                <param name="every_n" value="5" type="integer" min="2" label="N" help="At least 2, e.g. 5 will take every 5th sequence (taking 20% of the sequences)" />
            </when>
            <when value="percentage">
                <param name="percent" value="20.0" type="float" min="0" max="100" label="Percentage" help="Between 0 and 100, e.g. 20% will take every 5th sequence" />
            </when>
            <when value="desired_count">
                <param name="count" value="1000" type="integer" min="1" label="N" help="Number of unique sequences to pick (between 1 and number itotal n input file)" />
            </when>
        </conditional>
        <param name="interleaved" type="boolean" label="Interleaved paired reads" help="This mode keeps paired reads together (e.g. take every 5th read pair)" />
    </inputs>
    <outputs>
        <data name="output_file" format="input" metadata_source="input_file" label="${input_file.name} (sub-sampled)"/>
    </outputs>
    <tests>
        <test>
            <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
            <param name="type" value="everyNth" />
            <param name="every_n" value="100" />
            <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" />
        </test>
        <test>
            <param name="input_file" value="ecoli.fastq" />
            <param name="type" value="everyNth" />
            <param name="every_n" value="100" />
            <output name="output_file" file="ecoli.sample_N100.fastq" />
        </test>
        <test>
            <param name="input_file" value="ecoli.fastq" />
            <param name="type" value="everyNth" />
            <param name="every_n" value="100" />
            <param name="interleaved" value="true" />
            <output name="output_file" file="ecoli.pair_sample_N100.fastq" />
        </test>
        <test>
            <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
            <param name="type" value="everyNth" />
            <param name="every_n" value="5" />
            <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/>
        </test>
        <test>
            <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
            <param name="type" value="percentage" />
            <param name="percent" value="1.0" />
            <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" />
        </test>
        <test>
            <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
            <param name="type" value="everyNth" />
            <param name="every_n" value="100" />
            <param name="interleaved" value="true" />
            <output name="output_file" file="get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta" />
        </test>
        <test>
            <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
            <param name="type" value="desired_count" />
            <param name="count" value="2910" />
            <output name="output_file" file="get_orf_input.Suis_ORF.prot.fasta" />
        </test>
        <test>
            <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
            <param name="type" value="desired_count" />
            <param name="count" value="10" />
            <param name="interleaved" value="true" />
            <output name="output_file" file="get_orf_input.Suis_ORF.prot.pair_sample_C10.fasta" />
        </test>
        <test>
            <param name="input_file" value="ecoli.fastq" />
            <param name="type" value="percentage" />
            <param name="percent" value="1.0" />
            <output name="output_file" file="ecoli.sample_N100.fastq" />
        </test>
        <test>
            <param name="input_file" value="ecoli.fastq" />
            <param name="type" value="desired_count" />
            <param name="count" value="10" />
            <output name="output_file" file="ecoli.sample_C10.fastq" />
        </test>
        <test>
            <param name="input_file" value="ecoli.sample_C10.fastq" />
            <param name="type" value="desired_count" />
            <param name="count" value="10" />
            <output name="output_file" file="ecoli.sample_C10.fastq" />
        </test>
        <test>
            <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
            <param name="type" value="percentage" />
            <param name="percent" value="20.0" />
            <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/>
            <assert_stderr>
                <has_line line="Sampling 20.000% of sequences" />
                <has_line line="Selected 5 records" />
            </assert_stderr>
        </test>
        <test>
            <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
            <param name="type" value="everyNth" />
            <param name="every_n" value="5" />
            <param name="interleaved" value="true" />
            <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff" ftype="sff"/>
            <assert_stderr>
                <has_line line="Sampling every 5th sequence" />
                <has_line line="Selected 3 pairs" />
            </assert_stderr>
        </test>
        <test>
            <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
            <param name="type" value="desired_count" />
            <param name="count" value="25" />
            <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff"/>
            <assert_stderr>
                <has_line line="Input file has 25 sequences" />
                <has_line line="Taking all the sequences" />
                <has_line line="Selected 25 records" />
            </assert_stderr>
        </test>
        <test>
            <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
            <param name="type" value="desired_count" />
            <param name="count" value="1" />
            <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_C1.sff" ftype="sff"/>
            <assert_stderr>
                <has_line line="Input file has 25 sequences" />
                <has_line line="Sampling just first sequence!" />
                <has_line line="Selected 1 records" />
            </assert_stderr>
        </test>
        <test expect_failure="true" expect_exit_code="1">
            <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
            <param name="type" value="desired_count" />
            <param name="count" value="30" />
            <assert_stderr>
                <has_line line="Input file has 25 sequences" />
                <has_line line="Requested 30 sequences, but file only has 25." />
            </assert_stderr>
        </test>
    </tests>
    <help>
**What it does**

Takes an input file of sequences (typically FASTA or FASTQ, but also
Standard Flowgram Format (SFF) is supported), and returns a new sequence
file sub-sampling uniformly from this (in the same format, preserving the
input order and selecting sequencing evenly though the input file).

Several sampling modes are supported, all designed to do non-random
uniform sampling (i.e. evenly through the input file). This allows
reproducibility, and also works on paired sequence files (run the tool
twice, once on each file using the same settings).

By sampling uniformly (evenly) through the file, this avoids any bias
should reads in any part of the file be of lesser quality (e.g. for
high throughput sequencing the reads at the start and end of the file
can be of lower quality).

The simplest mode is to take every *N*-th sequence, for example taking
every 2nd sequence would sample half the file - while taking every 5th
sequence would take 20% of the file.

The target count method picks *N* sequences from the input file, which
again will be distributed uniformly (evenly) though the file. This works
by first counting the number of records, then calculating the desired
percentage of sequences to take. Note if your input file has exactly
*N* sequences this selects them all (effectively copying the input file).
If your input file has less than *N* sequences, this is treated as an
error.

If you tick the interleaved option, the file is processed as pairs of
records to ensure your read pairs are not separated by sampling.
For example using 20% would take every 5th pair of records, or you
could request 1000 read pairs.

.. class:: warningmark

Note interleaves/pair mode does *not* actually check your read names
match a known pair naming scheme!

**Example Usage**

Suppose you have some Illumina paired end data as files ``R1.fastq`` and
``R2.fastq`` which give an estimated x200 coverage, and you wish to do a
*de novo* assembly with a tool like MIRA which recommends lower coverage.
Taking every 3rd read would reduce the estimated coverage to about x66,
and would preserve the pairing as well.

Similarly, if you had some Illumina paired end data interleaved into one
file with an estimated x200 coverage, you would run this tool in
interleaved mode, taking every 3rd read pair. This would again reduce
the estimated coverage to about x66, while preserving the read pairing.

Suppose you have a transcriptome assembly, and wish to look at the
species distribution of the top BLAST hits for an initial quality check.
Rather than using all your sequences, you could pick 1000 only for this.

**Citation**

This tool uses Biopython, so if you use this Galaxy tool in work leading to a
scientific publication please cite the following paper:

Cock et al (2009). Biopython: freely available Python tools for computational
molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

This tool is available to install into other Galaxy Instances via the Galaxy
Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs
    </help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btp163</citation>
    </citations>
</tool>
author	peterjc
date	Fri, 27 Mar 2015 09:34:27 -0400
parents	da64f6a9e32b
children	d3aa9f25c24c