Mercurial > repos > iuc > qiime_pick_otus

<tool id="qiime_pick_otus" name="Perform OTU picking" version="@WRAPPER_VERSION@.0" profile="@PROFILE@">
    <description> (pick_otus)</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
        <requirement type="package" version="2.1b">sortmerna</requirement>
        <!--<requirement type="package" version="1.38.1.1">mothur</requirement>-->
        <!--<requirement type="package" version="2.2.22">blast-legacy</requirement>-->
        <requirement type="package" version="4.6.6">cd-hit</requirement>
    </expand>
    <version_command>pick_otus.py --version</version_command>
    <command detect_errors="aggressive"><![CDATA[
pick_otus.py
    --input_seqs_filepath '$input_seqs_filepath'
    --output_dir pick_otus
    --otu_picking_method '$method.otu_picking_method'

    #if $method.otu_picking_method == "sortmerna"
        #if $method.references.source_selector == 'history'
            --refseqs_fp '$method.references.refseqs_fp'
        #else if $method.references.source_selector == 'cached'
            --refseqs_fp '$method.references.refseqs_fp.fields.path'
        #end if
        --sortmerna_e_value '$method.sortmerna_e_value'
        --sortmerna_coverage '$method.sortmerna_coverage'
        #if $method.sortmerna_tabular.test == 'yes'
            --sortmerna_tabular
            --sortmerna_best_N_alignments '$method.sortmerna_tabular.sortmerna_best_N_alignments'
        #end if
        --sortmerna_max_pos '$method.sortmerna_max_pos'
        --similarity '$method.similarity'
        $method.suppress_prefilter_exact_match
        --threads \${GALAXY_SLOTS:-2}
    #else if $method.otu_picking_method == "mothur"
        --clustering_algorithm '$method.clustering_algorithm'
    #else if $method.otu_picking_method == "trie"
        $method.trie_reverse_seqs
    #else if $method.otu_picking_method == "uclust_ref"
        #if $method.references.source_selector == 'history'
            --refseqs_fp '$method.references.refseqs_fp'
        #else if $method.references.source_selector == 'cached'
            --refseqs_fp '$method.references.refseqs_fp.fields.path'
        #end if
        --similarity '$method.similarity'
        $method.enable_rev_strand_match
        $method.suppress_presort_by_abundance_uclust
        $method.suppress_new_clusters
        --max_accepts '$method.max_accepts'
        --max_rejects '$method.max_rejects'
        --stepwords '$method.stepwords'
        --word_length '$method.word_length'
    #else if $method.otu_picking_method == "blast"
        #if $method.references.source_selector == 'history'
            --refseqs_fp '$method.references.refseqs_fp'
        #else if $method.references.source_selector == 'cached'
            --refseqs_fp '$method.references.refseqs_fp.fields.path'
        #end if
        --max_e_value_blast '$method.max_e_value_blast'
        --min_aligned_percent '$method.min_aligned_percent'
        --similarity '$method.similarity'
    #else if $method.otu_picking_method == "sumaclust"
        --similarity '$method.similarity'
        '$method.sumaclust_exact'
        --denovo_otu_id_prefix '$method.denovo_otu_id_prefix'
        $method.suppress_prefilter_exact_match
        --threads \${GALAXY_SLOTS:-2}
    #else if $method.otu_picking_method == "swarm"
        --denovo_otu_id_prefix '$method.denovo_otu_id_prefix'
        --swarm_resolution '$method.swarm_resolution'
        --threads \${GALAXY_SLOTS:-2}
    #else if $method.otu_picking_method == "prefix_suffix"
        --prefix_length '$method.prefix_length'
        --suffix_length '$method.suffix_length'
    #else if $method.otu_picking_method == "cdhit"
        --similarity '$method.similarity'
    #else if $method.otu_picking_method == "uclust"
        --similarity '$method.similarity'
        --denovo_otu_id_prefix '$method.denovo_otu_id_prefix'
        $method.enable_rev_strand_match
        $method.suppress_presort_by_abundance_uclust
        $method.optimal_uclust
        $method.exact_uclust
        $method.user_sort
        --max_accepts '$method.max_accepts'
        --max_rejects '$method.max_rejects'
        --stepwords '$method.stepwords'
        --word_length '$method.word_length'
        $method.suppress_uclust_stable_sort
        $method.suppress_prefilter_exact_match
    #end if

    #if str($prefix_prefilter_length) != ''
        --prefix_prefilter_length '$prefix_prefilter_length'
    #end if
    $trie_prefilter
    --non_chimeras_retention '$non_chimeras_retention'

#if $method.otu_picking_method == "sortmerna"
    &&
    rm pick_otus/sortmerna_otus.log
#end if
    ]]></command>
    <inputs>
        <param argument="--input_seqs_filepath" type="data" format="fasta" label="Input sequences file"/>
        <conditional name="method">
            <param argument="--otu_picking_method" type="select" label="Method for picking OTUs">
                <option value="sortmerna">sortmerna</option>
                <!--<option value="mothur">mothur (requires an input file of aligned sequences)</option>-->
                <option value="trie">trie</option>
                <option value="uclust_ref">uclust_ref</option>
                <!--<option value="blast">blast</option>-->
                <!--<option value="sumaclust">sumaclust</option>-->
                <option value="swarm">swarm</option>
                <option value="prefix_suffix">prefix_suffix</option>
                <!--<option value="cdhit">cdhit</option>-->
                <option value="uclust" selected="true">uclust</option>
            </param>
            <when value="sortmerna">
                <expand macro="pick_otus_reference_source"/>
                <param argument="--sortmerna_e_value" type="float" value="1" label="Max E-value when clustering"/>
                <param argument="--sortmerna_coverage" type="float" value="0.97" min="0" max="1" label="Mininum percent query coverage (of an alignment) to consider a hit"/>
                <conditional name="sortmerna_tabular">
                    <param argument="test" type="select" label="Output alignments in the Blast tabular format with two additional columns including the CIGAR string and the percent query coverage?">
                        <option value="yes">Yes</option>
                        <option value="no" selected="true">No</option>
                    </param>
                    <when value="yes">
                        <param argument="--sortmerna_best_N_alignments" type="integer" value="1" label="How many alignments per read will be written"/>
                    </when>
                    <when value="no"/>
                </conditional>
                <param argument="--sortmerna_max_pos" type="integer" value="10000" label="Maximum number of positions per seed to store in the indexed database"/>
                <expand macro="pick_otus_similarity"/>
                <expand macro="pick_otus_suppress_prefilter_exact_match"/>
            </when>
            <when value="mothur">
                <param argument="--clustering_algorithm" type="select" label="Clustering algorithm">
                    <option value="furthest" selected="true">furthest</option>
                    <option value="nearest">nearest</option>
                    <option value="average">average</option>
                </param>
            </when>
            <when value="trie">
                <param argument="--trie_reverse_seqs" type="boolean" truevalue="--trie_reverse_seqs" falsevalue="" checked="false" label="Reverse seqs before picking OTUs for suffix (rather than prefix) collapsing?"/>
            </when>
            <when value="uclust_ref">
                <expand macro="pick_otus_reference_source"/>
                <expand macro="pick_otus_similarity"/>
                <expand macro="pick_otus_enable_rev_strand_match"/>
                <expand macro="pick_otus_suppress_presort_by_abundance_uclust"/>
                <param argument="--suppress_new_clusters" type="boolean" truevalue="--suppress_new_clusters" falsevalue="" checked="false" label="Suppress creation of new clusters using seqs that don't match reference?"/>
                <expand macro="pick_otus_max"/>
                <expand macro="pick_otus_stepwords"/>
                <expand macro="pick_otus_word_length"/>
            </when>
            <when value="blast">
                <expand macro="pick_otus_reference_source"/>
                <expand macro="pick_otus_similarity"/>
                <param argument="--max_e_value_blast" type="float" value="1e-10" label="Max E-value when clustering"/>
                <param argument="--min_aligned_percent" type="float" value="0.5" min="0" max="1" label="Minimum percent of query sequence that can be aligned to consider a hit"/>
            </when>
            <when value="sumaclust">
                <expand macro="pick_otus_similarity"/>
                <param argument="--sumaclust_exact" type="boolean" truevalue="--sumaclust_exact" falsevalue="" checked="false" label="Assign a sequence to the best matching seed rather than the first matching seed passing the similarity threshold?"/>
                <param argument="--sumaclust_l" type="boolean" truevalue="--sumaclust_l" falsevalue="" checked="true" label="Reference sequence length if the shortest?"/>
                <expand macro="pick_otus_denovo_otu_id_prefix"/>
                <expand macro="pick_otus_suppress_prefilter_exact_match"/>
            </when>
            <when value="swarm">
                <expand macro="pick_otus_denovo_otu_id_prefix"/>
                <param argument="--swarm_resolution" type="integer" value="1" label="Maximum number of differences allowed between two amplicons" help="Two amplicons will be grouped if they have integer (or less) differences"/>
            </when>
            <when value="prefix_suffix">
                <param argument="--prefix_length" type="integer" value="50" label="Prefix length"/>
                <param argument="--suffix_length" type="integer" value="50" label="Suffix length"/>
            </when>
            <when value="cdhit">
                <expand macro="pick_otus_similarity"/>
            </when>
            <when value="uclust">
                <expand macro="pick_otus_similarity"/>
                <expand macro="pick_otus_denovo_otu_id_prefix"/>
                <expand macro="pick_otus_enable_rev_strand_match"/>
                <expand macro="pick_otus_suppress_presort_by_abundance_uclust"/>
                <param argument="--optimal_uclust" type="boolean" truevalue="--optimal_uclust" falsevalue="" checked="false" label="Pass the –optimal flag to uclust?"/>
                <param argument="--exact_uclust" type="boolean" truevalue="--exact_uclust" falsevalue="" checked="false" label="Pass the –exact flag to uclust?"/>
                <param argument="--user_sort" type="boolean" truevalue="--user_sort" falsevalue="" checked="false" label="Pass the -user_sort flag to uclust?"/>
                <expand macro="pick_otus_max"/>
                <expand macro="pick_otus_stepwords"/>
                <expand macro="pick_otus_word_length"/>
                <param argument="--suppress_uclust_stable_sort" type="boolean" truevalue="--suppress_uclust_stable_sort" falsevalue="" checked="false" label="Don't pass –stable-sort to uclust?"/>
                <expand macro="pick_otus_suppress_prefilter_exact_match"/>
            </when>
        </conditional>
        <param argument="--prefix_prefilter_length" type="integer" label="Threshold to automatically group first identical prefix_prefilter_length into a single OTU" help="This is useful for large sequence collections where OTU picking doesn't scale well" optional="true"/>
        <param argument="--trie_prefilter" type="boolean" truevalue="--trie_prefilter" falsevalue="" checked="false" label="Prefilter data so seqs which are identical prefixes of a longer seq are automatically grouped into a single OTU?" help="This is useful for large sequence collections where OTU picking doesn't scale well"/>
        <param argument="--non_chimeras_retention" type="select" label="Selects subsets of sequences detected as non-chimeras to retain after de novo and reference based chimera detection">
            <option value="intersection">Intersection (retains only those sequences that are flagged as non-chimeras from both detection methods)</option>
            <option value="union" selected="true">Union (retains sequences that are flagged as non-chimeric from either filter)</option>
        </param>
    </inputs>
    <outputs>
        <data name="otus" format="txt" from_work_dir="pick_otus/*_otus.txt" label="${tool.name} on ${on_string}: OTUs"/>
        <data name="log" format="txt" from_work_dir="pick_otus/*_otus.log" label="${tool.name} on ${on_string}: Log"/>
        <data name="failures" format="txt" from_work_dir="pick_otus/*_failures.txt" label="${tool.name} on ${on_string}: Failures">
            <filter>method['otu_picking_method'] == "sortmerna" or method['otu_picking_method'] == "uclust_ref"</filter>
        </data>
        <data name="blast_output" format="tabular" from_work_dir="pick_otus/sortmerna_otus.blast" label="${tool.name} on ${on_string}: SortMeRNA Blast output">
            <filter>method['otu_picking_method'] == "sortmerna" and method["sortmerna_tabular"]["test"] == "yes"</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="input_seqs_filepath" value="pick_otus/seqs.fna"/>
            <param name="otu_picking_method" value="uclust"/>
            <param name="similarity" value="0.97" />
            <param name="denovo_otu_id_prefix" value="denovo"/>
            <param name="enable_rev_strand_match" value=""/>
            <param name="suppress_presort_by_abundance_uclust" value=""/>
            <param name="optimal_uclust" value=""/>
            <param name="exact_uclust" value=""/>
            <param name="user_sort" value=""/>
            <param name="max_accepts" value="1"/>
            <param name="max_rejects" value="8"/>
            <param name="stepwords" value="8"/>
            <param name="word_length" value="8"/>
            <param name="suppress_uclust_stable_sort" value=""/>
            <param name="suppress_prefilter_exact_match" value=""/>
            <param name="trie_prefilter" value=""/>
            <param name="non_chimeras_retention" value="union"/>
            <output name="otus" md5="2ca01ee393e8a795e5d09a15c5ca77c3"/>
            <output name="log">
                <assert_contents>
                    <has_text text="UclustOtuPicker parameters"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="input_seqs_filepath" value="pick_otus/seqs.fna"/>
            <param name="otu_picking_method" value="sortmerna"/>
            <param name="source_selector" value="history"/>
            <param name="refseqs_fp" value="pick_otus/refseqs.fasta"/>
            <param name="sortmerna_e_value" value="1"/>
            <param name="sortmerna_coverage" value="0.97"/>
            <param name="test" value="yes"/>
            <param name="sortmerna_best_N_alignments" value="1"/>
            <param name="sortmerna_max_pos" value="10000"/>
            <param name="similarity" value="0.97"/>
            <param name="suppress_prefilter_exact_match" value=""/>
            <param name="trie_prefilter" value=""/>
            <param name="non_chimeras_retention" value="union"/>
            <output name="otus" md5="dbc3e13e7da742ac3df317054821f522"/>
            <output name="log">
                <assert_contents>
                    <has_text text="OtuPicker parameters"/>
                    <has_text text="SortMeRNA database"/>
                </assert_contents>
            </output>
            <output name="failures" md5="cce9699de546618df2db9de7a8098916"/>
            <output name="blast_output" md5="214b1674f5f84dcb68c0c0c1c8a001be"/>
        </test>
        <test>
            <param name="input_seqs_filepath" value="pick_otus/seqs.fna"/>
            <param name="otu_picking_method" value="trie"/>
            <param name="trie_reverse_seqs" value=""/>
            <param name="trie_prefilter" value=""/>
            <param name="non_chimeras_retention" value="union"/>
            <output name="otus" md5="441c2bc34cd6766e75585dbe2ea09f9b"/>
            <output name="log">
                <assert_contents>
                    <has_text text="TrieOtuPicker parameters:"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="input_seqs_filepath" value="pick_otus/seqs.fna"/>
            <param name="otu_picking_method" value="uclust_ref"/>
            <param name="source_selector" value="history"/>
            <param name="refseqs_fp" value="pick_otus/refseqs.fasta"/>
            <param name="similarity" value="0.97"/>
            <param name="enable_rev_strand_match" value=""/>
            <param name="suppress_presort_by_abundance_uclust" value=""/>
            <param name="suppress_new_clusters" value="" />
            <param name="max_accepts" value="1"/>
            <param name="max_rejects" value="8"/>
            <param name="stepwords" value="8"/>
            <param name="word_length" value="8"/>
            <param name="trie_prefilter" value=""/>
            <param name="non_chimeras_retention" value="union"/>
            <output name="otus" md5="e44b21022227e9fc213e13fd028efb81"/>
            <output name="log">
                <assert_contents>
                    <has_text text="OtuPicker parameters"/>
                    <has_text text="Application:uclust"/>
                </assert_contents>
            </output>
            <output name="failures" md5="d41d8cd98f00b204e9800998ecf8427e"/>
        </test>
        <test>
            <param name="input_seqs_filepath" value="pick_otus/seqs.fna"/>
            <param name="otu_picking_method" value="swarm"/>
            <param name="denovo_otu_id_prefix" value="denovo"/>
            <param name="swarm_resolution" value="1"/>
            <param name="trie_prefilter" value=""/>
            <param name="non_chimeras_retention" value="union"/>
            <output name="otus" md5="9b603183a1116071e762e31525928f94"/>
            <output name="log">
                <assert_contents>
                    <has_text text="OtuPicker parameters"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="input_seqs_filepath" value="pick_otus/seqs.fna"/>
            <param name="otu_picking_method" value="prefix_suffix"/>
            <param name="prefix_length" value="50"/>
            <param name="suffix_length" value="50"/>
            <param name="trie_prefilter" value=""/>
            <param name="non_chimeras_retention" value="union"/>
            <output name="otus" md5="3bf79c70affb264a977fd8f2f34f2889"/>
            <output name="log">
                <assert_contents>
                    <has_text text="PrefixSuffixOtuPicker parameters:"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="input_seqs_filepath" value="pick_otus/seqs.fna"/>
            <param name="otu_picking_method" value="uclust"/>
            <param name="similarity" value="0.97"/>
            <param name="denovo_otu_id_prefix" value="denovo"/>
            <param name="enable_rev_strand_match" value=""/>
            <param name="suppress_presort_by_abundance_uclust" value=""/>
            <param name="optimal_uclust" value=""/>
            <param name="exact_uclust" value=""/>
            <param name="user_sort" value=""/>
            <param name="max_accepts" value="1"/>
            <param name="max_rejects" value="8"/>
            <param name="stepwords" value="8"/>
            <param name="word_length" value="8"/>
            <param name="suppress_uclust_stable_sort" value=""/>
            <param name="suppress_prefilter_exact_match" value=""/>
            <param name="trie_prefilter" value=""/>
            <param name="non_chimeras_retention" value="union"/>
            <output name="otus" md5="2ca01ee393e8a795e5d09a15c5ca77c3"/>
            <output name="log">
                <assert_contents>
                    <has_text text="UclustOtuPicker parameters"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
**What it does**

The OTU picking step assigns similar sequences to operational taxonomic units, or OTUs, by clustering sequences based on a user-defined similarity threshold. Sequences which are similar at or above the threshold level are taken to represent the presence of a taxonomic unit (e.g., a genus, when the similarity threshold is set at 0.94) in the sequence collection.

Currently, the following clustering methods have been implemented in QIIME:

1.  uclust, creates &quot;seeds&quot; of sequences which generate clusters based on percent identity.

2.  uclust_ref, as uclust, but takes a reference database to use as seeds.  New clusters can be toggled on or off.

3.  usearch, creates &quot;seeds&quot; of sequences which generate clusters based on percent identity, filters low abundance clusters, performs de novo and reference based chimera detection.

4.  usearch_ref, as usearch, but takes a reference database to use as seeds.  New clusters can be toggled on or off.

5. sumaclust, creates &quot;seeds&quot; of sequences which generate clusters based on similarity threshold.

6. swarm, creates &quot;seeds&quot; of sequences which generate clusters based on a resolution threshold.


Chimera checking with usearch 6.X is implemented in identify_chimeric_seqs.py.  Chimera checking should be done first with usearch 6.X, and the filtered resulting fasta file can then be clustered.


The primary inputs for pick_otus.py are:

1. A FASTA file containing sequences to be clustered

2. An OTU threshold (default is 0.97, roughly corresponding to species-level OTUs);

3. The method to be applied for clustering sequences into OTUs.

pick_otus.py takes a standard fasta file as input.


The output consists of two files (i.e. seqs_otus.txt and seqs_otus.log). The .txt file is composed of tab-delimited lines, where the first field on each line corresponds to an (arbitrary) cluster identifier, and the remaining fields correspond to sequence identifiers assigned to that cluster. Sequence identifiers correspond to those provided in the input FASTA file.  Usearch (i.e. usearch quality filter) can additionally have log files for each intermediate call to usearch.

Example lines from the resulting .txt file:

=   ====    ====    ====
0   seq1    seq5
1   seq2
2   seq3
3   seq4    seq6    seq7
=   ====    ====    ====

This result implies that four clusters were created based on 7 input sequences.
The first cluster (cluster id 0) contains two sequences, sequence ids seq1 and
seq5; the second cluster (cluster id 1) contains one sequence, sequence id seq2;
the third cluster (cluster id 2) contains one sequence, sequence id seq3, and the
final cluster (cluster id 3) contains three sequences, sequence ids seq4, seq6,
and seq7.

The resulting .log file contains a list of parameters passed to the pick_otus.py
script along with the output location of the resulting .txt file.
    ]]></help>
    <citations>
        <expand macro="citations"/>
        <citation type="doi">10.1093/bioinformatics/btv231</citation>
        <citation type="doi">10.1093/bioinformatics/btq461</citation>
        <citation type="doi">10.1093/bioinformatics/bts611</citation>
        <citation type="doi">10.7287/peerj.preprints.386v1/supp-1</citation>
    </citations>
</tool>
author	iuc
date	Thu, 05 Dec 2019 07:47:04 -0500
parents	127947ced93f
children