Mercurial > repos > iuc > stacks_genotypes

<tool id="stacks_genotypes" name="Stacks: genotypes" version="@WRAPPER_VERSION@.0">
    <description>analyse haplotypes or genotypes in a genetic cross ('genotypes' program)</description>
    <expand macro="bio_tools"/>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <expand macro="stdio"/>
    <command><![CDATA[
        #import re

        mkdir stacks_outputs

        &&

        #for $input_file in $input_col
            #set $filename = str($input_file.element_identifier)
            #if not $filename.endswith('.tsv')
                #set $filename = $filename + ".tsv"
            #end if
            #if re.search('\.(tags|snps|alleles|matches)(\.tsv)?$', $filename)
                ln -s '${input_file}' 'stacks_outputs/${filename}' &&
            #end if
        #end for

        genotypes

            -P stacks_outputs
            -b $advanced_options.batchid

            -t $options_usage.cross_type
            -o $options_usage.map_out.map_out_type

            #if str( $options_usage.map_out.map_out_type ) == "genomic"
                -e ${options_usage.map_out.enzyme}
            #end if

            #if str($advanced_options.minprogeny)
                -r $advanced_options.minprogeny
            #end if

            #if str($advanced_options.mindepth)
                -m $advanced_options.mindepth
            #end if

            #if str($advanced_options.lnl)
                --lnl_lim $advanced_options.lnl
            #end if

            #if $advanced_options.blacklist
                -B '$advanced_options.blacklist'
            #end if
            #if $advanced_options.whitelist
                -W '$advanced_options.whitelist'
            #end if

            #if $advanced_options.manual_cor
                --corr_path '$advanced_options.manual_cor'
            #end if

            #if $options_autocorr.corrections
                -c
                --min_hom_seqs $options_autocorr.hom
                --min_het_seqs $options_autocorr.het
                --max_het_seqs $options_autocorr.hetmax
            #end if

            ## output SQL file (as denovo/refmap)
            -s

            @NORM_GENOTYPES_OUTPUT_FULL@

            &&

            stacks_summary.py --stacks-prog genotypes --res-dir stacks_outputs --summary stacks_outputs/summary.html
    ]]></command>
    <inputs>
        <param name="input_col" format="tabular,txt" type="data_collection" collection_type="list" label="Output from previous Stacks pipeline steps (e.g. denovo_map or refmap)" />

        <section name="options_usage" title="Genotyping options">

            <param name="cross_type" argument="-t" type="select" label="Cross type">
                <expand macro="cross_types"/>
            </param>

            <conditional name="map_out">
                <param argument="-o" name="map_out_type" type="select" label="Output file type" help="Output map file type to write" >
                    <option value="joinmap">JoinMap</option>
                    <option value="onemap">OneMap</option>
                    <option value="rqtl">R/QTL</option>
                    <option value="genomic">Genomic</option>
                </param>
                <when value="joinmap"/>
                <when value="onemap"/>
                <when value="rqtl"/>
                <when value="genomic">
                    <param name="enzyme" argument="-e" type="select" label="Restriction enzyme used" help="Only needed for Genomic output format">
                        <expand macro="enzymes"/>
                    </param>
                </when>
            </conditional>
        </section>

        <conditional name="options_autocorr">
            <param name="corrections" argument="-c" type="boolean" checked="true" truevalue="-c" falsevalue="" label="Make automated corrections to the data" />
            <when value="-c">
                <param name="hom" argument="--min_hom_seqs" type="integer" value="5" label="Minimum number of reads required at a stack to call a homozygous genotype" />
                <param name="het" argument="--min_het_seqs:" type="float" value="0.05" label="Heterozygote minor allele minimum frequency" help="below this minor allele frequency a stack is called a homozygote, above it (but below --max_het_seqs) it is called unknown" />
                <param name="hetmax" argument="--max_het_seqs:" type="float" value="0.1" label="Heterozygote minor allele maximum frequency" help="minimum frequency of minor allele to call a heterozygote" />
            </when>
            <when value="">
            </when>
        </conditional>

        <!-- Output options -->
        <section name="advanced_options" title="advanced options" expanded="False">
            <param name="minprogeny" type="integer" value="0" optional="true" argument="-r" label="Minimum number of progeny required to print a marker" />
            <param name="mindepth" type="integer" value="" optional="true" argument="-m" label="Minimum stack depth required before exporting a locus in a particular individual" />
            <param name="lnl" type="float" value="" optional="true" argument="--lnl_lim" label="Filter loci with log likelihood values below this threshold" />

            <param name="whitelist" argument="-W" format="txt,tabular" type="data" optional="true" label="Specify a file containing markers to include in the export" />
            <param name="blacklist" argument="-B" format="txt,tabular" type="data" optional="true" label="Specify a file containing markers to exclude from the export" />

            <param name="manual_cor" argument="--cor_path" type="data" format="tabular,txt" optional="true" label="Path to file containing manual genotype corrections from a Stacks SQL database to incorporate into output." />

            <param name="batchid" type="integer" value="1" label="Batch ID to examine when exporting from the catalog" help="Only useful if you analyse data that was processed outside galaxy" />
        </section>
    </inputs>
    <outputs>
        <expand macro="genotypes_output_full"/>

        <data format="html" name="output_summary" label="Summary from ${tool.name} on ${on_string}" from_work_dir="stacks_outputs/summary.html" />
    </outputs>

    <tests>
        <test>
            <param name="input_col">
                <collection type="list">
                    <element name="batch_1.catalog.alleles.tsv" ftype="tabular" value="genotypes/batch_1.catalog.alleles.tsv" />
                    <element name="batch_1.catalog.snps.tsv" ftype="tabular" value="genotypes/batch_1.catalog.snps.tsv" />
                    <element name="batch_1.catalog.tags.tsv" ftype="tabular" value="genotypes/batch_1.catalog.tags.tsv" />
                    <element name="PopA_01.alleles.tsv" ftype="tabular" value="genotypes/PopA_01.alleles.tsv" />
                    <element name="PopA_01.matches.tsv" ftype="tabular" value="genotypes/PopA_01.matches.tsv" />
                    <element name="PopA_01.snps.tsv" ftype="tabular" value="genotypes/PopA_01.snps.tsv" />
                    <element name="PopA_01.tags.tsv" ftype="tabular" value="genotypes/PopA_01.tags.tsv" />
                    <element name="PopA_02.alleles.tsv" ftype="tabular" value="genotypes/PopA_02.alleles.tsv" />
                    <element name="PopA_02.matches.tsv" ftype="tabular" value="genotypes/PopA_02.matches.tsv" />
                    <element name="PopA_02.snps.tsv" ftype="tabular" value="genotypes/PopA_02.snps.tsv" />
                    <element name="PopA_02.tags.tsv" ftype="tabular" value="genotypes/PopA_02.tags.tsv" />
                </collection>
            </param>
            <param name="map_out_type" value="joinmap" />
            <param name="cross_type" value="CP" />
            <param name="advanced_options|minprogeny" value="1" />

            <output name="output_summary">
                <assert_contents>
                    <has_text text="Stacks Statistics" />
                </assert_contents>
            </output>

            <!-- genotypes -->
            <output name="out_generic_haplo">
                <assert_contents>
                    <has_text text="Catalog ID" />
                </assert_contents>
            </output>
            <output name="out_sql_markers">
                <assert_contents>
                    <has_text text="Total Genotypes" />
                </assert_contents>
            </output>
            <output name="out_joinmap">
                <assert_contents>
                    <has_text text="batch_1.genotypes_" />
                </assert_contents>
            </output>
            <output name="out_sql_genotypes">
                <assert_contents>
                    <has_text text="SQL ID" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
<![CDATA[
.. class:: infomark

**What it does**

This program exports a Stacks data set either as a set of observed haplotypes at each locus in the population, or with the haplotypes encoded into genotypes. The -r option allows only loci that exist in a certain number of population individuals to be exported. In a mapping context, raising or lowering this limit is an effective way to control the quality level of markers exported as genuine markers will be found in a large number of progeny. If exporting a set of observed haplotypes in a population, the "min stack depth" option can be used to restict exported loci to those that have a minimum depth of reads.

By default, when executing the pipeline (either denovo_map or ref_map) the genotypes program will be executed last and will identify mappable markers in the population and export both a set of observed haplotypes and a set of generic genotypes with "min number of progeny" option = 1.


Making Corrections

If enabled with the "make automated corrections to the data" option, the genotypes program will make automated corrections to the data. Since loci are matched up in the population, the script can correct false-negative heterozygote alleles since it knows the existence of alleles at a particular locus in the other individuals. For example, the program will identify loci with SNPs that didn’t have high enough coverage to be identified by the SNP caller. It will also check that homozygous tags have a minimum depth of coverage, since a low-coverage polymorphic locus may appear homozygous simply because the other allele wasn’t sequenced.


Correction Thresholds

The thresholds for automatic corrections can be modified by using the "automated corrections option" and changing the default values for the "min number of reads for homozygous genotype", "homozygote minor minimum allele frequency" and "heterozygote minor minimum allele frequency" parameters to genotypes. "min number of reads for homozygous genotype" is the minimum number of reads required to consider a stack homozygous (default of 5). The "homozygote minor minimum allele frequency" and "heterozygote minor minimum allele frequency" variables represent fractions. If the ratio of the depth of the the smaller allele to the bigger allele is greater than "heterozygote minor minimum allele frequency" (default of 1/10) a stack is called a het. If the ratio is less than homozygote minor minimum allele frequency (default of 1/20) a stack is called homozygous. If the ratio is in between the two values it is unknown and a genotype will not be assigned.

Automated corrections made by the program are shown in the output file in capital letters.

--------

**Input files**

Output from denovo_map or ref_map

**Output files:**

- XXX.tags.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence.


- XXX.snps.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one.


- XXX.alleles.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_


- XXX.matches.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample.


- other files:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

@STACKS_INFOS@
]]>
    </help>
    <expand macro="citation" />
</tool>
author	iuc
date	Tue, 22 Mar 2022 23:21:14 +0000
parents	00f4a4e553ae
children	62a4b8e5e139