view stacks_denovomap.xml @ 4:998dfec32bde draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/stacks commit 643293a896a2ccfac10fe995f48c7f01c1a89a7f
author iuc
date Mon, 26 Sep 2016 11:44:20 -0400
parents bab631cc9f64
children 34000c653ae5
line wrap: on
line source

<tool id="stacks_denovomap" name="Stacks: de novo map" version="@WRAPPER_VERSION@.0">
    <description>the Stacks pipeline without a reference genome (denovo_map.pl)</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <expand macro="stdio"/>
    <command><![CDATA[
        #from os.path import splitext
        #import re

        #if str( $options_usage.rad_analysis_type ) == "genetic":
            #for $input_parent in $options_usage.parent_sequences:

                #if $input_parent.ext == "fastqsanger":
                    #set $data_path = splitext($input_parent.element_identifier)[0]
                    #set $data_path = re.sub(r'\.1$', '', $data_path)
                    #set $data_path = $data_path + ".fq"
                #else:
                    #set $data_path = splitext($input_parent.element_identifier)[0]
                    #set $data_path = re.sub(r'\.1$', '', $data_path)
                    #set $data_path = $data_path + ".fa"
                #end if

                ln -s "${input_parent}" "${data_path}" &&
            #end for

            #for $input_progeny in $options_usage.progeny_sequences:
                #if $input_progeny:
                    #if $input_progeny.ext == "fastqsanger":
                        #set $data_path = splitext($input_progeny.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".fq"
                    #else:
                        #set $data_path = splitext($input_progeny.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".fa"
                    #end if

                    ln -s "${input_progeny}" "${data_path}" &&
                #end if
            #end for
        #else:
            #for $input_indiv in $options_usage.individual_sample:

                #if $input_indiv.ext == "fastqsanger":
                    #set $data_path = splitext($input_indiv.element_identifier)[0]
                    #set $data_path = re.sub(r'\.1$', '', $data_path)
                    #set $data_path = $data_path + ".fq"
                #else:
                    #set $data_path = splitext($input_indiv.element_identifier)[0]
                    #set $data_path = re.sub(r'\.1$', '', $data_path)
                    #set $data_path = $data_path + ".fa"
                #end if

                ln -s "${input_indiv}" "${data_path}" &&
            #end for
        #end if

        mkdir stacks_outputs

        &&

        denovo_map.pl

            -T \${GALAXY_SLOTS:-1}

            #if str( $options_usage.rad_analysis_type ) == "genetic":
                #for $input_parent in $options_usage.parent_sequences:
                    #if $input_parent.ext == "fastqsanger":
                        #set $data_path = splitext($input_parent.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".fq"
                    #else:
                        #set $data_path = splitext($input_parent.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".fa"
                    #end if

                    -p "${data_path}"
                #end for

                -A $options_usage.cross_type

                #for $input_progeny in $options_usage.progeny_sequences:
                    #if $input_progeny:
                        #if $input_progeny.ext == "fastqsanger":
                            #set $data_path = splitext($input_progeny.element_identifier)[0]
                            #set $data_path = re.sub(r'\.1$', '', $data_path)
                            #set $data_path = $data_path + ".fq"
                        #else:
                            #set $data_path = splitext($input_progeny.element_identifier)[0]
                            #set $data_path = re.sub(r'\.1$', '', $data_path)
                            #set $data_path = $data_path + ".fa"
                        #end if

                        -r "${data_path}"
                    #end if
                #end for

                #if str($assembly_options.P):
                    -P $assembly_options.P
                #end if
            #else:
                #for $i_indiv, $input_indiv in enumerate($options_usage.individual_sample):

                    #if $input_indiv.ext == "fastqsanger":
                        #set $data_path = splitext($input_indiv.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".fq"
                    #else:
                        #set $data_path = splitext($input_indiv.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".fa"
                    #end if

                    -s "${data_path}"
                #end for
                -O "$options_usage.popmap"
            #end if

            #if str($assembly_options.m):
                -m $assembly_options.m
            #end if
            #if str($assembly_options.N):
                -N $assembly_options.N
            #end if
            -M $assembly_options.M
            -n $assembly_options.n
            $assembly_options.t
            $assembly_options.H

            ## Batch description
            -b 1

            ## No SQL recording
            -S

            ## snp_model
            #if str( $snp_options.select_model.model_type) == "bounded":
                --bound_low $snp_options.select_model.bound_low
                --bound_high $snp_options.select_model.bound_high
                --alpha $snp_options.select_model.alpha
            #else if str( $snp_options.select_model.model_type) == "snp":
                --alpha $snp_options.select_model.alpha
            #end if

            -o stacks_outputs

            #if str( $options_usage.rad_analysis_type ) == "genetic":
                @NORM_GENOTYPES_OUTPUT_LIGHT@
            #end if
    ]]></command>

    <inputs>
        <conditional name="options_usage">
            <param name="rad_analysis_type" type="select" label="Select your usage">
                <option value="genetic" selected="true">Genetic map</option>
                <option value="population">Population</option>
            </param>
            <when value="genetic">
                <param name="parent_sequences" argument="-p" format="fastqsanger,fasta" type="data" multiple="true" label="Files containing parent sequences" help="Files containing parent sequences from a mapping cross (only R1 reads). Dataset names will be used as sample name (no space allowed)." />

                <param name="progeny_sequences" argument="-r" format="fastqsanger,fasta" type="data" multiple="true" optional="true" label="Files containing progeny sequences" help="files containing progeny sequences from a mapping cross (only R1 reads). Dataset names will be used as sample name (no space allowed)." />

                <param name="cross_type" argument="-A" type="select" label="Cross type">
                    <expand macro="cross_types"/>
                </param>
            </when>
            <when value="population">
                <param name="individual_sample" argument="-s" format="fastqsanger,fasta" type="data" multiple="true" label="Files containing an individual sample from a population" help="files containing an individual sample from a population (only R1 reads). Dataset names will be used as sample name (no space allowed)." />
                <param name="popmap" argument="-O" type="data" format="tabular,txt" label="Specify a population map" />
            </when>
        </conditional>

        <!-- stack assembly options -->
        <section name="assembly_options" title="Assembly options" expanded="false">
            <param name="m" argument="-m" type="integer" value="" optional="true" label="Minimum number of identical raw reads required to create a stack" />
            <param name="P" argument="-P" type="integer" value="" optional="true" label="Minimum number of identical, raw reads required to create a stack in 'progeny' individuals" />
            <param name="M" argument="-M" type="integer" value="2" label="Number of mismatches allowed between loci when processing a single individual"/>
            <param name="N" argument="-N" type="integer" value="" optional="true" label="Number of mismatches allowed when aligning secondary reads" help="default is [-M]+2" />
            <param name="n" argument="-n" type="integer" value="1" label="Number of mismatches allowed between loci when building the catalog"/>

            <param name="t" argument="-t" type="boolean" checked="false" truevalue="-t" falsevalue="" label="Remove, or break up, highly repetitive RAD-Tags in the ustacks program" />
            <param name="H" argument="-H" type="boolean" checked="false" truevalue="-H" falsevalue="" label="Disable calling haplotypes from secondary reads" />
        </section>

        <!-- SNP Model options -->
        <section name="snp_options" title="SNP Model Options (ustacks options)" expanded="False">
            <expand macro="snp_options"/>
        </section>
    </inputs>
    <outputs>
        <data format="txt" name="output_log" label="denovo_map.log with ${tool.name} on ${on_string}" from_work_dir="stacks_outputs/denovo_map.log" />

        <data format="tabular" name="catalogtags" label="Catalog assembled loci (tags) with ${tool.name} on ${on_string}" from_work_dir="stacks_outputs/batch_1.catalog.tags.tsv" />
        <data format="tabular" name="catalogsnps" label="Catalog model calls (snps) with ${tool.name} on ${on_string}" from_work_dir="stacks_outputs/batch_1.catalog.snps.tsv" />
        <data format="tabular" name="catalogalleles" label="Catalog haplotypes (alleles) with ${tool.name} on ${on_string}" from_work_dir="stacks_outputs/batch_1.catalog.alleles.tsv" />

        <expand macro="genotypes_output_light"/>
        <expand macro="populations_output_light"/>

        <collection name="tags" type="list" label="Assembled loci from ${on_string}">
            <discover_datasets pattern="(?P&lt;name&gt;.+\.tags)\.tsv$" ext="tabular" directory="stacks_outputs" />
        </collection>

        <collection name="snps" type="list" label="Model calls from each locus on ${on_string}">
            <discover_datasets pattern="(?P&lt;name&gt;.+\.snps)\.tsv$" ext="tabular" directory="stacks_outputs" />
        </collection>

        <collection name="alleles" type="list" label="Haplotypes/alleles recorded from each locus on ${on_string}">
            <discover_datasets pattern="(?P&lt;name&gt;.+\.alleles)\.tsv$" ext="tabular" directory="stacks_outputs" />
        </collection>

        <collection name="matches" type="list" label="Matches to the catalog on ${on_string}">
            <discover_datasets pattern="(?P&lt;name&gt;.+\.matches)\.tsv$" ext="tabular" directory="stacks_outputs" />
        </collection>

        <collection name="all_output" type="list" label="Full output from denovo_map on ${on_string}">
            <discover_datasets pattern="(?P&lt;name&gt;.+\.(tags|snps|alleles|matches))\.tsv$" ext="tabular" directory="stacks_outputs" />
            <discover_datasets pattern="(?P&lt;name&gt;.+\.(haplotypes|genotypes|markers|hapstats|sumstats|sumstats_summary))\.tsv$" ext="tabular" directory="stacks_outputs" />
            <discover_datasets pattern="(?P&lt;name&gt;.+\.(genotypes))\.(loc|txt)$" ext="txt" directory="stacks_outputs" />
        </collection>
    </outputs>

    <tests>
        <test>
            <param name="options_usage|rad_analysis_type" value="genetic"/>
            <param name="options_usage|parent_sequences" value="demultiplexed/PopA_01.1.fq" ftype="fastqsanger" />
            <output name="output_log">
                <assert_contents>
                    <has_text text="denovo_map.pl completed" />
                </assert_contents>
            </output>

            <!-- catalog -->
            <output name="catalogtags">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>
            <output name="catalogsnps">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>
            <output name="catalogalleles">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>

            <!-- genotypes -->
            <output name="out_generic_haplo">
                <assert_contents>
                    <has_text text="Catalog ID" />
                </assert_contents>
            </output>
            <output name="out_sql_markers">
                <assert_contents>
                    <has_text text="Total Genotypes" />
                </assert_contents>
            </output>
            <output name="out_joinmap">
                <assert_contents>
                    <has_text text="batch_1.genotypes_" />
                </assert_contents>
            </output>
            <output name="out_sql_genotypes">
                <assert_contents>
                    <has_text text="SQL ID" />
                </assert_contents>
            </output>

            <!-- samples -->
            <output_collection name="tags">
                <element name="PopA_01.tags">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="snps">
                <element name="PopA_01.snps">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="alleles">
                <element name="PopA_01.alleles">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="matches">
                <element name="PopA_01.matches">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
        </test>

        <test>
            <param name="options_usage|rad_analysis_type" value="genetic"/>
            <param name="options_usage|parent_sequences" value="demultiplexed/PopA_01.1.fq" ftype="fastqsanger" />
            <param name="options_usage|progeny_sequences" value="demultiplexed/PopA_02.1.fq" ftype="fastqsanger" />
            <output name="output_log">
                <assert_contents>
                    <has_text text="denovo_map.pl completed" />
                </assert_contents>
            </output>

            <!-- catalog -->
            <output name="catalogtags">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>
            <output name="catalogsnps">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>
            <output name="catalogalleles">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>

            <!-- genotypes -->
            <output name="out_generic_haplo">
                <assert_contents>
                    <has_text text="Catalog ID" />
                </assert_contents>
            </output>
            <output name="out_sql_markers">
                <assert_contents>
                    <has_text text="Total Genotypes" />
                </assert_contents>
            </output>
            <output name="out_joinmap">
                <assert_contents>
                    <has_text text="batch_1.genotypes_" />
                </assert_contents>
            </output>
            <output name="out_sql_genotypes">
                <assert_contents>
                    <has_text text="SQL ID" />
                </assert_contents>
            </output>

            <!-- samples -->
            <output_collection name="tags">
                <element name="PopA_01.tags">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="snps">
                <element name="PopA_01.snps">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="alleles">
                <element name="PopA_01.alleles">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="matches">
                <element name="PopA_01.matches">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
        </test>

        <test>
            <param name="options_usage|rad_analysis_type" value="population"/>
            <param name="options_usage|individual_sample" value="demultiplexed/PopA_01.1.fq,demultiplexed/PopA_02.1.fq,demultiplexed/PopA_03.1.fq,demultiplexed/PopA_04.1.fq,demultiplexed/PopB_01.1.fq,demultiplexed/PopB_02.1.fq,demultiplexed/PopB_03.1.fq,demultiplexed/PopB_04.1.fq" ftype="fastqsanger" />
            <param name="options_usage|popmap" value="denovo_map/popmap.tsv" />
            <output name="output_log">
                <assert_contents>
                    <has_text text="denovo_map.pl completed" />
                </assert_contents>
            </output>

            <!-- catalog -->
            <output name="catalogtags">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>
            <output name="catalogsnps">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>
            <output name="catalogalleles">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>

            <!-- populations -->
            <output name="out_haplotypes">
                <assert_contents>
                    <has_text text="PopA_01" />
                </assert_contents>
            </output>
            <output name="out_hapstats">
                <assert_contents>
                    <has_text text="Smoothed Gene Diversity" />
                </assert_contents>
            </output>
            <output name="out_populations_log">
                <assert_contents>
                    <has_text text="populations version" />
                </assert_contents>
            </output>
            <output name="out_sumstats_sum">
                <assert_contents>
                    <has_text text="Polymorphic Sites" />
                </assert_contents>
            </output>
            <output name="out_sumstats">
                <assert_contents>
                    <has_text text="Smoothed Pi" />
                </assert_contents>
            </output>

            <!-- samples -->
            <output_collection name="tags">
                <element name="PopA_01.tags">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="snps">
                <element name="PopA_01.snps">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="alleles">
                <element name="PopA_01.alleles">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="matches">
                <element name="PopA_01.matches">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
    </tests>

    <help>
<![CDATA[
.. class:: infomark

**What it does**

This program will run each of the Stacks components: first, running ustacks on each of the samples specified, building loci and calling SNPs in each. Second, cstacks will be run to create a catalog of all loci that were marked as 'parents' or 'samples' on the command line, and finally, sstacks will be executed to match each sample against the catalog. A bit more detail on this process can be found in the FAQ. The denovo_map.pl program will also load the results of each stage of the analysis: individual loci, the catalog, and matches against the catalog into the database (although this can be disabled). After matching, the program will build a database index to speed up access (index_radtags.pl) and enable web-based filtering.

--------

**Input files**

FASTQ, FASTA

- Population map::

    indv_01    1
    indv_02    1
    indv_03    1
    indv_04    2
    indv_05    2
    indv_06    2


**Output files**


- XXX.tags.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence.


- XXX.snps.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one.


- XXX.alleles.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_


- XXX.matches.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample.


- other files:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

@STACKS_INFOS@
]]>
    </help>
    <expand macro="citation" />
</tool>