view stacks_gstacks.xml @ 6:7b72fde3d27e draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/stacks2 commit 2f4c9bfc48d63075ae18a1687e8d01ffea509084
author iuc
date Wed, 11 May 2022 06:40:42 +0000
parents f6e91108b7b1
children
line wrap: on
line source

<tool id="stacks2_gstacks" name="Stacks2: gstacks" profile="@PROFILE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description>Call variants, genotypes and haplotype</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
        <requirement type="package" version="1.13">samtools</requirement>
    </expand>
    <expand macro="version_cmd"/>
    <command detect_errors="aggressive"><![CDATA[
@FASTQ_INPUT_FUNCTIONS@

mkdir bam_inputs stacks_outputs &&

## annoyingly gstacks creates stacks_output/gstacks.log
## instead of just writing to stderr as the other tools
## hence we do not use the tokens and return populations.log as log file and take the stderr
#if $output_log
    ln -s '$output_log' stacks_outputs/gstacks.log &&
#end if

#if $mode_cond.mode_select == "denovo" and not $popmap:
    ## since collections have no len .. yet
    #try:
        #set count = len($input_bam)
    #except:
        #set count = len($input_bam.keys())
    #end try
    #if count == 1:
        #for $bam in $input_bam:
            ln -s '$bam' bam_inputs/catalog.bam &&
        #end for
    #else
        >&2 echo "exactly one (merged) bam file is needed in denovo mode if no population map is given" &&
        exit 1 &&
    #end if
#else
    @BAM_INPUT@
#end if

gstacks

#if $mode_cond.mode_select == "denovo":
    -P bam_inputs
    $mode_cond.ignore_pe_reads
    #if $mode_cond.advanced_cond.advanced_select == "yes":
        --kmer-length $mode_cond.advanced_cond.kmer_length
        --max-debruijn-reads $mode_cond.advanced_cond.max_debruijn_reads
        --min-kmer-cov $mode_cond.advanced_cond.min_kmer_cov
        $mode_cond.advanced_cond.write_alignments
    #end if
#else:
    #if $popmap
        -I bam_inputs
    #else
        $bamlist
    #end if
    #if $mode_cond.paired_cond.paired_select == ''
        $mode_cond.paired_cond.rm_unpaired_reads
        $mode_cond.paired_cond.rm_pcr_duplicates
    #else:
        $mode_cond.paired_cond.paired_select
    #end if
    #if $mode_cond.advanced_cond.advanced_select == "yes":
        --min-mapq $mode_cond.advanced_cond.min_mapq
        --max-clipped $mode_cond.advanced_cond.max_clipped
        --max-insert-len $mode_cond.advanced_cond.max_insert_len
        $mode_cond.advanced_cond.details
        --phasing-cooccurrences-thr-range $mode_cond.advanced_cond.phasing_cooccurrences_thr_min,$mode_cond.advanced_cond.phasing_cooccurrences_thr_max
        $mode_cond.advanced_cond.phasing_dont_prune_hets
    #end if
#end if
#if $popmap
    -M '$popmap'
#end if
-O stacks_outputs
-t \${GALAXY_SLOTS:-1}

##Model options:
--model $model_cond.model
--var-alpha $model_cond.var_alpha
--gt-alpha $model_cond.gt_alpha


## the bam files generated by gstacks (--write-alignments) are seemingly buggy
## (https://groups.google.com/d/msg/stacks-users/CazwJY1DPGA/7vuahiB2GgAJ)
## so we fix them temporarily by piping them through samtools view (disabling all
## exit codes and stderr output) this adds the samtools requirement
## for later versions where this is fixed the output bam files could just be moved
## to stacks_outputs if this is still necessary
#if $mode_cond.mode_select == "denovo" and $mode_cond.advanced_cond.advanced_select == "yes" and $mode_cond.advanced_cond.write_alignments
    #if $popmap:
        && for b in stacks_outputs/*alns.bam; do (samtools view --no-PG -b "\$b" || true) 2> /dev/null > tmp && mv tmp "\$b"; done
    #else
        && (samtools view --no-PG -b stacks_outputs/alignments.bam || true) 2> /dev/null > tmp && mv tmp stacks_outputs/alignments.bam
    #end if
#end if

@EXTRACT_VCF@

## TODO extract individual distributions from stacks_outputs/gstacks.log.distribs
## alternative extra tool
## for i in \$(stacks-dist-extract stacks_outputs/gstacks.log.distribs)
## do
##     stacks-dist-extract stacks_outputs/gstacks.log.distribs $i > stacks_outputs/gstacks.log.\$i.tsv
## done
## TODO make optional output collection
    ]]></command>

    <inputs>
        <expand macro="bam_input_macro"/>
        <param name="popmap" type="data" format="tabular,txt" label="Population map" help="If set, matching will be done only for samples listed in this file" optional="true" argument="-M"/>

        <conditional name="mode_cond">
            <param name="mode_select" type="select" label="Mode">
                <option value="denovo" selected="true">De novo mode</option>
                <option value="refbased">Reference-based</option>
            </param>
            <when value="denovo">
                <param argument="--ignore-pe-reads" type="boolean" checked="false" truevalue="--ignore-pe-reads" falsevalue="" label="Ignore paired-end reads" help="ignore paired-end reads even if present in the input"/>
                <conditional name="advanced_cond">
                    <param name="advanced_select" type="select" label="Advanced options">
                        <option value="no">No</option>
                        <option value="yes">Yes</option>
                    </param>
                    <when value="yes">
                        <param argument="--kmer-length" type="integer" value="31" min="2" max="31" label="K-mer length for the de Bruijn graph"/>
                        <param argument="--max-debruijn-reads" type="integer" value="1000" min="1" label="Maximum number of reads to use in the de Bruijn graph"/>
                        <param argument="--min-kmer-cov" type="integer" value="2" label="Minimum coverage to consider a kmer"/>
                        <param argument="--write-alignments" type="boolean" checked="false" truevalue="--write-alignments" falsevalue="" label="save read alignments" help="heavy BAM files"/>
                    </when>
                    <when value="no"/>
                </conditional>
            </when>
            <when value="refbased">
                <conditional name="paired_cond">
                    <param name="paired_select" type="select" label="Paired end options" help="select single/paired for single end data or to select advanced paired end options, --unpaired: treat reverse reads as if they were forward reads; --ignore-pe-reads: ignore paired-end reads even if present in the input">
                        <option value="" selected="true">single/paired</option>
                        <option value="--unpaired" selected="true">ignore read pairing (--unpaired)</option>
                        <option value="--ignore-pe-reads" selected="true">ignore paired-end reads (--ignore-pe-reads)</option>
                    </param>
                    <when value="">
                        <param argument="--rm-unpaired-reads" type="boolean" checked="false" truevalue="--rm-unpaired-reads" falsevalue="" label="Discard unpaired reads"/>
                        <param argument="--rm-pcr-duplicates" type="boolean" checked="false" truevalue="--rm-pcr-duplicates" falsevalue="" label="Remove read pairs of the same sample that have the same insert length" help="implies --rm-unpaired-reads"/>
                    </when>
                    <when value="--unpaired"/>
                    <when value="--ignore-pe-reads"/>
                </conditional>
                <conditional name="advanced_cond">
                    <param name="advanced_select" type="select" label="Advanced options">
                        <option value="no">No</option>
                        <option value="yes">Yes</option>
                    </param>
                    <when value="yes">
                        <param argument="--min-mapq" type="integer" value="10" min="0" max="255" label="Minimum PHRED-scaled mapping quality to consider a read"/>
                        <param argument="--max-clipped" type="float" value="0.2" min="0.0" max="1.1" label="Maximum soft-clipping level" help="in fraction of read length"/>
                        <param argument="--max-insert-len" type="integer" value="1000" min="0" label="Maximum allowed sequencing insert length"/>
                        <param argument="--details" type="boolean" checked="false" truevalue="--details" falsevalue="" label="Write a heaview output"/>
                        <param name="phasing_cooccurrences_thr_min" type="integer" value="1" min="0" label="Edge coverage min" help="(--phasing-cooccurrences-thr-range)"/>
                        <param name="phasing_cooccurrences_thr_max" type="integer" value="2" min="0" label="Edge coverage max" help="range of edge coverage thresholds to iterate over when building the graph of allele cooccurrences for SNP phasing (--phasing-cooccurrences-thr-range)"/>
                        <param argument="--phasing-dont-prune-hets" type="boolean" checked="false" truevalue="--phasing-dont-prune-hets" falsevalue="" label="Don't try to ignore dubious heterozygote genotypes during phasing"/>
                    </when>
                    <when value="no"/>
                </conditional>
            </when>
        </conditional>

        <conditional name="model_cond">
            <param argument="--model" type="select" label="Model to use to call variants and genotypes">
                <option value="marukilow" selected="true">marukilow</option>
                <option value="marukihigh">marukihigh</option>
                <option value="snp">snp</option>
            </param>
            <when value="marukilow">
                <expand macro="variant_calling_options_vg" varalpha_default="0.01"/>
            </when>
            <when value="marukihigh">
                <expand macro="variant_calling_options_vg"/>
            </when>
            <when value="snp">
                <expand macro="variant_calling_options_vg"/>
            </when>
        </conditional>
        <param name="add_log_distribs" type="boolean" checked="false" truevalue="yes" falsevalue="no" label="Add log distribs output as dataset"/>
        <expand macro="in_log"/>
    </inputs>
    <outputs>
        <expand macro="out_log"/>
        <expand macro="gstacks_outputs_full_macro"/>
    </outputs>

    <tests>
        <!-- denovomode, w popmap -->
        <test expect_num_outputs="3">
            <param name="input_bam" ftype="bam" value="tsv2bam/PopA_01.matches.bam,tsv2bam/PopA_02.matches.bam"/>
            <param name="popmap" ftype="tabular" value="denovo_map/popmap_cstacks.tsv"/>
            <conditional name="mode_cond">
                <param name="mode_select" value="denovo"/>
            </conditional>
            <param name="add_log" value="yes"/>
            <param name="add_log_distribs" value="yes"/>
            <output name="output_log" ftype="txt" file="gstacks/gstacks.log" lines_diff="8"/>
            <output name="distribs" ftype="txt" file="gstacks/gstacks.log.distribs" compare="sim_size" delta="10"/>
            <output_collection name="gstacks_out" type="list" count="2">
                <element name="catalog.calls.vcf" file="gstacks/catalog.calls.vcf" ftype="vcf" lines_diff="4"/>
                <element name="catalog.fa.gz" file="gstacks/catalog.fa.gz" ftype="fasta.gz" decompress="true" compare="diff"/>
            </output_collection>
        </test>
        <!-- denovomode, w popmap, write alignments -->
        <test expect_num_outputs="3">
            <param name="input_bam" ftype="bam" value="tsv2bam/PopA_01.matches.bam,tsv2bam/PopA_02.matches.bam"/>
            <param name="popmap" ftype="tabular" value="denovo_map/popmap_cstacks.tsv"/>
            <conditional name="mode_cond">
                <param name="mode_select" value="denovo"/>
                <conditional name="advanced_cond">
                    <param name="advanced_select" value="yes"/>
                    <param name="write_alignments" value="true"/>
                </conditional>
            </conditional>
            <param name="add_log" value="yes"/>
            <assert_command>
                <has_text text="--write-alignments"/>
            </assert_command>
            <output name="output_log" ftype="txt"><assert_contents><has_text text="done."/></assert_contents></output>
            <output_collection name="gstacks_out" type="list" count="2"/>
            <output_collection name="gstacks_alns_out" type="list" count="2">
                <element name="PopA_01" file="gstacks/PopA_01.alns.bam" ftype="bam"/>
                <element name="PopA_02" file="gstacks/PopA_02.alns.bam" ftype="bam"/>
            </output_collection>
        </test>
        <!-- denovomode, wo popmap (allows for only one input), ignore PE, advanced, alt model -->
        <test expect_num_outputs="3">
            <param name="input_bam" value="tsv2bam/PopA_01.matches.bam" ftype="bam"/>
            <conditional name="mode_cond">
                <param name="mode_select" value="denovo"/>
                <param name="ignore_pe_reads" value="--ignore-pe-reads"/>
                <conditional name="advanced_cond">
                    <param name="advanced_select" value="yes"/>
                    <param name="kmer_length" value="23"/>
                    <param name="max_debruijn_reads" value="666"/>
                    <param name="min_kmer_cov" value="3"/>
                    <param name="write_alignments" value="--write-alignments"/>
                </conditional>
            </conditional>
            <conditional name="model_cond">
                <param name="model" value="marukihigh"/>
                <param name="var_alpha" value="0.1"/>
                <param name="gt_alpha" value="0.1"/>
            </conditional>
            <param name="add_log" value="yes"/>
            <assert_command>
                <has_text text="--ignore-pe-reads"/>
                <has_text text="--kmer-length 23"/>
                <has_text text="--max-debruijn-reads 666"/>
                <has_text text="--min-kmer-cov 3"/>
                <has_text text="--write-alignments"/>
                <has_text text="--model marukihigh"/>
                <has_text text="--var-alpha 0.1"/>
                <has_text text="--gt-alpha 0.1"/>
            </assert_command>
            <output name="output_log" ftype="txt"><assert_contents><has_text text="done."/></assert_contents></output>
            <output_collection name="gstacks_out" type="list" count="2"/>
            <output name="gstacks_aln_out" ftype="bam" file="gstacks/alignments.bam"/>
        </test>
        <!-- refbased wo popmap, paired options, removing all unpaired reads results in an error -->
        <test expect_failure="true" expect_exit_code="1">
            <param name="input_bam" value="tsv2bam/PopA_01.bam,tsv2bam/PopA_02.bam"/>
            <conditional name="mode_cond">
                <param name="mode_select" value="refbased"/>
                <conditional name="paired_cond">
                    <param name="paired_select" value=""/>
                    <!--<param name="rm_unpaired_reads" value="\-\-rm-unpaired-reads"/> removes too much of the test data and gstacks fails-->
                    <param name="rm_pcr_duplicates" value="--rm-pcr-duplicates"/>
                </conditional>
            </conditional>
            <param name="add_log" value="yes"/>
            <assert_command>
                <not_has_text text="-I bam_inputs"/>
                <has_text text="-B "/>
                <not_has_text text="--rm-unpaired-reads"/>
                <has_text text="--rm-pcr-duplicates"/>
            </assert_command>
        </test>
        <!-- refbased w popmap (here bam names need to be equal to sample names in popmap), \-\-unpaired, advanced, snp model -->
        <test expect_num_outputs="2">
            <param name="input_bam" ftype="bam" value="tsv2bam/PopA_01.bam,tsv2bam/PopA_02.bam"/>
            <param name="popmap" ftype="tabular" value="denovo_map/popmap_cstacks.tsv"/>
            <conditional name="mode_cond">
                <param name="mode_select" value="refbased"/>
                <conditional name="paired_cond">
                    <param name="paired_select" value="--unpaired"/>
                </conditional>
                <conditional name="advanced_cond">
                    <param name="advanced_select" value="yes"/>
                    <param name="min_mapq" value="23"/>
                    <param name="max_clipped" value="0.23"/>
                    <param name="max_insert_len" value="666"/>
                    <param name="details" value="--details"/>
                    <param name="phasing_cooccurrences_thr_min" value="2"/>
                    <param name="phasing_cooccurrences_thr_max" value="3"/>
                    <param name="phasing_dont_prune_hets" value="--phasing-dont-prune-hets"/>
                </conditional>
            </conditional>
            <param name="model_cond|model" value="snp"/>
            <param name="model_cond|gt_alpha" value="0.1"/>
            <param name="model_cond|var_alpha" value="0.1"/>
            <param name="add_log" value="yes"/>
            <assert_command>
                <has_text text="-I bam_inputs"/>
                <not_has_text text="-B "/>
                <has_text text="--unpaired"/>
                <has_text text="--min-mapq 23"/>
                <has_text text="--max-clipped 0.23"/>
                <has_text text="--max-insert-len 666"/>
                <has_text text="--details"/>
                <has_text text="--phasing-cooccurrences-thr-range 2,3"/>
                <has_text text="--phasing-dont-prune-hets"/>
                <has_text text="--model snp"/>
                <has_text text="--gt-alpha 0.1"/>
            </assert_command>
            <output name="output_log" ftype="txt"><assert_contents><has_text text="done."/></assert_contents></output>
            <output_collection name="gstacks_out" type="list" count="2"/>
        </test>
        <!-- refbased wo popmap (here bam names don't matter), \-\-ignorepe -->
        <test expect_num_outputs="2">
            <param name="input_bam" ftype="bam" value="tsv2bam/PopA_01.bam,tsv2bam/PopA_02.bam"/>
            <conditional name="mode_cond">
                <param name="mode_select" value="refbased"/>
                <conditional name="paired_cond">
                    <param name="paired_select" value="--ignore-pe-reads"/>
                </conditional>
            </conditional>
            <param name="add_log" value="yes"/>
            <assert_command>
                <not_has_text text="-I bam_inputs"/>
                <has_text text="-B "/>
                <has_text text="--ignore-pe-reads"/>
            </assert_command>
            <output name="output_log"><assert_contents><has_text text="gstacks is done."/></assert_contents></output>
            <output_collection name="gstacks_out" type="list" count="2"/>
        </test>
    </tests>

    <help>
<![CDATA[
.. class:: infomark

**What it does**

For de novo analyses, this program will pull in paired-end reads, if available,
assemble the paired-end contig and merge it with the single-end locus, align
reads to the locus, and call SNPs.

For reference-aligned analyses, this program will build loci from the single
and/or paired-end reads before calling SNPs. The single- and paired-end reads
must be aligned and stored together in the intput BAM or SAM files and the
reads must be sorted. The gstacks program will detect if single- or paired-end
reads are present.

In either mode, gstacks is able to remove PCR duplicates if requested.

--------

**Input files**

If a population map is given BAM records must be assigned to samples using BAM "reads groups"
(gstacks uses the ID/identifier and SM/sample name fields). Read groups
must be consistent if repeated different files.
Otherwise read groups are unneeded and ignored.

**Output files**

- Assembled contigs and variant sites

- Optional outputs: Read alignments and log.distribs

@STACKS_INFOS@
]]>
    </help>
    <expand macro="citation"/>
</tool>