view strelka_germline.xml @ 1:19481653a22f draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/strelka commit 8e20575721a463931a4b9fbbe905d90cc2863575"
author iuc
date Tue, 02 Mar 2021 17:46:39 +0000
parents 1fbe84e8a740
children
line wrap: on
line source

<?xml version="1.0"?>
<tool id="strelka_germline" name="Strelka Germline" version="@TOOL_VERSION@+@GALAXY_VERSION@">
    <description>@DESCRIPTION@ for germline variation in small cohorts</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
    ## sanity check
    #if len($bam) > 1 and str($expert_settings.evs.selector) == "enableEVS" and $expert_settings.evs.reportEVSFeatures
        echo "Reporting of EVS features can only be used with a single input sample" 1>&2; exit 1
    #else
        ## initialize
        #set $bam_inputs = []
        #for $i, $s in enumerate($bam):
            #set $target_file = 'input_sample_%d.%s' % ($i, $s.ext)
            ln -s '$s' $target_file &&
            #if $s.is_of_type('bam')
                ln -s '$s.metadata.bam_index' ${target_file}.bai &&
            #elif $s.is_of_type('cram')
                ln -s '$s.metadata.cram_index' ${target_file}.crai &&
            #end if
            #silent $bam_inputs.extend(['--bam', $target_file])
        #end for
        #set $bam_spec = ' '.join($bam_inputs)
        ## Strelka requires both the --ploidy vcf and the --noCompress bed
        ## to be bgzipped and tabixed.
        ## Same for the --callRegions bed, but that's handled inside
        ## the shared INIT code.
        #if $pl.ploidy
            #if $pl.ploidy.ext == 'vcf'
                ln -s '$pl.ploidy' input_ploidy.vcf &&
                bgzip -c input_ploidy.vcf > input_ploidy.vcf.gz &&
                tabix -p vcf input_ploidy.vcf.gz &&
            #else
                ## File is bgzipped and tabixed already
                ## -> just symlink data and index
                ln -s '$pl.ploidy' input_ploidy.vcf.gz &&
                ln -s '$pl.ploidy.metadata.tabix_index' input_ploidy.vcf.gz.tbi &&
            #end if
        #end if
        #if $oo.gvcf.emit_gvcfs == 'yes' and $oo.gvcf.noCompress
            ln -s '$oo.gvcf.noCompress' input_nocompress.bed &&
            bgzip -c input_nocompress.bed > input_nocompress.bed.gz &&
            tabix -p bed input_nocompress.bed.gz &&
        #end if
        @INIT@

        ## create workflow
        configureStrelkaGermlineWorkflow.py
            $bam_spec
            #if str($pl.callContinuousVf)
                --callContinuousVf '$pl.callContinuousVf'
            #end if
            #if $pl.ploidy
                --ploidy input_ploidy.vcf.gz
            #end if
            #if $oo.gvcf.emit_gvcfs == 'yes' and $oo.gvcf.noCompress
                --noCompress input_nocompress.bed.gz
            #end if
            $expert_settings.s_e_e
            @CREATE@

        ## run workflow
        @RUN@

        ## decompress results
        #if $oo.vcf_type == "decompressed"
            ## we decompress just the main variants file
            ## per-sample gvcf files are always emitted as a collection of
            ## compressed files.
            && bgzip -d results/results/variants/variants.vcf.gz
            && mv results/results/variants/variants.vcf results/results/variants/variants_out
        #else
            && mv results/results/variants/variants.vcf.gz results/results/variants/variants_out
        #end if
    #end if
    ]]></command>
    
    <configfiles>
        <configfile name="config_file">
## parser cannot handle indents
[StrelkaGermline]
minMapq = $strelka.minMapq
@CONFIG@
        </configfile>
    </configfiles>

    <inputs>
        <param argument="--bam" type="data" format="bam,cram" multiple="true" label="Select sample file(s)" help=""/>
        <expand macro="input_required" ref="bam"/>
        <expand macro="calling_model">
            <option value="--rna">RNA sequencing data (--rna)</option>
        </expand>
        <expand macro="calling_model_expert">
            <param name="s_e_e" type="select" label="Configure sequence error estimation for indels"
            help="By default, indel error rates are estimated from a subsample of the input sequencing data for each sample. This step can be disabled so that the tool reverts to precomputed indel error rates reflecting an intermediate point between different sequencing assays. Alternatively, all data (from sufficiently large chromosomes) can be used to obtain the estimate, but this may greatly increase runtime.">
                <option value="--disableSequenceErrorEstimation">Use default sequence error estimate (--disableSequenceErrorEstimation)</option>
                <option value="" selected="true">Estimate sequence error rate from a subsample of the input data (default)</option>
                <option value="--useAllDataForSequenceErrorEstimation">Estimate sequence error rate from the full input data (--useAllDataForSequenceErrorEstimation)</option>
            </param>
        </expand>
        <expand macro="regions_select" />
        <section name="pl" title="Ploidy configuration" expanded="false">
            <param argument="--ploidy" type="data" format="vcf,vcf_bgzip" optional="true" label="Select ploidy file" help="Provide ploidy file in VCF. The VCF should include one sample column per input sample labeled with the same sample names found in the input BAM/CRAM RG header sections. Ploidy should be provided in records using the FORMAT/CN field, which are interpreted to span the range [POS+1, INFO/END]. Any CN value besides 1 (haploid regions) or 0 (regions expected to be absent) will be treated as 2."/>
            <param argument="--callContinuousVf" type="text" value="" label="Call variants without ploidy assumption on this chromosome" help="For the specified chromosome the sequencing data will be treated as a pooled sample: variants on it will be called with continuous frequencies and scored using a simple Poisson noise model. May be applied to the mitochondrial genome, for example." />
        </section>
        <section name="oo" title="Output options" expanded="false">
            <expand macro="input_output"/>
            <conditional name="gvcf">
                <param name="emit_gvcfs" type="select" label="Generate per-sample gVCFs?">
                    <option value="no">No</option>
                    <option value="yes">Yes</option>
                </param>
                <when value="no" />
                <when value="yes">
                    <param argument="--noCompress" type="data" format="bed" optional="true"
                    label="In the gVCFs, do NOT block-compress sites that fall in these regions"
                    help="If you have regions of special interest, for which you do not want site-information to be binned into blocks in the gVCF output, but would want to have calling statistics reported separately, you can specify those regions through a BED dataset here." />
                </when>
            </conditional>
        </section>
        <section name="strelka" title="Strelka run configuration" expanded="false">
            <param argument="minMapq" name="minMapq" type="integer" value="20" label="Set minMapq" help="Don't use reads with MAPQ less than this value for variant calling."/>
            <expand macro="input_strelka"/>
        </section>
    </inputs>
    <outputs>
        <data name="out_variants" format="vcf" from_work_dir="results/results/variants/variants_out" label="${tool.name} on ${on_string}, Variants, vcf">
            <change_format>
                <when input="oo.vcf_type" value="compressed" format="vcf_bgzip" />
            </change_format>
        </data>
        <collection name="out_genome" type="list" label="${tool.name} on ${on_string}: Genome, vcf">
            <discover_datasets pattern="genome\.(?P&lt;designation&gt;.+)\.vcf\.gz&#36;" format="vcf_bgzip" directory="results/results/variants/" />
            <filter>oo['gvcf']['emit_gvcfs'] == 'yes'</filter>
        </collection>
   </outputs>
    <tests>
        <!-- #1; input bam, decompressed -->
        <test expect_num_outputs="2">
            <param name="bam" value="sample1.bam,sample2.bam,sample3.bam"/>
            <conditional name="ref_cond">
                <param name="ref_sel" value="history"/>
                <param name="ref" value="hg98.fa" ftype="fasta"/>
            </conditional>
            <section name="expert_settings">
                <param name="s_e_e" value="--disableSequenceErrorEstimation" />
            </section>
            <section name="oo">
                <param name="vcf_type" value="decompressed"/>
                <conditional name="gvcf">
                    <param name="emit_gvcfs" value="yes" />
                </conditional>
            </section>
            <output name="out_variants" ftype="vcf">
                <assert_contents>
                    <has_n_lines n="62"/>
                    <has_line_matching expression="#CHROM&#009;POS&#009;.+"/>
                    <has_line_matching expression="demo20&#009;3664&#009;.+"/>
                </assert_contents>
            </output>
            <output_collection name="out_genome" type="list" count="3">
                <element name="S1" ftype="vcf_bgzip" file="genome_test1.vcf" decompress="true" compare="diff" lines_diff="8" />
            </output_collection>
        </test>
        <!-- #2; input cram, compressed -->
        <test expect_num_outputs="1">
            <param name="bam" value="sample1.cram,sample2.cram"/>
            <conditional name="ref_cond">
                <param name="ref_sel" value="history"/>
                <param name="ref" value="hg98.fa" ftype="fasta"/>
            </conditional>
            <section name="expert_settings">
                <param name="s_e_e" value="--disableSequenceErrorEstimation" />
            </section>
            <section name="oo">
                <param name="vcf_type" value="compressed"/>
            </section>
            <output name="out_variants" ftype="vcf_bgzip" file="variants_test2.vcf" decompress="true" compare="diff" lines_diff="8" />
        </test>
        <!-- #3; input bam, no defaults (note without s_e_e (genome) output seem not deterministic -->
        <test expect_num_outputs="2">
            <param name="bam" value="sample1.cram,sample2.cram"/>
            <conditional name="ref_cond">
                <param name="ref_sel" value="history"/>
                <param name="ref" value="hg98.fa" ftype="fasta"/>
            </conditional>
            <param name="optimization" value="--rna" />
            <section name="oo">
                <param name="vcf_type" value="decompressed"/>
                <conditional name="gvcf">
                    <param name="emit_gvcfs" value="yes" />
                </conditional>
            </section>
            <section name="pl">
                <param name="callContinuousVf" value="Chr1"/>
            </section>
            <section name="strelka">
                <param name="minMapq" value="21"/>
                <param name="maxIndelSize" value="51"/>
            </section>
            <output name="out_variants" ftype="vcf">
                <assert_contents>
                    <has_n_lines n="81"/>
                    <has_line_matching expression="#CHROM&#009;POS&#009;.+"/>
                    <has_line_matching expression="demo20&#009;3664&#009;.+"/>
                </assert_contents>
            </output>
            <output_collection name="out_genome" type="list" count="2">
                <!-- assertions are applied to compressed output (even with decomress true)
                https://github.com/galaxyproject/galaxy/issues/11521
                <element name="S1" ftype="vcf_bgzip" decompress="true">
                    <assert_contents>
                        <has_n_lines n="139"/>
                        <has_line_matching expression="demo20&#009;.+"/>
                    </assert_contents>
                </element> -->
            </output_collection>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**What it does**

@HELP_STRELKA@

The germline caller employs an efficient tiered haplotype model to improve accuracy and provide read-backed phasing, adaptively selecting between assembly and a faster alignment-based haplotyping approach at each variant locus. The germline caller also analyzes input sequencing data using a mixture-model indel error estimation method to improve robustness to indel noise.

**Input**

@HELP_INPUT@

**Output**

*Variants*

This describes all potential variant loci across all samples. Note this file includes non-variant loci if they have a non-trivial level of variant evidence or contain one or more alleles for which genotyping has been forced. Please see the multi-sample variants VCF section below for additional details on interpreting this file.

*Genome*

This is the genome VCF output for sample N, which includes both variant records and compressed non-variant blocks. The sample index, N is 1-indexed and corresponds to the input order of alignment files on the configuration command-line.

.. class:: infomark

**References**

@HELP_REFERENCES@
    ]]></help>
    <expand macro="citations"/>
</tool>