view deepvariant.xml @ 5:fd52f65372c9 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/deepvariant commit bf3bb5d5a1f442208f054523fbcf0be4cc366a35
author iuc
date Mon, 02 Feb 2026 12:34:14 +0000
parents 63b68fe4af85
children
line wrap: on
line source

<tool id='deepvariant' name='DeepVariant' version='@TOOL_VERSION@+galaxy@SUFFIX_VERSION@' profile='@PROFILE@'>
    <description>deep learning-based variant caller</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro='edam_ontology' />
    <expand macro='requirements' />
    <command detect_errors='exit_code'><![CDATA[
        ln -s '${reads}' reads_alignment.bam
        && ln -s '${reads.metadata.bam_index}' reads_alignment.bam.bai
        #if $regions_conditional.regions_option == 'bed'
            && ln -s '${regions_conditional.bed_file}' region.bed
        #end if
        #if $par_regions_bed
            && ln -s '${par_regions_bed}' par_regions.bed
        #end if
        #set $logging_dir_value = None
        #if $create_runtime_report
            #set $logging_dir_value = 'logging'
            && mkdir -p '$logging_dir_value'
        #end if
        #if $reference_genome.source == 'history':
            #set $ref_genome = 'reference.fasta'
            && ln -s -f '${reference_genome.history_item}' $ref_genome
            && samtools faidx $ref_genome
        #else:
            #set $ref_genome = $reference_genome.index.fields.path
        #end if
        && run_deepvariant
        --model_type=$model_type
        --ref=$ref_genome
        --reads='reads_alignment.bam'
        #if $sample_name
            --sample_name '$sample_name'
        #end if
        --output_vcf='./output.vcf.gz'
        #if $output_gvcf
            --output_gvcf='./output.g.vcf.gz'
        #end if
        #if $regions_conditional.regions_option == 'region'
            --regions '$regions_conditional.region_literal'
        #else if $regions_conditional.regions_option == 'bed'
            --regions 'region.bed'
        #end if
        --disable_small_model=$disable_small_model
        #if $haploid_contigs
            --haploid_contigs='$haploid_contigs'
        #end if
        #if $par_regions_bed
            --par_regions_bed='par_regions.bed'
        #end if
        #if $logging_dir_value
            --logging_dir='$logging_dir_value'
        #end if
        #if $report_title
            --report_title='$report_title'
        #end if
        --runtime_report=$create_runtime_report
        #if $make_examples_extra_args
            --make_examples_extra_args='$make_examples_extra_args'
        #end if
        #if $call_variants_extra_args
            --call_variants_extra_args='$call_variants_extra_args'
        #end if
        #if $postprocess_variants_extra_args
            --postprocess_variants_extra_args='$postprocess_variants_extra_args'
        #end if
        $vcf_stats_report
        --num_shards=\${GALAXY_SLOTS:-2}
        && gunzip './output.vcf.gz'
        #if $output_gvcf
            && gunzip './output.g.vcf.gz'
        #end if
    ]]>    </command>
    <inputs>
        <conditional name="reference_genome">
            <param name="source" type="select" label="Source for the reference genome" help="Built-in references were created using default options.">
                <option value="indexed" selected="true">Use a built-in genome</option>
                <option value="history">Use a genome from history</option>
            </param>
            <when value="indexed">
                <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team.">
                    <options from_data_table="fasta_indexes">
                        <filter type="sort_by" column="2" />
                        <validator type="no_options" message="No genomes are available for the selected input dataset" />
                    </options>
                </param>
            </when>
            <when value="history">
                <param name="history_item" type="data" format="fasta" label="Reference genome" help="A reference genome in FASTA format" />
            </when>
        </conditional>
        <param argument="--reads" type="data" format="bam" label="BAM file" help="An aligned reads file in BAM format. The reads must be aligned to the reference genome" />
        <param argument="--sample_name" type="text" optional="true" label="Sample name" help="Sample name to use instead of the SM tag in the BAM header. Example: NA12878" />
        <param argument="--model_type" type="select" label="Model type" help="Type of model to use for variant calling">
            <option value="WGS">WGS: Illumina whole genome sequencing</option>
            <option value="WES">WES: Illumina whole exome sequencing</option>
            <option value="PACBIO">PacBio HiFi</option>
            <option value="HYBRID_PACBIO_ILLUMINA">Hybrid PacBio HiFi-Illumina</option>
            <option value="ONT_R104">ONT R10.4 simplex and duplex data</option>
        </param>
        <conditional name="regions_conditional">
            <param name="regions_option" type="select" label="Select specific regions to process" help="Restrict the analysis to specific regions. A space-separated list of chromosome regions to process. Individual elements can be region literals, such as chr20:10-20 or paths to BED files.">
                <option value="disabled" selected="True">Disabled</option>
                <option value="region">Specify region literals</option>
                <option value="bed">Provide a BED file</option>
            </param>
            <when value="disabled"/>
            <when value="region">
                <param name="region_literal" argument="--regions" type="text" label="Regions" help="This option refers to contigs present in the reference genome. These arguments accept space-separated lists, so all of the follow examples are valid arguments: 'chr20:10,000,000-11,000,000', 'chr20 chr21' and 'chr20'">
                    <sanitizer invalid_char="">
                        <valid initial="string.letters,string.digits">
                            <add value="," />
                            <add value=":" />
                            <add value="-" />
                        </valid>
                    </sanitizer>
                    <validator type="regex">[0-9a-zA-Z,:-]+</validator>
                </param>
            </when>
            <when value="bed">
                <param name="bed_file" argument="--regions" type="data" format="bed" label="BED file" help="The BED should the store genomic regions of interest" />
            </when>
        </conditional>
        <param argument="--output_gvcf" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Generate genomic VCF (gVCF) output" help="The key difference between a regular VCF and a gVCF is that the gVCF has records for all sites, whether there is a variant call there or not. The goal is to have every site represented in the file in order to do joint analysis of a cohort in subsequent steps" />
        <section name="advanced_options" title="Advanced options" expanded="false">
            <param argument="--disable_small_model" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Disable small model" help="Disable the use of the small model to call variants during make_examples." />
            <param argument="--haploid_contigs" type="text" optional="true" label="Haploid contigs" help="Comma-separated list of non-autosomal chromosomes. Example: chrX,chrY" />
            <param argument="--par_regions_bed" type="data" format="bed" optional="true" label="PAR regions BED file" help="BED file of pseudoautosomal regions." />
            <param argument="--report_title" type="text" optional="true" label="Report title" help="Title for the VCF stats report (HTML). Example: NA12878 DeepVariant report" />
            <param argument="--vcf_stats_report" type="boolean" truevalue="--vcf_stats_report=True" falsevalue="--vcf_stats_report=False" checked="False" label="Create VCF stats report (HTML)" help="Output a visual report (HTML) of statistics about the output VCF." />
            <param name="create_runtime_report" argument="--runtime_report" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Create runtime report" help="Output make_examples runtime metrics and create a visual runtime report." />
        </section>
        <section name="expert_options" title="Expert options (may increase runtime and disk usage)" expanded="false">
            <param argument="--make_examples_extra_args" type="text" optional="true" label="make_examples extra args" help="Comma-separated flag_name=flag_value for make_examples.py. Example: min_base_quality=10,min_mapping_quality=5" />
            <param argument="--call_variants_extra_args" type="text" optional="true" label="call_variants extra args" help="Comma-separated flag_name=flag_value for call_variants.py. Example: allow_empty_examples=true,batch_size=1024" />
            <param argument="--postprocess_variants_extra_args" type="text" optional="true" label="postprocess_variants extra args" help="Comma-separated flag_name=flag_value for postprocess_variants.py. Example: cnn_homref_call_min_gq=20.0" />
        </section>
    </inputs>
    <outputs>
        <data name="vcf_file" format="vcf" from_work_dir="output.vcf" label="${tool.name} on ${on_string}: VCF file"/>
        <data name="html_report" format="html" from_work_dir="output.visual_report.html" label="${tool.name} on ${on_string}: HTML report">
            <filter>advanced_options['vcf_stats_report']</filter>
        </data>
        <data name="runtime_report" format="html" from_work_dir="logging/make_examples_runtime_by_region_report.html" label="${tool.name} on ${on_string}: runtime report">
            <filter>advanced_options['create_runtime_report']</filter>
        </data>
        <data name="gvcf_file" format="vcf" from_work_dir="output.g.vcf" label="${tool.name} on ${on_string}: gVCF file">
            <filter>output_gvcf</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <conditional name="reference_genome">
                <param name="source" value="history"/>
                <param name="history_item" value="reference.fasta"/>
            </conditional>
            <param name="reads" value="reads.bam"/>
            <param name="model_type" value="WGS"/>
            <param name="output_gvcf" value="False"/>
            <conditional name="regions_conditional">
                <param name="regions_option" value="disabled"/>
            </conditional>
            <section name="advanced_options">
                <param name="vcf_stats_report" value="True"/>
                <param name="report_title" value="Test Report Title"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text="##fileformat=VCFv4.2"/>
                    <has_text text="#CHROM"/>
                </assert_contents>
            </output>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <is_valid_xml />
                    <has_n_lines n="34" delta="10" />
                    <has_text text="Test Report Title"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test region literal option-->
        <test expect_num_outputs="2">
            <conditional name="reference_genome">
                <param name="source" value="history"/>
                <param name="history_item" value="reference.fasta"/>
            </conditional>
            <param name="reads" value="reads.bam"/>
            <param name="model_type" value="WGS"/>
            <conditional name="regions_conditional">
                <param name="regions_option" value="region"/>
                <param name="region_literal" value="K03455:1-2669"/>
            </conditional>
            <section name="advanced_options">
                <param name="vcf_stats_report" value="True"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text="#CHROM"/>
                    <has_text text="##fileformat=VCFv4.2"/>
                </assert_contents>
            </output>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <is_valid_xml />
                    <has_n_lines n="34" delta="10" />
                    <has_text text="Variant types"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test region bed option-->
        <test expect_num_outputs="2">
            <conditional name="reference_genome">
                <param name="source" value="history"/>
                <param name="history_item" value="reference.fasta"/>
            </conditional>
            <param name="reads" value="reads.bam"/>
            <param name="model_type" value="WGS"/>
            <conditional name="regions_conditional">
                <param name="regions_option" value="bed"/>
                <param name="bed_file" value="region.bed" ftype="bed"/>
            </conditional>
            <section name="advanced_options">
                <param name="vcf_stats_report" value="True"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text="#CHROM"/>
                    <has_text text="##fileformat=VCFv4.2"/>
                </assert_contents>
            </output>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <is_valid_xml />
                    <has_n_lines n="34" delta="10" />
                    <has_text text="Variant types"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test gvcf output option-->
        <test expect_num_outputs="3">
            <conditional name="reference_genome">
                <param name="source" value="history"/>
                <param name="history_item" value="reference.fasta"/>
            </conditional>
            <param name="reads" value="reads.bam"/>
            <param name="model_type" value="WGS"/>
            <param name="output_gvcf" value="True"/>
            <conditional name="regions_conditional">
                <param name="regions_option" value="region"/>
                <param name="region_literal" value="K03455:1-2669"/>
            </conditional>
            <section name="advanced_options">
                <param name="vcf_stats_report" value="True"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text="#CHROM"/>
                    <has_text text="##fileformat=VCFv4.2"/>
                </assert_contents>
            </output>
            <output name="gvcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text="#CHROM"/>
                    <has_text text="##fileformat=VCFv4.2"/>
                </assert_contents>
            </output>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <is_valid_xml />
                    <has_n_lines n="34" delta="10" />
                    <has_text text="Variant types"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test CRAM format input-->
        <test expect_num_outputs="2">
            <conditional name="reference_genome">
                <param name="source" value="history"/>
                <param name="history_item" value="reference.fasta"/>
            </conditional>
            <param name="reads" value="reads.cram"/>
            <param name="model_type" value="WGS"/>
            <conditional name="regions_conditional">
                <param name="regions_option" value="disabled"/>
            </conditional>
            <section name="advanced_options">
                <param name="vcf_stats_report" value="True"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text="#CHROM"/>
                    <has_text text="##fileformat=VCFv4.2"/>
                </assert_contents>
            </output>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <is_valid_xml />
                    <has_n_lines n="34" delta="10" />
                    <has_text text="Variant types"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test indexed reference format input-->
        <test expect_num_outputs="2">
            <conditional name="reference_genome">
                <param name="source" value="indexed"/>
                <param name="index" value="phix174"/>
            </conditional>
            <param name="reads" value="reads.bam"/>
            <param name="model_type" value="WGS"/>
            <conditional name="regions_conditional">
                <param name="regions_option" value="disabled"/>
            </conditional>
            <section name="advanced_options">
                <param name="vcf_stats_report" value="True"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text="#CHROM"/>
                    <has_text text="##fileformat=VCFv4.2"/>
                </assert_contents>
            </output>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <is_valid_xml />
                    <has_n_lines n="34" delta="10" />
                    <has_text text="Variant types"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test runtime report output-->
        <test expect_num_outputs="3">
            <conditional name="reference_genome">
                <param name="source" value="history"/>
                <param name="history_item" value="reference.fasta"/>
            </conditional>
            <param name="reads" value="reads.bam"/>
            <param name="model_type" value="WGS"/>
            <conditional name="regions_conditional">
                <param name="regions_option" value="disabled"/>
            </conditional>
            <section name="advanced_options">
                <param name="vcf_stats_report" value="True"/>
                <param name="create_runtime_report" value="True"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text="#CHROM"/>
                    <has_text text="##fileformat=VCFv4.2"/>
                </assert_contents>
            </output>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <is_valid_xml />
                    <has_n_lines n="34" delta="10" />
                    <has_text text="Variant types"/>
                </assert_contents>
            </output>
            <output name="runtime_report" ftype="html">
                <assert_contents>
                    <is_valid_xml />
                    <has_text text="Overall runtime"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test haploid contigs and PAR regions BED-->
        <test expect_num_outputs="2">
            <conditional name="reference_genome">
                <param name="source" value="history"/>
                <param name="history_item" value="reference.fasta"/>
            </conditional>
            <param name="reads" value="reads.bam"/>
            <param name="model_type" value="WGS"/>
            <conditional name="regions_conditional">
                <param name="regions_option" value="disabled"/>
            </conditional>
            <section name="advanced_options">
                <param name="vcf_stats_report" value="True"/>
                <param name="haploid_contigs" value="K03455"/>
                <param name="par_regions_bed" value="phix174.par_regions.bed" ftype="bed"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text="#CHROM"/>
                    <has_text text="##fileformat=VCFv4.2"/>
                </assert_contents>
            </output>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <is_valid_xml />
                    <has_n_lines n="34" delta="10" />
                    <has_text text="Variant types"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test expert options extra-args-->
        <test expect_num_outputs="2">
            <conditional name="reference_genome">
                <param name="source" value="history"/>
                <param name="history_item" value="reference.fasta"/>
            </conditional>
            <param name="reads" value="reads.bam"/>
            <param name="model_type" value="WGS"/>
            <conditional name="regions_conditional">
                <param name="regions_option" value="disabled"/>
            </conditional>
            <section name="advanced_options">
                <param name="vcf_stats_report" value="True"/>
            </section>
            <section name="expert_options">
                <param name="make_examples_extra_args" value="min_base_quality=10"/>
                <param name="call_variants_extra_args" value="allow_empty_examples=true"/>
                <param name="postprocess_variants_extra_args" value="cnn_homref_call_min_gq=20.0"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text="#CHROM"/>
                    <has_text text="##fileformat=VCFv4.2"/>
                </assert_contents>
            </output>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <is_valid_xml />
                    <has_n_lines n="34" delta="10" />
                    <has_text text="Variant types"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[

.. class:: infomark

**Purpose**

DeepVariant is a deep learning-based variant caller that takes aligned reads (in BAM or CRAM format), produces pileup image tensors from them, classifies each tensor using a convolutional neural network, and finally reports the results in a standard VCF or gVCF file.

DeepVariant runs a pipeline of three steps: **make_examples**, **call_variants**, and **postprocess_variants**. The **Expert options** section exposes extra-args fields for each step. Use these only with parameters that are accepted by the corresponding DeepVariant binary (see `run_deepvariant --helpfull` and the subcommand help, e.g. `/opt/deepvariant/bin/make_examples --helpfull` in the container).

DeepVariant supports germline variant-calling in diploid organisms.

- NGS (Illumina) data for either a `whole genome <https://github.com/google/deepvariant/blob/r1.9/docs/deepvariant-case-study.md>`_ or `whole exome <https://github.com/google/deepvariant/blob/r1.9/docs/deepvariant-exome-case-study.md>`_.
- PacBio HiFi data, see the `PacBio case study <https://github.com/google/deepvariant/blob/r1.9/docs/deepvariant-pacbio-model-case-study.md>`_.
- Hybrid PacBio HiFi + Illumina WGS, see the `hybrid case study <https://github.com/google/deepvariant/blob/r1.9/docs/deepvariant-hybrid-case-study.md>`_.

Please also note:

For somatic data or any other samples where the genotypes go beyond two copies of DNA, DeepVariant will not work out of the box because the only genotypes supported are hom-alt, het, and hom-ref.

The models included with DeepVariant are only trained on human data. For other organisms, see the blog post on `non-human variant-calling <https://google.github.io/deepvariant/posts/2018-12-05-improved-non-human-variant-calling-using-species-specific-deepvariant-models/>`_ for some possible pitfalls and how to handle them.

----

.. class:: infomark

**How DeepVariants works**

DeepVariant relies on `Nucleus <https://github.com/google/nucleus>`_, a library of Python and C++ code for reading and writing data in common genomics file formats (like SAM and VCF) designed for painless integration with the `TensorFlow <https://www.tensorflow.org/>`_ machine learning framework. Nucleus was built with DeepVariant in mind and open-sourced separately so it can be used by anyone in the genomics research community for other projects. See this blog post on `Using Nucleus and TensorFlow for DNA Sequencing Error Correction <https://google.github.io/deepvariant/posts/2019-01-31-using-nucleus-and-tensorflow-for-dna-sequencing-error-correction/>`_.


]]>    </help>
    <expand macro="citations"/>
</tool>