view varscan_somatic.xml @ 4:00a6be52a45a draft

planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit 3932667bb4f6770a671141f7aab1f326e6dd1919
author iuc
date Tue, 04 Dec 2018 05:53:45 -0500
parents 2fe9ebb98aad
children d37adcc2ec03
line wrap: on
line source

<tool id="varscan_somatic" name="VarScan somatic" version="@VERSION@.1">
    <description>Call germline/somatic and LOH variants from tumor-normal sample pairs</description>
    <macros>
        <import>macros.xml</import>
        <macro name="test_mentions_contig">
            <assert_contents>
                <has_line_matching
                expression="##contig=.ID=chrM,length=16571." />
            </assert_contents>
        </macro>
        <macro name="test_mentions_filters">
            <assert_contents>
                <has_line_matching
                expression="##FILTER=.ID=VarCount,Description=.+" />
                <has_line_matching
                expression="##FILTER=.ID=ReadLenDiff,Description=.+" />
                <has_line_matching
                expression="##FILTER=.ID=RefDist3,Description=.+" />
            </assert_contents>
        </macro>
        <macro name="test_not_mentions_filters">
            <assert_contents>
                <not_has_text
                text="##FILTER=&lt;ID=VarCount,Description=" />
                <not_has_text
                text="##FILTER=&lt;ID=ReadLenDiff,Description=" />
                <not_has_text
                text="##FILTER=&lt;ID=RefDist3,Description=" />
            </assert_contents>
        </macro>
    </macros>
    <expand macro="requirements">
        <requirement type="package" version="3.6.7">python</requirement>
        <requirement type="package" version="0.15.1">pysam</requirement>
    </expand>
    <stdio>
        <exit_code range="1:" />
    </stdio>
    <command><![CDATA[
        #if str($reference.source) == "history":
            #set ref_genome = 'ref.fa'
            ln -s -f '$reference.genome' $ref_genome &&
        #else:
            #set ref_genome = '$reference.genome.fields.path'
        #end if
        #set normal_data = 'normal.bam'
        #set tumor_data = 'tumor.bam'
        ln -s -f '$normal_bam' $normal_data &&
        ln -s -f '$tumor_bam' $tumor_data &&
        ln -s -f '${normal_bam.metadata.bam_index}' ${normal_data}.bai &&
        ln -s -f '${tumor_bam.metadata.bam_index}' ${tumor_data}.bai &&
        python3 $__tool_directory__/varscan.py
            --normal '$normal_data'
            --tumor '$tumor_data'
            --normal-purity ${normal_purity}
            --tumor-purity ${tumor_purity}
            #if str($split_output):
                --ofile variants_out
                $split_output
            #else:
                --ofile '$output'
            #end if
            --threads \${GALAXY_SLOTS:-2}
            #if str($call_params.settings) == "custom":
                ## samtools mpileup parameters
                --min-basequal ${call_params.min_avg_qual}
                --min-mapqual ${call_params.min_mapqual}
                ## VarScan parameters
                --min-coverage ${call_params.min_coverage}
                --min-var-count ${call_params.min_reads2}
                --min-var-freq ${call_params.min_var_freq}
                --min-hom-freq ${call_params.min_freq_for_hom}
                --p-value ${call_params.p_value}
                --somatic-p-value ${call_params.somatic_p_value}
            #end if
            #if str($filter_params.settings) == "no_filter":
                --no-filters
            #elif str($filter_params.settings) == "dream3_settings":
                --min-var-count2 3
                --min-var-count2-lc 1
                --min-var-freq2 0.05
                --max-somatic-p 0.05
                --max-somatic-p-depth 10
                --min-ref-readpos 0.2
                --min-var-readpos 0.15
                --min-ref-dist3 0.2
                --min-var-dist3 0.15
                --min-ref-len 90
                --min-var-len 90
                --max-len-diff 0.05
                --min-strandedness 0
                --min-strand-reads 5
                --min-ref-basequal 15
                --min-var-basequal 30
                --max-basequal-diff 50
                --min-ref-mapqual 20
                --min-var-mapqual 30
                --max-mapqual-diff 10
                --max-ref-mmqs 50
                --max-var-mmqs 100
                --min-mmqs-diff 0
                --max-mmqs-diff 50
            #elif str($filter_params.settings) == "custom":
                --min-var-count2 ${filter_params.min_var_count}
                --min-var-count2-lc ${filter_params.min_var_count_lc}
                --min-var-freq2 ${filter_params.min_var_freq2}
                --max-somatic-p ${filter_params.max_somatic_p}
                --max-somatic-p-depth ${filter_params.max_somatic_p_depth}
                --min-ref-readpos ${filter_params.min_ref_readpos}
                --min-var-readpos ${filter_params.min_var_readpos}
                --min-ref-dist3 ${filter_params.min_ref_dist3}
                --min-var-dist3 ${filter_params.min_var_dist3}
                --min-ref-len ${filter_params.min_ref_len}
                --min-var-len ${filter_params.min_var_len}
                --max-len-diff ${filter_params.max_len_diff}
                --min-strandedness ${filter_params.min_strandedness}
                --min-strand-reads ${filter_params.min_strand_reads}
                --min-ref-basequal ${filter_params.min_ref_basequal}
                --min-var-basequal ${filter_params.min_var_basequal}
                --max-basequal-diff ${filter_params.max_basequal_diff}
                --min-ref-mapqual ${filter_params.min_ref_mapqual}
                --min-var-mapqual ${filter_params.min_var_mapqual}
                --max-mapqual-diff ${filter_params.max_mapqual_diff}
                --max-ref-mmqs ${filter_params.max_ref_mmqs}
                --max-var-mmqs ${filter_params.max_var_mmqs}
                --min-mmqs-diff ${filter_params.min_mmqs_diff}
                --max-mmqs-diff ${filter_params.max_mmqs_diff}
            #end if
            --verbose
            $ref_genome
    ]]></command>

    <inputs>
        <conditional name="reference">
            <param name="source" type="select"
            label="Will you select a reference genome from your history or use a built-in genome?">
                <option value="cached">Use a built-in genome</option>
                <option value="history">Use a genome from my history</option>
            </param>
            <when value="cached">
                <param name="genome" type="select"
                label="reference genome"
                help="The fasta reference genome that variants should be called against.">
                    <options from_data_table="fasta_indexes" />
                </param>
            </when>
            <when value="history">
                <param name="genome" type="data" format="fasta"
                label="reference genome"
                help="The fasta reference genome that variants should be called against."/>
            </when>
        </conditional>
        <param name="normal_bam" type="data" format="bam"
        label="aligned reads from normal sample" />
        <param name="tumor_bam" type="data" format="bam"
        label="aligned reads from tumor sample" />
        <param argument="--normal-purity" name="normal_purity" type="float" value="1.0" min="0" max="1.0"
        label="Estimated purity (non-tumor content) of normal sample"/>
        <param argument="--tumor-purity" name="tumor_purity" type="float" value="1.0" min="0" max="1.0"
        label="Estimated purity (tumor content) of tumor sample"/>
        <param name="split_output" type="boolean" truevalue="--split-output" falsevalue="" checked="false"
        label="Generate separate output datasets for SNP and indel calls?" />
        <conditional name="call_params">
            <param name="settings" label="Settings for Variant Calling" type="select">
                <option value="varscan_defaults" selected="true">Use default values</option>
                <option value="custom">Customize settings</option>
            </param>
            <when value="custom">
                <param argument="samtools mpileup -Q" name="min_avg_qual" type="integer" value="13" min="0" max="50"
                label="Minimum base quality"
                help="The minimum base quality at the variant position required to use a read for calling" />
                <param argument="samtools mpileup -q" name="min_mapqual" type="integer" value="0" min="0" max="60"
                label="Minimum mapping quality"
                help="The minimum mapping quality required for a read to be considered in variant calling" />
                <expand macro="min_coverage"
                help="Minimum site coverage required in the normal and in the tumor sample to call a variant. This threshold gets applied after eliminating reads with low base and mapping qualitiy as defined above." />
                <expand macro="min_reads2" />
                <expand macro="min_var_freq" value="0.1" />
                <expand macro="min_freq_for_hom" />
                <expand macro="p_value" value="0.99"
                help="The p-value threshold used to determine if a variant should be called for either sample" />
                <param argument="--somatic-p-value" name="somatic_p_value" type="float" value="0.05" min="0" max="1"
                label="P-value threshold for calling somatic variants and LOH events"
                help="The p-value threshold used to determine if read count differences between the normal and the tumor sample justify classification of a variant as somatic or as an LOH event" />
            </when>
            <when value="varscan_defaults" />
        </conditional>
        <conditional name="filter_params">
            <param name="settings" label="Settings for Posterior Variant Filtering" type="select">
                <option value="varscan_defaults" selected="true">Use default values</option>
                <option value="dream3_settings">Use settings optimized for DREAM-3</option>
                <option value="no_filter">Do not perform posterior filtering</option>
                <option value="custom">Customize settings</option>
            </param>
            <when value="varscan_defaults" />
            <when value="dream3_settings" />
            <when value="no_filter" />
            <when value="custom">
                <param argument="--min-var-count" name="min_var_count" type="integer" value="4" min="1" max="200"
                label="Minimum number of variant-supporting reads"
                help="" />
                <param argument="--min-var-count-lc" name="min_var_count_lc" type="integer" value="2" min="1" max="200"
                label="Low coverage minimum number of variant-supporting reads"
                help="Will be applied instead of the --min-var-count limit for sites with poor overall (less than --max-somatic-p-depth) coverage" />
                <param argument="--min-var-freq" name="min_var_freq2" type="float" value="0.05" min="0" max="1"
                label="Minimum variant allele frequency"
                help="" />
                <param argument="--max-somatic-p" name="max_somatic_p" type="float" value="0.05" min="0" max="1"
                label="Maximum somatic p-value allowed for a somatic call"
                help="" />
                <param argument="--max-somatic-p-depth" name="max_somatic_p_depth" type="integer" value="10" min="2" max="200"
                label="Depth required at variant site to run --max-somatic-p filter"
                help="" />
                <param argument="--min-ref-readpos" name="min_ref_readpos" type="float" value="0.1" min="0" max="1"
                label="Minimum relative variant position in ref-supporting reads"
                help="The minimum average relative distance from the ends of ref-supporting reads required for variant sites" />
                <param argument="--min-var-readpos" name="min_var_readpos" type="float" value="0.1" min="0" max="1"
                label="Minimum relative variant position in variant-supporting reads"
                help="The minimum average relative distance from the ends of variant-supporting reads required for variant sites" />
                <param argument="--min-ref-dist3" name="min_ref_dist3" type="float" value="0.1" min="0" max="1"
                label="Minimum distance of variant site from 3'-end of ref-supporting reads"
                help="The minimum average relative distance from the effective 3'end of ref-supporting reads required for variant sites" />
                <param argument="--min-var-dist3" name="min_var_dist3" type="float" value="0.1" min="0" max="1"
                label="Minimum distance of variant site from 3'-end of variant-supporting reads"
                help="The minimum average relative distance from the effective 3'end of variant-supporting reads required for variant sites" />
                <param argument="--min-ref-avgrl" name="min_ref_len" type="integer" value="90" min="0" max="200"
                label="Minimum length of ref-supporting reads"
                help="The minimum average trimmed length required for reads supporting the reference allele" />
                <param argument="--min-var-avgrl" name="min_var_len" type="integer" value="90" min="0" max="200"
                label="Minimum length of variant-supporting reads"
                help="The minimum average trimmed length required for reads supporting the variant allele" />
                <param argument="--max-rl-diff" name="max_len_diff" type="float" value="0.25" min="0" max="1"
                label="Maximum relative read length difference"
                help="The maximum allowed average relative read length difference (ref - var) between reads supporting the reference and the variant allele" />
                <param argument="--min-strandedness" name="min_strandedness" type="float" value="0.01" min="0" max="0.5"
                label="Minimum fraction of variant reads from each strand"
                help="The minimum fraction of variant reads that are required to come from the forward and from the reverse strand" />
                <param argument="--min-strand-reads" name="min_strand_reads" type="integer" value="5" min="2" max="200"
                label="Minimum variant allele depth required to apply the --min-strandedness filter"
                help="" />
                <param argument="--min-ref-basequal" name="min_ref_basequal" type="integer" value="15" min="1" max="50"
                label="Minimum average base quality for the ref allele"
                help="The minimum average base quality required at the variant site for reads supporting the reference allele" />
                <param argument="--min-var-basequal" name="min_var_basequal" type="integer" value="15" min="1" max="50"
                label="Minimum average base quality for the variant allele"
                help="The minimum average base quality required at the variant site for reads supporting the variant allele" />
                <param argument="max-basequal-diff" name="max_basequal_diff" type="integer" value="50" min="0" max="50"
                label="Maximum base quality difference between ref- and variant-supporting reads"
                help="The maximum average base quality difference (ref - var) allowed between the variant site positions of reads supporting the reference and the variant allele" />
                <param argument="--min-ref-mapqual" name="min_ref_mapqual" type="integer" value="15" min="1" max="60"
                label="Minimum average mapping quality of ref-supporting reads"
                help="The minimum average mapping quality required for reads supporting the reference allele" />
                <param argument="--min-var-mapqual" name="min_var_mapqual" type="integer" value="15" min="1" max="60"
                label="Minimum average mapping quality of variant-supporting reads"
                help="The minimum average mapping quality required for reads supporting the variant allele" />
                <param argument="--max-mapqual-diff" name="max_mapqual_diff" type="integer" value="50" min="0" max="60"
                label="Maximum mapping quality difference between ref- and variant-supporting reads"
                help="The maximum average mapping quality difference (ref - var) allowed between reads supporting the reference and the variant allele" />
                <param argument="--max-ref-mmqs" name="max_ref_mmqs" type="integer" value="100" min="0"
                label="Maximum mismatch base quality sum of ref-supporting reads"
                help="The maximum mismatch base quality sum allowed for reads supporting the reference allele" />
                <param argument="--max-var-mmqs" name="max_var_mmqs" type="integer" value="100" min="0"
                label="Maximum mismatch base quality sum of var-supporting reads"
                help="The maximum mismatch base quality sum allowed for reads supporting the variant allele" />
                <param argument="--min-mmqs-diff" name="min_mmqs_diff" type="integer" value="0" min="0"
                label="Minimum difference between mismatch base quality sums of variant- and ref-supporting reads"
                help="The minimum difference in the mismatch base quality sums (var - ref) required between reads supporting the variant and the reference allele" />
                <param argument="--max-mmqs-diff" name="max_mmqs_diff" type="integer" value="50" min="1"
                label="Maximum difference between mismatch base quality sums of variant- and ref-supporting reads"
                help="The maximum difference in the mismatch base quality sums (var - ref) allowed between reads supporting the variant and the reference allele" />
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="output" format="vcf">
            <filter>not split_output</filter>
        </data>
        <data name="output_snp" from_work_dir="variants_out.snp" format="vcf"
        label="Varscan somatic SNP calls on ${on_string}">
            <filter>split_output</filter>
        </data>
        <data name="output_indel" from_work_dir="variants_out.indel" format="vcf"
        label="Varscan somatic indel calls on ${on_string}">
            <filter>split_output</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <conditional name="reference">
                <param name="source" value="history" />
                <param name="genome" value="hg19_chrM.fa" />
            </conditional>
            <param name="normal_bam" value="control_chrM.bam" />
            <param name="tumor_bam" value="tumor_chrM.bam" />
            <param name="split_output" value="false" />
            <conditional name="call_params">
                <param name="settings" value="varscan_defaults" />
            </conditional>
            <conditional name="filter_params">
                <param name="settings" value="varscan_defaults" />
            </conditional>
            <output name="output">
                <expand macro="test_mentions_contig" />
                <expand macro="test_mentions_filters" />
            </output>
        </test>
        <test expect_num_outputs="2">
            <conditional name="reference">
                <param name="source" value="history" />
                <param name="genome" value="hg19_chrM.fa" />
            </conditional>
            <param name="normal_bam" value="control_chrM.bam" />
            <param name="tumor_bam" value="tumor_chrM.bam" />
            <param name="split_output" value="true" />
            <conditional name="call_params">
                <param name="settings" value="varscan_defaults" />
            </conditional>
            <conditional name="filter_params">
                <param name="settings" value="varscan_defaults" />
            </conditional>
            <output name="output_indel">
                <expand macro="test_mentions_contig" />
                <expand macro="test_mentions_filters" />
            </output>
            <output name="output_snp">
                <expand macro="test_mentions_contig" />
                <expand macro="test_mentions_filters" />
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="reference">
                <param name="source" value="history" />
                <param name="genome" value="hg19_chrM.fa" />
            </conditional>
            <param name="normal_bam" value="control_chrM.bam" />
            <param name="tumor_bam" value="tumor_chrM.bam" />
            <param name="split_output" value="false" />
            <conditional name="call_params">
                <param name="settings" value="custom" />
                <param name="min_coverage" value="2" />
                <param name="min_reads2" value="1" />
                <param name="min_avg_qual" value="5" />
                <param name="min_var_freq" value="0.01" />
                <param name="min_freq_for_hom" value="0.66" />
                <param name="p_value" value="0.97" />
                <param name="somatic_p_value" value="0.09" />
            </conditional>
            <param name="normal_purity" value="0.6" />
            <param name="tumor_purity" value="0.6" />
            <conditional name="filter_params">
                <param name="settings" value="varscan_defaults" />
            </conditional>
            <assert_stderr>
                <has_line_matching
                expression=" Min coverage:&#09;2x for Normal, 2x for Tumor" />
                <has_line_matching
                expression="Min reads2:&#09;1" />
                <has_line_matching
                expression="Min var freq:&#09;0.01" />
                <has_line_matching
                expression="Min freq for hom:&#09;0.66" />
                <has_line_matching
                expression="Normal purity:&#09;0.6" />
                <has_line_matching
                expression="Tumor purity:&#09;0.6" />
                <has_line_matching
                expression="Min avg qual:&#09;5" />
                <has_line_matching
                expression="P-value thresh:&#09;0.97" />
                <has_line_matching
                expression="Somatic p-value:&#09;0.09" />
            </assert_stderr>
            <output name="output">
                <expand macro="test_mentions_contig" />
                <expand macro="test_mentions_filters" />
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="reference">
                <param name="source" value="history" />
                <param name="genome" value="hg19_chrM.fa" />
            </conditional>
            <param name="normal_bam" value="control_chrM.bam" />
            <param name="tumor_bam" value="tumor_chrM.bam" />
            <param name="split_output" value="false" />
            <conditional name="call_params">
                <param name="settings" value="varscan_defaults" />
            </conditional>
            <conditional name="filter_params">
                <param name="settings" value="dream3_settings" />
            </conditional>
            <output name="output">
                <expand macro="test_mentions_contig" />
                <expand macro="test_mentions_filters" />
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="reference">
                <param name="source" value="history" />
                <param name="genome" value="hg19_chrM.fa" />
            </conditional>
            <param name="normal_bam" value="control_chrM.bam" />
            <param name="tumor_bam" value="tumor_chrM.bam" />
            <param name="split_output" value="false" />
            <conditional name="call_params">
                <param name="settings" value="varscan_defaults" />
            </conditional>
            <conditional name="filter_params">
                <param name="settings" value="no_filter" />
            </conditional>
            <output name="output">
                <expand macro="test_mentions_contig" />
                <expand macro="test_not_mentions_filters" />
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="reference">
                <param name="source" value="history" />
                <param name="genome" value="hg19_chrM.fa" />
            </conditional>
            <param name="normal_bam" value="control_chrM.bam" />
            <param name="tumor_bam" value="tumor_chrM.bam" />
            <param name="split_output" value="false" />
            <conditional name="call_params">
                <param name="settings" value="varscan_defaults" />
            </conditional>
            <conditional name="filter_params">
                <param name="settings" value="custom" />
                <param name="min_ref_basequal" value="28" />
                <param name="min_var_basequal" value="28" />
            </conditional>
            <output name="output">
                <expand macro="test_mentions_contig" />
                <expand macro="test_mentions_filters" />
            </output>
        </test>
    </tests>

    <help>
@HELP_HEADER@

**The Varscan Somatic tool for Galaxy**

This tool wraps the functionality of the ``varscan somatic`` and the
``varscan fpfilter`` command line tools.

.. class:: infomark

   The wrapper aims at providing the same functionality as the
   ``varscan fpfilter`` tool, but implements it using ``pysam`` internally.
   Note that, as one limitation compared to the original ``varscan`` tool,
   the current version does not apply filters to indels!

The tool is designed to detect genetic variants in a **pair of samples**
representing normal and tumor tissue from the same individual. It classifies
the variants, according to their most likely origin, as **somatic** (variant is
found in the tumor, but not in the normal sample, *i.e.*, is the consequence of
a somatic mutation event), **germline** (variant is found in both samples =>
germline mutation event) and **LOH** (variant is found in both samples, but
only the tumor sample appears to be homozygous for it => loss of heterozygosity
event).
This classification is encoded in the variant ``INFO`` fields of the VCF output
produced by the tool in the form of a status code ``SS`` (somatic status),
where:

- ``SS=1`` signifies a likely germline variant,
- ``SS=2`` a somatic variant
- ``SS=3`` a LOH variant

In addition, ``SS=0`` indicates a possible variant, but with insufficient
evidence for an, at least, heterozygous state in either individual sample, and
``SS=5`` is used for variants of unexplained origin (*e.g.*, variants found in
the normal, but not in the tumor tissue sample).

In a second step, following variant calling, the tool can try to detect likely
false-positive calls by re-inspecting the data at the variant sites more
carefully and looking for signs that may indicate problems with the
sequencing data or its mapping. If a called variant is deemed a possible
false-positive at this step, this gets indicated in the ``FILTER`` field of the
variant record in the VCF output. For high confidence variants passing all
posterior (applied after variant calling) filters the value of the field will
be ``PASS``, for variants failing any of the posterior filters the value will
be a ``;``-separated list of the problematic filters.


**Input**

The tool takes as input a reference genome (in fasta format) and a pair of
aligned reads datasets (bam format).

**Output**

A VCF dataset of called variants. When asked to *Generate separate output
datasets for SNP and indel calls*, the tool will behave like the
``varscan somatic`` command line tool and produce two VCF datasets - one with
just the single nucleotide variants, while the other one will store
insertion/deletion variants.

**Options**

*Estimated purity of normal sample / of tumor sample*

Since, in practice, it is often impossible to isolate tissue samples without
contamination from surrounding tissue or from invading cells, these two fields
let you indicate your estimate of the purity of the two samples (as fractions
between 0 and 1, where 1 would indicate a contamination-free sample and 0.5 a
sample to which the desired tissue contributes only 50%, while the other 50%
consist of cells from the other tissue type).

*Settings for Variant Calling*

Settings in this section will affect the steps of variant calling and
classification. You can accept VarScan's default values for the corresponding
parameters or customize them according to your needs.

*Settings for Posterior Variant Filtering*

Use the parameters in this section to configure the false-positive filtering
step that follows variant calling and classification. These settings will not
influence the number of variants detected nor their classification, but may
change the ``FILTER`` field of variant records to indicate which variants
failed to pass certain filters. You can use this information with downstream
tools to exclude certain variants from further analysis steps or include only
high confidence variants that passed all filters (those with ``PASS`` as their
``INFO`` field value. You can accept the orignal filter defaults of the
``varscan fpfilter`` command line tool, use the settings established for the
tool in the `DREAM3 challenge`_, or choose to customize the settings.
Alternatively, you can also choose to skip posterior filtering entirely, in
which case all variants will have their ``INFO`` field set to ``PASS``.

.. _DREAM3 challenge: https://www.synapse.org/#!Synapse:syn312572/wiki/58893
    </help>
    <expand macro="citations" />
</tool>