view snpfreqplot.xml @ 2:dc51db22310c draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit d1c54d077cfc0eeb9699719760e668948cb9bbbc"
author iuc
date Fri, 18 Dec 2020 23:48:01 +0000
parents e362b3143cde
children 3d0adeee3f2b
line wrap: on
line source

<tool id="snpfreqplot" name="Variant Frequency Plot" version="@VERSION@+galaxy@GALAXY_VERSION@" profile="20.09"
      license="GPL-3.0-or-later" >
    <description>Generates a heatmap of allele frequencies grouped by variant type for SnpEff-annotated SARS-CoV-2 data</description>
    <macros>
        <token name="@VERSION@">1.0</token>
        <token name="@GALAXY_VERSION@">2</token>
    </macros>
    <requirements>
        <requirement type="package" version="4.0">r-base</requirement>
        <requirement type="package" version="1.0.12">r-pheatmap</requirement>
        <requirement type="package" version="1.3.0">r-tidyverse</requirement>
        <requirement type="package" version="1.36.0">bioconductor-variantannotation</requirement>
    </requirements>
    <edam_topics>
        <edam_topic>topic_0797</edam_topic>
        <edam_topic>topic_0092</edam_topic>
    </edam_topics>
    <edam_operations>
        <edam_operation>operation_3436</edam_operation>
    </edam_operations>
    <command detect_errors="exit_code"><![CDATA[
#set $outfile = "tmp_output." + str($advanced.output_type)

cat '$__tool_directory__/helperFunctions.R' > /dev/null
&& cat '$__tool_directory__/snpEffExtract.R' > /dev/null
&& cat '$__tool_directory__/heatmap_for_variants.R' > /dev/null
&& echo "output file: $outfile"
&& Rscript '$configscript'
    ]]>
   </command>
    <configfiles>
        <configfile name="configscript"><![CDATA[
## 1. Set Sample Inputs
##    ------------------
##    Create a dataframe of sample ids, filetypes, and filenames
##    from the input collection. At this point, the list could be
##    of mixed type (vcf and tabular), though maybe Galaxy
##    restricts that.
samples = list(ids = c(), exts= c(), files = c())
#for $i, $file in enumerate($sinputs):
samples\$ids = c(samples\$ids, '${file.element_identifier}')
samples\$exts = c(samples\$exts, '${file.extension}')
samples\$files = c(samples\$files, '${file}')
#end for
samples = data.frame(samples, stringsAsFactors=F)

## 2. Input Conversion (external script)
##    ----------------------------------
##    We source the input conversion script *after* the samples
##    have been populated, so that it performs an inplace replacement
##    of the vcf inputs with their converted tabular counterparts.
##
##    All samples are all tabular after this point
source('$__tool_directory__/helperFunctions.R')
source('$__tool_directory__/snpEffExtract.R')

## 3. Galaxy Params
##    --------------
##    Set the general script parameters from the UI
variant_frequency <- as.numeric( '$varfreq' )
brewer_color_gene_annotation <- as.character( '$advanced.color' )

#if str($clustering.do) == "TRUE":
pheat_clustering <- TRUE
pheat_clustering_method <- as.character( '$clustering.method' )
pheat_number_of_clusters <- as.integer( '$clustering.nclust' )
#else
pheat_clustering <- FALSE
pheat_clustering_method <- "ward.D2"
pheat_number_of_clusters <- 5
#end if

ratio = as.numeric('$advanced.ratio')
out_ext = '$advanced.output_type'
out_file = paste0("tmp_output.", out_ext)

## 4. Generate Heatmap (external script)
##    ----------------------------------
source('$__tool_directory__/heatmap_for_variants.R')

]]>
        </configfile>
    </configfiles>
    <inputs>
        <param name="sinputs" format="tabular,vcf" type="data" multiple="true"
            collection_type="list" label="Variant lists data"
            help="Select at least two datasets (or a dataset collection) with variant lists (see the tool help below for format details). Datasets are expected to represent individual samples and dataset names will be used as sample identifiers." />
        <param name="varfreq" type="float" min="0" max="1" value="0.1"
            label="Variant Frequency Threshold"
            help="Only plot variants with an intrasample frequency above this threshold in at least one sample." />
        <section name="advanced" title="Image Properties" expanded="true">
            <param name="output_type" type="select" label="Plot output format" >
                <option value="pdf" selected="true" >PDF</option>
                <option value="png" >PNG</option>
                <option value="svg">SVG</option>
                <option value="tiff" >TIFF</option>
                <option value="bmp" >BMP</option>
                <option value="jpeg" >JPEG</option>
            </param>
            <param name="ratio" label="Cell Ratio" type="float"
                min="0.05" value="0.67" max="20"
                help="Width:Height ratio of individual heatmap cells" />
            <param name="color" type="select" label="Color palette used for the gene annotations" >
                <option value="Set1" />
                <option value="Set2" />
                <option value="Set3" selected="true" />
                <option value="Pastel2" />
                <option value="Pastel1" />
                <option value="Paired" />
                <option value="Dark2" />
                <option value="Accent" />
                <option value="YlOrRd" />
                <option value="YlOrBr" />
                <option value="YlGnBu" />
                <option value="YlGn" />
                <option value="Reds" />
                <option value="RdPu" />
                <option value="Purples" />
                <option value="PuRd" />
                <option value="PuBuGn" />
                <option value="PuBu" />
                <option value="OrRd" />
                <option value="Oranges" />
                <option value="Greys" />
                <option value="Greens" />
                <option value="GnBu" />
                <option value="BuPu" />
                <option value="BuGn" />
                <option value="Blues" />
                <option value="Spectral" />
                <option value="RdYlGn" />
                <option value="RdYlBu" />
                <option value="RdGy" />
                <option value="RdBu" />
                <option value="PuOr" />
                <option value="PRGn" />
                <option value="PiYG" />
                <option value="BrBG" />
            </param>
        </section>
        <conditional name="clustering">
            <param name="do" type="select" label="Perform Clustering?" >
                <option value="TRUE">Yes</option>
                <option value="FALSE" selected="true">No</option>
            </param>
            <when value="TRUE" >
                <param name="nclust" type="integer"
                    min="1" value="1" label="Number of clusters" />
                <param name="method" type="select" label="Clustering method" >
                    <option value="ward.D" />
                    <option value="ward.D2" selected="true" />
                    <option value="single" />
                    <option value="complete" />
                    <option value="average" >average (UPGMA)</option>
                    <option value="mcquitty" >mcquitty (WPGMA)</option>
                    <option value="median" >median (WPGMC)</option>
                    <option value="centroid" >centroid (UPGMC)</option>
                </param>
            </when>
            <when value="FALSE" />
        </conditional>
    </inputs>
    <outputs>
        <data name="outfile" format="pdf" from_work_dir="tmp_output.*"
            label="Variant-Frequency Plot on ${on_string}: ${advanced.output_type}">
            <change_format>
                <when input="advanced.output_type" value="svg" format="svg" />
                <when input="advanced.output_type" value="png" format="png" />
                <when input="advanced.output_type" value="tiff" format="tiff" />
                <when input="advanced.output_type" value="bmp" format="bmp" />
                <when input="advanced.output_type" value="jpeg" format="jpg" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <!-- PDF, tabular inputs -->
            <param name="sinputs" ftype="tabular" value="input436.tabular,input437.tabular,input438.tabular,input439.tabular,input440.tabular,input441.tabular,input442.tabular,input443.tabular,input444.tabular" />
            <output name="outfile" ftype="pdf" value="heatmap.default.pdf" compare="sim_size" delta="250" />
        </test>
        <test expect_num_outputs="1">
            <!-- PDF, tabular inputs, short color palette -->
            <param name="sinputs" ftype="tabular" value="input436.tabular,input437.tabular,input438.tabular,input439.tabular,input440.tabular,input441.tabular,input442.tabular,input443.tabular,input444.tabular" />
            <section name="advanced" >
                <param name="color" value="Set2" />
            </section>
            <output name="outfile" ftype="pdf" value="heatmap.default.pdf" compare="sim_size" delta="250" />
        </test>
        <test expect_num_outputs="1">
            <!-- PNG, multiple inputs, non-numeric IDS -->
            <param name="sinputs" ftype="tabular" value="input436.tabular,input437.tabular,input443.tabular,input444.tabular" />
            <param name="varfreq" value="0.5" />
            <section name="advanced" >
                <param name="color" value="Spectral" />
                <param name="output_type" value="png" />
            </section>
            <output name="outfile" ftype="png" value="heatmap.imageopts.png" compare="sim_size" delta="100000" />
        </test>
        <test expect_num_outputs="1">
            <!-- SVG, clustering defaults -->
            <param name="sinputs" ftype="tabular" value="input438.tabular,input439.tabular,input440.tabular,input441.tabular,input442.tabular" />
            <conditional name="clustering">
                <param name="do" value="TRUE" />
            </conditional>
            <section name="advanced" >
                <param name="color" value="Greys" />
                <param name="ratio" value="0.8" />
                <param name="output_type" value="svg" />
            </section>
            <output name="outfile" ftype="svg">
                <assert_contents>
                    <has_text text="viewBox=&quot;0 0 1156 361&quot;" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <!-- JPEG, clustering extras, mixed alphanumeric labels -->
            <param name="sinputs" ftype="tabular" value="input436.tabular,input443.tabular,input438.tabular,input444.tabular" />
            <conditional name="clustering">
                <param name="do" value="TRUE" />
                <param name="nclust" value="2" />
                <param name="method" value="centroid" />
            </conditional>
            <section name="advanced" >
                <param name="color" value="Purples" />
                <param name="ratio" value="1.2" />
                <param name="output_type" value="jpeg" />
            </section>
            <output name="outfile" ftype="jpg" value="heatmap.clustering2.jpeg" compare="sim_size" delta="130000" />
        </test>
        <test expect_num_outputs="1">
            <!-- PDF, vcf test -->
            <param name="sinputs" ftype="vcf" value="snpeff.123.vcf,snpeff.456.vcf,snpeff.789.vcf" />
            <section name="advanced" >
                <param name="color" value="PuBuGn" />
                <param name="output_type" value="pdf" />
            </section>
            <output name="outfile" ftype="pdf" value="heatmap.from_vcf.pdf" compare="sim_size" delta="250" />
        </test>
        <test expect_num_outputs="1">
            <!-- SVG, problematic vcf test -->
            <param name="sinputs" ftype="vcf" value="1084592.vcf,1085080.vcf,1085445.vcf,1085841.vcf,1085990.vcf" />
            <section name="advanced" >
                <param name="output_type" value="svg" />
            </section>
            <output name="outfile" ftype="svg">
                <assert_contents>
                    <has_text text="viewBox=&quot;0 0 754 292&quot;" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <!-- SVG, Vcf test with problematic splice+syn at snpeff789.vcf for threshold = 0.0222
            and an "empty" file (no variants) and a file with just one variant among its inputs -->
            <param name="sinputs" ftype="vcf" value="snpeff.123.vcf,snpeff.456.vcf,snpeff.789.vcf,no_variants.vcf,single_variant.vcf" />
            <param name="varfreq" value="0.0222" />
            <section name="advanced" >
                <param name="output_type" value="svg" />
            </section>
            <output name="outfile" ftype="svg">
                <assert_contents>
                    <has_text text="viewBox=&quot;0 0 1962 546&quot;" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
**What it does**

This tool generates multi-sample variant-frequency plots from SnpEff-annotated
viral variant lists with optional hierarchical clustering of the samples.

.. class:: Warning mark

    Currently, this tool has been tested only on SARS-CoV-2 variant data.
    While the intention is to have it work for viral variant data in general,
    be prepared for unexpected behavior with other input data at the current
    development stage.

----

The tool expects input variant lists in one of the following two formats:

1. VCF datasets as produced by standard variant callers with

   - variant allele frequencies encoded in an ``AF`` INFO field
   - variant functional genomic effects annotated using SnpEff's EFF format (SnpEff's ANN format is not currently supported!)

2. tabular datasets with columns listing, at least, the following variant properties:

   - ``CHROM``
   - ``POS``
   - ``REF``
   - ``ALT``
   - ``AF``
   - ``EFF[*].AA``
   - ``EFF[*].GENE``
   - ``EFF[*].EFFECT``

   Such files can be produced with SnpSift Extract Fields and can be useful if
   preprocessing of the lists with standard text processing tools is required.

   .. class:: infomark

   To represent empty EFF fields in the tabular format you can choose between
   ``.`` and the empty string.

----

Example output:

.. image:: /static/images/example_output.png

    ]]></help>
    <citations>
        <citation type="bibtex">@unpublished{Fuchs2020,
            author = {Fuchs, Jonas},
            title = {},
            year = {2020},
            note = {Multi-sample annotated viral variant-frequency plots based on the R pheatmap package.},
            address = {Institute for Virology, University of Freiburg}
        }</citation>
    </citations>
</tool>