changeset 0:328710890963 draft

"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit 98e541a75678eee749261630610b946c258fd5f3"
author recetox
date Wed, 23 Mar 2022 11:35:30 +0000
parents
children b77023c41c76
files macros.xml test-data/incomplete_metadata_data.csv test-data/input_data.csv test-data/input_data_nobatch.csv test-data/na_data.csv test-data/normalized_data.tsv test-data/normalized_data_nobatch.tsv waveica.xml waveica_wrapper.R
diffstat 9 files changed, 423 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Mar 23 11:35:30 2022 +0000
@@ -0,0 +1,127 @@
+<macros>
+    <token name="@TOOL_VERSION@">0.2.0</token>
+
+    <xml name="creator">
+        <creator>
+            <person
+                givenName="Maksym"
+                familyName="Skoryk"
+                url="https://github.com/maximskorik"
+                identifier="0000-0003-2056-8018" />
+            <organization
+                url="https://www.recetox.muni.cz/"
+                email="GalaxyToolsDevelopmentandDeployment@space.muni.cz"
+                name="RECETOX MUNI" />
+        </creator>
+    </xml>
+
+    <xml name="general_parameters">
+        <param type="data" name="data" label="Feature table" format="csv" help=""/>
+        <param type="integer" value="20" name="k" label="Number of components to decompose" help="maximal component that ICA decomposes"/>
+        <param type="float" value="0" name="alpha" label="Alpha" help="trade-off value between the independence of samples (temporal ICA) and variables (spatial ICA), should be between 0 and 1"/>
+    </xml>
+    <xml name="batchwise_parameters">
+        <param type="float" value="0.05" name="t" label="Batch-association threshold" help="threshold to consider a component associate with the batch,
+        should be between 0 and 1"/>
+        <param type="float" value="0.05" name="t2" label="Group-association threshold" help="threshold to consider a component associate with the group,
+        should be between 0 and 1"/>
+    </xml>
+    <xml name="singlebatch_parameters">
+        <param type="float" value="0" name="cutoff" label="Cutoff" help="threshold of the variation explained by the injection order for independent components, should be between 0 and 1"/>
+    </xml>
+    <xml name="exclude_blanks">
+        <param name="exclude_blanks" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Remove blanks" help="Excludes blank samples from the output" />
+    </xml>
+    <xml name="wf">
+        <conditional name="wf">
+            <param type="select" name="wavelet_filter" label="Wavelet transform filter" help="wavelet function and filter length [1] (see footnotes for more details)">
+                <option value="d" selected="True">Daubechies</option>
+                <option value="la" >Least Asymetric</option>
+                <option value="bl" >Best Localized</option>
+                <option value="c" >Coiflet</option>
+            </param>
+            <when value="d">
+                <param name="wavelet_length" type="select" label="filter length">
+                    <option value="2" selected="True">2</option>
+                    <option value="4">4</option>
+                    <option value="6">6</option>
+                    <option value="8">8</option>
+                    <option value="10">10</option>
+                    <option value="12">12</option>
+                    <option value="14">14</option>
+                    <option value="16">16</option>
+                    <option value="18">18</option>
+                    <option value="20">20</option>
+                </param>
+            </when>
+            <when value="la">
+                <param name="wavelet_length" type="select" label="filter length">
+                    <option value="8">8</option>
+                    <option value="10">10</option>
+                    <option value="12">12</option>
+                    <option value="14">14</option>
+                    <option value="16">16</option>
+                    <option value="18">18</option>
+                    <option value="20">20</option>
+                </param>
+            </when>
+            <when value="bl">
+                <param name="wavelet_length" type="select" label="filter length">
+                    <option value="14">14</option>
+                    <option value="18">18</option>
+                    <option value="20">20</option>
+                </param>
+            </when>
+            <when value="c">
+                <param name="wavelet_length" type="select" label="filter length">
+                    <option value="6">6</option>
+                    <option value="12">12</option>
+                    <option value="18">18</option>
+                    <option value="24">24</option>
+                    <option value="30">30</option>
+                </param>
+            </when>
+        </conditional>
+    </xml>
+
+    <xml name="outputs">
+        <outputs>
+            <data name="normalized_data" format="tsv" />
+        </outputs>
+    </xml>
+
+    <token name="@HELP@"><![CDATA[
+        **Description**
+
+        Removal of batch effects for large-scale untargeted metabolomics data based on wavelet analysis and independent
+        component analysis. The WaveICA method uses the time trend of samples over the injection order, decomposes the
+        original data into new multi-scale features, extracts and removes the batch effect resulting in normalized
+        intensities across samples.
+
+        The input is an intensity-by-feature table with metadata in the following format:
+
+        +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+
+        | sampleName    | class  | sampleType | injectionOrder | batch | M85T34     | M86T41       | M86T518     | M86T539     | ... |
+        +===============+========+============+================+=======+============+==============+=============+=============+=====+
+        | VT_160120_002 | sample | sample     | 1              | 1     | 228520.064 | 35646729.215 | 2386896.979 | 1026645.836 | ... |
+        +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+
+        | QC1           | sample | QC         | 2              | 1     | 90217.384  | 35735702.457 | 2456290.696 | 1089246.460 | ... |
+        +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+
+        | ...           | ...    | ...        | ...            | ...   | ...        | ...          | ...         | ...         | ... |
+        +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+
+
+
+        + The required columns are **sampleName**, **class**, **sampleType**, **injectionOrder**, and the **features** that you want to normalize.
+        + The **batch** column is required if batch correction mode is **Multiple batches** and optional otherwise.
+        + The presence of any additional columns (except features) will result in incorrect batch correction or job failure.
+        + the input table must not contain missing values. Missing intensities must be filled with 0.
+        + **sampleType** column accepts three possible values: [QC, sample, blank] (case insensitive).
+        + **class** column is used to denote a biological group of a sample (e.g., positive/negative species). The column accepts any values.
+        + the **output** is the same table with corrected feature intensities.
+
+        .. rubric:: **Footnotes**
+        .. [1] for details on wavelet-filter parameters refer to R `wavelets::wt.filter <https://www.rdocumentation.org/packages/wavelets/versions/0.3-0.2/topics/wt.filter>`_;
+        .. [2] when using 'Multiple batches', please cite the WaveICA (2019) paper; else, cite WaveICA 2.0 (2021) paper;
+    ]]>
+    </token>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/incomplete_metadata_data.csv	Wed Mar 23 11:35:30 2022 +0000
@@ -0,0 +1,6 @@
+sampleName,class,sampleType,batch,M85T34,M86T41,M86T518,M86T539
+VT_160120_002,sample,sample,1,228520.06430737,35646729.2154397,2386896.97966461,1026645.83653468
+VT_160120_004,sample,sample,1,90217.384387202,35735702.457216,2456290.69621518,1089246.46040563
+VT_160120_006,sample,sample,1,235656.752883839,37021134.4527116,8873450.40260241,837856.449608585
+VT_160120_008,sample,sample,1,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685
+VT_160120_010,sample,sample,1,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_data.csv	Wed Mar 23 11:35:30 2022 +0000
@@ -0,0 +1,6 @@
+sampleName,class,sampleType,injectionOrder,batch,M85T34,M86T41,M86T518,M86T539
+VT_160120_002,sample,sample,1,1,228520.06430737,35646729.21543971,2386896.97966461,1026645.83653468
+VT_160120_004,sample,sample,2,1,90217.384387202,35735702.457215995,2456290.69621518,1089246.46040563
+VT_160120_006,sample,sample,3,1,235656.75288383896,37021134.452711605,8873450.40260241,837856.449608585
+VT_160120_008,sample,sample,4,1,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685
+VT_160120_010,sample,sample,5,1,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_data_nobatch.csv	Wed Mar 23 11:35:30 2022 +0000
@@ -0,0 +1,6 @@
+sampleName,class,sampleType,injectionOrder,M85T34,M86T41,M86T518,M86T539
+VT_160120_002,sample,sample,1,228520.06430737,35646729.21543971,2386896.97966461,1026645.83653468
+VT_160120_004,sample,sample,2,90217.384387202,35735702.457215995,2456290.69621518,1089246.46040563
+VT_160120_006,sample,sample,3,235656.75288383896,37021134.452711605,8873450.40260241,837856.449608585
+VT_160120_008,sample,sample,4,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685
+VT_160120_010,sample,sample,5,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/na_data.csv	Wed Mar 23 11:35:30 2022 +0000
@@ -0,0 +1,6 @@
+sampleName,class,sampleType,injectionOrder,batch,M85T34,M86T41,M86T518,M86T539
+VT_160120_002,sample,sample,1,1,NA,35646729.2154397,2386896.97966461,1026645.83653468
+VT_160120_004,sample,sample,2,1,90217.384387202,35735702.457216,2456290.69621518,1089246.46040563
+VT_160120_006,sample,sample,3,1,235656.752883839,37021134.4527116,8873450.40260241,837856.449608585
+VT_160120_008,sample,sample,4,1,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685
+VT_160120_010,sample,sample,5,1,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/normalized_data.tsv	Wed Mar 23 11:35:30 2022 +0000
@@ -0,0 +1,6 @@
+sampleName	class	sampleType	injectionOrder	batch	M85T34	M86T41	M86T518	M86T539
+VT_160120_002	sample	sample	1	1	355200.506508035	75115889.9077485	6101488.54615418	2007379.02604984
+VT_160120_004	sample	sample	2	1	216897.826587868	75204863.1495248	6170882.26270475	2069979.64992079
+VT_160120_006	sample	sample	3	1	362337.195084504	76490295.1450204	12588041.969092	1818589.63912375
+VT_160120_008	sample	sample	4	1	143303.377379009	83771659.9549148	6181538.46316058	1975712.25920485
+VT_160120_010	sample	sample	5	1	189065.516447239	84108898.7658797	6103964.42378424	1935671.32085241
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/normalized_data_nobatch.tsv	Wed Mar 23 11:35:30 2022 +0000
@@ -0,0 +1,6 @@
+sampleName	class	sampleType	injectionOrder	M85T34	M86T41	M86T518	M86T539
+VT_160120_002	sample	sample	1	-9795801.68327296	29546678.5668352	-6207890.55898405	-8941748.93595845
+VT_160120_004	sample	sample	2	-9798910.74239713	29543569.5077111	-6210999.61810821	-8944857.99508262
+VT_160120_006	sample	sample	3	-9797307.93141959	29545172.3186886	-6209396.80713068	-8943255.18410509
+VT_160120_008	sample	sample	4	-9793706.69204694	29548773.5580612	-6205795.56775803	-8939653.94473244
+VT_160120_010	sample	sample	5	-9800711.45464277	29541768.7954654	-6212800.33035386	-8946658.70732827
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/waveica.xml	Wed Mar 23 11:35:30 2022 +0000
@@ -0,0 +1,105 @@
+<tool id="waveica" name="WaveICA" version="@TOOL_VERSION@+galaxy1" python_template_version="3.5">
+    <description>removal of batch effects for untargeted metabolomics data</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="creator"/>
+
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">r-recetox-waveica</requirement>
+    </requirements>
+    <command detect_errors="aggressive"><![CDATA[
+        Rscript
+            -e 'source("${__tool_directory__}/waveica_wrapper.R")'
+
+            #if $batch_correction.mode == "batchwise":
+            -e 'normalized_data <- waveica(
+                data = "$data",
+                wavelet_filter = "$wf.wavelet_filter",
+                wavelet_length = "$wf.wavelet_length",
+                k = $k,
+                t = $batch_correction.t,
+                t2 = $batch_correction.t2,
+                alpha = $alpha,
+                exclude_blanks = $exclude_blanks
+            )'
+            #else if $batch_correction.mode == "single_batch":
+            -e 'normalized_data <- waveica_singlebatch(
+                data = "$data",
+                wavelet_filter = "$wf.wavelet_filter",
+                wavelet_length = "$wf.wavelet_length",
+                k = $k,
+                alpha = $alpha,
+                cutoff = $batch_correction.cutoff,
+                exclude_blanks = $exclude_blanks
+            )'
+            #end if
+
+            -e 'store_data(normalized_data,"$normalized_data")'
+    ]]></command>
+
+    <inputs>
+        <expand macro="general_parameters"/>
+        <expand macro="wf"/>
+        <conditional name="batch_correction">
+            <param name="mode" type="select" label="Batch correction mode" help="'multiple batches' takes into account
+            inter- and intrabatch intensity drift; 'single batch' relies only on the injection order of the samples and
+            requires no batch information [2]">
+                <option value="batchwise" selected="true">Multiple batches</option>
+                <option value="single_batch">Single batch (or no batch information)</option>
+            </param>
+            <when value="batchwise">
+                <expand macro="batchwise_parameters"/>
+            </when>
+            <when value="single_batch">
+                <expand macro="singlebatch_parameters"/>
+            </when>
+        </conditional>
+        <expand macro="exclude_blanks"/>
+    </inputs>
+
+    <expand macro="outputs"/>
+
+    <tests>
+        <test>
+            <param name="data" value="input_data.csv" ftype="csv"/>
+            <param name="mode" value="batchwise"/>
+            <param name="wavelet_filter" value="d"/>
+            <param name="filter_length" value="2"/>
+            <param name="k" value="20"/>
+            <param name="t" value="0.05"/>
+            <param name="t2" value="0.05"/>
+            <param name="alpha" value="0"/>
+            <output name="normalized_data" file="normalized_data.tsv"/>
+        </test>
+        <!-- The following test has different results on three platform I've tried -->
+        <!-- <test>
+            <param name="data" value="input_data_nobatch.csv" ftype="csv"/>
+            <param name="mode" value="single_batch"/>
+            <param name="wavelet_filter" value="d"/>
+            <param name="filter_length" value="2"/>
+            <param name="k" value="20"/>
+            <param name="alpha" value="0"/>
+            <param name="cutoff" value="0"/>
+            <output name="normalized_data" file="normalized_data_nobatch.tsv"/>
+        </test> -->
+        <test expect_failure="true">
+            <param name="data" value="na_data.csv" ftype="csv"/>
+        </test>
+        <test expect_failure="true">
+            <param name="data" value="incomplete_metadata_data.csv" ftype="csv"/>
+        </test>
+    </tests>
+
+    <help>
+        <![CDATA[
+        @HELP@
+        ]]>
+    </help>
+
+    <citations>
+        <citation type="doi">10.1016/j.aca.2019.02.010</citation>
+        <citation type="doi">10.1007/s11306-021-01839-7</citation>
+    </citations>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/waveica_wrapper.R	Wed Mar 23 11:35:30 2022 +0000
@@ -0,0 +1,155 @@
+waveica <- function(data,
+                    wavelet_filter,
+                    wavelet_length,
+                    k,
+                    t,
+                    t2,
+                    alpha,
+                    exclude_blanks) {
+
+  # get input from the Galaxy, preprocess data
+  data <- read.csv(data, header = TRUE)
+
+  required_columns <- c("sampleName", "class", "sampleType", "injectionOrder", "batch")
+  verify_input_dataframe(data, required_columns)
+
+  data <- sort_by_injection_order(data)
+
+  # separate data into features, batch and group
+  feature_columns <- colnames(data)[!colnames(data) %in% required_columns]
+  features <- data[, feature_columns]
+  group <- enumerate_groups(as.character(data$sampleType))
+  batch <- data$batch
+
+  # run WaveICA
+  features <- recetox.waveica::waveica(
+    data = features,
+    wf = get_wf(wavelet_filter, wavelet_length),
+    batch = batch,
+    group = group,
+    K = k,
+    t = t,
+    t2 = t2,
+    alpha = alpha
+  )
+
+  data[, feature_columns] <- features
+
+  # remove blanks from dataset
+  if (exclude_blanks) {
+    data <- exclude_group(data, group)
+  }
+
+  return(data)
+}
+
+
+waveica_singlebatch <- function(data,
+                                wavelet_filter,
+                                wavelet_length,
+                                k,
+                                alpha,
+                                cutoff,
+                                exclude_blanks) {
+
+  # get input from the Galaxy, preprocess data
+  data <- read.csv(data, header = TRUE)
+
+  required_columns <- c("sampleName", "class", "sampleType", "injectionOrder")
+  optional_columns <- c("batch")
+  verify_input_dataframe(data, required_columns)
+
+  data <- sort_by_injection_order(data)
+
+  feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)]
+  features <- data[, feature_columns]
+  injection_order <- data$injectionOrder
+
+  # run WaveICA
+  features <- recetox.waveica::waveica_nonbatchwise(
+    data = features,
+    wf = get_wf(wavelet_filter, wavelet_length),
+    injection_order = injection_order,
+    K = k,
+    alpha = alpha,
+    cutoff = cutoff
+  )
+
+  data[, feature_columns] <- features
+
+  # remove blanks from dataset
+  if (exclude_blanks) {
+    data <- exclude_group(data, group)
+  }
+
+  return(data)
+}
+
+
+sort_by_injection_order <- function(data) {
+  if ("batch" %in% colnames(data)) {
+    data <- data[order(data[, "batch"],
+      data[, "injectionOrder"],
+      decreasing = FALSE
+    ), ]
+  } else {
+    data <- data[order(data[, "injectionOrder"],
+      decreasing = FALSE
+    ), ]
+  }
+  return(data)
+}
+
+
+verify_input_dataframe <- function(data, required_columns) {
+  if (anyNA(data)) {
+    stop("Error: dataframe cannot contain NULL values!
+Make sure that your dataframe does not contain empty cells")
+  } else if (!all(required_columns %in% colnames(data))) {
+    stop("Error: missing metadata!
+Make sure that the following columns are present in your dataframe: ", paste(required_columns, collapse = ", "))
+  }
+}
+
+
+# Match group labels with [blank/sample/qc] and enumerate them
+enumerate_groups <- function(group) {
+  group[grepl("blank", tolower(group))] <- 0
+  group[grepl("sample", tolower(group))] <- 1
+  group[grepl("qc", tolower(group))] <- 2
+
+  return(group)
+}
+
+
+# Create appropriate input for R wavelets function
+get_wf <- function(wavelet_filter, wavelet_length) {
+  wf <- paste(wavelet_filter, wavelet_length, sep = "")
+
+  # exception to the wavelet function
+  if (wf == "d2") {
+    wf <- "haar"
+  }
+
+  return(wf)
+}
+
+
+# Exclude blanks from a dataframe
+exclude_group <- function(data, group) {
+  row_idx_to_exclude <- which(group %in% 0)
+  if (length(row_idx_to_exclude) > 0) {
+    data_without_blanks <- data[-c(row_idx_to_exclude), ]
+    cat("Blank samples have been excluded from the dataframe.\n")
+    return(data_without_blanks)
+  } else {
+    return(data)
+  }
+}
+
+
+# Store output of WaveICA in a tsv file
+store_data <- function(data, output) {
+  write.table(data, file = output, sep = "\t", row.names = FALSE, quote = FALSE)
+  cat("Normalization has been completed.\n")
+}