changeset 0:939c59ab61cf draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ancombc commit 045979180e44c683b5e0760f802af66c05abcae8
author iuc
date Sat, 16 Jul 2022 21:40:04 +0000
parents
children 6fbedba4d983
files ancombc.R ancombc.xml macros.xml test-data/input1.phyloseq
diffstat 4 files changed, 289 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ancombc.R	Sat Jul 16 21:40:04 2022 +0000
@@ -0,0 +1,85 @@
+#!/usr/bin/env Rscript
+
+suppressPackageStartupMessages(library("ANCOMBC"))
+suppressPackageStartupMessages(library("data.table"))
+suppressPackageStartupMessages(library("optparse"))
+
+option_list <- list(
+    make_option(c("--phyloseq"), action = "store", dest = "phyloseq", help = "File containing a phyloseq object"),
+    make_option(c("--formula"), action = "store", dest = "formula", help = "Formula"),
+    make_option(c("--p_adj_method"), action = "store", dest = "p_adj_method", help = "Method to adjust p-values"),
+    make_option(c("--zero_cut"), action = "store", dest = "zero_cut", type = "double", help = "Minimum taxa prevalence"),
+    make_option(c("--lib_cut"), action = "store", dest = "lib_cut", type = "integer", help = "Thrshold for filtering samples based on library sizes"),
+    make_option(c("--group"), action = "store", dest = "group", help = "Name of the group variable in the metadata"),
+    make_option(c("--struc_zero"), action = "store", dest = "struc_zero", help = "Detect structural zeros based on group"),
+    make_option(c("--neg_lb"), action = "store", dest = "neg_lb", help = "Classify a taxon as a structural zero using its asymptotic lower bound"),
+    make_option(c("--tol"), action = "store", dest = "tol", type = "double", help = "Iteration convergence tolerance for the E-M algorithm"),
+    make_option(c("--max_iter"), action = "store", dest = "max_iter", help = "Maximum number of iterations for the E-M algorithm"),
+    make_option(c("--conserve"), action = "store", dest = "conserve", help = "Use a conservative variance estimator for the test statistic"),
+    make_option(c("--alpha"), action = "store", dest = "alpha", help = "Level of significance"),
+    make_option(c("--global"), action = "store", dest = "global", help = "Perform global test"),
+    make_option(c("--output_dir"), action = "store", dest = "output_dir", help = "Output directory")
+)
+
+parser <- OptionParser(usage = "%prog [options] file", option_list = option_list)
+args <- parse_args(parser, positional_arguments = TRUE)
+opt <- args$options
+
+get_boolean_value <- function(val) {
+    if (val == "true") {
+        return(TRUE)
+    } else {
+        return(FALSE)
+    }
+}
+
+get_file_path <- function(dir, file_name) {
+    file_path <- paste(dir, file_name, sep = "/")
+    return(file_path)
+}
+
+write_data_frame <- function(dir, file_name, data_frame) {
+    file_path <- get_file_path(dir, file_name)
+    write.table(data_frame, file = file_path, quote = FALSE, row.names = TRUE, col.names = TRUE, sep = "\t")
+}
+
+# Convert boolean values to boolean.
+struc_zero <- get_boolean_value(opt$struc_zero)
+neg_lb <- get_boolean_value(opt$neg_lb)
+conserve <- get_boolean_value(opt$conserve)
+global <- get_boolean_value(opt$global)
+
+# Construct a phyloseq object.
+phyloseq_obj <- readRDS(opt$phyloseq)
+
+# Construct an ANCOM-BC object.
+ancombc_obj <- ancombc(phyloseq = phyloseq_obj,
+                       formula = opt$formula,
+                       p_adj_method = opt$p_adj_method,
+                       zero_cut = opt$zero_cut,
+                       lib_cut = opt$lib_cut,
+                       group = opt$group,
+                       struc_zero = struc_zero,
+                       neg_lb = neg_lb,
+                       tol = opt$tol,
+                       max_iter = opt$max_iter,
+                       conserve = conserve,
+                       alpha = opt$alpha,
+                       global = global)
+
+res <- ancombc_obj$res
+
+# Write the outputs.
+write_data_frame(opt$output_dir, "feature_table.tabular", ancombc_obj$feature_table)
+write_data_frame(opt$output_dir, "zero_ind.tabular", ancombc_obj$zero_ind)
+write.csv2(ancombc_obj$samp_frac, file = get_file_path(opt$output_dir, "samp_frac.tabular"), row.names = FALSE, col.names = FALSE, sep = "\t")
+write_data_frame(opt$output_dir, "resid.tabular", ancombc_obj$resid)
+write(ancombc_obj$delta_em, file = get_file_path(opt$output_dir, "delta_em.tabular"))
+write(ancombc_obj$delta_wls, file = get_file_path(opt$output_dir, "delta_wls.tabular"))
+write_data_frame(opt$output_dir, "res_beta.tabular", res$beta)
+write_data_frame(opt$output_dir, "res_se.tabular", res$se)
+write_data_frame(opt$output_dir, "res_W.tabular", res$W)
+write_data_frame(opt$output_dir, "res_p_val.tabular", res$p_val)
+write_data_frame(opt$output_dir, "res_q_val.tabular", res$q_val)
+write_data_frame(opt$output_dir, "res_diff_abn.tabular", res$diff_abn)
+write_data_frame(opt$output_dir, "res_global.tabular", ancombc_obj$res_global)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ancombc.xml	Sat Jul 16 21:40:04 2022 +0000
@@ -0,0 +1,175 @@
+<tool id="ancombc" name="ANCOM-BC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+  <description>differential abundance analysis</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+mkdir 'output_dir' &&
+
+Rscript '${__tool_directory__}/ancombc.R' 
+--phyloseq '$phyloseq'
+--formula '$formula'
+--p_adj_method '$p_adj_method'
+--zero_cut $zero_cut
+--lib_cut $lib_cut
+--group '$group'
+--struc_zero '$struc_zero'
+--neg_lb '$neg_lb'
+--tol $tol
+--max_iter $max_iter
+--conserve '$conserve'
+--alpha $alpha
+--global '$global'
+--output_dir 'output_dir'
+    ]]></command>
+    <inputs>
+        <param argument="--phyloseq" type="data" format="phyloseq" label="File containing a phyloseq object"/>
+        <param argument="--formula" type="text" value="" label="Formula" help="Expresses how the microbial absolute abundances for each taxon depend on the variables in the metadata">
+            <expand macro="sanitize_query"/>
+        </param>
+        <param argument="--p_adj_method" type="select" label="Select method to adjust p-values">
+            <option value="holm" selected="true">holm</option>
+            <option value="hochberg">hochberg</option>
+            <option value="hommel">hommel</option>
+            <option value="bonferroni">bonferroni</option>
+            <option value="BH">BH</option>
+            <option value="BY">BY</option>
+            <option value="fdr">fdr</option>
+            <option value="none">none</option>
+        </param>
+        <param argument="--zero_cut" type="float" value="0.1" min="0" max="1" label="Minimum taxa prevalence" help="Taxa with prevalences less than this will be excluded"/>
+        <param argument="--lib_cut" type="integer" value="0" min="0" label="Threshold for filtering samples based on library sizes" help="Samples with library sizes less than this will be excluded"/>
+        <param argument="--group" type="text" value="" label="Name of the group variable in the metadata" help="Group should be discrete, specifying group is required for detecting structural zeros and performing global test"/>
+        <param argument="--struc_zero" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Detect structural zeros based on group?"/>
+        <param argument="--neg_lb" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Classify a taxon as a structural zero using its asymptotic lower bound?"/>
+        <param argument="--tol" type="float" value="0.00001" min="0" label="Iteration convergence tolerance for the E-M algorithm"/>
+        <param argument="--max_iter" type="integer" value="100" min="1" label="Maximum number of iterations for the E-M algorithm"/>
+        <param argument="--conserve" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Use a conservative variance estimator for the test statistic?" help="Recommended if the sample size is small and/or the number of differentially abundant taxa is believed to be large"/>
+        <param argument="--alpha" type="float" value="0.05" min="0" label="Level of significance"/>
+        <param argument="--global" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Perform global test?"/>
+    </inputs>
+    <outputs>
+        <collection name="output_collection" type="list">
+            <discover_datasets pattern="__name_and_ext__" directory="output_dir"/>
+        </collection>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="phyloseq" value="input1.phyloseq" ftype="phyloseq"/>
+            <param name="formula" value="patient_status"/>
+            <param name="p_adj_method" value="fdr"/>
+            <param name="zero_cut" value="0.9"/>
+            <param name="group" value="patient_status"/>
+            <param name="struc_zero" value="true"/>
+            <param name="neg_lb" value="true"/>
+            <param name="conserve" value="true"/>
+            <param name="global" value="true"/>
+            <output_collection name="output_collection" type="list" count="13">
+                <element name="delta_em" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="0.2855054"/>
+                    </assert_contents>
+                </element>
+                <element name="delta_wls" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="0.2992838"/>
+                    </assert_contents>
+                </element>
+                <element name="feature_table" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="A110"/>
+                        <has_size value="4270" delta="100"/>
+                    </assert_contents>
+                </element>
+                <element name="res_W" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="patient_statusControl"/>
+                        <has_size value="1364" delta="100"/>
+                    </assert_contents>
+                </element>
+                <element name="res_beta" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="patient_statusControl"/>
+                        <has_size value="1369" delta="100"/>
+                    </assert_contents>
+                </element>
+                <element name="res_diff_abn" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="FALSE"/>
+                        <has_size value="760" delta="100"/>
+                    </assert_contents>
+                </element>
+                <element name="res_global" ftype="tabular">
+                    <assert_contents>
+                        <has_size value="1"/>
+                    </assert_contents>
+                </element>
+                <element name="res_p_val" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="patient_statusControl"/>
+                        <has_size value="1180" delta="100"/>
+                    </assert_contents>
+                </element>
+                <element name="res_q_val" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="patient_statusControl"/>
+                        <has_size value="1155" delta="100"/>
+                    </assert_contents>
+                </element>
+                <element name="res_se" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="patient_statusControl"/>
+                        <has_size value="1348" delta="100"/>
+                    </assert_contents>
+                </element>
+                <element name="resid" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="A110"/>
+                        <has_size value="24241" delta="100"/>
+                    </assert_contents>
+                </element>
+                <element name="samp_frac" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="x"/>
+                        <has_size value="501" delta="100"/>
+                    </assert_contents>
+                </element>
+                <element name="zero_ind" ftype="tabular">
+                    <assert_contents>
+                        <has_text text="structural_zero"/>
+                        <has_size value="1113" delta="100"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Performs a differential abundance analysis for microbiome data. Microbiome data are typically subject
+to two sources of biases: unequal sampling fractions (sample-specific biases) and differential sequencing
+efficiencies (taxon-specific biases). ANCOMBC package includes methodologies that aim to correct these
+biases and construct statistically consistent estimators.
+
+A detaset containing a phyloseq object is a required input.  The phyloseq object must consist of a feature table (microbial observed abundance table), a sample metadata, a taxonomy table (optional), and a phylogenetic tree (optional). The row names of the metadata must match the sample names of the feature table, and the row names of the taxonomy table must match the taxon (feature) names of the feature table.
+
+The tool produces a collection consisting of the following items.
+
+ * **feature_table** - a pre-processed (based on --zero_cut and --lib_cut) microbial observed abundance table
+ * **zero_ind** - a logical matrix with TRUE indicating the taxon is identified as a structural zero for the specified group variable
+ * **samp_frac** - a numeric vector of estimated sampling fractions in log scale (natural log) - if any sample contains missing values for any variable specified in the formula, the corresponding sampling fraction estimate for this sample will be NA since the sampling fraction is not estimable with the presence of missing values
+ * **resid** - a matrix of residuals from the ANCOM-BC log-linear (natural log) model - rows are taxa and columns are samples
+ * **delta_em** - estimated sample-specific biases through E-M algorithm
+ * **delta_wls** - estimated sample-specific biases through weighted least squares (WLS) algorithm
+ * **res_lfc** - a table of log fold changes obtained from the ANCOM-BC log-linear (natural log) model
+ * **res_se** - a table of standard errors (SEs) of lfc
+ * **res_W** - a table of test statistics. W = lfc/se
+ * **res_pval** - a table of p-values obtained from two-sided Z-test using the test statistic W
+ * **res_qval** - a table of adjusted p-values obtained by applying p_adj_method to p_val
+ * **res_diff_abn** - a table of logical values; TRUE if the taxon has q_val less than alpha
+ * **res_global** - a table containing the ANCOM-BC global test result for the variable specified in the group
+    </help>
+    <expand macro="citations"/>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Sat Jul 16 21:40:04 2022 +0000
@@ -0,0 +1,29 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.4.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">21.01</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">bioconductor-ancombc</requirement>
+            <requirement type="package" version="1.14.2">r-data.table</requirement>
+            <requirement type="package" version="1.7.1">r-optparse</requirement>
+        </requirements>
+    </xml>
+    <xml name="sanitize_query" token_validinitial="string.printable">
+        <sanitizer>
+            <valid initial="@VALIDINITIAL@">
+                <remove value="&apos;"/>
+            </valid>
+            <mapping initial="none">
+                <add source="&apos;" target="&apos;&quot;&apos;&quot;&apos;"/>
+            </mapping>
+        </sanitizer>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1038/s41467-020-17041-7</citation>
+            <citation type="doi">10.3402/mehd.v26.27663</citation>
+        </citations>
+    </xml>
+</macros>
+
Binary file test-data/input1.phyloseq has changed