Mercurial > repos > iuc > micro_decon

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/decon.R	Wed Aug 06 08:38:24 2025 +0000
@@ -0,0 +1,79 @@
+suppressPackageStartupMessages(library(microDecon))
+suppressPackageStartupMessages(library(optparse))
+
+# Define command-line options
+option_list <- list(
+    make_option(c("-m", "--mode"), type = "character", help = "Mode of operation: decon, remove.cont, remove.thresh, decon.diff", metavar = "MODE"),
+    make_option(c("-d", "--data_file"), type = "character", help = "Path to the data file (CSV format expected)", metavar = "FILE"),
+    make_option(c("-o", "--output"), type = "character", default = NULL, help = "Output file from remove.cont or remove.thresh (used in decon.diff)", metavar = "FILE"),
+    make_option(c("-b", "--numb_blanks"), type = "integer", default = NULL, help = "Number of blank samples"),
+    make_option(c("-n", "--numb_ind"), type = "character", default = NULL, help = "Number of individuals (eval-parsed)"),
+    make_option(c("-t", "--taxa"), type = "logical", default = FALSE, help = "Taxa flag (TRUE or FALSE)"),
+    make_option(c("-r", "--runs"), type = "integer", default = NULL, help = "Number of runs"),
+    make_option(c("-T", "--thresh"), type = "double", default = NULL, help = "Threshold value"),
+    make_option(c("-p", "--prop_thresh"), type = "double", default = NULL, help = "Proportional threshold value"),
+    make_option(c("-g", "--regression"), type = "double", default = NULL, help = "Regression value"),
+    make_option(c("-l", "--low_threshold"), type = "double", default = 40, help = "Low threshold [default: %default]"),
+    make_option(c("-u", "--up_threshold"), type = "double", default = 400, help = "Upper threshold [default: %default]")
+)
+
+# Parse arguments
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# Read main input data
+microbe_data <- read.csv(opt$data_file, header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
+
+if (opt$mode == "decon") {
+    result <- decon(
+        data = microbe_data,
+        numb.blanks = opt$numb_blanks,
+        numb.ind = eval(parse(text = opt$numb_ind)),
+        taxa = opt$taxa,
+        runs = opt$runs,
+        thresh = opt$thresh,
+        prop.thresh = opt$prop_thresh,
+        regression = opt$regression,
+        low.threshold = opt$low_threshold,
+        up.threshold = opt$up_threshold
+    )
+    write.csv(result$decon.table, "decon_table.csv", row.names = FALSE)
+    write.csv(result$reads.removed, "reads_removed.csv", row.names = FALSE)
+    write.csv(result$sum.per.group, "difference_sum.csv", row.names = FALSE)
+    write.csv(result$mean.per.group, "difference_mean.csv", row.names = FALSE)
+    write.csv(result$OTUs.removed, "OTUs_removed.csv", row.names = FALSE)
+} else if (opt$mode == "remove_cont") {
+    result <- remove.cont(
+        data = microbe_data,
+        numb.blanks = opt$numb_blanks,
+        taxa = opt$taxa,
+        runs = opt$runs,
+        regression = opt$regression,
+        low.threshold = opt$low_threshold,
+        up.threshold = opt$up_threshold
+    )
+    write.csv(result, "decon_table.csv", row.names = FALSE)
+} else if (opt$mode == "remove_thresh") {
+    result <- remove.thresh(
+        data = microbe_data,
+        numb.ind = eval(parse(text = opt$numb_ind)),
+        taxa = opt$taxa,
+        thresh = opt$thresh,
+        prop.thresh = opt$prop_thresh
+    )
+    write.csv(result, "decon_table.csv", row.names = FALSE)
+} else if (opt$mode == "decon_diff") {
+    if (is.null(opt$output)) stop("Error: --output must be provided for decon.diff mode")
+    output_data <- read.csv(opt$output, header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
+    result <- decon.diff(
+        data = microbe_data,
+        output = output_data,
+        numb.blanks = opt$numb_blanks,
+        numb.ind = eval(parse(text = opt$numb_ind)),
+        taxa = opt$taxa
+    )
+    write.csv(result$decon.table, "decon_table.csv", row.names = FALSE)
+    write.csv(result$reads.removed, "reads_removed.csv", row.names = FALSE)
+    write.csv(result$sum.per.group, "difference_sum.csv", row.names = FALSE)
+    write.csv(result$mean.per.group, "difference_mean.csv", row.names = FALSE)
+    write.csv(result$OTUs.removed, "OTUs_removed.csv", row.names = FALSE)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Aug 06 08:38:24 2025 +0000
@@ -0,0 +1,38 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.0.2</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@LICENSE@">GPL-2.0-or-later</token>
+    <xml name="requirements">
+        <requirements>
+            <container type="docker">sebimer/samba-v4-qiime2:2024.2</container>
+        </requirements>
+    </xml>
+    <xml name="numb_blanks_arg">
+        <param argument="--numb_blanks" type="integer" label="Number of blank samples" value="1" min="0" help="Specifies the number of blanks included in the data set"/>
+    </xml>
+    <xml name="numb_ind_arg">
+        <param argument="--numb_ind" type="text" label="Number of individuals per group" optional="false" help="A comma-separated vector (e.g. 3,2) indicating the number of individual samples in each group">
+            <validator type="regex" message="Enter a comma-separated list of positive integers (e.g., 3,2)">^\d+(,\d+)*$</validator>
+        </param>
+    </xml>
+    <xml name="runs_and_regression_args">
+        <param argument="--runs" type="integer" label="Number of runs" value="2" min="1" max="10" help="Specifies the number of times that the function should run the decontamination procedure on the data."/>
+        <param argument="--regression" type="integer" label="Regression" value="0" min="0" max="2" help="Specifies the regression equation used to calculate the constant. 0 = it chooses between regression 1 and regression 2 based on the lower threshold and upper threshold arguments. 1 = it always uses regression 1. 2 = it always uses regression 2"/>
+        <param argument="--low_threshold" type="integer" label="Lower threshold" value="40" min="0" help="Selects the lower point for switching between regression 1 and regression 2. It is usually best not to change this value"/>
+        <param argument="--up_threshold" type="integer" label="Upper threshold" value="400" min="0" help="Selects the higher point for switching between regression 1 and regression 2. It is usually best not to change this value"/>
+    </xml>
+    <xml name="thresh_args">
+        <param argument="--thresh" type="float" label="Zero proportion threshold" value="0.7" min="0" max="1" help="This is the threshold at which if that proportion of 0s are present for an OTU within a group, all samples will be set to 0 for that OTU for that group"/>
+        <param argument="--prop_thresh" type="float" label="Proportional threshold" value="0.00005" min="0" max="1" help="This is the threshold at which if the number of reads for a particular OTU are below this proportion, the OTU will be set to zero for all individuals in that group"/>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1002/edn3.11</citation>
+        </citations>
+    </xml>
+    <xml name="creators">
+        <creator>
+            <person givenName="Rand" familyName="Zoabi" url="https://github.com/RZ9082" identifier="https://orcid.org/0009-0000-2501-8053"/>
+        </creator>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/micro_decon.xml	Wed Aug 06 08:38:24 2025 +0000
@@ -0,0 +1,164 @@
+<tool name="micro_decon" id="micro_decon" version="@TOOL_VERSION@+@VERSION_SUFFIX@" profile="24.2" license="@LICENSE@">
+    <description>removing contamination from metabarcoding</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <expand macro="creators"/>
+    <command detect_errors="exit_code"><![CDATA[
+        Rscript '$__tool_directory__/decon.R'
+        --mode $function_cond.function_selector
+        --data_file '$input_table'
+        #set mode = $function_cond.function_selector
+        #if $mode != 'remove_thresh':
+            --numb_blanks '$function_cond.numb_blanks'
+        #end if
+
+        #if $mode != 'remove_cont'
+            --numb_ind 'c($function_cond.numb_ind)'
+        #end if
+
+        #if $mode in ['decon', 'remove_cont']:
+            --runs '$function_cond.runs'
+            --regression '$function_cond.regression'
+            --low_threshold '$function_cond.low_threshold'
+            --up_threshold '$function_cond.up_threshold'
+        #end if
+
+        #if $mode in ['decon', 'remove_thresh']:
+            --thresh '$function_cond.thresh'
+            --prop_thresh '$function_cond.prop_thresh'
+        #end if
+
+        #if $mode == 'decon_diff':
+            --output '$function_cond.output'
+        #end if
+        --taxa '$taxa'
+    ]]></command>
+    <inputs>
+        <param name="input_table" type="data" format="csv,tsv,tabular" label="Input table"/>
+        <param argument="--taxa" type="boolean" label="Does the input table contains taxonomy column?" value="true" truevalue="T" falsevalue="F"/>
+        <conditional name="function_cond">
+            <param name="function_selector" type="select" label="Select the microDecon function to apply">
+                <option value="decon">decon</option>
+                <option value="remove_cont">remove.cont</option>
+                <option value="remove_thresh">remove.thresh</option>
+                <option value="decon_diff">decon.diff</option>
+            </param>
+            <when value="decon">
+                <expand macro="numb_blanks_arg"/>
+                <expand macro="numb_ind_arg"/>
+                <expand macro="runs_and_regression_args"/>
+                <expand macro="thresh_args"/>
+            </when>
+            <when value="remove_cont">
+                <expand macro="numb_blanks_arg"/>
+                <expand macro="runs_and_regression_args"/>
+            </when>
+            <when value="remove_thresh">
+                <expand macro="numb_ind_arg"/>
+                <expand macro="thresh_args"/>
+            </when>
+            <when value="decon_diff">
+                <param argument="--output" type="data" label="The output table of remove.cont or remove.thresh" format="csv"/>
+                <expand macro="numb_blanks_arg"/>
+                <expand macro="numb_ind_arg"/>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="decon_table" format="csv" label="${function_cond.function_selector} on ${on_string}: Decontaminated table" from_work_dir="decon_table.csv"/>
+        <data name="reads_removed" format="csv" label="${function_cond.function_selector} on ${on_string}: Reads removed" from_work_dir="reads_removed.csv">
+            <filter>function_cond['function_selector'] != "remove_cont" and function_cond['function_selector'] != "remove_thresh"</filter>
+        </data>
+        <data name="difference_sum" format="csv" label="${function_cond.function_selector} on ${on_string}: Difference sum" from_work_dir="difference_sum.csv">
+            <filter>function_cond['function_selector'] != "remove_cont" and function_cond['function_selector'] != "remove_thresh"</filter>
+        </data>
+        <data name="difference_mean" format="csv" label="${function_cond.function_selector} on ${on_string}: Difference mean" from_work_dir="difference_mean.csv">
+            <filter>function_cond['function_selector'] != "remove_cont" and function_cond['function_selector'] != "remove_thresh"</filter>
+        </data>
+        <data name="otus_removed" format="csv" label="${function_cond.function_selector} on ${on_string}: OTUs removed" from_work_dir="OTUs_removed.csv">
+            <filter>function_cond['function_selector'] != "remove_cont" and function_cond['function_selector'] != "remove_thresh"</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="5">
+            <param name="input_table" value="input.csv"/>
+            <param name="taxa" value="true"/>
+            <conditional name="function_cond">
+                <param name="function_selector" value="decon"/>
+                <param name="numb_ind" value="3,2"/>
+            </conditional>
+            <output name="decon_table" file="decon_table.csv"/>
+            <output name="reads_removed" file="reads_removed.csv"/>
+            <output name="difference_sum" file="difference_sum.csv"/>
+            <output name="difference_mean" file="difference_mean.csv"/>
+            <output name="otus_removed" file="otus_removed.csv"/>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input_table" value="input.csv"/>
+            <param name="taxa" value="true"/>
+            <conditional name="function_cond">
+                <param name="function_selector" value="remove_cont"/>
+            </conditional>
+            <output name="decon_table" file="remove_cont_decon_table.csv"/>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input_table" value="remove_cont_decon_table.csv"/>
+            <param name="taxa" value="true"/>
+            <conditional name="function_cond">
+                <param name="function_selector" value="remove_thresh"/>
+                <param name="numb_ind" value="3,2"/>
+            </conditional>
+            <output name="decon_table" file="remove_thresh_decon_table.csv"/>
+        </test>
+        <test expect_num_outputs="5">
+            <param name="input_table" value="input.csv"/>
+            <param name="taxa" value="true"/>
+            <conditional name="function_cond">
+                <param name="function_selector" value="decon_diff"/>
+                <param name="numb_ind" value="3,2"/>
+                <param name="output" value="remove_thresh_decon_table.csv"/>
+            </conditional>
+            <output name="decon_table" file="decon_table.csv"/>
+            <output name="reads_removed" file="reads_removed.csv"/>
+            <output name="difference_sum" file="difference_sum.csv"/>
+            <output name="difference_mean" file="difference_mean.csv"/>
+            <output name="otus_removed" file="otus_removed.csv"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+microDecon
+==========
+The microDecon package is designed to remove contaminant reads from
+metabarcoding studies (e.g., from bacterial contamination in reagents during a microbiome
+study).
+
+Input
+------
+A table of metabarcoding data structured as a data
+frame where each row is an OTU (or ASV or other metabarcoding output), each column is an
+individual sample, and each cell contains the number of reads for a given OTU for a given
+individual. Additionally, the first column should contain OTU IDs (these can be numeric or
+characters), and the last column should (optionally) contain taxonomic information.
+
+Functions
+---------
+* **decon**: This is a wrapper function for all other microDecon functions. t first decontaminates the data using remove.cont(), then it runs remove.thresh(), then it runs decon.diff().
+* **remove.cont**: This is the primary function for removing contamination. It outputs a single OTU table of decontaminated results (OTUs that were entirely contamination are still included as rows of 0s).
+* **remove.thresh**: This function removes residual contamination in the output from remove.cont().
+* **decon.diff**: This function takes the output of either remove.cont() or remove.thresh() as well your original, contaminated data, and it returns summary statistics of how many OTUs were removed.
+
+Outputs
+--------
+* **Decontaminated table**: A table of decontaminated OTU data.
+* **Reads removed**: An OTU table showing the number of removed reads form each OTU.
+* **Difference sum**: The total number of removed reads from each OTU that amplified in the blank.
+* **Difference mean**: The average number of removed reads from each OTU that amplified in the blank.
+* **removed OTUs**: A table including the OTU identifiers which were completely removed from either particular groups or the entire data set.
+
+For further information visit the complete `docs <https://github.com/donaldtmcknight/microDecon/blob/master/microDecon%20user's%20guide%201.0.2%20updated.pdf>`_
+
+]]></help>
+    <expand macro="citations"/>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/decon_table.csv	Wed Aug 06 08:38:24 2025 +0000
@@ -0,0 +1,6 @@
+"OTU_ID","Blank1","Blank2","Blank3","Pop1_Sample1","Pop1_Sample2","Pop2_Sample3","Taxa"
+"OTU1",0,0,0,60,64,40,"K_Bacteria; P_Actinobacteria"
+"OTU2",200,100,0,380,328,340,"K_Bacteria; P_Proteobacteria"
+"OTU3",1000,200,80,40,40,0,"K_Bacteria; P_Proteobacteria"
+"OTU4",50,0,9,0,0,0,"K_Bacteria; P_Bacteroidetes"
+"OTU5",0,0,0,2400,1900,2100,"K_Bacteria"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/difference_mean.csv	Wed Aug 06 08:38:24 2025 +0000
@@ -0,0 +1,5 @@
+"OTU_ID","Blank1","All.groups","Group1","Group2","Taxa"
+"OTU2",200,182.4,193.333333333333,166,"K_Bacteria; P_Proteobacteria"
+"OTU3",1000,976,1073.33333333333,830,"K_Bacteria; P_Proteobacteria"
+"OTU4",50,48.8,53.6666666666667,41.5,"K_Bacteria; P_Bacteroidetes"
+"OTU6",25,21,23.3333333333333,17.5,"K_Bacteria"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/difference_sum.csv	Wed Aug 06 08:38:24 2025 +0000
@@ -0,0 +1,5 @@
+"OTU_ID","Blank1","All.groups","Group1","Group2","Taxa"
+"OTU2",200,912,580,332,"K_Bacteria; P_Proteobacteria"
+"OTU3",1000,4880,3220,1660,"K_Bacteria; P_Proteobacteria"
+"OTU4",50,244,161,83,"K_Bacteria; P_Bacteroidetes"
+"OTU6",25,105,70,35,"K_Bacteria"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.csv	Wed Aug 06 08:38:24 2025 +0000
@@ -0,0 +1,7 @@
+"OTU_ID","Blank1","Blank2","Blank3","Pop1_Sample1","Pop1_Sample2","Pop2_Sample3","Taxa"
+"OTU1",0,0,0,60,64,40,"K_Bacteria; P_Actinobacteria"
+"OTU2",200,220,180,660,520,480,"K_Bacteria; P_Proteobacteria"
+"OTU3",1000,800,1300,1440,1000,700,"K_Bacteria; P_Proteobacteria"
+"OTU4",50,30,70,70,48,35,"K_Bacteria; P_Bacteroidetes"
+"OTU5",0,0,0,2400,1900,2100,"K_Bacteria"
+"OTU6",25,10,30,30,20,15,"K_Bacteria"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/otus_removed.csv	Wed Aug 06 08:38:24 2025 +0000
@@ -0,0 +1,3 @@
+"OTU_ID","Blank1","All.groups","Group1","Group2","Taxa"
+"OTU4",50,"-","-","Totally.removed","K_Bacteria; P_Bacteroidetes"
+"OTU6",25,"Totally.removed","Totally.removed","Totally.removed","K_Bacteria"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reads_removed.csv	Wed Aug 06 08:38:24 2025 +0000
@@ -0,0 +1,5 @@
+"OTU_ID","Blank1","Blank2","Blank3","Pop1_Sample1","Pop1_Sample2","Pop2_Sample3","Taxa"
+"OTU2",200,120,180,280,192,140,"K_Bacteria; P_Proteobacteria"
+"OTU3",1000,600,1220,1400,960,700,"K_Bacteria; P_Proteobacteria"
+"OTU4",50,30,61,70,48,35,"K_Bacteria; P_Bacteroidetes"
+"OTU6",25,10,30,30,20,15,"K_Bacteria"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/remove_cont_decon_table.csv	Wed Aug 06 08:38:24 2025 +0000
@@ -0,0 +1,7 @@
+"OTU_ID","Blank1","Blank2","Blank3","Pop1_Sample1","Pop1_Sample2","Pop2_Sample3","Taxa"
+"OTU1",0,0,0,60,64,40,"K_Bacteria; P_Actinobacteria"
+"OTU2",200,100,0,380,328,340,"K_Bacteria; P_Proteobacteria"
+"OTU3",1000,200,80,40,40,0,"K_Bacteria; P_Proteobacteria"
+"OTU4",50,0,9,0,0,0,"K_Bacteria; P_Bacteroidetes"
+"OTU5",0,0,0,2400,1900,2100,"K_Bacteria"
+"OTU6",25,0,0,0,0,0,"K_Bacteria"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/remove_thresh_decon_table.csv	Wed Aug 06 08:38:24 2025 +0000
@@ -0,0 +1,7 @@
+"OTU_ID","Blank1","Blank2","Blank3","Pop1_Sample1","Pop1_Sample2","Pop2_Sample3","Taxa"
+"OTU1",0,0,0,60,64,40,"K_Bacteria; P_Actinobacteria"
+"OTU2",200,100,0,380,328,340,"K_Bacteria; P_Proteobacteria"
+"OTU3",1000,200,80,40,40,0,"K_Bacteria; P_Proteobacteria"
+"OTU4",50,0,9,0,0,0,"K_Bacteria; P_Bacteroidetes"
+"OTU5",0,0,0,2400,1900,2100,"K_Bacteria"
+"OTU6",25,0,0,0,0,0,"K_Bacteria"