changeset 6:8855361fcfc5 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/dropletutils/ commit ed0625fe59342d14a08745996e3e32c6f922a738"
author iuc
date Thu, 10 Dec 2020 13:50:06 +0000
parents cdf4443d5625
children 2c1200fba922
files dropletutils.xml scripts/dropletutils.Rscript test-data/defs_defaultdrops.h5 test-data/defs_emptydrops_150_0002.h5 test-data/defs_emptydrops_150_0002.png test-data/defs_emptydrops_150_0002a.h5 test-data/defs_emptydrops_150_0002a.png
diffstat 7 files changed, 132 insertions(+), 99 deletions(-) [+]
line wrap: on
line diff
--- a/dropletutils.xml	Wed Jan 29 15:07:38 2020 -0500
+++ b/dropletutils.xml	Thu Dec 10 13:50:06 2020 +0000
@@ -1,9 +1,22 @@
 <?xml version="1.0" encoding="utf-8"?>
-<tool id="dropletutils" name="DropletUtils" version="@PACKAGE_VERSION@+@GALAXY_VERSION@" >
+<tool id="dropletutils" name="DropletUtils" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" >
     <description>Utilities for handling droplet-based single-cell RNA-seq data</description>
+    <xrefs>
+        <xref type="bio.tools">dropletutils</xref>
+    </xrefs>
+    <edam_topics>
+        <edam_topic>topic_0203</edam_topic>
+        <edam_topic>topic_3168</edam_topic>
+        <edam_topic>topic_3170</edam_topic>
+        <edam_topic>topic_3308</edam_topic>
+    </edam_topics>
+    <edam_operations>
+        <edam_operation>operation_1812</edam_operation>
+        <edam_operation>operation_3200</edam_operation>
+    </edam_operations>
     <macros>
-        <token name="@PACKAGE_VERSION@" >1.2.1</token>
-        <token name="@GALAXY_VERSION@" >galaxy6</token>
+        <token name="@TOOL_VERSION@">1.10.0</token>
+        <token name="@VERSION_SUFFIX@">0</token>
         <token name="@TXIN@">tenx.input</token>
         <token name="@TXOUT@">tenx.output</token>
         <xml name="test_dirin" >
@@ -16,9 +29,10 @@
         </xml>
     </macros>
     <requirements>
-        <requirement type="package" version="@PACKAGE_VERSION@">bioconductor-dropletutils</requirement>
-        <requirement type="package" version="1.2_17" >r-matrix</requirement>
-        <requirement type="package" version="1.10.1" >bioconductor-scater</requirement>
+        <requirement type="package" version="@TOOL_VERSION@">bioconductor-dropletutils</requirement>
+        <requirement type="package" version="1.18.0" >bioconductor-scater</requirement>
+        <requirement type="package" version="4.0">r-base</requirement>
+        <requirement type="package" version="1.2_18">r-matrix</requirement>
         <requirement type="package" version="1">fonts-conda-ecosystem</requirement>
     </requirements>
     <version_command><![CDATA[
@@ -45,14 +59,14 @@
     <configfiles>
         <configfile name="droplet_conf" >
 ## defaults
-empty.fdr_threshold = 0.01
-eparams=formals(emptyDrops)
-dparams=formals(defaultDrops)
-bparams=formals(barcodeRanks)
+empty_fdr_threshold = 0.01
+eparams = formals(emptyDrops)
+dparams = formals(defaultDrops)
+bparams = formals(barcodeRanks)
 
 ## File params
-in.type='$tenx_format.use'
-out.type=NULL
+intype='$tenx_format.use'
+outtype=NULL
 
 files=list()
 files\$table='$table'
@@ -74,10 +88,10 @@
     #else if str($operation.method.use) == 'emptydrops':
 do.method="emptyDrops"
 eparams\$lower=as.integer('$operation.method.lower')
-empty.fdr_threshold=as.numeric('$operation.method.fdr_thresh')
+empty_fdr_threshold=as.numeric('$operation.method.fdr_thresh')
     #end if
 
-out.type='$operation.outformat'
+outtype='$operation.outformat'
     #if str($operation.outformat) == 'directory':
 files\$out='@TXOUT@'
     #else if str($operation.outformat) == 'h5':
@@ -172,8 +186,8 @@
     </outputs>
     <tests>
         <!-- Directory input tests -->
-        <!-- ::: Default Drops -->
         <test expect_num_outputs="1">
+            <!-- ::: Default Drops -->
             <expand macro="test_dirin" />
             <conditional name="operation">
                 <param name="use" value="filter" />
@@ -196,17 +210,17 @@
                 </assert_contents>
             </output>
         </test>
-        <!-- :: Barcode Ranks -->
         <test expect_num_outputs="1">
+            <!-- :: Barcode Ranks -->
             <expand macro="test_dirin" />
             <conditional name="operation">
                 <param name="use" value="barcode_rank" />
                 <param name="lower" value="120" />
             </conditional>
-            <output name="plot" value="defs_barcoderankings.png" compare="sim_size" delta="400"/>
+            <output name="plot" value="defs_barcoderankings.png" compare="sim_size" delta="600"/>
         </test>
-        <!-- ::: Empty Drops -->
         <test expect_num_outputs="3">
+            <!-- ::: Empty Drops -->
             <expand macro="test_dirin" />
             <conditional name="operation">
                 <param name="use" value="filter" />
@@ -221,16 +235,16 @@
             <output name="table" >
                 <assert_contents>
                     <has_n_columns n="9" />
-                    <has_line_matching expression="^\sbar.names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis.Cell\sis.CellAndLimited" />
-                    <has_line_matching expression="^994\sGGCATTACAA\s338\s-246.922772388055\s9.99900009999e-05\sTRUE\s9.99900009999e-05\sTRUE\sTRUE" />
-                    <has_line_matching expression="^998\sCATGAAGCAA\s151\s-166.644236503983\s9.99900009999e-05\sTRUE\s9.99900009999e-05\sTRUE\sTRUE" />
+                    <has_line_matching expression="^\sbar_names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis_cell\sis_cellandlimited" />
+                    <has_line_matching expression="^994\sGGCATTACAA\s338\s-246\.(.*TRUE){3}$" />
+                    <has_line_matching expression="^998\sCATGAAGCAA\s151\s-166\.(.*TRUE){3}$" />
                 </assert_contents>
             </output>
             <output name="plot" value="defs_emptydrops_150_0002.png" compare="sim_size" delta="400" />
         </test>
         <!-- Other format input tests -->
-        <!-- ::: Empty Drops, same as above but input is h5 -->
         <test expect_num_outputs="3">
+            <!-- ::: Empty Drops, same as above but input is h5 -->
             <conditional name="tenx_format" >
                 <param name="use" value="h5" />
                 <param name="input" value="in_matrix.h5" />
@@ -248,9 +262,9 @@
             <output name="table" >
                 <assert_contents>
                     <has_n_columns n="9" />
-                    <has_line_matching expression="^\sbar.names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis.Cell\sis.CellAndLimited" />
-                    <has_line_matching expression="^1100\sCCGGAAGCAA\s169\s-198.117943099773\s9.99900009999e-05\sTRUE\s0.000126279506880773\sTRUE\sTRUE" />
-                    <has_line_matching expression="^1114\sTCCGAAGCAA\s182\s-196.181449214729\s9.99900009999e-05\sTRUE\s0.000126279506880773\sTRUE\sTRUE" />
+                    <has_line_matching expression="^\sbar_names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis_cell\sis_cellandlimited" />
+                    <has_line_matching expression="^1100\sCCGGAAGCAA\s169\s-198\.(.*TRUE){3}$" />
+                    <has_line_matching expression="^1114\sTCCGAAGCAA\s182\s-196\.(.*TRUE){3}$" />
                 </assert_contents>
             </output>
             <output name="plot" value="defs_emptydrops_150_0002a.png" compare="sim_size" delta="400" />
--- a/scripts/dropletutils.Rscript	Wed Jan 29 15:07:38 2020 -0500
+++ b/scripts/dropletutils.Rscript	Thu Dec 10 13:50:06 2020 +0000
@@ -1,6 +1,6 @@
 ## Load in data
-args = commandArgs(trailingOnly = T)
-if (length(args) != 1){
+args <- commandArgs(trailingOnly = T)
+if (length(args) != 1) {
     stop("Please provide the config file")
 }
 
@@ -11,57 +11,60 @@
 source(args[1])
 
 ## Helper functions
-setSparse <- function(obj){
+set_sparse <- function(obj) {
     return(as(obj, "dgCMatrix"))
 }
 
-writeTSV <- function(fileout, obj){
-    write.table(as.matrix(obj), file=fileout, col.names=NA, sep='\t', quote=FALSE)
+write_tsv <- function(fileout, obj) {
+    write.table(as.matrix(obj), file = fileout,
+                col.names = NA, sep = "\t", quote = FALSE)
 }
 
-determineGeneIDs <- function(object){
-    if (!is.null(rowData(object)$Symbol)){
+determine_geneids <- function(object) {
+    if (!is.null(rowData(object)$Symbol)) {
         return(rowData(object)$Symbol)
     }
     return(rownames(object))
 }
 
-getCounts <- function(object){
-    return(Matrix(counts(object), sparse=TRUE))
+get_counts <- function(object) {
+    return(Matrix(counts(object), sparse = TRUE))
 }
 
-writeOut <- function(object, fileout, typeout){
-    if (typeout == "tsv"){
-        writeTSV(fileout, getCounts(object))
+write_out <- function(object, fileout, typeout) {
+    if (typeout == "tsv") {
+        write_tsv(fileout, get_counts(object))
     }
-    else if (typeout == "h5"){
-        write10xCounts(fileout, getCounts(object),
-                       type="HDF5",
-                       gene.symbol=determineGeneIDs(object),
-                       overwrite=TRUE)
+    else if (typeout == "h5") {
+        write10xCounts(fileout, get_counts(object),
+                       type = "HDF5",
+                       gene.symbol = determine_geneids(object),
+                       overwrite = TRUE)
     }
-    else if (typeout == "directory"){
-        write10xCounts(fileout, getCounts(object),
-                       type="sparse",
-                       gene.symbol=determineGeneIDs(object),
-                       overwrite=TRUE)
+    else if (typeout == "directory") {
+        write10xCounts(fileout, get_counts(object),
+                       type = "sparse",
+                       gene.symbol = determine_geneids(object),
+                       overwrite = TRUE)
     }
 }
 
-read10xFiles <- function(filein, typein){
+read_10x_files <- function(filein, typein) {
     sce <- NULL
-    if (typein == "tsv"){
+    if (typein == "tsv") {
         ## Exploding memory problems occured here
         ## - solution is to use the readSparseCounts function from scater
-        sce <- SingleCellExperiment(assays = list(counts = readSparseCounts(filein)))
-    }
-    else if (typein == "h5"){
-        sce <- read10xCounts(filein, col.names=T, type="HDF5")   # use barcodes.tsv as column names
+        sce <- SingleCellExperiment(assays =
+                                        list(counts = readSparseCounts(filein)))
     }
-    else if (typein == "directory"){
-        sce <- read10xCounts(filein, col.names=T, type="sparse")
+    else if (typein == "h5") {
+         # use barcodes.tsv as column names
+        sce <- read10xCounts(filein, col.names = T, type = "HDF5")
     }
-    counts(sce) <- setSparse(counts(sce))
+    else if (typein == "directory") {
+        sce <- read10xCounts(filein, col.names = T, type = "sparse")
+    }
+    counts(sce) <- set_sparse(counts(sce))
     return(sce)
 }
 
@@ -69,97 +72,113 @@
 ## Methods
 
 
-doEmptyDrops <- function(files, eparams, in.type="directory", out.type="h5", fdr_threshold = 0.01){
-    sce <- read10xFiles(files$infile, in.type)
+do_empty_drops <- function(files, eparams, intype = "directory",
+                         outtype = "h5", fdr_threshold  =  0.01) {
+    sce <- read_10x_files(files$infile, intype)
+
+    eparams$... <- NULL ## hack to remove other parameters from being
+    eparams$m <- Matrix(counts(sce), sparse = TRUE)
+
+    ## Determine sensible lowerbound
+    m_stats <- summary(colSums(counts(sce)))
+    print("Cell Library Size Distribution:")
+    print(m_stats)
 
-    eparams$... <- NULL ## hack
-    eparams$m = Matrix(counts(sce), sparse=TRUE)
+    if (m_stats["Min."] > eparams$lower) {
+        print(paste0("CAUTION: Min. Lib. Size (", m_stats["Min."]
+                      , ") < requested lowerbound (", eparams$lower, ")"))
+        message(paste0("Setting lowerbound to Mean: ", m_stats["Mean"]))
+        eparams$lower <- m_stats["Mean"]
+    }
 
-    e.out <- do.call(emptyDrops, c(eparams))
+    e_out <- do.call(emptyDrops, c(eparams))
 
-    bar.names <- colnames(sce)
-    if (length(bar.names) != nrow(e.out)){
+    bar_names <- colnames(sce)
+    if (length(bar_names) != nrow(e_out)) {
         stop("Length of barcodes and output metrics don't match.")
     }
-    e.out <- cbind(bar.names, e.out)
-    e.out$is.Cell <- e.out$FDR <= fdr_threshold
-    e.out$is.CellAndLimited <- e.out$is.Cell & e.out$Limited
+    e_out <- cbind(bar_names, e_out)
+    e_out$is_cell <- e_out$FDR <= fdr_threshold
+    e_out$is_cellandlimited <- e_out$is_cell & e_out$Limited
 
     ## Write to Plot
-    e.out$is.Cell[is.na(e.out$is.Cell)] <- FALSE
-    xlim.dat <- e.out[complete.cases(e.out),]$Total
+    e_out$is_cell[is.na(e_out$is_cell)] <- FALSE
+    xlim_dat <- e_out[complete.cases(e_out), ]$Total
 
     ## Write to table
-    writeTSV(files$table, e.out[complete.cases(e.out),])
+    write_tsv(files$table, e_out[complete.cases(e_out), ])
 
     png(files$plot)
-    plot(e.out$Total, -e.out$LogProb, col=ifelse(e.out$is.Cell, "red", "black"),
-         xlab="Total UMI count", ylab="-Log Probability",
-         xlim=c(min(xlim.dat),max(xlim.dat)))
+    plot(e_out$Total, -e_out$LogProb, col = ifelse(e_out$is_cell,
+                                                   "red", "black"),
+         xlab = "Total UMI count", ylab = "-Log Probability",
+         xlim = c(min(xlim_dat), max(xlim_dat)))
     dev.off()
 
     ## Filtered
     called <- NULL
-    if (fdr_threshold != 0){
-        called <- e.out$is.CellAndLimited
+    if (fdr_threshold != 0) {
+        called <- e_out$is_cellandlimited
     } else {
-        called <- e.out$is.Cell
+        called <- e_out$is_cell
     }
     called[is.na(called)] <- FALSE    # replace NA's with FALSE
-    sce.filtered <- sce[,called]
+    sce_filtered <- sce[, called]
 
-    writeOut(sce.filtered, files$out, out.type)
+    write_out(sce_filtered, files$out, outtype)
 
-    message(paste("Cells:", sum(na.omit(e.out$is.Cell))))
-    message(paste("Cells and Limited:", sum(na.omit(e.out$is.CellAndLimited))))
+    message(paste("Cells:", sum(na.omit(e_out$is_cell))))
+    message(paste("Cells and Limited:", sum(na.omit(e_out$is_cellandlimited))))
 }
 
 
-doDefaultDrops <- function(files, dparams, in.type="directory", out.type="h5"){
-    sce <- read10xFiles(files$infile, in.type)
+do_default_drops <- function(files, dparams,
+                           intype = "directory", outtype = "h5") {
+    sce <- read_10x_files(files$infile, intype)
 
-    dparams$m = counts(sce)
+    dparams$m <- counts(sce)
     called <- do.call(defaultDrops, c(dparams))
 
     # Filtered
-    sce.filtered <- sce[,called]
+    sce_filtered <- sce[, called]
 
-    writeOut(sce.filtered, files$out, out.type)
+    write_out(sce_filtered, files$out, outtype)
 
     message(paste("Cells:", sum(called)))
 }
 
-
-doBarcodeRankings <- function(files, bparams, in.type="directory"){
-    sce <- read10xFiles(files$infile, in.type)
+do_barcode_rankings <- function(files, bparams, intype = "directory") {
+    sce <- read_10x_files(files$infile, intype)
 
     bparams$... <- NULL ## hack
-    bparams$m = counts(sce)
+    bparams$m <- counts(sce)
 
-    br.out <- do.call(barcodeRanks, c(bparams))
+    brout <- do.call(barcodeRanks, c(bparams))
 
     png(files$plot)
-    plot(br.out$rank, br.out$total, log="xy", xlab="(log) Rank", ylab="(log) Total Number of Barcodes")
-    o <- order(br.out$rank)
-    lines(br.out$rank[o], br.out$fitted[o], col="red")
+    plot(brout$rank, brout$total, log = "xy",
+         xlab = "(log) Rank", ylab = "(log) Total Number of Barcodes")
+    o <- order(brout$rank)
+    lines(brout$rank[o], brout$fitted[o], col = "red")
 
-    abline(h=br.out$knee, col="dodgerblue", lty=2)
-    abline(h=br.out$inflection, col="forestgreen", lty=2)
-    legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), legend=c("knee", "inflection"))
+    abline(h = brout$knee, col = "dodgerblue", lty = 2)
+    abline(h = brout$inflection, col = "forestgreen", lty = 2)
+    legend("bottomleft", lty = 2, col = c("dodgerblue", "forestgreen"),
+           legend = c("knee", "inflection"))
     dev.off()
 
-    print(paste("knee =", br.out$knee, ", inflection = ", br.out$inflection))
+    print(paste("knee =", brout$knee, ", inflection = ", brout$inflection))
 }
 
 ## Main
 set.seed(seed.val)
 
 if (do.method == "barcodeRankings") {
-    doBarcodeRankings(files, bparams, in.type)
+    do_barcode_rankings(files, bparams, intype)
 
 } else if (do.method == "defaultDrops") {
-    doDefaultDrops(files, dparams, in.type, out.type)
+    do_default_drops(files, dparams, intype, outtype)
 
 } else if (do.method == "emptyDrops") {
-    doEmptyDrops(files, eparams, in.type, out.type, empty.fdr_threshold)
+    do_empty_drops(files, eparams, intype, outtype, empty_fdr_threshold)
 }
Binary file test-data/defs_defaultdrops.h5 has changed
Binary file test-data/defs_emptydrops_150_0002.h5 has changed
Binary file test-data/defs_emptydrops_150_0002.png has changed
Binary file test-data/defs_emptydrops_150_0002a.h5 has changed
Binary file test-data/defs_emptydrops_150_0002a.png has changed