diff scripts/dropletutils.Rscript @ 6:8855361fcfc5 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/dropletutils/ commit ed0625fe59342d14a08745996e3e32c6f922a738"
author iuc
date Thu, 10 Dec 2020 13:50:06 +0000
parents cdf4443d5625
children 2c1200fba922
line wrap: on
line diff
--- a/scripts/dropletutils.Rscript	Wed Jan 29 15:07:38 2020 -0500
+++ b/scripts/dropletutils.Rscript	Thu Dec 10 13:50:06 2020 +0000
@@ -1,6 +1,6 @@
 ## Load in data
-args = commandArgs(trailingOnly = T)
-if (length(args) != 1){
+args <- commandArgs(trailingOnly = T)
+if (length(args) != 1) {
     stop("Please provide the config file")
 }
 
@@ -11,57 +11,60 @@
 source(args[1])
 
 ## Helper functions
-setSparse <- function(obj){
+set_sparse <- function(obj) {
     return(as(obj, "dgCMatrix"))
 }
 
-writeTSV <- function(fileout, obj){
-    write.table(as.matrix(obj), file=fileout, col.names=NA, sep='\t', quote=FALSE)
+write_tsv <- function(fileout, obj) {
+    write.table(as.matrix(obj), file = fileout,
+                col.names = NA, sep = "\t", quote = FALSE)
 }
 
-determineGeneIDs <- function(object){
-    if (!is.null(rowData(object)$Symbol)){
+determine_geneids <- function(object) {
+    if (!is.null(rowData(object)$Symbol)) {
         return(rowData(object)$Symbol)
     }
     return(rownames(object))
 }
 
-getCounts <- function(object){
-    return(Matrix(counts(object), sparse=TRUE))
+get_counts <- function(object) {
+    return(Matrix(counts(object), sparse = TRUE))
 }
 
-writeOut <- function(object, fileout, typeout){
-    if (typeout == "tsv"){
-        writeTSV(fileout, getCounts(object))
+write_out <- function(object, fileout, typeout) {
+    if (typeout == "tsv") {
+        write_tsv(fileout, get_counts(object))
     }
-    else if (typeout == "h5"){
-        write10xCounts(fileout, getCounts(object),
-                       type="HDF5",
-                       gene.symbol=determineGeneIDs(object),
-                       overwrite=TRUE)
+    else if (typeout == "h5") {
+        write10xCounts(fileout, get_counts(object),
+                       type = "HDF5",
+                       gene.symbol = determine_geneids(object),
+                       overwrite = TRUE)
     }
-    else if (typeout == "directory"){
-        write10xCounts(fileout, getCounts(object),
-                       type="sparse",
-                       gene.symbol=determineGeneIDs(object),
-                       overwrite=TRUE)
+    else if (typeout == "directory") {
+        write10xCounts(fileout, get_counts(object),
+                       type = "sparse",
+                       gene.symbol = determine_geneids(object),
+                       overwrite = TRUE)
     }
 }
 
-read10xFiles <- function(filein, typein){
+read_10x_files <- function(filein, typein) {
     sce <- NULL
-    if (typein == "tsv"){
+    if (typein == "tsv") {
         ## Exploding memory problems occured here
         ## - solution is to use the readSparseCounts function from scater
-        sce <- SingleCellExperiment(assays = list(counts = readSparseCounts(filein)))
-    }
-    else if (typein == "h5"){
-        sce <- read10xCounts(filein, col.names=T, type="HDF5")   # use barcodes.tsv as column names
+        sce <- SingleCellExperiment(assays =
+                                        list(counts = readSparseCounts(filein)))
     }
-    else if (typein == "directory"){
-        sce <- read10xCounts(filein, col.names=T, type="sparse")
+    else if (typein == "h5") {
+         # use barcodes.tsv as column names
+        sce <- read10xCounts(filein, col.names = T, type = "HDF5")
     }
-    counts(sce) <- setSparse(counts(sce))
+    else if (typein == "directory") {
+        sce <- read10xCounts(filein, col.names = T, type = "sparse")
+    }
+    counts(sce) <- set_sparse(counts(sce))
     return(sce)
 }
 
@@ -69,97 +72,113 @@
 ## Methods
 
 
-doEmptyDrops <- function(files, eparams, in.type="directory", out.type="h5", fdr_threshold = 0.01){
-    sce <- read10xFiles(files$infile, in.type)
+do_empty_drops <- function(files, eparams, intype = "directory",
+                         outtype = "h5", fdr_threshold  =  0.01) {
+    sce <- read_10x_files(files$infile, intype)
+
+    eparams$... <- NULL ## hack to remove other parameters from being
+    eparams$m <- Matrix(counts(sce), sparse = TRUE)
+
+    ## Determine sensible lowerbound
+    m_stats <- summary(colSums(counts(sce)))
+    print("Cell Library Size Distribution:")
+    print(m_stats)
 
-    eparams$... <- NULL ## hack
-    eparams$m = Matrix(counts(sce), sparse=TRUE)
+    if (m_stats["Min."] > eparams$lower) {
+        print(paste0("CAUTION: Min. Lib. Size (", m_stats["Min."]
+                      , ") < requested lowerbound (", eparams$lower, ")"))
+        message(paste0("Setting lowerbound to Mean: ", m_stats["Mean"]))
+        eparams$lower <- m_stats["Mean"]
+    }
 
-    e.out <- do.call(emptyDrops, c(eparams))
+    e_out <- do.call(emptyDrops, c(eparams))
 
-    bar.names <- colnames(sce)
-    if (length(bar.names) != nrow(e.out)){
+    bar_names <- colnames(sce)
+    if (length(bar_names) != nrow(e_out)) {
         stop("Length of barcodes and output metrics don't match.")
     }
-    e.out <- cbind(bar.names, e.out)
-    e.out$is.Cell <- e.out$FDR <= fdr_threshold
-    e.out$is.CellAndLimited <- e.out$is.Cell & e.out$Limited
+    e_out <- cbind(bar_names, e_out)
+    e_out$is_cell <- e_out$FDR <= fdr_threshold
+    e_out$is_cellandlimited <- e_out$is_cell & e_out$Limited
 
     ## Write to Plot
-    e.out$is.Cell[is.na(e.out$is.Cell)] <- FALSE
-    xlim.dat <- e.out[complete.cases(e.out),]$Total
+    e_out$is_cell[is.na(e_out$is_cell)] <- FALSE
+    xlim_dat <- e_out[complete.cases(e_out), ]$Total
 
     ## Write to table
-    writeTSV(files$table, e.out[complete.cases(e.out),])
+    write_tsv(files$table, e_out[complete.cases(e_out), ])
 
     png(files$plot)
-    plot(e.out$Total, -e.out$LogProb, col=ifelse(e.out$is.Cell, "red", "black"),
-         xlab="Total UMI count", ylab="-Log Probability",
-         xlim=c(min(xlim.dat),max(xlim.dat)))
+    plot(e_out$Total, -e_out$LogProb, col = ifelse(e_out$is_cell,
+                                                   "red", "black"),
+         xlab = "Total UMI count", ylab = "-Log Probability",
+         xlim = c(min(xlim_dat), max(xlim_dat)))
     dev.off()
 
     ## Filtered
     called <- NULL
-    if (fdr_threshold != 0){
-        called <- e.out$is.CellAndLimited
+    if (fdr_threshold != 0) {
+        called <- e_out$is_cellandlimited
     } else {
-        called <- e.out$is.Cell
+        called <- e_out$is_cell
     }
     called[is.na(called)] <- FALSE    # replace NA's with FALSE
-    sce.filtered <- sce[,called]
+    sce_filtered <- sce[, called]
 
-    writeOut(sce.filtered, files$out, out.type)
+    write_out(sce_filtered, files$out, outtype)
 
-    message(paste("Cells:", sum(na.omit(e.out$is.Cell))))
-    message(paste("Cells and Limited:", sum(na.omit(e.out$is.CellAndLimited))))
+    message(paste("Cells:", sum(na.omit(e_out$is_cell))))
+    message(paste("Cells and Limited:", sum(na.omit(e_out$is_cellandlimited))))
 }
 
 
-doDefaultDrops <- function(files, dparams, in.type="directory", out.type="h5"){
-    sce <- read10xFiles(files$infile, in.type)
+do_default_drops <- function(files, dparams,
+                           intype = "directory", outtype = "h5") {
+    sce <- read_10x_files(files$infile, intype)
 
-    dparams$m = counts(sce)
+    dparams$m <- counts(sce)
     called <- do.call(defaultDrops, c(dparams))
 
     # Filtered
-    sce.filtered <- sce[,called]
+    sce_filtered <- sce[, called]
 
-    writeOut(sce.filtered, files$out, out.type)
+    write_out(sce_filtered, files$out, outtype)
 
     message(paste("Cells:", sum(called)))
 }
 
-
-doBarcodeRankings <- function(files, bparams, in.type="directory"){
-    sce <- read10xFiles(files$infile, in.type)
+do_barcode_rankings <- function(files, bparams, intype = "directory") {
+    sce <- read_10x_files(files$infile, intype)
 
     bparams$... <- NULL ## hack
-    bparams$m = counts(sce)
+    bparams$m <- counts(sce)
 
-    br.out <- do.call(barcodeRanks, c(bparams))
+    brout <- do.call(barcodeRanks, c(bparams))
 
     png(files$plot)
-    plot(br.out$rank, br.out$total, log="xy", xlab="(log) Rank", ylab="(log) Total Number of Barcodes")
-    o <- order(br.out$rank)
-    lines(br.out$rank[o], br.out$fitted[o], col="red")
+    plot(brout$rank, brout$total, log = "xy",
+         xlab = "(log) Rank", ylab = "(log) Total Number of Barcodes")
+    o <- order(brout$rank)
+    lines(brout$rank[o], brout$fitted[o], col = "red")
 
-    abline(h=br.out$knee, col="dodgerblue", lty=2)
-    abline(h=br.out$inflection, col="forestgreen", lty=2)
-    legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), legend=c("knee", "inflection"))
+    abline(h = brout$knee, col = "dodgerblue", lty = 2)
+    abline(h = brout$inflection, col = "forestgreen", lty = 2)
+    legend("bottomleft", lty = 2, col = c("dodgerblue", "forestgreen"),
+           legend = c("knee", "inflection"))
     dev.off()
 
-    print(paste("knee =", br.out$knee, ", inflection = ", br.out$inflection))
+    print(paste("knee =", brout$knee, ", inflection = ", brout$inflection))
 }
 
 ## Main
 set.seed(seed.val)
 
 if (do.method == "barcodeRankings") {
-    doBarcodeRankings(files, bparams, in.type)
+    do_barcode_rankings(files, bparams, intype)
 
 } else if (do.method == "defaultDrops") {
-    doDefaultDrops(files, dparams, in.type, out.type)
+    do_default_drops(files, dparams, intype, outtype)
 
 } else if (do.method == "emptyDrops") {
-    doEmptyDrops(files, eparams, in.type, out.type, empty.fdr_threshold)
+    do_empty_drops(files, eparams, intype, outtype, empty_fdr_threshold)
 }