view scripts/dropletutils.Rscript @ 6:8855361fcfc5 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/dropletutils/ commit ed0625fe59342d14a08745996e3e32c6f922a738"
author iuc
date Thu, 10 Dec 2020 13:50:06 +0000
parents cdf4443d5625
children 2c1200fba922
line wrap: on
line source

## Load in data
args <- commandArgs(trailingOnly = T)
if (length(args) != 1) {
    stop("Please provide the config file")
}

suppressWarnings(suppressPackageStartupMessages(require(DropletUtils)))
suppressWarnings(suppressPackageStartupMessages(require(Matrix)))
suppressWarnings(suppressPackageStartupMessages(require(scater)))

source(args[1])

## Helper functions
set_sparse <- function(obj) {
    return(as(obj, "dgCMatrix"))
}

write_tsv <- function(fileout, obj) {
    write.table(as.matrix(obj), file = fileout,
                col.names = NA, sep = "\t", quote = FALSE)
}

determine_geneids <- function(object) {
    if (!is.null(rowData(object)$Symbol)) {
        return(rowData(object)$Symbol)
    }
    return(rownames(object))
}

get_counts <- function(object) {
    return(Matrix(counts(object), sparse = TRUE))
}

write_out <- function(object, fileout, typeout) {
    if (typeout == "tsv") {
        write_tsv(fileout, get_counts(object))
    }
    else if (typeout == "h5") {
        write10xCounts(fileout, get_counts(object),
                       type = "HDF5",
                       gene.symbol = determine_geneids(object),
                       overwrite = TRUE)
    }
    else if (typeout == "directory") {
        write10xCounts(fileout, get_counts(object),
                       type = "sparse",
                       gene.symbol = determine_geneids(object),
                       overwrite = TRUE)
    }
}

read_10x_files <- function(filein, typein) {
    sce <- NULL
    if (typein == "tsv") {
        ## Exploding memory problems occured here
        ## - solution is to use the readSparseCounts function from scater
        sce <- SingleCellExperiment(assays =
                                        list(counts = readSparseCounts(filein)))
    }
    else if (typein == "h5") {
         # use barcodes.tsv as column names
        sce <- read10xCounts(filein, col.names = T, type = "HDF5")
    }
    else if (typein == "directory") {
        sce <- read10xCounts(filein, col.names = T, type = "sparse")
    }
    counts(sce) <- set_sparse(counts(sce))
    return(sce)
}


## Methods


do_empty_drops <- function(files, eparams, intype = "directory",
                         outtype = "h5", fdr_threshold  =  0.01) {
    sce <- read_10x_files(files$infile, intype)

    eparams$... <- NULL ## hack to remove other parameters from being
    eparams$m <- Matrix(counts(sce), sparse = TRUE)

    ## Determine sensible lowerbound
    m_stats <- summary(colSums(counts(sce)))
    print("Cell Library Size Distribution:")
    print(m_stats)

    if (m_stats["Min."] > eparams$lower) {
        print(paste0("CAUTION: Min. Lib. Size (", m_stats["Min."]
                      , ") < requested lowerbound (", eparams$lower, ")"))
        message(paste0("Setting lowerbound to Mean: ", m_stats["Mean"]))
        eparams$lower <- m_stats["Mean"]
    }

    e_out <- do.call(emptyDrops, c(eparams))

    bar_names <- colnames(sce)
    if (length(bar_names) != nrow(e_out)) {
        stop("Length of barcodes and output metrics don't match.")
    }
    e_out <- cbind(bar_names, e_out)
    e_out$is_cell <- e_out$FDR <= fdr_threshold
    e_out$is_cellandlimited <- e_out$is_cell & e_out$Limited

    ## Write to Plot
    e_out$is_cell[is.na(e_out$is_cell)] <- FALSE
    xlim_dat <- e_out[complete.cases(e_out), ]$Total

    ## Write to table
    write_tsv(files$table, e_out[complete.cases(e_out), ])

    png(files$plot)
    plot(e_out$Total, -e_out$LogProb, col = ifelse(e_out$is_cell,
                                                   "red", "black"),
         xlab = "Total UMI count", ylab = "-Log Probability",
         xlim = c(min(xlim_dat), max(xlim_dat)))
    dev.off()

    ## Filtered
    called <- NULL
    if (fdr_threshold != 0) {
        called <- e_out$is_cellandlimited
    } else {
        called <- e_out$is_cell
    }
    called[is.na(called)] <- FALSE    # replace NA's with FALSE
    sce_filtered <- sce[, called]

    write_out(sce_filtered, files$out, outtype)

    message(paste("Cells:", sum(na.omit(e_out$is_cell))))
    message(paste("Cells and Limited:", sum(na.omit(e_out$is_cellandlimited))))
}


do_default_drops <- function(files, dparams,
                           intype = "directory", outtype = "h5") {
    sce <- read_10x_files(files$infile, intype)

    dparams$m <- counts(sce)
    called <- do.call(defaultDrops, c(dparams))

    # Filtered
    sce_filtered <- sce[, called]

    write_out(sce_filtered, files$out, outtype)

    message(paste("Cells:", sum(called)))
}

do_barcode_rankings <- function(files, bparams, intype = "directory") {
    sce <- read_10x_files(files$infile, intype)

    bparams$... <- NULL ## hack
    bparams$m <- counts(sce)

    brout <- do.call(barcodeRanks, c(bparams))

    png(files$plot)
    plot(brout$rank, brout$total, log = "xy",
         xlab = "(log) Rank", ylab = "(log) Total Number of Barcodes")
    o <- order(brout$rank)
    lines(brout$rank[o], brout$fitted[o], col = "red")

    abline(h = brout$knee, col = "dodgerblue", lty = 2)
    abline(h = brout$inflection, col = "forestgreen", lty = 2)
    legend("bottomleft", lty = 2, col = c("dodgerblue", "forestgreen"),
           legend = c("knee", "inflection"))
    dev.off()

    print(paste("knee =", brout$knee, ", inflection = ", brout$inflection))
}

## Main
set.seed(seed.val)

if (do.method == "barcodeRankings") {
    do_barcode_rankings(files, bparams, intype)

} else if (do.method == "defaultDrops") {
    do_default_drops(files, dparams, intype, outtype)

} else if (do.method == "emptyDrops") {
    do_empty_drops(files, eparams, intype, outtype, empty_fdr_threshold)
}