Mercurial > repos > iuc > ruvseq
view ruvseq.R @ 2:fed9d0350d72 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ruvseq commit 4daa375d022673d2437d609b1865b78c64b04415"
author | iuc |
---|---|
date | Fri, 15 Jan 2021 17:53:15 +0000 |
parents | c24765926774 |
children | 3a083c78896e |
line wrap: on
line source
# setup R error handling to go to stderr library("getopt") options(show.error.messages = F, error = function() { cat(geterrmessage(), file = stderr()); q("no", 1, F) }) options(stringAsFactors = FALSE, useFancyQuotes = FALSE) setup_cmdline_options <- function() { args <- commandArgs(trailingOnly = TRUE) spec <- matrix(c( "help", "h", 0, "logical", "alpha", "a", 1, "double", "min_mean_count", "min_c", 1, "double", "min_k", "min_k", 1, "double", "max_k", "max_k", 1, "double", "sample_json", "s", 1, "character", "plots", "p", 1, "character", "header", "H", 0, "logical", "txtype", "y", 1, "character", "tx2gene", "x", 1, "character"), # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF) byrow = TRUE, ncol = 4) opt <- getopt(spec) # if help was asked for print a friendly message # and exit with a non-zero error code if (!is.null(opt$help)) { cat(getopt(spec, usage = TRUE)) q(status = 1) } else { load_libraries() } return(opt) } load_libraries <- function() { # Allows displaying help without waiting for libraries to load library("tools") library("jsonlite") library("reshape2") library("RUVSeq") library("RColorBrewer") library("tximport") library("DESeq2") library("ggrepel") } source_local <- function(fname) { argv <- commandArgs(trailingOnly = FALSE) base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) source(paste(base_dir, fname, sep = "/")) } # Source get_deseq_dataset.R for getting deseq dataset from htseq/featurecounts/tximport source_local("get_deseq_dataset.R") # RUVseq function definitions plot_pca_rle <- function(set, title) { x <- pData(set)[, 1] colors <- brewer.pal(3, "Set2") label <- paste0(" for ", title) plotRLE(set, outline = FALSE, ylim = c(-4, 4), col = colors[x]) title(main = paste0("RLE", label)) plotPCA(set, col = colors[x], cex = 1.2) title(main = paste0("PCA", label)) } plot_factors_of_unwanted_var <- function(set, method, k) { pd <- pData(set) pd["sample"] <- row.names(pd) colnames(pd)[1] <- "condition" d <- melt(pd, id.vars = c("sample", "condition")) d["x"] <- 1 # There is no information on the X, so we just fake it to be able to do a scatterplot print(ggplot(d, aes(x = x, y = value, color = condition, label = sample)) + geom_point() + ggtitle(paste0("Factors of unwanted variation for method: ", method, ", k=", k)) + facet_wrap(~ variable, scales = "free_x") + geom_text_repel() + theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.title = element_text(hjust = 0.5)) ) } create_seq_expression_set <- function(dds, min_mean_count) { count_values <- counts(dds) print(paste0("feature count before filtering :", nrow(count_values), "\n")) print(paste0("Filtering features which mean expression is less or eq. than ", min_mean_count, " counts\n")) filter <- apply(count_values, 1, function(x) mean(x) > min_mean_count) filtered <- count_values[filter, ] print(paste0("feature count after filtering :", nrow(filtered), "\n")) set <- newSeqExpressionSet(as.matrix(filtered), phenoData = data.frame(colData(dds)$condition, row.names = colnames(filtered))) plot_pca_rle(set = set, title = "raw data") set <- betweenLaneNormalization(set, which = "upper") plot_pca_rle(set = set, title = "upper quartile normalized") return(set) } get_empirical_control_genes <- function(set, cutoff_p) { x <- pData(set)[, 1] design <- model.matrix(~x, data = pData(set)) y <- DGEList(counts = counts(set), group = x) y <- calcNormFactors(y, method = "upperquartile") y <- estimateGLMCommonDisp(y, design) y <- estimateGLMTagwiseDisp(y, design) fit <- glmFit(y, design) lrt <- glmLRT(fit, coef = 2) top <- topTags(lrt, n = nrow(set))$table top_rows <- rownames(top)[which(top$PValue < cutoff_p)] empirical <- rownames(set)[which(!(rownames(set) %in% top_rows))] return(empirical) } ruv_control_gene_method <- function(set, k, control_genes = "empirical", cutoff_p = 0.2) { if (control_genes == "empirical") { control_genes <- get_empirical_control_genes(set, cutoff_p = cutoff_p) } set <- RUVg(set, control_genes, k = k) plot_pca_rle(set, paste0("RUVg with empirical control genes, k=", k)) plot_factors_of_unwanted_var(set, method = "RUVg with empirical control genes", k = k) return(set) } ruv_residual_method <- function(set, k) { genes <- rownames(counts(set)) x <- pData(set)[, 1] # Initial edger residuals design <- model.matrix(~x, data = pData(set)) y <- DGEList(counts = counts(set), group = x) y <- calcNormFactors(y, method = "upperquartile") y <- estimateGLMCommonDisp(y, design) y <- estimateGLMTagwiseDisp(y, design) fit <- glmFit(y, design) res <- residuals(fit, type = "deviance") set <- RUVr(set, genes, k = k, res) plot_pca_rle(set = set, title = paste0("RUVr using residuals, k=", k)) plot_factors_of_unwanted_var(set, method = "RUVr using residuals", k = k) return(set) } ruv_replicate_method <- function(set, k) { genes <- rownames(counts(set)) x <- pData(set)[, 1] differences <- makeGroups(x) set <- RUVs(set, genes, k = k, differences) plot_pca_rle(set, paste0("RUVs with replicate samples, k=", k)) plot_factors_of_unwanted_var(set, method = "RUVs using replicates", k = k) return(set) } opt <- setup_cmdline_options() alpha <- opt$alpha min_k <- opt$min_k max_k <- opt$max_k min_c <- opt$min_mean_count sample_json <- fromJSON(opt$sample_json) sample_paths <- sample_json$path sample_names <- sample_json$label condition <- as.factor(sample_json$condition) sample_table <- data.frame(samplename = sample_names, filename = sample_paths, condition = condition) rownames(sample_table) <- sample_names dds <- get_deseq_dataset(sample_table, header = opt$header, design_formula = ~ condition, tximport = opt$txtype, txtype = opt$txtype, tx2gene = opt$tx2gene) if (!is.null(opt$plots)) { pdf(opt$plots) } # Run through the ruvseq variants set <- create_seq_expression_set(dds, min_mean_count = min_c) result <- list(no_correction = set) for (k in seq(min_k, max_k)) { result[[paste0("residual_method_k", k)]] <- ruv_residual_method(set, k = k) result[[paste0("replicate_method_k", k)]] <- ruv_replicate_method(set, k = k) result[[paste0("control_method_k", k)]] <- ruv_control_gene_method(set, k = k, cutoff_p = 0.5) } for (name in names(result)) { if (!startsWith(name, "no_correction")) { set <- result[[name]] unwanted_variation <- pData(set) df <- data.frame(identifier = rownames(unwanted_variation)) df <- cbind(df, unwanted_variation) colnames(df)[2] <- "condition" write.table(df, file = paste0("batch_effects_", name, ".tabular"), sep = "\t", quote = F, row.names = F) } } # close the plot device if (!is.null(opt$plots)) { cat("closing plot device\n") dev.off() } cat("Session information:\n\n") sessionInfo()