diff deseq2.R @ 25:de44f8eff84a draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 9eb6d07510ccf27d6499172d62c81661078ec57b"
author iuc
date Wed, 25 Nov 2020 18:36:55 +0000
parents 71bacea10eee
children d027d1f4984e
line wrap: on
line diff
--- a/deseq2.R	Sat Feb 29 17:08:08 2020 -0500
+++ b/deseq2.R	Wed Nov 25 18:36:55 2020 +0000
@@ -31,7 +31,9 @@
 #   3 "mean"
 
 # setup R error handling to go to stderr
-options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+options(show.error.messages = F, error = function() {
+  cat(geterrmessage(), file = stderr()); q("no", 1, F)
+})
 
 # we need that to not crash galaxy with an UTF8 error on German LC settings.
 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
@@ -55,50 +57,46 @@
   "header", "H", 0, "logical",
   "factors", "f", 1, "character",
   "files_to_labels", "l", 1, "character",
-  "plots" , "p", 1, "character",
+  "plots", "p", 1, "character",
   "tximport", "i", 0, "logical",
   "txtype", "y", 1, "character",
   "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF/GFF3 file
   "esf", "e", 1, "character",
   "fit_type", "t", 1, "integer",
   "many_contrasts", "m", 0, "logical",
-  "outlier_replace_off" , "a", 0, "logical",
-  "outlier_filter_off" , "b", 0, "logical",
+  "outlier_replace_off", "a", 0, "logical",
+  "outlier_filter_off", "b", 0, "logical",
   "auto_mean_filter_off", "c", 0, "logical",
-  "beta_prior_off", "d", 0, "logical"),
-  byrow=TRUE, ncol=4)
+  "beta_prior_off", "d", 0, "logical"
+), byrow = TRUE, ncol = 4)
 opt <- getopt(spec)
 
 # if help was asked for print a friendly message
 # and exit with a non-zero error code
 if (!is.null(opt$help)) {
-  cat(getopt(spec, usage=TRUE))
-  q(status=1)
+  cat(getopt(spec, usage = TRUE))
+  q(status = 1)
 }
 
 # enforce the following required arguments
 if (is.null(opt$outfile)) {
   cat("'outfile' is required\n")
-  q(status=1)
+  q(status = 1)
 }
 if (is.null(opt$factors)) {
   cat("'factors' is required\n")
-  q(status=1)
+  q(status = 1)
 }
 
-verbose <- if (is.null(opt$quiet)) {
-  TRUE
-} else {
-  FALSE
+verbose <- is.null(opt$quiet)
+
+source_local <- function(fname) {
+    argv <- commandArgs(trailingOnly = FALSE)
+    base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))
+    source(paste(base_dir, fname, sep = "/"))
 }
 
-source_local <- function(fname){
-    argv <- commandArgs(trailingOnly = FALSE)
-    base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))
-    source(paste(base_dir, fname, sep="/"))
-}
-
-source_local('get_deseq_dataset.R')
+source_local("get_deseq_dataset.R")
 
 suppressPackageStartupMessages({
   library("DESeq2")
@@ -109,102 +107,116 @@
 if (opt$cores > 1) {
   library("BiocParallel")
   register(MulticoreParam(opt$cores))
-  parallel = TRUE
+  parallel <- TRUE
 } else {
-  parallel = FALSE
+  parallel <- FALSE
 }
 
 # build or read sample table
 
-trim <- function (x) gsub("^\\s+|\\s+$", "", x)
+trim <- function(x) gsub("^\\s+|\\s+$", "", x)
 
 # switch on if 'factors' was provided:
 library("rjson")
 parser <- newJSONParser()
 parser$addData(opt$factors)
-factorList <- parser$getObject()
+factor_list <- parser$getObject()
 filenames_to_labels <- fromJSON(opt$files_to_labels)
-factors <- sapply(factorList, function(x) x[[1]])
-primaryFactor <- factors[1]
-filenamesIn <- unname(unlist(factorList[[1]][[2]]))
-labs = unname(unlist(filenames_to_labels[basename(filenamesIn)]))
-sampleTable <- data.frame(sample=basename(filenamesIn),
-                          filename=filenamesIn,
-                          row.names=filenamesIn,
-                          stringsAsFactors=FALSE)
-for (factor in factorList) {
-  factorName <- trim(factor[[1]])
-  sampleTable[[factorName]] <- character(nrow(sampleTable))
+factors <- sapply(factor_list, function(x) x[[1]])
+primary_factor <- factors[1]
+filenames_in <- unname(unlist(factor_list[[1]][[2]]))
+labs <- unname(unlist(filenames_to_labels[basename(filenames_in)]))
+sample_table <- data.frame(
+  sample = basename(filenames_in),
+  filename = filenames_in,
+  row.names = filenames_in,
+  stringsAsFactors = FALSE
+)
+for (factor in factor_list) {
+  factor_name <- trim(factor[[1]])
+  sample_table[[factor_name]] <- character(nrow(sample_table))
   lvls <- sapply(factor[[2]], function(x) names(x))
   for (i in seq_along(factor[[2]])) {
     files <- factor[[2]][[i]][[1]]
-    sampleTable[files,factorName] <- trim(lvls[i])
+    sample_table[files, factor_name] <- trim(lvls[i])
   }
-  sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls)
+  sample_table[[factor_name]] <- factor(sample_table[[factor_name]], levels = lvls)
 }
-rownames(sampleTable) <- labs
+rownames(sample_table) <- labs
 
-primaryFactor <- factors[1]
-designFormula <- as.formula(paste("~", paste(rev(factors), collapse=" + ")))
+design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + ")))
 
 # these are plots which are made once for each analysis
-generateGenericPlots <- function(dds, factors) {
+generate_generic_plots <- function(dds, factors) {
   library("ggplot2")
   library("ggrepel")
   library("pheatmap")
 
   rld <- rlog(dds)
-  p <- plotPCA(rld, intgroup=rev(factors))
-  print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size=3)  + geom_point())
+  p <- plotPCA(rld, intgroup = rev(factors))
+  print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size = 3)  + geom_point())
   dat <- assay(rld)
-  distsRL <- dist(t(dat))
-  mat <- as.matrix(distsRL)
-  colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
-  pheatmap(mat,
-           clustering_distance_rows=distsRL,
-           clustering_distance_cols=distsRL,
-           col=colors,
-           main="Sample-to-sample distances")
-  plotDispEsts(dds, main="Dispersion estimates")
+  dists_rl <- dist(t(dat))
+  mat <- as.matrix(dists_rl)
+  colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
+  pheatmap(
+    mat,
+    clustering_distance_rows = dists_rl,
+    clustering_distance_cols = dists_rl,
+    col = colors,
+    main = "Sample-to-sample distances"
+  )
+  plotDispEsts(dds, main = "Dispersion estimates")
 }
 
 # these are plots which can be made for each comparison, e.g.
 # once for C vs A and once for B vs A
-generateSpecificPlots <- function(res, threshold, title_suffix) {
+generate_specific_plots <- function(res, threshold, title_suffix) {
   use <- res$baseMean > threshold
   if (sum(!use) == 0) {
-    h <- hist(res$pvalue, breaks=0:50/50, plot=FALSE)
-    barplot(height = h$counts,
-            col = "powderblue", space = 0, xlab="p-values", ylab="frequency",
-            main=paste("Histogram of p-values for",title_suffix))
-    text(x = c(0, length(h$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA)
+    h <- hist(res$pvalue, breaks = 0:50 / 50, plot = FALSE)
+    barplot(
+      height = h$counts,
+      col = "powderblue",
+      space = 0,
+      xlab = "p-values",
+      ylab = "frequency",
+      main = paste("Histogram of p-values for", title_suffix)
+    )
+    text(x = c(0, length(h$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
   } else {
-    h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE)
-    h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE)
-    colori <- c("filtered (low count)"="khaki", "not filtered"="powderblue")
-    barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
-            col = colori, space = 0, xlab="p-values", ylab="frequency",
-            main=paste("Histogram of p-values for",title_suffix))
-    text(x = c(0, length(h1$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA)
-    legend("topright", fill=rev(colori), legend=rev(names(colori)), bg="white")
+    h1 <- hist(res$pvalue[!use], breaks = 0:50 / 50, plot = FALSE)
+    h2 <- hist(res$pvalue[use], breaks = 0:50 / 50, plot = FALSE)
+    colori <- c("filtered (low count)" = "khaki", "not filtered" = "powderblue")
+    barplot(
+      height = rbind(h1$counts, h2$counts),
+      beside = FALSE,
+      col = colori,
+      space = 0,
+      xlab = "p-values",
+      ylab = "frequency",
+      main = paste("Histogram of p-values for", title_suffix)
+    )
+    text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
+    legend("topright", fill = rev(colori), legend = rev(names(colori)), bg = "white")
   }
-    plotMA(res, main= paste("MA-plot for",title_suffix), ylim=range(res$log2FoldChange, na.rm=TRUE))
+    plotMA(res, main = paste("MA-plot for", title_suffix), ylim = range(res$log2FoldChange, na.rm = TRUE))
 }
 
 if (verbose) {
-  cat(paste("primary factor:",primaryFactor,"\n"))
+  cat(paste("primary factor:", primary_factor, "\n"))
   if (length(factors) > 1) {
-    cat(paste("other factors in design:",paste(factors[-length(factors)],collapse=","),"\n"))
+    cat(paste("other factors in design:", paste(factors[-length(factors)], collapse = ","), "\n"))
   }
   cat("\n---------------------\n")
 }
 
-dds <- get_deseq_dataset(sampleTable, header=opt$header, designFormula=designFormula, tximport=opt$tximport, txtype=opt$txtype, tx2gene=opt$tx2gene)
+dds <- get_deseq_dataset(sample_table, header = opt$header, design_formula = design_formula, tximport = opt$tximport, txtype = opt$txtype, tx2gene = opt$tx2gene)
 # estimate size factors for the chosen method
-if(!is.null(opt$esf)){
-    dds <- estimateSizeFactors(dds, type=opt$esf)
+if (!is.null(opt$esf)) {
+    dds <- estimateSizeFactors(dds, type = opt$esf)
 }
-apply_batch_factors <- function (dds, batch_factors) {
+apply_batch_factors <- function(dds, batch_factors) {
   rownames(batch_factors) <- batch_factors$identifier
   batch_factors <- subset(batch_factors, select = -c(identifier, condition))
   dds_samples <- colnames(dds)
@@ -215,155 +227,161 @@
   dds_data <- colData(dds)
   # Merge dds_data with batch_factors using indexes, which are sample names
   # Set sort to False, which maintains the order in dds_data
-  reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort=F)
+  reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort = F)
   batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)]
   for (factor in colnames(batch_factors)) {
     dds[[factor]] <- batch_factors[[factor]]
   }
-  colnames(dds) <- reordered_batch[,1]
+  colnames(dds) <- reordered_batch[, 1]
   return(dds)
 }
 
 if (!is.null(opt$batch_factors)) {
-  batch_factors <- read.table(opt$batch_factors, sep="\t", header=T)
+  batch_factors <- read.table(opt$batch_factors, sep = "\t", header = T)
   dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors)
-  batch_design <- colnames(batch_factors)[-c(1,2)]
-  designFormula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse=" + ")))
-  design(dds) <- designFormula
+  batch_design <- colnames(batch_factors)[-c(1, 2)]
+  design_formula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse = " + ")))
+  design(dds) <- design_formula
 }
 
 if (verbose) {
   cat("DESeq2 run information\n\n")
   cat("sample table:\n")
-  print(sampleTable[,-c(1:2),drop=FALSE])
+  print(sample_table[, -c(1:2), drop = FALSE])
   cat("\ndesign formula:\n")
-  print(designFormula)
+  print(design_formula)
   cat("\n\n")
   cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n"))
 }
 
 # optional outlier behavior
 if (is.null(opt$outlier_replace_off)) {
-  minRep <- 7
+  min_rep <- 7
 } else {
-  minRep <- Inf
+  min_rep <- Inf
   if (verbose) cat("outlier replacement off\n")
 }
 if (is.null(opt$outlier_filter_off)) {
-  cooksCutoff <- TRUE
+  cooks_cutoff <- TRUE
 } else {
-  cooksCutoff <- FALSE
+  cooks_cutoff <- FALSE
   if (verbose) cat("outlier filtering off\n")
 }
 
 # optional automatic mean filtering
 if (is.null(opt$auto_mean_filter_off)) {
-  independentFiltering <- TRUE
+  independent_filtering <- TRUE
 } else {
-  independentFiltering <- FALSE
+  independent_filtering <- FALSE
   if (verbose) cat("automatic filtering on the mean off\n")
 }
 
 # shrinkage of LFCs
 if (is.null(opt$beta_prior_off)) {
-  betaPrior <- TRUE
+  beta_prior <- TRUE
 } else {
-  betaPrior <- FALSE
+  beta_prior <- FALSE
   if (verbose) cat("beta prior off\n")
 }
 
 # dispersion fit type
 if (is.null(opt$fit_type)) {
-  fitType <- "parametric"
+  fit_type <- "parametric"
 } else {
-  fitType <- c("parametric","local","mean")[opt$fit_type]
+  fit_type <- c("parametric", "local", "mean")[opt$fit_type]
 }
 
-if (verbose) cat(paste("using disperion fit type:",fitType,"\n"))
+if (verbose) cat(paste("using disperion fit type:", fit_type, "\n"))
 
 # run the analysis
-dds <- DESeq(dds, fitType=fitType, betaPrior=betaPrior, minReplicatesForReplace=minRep, parallel=parallel)
+dds <- DESeq(dds, fitType = fit_type, betaPrior = beta_prior, minReplicatesForReplace = min_rep, parallel = parallel)
 
 # create the generic plots and leave the device open
 if (!is.null(opt$plots)) {
   if (verbose) cat("creating plots\n")
   pdf(opt$plots)
-  generateGenericPlots(dds, factors)
+  generate_generic_plots(dds, factors)
 }
 
-n <- nlevels(colData(dds)[[primaryFactor]])
-allLevels <- levels(colData(dds)[[primaryFactor]])
+n <- nlevels(colData(dds)[[primary_factor]])
+all_levels <- levels(colData(dds)[[primary_factor]])
 
 if (!is.null(opt$countsfile)) {
-    normalizedCounts<-counts(dds,normalized=TRUE)
-    write.table(normalizedCounts, file=opt$countsfile, sep="\t", col.names=NA, quote=FALSE)
+    normalized_counts <- counts(dds, normalized = TRUE)
+    write.table(normalized_counts, file = opt$countsfile, sep = "\t", col.names = NA, quote = FALSE)
 }
 
 if (!is.null(opt$rlogfile)) {
-    rLogNormalized <-rlogTransformation(dds)
-    rLogNormalizedMat <- assay(rLogNormalized)
-    write.table(rLogNormalizedMat, file=opt$rlogfile, sep="\t", col.names=NA, quote=FALSE)
+    rlog_normalized <- rlogTransformation(dds)
+    rlog_normalized_mat <- assay(rlog_normalized)
+    write.table(rlog_normalized_mat, file = opt$rlogfile, sep = "\t", col.names = NA, quote = FALSE)
 }
 
 if (!is.null(opt$vstfile)) {
-    vstNormalized<-varianceStabilizingTransformation(dds)
-    vstNormalizedMat <- assay(vstNormalized)
-    write.table(vstNormalizedMat, file=opt$vstfile, sep="\t", col.names=NA, quote=FALSE)
+    vst_normalized <- varianceStabilizingTransformation(dds)
+    vst_normalized_mat <- assay(vst_normalized)
+    write.table(vst_normalized_mat, file = opt$vstfile, sep = "\t", col.names = NA, quote = FALSE)
 }
 
 
 if (is.null(opt$many_contrasts)) {
   # only contrast the first and second level of the primary factor
-  ref <- allLevels[1]
-  lvl <- allLevels[2]
-  res <- results(dds, contrast=c(primaryFactor, lvl, ref),
-                 cooksCutoff=cooksCutoff,
-                 independentFiltering=independentFiltering)
+  ref <- all_levels[1]
+  lvl <- all_levels[2]
+  res <- results(
+    dds,
+    contrast = c(primary_factor, lvl, ref),
+    cooksCutoff = cooks_cutoff,
+    independentFiltering = independent_filtering
+  )
   if (verbose) {
     cat("summary of results\n")
-    cat(paste0(primaryFactor,": ",lvl," vs ",ref,"\n"))
+    cat(paste0(primary_factor, ": ", lvl, " vs ", ref, "\n"))
     print(summary(res))
   }
-  resSorted <- res[order(res$padj),]
-  outDF <- as.data.frame(resSorted)
-  outDF$geneID <- rownames(outDF)
-  outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
+  res_sorted <- res[order(res$padj), ]
+  out_df <- as.data.frame(res_sorted)
+  out_df$geneID <- rownames(out_df)  # nolint
+  out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
   filename <- opt$outfile
-  write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
-  if (independentFiltering) {
+  write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
+  if (independent_filtering) {
     threshold <- unname(attr(res, "filterThreshold"))
   } else {
     threshold <- 0
   }
-  title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)
+  title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
   if (!is.null(opt$plots)) {
-    generateSpecificPlots(res, threshold, title_suffix)
+    generate_specific_plots(res, threshold, title_suffix)
   }
 } else {
   # rotate through the possible contrasts of the primary factor
   # write out a sorted table of results with the contrast as a suffix
   # add contrast specific plots to the device
-  for (i in seq_len(n-1)) {
-    ref <- allLevels[i]
-    contrastLevels <- allLevels[(i+1):n]
-    for (lvl in contrastLevels) {
-      res <- results(dds, contrast=c(primaryFactor, lvl, ref),
-                     cooksCutoff=cooksCutoff,
-                     independentFiltering=independentFiltering)
-      resSorted <- res[order(res$padj),]
-      outDF <- as.data.frame(resSorted)
-      outDF$geneID <- rownames(outDF)
-      outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
-      filename <- paste0(primaryFactor,"_",lvl,"_vs_",ref)
-      write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
-      if (independentFiltering) {
+  for (i in seq_len(n - 1)) {
+    ref <- all_levels[i]
+    contrast_levels <- all_levels[(i + 1):n]
+    for (lvl in contrast_levels) {
+      res <- results(
+        dds,
+        contrast = c(primary_factor, lvl, ref),
+        cooksCutoff = cooks_cutoff,
+        independentFiltering = independent_filtering
+      )
+      res_sorted <- res[order(res$padj), ]
+      out_df <- as.data.frame(res_sorted)
+      out_df$geneID <- rownames(out_df)  # nolint
+      out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
+      filename <- paste0(primary_factor, "_", lvl, "_vs_", ref)
+      write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
+      if (independent_filtering) {
         threshold <- unname(attr(res, "filterThreshold"))
       } else {
         threshold <- 0
       }
-      title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)
+      title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
       if (!is.null(opt$plots)) {
-        generateSpecificPlots(res, threshold, title_suffix)
+        generate_specific_plots(res, threshold, title_suffix)
       }
     }
   }
@@ -378,4 +396,3 @@
 cat("Session information:\n\n")
 
 sessionInfo()
-