view deseq2.R @ 30:8fe98f7094de draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 6868b66f73ddbe947986d1a20b546873cbd515a9
author iuc
date Fri, 26 Aug 2022 11:16:15 +0000
parents cd9874cb9019
children 9a882d108833
line wrap: on
line source

#!/usr/bin/env Rscript

# A command-line interface to DESeq2 for use with Galaxy
# written by Bjoern Gruening and modified by Michael Love 2016.03.30
#
# This argument is required:
#
#   'factors' a JSON list object from Galaxy
#
# the output file has columns:
#
#   baseMean (mean normalized count)
#   log2FoldChange (by default a moderated LFC estimate)
#   lfcSE (the standard error)
#   stat (the Wald statistic)
#   pvalue (p-value from comparison of Wald statistic to a standard Normal)
#   padj (adjusted p-value, Benjamini Hochberg correction on genes which pass the mean count filter)
#
# the first variable in 'factors' will be the primary factor.
# the levels of the primary factor are used in the order of appearance in factors.
#
# by default, levels in the order A,B,C produces a single comparison of B vs A, to a single file 'outfile'
#
# for the 'many_contrasts' flag, levels in the order A,B,C produces comparisons C vs A, B vs A, C vs B,
# to a number of files using the 'outfile' prefix: 'outfile.condition_C_vs_A' etc.
# all plots will still be sent to a single PDF, named by the arg 'plots', with extra pages.
#
# fit_type is an integer valued argument, with the options from ?estimateDisperions
#   1 "parametric"
#   2 "local"
#   3 "mean"

# setup R error handling to go to stderr
options(show.error.messages = FALSE, error = function() {
  cat(geterrmessage(), file = stderr())
  q("no", 1, FALSE)
})

# we need that to not crash galaxy with an UTF8 error on German LC settings.
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")

library("getopt")
library("tools")
options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
args <- commandArgs(trailingOnly = TRUE)

# get options, using the spec as defined by the enclosed list.
# we read the options from the default: commandArgs(TRUE).
spec <- matrix(c(
  "quiet", "q", 0, "logical",
  "help", "h", 0, "logical",
  "cores", "s", 0, "integer",
  "batch_factors", "w", 1, "character",
  "outfile", "o", 1, "character",
  "countsfile", "n", 1, "character",
  "sizefactorsfile", "F", 1, "character",
  "rlogfile", "r", 1, "character",
  "vstfile", "v", 1, "character",
  "header", "H", 0, "logical",
  "factors", "f", 1, "character",
  "files_to_labels", "l", 1, "character",
  "plots", "p", 1, "character",
  "tximport", "i", 0, "logical",
  "txtype", "y", 1, "character",
  "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF/GFF3 file
  "esf", "e", 1, "character",
  "fit_type", "t", 1, "integer",
  "many_contrasts", "m", 0, "logical",
  "outlier_replace_off", "a", 0, "logical",
  "outlier_filter_off", "b", 0, "logical",
  "auto_mean_filter_off", "c", 0, "logical",
  "beta_prior_off", "d", 0, "logical",
  "alpha_ma", "A", 1, "numeric",
  "prefilter", "P", 0, "logical",
  "prefilter_value", "V", 1, "numeric"
), byrow = TRUE, ncol = 4)
opt <- getopt(spec)

# if help was asked for print a friendly message
# and exit with a non-zero error code
if (!is.null(opt$help)) {
  cat(getopt(spec, usage = TRUE))
  q(status = 1)
}

# enforce the following required arguments
if (is.null(opt$outfile)) {
  cat("'outfile' is required\n")
  q(status = 1)
}
if (is.null(opt$factors)) {
  cat("'factors' is required\n")
  q(status = 1)
}

verbose <- is.null(opt$quiet)

source_local <- function(fname) {
    argv <- commandArgs(trailingOnly = FALSE)
    base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))
    source(paste(base_dir, fname, sep = "/"))
}

source_local("get_deseq_dataset.R")

suppressPackageStartupMessages({
  library("DESeq2")
  library("RColorBrewer")
  library("gplots")
})

if (opt$cores > 1) {
  library("BiocParallel")
  register(MulticoreParam(opt$cores))
  parallel <- TRUE
} else {
  parallel <- FALSE
}

# build or read sample table

trim <- function(x) gsub("^\\s+|\\s+$", "", x)

# switch on if 'factors' was provided:
library("rjson")
parser <- newJSONParser()
parser$addData(opt$factors)
factor_list <- parser$getObject()
filenames_to_labels <- fromJSON(opt$files_to_labels)
factors <- sapply(factor_list, function(x) x[[1]])
primary_factor <- factors[1]
filenames_in <- unname(unlist(factor_list[[1]][[2]]))
labs <- unname(unlist(filenames_to_labels[basename(filenames_in)]))
sample_table <- data.frame(
  sample = basename(filenames_in),
  filename = filenames_in,
  row.names = filenames_in,
  stringsAsFactors = FALSE
)
for (factor in factor_list) {
  factor_name <- trim(factor[[1]])
  sample_table[[factor_name]] <- character(nrow(sample_table))
  lvls <- sapply(factor[[2]], function(x) names(x))
  for (i in seq_along(factor[[2]])) {
    files <- factor[[2]][[i]][[1]]
    sample_table[files, factor_name] <- trim(lvls[i])
  }
  sample_table[[factor_name]] <- factor(sample_table[[factor_name]], levels = lvls)
}
rownames(sample_table) <- labs

design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + ")))

# these are plots which are made once for each analysis
generate_generic_plots <- function(dds, factors) {
  library("ggplot2")
  library("ggrepel")
  library("pheatmap")

  rld <- rlog(dds)
  p <- plotPCA(rld, intgroup = rev(factors))
  print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size = 3)  + geom_point())
  dat <- assay(rld)
  dists_rl <- dist(t(dat))
  mat <- as.matrix(dists_rl)
  colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
  pheatmap(
    mat,
    clustering_distance_rows = dists_rl,
    clustering_distance_cols = dists_rl,
    col = colors,
    main = "Sample-to-sample distances"
  )
  plotDispEsts(dds, main = "Dispersion estimates")
}

# these are plots which can be made for each comparison, e.g.
# once for C vs A and once for B vs A
generate_specific_plots <- function(res, threshold, title_suffix) {
  use <- res$baseMean > threshold
  if (sum(!use) == 0) {
    h <- hist(res$pvalue, breaks = 0:50 / 50, plot = FALSE)
    barplot(
      height = h$counts,
      col = "powderblue",
      space = 0,
      xlab = "p-values",
      ylab = "frequency",
      main = paste("Histogram of p-values for", title_suffix)
    )
    text(x = c(0, length(h$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
  } else {
    h1 <- hist(res$pvalue[!use], breaks = 0:50 / 50, plot = FALSE)
    h2 <- hist(res$pvalue[use], breaks = 0:50 / 50, plot = FALSE)
    colori <- c("filtered (low count)" = "khaki", "not filtered" = "powderblue")
    barplot(
      height = rbind(h1$counts, h2$counts),
      beside = FALSE,
      col = colori,
      space = 0,
      xlab = "p-values",
      ylab = "frequency",
      main = paste("Histogram of p-values for", title_suffix)
    )
    text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
    legend("topright", fill = rev(colori), legend = rev(names(colori)), bg = "white")
  }
    plotMA(res, main = paste("MA-plot for", title_suffix), ylim = range(res$log2FoldChange, na.rm = TRUE), alpha = opt$alpha_ma)
}

if (verbose) {
  cat(paste("primary factor:", primary_factor, "\n"))
  if (length(factors) > 1) {
    cat(paste("other factors in design:", paste(factors[-length(factors)], collapse = ","), "\n"))
  }
  cat("\n---------------------\n")
}

dds <- get_deseq_dataset(sample_table, header = opt$header, design_formula = design_formula, tximport = opt$tximport, txtype = opt$txtype, tx2gene = opt$tx2gene)
# estimate size factors for the chosen method
if (!is.null(opt$esf)) {
    dds <- estimateSizeFactors(dds, type = opt$esf)
}

# estimate size factors for each sample
# - https://support.bioconductor.org/p/97676/
if (!is.null(opt$sizefactorsfile)) {
    nm <- assays(dds)[["avgTxLength"]]
    if (!is.null(nm)) {
        ## Recommended: takes into account tximport data
        cat("\nsize factors for samples: taking tximport data into account\n")
        size_factors <- estimateSizeFactorsForMatrix(counts(dds) / nm)
    } else {
        norm_factors <- normalizationFactors(dds)
        if (!is.null(norm_factors)) {
            ## In practice, gives same results as above.
            cat("\nsize factors for samples: no tximport data, using derived normalization factors\n")
            size_factors <- estimateSizeFactorsForMatrix(norm_factors)
        } else {
            ## If we have no other information, estimate from raw.
            cat("\nsize factors for samples: no tximport data, no normalization factors, estimating from raw data\n")
            size_factors <- estimateSizeFactorsForMatrix(counts(dds))
        }
    }
    write.table(size_factors, file = opt$sizefactorsfile, sep = "\t", col.names = FALSE, quote = FALSE)
}

apply_batch_factors <- function(dds, batch_factors) {
  rownames(batch_factors) <- batch_factors$identifier
  batch_factors <- subset(batch_factors, select = -c(identifier, condition))
  dds_samples <- colnames(dds)
  batch_samples <- rownames(batch_factors)
  if (!setequal(batch_samples, dds_samples)) {
    stop("Batch factor names don't correspond to input sample names, check input files")
  }
  dds_data <- colData(dds)
  # Merge dds_data with batch_factors using indexes, which are sample names
  # Set sort to False, which maintains the order in dds_data
  reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort = FALSE)
  batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)]
  for (factor in colnames(batch_factors)) {
    dds[[factor]] <- batch_factors[[factor]]
  }
  colnames(dds) <- reordered_batch[, 1]
  return(dds)
}

if (!is.null(opt$batch_factors)) {
  batch_factors <- read.table(opt$batch_factors, sep = "\t", header = TRUE)
  dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors)
  batch_design <- colnames(batch_factors)[-c(1, 2)]
  design_formula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse = " + ")))
  design(dds) <- design_formula
}

if (verbose) {
  cat("DESeq2 run information\n\n")
  cat("sample table:\n")
  print(sample_table[, -c(1:2), drop = FALSE])
  cat("\ndesign formula:\n")
  print(design_formula)
  cat("\n\n")
  cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n"))
}

# minimal pre-filtering
if (!is.null(opt$prefilter)) {
    keep <- rowSums(counts(dds)) >= opt$prefilter_value
    dds <- dds[keep, ]
}

# optional outlier behavior
if (is.null(opt$outlier_replace_off)) {
  min_rep <- 7
} else {
  min_rep <- Inf
  if (verbose) cat("outlier replacement off\n")
}
if (is.null(opt$outlier_filter_off)) {
  cooks_cutoff <- TRUE
} else {
  cooks_cutoff <- FALSE
  if (verbose) cat("outlier filtering off\n")
}

# optional automatic mean filtering
if (is.null(opt$auto_mean_filter_off)) {
  independent_filtering <- TRUE
} else {
  independent_filtering <- FALSE
  if (verbose) cat("automatic filtering on the mean off\n")
}

# shrinkage of LFCs
if (is.null(opt$beta_prior_off)) {
  beta_prior <- TRUE
} else {
  beta_prior <- FALSE
  if (verbose) cat("beta prior off\n")
}

# dispersion fit type
if (is.null(opt$fit_type)) {
  fit_type <- "parametric"
} else {
  fit_type <- c("parametric", "local", "mean")[opt$fit_type]
}

if (verbose) cat(paste("using disperion fit type:", fit_type, "\n"))

# run the analysis
dds <- DESeq(dds, fitType = fit_type, betaPrior = beta_prior, minReplicatesForReplace = min_rep, parallel = parallel)

# create the generic plots and leave the device open
if (!is.null(opt$plots)) {
  if (verbose) cat("creating plots\n")
  pdf(opt$plots)
  generate_generic_plots(dds, factors)
}

n <- nlevels(colData(dds)[[primary_factor]])
all_levels <- levels(colData(dds)[[primary_factor]])

if (!is.null(opt$countsfile)) {
    normalized_counts <- counts(dds, normalized = TRUE)
    write.table(normalized_counts, file = opt$countsfile, sep = "\t", col.names = NA, quote = FALSE)
}

if (!is.null(opt$rlogfile)) {
    rlog_normalized <- rlogTransformation(dds)
    rlog_normalized_mat <- assay(rlog_normalized)
    write.table(rlog_normalized_mat, file = opt$rlogfile, sep = "\t", col.names = NA, quote = FALSE)
}

if (!is.null(opt$vstfile)) {
    vst_normalized <- varianceStabilizingTransformation(dds)
    vst_normalized_mat <- assay(vst_normalized)
    write.table(vst_normalized_mat, file = opt$vstfile, sep = "\t", col.names = NA, quote = FALSE)
}


if (is.null(opt$many_contrasts)) {
  # only contrast the first and second level of the primary factor
  ref <- all_levels[1]
  lvl <- all_levels[2]
  res <- results(
    dds,
    contrast = c(primary_factor, lvl, ref),
    cooksCutoff = cooks_cutoff,
    independentFiltering = independent_filtering
  )
  if (verbose) {
    cat("summary of results\n")
    cat(paste0(primary_factor, ": ", lvl, " vs ", ref, "\n"))
    print(summary(res))
  }
  res_sorted <- res[order(res$padj), ]
  out_df <- as.data.frame(res_sorted)
  out_df$geneID <- rownames(out_df)  # nolint
  out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
  filename <- opt$outfile
  write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
  if (independent_filtering) {
    threshold <- unname(attr(res, "filterThreshold"))
  } else {
    threshold <- 0
  }
  title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
  if (!is.null(opt$plots)) {
    generate_specific_plots(res, threshold, title_suffix)
  }
} else {
  # rotate through the possible contrasts of the primary factor
  # write out a sorted table of results with the contrast as a suffix
  # add contrast specific plots to the device
  for (i in seq_len(n - 1)) {
    ref <- all_levels[i]
    contrast_levels <- all_levels[(i + 1):n]
    for (lvl in contrast_levels) {
      res <- results(
        dds,
        contrast = c(primary_factor, lvl, ref),
        cooksCutoff = cooks_cutoff,
        independentFiltering = independent_filtering
      )
      res_sorted <- res[order(res$padj), ]
      out_df <- as.data.frame(res_sorted)
      out_df$geneID <- rownames(out_df)  # nolint
      out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
      filename <- paste0(primary_factor, "_", lvl, "_vs_", ref)
      write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
      if (independent_filtering) {
        threshold <- unname(attr(res, "filterThreshold"))
      } else {
        threshold <- 0
      }
      title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
      if (!is.null(opt$plots)) {
        generate_specific_plots(res, threshold, title_suffix)
      }
    }
  }
}

# close the plot device
if (!is.null(opt$plots)) {
  cat("closing plot device\n")
  dev.off()
}

cat("Session information:\n\n")

sessionInfo()