diff deseq2.R @ 6:4939397c4706 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 3bc8d91ee546682ef8e9303bd1044bb14cf21b07
author iuc
date Wed, 09 Nov 2016 17:00:31 -0500
parents
children 6f91b8ce6e07
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deseq2.R	Wed Nov 09 17:00:31 2016 -0500
@@ -0,0 +1,373 @@
+#!/usr/bin/env Rscript
+
+# A command-line interface to DESeq2 for use with Galaxy
+# written by Bjoern Gruening and modified by Michael Love 2016.03.30
+#
+# one of these arguments is required:
+#
+#   'factors' a JSON list object from Galaxy
+#
+#   'sample_table' is a sample table as described in ?DESeqDataSetFromHTSeq
+#   with columns: sample name, filename, then factors (variables)
+#
+# the output file has columns:
+# 
+#   baseMean (mean normalized count)
+#   log2FoldChange (by default a moderated LFC estimate)
+#   lfcSE (the standard error)
+#   stat (the Wald statistic)
+#   pvalue (p-value from comparison of Wald statistic to a standard Normal)
+#   padj (adjusted p-value, Benjamini Hochberg correction on genes which pass the mean count filter)
+# 
+# the first variable in 'factors' and first column in 'sample_table' will be the primary factor.
+# the levels of the primary factor are used in the order of appearance in factors or in sample_table.
+#
+# by default, levels in the order A,B,C produces a single comparison of B vs A, to a single file 'outfile'
+#
+# for the 'many_contrasts' flag, levels in the order A,B,C produces comparisons C vs A, B vs A, C vs B,
+# to a number of files using the 'outfile' prefix: 'outfile.condition_C_vs_A' etc.
+# all plots will still be sent to a single PDF, named by the arg 'plots', with extra pages.
+#
+# fit_type is an integer valued argument, with the options from ?estimateDisperions
+#   1 "parametric"
+#   2 "local"
+#   3 "mean"
+
+# setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+
+library("getopt")
+library("tools")
+options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
+args <- commandArgs(trailingOnly = TRUE)
+
+# get options, using the spec as defined by the enclosed list.
+# we read the options from the default: commandArgs(TRUE).
+spec <- matrix(c(
+  "quiet", "q", 0, "logical",
+  "help", "h", 0, "logical",
+  "outfile", "o", 1, "character",
+  "countsfile", "n", 1, "character",
+  "factors", "f", 1, "character",
+  "plots" , "p", 1, "character",
+  "sample_table", "s", 1, "character",
+  "tximport", "i", 0, "logical",
+  "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF)
+  "fit_type", "t", 1, "integer",
+  "many_contrasts", "m", 0, "logical",
+  "outlier_replace_off" , "a", 0, "logical",
+  "outlier_filter_off" , "b", 0, "logical",
+  "auto_mean_filter_off", "c", 0, "logical",
+  "beta_prior_off", "d", 0, "logical"),
+  byrow=TRUE, ncol=4)
+opt <- getopt(spec)
+
+# if help was asked for print a friendly message
+# and exit with a non-zero error code
+if (!is.null(opt$help)) {
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+# enforce the following required arguments
+if (is.null(opt$outfile)) {
+  cat("'outfile' is required\n")
+  q(status=1)
+}
+if (is.null(opt$sample_table) & is.null(opt$factors)) {
+  cat("'factors' or 'sample_table' is required\n")
+  q(status=1)
+}
+
+verbose <- if (is.null(opt$quiet)) {
+  TRUE
+} else {
+  FALSE
+}
+
+if (!is.null(opt$tximport)) {
+  if (is.null(opt$tx2gene)) stop("A transcript-to-gene map or a GTF file is required for tximport")
+  if (tolower(file_ext(opt$tx2gene)) == "gtf") {
+    gtfFile <- opt$tx2gene
+  } else {
+    gtfFile <- NULL
+    tx2gene <- read.table(opt$tx2gene, header=FALSE)
+  }
+  useTXI <- TRUE
+} else {
+  useTXI <- FALSE
+}
+
+suppressPackageStartupMessages({
+  library("DESeq2")
+  library("RColorBrewer")
+  library("gplots")
+})
+
+# build or read sample table
+
+trim <- function (x) gsub("^\\s+|\\s+$", "", x)
+
+# switch on if 'factors' was provided:
+if (!is.null(opt$factors)) {
+  library("rjson")
+  parser <- newJSONParser()
+  parser$addData(opt$factors)
+  factorList <- parser$getObject()
+  factors <- sapply(factorList, function(x) x[[1]])
+  primaryFactor <- factors[1]
+  filenamesIn <- unname(unlist(factorList[[1]][[2]]))
+  sampleTable <- data.frame(sample=basename(filenamesIn),
+                            filename=filenamesIn,
+                            row.names=filenamesIn,
+                            stringsAsFactors=FALSE)
+  for (factor in factorList) {
+    factorName <- trim(factor[[1]])
+    sampleTable[[factorName]] <- character(nrow(sampleTable))
+    lvls <- sapply(factor[[2]], function(x) names(x))
+    for (i in seq_along(factor[[2]])) {
+      files <- factor[[2]][[i]][[1]]
+      sampleTable[files,factorName] <- trim(lvls[i])
+    }
+    sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls)
+  }
+  rownames(sampleTable) <- sampleTable$sample
+} else {
+  # read the sample_table argument
+  # this table is described in ?DESeqDataSet
+  # one column for the sample name, one for the filename, and
+  # the remaining columns for factors in the analysis
+  sampleTable <- read.delim(opt$sample_table, stringsAsFactors=FALSE)
+  factors <- colnames(sampleTable)[-c(1:2)]
+  for (factor in factors) {
+    lvls <- unique(as.character(sampleTable[[factor]]))
+    sampleTable[[factor]] <- factor(sampleTable[[factor]], levels=lvls)
+  }
+}
+
+primaryFactor <- factors[1]
+designFormula <- as.formula(paste("~", paste(rev(factors), collapse=" + ")))
+
+if (verbose) {
+  cat("DESeq2 run information\n\n")
+  cat("sample table:\n")
+  print(sampleTable[,-c(1:2),drop=FALSE])
+  cat("\ndesign formula:\n")
+  print(designFormula)
+  cat("\n\n")
+}
+
+# these are plots which are made once for each analysis
+generateGenericPlots <- function(dds, factors) {
+  rld <- rlog(dds)
+  d=plotPCA(rld, intgroup=rev(factors), returnData=TRUE)
+  labs <- paste0(seq_len(ncol(dds)), ": ", do.call(paste, as.list(colData(dds)[factors])))
+  library(ggplot2)
+  print(ggplot(d, aes(x=PC1,y=PC2, col=group,label=factor(labs)), environment = environment()) + geom_point() + geom_text(size=3))  
+  dat <- assay(rld)
+  colnames(dat) <- labs
+  distsRL <- dist(t(dat))
+  mat <- as.matrix(distsRL)
+  hc <- hclust(distsRL)
+  hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
+  heatmap.2(mat, Rowv=as.dendrogram(hc), symm=TRUE, trace="none", col = rev(hmcol),
+            main="Sample-to-sample distances", margin=c(13,13))
+  plotDispEsts(dds, main="Dispersion estimates")
+}
+
+# these are plots which can be made for each comparison, e.g.
+# once for C vs A and once for B vs A
+generateSpecificPlots <- function(res, threshold, title_suffix) {
+  use <- res$baseMean > threshold
+  if (sum(!use) == 0) {
+    h <- hist(res$pvalue, breaks=0:50/50, plot=FALSE)
+    barplot(height = h$counts,
+            col = "powderblue", space = 0, xlab="p-values", ylab="frequency",
+            main=paste("Histogram of p-values for",title_suffix))
+    text(x = c(0, length(h$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA)
+  } else {
+    h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE)
+    h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE)
+    colori <- c("filtered (low count)"="khaki", "not filtered"="powderblue")
+    barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
+            col = colori, space = 0, xlab="p-values", ylab="frequency",
+            main=paste("Histogram of p-values for",title_suffix))
+    text(x = c(0, length(h1$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA)
+    legend("topright", fill=rev(colori), legend=rev(names(colori)), bg="white")
+  }
+    plotMA(res, main= paste("MA-plot for",title_suffix), ylim=range(res$log2FoldChange, na.rm=TRUE))
+}
+
+if (verbose) {
+  cat(paste("primary factor:",primaryFactor,"\n"))
+  if (length(factors) > 1) {
+    cat(paste("other factors in design:",paste(factors[-length(factors)],collapse=","),"\n"))
+  }
+  cat("\n---------------------\n")
+}
+
+# if JSON input from Galaxy, path is absolute
+# otherwise, from sample_table, assume it is relative
+dir <- if (is.null(opt$factors)) {
+  "."
+} else {
+  ""
+}
+
+if (!useTXI) {
+  # construct the object from HTSeq files
+  dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
+                                    directory = dir,
+                                    design =  designFormula)
+} else {
+    # construct the object using tximport
+    # first need to make the tx2gene table
+    # this takes ~2-3 minutes using Bioconductor functions
+    if (!is.null(gtfFile)) {
+      suppressPackageStartupMessages({
+        library("GenomicFeatures")
+      })
+      txdb <- makeTxDbFromGFF(gtfFile, format="gtf")
+      k <- keys(txdb, keytype = "GENEID")
+      df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME")
+      tx2gene <- df[, 2:1]  # tx ID, then gene ID
+    }
+    library("tximport")
+    txiFiles <- as.character(sampleTable[,2])
+    names(txiFiles) <- as.character(sampleTable[,1])
+    txi <- tximport(txiFiles, type="sailfish", tx2gene=tx2gene)
+    dds <- DESeqDataSetFromTximport(txi,
+                                    sampleTable[,3:ncol(sampleTable),drop=FALSE],
+                                    designFormula)
+}
+
+if (verbose) cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n"))
+# optional outlier behavior
+if (is.null(opt$outlier_replace_off)) {
+  minRep <- 7
+} else {
+  minRep <- Inf
+  if (verbose) cat("outlier replacement off\n")
+}
+if (is.null(opt$outlier_filter_off)) {
+  cooksCutoff <- TRUE
+} else {  
+  cooksCutoff <- FALSE
+  if (verbose) cat("outlier filtering off\n")
+}
+
+# optional automatic mean filtering
+if (is.null(opt$auto_mean_filter_off)) {
+  independentFiltering <- TRUE
+} else {
+  independentFiltering <- FALSE
+  if (verbose) cat("automatic filtering on the mean off\n")
+}
+
+# shrinkage of LFCs
+if (is.null(opt$beta_prior_off)) {
+  betaPrior <- TRUE
+} else {
+  betaPrior <- FALSE
+  if (verbose) cat("beta prior off\n")
+}
+
+# dispersion fit type
+if (is.null(opt$fit_type)) {
+  fitType <- "parametric"
+} else {
+  fitType <- c("parametric","local","mean")[opt$fit_type]
+}
+
+if (verbose) cat(paste("using disperion fit type:",fitType,"\n"))
+
+# run the analysis
+dds <- DESeq(dds, fitType=fitType, betaPrior=betaPrior, minReplicatesForReplace=minRep)
+
+# create the generic plots and leave the device open
+if (!is.null(opt$plots)) {
+  if (verbose) cat("creating plots\n")
+  pdf(opt$plots)
+  generateGenericPlots(dds, factors)
+}
+
+n <- nlevels(colData(dds)[[primaryFactor]])
+allLevels <- levels(colData(dds)[[primaryFactor]])
+
+if (!is.null(opt$countsfile)) {
+    labs <- paste0(seq_len(ncol(dds)), ": ", do.call(paste, as.list(colData(dds)[factors])))
+    normalizedCounts<-counts(dds,normalized=TRUE)
+    colnames(normalizedCounts)<-labs
+    write.table(normalizedCounts, file=opt$countsfile, sep="\t", col.names=NA, quote=FALSE)
+}
+
+if (is.null(opt$many_contrasts)) {
+  # only contrast the first and second level of the primary factor
+  ref <- allLevels[1]
+  lvl <- allLevels[2]
+  res <- results(dds, contrast=c(primaryFactor, lvl, ref),
+                 cooksCutoff=cooksCutoff,
+                 independentFiltering=independentFiltering)
+  if (verbose) {
+    cat("summary of results\n")
+    cat(paste0(primaryFactor,": ",lvl," vs ",ref,"\n"))
+    print(summary(res))
+  }
+  resSorted <- res[order(res$padj),]
+  outDF <- as.data.frame(resSorted)
+  outDF$geneID <- rownames(outDF)
+  outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
+  filename <- opt$outfile
+  write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
+  if (independentFiltering) {
+    threshold <- unname(attr(res, "filterThreshold"))
+  } else {
+    threshold <- 0
+  }
+  title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)
+  if (!is.null(opt$plots)) {
+    generateSpecificPlots(res, threshold, title_suffix)
+  }
+} else {
+  # rotate through the possible contrasts of the primary factor
+  # write out a sorted table of results with the contrast as a suffix
+  # add contrast specific plots to the device
+  for (i in seq_len(n-1)) {
+    ref <- allLevels[i]
+    contrastLevels <- allLevels[(i+1):n]
+    for (lvl in contrastLevels) {
+      res <- results(dds, contrast=c(primaryFactor, lvl, ref),
+                     cooksCutoff=cooksCutoff,
+                     independentFiltering=independentFiltering)
+      resSorted <- res[order(res$padj),]
+      outDF <- as.data.frame(resSorted)
+      outDF$geneID <- rownames(outDF)
+      outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
+      filename <- paste0(opt$outfile,".",primaryFactor,"_",lvl,"_vs_",ref)
+      write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
+      if (independentFiltering) {
+        threshold <- unname(attr(res, "filterThreshold"))
+      } else {
+        threshold <- 0
+      }
+      title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)
+      if (!is.null(opt$plots)) {
+        generateSpecificPlots(res, threshold, title_suffix)
+      }
+    }
+  }
+}
+
+# close the plot device
+if (!is.null(opt$plots)) {
+  cat("closing plot device\n")
+  dev.off()
+}
+
+cat("Session information:\n\n")
+
+sessionInfo()
+