view deseq2.R @ 16:a416957ee305 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 81fa5b04b9ba9d4b1b59192af0adb1e1e18ca22b
author iuc
date Fri, 03 Aug 2018 17:23:46 -0400
parents 9a616afdbda5
children d9e5cadc7f0b
line wrap: on
line source

#!/usr/bin/env Rscript

# A command-line interface to DESeq2 for use with Galaxy
# written by Bjoern Gruening and modified by Michael Love 2016.03.30
#
# This argument is required:
#
#   'factors' a JSON list object from Galaxy
#
# the output file has columns:
#
#   baseMean (mean normalized count)
#   log2FoldChange (by default a moderated LFC estimate)
#   lfcSE (the standard error)
#   stat (the Wald statistic)
#   pvalue (p-value from comparison of Wald statistic to a standard Normal)
#   padj (adjusted p-value, Benjamini Hochberg correction on genes which pass the mean count filter)
#
# the first variable in 'factors' will be the primary factor.
# the levels of the primary factor are used in the order of appearance in factors.
#
# by default, levels in the order A,B,C produces a single comparison of B vs A, to a single file 'outfile'
#
# for the 'many_contrasts' flag, levels in the order A,B,C produces comparisons C vs A, B vs A, C vs B,
# to a number of files using the 'outfile' prefix: 'outfile.condition_C_vs_A' etc.
# all plots will still be sent to a single PDF, named by the arg 'plots', with extra pages.
#
# fit_type is an integer valued argument, with the options from ?estimateDisperions
#   1 "parametric"
#   2 "local"
#   3 "mean"

# setup R error handling to go to stderr
options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )

# we need that to not crash galaxy with an UTF8 error on German LC settings.
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")

library("getopt")
library("tools")
options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
args <- commandArgs(trailingOnly = TRUE)

# get options, using the spec as defined by the enclosed list.
# we read the options from the default: commandArgs(TRUE).
spec <- matrix(c(
  "quiet", "q", 0, "logical",
  "help", "h", 0, "logical",
  "outfile", "o", 1, "character",
  "countsfile", "n", 1, "character",
  "header", "H", 0, "logical",
  "factors", "f", 1, "character",
  "files_to_labels", "l", 1, "character",
  "plots" , "p", 1, "character",
  "tximport", "i", 0, "logical",
  "txtype", "y", 1, "character",
  "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF)
  "fit_type", "t", 1, "integer",
  "many_contrasts", "m", 0, "logical",
  "outlier_replace_off" , "a", 0, "logical",
  "outlier_filter_off" , "b", 0, "logical",
  "auto_mean_filter_off", "c", 0, "logical",
  "beta_prior_off", "d", 0, "logical"),
  byrow=TRUE, ncol=4)
opt <- getopt(spec)

# if help was asked for print a friendly message
# and exit with a non-zero error code
if (!is.null(opt$help)) {
  cat(getopt(spec, usage=TRUE))
  q(status=1)
}

# enforce the following required arguments
if (is.null(opt$outfile)) {
  cat("'outfile' is required\n")
  q(status=1)
}
if (is.null(opt$factors)) {
  cat("'factors' is required\n")
  q(status=1)
}

verbose <- if (is.null(opt$quiet)) {
  TRUE
} else {
  FALSE
}

if (!is.null(opt$header)) {
  hasHeader <- TRUE
} else {
  hasHeader <- FALSE
}

if (!is.null(opt$tximport)) {
  if (is.null(opt$tx2gene)) stop("A transcript-to-gene map or a GTF file is required for tximport")
  if (tolower(file_ext(opt$tx2gene)) == "gtf") {
    gtfFile <- opt$tx2gene
  } else {
    gtfFile <- NULL
    tx2gene <- read.table(opt$tx2gene, header=FALSE)
  }
  useTXI <- TRUE
} else {
  useTXI <- FALSE
}

suppressPackageStartupMessages({
  library("DESeq2")
  library("RColorBrewer")
  library("gplots")
})

# build or read sample table

trim <- function (x) gsub("^\\s+|\\s+$", "", x)

# switch on if 'factors' was provided:
library("rjson")
parser <- newJSONParser()
parser$addData(opt$factors)
factorList <- parser$getObject()
filenames_to_labels <- fromJSON(opt$files_to_labels)
factors <- sapply(factorList, function(x) x[[1]])
primaryFactor <- factors[1]
filenamesIn <- unname(unlist(factorList[[1]][[2]]))
labs = unname(unlist(filenames_to_labels[basename(filenamesIn)]))
sampleTable <- data.frame(sample=basename(filenamesIn),
                          filename=filenamesIn,
                          row.names=filenamesIn,
                          stringsAsFactors=FALSE)
for (factor in factorList) {
  factorName <- trim(factor[[1]])
  sampleTable[[factorName]] <- character(nrow(sampleTable))
  lvls <- sapply(factor[[2]], function(x) names(x))
  for (i in seq_along(factor[[2]])) {
    files <- factor[[2]][[i]][[1]]
    sampleTable[files,factorName] <- trim(lvls[i])
  }
  sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls)
}
rownames(sampleTable) <- labs

primaryFactor <- factors[1]
designFormula <- as.formula(paste("~", paste(rev(factors), collapse=" + ")))

# these are plots which are made once for each analysis
generateGenericPlots <- function(dds, factors) {
  library("ggplot2")
  library("ggrepel")
  library("pheatmap")

  rld <- rlog(dds)
  p <- plotPCA(rld, intgroup=rev(factors))
  print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size=3)  + geom_point())
  dat <- assay(rld)
  distsRL <- dist(t(dat))
  mat <- as.matrix(distsRL)
  colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
  pheatmap(mat,
           clustering_distance_rows=distsRL,
           clustering_distance_cols=distsRL,
           col=colors,
           main="Sample-to-sample distances")
  plotDispEsts(dds, main="Dispersion estimates")
}

# these are plots which can be made for each comparison, e.g.
# once for C vs A and once for B vs A
generateSpecificPlots <- function(res, threshold, title_suffix) {
  use <- res$baseMean > threshold
  if (sum(!use) == 0) {
    h <- hist(res$pvalue, breaks=0:50/50, plot=FALSE)
    barplot(height = h$counts,
            col = "powderblue", space = 0, xlab="p-values", ylab="frequency",
            main=paste("Histogram of p-values for",title_suffix))
    text(x = c(0, length(h$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA)
  } else {
    h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE)
    h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE)
    colori <- c("filtered (low count)"="khaki", "not filtered"="powderblue")
    barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
            col = colori, space = 0, xlab="p-values", ylab="frequency",
            main=paste("Histogram of p-values for",title_suffix))
    text(x = c(0, length(h1$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA)
    legend("topright", fill=rev(colori), legend=rev(names(colori)), bg="white")
  }
    plotMA(res, main= paste("MA-plot for",title_suffix), ylim=range(res$log2FoldChange, na.rm=TRUE))
}

if (verbose) {
  cat(paste("primary factor:",primaryFactor,"\n"))
  if (length(factors) > 1) {
    cat(paste("other factors in design:",paste(factors[-length(factors)],collapse=","),"\n"))
  }
  cat("\n---------------------\n")
}

# For JSON input from Galaxy, path is absolute
dir <- ""

if (!useTXI & hasHeader) {
    countfiles <- lapply(as.character(sampleTable$filename), function(x){read.delim(x, row.names=1)})
    tbl <- do.call("cbind", countfiles)
    colnames(tbl) <- rownames(sampleTable) # take sample ids from header

    # check for htseq report lines (from DESeqDataSetFromHTSeqCount function)
    oldSpecialNames <- c("no_feature", "ambiguous", "too_low_aQual",
        "not_aligned", "alignment_not_unique")
    specialRows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% oldSpecialNames
    tbl <- tbl[!specialRows, , drop = FALSE]

    dds <- DESeqDataSetFromMatrix(countData = tbl,
                                  colData = sampleTable[,-c(1:2), drop=FALSE],
                                  design = designFormula)
} else if (!useTXI & !hasHeader) {

  # construct the object from HTSeq files
  dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
                                    directory = dir,
                                    design =  designFormula)
  labs <- unname(unlist(filenames_to_labels[colnames(dds)]))
  colnames(dds) <- labs

} else {
    # construct the object using tximport
    # first need to make the tx2gene table
    # this takes ~2-3 minutes using Bioconductor functions
    if (!is.null(gtfFile)) {
      suppressPackageStartupMessages({
        library("GenomicFeatures")
      })
      txdb <- makeTxDbFromGFF(gtfFile, format="gtf")
      k <- keys(txdb, keytype = "GENEID")
      df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME")
      tx2gene <- df[, 2:1]  # tx ID, then gene ID
    }
    library("tximport")
    txiFiles <- as.character(sampleTable[,2])
    labs <- unname(unlist(filenames_to_labels[sampleTable[,1]]))
    names(txiFiles) <- labs
    txi <- tximport(txiFiles, type=opt$txtype, tx2gene=tx2gene)
    dds <- DESeqDataSetFromTximport(txi,
                                    sampleTable[,3:ncol(sampleTable),drop=FALSE],
                                    designFormula)
}

if (verbose) {
  cat("DESeq2 run information\n\n")
  cat("sample table:\n")
  print(sampleTable[,-c(1:2),drop=FALSE])
  cat("\ndesign formula:\n")
  print(designFormula)
  cat("\n\n")
  cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n"))
}

# optional outlier behavior
if (is.null(opt$outlier_replace_off)) {
  minRep <- 7
} else {
  minRep <- Inf
  if (verbose) cat("outlier replacement off\n")
}
if (is.null(opt$outlier_filter_off)) {
  cooksCutoff <- TRUE
} else {
  cooksCutoff <- FALSE
  if (verbose) cat("outlier filtering off\n")
}

# optional automatic mean filtering
if (is.null(opt$auto_mean_filter_off)) {
  independentFiltering <- TRUE
} else {
  independentFiltering <- FALSE
  if (verbose) cat("automatic filtering on the mean off\n")
}

# shrinkage of LFCs
if (is.null(opt$beta_prior_off)) {
  betaPrior <- TRUE
} else {
  betaPrior <- FALSE
  if (verbose) cat("beta prior off\n")
}

# dispersion fit type
if (is.null(opt$fit_type)) {
  fitType <- "parametric"
} else {
  fitType <- c("parametric","local","mean")[opt$fit_type]
}

if (verbose) cat(paste("using disperion fit type:",fitType,"\n"))

# run the analysis
dds <- DESeq(dds, fitType=fitType, betaPrior=betaPrior, minReplicatesForReplace=minRep)

# create the generic plots and leave the device open
if (!is.null(opt$plots)) {
  if (verbose) cat("creating plots\n")
  pdf(opt$plots)
  generateGenericPlots(dds, factors)
}

n <- nlevels(colData(dds)[[primaryFactor]])
allLevels <- levels(colData(dds)[[primaryFactor]])

if (!is.null(opt$countsfile)) {
    normalizedCounts<-counts(dds,normalized=TRUE)
    write.table(normalizedCounts, file=opt$countsfile, sep="\t", col.names=NA, quote=FALSE)
}

if (is.null(opt$many_contrasts)) {
  # only contrast the first and second level of the primary factor
  ref <- allLevels[1]
  lvl <- allLevels[2]
  res <- results(dds, contrast=c(primaryFactor, lvl, ref),
                 cooksCutoff=cooksCutoff,
                 independentFiltering=independentFiltering)
  if (verbose) {
    cat("summary of results\n")
    cat(paste0(primaryFactor,": ",lvl," vs ",ref,"\n"))
    print(summary(res))
  }
  resSorted <- res[order(res$padj),]
  outDF <- as.data.frame(resSorted)
  outDF$geneID <- rownames(outDF)
  outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
  filename <- opt$outfile
  write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
  if (independentFiltering) {
    threshold <- unname(attr(res, "filterThreshold"))
  } else {
    threshold <- 0
  }
  title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)
  if (!is.null(opt$plots)) {
    generateSpecificPlots(res, threshold, title_suffix)
  }
} else {
  # rotate through the possible contrasts of the primary factor
  # write out a sorted table of results with the contrast as a suffix
  # add contrast specific plots to the device
  for (i in seq_len(n-1)) {
    ref <- allLevels[i]
    contrastLevels <- allLevels[(i+1):n]
    for (lvl in contrastLevels) {
      res <- results(dds, contrast=c(primaryFactor, lvl, ref),
                     cooksCutoff=cooksCutoff,
                     independentFiltering=independentFiltering)
      resSorted <- res[order(res$padj),]
      outDF <- as.data.frame(resSorted)
      outDF$geneID <- rownames(outDF)
      outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
      filename <- paste0(opt$outfile,".",primaryFactor,"_",lvl,"_vs_",ref)
      write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
      if (independentFiltering) {
        threshold <- unname(attr(res, "filterThreshold"))
      } else {
        threshold <- 0
      }
      title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)
      if (!is.null(opt$plots)) {
        generateSpecificPlots(res, threshold, title_suffix)
      }
    }
  }
}

# close the plot device
if (!is.null(opt$plots)) {
  cat("closing plot device\n")
  dev.off()
}

cat("Session information:\n\n")

sessionInfo()