Mercurial > repos > iuc > deseq2
view deseq2.R @ 16:a416957ee305 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 81fa5b04b9ba9d4b1b59192af0adb1e1e18ca22b
author | iuc |
---|---|
date | Fri, 03 Aug 2018 17:23:46 -0400 |
parents | 9a616afdbda5 |
children | d9e5cadc7f0b |
line wrap: on
line source
#!/usr/bin/env Rscript # A command-line interface to DESeq2 for use with Galaxy # written by Bjoern Gruening and modified by Michael Love 2016.03.30 # # This argument is required: # # 'factors' a JSON list object from Galaxy # # the output file has columns: # # baseMean (mean normalized count) # log2FoldChange (by default a moderated LFC estimate) # lfcSE (the standard error) # stat (the Wald statistic) # pvalue (p-value from comparison of Wald statistic to a standard Normal) # padj (adjusted p-value, Benjamini Hochberg correction on genes which pass the mean count filter) # # the first variable in 'factors' will be the primary factor. # the levels of the primary factor are used in the order of appearance in factors. # # by default, levels in the order A,B,C produces a single comparison of B vs A, to a single file 'outfile' # # for the 'many_contrasts' flag, levels in the order A,B,C produces comparisons C vs A, B vs A, C vs B, # to a number of files using the 'outfile' prefix: 'outfile.condition_C_vs_A' etc. # all plots will still be sent to a single PDF, named by the arg 'plots', with extra pages. # # fit_type is an integer valued argument, with the options from ?estimateDisperions # 1 "parametric" # 2 "local" # 3 "mean" # setup R error handling to go to stderr options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) # we need that to not crash galaxy with an UTF8 error on German LC settings. loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") library("getopt") library("tools") options(stringAsFactors = FALSE, useFancyQuotes = FALSE) args <- commandArgs(trailingOnly = TRUE) # get options, using the spec as defined by the enclosed list. # we read the options from the default: commandArgs(TRUE). spec <- matrix(c( "quiet", "q", 0, "logical", "help", "h", 0, "logical", "outfile", "o", 1, "character", "countsfile", "n", 1, "character", "header", "H", 0, "logical", "factors", "f", 1, "character", "files_to_labels", "l", 1, "character", "plots" , "p", 1, "character", "tximport", "i", 0, "logical", "txtype", "y", 1, "character", "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF) "fit_type", "t", 1, "integer", "many_contrasts", "m", 0, "logical", "outlier_replace_off" , "a", 0, "logical", "outlier_filter_off" , "b", 0, "logical", "auto_mean_filter_off", "c", 0, "logical", "beta_prior_off", "d", 0, "logical"), byrow=TRUE, ncol=4) opt <- getopt(spec) # if help was asked for print a friendly message # and exit with a non-zero error code if (!is.null(opt$help)) { cat(getopt(spec, usage=TRUE)) q(status=1) } # enforce the following required arguments if (is.null(opt$outfile)) { cat("'outfile' is required\n") q(status=1) } if (is.null(opt$factors)) { cat("'factors' is required\n") q(status=1) } verbose <- if (is.null(opt$quiet)) { TRUE } else { FALSE } if (!is.null(opt$header)) { hasHeader <- TRUE } else { hasHeader <- FALSE } if (!is.null(opt$tximport)) { if (is.null(opt$tx2gene)) stop("A transcript-to-gene map or a GTF file is required for tximport") if (tolower(file_ext(opt$tx2gene)) == "gtf") { gtfFile <- opt$tx2gene } else { gtfFile <- NULL tx2gene <- read.table(opt$tx2gene, header=FALSE) } useTXI <- TRUE } else { useTXI <- FALSE } suppressPackageStartupMessages({ library("DESeq2") library("RColorBrewer") library("gplots") }) # build or read sample table trim <- function (x) gsub("^\\s+|\\s+$", "", x) # switch on if 'factors' was provided: library("rjson") parser <- newJSONParser() parser$addData(opt$factors) factorList <- parser$getObject() filenames_to_labels <- fromJSON(opt$files_to_labels) factors <- sapply(factorList, function(x) x[[1]]) primaryFactor <- factors[1] filenamesIn <- unname(unlist(factorList[[1]][[2]])) labs = unname(unlist(filenames_to_labels[basename(filenamesIn)])) sampleTable <- data.frame(sample=basename(filenamesIn), filename=filenamesIn, row.names=filenamesIn, stringsAsFactors=FALSE) for (factor in factorList) { factorName <- trim(factor[[1]]) sampleTable[[factorName]] <- character(nrow(sampleTable)) lvls <- sapply(factor[[2]], function(x) names(x)) for (i in seq_along(factor[[2]])) { files <- factor[[2]][[i]][[1]] sampleTable[files,factorName] <- trim(lvls[i]) } sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls) } rownames(sampleTable) <- labs primaryFactor <- factors[1] designFormula <- as.formula(paste("~", paste(rev(factors), collapse=" + "))) # these are plots which are made once for each analysis generateGenericPlots <- function(dds, factors) { library("ggplot2") library("ggrepel") library("pheatmap") rld <- rlog(dds) p <- plotPCA(rld, intgroup=rev(factors)) print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size=3) + geom_point()) dat <- assay(rld) distsRL <- dist(t(dat)) mat <- as.matrix(distsRL) colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) pheatmap(mat, clustering_distance_rows=distsRL, clustering_distance_cols=distsRL, col=colors, main="Sample-to-sample distances") plotDispEsts(dds, main="Dispersion estimates") } # these are plots which can be made for each comparison, e.g. # once for C vs A and once for B vs A generateSpecificPlots <- function(res, threshold, title_suffix) { use <- res$baseMean > threshold if (sum(!use) == 0) { h <- hist(res$pvalue, breaks=0:50/50, plot=FALSE) barplot(height = h$counts, col = "powderblue", space = 0, xlab="p-values", ylab="frequency", main=paste("Histogram of p-values for",title_suffix)) text(x = c(0, length(h$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA) } else { h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE) h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE) colori <- c("filtered (low count)"="khaki", "not filtered"="powderblue") barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, xlab="p-values", ylab="frequency", main=paste("Histogram of p-values for",title_suffix)) text(x = c(0, length(h1$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA) legend("topright", fill=rev(colori), legend=rev(names(colori)), bg="white") } plotMA(res, main= paste("MA-plot for",title_suffix), ylim=range(res$log2FoldChange, na.rm=TRUE)) } if (verbose) { cat(paste("primary factor:",primaryFactor,"\n")) if (length(factors) > 1) { cat(paste("other factors in design:",paste(factors[-length(factors)],collapse=","),"\n")) } cat("\n---------------------\n") } # For JSON input from Galaxy, path is absolute dir <- "" if (!useTXI & hasHeader) { countfiles <- lapply(as.character(sampleTable$filename), function(x){read.delim(x, row.names=1)}) tbl <- do.call("cbind", countfiles) colnames(tbl) <- rownames(sampleTable) # take sample ids from header # check for htseq report lines (from DESeqDataSetFromHTSeqCount function) oldSpecialNames <- c("no_feature", "ambiguous", "too_low_aQual", "not_aligned", "alignment_not_unique") specialRows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% oldSpecialNames tbl <- tbl[!specialRows, , drop = FALSE] dds <- DESeqDataSetFromMatrix(countData = tbl, colData = sampleTable[,-c(1:2), drop=FALSE], design = designFormula) } else if (!useTXI & !hasHeader) { # construct the object from HTSeq files dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = dir, design = designFormula) labs <- unname(unlist(filenames_to_labels[colnames(dds)])) colnames(dds) <- labs } else { # construct the object using tximport # first need to make the tx2gene table # this takes ~2-3 minutes using Bioconductor functions if (!is.null(gtfFile)) { suppressPackageStartupMessages({ library("GenomicFeatures") }) txdb <- makeTxDbFromGFF(gtfFile, format="gtf") k <- keys(txdb, keytype = "GENEID") df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME") tx2gene <- df[, 2:1] # tx ID, then gene ID } library("tximport") txiFiles <- as.character(sampleTable[,2]) labs <- unname(unlist(filenames_to_labels[sampleTable[,1]])) names(txiFiles) <- labs txi <- tximport(txiFiles, type=opt$txtype, tx2gene=tx2gene) dds <- DESeqDataSetFromTximport(txi, sampleTable[,3:ncol(sampleTable),drop=FALSE], designFormula) } if (verbose) { cat("DESeq2 run information\n\n") cat("sample table:\n") print(sampleTable[,-c(1:2),drop=FALSE]) cat("\ndesign formula:\n") print(designFormula) cat("\n\n") cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n")) } # optional outlier behavior if (is.null(opt$outlier_replace_off)) { minRep <- 7 } else { minRep <- Inf if (verbose) cat("outlier replacement off\n") } if (is.null(opt$outlier_filter_off)) { cooksCutoff <- TRUE } else { cooksCutoff <- FALSE if (verbose) cat("outlier filtering off\n") } # optional automatic mean filtering if (is.null(opt$auto_mean_filter_off)) { independentFiltering <- TRUE } else { independentFiltering <- FALSE if (verbose) cat("automatic filtering on the mean off\n") } # shrinkage of LFCs if (is.null(opt$beta_prior_off)) { betaPrior <- TRUE } else { betaPrior <- FALSE if (verbose) cat("beta prior off\n") } # dispersion fit type if (is.null(opt$fit_type)) { fitType <- "parametric" } else { fitType <- c("parametric","local","mean")[opt$fit_type] } if (verbose) cat(paste("using disperion fit type:",fitType,"\n")) # run the analysis dds <- DESeq(dds, fitType=fitType, betaPrior=betaPrior, minReplicatesForReplace=minRep) # create the generic plots and leave the device open if (!is.null(opt$plots)) { if (verbose) cat("creating plots\n") pdf(opt$plots) generateGenericPlots(dds, factors) } n <- nlevels(colData(dds)[[primaryFactor]]) allLevels <- levels(colData(dds)[[primaryFactor]]) if (!is.null(opt$countsfile)) { normalizedCounts<-counts(dds,normalized=TRUE) write.table(normalizedCounts, file=opt$countsfile, sep="\t", col.names=NA, quote=FALSE) } if (is.null(opt$many_contrasts)) { # only contrast the first and second level of the primary factor ref <- allLevels[1] lvl <- allLevels[2] res <- results(dds, contrast=c(primaryFactor, lvl, ref), cooksCutoff=cooksCutoff, independentFiltering=independentFiltering) if (verbose) { cat("summary of results\n") cat(paste0(primaryFactor,": ",lvl," vs ",ref,"\n")) print(summary(res)) } resSorted <- res[order(res$padj),] outDF <- as.data.frame(resSorted) outDF$geneID <- rownames(outDF) outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] filename <- opt$outfile write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) if (independentFiltering) { threshold <- unname(attr(res, "filterThreshold")) } else { threshold <- 0 } title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref) if (!is.null(opt$plots)) { generateSpecificPlots(res, threshold, title_suffix) } } else { # rotate through the possible contrasts of the primary factor # write out a sorted table of results with the contrast as a suffix # add contrast specific plots to the device for (i in seq_len(n-1)) { ref <- allLevels[i] contrastLevels <- allLevels[(i+1):n] for (lvl in contrastLevels) { res <- results(dds, contrast=c(primaryFactor, lvl, ref), cooksCutoff=cooksCutoff, independentFiltering=independentFiltering) resSorted <- res[order(res$padj),] outDF <- as.data.frame(resSorted) outDF$geneID <- rownames(outDF) outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] filename <- paste0(opt$outfile,".",primaryFactor,"_",lvl,"_vs_",ref) write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) if (independentFiltering) { threshold <- unname(attr(res, "filterThreshold")) } else { threshold <- 0 } title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref) if (!is.null(opt$plots)) { generateSpecificPlots(res, threshold, title_suffix) } } } } # close the plot device if (!is.null(opt$plots)) { cat("closing plot device\n") dev.off() } cat("Session information:\n\n") sessionInfo()