changeset 17:d9e5cadc7f0b draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit b95582cea8320d5488056a9576474f79cec53be8
author iuc
date Wed, 05 Sep 2018 15:54:03 -0400
parents a416957ee305
children 3bf1b3ec1ddf
files deseq2.R deseq2.xml get_deseq_dataset.R test-data/batch_factors.tab test-data/out_deseq2_sailfish.tab
diffstat 5 files changed, 139 insertions(+), 62 deletions(-) [+]
line wrap: on
line diff
--- a/deseq2.R	Fri Aug 03 17:23:46 2018 -0400
+++ b/deseq2.R	Wed Sep 05 15:54:03 2018 -0400
@@ -46,6 +46,7 @@
 spec <- matrix(c(
   "quiet", "q", 0, "logical",
   "help", "h", 0, "logical",
+  "batch_factors", "", 1, "character",
   "outfile", "o", 1, "character",
   "countsfile", "n", 1, "character",
   "header", "H", 0, "logical",
@@ -87,24 +88,13 @@
   FALSE
 }
 
-if (!is.null(opt$header)) {
-  hasHeader <- TRUE
-} else {
-  hasHeader <- FALSE
+source_local <- function(fname){
+    argv <- commandArgs(trailingOnly = FALSE)
+    base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))
+    source(paste(base_dir, fname, sep="/"))
 }
 
-if (!is.null(opt$tximport)) {
-  if (is.null(opt$tx2gene)) stop("A transcript-to-gene map or a GTF file is required for tximport")
-  if (tolower(file_ext(opt$tx2gene)) == "gtf") {
-    gtfFile <- opt$tx2gene
-  } else {
-    gtfFile <- NULL
-    tx2gene <- read.table(opt$tx2gene, header=FALSE)
-  }
-  useTXI <- TRUE
-} else {
-  useTXI <- FALSE
-}
+source_local('get_deseq_dataset.R')
 
 suppressPackageStartupMessages({
   library("DESeq2")
@@ -197,53 +187,34 @@
   cat("\n---------------------\n")
 }
 
-# For JSON input from Galaxy, path is absolute
-dir <- ""
-
-if (!useTXI & hasHeader) {
-    countfiles <- lapply(as.character(sampleTable$filename), function(x){read.delim(x, row.names=1)})
-    tbl <- do.call("cbind", countfiles)
-    colnames(tbl) <- rownames(sampleTable) # take sample ids from header
-
-    # check for htseq report lines (from DESeqDataSetFromHTSeqCount function)
-    oldSpecialNames <- c("no_feature", "ambiguous", "too_low_aQual",
-        "not_aligned", "alignment_not_unique")
-    specialRows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% oldSpecialNames
-    tbl <- tbl[!specialRows, , drop = FALSE]
-
-    dds <- DESeqDataSetFromMatrix(countData = tbl,
-                                  colData = sampleTable[,-c(1:2), drop=FALSE],
-                                  design = designFormula)
-} else if (!useTXI & !hasHeader) {
+dds <- get_deseq_dataset(sampleTable, header=opt$header, designFormula=designFormula, tximport=opt$tximport, txtype=opt$txtype, tx2gene=opt$tx2gene)
 
-  # construct the object from HTSeq files
-  dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
-                                    directory = dir,
-                                    design =  designFormula)
-  labs <- unname(unlist(filenames_to_labels[colnames(dds)]))
-  colnames(dds) <- labs
+apply_batch_factors <- function (dds, batch_factors) {
+  rownames(batch_factors) <- batch_factors$identifier
+  batch_factors <- subset(batch_factors, select = -c(identifier, condition))
+  dds_samples <- colnames(dds)
+  batch_samples <- rownames(batch_factors)
+  if (!setequal(batch_samples, dds_samples)) {
+    stop("Batch factor names don't correspond to input sample names, check input files")
+  }
+  dds_data <- colData(dds)
+  # Merge dds_data with batch_factors using indexes, which are sample names
+  # Set sort to False, which maintains the order in dds_data
+  reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort=F)
+  batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)]
+  for (factor in colnames(batch_factors)) {
+    dds[[factor]] <- batch_factors[[factor]]
+  }
+  colnames(dds) <- reordered_batch[,1]
+  return(dds)
+}
 
-} else {
-    # construct the object using tximport
-    # first need to make the tx2gene table
-    # this takes ~2-3 minutes using Bioconductor functions
-    if (!is.null(gtfFile)) {
-      suppressPackageStartupMessages({
-        library("GenomicFeatures")
-      })
-      txdb <- makeTxDbFromGFF(gtfFile, format="gtf")
-      k <- keys(txdb, keytype = "GENEID")
-      df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME")
-      tx2gene <- df[, 2:1]  # tx ID, then gene ID
-    }
-    library("tximport")
-    txiFiles <- as.character(sampleTable[,2])
-    labs <- unname(unlist(filenames_to_labels[sampleTable[,1]]))
-    names(txiFiles) <- labs
-    txi <- tximport(txiFiles, type=opt$txtype, tx2gene=tx2gene)
-    dds <- DESeqDataSetFromTximport(txi,
-                                    sampleTable[,3:ncol(sampleTable),drop=FALSE],
-                                    designFormula)
+if (!is.null(opt$batch_factors)) {
+  batch_factors <- read.table(opt$batch_factors, sep="\t", header=T)
+  dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors)
+  batch_design <- colnames(batch_factors)[-c(1,2)]
+  designFormula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse=" + ")))
+  design(dds) <- designFormula
 }
 
 if (verbose) {
--- a/deseq2.xml	Fri Aug 03 17:23:46 2018 -0400
+++ b/deseq2.xml	Wed Sep 05 15:54:03 2018 -0400
@@ -64,6 +64,9 @@
     -f '#echo json.dumps(temp_factor_names)#'
     -l '#echo json.dumps(filename_to_element_identifiers)#'
     -t $fit_type
+    #if $batch_factors
+        --batch_factors '$batch_factors'
+    #end if
     #if $outlier_replace_off:
         -a
     #end if
@@ -105,7 +108,7 @@
                 <param name="countsFile" type="data" format="tabular" multiple="true" label="Counts file(s)"/>
             </repeat>
         </repeat>
-
+        <param name="batch_factors" type="data" format="tabular" optional="true" label="(Optional) provide a tabular file with additional batch factors to include in the model." help="You can produce this file using RUVSeq or svaseq."/>
         <param name="header" type="boolean" truevalue="-H" falsevalue="" checked="true" label="Files have header?" help="If this option is set to Yes, the tool will assume that the count files have column headers in the first row. Default: Yes" />
 
         <conditional name="tximport">
@@ -206,6 +209,28 @@
                 </assert_contents>
             </output>
         </test>
+        <!--Ensure additional batch factor correction works -->
+        <test expect_num_outputs="2">
+            <repeat name="rep_factorName">
+                <param name="factorName" value="Treatment"/>
+                <repeat name="rep_factorLevel">
+                    <param name="factorLevel" value="Treated"/>
+                    <param name="countsFile" value="GSM461179_treat_single.counts,GSM461180_treat_paired.counts,GSM461181_treat_paired.counts"/>
+                </repeat>
+                <repeat name="rep_factorLevel">
+                    <param name="factorLevel" value="Untreated"/>
+                    <param name="countsFile" value="GSM461176_untreat_single.counts,GSM461177_untreat_paired.counts,GSM461178_untreat_paired.counts,GSM461182_untreat_single.counts"/>
+                </repeat>
+            </repeat>
+            <param name="batch_factors" value="batch_factors.tab"/>
+            <param name="pdf" value="False"/>
+            <param name="normCounts" value="True"/>
+            <output name="deseq_out">
+                <assert_contents>
+                    <has_text_matching expression="FBgn0003360\t1933.*\t-2.9.*\t0.1.*\t-26.*\t1.*-152\t4.*-149" />
+                </assert_contents>
+            </output>
+        </test>
         <!--Ensure counts files without header works -->
         <test expect_num_outputs="2">
             <repeat name="rep_factorName">
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_deseq_dataset.R	Wed Sep 05 15:54:03 2018 -0400
@@ -0,0 +1,69 @@
+get_deseq_dataset <- function(sampleTable, header, designFormula, tximport, txtype, tx2gene) {
+
+  dir <- ""
+
+  if (!is.null(header)) {
+    hasHeader <- TRUE
+  } else {
+    hasHeader <- FALSE
+  }
+
+  if (!is.null(tximport)) {
+    if (is.null(tx2gene)) stop("A transcript-to-gene map or a GTF file is required for tximport")
+    if (tolower(file_ext(opt$tx2gene)) == "gtf") {
+      gtfFile <-tx2gene
+    } else {
+      gtfFile <- NULL
+      tx2gene <- read.table(tx2gene, header=FALSE)
+    }
+    useTXI <- TRUE
+  } else {
+    useTXI <- FALSE
+  }
+
+  if (!useTXI & hasHeader) {
+      countfiles <- lapply(as.character(sampleTable$filename), function(x){read.delim(x, row.names=1)})
+      tbl <- do.call("cbind", countfiles)
+      colnames(tbl) <- rownames(sampleTable) # take sample ids from header
+
+      # check for htseq report lines (from DESeqDataSetFromHTSeqCount function)
+      oldSpecialNames <- c("no_feature", "ambiguous", "too_low_aQual",
+          "not_aligned", "alignment_not_unique")
+      specialRows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% oldSpecialNames
+      tbl <- tbl[!specialRows, , drop = FALSE]
+
+      dds <- DESeqDataSetFromMatrix(countData = tbl,
+                                    colData = subset(sampleTable, select=-(filename)),
+                                    design = designFormula)
+  } else if (!useTXI & !hasHeader) {
+
+    # construct the object from HTSeq files
+    dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
+                                      directory = dir,
+                                      design =  designFormula)
+    colnames(dds) <- row.names(sampleTable)
+
+  } else {
+      # construct the object using tximport
+      # first need to make the tx2gene table
+      # this takes ~2-3 minutes using Bioconductor functions
+      if (!is.null(gtfFile)) {
+        suppressPackageStartupMessages({
+          library("GenomicFeatures")
+        })
+        txdb <- makeTxDbFromGFF(gtfFile, format="gtf")
+        k <- keys(txdb, keytype = "GENEID")
+        df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME")
+        tx2gene <- df[, 2:1]  # tx ID, then gene ID
+      }
+      library("tximport")
+      txiFiles <- as.character(sampleTable$filename)
+      labs <- row.names(sampleTable)
+      names(txiFiles) <- labs
+      txi <- tximport(txiFiles, type=txtype, tx2gene=tx2gene)
+      dds <- DESeqDataSetFromTximport(txi,
+                                      subset(sampleTable, select=-c(filename)),
+                                      designFormula)
+  }
+  return(dds)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/batch_factors.tab	Wed Sep 05 15:54:03 2018 -0400
@@ -0,0 +1,8 @@
+identifier	condition	W_1
+GSM461179_treat_single.counts	Treatment	-0.607164236699804
+GSM461180_treat_paired.counts	Treatment	0.261662524312529
+GSM461181_treat_paired.counts	Treatment	0.341268033254441
+GSM461176_untreat_single.counts	Control	-0.297792561552867
+GSM461177_untreat_paired.counts	Control	0.322555604859173
+GSM461178_untreat_paired.counts	Control	0.345745982909988
+GSM461182_untreat_single.counts	Control	-0.366275347083457
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out_deseq2_sailfish.tab	Wed Sep 05 15:54:03 2018 -0400
@@ -0,0 +1,4 @@
+MIR6859-2	1.1858265336986	-1.58325519934585	1.29566733255222	-1.22196119294536	0.221722302676964	0.886889210707857
+DDX11L1	1.91972630671259	-0.204757212641591	1.31524142129485	-0.155680325548148	0.876285006005361	0.999999999763967
+MIR6859-1	0.294068009390125	-0.712896459907293	1.28824575367762	-0.55338545294805	0.579999497913718	0.999999999763967
+WASH7P	114.545909292233	-3.75034693001422e-10	1.2677635335979	-2.95823852841924e-10	0.999999999763967	0.999999999763967