Galaxy |

Changeset 25:de44f8eff84a (2020-11-25)

Previous changeset 24:71bacea10eee (2020-02-29) Next changeset 26:6a3a025714d3 (2021-01-08)

Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 9eb6d07510ccf27d6499172d62c81661078ec57b"

modified:
deseq2.R
deseq2.xml
get_deseq_dataset.R

diff -r 71bacea10eee -r de44f8eff84a deseq2.R
--- a/deseq2.R Sat Feb 29 17:08:08 2020 -0500
+++ b/deseq2.R Wed Nov 25 18:36:55 2020 +0000

[

b'@@ -31,7 +31,9 @@\n # 3 "mean"\n \n # setup R error handling to go to stderr\n-options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )\n+options(show.error.messages = F, error = function() {\n+ cat(geterrmessage(), file = stderr()); q("no", 1, F)\n+})\n \n # we need that to not crash galaxy with an UTF8 error on German LC settings.\n loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")\n@@ -55,50 +57,46 @@\n "header", "H", 0, "logical",\n "factors", "f", 1, "character",\n "files_to_labels", "l", 1, "character",\n- "plots" , "p", 1, "character",\n+ "plots", "p", 1, "character",\n "tximport", "i", 0, "logical",\n "txtype", "y", 1, "character",\n "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF/GFF3 file\n "esf", "e", 1, "character",\n "fit_type", "t", 1, "integer",\n "many_contrasts", "m", 0, "logical",\n- "outlier_replace_off" , "a", 0, "logical",\n- "outlier_filter_off" , "b", 0, "logical",\n+ "outlier_replace_off", "a", 0, "logical",\n+ "outlier_filter_off", "b", 0, "logical",\n "auto_mean_filter_off", "c", 0, "logical",\n- "beta_prior_off", "d", 0, "logical"),\n- byrow=TRUE, ncol=4)\n+ "beta_prior_off", "d", 0, "logical"\n+), byrow = TRUE, ncol = 4)\n opt <- getopt(spec)\n \n # if help was asked for print a friendly message\n # and exit with a non-zero error code\n if (!is.null(opt$help)) {\n- cat(getopt(spec, usage=TRUE))\n- q(status=1)\n+ cat(getopt(spec, usage = TRUE))\n+ q(status = 1)\n }\n \n # enforce the following required arguments\n if (is.null(opt$outfile)) {\n cat("\'outfile\' is required\\n")\n- q(status=1)\n+ q(status = 1)\n }\n if (is.null(opt$factors)) {\n cat("\'factors\' is required\\n")\n- q(status=1)\n+ q(status = 1)\n }\n \n-verbose <- if (is.null(opt$quiet)) {\n- TRUE\n-} else {\n- FALSE\n+verbose <- is.null(opt$quiet)\n+\n+source_local <- function(fname) {\n+ argv <- commandArgs(trailingOnly = FALSE)\n+ base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))\n+ source(paste(base_dir, fname, sep = "/"))\n }\n \n-source_local <- function(fname){\n- argv <- commandArgs(trailingOnly = FALSE)\n- base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))\n- source(paste(base_dir, fname, sep="/"))\n-}\n-\n-source_local(\'get_deseq_dataset.R\')\n+source_local("get_deseq_dataset.R")\n \n suppressPackageStartupMessages({\n library("DESeq2")\n@@ -109,102 +107,116 @@\n if (opt$cores > 1) {\n library("BiocParallel")\n register(MulticoreParam(opt$cores))\n- parallel = TRUE\n+ parallel <- TRUE\n } else {\n- parallel = FALSE\n+ parallel <- FALSE\n }\n \n # build or read sample table\n \n-trim <- function (x) gsub("^\\\\s+|\\\\s+$", "", x)\n+trim <- function(x) gsub("^\\\\s+|\\\\s+$", "", x)\n \n # switch on if \'factors\' was provided:\n library("rjson")\n parser <- newJSONParser()\n parser$addData(opt$factors)\n-factorList <- parser$getObject()\n+factor_list <- parser$getObject()\n filenames_to_labels <- fromJSON(opt$files_to_labels)\n-factors <- sapply(factorList, function(x) x[[1]])\n-primaryFactor <- factors[1]\n-filenamesIn <- unname(unlist(factorList[[1]][[2]]))\n-labs = unname(unlist(filenames_to_labels[basename(filenamesIn)]))\n-sampleTable <- data.frame(sample=basename(filenamesIn),\n- filename=filenamesIn,\n- row.names=filenamesIn,\n- stringsAsFactors=FALSE)\n-for (factor in factorList) {\n- factorName <- trim(factor[[1]])\n- sampleTable[[factorName]] <- character(nrow(sampleTable))\n+factors <- sapply(factor_list, function(x) x[[1]])\n+primary_factor <- factors[1]\n+filenames_in <- unname(unlist(factor_list[[1]][[2]]))\n+labs <- unname(unlist(filenames_to_labels[basename(filenames_in)]))\n+sample_table <- data.frame(\n+ sample = basename(filenames_in),\n+ filename = filenames_in,\n+ row.names = filenames_in,\n+ stringsAsFactors = FALSE\n+)\n+for (factor in factor_list) {\n+ factor_name <- trim(factor[[1]])\n+ sample_table[[factor_name]] <- character(nrow(sample_table))\n lvls <- sapply(factor[[2]], function('..b'is.null(opt$many_contrasts)) {\n # only contrast the first and second level of the primary factor\n- ref <- allLevels[1]\n- lvl <- allLevels[2]\n- res <- results(dds, contrast=c(primaryFactor, lvl, ref),\n- cooksCutoff=cooksCutoff,\n- independentFiltering=independentFiltering)\n+ ref <- all_levels[1]\n+ lvl <- all_levels[2]\n+ res <- results(\n+ dds,\n+ contrast = c(primary_factor, lvl, ref),\n+ cooksCutoff = cooks_cutoff,\n+ independentFiltering = independent_filtering\n+ )\n if (verbose) {\n cat("summary of results\\n")\n- cat(paste0(primaryFactor,": ",lvl," vs ",ref,"\\n"))\n+ cat(paste0(primary_factor, ": ", lvl, " vs ", ref, "\\n"))\n print(summary(res))\n }\n- resSorted <- res[order(res$padj),]\n- outDF <- as.data.frame(resSorted)\n- outDF$geneID <- rownames(outDF)\n- outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]\n+ res_sorted <- res[order(res$padj), ]\n+ out_df <- as.data.frame(res_sorted)\n+ out_df$geneID <- rownames(out_df) # nolint\n+ out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]\n filename <- opt$outfile\n- write.table(outDF, file=filename, sep="\\t", quote=FALSE, row.names=FALSE, col.names=FALSE)\n- if (independentFiltering) {\n+ write.table(out_df, file = filename, sep = "\\t", quote = FALSE, row.names = FALSE, col.names = FALSE)\n+ if (independent_filtering) {\n threshold <- unname(attr(res, "filterThreshold"))\n } else {\n threshold <- 0\n }\n- title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)\n+ title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)\n if (!is.null(opt$plots)) {\n- generateSpecificPlots(res, threshold, title_suffix)\n+ generate_specific_plots(res, threshold, title_suffix)\n }\n } else {\n # rotate through the possible contrasts of the primary factor\n # write out a sorted table of results with the contrast as a suffix\n # add contrast specific plots to the device\n- for (i in seq_len(n-1)) {\n- ref <- allLevels[i]\n- contrastLevels <- allLevels[(i+1):n]\n- for (lvl in contrastLevels) {\n- res <- results(dds, contrast=c(primaryFactor, lvl, ref),\n- cooksCutoff=cooksCutoff,\n- independentFiltering=independentFiltering)\n- resSorted <- res[order(res$padj),]\n- outDF <- as.data.frame(resSorted)\n- outDF$geneID <- rownames(outDF)\n- outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]\n- filename <- paste0(primaryFactor,"_",lvl,"_vs_",ref)\n- write.table(outDF, file=filename, sep="\\t", quote=FALSE, row.names=FALSE, col.names=FALSE)\n- if (independentFiltering) {\n+ for (i in seq_len(n - 1)) {\n+ ref <- all_levels[i]\n+ contrast_levels <- all_levels[(i + 1):n]\n+ for (lvl in contrast_levels) {\n+ res <- results(\n+ dds,\n+ contrast = c(primary_factor, lvl, ref),\n+ cooksCutoff = cooks_cutoff,\n+ independentFiltering = independent_filtering\n+ )\n+ res_sorted <- res[order(res$padj), ]\n+ out_df <- as.data.frame(res_sorted)\n+ out_df$geneID <- rownames(out_df) # nolint\n+ out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]\n+ filename <- paste0(primary_factor, "_", lvl, "_vs_", ref)\n+ write.table(out_df, file = filename, sep = "\\t", quote = FALSE, row.names = FALSE, col.names = FALSE)\n+ if (independent_filtering) {\n threshold <- unname(attr(res, "filterThreshold"))\n } else {\n threshold <- 0\n }\n- title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)\n+ title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)\n if (!is.null(opt$plots)) {\n- generateSpecificPlots(res, threshold, title_suffix)\n+ generate_specific_plots(res, threshold, title_suffix)\n }\n }\n }\n@@ -378,4 +396,3 @@\n cat("Session information:\\n\\n")\n \n sessionInfo()\n-\n'

diff -r 71bacea10eee -r de44f8eff84a deseq2.xml
--- a/deseq2.xml Sat Feb 29 17:08:08 2020 -0500
+++ b/deseq2.xml Wed Nov 25 18:36:55 2020 +0000

@@ -176,7 +176,7 @@
             label="Output VST normalized table" />
         <param name="many_contrasts" type="boolean" truevalue="1" falsevalue="0" checked="false"
             label="Output all levels vs all levels of primary factor (use when you have >2 levels for primary factor)"
-            help=" DESeq2 performs independent ﬁltering by default using the mean of normalized counts as a ﬁlter statistic" />
+            help=" DESeq2 performs independent filtering by default using the mean of normalized counts as a filter statistic" />
         <param name="esf" type="select" label="(Optional) Method for estimateSizeFactors"
             help="Method for estimation: either 'ratio', 'poscounts', or 'iterate'. 'ratio' uses the standard median ratio method introduced in DESeq.
                 The size factor is the median ratio of the sample over a 'pseudosample': for each gene, the geometric mean of all samples.
@@ -203,10 +203,10 @@
         <param name="outlier_filter_off" type="boolean" truevalue="1" falsevalue="0" checked="false"
             label="Turn off outliers filtering (only affects with >2 replicates)"
             help="When there are more than 2 replicates for a given sample, the DESeq2 will automatically
-                filter genes which contain a Cook’s distance above a cutoﬀ" />
+                filter genes which contain a Cook’s distance above a cutoff" />
         <param name="auto_mean_filter_off" type="boolean" truevalue="1" falsevalue="0" checked="false"
             label="Turn off independent filtering"
-            help=" DESeq2 performs independent ﬁltering by default using the mean of normalized counts as a ﬁlter statistic" />
+            help=" DESeq2 performs independent filtering by default using the mean of normalized counts as a filter statistic" />
     </inputs>
     <outputs>
         <data name="deseq_out" format="tabular" label="DESeq2 result file on ${on_string}">

diff -r 71bacea10eee -r de44f8eff84a get_deseq_dataset.R
--- a/get_deseq_dataset.R Sat Feb 29 17:08:08 2020 -0500
+++ b/get_deseq_dataset.R Wed Nov 25 18:36:55 2020 +0000

[

@@ -1,76 +1,80 @@
-get_deseq_dataset <- function(sampleTable, header, designFormula, tximport, txtype, tx2gene) {
+get_deseq_dataset <- function(sample_table, header, design_formula, tximport, txtype, tx2gene) {

   dir <- ""

-  if (!is.null(header)) {
-    hasHeader <- TRUE
-  } else {
-    hasHeader <- FALSE
-  }
-
-  if (!is.null(tximport)) {
+  has_header <- !is.null(header)
+  use_txi <- !is.null(tximport)
+  if (use_txi) {
     if (is.null(tx2gene)) stop("A transcript-to-gene map or a GTF/GFF3 file is required for tximport")
     if (tolower(file_ext(tx2gene)) == "gff") {
-      gffFile <-tx2gene
+      gff_file <- tx2gene
     } else {
-      gffFile <- NULL
-      tx2gene <- read.table(tx2gene, header=hasHeader)
+      gff_file <- NULL
+      tx2gene <- read.table(tx2gene, header = has_header)
     }
-    useTXI <- TRUE
-  } else {
-    useTXI <- FALSE
   }

-  if (!useTXI & hasHeader) {
-      countfiles <- lapply(as.character(sampleTable$filename), function(x){read.delim(x, row.names=1)})
+  if (!use_txi & has_header) {
+      countfiles <- lapply(as.character(sample_table$filename), read.delim, row.names = 1)
       tbl <- do.call("cbind", countfiles)
-      colnames(tbl) <- rownames(sampleTable) # take sample ids from header
+      colnames(tbl) <- rownames(sample_table) # take sample ids from header

       # check for htseq report lines (from DESeqDataSetFromHTSeqCount function)
-      oldSpecialNames <- c("no_feature", "ambiguous", "too_low_aQual",
-          "not_aligned", "alignment_not_unique")
-      specialRows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% oldSpecialNames
-      tbl <- tbl[!specialRows, , drop = FALSE]
+      old_special_names <- c(
+        "no_feature",
+        "ambiguous",
+        "too_low_aQual",
+        "not_aligned",
+        "alignment_not_unique"
+      )
+      special_rows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% old_special_names
+      tbl <- tbl[!special_rows, , drop = FALSE]

-      dds <- DESeqDataSetFromMatrix(countData = tbl,
-                                    colData = subset(sampleTable, select=-(filename)),
-                                    design = designFormula)
-  } else if (!useTXI & !hasHeader) {
+      dds <- DESeqDataSetFromMatrix(
+        countData = tbl,
+        colData = subset(sample_table, select = -filename),
+        design = design_formula
+      )
+  } else if (!use_txi & !has_header) {

     # construct the object from HTSeq files
-    dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
-                                      directory = dir,
-                                      design =  designFormula)
-    colnames(dds) <- row.names(sampleTable)
+    dds <- DESeqDataSetFromHTSeqCount(
+      sampleTable = sample_table,
+      directory = dir,
+      design = design_formula
+    )
+    colnames(dds) <- row.names(sample_table)

   } else {
       # construct the object using tximport
       library("tximport")
-      txiFiles <- as.character(sampleTable$filename)
-      labs <- row.names(sampleTable)
-      names(txiFiles) <- labs
-      if (!is.null(gffFile)) {
+      txi_files <- as.character(sample_table$filename)
+      labs <- row.names(sample_table)
+      names(txi_files) <- labs
+      if (!is.null(gff_file)) {
         # first need to make the tx2gene table
         # this takes ~2-3 minutes using Bioconductor functions
         suppressPackageStartupMessages({
           library("GenomicFeatures")
         })
-        txdb <- makeTxDbFromGFF(gffFile)
+        txdb <- makeTxDbFromGFF(gff_file)
         k <- keys(txdb, keytype = "TXNAME")
-        tx2gene <- select(txdb, keys=k, columns="GENEID", keytype="TXNAME")
-        # Remove 'transcript:' from transcript IDs (when gffFile is a GFF3 from Ensembl and the transcript does not have a Name)
-        tx2gene$TXNAME <- sub('^transcript:', '', tx2gene$TXNAME)
+        tx2gene <- select(txdb, keys = k, columns = "GENEID", keytype = "TXNAME")
+        # Remove 'transcript:' from transcript IDs (when gff_file is a GFF3 from Ensembl and the transcript does not have a Name)
+        tx2gene$TXNAME <- sub("^transcript:", "", tx2gene$TXNAME)  # nolint
       }
-      try(txi <- tximport(txiFiles, type=txtype, tx2gene=tx2gene))
+      try(txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene))
       if (!exists("txi")) {
         # Remove version from transcript IDs in tx2gene...
-        tx2gene$TXNAME <- sub('\\.[0-9]+$', '', tx2gene$TXNAME)
-        # ...and in txiFiles
-        txi <- tximport(txiFiles, type=txtype, tx2gene=tx2gene, ignoreTxVersion=TRUE)
+        tx2gene$TXNAME <- sub("\\.[0-9]+$", "", tx2gene$TXNAME)  # nolint
+        # ...and in txi_files
+        txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene, ignoreTxVersion = TRUE)
       }
-      dds <- DESeqDataSetFromTximport(txi,
-                                      subset(sampleTable, select=-c(filename)),
-                                      designFormula)
+      dds <- DESeqDataSetFromTximport(
+        txi,
+        subset(sample_table, select = -c(filename)),
+        design_formula
+      )
   }
   return(dds)
}