Mercurial > repos > iuc > ruvseq

--- a/get_deseq_dataset.R	Tue Mar 26 06:25:38 2019 -0400
+++ b/get_deseq_dataset.R	Fri Jan 15 17:53:15 2021 +0000
@@ -1,76 +1,80 @@
-get_deseq_dataset <- function(sampleTable, header, designFormula, tximport, txtype, tx2gene) {
+get_deseq_dataset <- function(sample_table, header, design_formula, tximport, txtype, tx2gene) {

   dir <- ""

-  if (!is.null(header)) {
-    hasHeader <- TRUE
-  } else {
-    hasHeader <- FALSE
-  }
-
-  if (!is.null(tximport)) {
+  has_header <- !is.null(header)
+  use_txi <- !is.null(tximport)
+  if (use_txi) {
     if (is.null(tx2gene)) stop("A transcript-to-gene map or a GTF/GFF3 file is required for tximport")
     if (tolower(file_ext(tx2gene)) == "gff") {
-      gffFile <-tx2gene
+      gff_file <- tx2gene
     } else {
-      gffFile <- NULL
-      tx2gene <- read.table(tx2gene, header=hasHeader)
+      gff_file <- NULL
+      tx2gene <- read.table(tx2gene, header = has_header)
     }
-    useTXI <- TRUE
-  } else {
-    useTXI <- FALSE
   }

-  if (!useTXI & hasHeader) {
-      countfiles <- lapply(as.character(sampleTable$filename), function(x){read.delim(x, row.names=1)})
+  if (!use_txi & has_header) {
+      countfiles <- lapply(as.character(sample_table$filename), read.delim, row.names = 1)
       tbl <- do.call("cbind", countfiles)
-      colnames(tbl) <- rownames(sampleTable) # take sample ids from header
+      colnames(tbl) <- rownames(sample_table) # take sample ids from header

       # check for htseq report lines (from DESeqDataSetFromHTSeqCount function)
-      oldSpecialNames <- c("no_feature", "ambiguous", "too_low_aQual",
-          "not_aligned", "alignment_not_unique")
-      specialRows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% oldSpecialNames
-      tbl <- tbl[!specialRows, , drop = FALSE]
+      old_special_names <- c(
+        "no_feature",
+        "ambiguous",
+        "too_low_aQual",
+        "not_aligned",
+        "alignment_not_unique"
+      )
+      special_rows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% old_special_names
+      tbl <- tbl[!special_rows, , drop = FALSE]

-      dds <- DESeqDataSetFromMatrix(countData = tbl,
-                                    colData = subset(sampleTable, select=-(filename)),
-                                    design = designFormula)
-  } else if (!useTXI & !hasHeader) {
+      dds <- DESeqDataSetFromMatrix(
+        countData = tbl,
+        colData = subset(sample_table, select = -filename),
+        design = design_formula
+      )
+  } else if (!use_txi & !has_header) {

     # construct the object from HTSeq files
-    dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
-                                      directory = dir,
-                                      design =  designFormula)
-    colnames(dds) <- row.names(sampleTable)
+    dds <- DESeqDataSetFromHTSeqCount(
+      sampleTable = sample_table,
+      directory = dir,
+      design = design_formula
+    )
+    colnames(dds) <- row.names(sample_table)

   } else {
       # construct the object using tximport
       library("tximport")
-      txiFiles <- as.character(sampleTable$filename)
-      labs <- row.names(sampleTable)
-      names(txiFiles) <- labs
-      if (!is.null(gffFile)) {
+      txi_files <- as.character(sample_table$filename)
+      labs <- row.names(sample_table)
+      names(txi_files) <- labs
+      if (!is.null(gff_file)) {
         # first need to make the tx2gene table
         # this takes ~2-3 minutes using Bioconductor functions
         suppressPackageStartupMessages({
           library("GenomicFeatures")
         })
-        txdb <- makeTxDbFromGFF(gffFile)
+        txdb <- makeTxDbFromGFF(gff_file)
         k <- keys(txdb, keytype = "TXNAME")
-        tx2gene <- select(txdb, keys=k, columns="GENEID", keytype="TXNAME")
-        # Remove 'transcript:' from transcript IDs (when gffFile is a GFF3 from Ensembl and the transcript does not have a Name)
-        tx2gene$TXNAME <- sub('^transcript:', '', tx2gene$TXNAME)
+        tx2gene <- select(txdb, keys = k, columns = "GENEID", keytype = "TXNAME")
+        # Remove 'transcript:' from transcript IDs (when gff_file is a GFF3 from Ensembl and the transcript does not have a Name)
+        tx2gene$TXNAME <- sub("^transcript:", "", tx2gene$TXNAME)  # nolint
       }
-      try(txi <- tximport(txiFiles, type=txtype, tx2gene=tx2gene))
+      try(txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene))
       if (!exists("txi")) {
         # Remove version from transcript IDs in tx2gene...
-        tx2gene$TXNAME <- sub('\\.[0-9]+$', '', tx2gene$TXNAME)
-        # ...and in txiFiles
-        txi <- tximport(txiFiles, type=txtype, tx2gene=tx2gene, ignoreTxVersion=TRUE)
+        tx2gene$TXNAME <- sub("\\.[0-9]+$", "", tx2gene$TXNAME)  # nolint
+        # ...and in txi_files
+        txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene, ignoreTxVersion = TRUE)
       }
-      dds <- DESeqDataSetFromTximport(txi,
-                                      subset(sampleTable, select=-c(filename)),
-                                      designFormula)
+      dds <- DESeqDataSetFromTximport(
+        txi,
+        subset(sample_table, select = -c(filename)),
+        design_formula
+      )
   }
   return(dds)
 }
--- a/ruvseq.R	Tue Mar 26 06:25:38 2019 -0400
+++ b/ruvseq.R	Fri Jan 15 17:53:15 2021 +0000
@@ -1,6 +1,8 @@
 # setup R error handling to go to stderr
 library("getopt")
-options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+options(show.error.messages = F, error = function() {
+  cat(geterrmessage(), file = stderr()); q("no", 1, F)
+})
 options(stringAsFactors = FALSE, useFancyQuotes = FALSE)

 setup_cmdline_options <- function() {
@@ -12,18 +14,18 @@
     "min_k", "min_k", 1, "double",
     "max_k", "max_k", 1, "double",
     "sample_json", "s", 1, "character",
-    "plots" , "p", 1, "character",
+    "plots", "p", 1, "character",
     "header", "H", 0, "logical",
     "txtype", "y", 1, "character",
     "tx2gene", "x", 1, "character"), # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF)
-    byrow=TRUE, ncol=4)
+    byrow = TRUE, ncol = 4)

   opt <- getopt(spec)
   # if help was asked for print a friendly message
   # and exit with a non-zero error code
   if (!is.null(opt$help)) {
-    cat(getopt(spec, usage=TRUE))
-    q(status=1)
+    cat(getopt(spec, usage = TRUE))
+    q(status = 1)
   } else {
     load_libraries()
   }
@@ -42,117 +44,112 @@
   library("ggrepel")
 }

-source_local <- function(fname){
+source_local <- function(fname) {
     argv <- commandArgs(trailingOnly = FALSE)
     base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))
-    source(paste(base_dir, fname, sep="/"))
+    source(paste(base_dir, fname, sep = "/"))
 }

 # Source get_deseq_dataset.R for getting deseq dataset from htseq/featurecounts/tximport
-source_local('get_deseq_dataset.R')
+source_local("get_deseq_dataset.R")

 # RUVseq function definitions

-plot_pca_rle <- function (set, title) {
-  x <- pData(set)[,1]
+plot_pca_rle <- function(set, title) {
+  x <- pData(set)[, 1]
   colors <- brewer.pal(3, "Set2")
-  label <- paste0(' for ', title)
-  plotRLE(set, outline=FALSE, ylim=c(-4, 4), col=colors[x])
-  title(main=paste0("RLE", label))
-  plotPCA(set, col=colors[x], cex=1.2)
-  title(main=paste0("PCA", label))
+  label <- paste0(" for ", title)
+  plotRLE(set, outline = FALSE, ylim = c(-4, 4), col = colors[x])
+  title(main = paste0("RLE", label))
+  plotPCA(set, col = colors[x], cex = 1.2)
+  title(main = paste0("PCA", label))
 }

-plot_factors_of_unwanted_variation <- function(set, method, k){
+plot_factors_of_unwanted_var <- function(set, method, k) {
   pd <- pData(set)
-  pd['sample'] <- row.names(pd)
-  colnames(pd)[1] <- 'condition'
-  d = melt(pd, id.vars = c('sample', 'condition'))
-  d['x'] <- 1  # There is no information on the X, so we just fake it to be able to do a scatterplot
-  print(ggplot(d, aes(x=x, y=value, color=condition, label=sample)) +
+  pd["sample"] <- row.names(pd)
+  colnames(pd)[1] <- "condition"
+  d <- melt(pd, id.vars = c("sample", "condition"))
+  d["x"] <- 1  # There is no information on the X, so we just fake it to be able to do a scatterplot
+  print(ggplot(d, aes(x = x, y = value, color = condition, label = sample)) +
   geom_point() +
-  ggtitle(paste0('Factors of unwanted variation for method: ', method, ", k=", k)) +
-  facet_wrap( ~ variable, scales = "free_x") +
+  ggtitle(paste0("Factors of unwanted variation for method: ", method, ", k=", k)) +
+  facet_wrap(~ variable, scales = "free_x") +
   geom_text_repel() +
-  theme(axis.title.x=element_blank(),
-        axis.text.x=element_blank(),
-        axis.ticks.x=element_blank(),
+  theme(axis.title.x = element_blank(),
+        axis.text.x = element_blank(),
+        axis.ticks.x = element_blank(),
         plot.title = element_text(hjust = 0.5))
   )
 }

-create_seq_expression_set <- function (dds, min_mean_count) {
+create_seq_expression_set <- function(dds, min_mean_count) {
   count_values <- counts(dds)
-  print(paste0("feature count before filtering :",nrow(count_values),"\n"))
+  print(paste0("feature count before filtering :", nrow(count_values), "\n"))
   print(paste0("Filtering features which mean expression is less or eq. than ", min_mean_count, " counts\n"))
   filter <- apply(count_values, 1, function(x) mean(x) > min_mean_count)
-  filtered <- count_values[filter,]
-  print(paste0("feature count after filtering :",nrow(filtered),"\n"))
-  set = newSeqExpressionSet(as.matrix(filtered),
-                            phenoData = data.frame(colData(dds)$condition, row.names=colnames(filtered)))
-  plot_pca_rle(set = set, title = 'raw data')
-  set <- betweenLaneNormalization(set, which="upper")
-  plot_pca_rle(set = set, title = 'upper quartile normalized')
+  filtered <- count_values[filter, ]
+  print(paste0("feature count after filtering :", nrow(filtered), "\n"))
+  set <- newSeqExpressionSet(as.matrix(filtered),
+                            phenoData = data.frame(colData(dds)$condition, row.names = colnames(filtered)))
+  plot_pca_rle(set = set, title = "raw data")
+  set <- betweenLaneNormalization(set, which = "upper")
+  plot_pca_rle(set = set, title = "upper quartile normalized")
   return(set)
 }

 get_empirical_control_genes <- function(set, cutoff_p) {
-  x <- pData(set)[,1]
-  design <- model.matrix(~x, data=pData(set))
-  y <- DGEList(counts=counts(set), group=x)
-  y <- calcNormFactors(y, method="upperquartile")
+  x <- pData(set)[, 1]
+  design <- model.matrix(~x, data = pData(set))
+  y <- DGEList(counts = counts(set), group = x)
+  y <- calcNormFactors(y, method = "upperquartile")
   y <- estimateGLMCommonDisp(y, design)
   y <- estimateGLMTagwiseDisp(y, design)
   fit <- glmFit(y, design)
-  lrt <- glmLRT(fit, coef=2)
-  top <- topTags(lrt, n=nrow(set))$table
+  lrt <- glmLRT(fit, coef = 2)
+  top <- topTags(lrt, n = nrow(set))$table
   top_rows <- rownames(top)[which(top$PValue < cutoff_p)]
   empirical <- rownames(set)[which(!(rownames(set) %in% top_rows))]
   return(empirical)
 }

-ruv_control_gene_method <- function(set, k, control_genes='empirical', cutoff_p=0.2) {
-  if (control_genes == 'empirical') {
-    control_genes = get_empirical_control_genes(set, cutoff_p=cutoff_p)
+ruv_control_gene_method <- function(set, k, control_genes = "empirical", cutoff_p = 0.2) {
+  if (control_genes == "empirical") {
+    control_genes <- get_empirical_control_genes(set, cutoff_p = cutoff_p)
   }
-  set <- RUVg(set, control_genes, k=k)
+  set <- RUVg(set, control_genes, k = k)
   plot_pca_rle(set, paste0("RUVg with empirical control genes, k=", k))
-  plot_factors_of_unwanted_variation(set, method="RUVg with empirical control genes", k=k)
+  plot_factors_of_unwanted_var(set, method = "RUVg with empirical control genes", k = k)
   return(set)
 }

 ruv_residual_method <- function(set, k) {
   genes <- rownames(counts(set))
-  x <- pData(set)[,1]
+  x <- pData(set)[, 1]
   # Initial edger residuals
-  design <- model.matrix(~x, data=pData(set))
-  y <- DGEList(counts=counts(set), group=x)
-  y <- calcNormFactors(y, method="upperquartile")
+  design <- model.matrix(~x, data = pData(set))
+  y <- DGEList(counts = counts(set), group = x)
+  y <- calcNormFactors(y, method = "upperquartile")
   y <- estimateGLMCommonDisp(y, design)
   y <- estimateGLMTagwiseDisp(y, design)
   fit <- glmFit(y, design)
-  res <- residuals(fit, type="deviance")
-  set <- RUVr(set, genes, k=k, res)
-  plot_pca_rle(set = set, title = paste0('RUVr using residuals, k=', k))
-  plot_factors_of_unwanted_variation(set, method="RUVr using residuals", k=k)
+  res <- residuals(fit, type = "deviance")
+  set <- RUVr(set, genes, k = k, res)
+  plot_pca_rle(set = set, title = paste0("RUVr using residuals, k=", k))
+  plot_factors_of_unwanted_var(set, method = "RUVr using residuals", k = k)
   return(set)
 }

-ruv_replicate_method <- function (set, k) {
+ruv_replicate_method <- function(set, k) {
   genes <- rownames(counts(set))
-  x <- pData(set)[,1]
+  x <- pData(set)[, 1]
   differences <- makeGroups(x)
-  set <- RUVs(set, genes, k=k, differences)
-  plot_pca_rle(set, paste0('RUVs with replicate samples, k=', k))
-  plot_factors_of_unwanted_variation(set, method="RUVs using replicates", k=k)
+  set <- RUVs(set, genes, k = k, differences)
+  plot_pca_rle(set, paste0("RUVs with replicate samples, k=", k))
+  plot_factors_of_unwanted_var(set, method = "RUVs using replicates", k = k)
   return(set)
 }

-get_differentially_expressed_genes <- function(dds, contrast, alpha=0.01) {
-  r <- results(dds, contrast=contrast, alpha=alpha)
-  return(rownames(r[which(r$padj < alpha),]))
-}
-
 opt <- setup_cmdline_options()
 alpha <- opt$alpha
 min_k <- opt$min_k
@@ -162,12 +159,10 @@
 sample_paths <- sample_json$path
 sample_names <- sample_json$label
 condition <- as.factor(sample_json$condition)
-sampleTable <- data.frame(samplename=sample_names,
-                          filename = sample_paths,
-                          condition=condition)
-rownames(sampleTable) <- sample_names
+sample_table <- data.frame(samplename = sample_names, filename = sample_paths, condition = condition)
+rownames(sample_table) <- sample_names

-dds <- get_deseq_dataset(sampleTable, header=opt$header, designFormula= ~ condition, tximport=opt$txtype, txtype=opt$txtype, tx2gene=opt$tx2gene)
+dds <- get_deseq_dataset(sample_table, header = opt$header, design_formula = ~ condition, tximport = opt$txtype, txtype = opt$txtype, tx2gene = opt$tx2gene)
 if (!is.null(opt$plots)) {
   pdf(opt$plots)
 }
@@ -176,9 +171,9 @@
 set <- create_seq_expression_set(dds, min_mean_count = min_c)
 result <- list(no_correction = set)
 for (k in seq(min_k, max_k)) {
-  result[[paste0('residual_method_k', k)]] <- ruv_residual_method(set, k=k)
-  result[[paste0('replicate_method_k', k)]] <- ruv_replicate_method(set, k=k)
-  result[[paste0('control_method_k', k)]] <- ruv_control_gene_method(set, k=k, cutoff_p=0.5)
+  result[[paste0("residual_method_k", k)]] <- ruv_residual_method(set, k = k)
+  result[[paste0("replicate_method_k", k)]] <- ruv_replicate_method(set, k = k)
+  result[[paste0("control_method_k", k)]] <- ruv_control_gene_method(set, k = k, cutoff_p = 0.5)
 }

 for (name in names(result)) {
@@ -187,8 +182,8 @@
     unwanted_variation <- pData(set)
     df <- data.frame(identifier = rownames(unwanted_variation))
     df <- cbind(df, unwanted_variation)
-    colnames(df)[2] <- 'condition'
-    write.table(df, file=paste0("batch_effects_", name, ".tabular"),  sep="\t", quote=F, row.names=F)
+    colnames(df)[2] <- "condition"
+    write.table(df, file = paste0("batch_effects_", name, ".tabular"),  sep = "\t", quote = F, row.names = F)
   }
 }
--- a/ruvseq.xml	Tue Mar 26 06:25:38 2019 -0400
+++ b/ruvseq.xml	Fri Jan 15 17:53:15 2021 +0000
@@ -1,7 +1,11 @@
-<tool id="ruvseq" name="Remove Unwanted Variation" version="1.16.0">
+<tool id="ruvseq" name="Remove Unwanted Variation" version="@TOOL_VERSION@+galaxy@WRAPPER_VERSION@">
     <description>from RNA-seq data</description>
+    <macros>
+        <token name="@TOOL_VERSION@">1.16.0</token>
+        <token name="@WRAPPER_VERSION@">1</token>
+    </macros>
     <requirements>
-        <requirement type="package" version="1.16.0">bioconductor-ruvseq</requirement>
+        <requirement type="package" version="@TOOL_VERSION@">bioconductor-ruvseq</requirement>
         <requirement type="package" version="1.22.1">bioconductor-deseq2</requirement>
         <requirement type="package" version="1.10.0">bioconductor-tximport</requirement>
         <requirement type="package" version="1.34.1">bioconductor-genomicfeatures</requirement>