diff edgeR_repenrich2.R @ 1:6d59fbca2db4 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 4dd520dee5c3c0c526e8319a74c4890da032300f
author artbio
date Sat, 20 Apr 2024 14:46:12 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/edgeR_repenrich2.R	Sat Apr 20 14:46:12 2024 +0000
@@ -0,0 +1,176 @@
+# setup R error handling to go to stderr
+options(show.error.messages = FALSE, error = function() {
+    cat(geterrmessage(), file = stderr())
+    q("no", 1, FALSE)
+})
+
+# To not crash galaxy with an UTF8 error with not-US LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+
+# load libraries
+library("getopt")
+library("tools")
+library("rjson")
+suppressPackageStartupMessages({
+    library("edgeR")
+    library("limma")
+})
+
+options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
+
+# get options, using the spec as defined by the enclosed list.
+spec <- matrix(
+    c(
+        "quiet", "q", 0, "logical",
+        "outfile", "o", 1, "character",
+        "countsfile", "n", 1, "character",
+        "factorName", "N", 1, "character",
+        "levelNameA", "A", 1, "character",
+        "levelNameB", "B", 1, "character",
+        "levelAfiles", "a", 1, "character",
+        "levelBfiles", "b", 1, "character",
+        "plots", "p", 1, "character"
+    ),
+    byrow = TRUE, ncol = 4
+)
+opt <- getopt(spec)
+
+# build levels A and B file lists
+filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error")
+filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error")
+listA <- list()
+indice <- 0
+listA[["level"]] <- opt$levelNameA
+for (file in filesA) {
+    indice <- indice + 1
+    listA[[paste0(opt$levelNameA, "_", indice)]] <- read.delim(file, header = FALSE)
+}
+listB <- list()
+indice <- 0
+listB[["level"]] <- opt$levelNameB
+for (file in filesB) {
+    indice <- indice + 1
+    listB[[paste0(opt$levelNameB, "_", indice)]] <- read.delim(file, header = FALSE)
+}
+
+# build a counts table
+counts <- data.frame(row.names = listA[[2]][, 1])
+for (element in names(listA[-1])) {
+    counts <- cbind(counts, listA[[element]][, 4])
+}
+for (element in names(listB[-1])) {
+    counts <- cbind(counts, listB[[element]][, 4])
+}
+colnames(counts) <- c(names(listA[-1]), names(listB[-1]))
+sizes <- colSums(counts)
+
+# build a meta data object
+meta <- data.frame(
+    row.names = colnames(counts),
+    condition = c(rep(opt$levelNameA, length(filesA)), rep(opt$levelNameB, length(filesB))),
+    libsize = sizes
+)
+
+
+# Define the library size and conditions for the GLM
+libsize <- meta$libsize
+condition <- factor(meta$condition)
+design <- model.matrix(~ 0 + condition)
+colnames(design) <- levels(condition)
+
+# Build a DGE object for the GLM
+y <- DGEList(counts = counts, lib.size = libsize)
+
+# Normalize the data
+y <- calcNormFactors(y)
+
+# Estimate the variance
+y <- estimateGLMCommonDisp(y, design)
+y <- estimateGLMTrendedDisp(y, design)
+y <- estimateGLMTagwiseDisp(y, design)
+
+# Builds and outputs an object to contain the normalized read abundance in counts per million of reads
+cpm <- cpm(y, log = FALSE, lib.size = libsize)
+cpm <- as.data.frame(cpm)
+colnames(cpm) <- colnames(counts)
+if (!is.null(opt$countsfile)) {
+    normalizedAbundance <- data.frame(Tag = rownames(cpm))
+    normalizedAbundance <- cbind(normalizedAbundance, cpm)
+    write.table(normalizedAbundance, file = opt$countsfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
+}
+
+# Conduct fitting of the GLM
+yfit <- glmFit(y, design)
+
+# Initialize result matrices to contain the results of the GLM
+results <- matrix(nrow = dim(counts)[1], ncol = 0)
+logfc <- matrix(nrow = dim(counts)[1], ncol = 0)
+
+# Make the comparisons for the GLM
+my.contrasts <- makeContrasts(
+    paste0(opt$levelNameA, "_", opt$levelNameB, " = ", opt$levelNameA, " - ", opt$levelNameB),
+    levels = design
+)
+
+# Define the contrasts used in the comparisons
+allcontrasts <- paste0(opt$levelNameA, " vs ", opt$levelNameB)
+
+# Conduct a for loop that will do the fitting of the GLM for each comparison
+# Put the results into the results objects
+lrt <- glmLRT(yfit, contrast = my.contrasts[, 1])
+res <- topTags(lrt, n = dim(c)[1], sort.by = "none")$table
+results <- cbind(results, res[, c(1, 5)])
+logfc <- cbind(logfc, res[c(1)])
+
+# Add the repeat types back into the results.
+# We should still have the same order as the input data
+results$class <- listA[[2]][, 2]
+results$type <- listA[[2]][, 3]
+# Sort the results table by the FDR
+results <- results[with(results, order(FDR)), ]
+
+# Plot Fold Changes for repeat classes and types
+
+# open the device and plots
+if (!is.null(opt$plots)) {
+    pdf(opt$plots)
+    plotMDS(y, main = "Multidimensional Scaling Plot Of Distances Between Samples")
+    plotBCV(y, xlab = "Gene abundance (Average log CPM)", main = "Biological Coefficient of Variation Plot")
+    logFC <- results[, "logFC"]
+    # Plot the repeat classes
+    classes <- with(results, reorder(class, -logFC, median))
+    classes
+    par(mar = c(6, 10, 4, 1))
+    boxplot(logFC ~ classes,
+        data = results, outline = FALSE, horizontal = TRUE,
+        las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Class")
+    )
+    abline(v = 0)
+    # Plot the repeat types
+    types <- with(results, reorder(type, -logFC, median))
+    boxplot(logFC ~ types,
+        data = results, outline = FALSE, horizontal = TRUE,
+        las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Type")
+    )
+    abline(v = 0)
+    # volcano plot
+    TEdata <- cbind(rownames(results), as.data.frame(results), score = -log(results$FDR, 10))
+    colnames(TEdata) <- c("Tag", "log2FC", "FDR", "Class", "Type", "score")
+    color <- ifelse(TEdata$FDR < 0.05, "red", "black")
+    s <- subset(TEdata, FDR < 0.01)
+    with(TEdata, plot(log2FC, score, pch = 20, col = color, main = "Volcano plot (all tag types)", ylab = "-log10(FDR)"))
+    text(s[, 2], s[, 6], labels = s[, 5], pos = seq(from = 1, to = 3), cex = 0.5)
+}
+
+# close the plot device
+if (!is.null(opt$plots)) {
+    cat("closing plot device\n")
+    dev.off()
+}
+
+# Save the results
+results <- cbind(TE_item = rownames(results), results)
+colnames(results) <- c("TE_item", "log2FC", "FDR", "Class", "Type")
+results$log2FC <- format(results$log2FC, digits = 5)
+results$FDR <- format(results$FDR, digits = 5)
+write.table(results, opt$outfile, quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE)