# HG changeset patch
# User artbio
# Date 1713624372 0
# Node ID 6d59fbca2db4604525ee8dcf33a81e60e9f877f8
# Parent 4905a332a094339a09be02094bac6595b1e39841
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 4dd520dee5c3c0c526e8319a74c4890da032300f
diff -r 4905a332a094 -r 6d59fbca2db4 edgeR_repenrich.R
--- a/edgeR_repenrich.R Sat Apr 20 11:56:53 2024 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,176 +0,0 @@
-# setup R error handling to go to stderr
-options(show.error.messages = FALSE, error = function() {
- cat(geterrmessage(), file = stderr())
- q("no", 1, FALSE)
-})
-
-# To not crash galaxy with an UTF8 error with not-US LC settings.
-loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
-
-# load libraries
-library("getopt")
-library("tools")
-library("rjson")
-suppressPackageStartupMessages({
- library("edgeR")
- library("limma")
-})
-
-options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
-
-# get options, using the spec as defined by the enclosed list.
-spec <- matrix(
- c(
- "quiet", "q", 0, "logical",
- "outfile", "o", 1, "character",
- "countsfile", "n", 1, "character",
- "factorName", "N", 1, "character",
- "levelNameA", "A", 1, "character",
- "levelNameB", "B", 1, "character",
- "levelAfiles", "a", 1, "character",
- "levelBfiles", "b", 1, "character",
- "plots", "p", 1, "character"
- ),
- byrow = TRUE, ncol = 4
-)
-opt <- getopt(spec)
-
-# build levels A and B file lists
-filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error")
-filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error")
-listA <- list()
-indice <- 0
-listA[["level"]] <- opt$levelNameA
-for (file in filesA) {
- indice <- indice + 1
- listA[[paste0(opt$levelNameA, "_", indice)]] <- read.delim(file, header = FALSE)
-}
-listB <- list()
-indice <- 0
-listB[["level"]] <- opt$levelNameB
-for (file in filesB) {
- indice <- indice + 1
- listB[[paste0(opt$levelNameB, "_", indice)]] <- read.delim(file, header = FALSE)
-}
-
-# build a counts table
-counts <- data.frame(row.names = listA[[2]][, 1])
-for (element in names(listA[-1])) {
- counts <- cbind(counts, listA[[element]][, 4])
-}
-for (element in names(listB[-1])) {
- counts <- cbind(counts, listB[[element]][, 4])
-}
-colnames(counts) <- c(names(listA[-1]), names(listB[-1]))
-sizes <- colSums(counts)
-
-# build a meta data object
-meta <- data.frame(
- row.names = colnames(counts),
- condition = c(rep(opt$levelNameA, length(filesA)), rep(opt$levelNameB, length(filesB))),
- libsize = sizes
-)
-
-
-# Define the library size and conditions for the GLM
-libsize <- meta$libsize
-condition <- factor(meta$condition)
-design <- model.matrix(~ 0 + condition)
-colnames(design) <- levels(condition)
-
-# Build a DGE object for the GLM
-y <- DGEList(counts = counts, lib.size = libsize)
-
-# Normalize the data
-y <- calcNormFactors(y)
-
-# Estimate the variance
-y <- estimateGLMCommonDisp(y, design)
-y <- estimateGLMTrendedDisp(y, design)
-y <- estimateGLMTagwiseDisp(y, design)
-
-# Builds and outputs an object to contain the normalized read abundance in counts per million of reads
-cpm <- cpm(y, log = FALSE, lib.size = libsize)
-cpm <- as.data.frame(cpm)
-colnames(cpm) <- colnames(counts)
-if (!is.null(opt$countsfile)) {
- normalizedAbundance <- data.frame(Tag = rownames(cpm))
- normalizedAbundance <- cbind(normalizedAbundance, cpm)
- write.table(normalizedAbundance, file = opt$countsfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
-}
-
-# Conduct fitting of the GLM
-yfit <- glmFit(y, design)
-
-# Initialize result matrices to contain the results of the GLM
-results <- matrix(nrow = dim(counts)[1], ncol = 0)
-logfc <- matrix(nrow = dim(counts)[1], ncol = 0)
-
-# Make the comparisons for the GLM
-my.contrasts <- makeContrasts(
- paste0(opt$levelNameA, "_", opt$levelNameB, " = ", opt$levelNameA, " - ", opt$levelNameB),
- levels = design
-)
-
-# Define the contrasts used in the comparisons
-allcontrasts <- paste0(opt$levelNameA, " vs ", opt$levelNameB)
-
-# Conduct a for loop that will do the fitting of the GLM for each comparison
-# Put the results into the results objects
-lrt <- glmLRT(yfit, contrast = my.contrasts[, 1])
-res <- topTags(lrt, n = dim(c)[1], sort.by = "none")$table
-results <- cbind(results, res[, c(1, 5)])
-logfc <- cbind(logfc, res[c(1)])
-
-# Add the repeat types back into the results.
-# We should still have the same order as the input data
-results$class <- listA[[2]][, 2]
-results$type <- listA[[2]][, 3]
-# Sort the results table by the FDR
-results <- results[with(results, order(FDR)), ]
-
-# Plot Fold Changes for repeat classes and types
-
-# open the device and plots
-if (!is.null(opt$plots)) {
- pdf(opt$plots)
- plotMDS(y, main = "Multidimensional Scaling Plot Of Distances Between Samples")
- plotBCV(y, xlab = "Gene abundance (Average log CPM)", main = "Biological Coefficient of Variation Plot")
- logFC <- results[, "logFC"]
- # Plot the repeat classes
- classes <- with(results, reorder(class, -logFC, median))
- classes
- par(mar = c(6, 10, 4, 1))
- boxplot(logFC ~ classes,
- data = results, outline = FALSE, horizontal = TRUE,
- las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Class")
- )
- abline(v = 0)
- # Plot the repeat types
- types <- with(results, reorder(type, -logFC, median))
- boxplot(logFC ~ types,
- data = results, outline = FALSE, horizontal = TRUE,
- las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Type")
- )
- abline(v = 0)
- # volcano plot
- TEdata <- cbind(rownames(results), as.data.frame(results), score = -log(results$FDR, 10))
- colnames(TEdata) <- c("Tag", "log2FC", "FDR", "Class", "Type", "score")
- color <- ifelse(TEdata$FDR < 0.05, "red", "black")
- s <- subset(TEdata, FDR < 0.01)
- with(TEdata, plot(log2FC, score, pch = 20, col = color, main = "Volcano plot (all tag types)", ylab = "-log10(FDR)"))
- text(s[, 2], s[, 6], labels = s[, 5], pos = seq(from = 1, to = 3), cex = 0.5)
-}
-
-# close the plot device
-if (!is.null(opt$plots)) {
- cat("closing plot device\n")
- dev.off()
-}
-
-# Save the results
-results <- cbind(TE_item = rownames(results), results)
-colnames(results) <- c("TE_item", "log2FC", "FDR", "Class", "Type")
-results$log2FC <- format(results$log2FC, digits = 5)
-results$FDR <- format(results$FDR, digits = 5)
-write.table(results, opt$outfile, quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE)
diff -r 4905a332a094 -r 6d59fbca2db4 edgeR_repenrich2.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/edgeR_repenrich2.R Sat Apr 20 14:46:12 2024 +0000
@@ -0,0 +1,176 @@
+# setup R error handling to go to stderr
+options(show.error.messages = FALSE, error = function() {
+ cat(geterrmessage(), file = stderr())
+ q("no", 1, FALSE)
+})
+
+# To not crash galaxy with an UTF8 error with not-US LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+
+# load libraries
+library("getopt")
+library("tools")
+library("rjson")
+suppressPackageStartupMessages({
+ library("edgeR")
+ library("limma")
+})
+
+options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
+
+# get options, using the spec as defined by the enclosed list.
+spec <- matrix(
+ c(
+ "quiet", "q", 0, "logical",
+ "outfile", "o", 1, "character",
+ "countsfile", "n", 1, "character",
+ "factorName", "N", 1, "character",
+ "levelNameA", "A", 1, "character",
+ "levelNameB", "B", 1, "character",
+ "levelAfiles", "a", 1, "character",
+ "levelBfiles", "b", 1, "character",
+ "plots", "p", 1, "character"
+ ),
+ byrow = TRUE, ncol = 4
+)
+opt <- getopt(spec)
+
+# build levels A and B file lists
+filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error")
+filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error")
+listA <- list()
+indice <- 0
+listA[["level"]] <- opt$levelNameA
+for (file in filesA) {
+ indice <- indice + 1
+ listA[[paste0(opt$levelNameA, "_", indice)]] <- read.delim(file, header = FALSE)
+}
+listB <- list()
+indice <- 0
+listB[["level"]] <- opt$levelNameB
+for (file in filesB) {
+ indice <- indice + 1
+ listB[[paste0(opt$levelNameB, "_", indice)]] <- read.delim(file, header = FALSE)
+}
+
+# build a counts table
+counts <- data.frame(row.names = listA[[2]][, 1])
+for (element in names(listA[-1])) {
+ counts <- cbind(counts, listA[[element]][, 4])
+}
+for (element in names(listB[-1])) {
+ counts <- cbind(counts, listB[[element]][, 4])
+}
+colnames(counts) <- c(names(listA[-1]), names(listB[-1]))
+sizes <- colSums(counts)
+
+# build a meta data object
+meta <- data.frame(
+ row.names = colnames(counts),
+ condition = c(rep(opt$levelNameA, length(filesA)), rep(opt$levelNameB, length(filesB))),
+ libsize = sizes
+)
+
+
+# Define the library size and conditions for the GLM
+libsize <- meta$libsize
+condition <- factor(meta$condition)
+design <- model.matrix(~ 0 + condition)
+colnames(design) <- levels(condition)
+
+# Build a DGE object for the GLM
+y <- DGEList(counts = counts, lib.size = libsize)
+
+# Normalize the data
+y <- calcNormFactors(y)
+
+# Estimate the variance
+y <- estimateGLMCommonDisp(y, design)
+y <- estimateGLMTrendedDisp(y, design)
+y <- estimateGLMTagwiseDisp(y, design)
+
+# Builds and outputs an object to contain the normalized read abundance in counts per million of reads
+cpm <- cpm(y, log = FALSE, lib.size = libsize)
+cpm <- as.data.frame(cpm)
+colnames(cpm) <- colnames(counts)
+if (!is.null(opt$countsfile)) {
+ normalizedAbundance <- data.frame(Tag = rownames(cpm))
+ normalizedAbundance <- cbind(normalizedAbundance, cpm)
+ write.table(normalizedAbundance, file = opt$countsfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
+}
+
+# Conduct fitting of the GLM
+yfit <- glmFit(y, design)
+
+# Initialize result matrices to contain the results of the GLM
+results <- matrix(nrow = dim(counts)[1], ncol = 0)
+logfc <- matrix(nrow = dim(counts)[1], ncol = 0)
+
+# Make the comparisons for the GLM
+my.contrasts <- makeContrasts(
+ paste0(opt$levelNameA, "_", opt$levelNameB, " = ", opt$levelNameA, " - ", opt$levelNameB),
+ levels = design
+)
+
+# Define the contrasts used in the comparisons
+allcontrasts <- paste0(opt$levelNameA, " vs ", opt$levelNameB)
+
+# Conduct a for loop that will do the fitting of the GLM for each comparison
+# Put the results into the results objects
+lrt <- glmLRT(yfit, contrast = my.contrasts[, 1])
+res <- topTags(lrt, n = dim(c)[1], sort.by = "none")$table
+results <- cbind(results, res[, c(1, 5)])
+logfc <- cbind(logfc, res[c(1)])
+
+# Add the repeat types back into the results.
+# We should still have the same order as the input data
+results$class <- listA[[2]][, 2]
+results$type <- listA[[2]][, 3]
+# Sort the results table by the FDR
+results <- results[with(results, order(FDR)), ]
+
+# Plot Fold Changes for repeat classes and types
+
+# open the device and plots
+if (!is.null(opt$plots)) {
+ pdf(opt$plots)
+ plotMDS(y, main = "Multidimensional Scaling Plot Of Distances Between Samples")
+ plotBCV(y, xlab = "Gene abundance (Average log CPM)", main = "Biological Coefficient of Variation Plot")
+ logFC <- results[, "logFC"]
+ # Plot the repeat classes
+ classes <- with(results, reorder(class, -logFC, median))
+ classes
+ par(mar = c(6, 10, 4, 1))
+ boxplot(logFC ~ classes,
+ data = results, outline = FALSE, horizontal = TRUE,
+ las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Class")
+ )
+ abline(v = 0)
+ # Plot the repeat types
+ types <- with(results, reorder(type, -logFC, median))
+ boxplot(logFC ~ types,
+ data = results, outline = FALSE, horizontal = TRUE,
+ las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Type")
+ )
+ abline(v = 0)
+ # volcano plot
+ TEdata <- cbind(rownames(results), as.data.frame(results), score = -log(results$FDR, 10))
+ colnames(TEdata) <- c("Tag", "log2FC", "FDR", "Class", "Type", "score")
+ color <- ifelse(TEdata$FDR < 0.05, "red", "black")
+ s <- subset(TEdata, FDR < 0.01)
+ with(TEdata, plot(log2FC, score, pch = 20, col = color, main = "Volcano plot (all tag types)", ylab = "-log10(FDR)"))
+ text(s[, 2], s[, 6], labels = s[, 5], pos = seq(from = 1, to = 3), cex = 0.5)
+}
+
+# close the plot device
+if (!is.null(opt$plots)) {
+ cat("closing plot device\n")
+ dev.off()
+}
+
+# Save the results
+results <- cbind(TE_item = rownames(results), results)
+colnames(results) <- c("TE_item", "log2FC", "FDR", "Class", "Type")
+results$log2FC <- format(results$log2FC, digits = 5)
+results$FDR <- format(results$FDR, digits = 5)
+write.table(results, opt$outfile, quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE)
diff -r 4905a332a094 -r 6d59fbca2db4 edger-repenrich.xml
--- a/edger-repenrich.xml Sat Apr 20 11:56:53 2024 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,199 +0,0 @@
-
- Determines differentially expressed features from RepEnrich2 counts
-
- macros.xml
-
-
-
-
-
-
-
-
- /dev/null | grep -v -i "WARNING: ")
- ]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- normCounts == True
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-**Note**: This edgeR_ wrapper was adapted from code available at
-https://github.com/nskvir/RepEnrich
-
-
-
- 10.1093/bioinformatics/btp616
-
-
diff -r 4905a332a094 -r 6d59fbca2db4 edger-repenrich2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/edger-repenrich2.xml Sat Apr 20 14:46:12 2024 +0000
@@ -0,0 +1,199 @@
+
+ Determines differentially expressed features from RepEnrich2 counts
+
+ macros.xml
+
+
+
+
+
+
+
+
+ /dev/null | grep -v -i "WARNING: ")
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ normCounts == True
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Note**: This edgeR_ wrapper was adapted from code available at
+https://github.com/nskvir/RepEnrich
+
+
+
+ 10.1093/bioinformatics/btp616
+
+
diff -r 4905a332a094 -r 6d59fbca2db4 macros.xml
--- a/macros.xml Sat Apr 20 11:56:53 2024 +0000
+++ b/macros.xml Sat Apr 20 14:46:12 2024 +0000
@@ -1,6 +1,6 @@
2.31.1
- 0
+ 1
23.0
diff -r 4905a332a094 -r 6d59fbca2db4 repenrich2.xml
--- a/repenrich2.xml Sat Apr 20 11:56:53 2024 +0000
+++ b/repenrich2.xml Sat Apr 20 14:46:12 2024 +0000
@@ -1,4 +1,4 @@
-
+
Repeat Element Profiling
macros.xml