repenrich: edgeR_repenrich.R comparison

comparison edgeR_repenrich.R @ 11:6bba3e33c2e7 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 11df9f0b68d35d3a9424a17e4cefee6cfb9d4c19

author	artbio
date	Sat, 09 Mar 2024 22:32:46 +0000
parents	5dd4791c7b70
children	530626b0757c

comparison

equal deleted inserted replaced

-:6f4143893463
+:6bba3e33c2e7
-#!/usr/bin/env Rscript
-# A command-line interface to edgeR for use with Galaxy edger-repenrich
-# written by Christophe Antoniewski drosofff@gmail.com 2017.05.30
 # setup R error handling to go to stderr
-options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+options(show.error.messages = FALSE, error = function() {
+cat(geterrmessage(), file = stderr())
+q("no", 1, FALSE)
+})
 # To not crash galaxy with an UTF8 error with not-US LC settings.
 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+# load libraries
 library("getopt")
 library("tools")
+library("rjson")
+suppressPackageStartupMessages({
+library("edgeR")
+library("limma")
+})
 options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
-args <- commandArgs(trailingOnly = TRUE)
 # get options, using the spec as defined by the enclosed list.
 # we read the options from the default: commandArgs(TRUE).
-spec <- matrix(c(
+spec <- matrix(
-"quiet", "q", 0, "logical",
+c(
-"help", "h", 0, "logical",
+"quiet", "q", 0, "logical",
 "outfile", "o", 1, "character",
 "countsfile", "n", 1, "character",
 "factorName", "N", 1, "character",
 "levelNameA", "A", 1, "character",
 "levelNameB", "B", 1, "character",
 "levelAfiles", "a", 1, "character",
 "levelBfiles", "b", 1, "character",
 "alignmentA", "i", 1, "character",
 "alignmentB", "j", 1, "character",
-"plots" , "p", 1, "character"),
+"plots", "p", 1, "character"
-byrow=TRUE, ncol=4)
+),
+byrow = TRUE, ncol = 4
+)
 opt <- getopt(spec)
-# if help was asked for print a friendly message
-# and exit with a non-zero error code
-if (!is.null(opt$help)) {
-cat(getopt(spec, usage=TRUE))
-q(status=1)
-}
-# enforce the following required arguments
-if (is.null(opt$outfile)) {
-cat("'outfile' is required\n")
-q(status=1)
-}
-if (is.null(opt$levelAfiles) | is.null(opt$levelBfiles)) {
-cat("input count files are required for both levels\n")
-q(status=1)
-}
-if (is.null(opt$alignmentA) | is.null(opt$alignmentB)) {
-cat("total aligned read files are required for both levels\n")
-q(status=1)
-}
-verbose <- if (is.null(opt$quiet)) {
-TRUE
-} else {
-FALSE
-}
-suppressPackageStartupMessages({
-library("edgeR")
-library("limma")
-})
 # build levels A and B file lists
-library("rjson")
 filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error")
 filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error")
 listA <- list()
-indice = 0
+indice <- 0
 listA[["level"]] <- opt$levelNameA
 for (file in filesA) {
-indice = indice +1
+indice <- indice + 1
-listA[[paste0(opt$levelNameA,"_",indice)]] <- read.delim(file, header=FALSE)
+listA[[paste0(opt$levelNameA, "_", indice)]] <- read.delim(file, header = FALSE)
 }
 listB <- list()
-indice = 0
+indice <- 0
 listB[["level"]] <- opt$levelNameB
 for (file in filesB) {
-indice = indice +1
+indice <- indice + 1
-listB[[paste0(opt$levelNameB,"_",indice)]] <- read.delim(file, header=FALSE)
+listB[[paste0(opt$levelNameB, "_", indice)]] <- read.delim(file, header = FALSE)
 }
 # build a counts table
-counts <- data.frame(row.names=listA[[2]][,1])
+counts <- data.frame(row.names = listA[[2]][, 1])
 for (element in names(listA[-1])) {
-counts<-cbind(counts, listA[[element]][,4])
+counts <- cbind(counts, listA[[element]][, 4])
 }
 for (element in names(listB[-1])) {
-counts<-cbind(counts, listB[[element]][,4])
+counts <- cbind(counts, listB[[element]][, 4])
 }
-colnames(counts)=c(names(listA[-1]), names(listB[-1]))
+colnames(counts) <- c(names(listA[-1]), names(listB[-1]))
 # build aligned counts vector
 filesi <- fromJSON(opt$alignmentA, method = "C", unexpected.escape = "error")
 filesj <- fromJSON(opt$alignmentB, method = "C", unexpected.escape = "error")
 sizes <- c()
 for (file in filesi) {
-sizes <- c(sizes, read.delim(file, header=TRUE)[1,1])
+sizes <- c(sizes, read.delim(file, header = TRUE)[1, 1])
 }
 for (file in filesj) {
-sizes <- c(sizes, read.delim(file, header=TRUE)[1,1])
+sizes <- c(sizes, read.delim(file, header = TRUE)[1, 1])
 }
 # build a meta data object
 meta <- data.frame(
-row.names=colnames(counts),
+row.names = colnames(counts),
-condition=c(rep(opt$levelNameA,length(filesA)), rep(opt$levelNameB,length(filesB)) ),
+condition = c(rep(opt$levelNameA, length(filesA)), rep(opt$levelNameB, length(filesB))),
-libsize=sizes
+libsize = sizes
 )
 # Define the library size and conditions for the GLM
 libsize <- meta$libsize
 condition <- factor(meta$condition)
-design <- model.matrix(~0+condition)
+design <- model.matrix(~ 0 + condition)
-colnames(design) <- levels(meta$condition)
+colnames(design) <- levels(condition)
 # Build a DGE object for the GLM
-y <- DGEList(counts=counts, lib.size=libsize)
+y <- DGEList(counts = counts, lib.size = libsize)
 # Normalize the data
 y <- calcNormFactors(y)
-y$samples
-# plotMDS(y) latter
 # Estimate the variance
 y <- estimateGLMCommonDisp(y, design)
 y <- estimateGLMTrendedDisp(y, design)
 y <- estimateGLMTagwiseDisp(y, design)
-# plotBCV(y) latter
 # Builds and outputs an object to contain the normalized read abundance in counts per million of reads
-cpm <- cpm(y, log=FALSE, lib.size=libsize)
+cpm <- cpm(y, log = FALSE, lib.size = libsize)
 cpm <- as.data.frame(cpm)
 colnames(cpm) <- colnames(counts)
 if (!is.null(opt$countsfile)) {
-normalizedAbundance <- data.frame(Tag=rownames(cpm))
+normalizedAbundance <- data.frame(Tag = rownames(cpm))
 normalizedAbundance <- cbind(normalizedAbundance, cpm)
-write.table(normalizedAbundance, file=opt$countsfile, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
+write.table(normalizedAbundance, file = opt$countsfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
 }
 # Conduct fitting of the GLM
 yfit <- glmFit(y, design)
 # Initialize result matrices to contain the results of the GLM
-results <- matrix(nrow=dim(counts)[1],ncol=0)
+results <- matrix(nrow = dim(counts)[1], ncol = 0)
-logfc <- matrix(nrow=dim(counts)[1],ncol=0)
+logfc <- matrix(nrow = dim(counts)[1], ncol = 0)
 # Make the comparisons for the GLM
 my.contrasts <- makeContrasts(
-paste0(opt$levelNameA,"_",opt$levelNameB," = ", opt$levelNameA, " - ", opt$levelNameB),
+paste0(opt$levelNameA, "_", opt$levelNameB, " = ", opt$levelNameA, " - ", opt$levelNameB),
 levels = design
 )
 # Define the contrasts used in the comparisons
-allcontrasts =  paste0(opt$levelNameA," vs ",opt$levelNameB)
+allcontrasts <- paste0(opt$levelNameA, " vs ", opt$levelNameB)
 # Conduct a for loop that will do the fitting of the GLM for each comparison
 # Put the results into the results objects
-lrt <- glmLRT(yfit, contrast=my.contrasts[,1])
+lrt <- glmLRT(yfit, contrast = my.contrasts[, 1])
-plotSmear(lrt, de.tags=rownames(y))
+res <- topTags(lrt, n = dim(c)[1], sort.by = "none")$table
-title(allcontrasts)
+results <- cbind(results, res[, c(1, 5)])
-res <- topTags(lrt,n=dim(c)[1],sort.by="none")$table
+logfc <- cbind(logfc, res[c(1)])
-results <- cbind(results,res[,c(1,5)])
-logfc <- cbind(logfc,res[c(1)])
 # Add the repeat types back into the results.
 # We should still have the same order as the input data
-results$class <- listA[[2]][,2]
+results$class <- listA[[2]][, 2]
-results$type <- listA[[2]][,3]
+results$type <- listA[[2]][, 3]
 # Sort the results table by the FDR
 results <- results[with(results, order(FDR)), ]
-# Save the results
-write.table(results, opt$outfile, quote=FALSE, sep="\t", col.names=FALSE)
 # Plot Fold Changes for repeat classes and types
 # open the device and plots
 if (!is.null(opt$plots)) {
-if (verbose) cat("creating plots\n")
 pdf(opt$plots)
-plotMDS(y, main="Multidimensional Scaling Plot Of Distances Between Samples")
+plotMDS(y, main = "Multidimensional Scaling Plot Of Distances Between Samples")
-plotBCV(y, xlab="Gene abundance (Average log CPM)", main="Biological Coefficient of Variation Plot")
+plotBCV(y, xlab = "Gene abundance (Average log CPM)", main = "Biological Coefficient of Variation Plot")
 logFC <- results[, "logFC"]
 # Plot the repeat classes
 classes <- with(results, reorder(class, -logFC, median))
 classes
-par(mar=c(6,10,4,1))
+par(mar = c(6, 10, 4, 1))
-boxplot(logFC ~ classes, data=results, outline=FALSE, horizontal=TRUE,
+boxplot(logFC ~ classes,
-las=2, xlab="log2(Fold Change)", main=paste0(allcontrasts, ", by Class"))
+data = results, outline = FALSE, horizontal = TRUE,
-abline(v=0)
+las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Class")
+)
+abline(v = 0)
 # Plot the repeat types
 types <- with(results, reorder(type, -logFC, median))
-boxplot(logFC ~ types, data=results, outline=FALSE, horizontal=TRUE,
+boxplot(logFC ~ types,
-las=2, xlab="log2(Fold Change)", main=paste0(allcontrasts, ", by Type"), yaxt="n")
+data = results, outline = FALSE, horizontal = TRUE,
-axis(2, cex.axis=(1*28)/(length(levels(types))),
+las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Type")
-at=seq(from=1, to=length(levels(types))),
+)
-labels=levels(types), las=2)
+abline(v = 0)
-abline(v=0)
 # volcano plot
-TEdata = cbind(rownames(results), as.data.frame(results), score=-log(results$FDR, 10))
+TEdata <- cbind(rownames(results), as.data.frame(results), score = -log(results$FDR, 10))
-colnames(TEdata) = c("Tag","log2FC", "FDR", "Class", "Type", "score")
+colnames(TEdata) <- c("Tag", "log2FC", "FDR", "Class", "Type", "score")
-color = ifelse(TEdata$FDR<0.05, "red", "black")
+color <- ifelse(TEdata$FDR < 0.05, "red", "black")
-s <- subset(TEdata, FDR<0.01)
+s <- subset(TEdata, FDR < 0.01)
-with(TEdata, plot(log2FC, score,  pch=20, col=color, main="Volcano plot (all tag types)", ylab="-log10(FDR)"))
+with(TEdata, plot(log2FC, score, pch = 20, col = color, main = "Volcano plot (all tag types)", ylab = "-log10(FDR)"))
-text(s[,2], s[,6], labels = s[,5], pos = seq(from=1, to=3), cex=0.5)
+text(s[, 2], s[, 6], labels = s[, 5], pos = seq(from = 1, to = 3), cex = 0.5)
 }
 # close the plot device
 if (!is.null(opt$plots)) {
 cat("closing plot device\n")
 dev.off()
 }
+# Save the results
+results <- cbind(TE_item = rownames(results), results)
+colnames(results) <- c("TE_item", "log2FC", "FDR", "Class", "Type")
+results$log2FC <- format(results$log2FC, digits = 5)
+results$FDR <- format(results$FDR, digits = 5)
+write.table(results, opt$outfile, quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE)
 cat("Session information:\n\n")
 sessionInfo()

Mercurial > repos > artbio > repenrich

comparison edgeR_repenrich.R @ 11:6bba3e33c2e7 draft