comparison edgeR_repenrich.R @ 11:6bba3e33c2e7 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 11df9f0b68d35d3a9424a17e4cefee6cfb9d4c19
author artbio
date Sat, 09 Mar 2024 22:32:46 +0000
parents 5dd4791c7b70
children 530626b0757c
comparison
equal deleted inserted replaced
10:6f4143893463 11:6bba3e33c2e7
1 #!/usr/bin/env Rscript
2
3 # A command-line interface to edgeR for use with Galaxy edger-repenrich
4 # written by Christophe Antoniewski drosofff@gmail.com 2017.05.30
5
6
7 # setup R error handling to go to stderr 1 # setup R error handling to go to stderr
8 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) 2 options(show.error.messages = FALSE, error = function() {
3 cat(geterrmessage(), file = stderr())
4 q("no", 1, FALSE)
5 })
9 6
10 # To not crash galaxy with an UTF8 error with not-US LC settings. 7 # To not crash galaxy with an UTF8 error with not-US LC settings.
11 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") 8 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
12 9
10 # load libraries
13 library("getopt") 11 library("getopt")
14 library("tools") 12 library("tools")
13 library("rjson")
14 suppressPackageStartupMessages({
15 library("edgeR")
16 library("limma")
17 })
18
15 options(stringAsFactors = FALSE, useFancyQuotes = FALSE) 19 options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
16 args <- commandArgs(trailingOnly = TRUE)
17 20
18 # get options, using the spec as defined by the enclosed list. 21 # get options, using the spec as defined by the enclosed list.
19 # we read the options from the default: commandArgs(TRUE). 22 # we read the options from the default: commandArgs(TRUE).
20 spec <- matrix(c( 23 spec <- matrix(
21 "quiet", "q", 0, "logical", 24 c(
22 "help", "h", 0, "logical", 25 "quiet", "q", 0, "logical",
23 "outfile", "o", 1, "character", 26 "outfile", "o", 1, "character",
24 "countsfile", "n", 1, "character", 27 "countsfile", "n", 1, "character",
25 "factorName", "N", 1, "character", 28 "factorName", "N", 1, "character",
26 "levelNameA", "A", 1, "character", 29 "levelNameA", "A", 1, "character",
27 "levelNameB", "B", 1, "character", 30 "levelNameB", "B", 1, "character",
28 "levelAfiles", "a", 1, "character", 31 "levelAfiles", "a", 1, "character",
29 "levelBfiles", "b", 1, "character", 32 "levelBfiles", "b", 1, "character",
30 "alignmentA", "i", 1, "character", 33 "alignmentA", "i", 1, "character",
31 "alignmentB", "j", 1, "character", 34 "alignmentB", "j", 1, "character",
32 "plots" , "p", 1, "character"), 35 "plots", "p", 1, "character"
33 byrow=TRUE, ncol=4) 36 ),
37 byrow = TRUE, ncol = 4
38 )
34 opt <- getopt(spec) 39 opt <- getopt(spec)
35 40
36 # if help was asked for print a friendly message
37 # and exit with a non-zero error code
38 if (!is.null(opt$help)) {
39 cat(getopt(spec, usage=TRUE))
40 q(status=1)
41 }
42
43 # enforce the following required arguments
44 if (is.null(opt$outfile)) {
45 cat("'outfile' is required\n")
46 q(status=1)
47 }
48 if (is.null(opt$levelAfiles) | is.null(opt$levelBfiles)) {
49 cat("input count files are required for both levels\n")
50 q(status=1)
51 }
52 if (is.null(opt$alignmentA) | is.null(opt$alignmentB)) {
53 cat("total aligned read files are required for both levels\n")
54 q(status=1)
55 }
56
57 verbose <- if (is.null(opt$quiet)) {
58 TRUE
59 } else {
60 FALSE
61 }
62
63 suppressPackageStartupMessages({
64 library("edgeR")
65 library("limma")
66 })
67
68 # build levels A and B file lists 41 # build levels A and B file lists
69
70 library("rjson")
71 filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error") 42 filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error")
72 filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error") 43 filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error")
73 listA <- list() 44 listA <- list()
74 indice = 0 45 indice <- 0
75 listA[["level"]] <- opt$levelNameA 46 listA[["level"]] <- opt$levelNameA
76 for (file in filesA) { 47 for (file in filesA) {
77 indice = indice +1 48 indice <- indice + 1
78 listA[[paste0(opt$levelNameA,"_",indice)]] <- read.delim(file, header=FALSE) 49 listA[[paste0(opt$levelNameA, "_", indice)]] <- read.delim(file, header = FALSE)
79 } 50 }
80 listB <- list() 51 listB <- list()
81 indice = 0 52 indice <- 0
82 listB[["level"]] <- opt$levelNameB 53 listB[["level"]] <- opt$levelNameB
83 for (file in filesB) { 54 for (file in filesB) {
84 indice = indice +1 55 indice <- indice + 1
85 listB[[paste0(opt$levelNameB,"_",indice)]] <- read.delim(file, header=FALSE) 56 listB[[paste0(opt$levelNameB, "_", indice)]] <- read.delim(file, header = FALSE)
86 } 57 }
87 58
88 # build a counts table 59 # build a counts table
89 counts <- data.frame(row.names=listA[[2]][,1]) 60 counts <- data.frame(row.names = listA[[2]][, 1])
90 for (element in names(listA[-1])) { 61 for (element in names(listA[-1])) {
91 counts<-cbind(counts, listA[[element]][,4]) 62 counts <- cbind(counts, listA[[element]][, 4])
92 } 63 }
93 for (element in names(listB[-1])) { 64 for (element in names(listB[-1])) {
94 counts<-cbind(counts, listB[[element]][,4]) 65 counts <- cbind(counts, listB[[element]][, 4])
95 } 66 }
96 colnames(counts)=c(names(listA[-1]), names(listB[-1])) 67 colnames(counts) <- c(names(listA[-1]), names(listB[-1]))
97 68
98 # build aligned counts vector 69 # build aligned counts vector
99
100 filesi <- fromJSON(opt$alignmentA, method = "C", unexpected.escape = "error") 70 filesi <- fromJSON(opt$alignmentA, method = "C", unexpected.escape = "error")
101 filesj <- fromJSON(opt$alignmentB, method = "C", unexpected.escape = "error") 71 filesj <- fromJSON(opt$alignmentB, method = "C", unexpected.escape = "error")
102 sizes <- c() 72 sizes <- c()
103 for (file in filesi) { 73 for (file in filesi) {
104 sizes <- c(sizes, read.delim(file, header=TRUE)[1,1]) 74 sizes <- c(sizes, read.delim(file, header = TRUE)[1, 1])
105 } 75 }
106 for (file in filesj) { 76 for (file in filesj) {
107 sizes <- c(sizes, read.delim(file, header=TRUE)[1,1]) 77 sizes <- c(sizes, read.delim(file, header = TRUE)[1, 1])
108 } 78 }
109 79
110 # build a meta data object 80 # build a meta data object
111
112 meta <- data.frame( 81 meta <- data.frame(
113 row.names=colnames(counts), 82 row.names = colnames(counts),
114 condition=c(rep(opt$levelNameA,length(filesA)), rep(opt$levelNameB,length(filesB)) ), 83 condition = c(rep(opt$levelNameA, length(filesA)), rep(opt$levelNameB, length(filesB))),
115 libsize=sizes 84 libsize = sizes
116 ) 85 )
117 86
118 87
119 # Define the library size and conditions for the GLM 88 # Define the library size and conditions for the GLM
120 libsize <- meta$libsize 89 libsize <- meta$libsize
121 condition <- factor(meta$condition) 90 condition <- factor(meta$condition)
122 design <- model.matrix(~0+condition) 91 design <- model.matrix(~ 0 + condition)
123 colnames(design) <- levels(meta$condition) 92 colnames(design) <- levels(condition)
124
125 93
126 # Build a DGE object for the GLM 94 # Build a DGE object for the GLM
127 y <- DGEList(counts=counts, lib.size=libsize) 95 y <- DGEList(counts = counts, lib.size = libsize)
128 96
129 # Normalize the data 97 # Normalize the data
130 y <- calcNormFactors(y) 98 y <- calcNormFactors(y)
131 y$samples
132 # plotMDS(y) latter
133 99
134 # Estimate the variance 100 # Estimate the variance
135 y <- estimateGLMCommonDisp(y, design) 101 y <- estimateGLMCommonDisp(y, design)
136 y <- estimateGLMTrendedDisp(y, design) 102 y <- estimateGLMTrendedDisp(y, design)
137 y <- estimateGLMTagwiseDisp(y, design) 103 y <- estimateGLMTagwiseDisp(y, design)
138 # plotBCV(y) latter
139 104
140 # Builds and outputs an object to contain the normalized read abundance in counts per million of reads 105 # Builds and outputs an object to contain the normalized read abundance in counts per million of reads
141 cpm <- cpm(y, log=FALSE, lib.size=libsize) 106 cpm <- cpm(y, log = FALSE, lib.size = libsize)
142 cpm <- as.data.frame(cpm) 107 cpm <- as.data.frame(cpm)
143 colnames(cpm) <- colnames(counts) 108 colnames(cpm) <- colnames(counts)
144 if (!is.null(opt$countsfile)) { 109 if (!is.null(opt$countsfile)) {
145 normalizedAbundance <- data.frame(Tag=rownames(cpm)) 110 normalizedAbundance <- data.frame(Tag = rownames(cpm))
146 normalizedAbundance <- cbind(normalizedAbundance, cpm) 111 normalizedAbundance <- cbind(normalizedAbundance, cpm)
147 write.table(normalizedAbundance, file=opt$countsfile, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE) 112 write.table(normalizedAbundance, file = opt$countsfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
148 } 113 }
149 114
150 # Conduct fitting of the GLM 115 # Conduct fitting of the GLM
151 yfit <- glmFit(y, design) 116 yfit <- glmFit(y, design)
152 117
153 # Initialize result matrices to contain the results of the GLM 118 # Initialize result matrices to contain the results of the GLM
154 results <- matrix(nrow=dim(counts)[1],ncol=0) 119 results <- matrix(nrow = dim(counts)[1], ncol = 0)
155 logfc <- matrix(nrow=dim(counts)[1],ncol=0) 120 logfc <- matrix(nrow = dim(counts)[1], ncol = 0)
156 121
157 # Make the comparisons for the GLM 122 # Make the comparisons for the GLM
158 my.contrasts <- makeContrasts( 123 my.contrasts <- makeContrasts(
159 paste0(opt$levelNameA,"_",opt$levelNameB," = ", opt$levelNameA, " - ", opt$levelNameB), 124 paste0(opt$levelNameA, "_", opt$levelNameB, " = ", opt$levelNameA, " - ", opt$levelNameB),
160 levels = design 125 levels = design
161 ) 126 )
162 127
163 # Define the contrasts used in the comparisons 128 # Define the contrasts used in the comparisons
164 allcontrasts = paste0(opt$levelNameA," vs ",opt$levelNameB) 129 allcontrasts <- paste0(opt$levelNameA, " vs ", opt$levelNameB)
165 130
166 # Conduct a for loop that will do the fitting of the GLM for each comparison 131 # Conduct a for loop that will do the fitting of the GLM for each comparison
167 # Put the results into the results objects 132 # Put the results into the results objects
168 lrt <- glmLRT(yfit, contrast=my.contrasts[,1]) 133 lrt <- glmLRT(yfit, contrast = my.contrasts[, 1])
169 plotSmear(lrt, de.tags=rownames(y)) 134 res <- topTags(lrt, n = dim(c)[1], sort.by = "none")$table
170 title(allcontrasts) 135 results <- cbind(results, res[, c(1, 5)])
171 res <- topTags(lrt,n=dim(c)[1],sort.by="none")$table 136 logfc <- cbind(logfc, res[c(1)])
172 results <- cbind(results,res[,c(1,5)])
173 logfc <- cbind(logfc,res[c(1)])
174 137
175 # Add the repeat types back into the results. 138 # Add the repeat types back into the results.
176 # We should still have the same order as the input data 139 # We should still have the same order as the input data
177 results$class <- listA[[2]][,2] 140 results$class <- listA[[2]][, 2]
178 results$type <- listA[[2]][,3] 141 results$type <- listA[[2]][, 3]
179
180 # Sort the results table by the FDR 142 # Sort the results table by the FDR
181 results <- results[with(results, order(FDR)), ] 143 results <- results[with(results, order(FDR)), ]
182
183 # Save the results
184 write.table(results, opt$outfile, quote=FALSE, sep="\t", col.names=FALSE)
185 144
186 # Plot Fold Changes for repeat classes and types 145 # Plot Fold Changes for repeat classes and types
187 146
188 # open the device and plots 147 # open the device and plots
189 if (!is.null(opt$plots)) { 148 if (!is.null(opt$plots)) {
190 if (verbose) cat("creating plots\n")
191 pdf(opt$plots) 149 pdf(opt$plots)
192 plotMDS(y, main="Multidimensional Scaling Plot Of Distances Between Samples") 150 plotMDS(y, main = "Multidimensional Scaling Plot Of Distances Between Samples")
193 plotBCV(y, xlab="Gene abundance (Average log CPM)", main="Biological Coefficient of Variation Plot") 151 plotBCV(y, xlab = "Gene abundance (Average log CPM)", main = "Biological Coefficient of Variation Plot")
194 logFC <- results[, "logFC"] 152 logFC <- results[, "logFC"]
195 # Plot the repeat classes 153 # Plot the repeat classes
196 classes <- with(results, reorder(class, -logFC, median)) 154 classes <- with(results, reorder(class, -logFC, median))
197 classes 155 classes
198 par(mar=c(6,10,4,1)) 156 par(mar = c(6, 10, 4, 1))
199 boxplot(logFC ~ classes, data=results, outline=FALSE, horizontal=TRUE, 157 boxplot(logFC ~ classes,
200 las=2, xlab="log2(Fold Change)", main=paste0(allcontrasts, ", by Class")) 158 data = results, outline = FALSE, horizontal = TRUE,
201 abline(v=0) 159 las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Class")
160 )
161 abline(v = 0)
202 # Plot the repeat types 162 # Plot the repeat types
203 types <- with(results, reorder(type, -logFC, median)) 163 types <- with(results, reorder(type, -logFC, median))
204 boxplot(logFC ~ types, data=results, outline=FALSE, horizontal=TRUE, 164 boxplot(logFC ~ types,
205 las=2, xlab="log2(Fold Change)", main=paste0(allcontrasts, ", by Type"), yaxt="n") 165 data = results, outline = FALSE, horizontal = TRUE,
206 axis(2, cex.axis=(1*28)/(length(levels(types))), 166 las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Type")
207 at=seq(from=1, to=length(levels(types))), 167 )
208 labels=levels(types), las=2) 168 abline(v = 0)
209 abline(v=0)
210 # volcano plot 169 # volcano plot
211 TEdata = cbind(rownames(results), as.data.frame(results), score=-log(results$FDR, 10)) 170 TEdata <- cbind(rownames(results), as.data.frame(results), score = -log(results$FDR, 10))
212 colnames(TEdata) = c("Tag","log2FC", "FDR", "Class", "Type", "score") 171 colnames(TEdata) <- c("Tag", "log2FC", "FDR", "Class", "Type", "score")
213 color = ifelse(TEdata$FDR<0.05, "red", "black") 172 color <- ifelse(TEdata$FDR < 0.05, "red", "black")
214 s <- subset(TEdata, FDR<0.01) 173 s <- subset(TEdata, FDR < 0.01)
215 with(TEdata, plot(log2FC, score, pch=20, col=color, main="Volcano plot (all tag types)", ylab="-log10(FDR)")) 174 with(TEdata, plot(log2FC, score, pch = 20, col = color, main = "Volcano plot (all tag types)", ylab = "-log10(FDR)"))
216 text(s[,2], s[,6], labels = s[,5], pos = seq(from=1, to=3), cex=0.5) 175 text(s[, 2], s[, 6], labels = s[, 5], pos = seq(from = 1, to = 3), cex = 0.5)
217 } 176 }
218 177
219 # close the plot device 178 # close the plot device
220 if (!is.null(opt$plots)) { 179 if (!is.null(opt$plots)) {
221 cat("closing plot device\n") 180 cat("closing plot device\n")
222 dev.off() 181 dev.off()
223 } 182 }
224 183
184 # Save the results
185 results <- cbind(TE_item = rownames(results), results)
186 colnames(results) <- c("TE_item", "log2FC", "FDR", "Class", "Type")
187 results$log2FC <- format(results$log2FC, digits = 5)
188 results$FDR <- format(results$FDR, digits = 5)
189 write.table(results, opt$outfile, quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE)
190
225 cat("Session information:\n\n") 191 cat("Session information:\n\n")
226
227 sessionInfo() 192 sessionInfo()
228