Mercurial > repos > artbio > repenrich2
comparison edgeR_repenrich.R @ 0:4905a332a094 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
author | artbio |
---|---|
date | Sat, 20 Apr 2024 11:56:53 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4905a332a094 |
---|---|
1 # setup R error handling to go to stderr | |
2 options(show.error.messages = FALSE, error = function() { | |
3 cat(geterrmessage(), file = stderr()) | |
4 q("no", 1, FALSE) | |
5 }) | |
6 | |
7 # To not crash galaxy with an UTF8 error with not-US LC settings. | |
8 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
9 | |
10 # load libraries | |
11 library("getopt") | |
12 library("tools") | |
13 library("rjson") | |
14 suppressPackageStartupMessages({ | |
15 library("edgeR") | |
16 library("limma") | |
17 }) | |
18 | |
19 options(stringAsFactors = FALSE, useFancyQuotes = FALSE) | |
20 | |
21 # get options, using the spec as defined by the enclosed list. | |
22 spec <- matrix( | |
23 c( | |
24 "quiet", "q", 0, "logical", | |
25 "outfile", "o", 1, "character", | |
26 "countsfile", "n", 1, "character", | |
27 "factorName", "N", 1, "character", | |
28 "levelNameA", "A", 1, "character", | |
29 "levelNameB", "B", 1, "character", | |
30 "levelAfiles", "a", 1, "character", | |
31 "levelBfiles", "b", 1, "character", | |
32 "plots", "p", 1, "character" | |
33 ), | |
34 byrow = TRUE, ncol = 4 | |
35 ) | |
36 opt <- getopt(spec) | |
37 | |
38 # build levels A and B file lists | |
39 filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error") | |
40 filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error") | |
41 listA <- list() | |
42 indice <- 0 | |
43 listA[["level"]] <- opt$levelNameA | |
44 for (file in filesA) { | |
45 indice <- indice + 1 | |
46 listA[[paste0(opt$levelNameA, "_", indice)]] <- read.delim(file, header = FALSE) | |
47 } | |
48 listB <- list() | |
49 indice <- 0 | |
50 listB[["level"]] <- opt$levelNameB | |
51 for (file in filesB) { | |
52 indice <- indice + 1 | |
53 listB[[paste0(opt$levelNameB, "_", indice)]] <- read.delim(file, header = FALSE) | |
54 } | |
55 | |
56 # build a counts table | |
57 counts <- data.frame(row.names = listA[[2]][, 1]) | |
58 for (element in names(listA[-1])) { | |
59 counts <- cbind(counts, listA[[element]][, 4]) | |
60 } | |
61 for (element in names(listB[-1])) { | |
62 counts <- cbind(counts, listB[[element]][, 4]) | |
63 } | |
64 colnames(counts) <- c(names(listA[-1]), names(listB[-1])) | |
65 sizes <- colSums(counts) | |
66 | |
67 # build a meta data object | |
68 meta <- data.frame( | |
69 row.names = colnames(counts), | |
70 condition = c(rep(opt$levelNameA, length(filesA)), rep(opt$levelNameB, length(filesB))), | |
71 libsize = sizes | |
72 ) | |
73 | |
74 | |
75 # Define the library size and conditions for the GLM | |
76 libsize <- meta$libsize | |
77 condition <- factor(meta$condition) | |
78 design <- model.matrix(~ 0 + condition) | |
79 colnames(design) <- levels(condition) | |
80 | |
81 # Build a DGE object for the GLM | |
82 y <- DGEList(counts = counts, lib.size = libsize) | |
83 | |
84 # Normalize the data | |
85 y <- calcNormFactors(y) | |
86 | |
87 # Estimate the variance | |
88 y <- estimateGLMCommonDisp(y, design) | |
89 y <- estimateGLMTrendedDisp(y, design) | |
90 y <- estimateGLMTagwiseDisp(y, design) | |
91 | |
92 # Builds and outputs an object to contain the normalized read abundance in counts per million of reads | |
93 cpm <- cpm(y, log = FALSE, lib.size = libsize) | |
94 cpm <- as.data.frame(cpm) | |
95 colnames(cpm) <- colnames(counts) | |
96 if (!is.null(opt$countsfile)) { | |
97 normalizedAbundance <- data.frame(Tag = rownames(cpm)) | |
98 normalizedAbundance <- cbind(normalizedAbundance, cpm) | |
99 write.table(normalizedAbundance, file = opt$countsfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) | |
100 } | |
101 | |
102 # Conduct fitting of the GLM | |
103 yfit <- glmFit(y, design) | |
104 | |
105 # Initialize result matrices to contain the results of the GLM | |
106 results <- matrix(nrow = dim(counts)[1], ncol = 0) | |
107 logfc <- matrix(nrow = dim(counts)[1], ncol = 0) | |
108 | |
109 # Make the comparisons for the GLM | |
110 my.contrasts <- makeContrasts( | |
111 paste0(opt$levelNameA, "_", opt$levelNameB, " = ", opt$levelNameA, " - ", opt$levelNameB), | |
112 levels = design | |
113 ) | |
114 | |
115 # Define the contrasts used in the comparisons | |
116 allcontrasts <- paste0(opt$levelNameA, " vs ", opt$levelNameB) | |
117 | |
118 # Conduct a for loop that will do the fitting of the GLM for each comparison | |
119 # Put the results into the results objects | |
120 lrt <- glmLRT(yfit, contrast = my.contrasts[, 1]) | |
121 res <- topTags(lrt, n = dim(c)[1], sort.by = "none")$table | |
122 results <- cbind(results, res[, c(1, 5)]) | |
123 logfc <- cbind(logfc, res[c(1)]) | |
124 | |
125 # Add the repeat types back into the results. | |
126 # We should still have the same order as the input data | |
127 results$class <- listA[[2]][, 2] | |
128 results$type <- listA[[2]][, 3] | |
129 # Sort the results table by the FDR | |
130 results <- results[with(results, order(FDR)), ] | |
131 | |
132 # Plot Fold Changes for repeat classes and types | |
133 | |
134 # open the device and plots | |
135 if (!is.null(opt$plots)) { | |
136 pdf(opt$plots) | |
137 plotMDS(y, main = "Multidimensional Scaling Plot Of Distances Between Samples") | |
138 plotBCV(y, xlab = "Gene abundance (Average log CPM)", main = "Biological Coefficient of Variation Plot") | |
139 logFC <- results[, "logFC"] | |
140 # Plot the repeat classes | |
141 classes <- with(results, reorder(class, -logFC, median)) | |
142 classes | |
143 par(mar = c(6, 10, 4, 1)) | |
144 boxplot(logFC ~ classes, | |
145 data = results, outline = FALSE, horizontal = TRUE, | |
146 las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Class") | |
147 ) | |
148 abline(v = 0) | |
149 # Plot the repeat types | |
150 types <- with(results, reorder(type, -logFC, median)) | |
151 boxplot(logFC ~ types, | |
152 data = results, outline = FALSE, horizontal = TRUE, | |
153 las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Type") | |
154 ) | |
155 abline(v = 0) | |
156 # volcano plot | |
157 TEdata <- cbind(rownames(results), as.data.frame(results), score = -log(results$FDR, 10)) | |
158 colnames(TEdata) <- c("Tag", "log2FC", "FDR", "Class", "Type", "score") | |
159 color <- ifelse(TEdata$FDR < 0.05, "red", "black") | |
160 s <- subset(TEdata, FDR < 0.01) | |
161 with(TEdata, plot(log2FC, score, pch = 20, col = color, main = "Volcano plot (all tag types)", ylab = "-log10(FDR)")) | |
162 text(s[, 2], s[, 6], labels = s[, 5], pos = seq(from = 1, to = 3), cex = 0.5) | |
163 } | |
164 | |
165 # close the plot device | |
166 if (!is.null(opt$plots)) { | |
167 cat("closing plot device\n") | |
168 dev.off() | |
169 } | |
170 | |
171 # Save the results | |
172 results <- cbind(TE_item = rownames(results), results) | |
173 colnames(results) <- c("TE_item", "log2FC", "FDR", "Class", "Type") | |
174 results$log2FC <- format(results$log2FC, digits = 5) | |
175 results$FDR <- format(results$FDR, digits = 5) | |
176 write.table(results, opt$outfile, quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE) |