comparison edgeR_repenrich.R @ 0:f6f0f1e5e940 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
author artbio
date Wed, 02 Aug 2017 05:17:29 -0400
parents
children 15e3e29f310e
comparison
equal deleted inserted replaced
-1:000000000000 0:f6f0f1e5e940
1 #!/usr/bin/env Rscript
2
3 # A command-line interface to edgeR for use with Galaxy edger-repenrich
4 # written by Christophe Antoniewski drosofff@gmail.com 2017.05.30
5
6
7 # setup R error handling to go to stderr
8 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
9
10 # To not crash galaxy with an UTF8 error with not-US LC settings.
11 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
12
13 library("getopt")
14 library("tools")
15 options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
16 args <- commandArgs(trailingOnly = TRUE)
17
18 # get options, using the spec as defined by the enclosed list.
19 # we read the options from the default: commandArgs(TRUE).
20 spec <- matrix(c(
21 "quiet", "q", 0, "logical",
22 "help", "h", 0, "logical",
23 "outfile", "o", 1, "character",
24 "countsfile", "n", 1, "character",
25 "factorName", "N", 1, "character",
26 "levelNameA", "A", 1, "character",
27 "levelNameB", "B", 1, "character",
28 "levelAfiles", "a", 1, "character",
29 "levelBfiles", "b", 1, "character",
30 "alignmentA", "i", 1, "character",
31 "alignmentB", "j", 1, "character",
32 "plots" , "p", 1, "character"),
33 byrow=TRUE, ncol=4)
34 opt <- getopt(spec)
35
36 # if help was asked for print a friendly message
37 # and exit with a non-zero error code
38 if (!is.null(opt$help)) {
39 cat(getopt(spec, usage=TRUE))
40 q(status=1)
41 }
42
43 # enforce the following required arguments
44 if (is.null(opt$outfile)) {
45 cat("'outfile' is required\n")
46 q(status=1)
47 }
48 if (is.null(opt$levelAfiles) | is.null(opt$levelBfiles)) {
49 cat("input count files are required for both levels\n")
50 q(status=1)
51 }
52 if (is.null(opt$alignmentA) | is.null(opt$alignmentB)) {
53 cat("total aligned read files are required for both levels\n")
54 q(status=1)
55 }
56
57 verbose <- if (is.null(opt$quiet)) {
58 TRUE
59 } else {
60 FALSE
61 }
62
63 suppressPackageStartupMessages({
64 library("edgeR")
65 library("limma")
66 })
67
68 # build levels A and B file lists
69
70 library("rjson")
71 filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error")
72 filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error")
73 listA <- list()
74 indice = 0
75 listA[["level"]] <- opt$levelNameA
76 for (file in filesA) {
77 indice = indice +1
78 listA[[paste0(opt$levelNameA,"_",indice)]] <- read.delim(file, header=FALSE)
79 }
80 listB <- list()
81 indice = 0
82 listB[["level"]] <- opt$levelNameB
83 for (file in filesB) {
84 indice = indice +1
85 listB[[paste0(opt$levelNameB,"_",indice)]] <- read.delim(file, header=FALSE)
86 }
87
88 # build a counts table
89 counts <- data.frame(row.names=listA[[2]][,1])
90 for (element in names(listA[-1])) {
91 counts<-cbind(counts, listA[[element]][,4])
92 }
93 for (element in names(listB[-1])) {
94 counts<-cbind(counts, listB[[element]][,4])
95 }
96 colnames(counts)=c(names(listA[-1]), names(listB[-1]))
97
98 # build aligned counts vector
99
100 filesi <- fromJSON(opt$alignmentA, method = "C", unexpected.escape = "error")
101 filesj <- fromJSON(opt$alignmentB, method = "C", unexpected.escape = "error")
102 sizes <- c()
103 for (file in filesi) {
104 sizes <- c(sizes, read.delim(file, header=FALSE)[1,1])
105 }
106 for (file in filesj) {
107 sizes <- c(sizes, read.delim(file, header=FALSE)[1,1])
108 }
109
110 # build a meta data object
111
112 meta <- data.frame(
113 row.names=colnames(counts),
114 condition=c(rep(opt$levelNameA,length(filesA)), rep(opt$levelNameB,length(filesB)) ),
115 libsize=sizes
116 )
117
118
119 # Define the library size and conditions for the GLM
120 libsize <- meta$libsize
121 condition <- factor(meta$condition)
122 design <- model.matrix(~0+condition)
123 colnames(design) <- levels(meta$condition)
124
125
126 # Build a DGE object for the GLM
127 y <- DGEList(counts=counts, lib.size=libsize)
128
129 # Normalize the data
130 y <- calcNormFactors(y)
131 y$samples
132 # plotMDS(y) latter
133
134 # Estimate the variance
135 y <- estimateGLMCommonDisp(y, design)
136 y <- estimateGLMTrendedDisp(y, design)
137 y <- estimateGLMTagwiseDisp(y, design)
138 # plotBCV(y) latter
139
140 # Builds and outputs an object to contain the normalized read abundance in counts per million of reads
141 cpm <- cpm(y, log=FALSE, lib.size=libsize)
142 cpm <- as.data.frame(cpm)
143 colnames(cpm) <- colnames(counts)
144 if (!is.null(opt$countsfile)) {
145 normalizedAbundance <- data.frame(Tag=rownames(cpm))
146 normalizedAbundance <- cbind(normalizedAbundance, cpm)
147 write.table(normalizedAbundance, file=opt$countsfile, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
148 }
149
150 # Conduct fitting of the GLM
151 yfit <- glmFit(y, design)
152
153 # Initialize result matrices to contain the results of the GLM
154 results <- matrix(nrow=dim(counts)[1],ncol=0)
155 logfc <- matrix(nrow=dim(counts)[1],ncol=0)
156
157 # Make the comparisons for the GLM
158 my.contrasts <- makeContrasts(
159 paste0(opt$levelNameA,"_",opt$levelNameB," = ", opt$levelNameA, " - ", opt$levelNameB),
160 levels = design
161 )
162
163 # Define the contrasts used in the comparisons
164 allcontrasts = paste0(opt$levelNameA," vs ",opt$levelNameB)
165
166 # Conduct a for loop that will do the fitting of the GLM for each comparison
167 # Put the results into the results objects
168 lrt <- glmLRT(yfit, contrast=my.contrasts[,1])
169 plotSmear(lrt, de.tags=rownames(y))
170 title(allcontrasts)
171 res <- topTags(lrt,n=dim(c)[1],sort.by="none")$table
172 results <- cbind(results,res[,c(1,5)])
173 logfc <- cbind(logfc,res[c(1)])
174
175 # Add the repeat types back into the results.
176 # We should still have the same order as the input data
177 results$class <- listA[[2]][,2]
178 results$type <- listA[[2]][,3]
179
180 # Sort the results table by the FDR
181 results <- results[with(results, order(FDR)), ]
182
183 # Save the results
184 write.table(results, opt$outfile, quote=FALSE, sep="\t", col.names=FALSE)
185
186 # Plot Fold Changes for repeat classes and types
187
188 # open the device and plots
189 if (!is.null(opt$plots)) {
190 if (verbose) cat("creating plots\n")
191 pdf(opt$plots)
192 plotMDS(y, main="Multidimensional Scaling Plot Of Distances Between Samples")
193 plotBCV(y, xlab="Gene abundance (Average log CPM)", main="Biological Coefficient of Variation Plot")
194 logFC <- results[, "logFC"]
195 # Plot the repeat classes
196 classes <- with(results, reorder(class, -logFC, median))
197 par(mar=c(6,10,4,1))
198 boxplot(logFC ~ classes, data=results, outline=FALSE, horizontal=TRUE,
199 las=2, xlab="log(Fold Change)", main=paste0(allcontrasts, ", by Class"))
200 abline(v=0)
201 # Plot the repeat types
202 types <- with(results, reorder(type, -logFC, median))
203 boxplot(logFC ~ types, data=results, outline=FALSE, horizontal=TRUE,
204 las=2, xlab="log(Fold Change)", main=paste0(allcontrasts, ", by Type"))
205 abline(v=0)
206 }
207
208 # close the plot device
209 if (!is.null(opt$plots)) {
210 cat("closing plot device\n")
211 dev.off()
212 }
213
214 cat("Session information:\n\n")
215
216 sessionInfo()
217