Mercurial > repos > drosofff > repenrich
comparison edgeR_repenrich.R @ 1:54a3f3a195d6 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 114b47cc624e39b4f485c8623458fc98494c564d
| author | drosofff |
|---|---|
| date | Mon, 29 May 2017 13:11:57 -0400 |
| parents | |
| children | 1805b262c12d |
comparison
equal
deleted
inserted
replaced
| 0:1435d142041b | 1:54a3f3a195d6 |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 | |
| 3 # A command-line interface to edgeR for use with Galaxy edger-repenrich | |
| 4 # written by Christophe Antoniewski drosofff@gmail.com 2017.05.30 | |
| 5 | |
| 6 | |
| 7 # setup R error handling to go to stderr | |
| 8 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | |
| 9 | |
| 10 # To not crash galaxy with an UTF8 error with not-US LC settings. | |
| 11 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
| 12 | |
| 13 library("getopt") | |
| 14 library("tools") | |
| 15 options(stringAsFactors = FALSE, useFancyQuotes = FALSE) | |
| 16 args <- commandArgs(trailingOnly = TRUE) | |
| 17 | |
| 18 # get options, using the spec as defined by the enclosed list. | |
| 19 # we read the options from the default: commandArgs(TRUE). | |
| 20 spec <- matrix(c( | |
| 21 "quiet", "q", 0, "logical", | |
| 22 "help", "h", 0, "logical", | |
| 23 "outfile", "o", 1, "character", | |
| 24 "countsfile", "n", 1, "character", | |
| 25 "factorName", "N", 1, "character", | |
| 26 "levelNameA", "A", 1, "character", | |
| 27 "levelNameB", "B", 1, "character", | |
| 28 "levelAfiles", "a", 1, "character", | |
| 29 "levelBfiles", "b", 1, "character", | |
| 30 "alignmentA", "i", 1, "character", | |
| 31 "alignmentB", "j", 1, "character", | |
| 32 "plots" , "p", 1, "character"), | |
| 33 byrow=TRUE, ncol=4) | |
| 34 opt <- getopt(spec) | |
| 35 | |
| 36 # if help was asked for print a friendly message | |
| 37 # and exit with a non-zero error code | |
| 38 if (!is.null(opt$help)) { | |
| 39 cat(getopt(spec, usage=TRUE)) | |
| 40 q(status=1) | |
| 41 } | |
| 42 | |
| 43 # enforce the following required arguments | |
| 44 if (is.null(opt$outfile)) { | |
| 45 cat("'outfile' is required\n") | |
| 46 q(status=1) | |
| 47 } | |
| 48 if (is.null(opt$levelAfiles) | is.null(opt$levelBfiles)) { | |
| 49 cat("input count files are required for both levels\n") | |
| 50 q(status=1) | |
| 51 } | |
| 52 if (is.null(opt$alignmentA) | is.null(opt$alignmentB)) { | |
| 53 cat("total aligned read files are required for both levels\n") | |
| 54 q(status=1) | |
| 55 } | |
| 56 | |
| 57 verbose <- if (is.null(opt$quiet)) { | |
| 58 TRUE | |
| 59 } else { | |
| 60 FALSE | |
| 61 } | |
| 62 | |
| 63 suppressPackageStartupMessages({ | |
| 64 library("edgeR") | |
| 65 library("limma") | |
| 66 }) | |
| 67 | |
| 68 # build levels A and B file lists | |
| 69 | |
| 70 library("rjson") | |
| 71 filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error") | |
| 72 filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error") | |
| 73 listA <- list() | |
| 74 indice = 0 | |
| 75 listA[["level"]] <- opt$levelNameA | |
| 76 for (file in filesA) { | |
| 77 indice = indice +1 | |
| 78 listA[[paste0(opt$levelNameA,"_",indice)]] <- read.delim(file, header=FALSE) | |
| 79 } | |
| 80 listB <- list() | |
| 81 indice = 0 | |
| 82 listB[["level"]] <- opt$levelNameB | |
| 83 for (file in filesB) { | |
| 84 indice = indice +1 | |
| 85 listB[[paste0(opt$levelNameB,"_",indice)]] <- read.delim(file, header=FALSE) | |
| 86 } | |
| 87 | |
| 88 # build a counts table | |
| 89 counts <- data.frame(row.names=listA[[2]][,1]) | |
| 90 for (element in names(listA[-1])) { | |
| 91 counts<-cbind(counts, listA[[element]][,4]) | |
| 92 } | |
| 93 for (element in names(listB[-1])) { | |
| 94 counts<-cbind(counts, listB[[element]][,4]) | |
| 95 } | |
| 96 colnames(counts)=c(names(listA[-1]), names(listB[-1])) | |
| 97 | |
| 98 # build aligned counts vector | |
| 99 | |
| 100 filesi <- fromJSON(opt$alignmentA, method = "C", unexpected.escape = "error") | |
| 101 filesj <- fromJSON(opt$alignmentB, method = "C", unexpected.escape = "error") | |
| 102 sizes <- c() | |
| 103 for (file in filesi) { | |
| 104 sizes <- c(sizes, read.delim(file, header=FALSE)[1,1]) | |
| 105 } | |
| 106 for (file in filesj) { | |
| 107 sizes <- c(sizes, read.delim(file, header=FALSE)[1,1]) | |
| 108 } | |
| 109 | |
| 110 # build a meta data object | |
| 111 | |
| 112 meta <- data.frame( | |
| 113 row.names=colnames(counts), | |
| 114 condition=c(rep(opt$levelNameA,length(filesA)), rep(opt$levelNameB,length(filesB)) ), | |
| 115 libsize=sizes | |
| 116 ) | |
| 117 | |
| 118 | |
| 119 # Define the library size and conditions for the GLM | |
| 120 libsize <- meta$libsize | |
| 121 condition <- factor(meta$condition) | |
| 122 design <- model.matrix(~0+condition) | |
| 123 colnames(design) <- levels(meta$condition) | |
| 124 | |
| 125 | |
| 126 # Build a DGE object for the GLM | |
| 127 y <- DGEList(counts=counts, lib.size=libsize) | |
| 128 | |
| 129 # Normalize the data | |
| 130 y <- calcNormFactors(y) | |
| 131 y$samples | |
| 132 # plotMDS(y) latter | |
| 133 | |
| 134 # Estimate the variance | |
| 135 y <- estimateGLMCommonDisp(y, design) | |
| 136 y <- estimateGLMTrendedDisp(y, design) | |
| 137 y <- estimateGLMTagwiseDisp(y, design) | |
| 138 # plotBCV(y) latter | |
| 139 | |
| 140 # Builds and outputs an object to contain the normalized read abundance in counts per million of reads | |
| 141 cpm <- cpm(y, log=FALSE, lib.size=libsize) | |
| 142 cpm <- as.data.frame(cpm) | |
| 143 colnames(cpm) <- colnames(counts) | |
| 144 if (!is.null(opt$countsfile)) { | |
| 145 normalizedAbundance <- data.frame(Tag=rownames(cpm)) | |
| 146 normalizedAbundance <- cbind(normalizedAbundance, cpm) | |
| 147 write.table(normalizedAbundance, file=opt$countsfile, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE) | |
| 148 } | |
| 149 | |
| 150 # test | |
| 151 print(counts) | |
| 152 print(cpm) | |
| 153 | |
| 154 # Conduct fitting of the GLM | |
| 155 yfit <- glmFit(y, design) | |
| 156 | |
| 157 # Initialize result matrices to contain the results of the GLM | |
| 158 results <- matrix(nrow=dim(counts)[1],ncol=0) | |
| 159 logfc <- matrix(nrow=dim(counts)[1],ncol=0) | |
| 160 | |
| 161 # Make the comparisons for the GLM | |
| 162 my.contrasts <- makeContrasts( | |
| 163 paste0(opt$levelNameB,"_",opt$levelNameA," = ", opt$levelNameB, " - ", opt$levelNameA), | |
| 164 levels = design | |
| 165 ) | |
| 166 | |
| 167 # Define the contrasts used in the comparisons | |
| 168 allcontrasts = paste0(opt$levelNameB," vs ",opt$levelNameA) | |
| 169 | |
| 170 # Conduct a for loop that will do the fitting of the GLM for each comparison | |
| 171 # Put the results into the results objects | |
| 172 lrt <- glmLRT(yfit, contrast=my.contrasts[,1]) | |
| 173 plotSmear(lrt, de.tags=rownames(y)) | |
| 174 title(allcontrasts) | |
| 175 res <- topTags(lrt,n=dim(c)[1],sort.by="none")$table | |
| 176 results <- cbind(results,res[,c(1,5)]) | |
| 177 logfc <- cbind(logfc,res[c(1)]) | |
| 178 | |
| 179 # Add the repeat types back into the results. | |
| 180 # We should still have the same order as the input data | |
| 181 results$class <- listA[[2]][,2] | |
| 182 results$type <- listA[[2]][,3] | |
| 183 | |
| 184 # Sort the results table by the FDR | |
| 185 results <- results[with(results, order(FDR)), ] | |
| 186 | |
| 187 # Save the results | |
| 188 write.table(results, opt$outfile, quote=FALSE, sep="\t", col.names=FALSE) | |
| 189 | |
| 190 # Plot Fold Changes for repeat classes and types | |
| 191 | |
| 192 # open the device and plots | |
| 193 if (!is.null(opt$plots)) { | |
| 194 if (verbose) cat("creating plots\n") | |
| 195 pdf(opt$plots) | |
| 196 plotMDS(y, main="Multidimensional Scaling Plot Of Distances Between Samples") | |
| 197 plotBCV(y, xlab="Gene abundance (Average log CPM)", main="Biological Coefficient of Variation Plot") | |
| 198 logFC <- results[, "logFC"] | |
| 199 # Plot the repeat classes | |
| 200 classes <- with(results, reorder(class, -logFC, median)) | |
| 201 par(mar=c(6,10,4,1)) | |
| 202 boxplot(logFC ~ classes, data=results, outline=FALSE, horizontal=TRUE, | |
| 203 las=2, xlab="log(Fold Change)", main=paste0(allcontrasts, ", by Class")) | |
| 204 abline(v=0) | |
| 205 # Plot the repeat types | |
| 206 types <- with(results, reorder(type, -logFC, median)) | |
| 207 boxplot(logFC ~ types, data=results, outline=FALSE, horizontal=TRUE, | |
| 208 las=2, xlab="log(Fold Change)", main=paste0(allcontrasts, ", by Type")) | |
| 209 abline(v=0) | |
| 210 } | |
| 211 | |
| 212 # close the plot device | |
| 213 if (!is.null(opt$plots)) { | |
| 214 cat("closing plot device\n") | |
| 215 dev.off() | |
| 216 } | |
| 217 | |
| 218 cat("Session information:\n\n") | |
| 219 | |
| 220 sessionInfo() | |
| 221 |
