Mercurial > repos > mvdbeek > damidseq_polii_gene_call
comparison polii.gene.call @ 0:1b5bd3b955ed draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damidseq_polII_gene_call commit 2e97fdaeafdafe2c63588de17777cd32a798adb1-dirty
author | mvdbeek |
---|---|
date | Fri, 20 Apr 2018 10:27:20 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1b5bd3b955ed |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 # polii.gene.call.r | |
4 # Copyright © 2014-15, Owen Marshall | |
5 | |
6 # This program is free software; you can redistribute it and/or modify | |
7 # it under the terms of the GNU General Public License as published by | |
8 # the Free Software Foundation; either version 2 of the License, or (at | |
9 # your option) any later version. | |
10 # | |
11 # This program is distributed in the hope that it will be useful, but | |
12 # WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 # General Public License for more details. | |
15 # | |
16 # You should have received a copy of the GNU General Public License | |
17 # along with this program; if not, write to the Free Software | |
18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
19 # USA | |
20 | |
21 ### FDR calcs ### | |
22 # Method based on original perl scripts by Tony Southall (TDS) as published in | |
23 # Southall et al. (2013). Dev Cell, 26(1), 101–12. doi:10.1016/j.devcel.2013.05.020 | |
24 # | |
25 # Significant modifications to the original methodology include: | |
26 # * taking a linear regression of log data rather than trial-and-error curve fitting of non-log data | |
27 # * using a linear regression for the final intercept value rather than using the average intercept value for all conditions | |
28 # -- both of these should increase the accuracy of the final FDR value. | |
29 | |
30 version <- "1.0.2" | |
31 cat(paste("polii.gene.call v",version,"\n", sep="")) | |
32 | |
33 library(tools) | |
34 | |
35 ### Read CLI options | |
36 input.args <- commandArgs(trailingOnly = TRUE) | |
37 | |
38 in.files <- vector() | |
39 read.ops <- function (x) { | |
40 for (op in x) { | |
41 if (any(grepl("^--",op))) { | |
42 op <- gsub("^--","",op) | |
43 y <- unlist(strsplit(op,"=")) | |
44 | |
45 if (y[1] == "help") { | |
46 cat(paste("Usage: Rscript polii.gene.call.r --genes.file=[some_genes_file.gff] [list of .gatc.bedgraph or .gatc.gff ratio files to process]\n\n", sep="")) | |
47 | |
48 cat("Options:\n") | |
49 for (n in names(op.args)) { | |
50 cat(paste(" ",n,"=",op.args[[n]],"\n",sep="")) | |
51 } | |
52 cat("\n") | |
53 quit("no",1) | |
54 } | |
55 | |
56 if (!is.null(op.args[[ y[1] ]])) { | |
57 op.args[[ y[1] ]] <<- y[2] | |
58 } else { | |
59 cat("Error: Option",y[1],"not recognised ...\n") | |
60 quit("no",1) | |
61 } | |
62 } else { | |
63 in.files <<- c(in.files,op) | |
64 } | |
65 } | |
66 } | |
67 | |
68 write.ops <- function () { | |
69 out.df <- data.frame() | |
70 for (n in names(op.args)) { | |
71 v <<- as.character(op.args[[n]]) | |
72 df.line <- data.frame( | |
73 option=n, | |
74 value=v | |
75 ) | |
76 out.df <- rbind(out.df, df.line) | |
77 } | |
78 write.table(out.df,"input.args.single.txt",row.names=F) | |
79 } | |
80 | |
81 op.args <- list( | |
82 "genes.file" = "/mnt/data/Genomes/dmel_release/DmR6/DmR6.genes.gff", | |
83 "iter" = 50000, | |
84 "fdr" = 0.01 | |
85 ) | |
86 | |
87 read.ops(input.args) | |
88 | |
89 if (length(in.files) == 0) { | |
90 cat("Usage: Rscript polii.gene.call.r [list of .gatc.gff ratio files to process]\n\n") | |
91 quit("no",1) | |
92 } | |
93 | |
94 write.ops() | |
95 | |
96 ### save random seed for future reproducibility | |
97 dump.random <- runif(1) | |
98 my.seed <- .Random.seed | |
99 write.table(my.seed,".randomseed") | |
100 | |
101 ### read genes file | |
102 cat("Reading genes data file ...\n") | |
103 genes.file=op.args[["genes.file"]] | |
104 genes <- read.table(genes.file, comment.char="#", sep="\t", quote="", fill=T) | |
105 names(genes) <- c('chr','source','type','start','end','score','strand','c','details') | |
106 | |
107 # only subset if there is a type termed "gene" | |
108 if (any(genes$type == 'gene')) { | |
109 genes <- subset(genes, type=='gene') | |
110 } | |
111 | |
112 genes$name <- sapply(genes$details, FUN = function (x) {regmatches(x,gregexpr("(?<=Name=).*?(?=;)", x, perl=T))} ) | |
113 genes <- genes[,c('chr','start','end','strand','name')] | |
114 genes$chr <- gsub("^chr","",genes$chr,perl=T) | |
115 | |
116 if (nrow(genes) == 0) { | |
117 cat("Error: unable to extract gene information from genes file\n\n") | |
118 quit("no",1) | |
119 } | |
120 | |
121 ### functions | |
122 read.gff <- function (x,name="score") { | |
123 fn.ext <- file_ext(x) | |
124 | |
125 if (grepl("gff",ignore.case=T,fn.ext)) { | |
126 temp.data <- read.table(x,row.names=NULL) | |
127 if (ncol(temp.data) > 5) { | |
128 # GFF | |
129 trim.data <- temp.data[,c(1,4,5,6)] | |
130 } else { | |
131 cat("Error: file does not appear to be in GFF format\n\n") | |
132 quit("no",1) | |
133 } | |
134 } else if (grepl("bed",ignore.case=T,fn.ext)) { | |
135 temp.data <- read.table(x,row.names=NULL,skip=1) | |
136 if (ncol(temp.data) == 4) { | |
137 # bedgraph | |
138 trim.data <- temp.data | |
139 } else { | |
140 cat("Error: file does not appear to be in bedGraph format\n\n") | |
141 quit("no",1) | |
142 } | |
143 } else { | |
144 cat("Error: input file does not appear to be in bedGraph or GFF format ...\n\n") | |
145 quit("no",1) | |
146 } | |
147 | |
148 names(trim.data) <- c("chr","start","end",name) | |
149 | |
150 trim.data$chr <- gsub("^chr","",trim.data$chr,perl=T) | |
151 | |
152 return(trim.data) | |
153 } | |
154 | |
155 gene.exp <- function (input.df, buffer=0, iter=50000, debug=F) { | |
156 | |
157 avg.exp <- data.frame(input.df[1,c(4:(length(names(input.df))))]) | |
158 avg <- vector(length=(length(names(input.df)) - 4)) | |
159 avg.exp <- avg.exp[0,] | |
160 | |
161 ### FDR calcs ### | |
162 # Method based off perl scripts by Tony Southall (TDS) as published in | |
163 # Southall et al. (2013). Dev Cell, 26(1), 101–12. doi:10.1016/j.devcel.2013.05.020 | |
164 # | |
165 # Significant modifications to the original methodology include: | |
166 # * taking a linear regression of log data rather than trial-and-error curve fitting of non-log data | |
167 # * using a linear regression for the final intercept value rather than using the average intercept value for all conditions | |
168 # -- both of these should increase the accuracy of the final FDR value. | |
169 | |
170 input.len <- length(input.df[,1]) | |
171 frag.samp <- c(1,2,3,4,6,8,10,12,15) | |
172 thres.samp <- c(0.1,0.2,0.3,0.4,0.5,0.65,0.8,1.0,1.5,2.0) | |
173 | |
174 rand <- list() | |
175 | |
176 for (thres in thres.samp) { | |
177 | |
178 cat(paste(" Calculating FDR for threshold",thres,"\n",sep=" ")) | |
179 | |
180 # init vars | |
181 for (f in frag.samp) { | |
182 # List names are, e.g. frag 1, thres 0.2: rand[[thres.1.0.2]] | |
183 rand[[paste("thres.",f,".",thres,sep="")]] <- 0; | |
184 } | |
185 | |
186 for (i in 1:iter) { | |
187 if (i %% 200 == 0) {cat(paste(" iter",i,"\r"))} | |
188 # get random sample for different fragment lengths | |
189 | |
190 rand.samp <- list() | |
191 | |
192 for (f in frag.samp) { | |
193 # Using the fourth column as we're only calculating FDR for one sample ... | |
194 rand.samp[[paste("rand.",f,sep="")]] <- mean(input.df[runif(f,1,input.len),4]) | |
195 } | |
196 | |
197 # count number of times exp > thres | |
198 for (f in frag.samp) { | |
199 if (rand.samp[[paste("rand.",f,sep="")]] > thres) {rand[[paste("thres.",f,".",thres,sep="")]] <- rand[[paste("thres.",f,".",thres,sep="")]] + 1} | |
200 } | |
201 } | |
202 } | |
203 | |
204 rand.fdr <- list() | |
205 | |
206 for (thres in thres.samp) { | |
207 for (f in frag.samp) { | |
208 rand.fdr[[paste("thres.",f,".",thres,sep="")]] <- rand[[paste("thres.",f,".",thres,sep="")]]/iter | |
209 } | |
210 } | |
211 | |
212 cat("Fitting curves ...\n") | |
213 | |
214 # curve fit: fdr vs thresholds | |
215 var.thres <- list() | |
216 | |
217 for (thres in thres.samp) { | |
218 for (f in frag.samp) { | |
219 var.thres[[paste("frags.",f,sep="")]] <- append(var.thres[[paste("frags.",f,sep="")]], rand.fdr[[paste("thres.",f,".",thres,sep="")]]) | |
220 } | |
221 } | |
222 | |
223 inf.log.lm <- function (v) { | |
224 non.inf <- log(v) != -Inf | |
225 ret <- lm(log(v)[non.inf] ~ thres.samp[non.inf]) | |
226 return(ret) | |
227 } | |
228 | |
229 # The relationship is exponential, so we need log data for a linear regression | |
230 # (in R, linear regression is: y = lm$coefficients[[2]]x + lm$coefficients[[1]] ... ) | |
231 var.lm <- list() | |
232 for (f in frag.samp) { | |
233 var.lm[[paste("frags.",f,sep="")]] <- inf.log.lm(var.thres[[paste("frags.",f,sep="")]]) | |
234 } | |
235 | |
236 # ... and now we do a linear regression on the slopes and intercepts of our previous regressions | |
237 # (This is the clever bit, and it actually seems to work. The correlation of slope to fragment size is linear ... | |
238 # By doing this on the slope and intercept, we can now predict the FDR for any number of fragments with any expression.) | |
239 slope <- vector() | |
240 for (f in frag.samp) { | |
241 slope <- append(slope, var.lm[[paste("frags.",f,sep="")]]$coefficients[[2]]) | |
242 } | |
243 | |
244 # slope regression predicts the average slope | |
245 slope.lm <- lm(slope ~ frag.samp) | |
246 | |
247 # TDS used an average intercept value for the intercept, however ... | |
248 inter <- vector() | |
249 for (f in frag.samp) { | |
250 inter <- append(inter, var.lm[[paste("frags.",f,sep="")]]$coefficients[[1]]) | |
251 } | |
252 | |
253 # ... there's actually quite a bit of variation of the intercept with real data, | |
254 # so we're going to do a lin regression on the intercept instead. | |
255 # | |
256 # (I'm not convinced it's a true linear relationship. But it's close to linear, | |
257 # and will certainly perform better than taking the mean intercept ...) | |
258 # | |
259 # If you're interested, set the debug flag to TRUE and take a look at the plots generated below ... | |
260 inter.lm <- lm(inter ~ frag.samp) | |
261 | |
262 if (debug == T) { | |
263 # plots for debugging/checking | |
264 plot.debug <- function (y,x,l,name="Debug plot") { | |
265 plot(y ~ x) | |
266 abline(l) | |
267 | |
268 lsum <- summary(l) | |
269 r2 <- lsum$r.squared | |
270 | |
271 legend("topleft",legend=r2,bty='n') | |
272 title(name) | |
273 | |
274 dev.copy(png,paste(name,".png",sep=""));dev.off() | |
275 } | |
276 | |
277 plot.debug(slope,frag.samp,slope.lm,name="correlation of slope") | |
278 plot.debug(inter,frag.samp,inter.lm,name="correlation of intercepts") | |
279 } | |
280 | |
281 # ok, so putting that all together ... | |
282 fdr <- function (frags, expr) { | |
283 inter.test <- inter.lm$coefficients[[2]] * frags + inter.lm$coefficients[[1]] | |
284 slope.test <- slope.lm$coefficients[[2]] * frags + slope.lm$coefficients[[1]] | |
285 | |
286 fdr.out <- exp(slope.test * expr + inter.test) | |
287 return(fdr.out) | |
288 } | |
289 | |
290 ### Gene expression values ### | |
291 cat("Calculating gene values ...\n") | |
292 | |
293 count <- 0 | |
294 | |
295 # unroll chromosomes for speed: | |
296 for (chromo in unique(genes$chr)) { | |
297 input.chr <- subset(input.df, chr==chromo) | |
298 genes.chr <- subset(genes, chr==chromo) | |
299 for (i in 1:length(genes.chr$name)) { | |
300 # Roll through each gene | |
301 | |
302 # Note: the original script calculated expression values for a table of proteins, | |
303 # here we just limit ourselves to the FDR for the first column past "chr", "start" and "end" | |
304 # so if you're thinking of looking at chromatin, for example, put PolII as column 4 in your table! | |
305 | |
306 gene.start <- genes.chr[i,"start"] - buffer | |
307 gene.end <- genes.chr[i,"end"] + buffer | |
308 | |
309 gene.start <- ifelse(gene.start < 1, 1, gene.start) | |
310 | |
311 # Create data frames for all gatc fragments covering current gene | |
312 exp <- data.frame(input.chr[ (input.chr$start <= gene.end) | |
313 & (input.chr$end >= gene.start) | |
314 ,] ) | |
315 | |
316 gatc.num <- length(exp[,1]) | |
317 | |
318 # skip if no gatc fragments cover gene :( | |
319 if (gatc.num == 0) {next} | |
320 | |
321 # trim to gene boundaries ... | |
322 exp$start[1] <- gene.start | |
323 exp$end[length(exp[,1])] <- gene.end | |
324 | |
325 # gene length covered by gatc fragments | |
326 len <- sum(exp$end-exp$start) | |
327 | |
328 # calculate weighted score for each column (representing different proteins) | |
329 for (j in 4:length(names(input.chr))) { | |
330 avg[j] <- (sum((exp$end-exp$start)*exp[j]))/len | |
331 } | |
332 | |
333 # make data.frame of averages (to be appended to avg.exp) | |
334 df <- cbind(avg[1]) | |
335 for (k in 2:length(avg)) { | |
336 df <- cbind(df,avg[k]) | |
337 } | |
338 df <- cbind(df,gatc.num) | |
339 | |
340 # only fdr for first column for now ... | |
341 gene.fdr <- fdr(gatc.num,avg[4]) | |
342 df <- cbind(df, gene.fdr) | |
343 | |
344 # append current gene to list | |
345 avg.exp <- rbind(avg.exp,data.frame(name=as.character(genes.chr[i,"name"]), df)) | |
346 count <- count+1 | |
347 if (count %% 50 == 0) {cat(paste(count,"genes averaged ...\r"))} | |
348 } | |
349 } | |
350 | |
351 avg.exp <- avg.exp[,c(1,5:(length(names(avg.exp))))] | |
352 names(avg.exp) <- c("name",names(input.df)[c(4:(length(names(input.df))))],"gatc.num","FDR") | |
353 | |
354 return(avg.exp) | |
355 } | |
356 | |
357 for (name in in.files) { | |
358 cat(paste("\nNow working on",name,"...\n")) | |
359 bname <- basename(name) | |
360 fname <- sub("\\..*","",bname,perl=T) | |
361 | |
362 polii <- read.gff(name,"polii") | |
363 | |
364 polii.exp <- gene.exp(polii, iter=as.numeric(op.args[["iter"]])) | |
365 | |
366 out <- subset(polii.exp,FDR < op.args[["fdr"]]) | |
367 | |
368 write.table(polii.exp,paste(fname,"genes.details.csv",sep="."),row.names=F,col.names=T,quote=T,sep="\t") | |
369 write.table(out$name, paste(fname,"genes",sep="."),row.names=F,col.names=F,quote=F) | |
370 } | |
371 | |
372 cat("\nAll done.\n\n") |