comparison [APliBio]Nebula tools suite/Nebula/MakeTSSdist/makeTSSdist.R @ 0:2ec3ba0e9e70 draft

Uploaded
author alermine
date Thu, 25 Oct 2012 08:18:25 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2ec3ba0e9e70
1 #usage $0 STEP RIGHT chipPeaks outputFile.png output.txt [controlPeaks] [1 for pdf]
2 args <- commandArgs()
3 print (args)
4 myStep <- type.convert(args[4])
5 maxValue <- type.convert(args[5])
6
7 dataTable <-read.table(file=paste(args[6],".genes.ClosestPeakDist", sep=""), header=TRUE);
8 chip.genes.ClosestPeakDist<-data.frame(dataTable)
9 ifReg <- 0
10 if (length(unique(chip.genes.ClosestPeakDist$Reg))>1) {
11 ifReg <- 1
12 }
13 ifControl <- 0
14
15
16 ifPDF <- 0
17 if (length(args)>=10) {
18 ifPDF=args[10]
19 }
20 if (length(args)==9 & args[9]==1) {
21 ifPDF=1
22 }
23
24 library(Hmisc)
25
26 if (length(args)>=9 & args[9]!=1 & args[9]!=0) {
27 dataTable <-read.table(file=paste(args[9],".genes.ClosestPeakDist", sep=""), header=TRUE);
28 control.genes.ClosestPeakDist<-data.frame(dataTable)
29 ifControl <- 1
30 }
31 if (ifReg & ifControl) {
32 if (ifPDF==1) {
33 pdf(file = args[7], width = 19, height = 8, pointsize = 20, bg = "white")
34 } else {
35 png(filename = args[7], width = 1440 , height = 680, units = "px", pointsize = 20, bg = "white", res = NA)
36 plot(1:10)
37 }
38 op <- par(mfrow = c(2,3))
39 } else {
40 if (ifPDF==1) {
41 pdf(file = args[7], width = 10, height = 13, pointsize = 20, bg = "white")
42 } else {
43 png(filename = args[7], width = 680, height = 880, units = "px", pointsize = 20, bg = "white", res = NA)
44 plot(1:10)
45 }
46 # plot(1:10)
47 op <- par(mfrow = c(2,1))
48 }
49 myColor <- 1
50 myColor[1] <- colors()[131]
51 myColor[2] <- "darkolivegreen3"
52 myColor[3] <- "azure4"
53 myColor[4] <- "royalblue3"
54 myColor[5] <- colors()[17]
55
56 myColorControl <- 1
57
58 myColorControl[1] <- colors()[24]
59 myColorControl[2] <- colors()[278]
60 myColorControl[3] <- colors()[305]
61 myColorControl[4] <- colors()[219]
62 myColorControl[5] <- colors()[343]
63
64
65
66 #for cumulative:
67 dist_real_f <- chip.genes.ClosestPeakDist
68 if (ifControl) {
69 dist_control_f <- control.genes.ClosestPeakDist
70 }
71 step <- myStep
72 lim <- maxValue
73 x <- 0
74 count <- 1
75 countL <-1
76 n.types <- 1
77 myLevels <- 0
78 countTotalCont <- 0
79 countTotal <-0
80 countLCont <- 0
81 cumTotalCont <- 0
82 if (ifReg) {
83 n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
84 myLevels <- levels(chip.genes.ClosestPeakDist$Reg)
85 cum = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE)
86 for (i in c(1:n.types)) {
87 t <- which ((dist_real_f$Reg==myLevels[i]))
88 countL[i] <- length(t)
89 }
90 count <-1
91 for (i in seq(length=lim/step +1, from=0, by=step)) {
92 for (t in c(1:n.types)) {
93 tt <- which ((dist_real_f$Reg==myLevels[t])&(dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
94 cum[count,t] <- length(tt)
95 }
96 x[count] <- i
97 count <- count + 1
98 }
99 ymax <- max(cum[,1]/countL[1], na.rm=TRUE)
100 for (i in c(2:n.types)) {
101 ymax <- max(ymax,max(cum[,i]/countL[i], na.rm=TRUE))
102 }
103 myLocCol <- myColor[2]
104
105 par(mar=c(5.1, 7.1, 4.1, 2.1))
106
107 plot (x,cum[,1]/countL[1] ,col = myColor[2],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
108 for (i in c(2:n.types)) {
109 colorr <- i+1
110 myLocCol <- c(myLocCol,myColor[colorr])
111 lines (x,cum[,i]/countL[i] ,col = myColor[colorr],type="l", lwd = 2)
112 print (myColor[colorr])
113 }
114
115 gradi <- 1000
116 if (lim>10000) {
117 gradi <- 10000
118 }
119 if (lim<3000) {
120 gradi <- 500
121 }
122 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
123 axisxlab <- paste(axisx/1000,"", sep = "")
124 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
125 ymax <- max(cum[,i]/countL[i], na.rm=TRUE)
126
127 minor.tick(nx=5,tick.ratio=0.5)
128
129 legend(lim*0.45, ymax*0.45, myLevels, cex=1, lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
130
131 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
132
133 if (ifControl) {
134 count <-1
135 n.types <- length(levels(control.genes.ClosestPeakDist$Reg))
136 myLevels <- levels(control.genes.ClosestPeakDist$Reg)
137 cumCont = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE)
138 for (i in c(1:n.types)) {
139 t <- which ((dist_control_f$Reg==myLevels[i]))
140 countLCont[i] <- length(t)
141 }
142 for (i in seq(length=lim/step +1, from=0, by=step)) {
143 for (t in c(1:n.types)) {
144 tt <- which ((dist_control_f$Reg==myLevels[t])&(dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
145 cumCont[count,t] <- length(tt)
146 }
147 x[count] <- i
148 count <- count + 1
149 }
150 ymax <- max(cumCont[,1]/countLCont[1], na.rm=TRUE)
151 for (i in c(2:n.types)) {
152 ymax <- max(ymax,max(cumCont[,i]/countLCont[i], na.rm=TRUE))
153 }
154 myLocColCntr <- myColorControl[2]
155 plot (x,cumCont[,1]/countLCont[1] ,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
156 for (i in c(2:n.types)) {
157 colorr <- i+1
158 myLocColCntr <- c(myLocColCntr,myColorControl[colorr])
159 lines (x,cumCont[,i]/countLCont[i] ,col = myColorControl[colorr],type="l", lwd = 2)
160 }
161 if (lim>10000) {
162 gradi <- 10000
163 }
164 if (lim<3000) {
165 gradi <- 500
166 }
167 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
168 axisxlab <- paste(axisx/1000, sep = "")
169 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
170 minor.tick(nx=5,tick.ratio=0.5)
171 legend(lim*0.45, ymax*0.45, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
172 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
173 #real_vs_control_cumulative:
174 count <-1
175 countTotal <- length(dist_real_f$Reg)
176 cumTotal <- 0
177 for (i in seq(length=lim/step +1, from=0, by=step)) {
178 t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
179 cumTotal[count] <- length(t)
180 x[count] <- i
181 count <- count + 1
182 }
183 plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
184 gradi <- 1000
185 if (lim>10000) {
186 gradi <- 10000
187 }
188 if (lim<3000) {
189 gradi <- 500
190 }
191 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
192 axisxlab <- paste(axisx/1000, sep = "")
193 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
194 ymax <- max(cumTotal/countTotal, na.rm=TRUE)
195 minor.tick(nx=5,tick.ratio=0.5)
196 countTotalCont <- length(dist_control_f$Reg)
197 cumTotalCont <- 0
198 count <- 1
199 for (i in seq(length=lim/step +1, from=0, by=step)) {
200 t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
201 cumTotalCont[count] <- length(t)
202 x[count] <- i
203 count <- count + 1
204 }
205 lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
206 legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
207 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
208 }
209 } else {
210 countTotal <- length(dist_real_f$Reg)
211 cumTotal <- 0
212 count <-1
213
214 gradi <- 1000
215 if (lim>10000) {
216 gradi <- 10000
217 }
218 if (lim<3000) {
219 gradi <- 500
220 }
221
222 for (i in seq(length=lim/step +1, from=0, by=step)) {
223 t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
224 cumTotal[count] <- length(t)
225 x[count] <- i
226 count <- count + 1
227 }
228 par(mar=c(5.1, 7.1, 4.1, 2.1))
229
230 plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
231 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
232 axisxlab <- paste(axisx/1000, sep = "")
233 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
234 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
235 ymax <- max(cumTotal/countTotal, na.rm=TRUE)
236 if (ifControl) {
237 countTotalCont <- length(dist_control_f$Reg)
238 cumTotalCont <- 0
239 count <- 1
240 for (i in seq(length=lim/step +1, from=0, by=step)) {
241 t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
242 cumTotalCont[count] <- length(t)
243 x[count] <- i
244 count <- count + 1
245 }
246 lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
247 legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
248 } else {
249 legend(lim*0.45, ymax*0.45, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE)
250 }
251 }
252
253 sink(args[8], append=FALSE, split=FALSE)
254 if (ifReg) {
255 if (ifControl) {
256 cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
257 cat("\t")
258 for (i in c(1:n.types)) {
259 cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", sep=""))
260 cat("\t")
261 }
262
263 for (i in c(1:n.types)) {
264 cat(paste("% ", myLevels[i]," genes w/ a peak in Control", sep=""))
265 cat("\t")
266 }
267 cat("\n")
268 for (i in c(1:length(x))) {
269 cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
270 cat("\t")
271 for (t in c(1:n.types)) {
272 cat(paste(cum[i,t]/countL[t],"\t", sep=""))
273 }
274 for (t in c(1:n.types)) {
275 cat(paste(cumCont[i,t]/countLCont[t],"\t", sep=""))
276 }
277 cat("\n")
278 }
279 }else {
280 cat (paste("Dist_TSS","\t",sep=''))
281 for (i in c(1:n.types)) {
282 cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", "\t", sep=""))
283 }
284 cat("\n")
285 for (i in c(1:length(x))) {
286 cat(paste(x[i],"\t",sep=""))
287 for (t in c(1:n.types)) {
288 cat(paste(cum[i,t]/countL[t],"\t", sep=""))
289 }
290 cat("\n")
291 }
292 }
293 } else {
294 if (ifControl) {
295 cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
296 cat("\n")
297 for (i in c(1:length(x))) {
298 cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
299 cat("\n")
300 }
301 }else {
302 cat (paste("Dist_TSS","% genes w/ a peak in ChIP",sep='\t'))
303 cat("\n")
304 for (i in c(1:length(x))) {
305 cat(paste(x[i],cumTotal[i]/countTotal,sep="\t"))
306 cat("\n")
307 }
308
309 }
310 }
311
312
313 #stop sinking:
314 sink()
315
316 #around TSS:
317 lim <- maxValue
318 step <- myStep
319 my_breaks <- seq(length=lim/step*2 +1, from=-lim, by=step)
320 chip.genes <- read.table(file=paste(args[6],".genes", sep=""), header=TRUE) ;
321 dist_real_f <- chip.genes
322 if (ifControl) {
323 control.genes <- read.table(file=paste(args[9],".genes", sep=""), header=TRUE) ;
324 dist_control_f<-data.frame(control.genes)
325 }
326 if (ifReg) {
327 #n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
328 #myLevels <- levels(dist_real_f$Reg)
329
330 t<- which (dist_real_f$Reg==myLevels[1])
331 values_real <-dist_real_f$Dist[t]
332 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
333 ymax <- max(hTSSreal$density, na.rm=TRUE)
334 for (i in c(2:n.types)) {
335 t<- which (dist_real_f$Reg==myLevels[i])
336 values_real <-dist_real_f$Dist[t]
337 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
338 ymax <- max(ymax,max(hTSSreal$density, na.rm=TRUE))
339 }
340
341
342 t<- which (dist_real_f$Reg==myLevels[1])
343 values_real <-dist_real_f$Dist[t]
344 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
345 plot (hTSSreal$mids,hTSSreal$density,col = myLocCol[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax), xaxt="n" )
346
347 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
348
349 for (i in c(2:n.types)) {
350 t<- which (dist_real_f$Reg==myLevels[i])
351 values_real <-dist_real_f$Dist[t]
352 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
353 lines (hTSSreal$mids,hTSSreal$density,col = myLocCol[i],type="l", lwd = 2)
354 }
355 legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
356
357 gradi <- 1000
358 if (lim>10000) {
359 gradi <- 10000
360 }
361 if (lim<3000) {
362 gradi <- 500
363 }
364
365 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
366 axisxlab <- paste(axisx/1000, sep = "")
367 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
368
369
370 #minor.tick(nx=10,tick.ratio=0.5)
371 if (ifControl) {
372 t<- which (dist_control_f$Reg==myLevels[1])
373 values_control <-dist_control_f$Dist[t]
374 hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
375 ymax <- max(hTSScontrol$density, na.rm=TRUE)
376 for (i in c(2:n.types)) {
377 t<- which (dist_control_f$Reg==myLevels[i])
378 values_control <-dist_control_f$Dist[t]
379 hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
380 ymax <- max(ymax,max(hTSScontrol$density, na.rm=TRUE))
381 }
382 t<- which (dist_control_f$Reg==myLevels[1])
383 values_control <-dist_control_f$Dist[t]
384 hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
385 plot (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax),xaxt="n" )
386 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
387 for (i in c(2:n.types)) {
388 t<- which (dist_control_f$Reg==myLevels[i])
389 values_control <-dist_control_f$Dist[t]
390 hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
391 lines (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[i],type="l", lwd = 2)
392 }
393
394 gradi <- 1000
395 if (lim>10000) {
396 gradi <- 10000
397 }
398 if (lim<3000) {
399 gradi <- 500
400 }
401
402 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
403 axisxlab <- paste(axisx/1000, sep = "")
404 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
405
406 legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
407
408 # minor.tick(nx=10,tick.ratio=0.5)
409 #control vs real
410 values_real <-dist_real_f$Dist
411 hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) )
412 plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n")
413 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
414 ymax <- max(hTSSreal$density, na.rm=TRUE)
415 values_control <-dist_control_f$Dist
416 hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) )
417 lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
418 legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
419
420 gradi <- 1000
421 if (lim>10000) {
422 gradi <- 10000
423 }
424 if (lim<3000) {
425 gradi <- 500
426 }
427
428 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
429 axisxlab <- paste(axisx/1000, sep = "")
430 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
431
432
433
434 # minor.tick(nx=10,tick.ratio=0.5)
435 }
436 } else {
437 values_real <-dist_real_f$Dist
438 hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) )
439 plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n")
440 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
441 ymax <- max(hTSSreal$density, na.rm=TRUE)
442 if (ifControl) {
443 values_control <-dist_control_f$Dist
444 hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) )
445 lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
446 legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
447 } else {
448 legend(lim*0.2, ymax*0.9, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE)
449 }
450
451 gradi <- 1000
452 if (lim>10000) {
453 gradi <- 10000
454 }
455 if (lim<3000) {
456 gradi <- 500
457 }
458
459 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
460 axisxlab <- paste(axisx/1000, sep = "")
461 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
462
463
464 # minor.tick(nx=10,tick.ratio=0.5)
465 }
466 dev.off()
467 q();
468 cat (paste("peak height","# peaks in ChIP","# peaks in Control","#control/chip","\n",sep='\t'))
469 for (xval in c(minValue:maxValue)) {
470 for (i in (1:length(chipHist$mids))) {
471 if (xval==chipHist$mids[i]) {
472 ychip <- chipHist$counts[i]
473 }
474 }
475 for (i in (1:length(controlHist$mids))) {
476 if (xval==controlHist$mids[i]) {
477 ycontrol <- controlHist$counts[i]
478 }
479 }
480 cat (paste(xval,ychip,ycontrol,ycontrol/ychip,"\n",sep='\t'))
481 }