Mercurial > repos > alermine > nebula
diff [APliBio]Nebula tools suite/Nebula/MakeTSSdist/makeTSSdist.R @ 0:2ec3ba0e9e70 draft
Uploaded
author | alermine |
---|---|
date | Thu, 25 Oct 2012 08:18:25 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/[APliBio]Nebula tools suite/Nebula/MakeTSSdist/makeTSSdist.R Thu Oct 25 08:18:25 2012 -0400 @@ -0,0 +1,481 @@ +#usage $0 STEP RIGHT chipPeaks outputFile.png output.txt [controlPeaks] [1 for pdf] +args <- commandArgs() +print (args) +myStep <- type.convert(args[4]) +maxValue <- type.convert(args[5]) + +dataTable <-read.table(file=paste(args[6],".genes.ClosestPeakDist", sep=""), header=TRUE); +chip.genes.ClosestPeakDist<-data.frame(dataTable) +ifReg <- 0 +if (length(unique(chip.genes.ClosestPeakDist$Reg))>1) { + ifReg <- 1 +} +ifControl <- 0 + + +ifPDF <- 0 +if (length(args)>=10) { + ifPDF=args[10] +} +if (length(args)==9 & args[9]==1) { + ifPDF=1 +} + +library(Hmisc) + +if (length(args)>=9 & args[9]!=1 & args[9]!=0) { + dataTable <-read.table(file=paste(args[9],".genes.ClosestPeakDist", sep=""), header=TRUE); + control.genes.ClosestPeakDist<-data.frame(dataTable) + ifControl <- 1 +} +if (ifReg & ifControl) { + if (ifPDF==1) { + pdf(file = args[7], width = 19, height = 8, pointsize = 20, bg = "white") + } else { + png(filename = args[7], width = 1440 , height = 680, units = "px", pointsize = 20, bg = "white", res = NA) + plot(1:10) + } + op <- par(mfrow = c(2,3)) +} else { + if (ifPDF==1) { + pdf(file = args[7], width = 10, height = 13, pointsize = 20, bg = "white") + } else { + png(filename = args[7], width = 680, height = 880, units = "px", pointsize = 20, bg = "white", res = NA) + plot(1:10) + } + # plot(1:10) + op <- par(mfrow = c(2,1)) +} +myColor <- 1 +myColor[1] <- colors()[131] +myColor[2] <- "darkolivegreen3" +myColor[3] <- "azure4" +myColor[4] <- "royalblue3" +myColor[5] <- colors()[17] + +myColorControl <- 1 + +myColorControl[1] <- colors()[24] +myColorControl[2] <- colors()[278] +myColorControl[3] <- colors()[305] +myColorControl[4] <- colors()[219] +myColorControl[5] <- colors()[343] + + + +#for cumulative: +dist_real_f <- chip.genes.ClosestPeakDist +if (ifControl) { + dist_control_f <- control.genes.ClosestPeakDist +} +step <- myStep +lim <- maxValue +x <- 0 +count <- 1 +countL <-1 +n.types <- 1 +myLevels <- 0 +countTotalCont <- 0 +countTotal <-0 +countLCont <- 0 +cumTotalCont <- 0 +if (ifReg) { + n.types <- length(levels(chip.genes.ClosestPeakDist$Reg)) + myLevels <- levels(chip.genes.ClosestPeakDist$Reg) + cum = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE) + for (i in c(1:n.types)) { + t <- which ((dist_real_f$Reg==myLevels[i])) + countL[i] <- length(t) + } + count <-1 + for (i in seq(length=lim/step +1, from=0, by=step)) { + for (t in c(1:n.types)) { + tt <- which ((dist_real_f$Reg==myLevels[t])&(dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i)) + cum[count,t] <- length(tt) + } + x[count] <- i + count <- count + 1 + } + ymax <- max(cum[,1]/countL[1], na.rm=TRUE) + for (i in c(2:n.types)) { + ymax <- max(ymax,max(cum[,i]/countL[i], na.rm=TRUE)) + } + myLocCol <- myColor[2] + + par(mar=c(5.1, 7.1, 4.1, 2.1)) + + plot (x,cum[,1]/countL[1] ,col = myColor[2],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax)) + for (i in c(2:n.types)) { + colorr <- i+1 + myLocCol <- c(myLocCol,myColor[colorr]) + lines (x,cum[,i]/countL[i] ,col = myColor[colorr],type="l", lwd = 2) + print (myColor[colorr]) + } + + gradi <- 1000 + if (lim>10000) { + gradi <- 10000 + } + if (lim<3000) { + gradi <- 500 + } + axisx <- seq(length=lim/gradi+1, from=0, by=gradi) + axisxlab <- paste(axisx/1000,"", sep = "") + axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1) + ymax <- max(cum[,i]/countL[i], na.rm=TRUE) + + minor.tick(nx=5,tick.ratio=0.5) + + legend(lim*0.45, ymax*0.45, myLevels, cex=1, lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE) + + title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)") + + if (ifControl) { + count <-1 + n.types <- length(levels(control.genes.ClosestPeakDist$Reg)) + myLevels <- levels(control.genes.ClosestPeakDist$Reg) + cumCont = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE) + for (i in c(1:n.types)) { + t <- which ((dist_control_f$Reg==myLevels[i])) + countLCont[i] <- length(t) + } + for (i in seq(length=lim/step +1, from=0, by=step)) { + for (t in c(1:n.types)) { + tt <- which ((dist_control_f$Reg==myLevels[t])&(dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i)) + cumCont[count,t] <- length(tt) + } + x[count] <- i + count <- count + 1 + } + ymax <- max(cumCont[,1]/countLCont[1], na.rm=TRUE) + for (i in c(2:n.types)) { + ymax <- max(ymax,max(cumCont[,i]/countLCont[i], na.rm=TRUE)) + } + myLocColCntr <- myColorControl[2] + plot (x,cumCont[,1]/countLCont[1] ,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax)) + for (i in c(2:n.types)) { + colorr <- i+1 + myLocColCntr <- c(myLocColCntr,myColorControl[colorr]) + lines (x,cumCont[,i]/countLCont[i] ,col = myColorControl[colorr],type="l", lwd = 2) + } + if (lim>10000) { + gradi <- 10000 + } + if (lim<3000) { + gradi <- 500 + } + axisx <- seq(length=lim/gradi+1, from=0, by=gradi) + axisxlab <- paste(axisx/1000, sep = "") + axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1) + minor.tick(nx=5,tick.ratio=0.5) + legend(lim*0.45, ymax*0.45, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE) + title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)") + #real_vs_control_cumulative: + count <-1 + countTotal <- length(dist_real_f$Reg) + cumTotal <- 0 + for (i in seq(length=lim/step +1, from=0, by=step)) { + t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i)) + cumTotal[count] <- length(t) + x[count] <- i + count <- count + 1 + } + plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" ) + gradi <- 1000 + if (lim>10000) { + gradi <- 10000 + } + if (lim<3000) { + gradi <- 500 + } + axisx <- seq(length=lim/gradi+1, from=0, by=gradi) + axisxlab <- paste(axisx/1000, sep = "") + axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1) + ymax <- max(cumTotal/countTotal, na.rm=TRUE) + minor.tick(nx=5,tick.ratio=0.5) + countTotalCont <- length(dist_control_f$Reg) + cumTotalCont <- 0 + count <- 1 + for (i in seq(length=lim/step +1, from=0, by=step)) { + t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i)) + cumTotalCont[count] <- length(t) + x[count] <- i + count <- count + 1 + } + lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2) + legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE) + title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)") + } +} else { + countTotal <- length(dist_real_f$Reg) + cumTotal <- 0 + count <-1 + + gradi <- 1000 + if (lim>10000) { + gradi <- 10000 + } + if (lim<3000) { + gradi <- 500 + } + + for (i in seq(length=lim/step +1, from=0, by=step)) { + t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i)) + cumTotal[count] <- length(t) + x[count] <- i + count <- count + 1 + } + par(mar=c(5.1, 7.1, 4.1, 2.1)) + + plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" ) + axisx <- seq(length=lim/gradi+1, from=0, by=gradi) + axisxlab <- paste(axisx/1000, sep = "") + axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1) +title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)") + ymax <- max(cumTotal/countTotal, na.rm=TRUE) + if (ifControl) { + countTotalCont <- length(dist_control_f$Reg) + cumTotalCont <- 0 + count <- 1 + for (i in seq(length=lim/step +1, from=0, by=step)) { + t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i)) + cumTotalCont[count] <- length(t) + x[count] <- i + count <- count + 1 + } + lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2) + legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE) + } else { + legend(lim*0.45, ymax*0.45, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE) + } +} + +sink(args[8], append=FALSE, split=FALSE) +if (ifReg) { + if (ifControl) { + cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t')) + cat("\t") + for (i in c(1:n.types)) { + cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", sep="")) + cat("\t") + } + + for (i in c(1:n.types)) { + cat(paste("% ", myLevels[i]," genes w/ a peak in Control", sep="")) + cat("\t") + } + cat("\n") + for (i in c(1:length(x))) { + cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t")) + cat("\t") + for (t in c(1:n.types)) { + cat(paste(cum[i,t]/countL[t],"\t", sep="")) + } + for (t in c(1:n.types)) { + cat(paste(cumCont[i,t]/countLCont[t],"\t", sep="")) + } + cat("\n") + } + }else { + cat (paste("Dist_TSS","\t",sep='')) + for (i in c(1:n.types)) { + cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", "\t", sep="")) + } + cat("\n") + for (i in c(1:length(x))) { + cat(paste(x[i],"\t",sep="")) + for (t in c(1:n.types)) { + cat(paste(cum[i,t]/countL[t],"\t", sep="")) + } + cat("\n") + } + } +} else { + if (ifControl) { + cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t')) + cat("\n") + for (i in c(1:length(x))) { + cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t")) + cat("\n") + } + }else { + cat (paste("Dist_TSS","% genes w/ a peak in ChIP",sep='\t')) + cat("\n") + for (i in c(1:length(x))) { + cat(paste(x[i],cumTotal[i]/countTotal,sep="\t")) + cat("\n") + } + + } +} + + +#stop sinking: +sink() + +#around TSS: +lim <- maxValue +step <- myStep +my_breaks <- seq(length=lim/step*2 +1, from=-lim, by=step) +chip.genes <- read.table(file=paste(args[6],".genes", sep=""), header=TRUE) ; +dist_real_f <- chip.genes +if (ifControl) { + control.genes <- read.table(file=paste(args[9],".genes", sep=""), header=TRUE) ; + dist_control_f<-data.frame(control.genes) +} +if (ifReg) { + #n.types <- length(levels(chip.genes.ClosestPeakDist$Reg)) + #myLevels <- levels(dist_real_f$Reg) + + t<- which (dist_real_f$Reg==myLevels[1]) + values_real <-dist_real_f$Dist[t] + hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) ) + ymax <- max(hTSSreal$density, na.rm=TRUE) + for (i in c(2:n.types)) { + t<- which (dist_real_f$Reg==myLevels[i]) + values_real <-dist_real_f$Dist[t] + hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) ) + ymax <- max(ymax,max(hTSSreal$density, na.rm=TRUE)) + } + + + t<- which (dist_real_f$Reg==myLevels[1]) + values_real <-dist_real_f$Dist[t] + hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) ) + plot (hTSSreal$mids,hTSSreal$density,col = myLocCol[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax), xaxt="n" ) + + title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)") + + for (i in c(2:n.types)) { + t<- which (dist_real_f$Reg==myLevels[i]) + values_real <-dist_real_f$Dist[t] + hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) ) + lines (hTSSreal$mids,hTSSreal$density,col = myLocCol[i],type="l", lwd = 2) + } + legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE) + + gradi <- 1000 + if (lim>10000) { + gradi <- 10000 + } + if (lim<3000) { + gradi <- 500 + } + + axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi) + axisxlab <- paste(axisx/1000, sep = "") + axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1) + + + #minor.tick(nx=10,tick.ratio=0.5) + if (ifControl) { + t<- which (dist_control_f$Reg==myLevels[1]) + values_control <-dist_control_f$Dist[t] + hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) ) + ymax <- max(hTSScontrol$density, na.rm=TRUE) + for (i in c(2:n.types)) { + t<- which (dist_control_f$Reg==myLevels[i]) + values_control <-dist_control_f$Dist[t] + hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) ) + ymax <- max(ymax,max(hTSScontrol$density, na.rm=TRUE)) + } + t<- which (dist_control_f$Reg==myLevels[1]) + values_control <-dist_control_f$Dist[t] + hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) ) + plot (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax),xaxt="n" ) + title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)") + for (i in c(2:n.types)) { + t<- which (dist_control_f$Reg==myLevels[i]) + values_control <-dist_control_f$Dist[t] + hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) ) + lines (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[i],type="l", lwd = 2) + } + + gradi <- 1000 + if (lim>10000) { + gradi <- 10000 + } + if (lim<3000) { + gradi <- 500 + } + + axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi) + axisxlab <- paste(axisx/1000, sep = "") + axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1) + + legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE) + + # minor.tick(nx=10,tick.ratio=0.5) + #control vs real + values_real <-dist_real_f$Dist + hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) ) + plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n") + title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)") + ymax <- max(hTSSreal$density, na.rm=TRUE) + values_control <-dist_control_f$Dist + hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) ) + lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2) + legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE) + + gradi <- 1000 + if (lim>10000) { + gradi <- 10000 + } + if (lim<3000) { + gradi <- 500 + } + + axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi) + axisxlab <- paste(axisx/1000, sep = "") + axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1) + + + + # minor.tick(nx=10,tick.ratio=0.5) + } +} else { + values_real <-dist_real_f$Dist + hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) ) + plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n") + title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)") + ymax <- max(hTSSreal$density, na.rm=TRUE) + if (ifControl) { + values_control <-dist_control_f$Dist + hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) ) + lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2) + legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE) + } else { + legend(lim*0.2, ymax*0.9, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE) + } + + gradi <- 1000 + if (lim>10000) { + gradi <- 10000 + } + if (lim<3000) { + gradi <- 500 + } + + axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi) + axisxlab <- paste(axisx/1000, sep = "") + axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1) + + + # minor.tick(nx=10,tick.ratio=0.5) +} +dev.off() +q(); +cat (paste("peak height","# peaks in ChIP","# peaks in Control","#control/chip","\n",sep='\t')) +for (xval in c(minValue:maxValue)) { + for (i in (1:length(chipHist$mids))) { + if (xval==chipHist$mids[i]) { + ychip <- chipHist$counts[i] + } + } + for (i in (1:length(controlHist$mids))) { + if (xval==controlHist$mids[i]) { + ycontrol <- controlHist$counts[i] + } + } + cat (paste(xval,ychip,ycontrol,ycontrol/ychip,"\n",sep='\t')) +}