diff [APliBio]Nebula tools suite/Nebula/MakeTSSdist/makeTSSdist.R @ 0:2ec3ba0e9e70 draft

Uploaded
author alermine
date Thu, 25 Oct 2012 08:18:25 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/[APliBio]Nebula tools suite/Nebula/MakeTSSdist/makeTSSdist.R	Thu Oct 25 08:18:25 2012 -0400
@@ -0,0 +1,481 @@
+#usage $0 STEP RIGHT chipPeaks outputFile.png output.txt [controlPeaks] [1 for pdf]
+args <- commandArgs()
+print (args)
+myStep <- type.convert(args[4])
+maxValue <- type.convert(args[5])
+
+dataTable <-read.table(file=paste(args[6],".genes.ClosestPeakDist", sep=""), header=TRUE);
+chip.genes.ClosestPeakDist<-data.frame(dataTable)
+ifReg <- 0
+if (length(unique(chip.genes.ClosestPeakDist$Reg))>1) {
+ ifReg <- 1
+}
+ifControl <- 0
+
+
+ifPDF <- 0
+if (length(args)>=10) {
+	ifPDF=args[10]
+}
+if (length(args)==9 & args[9]==1) {
+	ifPDF=1
+}
+
+library(Hmisc)
+
+if (length(args)>=9 & args[9]!=1 & args[9]!=0) {
+  dataTable <-read.table(file=paste(args[9],".genes.ClosestPeakDist", sep=""), header=TRUE);
+  control.genes.ClosestPeakDist<-data.frame(dataTable)
+  ifControl <- 1
+}
+if (ifReg & ifControl) {
+  if (ifPDF==1) {
+	pdf(file = args[7], width = 19, height = 8, pointsize = 20, bg = "white")
+  } else {
+  	png(filename = args[7], width = 1440 , height = 680, units = "px", pointsize = 20, bg = "white", res = NA)
+	plot(1:10)
+  }
+  op <- par(mfrow = c(2,3))
+} else {
+  if (ifPDF==1) {
+	pdf(file = args[7], width = 10, height = 13, pointsize = 20, bg = "white")
+  } else {
+  	png(filename = args[7], width = 680, height = 880, units = "px", pointsize = 20, bg = "white", res = NA)
+	plot(1:10)
+  }
+ # plot(1:10)
+  op <- par(mfrow = c(2,1))
+}
+myColor <- 1
+myColor[1] <- colors()[131]
+myColor[2] <- "darkolivegreen3"
+myColor[3] <- "azure4"
+myColor[4] <- "royalblue3"
+myColor[5] <- colors()[17]
+
+myColorControl <- 1
+
+myColorControl[1] <- colors()[24]
+myColorControl[2] <- colors()[278]
+myColorControl[3] <- colors()[305]
+myColorControl[4] <- colors()[219]
+myColorControl[5] <- colors()[343]
+
+
+
+#for cumulative:
+dist_real_f <- chip.genes.ClosestPeakDist
+if (ifControl) {
+ dist_control_f <- control.genes.ClosestPeakDist
+}
+step <- myStep
+lim <- maxValue
+x <- 0
+count <- 1
+countL <-1
+n.types <- 1
+myLevels <- 0
+countTotalCont <- 0
+countTotal <-0
+countLCont <- 0
+cumTotalCont <- 0
+if (ifReg) {
+	 n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
+	 myLevels <- levels(chip.genes.ClosestPeakDist$Reg)
+	 cum = matrix( 0, nrow=lim/step +1,  ncol=n.types, byrow = TRUE) 
+	 for (i in c(1:n.types)) {
+	 	t <- which ((dist_real_f$Reg==myLevels[i]))
+	 	countL[i] <- length(t)
+         }	
+	count <-1 
+	for (i in seq(length=lim/step +1, from=0, by=step)) {
+		for (t in c(1:n.types)) {
+			tt <- which ((dist_real_f$Reg==myLevels[t])&(dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
+			cum[count,t] <- length(tt)	 		
+         	}		
+		x[count] <- i
+		count <- count + 1
+	}
+	ymax <- max(cum[,1]/countL[1], na.rm=TRUE)
+	for (i in c(2:n.types)) {
+	 	ymax <- max(ymax,max(cum[,i]/countL[i], na.rm=TRUE))
+         }
+	myLocCol <- myColor[2]	
+
+	par(mar=c(5.1, 7.1, 4.1, 2.1)) 
+
+	 plot (x,cum[,1]/countL[1] ,col = myColor[2],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
+	 for (i in c(2:n.types)) {
+	 	colorr <- i+1
+		myLocCol <- c(myLocCol,myColor[colorr])
+		lines (x,cum[,i]/countL[i] ,col = myColor[colorr],type="l", lwd = 2)
+		print (myColor[colorr])
+         }	
+	
+	gradi <- 1000
+	if (lim>10000) {
+		gradi <- 10000
+	}
+	if (lim<3000) {
+		gradi <- 500
+	}
+	 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
+  	 axisxlab <- paste(axisx/1000,"", sep = "")
+   	 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
+  	 ymax <- max(cum[,i]/countL[i], na.rm=TRUE)
+
+	  minor.tick(nx=5,tick.ratio=0.5)
+
+	 legend(lim*0.45, ymax*0.45, myLevels,  cex=1, lwd = 2, bty = "n", col = myLocCol, lty = c(1),  pt.bg= c(myLocCol) , merge = TRUE)
+
+	title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
+	
+	 if (ifControl) {
+                count <-1
+ 	 	n.types <- length(levels(control.genes.ClosestPeakDist$Reg))
+	 	myLevels <- levels(control.genes.ClosestPeakDist$Reg)
+		 cumCont = matrix( 0, nrow=lim/step +1,  ncol=n.types, byrow = TRUE) 
+		 for (i in c(1:n.types)) {
+		 	t <- which ((dist_control_f$Reg==myLevels[i]))
+		 	countLCont[i] <- length(t)
+		 }	 
+		for (i in seq(length=lim/step +1, from=0, by=step)) {
+			for (t in c(1:n.types)) {
+				tt <- which ((dist_control_f$Reg==myLevels[t])&(dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
+				cumCont[count,t] <- length(tt)	 		
+		 	}		
+			x[count] <- i
+			count <- count + 1
+		}
+		ymax <- max(cumCont[,1]/countLCont[1], na.rm=TRUE)
+		for (i in c(2:n.types)) {
+		 	ymax <- max(ymax,max(cumCont[,i]/countLCont[i], na.rm=TRUE))
+		 }
+		myLocColCntr <- myColorControl[2]	
+		plot (x,cumCont[,1]/countLCont[1] ,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
+		 for (i in c(2:n.types)) {
+		 	colorr <- i+1
+			myLocColCntr <- c(myLocColCntr,myColorControl[colorr])
+			lines (x,cumCont[,i]/countLCont[i] ,col = myColorControl[colorr],type="l", lwd = 2)
+		 }	
+	   	   if (lim>10000) {
+				gradi <- 10000
+		   }
+		   if (lim<3000) {
+				gradi <- 500
+		   }
+		   axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
+	  	 axisxlab <- paste(axisx/1000, sep = "")
+	   	 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
+  		minor.tick(nx=5,tick.ratio=0.5)
+		legend(lim*0.45, ymax*0.45, myLevels,  cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1),  pt.bg= c(myLocCol) , merge = TRUE)
+		title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
+		#real_vs_control_cumulative:
+		count <-1
+		countTotal <- length(dist_real_f$Reg)
+	   	cumTotal  <- 0
+		for (i in seq(length=lim/step +1, from=0, by=step)) {	
+			t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
+			cumTotal[count] <- length(t)
+			x[count] <- i
+			count <- count + 1
+		}
+		   plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
+		   gradi <- 1000
+		   if (lim>10000) {
+				gradi <- 10000
+		    }
+		    if (lim<3000) {
+				gradi <- 500
+		    }
+		    axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
+		   axisxlab <- paste(axisx/1000, sep = "")
+		   axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
+		   ymax <- max(cumTotal/countTotal, na.rm=TRUE)
+    		 minor.tick(nx=5,tick.ratio=0.5)
+		countTotalCont <- length(dist_control_f$Reg)
+	   	cumTotalCont  <- 0
+		count <- 1
+		for (i in seq(length=lim/step +1, from=0, by=step)) {	
+			t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
+			cumTotalCont[count] <- length(t)
+			x[count] <- i
+			count <- count + 1
+		}
+		lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
+		legend(lim*0.45, ymax*0.45, c("ChIP","Control"),  cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1),  pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
+		title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
+	}
+} else {
+   countTotal <- length(dist_real_f$Reg)
+   cumTotal  <- 0
+   count <-1
+
+   gradi <- 1000
+   if (lim>10000) {
+	gradi <- 10000
+   }
+   if (lim<3000) {
+	gradi <- 500
+   }
+
+   for (i in seq(length=lim/step +1, from=0, by=step)) {	
+	t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
+	cumTotal[count] <- length(t)
+	x[count] <- i
+	count <- count + 1
+   }
+   par(mar=c(5.1, 7.1, 4.1, 2.1)) 
+  
+   plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
+   axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
+   axisxlab <- paste(axisx/1000, sep = "")
+   axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
+title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
+   ymax <- max(cumTotal/countTotal, na.rm=TRUE)
+   if (ifControl) {
+	countTotalCont <- length(dist_control_f$Reg)
+   	cumTotalCont  <- 0
+	count <- 1
+	for (i in seq(length=lim/step +1, from=0, by=step)) {	
+		t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
+		cumTotalCont[count] <- length(t)
+		x[count] <- i
+		count <- count + 1
+	}
+	lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
+        legend(lim*0.45, ymax*0.45, c("ChIP","Control"),  cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1),  pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
+   } else {
+        legend(lim*0.45, ymax*0.45, c("ChIP"),  cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1),  pt.bg= c(myColor[1]) , merge = TRUE)
+   }
+}
+
+sink(args[8], append=FALSE, split=FALSE)
+if (ifReg) {
+	if (ifControl) {
+		cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
+		cat("\t")
+ 		for (i in c(1:n.types)) {
+		 	cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", sep=""))
+                	cat("\t")
+		}	
+
+ 		for (i in c(1:n.types)) {
+		 	cat(paste("% ", myLevels[i]," genes w/ a peak in Control", sep=""))
+			cat("\t")
+		}
+                cat("\n")
+		for (i in c(1:length(x))) {
+			cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
+                	cat("\t")
+	 		for (t in c(1:n.types)) {
+			 	cat(paste(cum[i,t]/countL[t],"\t", sep=""))
+			}
+	 		for (t in c(1:n.types)) {
+			 	cat(paste(cumCont[i,t]/countLCont[t],"\t", sep=""))
+			}	 		
+                	cat("\n")
+		}
+	}else {
+            cat (paste("Dist_TSS","\t",sep=''))
+ 	    for (i in c(1:n.types)) {
+		 cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", "\t", sep=""))
+	    }
+ 	    cat("\n")
+	    for (i in c(1:length(x))) {
+			cat(paste(x[i],"\t",sep=""))	 		
+	 		for (t in c(1:n.types)) {
+			 	cat(paste(cum[i,t]/countL[t],"\t", sep=""))
+			}
+                	cat("\n")
+	    }
+	}
+} else {
+	if (ifControl) {
+		cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
+                cat("\n")
+		for (i in c(1:length(x))) {
+			cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
+                	cat("\n")
+		}
+	}else {
+		cat (paste("Dist_TSS","% genes w/ a peak in ChIP",sep='\t'))
+                cat("\n")
+		for (i in c(1:length(x))) {
+			cat(paste(x[i],cumTotal[i]/countTotal,sep="\t"))
+                	cat("\n")
+		}
+
+	}
+}
+
+
+#stop sinking:
+sink() 
+
+#around TSS:
+lim <- maxValue
+step <- myStep
+my_breaks <- seq(length=lim/step*2 +1, from=-lim, by=step)
+chip.genes <- read.table(file=paste(args[6],".genes", sep=""), header=TRUE) ;
+dist_real_f <- chip.genes
+if (ifControl) {
+   control.genes <- read.table(file=paste(args[9],".genes", sep=""), header=TRUE) ;   
+   dist_control_f<-data.frame(control.genes)
+}
+if (ifReg) {
+	#n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
+	 #myLevels <- levels(dist_real_f$Reg)
+
+	t<- which (dist_real_f$Reg==myLevels[1])
+	values_real <-dist_real_f$Dist[t]
+	hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real))  )
+        ymax <- max(hTSSreal$density, na.rm=TRUE)
+	for (i in c(2:n.types)) {
+		t<- which (dist_real_f$Reg==myLevels[i])
+		values_real <-dist_real_f$Dist[t]
+		hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real))  )
+        	ymax <- max(ymax,max(hTSSreal$density, na.rm=TRUE))
+        }
+
+
+	t<- which (dist_real_f$Reg==myLevels[1])
+	values_real <-dist_real_f$Dist[t]
+	hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real))  ) 
+        plot (hTSSreal$mids,hTSSreal$density,col = myLocCol[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax), xaxt="n" ) 
+	
+	title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
+
+	for (i in c(2:n.types)) {
+		t<- which (dist_real_f$Reg==myLevels[i])
+		values_real <-dist_real_f$Dist[t]
+		hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real))  )
+  		lines (hTSSreal$mids,hTSSreal$density,col = myLocCol[i],type="l", lwd = 2)
+        }
+	legend(lim*0.1, ymax*0.9, myLevels,  cex=1 , lwd = 2, bty = "n", col = myLocCol, lty = c(1),  pt.bg= c(myLocCol) , merge = TRUE)
+		
+	   gradi <- 1000
+	   if (lim>10000) {
+		gradi <- 10000
+	   }
+	   if (lim<3000) {
+		gradi <- 500
+	   }
+
+	   axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
+      	   axisxlab <- paste(axisx/1000, sep = "")
+	   axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
+
+
+  	#minor.tick(nx=10,tick.ratio=0.5)
+       if (ifControl) {
+		t<- which (dist_control_f$Reg==myLevels[1])
+		values_control <-dist_control_f$Dist[t]
+		hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control))  )
+		ymax <- max(hTSScontrol$density, na.rm=TRUE)
+		for (i in c(2:n.types)) {
+			t<- which (dist_control_f$Reg==myLevels[i])
+			values_control <-dist_control_f$Dist[t]
+			hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control))  )
+			ymax <- max(ymax,max(hTSScontrol$density, na.rm=TRUE))
+		}
+		t<- which (dist_control_f$Reg==myLevels[1])
+		values_control <-dist_control_f$Dist[t]
+		hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control))  )
+		plot (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax),xaxt="n" ) 
+		title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")		
+		for (i in c(2:n.types)) {
+			t<- which (dist_control_f$Reg==myLevels[i])
+			values_control <-dist_control_f$Dist[t]
+			hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control))  )
+	  		lines (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[i],type="l", lwd = 2)
+		}
+
+		gradi <- 1000
+		   if (lim>10000) {
+			gradi <- 10000
+		   }
+		   if (lim<3000) {
+			gradi <- 500
+		   }
+
+		   axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
+	      	   axisxlab <- paste(axisx/1000, sep = "")
+		   axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
+
+		legend(lim*0.1, ymax*0.9, myLevels,  cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1),  pt.bg= c(myLocCol) , merge = TRUE)
+		
+	  	# minor.tick(nx=10,tick.ratio=0.5)
+		#control vs real
+		 values_real <-dist_real_f$Dist
+		 hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real))  )
+		 plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n") 
+		title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")		
+		 ymax <- max(hTSSreal$density, na.rm=TRUE)
+		  values_control <-dist_control_f$Dist
+		  hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control))  )
+		  lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
+		  legend(lim*0.2, ymax*0.9, c("ChIP","Control"),  cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1),  pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)		 
+		
+		   gradi <- 1000
+		   if (lim>10000) {
+			gradi <- 10000
+		   }
+		   if (lim<3000) {
+			gradi <- 500
+		   }
+
+		   axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
+	      	   axisxlab <- paste(axisx/1000, sep = "")
+		   axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
+
+
+
+		 # minor.tick(nx=10,tick.ratio=0.5)
+       }
+} else {
+ values_real <-dist_real_f$Dist
+ hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real))  )
+ plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n") 
+		title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")		
+ ymax <- max(hTSSreal$density, na.rm=TRUE)
+ if (ifControl) {
+  values_control <-dist_control_f$Dist
+  hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control))  )
+  lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
+  legend(lim*0.2, ymax*0.9, c("ChIP","Control"),  cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1),  pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
+ } else {
+  legend(lim*0.2, ymax*0.9, c("ChIP"),  cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1),  pt.bg= c(myColor[1]) , merge = TRUE)
+ }
+
+ gradi <- 1000
+ if (lim>10000) {
+    gradi <- 10000
+  }
+  if (lim<3000) {
+	gradi <- 500
+  }
+
+  axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
+  axisxlab <- paste(axisx/1000, sep = "")
+  axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
+
+
+ # minor.tick(nx=10,tick.ratio=0.5)
+}
+dev.off()
+q();
+cat (paste("peak height","# peaks in ChIP","# peaks in Control","#control/chip","\n",sep='\t'))
+for (xval in c(minValue:maxValue)) {
+  for (i in (1:length(chipHist$mids))) {     
+     if (xval==chipHist$mids[i]) {
+      ychip <- chipHist$counts[i]
+     }
+  }
+  for (i in (1:length(controlHist$mids))) {     
+     if (xval==controlHist$mids[i]) {
+      ycontrol <- controlHist$counts[i]
+     }
+  }
+  cat (paste(xval,ychip,ycontrol,ycontrol/ychip,"\n",sep='\t'))
+}