diff [APliBio]Nebula tools suite/Nebula/AnnotateGenes/geneDist.R @ 0:2ec3ba0e9e70 draft

Uploaded
author alermine
date Thu, 25 Oct 2012 08:18:25 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/[APliBio]Nebula tools suite/Nebula/AnnotateGenes/geneDist.R	Thu Oct 25 08:18:25 2012 -0400
@@ -0,0 +1,348 @@
+args <- commandArgs()
+input <- args[4]
+pngFile <- args[5]
+dataTable <-read.table(file=input, header=TRUE);
+chip.data<-data.frame(dataTable)
+ifReg <- 0
+if (length(unique(chip.data$Reg))>1) {
+ ifReg <- 1
+}
+
+ifPDF <- 0
+bootstrap <- 1
+
+if (length(args)>=8) {
+	ifPDF=args[8]
+	bootstrap=args[9]
+}
+if (length(args)==7 & args[7]==1) {
+	ifPDF=1
+}
+
+ifControl <- 0
+if (length(args)>=7 & args[7]!=1 & args[7]!=0) {
+  dataTable <-read.table(file=args[7], header=TRUE);
+  control.data<-data.frame(dataTable)
+  ifControl <- 1
+
+
+}
+
+error.bar <- function(x, y, upper, lower=upper, length=0.1,...){
+      if(length(x) != length(y) | length(y) !=length(lower) | length(lower) != length(upper))
+      stop("vectors must be same length")
+      arrows(x,y+upper, x, y-lower, angle=90, code=3, length=length, ...)
+}
+
+
+
+logFile <- args[6]
+sink(logFile, append=FALSE, split=FALSE)
+
+if (ifReg & ifControl) {
+
+  if (ifPDF==1) {
+       pdf(file = pngFile, width = 14, height = 13, pointsize = 20, bg = "white")
+  } else {
+       png(filename = pngFile, width =  1140, height =  840, units = "px", pointsize = 20, bg = "white", res = NA)
+       plot(1:10)
+  }
+  op <- par(mfrow = c(3,1))
+} else {
+	if (ifReg | ifControl) {
+
+	  if (ifPDF==1) {
+		pdf(file = pngFile, width = 20, height = 17, pointsize = 20, bg = "white")
+	  } else {
+		png(filename = pngFile, width = 1580, height = 1230, units = "px", pointsize = 20, bg = "white", res = NA)
+		plot(1:10)
+	  }
+	  op <- par(mfrow = c(2,1))
+	} else {
+	  if (ifPDF==1) {
+		pdf(file = pngFile, width = 22, height = 8, pointsize = 20, bg = "white")
+	  } else {
+		png(filename = pngFile, width = 1580, height = 530, units = "px", pointsize = 20, bg = "white", res = NA)
+		plot(1:10)
+	  }
+	  op <- par(mfrow = c(1,1))
+	}
+}
+myColor <- 1
+myColor[1] <- colors()[131]
+myColor[2] <- colors()[59]
+myColor[3] <- colors()[76]
+myColor[4] <- colors()[88]
+myColor[5] <- colors()[17]
+myColor[6] <- colors()[565]
+myColor[7] <- colors()[454]
+myColor[8] <- colors()[401]
+myColor[9] <- colors()[99]
+myColorControl <- 1
+myColorControl[1] <- colors()[24]
+myColorControl[2] <- colors()[278]
+myColorControl[3] <- colors()[305]
+myColorControl[4] <- colors()[219]
+myColorControl[5] <- colors()[343]
+myColorControl[6] <- colors()[245]
+myLevels <- 0
+nn <- colnames(chip.data)
+cc <- c(1:41)
+colnames(chip.data) <- cc
+countTotal <- length(unique(chip.data$"1"))
+tt <- which(chip.data$"16">0)
+countTotalEnh <- length(unique(chip.data$"1"[tt]))
+tt <- which(chip.data$"10">0)
+countTotalProm <- length(unique(chip.data$"1"[tt]))
+tt <- which(chip.data$"12">0)
+countTotalImDown <- length(unique(chip.data$"1"[tt]))
+tt <- which(chip.data$"18">0)
+countTotalIntra <- length(unique(chip.data$"1"[tt]))
+tt <- which(chip.data$"20">0)
+countTotal5kbD <- length(unique(chip.data$"1"[tt]))
+tt <- which(chip.data$"28">0)
+countTotal1IntronI <- length(unique(chip.data$"1"[tt]))
+tt <- which(chip.data$"32">0)
+countTotalExonsI <- length(unique(chip.data$"1"[tt]))
+tt <- which(chip.data$"36">0)
+countTotalIntronsI <- length(unique(chip.data$"1"[tt]))
+tt <- which(chip.data$"40">0)
+countTotalJonctionsI <- length(unique(chip.data$"1"[tt]))
+
+names <- c("Enh.","Prom.","Imm.Down.","Intrag.","GeneDown.","F.Intron","Exons","2,3,etc.Introns","E.I.Junctions")
+yChIPTotal <- c (countTotalEnh/countTotal,countTotalProm/countTotal, countTotalImDown/countTotal,countTotalIntra/countTotal,countTotal5kbD/countTotal,countTotal1IntronI/countTotal,countTotalExonsI/countTotal,countTotalIntronsI/countTotal,countTotalJonctionsI/countTotal)
+if(!ifReg&!ifControl) {
+        par(mar=c(5.1, 7.1, 4.1, 2.1)) 
+	barplot(yChIPTotal,xlab="",beside=TRUE, col=c(myColor), names.arg=c(names),ylab="Proportion of genes with a peak")
+	cat ("Proportion of genes with a peak in a given genomic region:\n")
+	cat (paste(c(names),sep='\t'))
+	cat("\n")
+	cat (paste(c(yChIPTotal),sep='\t'))
+	cat("\n\n")
+}
+if (ifControl) {
+	if (bootstrap>1) {
+		yControlTotalMatrix <- NULL	
+	}
+	for (fileNumber in 1:bootstrap) {
+
+		if (fileNumber>=2) {
+			dataTable <-read.table(file=paste(args[7],fileNumber,sep=""), header=TRUE);
+ 			control.data<-data.frame(dataTable)
+		}
+
+                colnames(control.data) <- cc
+		countTotalCntr <- length(unique(control.data$"1"))
+		tt <- which(control.data$"16">0)
+		countTotalEnhCntr <- length(unique(control.data$"1"[tt]))
+		tt <- which(control.data$"10">0)
+		countTotalPromCntr <- length(unique(control.data$"1"[tt]))
+		tt <- which(control.data$"12">0)
+		countTotalImDownCntr <- length(unique(control.data$"1"[tt]))
+		tt <- which(control.data$"18">0)
+		countTotalIntraCntr <- length(unique(control.data$"1"[tt]))
+		tt <- which(control.data$"20">0)
+		countTotal5kbDCntr <- length(unique(control.data$"1"[tt]))
+		tt <- which(control.data$"28">0)
+		countTotal1IntronICntr <- length(unique(control.data$"1"[tt]))
+		tt <- which(control.data$"32">0)
+		countTotalExonsICntr <- length(unique(control.data$"1"[tt]))
+		tt <- which(control.data$"36">0)
+		countTotalIntronsICntr <- length(unique(control.data$"1"[tt]))
+		tt <- which(control.data$"40">0)
+		countTotalJonctionsICntr <- length(unique(control.data$"1"[tt]))
+		yControlTotal <- c (countTotalEnhCntr/countTotalCntr,countTotalPromCntr/countTotalCntr, countTotalImDownCntr/countTotalCntr,countTotalIntraCntr/countTotalCntr,countTotal5kbDCntr/countTotalCntr,countTotal1IntronICntr/countTotalCntr,countTotalExonsICntr/countTotalCntr,countTotalIntronsICntr/countTotalCntr,countTotalJonctionsICntr/countTotalCntr)	
+		if (bootstrap>1) {
+			yControlTotalMatrix <- rbind(yControlTotalMatrix,yControlTotal)	
+		}
+	}
+	if (bootstrap>1) {
+		yControlTotal <- colMeans(yControlTotalMatrix)
+		Nrows <- nrow(yControlTotalMatrix)
+		colVars <-  Nrows/(Nrows-1) * (colMeans(yControlTotalMatrix*yControlTotalMatrix)-colMeans(yControlTotalMatrix)^2)
+	} 
+	if (!ifReg) {
+	 	cum = matrix( 0, nrow=2,  ncol=length(names), byrow = TRUE) 
+		for (i in c(1:length(names))) {	
+		  cum[1,i] <- yChIPTotal[i]
+		  cum[2,i] <- yControlTotal[i]
+		}
+		if (bootstrap>1) {
+			wiskers <- matrix(c(colVars-colVars,sqrt(colVars)),2,length(names),byrow=TRUE)*1.96
+		} 
+		par(mar=c(5.1, 7.1, 4.1, 2.1)) 
+		barx <- barplot(cum,xlab="",beside=TRUE, col=c(myColor[6],myColor[5]),  legend = c("ChIP","Control"), names.arg=c(names),ylab="Proportion of genes with a peak")
+		cat ("Proportion of genes with a peak from the ChIP dataset in a given genomic region:\n")
+		cat (paste(c(names),sep='\t'))
+		cat("\n")
+		cat (paste(c(yChIPTotal),sep='\t'))
+		cat("\n\n")
+		cat ("Proportion of genes with a peak from the Control dataset in a given genomic region:\n")
+		cat (paste(c(names),sep='\t'))
+		cat("\n")
+		cat (paste(c(yControlTotal),sep='\t'))
+		cat("\n\n")
+		if (bootstrap>1) {
+			error.bar(barx,cum,wiskers)
+			cat ("Standard deviation for the Control dataset in a given genomic region:\n")
+			cat (paste(c(names),sep='\t'))
+			cat("\n")
+			cat (paste(c(sqrt(colVars)),sep='\t'))
+			cat("\n\n")
+		} 
+		enrich <- NULL
+		for (i in c(1:length(names))) {	
+		  enrich[i] <- yChIPTotal[i]/yControlTotal[i];	  
+		}
+		barplot(enrich-1,xlab="",beside=TRUE, col=c(myColor), names.arg=c(names),ylab="Enrichment in comparison\nwith the control dataset",  yaxt="n")
+		minX <- min(enrich-1)
+		maxX <- max(enrich-1)
+		x = seq(length=11, from=round(minX*10)/10, by=round((maxX-minX)*10)/100)
+		axis(2, at=x,labels=x+1, las=2)
+
+		cat ("Enrichment of genomic regions, ChIP peaks vs Control Peaks:\n")
+		cat (paste(c(names),sep='\t'))
+		cat("\n")
+		cat (paste(c(yChIPTotal/yControlTotal),sep='\t'))
+		cat("\n\n")
+		if (bootstrap>1) {
+			z <- (yChIPTotal-yControlTotal)/sqrt(colVars)
+			pvalues <- 2*pnorm(-abs(z))
+			cat ("Two-side P-values for each genomic category:\n")
+			cat (paste(c(names),sep='\t'))
+			cat("\n")
+			cat (paste(c(yChIPTotal/yControlTotal),sep='\t'))
+			cat("\n\n")
+		}
+	}
+}
+if (ifReg) { 	
+	 n.types <- length(levels(chip.data$"6"))
+	 myLevels <- levels(chip.data$"6")
+	 nlev <- length(names)
+
+	if (ifControl) {
+		cum = matrix( 0, nrow=length(myLevels)+1,  ncol=nlev, byrow = TRUE) 
+         	cumEnrichTotal = matrix( 0, nrow=length(myLevels),  ncol=nlev, byrow = TRUE) 
+         	cumEnrichControl = matrix( 0, nrow=length(myLevels),  ncol=nlev, byrow = TRUE) 
+	}else  {
+		cum = matrix( 0, nrow=length(myLevels),  ncol=nlev, byrow = TRUE) 
+         	cumEnrichTotal = matrix( 0, nrow=length(myLevels),  ncol=nlev, byrow = TRUE) 
+	}
+	colReg <-NULL
+	for (r in c(1:length(myLevels))) {
+		      tt <- which(chip.data$"6"==myLevels[r])
+		      subset.data <- (chip.data[tt,])
+		      countTotalSubset <- length(unique(subset.data$"1"))
+			tt <- which(subset.data$"16">0)
+			countTotalEnhSubset <- length(unique(subset.data$"1"[tt]))
+			tt <- which(subset.data$"10">0)
+			countTotalPromSubset <- length(unique(subset.data$"1"[tt]))
+			tt <- which(subset.data$"12">0)
+			countTotalImDownSubset <- length(unique(subset.data$"1"[tt]))
+			tt <- which(subset.data$"18">0)
+			countTotalIntraSubset <- length(unique(subset.data$"1"[tt]))
+			tt <- which(subset.data$"20">0)
+			countTotal5kbDSubset <- length(unique(subset.data$"1"[tt]))
+			tt <- which(subset.data$"28">0)
+			countTotal1IntronISubset <- length(unique(subset.data$"1"[tt]))
+			tt <- which(subset.data$"32">0)
+			countTotalExonsISubset <- length(unique(subset.data$"1"[tt]))
+			tt <- which(subset.data$"36">0)
+			countTotalIntronsISubset <- length(unique(subset.data$"1"[tt]))
+			tt <- which(subset.data$"40">0)
+			countTotalJonctionsISubset <- length(unique(subset.data$"1"[tt]))
+		      ySubsetTotal <- c (countTotalEnhSubset/countTotalSubset,countTotalPromSubset/countTotalSubset, countTotalImDownSubset/countTotalSubset,countTotalIntraSubset/countTotalSubset,countTotal5kbDSubset/countTotalSubset,countTotal1IntronISubset/countTotalSubset,countTotalExonsISubset/countTotalSubset,countTotalIntronsISubset/countTotalSubset,countTotalJonctionsISubset/countTotalSubset)
+		      for (i in c(1:nlev)) {	
+			cum[r,i] <- ySubsetTotal[i]  
+			cumEnrichTotal[r,i] <- ySubsetTotal[i]/yChIPTotal[i]   
+			if (ifControl) {
+				cumEnrichControl[r,i] <- ySubsetTotal[i]/yControlTotal[i]   
+			}      
+		      }
+		      colReg[r]<-myColor[r+3]
+	}
+	if (ifControl) {
+	 	for (i in c(1:nlev)) {	
+			cum[4,i] <- yControlTotal[i]                
+		}		
+	}
+	
+	cat ("Proportion of genes with a peak from the ChIP dataset in a given genomic region:\n")
+	for (r in c(1:length(myLevels))) {
+		cat (paste(myLevels[r],":\n",sep=""))
+		cat (paste(c(names),sep='\t'))
+		cat("\n")
+		cat (paste(c(cum[r,] ),sep='\t'))
+		cat("\n")
+	}
+	cat("\n")
+	par(mar=c(5.1, 7.1, 4.1, 2.1)) 	
+	if (ifControl) {
+		barx <- barplot(cum,xlab="",beside=TRUE, col=c(colReg, colors()[334]),  legend = c(myLevels,"Control"), names.arg=c(names),ylab="Proportion of genes with a peak")
+		cat ("Proportion of genes with a peak from the Control dataset in a given genomic region:\n")
+		cat (paste(c(names),sep='\t'))	
+		cat("\n")
+		cat (paste(c(yControlTotal),sep='\t'))
+		cat("\n\n")				
+		if (bootstrap>1) {
+			wiskers <- cum-cum
+			wiskers[nrow(wiskers),] <- sqrt(colVars)*1.96
+			error.bar(barx,cum,wiskers)	
+			cat ("Standard deviation for the Control dataset in a given genomic region:\n")
+			cat (paste(c(names),sep='\t'))
+			cat("\n")
+			cat (paste(c(sqrt(colVars)),sep='\t'))
+			cat("\n\n")	
+		} 
+	} else {
+		barplot(cum,xlab="",beside=TRUE, col=c(colReg),  legend = c(myLevels), names.arg=c(names),ylab="Proportion of genes with a peak")
+	}		
+	barplot(cumEnrichTotal-1,xlab="",beside=TRUE, col=c(colReg), names.arg=c(names),ylab="Enrichment in comparison\nwith the total gene set",  yaxt="n")
+	minX <- min(cumEnrichTotal-1)
+	maxX <- max(cumEnrichTotal-1)
+	x = seq(length=11, from=round(minX*10)/10, by=round((maxX-minX)*10)/100)
+	axis(2, at=x,labels=x+1, las=2)
+
+	cat ("Enrichment of genomic regions, Transcriptional categories vs All Genes:\n")
+	for (r in c(1:length(myLevels))) {
+			cat (paste(myLevels[r],":\n",sep=""))
+			cat (paste(c(names),sep='\t'))	
+			cat("\n")
+			cat (paste(c(cumEnrichTotal[r,]),sep='\t'))			
+			cat("\n")
+	}	
+	cat("\n")
+
+	if (ifControl) {
+		barplot(cumEnrichControl-1,xlab="",beside=TRUE, col=c(colReg), names.arg=c(names),ylab="Enrichment in comparison\nwith control",  yaxt="n")
+		minX <- min(cumEnrichControl-1)
+		maxX <- max(cumEnrichControl-1)
+		x = seq(length=11, from=round(minX*10)/10, by=round((maxX-minX)*10)/100)
+		axis(2, at=x,labels=x+1, las=2)
+		cat ("Enrichment of genomic regions, ChIP peaks vs Control Peaks:\n")
+		for (r in c(1:length(myLevels))) {
+			cat (paste(myLevels[r],":\n",sep=""))
+			cat (paste(c(names),sep='\t'))	
+			cat("\n")
+			cat (paste(c(cumEnrichControl[r,]),sep='\t'))			
+			cat("\n")
+		}	
+		cat("\n")
+		if (bootstrap>1) {
+			cat ("Two-side P-values for each genomic category:\n")
+			for (r in c(1:length(myLevels))) {
+				z <- (cum[r,]-yControlTotal)/sqrt(colVars)
+				pvalues <- 2*pnorm(-abs(z))
+				cat (paste(myLevels[r],":\n",sep=""))
+				cat (paste(c(names),sep='\t'))	
+				cat("\n")
+				cat (paste(c(pvalues),sep='\t'))			
+				cat("\n")
+			}	
+		}
+	}	
+} 
+sink() #stop sinking :)
+dev.off()
+