diff mtls_analyze/map_ChIP_peaks_to_genes.R @ 0:a903c48f05b4

Added non-integrated scripts to galaxy
author kmace
date Thu, 22 Mar 2012 15:21:00 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/map_ChIP_peaks_to_genes.R	Thu Mar 22 15:21:00 2012 -0400
@@ -0,0 +1,304 @@
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+## Nov 2011 th17
+## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, 
+## NYU - Center for Genomics and Systems Biology
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+
+rm(list=ls())
+debug=F
+script.version=0.1
+print.error <- function(){
+  cat("
+NAME
+       map_ChIP_peaks_to_genes.R
+
+DESCRIPTION
+       This script takes a MACS tab delimited file as input and
+       produces two files:
+
+       1) genes.xls
+          A gene centered table that stores the peaks that are
+          proximal (within x[kb] of TSS) and distal (within the
+          gene's region + y[kb] upstream or downstream of TSS and
+          TES respectively).
+
+       2) peaks.xls
+          A peak centered table that equals the MACS input table plus
+          colums for proximal and distal targets of each peak and
+          their distance from TSS (if exist). For gene.xls script also
+          gives a poisson model based -log10(pval) score for proximal
+          and genewide enrichment of peaks for each gene with at least
+          one prox/dist peak associated with it.
+
+ARGUMENTS
+       input_file=path
+         Path to MACS file.
+
+       refseq_table=path
+         Path to refseq table (gives TSS/TES locations for all genes).
+
+       path_output=path
+         Path to save genes.xls and peaks.xls output files.
+
+       tss.dist=N
+         The absolute distance from TSS where we connect a peak to
+         gene (for proximal peaks).
+
+       gene.wide.dist=N
+         The absolute distance from TSS or TES where we connect a peak
+         to gene (for peaks hitting anywhere in gene).
+
+       effective.genome.size=N
+         The effective mappable genome size (for mm9 the value is
+         1.87e9. For hg19 the value is 2.7e9 (from MACS manual)).
+
+       macs.skip.lines=N
+         Number of lines to skip in input_file.
+
+EXAMPLE 
+       Rscript map_ChIP_peaks_to_genes_v.1.R \\
+         input_file=SL971_SL970_peaks.xls \\
+         refseq_table=UCSC_mm9_refseq_genes_Sep_15_2011.txt \\
+         path_output=./ \\
+         tss.dist=5000 \\
+         gene.wide.dist=10000 \\
+         effective.genome.size=1.87e9 \\
+         macs.skip.lines=23
+
+CITATION
+       Please cite us if you used this script: The transcription
+       factor network regulating Th17 lineage specification and
+       function.  Maria Ciofani, Aviv Madar, Carolina Galan, Kieran
+       Mace, Ashish Agarwal, Kim Newberry, Richard M. Myers, Richard
+       Bonneau and Dan R. Littman et. al. (in preperation).
+
+")
+}
+
+# retrieve args
+if(debug==T){
+	cmd.args <- c(
+		# "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL971_SL970_peaks.xls",
+		"input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL3594_SL3592_peaks.xls",		
+		"refseq_table=input/th17/used_for_paper/rawData/UCSC_mm9_refseq_genes_Sep_15_2011.txt",
+		"path_output=/Users/aviv/Desktop/test_script/",
+		"tss.dist=5000",
+		"tes.dist=10000",
+		"effective.genome.size=1.87e9",
+		"gene.wide.dist=10000",
+		"macs.skip.lines=23"
+	)
+} else {
+	cmd.args <- commandArgs();      
+}
+
+if(length(grep("--version",cmd.args))){
+	cat("version",script.version,"\n")
+	q()
+}
+
+arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1])
+args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2])
+
+arg.nms <- c("input_file","refseq_table","path_output","tss.dist",
+			 "gene.wide.dist","effective.genome.size","macs.skip.lines")
+arg.val <- character(length=length(arg.nms))
+for(i in 1:length(arg.nms)){
+	ix <- which(arg.names.cmd.line==arg.nms[i])
+	if(length(ix)==1){
+		arg.val[i] <- args.val.cmd.line[ix]
+	} else {
+		stop("######could not find ",arg.nms[i]," arg######\n\n",print.error())
+		
+	} 
+}
+if(debug==T){
+	print(paste(arg.nms,"=",arg.val))
+}
+# the files here adhere to tab delim format
+path.input.macs <- arg.val[1]
+path.input.refseq <- arg.val[2]
+path.output <- arg.val[3]
+tss.dist <- as.numeric(arg.val[4])
+gene.wide.dist <- as.numeric(arg.val[5])
+effective.genome.size <- as.numeric(arg.val[6])
+macs.skip.lines=as.numeric(arg.val[7])
+
+if(debug==T){
+# if(1){
+	load("/Users/aviv/Desktop/r.u.RData")
+	gn.nms <- rownames(r.u)
+} else {
+	##################
+	### step 1 handle refseq table file (read, make unique)
+	cat("reading refseq table",path.input.refseq,"\n")
+	refseq <- read.delim(file=path.input.refseq)
+	# refseq can have many transcripts for each gene
+	# here i make it have only one transcript for each gene (the longest one)
+	cat("for transcripts matching more than one gene keeping only the longest transcript\n")
+	refseq$name2 <- as.character(refseq$name2)
+	refseq$chrom <- as.character(refseq$chrom)
+	refseq$strand <- as.character(refseq$strand)
+	gn.nms <- unique(refseq$name2)
+	#x <- sort(table(refseq$name2),decreasing=T)
+	#gn.nms.non.unique <- names(x)[which(x>1)]
+	#gn.nms.unique <- names(x)[which(x==1)]
+	# create refseq unique r.u
+	n <- length(gn.nms)
+	r.u <- data.frame(cbind(chrom=rep("NA",n),strand=rep("NA",n),txStart=rep(0,n),txEnd=rep(0,n)),stringsAsFactors=FALSE,row.names=gn.nms)
+	for(i in 1:n){
+	  ix <- which(refseq$name2==gn.nms[i])
+	  if(length(ix)==0) {
+	    error("could not find gene", ng.nms[i], "in refseq table.  Bailing out...\n")
+	  } else if (length(ix)>1){
+	    l <- apply(refseq[ix,c("txStart","txEnd")],1,function(i) abs(i[1]-i[2]) )
+	    l.max <- max(l)
+	    ix <- ix[which(l==l.max)[1]]
+	  }
+	  r.u[gn.nms[i],c("chrom","strand","txStart","txEnd")] <- refseq[ix,c("chrom","strand","txStart","txEnd")]
+	}
+	r.u[,"txStart"] <- as.numeric(r.u[,"txStart"])
+	r.u[,"txEnd"] <- as.numeric(r.u[,"txEnd"])
+
+	# switch TSS and TES if we have chr "-"
+	cat("correcting TSS, TES assiments in refseq table based on strand\n")
+	ix <- which(r.u$strand=="-")
+	tmp.tes <- r.u$txStart[ix]
+	tmp.tss <- r.u$txEnd[ix]
+	r.u[ix,c("txStart","txEnd")] <- cbind(tmp.tss,tmp.tes)
+	# ############
+}
+
+
+#### step 2 read macs file
+cat("reading Macs file",path.input.macs,"\n")
+
+# read macs file
+M <- read.delim(file=path.input.macs,skip=macs.skip.lines)
+trgt.prox <- NA
+trgt.dist <- NA
+dtss.prox <- NA
+dtss.dist <- NA
+M.annot <- cbind(M,trgt.prox,trgt.dist,dtss.prox,dtss.dist)
+M.annot[is.na(M.annot)] <- ""
+# get the number of genes
+m <- length(gn.nms)
+if(debug==T){
+	# m <- 100
+}
+f.nm.peak <- paste(sep="",path.output,"peaks.xls")
+f.nm.gene <- paste(sep="",path.output,"genes.xls")
+# for each genes in the expt (j goes over genes)
+cat(file=f.nm.gene,sep="\t","Gene_ID","prox_n_peak","genewide_n_peak","gn_length","strand","prox_mean_pval","genewide_mean_pval",
+			"prox_max_pval","genewide_max_pval",
+			"prox_pois_model_pval","genewide_pois_model_pval","(peak_id,summit,d_TSS,d_TES,class,pval,fold_enrich,FDR),(..)\n")
+peaks.summit <- (M[,"start"]+M[,"summit"])
+cat(sep="","mapping MACS peaks to genes\n")
+for (j in 1:m){
+  if(j%%20==0){cat(".")}
+  tss <- r.u[j,"txStart"]
+  tes <- r.u[j,"txEnd"]
+  gn.lngth <- abs(tss-tes)
+  strand <- r.u[j,"strand"]
+  chr <- r.u[j,"chrom"]
+  if(strand=="+"){
+    d.tss.all <- peaks.summit-tss
+   	d.tes.all <- peaks.summit-tes
+  } else {
+    d.tss.all <- tss-peaks.summit
+   	d.tes.all <- tes-peaks.summit
+  }
+  ix.distal <- sort(union( c(which( abs(d.tss.all) < gene.wide.dist ),which( abs(d.tes.all) < gene.wide.dist )),
+               which( sign(d.tss.all) != sign(d.tes.all) )),decreasing=TRUE)
+  ix.prox <- sort(which( abs(d.tss.all) < tss.dist ),decreasing=TRUE)	
+  ix.prox <- ix.prox[which(M[ix.prox,"chr"]==chr)]    
+  ix.distal <- ix.distal[which(M[ix.distal,"chr"]==chr)]
+  n.peaks.prox <- length(ix.prox)
+  n.peaks.distal <- length(ix.distal)
+	# if there is at least one peak hitting gene j
+  if(n.peaks.distal > 0){
+    # for each peak (l goes over peaks)
+    peaks.line <- paste(sep="","(")
+    for (k in 1:n.peaks.distal){
+      if(k>1){
+        peaks.line <- paste(sep="",peaks.line,";(")
+      }
+      d.tss <- d.tss.all[ ix.distal[k] ]
+      d.tes <- d.tes.all[ ix.distal[k] ]
+      if(sign(d.tss)!=sign(d.tes)){
+        class="intra"
+      } else if(d.tss<=0){
+        class="upstream"
+      } else if(d.tss>0){
+        class="downstream"
+      }
+      peaks.line <- paste(sep="",peaks.line,paste(sep=",", ix.distal[k], peaks.summit[ ix.distal[k] ], 
+					d.tss, d.tes, class, M[ ix.distal[k] ,7], M[ ix.distal[k] ,8], M[ ix.distal[k] ,9] ),")")
+	  # handle M.annot
+	  dtss <- d.tss.all[ ix.distal[k] ]
+	  prev.trgt <- M.annot[ix.distal[k],"trgt.dist"]
+	  if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
+		if(abs(dtss)<abs(as.numeric(M.annot[ix.distal[k],"dtss.dist"]))){
+			M.annot[ix.distal[k],"dtss.dist"] <- dtss
+			M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
+		}
+	  } else { # if peak does not have trgt add current trgt
+		M.annot[ix.distal[k],"dtss.dist"] <- dtss
+		M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
+	  }
+	# calc the pval for these many peaks in proximal region and in gene-wide region
+	}
+	if(n.peaks.prox > 0){
+		mean.pval.prox <- mean(M[ ix.prox ,7])
+		max.pval.prox <- max(M[ ix.prox ,7])
+		expected.num.peaks.genome.wide.prox <- length(which(M[,7]>=mean.pval.prox))
+		# lambda = num peaks / genome size * searched region
+		lambda.prox <- expected.num.peaks.genome.wide.prox/effective.genome.size*(tss.dist*2)		
+		pval.pois.prox <- -log10(ppois(n.peaks.prox,lambda.prox,lower.tail=FALSE))
+		# handle M.annot
+		for(k in 1:n.peaks.prox){
+			dtss <- d.tss.all[ ix.prox[k] ]
+			prev.trgt <- M.annot[ix.prox[k],"trgt.prox"]
+			if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
+				if(abs(dtss)<abs(as.numeric(M.annot[ix.prox[k],"dtss.prox"]))){
+					M.annot[ix.prox[k],"dtss.prox"] <- dtss
+					M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
+				}
+			} else { # if peak does not have trgt add current trgt
+				M.annot[ix.prox[k],"dtss.prox"] <- dtss
+				M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
+			}
+		}
+	} else {
+		mean.pval.prox <- "NA"
+		max.pval.prox <- "NA"
+		pval.pois.prox <- "NA"
+	}
+	mean.pval.distal <- mean(M[ ix.distal ,7])
+	max.pval.distal <- max(M[ ix.distal ,7])
+	expected.num.peaks.genome.wide.distal <- length(which(M[,7]>=mean.pval.distal))
+	# lambda = num peaks / genome size * searched region
+	lambda.distal <- expected.num.peaks.genome.wide.distal/effective.genome.size*(gn.lngth+gene.wide.dist*2)
+    pval.pois.distal <- -log10(ppois(n.peaks.distal,lambda.distal,lower.tail=FALSE))
+    # pring peaks for gene j    
+    core.line <- paste(sep="\t",gn.nms[ j ],n.peaks.prox,n.peaks.distal,gn.lngth,strand,
+							mean.pval.prox,mean.pval.distal,max.pval.prox,max.pval.distal,
+							pval.pois.prox,pval.pois.distal)
+    cat(file=f.nm.gene,append=TRUE,sep="",core.line,"\t",peaks.line,"\n")
+  }
+}
+cat("Done!\n")
+
+cat(file=f.nm.peak,colnames(M.annot),"\n",sep="\t")
+write.table(file=f.nm.peak,append=T,col.names=F,row.names=F,M.annot,sep="\t")
+
+
+
+
+
+
+