diff mtls_analyze/cluster_peaks.R @ 4:b465306d00ba draft default tip

Uploaded
author kmace
date Mon, 23 Jul 2012 13:00:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/cluster_peaks.R	Mon Jul 23 13:00:15 2012 -0400
@@ -0,0 +1,431 @@
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+## Feb 2012 th17
+## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, 
+## NYU - Center for Genomics and Systems Biology
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+
+rm(list=ls())
+debug=F
+verbose=F
+script.version=0.1
+print.error <- function(){
+	cat("
+DESCRIPTIION:	
+	cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where 
+	each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
+	fall within a certain distance from eachother. If you cluster MTLs based on summits, you need to specify dist.summits.
+	If you use .bed files, the files must contain no header, and at least the five columns: 
+	1. chrom, 2. chromStart, 3. chromEnd, 4. name, and 5.score. 2 and 3 represent the end points used for MTLs defined based on a shared
+	interval. (2+3)/2 is used as the summit of each row if used for MTLs defined based on proximity of summits.
+
+INPUT:
+	1.input_files: path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
+	2.path_output: path to save generated MTL cluster file (where to save mtls.xls)
+	3.expt_names: user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
+	4.input_type: the type of input file used (MACS or BED; defaults to MACS)
+	5.mtl_type: interval or summit (defaults to summit)
+	6.dist.summits: maximum distance between summits belonging to the same MTL (defaults to 100; only used if mtl_type is summit)
+		
+EXAMPLE RUN: 
+	cluster_peaks.R
+	--input_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
+	--input_type MACS
+	--path_output results/
+	--expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
+	--dist_summits 100
+	--mtl_type summit
+	
+Please cite us if you used this script: 
+	The transcription factor network regulating Th17 lineage specification and function.
+	Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, 
+	Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n")
+}
+
+############# helper functions:
+## split.macs.list.to.chrmosomes
+# input:
+#      - macs.list: a list of macs expts: here are a few lines of one expt
+## chr	start	end	length	summit	tags	#NAME?	fold_enrichment	FDR(%)
+## chr1	4322210	4323069	860	494	55	158.95	6.03	0.05
+## chr1	4797749	4798368	620	211	29	119.82	3.47	0.09
+## chr1	4848182	4849113	932	494	46	105.42	2.9	0.09
+#      - expts: a list of the expts names from macs.list that you want to process
+#      - chr: chrmosomes names
+# output:
+#      - x: a list with as many elements as chr specified by input.
+#      -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS)
+split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){
+  x <- list()
+  n <- length(expts)
+  for(i in 1:n){
+    e <- expts[i] #experiment name
+    cat("wroking on expt", e,"\n")
+    x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]])
+    names(x[[e]]) <- chr
+  }
+  return(x)
+}
+# a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m
+# input:
+#     - r is chromosome
+#     - m is macs matrix for expt e from above function
+split.one.macs.expt.by.chromosome.no.pml <- function(r,m){
+  ix.chr.i <- which(m[,"chr"]==r)
+  # cat("working on",r,"\n")
+  o <- list()
+  o[[r]] <- m[ix.chr.i,]
+  o[[r]]$peak.id <- ix.chr.i
+  return(o[[r]])
+}
+
+## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
+# also match these summit locations with corresponding:
+# pvals, tfs, peak start and peak end
+# input:
+#     - chr: a list of chromosome names
+#     - macs.list.per.chrom: a list of macs peaks for each chromosome
+# output:
+#     - o: a list each chormosome ruler as an element
+make.ruler.no.pml <- function(chr,macs.list.per.chrom){
+  x <- macs.list.per.chrom
+  o <- list()
+  for(j in 1:length(chr)){
+    r <- chr[j] # chrmosome we go over
+    s <- numeric()
+    pval <- numeric()
+    tf.to.s <- character()
+    start <- numeric()
+    end <- numeric()
+    trtmnts <- names(x) # the treatments name (expt names)
+    ## debug parameters ###
+    ## which experiment peaks come from
+    expt <- character()
+    ## what was the peak id in that experiment
+    peak.ids <- numeric()
+    ## this will allow us to always back track from a cluster to the actual peaks in it
+    ## debug params end ###
+    for(i in 1:length(trtmnts)){
+      o[[r]] <- list()
+      e <- trtmnts[i] #experiment name
+      tf <- strsplit(e,"_")[[1]][1]
+      s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"])
+      pval <- c(pval,x[[e]][[r]][,"score"]) 
+      start <- c(start,x[[e]][[r]][,"start"])
+      end <- c(end,x[[e]][[r]][,"end"])
+      expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"])))
+      peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"])    
+    }
+    ix <- sort(s,index.return=TRUE)$ix
+    o[[r]] <- list(summit=s[ix],score=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix])				
+  }
+  return(o)
+}
+
+## add cluster memberships based on ruler
+## require no more than d.cut distance between tf summits
+## cur.l is the current loci number (or cluster number)
+assign.clusters.ids.to.peaks.based.on.summits <- function(ruler,d.cut){
+ cur.l <- 0
+ for(j in 1:length(ruler)){
+   s <- ruler[[j]][["summit"]]
+   l <- numeric(length=length(s))
+   if(length(l)>0){
+	   cur.l <- cur.l+1
+   }
+   if(length(l)==1){
+   	l[1] <- cur.l
+   } else if(length(l)>1) {
+	l[1] <- cur.l
+	   for(i in 2:length(l)){
+	     d <- s[i]-s[i-1] # assumes s is sorted increasingly
+	     if(d>d.cut){
+	       cur.l <- cur.l+1
+	     }
+	     l[i] <- cur.l    
+	   }
+	}
+   ruler[[j]][["mtl.id"]] <- l
+ }
+ return(ruler)
+}
+
+## add cluster memberships based on ruler
+## require that peaks span will have a non-empty intersection 
+## cur.mtl is the current loci number (or cluster number)
+assign.clusters.ids.to.peaks.based.on.intervals <- function(ruler){
+cur.mtl <- 0
+for(j in 1:length(ruler)){
+	s <- ruler[[j]][["start"]]
+	e <- ruler[[j]][["end"]]
+	l <- numeric(length=length(s))
+	if(length(l)>0){
+	   cur.mtl <- cur.mtl+1
+	}
+	if(length(l)==1){
+	  	l[1] <- cur.mtl
+	} else if(length(l)>1) {
+		l[1] <- cur.mtl
+		cur.e <- e[1] # the right-most bp belonging to the current mtl
+	   	for(i in 2:length(l)){
+	     	if(cur.e<s[i]){
+	       		cur.mtl <- cur.mtl+1
+	     	}
+	     	l[i] <- cur.mtl 
+			cur.e <- max(cur.e,e[i])
+	   	}
+	}
+	ruler[[j]][["mtl.id"]] <- l
+}
+return(ruler)
+}
+
+## here we create a list of TF enriched loci (clusters)
+## input:
+##     - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
+## output:
+##     -ll: a list of clusters ll where each cluster (elem in ll holds):
+##        - mtl.id: the current loci (elem in ll)
+##        - s: summits vector as before
+##        - pval: pvals vector matching the summits
+##        - tfs: a vector of tfs matching s, the summits in loci l
+##        - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
+create.cluster.list.no.pml <- function(ruler){
+  tmp <- list()
+  ll <- list()
+  for(j in 1:length(ruler)){
+    r <- names(ruler)[j]
+    # cat("working on ",r,"\n")
+    x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
+    l.vec <- unique(x[["mtl.id"]]) # the clusters ids on j chr (only unique)
+    n <- length(l.vec) # iterate over n clusters
+    tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
+  }
+  ## concatenate tmp into one list
+  command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
+  ll=eval(parse(text=command))
+  return(ll)
+}
+get.cluster.params.no.pml <- function(i,x,l.vec,r){
+  # ix <- which(x[["l"]]==l.vec[i])
+  # l <- l.vec[i]
+  ix <- which(x[["mtl.id"]]==l.vec[i])
+  l <- l.vec[i]
+  start <- min(x[["start"]][ix])
+  end <- max(x[["end"]][ix])
+  # s <- x[["s"]][ix]
+  summit <- x[["summit"]][ix]
+  # pval <- x[["pval"]][ix]
+  score <- x[["score"]][ix]
+  # pval.mean <- mean(pval)
+  score.mean <- mean(score)
+  span.tfs <- x[["end"]][ix]-x[["start"]][ix]
+  span.l <- end-start
+  peak.ids <- x[["peak.ids"]][ix]
+  expt <- x[["expt"]][ix]
+  expt.alphanum.sorted <- sort(x[["expt"]][ix])
+
+  chr <- rep(r,length(ix))
+  return(list(mtl.id=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
+			  end=end,summit=summit,score=score,score.mean=score.mean,span.tfs=span.tfs,
+			  span.l=span.l,peak.ids=peak.ids))
+}
+
+## pretty print (tab deim) to file requested elements out of chip cluster list, ll.
+## input:
+##     - ll: a cluster list
+##     - f.nm: a file name (include path) to where you want files to print
+##     - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering)
+## output
+##     - a tab delim file with clusers as rows and elems tab delim for each cluster
+print.ll.verbose.all <- function(ll,f.nm="ll.xls"){
+  options(digits=5)
+  cat(file=f.nm,names(ll[[1]]),sep="\t")
+  cat(file=f.nm,"\n",append=TRUE)
+  for(i in 1:length(ll)){
+	line <- ll[[i]][[1]] # put MTL number
+	line <- paste(line,ll[[i]][[2]][1],sep="\t")
+	for(j in 3:length(ll[[i]])){
+		val <- ll[[i]][[j]]
+		if(length(val) == 1){
+			line <- paste(line,val,sep="\t")
+		} else {
+			line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t")
+		}
+	}
+  	cat(file=f.nm,line,"\n",append=TRUE)	
+  }
+}
+# read command line paramters that are not optional
+read.cmd.line.params.must <- function(args.nms, cmd.line.args){
+	if(length(grep("--version",cmd.line.args))){
+		cat("version",script.version,"\n")
+		q()
+	}
+	args <- sapply(strsplit(cmd.line.args," "),function(i) i)
+	vals <- character(length(args.nms))
+	# split cmd.line to key and value pairs
+	for(i in 1:length(args.nms)){
+		ix <- grep(args.nms[i],args)
+		if(length(ix)>1){
+			stop("arg ",args.nms[i]," used more than once.  Bailing out...\n",print.error())
+		} else if (length(ix)==0){
+			stop("could not find ",args.nms[i],". Bailing out...\n",print.error())
+		} else {
+			vals[i] <- args[ix+1]
+		}
+	}
+	return(vals)
+}
+# read command line paramters that are optional, if an optional param is not give functions return na for this param
+read.cmd.line.params.optional <- function(args.nms, cmd.line.args){
+	args <- sapply(strsplit(cmd.line.args," "),function(i) i)
+	vals <- character(length(args.nms))
+	# split cmd.line to key and value pairs
+	for(i in 1:length(args.nms)){
+		ix <- grep(args.nms[i],args)
+		if(length(ix)>1){
+			stop("arg ",args.nms[i]," used more than once.  Bailing out...\n",print.error())
+		} else if (length(ix)==0){ # if --param was not written in cmd line
+			vals[i] <- NA
+		} else if(( (ix+1) <= length(args) ) & ( length(grep("--",args[ix+1])) == 0) ){ # if --param was written in cmd line AND was followed by a value
+		 	vals[i] <- args[ix+1]
+		} else { # otherwise
+			vals[i] <- NA
+		}
+	}
+	return(vals)
+}
+
+############# Code:
+# retrieve args
+
+if(debug==T){
+	cmd.args <- c(
+		"--input_files data/xls/SL10571_SL10565_peaks.xls::data/xls/SL10570_SL10564_peaks.xls::data/xls/SL10572_SL10566_peaks.xls",
+		#"--input_files data/bed/SL10571_SL10565_peaks.bed::data/bed/SL10570_SL10564_peaks.bed::data/bed/SL10572_SL10566_peaks.bed",
+		"--input_type MACS", #BED
+		"--path_output ./results/",
+		"--expt_names macs1::macs2::macs3",
+		#"--expt_names bed1::bed2::bed3",
+		"--mtl_type interval", #interval summit
+		"--dist_summits 100"
+	)
+} else {
+	cmd.args <- commandArgs(trailingOnly = T);      
+}
+
+# if(length(grep("--version",cmd.args))){
+# 	cat("version",script.version,"\n")
+# 	q()
+# }
+args.nms.must <- c(	
+		"--input_files",   #1
+		"--path_output",  	 #2
+		"--expt_names" #3
+)
+
+# all numeric params must come at the end of args.nms.optional
+n.start.numeric <- 3
+args.nms.optional <- c(	
+		"--input_type",	#1
+		"--mtl_type", 	 #2
+		"--dist_summits" #3
+)
+
+# get must parameters
+vals.must <- read.cmd.line.params.must(args.nms = args.nms.must, cmd.line.args = cmd.args)
+if(verbose){
+	cat("\nfinshed reading vals: \n")
+	cat("\nvals.must", unlist(vals.must), "\n")
+}
+input.files <- strsplit(vals.must[1],"::")[[1]]
+if(length(input.files)==1){
+	cat("only provided one MACS file to cluster.")
+	print.error()
+}
+path.output <- vals.must[2]
+expt.names <- strsplit(vals.must[3],"::")[[1]]
+# get optional parameters
+vals.optional <- read.cmd.line.params.optional(args.nms = args.nms.optional, cmd.line.args = cmd.args)
+if(verbose){
+	cat("\nvals.optional:", unlist(vals.optional),"\n")
+}
+if(is.na(vals.optional[1])){
+	input.type <- "MACS"
+} else {
+	input.type <- vals.optional[1]
+}
+if(is.na(vals.optional[2])){
+	mtl.type <- "interval"
+} else {
+	mtl.type <- vals.optional[2]
+}
+if(is.na(vals.optional[2])){
+	dist.summits <- 100
+} else {
+	dist.summits <- as.numeric(vals.optional[3])
+}
+		
+# read MACS files
+unique.chr <- character()
+infile.list <- list()
+col.nms.keepers <- c("chr","start","end","summit","score")
+for(i in 1:length(input.files)){
+	e <- expt.names[i]
+	if(toupper(input.type)=="MACS"){
+		tmp <- read.delim(file=input.files[i],comment.char="#",stringsAsFactors = F)		
+		columns <- c("chr","start","end","summit","pvalue")
+		ix.cols <- numeric()
+		for(j in 1:length(columns)){
+			ix.j <- grep(columns[j],names(tmp))
+			if(length(ix.j)==0){
+				stop("can't find (grep) column ",columns[j]," in MACS input file ", e)
+			}
+			ix.cols <- c(ix.cols,ix.j)
+		}
+		infile.list[[e]] <- tmp[,ix.cols]
+		colnames(infile.list[[e]]) <- col.nms.keepers
+	}
+	if(toupper(input.type)=="BED"){
+		tmp <- read.delim(file=input.files[i],stringsAsFactors = F,header = FALSE)
+		tmp[,4] <- tmp[,2]+(tmp[,3]-tmp[,2])/2 # define summit and put istead of name column
+		colnames(tmp)  <- col.nms.keepers
+		infile.list[[e]] <- tmp[,1:5]
+	}	
+	unique.chr <- unique(c(unique.chr,infile.list[[ e ]][,"chr"]))
+}
+
+unique.chr <- sort(unique.chr)
+
+# take all macs files and put peaks together on each chromosome 
+# (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls)
+cat("splitting input files per chromosome\n")
+x <- split.macs.list.to.chrmosomes.no.pml(macs.list=infile.list,expts=expt.names,chr=unique.chr)
+cat("adding peaks from all input files into chromosome rulers\n")
+ruler <- make.ruler.no.pml(chr=unique.chr,macs.list.per.chrom=x)
+cat("add MTL membership to the ruler\n")
+
+if(mtl.type=="interval"){
+	ruler <- assign.clusters.ids.to.peaks.based.on.intervals(ruler)
+} else {
+	ruler <- assign.clusters.ids.to.peaks.based.on.summits(ruler,d.cut=dist.summits)
+}
+
+cat("creating MTL list\n")
+ll <- create.cluster.list.no.pml(ruler=ruler)
+cat("writing MTL table\n")
+f.nm <- paste(sep="",path.output,"mtls",".xls")
+print.ll.verbose.all(ll=ll,f.nm=f.nm)
+
+
+
+
+
+
+
+
+