diff mtls_analyze/cluster_peaks.R @ 0:a903c48f05b4

Added non-integrated scripts to galaxy
author kmace
date Thu, 22 Mar 2012 15:21:00 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/cluster_peaks.R	Thu Mar 22 15:21:00 2012 -0400
@@ -0,0 +1,376 @@
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+## Feb 2012 th17
+## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, 
+## NYU - Center for Genomics and Systems Biology
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+
+rm(list=ls())
+debug=F
+script.version=0.1
+print.error <- function(){
+	cat("
+DESCRIPTIION:	
+	cluster_peaks.R takes MACS tab delimited files as input and produces one tab delimeted file (named mtls.xls) where 
+	each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS files)
+	fall within a certain distance between summits from eachother.
+
+INPUT:
+	1.path_input=path to MACS files '::' delim [path_input=f1::f2::f3::...::fk]
+	2.path_output=path to save generated MTL cluster file (where to save mtls.xls)
+	3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
+	4.dist.summits=maximum distance between summits belonging to the same MTL
+	5.n.autosome.chr=19 for mouse, 22 for human
+	
+EXAMPLE RUN: 
+	cluster_peaks.R
+	input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls
+	path_output=~/Desktop/
+	expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
+	dist_summits=100
+	n_autosome_chr=19
+	
+Please cite us if you used this script: 
+	The transcription factor network regulating Th17 lineage specification and function.
+	Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, 
+	Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n")
+}
+
+############# helper functions:
+## split.macs.list.to.chrmosomes
+# input:
+#      - macs.list: a list of macs expts: here are a few lines of one expt
+## chr	start	end	length	summit	tags	#NAME?	fold_enrichment	FDR(%)
+## chr1	4322210	4323069	860	494	55	158.95	6.03	0.05
+## chr1	4797749	4798368	620	211	29	119.82	3.47	0.09
+## chr1	4848182	4849113	932	494	46	105.42	2.9	0.09
+#      - expts: a list of the expts names from macs.list that you want to process
+#      - chr: chrmosomes names
+# output:
+#      - x: a list with as many elements as chr specified by input.
+#      -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS)
+split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){
+  x <- list()
+  n <- length(expts)
+  for(i in 1:n){
+    e <- expts[i] #experiment name
+    cat("wroking on expt", e,"\n")
+    x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]])
+    names(x[[e]]) <- chr
+  }
+  return(x)
+}
+# a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m
+# input:
+#     - r is chromosome
+#     - m is macs matrix for expt e from above function
+split.one.macs.expt.by.chromosome.no.pml <- function(r,m){
+  ix.chr.i <- which(m[,"chr"]==r)
+  # cat("working on",r,"\n")
+  o <- list()
+  o[[r]] <- m[ix.chr.i,]
+  o[[r]]$peak.id <- ix.chr.i
+  return(o[[r]])
+}
+
+## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
+# also match these summit locations with corresponding:
+# pvals, tfs, peak start and peak end trgts
+# input:
+#     - chr: a list of chromosome names
+#     - macs.list.per.chrom: a list of macs peaks for each chromosome
+# output:
+#     - o: a list each chormosome ruler as an element
+make.ruler.no.pml <- function(chr,macs.list.per.chrom){
+  x <- macs.list.per.chrom
+  o <- list()
+  for(j in 1:length(chr)){
+    r <- chr[j] # chrmosome we go over
+    s <- numeric()
+    pval <- numeric()
+    tf.to.s <- character()
+    trgt.prox <- character()
+    trgt.dist <- character()
+    dtss.prox <- character() 
+    dtss.dist <- character()
+    start <- numeric()
+    end <- numeric()
+    trtmnts <- names(x) # the treatments name (expt names)
+    ## debug parameters ###
+    ## which experiment peaks come from
+    expt <- character()
+    ## what was the peak id in that experiment
+    peak.ids <- numeric()
+    ## this will allow us to always back track from a cluster to the actual peaks in it
+    ## debug params end ###
+    for(i in 1:length(trtmnts)){
+      o[[r]] <- list()
+      e <- trtmnts[i] #experiment name
+      tf <- strsplit(e,"_")[[1]][1]
+      s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"])
+      pval <- c(pval,x[[e]][[r]][,7]) # the name of 7th column is X.10.log10.pvalue. this is pval
+      # tf.to.s <- c(tf.to.s,rep(tf,length(x[[e]][[r]][,7]))) # all summits belong to tf
+      start <- c(start,x[[e]][[r]][,"start"])
+      end <- c(end,x[[e]][[r]][,"end"])
+      expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"])))
+      peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"])    
+      trgt.prox <- c(trgt.prox,x[[e]][[r]][,"trgt.prox"])
+	  trgt.dist <- c(trgt.dist,x[[e]][[r]][,"trgt.dist"])
+      dtss.prox <- c(dtss.prox,x[[e]][[r]][,"dtss.prox"])
+      dtss.dist <- c(dtss.dist,x[[e]][[r]][,"dtss.dist"])
+    }
+    ix <- sort(s,index.return=TRUE)$ix
+    o[[r]] <- list(s=s[ix],pval=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix],
+				   trgt.prox=trgt.prox[ix],trgt.dist=trgt.dist[ix],dtss.prox=dtss.prox[ix],dtss.dist=dtss.dist[ix])
+  }
+  return(o)
+}
+
+## add cluster memberships based on ruler
+## require no more than d.cut distance between tf summits
+## cur.l is the current loci number (or cluster number)
+assign.clusters.ids.to.peaks <- function(ruler,d.cut){
+ cur.l <- 0
+ for(j in 1:length(ruler)){
+   s <- ruler[[j]][["s"]]
+   l <- numeric(length=length(s))
+   if(length(l)>0){
+	   cur.l <- cur.l+1
+   }
+   if(length(l)==1){
+   	l[1] <- cur.l
+   } else if(length(l)>1) {
+	l[1] <- cur.l
+	   for(i in 2:length(l)){
+	     d <- s[i]-s[i-1] # assumes s is sorted increasingly
+	     if(d>d.cut){
+	       cur.l <- cur.l+1
+	     }
+	     l[i] <- cur.l    
+	   }
+	}
+   ruler[[j]][["l"]] <- l
+ }
+ return(ruler)
+}
+
+## here we create a list of TF enriched loci (clusters)
+## input:
+##     - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
+## output:
+##     -ll: a list of clusters ll where each cluster (elem in ll holds):
+##        - l: the current loci (elem in ll)
+##        - s: summits vector as before
+##        - pval: pvals vector matching the summits
+##        - tfs: a vector of tfs matching s, the summits in loci l
+##        - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
+##        - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream)
+create.cluster.list.no.pml <- function(ruler){
+  tmp <- list()
+  ll <- list()
+  for(j in 1:length(ruler)){
+    r <- names(ruler)[j]
+    # cat("working on ",r,"\n")
+    x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
+    l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique)
+    n <- length(l.vec) # iterate over n clusters
+    tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
+  }
+  ## concatenate tmp into one list
+  command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
+  ll=eval(parse(text=command))
+  return(ll)
+}
+get.cluster.params.no.pml <- function(i,x,l.vec,r){
+  ix <- which(x[["l"]]==l.vec[i])
+  l <- l.vec[i]
+  start <- min(x[["start"]][ix])
+  end <- max(x[["end"]][ix])
+  s <- x[["s"]][ix]
+  # tf.to.s <- x[["tf.to.s"]][ix]
+  pval <- x[["pval"]][ix]
+  pval.mean <- mean(pval)
+  span.tfs <- x[["end"]][ix]-x[["start"]][ix]
+  span.l <- end-start
+  peak.ids <- x[["peak.ids"]][ix]
+  expt <- x[["expt"]][ix]
+  expt.alphanum.sorted <- sort(x[["expt"]][ix])
+  trgt.prox <- unique(x[["trgt.prox"]][ix])
+  trgt.dist <- unique(x[["trgt.dist"]][ix])
+  dtss.prox <- unique(x[["dtss.prox"]][ix])
+  dtss.dist <- unique(x[["dtss.dist"]][ix])
+  chr <- rep(r,length(ix))
+  return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
+			  end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs,
+			  span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist,
+			  dtss.prox=dtss.prox,dtss.dist=dtss.dist))
+}
+
+## here we create a list of TF enriched loci (clusters)
+## input:
+##     - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
+## output:
+##     -ll: a list of clusters ll where each cluster (elem in ll holds):
+##        - l: the current loci (elem in ll)
+##        - s: summits vector as before
+##        - pval: pvals vector matching the summits
+##        - tfs: a vector of tfs matching s, the summits in loci l
+##        - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
+##        - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream)
+create.cluster.list.no.pml <- function(ruler){
+  tmp <- list()
+  ll <- list()
+  for(j in 1:length(ruler)){
+    r <- names(ruler)[j]
+    # cat("working on ",r,"\n")
+    x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
+    l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique)
+    n <- length(l.vec) # iterate over n clusters
+    tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
+  }
+  ## concatenate tmp into one list
+  command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
+  ll=eval(parse(text=command))
+  return(ll)
+}
+get.cluster.params.no.pml <- function(i,x,l.vec,r){
+  ix <- which(x[["l"]]==l.vec[i])
+  l <- l.vec[i]
+  start <- min(x[["start"]][ix])
+  end <- max(x[["end"]][ix])
+  s <- x[["s"]][ix]
+  # tf.to.s <- x[["tf.to.s"]][ix]
+  pval <- x[["pval"]][ix]
+  pval.mean <- mean(pval)
+  span.tfs <- x[["end"]][ix]-x[["start"]][ix]
+  span.l <- end-start
+  peak.ids <- x[["peak.ids"]][ix]
+  expt <- x[["expt"]][ix]
+  expt.alphanum.sorted <- sort(x[["expt"]][ix])
+  trgt.prox <- unique(x[["trgt.prox"]][ix])
+  trgt.dist <- unique(x[["trgt.dist"]][ix])
+  dtss.prox <- unique(x[["dtss.prox"]][ix])
+  dtss.dist <- unique(x[["dtss.dist"]][ix])
+  chr <- rep(r,length(ix))
+  return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
+			  end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs,
+			  span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist,
+			  dtss.prox=dtss.prox,dtss.dist=dtss.dist))
+}
+## pretty print (tab deim) to file requested elements out of chip cluster list, ll.
+## input:
+##     - ll: a cluster list
+##     - f.nm: a file name (include path) to where you want files to print
+##     - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering)
+## output
+##     - a tab delim file with clusers as rows and elems tab delim for each cluster
+print.ll.verbose.all <- function(ll,f.nm="ll.xls"){
+  options(digits=5)
+  cat(file=f.nm,names(ll[[1]]),sep="\t")
+  cat(file=f.nm,"\n",append=TRUE)
+  for(i in 1:length(ll)){
+	line <- ll[[i]][[1]] # put MTL number
+	line <- paste(line,ll[[i]][[2]][1],sep="\t")
+	for(j in 3:length(ll[[i]])){
+		val <- ll[[i]][[j]]
+		if(length(val) == 1){
+			line <- paste(line,val,sep="\t")
+		} else {
+			line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t")
+		}
+	}
+  	cat(file=f.nm,line,"\n",append=TRUE)	
+  }
+}
+############# Code:
+# retrieve args
+if(debug==T){
+	cmd.args <- c(
+		"input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls",
+		"path_output=~/Desktop/",
+		"expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17",
+		"dist_summits=100",
+		"n_autosome_chr=19"
+	)
+} else {
+	cmd.args <- commandArgs();      
+}
+
+if(length(grep("--version",cmd.args))){
+	cat("version",script.version,"\n")
+	q()
+}
+
+arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1])
+args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2])
+
+arg.nms <- c("input_macs_files","path_output","expt_names","dist_summits","n_autosome_chr")
+arg.val <- character(length=length(arg.nms))
+for(i in 1:length(arg.nms)){
+	ix <- which(arg.names.cmd.line==arg.nms[i])
+	if(length(ix)==1){
+		arg.val[i] <- args.val.cmd.line[ix]
+	} else {
+		stop("######could not find ",arg.nms[i]," arg######\n\n",print.error())
+		
+	} 
+}
+if(debug==T){
+	print(paste(arg.nms,"=",arg.val))
+}
+# the files here adhere to tab delim format
+input.macs.files <- strsplit(arg.val[1],"::")[[1]]
+if(length(input.macs.files)==1){
+	cat("only provided one MACS file to cluster.")
+	print.error()
+}
+path.output <- arg.val[2]
+expt.names <- strsplit(arg.val[3],"::")[[1]]
+dist.summits <- as.numeric(arg.val[4])
+n.autosome.chr <- as.numeric(arg.val[5])
+
+# source("~/Documents/nyu/littmanLab/th17_used_for_paper/r_scripts/th17/used_for_paper/cluster_peaks_util.R")
+chr <- c(paste(sep="","chr",1:n.autosome.chr),paste(sep="","chr",c("X","Y")))
+
+# read MACS files
+macs.list <- list()
+for(i in 1:length(input.macs.files)){
+	e <- expt.names[i]
+	macs.list[[ e ]] <- read.delim(file=input.macs.files[i])
+	macs.list[[ e ]][,"chr"] <- as.character(macs.list[[ e ]][,"chr"])
+	macs.list[[ e ]][,"trgt.prox"] <- as.character(macs.list[[ e ]][,"trgt.prox"])
+	macs.list[[ e ]][,"trgt.dist"] <- as.character(macs.list[[ e ]][,"trgt.dist"])
+}
+
+# take all macs files and put peaks together on each chromosome 
+# (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls)
+cat("splitting macs files per chromosome\n")
+x <- split.macs.list.to.chrmosomes.no.pml(macs.list=macs.list,expts=expt.names,chr=chr)
+cat("adding peaks from all macs files into chromosome rulers\n")
+ruler <- make.ruler.no.pml(chr,macs.list.per.chrom=x)
+cat("add MTL membership to the ruler\n")
+ruler <- assign.clusters.ids.to.peaks(ruler,d.cut=dist.summits)
+for(i in 1:length(ruler)){
+	ix <- which(is.na(ruler[[i]][["dtss.prox"]]))
+	ruler[[i]][["dtss.prox"]][ix] <- ""
+	ix <- which(is.na(ruler[[i]][["dtss.dist"]]))
+	ruler[[i]][["dtss.dist"]][ix] <- ""
+}
+cat("creating MTL list\n")
+ll <- create.cluster.list.no.pml(ruler=ruler)
+cat("writing MTL table\n")
+# f.nm <- paste(sep="",path.output,paste(expt.names,collapse="_"),"_MTLs.xls")
+f.nm <- paste(sep="",path.output,"mtls",".xls")
+print.ll.verbose.all(ll=ll,f.nm=f.nm)
+
+
+
+
+
+
+
+
+