view mtls_analyze/cluster_peaks.R @ 0:a903c48f05b4

Added non-integrated scripts to galaxy
author kmace
date Thu, 22 Mar 2012 15:21:00 -0400
parents
children
line wrap: on
line source

##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
## Feb 2012 th17
## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, 
## NYU - Center for Genomics and Systems Biology
##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '

rm(list=ls())
debug=F
script.version=0.1
print.error <- function(){
	cat("
DESCRIPTIION:	
	cluster_peaks.R takes MACS tab delimited files as input and produces one tab delimeted file (named mtls.xls) where 
	each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS files)
	fall within a certain distance between summits from eachother.

INPUT:
	1.path_input=path to MACS files '::' delim [path_input=f1::f2::f3::...::fk]
	2.path_output=path to save generated MTL cluster file (where to save mtls.xls)
	3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
	4.dist.summits=maximum distance between summits belonging to the same MTL
	5.n.autosome.chr=19 for mouse, 22 for human
	
EXAMPLE RUN: 
	cluster_peaks.R
	input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls
	path_output=~/Desktop/
	expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
	dist_summits=100
	n_autosome_chr=19
	
Please cite us if you used this script: 
	The transcription factor network regulating Th17 lineage specification and function.
	Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, 
	Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n")
}

############# helper functions:
## split.macs.list.to.chrmosomes
# input:
#      - macs.list: a list of macs expts: here are a few lines of one expt
## chr	start	end	length	summit	tags	#NAME?	fold_enrichment	FDR(%)
## chr1	4322210	4323069	860	494	55	158.95	6.03	0.05
## chr1	4797749	4798368	620	211	29	119.82	3.47	0.09
## chr1	4848182	4849113	932	494	46	105.42	2.9	0.09
#      - expts: a list of the expts names from macs.list that you want to process
#      - chr: chrmosomes names
# output:
#      - x: a list with as many elements as chr specified by input.
#      -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS)
split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){
  x <- list()
  n <- length(expts)
  for(i in 1:n){
    e <- expts[i] #experiment name
    cat("wroking on expt", e,"\n")
    x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]])
    names(x[[e]]) <- chr
  }
  return(x)
}
# a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m
# input:
#     - r is chromosome
#     - m is macs matrix for expt e from above function
split.one.macs.expt.by.chromosome.no.pml <- function(r,m){
  ix.chr.i <- which(m[,"chr"]==r)
  # cat("working on",r,"\n")
  o <- list()
  o[[r]] <- m[ix.chr.i,]
  o[[r]]$peak.id <- ix.chr.i
  return(o[[r]])
}

## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
# also match these summit locations with corresponding:
# pvals, tfs, peak start and peak end trgts
# input:
#     - chr: a list of chromosome names
#     - macs.list.per.chrom: a list of macs peaks for each chromosome
# output:
#     - o: a list each chormosome ruler as an element
make.ruler.no.pml <- function(chr,macs.list.per.chrom){
  x <- macs.list.per.chrom
  o <- list()
  for(j in 1:length(chr)){
    r <- chr[j] # chrmosome we go over
    s <- numeric()
    pval <- numeric()
    tf.to.s <- character()
    trgt.prox <- character()
    trgt.dist <- character()
    dtss.prox <- character() 
    dtss.dist <- character()
    start <- numeric()
    end <- numeric()
    trtmnts <- names(x) # the treatments name (expt names)
    ## debug parameters ###
    ## which experiment peaks come from
    expt <- character()
    ## what was the peak id in that experiment
    peak.ids <- numeric()
    ## this will allow us to always back track from a cluster to the actual peaks in it
    ## debug params end ###
    for(i in 1:length(trtmnts)){
      o[[r]] <- list()
      e <- trtmnts[i] #experiment name
      tf <- strsplit(e,"_")[[1]][1]
      s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"])
      pval <- c(pval,x[[e]][[r]][,7]) # the name of 7th column is X.10.log10.pvalue. this is pval
      # tf.to.s <- c(tf.to.s,rep(tf,length(x[[e]][[r]][,7]))) # all summits belong to tf
      start <- c(start,x[[e]][[r]][,"start"])
      end <- c(end,x[[e]][[r]][,"end"])
      expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"])))
      peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"])    
      trgt.prox <- c(trgt.prox,x[[e]][[r]][,"trgt.prox"])
	  trgt.dist <- c(trgt.dist,x[[e]][[r]][,"trgt.dist"])
      dtss.prox <- c(dtss.prox,x[[e]][[r]][,"dtss.prox"])
      dtss.dist <- c(dtss.dist,x[[e]][[r]][,"dtss.dist"])
    }
    ix <- sort(s,index.return=TRUE)$ix
    o[[r]] <- list(s=s[ix],pval=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix],
				   trgt.prox=trgt.prox[ix],trgt.dist=trgt.dist[ix],dtss.prox=dtss.prox[ix],dtss.dist=dtss.dist[ix])
  }
  return(o)
}

## add cluster memberships based on ruler
## require no more than d.cut distance between tf summits
## cur.l is the current loci number (or cluster number)
assign.clusters.ids.to.peaks <- function(ruler,d.cut){
 cur.l <- 0
 for(j in 1:length(ruler)){
   s <- ruler[[j]][["s"]]
   l <- numeric(length=length(s))
   if(length(l)>0){
	   cur.l <- cur.l+1
   }
   if(length(l)==1){
   	l[1] <- cur.l
   } else if(length(l)>1) {
	l[1] <- cur.l
	   for(i in 2:length(l)){
	     d <- s[i]-s[i-1] # assumes s is sorted increasingly
	     if(d>d.cut){
	       cur.l <- cur.l+1
	     }
	     l[i] <- cur.l    
	   }
	}
   ruler[[j]][["l"]] <- l
 }
 return(ruler)
}

## here we create a list of TF enriched loci (clusters)
## input:
##     - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
## output:
##     -ll: a list of clusters ll where each cluster (elem in ll holds):
##        - l: the current loci (elem in ll)
##        - s: summits vector as before
##        - pval: pvals vector matching the summits
##        - tfs: a vector of tfs matching s, the summits in loci l
##        - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
##        - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream)
create.cluster.list.no.pml <- function(ruler){
  tmp <- list()
  ll <- list()
  for(j in 1:length(ruler)){
    r <- names(ruler)[j]
    # cat("working on ",r,"\n")
    x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
    l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique)
    n <- length(l.vec) # iterate over n clusters
    tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
  }
  ## concatenate tmp into one list
  command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
  ll=eval(parse(text=command))
  return(ll)
}
get.cluster.params.no.pml <- function(i,x,l.vec,r){
  ix <- which(x[["l"]]==l.vec[i])
  l <- l.vec[i]
  start <- min(x[["start"]][ix])
  end <- max(x[["end"]][ix])
  s <- x[["s"]][ix]
  # tf.to.s <- x[["tf.to.s"]][ix]
  pval <- x[["pval"]][ix]
  pval.mean <- mean(pval)
  span.tfs <- x[["end"]][ix]-x[["start"]][ix]
  span.l <- end-start
  peak.ids <- x[["peak.ids"]][ix]
  expt <- x[["expt"]][ix]
  expt.alphanum.sorted <- sort(x[["expt"]][ix])
  trgt.prox <- unique(x[["trgt.prox"]][ix])
  trgt.dist <- unique(x[["trgt.dist"]][ix])
  dtss.prox <- unique(x[["dtss.prox"]][ix])
  dtss.dist <- unique(x[["dtss.dist"]][ix])
  chr <- rep(r,length(ix))
  return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
			  end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs,
			  span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist,
			  dtss.prox=dtss.prox,dtss.dist=dtss.dist))
}

## here we create a list of TF enriched loci (clusters)
## input:
##     - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
## output:
##     -ll: a list of clusters ll where each cluster (elem in ll holds):
##        - l: the current loci (elem in ll)
##        - s: summits vector as before
##        - pval: pvals vector matching the summits
##        - tfs: a vector of tfs matching s, the summits in loci l
##        - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
##        - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream)
create.cluster.list.no.pml <- function(ruler){
  tmp <- list()
  ll <- list()
  for(j in 1:length(ruler)){
    r <- names(ruler)[j]
    # cat("working on ",r,"\n")
    x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
    l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique)
    n <- length(l.vec) # iterate over n clusters
    tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
  }
  ## concatenate tmp into one list
  command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
  ll=eval(parse(text=command))
  return(ll)
}
get.cluster.params.no.pml <- function(i,x,l.vec,r){
  ix <- which(x[["l"]]==l.vec[i])
  l <- l.vec[i]
  start <- min(x[["start"]][ix])
  end <- max(x[["end"]][ix])
  s <- x[["s"]][ix]
  # tf.to.s <- x[["tf.to.s"]][ix]
  pval <- x[["pval"]][ix]
  pval.mean <- mean(pval)
  span.tfs <- x[["end"]][ix]-x[["start"]][ix]
  span.l <- end-start
  peak.ids <- x[["peak.ids"]][ix]
  expt <- x[["expt"]][ix]
  expt.alphanum.sorted <- sort(x[["expt"]][ix])
  trgt.prox <- unique(x[["trgt.prox"]][ix])
  trgt.dist <- unique(x[["trgt.dist"]][ix])
  dtss.prox <- unique(x[["dtss.prox"]][ix])
  dtss.dist <- unique(x[["dtss.dist"]][ix])
  chr <- rep(r,length(ix))
  return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
			  end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs,
			  span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist,
			  dtss.prox=dtss.prox,dtss.dist=dtss.dist))
}
## pretty print (tab deim) to file requested elements out of chip cluster list, ll.
## input:
##     - ll: a cluster list
##     - f.nm: a file name (include path) to where you want files to print
##     - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering)
## output
##     - a tab delim file with clusers as rows and elems tab delim for each cluster
print.ll.verbose.all <- function(ll,f.nm="ll.xls"){
  options(digits=5)
  cat(file=f.nm,names(ll[[1]]),sep="\t")
  cat(file=f.nm,"\n",append=TRUE)
  for(i in 1:length(ll)){
	line <- ll[[i]][[1]] # put MTL number
	line <- paste(line,ll[[i]][[2]][1],sep="\t")
	for(j in 3:length(ll[[i]])){
		val <- ll[[i]][[j]]
		if(length(val) == 1){
			line <- paste(line,val,sep="\t")
		} else {
			line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t")
		}
	}
  	cat(file=f.nm,line,"\n",append=TRUE)	
  }
}
############# Code:
# retrieve args
if(debug==T){
	cmd.args <- c(
		"input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls",
		"path_output=~/Desktop/",
		"expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17",
		"dist_summits=100",
		"n_autosome_chr=19"
	)
} else {
	cmd.args <- commandArgs();      
}

if(length(grep("--version",cmd.args))){
	cat("version",script.version,"\n")
	q()
}

arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1])
args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2])

arg.nms <- c("input_macs_files","path_output","expt_names","dist_summits","n_autosome_chr")
arg.val <- character(length=length(arg.nms))
for(i in 1:length(arg.nms)){
	ix <- which(arg.names.cmd.line==arg.nms[i])
	if(length(ix)==1){
		arg.val[i] <- args.val.cmd.line[ix]
	} else {
		stop("######could not find ",arg.nms[i]," arg######\n\n",print.error())
		
	} 
}
if(debug==T){
	print(paste(arg.nms,"=",arg.val))
}
# the files here adhere to tab delim format
input.macs.files <- strsplit(arg.val[1],"::")[[1]]
if(length(input.macs.files)==1){
	cat("only provided one MACS file to cluster.")
	print.error()
}
path.output <- arg.val[2]
expt.names <- strsplit(arg.val[3],"::")[[1]]
dist.summits <- as.numeric(arg.val[4])
n.autosome.chr <- as.numeric(arg.val[5])

# source("~/Documents/nyu/littmanLab/th17_used_for_paper/r_scripts/th17/used_for_paper/cluster_peaks_util.R")
chr <- c(paste(sep="","chr",1:n.autosome.chr),paste(sep="","chr",c("X","Y")))

# read MACS files
macs.list <- list()
for(i in 1:length(input.macs.files)){
	e <- expt.names[i]
	macs.list[[ e ]] <- read.delim(file=input.macs.files[i])
	macs.list[[ e ]][,"chr"] <- as.character(macs.list[[ e ]][,"chr"])
	macs.list[[ e ]][,"trgt.prox"] <- as.character(macs.list[[ e ]][,"trgt.prox"])
	macs.list[[ e ]][,"trgt.dist"] <- as.character(macs.list[[ e ]][,"trgt.dist"])
}

# take all macs files and put peaks together on each chromosome 
# (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls)
cat("splitting macs files per chromosome\n")
x <- split.macs.list.to.chrmosomes.no.pml(macs.list=macs.list,expts=expt.names,chr=chr)
cat("adding peaks from all macs files into chromosome rulers\n")
ruler <- make.ruler.no.pml(chr,macs.list.per.chrom=x)
cat("add MTL membership to the ruler\n")
ruler <- assign.clusters.ids.to.peaks(ruler,d.cut=dist.summits)
for(i in 1:length(ruler)){
	ix <- which(is.na(ruler[[i]][["dtss.prox"]]))
	ruler[[i]][["dtss.prox"]][ix] <- ""
	ix <- which(is.na(ruler[[i]][["dtss.dist"]]))
	ruler[[i]][["dtss.dist"]][ix] <- ""
}
cat("creating MTL list\n")
ll <- create.cluster.list.no.pml(ruler=ruler)
cat("writing MTL table\n")
# f.nm <- paste(sep="",path.output,paste(expt.names,collapse="_"),"_MTLs.xls")
f.nm <- paste(sep="",path.output,"mtls",".xls")
print.ll.verbose.all(ll=ll,f.nm=f.nm)