Mercurial > repos > kmace > mtls_analysis

##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
## Feb 2012 th17
## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>,
## NYU - Center for Genomics and Systems Biology
##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '

rm(list=ls())
debug=F
verbose=F
script.version=0.1
print.error <- function(){
	cat("
DESCRIPTIION:
	cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where
	each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
	fall within a certain distance from eachother. If you cluster MTLs based on summits, you need to specify dist.summits.
	If you use .bed files, the files must contain no header, and at least the five columns:
	1. chrom, 2. chromStart, 3. chromEnd, 4. name, and 5.score. 2 and 3 represent the end points used for MTLs defined based on a shared
	interval. (2+3)/2 is used as the summit of each row if used for MTLs defined based on proximity of summits.

INPUT:
	1.input_files: path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
	2.path_output: path to save generated MTL cluster file (where to save mtls.xls)
	3.expt_names: user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
	4.input_type: the type of input file used (MACS or BED; defaults to MACS)
	5.mtl_type: interval or summit (defaults to summit)
	6.dist.summits: maximum distance between summits belonging to the same MTL (defaults to 100; only used if mtl_type is summit)

EXAMPLE RUN:
	cluster_peaks.R
	--input_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
	--input_type MACS
	--path_output results/
	--expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
	--dist_summits 100
	--mtl_type summit

Please cite us if you used this script:
	The transcription factor network regulating Th17 lineage specification and function.
	Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers,
	Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n")
}

############# helper functions:
## split.macs.list.to.chrmosomes
# input:
#      - macs.list: a list of macs expts: here are a few lines of one expt
## chr	start	end	length	summit	tags	#NAME?	fold_enrichment	FDR(%)
## chr1	4322210	4323069	860	494	55	158.95	6.03	0.05
## chr1	4797749	4798368	620	211	29	119.82	3.47	0.09
## chr1	4848182	4849113	932	494	46	105.42	2.9	0.09
#      - expts: a list of the expts names from macs.list that you want to process
#      - chr: chrmosomes names
# output:
#      - x: a list with as many elements as chr specified by input.
#      -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS)
split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){
  x <- list()
  n <- length(expts)
  for(i in 1:n){
    e <- expts[i] #experiment name
    cat("wroking on expt", e,"\n")
    x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]])
    names(x[[e]]) <- chr
  }
  return(x)
}
# a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m
# input:
#     - r is chromosome
#     - m is macs matrix for expt e from above function
split.one.macs.expt.by.chromosome.no.pml <- function(r,m){
  ix.chr.i <- which(m[,"chr"]==r)
  # cat("working on",r,"\n")
  o <- list()
  o[[r]] <- m[ix.chr.i,]
  o[[r]]$peak.id <- ix.chr.i
  return(o[[r]])
}

## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
# also match these summit locations with corresponding:
# pvals, tfs, peak start and peak end
# input:
#     - chr: a list of chromosome names
#     - macs.list.per.chrom: a list of macs peaks for each chromosome
# output:
#     - o: a list each chormosome ruler as an element
make.ruler.no.pml <- function(chr,macs.list.per.chrom){
  x <- macs.list.per.chrom
  o <- list()
  for(j in 1:length(chr)){
    r <- chr[j] # chrmosome we go over
    s <- numeric()
    pval <- numeric()
    tf.to.s <- character()
    start <- numeric()
    end <- numeric()
    trtmnts <- names(x) # the treatments name (expt names)
    ## debug parameters ###
    ## which experiment peaks come from
    expt <- character()
    ## what was the peak id in that experiment
    peak.ids <- numeric()
    ## this will allow us to always back track from a cluster to the actual peaks in it
    ## debug params end ###
    for(i in 1:length(trtmnts)){
      o[[r]] <- list()
      e <- trtmnts[i] #experiment name
      tf <- strsplit(e,"_")[[1]][1]
      s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"])
      pval <- c(pval,x[[e]][[r]][,"score"])
      start <- c(start,x[[e]][[r]][,"start"])
      end <- c(end,x[[e]][[r]][,"end"])
      expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"])))
      peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"])
    }
    ix <- sort(s,index.return=TRUE)$ix
    o[[r]] <- list(summit=s[ix],score=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix])
  }
  return(o)
}

## add cluster memberships based on ruler
## require no more than d.cut distance between tf summits
## cur.l is the current loci number (or cluster number)
assign.clusters.ids.to.peaks.based.on.summits <- function(ruler,d.cut){
 cur.l <- 0
 for(j in 1:length(ruler)){
   s <- ruler[[j]][["summit"]]
   l <- numeric(length=length(s))
   if(length(l)>0){
	   cur.l <- cur.l+1
   }
   if(length(l)==1){
   	l[1] <- cur.l
   } else if(length(l)>1) {
	l[1] <- cur.l
	   for(i in 2:length(l)){
	     d <- s[i]-s[i-1] # assumes s is sorted increasingly
	     if(d>d.cut){
	       cur.l <- cur.l+1
	     }
	     l[i] <- cur.l
	   }
	}
   ruler[[j]][["mtl.id"]] <- l
 }
 return(ruler)
}

## add cluster memberships based on ruler
## require that peaks span will have a non-empty intersection
## cur.mtl is the current loci number (or cluster number)
assign.clusters.ids.to.peaks.based.on.intervals <- function(ruler){
cur.mtl <- 0
for(j in 1:length(ruler)){
	s <- ruler[[j]][["start"]]
	e <- ruler[[j]][["end"]]
	l <- numeric(length=length(s))
	if(length(l)>0){
	   cur.mtl <- cur.mtl+1
	}
	if(length(l)==1){
	  	l[1] <- cur.mtl
	} else if(length(l)>1) {
		l[1] <- cur.mtl
		cur.e <- e[1] # the right-most bp belonging to the current mtl
	   	for(i in 2:length(l)){
	     	if(cur.e<s[i]){
	       		cur.mtl <- cur.mtl+1
	     	}
	     	l[i] <- cur.mtl
			cur.e <- max(cur.e,e[i])
	   	}
	}
	ruler[[j]][["mtl.id"]] <- l
}
return(ruler)
}

## here we create a list of TF enriched loci (clusters)
## input:
##     - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
## output:
##     -ll: a list of clusters ll where each cluster (elem in ll holds):
##        - mtl.id: the current loci (elem in ll)
##        - s: summits vector as before
##        - pval: pvals vector matching the summits
##        - tfs: a vector of tfs matching s, the summits in loci l
##        - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
create.cluster.list.no.pml <- function(ruler){
  tmp <- list()
  ll <- list()
  for(j in 1:length(ruler)){
    r <- names(ruler)[j]
    # cat("working on ",r,"\n")
    x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
    l.vec <- unique(x[["mtl.id"]]) # the clusters ids on j chr (only unique)
    n <- length(l.vec) # iterate over n clusters
    tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
  }
  ## concatenate tmp into one list
  command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
  ll=eval(parse(text=command))
  return(ll)
}
get.cluster.params.no.pml <- function(i,x,l.vec,r){
  # ix <- which(x[["l"]]==l.vec[i])
  # l <- l.vec[i]
  ix <- which(x[["mtl.id"]]==l.vec[i])
  l <- l.vec[i]
  start <- min(x[["start"]][ix])
  end <- max(x[["end"]][ix])
  # s <- x[["s"]][ix]
  summit <- x[["summit"]][ix]
  # pval <- x[["pval"]][ix]
  score <- x[["score"]][ix]
  # pval.mean <- mean(pval)
  score.mean <- mean(score)
  span.tfs <- x[["end"]][ix]-x[["start"]][ix]
  span.l <- end-start
  peak.ids <- x[["peak.ids"]][ix]
  expt <- x[["expt"]][ix]
  expt.alphanum.sorted <- sort(x[["expt"]][ix])

  chr <- rep(r,length(ix))
  return(list(mtl.id=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
			  end=end,summit=summit,score=score,score.mean=score.mean,span.tfs=span.tfs,
			  span.l=span.l,peak.ids=peak.ids))
}

## pretty print (tab deim) to file requested elements out of chip cluster list, ll.
## input:
##     - ll: a cluster list
##     - f.nm: a file name (include path) to where you want files to print
##     - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering)
## output
##     - a tab delim file with clusers as rows and elems tab delim for each cluster
print.ll.verbose.all <- function(ll,f.nm="ll.xls"){
  options(digits=5)
  cat(file=f.nm,names(ll[[1]]),sep="\t")
  cat(file=f.nm,"\n",append=TRUE)
  for(i in 1:length(ll)){
	line <- ll[[i]][[1]] # put MTL number
	line <- paste(line,ll[[i]][[2]][1],sep="\t")
	for(j in 3:length(ll[[i]])){
		val <- ll[[i]][[j]]
		if(length(val) == 1){
			line <- paste(line,val,sep="\t")
		} else {
			line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t")
		}
	}
  	cat(file=f.nm,line,"\n",append=TRUE)
  }
}
# read command line paramters that are not optional
read.cmd.line.params.must <- function(args.nms, cmd.line.args){
	if(length(grep("--version",cmd.line.args))){
		cat("version",script.version,"\n")
		q()
	}
	args <- sapply(strsplit(cmd.line.args," "),function(i) i)
	vals <- character(length(args.nms))
	# split cmd.line to key and value pairs
	for(i in 1:length(args.nms)){
		ix <- grep(args.nms[i],args)
		if(length(ix)>1){
			stop("arg ",args.nms[i]," used more than once.  Bailing out...\n",print.error())
		} else if (length(ix)==0){
			stop("could not find ",args.nms[i],". Bailing out...\n",print.error())
		} else {
			vals[i] <- args[ix+1]
		}
	}
	return(vals)
}
# read command line paramters that are optional, if an optional param is not give functions return na for this param
read.cmd.line.params.optional <- function(args.nms, cmd.line.args){
	args <- sapply(strsplit(cmd.line.args," "),function(i) i)
	vals <- character(length(args.nms))
	# split cmd.line to key and value pairs
	for(i in 1:length(args.nms)){
		ix <- grep(args.nms[i],args)
		if(length(ix)>1){
			stop("arg ",args.nms[i]," used more than once.  Bailing out...\n",print.error())
		} else if (length(ix)==0){ # if --param was not written in cmd line
			vals[i] <- NA
		} else if(( (ix+1) <= length(args) ) & ( length(grep("--",args[ix+1])) == 0) ){ # if --param was written in cmd line AND was followed by a value
		 	vals[i] <- args[ix+1]
		} else { # otherwise
			vals[i] <- NA
		}
	}
	return(vals)
}

############# Code:
# retrieve args

if(debug==T){
	cmd.args <- c(
		"--input_files data/xls/SL10571_SL10565_peaks.xls::data/xls/SL10570_SL10564_peaks.xls::data/xls/SL10572_SL10566_peaks.xls",
		#"--input_files data/bed/SL10571_SL10565_peaks.bed::data/bed/SL10570_SL10564_peaks.bed::data/bed/SL10572_SL10566_peaks.bed",
		"--input_type MACS", #BED
		"--path_output ./results/",
		"--expt_names macs1::macs2::macs3",
		#"--expt_names bed1::bed2::bed3",
		"--mtl_type interval", #interval summit
		"--dist_summits 100"
	)
} else {
	cmd.args <- commandArgs(trailingOnly = T);
}

# if(length(grep("--version",cmd.args))){
# 	cat("version",script.version,"\n")
# 	q()
# }
args.nms.must <- c(
		"--input_files",   #1
		"--path_output",  	 #2
		"--expt_names" #3
)

# all numeric params must come at the end of args.nms.optional
n.start.numeric <- 3
args.nms.optional <- c(
		"--input_type",	#1
		"--mtl_type", 	 #2
		"--dist_summits" #3
)

# get must parameters
vals.must <- read.cmd.line.params.must(args.nms = args.nms.must, cmd.line.args = cmd.args)
if(verbose){
	cat("\nfinshed reading vals: \n")
	cat("\nvals.must", unlist(vals.must), "\n")
}
input.files <- strsplit(vals.must[1],"::")[[1]]
if(length(input.files)==1){
	cat("only provided one MACS file to cluster.")
	print.error()
}
path.output <- vals.must[2]
expt.names <- strsplit(vals.must[3],"::")[[1]]
# get optional parameters
vals.optional <- read.cmd.line.params.optional(args.nms = args.nms.optional, cmd.line.args = cmd.args)
if(verbose){
	cat("\nvals.optional:", unlist(vals.optional),"\n")
}
if(is.na(vals.optional[1])){
	input.type <- "MACS"
} else {
	input.type <- vals.optional[1]
}
if(is.na(vals.optional[2])){
	mtl.type <- "interval"
} else {
	mtl.type <- vals.optional[2]
}
if(is.na(vals.optional[2])){
	dist.summits <- 100
} else {
	dist.summits <- as.numeric(vals.optional[3])
}

# read MACS files
unique.chr <- character()
infile.list <- list()
col.nms.keepers <- c("chr","start","end","summit","score")
for(i in 1:length(input.files)){
	e <- expt.names[i]
	if(toupper(input.type)=="MACS"){
		tmp <- read.delim(file=input.files[i],comment.char="#",stringsAsFactors = F)
		columns <- c("chr","start","end","summit","pvalue")
		ix.cols <- numeric()
		for(j in 1:length(columns)){
			ix.j <- grep(columns[j],names(tmp))
			if(length(ix.j)==0){
				stop("can't find (grep) column ",columns[j]," in MACS input file ", e)
			}
			ix.cols <- c(ix.cols,ix.j)
		}
		infile.list[[e]] <- tmp[,ix.cols]
		colnames(infile.list[[e]]) <- col.nms.keepers
	}
	if(toupper(input.type)=="BED"){
		tmp <- read.delim(file=input.files[i],stringsAsFactors = F,header = FALSE)
		tmp[,4] <- tmp[,2]+(tmp[,3]-tmp[,2])/2 # define summit and put istead of name column
		colnames(tmp)  <- col.nms.keepers
		infile.list[[e]] <- tmp[,1:5]
	}
	unique.chr <- unique(c(unique.chr,infile.list[[ e ]][,"chr"]))
}

unique.chr <- sort(unique.chr)

# take all macs files and put peaks together on each chromosome
# (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls)
cat("splitting input files per chromosome\n")
x <- split.macs.list.to.chrmosomes.no.pml(macs.list=infile.list,expts=expt.names,chr=unique.chr)
cat("adding peaks from all input files into chromosome rulers\n")
ruler <- make.ruler.no.pml(chr=unique.chr,macs.list.per.chrom=x)
cat("add MTL membership to the ruler\n")

if(mtl.type=="interval"){
	ruler <- assign.clusters.ids.to.peaks.based.on.intervals(ruler)
} else {
	ruler <- assign.clusters.ids.to.peaks.based.on.summits(ruler,d.cut=dist.summits)
}

cat("creating MTL list\n")
ll <- create.cluster.list.no.pml(ruler=ruler)
cat("writing MTL table\n")
f.nm <- paste(sep="",path.output,"mtls",".xls")
print.ll.verbose.all(ll=ll,f.nm=f.nm)
author	kmace
date	Mon, 23 Jul 2012 13:00:15 -0400
parents
children