Mercurial > repos > kmace > mtls_analysis
diff mtls_analyze/cluster_peaks.R @ 4:b465306d00ba draft default tip
Uploaded
| author | kmace |
|---|---|
| date | Mon, 23 Jul 2012 13:00:15 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/cluster_peaks.R Mon Jul 23 13:00:15 2012 -0400 @@ -0,0 +1,431 @@ +## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. +## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ +##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' +## Feb 2012 th17 +## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, +## NYU - Center for Genomics and Systems Biology +## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. +## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ +##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' + +rm(list=ls()) +debug=F +verbose=F +script.version=0.1 +print.error <- function(){ + cat(" +DESCRIPTIION: + cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where + each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files) + fall within a certain distance from eachother. If you cluster MTLs based on summits, you need to specify dist.summits. + If you use .bed files, the files must contain no header, and at least the five columns: + 1. chrom, 2. chromStart, 3. chromEnd, 4. name, and 5.score. 2 and 3 represent the end points used for MTLs defined based on a shared + interval. (2+3)/2 is used as the summit of each row if used for MTLs defined based on proximity of summits. + +INPUT: + 1.input_files: path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk] + 2.path_output: path to save generated MTL cluster file (where to save mtls.xls) + 3.expt_names: user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] + 4.input_type: the type of input file used (MACS or BED; defaults to MACS) + 5.mtl_type: interval or summit (defaults to summit) + 6.dist.summits: maximum distance between summits belonging to the same MTL (defaults to 100; only used if mtl_type is summit) + +EXAMPLE RUN: + cluster_peaks.R + --input_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls + --input_type MACS + --path_output results/ + --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 + --dist_summits 100 + --mtl_type summit + +Please cite us if you used this script: + The transcription factor network regulating Th17 lineage specification and function. + Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, + Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n") +} + +############# helper functions: +## split.macs.list.to.chrmosomes +# input: +# - macs.list: a list of macs expts: here are a few lines of one expt +## chr start end length summit tags #NAME? fold_enrichment FDR(%) +## chr1 4322210 4323069 860 494 55 158.95 6.03 0.05 +## chr1 4797749 4798368 620 211 29 119.82 3.47 0.09 +## chr1 4848182 4849113 932 494 46 105.42 2.9 0.09 +# - expts: a list of the expts names from macs.list that you want to process +# - chr: chrmosomes names +# output: +# - x: a list with as many elements as chr specified by input. +# -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS) +split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){ + x <- list() + n <- length(expts) + for(i in 1:n){ + e <- expts[i] #experiment name + cat("wroking on expt", e,"\n") + x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]]) + names(x[[e]]) <- chr + } + return(x) +} +# a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m +# input: +# - r is chromosome +# - m is macs matrix for expt e from above function +split.one.macs.expt.by.chromosome.no.pml <- function(r,m){ + ix.chr.i <- which(m[,"chr"]==r) + # cat("working on",r,"\n") + o <- list() + o[[r]] <- m[ix.chr.i,] + o[[r]]$peak.id <- ix.chr.i + return(o[[r]]) +} + +## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) +# also match these summit locations with corresponding: +# pvals, tfs, peak start and peak end +# input: +# - chr: a list of chromosome names +# - macs.list.per.chrom: a list of macs peaks for each chromosome +# output: +# - o: a list each chormosome ruler as an element +make.ruler.no.pml <- function(chr,macs.list.per.chrom){ + x <- macs.list.per.chrom + o <- list() + for(j in 1:length(chr)){ + r <- chr[j] # chrmosome we go over + s <- numeric() + pval <- numeric() + tf.to.s <- character() + start <- numeric() + end <- numeric() + trtmnts <- names(x) # the treatments name (expt names) + ## debug parameters ### + ## which experiment peaks come from + expt <- character() + ## what was the peak id in that experiment + peak.ids <- numeric() + ## this will allow us to always back track from a cluster to the actual peaks in it + ## debug params end ### + for(i in 1:length(trtmnts)){ + o[[r]] <- list() + e <- trtmnts[i] #experiment name + tf <- strsplit(e,"_")[[1]][1] + s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"]) + pval <- c(pval,x[[e]][[r]][,"score"]) + start <- c(start,x[[e]][[r]][,"start"]) + end <- c(end,x[[e]][[r]][,"end"]) + expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"]))) + peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"]) + } + ix <- sort(s,index.return=TRUE)$ix + o[[r]] <- list(summit=s[ix],score=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix]) + } + return(o) +} + +## add cluster memberships based on ruler +## require no more than d.cut distance between tf summits +## cur.l is the current loci number (or cluster number) +assign.clusters.ids.to.peaks.based.on.summits <- function(ruler,d.cut){ + cur.l <- 0 + for(j in 1:length(ruler)){ + s <- ruler[[j]][["summit"]] + l <- numeric(length=length(s)) + if(length(l)>0){ + cur.l <- cur.l+1 + } + if(length(l)==1){ + l[1] <- cur.l + } else if(length(l)>1) { + l[1] <- cur.l + for(i in 2:length(l)){ + d <- s[i]-s[i-1] # assumes s is sorted increasingly + if(d>d.cut){ + cur.l <- cur.l+1 + } + l[i] <- cur.l + } + } + ruler[[j]][["mtl.id"]] <- l + } + return(ruler) +} + +## add cluster memberships based on ruler +## require that peaks span will have a non-empty intersection +## cur.mtl is the current loci number (or cluster number) +assign.clusters.ids.to.peaks.based.on.intervals <- function(ruler){ +cur.mtl <- 0 +for(j in 1:length(ruler)){ + s <- ruler[[j]][["start"]] + e <- ruler[[j]][["end"]] + l <- numeric(length=length(s)) + if(length(l)>0){ + cur.mtl <- cur.mtl+1 + } + if(length(l)==1){ + l[1] <- cur.mtl + } else if(length(l)>1) { + l[1] <- cur.mtl + cur.e <- e[1] # the right-most bp belonging to the current mtl + for(i in 2:length(l)){ + if(cur.e<s[i]){ + cur.mtl <- cur.mtl+1 + } + l[i] <- cur.mtl + cur.e <- max(cur.e,e[i]) + } + } + ruler[[j]][["mtl.id"]] <- l +} +return(ruler) +} + +## here we create a list of TF enriched loci (clusters) +## input: +## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) +## output: +## -ll: a list of clusters ll where each cluster (elem in ll holds): +## - mtl.id: the current loci (elem in ll) +## - s: summits vector as before +## - pval: pvals vector matching the summits +## - tfs: a vector of tfs matching s, the summits in loci l +## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak +create.cluster.list.no.pml <- function(ruler){ + tmp <- list() + ll <- list() + for(j in 1:length(ruler)){ + r <- names(ruler)[j] + # cat("working on ",r,"\n") + x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]] + l.vec <- unique(x[["mtl.id"]]) # the clusters ids on j chr (only unique) + n <- length(l.vec) # iterate over n clusters + tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r) + } + ## concatenate tmp into one list + command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")") + ll=eval(parse(text=command)) + return(ll) +} +get.cluster.params.no.pml <- function(i,x,l.vec,r){ + # ix <- which(x[["l"]]==l.vec[i]) + # l <- l.vec[i] + ix <- which(x[["mtl.id"]]==l.vec[i]) + l <- l.vec[i] + start <- min(x[["start"]][ix]) + end <- max(x[["end"]][ix]) + # s <- x[["s"]][ix] + summit <- x[["summit"]][ix] + # pval <- x[["pval"]][ix] + score <- x[["score"]][ix] + # pval.mean <- mean(pval) + score.mean <- mean(score) + span.tfs <- x[["end"]][ix]-x[["start"]][ix] + span.l <- end-start + peak.ids <- x[["peak.ids"]][ix] + expt <- x[["expt"]][ix] + expt.alphanum.sorted <- sort(x[["expt"]][ix]) + + chr <- rep(r,length(ix)) + return(list(mtl.id=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start, + end=end,summit=summit,score=score,score.mean=score.mean,span.tfs=span.tfs, + span.l=span.l,peak.ids=peak.ids)) +} + +## pretty print (tab deim) to file requested elements out of chip cluster list, ll. +## input: +## - ll: a cluster list +## - f.nm: a file name (include path) to where you want files to print +## - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering) +## output +## - a tab delim file with clusers as rows and elems tab delim for each cluster +print.ll.verbose.all <- function(ll,f.nm="ll.xls"){ + options(digits=5) + cat(file=f.nm,names(ll[[1]]),sep="\t") + cat(file=f.nm,"\n",append=TRUE) + for(i in 1:length(ll)){ + line <- ll[[i]][[1]] # put MTL number + line <- paste(line,ll[[i]][[2]][1],sep="\t") + for(j in 3:length(ll[[i]])){ + val <- ll[[i]][[j]] + if(length(val) == 1){ + line <- paste(line,val,sep="\t") + } else { + line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t") + } + } + cat(file=f.nm,line,"\n",append=TRUE) + } +} +# read command line paramters that are not optional +read.cmd.line.params.must <- function(args.nms, cmd.line.args){ + if(length(grep("--version",cmd.line.args))){ + cat("version",script.version,"\n") + q() + } + args <- sapply(strsplit(cmd.line.args," "),function(i) i) + vals <- character(length(args.nms)) + # split cmd.line to key and value pairs + for(i in 1:length(args.nms)){ + ix <- grep(args.nms[i],args) + if(length(ix)>1){ + stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error()) + } else if (length(ix)==0){ + stop("could not find ",args.nms[i],". Bailing out...\n",print.error()) + } else { + vals[i] <- args[ix+1] + } + } + return(vals) +} +# read command line paramters that are optional, if an optional param is not give functions return na for this param +read.cmd.line.params.optional <- function(args.nms, cmd.line.args){ + args <- sapply(strsplit(cmd.line.args," "),function(i) i) + vals <- character(length(args.nms)) + # split cmd.line to key and value pairs + for(i in 1:length(args.nms)){ + ix <- grep(args.nms[i],args) + if(length(ix)>1){ + stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error()) + } else if (length(ix)==0){ # if --param was not written in cmd line + vals[i] <- NA + } else if(( (ix+1) <= length(args) ) & ( length(grep("--",args[ix+1])) == 0) ){ # if --param was written in cmd line AND was followed by a value + vals[i] <- args[ix+1] + } else { # otherwise + vals[i] <- NA + } + } + return(vals) +} + +############# Code: +# retrieve args + +if(debug==T){ + cmd.args <- c( + "--input_files data/xls/SL10571_SL10565_peaks.xls::data/xls/SL10570_SL10564_peaks.xls::data/xls/SL10572_SL10566_peaks.xls", + #"--input_files data/bed/SL10571_SL10565_peaks.bed::data/bed/SL10570_SL10564_peaks.bed::data/bed/SL10572_SL10566_peaks.bed", + "--input_type MACS", #BED + "--path_output ./results/", + "--expt_names macs1::macs2::macs3", + #"--expt_names bed1::bed2::bed3", + "--mtl_type interval", #interval summit + "--dist_summits 100" + ) +} else { + cmd.args <- commandArgs(trailingOnly = T); +} + +# if(length(grep("--version",cmd.args))){ +# cat("version",script.version,"\n") +# q() +# } +args.nms.must <- c( + "--input_files", #1 + "--path_output", #2 + "--expt_names" #3 +) + +# all numeric params must come at the end of args.nms.optional +n.start.numeric <- 3 +args.nms.optional <- c( + "--input_type", #1 + "--mtl_type", #2 + "--dist_summits" #3 +) + +# get must parameters +vals.must <- read.cmd.line.params.must(args.nms = args.nms.must, cmd.line.args = cmd.args) +if(verbose){ + cat("\nfinshed reading vals: \n") + cat("\nvals.must", unlist(vals.must), "\n") +} +input.files <- strsplit(vals.must[1],"::")[[1]] +if(length(input.files)==1){ + cat("only provided one MACS file to cluster.") + print.error() +} +path.output <- vals.must[2] +expt.names <- strsplit(vals.must[3],"::")[[1]] +# get optional parameters +vals.optional <- read.cmd.line.params.optional(args.nms = args.nms.optional, cmd.line.args = cmd.args) +if(verbose){ + cat("\nvals.optional:", unlist(vals.optional),"\n") +} +if(is.na(vals.optional[1])){ + input.type <- "MACS" +} else { + input.type <- vals.optional[1] +} +if(is.na(vals.optional[2])){ + mtl.type <- "interval" +} else { + mtl.type <- vals.optional[2] +} +if(is.na(vals.optional[2])){ + dist.summits <- 100 +} else { + dist.summits <- as.numeric(vals.optional[3]) +} + +# read MACS files +unique.chr <- character() +infile.list <- list() +col.nms.keepers <- c("chr","start","end","summit","score") +for(i in 1:length(input.files)){ + e <- expt.names[i] + if(toupper(input.type)=="MACS"){ + tmp <- read.delim(file=input.files[i],comment.char="#",stringsAsFactors = F) + columns <- c("chr","start","end","summit","pvalue") + ix.cols <- numeric() + for(j in 1:length(columns)){ + ix.j <- grep(columns[j],names(tmp)) + if(length(ix.j)==0){ + stop("can't find (grep) column ",columns[j]," in MACS input file ", e) + } + ix.cols <- c(ix.cols,ix.j) + } + infile.list[[e]] <- tmp[,ix.cols] + colnames(infile.list[[e]]) <- col.nms.keepers + } + if(toupper(input.type)=="BED"){ + tmp <- read.delim(file=input.files[i],stringsAsFactors = F,header = FALSE) + tmp[,4] <- tmp[,2]+(tmp[,3]-tmp[,2])/2 # define summit and put istead of name column + colnames(tmp) <- col.nms.keepers + infile.list[[e]] <- tmp[,1:5] + } + unique.chr <- unique(c(unique.chr,infile.list[[ e ]][,"chr"])) +} + +unique.chr <- sort(unique.chr) + +# take all macs files and put peaks together on each chromosome +# (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls) +cat("splitting input files per chromosome\n") +x <- split.macs.list.to.chrmosomes.no.pml(macs.list=infile.list,expts=expt.names,chr=unique.chr) +cat("adding peaks from all input files into chromosome rulers\n") +ruler <- make.ruler.no.pml(chr=unique.chr,macs.list.per.chrom=x) +cat("add MTL membership to the ruler\n") + +if(mtl.type=="interval"){ + ruler <- assign.clusters.ids.to.peaks.based.on.intervals(ruler) +} else { + ruler <- assign.clusters.ids.to.peaks.based.on.summits(ruler,d.cut=dist.summits) +} + +cat("creating MTL list\n") +ll <- create.cluster.list.no.pml(ruler=ruler) +cat("writing MTL table\n") +f.nm <- paste(sep="",path.output,"mtls",".xls") +print.ll.verbose.all(ll=ll,f.nm=f.nm) + + + + + + + + +
