Mercurial > repos > kmace > mtls_analysis
view mtls_analyze/cluster_peaks.R @ 4:b465306d00ba draft default tip
Uploaded
| author | kmace |
|---|---|
| date | Mon, 23 Jul 2012 13:00:15 -0400 |
| parents | |
| children |
line wrap: on
line source
## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' ## Feb 2012 th17 ## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, ## NYU - Center for Genomics and Systems Biology ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' rm(list=ls()) debug=F verbose=F script.version=0.1 print.error <- function(){ cat(" DESCRIPTIION: cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files) fall within a certain distance from eachother. If you cluster MTLs based on summits, you need to specify dist.summits. If you use .bed files, the files must contain no header, and at least the five columns: 1. chrom, 2. chromStart, 3. chromEnd, 4. name, and 5.score. 2 and 3 represent the end points used for MTLs defined based on a shared interval. (2+3)/2 is used as the summit of each row if used for MTLs defined based on proximity of summits. INPUT: 1.input_files: path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk] 2.path_output: path to save generated MTL cluster file (where to save mtls.xls) 3.expt_names: user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] 4.input_type: the type of input file used (MACS or BED; defaults to MACS) 5.mtl_type: interval or summit (defaults to summit) 6.dist.summits: maximum distance between summits belonging to the same MTL (defaults to 100; only used if mtl_type is summit) EXAMPLE RUN: cluster_peaks.R --input_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls --input_type MACS --path_output results/ --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 --dist_summits 100 --mtl_type summit Please cite us if you used this script: The transcription factor network regulating Th17 lineage specification and function. Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n") } ############# helper functions: ## split.macs.list.to.chrmosomes # input: # - macs.list: a list of macs expts: here are a few lines of one expt ## chr start end length summit tags #NAME? fold_enrichment FDR(%) ## chr1 4322210 4323069 860 494 55 158.95 6.03 0.05 ## chr1 4797749 4798368 620 211 29 119.82 3.47 0.09 ## chr1 4848182 4849113 932 494 46 105.42 2.9 0.09 # - expts: a list of the expts names from macs.list that you want to process # - chr: chrmosomes names # output: # - x: a list with as many elements as chr specified by input. # -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS) split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){ x <- list() n <- length(expts) for(i in 1:n){ e <- expts[i] #experiment name cat("wroking on expt", e,"\n") x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]]) names(x[[e]]) <- chr } return(x) } # a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m # input: # - r is chromosome # - m is macs matrix for expt e from above function split.one.macs.expt.by.chromosome.no.pml <- function(r,m){ ix.chr.i <- which(m[,"chr"]==r) # cat("working on",r,"\n") o <- list() o[[r]] <- m[ix.chr.i,] o[[r]]$peak.id <- ix.chr.i return(o[[r]]) } ## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) # also match these summit locations with corresponding: # pvals, tfs, peak start and peak end # input: # - chr: a list of chromosome names # - macs.list.per.chrom: a list of macs peaks for each chromosome # output: # - o: a list each chormosome ruler as an element make.ruler.no.pml <- function(chr,macs.list.per.chrom){ x <- macs.list.per.chrom o <- list() for(j in 1:length(chr)){ r <- chr[j] # chrmosome we go over s <- numeric() pval <- numeric() tf.to.s <- character() start <- numeric() end <- numeric() trtmnts <- names(x) # the treatments name (expt names) ## debug parameters ### ## which experiment peaks come from expt <- character() ## what was the peak id in that experiment peak.ids <- numeric() ## this will allow us to always back track from a cluster to the actual peaks in it ## debug params end ### for(i in 1:length(trtmnts)){ o[[r]] <- list() e <- trtmnts[i] #experiment name tf <- strsplit(e,"_")[[1]][1] s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"]) pval <- c(pval,x[[e]][[r]][,"score"]) start <- c(start,x[[e]][[r]][,"start"]) end <- c(end,x[[e]][[r]][,"end"]) expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"]))) peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"]) } ix <- sort(s,index.return=TRUE)$ix o[[r]] <- list(summit=s[ix],score=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix]) } return(o) } ## add cluster memberships based on ruler ## require no more than d.cut distance between tf summits ## cur.l is the current loci number (or cluster number) assign.clusters.ids.to.peaks.based.on.summits <- function(ruler,d.cut){ cur.l <- 0 for(j in 1:length(ruler)){ s <- ruler[[j]][["summit"]] l <- numeric(length=length(s)) if(length(l)>0){ cur.l <- cur.l+1 } if(length(l)==1){ l[1] <- cur.l } else if(length(l)>1) { l[1] <- cur.l for(i in 2:length(l)){ d <- s[i]-s[i-1] # assumes s is sorted increasingly if(d>d.cut){ cur.l <- cur.l+1 } l[i] <- cur.l } } ruler[[j]][["mtl.id"]] <- l } return(ruler) } ## add cluster memberships based on ruler ## require that peaks span will have a non-empty intersection ## cur.mtl is the current loci number (or cluster number) assign.clusters.ids.to.peaks.based.on.intervals <- function(ruler){ cur.mtl <- 0 for(j in 1:length(ruler)){ s <- ruler[[j]][["start"]] e <- ruler[[j]][["end"]] l <- numeric(length=length(s)) if(length(l)>0){ cur.mtl <- cur.mtl+1 } if(length(l)==1){ l[1] <- cur.mtl } else if(length(l)>1) { l[1] <- cur.mtl cur.e <- e[1] # the right-most bp belonging to the current mtl for(i in 2:length(l)){ if(cur.e<s[i]){ cur.mtl <- cur.mtl+1 } l[i] <- cur.mtl cur.e <- max(cur.e,e[i]) } } ruler[[j]][["mtl.id"]] <- l } return(ruler) } ## here we create a list of TF enriched loci (clusters) ## input: ## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) ## output: ## -ll: a list of clusters ll where each cluster (elem in ll holds): ## - mtl.id: the current loci (elem in ll) ## - s: summits vector as before ## - pval: pvals vector matching the summits ## - tfs: a vector of tfs matching s, the summits in loci l ## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak create.cluster.list.no.pml <- function(ruler){ tmp <- list() ll <- list() for(j in 1:length(ruler)){ r <- names(ruler)[j] # cat("working on ",r,"\n") x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]] l.vec <- unique(x[["mtl.id"]]) # the clusters ids on j chr (only unique) n <- length(l.vec) # iterate over n clusters tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r) } ## concatenate tmp into one list command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")") ll=eval(parse(text=command)) return(ll) } get.cluster.params.no.pml <- function(i,x,l.vec,r){ # ix <- which(x[["l"]]==l.vec[i]) # l <- l.vec[i] ix <- which(x[["mtl.id"]]==l.vec[i]) l <- l.vec[i] start <- min(x[["start"]][ix]) end <- max(x[["end"]][ix]) # s <- x[["s"]][ix] summit <- x[["summit"]][ix] # pval <- x[["pval"]][ix] score <- x[["score"]][ix] # pval.mean <- mean(pval) score.mean <- mean(score) span.tfs <- x[["end"]][ix]-x[["start"]][ix] span.l <- end-start peak.ids <- x[["peak.ids"]][ix] expt <- x[["expt"]][ix] expt.alphanum.sorted <- sort(x[["expt"]][ix]) chr <- rep(r,length(ix)) return(list(mtl.id=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start, end=end,summit=summit,score=score,score.mean=score.mean,span.tfs=span.tfs, span.l=span.l,peak.ids=peak.ids)) } ## pretty print (tab deim) to file requested elements out of chip cluster list, ll. ## input: ## - ll: a cluster list ## - f.nm: a file name (include path) to where you want files to print ## - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering) ## output ## - a tab delim file with clusers as rows and elems tab delim for each cluster print.ll.verbose.all <- function(ll,f.nm="ll.xls"){ options(digits=5) cat(file=f.nm,names(ll[[1]]),sep="\t") cat(file=f.nm,"\n",append=TRUE) for(i in 1:length(ll)){ line <- ll[[i]][[1]] # put MTL number line <- paste(line,ll[[i]][[2]][1],sep="\t") for(j in 3:length(ll[[i]])){ val <- ll[[i]][[j]] if(length(val) == 1){ line <- paste(line,val,sep="\t") } else { line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t") } } cat(file=f.nm,line,"\n",append=TRUE) } } # read command line paramters that are not optional read.cmd.line.params.must <- function(args.nms, cmd.line.args){ if(length(grep("--version",cmd.line.args))){ cat("version",script.version,"\n") q() } args <- sapply(strsplit(cmd.line.args," "),function(i) i) vals <- character(length(args.nms)) # split cmd.line to key and value pairs for(i in 1:length(args.nms)){ ix <- grep(args.nms[i],args) if(length(ix)>1){ stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error()) } else if (length(ix)==0){ stop("could not find ",args.nms[i],". Bailing out...\n",print.error()) } else { vals[i] <- args[ix+1] } } return(vals) } # read command line paramters that are optional, if an optional param is not give functions return na for this param read.cmd.line.params.optional <- function(args.nms, cmd.line.args){ args <- sapply(strsplit(cmd.line.args," "),function(i) i) vals <- character(length(args.nms)) # split cmd.line to key and value pairs for(i in 1:length(args.nms)){ ix <- grep(args.nms[i],args) if(length(ix)>1){ stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error()) } else if (length(ix)==0){ # if --param was not written in cmd line vals[i] <- NA } else if(( (ix+1) <= length(args) ) & ( length(grep("--",args[ix+1])) == 0) ){ # if --param was written in cmd line AND was followed by a value vals[i] <- args[ix+1] } else { # otherwise vals[i] <- NA } } return(vals) } ############# Code: # retrieve args if(debug==T){ cmd.args <- c( "--input_files data/xls/SL10571_SL10565_peaks.xls::data/xls/SL10570_SL10564_peaks.xls::data/xls/SL10572_SL10566_peaks.xls", #"--input_files data/bed/SL10571_SL10565_peaks.bed::data/bed/SL10570_SL10564_peaks.bed::data/bed/SL10572_SL10566_peaks.bed", "--input_type MACS", #BED "--path_output ./results/", "--expt_names macs1::macs2::macs3", #"--expt_names bed1::bed2::bed3", "--mtl_type interval", #interval summit "--dist_summits 100" ) } else { cmd.args <- commandArgs(trailingOnly = T); } # if(length(grep("--version",cmd.args))){ # cat("version",script.version,"\n") # q() # } args.nms.must <- c( "--input_files", #1 "--path_output", #2 "--expt_names" #3 ) # all numeric params must come at the end of args.nms.optional n.start.numeric <- 3 args.nms.optional <- c( "--input_type", #1 "--mtl_type", #2 "--dist_summits" #3 ) # get must parameters vals.must <- read.cmd.line.params.must(args.nms = args.nms.must, cmd.line.args = cmd.args) if(verbose){ cat("\nfinshed reading vals: \n") cat("\nvals.must", unlist(vals.must), "\n") } input.files <- strsplit(vals.must[1],"::")[[1]] if(length(input.files)==1){ cat("only provided one MACS file to cluster.") print.error() } path.output <- vals.must[2] expt.names <- strsplit(vals.must[3],"::")[[1]] # get optional parameters vals.optional <- read.cmd.line.params.optional(args.nms = args.nms.optional, cmd.line.args = cmd.args) if(verbose){ cat("\nvals.optional:", unlist(vals.optional),"\n") } if(is.na(vals.optional[1])){ input.type <- "MACS" } else { input.type <- vals.optional[1] } if(is.na(vals.optional[2])){ mtl.type <- "interval" } else { mtl.type <- vals.optional[2] } if(is.na(vals.optional[2])){ dist.summits <- 100 } else { dist.summits <- as.numeric(vals.optional[3]) } # read MACS files unique.chr <- character() infile.list <- list() col.nms.keepers <- c("chr","start","end","summit","score") for(i in 1:length(input.files)){ e <- expt.names[i] if(toupper(input.type)=="MACS"){ tmp <- read.delim(file=input.files[i],comment.char="#",stringsAsFactors = F) columns <- c("chr","start","end","summit","pvalue") ix.cols <- numeric() for(j in 1:length(columns)){ ix.j <- grep(columns[j],names(tmp)) if(length(ix.j)==0){ stop("can't find (grep) column ",columns[j]," in MACS input file ", e) } ix.cols <- c(ix.cols,ix.j) } infile.list[[e]] <- tmp[,ix.cols] colnames(infile.list[[e]]) <- col.nms.keepers } if(toupper(input.type)=="BED"){ tmp <- read.delim(file=input.files[i],stringsAsFactors = F,header = FALSE) tmp[,4] <- tmp[,2]+(tmp[,3]-tmp[,2])/2 # define summit and put istead of name column colnames(tmp) <- col.nms.keepers infile.list[[e]] <- tmp[,1:5] } unique.chr <- unique(c(unique.chr,infile.list[[ e ]][,"chr"])) } unique.chr <- sort(unique.chr) # take all macs files and put peaks together on each chromosome # (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls) cat("splitting input files per chromosome\n") x <- split.macs.list.to.chrmosomes.no.pml(macs.list=infile.list,expts=expt.names,chr=unique.chr) cat("adding peaks from all input files into chromosome rulers\n") ruler <- make.ruler.no.pml(chr=unique.chr,macs.list.per.chrom=x) cat("add MTL membership to the ruler\n") if(mtl.type=="interval"){ ruler <- assign.clusters.ids.to.peaks.based.on.intervals(ruler) } else { ruler <- assign.clusters.ids.to.peaks.based.on.summits(ruler,d.cut=dist.summits) } cat("creating MTL list\n") ll <- create.cluster.list.no.pml(ruler=ruler) cat("writing MTL table\n") f.nm <- paste(sep="",path.output,"mtls",".xls") print.ll.verbose.all(ll=ll,f.nm=f.nm)
