Mercurial > repos > kmace > mtls_analysis
view mtls_analyze/cluster_peaks.R @ 0:a903c48f05b4
Added non-integrated scripts to galaxy
author | kmace |
---|---|
date | Thu, 22 Mar 2012 15:21:00 -0400 |
parents | |
children |
line wrap: on
line source
## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' ## Feb 2012 th17 ## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, ## NYU - Center for Genomics and Systems Biology ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' rm(list=ls()) debug=F script.version=0.1 print.error <- function(){ cat(" DESCRIPTIION: cluster_peaks.R takes MACS tab delimited files as input and produces one tab delimeted file (named mtls.xls) where each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS files) fall within a certain distance between summits from eachother. INPUT: 1.path_input=path to MACS files '::' delim [path_input=f1::f2::f3::...::fk] 2.path_output=path to save generated MTL cluster file (where to save mtls.xls) 3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] 4.dist.summits=maximum distance between summits belonging to the same MTL 5.n.autosome.chr=19 for mouse, 22 for human EXAMPLE RUN: cluster_peaks.R input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls path_output=~/Desktop/ expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 dist_summits=100 n_autosome_chr=19 Please cite us if you used this script: The transcription factor network regulating Th17 lineage specification and function. Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n") } ############# helper functions: ## split.macs.list.to.chrmosomes # input: # - macs.list: a list of macs expts: here are a few lines of one expt ## chr start end length summit tags #NAME? fold_enrichment FDR(%) ## chr1 4322210 4323069 860 494 55 158.95 6.03 0.05 ## chr1 4797749 4798368 620 211 29 119.82 3.47 0.09 ## chr1 4848182 4849113 932 494 46 105.42 2.9 0.09 # - expts: a list of the expts names from macs.list that you want to process # - chr: chrmosomes names # output: # - x: a list with as many elements as chr specified by input. # -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS) split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){ x <- list() n <- length(expts) for(i in 1:n){ e <- expts[i] #experiment name cat("wroking on expt", e,"\n") x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]]) names(x[[e]]) <- chr } return(x) } # a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m # input: # - r is chromosome # - m is macs matrix for expt e from above function split.one.macs.expt.by.chromosome.no.pml <- function(r,m){ ix.chr.i <- which(m[,"chr"]==r) # cat("working on",r,"\n") o <- list() o[[r]] <- m[ix.chr.i,] o[[r]]$peak.id <- ix.chr.i return(o[[r]]) } ## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) # also match these summit locations with corresponding: # pvals, tfs, peak start and peak end trgts # input: # - chr: a list of chromosome names # - macs.list.per.chrom: a list of macs peaks for each chromosome # output: # - o: a list each chormosome ruler as an element make.ruler.no.pml <- function(chr,macs.list.per.chrom){ x <- macs.list.per.chrom o <- list() for(j in 1:length(chr)){ r <- chr[j] # chrmosome we go over s <- numeric() pval <- numeric() tf.to.s <- character() trgt.prox <- character() trgt.dist <- character() dtss.prox <- character() dtss.dist <- character() start <- numeric() end <- numeric() trtmnts <- names(x) # the treatments name (expt names) ## debug parameters ### ## which experiment peaks come from expt <- character() ## what was the peak id in that experiment peak.ids <- numeric() ## this will allow us to always back track from a cluster to the actual peaks in it ## debug params end ### for(i in 1:length(trtmnts)){ o[[r]] <- list() e <- trtmnts[i] #experiment name tf <- strsplit(e,"_")[[1]][1] s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"]) pval <- c(pval,x[[e]][[r]][,7]) # the name of 7th column is X.10.log10.pvalue. this is pval # tf.to.s <- c(tf.to.s,rep(tf,length(x[[e]][[r]][,7]))) # all summits belong to tf start <- c(start,x[[e]][[r]][,"start"]) end <- c(end,x[[e]][[r]][,"end"]) expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"]))) peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"]) trgt.prox <- c(trgt.prox,x[[e]][[r]][,"trgt.prox"]) trgt.dist <- c(trgt.dist,x[[e]][[r]][,"trgt.dist"]) dtss.prox <- c(dtss.prox,x[[e]][[r]][,"dtss.prox"]) dtss.dist <- c(dtss.dist,x[[e]][[r]][,"dtss.dist"]) } ix <- sort(s,index.return=TRUE)$ix o[[r]] <- list(s=s[ix],pval=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix], trgt.prox=trgt.prox[ix],trgt.dist=trgt.dist[ix],dtss.prox=dtss.prox[ix],dtss.dist=dtss.dist[ix]) } return(o) } ## add cluster memberships based on ruler ## require no more than d.cut distance between tf summits ## cur.l is the current loci number (or cluster number) assign.clusters.ids.to.peaks <- function(ruler,d.cut){ cur.l <- 0 for(j in 1:length(ruler)){ s <- ruler[[j]][["s"]] l <- numeric(length=length(s)) if(length(l)>0){ cur.l <- cur.l+1 } if(length(l)==1){ l[1] <- cur.l } else if(length(l)>1) { l[1] <- cur.l for(i in 2:length(l)){ d <- s[i]-s[i-1] # assumes s is sorted increasingly if(d>d.cut){ cur.l <- cur.l+1 } l[i] <- cur.l } } ruler[[j]][["l"]] <- l } return(ruler) } ## here we create a list of TF enriched loci (clusters) ## input: ## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) ## output: ## -ll: a list of clusters ll where each cluster (elem in ll holds): ## - l: the current loci (elem in ll) ## - s: summits vector as before ## - pval: pvals vector matching the summits ## - tfs: a vector of tfs matching s, the summits in loci l ## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak ## - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream) create.cluster.list.no.pml <- function(ruler){ tmp <- list() ll <- list() for(j in 1:length(ruler)){ r <- names(ruler)[j] # cat("working on ",r,"\n") x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]] l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique) n <- length(l.vec) # iterate over n clusters tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r) } ## concatenate tmp into one list command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")") ll=eval(parse(text=command)) return(ll) } get.cluster.params.no.pml <- function(i,x,l.vec,r){ ix <- which(x[["l"]]==l.vec[i]) l <- l.vec[i] start <- min(x[["start"]][ix]) end <- max(x[["end"]][ix]) s <- x[["s"]][ix] # tf.to.s <- x[["tf.to.s"]][ix] pval <- x[["pval"]][ix] pval.mean <- mean(pval) span.tfs <- x[["end"]][ix]-x[["start"]][ix] span.l <- end-start peak.ids <- x[["peak.ids"]][ix] expt <- x[["expt"]][ix] expt.alphanum.sorted <- sort(x[["expt"]][ix]) trgt.prox <- unique(x[["trgt.prox"]][ix]) trgt.dist <- unique(x[["trgt.dist"]][ix]) dtss.prox <- unique(x[["dtss.prox"]][ix]) dtss.dist <- unique(x[["dtss.dist"]][ix]) chr <- rep(r,length(ix)) return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start, end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs, span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist, dtss.prox=dtss.prox,dtss.dist=dtss.dist)) } ## here we create a list of TF enriched loci (clusters) ## input: ## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) ## output: ## -ll: a list of clusters ll where each cluster (elem in ll holds): ## - l: the current loci (elem in ll) ## - s: summits vector as before ## - pval: pvals vector matching the summits ## - tfs: a vector of tfs matching s, the summits in loci l ## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak ## - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream) create.cluster.list.no.pml <- function(ruler){ tmp <- list() ll <- list() for(j in 1:length(ruler)){ r <- names(ruler)[j] # cat("working on ",r,"\n") x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]] l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique) n <- length(l.vec) # iterate over n clusters tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r) } ## concatenate tmp into one list command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")") ll=eval(parse(text=command)) return(ll) } get.cluster.params.no.pml <- function(i,x,l.vec,r){ ix <- which(x[["l"]]==l.vec[i]) l <- l.vec[i] start <- min(x[["start"]][ix]) end <- max(x[["end"]][ix]) s <- x[["s"]][ix] # tf.to.s <- x[["tf.to.s"]][ix] pval <- x[["pval"]][ix] pval.mean <- mean(pval) span.tfs <- x[["end"]][ix]-x[["start"]][ix] span.l <- end-start peak.ids <- x[["peak.ids"]][ix] expt <- x[["expt"]][ix] expt.alphanum.sorted <- sort(x[["expt"]][ix]) trgt.prox <- unique(x[["trgt.prox"]][ix]) trgt.dist <- unique(x[["trgt.dist"]][ix]) dtss.prox <- unique(x[["dtss.prox"]][ix]) dtss.dist <- unique(x[["dtss.dist"]][ix]) chr <- rep(r,length(ix)) return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start, end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs, span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist, dtss.prox=dtss.prox,dtss.dist=dtss.dist)) } ## pretty print (tab deim) to file requested elements out of chip cluster list, ll. ## input: ## - ll: a cluster list ## - f.nm: a file name (include path) to where you want files to print ## - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering) ## output ## - a tab delim file with clusers as rows and elems tab delim for each cluster print.ll.verbose.all <- function(ll,f.nm="ll.xls"){ options(digits=5) cat(file=f.nm,names(ll[[1]]),sep="\t") cat(file=f.nm,"\n",append=TRUE) for(i in 1:length(ll)){ line <- ll[[i]][[1]] # put MTL number line <- paste(line,ll[[i]][[2]][1],sep="\t") for(j in 3:length(ll[[i]])){ val <- ll[[i]][[j]] if(length(val) == 1){ line <- paste(line,val,sep="\t") } else { line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t") } } cat(file=f.nm,line,"\n",append=TRUE) } } ############# Code: # retrieve args if(debug==T){ cmd.args <- c( "input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls", "path_output=~/Desktop/", "expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17", "dist_summits=100", "n_autosome_chr=19" ) } else { cmd.args <- commandArgs(); } if(length(grep("--version",cmd.args))){ cat("version",script.version,"\n") q() } arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1]) args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2]) arg.nms <- c("input_macs_files","path_output","expt_names","dist_summits","n_autosome_chr") arg.val <- character(length=length(arg.nms)) for(i in 1:length(arg.nms)){ ix <- which(arg.names.cmd.line==arg.nms[i]) if(length(ix)==1){ arg.val[i] <- args.val.cmd.line[ix] } else { stop("######could not find ",arg.nms[i]," arg######\n\n",print.error()) } } if(debug==T){ print(paste(arg.nms,"=",arg.val)) } # the files here adhere to tab delim format input.macs.files <- strsplit(arg.val[1],"::")[[1]] if(length(input.macs.files)==1){ cat("only provided one MACS file to cluster.") print.error() } path.output <- arg.val[2] expt.names <- strsplit(arg.val[3],"::")[[1]] dist.summits <- as.numeric(arg.val[4]) n.autosome.chr <- as.numeric(arg.val[5]) # source("~/Documents/nyu/littmanLab/th17_used_for_paper/r_scripts/th17/used_for_paper/cluster_peaks_util.R") chr <- c(paste(sep="","chr",1:n.autosome.chr),paste(sep="","chr",c("X","Y"))) # read MACS files macs.list <- list() for(i in 1:length(input.macs.files)){ e <- expt.names[i] macs.list[[ e ]] <- read.delim(file=input.macs.files[i]) macs.list[[ e ]][,"chr"] <- as.character(macs.list[[ e ]][,"chr"]) macs.list[[ e ]][,"trgt.prox"] <- as.character(macs.list[[ e ]][,"trgt.prox"]) macs.list[[ e ]][,"trgt.dist"] <- as.character(macs.list[[ e ]][,"trgt.dist"]) } # take all macs files and put peaks together on each chromosome # (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls) cat("splitting macs files per chromosome\n") x <- split.macs.list.to.chrmosomes.no.pml(macs.list=macs.list,expts=expt.names,chr=chr) cat("adding peaks from all macs files into chromosome rulers\n") ruler <- make.ruler.no.pml(chr,macs.list.per.chrom=x) cat("add MTL membership to the ruler\n") ruler <- assign.clusters.ids.to.peaks(ruler,d.cut=dist.summits) for(i in 1:length(ruler)){ ix <- which(is.na(ruler[[i]][["dtss.prox"]])) ruler[[i]][["dtss.prox"]][ix] <- "" ix <- which(is.na(ruler[[i]][["dtss.dist"]])) ruler[[i]][["dtss.dist"]][ix] <- "" } cat("creating MTL list\n") ll <- create.cluster.list.no.pml(ruler=ruler) cat("writing MTL table\n") # f.nm <- paste(sep="",path.output,paste(expt.names,collapse="_"),"_MTLs.xls") f.nm <- paste(sep="",path.output,"mtls",".xls") print.ll.verbose.all(ll=ll,f.nm=f.nm)