comparison mtls_analyze/cluster_peaks.R @ 4:b465306d00ba draft default tip

Uploaded
author kmace
date Mon, 23 Jul 2012 13:00:15 -0400
parents
children
comparison
equal deleted inserted replaced
3:a0306edbf2f8 4:b465306d00ba
1 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-.
2 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
3 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' '
4 ## Feb 2012 th17
5 ## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>,
6 ## NYU - Center for Genomics and Systems Biology
7 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-.
8 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
9 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' '
10
11 rm(list=ls())
12 debug=F
13 verbose=F
14 script.version=0.1
15 print.error <- function(){
16 cat("
17 DESCRIPTIION:
18 cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where
19 each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
20 fall within a certain distance from eachother. If you cluster MTLs based on summits, you need to specify dist.summits.
21 If you use .bed files, the files must contain no header, and at least the five columns:
22 1. chrom, 2. chromStart, 3. chromEnd, 4. name, and 5.score. 2 and 3 represent the end points used for MTLs defined based on a shared
23 interval. (2+3)/2 is used as the summit of each row if used for MTLs defined based on proximity of summits.
24
25 INPUT:
26 1.input_files: path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
27 2.path_output: path to save generated MTL cluster file (where to save mtls.xls)
28 3.expt_names: user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
29 4.input_type: the type of input file used (MACS or BED; defaults to MACS)
30 5.mtl_type: interval or summit (defaults to summit)
31 6.dist.summits: maximum distance between summits belonging to the same MTL (defaults to 100; only used if mtl_type is summit)
32
33 EXAMPLE RUN:
34 cluster_peaks.R
35 --input_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
36 --input_type MACS
37 --path_output results/
38 --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
39 --dist_summits 100
40 --mtl_type summit
41
42 Please cite us if you used this script:
43 The transcription factor network regulating Th17 lineage specification and function.
44 Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers,
45 Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n")
46 }
47
48 ############# helper functions:
49 ## split.macs.list.to.chrmosomes
50 # input:
51 # - macs.list: a list of macs expts: here are a few lines of one expt
52 ## chr start end length summit tags #NAME? fold_enrichment FDR(%)
53 ## chr1 4322210 4323069 860 494 55 158.95 6.03 0.05
54 ## chr1 4797749 4798368 620 211 29 119.82 3.47 0.09
55 ## chr1 4848182 4849113 932 494 46 105.42 2.9 0.09
56 # - expts: a list of the expts names from macs.list that you want to process
57 # - chr: chrmosomes names
58 # output:
59 # - x: a list with as many elements as chr specified by input.
60 # -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS)
61 split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){
62 x <- list()
63 n <- length(expts)
64 for(i in 1:n){
65 e <- expts[i] #experiment name
66 cat("wroking on expt", e,"\n")
67 x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]])
68 names(x[[e]]) <- chr
69 }
70 return(x)
71 }
72 # a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m
73 # input:
74 # - r is chromosome
75 # - m is macs matrix for expt e from above function
76 split.one.macs.expt.by.chromosome.no.pml <- function(r,m){
77 ix.chr.i <- which(m[,"chr"]==r)
78 # cat("working on",r,"\n")
79 o <- list()
80 o[[r]] <- m[ix.chr.i,]
81 o[[r]]$peak.id <- ix.chr.i
82 return(o[[r]])
83 }
84
85 ## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
86 # also match these summit locations with corresponding:
87 # pvals, tfs, peak start and peak end
88 # input:
89 # - chr: a list of chromosome names
90 # - macs.list.per.chrom: a list of macs peaks for each chromosome
91 # output:
92 # - o: a list each chormosome ruler as an element
93 make.ruler.no.pml <- function(chr,macs.list.per.chrom){
94 x <- macs.list.per.chrom
95 o <- list()
96 for(j in 1:length(chr)){
97 r <- chr[j] # chrmosome we go over
98 s <- numeric()
99 pval <- numeric()
100 tf.to.s <- character()
101 start <- numeric()
102 end <- numeric()
103 trtmnts <- names(x) # the treatments name (expt names)
104 ## debug parameters ###
105 ## which experiment peaks come from
106 expt <- character()
107 ## what was the peak id in that experiment
108 peak.ids <- numeric()
109 ## this will allow us to always back track from a cluster to the actual peaks in it
110 ## debug params end ###
111 for(i in 1:length(trtmnts)){
112 o[[r]] <- list()
113 e <- trtmnts[i] #experiment name
114 tf <- strsplit(e,"_")[[1]][1]
115 s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"])
116 pval <- c(pval,x[[e]][[r]][,"score"])
117 start <- c(start,x[[e]][[r]][,"start"])
118 end <- c(end,x[[e]][[r]][,"end"])
119 expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"])))
120 peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"])
121 }
122 ix <- sort(s,index.return=TRUE)$ix
123 o[[r]] <- list(summit=s[ix],score=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix])
124 }
125 return(o)
126 }
127
128 ## add cluster memberships based on ruler
129 ## require no more than d.cut distance between tf summits
130 ## cur.l is the current loci number (or cluster number)
131 assign.clusters.ids.to.peaks.based.on.summits <- function(ruler,d.cut){
132 cur.l <- 0
133 for(j in 1:length(ruler)){
134 s <- ruler[[j]][["summit"]]
135 l <- numeric(length=length(s))
136 if(length(l)>0){
137 cur.l <- cur.l+1
138 }
139 if(length(l)==1){
140 l[1] <- cur.l
141 } else if(length(l)>1) {
142 l[1] <- cur.l
143 for(i in 2:length(l)){
144 d <- s[i]-s[i-1] # assumes s is sorted increasingly
145 if(d>d.cut){
146 cur.l <- cur.l+1
147 }
148 l[i] <- cur.l
149 }
150 }
151 ruler[[j]][["mtl.id"]] <- l
152 }
153 return(ruler)
154 }
155
156 ## add cluster memberships based on ruler
157 ## require that peaks span will have a non-empty intersection
158 ## cur.mtl is the current loci number (or cluster number)
159 assign.clusters.ids.to.peaks.based.on.intervals <- function(ruler){
160 cur.mtl <- 0
161 for(j in 1:length(ruler)){
162 s <- ruler[[j]][["start"]]
163 e <- ruler[[j]][["end"]]
164 l <- numeric(length=length(s))
165 if(length(l)>0){
166 cur.mtl <- cur.mtl+1
167 }
168 if(length(l)==1){
169 l[1] <- cur.mtl
170 } else if(length(l)>1) {
171 l[1] <- cur.mtl
172 cur.e <- e[1] # the right-most bp belonging to the current mtl
173 for(i in 2:length(l)){
174 if(cur.e<s[i]){
175 cur.mtl <- cur.mtl+1
176 }
177 l[i] <- cur.mtl
178 cur.e <- max(cur.e,e[i])
179 }
180 }
181 ruler[[j]][["mtl.id"]] <- l
182 }
183 return(ruler)
184 }
185
186 ## here we create a list of TF enriched loci (clusters)
187 ## input:
188 ## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
189 ## output:
190 ## -ll: a list of clusters ll where each cluster (elem in ll holds):
191 ## - mtl.id: the current loci (elem in ll)
192 ## - s: summits vector as before
193 ## - pval: pvals vector matching the summits
194 ## - tfs: a vector of tfs matching s, the summits in loci l
195 ## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
196 create.cluster.list.no.pml <- function(ruler){
197 tmp <- list()
198 ll <- list()
199 for(j in 1:length(ruler)){
200 r <- names(ruler)[j]
201 # cat("working on ",r,"\n")
202 x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
203 l.vec <- unique(x[["mtl.id"]]) # the clusters ids on j chr (only unique)
204 n <- length(l.vec) # iterate over n clusters
205 tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
206 }
207 ## concatenate tmp into one list
208 command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
209 ll=eval(parse(text=command))
210 return(ll)
211 }
212 get.cluster.params.no.pml <- function(i,x,l.vec,r){
213 # ix <- which(x[["l"]]==l.vec[i])
214 # l <- l.vec[i]
215 ix <- which(x[["mtl.id"]]==l.vec[i])
216 l <- l.vec[i]
217 start <- min(x[["start"]][ix])
218 end <- max(x[["end"]][ix])
219 # s <- x[["s"]][ix]
220 summit <- x[["summit"]][ix]
221 # pval <- x[["pval"]][ix]
222 score <- x[["score"]][ix]
223 # pval.mean <- mean(pval)
224 score.mean <- mean(score)
225 span.tfs <- x[["end"]][ix]-x[["start"]][ix]
226 span.l <- end-start
227 peak.ids <- x[["peak.ids"]][ix]
228 expt <- x[["expt"]][ix]
229 expt.alphanum.sorted <- sort(x[["expt"]][ix])
230
231 chr <- rep(r,length(ix))
232 return(list(mtl.id=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
233 end=end,summit=summit,score=score,score.mean=score.mean,span.tfs=span.tfs,
234 span.l=span.l,peak.ids=peak.ids))
235 }
236
237 ## pretty print (tab deim) to file requested elements out of chip cluster list, ll.
238 ## input:
239 ## - ll: a cluster list
240 ## - f.nm: a file name (include path) to where you want files to print
241 ## - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering)
242 ## output
243 ## - a tab delim file with clusers as rows and elems tab delim for each cluster
244 print.ll.verbose.all <- function(ll,f.nm="ll.xls"){
245 options(digits=5)
246 cat(file=f.nm,names(ll[[1]]),sep="\t")
247 cat(file=f.nm,"\n",append=TRUE)
248 for(i in 1:length(ll)){
249 line <- ll[[i]][[1]] # put MTL number
250 line <- paste(line,ll[[i]][[2]][1],sep="\t")
251 for(j in 3:length(ll[[i]])){
252 val <- ll[[i]][[j]]
253 if(length(val) == 1){
254 line <- paste(line,val,sep="\t")
255 } else {
256 line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t")
257 }
258 }
259 cat(file=f.nm,line,"\n",append=TRUE)
260 }
261 }
262 # read command line paramters that are not optional
263 read.cmd.line.params.must <- function(args.nms, cmd.line.args){
264 if(length(grep("--version",cmd.line.args))){
265 cat("version",script.version,"\n")
266 q()
267 }
268 args <- sapply(strsplit(cmd.line.args," "),function(i) i)
269 vals <- character(length(args.nms))
270 # split cmd.line to key and value pairs
271 for(i in 1:length(args.nms)){
272 ix <- grep(args.nms[i],args)
273 if(length(ix)>1){
274 stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error())
275 } else if (length(ix)==0){
276 stop("could not find ",args.nms[i],". Bailing out...\n",print.error())
277 } else {
278 vals[i] <- args[ix+1]
279 }
280 }
281 return(vals)
282 }
283 # read command line paramters that are optional, if an optional param is not give functions return na for this param
284 read.cmd.line.params.optional <- function(args.nms, cmd.line.args){
285 args <- sapply(strsplit(cmd.line.args," "),function(i) i)
286 vals <- character(length(args.nms))
287 # split cmd.line to key and value pairs
288 for(i in 1:length(args.nms)){
289 ix <- grep(args.nms[i],args)
290 if(length(ix)>1){
291 stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error())
292 } else if (length(ix)==0){ # if --param was not written in cmd line
293 vals[i] <- NA
294 } else if(( (ix+1) <= length(args) ) & ( length(grep("--",args[ix+1])) == 0) ){ # if --param was written in cmd line AND was followed by a value
295 vals[i] <- args[ix+1]
296 } else { # otherwise
297 vals[i] <- NA
298 }
299 }
300 return(vals)
301 }
302
303 ############# Code:
304 # retrieve args
305
306 if(debug==T){
307 cmd.args <- c(
308 "--input_files data/xls/SL10571_SL10565_peaks.xls::data/xls/SL10570_SL10564_peaks.xls::data/xls/SL10572_SL10566_peaks.xls",
309 #"--input_files data/bed/SL10571_SL10565_peaks.bed::data/bed/SL10570_SL10564_peaks.bed::data/bed/SL10572_SL10566_peaks.bed",
310 "--input_type MACS", #BED
311 "--path_output ./results/",
312 "--expt_names macs1::macs2::macs3",
313 #"--expt_names bed1::bed2::bed3",
314 "--mtl_type interval", #interval summit
315 "--dist_summits 100"
316 )
317 } else {
318 cmd.args <- commandArgs(trailingOnly = T);
319 }
320
321 # if(length(grep("--version",cmd.args))){
322 # cat("version",script.version,"\n")
323 # q()
324 # }
325 args.nms.must <- c(
326 "--input_files", #1
327 "--path_output", #2
328 "--expt_names" #3
329 )
330
331 # all numeric params must come at the end of args.nms.optional
332 n.start.numeric <- 3
333 args.nms.optional <- c(
334 "--input_type", #1
335 "--mtl_type", #2
336 "--dist_summits" #3
337 )
338
339 # get must parameters
340 vals.must <- read.cmd.line.params.must(args.nms = args.nms.must, cmd.line.args = cmd.args)
341 if(verbose){
342 cat("\nfinshed reading vals: \n")
343 cat("\nvals.must", unlist(vals.must), "\n")
344 }
345 input.files <- strsplit(vals.must[1],"::")[[1]]
346 if(length(input.files)==1){
347 cat("only provided one MACS file to cluster.")
348 print.error()
349 }
350 path.output <- vals.must[2]
351 expt.names <- strsplit(vals.must[3],"::")[[1]]
352 # get optional parameters
353 vals.optional <- read.cmd.line.params.optional(args.nms = args.nms.optional, cmd.line.args = cmd.args)
354 if(verbose){
355 cat("\nvals.optional:", unlist(vals.optional),"\n")
356 }
357 if(is.na(vals.optional[1])){
358 input.type <- "MACS"
359 } else {
360 input.type <- vals.optional[1]
361 }
362 if(is.na(vals.optional[2])){
363 mtl.type <- "interval"
364 } else {
365 mtl.type <- vals.optional[2]
366 }
367 if(is.na(vals.optional[2])){
368 dist.summits <- 100
369 } else {
370 dist.summits <- as.numeric(vals.optional[3])
371 }
372
373 # read MACS files
374 unique.chr <- character()
375 infile.list <- list()
376 col.nms.keepers <- c("chr","start","end","summit","score")
377 for(i in 1:length(input.files)){
378 e <- expt.names[i]
379 if(toupper(input.type)=="MACS"){
380 tmp <- read.delim(file=input.files[i],comment.char="#",stringsAsFactors = F)
381 columns <- c("chr","start","end","summit","pvalue")
382 ix.cols <- numeric()
383 for(j in 1:length(columns)){
384 ix.j <- grep(columns[j],names(tmp))
385 if(length(ix.j)==0){
386 stop("can't find (grep) column ",columns[j]," in MACS input file ", e)
387 }
388 ix.cols <- c(ix.cols,ix.j)
389 }
390 infile.list[[e]] <- tmp[,ix.cols]
391 colnames(infile.list[[e]]) <- col.nms.keepers
392 }
393 if(toupper(input.type)=="BED"){
394 tmp <- read.delim(file=input.files[i],stringsAsFactors = F,header = FALSE)
395 tmp[,4] <- tmp[,2]+(tmp[,3]-tmp[,2])/2 # define summit and put istead of name column
396 colnames(tmp) <- col.nms.keepers
397 infile.list[[e]] <- tmp[,1:5]
398 }
399 unique.chr <- unique(c(unique.chr,infile.list[[ e ]][,"chr"]))
400 }
401
402 unique.chr <- sort(unique.chr)
403
404 # take all macs files and put peaks together on each chromosome
405 # (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls)
406 cat("splitting input files per chromosome\n")
407 x <- split.macs.list.to.chrmosomes.no.pml(macs.list=infile.list,expts=expt.names,chr=unique.chr)
408 cat("adding peaks from all input files into chromosome rulers\n")
409 ruler <- make.ruler.no.pml(chr=unique.chr,macs.list.per.chrom=x)
410 cat("add MTL membership to the ruler\n")
411
412 if(mtl.type=="interval"){
413 ruler <- assign.clusters.ids.to.peaks.based.on.intervals(ruler)
414 } else {
415 ruler <- assign.clusters.ids.to.peaks.based.on.summits(ruler,d.cut=dist.summits)
416 }
417
418 cat("creating MTL list\n")
419 ll <- create.cluster.list.no.pml(ruler=ruler)
420 cat("writing MTL table\n")
421 f.nm <- paste(sep="",path.output,"mtls",".xls")
422 print.ll.verbose.all(ll=ll,f.nm=f.nm)
423
424
425
426
427
428
429
430
431