comparison mtls_analyze/cluster_peaks.R @ 0:a903c48f05b4

Added non-integrated scripts to galaxy
author kmace
date Thu, 22 Mar 2012 15:21:00 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a903c48f05b4
1 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-.
2 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
3 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' '
4 ## Feb 2012 th17
5 ## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>,
6 ## NYU - Center for Genomics and Systems Biology
7 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-.
8 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
9 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' '
10
11 rm(list=ls())
12 debug=F
13 script.version=0.1
14 print.error <- function(){
15 cat("
16 DESCRIPTIION:
17 cluster_peaks.R takes MACS tab delimited files as input and produces one tab delimeted file (named mtls.xls) where
18 each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS files)
19 fall within a certain distance between summits from eachother.
20
21 INPUT:
22 1.path_input=path to MACS files '::' delim [path_input=f1::f2::f3::...::fk]
23 2.path_output=path to save generated MTL cluster file (where to save mtls.xls)
24 3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
25 4.dist.summits=maximum distance between summits belonging to the same MTL
26 5.n.autosome.chr=19 for mouse, 22 for human
27
28 EXAMPLE RUN:
29 cluster_peaks.R
30 input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls
31 path_output=~/Desktop/
32 expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
33 dist_summits=100
34 n_autosome_chr=19
35
36 Please cite us if you used this script:
37 The transcription factor network regulating Th17 lineage specification and function.
38 Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers,
39 Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n")
40 }
41
42 ############# helper functions:
43 ## split.macs.list.to.chrmosomes
44 # input:
45 # - macs.list: a list of macs expts: here are a few lines of one expt
46 ## chr start end length summit tags #NAME? fold_enrichment FDR(%)
47 ## chr1 4322210 4323069 860 494 55 158.95 6.03 0.05
48 ## chr1 4797749 4798368 620 211 29 119.82 3.47 0.09
49 ## chr1 4848182 4849113 932 494 46 105.42 2.9 0.09
50 # - expts: a list of the expts names from macs.list that you want to process
51 # - chr: chrmosomes names
52 # output:
53 # - x: a list with as many elements as chr specified by input.
54 # -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS)
55 split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){
56 x <- list()
57 n <- length(expts)
58 for(i in 1:n){
59 e <- expts[i] #experiment name
60 cat("wroking on expt", e,"\n")
61 x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]])
62 names(x[[e]]) <- chr
63 }
64 return(x)
65 }
66 # a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m
67 # input:
68 # - r is chromosome
69 # - m is macs matrix for expt e from above function
70 split.one.macs.expt.by.chromosome.no.pml <- function(r,m){
71 ix.chr.i <- which(m[,"chr"]==r)
72 # cat("working on",r,"\n")
73 o <- list()
74 o[[r]] <- m[ix.chr.i,]
75 o[[r]]$peak.id <- ix.chr.i
76 return(o[[r]])
77 }
78
79 ## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
80 # also match these summit locations with corresponding:
81 # pvals, tfs, peak start and peak end trgts
82 # input:
83 # - chr: a list of chromosome names
84 # - macs.list.per.chrom: a list of macs peaks for each chromosome
85 # output:
86 # - o: a list each chormosome ruler as an element
87 make.ruler.no.pml <- function(chr,macs.list.per.chrom){
88 x <- macs.list.per.chrom
89 o <- list()
90 for(j in 1:length(chr)){
91 r <- chr[j] # chrmosome we go over
92 s <- numeric()
93 pval <- numeric()
94 tf.to.s <- character()
95 trgt.prox <- character()
96 trgt.dist <- character()
97 dtss.prox <- character()
98 dtss.dist <- character()
99 start <- numeric()
100 end <- numeric()
101 trtmnts <- names(x) # the treatments name (expt names)
102 ## debug parameters ###
103 ## which experiment peaks come from
104 expt <- character()
105 ## what was the peak id in that experiment
106 peak.ids <- numeric()
107 ## this will allow us to always back track from a cluster to the actual peaks in it
108 ## debug params end ###
109 for(i in 1:length(trtmnts)){
110 o[[r]] <- list()
111 e <- trtmnts[i] #experiment name
112 tf <- strsplit(e,"_")[[1]][1]
113 s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"])
114 pval <- c(pval,x[[e]][[r]][,7]) # the name of 7th column is X.10.log10.pvalue. this is pval
115 # tf.to.s <- c(tf.to.s,rep(tf,length(x[[e]][[r]][,7]))) # all summits belong to tf
116 start <- c(start,x[[e]][[r]][,"start"])
117 end <- c(end,x[[e]][[r]][,"end"])
118 expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"])))
119 peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"])
120 trgt.prox <- c(trgt.prox,x[[e]][[r]][,"trgt.prox"])
121 trgt.dist <- c(trgt.dist,x[[e]][[r]][,"trgt.dist"])
122 dtss.prox <- c(dtss.prox,x[[e]][[r]][,"dtss.prox"])
123 dtss.dist <- c(dtss.dist,x[[e]][[r]][,"dtss.dist"])
124 }
125 ix <- sort(s,index.return=TRUE)$ix
126 o[[r]] <- list(s=s[ix],pval=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix],
127 trgt.prox=trgt.prox[ix],trgt.dist=trgt.dist[ix],dtss.prox=dtss.prox[ix],dtss.dist=dtss.dist[ix])
128 }
129 return(o)
130 }
131
132 ## add cluster memberships based on ruler
133 ## require no more than d.cut distance between tf summits
134 ## cur.l is the current loci number (or cluster number)
135 assign.clusters.ids.to.peaks <- function(ruler,d.cut){
136 cur.l <- 0
137 for(j in 1:length(ruler)){
138 s <- ruler[[j]][["s"]]
139 l <- numeric(length=length(s))
140 if(length(l)>0){
141 cur.l <- cur.l+1
142 }
143 if(length(l)==1){
144 l[1] <- cur.l
145 } else if(length(l)>1) {
146 l[1] <- cur.l
147 for(i in 2:length(l)){
148 d <- s[i]-s[i-1] # assumes s is sorted increasingly
149 if(d>d.cut){
150 cur.l <- cur.l+1
151 }
152 l[i] <- cur.l
153 }
154 }
155 ruler[[j]][["l"]] <- l
156 }
157 return(ruler)
158 }
159
160 ## here we create a list of TF enriched loci (clusters)
161 ## input:
162 ## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
163 ## output:
164 ## -ll: a list of clusters ll where each cluster (elem in ll holds):
165 ## - l: the current loci (elem in ll)
166 ## - s: summits vector as before
167 ## - pval: pvals vector matching the summits
168 ## - tfs: a vector of tfs matching s, the summits in loci l
169 ## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
170 ## - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream)
171 create.cluster.list.no.pml <- function(ruler){
172 tmp <- list()
173 ll <- list()
174 for(j in 1:length(ruler)){
175 r <- names(ruler)[j]
176 # cat("working on ",r,"\n")
177 x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
178 l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique)
179 n <- length(l.vec) # iterate over n clusters
180 tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
181 }
182 ## concatenate tmp into one list
183 command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
184 ll=eval(parse(text=command))
185 return(ll)
186 }
187 get.cluster.params.no.pml <- function(i,x,l.vec,r){
188 ix <- which(x[["l"]]==l.vec[i])
189 l <- l.vec[i]
190 start <- min(x[["start"]][ix])
191 end <- max(x[["end"]][ix])
192 s <- x[["s"]][ix]
193 # tf.to.s <- x[["tf.to.s"]][ix]
194 pval <- x[["pval"]][ix]
195 pval.mean <- mean(pval)
196 span.tfs <- x[["end"]][ix]-x[["start"]][ix]
197 span.l <- end-start
198 peak.ids <- x[["peak.ids"]][ix]
199 expt <- x[["expt"]][ix]
200 expt.alphanum.sorted <- sort(x[["expt"]][ix])
201 trgt.prox <- unique(x[["trgt.prox"]][ix])
202 trgt.dist <- unique(x[["trgt.dist"]][ix])
203 dtss.prox <- unique(x[["dtss.prox"]][ix])
204 dtss.dist <- unique(x[["dtss.dist"]][ix])
205 chr <- rep(r,length(ix))
206 return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
207 end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs,
208 span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist,
209 dtss.prox=dtss.prox,dtss.dist=dtss.dist))
210 }
211
212 ## here we create a list of TF enriched loci (clusters)
213 ## input:
214 ## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
215 ## output:
216 ## -ll: a list of clusters ll where each cluster (elem in ll holds):
217 ## - l: the current loci (elem in ll)
218 ## - s: summits vector as before
219 ## - pval: pvals vector matching the summits
220 ## - tfs: a vector of tfs matching s, the summits in loci l
221 ## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
222 ## - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream)
223 create.cluster.list.no.pml <- function(ruler){
224 tmp <- list()
225 ll <- list()
226 for(j in 1:length(ruler)){
227 r <- names(ruler)[j]
228 # cat("working on ",r,"\n")
229 x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
230 l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique)
231 n <- length(l.vec) # iterate over n clusters
232 tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
233 }
234 ## concatenate tmp into one list
235 command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
236 ll=eval(parse(text=command))
237 return(ll)
238 }
239 get.cluster.params.no.pml <- function(i,x,l.vec,r){
240 ix <- which(x[["l"]]==l.vec[i])
241 l <- l.vec[i]
242 start <- min(x[["start"]][ix])
243 end <- max(x[["end"]][ix])
244 s <- x[["s"]][ix]
245 # tf.to.s <- x[["tf.to.s"]][ix]
246 pval <- x[["pval"]][ix]
247 pval.mean <- mean(pval)
248 span.tfs <- x[["end"]][ix]-x[["start"]][ix]
249 span.l <- end-start
250 peak.ids <- x[["peak.ids"]][ix]
251 expt <- x[["expt"]][ix]
252 expt.alphanum.sorted <- sort(x[["expt"]][ix])
253 trgt.prox <- unique(x[["trgt.prox"]][ix])
254 trgt.dist <- unique(x[["trgt.dist"]][ix])
255 dtss.prox <- unique(x[["dtss.prox"]][ix])
256 dtss.dist <- unique(x[["dtss.dist"]][ix])
257 chr <- rep(r,length(ix))
258 return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
259 end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs,
260 span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist,
261 dtss.prox=dtss.prox,dtss.dist=dtss.dist))
262 }
263 ## pretty print (tab deim) to file requested elements out of chip cluster list, ll.
264 ## input:
265 ## - ll: a cluster list
266 ## - f.nm: a file name (include path) to where you want files to print
267 ## - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering)
268 ## output
269 ## - a tab delim file with clusers as rows and elems tab delim for each cluster
270 print.ll.verbose.all <- function(ll,f.nm="ll.xls"){
271 options(digits=5)
272 cat(file=f.nm,names(ll[[1]]),sep="\t")
273 cat(file=f.nm,"\n",append=TRUE)
274 for(i in 1:length(ll)){
275 line <- ll[[i]][[1]] # put MTL number
276 line <- paste(line,ll[[i]][[2]][1],sep="\t")
277 for(j in 3:length(ll[[i]])){
278 val <- ll[[i]][[j]]
279 if(length(val) == 1){
280 line <- paste(line,val,sep="\t")
281 } else {
282 line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t")
283 }
284 }
285 cat(file=f.nm,line,"\n",append=TRUE)
286 }
287 }
288 ############# Code:
289 # retrieve args
290 if(debug==T){
291 cmd.args <- c(
292 "input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls",
293 "path_output=~/Desktop/",
294 "expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17",
295 "dist_summits=100",
296 "n_autosome_chr=19"
297 )
298 } else {
299 cmd.args <- commandArgs();
300 }
301
302 if(length(grep("--version",cmd.args))){
303 cat("version",script.version,"\n")
304 q()
305 }
306
307 arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1])
308 args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2])
309
310 arg.nms <- c("input_macs_files","path_output","expt_names","dist_summits","n_autosome_chr")
311 arg.val <- character(length=length(arg.nms))
312 for(i in 1:length(arg.nms)){
313 ix <- which(arg.names.cmd.line==arg.nms[i])
314 if(length(ix)==1){
315 arg.val[i] <- args.val.cmd.line[ix]
316 } else {
317 stop("######could not find ",arg.nms[i]," arg######\n\n",print.error())
318
319 }
320 }
321 if(debug==T){
322 print(paste(arg.nms,"=",arg.val))
323 }
324 # the files here adhere to tab delim format
325 input.macs.files <- strsplit(arg.val[1],"::")[[1]]
326 if(length(input.macs.files)==1){
327 cat("only provided one MACS file to cluster.")
328 print.error()
329 }
330 path.output <- arg.val[2]
331 expt.names <- strsplit(arg.val[3],"::")[[1]]
332 dist.summits <- as.numeric(arg.val[4])
333 n.autosome.chr <- as.numeric(arg.val[5])
334
335 # source("~/Documents/nyu/littmanLab/th17_used_for_paper/r_scripts/th17/used_for_paper/cluster_peaks_util.R")
336 chr <- c(paste(sep="","chr",1:n.autosome.chr),paste(sep="","chr",c("X","Y")))
337
338 # read MACS files
339 macs.list <- list()
340 for(i in 1:length(input.macs.files)){
341 e <- expt.names[i]
342 macs.list[[ e ]] <- read.delim(file=input.macs.files[i])
343 macs.list[[ e ]][,"chr"] <- as.character(macs.list[[ e ]][,"chr"])
344 macs.list[[ e ]][,"trgt.prox"] <- as.character(macs.list[[ e ]][,"trgt.prox"])
345 macs.list[[ e ]][,"trgt.dist"] <- as.character(macs.list[[ e ]][,"trgt.dist"])
346 }
347
348 # take all macs files and put peaks together on each chromosome
349 # (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls)
350 cat("splitting macs files per chromosome\n")
351 x <- split.macs.list.to.chrmosomes.no.pml(macs.list=macs.list,expts=expt.names,chr=chr)
352 cat("adding peaks from all macs files into chromosome rulers\n")
353 ruler <- make.ruler.no.pml(chr,macs.list.per.chrom=x)
354 cat("add MTL membership to the ruler\n")
355 ruler <- assign.clusters.ids.to.peaks(ruler,d.cut=dist.summits)
356 for(i in 1:length(ruler)){
357 ix <- which(is.na(ruler[[i]][["dtss.prox"]]))
358 ruler[[i]][["dtss.prox"]][ix] <- ""
359 ix <- which(is.na(ruler[[i]][["dtss.dist"]]))
360 ruler[[i]][["dtss.dist"]][ix] <- ""
361 }
362 cat("creating MTL list\n")
363 ll <- create.cluster.list.no.pml(ruler=ruler)
364 cat("writing MTL table\n")
365 # f.nm <- paste(sep="",path.output,paste(expt.names,collapse="_"),"_MTLs.xls")
366 f.nm <- paste(sep="",path.output,"mtls",".xls")
367 print.ll.verbose.all(ll=ll,f.nm=f.nm)
368
369
370
371
372
373
374
375
376