Mercurial > repos > kmace > mtls_analysis
comparison mtls_analyze/cluster_peaks.R @ 0:a903c48f05b4
Added non-integrated scripts to galaxy
author | kmace |
---|---|
date | Thu, 22 Mar 2012 15:21:00 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a903c48f05b4 |
---|---|
1 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. | |
2 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ | |
3 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' | |
4 ## Feb 2012 th17 | |
5 ## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, | |
6 ## NYU - Center for Genomics and Systems Biology | |
7 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. | |
8 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ | |
9 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' | |
10 | |
11 rm(list=ls()) | |
12 debug=F | |
13 script.version=0.1 | |
14 print.error <- function(){ | |
15 cat(" | |
16 DESCRIPTIION: | |
17 cluster_peaks.R takes MACS tab delimited files as input and produces one tab delimeted file (named mtls.xls) where | |
18 each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS files) | |
19 fall within a certain distance between summits from eachother. | |
20 | |
21 INPUT: | |
22 1.path_input=path to MACS files '::' delim [path_input=f1::f2::f3::...::fk] | |
23 2.path_output=path to save generated MTL cluster file (where to save mtls.xls) | |
24 3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] | |
25 4.dist.summits=maximum distance between summits belonging to the same MTL | |
26 5.n.autosome.chr=19 for mouse, 22 for human | |
27 | |
28 EXAMPLE RUN: | |
29 cluster_peaks.R | |
30 input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls | |
31 path_output=~/Desktop/ | |
32 expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 | |
33 dist_summits=100 | |
34 n_autosome_chr=19 | |
35 | |
36 Please cite us if you used this script: | |
37 The transcription factor network regulating Th17 lineage specification and function. | |
38 Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, | |
39 Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n") | |
40 } | |
41 | |
42 ############# helper functions: | |
43 ## split.macs.list.to.chrmosomes | |
44 # input: | |
45 # - macs.list: a list of macs expts: here are a few lines of one expt | |
46 ## chr start end length summit tags #NAME? fold_enrichment FDR(%) | |
47 ## chr1 4322210 4323069 860 494 55 158.95 6.03 0.05 | |
48 ## chr1 4797749 4798368 620 211 29 119.82 3.47 0.09 | |
49 ## chr1 4848182 4849113 932 494 46 105.42 2.9 0.09 | |
50 # - expts: a list of the expts names from macs.list that you want to process | |
51 # - chr: chrmosomes names | |
52 # output: | |
53 # - x: a list with as many elements as chr specified by input. | |
54 # -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS) | |
55 split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){ | |
56 x <- list() | |
57 n <- length(expts) | |
58 for(i in 1:n){ | |
59 e <- expts[i] #experiment name | |
60 cat("wroking on expt", e,"\n") | |
61 x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]]) | |
62 names(x[[e]]) <- chr | |
63 } | |
64 return(x) | |
65 } | |
66 # a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m | |
67 # input: | |
68 # - r is chromosome | |
69 # - m is macs matrix for expt e from above function | |
70 split.one.macs.expt.by.chromosome.no.pml <- function(r,m){ | |
71 ix.chr.i <- which(m[,"chr"]==r) | |
72 # cat("working on",r,"\n") | |
73 o <- list() | |
74 o[[r]] <- m[ix.chr.i,] | |
75 o[[r]]$peak.id <- ix.chr.i | |
76 return(o[[r]]) | |
77 } | |
78 | |
79 ## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) | |
80 # also match these summit locations with corresponding: | |
81 # pvals, tfs, peak start and peak end trgts | |
82 # input: | |
83 # - chr: a list of chromosome names | |
84 # - macs.list.per.chrom: a list of macs peaks for each chromosome | |
85 # output: | |
86 # - o: a list each chormosome ruler as an element | |
87 make.ruler.no.pml <- function(chr,macs.list.per.chrom){ | |
88 x <- macs.list.per.chrom | |
89 o <- list() | |
90 for(j in 1:length(chr)){ | |
91 r <- chr[j] # chrmosome we go over | |
92 s <- numeric() | |
93 pval <- numeric() | |
94 tf.to.s <- character() | |
95 trgt.prox <- character() | |
96 trgt.dist <- character() | |
97 dtss.prox <- character() | |
98 dtss.dist <- character() | |
99 start <- numeric() | |
100 end <- numeric() | |
101 trtmnts <- names(x) # the treatments name (expt names) | |
102 ## debug parameters ### | |
103 ## which experiment peaks come from | |
104 expt <- character() | |
105 ## what was the peak id in that experiment | |
106 peak.ids <- numeric() | |
107 ## this will allow us to always back track from a cluster to the actual peaks in it | |
108 ## debug params end ### | |
109 for(i in 1:length(trtmnts)){ | |
110 o[[r]] <- list() | |
111 e <- trtmnts[i] #experiment name | |
112 tf <- strsplit(e,"_")[[1]][1] | |
113 s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"]) | |
114 pval <- c(pval,x[[e]][[r]][,7]) # the name of 7th column is X.10.log10.pvalue. this is pval | |
115 # tf.to.s <- c(tf.to.s,rep(tf,length(x[[e]][[r]][,7]))) # all summits belong to tf | |
116 start <- c(start,x[[e]][[r]][,"start"]) | |
117 end <- c(end,x[[e]][[r]][,"end"]) | |
118 expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"]))) | |
119 peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"]) | |
120 trgt.prox <- c(trgt.prox,x[[e]][[r]][,"trgt.prox"]) | |
121 trgt.dist <- c(trgt.dist,x[[e]][[r]][,"trgt.dist"]) | |
122 dtss.prox <- c(dtss.prox,x[[e]][[r]][,"dtss.prox"]) | |
123 dtss.dist <- c(dtss.dist,x[[e]][[r]][,"dtss.dist"]) | |
124 } | |
125 ix <- sort(s,index.return=TRUE)$ix | |
126 o[[r]] <- list(s=s[ix],pval=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix], | |
127 trgt.prox=trgt.prox[ix],trgt.dist=trgt.dist[ix],dtss.prox=dtss.prox[ix],dtss.dist=dtss.dist[ix]) | |
128 } | |
129 return(o) | |
130 } | |
131 | |
132 ## add cluster memberships based on ruler | |
133 ## require no more than d.cut distance between tf summits | |
134 ## cur.l is the current loci number (or cluster number) | |
135 assign.clusters.ids.to.peaks <- function(ruler,d.cut){ | |
136 cur.l <- 0 | |
137 for(j in 1:length(ruler)){ | |
138 s <- ruler[[j]][["s"]] | |
139 l <- numeric(length=length(s)) | |
140 if(length(l)>0){ | |
141 cur.l <- cur.l+1 | |
142 } | |
143 if(length(l)==1){ | |
144 l[1] <- cur.l | |
145 } else if(length(l)>1) { | |
146 l[1] <- cur.l | |
147 for(i in 2:length(l)){ | |
148 d <- s[i]-s[i-1] # assumes s is sorted increasingly | |
149 if(d>d.cut){ | |
150 cur.l <- cur.l+1 | |
151 } | |
152 l[i] <- cur.l | |
153 } | |
154 } | |
155 ruler[[j]][["l"]] <- l | |
156 } | |
157 return(ruler) | |
158 } | |
159 | |
160 ## here we create a list of TF enriched loci (clusters) | |
161 ## input: | |
162 ## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) | |
163 ## output: | |
164 ## -ll: a list of clusters ll where each cluster (elem in ll holds): | |
165 ## - l: the current loci (elem in ll) | |
166 ## - s: summits vector as before | |
167 ## - pval: pvals vector matching the summits | |
168 ## - tfs: a vector of tfs matching s, the summits in loci l | |
169 ## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak | |
170 ## - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream) | |
171 create.cluster.list.no.pml <- function(ruler){ | |
172 tmp <- list() | |
173 ll <- list() | |
174 for(j in 1:length(ruler)){ | |
175 r <- names(ruler)[j] | |
176 # cat("working on ",r,"\n") | |
177 x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]] | |
178 l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique) | |
179 n <- length(l.vec) # iterate over n clusters | |
180 tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r) | |
181 } | |
182 ## concatenate tmp into one list | |
183 command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")") | |
184 ll=eval(parse(text=command)) | |
185 return(ll) | |
186 } | |
187 get.cluster.params.no.pml <- function(i,x,l.vec,r){ | |
188 ix <- which(x[["l"]]==l.vec[i]) | |
189 l <- l.vec[i] | |
190 start <- min(x[["start"]][ix]) | |
191 end <- max(x[["end"]][ix]) | |
192 s <- x[["s"]][ix] | |
193 # tf.to.s <- x[["tf.to.s"]][ix] | |
194 pval <- x[["pval"]][ix] | |
195 pval.mean <- mean(pval) | |
196 span.tfs <- x[["end"]][ix]-x[["start"]][ix] | |
197 span.l <- end-start | |
198 peak.ids <- x[["peak.ids"]][ix] | |
199 expt <- x[["expt"]][ix] | |
200 expt.alphanum.sorted <- sort(x[["expt"]][ix]) | |
201 trgt.prox <- unique(x[["trgt.prox"]][ix]) | |
202 trgt.dist <- unique(x[["trgt.dist"]][ix]) | |
203 dtss.prox <- unique(x[["dtss.prox"]][ix]) | |
204 dtss.dist <- unique(x[["dtss.dist"]][ix]) | |
205 chr <- rep(r,length(ix)) | |
206 return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start, | |
207 end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs, | |
208 span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist, | |
209 dtss.prox=dtss.prox,dtss.dist=dtss.dist)) | |
210 } | |
211 | |
212 ## here we create a list of TF enriched loci (clusters) | |
213 ## input: | |
214 ## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) | |
215 ## output: | |
216 ## -ll: a list of clusters ll where each cluster (elem in ll holds): | |
217 ## - l: the current loci (elem in ll) | |
218 ## - s: summits vector as before | |
219 ## - pval: pvals vector matching the summits | |
220 ## - tfs: a vector of tfs matching s, the summits in loci l | |
221 ## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak | |
222 ## - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream) | |
223 create.cluster.list.no.pml <- function(ruler){ | |
224 tmp <- list() | |
225 ll <- list() | |
226 for(j in 1:length(ruler)){ | |
227 r <- names(ruler)[j] | |
228 # cat("working on ",r,"\n") | |
229 x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]] | |
230 l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique) | |
231 n <- length(l.vec) # iterate over n clusters | |
232 tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r) | |
233 } | |
234 ## concatenate tmp into one list | |
235 command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")") | |
236 ll=eval(parse(text=command)) | |
237 return(ll) | |
238 } | |
239 get.cluster.params.no.pml <- function(i,x,l.vec,r){ | |
240 ix <- which(x[["l"]]==l.vec[i]) | |
241 l <- l.vec[i] | |
242 start <- min(x[["start"]][ix]) | |
243 end <- max(x[["end"]][ix]) | |
244 s <- x[["s"]][ix] | |
245 # tf.to.s <- x[["tf.to.s"]][ix] | |
246 pval <- x[["pval"]][ix] | |
247 pval.mean <- mean(pval) | |
248 span.tfs <- x[["end"]][ix]-x[["start"]][ix] | |
249 span.l <- end-start | |
250 peak.ids <- x[["peak.ids"]][ix] | |
251 expt <- x[["expt"]][ix] | |
252 expt.alphanum.sorted <- sort(x[["expt"]][ix]) | |
253 trgt.prox <- unique(x[["trgt.prox"]][ix]) | |
254 trgt.dist <- unique(x[["trgt.dist"]][ix]) | |
255 dtss.prox <- unique(x[["dtss.prox"]][ix]) | |
256 dtss.dist <- unique(x[["dtss.dist"]][ix]) | |
257 chr <- rep(r,length(ix)) | |
258 return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start, | |
259 end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs, | |
260 span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist, | |
261 dtss.prox=dtss.prox,dtss.dist=dtss.dist)) | |
262 } | |
263 ## pretty print (tab deim) to file requested elements out of chip cluster list, ll. | |
264 ## input: | |
265 ## - ll: a cluster list | |
266 ## - f.nm: a file name (include path) to where you want files to print | |
267 ## - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering) | |
268 ## output | |
269 ## - a tab delim file with clusers as rows and elems tab delim for each cluster | |
270 print.ll.verbose.all <- function(ll,f.nm="ll.xls"){ | |
271 options(digits=5) | |
272 cat(file=f.nm,names(ll[[1]]),sep="\t") | |
273 cat(file=f.nm,"\n",append=TRUE) | |
274 for(i in 1:length(ll)){ | |
275 line <- ll[[i]][[1]] # put MTL number | |
276 line <- paste(line,ll[[i]][[2]][1],sep="\t") | |
277 for(j in 3:length(ll[[i]])){ | |
278 val <- ll[[i]][[j]] | |
279 if(length(val) == 1){ | |
280 line <- paste(line,val,sep="\t") | |
281 } else { | |
282 line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t") | |
283 } | |
284 } | |
285 cat(file=f.nm,line,"\n",append=TRUE) | |
286 } | |
287 } | |
288 ############# Code: | |
289 # retrieve args | |
290 if(debug==T){ | |
291 cmd.args <- c( | |
292 "input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls", | |
293 "path_output=~/Desktop/", | |
294 "expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17", | |
295 "dist_summits=100", | |
296 "n_autosome_chr=19" | |
297 ) | |
298 } else { | |
299 cmd.args <- commandArgs(); | |
300 } | |
301 | |
302 if(length(grep("--version",cmd.args))){ | |
303 cat("version",script.version,"\n") | |
304 q() | |
305 } | |
306 | |
307 arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1]) | |
308 args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2]) | |
309 | |
310 arg.nms <- c("input_macs_files","path_output","expt_names","dist_summits","n_autosome_chr") | |
311 arg.val <- character(length=length(arg.nms)) | |
312 for(i in 1:length(arg.nms)){ | |
313 ix <- which(arg.names.cmd.line==arg.nms[i]) | |
314 if(length(ix)==1){ | |
315 arg.val[i] <- args.val.cmd.line[ix] | |
316 } else { | |
317 stop("######could not find ",arg.nms[i]," arg######\n\n",print.error()) | |
318 | |
319 } | |
320 } | |
321 if(debug==T){ | |
322 print(paste(arg.nms,"=",arg.val)) | |
323 } | |
324 # the files here adhere to tab delim format | |
325 input.macs.files <- strsplit(arg.val[1],"::")[[1]] | |
326 if(length(input.macs.files)==1){ | |
327 cat("only provided one MACS file to cluster.") | |
328 print.error() | |
329 } | |
330 path.output <- arg.val[2] | |
331 expt.names <- strsplit(arg.val[3],"::")[[1]] | |
332 dist.summits <- as.numeric(arg.val[4]) | |
333 n.autosome.chr <- as.numeric(arg.val[5]) | |
334 | |
335 # source("~/Documents/nyu/littmanLab/th17_used_for_paper/r_scripts/th17/used_for_paper/cluster_peaks_util.R") | |
336 chr <- c(paste(sep="","chr",1:n.autosome.chr),paste(sep="","chr",c("X","Y"))) | |
337 | |
338 # read MACS files | |
339 macs.list <- list() | |
340 for(i in 1:length(input.macs.files)){ | |
341 e <- expt.names[i] | |
342 macs.list[[ e ]] <- read.delim(file=input.macs.files[i]) | |
343 macs.list[[ e ]][,"chr"] <- as.character(macs.list[[ e ]][,"chr"]) | |
344 macs.list[[ e ]][,"trgt.prox"] <- as.character(macs.list[[ e ]][,"trgt.prox"]) | |
345 macs.list[[ e ]][,"trgt.dist"] <- as.character(macs.list[[ e ]][,"trgt.dist"]) | |
346 } | |
347 | |
348 # take all macs files and put peaks together on each chromosome | |
349 # (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls) | |
350 cat("splitting macs files per chromosome\n") | |
351 x <- split.macs.list.to.chrmosomes.no.pml(macs.list=macs.list,expts=expt.names,chr=chr) | |
352 cat("adding peaks from all macs files into chromosome rulers\n") | |
353 ruler <- make.ruler.no.pml(chr,macs.list.per.chrom=x) | |
354 cat("add MTL membership to the ruler\n") | |
355 ruler <- assign.clusters.ids.to.peaks(ruler,d.cut=dist.summits) | |
356 for(i in 1:length(ruler)){ | |
357 ix <- which(is.na(ruler[[i]][["dtss.prox"]])) | |
358 ruler[[i]][["dtss.prox"]][ix] <- "" | |
359 ix <- which(is.na(ruler[[i]][["dtss.dist"]])) | |
360 ruler[[i]][["dtss.dist"]][ix] <- "" | |
361 } | |
362 cat("creating MTL list\n") | |
363 ll <- create.cluster.list.no.pml(ruler=ruler) | |
364 cat("writing MTL table\n") | |
365 # f.nm <- paste(sep="",path.output,paste(expt.names,collapse="_"),"_MTLs.xls") | |
366 f.nm <- paste(sep="",path.output,"mtls",".xls") | |
367 print.ll.verbose.all(ll=ll,f.nm=f.nm) | |
368 | |
369 | |
370 | |
371 | |
372 | |
373 | |
374 | |
375 | |
376 |