Mercurial > repos > kmace > mtls_analysis
comparison mtls_analyze/cluster_peaks.R @ 4:b465306d00ba draft default tip
Uploaded
| author | kmace |
|---|---|
| date | Mon, 23 Jul 2012 13:00:15 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 3:a0306edbf2f8 | 4:b465306d00ba |
|---|---|
| 1 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. | |
| 2 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ | |
| 3 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' | |
| 4 ## Feb 2012 th17 | |
| 5 ## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, | |
| 6 ## NYU - Center for Genomics and Systems Biology | |
| 7 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. | |
| 8 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ | |
| 9 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' | |
| 10 | |
| 11 rm(list=ls()) | |
| 12 debug=F | |
| 13 verbose=F | |
| 14 script.version=0.1 | |
| 15 print.error <- function(){ | |
| 16 cat(" | |
| 17 DESCRIPTIION: | |
| 18 cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where | |
| 19 each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files) | |
| 20 fall within a certain distance from eachother. If you cluster MTLs based on summits, you need to specify dist.summits. | |
| 21 If you use .bed files, the files must contain no header, and at least the five columns: | |
| 22 1. chrom, 2. chromStart, 3. chromEnd, 4. name, and 5.score. 2 and 3 represent the end points used for MTLs defined based on a shared | |
| 23 interval. (2+3)/2 is used as the summit of each row if used for MTLs defined based on proximity of summits. | |
| 24 | |
| 25 INPUT: | |
| 26 1.input_files: path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk] | |
| 27 2.path_output: path to save generated MTL cluster file (where to save mtls.xls) | |
| 28 3.expt_names: user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] | |
| 29 4.input_type: the type of input file used (MACS or BED; defaults to MACS) | |
| 30 5.mtl_type: interval or summit (defaults to summit) | |
| 31 6.dist.summits: maximum distance between summits belonging to the same MTL (defaults to 100; only used if mtl_type is summit) | |
| 32 | |
| 33 EXAMPLE RUN: | |
| 34 cluster_peaks.R | |
| 35 --input_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls | |
| 36 --input_type MACS | |
| 37 --path_output results/ | |
| 38 --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 | |
| 39 --dist_summits 100 | |
| 40 --mtl_type summit | |
| 41 | |
| 42 Please cite us if you used this script: | |
| 43 The transcription factor network regulating Th17 lineage specification and function. | |
| 44 Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, | |
| 45 Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n") | |
| 46 } | |
| 47 | |
| 48 ############# helper functions: | |
| 49 ## split.macs.list.to.chrmosomes | |
| 50 # input: | |
| 51 # - macs.list: a list of macs expts: here are a few lines of one expt | |
| 52 ## chr start end length summit tags #NAME? fold_enrichment FDR(%) | |
| 53 ## chr1 4322210 4323069 860 494 55 158.95 6.03 0.05 | |
| 54 ## chr1 4797749 4798368 620 211 29 119.82 3.47 0.09 | |
| 55 ## chr1 4848182 4849113 932 494 46 105.42 2.9 0.09 | |
| 56 # - expts: a list of the expts names from macs.list that you want to process | |
| 57 # - chr: chrmosomes names | |
| 58 # output: | |
| 59 # - x: a list with as many elements as chr specified by input. | |
| 60 # -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS) | |
| 61 split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){ | |
| 62 x <- list() | |
| 63 n <- length(expts) | |
| 64 for(i in 1:n){ | |
| 65 e <- expts[i] #experiment name | |
| 66 cat("wroking on expt", e,"\n") | |
| 67 x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]]) | |
| 68 names(x[[e]]) <- chr | |
| 69 } | |
| 70 return(x) | |
| 71 } | |
| 72 # a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m | |
| 73 # input: | |
| 74 # - r is chromosome | |
| 75 # - m is macs matrix for expt e from above function | |
| 76 split.one.macs.expt.by.chromosome.no.pml <- function(r,m){ | |
| 77 ix.chr.i <- which(m[,"chr"]==r) | |
| 78 # cat("working on",r,"\n") | |
| 79 o <- list() | |
| 80 o[[r]] <- m[ix.chr.i,] | |
| 81 o[[r]]$peak.id <- ix.chr.i | |
| 82 return(o[[r]]) | |
| 83 } | |
| 84 | |
| 85 ## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) | |
| 86 # also match these summit locations with corresponding: | |
| 87 # pvals, tfs, peak start and peak end | |
| 88 # input: | |
| 89 # - chr: a list of chromosome names | |
| 90 # - macs.list.per.chrom: a list of macs peaks for each chromosome | |
| 91 # output: | |
| 92 # - o: a list each chormosome ruler as an element | |
| 93 make.ruler.no.pml <- function(chr,macs.list.per.chrom){ | |
| 94 x <- macs.list.per.chrom | |
| 95 o <- list() | |
| 96 for(j in 1:length(chr)){ | |
| 97 r <- chr[j] # chrmosome we go over | |
| 98 s <- numeric() | |
| 99 pval <- numeric() | |
| 100 tf.to.s <- character() | |
| 101 start <- numeric() | |
| 102 end <- numeric() | |
| 103 trtmnts <- names(x) # the treatments name (expt names) | |
| 104 ## debug parameters ### | |
| 105 ## which experiment peaks come from | |
| 106 expt <- character() | |
| 107 ## what was the peak id in that experiment | |
| 108 peak.ids <- numeric() | |
| 109 ## this will allow us to always back track from a cluster to the actual peaks in it | |
| 110 ## debug params end ### | |
| 111 for(i in 1:length(trtmnts)){ | |
| 112 o[[r]] <- list() | |
| 113 e <- trtmnts[i] #experiment name | |
| 114 tf <- strsplit(e,"_")[[1]][1] | |
| 115 s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"]) | |
| 116 pval <- c(pval,x[[e]][[r]][,"score"]) | |
| 117 start <- c(start,x[[e]][[r]][,"start"]) | |
| 118 end <- c(end,x[[e]][[r]][,"end"]) | |
| 119 expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"]))) | |
| 120 peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"]) | |
| 121 } | |
| 122 ix <- sort(s,index.return=TRUE)$ix | |
| 123 o[[r]] <- list(summit=s[ix],score=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix]) | |
| 124 } | |
| 125 return(o) | |
| 126 } | |
| 127 | |
| 128 ## add cluster memberships based on ruler | |
| 129 ## require no more than d.cut distance between tf summits | |
| 130 ## cur.l is the current loci number (or cluster number) | |
| 131 assign.clusters.ids.to.peaks.based.on.summits <- function(ruler,d.cut){ | |
| 132 cur.l <- 0 | |
| 133 for(j in 1:length(ruler)){ | |
| 134 s <- ruler[[j]][["summit"]] | |
| 135 l <- numeric(length=length(s)) | |
| 136 if(length(l)>0){ | |
| 137 cur.l <- cur.l+1 | |
| 138 } | |
| 139 if(length(l)==1){ | |
| 140 l[1] <- cur.l | |
| 141 } else if(length(l)>1) { | |
| 142 l[1] <- cur.l | |
| 143 for(i in 2:length(l)){ | |
| 144 d <- s[i]-s[i-1] # assumes s is sorted increasingly | |
| 145 if(d>d.cut){ | |
| 146 cur.l <- cur.l+1 | |
| 147 } | |
| 148 l[i] <- cur.l | |
| 149 } | |
| 150 } | |
| 151 ruler[[j]][["mtl.id"]] <- l | |
| 152 } | |
| 153 return(ruler) | |
| 154 } | |
| 155 | |
| 156 ## add cluster memberships based on ruler | |
| 157 ## require that peaks span will have a non-empty intersection | |
| 158 ## cur.mtl is the current loci number (or cluster number) | |
| 159 assign.clusters.ids.to.peaks.based.on.intervals <- function(ruler){ | |
| 160 cur.mtl <- 0 | |
| 161 for(j in 1:length(ruler)){ | |
| 162 s <- ruler[[j]][["start"]] | |
| 163 e <- ruler[[j]][["end"]] | |
| 164 l <- numeric(length=length(s)) | |
| 165 if(length(l)>0){ | |
| 166 cur.mtl <- cur.mtl+1 | |
| 167 } | |
| 168 if(length(l)==1){ | |
| 169 l[1] <- cur.mtl | |
| 170 } else if(length(l)>1) { | |
| 171 l[1] <- cur.mtl | |
| 172 cur.e <- e[1] # the right-most bp belonging to the current mtl | |
| 173 for(i in 2:length(l)){ | |
| 174 if(cur.e<s[i]){ | |
| 175 cur.mtl <- cur.mtl+1 | |
| 176 } | |
| 177 l[i] <- cur.mtl | |
| 178 cur.e <- max(cur.e,e[i]) | |
| 179 } | |
| 180 } | |
| 181 ruler[[j]][["mtl.id"]] <- l | |
| 182 } | |
| 183 return(ruler) | |
| 184 } | |
| 185 | |
| 186 ## here we create a list of TF enriched loci (clusters) | |
| 187 ## input: | |
| 188 ## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) | |
| 189 ## output: | |
| 190 ## -ll: a list of clusters ll where each cluster (elem in ll holds): | |
| 191 ## - mtl.id: the current loci (elem in ll) | |
| 192 ## - s: summits vector as before | |
| 193 ## - pval: pvals vector matching the summits | |
| 194 ## - tfs: a vector of tfs matching s, the summits in loci l | |
| 195 ## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak | |
| 196 create.cluster.list.no.pml <- function(ruler){ | |
| 197 tmp <- list() | |
| 198 ll <- list() | |
| 199 for(j in 1:length(ruler)){ | |
| 200 r <- names(ruler)[j] | |
| 201 # cat("working on ",r,"\n") | |
| 202 x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]] | |
| 203 l.vec <- unique(x[["mtl.id"]]) # the clusters ids on j chr (only unique) | |
| 204 n <- length(l.vec) # iterate over n clusters | |
| 205 tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r) | |
| 206 } | |
| 207 ## concatenate tmp into one list | |
| 208 command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")") | |
| 209 ll=eval(parse(text=command)) | |
| 210 return(ll) | |
| 211 } | |
| 212 get.cluster.params.no.pml <- function(i,x,l.vec,r){ | |
| 213 # ix <- which(x[["l"]]==l.vec[i]) | |
| 214 # l <- l.vec[i] | |
| 215 ix <- which(x[["mtl.id"]]==l.vec[i]) | |
| 216 l <- l.vec[i] | |
| 217 start <- min(x[["start"]][ix]) | |
| 218 end <- max(x[["end"]][ix]) | |
| 219 # s <- x[["s"]][ix] | |
| 220 summit <- x[["summit"]][ix] | |
| 221 # pval <- x[["pval"]][ix] | |
| 222 score <- x[["score"]][ix] | |
| 223 # pval.mean <- mean(pval) | |
| 224 score.mean <- mean(score) | |
| 225 span.tfs <- x[["end"]][ix]-x[["start"]][ix] | |
| 226 span.l <- end-start | |
| 227 peak.ids <- x[["peak.ids"]][ix] | |
| 228 expt <- x[["expt"]][ix] | |
| 229 expt.alphanum.sorted <- sort(x[["expt"]][ix]) | |
| 230 | |
| 231 chr <- rep(r,length(ix)) | |
| 232 return(list(mtl.id=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start, | |
| 233 end=end,summit=summit,score=score,score.mean=score.mean,span.tfs=span.tfs, | |
| 234 span.l=span.l,peak.ids=peak.ids)) | |
| 235 } | |
| 236 | |
| 237 ## pretty print (tab deim) to file requested elements out of chip cluster list, ll. | |
| 238 ## input: | |
| 239 ## - ll: a cluster list | |
| 240 ## - f.nm: a file name (include path) to where you want files to print | |
| 241 ## - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering) | |
| 242 ## output | |
| 243 ## - a tab delim file with clusers as rows and elems tab delim for each cluster | |
| 244 print.ll.verbose.all <- function(ll,f.nm="ll.xls"){ | |
| 245 options(digits=5) | |
| 246 cat(file=f.nm,names(ll[[1]]),sep="\t") | |
| 247 cat(file=f.nm,"\n",append=TRUE) | |
| 248 for(i in 1:length(ll)){ | |
| 249 line <- ll[[i]][[1]] # put MTL number | |
| 250 line <- paste(line,ll[[i]][[2]][1],sep="\t") | |
| 251 for(j in 3:length(ll[[i]])){ | |
| 252 val <- ll[[i]][[j]] | |
| 253 if(length(val) == 1){ | |
| 254 line <- paste(line,val,sep="\t") | |
| 255 } else { | |
| 256 line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t") | |
| 257 } | |
| 258 } | |
| 259 cat(file=f.nm,line,"\n",append=TRUE) | |
| 260 } | |
| 261 } | |
| 262 # read command line paramters that are not optional | |
| 263 read.cmd.line.params.must <- function(args.nms, cmd.line.args){ | |
| 264 if(length(grep("--version",cmd.line.args))){ | |
| 265 cat("version",script.version,"\n") | |
| 266 q() | |
| 267 } | |
| 268 args <- sapply(strsplit(cmd.line.args," "),function(i) i) | |
| 269 vals <- character(length(args.nms)) | |
| 270 # split cmd.line to key and value pairs | |
| 271 for(i in 1:length(args.nms)){ | |
| 272 ix <- grep(args.nms[i],args) | |
| 273 if(length(ix)>1){ | |
| 274 stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error()) | |
| 275 } else if (length(ix)==0){ | |
| 276 stop("could not find ",args.nms[i],". Bailing out...\n",print.error()) | |
| 277 } else { | |
| 278 vals[i] <- args[ix+1] | |
| 279 } | |
| 280 } | |
| 281 return(vals) | |
| 282 } | |
| 283 # read command line paramters that are optional, if an optional param is not give functions return na for this param | |
| 284 read.cmd.line.params.optional <- function(args.nms, cmd.line.args){ | |
| 285 args <- sapply(strsplit(cmd.line.args," "),function(i) i) | |
| 286 vals <- character(length(args.nms)) | |
| 287 # split cmd.line to key and value pairs | |
| 288 for(i in 1:length(args.nms)){ | |
| 289 ix <- grep(args.nms[i],args) | |
| 290 if(length(ix)>1){ | |
| 291 stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error()) | |
| 292 } else if (length(ix)==0){ # if --param was not written in cmd line | |
| 293 vals[i] <- NA | |
| 294 } else if(( (ix+1) <= length(args) ) & ( length(grep("--",args[ix+1])) == 0) ){ # if --param was written in cmd line AND was followed by a value | |
| 295 vals[i] <- args[ix+1] | |
| 296 } else { # otherwise | |
| 297 vals[i] <- NA | |
| 298 } | |
| 299 } | |
| 300 return(vals) | |
| 301 } | |
| 302 | |
| 303 ############# Code: | |
| 304 # retrieve args | |
| 305 | |
| 306 if(debug==T){ | |
| 307 cmd.args <- c( | |
| 308 "--input_files data/xls/SL10571_SL10565_peaks.xls::data/xls/SL10570_SL10564_peaks.xls::data/xls/SL10572_SL10566_peaks.xls", | |
| 309 #"--input_files data/bed/SL10571_SL10565_peaks.bed::data/bed/SL10570_SL10564_peaks.bed::data/bed/SL10572_SL10566_peaks.bed", | |
| 310 "--input_type MACS", #BED | |
| 311 "--path_output ./results/", | |
| 312 "--expt_names macs1::macs2::macs3", | |
| 313 #"--expt_names bed1::bed2::bed3", | |
| 314 "--mtl_type interval", #interval summit | |
| 315 "--dist_summits 100" | |
| 316 ) | |
| 317 } else { | |
| 318 cmd.args <- commandArgs(trailingOnly = T); | |
| 319 } | |
| 320 | |
| 321 # if(length(grep("--version",cmd.args))){ | |
| 322 # cat("version",script.version,"\n") | |
| 323 # q() | |
| 324 # } | |
| 325 args.nms.must <- c( | |
| 326 "--input_files", #1 | |
| 327 "--path_output", #2 | |
| 328 "--expt_names" #3 | |
| 329 ) | |
| 330 | |
| 331 # all numeric params must come at the end of args.nms.optional | |
| 332 n.start.numeric <- 3 | |
| 333 args.nms.optional <- c( | |
| 334 "--input_type", #1 | |
| 335 "--mtl_type", #2 | |
| 336 "--dist_summits" #3 | |
| 337 ) | |
| 338 | |
| 339 # get must parameters | |
| 340 vals.must <- read.cmd.line.params.must(args.nms = args.nms.must, cmd.line.args = cmd.args) | |
| 341 if(verbose){ | |
| 342 cat("\nfinshed reading vals: \n") | |
| 343 cat("\nvals.must", unlist(vals.must), "\n") | |
| 344 } | |
| 345 input.files <- strsplit(vals.must[1],"::")[[1]] | |
| 346 if(length(input.files)==1){ | |
| 347 cat("only provided one MACS file to cluster.") | |
| 348 print.error() | |
| 349 } | |
| 350 path.output <- vals.must[2] | |
| 351 expt.names <- strsplit(vals.must[3],"::")[[1]] | |
| 352 # get optional parameters | |
| 353 vals.optional <- read.cmd.line.params.optional(args.nms = args.nms.optional, cmd.line.args = cmd.args) | |
| 354 if(verbose){ | |
| 355 cat("\nvals.optional:", unlist(vals.optional),"\n") | |
| 356 } | |
| 357 if(is.na(vals.optional[1])){ | |
| 358 input.type <- "MACS" | |
| 359 } else { | |
| 360 input.type <- vals.optional[1] | |
| 361 } | |
| 362 if(is.na(vals.optional[2])){ | |
| 363 mtl.type <- "interval" | |
| 364 } else { | |
| 365 mtl.type <- vals.optional[2] | |
| 366 } | |
| 367 if(is.na(vals.optional[2])){ | |
| 368 dist.summits <- 100 | |
| 369 } else { | |
| 370 dist.summits <- as.numeric(vals.optional[3]) | |
| 371 } | |
| 372 | |
| 373 # read MACS files | |
| 374 unique.chr <- character() | |
| 375 infile.list <- list() | |
| 376 col.nms.keepers <- c("chr","start","end","summit","score") | |
| 377 for(i in 1:length(input.files)){ | |
| 378 e <- expt.names[i] | |
| 379 if(toupper(input.type)=="MACS"){ | |
| 380 tmp <- read.delim(file=input.files[i],comment.char="#",stringsAsFactors = F) | |
| 381 columns <- c("chr","start","end","summit","pvalue") | |
| 382 ix.cols <- numeric() | |
| 383 for(j in 1:length(columns)){ | |
| 384 ix.j <- grep(columns[j],names(tmp)) | |
| 385 if(length(ix.j)==0){ | |
| 386 stop("can't find (grep) column ",columns[j]," in MACS input file ", e) | |
| 387 } | |
| 388 ix.cols <- c(ix.cols,ix.j) | |
| 389 } | |
| 390 infile.list[[e]] <- tmp[,ix.cols] | |
| 391 colnames(infile.list[[e]]) <- col.nms.keepers | |
| 392 } | |
| 393 if(toupper(input.type)=="BED"){ | |
| 394 tmp <- read.delim(file=input.files[i],stringsAsFactors = F,header = FALSE) | |
| 395 tmp[,4] <- tmp[,2]+(tmp[,3]-tmp[,2])/2 # define summit and put istead of name column | |
| 396 colnames(tmp) <- col.nms.keepers | |
| 397 infile.list[[e]] <- tmp[,1:5] | |
| 398 } | |
| 399 unique.chr <- unique(c(unique.chr,infile.list[[ e ]][,"chr"])) | |
| 400 } | |
| 401 | |
| 402 unique.chr <- sort(unique.chr) | |
| 403 | |
| 404 # take all macs files and put peaks together on each chromosome | |
| 405 # (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls) | |
| 406 cat("splitting input files per chromosome\n") | |
| 407 x <- split.macs.list.to.chrmosomes.no.pml(macs.list=infile.list,expts=expt.names,chr=unique.chr) | |
| 408 cat("adding peaks from all input files into chromosome rulers\n") | |
| 409 ruler <- make.ruler.no.pml(chr=unique.chr,macs.list.per.chrom=x) | |
| 410 cat("add MTL membership to the ruler\n") | |
| 411 | |
| 412 if(mtl.type=="interval"){ | |
| 413 ruler <- assign.clusters.ids.to.peaks.based.on.intervals(ruler) | |
| 414 } else { | |
| 415 ruler <- assign.clusters.ids.to.peaks.based.on.summits(ruler,d.cut=dist.summits) | |
| 416 } | |
| 417 | |
| 418 cat("creating MTL list\n") | |
| 419 ll <- create.cluster.list.no.pml(ruler=ruler) | |
| 420 cat("writing MTL table\n") | |
| 421 f.nm <- paste(sep="",path.output,"mtls",".xls") | |
| 422 print.ll.verbose.all(ll=ll,f.nm=f.nm) | |
| 423 | |
| 424 | |
| 425 | |
| 426 | |
| 427 | |
| 428 | |
| 429 | |
| 430 | |
| 431 |
