# HG changeset patch # User kmace # Date 1332444060 14400 # Node ID a903c48f05b4a306e39150e3be374f5f5422b1f0 Added non-integrated scripts to galaxy diff -r 000000000000 -r a903c48f05b4 mtls_analyze/cluster_peaks.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/cluster_peaks.R Thu Mar 22 15:21:00 2012 -0400 @@ -0,0 +1,376 @@ +## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. +## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ +##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' +## Feb 2012 th17 +## Bonneau lab - "Aviv Madar" , +## NYU - Center for Genomics and Systems Biology +## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. +## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ +##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' + +rm(list=ls()) +debug=F +script.version=0.1 +print.error <- function(){ + cat(" +DESCRIPTIION: + cluster_peaks.R takes MACS tab delimited files as input and produces one tab delimeted file (named mtls.xls) where + each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS files) + fall within a certain distance between summits from eachother. + +INPUT: + 1.path_input=path to MACS files '::' delim [path_input=f1::f2::f3::...::fk] + 2.path_output=path to save generated MTL cluster file (where to save mtls.xls) + 3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] + 4.dist.summits=maximum distance between summits belonging to the same MTL + 5.n.autosome.chr=19 for mouse, 22 for human + +EXAMPLE RUN: + cluster_peaks.R + input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls + path_output=~/Desktop/ + expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 + dist_summits=100 + n_autosome_chr=19 + +Please cite us if you used this script: + The transcription factor network regulating Th17 lineage specification and function. + Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, + Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n") +} + +############# helper functions: +## split.macs.list.to.chrmosomes +# input: +# - macs.list: a list of macs expts: here are a few lines of one expt +## chr start end length summit tags #NAME? fold_enrichment FDR(%) +## chr1 4322210 4323069 860 494 55 158.95 6.03 0.05 +## chr1 4797749 4798368 620 211 29 119.82 3.47 0.09 +## chr1 4848182 4849113 932 494 46 105.42 2.9 0.09 +# - expts: a list of the expts names from macs.list that you want to process +# - chr: chrmosomes names +# output: +# - x: a list with as many elements as chr specified by input. +# -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS) +split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){ + x <- list() + n <- length(expts) + for(i in 1:n){ + e <- expts[i] #experiment name + cat("wroking on expt", e,"\n") + x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]]) + names(x[[e]]) <- chr + } + return(x) +} +# a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m +# input: +# - r is chromosome +# - m is macs matrix for expt e from above function +split.one.macs.expt.by.chromosome.no.pml <- function(r,m){ + ix.chr.i <- which(m[,"chr"]==r) + # cat("working on",r,"\n") + o <- list() + o[[r]] <- m[ix.chr.i,] + o[[r]]$peak.id <- ix.chr.i + return(o[[r]]) +} + +## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) +# also match these summit locations with corresponding: +# pvals, tfs, peak start and peak end trgts +# input: +# - chr: a list of chromosome names +# - macs.list.per.chrom: a list of macs peaks for each chromosome +# output: +# - o: a list each chormosome ruler as an element +make.ruler.no.pml <- function(chr,macs.list.per.chrom){ + x <- macs.list.per.chrom + o <- list() + for(j in 1:length(chr)){ + r <- chr[j] # chrmosome we go over + s <- numeric() + pval <- numeric() + tf.to.s <- character() + trgt.prox <- character() + trgt.dist <- character() + dtss.prox <- character() + dtss.dist <- character() + start <- numeric() + end <- numeric() + trtmnts <- names(x) # the treatments name (expt names) + ## debug parameters ### + ## which experiment peaks come from + expt <- character() + ## what was the peak id in that experiment + peak.ids <- numeric() + ## this will allow us to always back track from a cluster to the actual peaks in it + ## debug params end ### + for(i in 1:length(trtmnts)){ + o[[r]] <- list() + e <- trtmnts[i] #experiment name + tf <- strsplit(e,"_")[[1]][1] + s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"]) + pval <- c(pval,x[[e]][[r]][,7]) # the name of 7th column is X.10.log10.pvalue. this is pval + # tf.to.s <- c(tf.to.s,rep(tf,length(x[[e]][[r]][,7]))) # all summits belong to tf + start <- c(start,x[[e]][[r]][,"start"]) + end <- c(end,x[[e]][[r]][,"end"]) + expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"]))) + peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"]) + trgt.prox <- c(trgt.prox,x[[e]][[r]][,"trgt.prox"]) + trgt.dist <- c(trgt.dist,x[[e]][[r]][,"trgt.dist"]) + dtss.prox <- c(dtss.prox,x[[e]][[r]][,"dtss.prox"]) + dtss.dist <- c(dtss.dist,x[[e]][[r]][,"dtss.dist"]) + } + ix <- sort(s,index.return=TRUE)$ix + o[[r]] <- list(s=s[ix],pval=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix], + trgt.prox=trgt.prox[ix],trgt.dist=trgt.dist[ix],dtss.prox=dtss.prox[ix],dtss.dist=dtss.dist[ix]) + } + return(o) +} + +## add cluster memberships based on ruler +## require no more than d.cut distance between tf summits +## cur.l is the current loci number (or cluster number) +assign.clusters.ids.to.peaks <- function(ruler,d.cut){ + cur.l <- 0 + for(j in 1:length(ruler)){ + s <- ruler[[j]][["s"]] + l <- numeric(length=length(s)) + if(length(l)>0){ + cur.l <- cur.l+1 + } + if(length(l)==1){ + l[1] <- cur.l + } else if(length(l)>1) { + l[1] <- cur.l + for(i in 2:length(l)){ + d <- s[i]-s[i-1] # assumes s is sorted increasingly + if(d>d.cut){ + cur.l <- cur.l+1 + } + l[i] <- cur.l + } + } + ruler[[j]][["l"]] <- l + } + return(ruler) +} + +## here we create a list of TF enriched loci (clusters) +## input: +## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) +## output: +## -ll: a list of clusters ll where each cluster (elem in ll holds): +## - l: the current loci (elem in ll) +## - s: summits vector as before +## - pval: pvals vector matching the summits +## - tfs: a vector of tfs matching s, the summits in loci l +## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak +## - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream) +create.cluster.list.no.pml <- function(ruler){ + tmp <- list() + ll <- list() + for(j in 1:length(ruler)){ + r <- names(ruler)[j] + # cat("working on ",r,"\n") + x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]] + l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique) + n <- length(l.vec) # iterate over n clusters + tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r) + } + ## concatenate tmp into one list + command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")") + ll=eval(parse(text=command)) + return(ll) +} +get.cluster.params.no.pml <- function(i,x,l.vec,r){ + ix <- which(x[["l"]]==l.vec[i]) + l <- l.vec[i] + start <- min(x[["start"]][ix]) + end <- max(x[["end"]][ix]) + s <- x[["s"]][ix] + # tf.to.s <- x[["tf.to.s"]][ix] + pval <- x[["pval"]][ix] + pval.mean <- mean(pval) + span.tfs <- x[["end"]][ix]-x[["start"]][ix] + span.l <- end-start + peak.ids <- x[["peak.ids"]][ix] + expt <- x[["expt"]][ix] + expt.alphanum.sorted <- sort(x[["expt"]][ix]) + trgt.prox <- unique(x[["trgt.prox"]][ix]) + trgt.dist <- unique(x[["trgt.dist"]][ix]) + dtss.prox <- unique(x[["dtss.prox"]][ix]) + dtss.dist <- unique(x[["dtss.dist"]][ix]) + chr <- rep(r,length(ix)) + return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start, + end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs, + span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist, + dtss.prox=dtss.prox,dtss.dist=dtss.dist)) +} + +## here we create a list of TF enriched loci (clusters) +## input: +## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) +## output: +## -ll: a list of clusters ll where each cluster (elem in ll holds): +## - l: the current loci (elem in ll) +## - s: summits vector as before +## - pval: pvals vector matching the summits +## - tfs: a vector of tfs matching s, the summits in loci l +## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak +## - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream) +create.cluster.list.no.pml <- function(ruler){ + tmp <- list() + ll <- list() + for(j in 1:length(ruler)){ + r <- names(ruler)[j] + # cat("working on ",r,"\n") + x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]] + l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique) + n <- length(l.vec) # iterate over n clusters + tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r) + } + ## concatenate tmp into one list + command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")") + ll=eval(parse(text=command)) + return(ll) +} +get.cluster.params.no.pml <- function(i,x,l.vec,r){ + ix <- which(x[["l"]]==l.vec[i]) + l <- l.vec[i] + start <- min(x[["start"]][ix]) + end <- max(x[["end"]][ix]) + s <- x[["s"]][ix] + # tf.to.s <- x[["tf.to.s"]][ix] + pval <- x[["pval"]][ix] + pval.mean <- mean(pval) + span.tfs <- x[["end"]][ix]-x[["start"]][ix] + span.l <- end-start + peak.ids <- x[["peak.ids"]][ix] + expt <- x[["expt"]][ix] + expt.alphanum.sorted <- sort(x[["expt"]][ix]) + trgt.prox <- unique(x[["trgt.prox"]][ix]) + trgt.dist <- unique(x[["trgt.dist"]][ix]) + dtss.prox <- unique(x[["dtss.prox"]][ix]) + dtss.dist <- unique(x[["dtss.dist"]][ix]) + chr <- rep(r,length(ix)) + return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start, + end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs, + span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist, + dtss.prox=dtss.prox,dtss.dist=dtss.dist)) +} +## pretty print (tab deim) to file requested elements out of chip cluster list, ll. +## input: +## - ll: a cluster list +## - f.nm: a file name (include path) to where you want files to print +## - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering) +## output +## - a tab delim file with clusers as rows and elems tab delim for each cluster +print.ll.verbose.all <- function(ll,f.nm="ll.xls"){ + options(digits=5) + cat(file=f.nm,names(ll[[1]]),sep="\t") + cat(file=f.nm,"\n",append=TRUE) + for(i in 1:length(ll)){ + line <- ll[[i]][[1]] # put MTL number + line <- paste(line,ll[[i]][[2]][1],sep="\t") + for(j in 3:length(ll[[i]])){ + val <- ll[[i]][[j]] + if(length(val) == 1){ + line <- paste(line,val,sep="\t") + } else { + line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t") + } + } + cat(file=f.nm,line,"\n",append=TRUE) + } +} +############# Code: +# retrieve args +if(debug==T){ + cmd.args <- c( + "input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls", + "path_output=~/Desktop/", + "expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17", + "dist_summits=100", + "n_autosome_chr=19" + ) +} else { + cmd.args <- commandArgs(); +} + +if(length(grep("--version",cmd.args))){ + cat("version",script.version,"\n") + q() +} + +arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1]) +args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2]) + +arg.nms <- c("input_macs_files","path_output","expt_names","dist_summits","n_autosome_chr") +arg.val <- character(length=length(arg.nms)) +for(i in 1:length(arg.nms)){ + ix <- which(arg.names.cmd.line==arg.nms[i]) + if(length(ix)==1){ + arg.val[i] <- args.val.cmd.line[ix] + } else { + stop("######could not find ",arg.nms[i]," arg######\n\n",print.error()) + + } +} +if(debug==T){ + print(paste(arg.nms,"=",arg.val)) +} +# the files here adhere to tab delim format +input.macs.files <- strsplit(arg.val[1],"::")[[1]] +if(length(input.macs.files)==1){ + cat("only provided one MACS file to cluster.") + print.error() +} +path.output <- arg.val[2] +expt.names <- strsplit(arg.val[3],"::")[[1]] +dist.summits <- as.numeric(arg.val[4]) +n.autosome.chr <- as.numeric(arg.val[5]) + +# source("~/Documents/nyu/littmanLab/th17_used_for_paper/r_scripts/th17/used_for_paper/cluster_peaks_util.R") +chr <- c(paste(sep="","chr",1:n.autosome.chr),paste(sep="","chr",c("X","Y"))) + +# read MACS files +macs.list <- list() +for(i in 1:length(input.macs.files)){ + e <- expt.names[i] + macs.list[[ e ]] <- read.delim(file=input.macs.files[i]) + macs.list[[ e ]][,"chr"] <- as.character(macs.list[[ e ]][,"chr"]) + macs.list[[ e ]][,"trgt.prox"] <- as.character(macs.list[[ e ]][,"trgt.prox"]) + macs.list[[ e ]][,"trgt.dist"] <- as.character(macs.list[[ e ]][,"trgt.dist"]) +} + +# take all macs files and put peaks together on each chromosome +# (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls) +cat("splitting macs files per chromosome\n") +x <- split.macs.list.to.chrmosomes.no.pml(macs.list=macs.list,expts=expt.names,chr=chr) +cat("adding peaks from all macs files into chromosome rulers\n") +ruler <- make.ruler.no.pml(chr,macs.list.per.chrom=x) +cat("add MTL membership to the ruler\n") +ruler <- assign.clusters.ids.to.peaks(ruler,d.cut=dist.summits) +for(i in 1:length(ruler)){ + ix <- which(is.na(ruler[[i]][["dtss.prox"]])) + ruler[[i]][["dtss.prox"]][ix] <- "" + ix <- which(is.na(ruler[[i]][["dtss.dist"]])) + ruler[[i]][["dtss.dist"]][ix] <- "" +} +cat("creating MTL list\n") +ll <- create.cluster.list.no.pml(ruler=ruler) +cat("writing MTL table\n") +# f.nm <- paste(sep="",path.output,paste(expt.names,collapse="_"),"_MTLs.xls") +f.nm <- paste(sep="",path.output,"mtls",".xls") +print.ll.verbose.all(ll=ll,f.nm=f.nm) + + + + + + + + + diff -r 000000000000 -r a903c48f05b4 mtls_analyze/heatmap.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/heatmap.R Thu Mar 22 15:21:00 2012 -0400 @@ -0,0 +1,555 @@ +rm(list = ls()) + +GeneratePeakMatrix <- function(experiments, scores) { + # Generates a score matrix for all mtls. + # + # Args: + # experiments: A list of underscore deliminated experiments for each mtl. + # There should never be a completely empty item in this list. + # for eg. experiments[1] = expA_expD_expB. + # scores: A list of underscore deliminated scores for each mtl, the length + # of these scores should be identicle to the length of the + # experiments. eg. scores[1] = 55_33_245. + # + # Returns: + # The peak score matrix for all mtls. + experiments <- sapply(experiments, function(x) strsplit(x, split = "_")) + scores <- sapply(scores, function(x) strsplit(x, split = "_")) + unique.experiments <- unique(unlist(experiments)) + + peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments)) + colnames(peaks) <- unique.experiments + + for(i in 1:length(experiments)){ + for(j in 1:length(experiments[[i]])){ + peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j]) + } + } + return(peaks) +################################################################## +} +GetChipData <- function(file.name, + proximity = "distal", + include.targetless = TRUE) { + # Reads in, filters, and organizes mtls data + # + # Args: + # file.name: The path to the mtls file (including the file name). + # proximity: Either "distal" or "proximal", defines the gene target distance + # from the mtl. Default is "distal". + # include.targetless: If TRUE, includes mtls with no targets (applied after + # the proximity filter); if not, mtls with no target + # will be exclided. Default is TRUE. + # + # Returns: + # An organized list of mtls data with the following elements: + # $peaks - a matrix of peak p-values + # $targets - a list of underscore deliminated gene targets for each mtl + if(param$debug) { + print("In GetChipData") + } + # Set Constants for the mtls file type: + MTLNUM <- "l" + CHROMOSOME <- "chr" + EXPERIMENTS <- "expt" + EXPERIMENTS.SORTED <- "expt.alphanum.sorted" + START <- "start" + END <- "end" + SUMMIT <- "s" + PVALS <- "pval" + PVAL.MEAN <- "pval.mean" + SPANS <- "span.tfs" + SPAN.TOTAL <- "span.l" + PEAK.IDS <- "peak.ids" # TODO(kieran): Find out what this is + PROXIMAL.TARGETS <- "trgt.prox" + DISTAL.TARGETS <- "trgt.dist" + PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox" + DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist" + + + #Get chip data in from files: + TARGETS <- switch(proximity, + distal = DISTAL.TARGETS, + proximal = PROXIMAL.TARGETS, + stop(" Bad proximity argument supplied.")) # Default + + keep.columns <- c(EXPERIMENTS, PVALS, TARGETS) + + file <- read.delim(file.name, header=T, as.is=T)[, keep.columns] + + if (!include.targetless) { + ix.has.trgt <- which(as.character(file[, TARGETS])!="") + file <- file[ix.has.trgt, ] + } + + chip = list() + + chip$peaks = GeneratePeakMatrix(file[, EXPERIMENTS], file[, PVALS]) + chip$targets = file[ , TARGETS] + return (chip) +} + + + +GenerateKMOrder <- function(data, km) { + # generates the order of clusters from high to low mean values + # + # Args: + # data: A matrix of scores that have already been clustered. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. Must + # have the same length as the number of rows in data + # + # Returns: + # cluster.order: The order in which the clusters should be displayed. + + km.cluster = km$cluster + clusters = unique(km.cluster) + clusters.avg = numeric() + for(i in clusters) { + clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ])) + } + if(param$debug) { + print ("straight clusters") + print (clusters) + print ("straigth average") + print (clusters.avg) + print ("ordered clusters") + print (clusters[order(clusters.avg)]) + print("ordered average") + print (clusters.avg[order(clusters.avg)]) + } + return(clusters[rev(order(clusters.avg))]) +} + +OrderMTL <- function(data, km, cluster.order) { + # Orders a matrix of data according to a clustering algorithm + # + # Args: + # data: A matrix of scores that have already been clustered. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. Must + # have the same length as the number of rows in data + # cluster.order: The order in which the clusters should be displayed. + # for eg. km.order = c(2, 3, 1) would result in cluster 2 + # being on top, then cluster 3 and lastly cluster 1. + # + # Returns: + # a list that contains 3 objects: + # list$data: the ordered version of the data. + # list$color.vector: a list of colors that should be assigned to each row. + # list$start.row: the starting row of each cluster in data. + number.clusters <- length(cluster.order) + cluster.colors <- sample(rainbow(number.clusters)) + + # Set up return objects + sorted.data <- matrix(,nr=nrow(data), nc=ncol(data)) + colnames(sorted.data) <- colnames(data) + cluster.color.vector = vector(length=length(km$cluster)) + cluster.start.row = numeric(number.clusters) + cluster.start.row[1]=1 + + for (i in 1:number.clusters) + { + current.cluster = cluster.order[i] + ix = which(km$cluster == current.cluster) + current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1) + sorted.data[current.cluster.range, ] = data[ix, ] + cluster.color.vector[current.cluster.range] = cluster.colors[i] + cluster.start.row[i+1] = (cluster.start.row[i]+length(ix)) + } + ret.list = list() + ret.list$data = sorted.data + ret.list$color.vector = cluster.color.vector + ret.list$cluster.index = cluster.start.row + return(ret.list) +} + + +CreateHeatMap <- function(data, + km, + cluster.order, + document.name, + document.type = "png", + number.colors = 30) { + # Generates a heatmap image based for a matrix based on a clustering + # algorithm. + # + # Args: + # data: A matrix of scores that have already been clustered, the column + # names of this matrix will become the column titels of the heatmap. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. + # cluster.order: The order in which the clusters should be displayed. + # for eg. km.order = c(2, 3, 1) would result in cluster 2 + # being on top, then cluster 3 and lastly cluster 1. + # document.name: A name for the produced file. there is no need to + # supply the .png/.pdf in your argument. + # document.type: The type of file you want to produce. current options are + # png and pdf. Default is pdf. + # + # Returns: + # Nothing to the script that calls it, however it creates an image at the + # path specified. + if(param$debug) { + print("In CreateHeatMap") + } + + data.ordered <- OrderMTL(data, km, cluster.order) + + + #Load Lib + library(gplots) + + #Set Color gradient + color.ramp = colorRampPalette(c("black", + "darkblue", + "blue", + "yellow", + "orange", + "red"))(number.colors) #7 + + if(document.type == "png") { + png(paste(document.name,".png", sep = ""), + width=1920, + height=2560, + res=100, + antialias="gray") + } + else { + pdf(paste(document.name,".pdf", sep = "")) + } + + heatmap.2( + data.ordered$data[,sort(colnames(data.ordered$data))], + rowsep = data.ordered$cluster.index[-1], + sepwidth = c(0.5, 60), + dendrogram = "none", + Rowv = F, + Colv = F, + trace = "none", + labRow = F, + labCol = sort(colnames(data.ordered$data)), + RowSideColors = data.ordered$color.vector, + col = color.ramp, + cexCol = 1.8, + cexRow = 0.8, + useRaster=F) + + dev.off() + +} + + + + +ReadCommadLineParameters <- function(argument.names, command.line.arguments) { + # Reads the parameters from the command line arguments. + # + # Args: + # argument.names: A list of expected argument names. + # command.line.arguments: The list of recieved command line arguments + # + # Returns: + # The arguments for argument.names. As strings In that order. + + if(length(grep("--version",command.line.arguments))){ + cat("version",script.version,"\n") + q() + } + # Split command line arguments + args <- sapply(strsplit(command.line.arguments, " "),function(i) i) + vals <- character(length(argument.names)) + # split cmd.line to key and value pairs + for(i in 1:length(argument.names)) { + ix <- grep(argument.names[i], args) + if(length(ix)>1) { + stop("arg ", + argument.names[i], + " used more than once. Bailing out...\n", + PrintParamError()) + } + else if (length(ix)==0) { + stop("could not find --", + argument.names[i], + ". Bailing out...\n", + PrintParamError()) + } + vals[i] <- args[ix+1] + } + return(vals) +} + +PrintParamError <- function(){ + # Prints the usage of the function, shows users what arguments they can use + # + # Args: + # param: A list that contains all the paramaters. + # + # Returns: + # A modified version of the param list, with the default values loaded. + cat(" +DESCRIPTIION: + heatmap.R takes a ... + +INPUT: + 1.--mtls_file path to mtls file.\n + 2.--cluster_file the destination path for the cluster file.\n + 3.--heatmap_file the destination path for heatmap image (no extension).\n + 4.--heatmap_type choice of image type, currently support png and pdf.\n + 5.--n_clusters number of clusters in the heatmap\n + 6.--filter_percentage percentage of mtls that will be analysed. for eg. if + we make filter_percentage 30, we will take the union of the top mtls in + mean, non-zero mean and variance.\n + +EXAMPLE RUN: + Rscript heatmap.R + --mtls_file mtls.xls + --cluster_file output/cluster + --heatmap_file output/heatmap + --heatmap_type png + --n_clusters 13 + --filter_percentage 60 + + ") +} + +LoadDefaultParams <- function(param) { + # Loads default paramaters for the heatmap application + # + # Args: + # param: A list that contains all the previous paramaters. + # + # Returns: + # A modified version of the param list, with the default values loaded. + +script.version=0.1 +param$debug = F + +# ChIP data: +param$annotated.macs.file = "data/mtls/mtls.xls" + +# Filter: +param$filter.percentage <- 100 + +# Analyze Clustering: +param$analyze.number.of.clusters = F +param$analyze.number.of.clusters.sample.size = 10000 +param$analyze.number.of.clusters.min = 4 +param$analyze.number.of.clusters.max = 9 +param$analyze.number.of.clusters.document.name <- "data/analysis/cluster_analysis" +param$analyze.number.of.clusters.itterations = 25 + +# Clustering: +param$clustering.number.of.clusters <- 13 + +# Heatmap: +param$heatmap.document.name <- "data/heatmap/heatmap" +param$heatmap.document.type <- "png" +#Cluster Groups: +param$cluster.groups.document.name <- "data/clusters/clusters" + +return(param) + +} + + + +LoadDefinedParams <- function(param) { + # Loads user defined paramaters for the heatmap application + # + # Args: + # param: A list that contains all the previous paramaters. + # + # Returns: + # A modified version of the param list, with the user defined values loaded. + +cmd.args <- commandArgs(trailingOnly = T); + + +# put the numeric params at the end! +n.start.numeric <- 5 +args.nms <- c( "--mtls_file", #1 + "--cluster_file", #2 + "--heatmap_file", #3 + "--heatmap_type", #4 + "--n_clusters", #5 + "--filter_percentage") #6 + + # get parameters +vals <- ReadCommadLineParameters(argument.names = args.nms, + command.line.arguments = cmd.args) + +# check if numeric params are indeed numeric +for(i in n.start.numeric:length(vals)){ + if(is.na(as.numeric(vals[i]))){ + stop("arg ",args.nms[i]," is not numeric. Bailing out...\n",print.error()) + } +} + +# vals has the same order as args.nms + +# ChIP data: +param$annotated.macs.file <- vals[1] + +# Filter: +param$filter.percentage <- as.numeric(vals[6]) + +# Clustering: +param$clustering.number.of.clusters <- as.numeric(vals[5]) + +# Heatmap: +param$heatmap.document.name <- vals[3] +param$heatmap.document.type <- vals[4] +#Cluster Groups: +param$cluster.groups.document.name <- vals[2] + +return(param) +} + + + + +# here we output the fasta file of the targets of each kmeans cluster +CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){ + f.nm = paste(f.nm, ".fasta", sep="") + clusters = km.order + for(i in 1:length(clusters)){ + v=trgts[which(k.ix==clusters[i])] + v.split = unlist(sapply(v,strsplit, "_")) + if(i == 1){ + cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n") + } else { + cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE) + } + cat(sep="\n",file=f.nm,v.split,append=TRUE) + } +} + + +GetTopRowsFromMatrix = function(mtrx, percentage = 10) +{ + if (param$debug) { + print("In GetTopRowsFromMatrix") + } + #Store the stats for the mtrx + stats = list() + stats$mean=apply(mtrx,1,mean) + stats$sd=apply(mtrx,1,sd) + stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)])) + + + #Store the indexes for the mtrx + index = list() + index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix + index$sd = sort(stats$sd, decreasing=T, index.return=T)$ix + index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix + + #Calculate how many data points we want to take + cutoff = floor(length(mtrx[ ,1])*(percentage/100)) + + #Extract Indexes and return to caller + index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff])) + + return(index) +} + +#GetUniq = function(targets.cluster) +#{ +# #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order +# +# return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))) +# #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))) +# +#} + +bindata = function(d,qunts=seq(.4,.9,.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d)) + for(j in 1:ncol(d)) + { + bins=quantile(d[,j],qunts) + for(i in 1:length(bins)) + { + tmp[which(d[,j]>bins[i]),j] = i + } + } + return(tmp) +} + +bindata.non.zero = function(d,qunts=seq(0,0.9,0.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d)) + for(j in 1:ncol(d)) + { + ix.non.zero=which(d[,j]!=0) + bins=quantile(d[ix.non.zero,j],qunts) + for(i in 1:length(bins)) + { + tmp[which(d[,j]>bins[i]),j] = i + } + } + + return(tmp) +} + +bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) + ix.non.zero=which(d!=0) + bins=quantile(d[ix.non.zero],qunts) + for(i in 1:length(bins)) + { + tmp[which(d>bins[i])] = i + } + return(tmp) +} + +print("Finished loading functions") + +param <- list() +param <- LoadDefaultParams(param) +param <- LoadDefinedParams(param) + +print (param) + +#Load Chip data, alread in pval format +chip <- GetChipData(param$annotated.macs.file) + +#Get the top percentages on different criteria +ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage) +chip$peaks <- chip$peaks[ix.top$union, ] +chip$targets <- chip$targets[ix.top$union] + +#Bin data: +chip$peaks = bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=30)) + + +#Analyze-cluster +if(param$analyze.number.of.clusters) +{ + AnalyzeCluster(data, + sample.size = param$analyze.number.of.clusters.sample.size, + clusters.min = param$analyze.number.of.clusters.min, + clusters.max = param$analyze.number.of.clusters.max, + document.name = param$analyze.number.of.clusters.document.name, + itterations = param$analyze.number.of.clusters.itterations) + +} + +#Cluster +set.seed(1234) +km <- kmeans(chip$peaks, param$clustering.number.of.clusters) +km.order <- GenerateKMOrder(chip$peaks, km) + +CreateHeatMap(chip$peaks, + km, + km.order, + document.name = param$heatmap.document.name, + document.type = param$heatmap.document.type) + +CreateClusterGroups(trgts=chip$targets, + k.ix=km$cluster, + f.nm = param$cluster.groups.document.name, + km.order = km.order) + diff -r 000000000000 -r a903c48f05b4 mtls_analyze/map_ChIP_peaks_to_genes.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/map_ChIP_peaks_to_genes.R Thu Mar 22 15:21:00 2012 -0400 @@ -0,0 +1,304 @@ +## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. +## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ +##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' +## Nov 2011 th17 +## Bonneau lab - "Aviv Madar" , +## NYU - Center for Genomics and Systems Biology +## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. +## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ +##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' + +rm(list=ls()) +debug=F +script.version=0.1 +print.error <- function(){ + cat(" +NAME + map_ChIP_peaks_to_genes.R + +DESCRIPTION + This script takes a MACS tab delimited file as input and + produces two files: + + 1) genes.xls + A gene centered table that stores the peaks that are + proximal (within x[kb] of TSS) and distal (within the + gene's region + y[kb] upstream or downstream of TSS and + TES respectively). + + 2) peaks.xls + A peak centered table that equals the MACS input table plus + colums for proximal and distal targets of each peak and + their distance from TSS (if exist). For gene.xls script also + gives a poisson model based -log10(pval) score for proximal + and genewide enrichment of peaks for each gene with at least + one prox/dist peak associated with it. + +ARGUMENTS + input_file=path + Path to MACS file. + + refseq_table=path + Path to refseq table (gives TSS/TES locations for all genes). + + path_output=path + Path to save genes.xls and peaks.xls output files. + + tss.dist=N + The absolute distance from TSS where we connect a peak to + gene (for proximal peaks). + + gene.wide.dist=N + The absolute distance from TSS or TES where we connect a peak + to gene (for peaks hitting anywhere in gene). + + effective.genome.size=N + The effective mappable genome size (for mm9 the value is + 1.87e9. For hg19 the value is 2.7e9 (from MACS manual)). + + macs.skip.lines=N + Number of lines to skip in input_file. + +EXAMPLE + Rscript map_ChIP_peaks_to_genes_v.1.R \\ + input_file=SL971_SL970_peaks.xls \\ + refseq_table=UCSC_mm9_refseq_genes_Sep_15_2011.txt \\ + path_output=./ \\ + tss.dist=5000 \\ + gene.wide.dist=10000 \\ + effective.genome.size=1.87e9 \\ + macs.skip.lines=23 + +CITATION + Please cite us if you used this script: The transcription + factor network regulating Th17 lineage specification and + function. Maria Ciofani, Aviv Madar, Carolina Galan, Kieran + Mace, Ashish Agarwal, Kim Newberry, Richard M. Myers, Richard + Bonneau and Dan R. Littman et. al. (in preperation). + +") +} + +# retrieve args +if(debug==T){ + cmd.args <- c( + # "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL971_SL970_peaks.xls", + "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL3594_SL3592_peaks.xls", + "refseq_table=input/th17/used_for_paper/rawData/UCSC_mm9_refseq_genes_Sep_15_2011.txt", + "path_output=/Users/aviv/Desktop/test_script/", + "tss.dist=5000", + "tes.dist=10000", + "effective.genome.size=1.87e9", + "gene.wide.dist=10000", + "macs.skip.lines=23" + ) +} else { + cmd.args <- commandArgs(); +} + +if(length(grep("--version",cmd.args))){ + cat("version",script.version,"\n") + q() +} + +arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1]) +args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2]) + +arg.nms <- c("input_file","refseq_table","path_output","tss.dist", + "gene.wide.dist","effective.genome.size","macs.skip.lines") +arg.val <- character(length=length(arg.nms)) +for(i in 1:length(arg.nms)){ + ix <- which(arg.names.cmd.line==arg.nms[i]) + if(length(ix)==1){ + arg.val[i] <- args.val.cmd.line[ix] + } else { + stop("######could not find ",arg.nms[i]," arg######\n\n",print.error()) + + } +} +if(debug==T){ + print(paste(arg.nms,"=",arg.val)) +} +# the files here adhere to tab delim format +path.input.macs <- arg.val[1] +path.input.refseq <- arg.val[2] +path.output <- arg.val[3] +tss.dist <- as.numeric(arg.val[4]) +gene.wide.dist <- as.numeric(arg.val[5]) +effective.genome.size <- as.numeric(arg.val[6]) +macs.skip.lines=as.numeric(arg.val[7]) + +if(debug==T){ +# if(1){ + load("/Users/aviv/Desktop/r.u.RData") + gn.nms <- rownames(r.u) +} else { + ################## + ### step 1 handle refseq table file (read, make unique) + cat("reading refseq table",path.input.refseq,"\n") + refseq <- read.delim(file=path.input.refseq) + # refseq can have many transcripts for each gene + # here i make it have only one transcript for each gene (the longest one) + cat("for transcripts matching more than one gene keeping only the longest transcript\n") + refseq$name2 <- as.character(refseq$name2) + refseq$chrom <- as.character(refseq$chrom) + refseq$strand <- as.character(refseq$strand) + gn.nms <- unique(refseq$name2) + #x <- sort(table(refseq$name2),decreasing=T) + #gn.nms.non.unique <- names(x)[which(x>1)] + #gn.nms.unique <- names(x)[which(x==1)] + # create refseq unique r.u + n <- length(gn.nms) + r.u <- data.frame(cbind(chrom=rep("NA",n),strand=rep("NA",n),txStart=rep(0,n),txEnd=rep(0,n)),stringsAsFactors=FALSE,row.names=gn.nms) + for(i in 1:n){ + ix <- which(refseq$name2==gn.nms[i]) + if(length(ix)==0) { + error("could not find gene", ng.nms[i], "in refseq table. Bailing out...\n") + } else if (length(ix)>1){ + l <- apply(refseq[ix,c("txStart","txEnd")],1,function(i) abs(i[1]-i[2]) ) + l.max <- max(l) + ix <- ix[which(l==l.max)[1]] + } + r.u[gn.nms[i],c("chrom","strand","txStart","txEnd")] <- refseq[ix,c("chrom","strand","txStart","txEnd")] + } + r.u[,"txStart"] <- as.numeric(r.u[,"txStart"]) + r.u[,"txEnd"] <- as.numeric(r.u[,"txEnd"]) + + # switch TSS and TES if we have chr "-" + cat("correcting TSS, TES assiments in refseq table based on strand\n") + ix <- which(r.u$strand=="-") + tmp.tes <- r.u$txStart[ix] + tmp.tss <- r.u$txEnd[ix] + r.u[ix,c("txStart","txEnd")] <- cbind(tmp.tss,tmp.tes) + # ############ +} + + +#### step 2 read macs file +cat("reading Macs file",path.input.macs,"\n") + +# read macs file +M <- read.delim(file=path.input.macs,skip=macs.skip.lines) +trgt.prox <- NA +trgt.dist <- NA +dtss.prox <- NA +dtss.dist <- NA +M.annot <- cbind(M,trgt.prox,trgt.dist,dtss.prox,dtss.dist) +M.annot[is.na(M.annot)] <- "" +# get the number of genes +m <- length(gn.nms) +if(debug==T){ + # m <- 100 +} +f.nm.peak <- paste(sep="",path.output,"peaks.xls") +f.nm.gene <- paste(sep="",path.output,"genes.xls") +# for each genes in the expt (j goes over genes) +cat(file=f.nm.gene,sep="\t","Gene_ID","prox_n_peak","genewide_n_peak","gn_length","strand","prox_mean_pval","genewide_mean_pval", + "prox_max_pval","genewide_max_pval", + "prox_pois_model_pval","genewide_pois_model_pval","(peak_id,summit,d_TSS,d_TES,class,pval,fold_enrich,FDR),(..)\n") +peaks.summit <- (M[,"start"]+M[,"summit"]) +cat(sep="","mapping MACS peaks to genes\n") +for (j in 1:m){ + if(j%%20==0){cat(".")} + tss <- r.u[j,"txStart"] + tes <- r.u[j,"txEnd"] + gn.lngth <- abs(tss-tes) + strand <- r.u[j,"strand"] + chr <- r.u[j,"chrom"] + if(strand=="+"){ + d.tss.all <- peaks.summit-tss + d.tes.all <- peaks.summit-tes + } else { + d.tss.all <- tss-peaks.summit + d.tes.all <- tes-peaks.summit + } + ix.distal <- sort(union( c(which( abs(d.tss.all) < gene.wide.dist ),which( abs(d.tes.all) < gene.wide.dist )), + which( sign(d.tss.all) != sign(d.tes.all) )),decreasing=TRUE) + ix.prox <- sort(which( abs(d.tss.all) < tss.dist ),decreasing=TRUE) + ix.prox <- ix.prox[which(M[ix.prox,"chr"]==chr)] + ix.distal <- ix.distal[which(M[ix.distal,"chr"]==chr)] + n.peaks.prox <- length(ix.prox) + n.peaks.distal <- length(ix.distal) + # if there is at least one peak hitting gene j + if(n.peaks.distal > 0){ + # for each peak (l goes over peaks) + peaks.line <- paste(sep="","(") + for (k in 1:n.peaks.distal){ + if(k>1){ + peaks.line <- paste(sep="",peaks.line,";(") + } + d.tss <- d.tss.all[ ix.distal[k] ] + d.tes <- d.tes.all[ ix.distal[k] ] + if(sign(d.tss)!=sign(d.tes)){ + class="intra" + } else if(d.tss<=0){ + class="upstream" + } else if(d.tss>0){ + class="downstream" + } + peaks.line <- paste(sep="",peaks.line,paste(sep=",", ix.distal[k], peaks.summit[ ix.distal[k] ], + d.tss, d.tes, class, M[ ix.distal[k] ,7], M[ ix.distal[k] ,8], M[ ix.distal[k] ,9] ),")") + # handle M.annot + dtss <- d.tss.all[ ix.distal[k] ] + prev.trgt <- M.annot[ix.distal[k],"trgt.dist"] + if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss + if(abs(dtss) 0){ + mean.pval.prox <- mean(M[ ix.prox ,7]) + max.pval.prox <- max(M[ ix.prox ,7]) + expected.num.peaks.genome.wide.prox <- length(which(M[,7]>=mean.pval.prox)) + # lambda = num peaks / genome size * searched region + lambda.prox <- expected.num.peaks.genome.wide.prox/effective.genome.size*(tss.dist*2) + pval.pois.prox <- -log10(ppois(n.peaks.prox,lambda.prox,lower.tail=FALSE)) + # handle M.annot + for(k in 1:n.peaks.prox){ + dtss <- d.tss.all[ ix.prox[k] ] + prev.trgt <- M.annot[ix.prox[k],"trgt.prox"] + if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss + if(abs(dtss)=mean.pval.distal)) + # lambda = num peaks / genome size * searched region + lambda.distal <- expected.num.peaks.genome.wide.distal/effective.genome.size*(gn.lngth+gene.wide.dist*2) + pval.pois.distal <- -log10(ppois(n.peaks.distal,lambda.distal,lower.tail=FALSE)) + # pring peaks for gene j + core.line <- paste(sep="\t",gn.nms[ j ],n.peaks.prox,n.peaks.distal,gn.lngth,strand, + mean.pval.prox,mean.pval.distal,max.pval.prox,max.pval.distal, + pval.pois.prox,pval.pois.distal) + cat(file=f.nm.gene,append=TRUE,sep="",core.line,"\t",peaks.line,"\n") + } +} +cat("Done!\n") + +cat(file=f.nm.peak,colnames(M.annot),"\n",sep="\t") +write.table(file=f.nm.peak,append=T,col.names=F,row.names=F,M.annot,sep="\t") + + + + + + +