Mercurial > repos > kmace > mtls_analysis
diff mtls_analyze/heatmap.R @ 0:a903c48f05b4
Added non-integrated scripts to galaxy
author | kmace |
---|---|
date | Thu, 22 Mar 2012 15:21:00 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/heatmap.R Thu Mar 22 15:21:00 2012 -0400 @@ -0,0 +1,555 @@ +rm(list = ls()) + +GeneratePeakMatrix <- function(experiments, scores) { + # Generates a score matrix for all mtls. + # + # Args: + # experiments: A list of underscore deliminated experiments for each mtl. + # There should never be a completely empty item in this list. + # for eg. experiments[1] = expA_expD_expB. + # scores: A list of underscore deliminated scores for each mtl, the length + # of these scores should be identicle to the length of the + # experiments. eg. scores[1] = 55_33_245. + # + # Returns: + # The peak score matrix for all mtls. + experiments <- sapply(experiments, function(x) strsplit(x, split = "_")) + scores <- sapply(scores, function(x) strsplit(x, split = "_")) + unique.experiments <- unique(unlist(experiments)) + + peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments)) + colnames(peaks) <- unique.experiments + + for(i in 1:length(experiments)){ + for(j in 1:length(experiments[[i]])){ + peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j]) + } + } + return(peaks) +################################################################## +} +GetChipData <- function(file.name, + proximity = "distal", + include.targetless = TRUE) { + # Reads in, filters, and organizes mtls data + # + # Args: + # file.name: The path to the mtls file (including the file name). + # proximity: Either "distal" or "proximal", defines the gene target distance + # from the mtl. Default is "distal". + # include.targetless: If TRUE, includes mtls with no targets (applied after + # the proximity filter); if not, mtls with no target + # will be exclided. Default is TRUE. + # + # Returns: + # An organized list of mtls data with the following elements: + # $peaks - a matrix of peak p-values + # $targets - a list of underscore deliminated gene targets for each mtl + if(param$debug) { + print("In GetChipData") + } + # Set Constants for the mtls file type: + MTLNUM <- "l" + CHROMOSOME <- "chr" + EXPERIMENTS <- "expt" + EXPERIMENTS.SORTED <- "expt.alphanum.sorted" + START <- "start" + END <- "end" + SUMMIT <- "s" + PVALS <- "pval" + PVAL.MEAN <- "pval.mean" + SPANS <- "span.tfs" + SPAN.TOTAL <- "span.l" + PEAK.IDS <- "peak.ids" # TODO(kieran): Find out what this is + PROXIMAL.TARGETS <- "trgt.prox" + DISTAL.TARGETS <- "trgt.dist" + PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox" + DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist" + + + #Get chip data in from files: + TARGETS <- switch(proximity, + distal = DISTAL.TARGETS, + proximal = PROXIMAL.TARGETS, + stop(" Bad proximity argument supplied.")) # Default + + keep.columns <- c(EXPERIMENTS, PVALS, TARGETS) + + file <- read.delim(file.name, header=T, as.is=T)[, keep.columns] + + if (!include.targetless) { + ix.has.trgt <- which(as.character(file[, TARGETS])!="") + file <- file[ix.has.trgt, ] + } + + chip = list() + + chip$peaks = GeneratePeakMatrix(file[, EXPERIMENTS], file[, PVALS]) + chip$targets = file[ , TARGETS] + return (chip) +} + + + +GenerateKMOrder <- function(data, km) { + # generates the order of clusters from high to low mean values + # + # Args: + # data: A matrix of scores that have already been clustered. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. Must + # have the same length as the number of rows in data + # + # Returns: + # cluster.order: The order in which the clusters should be displayed. + + km.cluster = km$cluster + clusters = unique(km.cluster) + clusters.avg = numeric() + for(i in clusters) { + clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ])) + } + if(param$debug) { + print ("straight clusters") + print (clusters) + print ("straigth average") + print (clusters.avg) + print ("ordered clusters") + print (clusters[order(clusters.avg)]) + print("ordered average") + print (clusters.avg[order(clusters.avg)]) + } + return(clusters[rev(order(clusters.avg))]) +} + +OrderMTL <- function(data, km, cluster.order) { + # Orders a matrix of data according to a clustering algorithm + # + # Args: + # data: A matrix of scores that have already been clustered. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. Must + # have the same length as the number of rows in data + # cluster.order: The order in which the clusters should be displayed. + # for eg. km.order = c(2, 3, 1) would result in cluster 2 + # being on top, then cluster 3 and lastly cluster 1. + # + # Returns: + # a list that contains 3 objects: + # list$data: the ordered version of the data. + # list$color.vector: a list of colors that should be assigned to each row. + # list$start.row: the starting row of each cluster in data. + number.clusters <- length(cluster.order) + cluster.colors <- sample(rainbow(number.clusters)) + + # Set up return objects + sorted.data <- matrix(,nr=nrow(data), nc=ncol(data)) + colnames(sorted.data) <- colnames(data) + cluster.color.vector = vector(length=length(km$cluster)) + cluster.start.row = numeric(number.clusters) + cluster.start.row[1]=1 + + for (i in 1:number.clusters) + { + current.cluster = cluster.order[i] + ix = which(km$cluster == current.cluster) + current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1) + sorted.data[current.cluster.range, ] = data[ix, ] + cluster.color.vector[current.cluster.range] = cluster.colors[i] + cluster.start.row[i+1] = (cluster.start.row[i]+length(ix)) + } + ret.list = list() + ret.list$data = sorted.data + ret.list$color.vector = cluster.color.vector + ret.list$cluster.index = cluster.start.row + return(ret.list) +} + + +CreateHeatMap <- function(data, + km, + cluster.order, + document.name, + document.type = "png", + number.colors = 30) { + # Generates a heatmap image based for a matrix based on a clustering + # algorithm. + # + # Args: + # data: A matrix of scores that have already been clustered, the column + # names of this matrix will become the column titels of the heatmap. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. + # cluster.order: The order in which the clusters should be displayed. + # for eg. km.order = c(2, 3, 1) would result in cluster 2 + # being on top, then cluster 3 and lastly cluster 1. + # document.name: A name for the produced file. there is no need to + # supply the .png/.pdf in your argument. + # document.type: The type of file you want to produce. current options are + # png and pdf. Default is pdf. + # + # Returns: + # Nothing to the script that calls it, however it creates an image at the + # path specified. + if(param$debug) { + print("In CreateHeatMap") + } + + data.ordered <- OrderMTL(data, km, cluster.order) + + + #Load Lib + library(gplots) + + #Set Color gradient + color.ramp = colorRampPalette(c("black", + "darkblue", + "blue", + "yellow", + "orange", + "red"))(number.colors) #7 + + if(document.type == "png") { + png(paste(document.name,".png", sep = ""), + width=1920, + height=2560, + res=100, + antialias="gray") + } + else { + pdf(paste(document.name,".pdf", sep = "")) + } + + heatmap.2( + data.ordered$data[,sort(colnames(data.ordered$data))], + rowsep = data.ordered$cluster.index[-1], + sepwidth = c(0.5, 60), + dendrogram = "none", + Rowv = F, + Colv = F, + trace = "none", + labRow = F, + labCol = sort(colnames(data.ordered$data)), + RowSideColors = data.ordered$color.vector, + col = color.ramp, + cexCol = 1.8, + cexRow = 0.8, + useRaster=F) + + dev.off() + +} + + + + +ReadCommadLineParameters <- function(argument.names, command.line.arguments) { + # Reads the parameters from the command line arguments. + # + # Args: + # argument.names: A list of expected argument names. + # command.line.arguments: The list of recieved command line arguments + # + # Returns: + # The arguments for argument.names. As strings In that order. + + if(length(grep("--version",command.line.arguments))){ + cat("version",script.version,"\n") + q() + } + # Split command line arguments + args <- sapply(strsplit(command.line.arguments, " "),function(i) i) + vals <- character(length(argument.names)) + # split cmd.line to key and value pairs + for(i in 1:length(argument.names)) { + ix <- grep(argument.names[i], args) + if(length(ix)>1) { + stop("arg ", + argument.names[i], + " used more than once. Bailing out...\n", + PrintParamError()) + } + else if (length(ix)==0) { + stop("could not find --", + argument.names[i], + ". Bailing out...\n", + PrintParamError()) + } + vals[i] <- args[ix+1] + } + return(vals) +} + +PrintParamError <- function(){ + # Prints the usage of the function, shows users what arguments they can use + # + # Args: + # param: A list that contains all the paramaters. + # + # Returns: + # A modified version of the param list, with the default values loaded. + cat(" +DESCRIPTIION: + heatmap.R takes a ... + +INPUT: + 1.--mtls_file path to mtls file.\n + 2.--cluster_file the destination path for the cluster file.\n + 3.--heatmap_file the destination path for heatmap image (no extension).\n + 4.--heatmap_type choice of image type, currently support png and pdf.\n + 5.--n_clusters number of clusters in the heatmap\n + 6.--filter_percentage percentage of mtls that will be analysed. for eg. if + we make filter_percentage 30, we will take the union of the top mtls in + mean, non-zero mean and variance.\n + +EXAMPLE RUN: + Rscript heatmap.R + --mtls_file mtls.xls + --cluster_file output/cluster + --heatmap_file output/heatmap + --heatmap_type png + --n_clusters 13 + --filter_percentage 60 + + ") +} + +LoadDefaultParams <- function(param) { + # Loads default paramaters for the heatmap application + # + # Args: + # param: A list that contains all the previous paramaters. + # + # Returns: + # A modified version of the param list, with the default values loaded. + +script.version=0.1 +param$debug = F + +# ChIP data: +param$annotated.macs.file = "data/mtls/mtls.xls" + +# Filter: +param$filter.percentage <- 100 + +# Analyze Clustering: +param$analyze.number.of.clusters = F +param$analyze.number.of.clusters.sample.size = 10000 +param$analyze.number.of.clusters.min = 4 +param$analyze.number.of.clusters.max = 9 +param$analyze.number.of.clusters.document.name <- "data/analysis/cluster_analysis" +param$analyze.number.of.clusters.itterations = 25 + +# Clustering: +param$clustering.number.of.clusters <- 13 + +# Heatmap: +param$heatmap.document.name <- "data/heatmap/heatmap" +param$heatmap.document.type <- "png" +#Cluster Groups: +param$cluster.groups.document.name <- "data/clusters/clusters" + +return(param) + +} + + + +LoadDefinedParams <- function(param) { + # Loads user defined paramaters for the heatmap application + # + # Args: + # param: A list that contains all the previous paramaters. + # + # Returns: + # A modified version of the param list, with the user defined values loaded. + +cmd.args <- commandArgs(trailingOnly = T); + + +# put the numeric params at the end! +n.start.numeric <- 5 +args.nms <- c( "--mtls_file", #1 + "--cluster_file", #2 + "--heatmap_file", #3 + "--heatmap_type", #4 + "--n_clusters", #5 + "--filter_percentage") #6 + + # get parameters +vals <- ReadCommadLineParameters(argument.names = args.nms, + command.line.arguments = cmd.args) + +# check if numeric params are indeed numeric +for(i in n.start.numeric:length(vals)){ + if(is.na(as.numeric(vals[i]))){ + stop("arg ",args.nms[i]," is not numeric. Bailing out...\n",print.error()) + } +} + +# vals has the same order as args.nms + +# ChIP data: +param$annotated.macs.file <- vals[1] + +# Filter: +param$filter.percentage <- as.numeric(vals[6]) + +# Clustering: +param$clustering.number.of.clusters <- as.numeric(vals[5]) + +# Heatmap: +param$heatmap.document.name <- vals[3] +param$heatmap.document.type <- vals[4] +#Cluster Groups: +param$cluster.groups.document.name <- vals[2] + +return(param) +} + + + + +# here we output the fasta file of the targets of each kmeans cluster +CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){ + f.nm = paste(f.nm, ".fasta", sep="") + clusters = km.order + for(i in 1:length(clusters)){ + v=trgts[which(k.ix==clusters[i])] + v.split = unlist(sapply(v,strsplit, "_")) + if(i == 1){ + cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n") + } else { + cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE) + } + cat(sep="\n",file=f.nm,v.split,append=TRUE) + } +} + + +GetTopRowsFromMatrix = function(mtrx, percentage = 10) +{ + if (param$debug) { + print("In GetTopRowsFromMatrix") + } + #Store the stats for the mtrx + stats = list() + stats$mean=apply(mtrx,1,mean) + stats$sd=apply(mtrx,1,sd) + stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)])) + + + #Store the indexes for the mtrx + index = list() + index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix + index$sd = sort(stats$sd, decreasing=T, index.return=T)$ix + index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix + + #Calculate how many data points we want to take + cutoff = floor(length(mtrx[ ,1])*(percentage/100)) + + #Extract Indexes and return to caller + index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff])) + + return(index) +} + +#GetUniq = function(targets.cluster) +#{ +# #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order +# +# return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))) +# #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))) +# +#} + +bindata = function(d,qunts=seq(.4,.9,.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d)) + for(j in 1:ncol(d)) + { + bins=quantile(d[,j],qunts) + for(i in 1:length(bins)) + { + tmp[which(d[,j]>bins[i]),j] = i + } + } + return(tmp) +} + +bindata.non.zero = function(d,qunts=seq(0,0.9,0.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d)) + for(j in 1:ncol(d)) + { + ix.non.zero=which(d[,j]!=0) + bins=quantile(d[ix.non.zero,j],qunts) + for(i in 1:length(bins)) + { + tmp[which(d[,j]>bins[i]),j] = i + } + } + + return(tmp) +} + +bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) + ix.non.zero=which(d!=0) + bins=quantile(d[ix.non.zero],qunts) + for(i in 1:length(bins)) + { + tmp[which(d>bins[i])] = i + } + return(tmp) +} + +print("Finished loading functions") + +param <- list() +param <- LoadDefaultParams(param) +param <- LoadDefinedParams(param) + +print (param) + +#Load Chip data, alread in pval format +chip <- GetChipData(param$annotated.macs.file) + +#Get the top percentages on different criteria +ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage) +chip$peaks <- chip$peaks[ix.top$union, ] +chip$targets <- chip$targets[ix.top$union] + +#Bin data: +chip$peaks = bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=30)) + + +#Analyze-cluster +if(param$analyze.number.of.clusters) +{ + AnalyzeCluster(data, + sample.size = param$analyze.number.of.clusters.sample.size, + clusters.min = param$analyze.number.of.clusters.min, + clusters.max = param$analyze.number.of.clusters.max, + document.name = param$analyze.number.of.clusters.document.name, + itterations = param$analyze.number.of.clusters.itterations) + +} + +#Cluster +set.seed(1234) +km <- kmeans(chip$peaks, param$clustering.number.of.clusters) +km.order <- GenerateKMOrder(chip$peaks, km) + +CreateHeatMap(chip$peaks, + km, + km.order, + document.name = param$heatmap.document.name, + document.type = param$heatmap.document.type) + +CreateClusterGroups(trgts=chip$targets, + k.ix=km$cluster, + f.nm = param$cluster.groups.document.name, + km.order = km.order) +