Mercurial > repos > kmace > mtls_analysis
view mtls_analyze/heatmap.R @ 0:a903c48f05b4
Added non-integrated scripts to galaxy
author | kmace |
---|---|
date | Thu, 22 Mar 2012 15:21:00 -0400 |
parents | |
children |
line wrap: on
line source
rm(list = ls()) GeneratePeakMatrix <- function(experiments, scores) { # Generates a score matrix for all mtls. # # Args: # experiments: A list of underscore deliminated experiments for each mtl. # There should never be a completely empty item in this list. # for eg. experiments[1] = expA_expD_expB. # scores: A list of underscore deliminated scores for each mtl, the length # of these scores should be identicle to the length of the # experiments. eg. scores[1] = 55_33_245. # # Returns: # The peak score matrix for all mtls. experiments <- sapply(experiments, function(x) strsplit(x, split = "_")) scores <- sapply(scores, function(x) strsplit(x, split = "_")) unique.experiments <- unique(unlist(experiments)) peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments)) colnames(peaks) <- unique.experiments for(i in 1:length(experiments)){ for(j in 1:length(experiments[[i]])){ peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j]) } } return(peaks) ################################################################## } GetChipData <- function(file.name, proximity = "distal", include.targetless = TRUE) { # Reads in, filters, and organizes mtls data # # Args: # file.name: The path to the mtls file (including the file name). # proximity: Either "distal" or "proximal", defines the gene target distance # from the mtl. Default is "distal". # include.targetless: If TRUE, includes mtls with no targets (applied after # the proximity filter); if not, mtls with no target # will be exclided. Default is TRUE. # # Returns: # An organized list of mtls data with the following elements: # $peaks - a matrix of peak p-values # $targets - a list of underscore deliminated gene targets for each mtl if(param$debug) { print("In GetChipData") } # Set Constants for the mtls file type: MTLNUM <- "l" CHROMOSOME <- "chr" EXPERIMENTS <- "expt" EXPERIMENTS.SORTED <- "expt.alphanum.sorted" START <- "start" END <- "end" SUMMIT <- "s" PVALS <- "pval" PVAL.MEAN <- "pval.mean" SPANS <- "span.tfs" SPAN.TOTAL <- "span.l" PEAK.IDS <- "peak.ids" # TODO(kieran): Find out what this is PROXIMAL.TARGETS <- "trgt.prox" DISTAL.TARGETS <- "trgt.dist" PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox" DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist" #Get chip data in from files: TARGETS <- switch(proximity, distal = DISTAL.TARGETS, proximal = PROXIMAL.TARGETS, stop(" Bad proximity argument supplied.")) # Default keep.columns <- c(EXPERIMENTS, PVALS, TARGETS) file <- read.delim(file.name, header=T, as.is=T)[, keep.columns] if (!include.targetless) { ix.has.trgt <- which(as.character(file[, TARGETS])!="") file <- file[ix.has.trgt, ] } chip = list() chip$peaks = GeneratePeakMatrix(file[, EXPERIMENTS], file[, PVALS]) chip$targets = file[ , TARGETS] return (chip) } GenerateKMOrder <- function(data, km) { # generates the order of clusters from high to low mean values # # Args: # data: A matrix of scores that have already been clustered. # km: The K-means object generated from running kmeans on data. Another # method could be used so long as it supplies a (km)$cluser list. Must # have the same length as the number of rows in data # # Returns: # cluster.order: The order in which the clusters should be displayed. km.cluster = km$cluster clusters = unique(km.cluster) clusters.avg = numeric() for(i in clusters) { clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ])) } if(param$debug) { print ("straight clusters") print (clusters) print ("straigth average") print (clusters.avg) print ("ordered clusters") print (clusters[order(clusters.avg)]) print("ordered average") print (clusters.avg[order(clusters.avg)]) } return(clusters[rev(order(clusters.avg))]) } OrderMTL <- function(data, km, cluster.order) { # Orders a matrix of data according to a clustering algorithm # # Args: # data: A matrix of scores that have already been clustered. # km: The K-means object generated from running kmeans on data. Another # method could be used so long as it supplies a (km)$cluser list. Must # have the same length as the number of rows in data # cluster.order: The order in which the clusters should be displayed. # for eg. km.order = c(2, 3, 1) would result in cluster 2 # being on top, then cluster 3 and lastly cluster 1. # # Returns: # a list that contains 3 objects: # list$data: the ordered version of the data. # list$color.vector: a list of colors that should be assigned to each row. # list$start.row: the starting row of each cluster in data. number.clusters <- length(cluster.order) cluster.colors <- sample(rainbow(number.clusters)) # Set up return objects sorted.data <- matrix(,nr=nrow(data), nc=ncol(data)) colnames(sorted.data) <- colnames(data) cluster.color.vector = vector(length=length(km$cluster)) cluster.start.row = numeric(number.clusters) cluster.start.row[1]=1 for (i in 1:number.clusters) { current.cluster = cluster.order[i] ix = which(km$cluster == current.cluster) current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1) sorted.data[current.cluster.range, ] = data[ix, ] cluster.color.vector[current.cluster.range] = cluster.colors[i] cluster.start.row[i+1] = (cluster.start.row[i]+length(ix)) } ret.list = list() ret.list$data = sorted.data ret.list$color.vector = cluster.color.vector ret.list$cluster.index = cluster.start.row return(ret.list) } CreateHeatMap <- function(data, km, cluster.order, document.name, document.type = "png", number.colors = 30) { # Generates a heatmap image based for a matrix based on a clustering # algorithm. # # Args: # data: A matrix of scores that have already been clustered, the column # names of this matrix will become the column titels of the heatmap. # km: The K-means object generated from running kmeans on data. Another # method could be used so long as it supplies a (km)$cluser list. # cluster.order: The order in which the clusters should be displayed. # for eg. km.order = c(2, 3, 1) would result in cluster 2 # being on top, then cluster 3 and lastly cluster 1. # document.name: A name for the produced file. there is no need to # supply the .png/.pdf in your argument. # document.type: The type of file you want to produce. current options are # png and pdf. Default is pdf. # # Returns: # Nothing to the script that calls it, however it creates an image at the # path specified. if(param$debug) { print("In CreateHeatMap") } data.ordered <- OrderMTL(data, km, cluster.order) #Load Lib library(gplots) #Set Color gradient color.ramp = colorRampPalette(c("black", "darkblue", "blue", "yellow", "orange", "red"))(number.colors) #7 if(document.type == "png") { png(paste(document.name,".png", sep = ""), width=1920, height=2560, res=100, antialias="gray") } else { pdf(paste(document.name,".pdf", sep = "")) } heatmap.2( data.ordered$data[,sort(colnames(data.ordered$data))], rowsep = data.ordered$cluster.index[-1], sepwidth = c(0.5, 60), dendrogram = "none", Rowv = F, Colv = F, trace = "none", labRow = F, labCol = sort(colnames(data.ordered$data)), RowSideColors = data.ordered$color.vector, col = color.ramp, cexCol = 1.8, cexRow = 0.8, useRaster=F) dev.off() } ReadCommadLineParameters <- function(argument.names, command.line.arguments) { # Reads the parameters from the command line arguments. # # Args: # argument.names: A list of expected argument names. # command.line.arguments: The list of recieved command line arguments # # Returns: # The arguments for argument.names. As strings In that order. if(length(grep("--version",command.line.arguments))){ cat("version",script.version,"\n") q() } # Split command line arguments args <- sapply(strsplit(command.line.arguments, " "),function(i) i) vals <- character(length(argument.names)) # split cmd.line to key and value pairs for(i in 1:length(argument.names)) { ix <- grep(argument.names[i], args) if(length(ix)>1) { stop("arg ", argument.names[i], " used more than once. Bailing out...\n", PrintParamError()) } else if (length(ix)==0) { stop("could not find --", argument.names[i], ". Bailing out...\n", PrintParamError()) } vals[i] <- args[ix+1] } return(vals) } PrintParamError <- function(){ # Prints the usage of the function, shows users what arguments they can use # # Args: # param: A list that contains all the paramaters. # # Returns: # A modified version of the param list, with the default values loaded. cat(" DESCRIPTIION: heatmap.R takes a ... INPUT: 1.--mtls_file path to mtls file.\n 2.--cluster_file the destination path for the cluster file.\n 3.--heatmap_file the destination path for heatmap image (no extension).\n 4.--heatmap_type choice of image type, currently support png and pdf.\n 5.--n_clusters number of clusters in the heatmap\n 6.--filter_percentage percentage of mtls that will be analysed. for eg. if we make filter_percentage 30, we will take the union of the top mtls in mean, non-zero mean and variance.\n EXAMPLE RUN: Rscript heatmap.R --mtls_file mtls.xls --cluster_file output/cluster --heatmap_file output/heatmap --heatmap_type png --n_clusters 13 --filter_percentage 60 ") } LoadDefaultParams <- function(param) { # Loads default paramaters for the heatmap application # # Args: # param: A list that contains all the previous paramaters. # # Returns: # A modified version of the param list, with the default values loaded. script.version=0.1 param$debug = F # ChIP data: param$annotated.macs.file = "data/mtls/mtls.xls" # Filter: param$filter.percentage <- 100 # Analyze Clustering: param$analyze.number.of.clusters = F param$analyze.number.of.clusters.sample.size = 10000 param$analyze.number.of.clusters.min = 4 param$analyze.number.of.clusters.max = 9 param$analyze.number.of.clusters.document.name <- "data/analysis/cluster_analysis" param$analyze.number.of.clusters.itterations = 25 # Clustering: param$clustering.number.of.clusters <- 13 # Heatmap: param$heatmap.document.name <- "data/heatmap/heatmap" param$heatmap.document.type <- "png" #Cluster Groups: param$cluster.groups.document.name <- "data/clusters/clusters" return(param) } LoadDefinedParams <- function(param) { # Loads user defined paramaters for the heatmap application # # Args: # param: A list that contains all the previous paramaters. # # Returns: # A modified version of the param list, with the user defined values loaded. cmd.args <- commandArgs(trailingOnly = T); # put the numeric params at the end! n.start.numeric <- 5 args.nms <- c( "--mtls_file", #1 "--cluster_file", #2 "--heatmap_file", #3 "--heatmap_type", #4 "--n_clusters", #5 "--filter_percentage") #6 # get parameters vals <- ReadCommadLineParameters(argument.names = args.nms, command.line.arguments = cmd.args) # check if numeric params are indeed numeric for(i in n.start.numeric:length(vals)){ if(is.na(as.numeric(vals[i]))){ stop("arg ",args.nms[i]," is not numeric. Bailing out...\n",print.error()) } } # vals has the same order as args.nms # ChIP data: param$annotated.macs.file <- vals[1] # Filter: param$filter.percentage <- as.numeric(vals[6]) # Clustering: param$clustering.number.of.clusters <- as.numeric(vals[5]) # Heatmap: param$heatmap.document.name <- vals[3] param$heatmap.document.type <- vals[4] #Cluster Groups: param$cluster.groups.document.name <- vals[2] return(param) } # here we output the fasta file of the targets of each kmeans cluster CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){ f.nm = paste(f.nm, ".fasta", sep="") clusters = km.order for(i in 1:length(clusters)){ v=trgts[which(k.ix==clusters[i])] v.split = unlist(sapply(v,strsplit, "_")) if(i == 1){ cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n") } else { cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE) } cat(sep="\n",file=f.nm,v.split,append=TRUE) } } GetTopRowsFromMatrix = function(mtrx, percentage = 10) { if (param$debug) { print("In GetTopRowsFromMatrix") } #Store the stats for the mtrx stats = list() stats$mean=apply(mtrx,1,mean) stats$sd=apply(mtrx,1,sd) stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)])) #Store the indexes for the mtrx index = list() index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix index$sd = sort(stats$sd, decreasing=T, index.return=T)$ix index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix #Calculate how many data points we want to take cutoff = floor(length(mtrx[ ,1])*(percentage/100)) #Extract Indexes and return to caller index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff])) return(index) } #GetUniq = function(targets.cluster) #{ # #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order # # return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))) # #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))) # #} bindata = function(d,qunts=seq(.4,.9,.1)) { tmp=matrix(0,nr=nrow(d),nc=ncol(d)) for(j in 1:ncol(d)) { bins=quantile(d[,j],qunts) for(i in 1:length(bins)) { tmp[which(d[,j]>bins[i]),j] = i } } return(tmp) } bindata.non.zero = function(d,qunts=seq(0,0.9,0.1)) { tmp=matrix(0,nr=nrow(d),nc=ncol(d)) for(j in 1:ncol(d)) { ix.non.zero=which(d[,j]!=0) bins=quantile(d[ix.non.zero,j],qunts) for(i in 1:length(bins)) { tmp[which(d[,j]>bins[i]),j] = i } } return(tmp) } bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1)) { tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) ix.non.zero=which(d!=0) bins=quantile(d[ix.non.zero],qunts) for(i in 1:length(bins)) { tmp[which(d>bins[i])] = i } return(tmp) } print("Finished loading functions") param <- list() param <- LoadDefaultParams(param) param <- LoadDefinedParams(param) print (param) #Load Chip data, alread in pval format chip <- GetChipData(param$annotated.macs.file) #Get the top percentages on different criteria ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage) chip$peaks <- chip$peaks[ix.top$union, ] chip$targets <- chip$targets[ix.top$union] #Bin data: chip$peaks = bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=30)) #Analyze-cluster if(param$analyze.number.of.clusters) { AnalyzeCluster(data, sample.size = param$analyze.number.of.clusters.sample.size, clusters.min = param$analyze.number.of.clusters.min, clusters.max = param$analyze.number.of.clusters.max, document.name = param$analyze.number.of.clusters.document.name, itterations = param$analyze.number.of.clusters.itterations) } #Cluster set.seed(1234) km <- kmeans(chip$peaks, param$clustering.number.of.clusters) km.order <- GenerateKMOrder(chip$peaks, km) CreateHeatMap(chip$peaks, km, km.order, document.name = param$heatmap.document.name, document.type = param$heatmap.document.type) CreateClusterGroups(trgts=chip$targets, k.ix=km$cluster, f.nm = param$cluster.groups.document.name, km.order = km.order)