mtls_analysis: mtls_analyze/heatmap.R comparison

comparison mtls_analyze/heatmap.R @ 0:a903c48f05b4

Added non-integrated scripts to galaxy

author	kmace
date	Thu, 22 Mar 2012 15:21:00 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:a903c48f05b4
+rm(list = ls())
+GeneratePeakMatrix <- function(experiments, scores) {
+# Generates a score matrix for all mtls.
+#
+# Args:
+#   experiments: A list of underscore deliminated experiments for each mtl.
+#                There should never be a completely empty item in this list.
+#                for eg. experiments[1] = expA_expD_expB.
+#   scores: A list of underscore deliminated scores for each mtl, the length
+#           of these scores should be identicle to the length of the
+#           experiments. eg. scores[1] = 55_33_245.
+#
+# Returns:
+#   The peak score matrix for all mtls.
+experiments <- sapply(experiments, function(x) strsplit(x, split = "_"))
+scores <- sapply(scores, function(x) strsplit(x, split = "_"))
+unique.experiments <- unique(unlist(experiments))
+peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments))
+colnames(peaks) <- unique.experiments
+for(i in 1:length(experiments)){
+for(j in 1:length(experiments[[i]])){
+peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j])
+}
+}
+return(peaks)
+##################################################################
+}
+GetChipData <- function(file.name,
+proximity = "distal",
+include.targetless = TRUE) {
+# Reads in, filters, and organizes mtls data
+#
+# Args:
+#   file.name: The path to the mtls file (including the file name).
+#   proximity: Either "distal" or "proximal", defines the gene target distance
+#              from the mtl. Default is "distal".
+#   include.targetless: If TRUE, includes mtls with no targets (applied after
+#                       the proximity filter); if not, mtls with no target
+#                       will be exclided. Default is TRUE.
+#
+# Returns:
+#   An organized list of mtls data with the following elements:
+#     $peaks - a matrix of peak p-values
+#     $targets - a list of underscore deliminated gene targets for each mtl
+if(param$debug) {
+print("In GetChipData")
+}
+# Set Constants for the mtls file type:
+MTLNUM <- "l"
+CHROMOSOME <- "chr"
+EXPERIMENTS <- "expt"
+EXPERIMENTS.SORTED <- "expt.alphanum.sorted"
+START <- "start"
+END <- "end"
+SUMMIT <- "s"
+PVALS <- "pval"
+PVAL.MEAN <- "pval.mean"
+SPANS <- "span.tfs"
+SPAN.TOTAL <- "span.l"
+PEAK.IDS <- "peak.ids"  # TODO(kieran): Find out what this is
+PROXIMAL.TARGETS <- "trgt.prox"
+DISTAL.TARGETS <- "trgt.dist"
+PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox"
+DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist"
+#Get chip data in from files:
+TARGETS <- switch(proximity,
+distal = DISTAL.TARGETS,
+proximal = PROXIMAL.TARGETS,
+stop(" Bad proximity argument supplied."))  # Default
+keep.columns <- c(EXPERIMENTS, PVALS, TARGETS)
+file <- read.delim(file.name, header=T, as.is=T)[, keep.columns]
+if (!include.targetless) {
+ix.has.trgt <- which(as.character(file[, TARGETS])!="")
+file <- file[ix.has.trgt, ]
+}
+chip = list()
+chip$peaks = GeneratePeakMatrix(file[, EXPERIMENTS], file[, PVALS])
+chip$targets = file[ , TARGETS]
+return (chip)
+}
+GenerateKMOrder <- function(data, km) {
+# generates the order of clusters from high to low mean values
+#
+# Args:
+#   data: A matrix of scores that have already been clustered.
+#   km: The K-means object generated from running kmeans on data. Another
+#       method could be used so long as it supplies a (km)$cluser list. Must
+#       have the same length as the number of rows in data
+#
+# Returns:
+#   cluster.order: The order in which the clusters should be displayed.
+km.cluster = km$cluster
+clusters = unique(km.cluster)
+clusters.avg = numeric()
+for(i in clusters) {
+clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ]))
+}
+if(param$debug) {
+print ("straight clusters")
+print (clusters)
+print ("straigth average")
+print (clusters.avg)
+print ("ordered clusters")
+print (clusters[order(clusters.avg)])
+print("ordered average")
+print (clusters.avg[order(clusters.avg)])
+}
+return(clusters[rev(order(clusters.avg))])
+}
+OrderMTL <- function(data, km, cluster.order) {
+# Orders a matrix of data according to a clustering algorithm
+#
+# Args:
+#   data: A matrix of scores that have already been clustered.
+#   km: The K-means object generated from running kmeans on data. Another
+#       method could be used so long as it supplies a (km)$cluser list. Must
+#       have the same length as the number of rows in data
+#   cluster.order: The order in which the clusters should be displayed.
+#                  for eg. km.order = c(2, 3, 1) would result in cluster 2
+#                  being on top, then cluster 3 and lastly cluster 1.
+#
+# Returns:
+#   a list that contains 3 objects:
+#     list$data: the ordered version of the data.
+#     list$color.vector: a list of colors that should be assigned to each row.
+#     list$start.row: the starting row of each cluster in data.
+number.clusters <- length(cluster.order)
+cluster.colors <- sample(rainbow(number.clusters))
+# Set up return objects
+sorted.data <- matrix(,nr=nrow(data), nc=ncol(data))
+colnames(sorted.data) <- colnames(data)
+cluster.color.vector = vector(length=length(km$cluster))
+cluster.start.row = numeric(number.clusters)
+cluster.start.row[1]=1
+for (i in 1:number.clusters)
+{
+current.cluster = cluster.order[i]
+ix = which(km$cluster == current.cluster)
+current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1)
+sorted.data[current.cluster.range, ] = data[ix, ]
+cluster.color.vector[current.cluster.range] = cluster.colors[i]
+cluster.start.row[i+1] = (cluster.start.row[i]+length(ix))
+}
+ret.list = list()
+ret.list$data = sorted.data
+ret.list$color.vector = cluster.color.vector
+ret.list$cluster.index = cluster.start.row
+return(ret.list)
+}
+CreateHeatMap <- function(data,
+km,
+cluster.order,
+document.name,
+document.type = "png",
+number.colors = 30) {
+# Generates a heatmap image based for a matrix based on a clustering
+# algorithm.
+#
+# Args:
+#   data: A matrix of scores that have already been clustered, the column
+#         names of this matrix will become the column titels of the heatmap.
+#   km: The K-means object generated from running kmeans on data. Another
+#       method could be used so long as it supplies a (km)$cluser list.
+#   cluster.order: The order in which the clusters should be displayed.
+#                  for eg. km.order = c(2, 3, 1) would result in cluster 2
+#                  being on top, then cluster 3 and lastly cluster 1.
+#   document.name: A name for the produced file. there is no need to
+#                  supply the .png/.pdf in your argument.
+#   document.type: The type of file you want to produce. current options are
+#                  png and pdf. Default is pdf.
+#
+# Returns:
+#   Nothing to the script that calls it, however it creates an image at the
+#   path specified.
+if(param$debug) {
+print("In CreateHeatMap")
+}
+data.ordered <- OrderMTL(data, km, cluster.order)
+	#Load Lib
+	library(gplots)
+	#Set Color gradient
+	color.ramp = colorRampPalette(c("black",
+	                                "darkblue",
+	                                "blue",
+	                                "yellow",
+	                                "orange",
+	                                "red"))(number.colors) #7
+	if(document.type == "png") {
+	  png(paste(document.name,".png", sep = ""),
+	      width=1920,
+	      height=2560,
+	      res=100,
+	      antialias="gray")
+	}
+	else {
+	pdf(paste(document.name,".pdf", sep = ""))
+	}
+	heatmap.2(
+		data.ordered$data[,sort(colnames(data.ordered$data))],
+		rowsep = data.ordered$cluster.index[-1],
+		sepwidth = c(0.5, 60),
+		dendrogram = "none",
+		Rowv = F,
+		Colv = F,
+		trace = "none",
+		labRow = F,
+		labCol = sort(colnames(data.ordered$data)),
+		RowSideColors = data.ordered$color.vector,
+		col = color.ramp,
+		cexCol = 1.8,
+		cexRow = 0.8,
+		useRaster=F)
+	dev.off()
+}
+ReadCommadLineParameters <- function(argument.names, command.line.arguments) {
+# Reads the parameters from the command line arguments.
+#
+# Args:
+#   argument.names: A list of expected argument names.
+#   command.line.arguments: The list of recieved command line arguments
+#
+# Returns:
+#   The arguments for argument.names. As strings In that order.
+	if(length(grep("--version",command.line.arguments))){
+		cat("version",script.version,"\n")
+		q()
+	}
+	# Split command line arguments
+	args <- sapply(strsplit(command.line.arguments, " "),function(i) i)
+	vals <- character(length(argument.names))
+	# split cmd.line to key and value pairs
+	for(i in 1:length(argument.names)) {
+		ix <- grep(argument.names[i], args)
+		if(length(ix)>1) {
+			stop("arg ",
+			     argument.names[i],
+			     " used more than once.  Bailing out...\n",
+			     PrintParamError())
+		}
+		else if (length(ix)==0) {
+			stop("could not find --",
+			     argument.names[i],
+			     ". Bailing out...\n",
+			     PrintParamError())
+		}
+		vals[i] <- args[ix+1]
+	}
+	return(vals)
+}
+PrintParamError <- function(){
+# Prints the usage of the function, shows users what arguments they can use
+#
+# Args:
+#   param: A list that contains all the paramaters.
+#
+# Returns:
+#   A modified version of the param list, with the default values loaded.
+	cat("
+DESCRIPTIION:
+	heatmap.R takes a ...
+INPUT:
+	1.--mtls_file path to mtls file.\n
+	2.--cluster_file the destination path for the cluster file.\n
+	3.--heatmap_file the destination path for heatmap image (no extension).\n
+	4.--heatmap_type choice of image type, currently support png and pdf.\n
+	5.--n_clusters number of clusters in the heatmap\n
+	6.--filter_percentage percentage of mtls that will be analysed. for eg. if
+	we make filter_percentage 30, we will take the union of the top mtls in
+	mean, non-zero mean and variance.\n
+EXAMPLE RUN:
+	Rscript heatmap.R
+				--mtls_file mtls.xls
+				--cluster_file output/cluster
+				--heatmap_file output/heatmap
+				--heatmap_type png
+				--n_clusters 13
+				--filter_percentage 60
+	")
+}
+LoadDefaultParams <- function(param) {
+# Loads default paramaters for the heatmap application
+#
+# Args:
+#   param: A list that contains all the previous paramaters.
+#
+# Returns:
+#   A modified version of the param list, with the default values loaded.
+script.version=0.1
+param$debug = F
+# ChIP data:
+param$annotated.macs.file = "data/mtls/mtls.xls"
+# Filter:
+param$filter.percentage <- 100
+# Analyze Clustering:
+param$analyze.number.of.clusters = F
+param$analyze.number.of.clusters.sample.size = 10000
+param$analyze.number.of.clusters.min = 4
+param$analyze.number.of.clusters.max = 9
+param$analyze.number.of.clusters.document.name <- "data/analysis/cluster_analysis"
+param$analyze.number.of.clusters.itterations = 25
+# Clustering:
+param$clustering.number.of.clusters <- 13
+# Heatmap:
+param$heatmap.document.name <- "data/heatmap/heatmap"
+param$heatmap.document.type <- "png"
+#Cluster Groups:
+param$cluster.groups.document.name <- "data/clusters/clusters"
+return(param)
+}
+LoadDefinedParams <- function(param) {
+# Loads user defined paramaters for the heatmap application
+#
+# Args:
+#   param: A list that contains all the previous paramaters.
+#
+# Returns:
+#   A modified version of the param list, with the user defined values loaded.
+cmd.args <- commandArgs(trailingOnly = T);
+# put the numeric params at the end!
+n.start.numeric <- 5
+args.nms <- c(	"--mtls_file", 	#1
+				"--cluster_file", 	    #2
+				"--heatmap_file", 	    #3
+				"--heatmap_type",     	#4
+			 	"--n_clusters", 	    	#5
+				"--filter_percentage")	#6
+				# get parameters
+vals <- ReadCommadLineParameters(argument.names = args.nms,
+command.line.arguments = cmd.args)
+# check if numeric params are indeed numeric
+for(i in n.start.numeric:length(vals)){
+	if(is.na(as.numeric(vals[i]))){
+		stop("arg ",args.nms[i]," is not numeric.  Bailing out...\n",print.error())
+	}
+}
+# vals has the same order as args.nms
+# ChIP data:
+param$annotated.macs.file <- vals[1]
+# Filter:
+param$filter.percentage <- as.numeric(vals[6])
+# Clustering:
+param$clustering.number.of.clusters <- as.numeric(vals[5])
+# Heatmap:
+param$heatmap.document.name <- vals[3]
+param$heatmap.document.type <- vals[4]
+#Cluster Groups:
+param$cluster.groups.document.name <- vals[2]
+return(param)
+}
+# here we output the fasta file of the targets of each kmeans cluster
+CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){
+f.nm = paste(f.nm, ".fasta", sep="")
+clusters = km.order
+for(i in 1:length(clusters)){
+v=trgts[which(k.ix==clusters[i])]
+v.split = unlist(sapply(v,strsplit, "_"))
+if(i == 1){
+cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n")
+} else {
+cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE)
+}
+cat(sep="\n",file=f.nm,v.split,append=TRUE)
+}
+}
+GetTopRowsFromMatrix = function(mtrx, percentage = 10)
+{
+if (param$debug) {
+print("In GetTopRowsFromMatrix")
+	}
+	#Store the stats for the mtrx
+	stats = list()
+	stats$mean=apply(mtrx,1,mean)
+	stats$sd=apply(mtrx,1,sd)
+	stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)]))
+	#Store the indexes for the mtrx
+	index = list()
+	index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix
+	index$sd   = sort(stats$sd, decreasing=T, index.return=T)$ix
+	index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix
+	#Calculate how many data points we want to take
+	cutoff = floor(length(mtrx[ ,1])*(percentage/100))
+	#Extract Indexes and return to caller
+	index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff]))
+	return(index)
+}
+#GetUniq = function(targets.cluster)
+#{
+#    #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order
+#
+#    return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))))
+#    #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))
+#
+#}
+bindata = function(d,qunts=seq(.4,.9,.1))
+{
+tmp=matrix(0,nr=nrow(d),nc=ncol(d))
+for(j in 1:ncol(d))
+{
+bins=quantile(d[,j],qunts)
+for(i in 1:length(bins))
+{
+tmp[which(d[,j]>bins[i]),j] = i
+}
+}
+return(tmp)
+}
+bindata.non.zero = function(d,qunts=seq(0,0.9,0.1))
+{
+tmp=matrix(0,nr=nrow(d),nc=ncol(d))
+for(j in 1:ncol(d))
+{
+ix.non.zero=which(d[,j]!=0)
+bins=quantile(d[ix.non.zero,j],qunts)
+for(i in 1:length(bins))
+{
+tmp[which(d[,j]>bins[i]),j] = i
+}
+}
+return(tmp)
+}
+bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1))
+{
+tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
+ix.non.zero=which(d!=0)
+bins=quantile(d[ix.non.zero],qunts)
+for(i in 1:length(bins))
+{
+tmp[which(d>bins[i])] = i
+}
+return(tmp)
+}
+print("Finished loading functions")
+param <- list()
+param <- LoadDefaultParams(param)
+param <- LoadDefinedParams(param)
+print (param)
+#Load Chip data, alread in pval format
+chip <- GetChipData(param$annotated.macs.file)
+#Get the top percentages on different criteria
+ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage)
+chip$peaks <- chip$peaks[ix.top$union, ]
+chip$targets <- chip$targets[ix.top$union]
+#Bin data:
+chip$peaks = bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=30))
+#Analyze-cluster
+if(param$analyze.number.of.clusters)
+{
+AnalyzeCluster(data,
+sample.size = param$analyze.number.of.clusters.sample.size,
+clusters.min = param$analyze.number.of.clusters.min,
+clusters.max = param$analyze.number.of.clusters.max,
+document.name = param$analyze.number.of.clusters.document.name,
+itterations = param$analyze.number.of.clusters.itterations)
+}
+#Cluster
+set.seed(1234)
+km <- kmeans(chip$peaks, param$clustering.number.of.clusters)
+km.order <- GenerateKMOrder(chip$peaks, km)
+CreateHeatMap(chip$peaks,
+km,
+km.order,
+document.name = param$heatmap.document.name,
+document.type = param$heatmap.document.type)
+CreateClusterGroups(trgts=chip$targets,
+k.ix=km$cluster,
+f.nm = param$cluster.groups.document.name,
+km.order = km.order)

Mercurial > repos > kmace > mtls_analysis

comparison mtls_analyze/heatmap.R @ 0:a903c48f05b4