diff mtls_analyze/heatmap.R @ 0:a903c48f05b4

Added non-integrated scripts to galaxy
author kmace
date Thu, 22 Mar 2012 15:21:00 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/heatmap.R	Thu Mar 22 15:21:00 2012 -0400
@@ -0,0 +1,555 @@
+rm(list = ls())
+
+GeneratePeakMatrix <- function(experiments, scores) {
+  # Generates a score matrix for all mtls.
+  #
+  # Args:
+  #   experiments: A list of underscore deliminated experiments for each mtl.
+  #                There should never be a completely empty item in this list.
+  #                for eg. experiments[1] = expA_expD_expB.
+  #   scores: A list of underscore deliminated scores for each mtl, the length
+  #           of these scores should be identicle to the length of the
+  #           experiments. eg. scores[1] = 55_33_245.
+  #
+  # Returns:
+  #   The peak score matrix for all mtls.
+  experiments <- sapply(experiments, function(x) strsplit(x, split = "_"))
+  scores <- sapply(scores, function(x) strsplit(x, split = "_"))
+  unique.experiments <- unique(unlist(experiments))
+  
+  peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments))
+  colnames(peaks) <- unique.experiments
+  
+  for(i in 1:length(experiments)){
+    for(j in 1:length(experiments[[i]])){
+      peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j])
+    }
+  }
+  return(peaks)
+##################################################################
+}
+GetChipData <- function(file.name, 
+                        proximity = "distal", 
+                        include.targetless = TRUE) {
+  # Reads in, filters, and organizes mtls data
+  #
+  # Args:
+  #   file.name: The path to the mtls file (including the file name).
+  #   proximity: Either "distal" or "proximal", defines the gene target distance
+  #              from the mtl. Default is "distal".
+  #   include.targetless: If TRUE, includes mtls with no targets (applied after 
+  #                       the proximity filter); if not, mtls with no target
+  #                       will be exclided. Default is TRUE. 
+  #
+  # Returns:
+  #   An organized list of mtls data with the following elements:
+  #     $peaks - a matrix of peak p-values
+  #     $targets - a list of underscore deliminated gene targets for each mtl 
+  if(param$debug) {
+    print("In GetChipData")
+  }
+  # Set Constants for the mtls file type:
+  MTLNUM <- "l"
+  CHROMOSOME <- "chr"
+  EXPERIMENTS <- "expt"
+  EXPERIMENTS.SORTED <- "expt.alphanum.sorted"
+  START <- "start"
+  END <- "end"
+  SUMMIT <- "s"
+  PVALS <- "pval"
+  PVAL.MEAN <- "pval.mean"
+  SPANS <- "span.tfs"
+  SPAN.TOTAL <- "span.l"
+  PEAK.IDS <- "peak.ids"  # TODO(kieran): Find out what this is
+  PROXIMAL.TARGETS <- "trgt.prox"
+  DISTAL.TARGETS <- "trgt.dist"
+  PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox"
+  DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist"
+  
+  
+  #Get chip data in from files:
+  TARGETS <- switch(proximity,
+                     distal = DISTAL.TARGETS,
+                     proximal = PROXIMAL.TARGETS,
+                     stop(" Bad proximity argument supplied."))  # Default
+  
+  keep.columns <- c(EXPERIMENTS, PVALS, TARGETS) 
+  
+  file <- read.delim(file.name, header=T, as.is=T)[, keep.columns]
+  
+  if (!include.targetless) {
+    ix.has.trgt <- which(as.character(file[, TARGETS])!="")  
+    file <- file[ix.has.trgt, ]
+  }
+  
+  chip = list()
+  
+  chip$peaks = GeneratePeakMatrix(file[, EXPERIMENTS], file[, PVALS])
+  chip$targets = file[ , TARGETS] 
+  return (chip)
+}
+
+
+
+GenerateKMOrder <- function(data, km) {
+  # generates the order of clusters from high to low mean values
+  #
+  # Args:
+  #   data: A matrix of scores that have already been clustered.
+  #   km: The K-means object generated from running kmeans on data. Another
+  #       method could be used so long as it supplies a (km)$cluser list. Must
+  #       have the same length as the number of rows in data
+  #
+  # Returns:
+  #   cluster.order: The order in which the clusters should be displayed.
+
+    km.cluster = km$cluster
+    clusters = unique(km.cluster)
+    clusters.avg = numeric()
+    for(i in clusters) {
+        clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ]))
+    }
+    if(param$debug) {
+      print ("straight clusters")
+      print (clusters)
+      print ("straigth average")
+      print (clusters.avg)
+      print ("ordered clusters")
+      print (clusters[order(clusters.avg)])
+      print("ordered average")
+      print (clusters.avg[order(clusters.avg)])
+    }
+    return(clusters[rev(order(clusters.avg))])
+}
+
+OrderMTL <- function(data, km, cluster.order) {
+  # Orders a matrix of data according to a clustering algorithm
+  #
+  # Args:
+  #   data: A matrix of scores that have already been clustered.
+  #   km: The K-means object generated from running kmeans on data. Another
+  #       method could be used so long as it supplies a (km)$cluser list. Must
+  #       have the same length as the number of rows in data
+  #   cluster.order: The order in which the clusters should be displayed. 
+  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2 
+  #                  being on top, then cluster 3 and lastly cluster 1.
+  #
+  # Returns:
+  #   a list that contains 3 objects:
+  #     list$data: the ordered version of the data.
+  #     list$color.vector: a list of colors that should be assigned to each row.
+  #     list$start.row: the starting row of each cluster in data.
+  number.clusters <- length(cluster.order)
+  cluster.colors <- sample(rainbow(number.clusters))
+  
+  # Set up return objects
+  sorted.data <- matrix(,nr=nrow(data), nc=ncol(data))
+  colnames(sorted.data) <- colnames(data)
+  cluster.color.vector = vector(length=length(km$cluster))
+  cluster.start.row = numeric(number.clusters)
+  cluster.start.row[1]=1
+  
+  for (i in 1:number.clusters)
+  {
+      current.cluster = cluster.order[i]
+      ix = which(km$cluster == current.cluster)
+      current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1)
+      sorted.data[current.cluster.range, ] = data[ix, ]
+      cluster.color.vector[current.cluster.range] = cluster.colors[i]
+      cluster.start.row[i+1] = (cluster.start.row[i]+length(ix))  
+  }
+  ret.list = list()
+  ret.list$data = sorted.data
+  ret.list$color.vector = cluster.color.vector
+  ret.list$cluster.index = cluster.start.row
+  return(ret.list)
+}    
+
+
+CreateHeatMap <- function(data,
+                         km,
+                         cluster.order, 
+                         document.name,
+                         document.type = "png",
+                         number.colors = 30) {
+  # Generates a heatmap image based for a matrix based on a clustering 
+  # algorithm.
+  #
+  # Args:
+  #   data: A matrix of scores that have already been clustered, the column
+  #         names of this matrix will become the column titels of the heatmap.
+  #   km: The K-means object generated from running kmeans on data. Another
+  #       method could be used so long as it supplies a (km)$cluser list.
+  #   cluster.order: The order in which the clusters should be displayed. 
+  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2 
+  #                  being on top, then cluster 3 and lastly cluster 1.
+  #   document.name: A name for the produced file. there is no need to
+  #                  supply the .png/.pdf in your argument.
+  #   document.type: The type of file you want to produce. current options are
+  #                  png and pdf. Default is pdf.
+  #
+  # Returns:
+  #   Nothing to the script that calls it, however it creates an image at the 
+  #   path specified.
+  if(param$debug) {
+    print("In CreateHeatMap")    
+  }
+  
+  data.ordered <- OrderMTL(data, km, cluster.order)
+    
+	
+	#Load Lib
+	library(gplots)
+	
+	#Set Color gradient
+	color.ramp = colorRampPalette(c("black",
+	                                "darkblue",
+	                                "blue",
+	                                "yellow",
+	                                "orange",
+	                                "red"))(number.colors) #7
+	
+	if(document.type == "png") {
+	  png(paste(document.name,".png", sep = ""), 
+	      width=1920, 
+	      height=2560, 
+	      res=100, 
+	      antialias="gray")
+	}
+	else {
+	pdf(paste(document.name,".pdf", sep = ""))
+	}	
+	
+	heatmap.2(
+		data.ordered$data[,sort(colnames(data.ordered$data))],
+		rowsep = data.ordered$cluster.index[-1],
+		sepwidth = c(0.5, 60), 
+		dendrogram = "none",
+		Rowv = F,
+		Colv = F,
+		trace = "none",
+		labRow = F,
+		labCol = sort(colnames(data.ordered$data)), 
+		RowSideColors = data.ordered$color.vector, 
+		col = color.ramp,
+		cexCol = 1.8,
+		cexRow = 0.8,
+		useRaster=F)
+  
+	dev.off()  
+
+}
+
+
+
+
+ReadCommadLineParameters <- function(argument.names, command.line.arguments) {
+  # Reads the parameters from the command line arguments.
+  #
+  # Args:
+  #   argument.names: A list of expected argument names.
+  #   command.line.arguments: The list of recieved command line arguments
+  #
+  # Returns:
+  #   The arguments for argument.names. As strings In that order.
+
+	if(length(grep("--version",command.line.arguments))){
+		cat("version",script.version,"\n")
+		q()
+	}
+	# Split command line arguments
+	args <- sapply(strsplit(command.line.arguments, " "),function(i) i)
+	vals <- character(length(argument.names))
+	# split cmd.line to key and value pairs
+	for(i in 1:length(argument.names)) {
+		ix <- grep(argument.names[i], args)
+		if(length(ix)>1) {
+			stop("arg ",
+			     argument.names[i],
+			     " used more than once.  Bailing out...\n",
+			     PrintParamError())
+		} 
+		else if (length(ix)==0) {
+			stop("could not find --",
+			     argument.names[i],
+			     ". Bailing out...\n",
+			     PrintParamError())
+		}
+		vals[i] <- args[ix+1]
+	}
+	return(vals)
+}
+
+PrintParamError <- function(){
+  # Prints the usage of the function, shows users what arguments they can use
+  #
+  # Args:
+  #   param: A list that contains all the paramaters.
+  #
+  # Returns:
+  #   A modified version of the param list, with the default values loaded.
+	cat("
+DESCRIPTIION:	
+	heatmap.R takes a ...
+	
+INPUT:
+	1.--mtls_file path to mtls file.\n
+	2.--cluster_file the destination path for the cluster file.\n
+	3.--heatmap_file the destination path for heatmap image (no extension).\n
+	4.--heatmap_type choice of image type, currently support png and pdf.\n
+	5.--n_clusters number of clusters in the heatmap\n
+	6.--filter_percentage percentage of mtls that will be analysed. for eg. if 
+	we make filter_percentage 30, we will take the union of the top mtls in
+	mean, non-zero mean and variance.\n
+	
+EXAMPLE RUN:
+	Rscript heatmap.R
+				--mtls_file mtls.xls
+				--cluster_file output/cluster
+				--heatmap_file output/heatmap
+				--heatmap_type png
+				--n_clusters 13
+				--filter_percentage 60
+	
+	")
+}
+
+LoadDefaultParams <- function(param) {
+  # Loads default paramaters for the heatmap application
+  #
+  # Args:
+  #   param: A list that contains all the previous paramaters.
+  #
+  # Returns:
+  #   A modified version of the param list, with the default values loaded.
+
+script.version=0.1
+param$debug = F
+
+# ChIP data:
+param$annotated.macs.file = "data/mtls/mtls.xls"
+
+# Filter:
+param$filter.percentage <- 100
+
+# Analyze Clustering:
+param$analyze.number.of.clusters = F
+param$analyze.number.of.clusters.sample.size = 10000 
+param$analyze.number.of.clusters.min = 4
+param$analyze.number.of.clusters.max = 9
+param$analyze.number.of.clusters.document.name <- "data/analysis/cluster_analysis" 
+param$analyze.number.of.clusters.itterations = 25
+
+# Clustering:
+param$clustering.number.of.clusters <- 13
+
+# Heatmap:
+param$heatmap.document.name <- "data/heatmap/heatmap"
+param$heatmap.document.type <- "png"
+#Cluster Groups:
+param$cluster.groups.document.name <- "data/clusters/clusters"
+
+return(param)
+
+}
+
+
+
+LoadDefinedParams <- function(param) {
+  # Loads user defined paramaters for the heatmap application
+  #
+  # Args:
+  #   param: A list that contains all the previous paramaters.
+  #
+  # Returns:
+  #   A modified version of the param list, with the user defined values loaded.
+
+cmd.args <- commandArgs(trailingOnly = T);
+
+
+# put the numeric params at the end!
+n.start.numeric <- 5
+args.nms <- c(	"--mtls_file", 	#1
+				"--cluster_file", 	    #2
+				"--heatmap_file", 	    #3
+				"--heatmap_type",     	#4
+			 	"--n_clusters", 	    	#5
+				"--filter_percentage")	#6
+				
+				# get parameters
+vals <- ReadCommadLineParameters(argument.names = args.nms, 
+                                 command.line.arguments = cmd.args)
+
+# check if numeric params are indeed numeric
+for(i in n.start.numeric:length(vals)){
+	if(is.na(as.numeric(vals[i]))){
+		stop("arg ",args.nms[i]," is not numeric.  Bailing out...\n",print.error())
+	}
+}
+
+# vals has the same order as args.nms
+
+# ChIP data:
+param$annotated.macs.file <- vals[1]
+
+# Filter:
+param$filter.percentage <- as.numeric(vals[6])
+
+# Clustering:
+param$clustering.number.of.clusters <- as.numeric(vals[5])
+
+# Heatmap:
+param$heatmap.document.name <- vals[3]
+param$heatmap.document.type <- vals[4]
+#Cluster Groups:
+param$cluster.groups.document.name <- vals[2]
+
+return(param)
+}
+
+
+
+
+# here we output the fasta file of the targets of each kmeans cluster
+CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){
+  f.nm = paste(f.nm, ".fasta", sep="")
+    clusters = km.order
+    for(i in 1:length(clusters)){
+        v=trgts[which(k.ix==clusters[i])]
+        v.split = unlist(sapply(v,strsplit, "_"))
+        if(i == 1){
+            cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n")    
+        } else {
+            cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE)
+        }
+        cat(sep="\n",file=f.nm,v.split,append=TRUE)
+    }
+}
+
+
+GetTopRowsFromMatrix = function(mtrx, percentage = 10)
+{
+  if (param$debug) {
+    print("In GetTopRowsFromMatrix")
+	}
+	#Store the stats for the mtrx
+	stats = list()
+	stats$mean=apply(mtrx,1,mean)
+	stats$sd=apply(mtrx,1,sd)
+	stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)]))
+    
+	
+	#Store the indexes for the mtrx
+	index = list()
+	index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix
+	index$sd   = sort(stats$sd, decreasing=T, index.return=T)$ix
+	index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix
+	
+	#Calculate how many data points we want to take
+	cutoff = floor(length(mtrx[ ,1])*(percentage/100))
+	
+	#Extract Indexes and return to caller
+	index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff]))
+	
+	return(index)
+}
+
+#GetUniq = function(targets.cluster)
+#{
+#    #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order
+#    
+#    return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))))
+#    #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))
+#    
+#}
+
+bindata = function(d,qunts=seq(.4,.9,.1))
+{
+    tmp=matrix(0,nr=nrow(d),nc=ncol(d))
+    for(j in 1:ncol(d))
+    {
+        bins=quantile(d[,j],qunts)
+            for(i in 1:length(bins))
+            {
+                tmp[which(d[,j]>bins[i]),j] = i
+         }
+    }
+    return(tmp)
+}
+
+bindata.non.zero = function(d,qunts=seq(0,0.9,0.1))
+{
+    tmp=matrix(0,nr=nrow(d),nc=ncol(d))
+    for(j in 1:ncol(d))
+    {
+        ix.non.zero=which(d[,j]!=0)
+        bins=quantile(d[ix.non.zero,j],qunts)
+            for(i in 1:length(bins))
+            {
+                tmp[which(d[,j]>bins[i]),j] = i
+         }
+    }
+    
+    return(tmp)
+}
+
+bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1))
+{
+    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
+    ix.non.zero=which(d!=0)
+    bins=quantile(d[ix.non.zero],qunts)
+    for(i in 1:length(bins))
+    {
+        tmp[which(d>bins[i])] = i
+    }
+    return(tmp)
+}
+
+print("Finished loading functions")
+
+param <- list()
+param <- LoadDefaultParams(param)
+param <- LoadDefinedParams(param)
+
+print (param)
+
+#Load Chip data, alread in pval format
+chip <- GetChipData(param$annotated.macs.file)
+
+#Get the top percentages on different criteria
+ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage)
+chip$peaks <- chip$peaks[ix.top$union, ]
+chip$targets <- chip$targets[ix.top$union]
+
+#Bin data:
+chip$peaks = bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=30))
+
+
+#Analyze-cluster
+if(param$analyze.number.of.clusters)
+{
+    AnalyzeCluster(data, 
+    sample.size = param$analyze.number.of.clusters.sample.size, 
+    clusters.min = param$analyze.number.of.clusters.min, 
+    clusters.max = param$analyze.number.of.clusters.max, 
+    document.name = param$analyze.number.of.clusters.document.name,
+    itterations = param$analyze.number.of.clusters.itterations)
+    
+}
+
+#Cluster
+set.seed(1234)
+km <- kmeans(chip$peaks, param$clustering.number.of.clusters)
+km.order <- GenerateKMOrder(chip$peaks, km)
+
+CreateHeatMap(chip$peaks, 
+              km,
+              km.order, 
+              document.name = param$heatmap.document.name,
+              document.type = param$heatmap.document.type)
+
+CreateClusterGroups(trgts=chip$targets,
+                    k.ix=km$cluster,
+                    f.nm = param$cluster.groups.document.name, 
+                    km.order = km.order)
+