view mtls_analyze/heatmap.R @ 0:a903c48f05b4

Added non-integrated scripts to galaxy
author kmace
date Thu, 22 Mar 2012 15:21:00 -0400
parents
children
line wrap: on
line source

rm(list = ls())

GeneratePeakMatrix <- function(experiments, scores) {
  # Generates a score matrix for all mtls.
  #
  # Args:
  #   experiments: A list of underscore deliminated experiments for each mtl.
  #                There should never be a completely empty item in this list.
  #                for eg. experiments[1] = expA_expD_expB.
  #   scores: A list of underscore deliminated scores for each mtl, the length
  #           of these scores should be identicle to the length of the
  #           experiments. eg. scores[1] = 55_33_245.
  #
  # Returns:
  #   The peak score matrix for all mtls.
  experiments <- sapply(experiments, function(x) strsplit(x, split = "_"))
  scores <- sapply(scores, function(x) strsplit(x, split = "_"))
  unique.experiments <- unique(unlist(experiments))
  
  peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments))
  colnames(peaks) <- unique.experiments
  
  for(i in 1:length(experiments)){
    for(j in 1:length(experiments[[i]])){
      peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j])
    }
  }
  return(peaks)
##################################################################
}
GetChipData <- function(file.name, 
                        proximity = "distal", 
                        include.targetless = TRUE) {
  # Reads in, filters, and organizes mtls data
  #
  # Args:
  #   file.name: The path to the mtls file (including the file name).
  #   proximity: Either "distal" or "proximal", defines the gene target distance
  #              from the mtl. Default is "distal".
  #   include.targetless: If TRUE, includes mtls with no targets (applied after 
  #                       the proximity filter); if not, mtls with no target
  #                       will be exclided. Default is TRUE. 
  #
  # Returns:
  #   An organized list of mtls data with the following elements:
  #     $peaks - a matrix of peak p-values
  #     $targets - a list of underscore deliminated gene targets for each mtl 
  if(param$debug) {
    print("In GetChipData")
  }
  # Set Constants for the mtls file type:
  MTLNUM <- "l"
  CHROMOSOME <- "chr"
  EXPERIMENTS <- "expt"
  EXPERIMENTS.SORTED <- "expt.alphanum.sorted"
  START <- "start"
  END <- "end"
  SUMMIT <- "s"
  PVALS <- "pval"
  PVAL.MEAN <- "pval.mean"
  SPANS <- "span.tfs"
  SPAN.TOTAL <- "span.l"
  PEAK.IDS <- "peak.ids"  # TODO(kieran): Find out what this is
  PROXIMAL.TARGETS <- "trgt.prox"
  DISTAL.TARGETS <- "trgt.dist"
  PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox"
  DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist"
  
  
  #Get chip data in from files:
  TARGETS <- switch(proximity,
                     distal = DISTAL.TARGETS,
                     proximal = PROXIMAL.TARGETS,
                     stop(" Bad proximity argument supplied."))  # Default
  
  keep.columns <- c(EXPERIMENTS, PVALS, TARGETS) 
  
  file <- read.delim(file.name, header=T, as.is=T)[, keep.columns]
  
  if (!include.targetless) {
    ix.has.trgt <- which(as.character(file[, TARGETS])!="")  
    file <- file[ix.has.trgt, ]
  }
  
  chip = list()
  
  chip$peaks = GeneratePeakMatrix(file[, EXPERIMENTS], file[, PVALS])
  chip$targets = file[ , TARGETS] 
  return (chip)
}



GenerateKMOrder <- function(data, km) {
  # generates the order of clusters from high to low mean values
  #
  # Args:
  #   data: A matrix of scores that have already been clustered.
  #   km: The K-means object generated from running kmeans on data. Another
  #       method could be used so long as it supplies a (km)$cluser list. Must
  #       have the same length as the number of rows in data
  #
  # Returns:
  #   cluster.order: The order in which the clusters should be displayed.

    km.cluster = km$cluster
    clusters = unique(km.cluster)
    clusters.avg = numeric()
    for(i in clusters) {
        clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ]))
    }
    if(param$debug) {
      print ("straight clusters")
      print (clusters)
      print ("straigth average")
      print (clusters.avg)
      print ("ordered clusters")
      print (clusters[order(clusters.avg)])
      print("ordered average")
      print (clusters.avg[order(clusters.avg)])
    }
    return(clusters[rev(order(clusters.avg))])
}

OrderMTL <- function(data, km, cluster.order) {
  # Orders a matrix of data according to a clustering algorithm
  #
  # Args:
  #   data: A matrix of scores that have already been clustered.
  #   km: The K-means object generated from running kmeans on data. Another
  #       method could be used so long as it supplies a (km)$cluser list. Must
  #       have the same length as the number of rows in data
  #   cluster.order: The order in which the clusters should be displayed. 
  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2 
  #                  being on top, then cluster 3 and lastly cluster 1.
  #
  # Returns:
  #   a list that contains 3 objects:
  #     list$data: the ordered version of the data.
  #     list$color.vector: a list of colors that should be assigned to each row.
  #     list$start.row: the starting row of each cluster in data.
  number.clusters <- length(cluster.order)
  cluster.colors <- sample(rainbow(number.clusters))
  
  # Set up return objects
  sorted.data <- matrix(,nr=nrow(data), nc=ncol(data))
  colnames(sorted.data) <- colnames(data)
  cluster.color.vector = vector(length=length(km$cluster))
  cluster.start.row = numeric(number.clusters)
  cluster.start.row[1]=1
  
  for (i in 1:number.clusters)
  {
      current.cluster = cluster.order[i]
      ix = which(km$cluster == current.cluster)
      current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1)
      sorted.data[current.cluster.range, ] = data[ix, ]
      cluster.color.vector[current.cluster.range] = cluster.colors[i]
      cluster.start.row[i+1] = (cluster.start.row[i]+length(ix))  
  }
  ret.list = list()
  ret.list$data = sorted.data
  ret.list$color.vector = cluster.color.vector
  ret.list$cluster.index = cluster.start.row
  return(ret.list)
}    


CreateHeatMap <- function(data,
                         km,
                         cluster.order, 
                         document.name,
                         document.type = "png",
                         number.colors = 30) {
  # Generates a heatmap image based for a matrix based on a clustering 
  # algorithm.
  #
  # Args:
  #   data: A matrix of scores that have already been clustered, the column
  #         names of this matrix will become the column titels of the heatmap.
  #   km: The K-means object generated from running kmeans on data. Another
  #       method could be used so long as it supplies a (km)$cluser list.
  #   cluster.order: The order in which the clusters should be displayed. 
  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2 
  #                  being on top, then cluster 3 and lastly cluster 1.
  #   document.name: A name for the produced file. there is no need to
  #                  supply the .png/.pdf in your argument.
  #   document.type: The type of file you want to produce. current options are
  #                  png and pdf. Default is pdf.
  #
  # Returns:
  #   Nothing to the script that calls it, however it creates an image at the 
  #   path specified.
  if(param$debug) {
    print("In CreateHeatMap")    
  }
  
  data.ordered <- OrderMTL(data, km, cluster.order)
    
	
	#Load Lib
	library(gplots)
	
	#Set Color gradient
	color.ramp = colorRampPalette(c("black",
	                                "darkblue",
	                                "blue",
	                                "yellow",
	                                "orange",
	                                "red"))(number.colors) #7
	
	if(document.type == "png") {
	  png(paste(document.name,".png", sep = ""), 
	      width=1920, 
	      height=2560, 
	      res=100, 
	      antialias="gray")
	}
	else {
	pdf(paste(document.name,".pdf", sep = ""))
	}	
	
	heatmap.2(
		data.ordered$data[,sort(colnames(data.ordered$data))],
		rowsep = data.ordered$cluster.index[-1],
		sepwidth = c(0.5, 60), 
		dendrogram = "none",
		Rowv = F,
		Colv = F,
		trace = "none",
		labRow = F,
		labCol = sort(colnames(data.ordered$data)), 
		RowSideColors = data.ordered$color.vector, 
		col = color.ramp,
		cexCol = 1.8,
		cexRow = 0.8,
		useRaster=F)
  
	dev.off()  

}




ReadCommadLineParameters <- function(argument.names, command.line.arguments) {
  # Reads the parameters from the command line arguments.
  #
  # Args:
  #   argument.names: A list of expected argument names.
  #   command.line.arguments: The list of recieved command line arguments
  #
  # Returns:
  #   The arguments for argument.names. As strings In that order.

	if(length(grep("--version",command.line.arguments))){
		cat("version",script.version,"\n")
		q()
	}
	# Split command line arguments
	args <- sapply(strsplit(command.line.arguments, " "),function(i) i)
	vals <- character(length(argument.names))
	# split cmd.line to key and value pairs
	for(i in 1:length(argument.names)) {
		ix <- grep(argument.names[i], args)
		if(length(ix)>1) {
			stop("arg ",
			     argument.names[i],
			     " used more than once.  Bailing out...\n",
			     PrintParamError())
		} 
		else if (length(ix)==0) {
			stop("could not find --",
			     argument.names[i],
			     ". Bailing out...\n",
			     PrintParamError())
		}
		vals[i] <- args[ix+1]
	}
	return(vals)
}

PrintParamError <- function(){
  # Prints the usage of the function, shows users what arguments they can use
  #
  # Args:
  #   param: A list that contains all the paramaters.
  #
  # Returns:
  #   A modified version of the param list, with the default values loaded.
	cat("
DESCRIPTIION:	
	heatmap.R takes a ...
	
INPUT:
	1.--mtls_file path to mtls file.\n
	2.--cluster_file the destination path for the cluster file.\n
	3.--heatmap_file the destination path for heatmap image (no extension).\n
	4.--heatmap_type choice of image type, currently support png and pdf.\n
	5.--n_clusters number of clusters in the heatmap\n
	6.--filter_percentage percentage of mtls that will be analysed. for eg. if 
	we make filter_percentage 30, we will take the union of the top mtls in
	mean, non-zero mean and variance.\n
	
EXAMPLE RUN:
	Rscript heatmap.R
				--mtls_file mtls.xls
				--cluster_file output/cluster
				--heatmap_file output/heatmap
				--heatmap_type png
				--n_clusters 13
				--filter_percentage 60
	
	")
}

LoadDefaultParams <- function(param) {
  # Loads default paramaters for the heatmap application
  #
  # Args:
  #   param: A list that contains all the previous paramaters.
  #
  # Returns:
  #   A modified version of the param list, with the default values loaded.

script.version=0.1
param$debug = F

# ChIP data:
param$annotated.macs.file = "data/mtls/mtls.xls"

# Filter:
param$filter.percentage <- 100

# Analyze Clustering:
param$analyze.number.of.clusters = F
param$analyze.number.of.clusters.sample.size = 10000 
param$analyze.number.of.clusters.min = 4
param$analyze.number.of.clusters.max = 9
param$analyze.number.of.clusters.document.name <- "data/analysis/cluster_analysis" 
param$analyze.number.of.clusters.itterations = 25

# Clustering:
param$clustering.number.of.clusters <- 13

# Heatmap:
param$heatmap.document.name <- "data/heatmap/heatmap"
param$heatmap.document.type <- "png"
#Cluster Groups:
param$cluster.groups.document.name <- "data/clusters/clusters"

return(param)

}



LoadDefinedParams <- function(param) {
  # Loads user defined paramaters for the heatmap application
  #
  # Args:
  #   param: A list that contains all the previous paramaters.
  #
  # Returns:
  #   A modified version of the param list, with the user defined values loaded.

cmd.args <- commandArgs(trailingOnly = T);


# put the numeric params at the end!
n.start.numeric <- 5
args.nms <- c(	"--mtls_file", 	#1
				"--cluster_file", 	    #2
				"--heatmap_file", 	    #3
				"--heatmap_type",     	#4
			 	"--n_clusters", 	    	#5
				"--filter_percentage")	#6
				
				# get parameters
vals <- ReadCommadLineParameters(argument.names = args.nms, 
                                 command.line.arguments = cmd.args)

# check if numeric params are indeed numeric
for(i in n.start.numeric:length(vals)){
	if(is.na(as.numeric(vals[i]))){
		stop("arg ",args.nms[i]," is not numeric.  Bailing out...\n",print.error())
	}
}

# vals has the same order as args.nms

# ChIP data:
param$annotated.macs.file <- vals[1]

# Filter:
param$filter.percentage <- as.numeric(vals[6])

# Clustering:
param$clustering.number.of.clusters <- as.numeric(vals[5])

# Heatmap:
param$heatmap.document.name <- vals[3]
param$heatmap.document.type <- vals[4]
#Cluster Groups:
param$cluster.groups.document.name <- vals[2]

return(param)
}




# here we output the fasta file of the targets of each kmeans cluster
CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){
  f.nm = paste(f.nm, ".fasta", sep="")
    clusters = km.order
    for(i in 1:length(clusters)){
        v=trgts[which(k.ix==clusters[i])]
        v.split = unlist(sapply(v,strsplit, "_"))
        if(i == 1){
            cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n")    
        } else {
            cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE)
        }
        cat(sep="\n",file=f.nm,v.split,append=TRUE)
    }
}


GetTopRowsFromMatrix = function(mtrx, percentage = 10)
{
  if (param$debug) {
    print("In GetTopRowsFromMatrix")
	}
	#Store the stats for the mtrx
	stats = list()
	stats$mean=apply(mtrx,1,mean)
	stats$sd=apply(mtrx,1,sd)
	stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)]))
    
	
	#Store the indexes for the mtrx
	index = list()
	index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix
	index$sd   = sort(stats$sd, decreasing=T, index.return=T)$ix
	index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix
	
	#Calculate how many data points we want to take
	cutoff = floor(length(mtrx[ ,1])*(percentage/100))
	
	#Extract Indexes and return to caller
	index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff]))
	
	return(index)
}

#GetUniq = function(targets.cluster)
#{
#    #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order
#    
#    return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))))
#    #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))
#    
#}

bindata = function(d,qunts=seq(.4,.9,.1))
{
    tmp=matrix(0,nr=nrow(d),nc=ncol(d))
    for(j in 1:ncol(d))
    {
        bins=quantile(d[,j],qunts)
            for(i in 1:length(bins))
            {
                tmp[which(d[,j]>bins[i]),j] = i
         }
    }
    return(tmp)
}

bindata.non.zero = function(d,qunts=seq(0,0.9,0.1))
{
    tmp=matrix(0,nr=nrow(d),nc=ncol(d))
    for(j in 1:ncol(d))
    {
        ix.non.zero=which(d[,j]!=0)
        bins=quantile(d[ix.non.zero,j],qunts)
            for(i in 1:length(bins))
            {
                tmp[which(d[,j]>bins[i]),j] = i
         }
    }
    
    return(tmp)
}

bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1))
{
    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
    ix.non.zero=which(d!=0)
    bins=quantile(d[ix.non.zero],qunts)
    for(i in 1:length(bins))
    {
        tmp[which(d>bins[i])] = i
    }
    return(tmp)
}

print("Finished loading functions")

param <- list()
param <- LoadDefaultParams(param)
param <- LoadDefinedParams(param)

print (param)

#Load Chip data, alread in pval format
chip <- GetChipData(param$annotated.macs.file)

#Get the top percentages on different criteria
ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage)
chip$peaks <- chip$peaks[ix.top$union, ]
chip$targets <- chip$targets[ix.top$union]

#Bin data:
chip$peaks = bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=30))


#Analyze-cluster
if(param$analyze.number.of.clusters)
{
    AnalyzeCluster(data, 
    sample.size = param$analyze.number.of.clusters.sample.size, 
    clusters.min = param$analyze.number.of.clusters.min, 
    clusters.max = param$analyze.number.of.clusters.max, 
    document.name = param$analyze.number.of.clusters.document.name,
    itterations = param$analyze.number.of.clusters.itterations)
    
}

#Cluster
set.seed(1234)
km <- kmeans(chip$peaks, param$clustering.number.of.clusters)
km.order <- GenerateKMOrder(chip$peaks, km)

CreateHeatMap(chip$peaks, 
              km,
              km.order, 
              document.name = param$heatmap.document.name,
              document.type = param$heatmap.document.type)

CreateClusterGroups(trgts=chip$targets,
                    k.ix=km$cluster,
                    f.nm = param$cluster.groups.document.name, 
                    km.order = km.order)