Mercurial > repos > kmace > mtls_analysis
diff mtls_analyze/heatmap.R @ 4:b465306d00ba draft default tip
Uploaded
author | kmace |
---|---|
date | Mon, 23 Jul 2012 13:00:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/heatmap.R Mon Jul 23 13:00:15 2012 -0400 @@ -0,0 +1,1524 @@ +GeneratePeakMatrix <- function(experiments, scores) { + # Generates a score matrix for all mtls. + # + # Args: + # experiments: A list of underscore deliminated experiments for each mtl. + # There should never be a completely empty item in this list. + # for eg. experiments[1] = expA_expD_expB. + # scores: A list of underscore deliminated scores for each mtl, the length + # of these scores should be identicle to the length of the + # experiments. eg. scores[1] = 55_33_245. + # + # Returns: + # The peak score matrix for all mtls. + experiments <- sapply(experiments, function(x) strsplit(x, split = "_")) + scores <- sapply(scores, function(x) strsplit(x, split = "_")) + unique.experiments <- unique(unlist(experiments)) + + peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments)) + colnames(peaks) <- unique.experiments + + for(i in 1:length(experiments)){ + for(j in 1:length(experiments[[i]])){ + peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j]) + } + } + return(peaks) +################################################################## +} +GetChipData <- function(file.name, + proximity = "distal", + include.targetless = TRUE, + column.order = NA) { + # Reads in, filters, and organizes mtls data + # + # Args: + # file.name: The path to the mtls file (including the file name). + # proximity: Either "distal" or "proximal", defines the gene target distance + # from the mtl. Default is "distal". + # include.targetless: If TRUE, includes mtls with no targets (applied after + # the proximity filter); if not, mtls with no target + # will be exclided. Default is TRUE. + # column.order: An optional vector of column names in the order in which + # they will be presented. If this is left to default (NA) the + # presented order of the chip columns will be the order in + # which they are seen. + # + # Returns: + # An organized list of mtls data with the following elements: + # $peaks - a matrix of peak p-values + # $targets - a list of underscore deliminated gene targets for each mtl + if(param$debug) { + print("In GetChipData") + } + # Set Constants for the mtls file type: + MTLNUM <- "mtl.id" + CHROMOSOME <- "chr" + EXPERIMENTS <- "expt" + EXPERIMENTS.SORTED <- "expt.alphanum.sorted" + START <- "start" + END <- "end" + SUMMIT <- "summit" + SCORES <- "score" + SCORE.MEAN <- "score.mean" + SPANS <- "span.tfs" + SPAN.TOTAL <- "span.l" + PEAK.IDS <- "peak.ids" + TARGETS <- "trgt" +# PROXIMAL.TARGETS <- "trgt.prox" +# DISTAL.TARGETS <- "trgt.dist" +# PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox" +# DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist" + + + #Get chip data in from files: + #TARGETS <- switch(proximity, + # distal = DISTAL.TARGETS, + # proximal = PROXIMAL.TARGETS, + # stop(" Bad proximity argument supplied.")) # Default +cat("param$rna.files = ", param$rna.files, "\n") +if(!is.na(param$rna.files) && param$rna.files != "none") { + keep.columns <- c(EXPERIMENTS, SCORES, TARGETS) +} else { + keep.columns <- c(EXPERIMENTS, SCORES) +} + file <- read.delim(file.name, header=T, as.is=T)[, keep.columns] +if(!is.na(param$rna.files) && param$rna.files != "none") { + if (!include.targetless) { + ix.has.trgt <- which(as.character(file[, TARGETS])!="") + file <- file[ix.has.trgt, ] + } +} + chip = list() + + chip$peaks <- GeneratePeakMatrix(file[, EXPERIMENTS], file[, SCORES]) + if(!is.na(column.order)) { + #If you specify an order, or a subset of experiments to include, this is where + #that gets done. + order <- unlist(strsplit(column.order, split="::")) + chip$peaks <- chip$peaks[,order] + } + if(!is.na(param$rna.files) && param$rna.files != "none") { + chip$targets <- file[ , TARGETS] +} + + return (chip) +} +GetRNADataFromOneFile <- function(file.name, fpkm = "avg") { + # Reads in rna expression data from cufflinks + # + # Args: + # file.name: The path to the cufflinks file (including the file name). + # fpkm: What fpkm is chosen, options are hi, low and avg + # Returns: + # An organized vector of rnaseq data with the following elements: + # names(data) - the genes from the file. eg names(data[1]) = JUND + # data - the fpkm score for that gene data[1] = 31 + if(param$debug) { + cat("In GetRNADataFromOneFile for ",file.name) + } + + # Set Constants for the cufflinks file type: + GENE <- "gene_id" + BUNDLE <- "bundle_id" + CHROMOSOME <- "chr" + LEFT <- "left" + RIGHT <- "right" + FPKM_AVG <- "FPKM" + FPKM_LOW <- "FPKM_conf_lo" + FPKM_HIGH <- "FPKM_conf_hi" + STATUS <- "status" + + + #Get chip data in from files: + FPKM <- switch(fpkm, + hi = FPKM_HIGH, + avg = FPKM_AVG, + low = FPKM_LOW, + stop(" Bad fpkm quality argument supplied.")) # Default + + keep.columns <- c(GENE, FPKM) + + file <- read.delim(file.name, header=T, as.is=T)[, keep.columns] + + rna = vector(mode = "numeric", length = nrow(file)) + + rna <- file[, FPKM] + names(rna) <- file[ , GENE] + return (rna) +} + +GetRNAData <- function(file.names, file.lables = file.names, fpkm = "avg") { + # Reads in rna expression data from cufflinks + # + # Args: + # file.names: The list of paths to the cufflinks files (including the file name). + # fpkm: What fpkm is chosen, options are hi, low and avg + # Returns: + # A matrix of rnaseq data with the following description: + # each col corresponds to an rnaseq run + # eacg row corresponds to a gene + if(param$debug) { + print("In GetRNAData") + } + files <- list() + for(i in 1:length(file.names)) { + files[[i]] <-GetRNADataFromOneFile(file.names[i]) + } + genes <- unique(names(unlist(files))) + scores <- matrix(0,nrow=length(genes),ncol=length(file.names)) + rownames(scores) <- genes + colnames(scores) <- file.names + + for(j in 1:length(file.names)) { + scores[names(files[[j]]),j] <- files[[j]] + } +print("# of cols for scores is ") +print(ncol(scores)) +print ("file lables are ") +print (file.lables) + colnames(scores) <- file.lables + return(scores) + #scores[genes,file.names] <- files[[]] +} + +NormalizeRNA <- function(scores) { + +#add psudocount +scores <- scores+1 + +numerator <- scores[,1:(ncol(scores)/2)] +denominator <- scores[,(ncol(scores)/2 + 1):ncol(scores)] + +#new.scores <- scores[,-which(colnames(scores) == norm.exp)] +#norm.score <- scores[,which(colnames(scores) == norm.exp)] +#return(log2(new.scores/norm.score)) + +return(log2(numerator/denominator)) + +} + +PrepareRNAforHeatmap <- function(new.scores.foldchange){ +new.scores.split <- matrix(0, nr=nrow(new.scores.foldchange), nc=2*ncol(new.scores.foldchange)) +is.even <- function(x){ x %% 2 == 0 } +corresponding.col <- function(x){ceiling(x/2)} +new.col.names <- vector(length = ncol(new.scores.split)) +rownames(new.scores.split) <- rownames(new.scores.foldchange) + for(i in 1:ncol(new.scores.split)) { + if(is.even(i)) {sign <- "down"} else {sign <- "up"} + new.col.names[i] <- paste(colnames(new.scores.foldchange)[corresponding.col(i)], + sign, sep =".") + new.scores.split[,i] <- new.scores.foldchange[,corresponding.col(i)] + if(is.even(i)) { + new.scores.split[which(new.scores.split[,i]>0),i] <- 0 + new.scores.split[,i] <- -1*new.scores.split[,i] + } else { + new.scores.split[which(new.scores.split[,i]<0),i] <- 0 + } + colnames(new.scores.split) <- new.col.names +} +return(new.scores.split)} + +#ConvertExpressionToPval = function(expression, threshold = 1) +#{ +# print("In ConvertExpressionToPval") +# #Generate Normal Stats +# # expression.mean = apply(expression, 2, mean) +# # expression.sd = apply(expression, 2, sd) +# +# ix.above.threshold = which(expression > threshold) +# expression.mean = apply(expression, 2, function(i) mean(i[which(i > threshold)])) +# expression.sd = apply(expression, 2, function(i) sd(i[which(i > threshold)])) +# + +# # CDF from zero to point +# # expression.pval = matrix(,nr=nrow(expression),nc=ncol(expression),dimnames=dimnames(expression)) +# # ix.low = which(expression < expression.mean) +# # ix.upper = which(expression >= expression.mean) +# # expression.pval[ix.low] = (-10 * pnorm(expression[ix.low], mean = expression.mean, sd = expression.sd, log=T) *log10(exp(1))) +# +# expression.pval = (-10 * log10(exp(1)) * pnorm(expression, +# mean = expression.mean, +# sd = expression.sd, +# log=T, +# lower.tail=F +# ) +# ) +# #squash those under threshold to zero +# expression.pval[-ix.above.threshold] = 0 + +# # correct for upper tail (ie from point to inf) +# #Still need to do +# return (expression.pval) +#} +MapExpressiontoChip2 = function(chip.targets, expression) +{ +mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression)) +rownames(mymatrix) <- chip.targets +colnames(mymatrix) <- colnames(expression) + + +j <- 1 #expression counter +i <- 1 #mymatrix counter +expression.genes <- rownames(expression) +previous.expression.gene <- expression.genes[j] + +# Progress bar: +total <- length(chip.targets) +# create progress bar +pb <- txtProgressBar(min = 0, max = total, style = 3) + +for ( i in 1:length(chip.targets)) { + current.target <- chip.targets[i] + setTxtProgressBar(pb, i) + while (current.target > previous.expression.gene && j<=length(expression.genes)) { + j <- j + 1 + previous.expression.gene <- expression.genes[j] + } + if (current.target == previous.expression.gene){ + mymatrix[i,] <- expression[j,] + } +} + +close(pb) + + print("Leaving MapRNAtoChip") + return(mymatrix) +} + + +MapExpressiontoChip = function(chip.targets, expression) +{ + print("In MapRNAtoChip") + # targets = unlist(strsplit(chip$targets[i], "_")) + # exp = matrix(rna$all.pval[targets, ]) + # return (apply(exp, 2, mean)) + + #rna col 1 = th0 overexpressed + #rna col 2 = th17 overexpressed + #rna col 3 = true zscores + + + + + + + #################################################################### + chip.targets.notnull.unique <- unique(chip.targets[which(chip.targets!="")]) + gene.intersect <- intersect(rownames(expression), chip.targets.notnull.unique) + chip.targets.notnull.unique.with.data <- chip.targets.notnull.unique[which(chip.targets.notnull.unique %in% gene.intersect)] + expression.useful.data <- expression[which(rownames(expression) %in% gene.intersect),] + chip.targets.notnull.unique.with.data.sorted <- chip.targets.notnull.unique.with.data[order(chip.targets.notnull.unique.with.data)] + expression.useful.data.sorted <- expression.useful.data[order(rownames(expression.useful.data)),] + #################################################################### + + mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression)) + rownames(mymatrix) <- chip.targets + colnames(mymatrix) <- colnames(expression) + head(chip.targets.notnull.unique.with.data.sorted) + head(rownames(expression.useful.data.sorted)) + if(!identical(chip.targets.notnull.unique.with.data.sorted, rownames(expression.useful.data.sorted))){ + stop("We have a serious problem, chip is not alligned to expression") + } + for(i in 1:length(chip.targets.notnull.unique.with.data.sorted)) + { + mtls.ix <- which(chip.targets == chip.targets.notnull.unique.with.data.sorted[i]) + for(j in 1:length(mtls.ix)){ + mymatrix[mtls.ix[j],] <- expression.useful.data.sorted[i,] + } + } + print("Leaving MapRNAtoChip") + return(mymatrix) + + #return( sapply(chip$targets, function(x) apply(rna$all.pval[unlist(strsplit(chip$targets[3], "_")), ], 2, mean)) ) +} + + + + + +GenerateKMOrder <- function(data, km) { + # generates the order of clusters from high to low mean values + # + # Args: + # data: A matrix of scores that have already been clustered. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. Must + # have the same length as the number of rows in data + # + # Returns: + # cluster.order: The order in which the clusters should be displayed. + + km.cluster = km$cluster + clusters = unique(km.cluster) + clusters.avg = numeric() + for(i in clusters) { + clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ])) + } + if(param$debug) { + print ("straight clusters") + print (clusters) + print ("straigth average") + print (clusters.avg) + print ("ordered clusters") + print (clusters[order(clusters.avg)]) + print("ordered average") + print (clusters.avg[order(clusters.avg)]) + } + return(clusters[rev(order(clusters.avg))]) +} + +OrderMTL <- function(data, km, cluster.order) { + # Orders a matrix of data according to a clustering algorithm + # + # Args: + # data: A matrix of scores that have already been clustered. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. Must + # have the same length as the number of rows in data + # cluster.order: The order in which the clusters should be displayed. + # for eg. km.order = c(2, 3, 1) would result in cluster 2 + # being on top, then cluster 3 and lastly cluster 1. + # + # Returns: + # a list that contains 3 objects: + # list$data: the ordered version of the data. + # list$color.vector: a list of colors that should be assigned to each row. + # list$start.row: the starting row of each cluster in data. + number.clusters <- length(cluster.order) + cluster.colors <- sample(rainbow(number.clusters)) + + # Set up return objects + sorted.data <- matrix(,nr=nrow(data), nc=ncol(data)) + colnames(sorted.data) <- colnames(data) + cluster.color.vector = vector(length=length(km$cluster)) + cluster.start.row = numeric(number.clusters) + cluster.start.row[1]=1 + + for (i in 1:number.clusters) + { + current.cluster = cluster.order[i] + ix = which(km$cluster == current.cluster) + current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1) + sorted.data[current.cluster.range, ] = data[ix, ] + cluster.color.vector[current.cluster.range] = cluster.colors[i] + cluster.start.row[i+1] = (cluster.start.row[i]+length(ix)) + } + ret.list = list() + ret.list$data = sorted.data + ret.list$color.vector = cluster.color.vector + ret.list$cluster.index = cluster.start.row + return(ret.list) +} + + +CreateHeatMap <- function(data, + km, + cluster.order, + document.name, + document.type = "png", + number.colors = 30) { + # Generates a heatmap image based for a matrix based on a clustering + # algorithm. + # + # Args: + # data: A matrix of scores that have already been clustered, the column + # names of this matrix will become the column titels of the heatmap. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. + # cluster.order: The order in which the clusters should be displayed. + # for eg. km.order = c(2, 3, 1) would result in cluster 2 + # being on top, then cluster 3 and lastly cluster 1. + # document.name: A name for the produced file. there is no need to + # supply the .png/.pdf in your argument. + # document.type: The type of file you want to produce. current options are + # png and pdf. Default is pdf. + # + # Returns: + # Nothing to the script that calls it, however it creates an image at the + # path specified. + if(param$debug) { + print("In CreateHeatMap") + } + + data.ordered <- OrderMTL(data, km, cluster.order) + + + #Load Lib + library(gplots) + + #Set Color gradient + color.ramp = colorRampPalette(c("black", + "darkblue", + "blue", + "yellow", + "orange", + "red"))(number.colors) #7 + + if(document.type == "png") { + png(paste(document.name,".png", sep = ""), + #width=15360,#1920, + #height=204080,#2560, + #res=500, + antialias="gray") + } + else if(document.type == "pdf") { + pdf(paste(document.name,".pdf", sep = "")) + } + else if(document.type == "tiff") { + tiff(paste(document.name,".tiff", sep = ""), + res=800, + pointsize=2, + width=1920, + height=1920) + } + else { + bitmap(paste(document.name,".bmp", sep = ""), + height = 5, + width = 5, + res = 500) + } +#op <- par(mar = rep(0, 4)) + + heatmap.2( + data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))], + rowsep = data.ordered$cluster.index[-1], + sepwidth = c(0.5, ncol(data)/100), + dendrogram = "none", + Rowv = F, + Colv = F, + trace = "none", + labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString), + labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)), + RowSideColors = data.ordered$color.vector, + keysize=0.6, + key=F, + col = color.ramp, + cexCol = 0.8, + cexRow = 0.8) + + dev.off() + +} + +CreateIndividualHeatMap <- function(data, + km, + cluster.order, + color.ramp = colorRampPalette(c("black", + "darkblue", + "blue", + "yellow", + "orange", + "red"))(30)) { + # Generates a heatmap image based for a matrix based on a clustering + # algorithm. + # + # Args: + # data: A matrix of scores that have already been clustered, the column + # names of this matrix will become the column titels of the heatmap. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. + # cluster.order: The order in which the clusters should be displayed. + # for eg. km.order = c(2, 3, 1) would result in cluster 2 + # being on top, then cluster 3 and lastly cluster 1. + # document.name: A name for the produced file. there is no need to + # supply the .png/.pdf in your argument. + # document.type: The type of file you want to produce. current options are + # png and pdf. Default is pdf. + # + # Returns: + # Nothing to the script that calls it, however it creates an image at the + # path specified. + if(param$debug) { + print("In CreateHeatMap") + } + + data.ordered <- OrderMTL(data, km, cluster.order) + + + #Load Lib + library(gplots) + + + + + heatmap.3( + data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))], + rowsep = data.ordered$cluster.index[-1], + sepwidth = c(0.5, nrow(data.ordered$data)/100), + dendrogram = "none", + Rowv = F, + Colv = F, + trace = "none", + labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString), + labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)), + #sourcRowSideColors = data.ordered$color.vector, + keysize=0.6, + key=F, + col = color.ramp, + cexCol = 0.8, + cexRow = 0.8) + + +} + +ReadCommadLineParameters <- function(argument.names, command.line.arguments, optional = F) { + # Reads the parameters from the command line arguments. + # + # Args: + # argument.names: A list of expected argument names. + # command.line.arguments: The list of recieved command line arguments + # optional: Are the areguments optional, or are they required, default is required + # + # Returns: + # The arguments for argument.names. As strings In that order. + + if(length(grep("--version",command.line.arguments))) { + cat("version",script.version,"\n") + q() + } + # Split command line arguments + args <- sapply(strsplit(command.line.arguments, " "),function(i) i) + vals <- character(length(argument.names)) + # split cmd.line to key and value pairs + for(i in 1:length(argument.names)) { + ix <- grep(argument.names[i], args) + if(length(ix)>1) { + stop("arg ", + argument.names[i], + " used more than once. Bailing out...\n", + PrintParamError()) + } + else if (length(ix)==0 && !optional) { + stop("could not find ", + argument.names[i], + ". Bailing out...\n", + PrintParamError()) + } + else if (length(ix)==0 && optional) { + vals[i] <- NA + } + else { + vals[i] <- args[ix+1] + } + } + return(vals) +} + +PrintParamError <- function(){ + # Prints the usage of the function, shows users what arguments they can use + # + # Args: + # param: A list that contains all the paramaters. + # + # Returns: + # A modified version of the param list, with the default values loaded. + cat(" +DESCRIPTIION: + heatmap.R takes a ... + +INPUT: + 1.--mtls_file: path to mtls file.\n + 2.--cluster_file: the destination path for the output cluster file.\n + 3.--chip_experiment_order: The order of desired chip experiments (optional).\n + 4.--heatmap_file: path for output heatmap image (no extension).\n + 5.--heatmap_type: choice of image format, currently support png, pdf, tiff and bmp (optional)\n + 6.--expression_file: list of expression files to be included in analysis (optional).\n + 7.--expression_name: lables for the expression files (optional).\n + 8.--normalization_file: a list of files to be used for normalization, + they can be the same file, however the number of expression nominated + normalization files must match the number of expression files (optional)\n + 9.--n_clusters: number of clusters\n + 10.--filter_percentage: percentage of mtls that will be analysed. Eg: if + we make filter_percentage 30, we will take the union of the top mtls in + mean, non-zero mean and variance (optional).\n + +EXAMPLE RUN: + Rscript heatmap.R + --mtls_file path/to/mtls.xls + --cluster_file path/to/output/cluster + --chip_experiment_order tf1::tf2::tf5::tf3 + --heatmap_file path/to/output/heatmap + --heatmap_type png + --expression_file path/to/exp1::path/to/exp2 + --expression_name myexp1::myexp2 + --normalization_file path/to/exp3::path/to/exp3 + --n_clusters 13 + --filter_percentage 100 + --include_targetless yes + --number_bins 30 + + ") +} + +#LoadDefaultParams <- function(param) { +# # Loads default paramaters for the heatmap application +# # +# # Args: +# # param: A list that contains all the previous paramaters. +# # +# # Returns: +# # A modified version of the param list, with the default values loaded. + +#script.version=0.1 +#param$debug = F + +## RNA data: +#param$rna.files = "" +#param$rna.normalization = "none" + +## Filter: +#param$filter.percentage <- 100 + +## Clustering: +#param$clustering.number.of.clusters <- 13 + +## Heatmap: +#param$heatmap.document.name <- "heatmap" +#param$heatmap.document.type <- "png" +##Cluster Groups: +#param$cluster.groups.document.name <- "clusters" + +#return(param) + +#} + + + +LoadParams <- function(cmd.args, args.nms, n.start.numeric, optional = F) { + # Loads user defined paramaters for the heatmap application + # + # Args: + # cmd.args: The command line arguments given. + # arg.nms: A list of possible command arguments. + # n.start.numeric: the first argument that should be numeric (alway put + # these last). + # optional: This specifies if the params in cmd.args are optional or + # required. + # Returns: + # A list of values assigned to each argument. + + + vals <- ReadCommadLineParameters(argument.names = args.nms, + command.line.arguments = cmd.args, + optional = optional) + + #check if numeric params are indeed numeric + if(!optional) { + for(i in n.start.numeric:length(vals)){ + if(is.na(as.numeric(vals[i]))){ + stop("arg ",args.nms[i]," is not numeric. Bailing out...\n",print.error()) + } + } +} + return (vals) + +} + +#ValidateParams <- function(params) { +# return(T) +#} + + +########## + +LoadDebugParams <- function(param) { + + cmd.args <- c( + "--mtls_file data/test/mtls.xls", + "--cluster_file data/test/cluster", + "--heatmap_file data/test/heatmap", + "--heatmap_type bmp", + "--n_clusters 13", + "--filter_percentage 100", + "--expression_file /home/kieran/code/scorchR-heatmap/data/expression/rna1.tabular::/home/kieran/code/scorchR-heatmap/data/expression/rna2.tabular", + "--expression_name batf.ko.0::batf.ko.17", + "--normalization_file mean", + "--chip_experiment_order ac::bc::cc::dc::ec", + "--include_targetless yes", + "--number_bins 20" + ) + +args.nms <- c( "--mtls_file", #1 + "--cluster_file", #2 + "--chip_experiment_order", #3 + "--heatmap_file", #4 + "--heatmap_type", #5 + "--expression_file", #6 + "--expression_name", #7 + "--normalization_file", #8 + "--include_targetless", #9 + "--n_clusters", #10 + "--filter_percentage", #11 + "--number_bins") #12 + + +# vals has the same order as args.nms +vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 10, optional = F) + + +# ChIP data: +param$annotated.macs.file <- vals[1] +param$chip.order <- vals[3] +# RNA data: +param$rna.files = vals[6] +param$rna.names = vals[7] +param$rna.normalization.file = vals[8] +param$include.targetless = vals[9] + +# Filter: +param$filter.percentage <- as.numeric(vals[11]) + +# Clustering: +param$number.bins <- as.numeric(vals[12]) +param$clustering.number.of.clusters <- as.numeric(vals[10]) + +# Heatmap: +param$heatmap.document.name <- vals[4] +param$heatmap.document.type <- vals[5] +#Cluster Groups: +param$cluster.groups.document.name <- vals[2] + +return(param) + +} + + + +LoadOptionalParams <- function(param) { + +cmd.args <- commandArgs(trailingOnly = T) + +args.nms <- c( "--chip_experiment_order", #1 + "--expression_file", #2 + "--expression_name", #3 + "--normalization_file", #4 + "--heatmap_type", #5 + "--include_targetless", #6 + "--filter_percentage", #7 + "--number_bins") #8 + + +# vals has the same order as args.nms +vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 7, optional = T) + +# ChIP data: +param$chip.order <- if(!is.na(vals[1])){vals[1]}else{NA} + +# RNA data: +param$rna.files <- if(!is.na(vals[2])){vals[2]}else{"none"} +param$rna.names <- if(!is.na(vals[3])){vals[3]}else{"none"} +param$rna.normalization.file <- if(!is.na(vals[4])){vals[4]}else{"no"} +param$include.targetless <- if(!is.na(vals[6])){vals[6]}else{"yes"} +# Filter: +param$filter.percentage <- if(!is.na(vals[7])){as.numeric(vals[7])}else{100} +param$number.bins <- if(!is.na(vals[8])){as.numeric(vals[8])}else{30} +# Heatmap file (output) +param$heatmap.document.type <- if(!is.na(vals[5])){vals[5]}else{"none"} + +return(param) +} + + +LoadReqiredParams <- function(param){ + +cmd.args <- commandArgs(trailingOnly = T) + +args.nms <- c( "--mtls_file", #1 + "--cluster_file", #2 + "--heatmap_file", #3 + "--n_clusters") #4 + + +# vals has the same order as args.nms +vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 4, optional = F) + +# ChIP data: +param$annotated.macs.file <- vals[1] + +# Clustering +param$clustering.number.of.clusters <- as.numeric(vals[4]) + +# Cluster file (output): +param$cluster.groups.document.name <- vals[2] + + +# Heatmap file (output): +param$heatmap.document.name <- vals[3] + +return(param) + +} + +########### +# here we output the fasta file of the targets of each kmeans cluster +CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){ + f.nm = paste(f.nm, ".fasta", sep="") + clusters = km.order + for(i in 1:length(clusters)){ + v=trgts[which(k.ix==clusters[i])] + v.split = unlist(sapply(v,strsplit, "_")) + if(i == 1){ + cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n") + } else { + cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE) + } + cat(sep="\n",file=f.nm,v.split,append=TRUE) + } +} + + +PrintClusters <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){ + f.nm = paste(f.nm, ".tsv", sep="") + cat(sep="\t",file=f.nm,"row_number/target","cluster","\n") + trgts[which(trgts=="")] <- "no_target" + for(i in 1:length(trgts)){ + cat(sep="",file=f.nm,trgts[i],"\t","cluster_",k.ix[i],"\n",append=TRUE) + } +# if(i == 1){ +# cat(sep="",file=f.nm,"cluster_",i,"\n")#clusters[i],"\n") +# } else { +# cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE) +# } + #cat(sep="\n",file=f.nm,v.split,append=TRUE) + } +GetTopRowsFromMatrix = function(mtrx, percentage = 10) +{ + if (param$debug) { + print("In GetTopRowsFromMatrix") + } + #Store the stats for the mtrx + stats = list() + stats$mean=apply(mtrx,1,mean) + stats$sd=apply(mtrx,1,sd) + stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)])) + + + #Store the indexes for the mtrx + index = list() + index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix + index$sd = sort(stats$sd, decreasing=T, index.return=T)$ix + index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix + + #Calculate how many data points we want to take + cutoff = floor(length(mtrx[ ,1])*(percentage/100)) + + #Extract Indexes and return to caller + index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff])) + + return(index) +} + +#GetUniq = function(targets.cluster) +#{ +# #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order +# +# return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))) +# #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))) +# +#} + +bindata = function(d,qunts=seq(.4,.9,.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) + for(j in 1:ncol(d)) + { + bins=quantile(d[,j],qunts) + for(i in 1:length(bins)) + { + tmp[which(d[,j]>bins[i]),j] = i + } + } + return(tmp) +} + +bindata.non.zero = function(d,qunts=seq(0,0.9,0.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d)) + for(j in 1:ncol(d)) + { + ix.non.zero=which(d[,j]!=0) + bins=quantile(d[ix.non.zero,j],qunts) + for(i in 1:length(bins)) + { + tmp[which(d[,j]>bins[i]),j] = i + } + } + + return(tmp) +} +bindata.matrix = function(d,qunts=seq(0,0.9,0.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) + #ix.non.zero=which(d!=0) + bins=quantile(d[],qunts) + for(i in 1:length(bins)) + { + tmp[which(d>bins[i])] = i + } + return(tmp) +} +bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) + ix.non.zero=which(d!=0) + bins=quantile(d[ix.non.zero],qunts) + for(i in 1:length(bins)) + { + tmp[which(d>bins[i])] = i + } + return(tmp) +} +heatmap.3 <- function (x, Rowv = TRUE, Colv = if (symm) "Rowv" else TRUE, + distfun = dist, hclustfun = hclust, dendrogram = c("both", + "row", "column", "none"), symm = FALSE, scale = c("none", + "row", "column"), na.rm = TRUE, revC = identical(Colv, + "Rowv"), add.expr, breaks, symbreaks = min(x < 0, na.rm = TRUE) || + scale != "none", col = "heat.colors", colsep, rowsep, + sepcolor = "white", sepwidth = c(0.05, 0.05), cellnote, notecex = 1, + notecol = "cyan", na.color = par("bg"), trace = c("column", + "row", "both", "none"), tracecol = "cyan", hline = median(breaks), + vline = median(breaks), linecol = tracecol, margins = c(5, + 5), ColSideColors, RowSideColors, cexRow = 0.2 + 1/log10(nr), + cexCol = 0.2 + 1/log10(nc), labRow = NULL, labCol = NULL, + key = TRUE, keysize = 1.5, density.info = c("histogram", + "density", "none"), denscol = tracecol, symkey = min(x < + 0, na.rm = TRUE) || symbreaks, densadj = 0.25, main = NULL, + xlab = NULL, ylab = NULL, lmat = NULL, lhei = NULL, lwid = NULL, + ...) +{ + scale01 <- function(x, low = min(x), high = max(x)) { + x <- (x - low)/(high - low) + x + } + retval <- list() + scale <- if (symm && missing(scale)) + "none" + else match.arg(scale) + dendrogram <- match.arg(dendrogram) + trace <- match.arg(trace) + density.info <- match.arg(density.info) + if (length(col) == 1 && is.character(col)) + col <- get(col, mode = "function") + if (!missing(breaks) && (scale != "none")) + warning("Using scale=\"row\" or scale=\"column\" when breaks are", + "specified can produce unpredictable results.", "Please consider using only one or the other.") + if (is.null(Rowv) || is.na(Rowv)) + Rowv <- FALSE + if (is.null(Colv) || is.na(Colv)) + Colv <- FALSE + else if (Colv == "Rowv" && !isTRUE(Rowv)) + Colv <- FALSE + if (length(di <- dim(x)) != 2 || !is.numeric(x)) + stop("`x' must be a numeric matrix") + nr <- di[1] + nc <- di[2] + if (nr <= 1 || nc <= 1) + stop("`x' must have at least 2 rows and 2 columns") + if (!is.numeric(margins) || length(margins) != 2) + stop("`margins' must be a numeric vector of length 2") + if (missing(cellnote)) + cellnote <- matrix("", ncol = ncol(x), nrow = nrow(x)) + if (!inherits(Rowv, "dendrogram")) { + if (((!isTRUE(Rowv)) || (is.null(Rowv))) && (dendrogram %in% + c("both", "row"))) { + if (is.logical(Colv) && (Colv)) + dendrogram <- "column" + else dedrogram <- "none" + warning("Discrepancy: Rowv is FALSE, while dendrogram is `", + dendrogram, "'. Omitting row dendogram.") + } + } + if (!inherits(Colv, "dendrogram")) { + if (((!isTRUE(Colv)) || (is.null(Colv))) && (dendrogram %in% + c("both", "column"))) { + if (is.logical(Rowv) && (Rowv)) + dendrogram <- "row" + else dendrogram <- "none" + warning("Discrepancy: Colv is FALSE, while dendrogram is `", + dendrogram, "'. Omitting column dendogram.") + } + } + if (inherits(Rowv, "dendrogram")) { + ddr <- Rowv + rowInd <- order.dendrogram(ddr) + } + else if (is.integer(Rowv)) { + hcr <- hclustfun(distfun(x)) + ddr <- as.dendrogram(hcr) + ddr <- reorder(ddr, Rowv) + rowInd <- order.dendrogram(ddr) + if (nr != length(rowInd)) + stop("row dendrogram ordering gave index of wrong length") + } + else if (isTRUE(Rowv)) { + Rowv <- rowMeans(x, na.rm = na.rm) + hcr <- hclustfun(distfun(x)) + ddr <- as.dendrogram(hcr) + ddr <- reorder(ddr, Rowv) + rowInd <- order.dendrogram(ddr) + if (nr != length(rowInd)) + stop("row dendrogram ordering gave index of wrong length") + } + else { + rowInd <- nr:1 + } + if (inherits(Colv, "dendrogram")) { + ddc <- Colv + colInd <- order.dendrogram(ddc) + } + else if (identical(Colv, "Rowv")) { + if (nr != nc) + stop("Colv = \"Rowv\" but nrow(x) != ncol(x)") + if (exists("ddr")) { + ddc <- ddr + colInd <- order.dendrogram(ddc) + } + else colInd <- rowInd + } + else if (is.integer(Colv)) { + hcc <- hclustfun(distfun(if (symm) + x + else t(x))) + ddc <- as.dendrogram(hcc) + ddc <- reorder(ddc, Colv) + colInd <- order.dendrogram(ddc) + if (nc != length(colInd)) + stop("column dendrogram ordering gave index of wrong length") + } + else if (isTRUE(Colv)) { + Colv <- colMeans(x, na.rm = na.rm) + hcc <- hclustfun(distfun(if (symm) + x + else t(x))) + ddc <- as.dendrogram(hcc) + ddc <- reorder(ddc, Colv) + colInd <- order.dendrogram(ddc) + if (nc != length(colInd)) + stop("column dendrogram ordering gave index of wrong length") + } + else { + colInd <- 1:nc + } + retval$rowInd <- rowInd + retval$colInd <- colInd + retval$call <- match.call() + x <- x[rowInd, colInd] + x.unscaled <- x + cellnote <- cellnote[rowInd, colInd] + if (is.null(labRow)) + labRow <- if (is.null(rownames(x))) + (1:nr)[rowInd] + else rownames(x) + else labRow <- labRow[rowInd] + if (is.null(labCol)) + labCol <- if (is.null(colnames(x))) + (1:nc)[colInd] + else colnames(x) + else labCol <- labCol[colInd] + if (scale == "row") { + retval$rowMeans <- rm <- rowMeans(x, na.rm = na.rm) + x <- sweep(x, 1, rm) + retval$rowSDs <- sx <- apply(x, 1, sd, na.rm = na.rm) + x <- sweep(x, 1, sx, "/") + } + else if (scale == "column") { + retval$colMeans <- rm <- colMeans(x, na.rm = na.rm) + x <- sweep(x, 2, rm) + retval$colSDs <- sx <- apply(x, 2, sd, na.rm = na.rm) + x <- sweep(x, 2, sx, "/") + } + if (missing(breaks) || is.null(breaks) || length(breaks) < + 1) { + if (missing(col) || is.function(col)) + breaks <- 16 + else breaks <- length(col) + 1 + } + if (length(breaks) == 1) { + if (!symbreaks) + breaks <- seq(min(x, na.rm = na.rm), max(x, na.rm = na.rm), + length = breaks) + else { + extreme <- max(abs(x), na.rm = TRUE) + breaks <- seq(-extreme, extreme, length = breaks) + } + } + nbr <- length(breaks) + ncol <- length(breaks) - 1 + if (class(col) == "function") + col <- col(ncol) + min.breaks <- min(breaks) + max.breaks <- max(breaks) + x[x < min.breaks] <- min.breaks + x[x > max.breaks] <- max.breaks + # if (missing(lhei) || is.null(lhei)) + # lhei <- c(keysize, 4) + # if (missing(lwid) || is.null(lwid)) + # lwid <- c(keysize, 4) + # if (missing(lmat) || is.null(lmat)) { + # lmat <- rbind(4:3, 2:1) + # if (!missing(ColSideColors)) { + # if (!is.character(ColSideColors) || length(ColSideColors) != + # nc) + # stop("'ColSideColors' must be a character vector of length ncol(x)") + # lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + + # 1) + # lhei <- c(lhei[1], 0.2, lhei[2]) + # } + # if (!missing(RowSideColors)) { + # if (!is.character(RowSideColors) || length(RowSideColors) != + # nr) + # stop("'RowSideColors' must be a character vector of length nrow(x)") + # lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - + # 1), 1), lmat[, 2] + 1) + # lwid <- c(lwid[1], 0.2, lwid[2]) + # } + # lmat[is.na(lmat)] <- 0 + # } + # if (length(lhei) != nrow(lmat)) + # stop("lhei must have length = nrow(lmat) = ", nrow(lmat)) + # if (length(lwid) != ncol(lmat)) + # stop("lwid must have length = ncol(lmat) =", ncol(lmat)) + # op <- par(no.readonly = TRUE) + # on.exit(par(op)) + # layout(lmat, widths = lwid, heights = lhei, respect = FALSE) + if (!missing(RowSideColors)) { + par(mar = c(margins[1], 0, 0, 0.5)) + image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE) + } + if (!missing(ColSideColors)) { + par(mar = c(0.5, 0, 0, margins[2])) + image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE) + } + par(mar = c(margins[1], 0, 0, margins[2])) + x <- t(x) + cellnote <- t(cellnote) + if (revC) { + iy <- nr:1 + if (exists("ddr")) + ddr <- rev(ddr) + x <- x[, iy] + cellnote <- cellnote[, iy] + } + else iy <- 1:nr + image(1:nc, 1:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 + + c(0, nr), axes = FALSE, xlab = "", ylab = "", col = col, + breaks = breaks, ...) + retval$carpet <- x + if (exists("ddr")) + retval$rowDendrogram <- ddr + if (exists("ddc")) + retval$colDendrogram <- ddc + retval$breaks <- breaks + retval$col <- col + if (!invalid(na.color) & any(is.na(x))) { + mmat <- ifelse(is.na(x), 1, NA) + image(1:nc, 1:nr, mmat, axes = FALSE, xlab = "", ylab = "", + col = na.color, add = TRUE) + } + axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0, + cex.axis = cexCol) + if (!is.null(xlab)) + mtext(xlab, side = 1, line = margins[1] - 1.25) + axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0, + cex.axis = cexRow) + if (!is.null(ylab)) + mtext(ylab, side = 4, line = margins[2] - 1.25) + if (!missing(add.expr)) + eval(substitute(add.expr)) + if (!missing(colsep)) + for (csep in colsep) rect(xleft = csep + 0.5, ybottom = rep(0, + length(csep)), xright = csep + 0.5 + sepwidth[1], + ytop = rep(ncol(x) + 1, csep), lty = 1, lwd = 1, + col = sepcolor, border = sepcolor) + if (!missing(rowsep)) + for (rsep in rowsep) rect(xleft = 0, ybottom = (ncol(x) + + 1 - rsep) - 0.5, xright = nrow(x) + 1, ytop = (ncol(x) + + 1 - rsep) - 0.5 - sepwidth[2], lty = 1, lwd = 1, + col = sepcolor, border = sepcolor) + min.scale <- min(breaks) + max.scale <- max(breaks) + x.scaled <- scale01(t(x), min.scale, max.scale) + if (trace %in% c("both", "column")) { + retval$vline <- vline + vline.vals <- scale01(vline, min.scale, max.scale) + for (i in colInd) { + if (!is.null(vline)) { + abline(v = i - 0.5 + vline.vals, col = linecol, + lty = 2) + } + xv <- rep(i, nrow(x.scaled)) + x.scaled[, i] - 0.5 + xv <- c(xv[1], xv) + yv <- 1:length(xv) - 0.5 + lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") + } + } + if (trace %in% c("both", "row")) { + retval$hline <- hline + hline.vals <- scale01(hline, min.scale, max.scale) + for (i in rowInd) { + if (!is.null(hline)) { + abline(h = i + hline, col = linecol, lty = 2) + } + yv <- rep(i, ncol(x.scaled)) + x.scaled[i, ] - 0.5 + yv <- rev(c(yv[1], yv)) + xv <- length(yv):1 - 0.5 + lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") + } + } + if (!missing(cellnote)) + text(x = c(row(cellnote)), y = c(col(cellnote)), labels = c(cellnote), + col = notecol, cex = notecex) + par(mar = c(margins[1], 0, 0, 0)) + # if (dendrogram %in% c("both", "row")) { + # plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") + # } + # else plot.new() + par(mar = c(0, 0, if (!is.null(main)) 5 else 0, margins[2])) + # if (dendrogram %in% c("both", "column")) { + # plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none") + # } + # else plot.new() + if (!is.null(main)) + title(main, cex.main = 1.5 * op[["cex.main"]]) + # if (key) { + # par(mar = c(5, 4, 2, 1), cex = 0.75) + # tmpbreaks <- breaks + # if (symkey) { + # max.raw <- max(abs(c(x, breaks)), na.rm = TRUE) + # min.raw <- -max.raw + # tmpbreaks[1] <- -max(abs(x), na.rm = TRUE) + # tmpbreaks[length(tmpbreaks)] <- max(abs(x), na.rm = TRUE) + # } + # else { + # min.raw <- min(x, na.rm = TRUE) + # max.raw <- max(x, na.rm = TRUE) + # } + # z <- seq(min.raw, max.raw, length = length(col)) + # image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks, + # xaxt = "n", yaxt = "n") + # par(usr = c(0, 1, 0, 1)) + # lv <- pretty(breaks) + # xv <- scale01(as.numeric(lv), min.raw, max.raw) + # axis(1, at = xv, labels = lv) + # if (scale == "row") + # mtext(side = 1, "Row Z-Score", line = 2) + # else if (scale == "column") + # mtext(side = 1, "Column Z-Score", line = 2) + # else mtext(side = 1, "Value", line = 2) + # if (density.info == "density") { + # dens <- density(x, adjust = densadj, na.rm = TRUE) + # omit <- dens$x < min(breaks) | dens$x > max(breaks) + # dens$x <- dens$x[-omit] + # dens$y <- dens$y[-omit] + # dens$x <- scale01(dens$x, min.raw, max.raw) + # lines(dens$x, dens$y/max(dens$y) * 0.95, col = denscol, + # lwd = 1) + # axis(2, at = pretty(dens$y)/max(dens$y) * 0.95, pretty(dens$y)) + # title("Color Key\nand Density Plot") + # par(cex = 0.5) + # mtext(side = 2, "Density", line = 2) + # } + # else if (density.info == "histogram") { + # h <- hist(x, plot = FALSE, breaks = breaks) + # hx <- scale01(breaks, min.raw, max.raw) + # hy <- c(h$counts, h$counts[length(h$counts)]) + # lines(hx, hy/max(hy) * 0.95, lwd = 1, type = "s", + # col = denscol) + # axis(2, at = pretty(hy)/max(hy) * 0.95, pretty(hy)) + # title("Color Key\nand Histogram") + # par(cex = 0.5) + # mtext(side = 2, "Count", line = 2) + # } + # else title("Color Key") + # } + # else plot.new() + retval$colorTable <- data.frame(low = retval$breaks[-length(retval$breaks)], + high = retval$breaks[-1], color = retval$col) + invisible(retval) +} + +#rm(list=ls()) +param <- list() +param$debug <- FALSE #T + +print("Finished loading functions") + + +if(param$debug) { + param <- LoadDebugParams(param) +} else { + param <- LoadReqiredParams(param) + param <- LoadOptionalParams(param) +} +print (param) +Rprof() #Remove me +# Load Chip data, alread in pval format +# Column.order need to be a vecor of strings, that correspond to the order (and +# inclustion) of chip experiments. +chip <- GetChipData(param$annotated.macs.file, column.order = param$chip.order) + +#Get the top percentages on different criteria +#ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage) +#chip$peaks <- chip$peaks[ix.top$union, ] +#chip$targets <- chip$targets[ix.top$union] +#Bin data: +chip$peaks <- bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=(param$number.bins+1))) + +file.names <- unlist(strsplit(param$rna.files, split="::")) +print(file.names) +file.lables <- unlist(strsplit(param$rna.names, split="::")) +print(file.lables) + +library("RColorBrewer") +color.2 = rev(colorRampPalette(c("darkblue", "steelblue", "lightgrey"))(param$number.bins)) +if(!is.na(file.names) && file.names != "none") { + #rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg") + #do differential expression if the user specifies such: + if(param$rna.normalization != "no") { + if(param$rna.normalization != "mean"){ + norm.file.names <- unlist(strsplit(param$rna.normalization.file, split="::")) + all.file.names <- c(file.names, norm.file.names) + all.file.lables <- c(file.lables, norm.file.names) + print (all.file.names) + print (all.file.lables) + rna.scores <- GetRNAData(all.file.names, file.lables = all.file.lables, fpkm = "avg") + rna.scores <- NormalizeRNA(scores = rna.scores) + rna.scores.sign <- rna.scores/abs(rna.scores) + rna.scores.sign[which(is.nan(rna.scores.sign))] <- 0 + rna.scores <- bindata.non.zero.matrix(abs(rna.scores), qunts = seq(0, 0.9, length=(param$number.bins+1))) + rna.scores <- rna.scores.sign * rna.scores + color.2 = colorRampPalette(c("darkblue", "steelblue", "lightgrey", "pink", "darkred"))(param$number.bins) + } else { + print("we are normalizing by average") + print("file lables are:") + print(file.lables) + rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg") + rna.scores <- t(apply(rna.scores,1, function(x) { if(mean(x)!=0){ return(x/mean(x)) } else { return(x) } })) + rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1))) + } + } else { + rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg") + rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1))) + } + + # chip$peaks <- chip$peaks[order(chip$targets), ] + # rna.scores <- rna.scores[order(rownames(rna.scores)), ] + # chip$targets <- chip$targets[order(chip$targets)] + + rna.scores.mapped <- MapExpressiontoChip(chip.targets = chip$targets, expression=rna.scores) + all.data <- cbind(chip$peaks, rna.scores.mapped) + if(param$include.targetless!="yes"){ + all.data <- all.data[-which(chip$targets==""),] + chip$peaks <- chip$peaks[-which(chip$targets==""),] + rna.scores.mapped <- rna.scores.mapped[-which(chip$targets==""),] + chip$targets <- chip$targets[-which(chip$targets=="")] + } +} else { + all.data <- chip$peaks +} + +print("Clustering") +set.seed(1234) +km <- kmeans(all.data, param$clustering.number.of.clusters, iter.max = 50) +print("Ordering") +km.order <- GenerateKMOrder(all.data, km) + +##AM edits## +km.new.order <- numeric(length(km$cluster)) +for(i in 1:length(km.order)){ + cur.clst <- km.order[i] + ix <- which(km$cluster==cur.clst) + km.new.order[ix] <- i +} +km.order <- 1:length(km.order) +km$cluster <- km.new.order +## Done AM ## +print("Creating image") +#bmp(param$heatmap.document.name, 640, 480)#1280, 960) +bmp(paste(param$heatmap.document.name,".bmp", sep = ""), 5120, 3840)#2560, 1920) +#pdf(paste(param$heatmap.document.name,".pdf", sep = "")) +color.1 <- c("#000000", colorRampPalette(c("blue", "yellow","orange","red"))(param$number.bins)) +if(!is.na(file.names) && file.names != "none") { + PrintClusters(trgts=chip$targets, + k.ix=km$cluster, + f.nm = param$cluster.groups.document.name, + km.order = NA) + + #if(param$rna.normalization != "no") { + # data.split <- cbind(chip$peaks, PrepareRNAforHeatmap(rna.scores.mapped)) + #} else { + # data.split <- cbind(chip$peaks, rna.scores.mapped) + #} + expression.width.multiplier <- 2 + if(ncol(rna.scores.mapped)==1) { + rna.scores.mapped <- cbind(rna.scores.mapped, rna.scores.mapped) + expression.width.multiplier <- 1 + } + + layout(matrix(c(1,2,2,5, + 1,3,4,6, + 1,3,4,7, + 1,3,4,8, + 1,3,4,9), + 5,4, + byrow=T), + widths=c(1,2*ncol(chip$peaks),expression.width.multiplier*ncol(rna.scores.mapped),1), + heights=c(1,10,2,10,2)) + #1 + plot.new() + #2 + plot.new() + #3 + print("Creating peak image") + CreateIndividualHeatMap(chip$peaks, + km, + km.order, color.ramp=color.1) + #4 + print("Creating rna image") + CreateIndividualHeatMap(rna.scores.mapped, + km, + km.order, color.ramp=color.2) + #5 + plot.new() + #6 + image(t(matrix(1:param$number.bins,nc=1)), col=color.1) + #7 + plot.new() + #8 + image(t(matrix(1:param$number.bins,nc=1)), col=color.2) + #9 + plot.new() +} else { + PrintClusters(trgts=1:nrow(chip$peaks), + k.ix=km$cluster, + f.nm = param$cluster.groups.document.name, + km.order = NA) + + + layout(matrix(c(1,2,2, + 1,3,4, + 1,3,5),3,3,byrow=T), + widths=c(1,3*ncol(chip$peaks),1), + heights=c(1,8,1)) + #1 + plot.new() + #2 + plot.new() + #3 + CreateIndividualHeatMap(chip$peaks, + km, + km.order, color.ramp=color.1) + #4 + image(t(matrix(1:param$number.bins,nc=1)), col=color.1) + #5 + plot.new() + +} + +dev.off() +Rprof(NULL) +profile <- summaryRprof() +print(str(profile)) +print(profile) +#save.image("test/testData.RData")