Mercurial > repos > kmace > mtls_analysis

GeneratePeakMatrix <- function(experiments, scores) {
  # Generates a score matrix for all mtls.
  #
  # Args:
  #   experiments: A list of underscore deliminated experiments for each mtl.
  #                There should never be a completely empty item in this list.
  #                for eg. experiments[1] = expA_expD_expB.
  #   scores: A list of underscore deliminated scores for each mtl, the length
  #           of these scores should be identicle to the length of the
  #           experiments. eg. scores[1] = 55_33_245.
  #
  # Returns:
  #   The peak score matrix for all mtls.
  experiments <- sapply(experiments, function(x) strsplit(x, split = "_"))
  scores <- sapply(scores, function(x) strsplit(x, split = "_"))
  unique.experiments <- unique(unlist(experiments))

  peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments))
  colnames(peaks) <- unique.experiments

  for(i in 1:length(experiments)){
    for(j in 1:length(experiments[[i]])){
      peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j])
    }
  }
  return(peaks)
##################################################################
}
GetChipData <- function(file.name,
                        proximity = "distal",
                        include.targetless = TRUE,
                        column.order = NA) {
  # Reads in, filters, and organizes mtls data
  #
  # Args:
  #   file.name: The path to the mtls file (including the file name).
  #   proximity: Either "distal" or "proximal", defines the gene target distance
  #              from the mtl. Default is "distal".
  #   include.targetless: If TRUE, includes mtls with no targets (applied after
  #                       the proximity filter); if not, mtls with no target
  #                       will be exclided. Default is TRUE.
  #   column.order: An optional vector of column names in the order in which
  #                 they will be presented. If this is left to default (NA) the
  #                 presented order of the chip columns will be the order in
  #                 which they are seen.
  #
  # Returns:
  #   An organized list of mtls data with the following elements:
  #     $peaks - a matrix of peak p-values
  #     $targets - a list of underscore deliminated gene targets for each mtl
  if(param$debug) {
    print("In GetChipData")
  }
  # Set Constants for the mtls file type:
  MTLNUM <- "mtl.id"
  CHROMOSOME <- "chr"
  EXPERIMENTS <- "expt"
  EXPERIMENTS.SORTED <- "expt.alphanum.sorted"
  START <- "start"
  END <- "end"
  SUMMIT <- "summit"
  SCORES <- "score"
  SCORE.MEAN <- "score.mean"
  SPANS <- "span.tfs"
  SPAN.TOTAL <- "span.l"
  PEAK.IDS <- "peak.ids"
  TARGETS <- "trgt"
#  PROXIMAL.TARGETS <- "trgt.prox"
#  DISTAL.TARGETS <- "trgt.dist"
#  PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox"
#  DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist"


  #Get chip data in from files:
  #TARGETS <- switch(proximity,
  #                   distal = DISTAL.TARGETS,
  #                   proximal = PROXIMAL.TARGETS,
  #                   stop(" Bad proximity argument supplied."))  # Default
cat("param$rna.files = ", param$rna.files, "\n")
if(!is.na(param$rna.files) && param$rna.files != "none") {
  keep.columns <- c(EXPERIMENTS, SCORES, TARGETS)
} else {
  keep.columns <- c(EXPERIMENTS, SCORES)
}
  file <- read.delim(file.name, header=T, as.is=T)[, keep.columns]
if(!is.na(param$rna.files) && param$rna.files != "none") {
  if (!include.targetless) {
    ix.has.trgt <- which(as.character(file[, TARGETS])!="")
    file <- file[ix.has.trgt, ]
  }
}
  chip = list()

  chip$peaks <- GeneratePeakMatrix(file[, EXPERIMENTS], file[, SCORES])
  if(!is.na(column.order)) {
  #If you specify an order, or a subset of experiments to include, this is where
  #that gets done.
  order <- unlist(strsplit(column.order, split="::"))
  chip$peaks <- chip$peaks[,order]
  }
  if(!is.na(param$rna.files) && param$rna.files != "none") {
  chip$targets <- file[ , TARGETS]
}

  return (chip)
}
GetRNADataFromOneFile <- function(file.name, fpkm = "avg") {
  # Reads in rna expression data from cufflinks
  #
  # Args:
  #   file.name: The path to the cufflinks file (including the file name).
  #   fpkm: What fpkm is chosen, options are hi, low and avg
  # Returns:
  #   An organized vector of rnaseq data with the following elements:
  #     names(data) - the genes from the file. eg names(data[1]) = JUND
  #     data - the fpkm score for that gene data[1] = 31
  if(param$debug) {
    cat("In GetRNADataFromOneFile for ",file.name)
  }

 # Set Constants for the cufflinks file type:
  GENE <- "gene_id"
  BUNDLE <- "bundle_id"
  CHROMOSOME <- "chr"
  LEFT <- "left"
  RIGHT <- "right"
  FPKM_AVG <- "FPKM"
  FPKM_LOW <- "FPKM_conf_lo"
  FPKM_HIGH <- "FPKM_conf_hi"
  STATUS <- "status"


  #Get chip data in from files:
   FPKM <- switch(fpkm,
                  hi = FPKM_HIGH,
                  avg = FPKM_AVG,
                  low = FPKM_LOW,
                  stop(" Bad fpkm quality argument supplied."))  # Default

  keep.columns <- c(GENE, FPKM)

  file <- read.delim(file.name, header=T, as.is=T)[, keep.columns]

  rna = vector(mode = "numeric", length = nrow(file))

  rna <- file[, FPKM]
  names(rna) <- file[ , GENE]
  return (rna)
}

GetRNAData <- function(file.names, file.lables = file.names, fpkm = "avg") {
  # Reads in rna expression data from cufflinks
  #
  # Args:
  #   file.names: The list of paths to the cufflinks files (including the file name).
  #   fpkm: What fpkm is chosen, options are hi, low and avg
  # Returns:
  #   A matrix of rnaseq data with the following description:
  #     each col corresponds to an rnaseq run
  #     eacg row corresponds to a gene
  if(param$debug) {
    print("In GetRNAData")
  }
  files <- list()
  for(i in 1:length(file.names)) {
  files[[i]] <-GetRNADataFromOneFile(file.names[i])
  }
  genes <- unique(names(unlist(files)))
  scores <- matrix(0,nrow=length(genes),ncol=length(file.names))
  rownames(scores) <- genes
  colnames(scores) <- file.names

  for(j in 1:length(file.names)) {
    scores[names(files[[j]]),j] <- files[[j]]
  }
print("# of cols for scores is ")
print(ncol(scores))
print ("file lables are ")
print (file.lables)
  colnames(scores) <- file.lables
  return(scores)
  #scores[genes,file.names] <- files[[]]
}

NormalizeRNA <- function(scores) {

#add psudocount
scores <- scores+1

numerator <- scores[,1:(ncol(scores)/2)]
denominator <- scores[,(ncol(scores)/2 + 1):ncol(scores)]

#new.scores <- scores[,-which(colnames(scores) == norm.exp)]
#norm.score <- scores[,which(colnames(scores) == norm.exp)]
#return(log2(new.scores/norm.score))

return(log2(numerator/denominator))

}

PrepareRNAforHeatmap <- function(new.scores.foldchange){
new.scores.split <- matrix(0, nr=nrow(new.scores.foldchange), nc=2*ncol(new.scores.foldchange))
is.even <- function(x){ x %% 2 == 0 }
corresponding.col <- function(x){ceiling(x/2)}
new.col.names <- vector(length = ncol(new.scores.split))
rownames(new.scores.split) <- rownames(new.scores.foldchange)
  for(i in 1:ncol(new.scores.split)) {
	if(is.even(i)) {sign <- "down"} else {sign <- "up"}
  new.col.names[i] <- paste(colnames(new.scores.foldchange)[corresponding.col(i)],
					sign, sep =".")
	new.scores.split[,i] <- new.scores.foldchange[,corresponding.col(i)]
	if(is.even(i)) {
		new.scores.split[which(new.scores.split[,i]>0),i] <- 0
		new.scores.split[,i] <- -1*new.scores.split[,i]
	} else {
		new.scores.split[which(new.scores.split[,i]<0),i] <- 0
	}
	colnames(new.scores.split) <- new.col.names
}
return(new.scores.split)}

#ConvertExpressionToPval = function(expression, threshold = 1)
#{
#    print("In ConvertExpressionToPval")
#    #Generate Normal Stats
#    # expression.mean = apply(expression, 2, mean)
#    # expression.sd = apply(expression, 2, sd)
#
#    ix.above.threshold = which(expression > threshold)
#    expression.mean = apply(expression, 2, function(i) mean(i[which(i > threshold)]))
#    expression.sd = apply(expression, 2, function(i)     sd(i[which(i > threshold)]))
#

#    # CDF from zero to point
#    # expression.pval = matrix(,nr=nrow(expression),nc=ncol(expression),dimnames=dimnames(expression))
#    # ix.low = which(expression < expression.mean)
#    # ix.upper = which(expression >= expression.mean)
#    # expression.pval[ix.low] = (-10 * pnorm(expression[ix.low], mean = expression.mean, sd = expression.sd, log=T) *log10(exp(1)))
#
#    expression.pval = (-10 * log10(exp(1)) * pnorm(expression,
#                                                    mean = expression.mean,
#                                                    sd = expression.sd,
#                                                    log=T,
#                                                    lower.tail=F
#    )
#    )
#    #squash those under threshold to zero
#    expression.pval[-ix.above.threshold] = 0

#    # correct for upper tail (ie from point to inf)
#    #Still need to do
#    return (expression.pval)
#}
MapExpressiontoChip2 = function(chip.targets, expression)
{
mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression))
rownames(mymatrix) <- chip.targets
colnames(mymatrix) <- colnames(expression)


j <- 1 #expression counter
i <- 1 #mymatrix counter
expression.genes <- rownames(expression)
previous.expression.gene <- expression.genes[j]

# Progress bar:
total <- length(chip.targets)
# create progress bar
pb <- txtProgressBar(min = 0, max = total, style = 3)

for ( i in 1:length(chip.targets)) {
  current.target <- chip.targets[i]
  setTxtProgressBar(pb, i)
  while (current.target > previous.expression.gene && j<=length(expression.genes)) {
    j <- j + 1
    previous.expression.gene <- expression.genes[j]
  }
  if (current.target == previous.expression.gene){
    mymatrix[i,] <- expression[j,]
  }
}

close(pb)

           print("Leaving MapRNAtoChip")
           return(mymatrix)
}


MapExpressiontoChip = function(chip.targets, expression)
{
    print("In MapRNAtoChip")
    # targets = unlist(strsplit(chip$targets[i], "_"))
    #   exp = matrix(rna$all.pval[targets, ])
    #   return (apply(exp, 2, mean))

    #rna col 1 = th0 overexpressed
    #rna col 2 = th17 overexpressed
    #rna col 3 = true zscores


    ####################################################################
    chip.targets.notnull.unique <- unique(chip.targets[which(chip.targets!="")])
    gene.intersect <- intersect(rownames(expression), chip.targets.notnull.unique)
    chip.targets.notnull.unique.with.data <- chip.targets.notnull.unique[which(chip.targets.notnull.unique %in% gene.intersect)]
    expression.useful.data <- expression[which(rownames(expression) %in% gene.intersect),]
    chip.targets.notnull.unique.with.data.sorted <- chip.targets.notnull.unique.with.data[order(chip.targets.notnull.unique.with.data)]
    expression.useful.data.sorted <- expression.useful.data[order(rownames(expression.useful.data)),]
    ####################################################################

    mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression))
       rownames(mymatrix) <- chip.targets
       colnames(mymatrix) <- colnames(expression)
       head(chip.targets.notnull.unique.with.data.sorted)
       head(rownames(expression.useful.data.sorted))
       if(!identical(chip.targets.notnull.unique.with.data.sorted, rownames(expression.useful.data.sorted))){
        stop("We have a serious problem, chip is not alligned to expression")
      }
       for(i in 1:length(chip.targets.notnull.unique.with.data.sorted))
          {
            mtls.ix <- which(chip.targets == chip.targets.notnull.unique.with.data.sorted[i])
            for(j in 1:length(mtls.ix)){
                mymatrix[mtls.ix[j],] <- expression.useful.data.sorted[i,]
              }
          }
           print("Leaving MapRNAtoChip")
           return(mymatrix)

   #return( sapply(chip$targets, function(x) apply(rna$all.pval[unlist(strsplit(chip$targets[3], "_")), ], 2, mean)) )
}


GenerateKMOrder <- function(data, km) {
  # generates the order of clusters from high to low mean values
  #
  # Args:
  #   data: A matrix of scores that have already been clustered.
  #   km: The K-means object generated from running kmeans on data. Another
  #       method could be used so long as it supplies a (km)$cluser list. Must
  #       have the same length as the number of rows in data
  #
  # Returns:
  #   cluster.order: The order in which the clusters should be displayed.

    km.cluster = km$cluster
    clusters = unique(km.cluster)
    clusters.avg = numeric()
    for(i in clusters) {
        clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ]))
    }
    if(param$debug) {
      print ("straight clusters")
      print (clusters)
      print ("straigth average")
      print (clusters.avg)
      print ("ordered clusters")
      print (clusters[order(clusters.avg)])
      print("ordered average")
      print (clusters.avg[order(clusters.avg)])
    }
    return(clusters[rev(order(clusters.avg))])
}

OrderMTL <- function(data, km, cluster.order) {
  # Orders a matrix of data according to a clustering algorithm
  #
  # Args:
  #   data: A matrix of scores that have already been clustered.
  #   km: The K-means object generated from running kmeans on data. Another
  #       method could be used so long as it supplies a (km)$cluser list. Must
  #       have the same length as the number of rows in data
  #   cluster.order: The order in which the clusters should be displayed.
  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2
  #                  being on top, then cluster 3 and lastly cluster 1.
  #
  # Returns:
  #   a list that contains 3 objects:
  #     list$data: the ordered version of the data.
  #     list$color.vector: a list of colors that should be assigned to each row.
  #     list$start.row: the starting row of each cluster in data.
  number.clusters <- length(cluster.order)
  cluster.colors <- sample(rainbow(number.clusters))

  # Set up return objects
  sorted.data <- matrix(,nr=nrow(data), nc=ncol(data))
  colnames(sorted.data) <- colnames(data)
  cluster.color.vector = vector(length=length(km$cluster))
  cluster.start.row = numeric(number.clusters)
  cluster.start.row[1]=1

  for (i in 1:number.clusters)
  {
      current.cluster = cluster.order[i]
      ix = which(km$cluster == current.cluster)
      current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1)
      sorted.data[current.cluster.range, ] = data[ix, ]
      cluster.color.vector[current.cluster.range] = cluster.colors[i]
      cluster.start.row[i+1] = (cluster.start.row[i]+length(ix))
  }
  ret.list = list()
  ret.list$data = sorted.data
  ret.list$color.vector = cluster.color.vector
  ret.list$cluster.index = cluster.start.row
  return(ret.list)
}


CreateHeatMap <- function(data,
                         km,
                         cluster.order,
                         document.name,
                         document.type = "png",
                         number.colors = 30) {
  # Generates a heatmap image based for a matrix based on a clustering
  # algorithm.
  #
  # Args:
  #   data: A matrix of scores that have already been clustered, the column
  #         names of this matrix will become the column titels of the heatmap.
  #   km: The K-means object generated from running kmeans on data. Another
  #       method could be used so long as it supplies a (km)$cluser list.
  #   cluster.order: The order in which the clusters should be displayed.
  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2
  #                  being on top, then cluster 3 and lastly cluster 1.
  #   document.name: A name for the produced file. there is no need to
  #                  supply the .png/.pdf in your argument.
  #   document.type: The type of file you want to produce. current options are
  #                  png and pdf. Default is pdf.
  #
  # Returns:
  #   Nothing to the script that calls it, however it creates an image at the
  #   path specified.
  if(param$debug) {
    print("In CreateHeatMap")
  }

  data.ordered <- OrderMTL(data, km, cluster.order)


	#Load Lib
	library(gplots)

	#Set Color gradient
	color.ramp = colorRampPalette(c("black",
	                                "darkblue",
	                                "blue",
	                                "yellow",
	                                "orange",
	                                "red"))(number.colors) #7

	if(document.type == "png") {
          png(paste(document.name,".png", sep = ""),
              #width=15360,#1920,
              #height=204080,#2560,
              #res=500,
              antialias="gray")
        }
        else if(document.type == "pdf") {
        pdf(paste(document.name,".pdf", sep = ""))
        }
        else if(document.type == "tiff") {
        tiff(paste(document.name,".tiff", sep = ""),
        res=800,
        pointsize=2,
        width=1920,
        height=1920)
        }
        else {
        bitmap(paste(document.name,".bmp", sep = ""),
        height = 5,
        width = 5,
        res = 500)
        }
#op <- par(mar = rep(0, 4))

	heatmap.2(
		data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))],
		rowsep = data.ordered$cluster.index[-1],
		sepwidth = c(0.5, ncol(data)/100),
		dendrogram = "none",
		Rowv = F,
		Colv = F,
		trace = "none",
		labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString),
		labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)),
		RowSideColors = data.ordered$color.vector,
		keysize=0.6,
		key=F,
		col = color.ramp,
		cexCol = 0.8,
		cexRow = 0.8)

	dev.off()

}

CreateIndividualHeatMap <- function(data,
                         km,
                         cluster.order,
                         color.ramp = colorRampPalette(c("black",
                                  "darkblue",
                                  "blue",
                                  "yellow",
                                  "orange",
                                  "red"))(30)) {
  # Generates a heatmap image based for a matrix based on a clustering
  # algorithm.
  #
  # Args:
  #   data: A matrix of scores that have already been clustered, the column
  #         names of this matrix will become the column titels of the heatmap.
  #   km: The K-means object generated from running kmeans on data. Another
  #       method could be used so long as it supplies a (km)$cluser list.
  #   cluster.order: The order in which the clusters should be displayed.
  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2
  #                  being on top, then cluster 3 and lastly cluster 1.
  #   document.name: A name for the produced file. there is no need to
  #                  supply the .png/.pdf in your argument.
  #   document.type: The type of file you want to produce. current options are
  #                  png and pdf. Default is pdf.
  #
  # Returns:
  #   Nothing to the script that calls it, however it creates an image at the
  #   path specified.
  if(param$debug) {
    print("In CreateHeatMap")
  }

  data.ordered <- OrderMTL(data, km, cluster.order)


  #Load Lib
  library(gplots)


  heatmap.3(
    data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))],
    rowsep = data.ordered$cluster.index[-1],
    sepwidth = c(0.5, nrow(data.ordered$data)/100),
    dendrogram = "none",
    Rowv = F,
    Colv = F,
    trace = "none",
    labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString),
    labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)),
    #sourcRowSideColors = data.ordered$color.vector,
    keysize=0.6,
    key=F,
    col = color.ramp,
    cexCol = 0.8,
    cexRow = 0.8)


}

ReadCommadLineParameters <- function(argument.names, command.line.arguments, optional = F) {
  # Reads the parameters from the command line arguments.
  #
  # Args:
  #   argument.names: A list of expected argument names.
  #   command.line.arguments: The list of recieved command line arguments
  #   optional: Are the areguments optional, or are they required, default is required
  #
  # Returns:
  #   The arguments for argument.names. As strings In that order.

    if(length(grep("--version",command.line.arguments))) {
        cat("version",script.version,"\n")
        q()
    }
    # Split command line arguments
    args <- sapply(strsplit(command.line.arguments, " "),function(i) i)
    vals <- character(length(argument.names))
    # split cmd.line to key and value pairs
    for(i in 1:length(argument.names)) {
        ix <- grep(argument.names[i], args)
        if(length(ix)>1) {
            stop("arg ",
                 argument.names[i],
                 " used more than once.  Bailing out...\n",
                 PrintParamError())
        }
        else if (length(ix)==0 && !optional) {
            stop("could not find ",
                 argument.names[i],
                 ". Bailing out...\n",
                 PrintParamError())
        }
        else if (length(ix)==0 && optional) {
        vals[i] <- NA
        }
        else {
        vals[i] <- args[ix+1]
        }
    }
    return(vals)
}

PrintParamError <- function(){
  # Prints the usage of the function, shows users what arguments they can use
  #
  # Args:
  #   param: A list that contains all the paramaters.
  #
  # Returns:
  #   A modified version of the param list, with the default values loaded.
    cat("
DESCRIPTIION:
    heatmap.R takes a ...

INPUT:
    1.--mtls_file: path to mtls file.\n
    2.--cluster_file: the destination path for the output cluster file.\n
    3.--chip_experiment_order: The order of desired chip experiments (optional).\n
    4.--heatmap_file: path for output heatmap image (no extension).\n
    5.--heatmap_type: choice of image format, currently support png, pdf, tiff and bmp (optional)\n
    6.--expression_file: list of expression files to be included in analysis (optional).\n
    7.--expression_name: lables for the expression files (optional).\n
    8.--normalization_file: a list of files to be used for normalization,
    they can be the same file, however the number of expression nominated
    normalization files must match the number of expression files (optional)\n
    9.--n_clusters: number of clusters\n
    10.--filter_percentage: percentage of mtls that will be analysed. Eg: if
    we make filter_percentage 30, we will take the union of the top mtls in
    mean, non-zero mean and variance (optional).\n

EXAMPLE RUN:
    Rscript heatmap.R
        --mtls_file path/to/mtls.xls
        --cluster_file path/to/output/cluster
        --chip_experiment_order tf1::tf2::tf5::tf3
        --heatmap_file path/to/output/heatmap
        --heatmap_type png
        --expression_file path/to/exp1::path/to/exp2
        --expression_name myexp1::myexp2
        --normalization_file path/to/exp3::path/to/exp3
        --n_clusters 13
        --filter_percentage 100
        --include_targetless yes
        --number_bins 30

    ")
}

#LoadDefaultParams <- function(param) {
#  # Loads default paramaters for the heatmap application
#  #
#  # Args:
#  #   param: A list that contains all the previous paramaters.
#  #
#  # Returns:
#  #   A modified version of the param list, with the default values loaded.

#script.version=0.1
#param$debug = F

## RNA data:
#param$rna.files = ""
#param$rna.normalization = "none"

## Filter:
#param$filter.percentage <- 100

## Clustering:
#param$clustering.number.of.clusters <- 13

## Heatmap:
#param$heatmap.document.name <- "heatmap"
#param$heatmap.document.type <- "png"
##Cluster Groups:
#param$cluster.groups.document.name <- "clusters"

#return(param)

#}


LoadParams <- function(cmd.args, args.nms, n.start.numeric, optional = F) {
  # Loads user defined paramaters for the heatmap application
  #
  # Args:
  #   cmd.args: The command line arguments given.
  #   arg.nms: A list of possible command arguments.
  #   n.start.numeric: the first argument that should be numeric (alway put
  #                        these last).
  #   optional: This specifies if the params in cmd.args are optional or
  #             required.
  # Returns:
  #   A list of values assigned to each argument.


  vals <- ReadCommadLineParameters(argument.names = args.nms,
                                 command.line.arguments = cmd.args,
                                 optional = optional)

   #check if numeric params are indeed numeric
  if(!optional) {
  for(i in n.start.numeric:length(vals)){
    if(is.na(as.numeric(vals[i]))){
      stop("arg ",args.nms[i]," is not numeric.  Bailing out...\n",print.error())
    }
  }
}
  return (vals)

}

#ValidateParams <- function(params) {
#    return(T)
#}


##########

LoadDebugParams <- function(param) {

    cmd.args <- c(
      "--mtls_file data/test/mtls.xls",
      "--cluster_file data/test/cluster",
      "--heatmap_file data/test/heatmap",
      "--heatmap_type bmp",
      "--n_clusters 13",
      "--filter_percentage 100",
      "--expression_file /home/kieran/code/scorchR-heatmap/data/expression/rna1.tabular::/home/kieran/code/scorchR-heatmap/data/expression/rna2.tabular",
      "--expression_name batf.ko.0::batf.ko.17",
      "--normalization_file mean",
      "--chip_experiment_order ac::bc::cc::dc::ec",
      "--include_targetless yes",
      "--number_bins 20"
    )

args.nms <- c(            "--mtls_file",        #1
                "--cluster_file",     #2
                "--chip_experiment_order",    #3
                "--heatmap_file",     #4
                "--heatmap_type",         #5
                "--expression_file",    #6
                "--expression_name",    #7
                "--normalization_file",    #8
                "--include_targetless",    #9
                 "--n_clusters",     #10
                "--filter_percentage", #11
                "--number_bins")    #12


# vals has the same order as args.nms
vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 10, optional = F)


# ChIP data:
param$annotated.macs.file <- vals[1]
param$chip.order <- vals[3]
# RNA data:
param$rna.files = vals[6]
param$rna.names = vals[7]
param$rna.normalization.file = vals[8]
param$include.targetless = vals[9]

# Filter:
param$filter.percentage <- as.numeric(vals[11])

# Clustering:
param$number.bins <- as.numeric(vals[12])
param$clustering.number.of.clusters <- as.numeric(vals[10])

# Heatmap:
param$heatmap.document.name <- vals[4]
param$heatmap.document.type <- vals[5]
#Cluster Groups:
param$cluster.groups.document.name <- vals[2]

return(param)

}


LoadOptionalParams <- function(param) {

cmd.args <- commandArgs(trailingOnly = T)

args.nms <- c(  "--chip_experiment_order",  #1
                "--expression_file",        #2
                "--expression_name",        #3
                "--normalization_file",     #4
                "--heatmap_type",           #5
                "--include_targetless",     #6
                "--filter_percentage",      #7
                "--number_bins")            #8


# vals has the same order as args.nms
vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 7, optional = T)

# ChIP data:
param$chip.order <- if(!is.na(vals[1])){vals[1]}else{NA}

# RNA data:
param$rna.files <- if(!is.na(vals[2])){vals[2]}else{"none"}
param$rna.names <- if(!is.na(vals[3])){vals[3]}else{"none"}
param$rna.normalization.file <- if(!is.na(vals[4])){vals[4]}else{"no"}
param$include.targetless <- if(!is.na(vals[6])){vals[6]}else{"yes"}
# Filter:
param$filter.percentage <- if(!is.na(vals[7])){as.numeric(vals[7])}else{100}
param$number.bins <- if(!is.na(vals[8])){as.numeric(vals[8])}else{30}
# Heatmap file (output)
param$heatmap.document.type <- if(!is.na(vals[5])){vals[5]}else{"none"}

return(param)
}


LoadReqiredParams <- function(param){

cmd.args <- commandArgs(trailingOnly = T)

args.nms <- c(  "--mtls_file",          #1
                "--cluster_file",       #2
                "--heatmap_file",       #3
                 "--n_clusters")        #4


# vals has the same order as args.nms
vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 4, optional = F)

# ChIP data:
param$annotated.macs.file <- vals[1]

# Clustering
param$clustering.number.of.clusters <- as.numeric(vals[4])

# Cluster file (output):
param$cluster.groups.document.name <- vals[2]


# Heatmap file (output):
param$heatmap.document.name <- vals[3]

return(param)

}

###########
# here we output the fasta file of the targets of each kmeans cluster
CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){
  f.nm = paste(f.nm, ".fasta", sep="")
    clusters = km.order
    for(i in 1:length(clusters)){
        v=trgts[which(k.ix==clusters[i])]
        v.split = unlist(sapply(v,strsplit, "_"))
        if(i == 1){
            cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n")
        } else {
            cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE)
        }
        cat(sep="\n",file=f.nm,v.split,append=TRUE)
    }
}


PrintClusters <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){
	f.nm = paste(f.nm, ".tsv", sep="")
	cat(sep="\t",file=f.nm,"row_number/target","cluster","\n")
	trgts[which(trgts=="")] <- "no_target"
    for(i in 1:length(trgts)){
		cat(sep="",file=f.nm,trgts[i],"\t","cluster_",k.ix[i],"\n",append=TRUE)
			}
#  	  if(i == 1){
#  	    cat(sep="",file=f.nm,"cluster_",i,"\n")#clusters[i],"\n")
#  	  } else {
#  	    cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE)
#  	  }
      #cat(sep="\n",file=f.nm,v.split,append=TRUE)
  	}
GetTopRowsFromMatrix = function(mtrx, percentage = 10)
{
  if (param$debug) {
    print("In GetTopRowsFromMatrix")
	}
	#Store the stats for the mtrx
	stats = list()
	stats$mean=apply(mtrx,1,mean)
	stats$sd=apply(mtrx,1,sd)
	stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)]))


	#Store the indexes for the mtrx
	index = list()
	index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix
	index$sd   = sort(stats$sd, decreasing=T, index.return=T)$ix
	index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix

	#Calculate how many data points we want to take
	cutoff = floor(length(mtrx[ ,1])*(percentage/100))

	#Extract Indexes and return to caller
	index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff]))

	return(index)
}

#GetUniq = function(targets.cluster)
#{
#    #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order
#
#    return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))))
#    #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))
#
#}

bindata = function(d,qunts=seq(.4,.9,.1))
{
    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
    for(j in 1:ncol(d))
    {
        bins=quantile(d[,j],qunts)
            for(i in 1:length(bins))
            {
                tmp[which(d[,j]>bins[i]),j] = i
         }
    }
    return(tmp)
}

bindata.non.zero = function(d,qunts=seq(0,0.9,0.1))
{
    tmp=matrix(0,nr=nrow(d),nc=ncol(d))
    for(j in 1:ncol(d))
    {
        ix.non.zero=which(d[,j]!=0)
        bins=quantile(d[ix.non.zero,j],qunts)
            for(i in 1:length(bins))
            {
                tmp[which(d[,j]>bins[i]),j] = i
         }
    }

    return(tmp)
}
bindata.matrix = function(d,qunts=seq(0,0.9,0.1))
{
    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
    #ix.non.zero=which(d!=0)
    bins=quantile(d[],qunts)
    for(i in 1:length(bins))
    {
        tmp[which(d>bins[i])] = i
    }
    return(tmp)
}
bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1))
{
    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
    ix.non.zero=which(d!=0)
    bins=quantile(d[ix.non.zero],qunts)
    for(i in 1:length(bins))
    {
        tmp[which(d>bins[i])] = i
    }
    return(tmp)
}
heatmap.3 <- function (x, Rowv = TRUE, Colv = if (symm) "Rowv" else TRUE,
    distfun = dist, hclustfun = hclust, dendrogram = c("both",
        "row", "column", "none"), symm = FALSE, scale = c("none",
        "row", "column"), na.rm = TRUE, revC = identical(Colv,
        "Rowv"), add.expr, breaks, symbreaks = min(x < 0, na.rm = TRUE) ||
        scale != "none", col = "heat.colors", colsep, rowsep,
    sepcolor = "white", sepwidth = c(0.05, 0.05), cellnote, notecex = 1,
    notecol = "cyan", na.color = par("bg"), trace = c("column",
        "row", "both", "none"), tracecol = "cyan", hline = median(breaks),
    vline = median(breaks), linecol = tracecol, margins = c(5,
        5), ColSideColors, RowSideColors, cexRow = 0.2 + 1/log10(nr),
    cexCol = 0.2 + 1/log10(nc), labRow = NULL, labCol = NULL,
    key = TRUE, keysize = 1.5, density.info = c("histogram",
        "density", "none"), denscol = tracecol, symkey = min(x <
        0, na.rm = TRUE) || symbreaks, densadj = 0.25, main = NULL,
    xlab = NULL, ylab = NULL, lmat = NULL, lhei = NULL, lwid = NULL,
    ...)
{
    scale01 <- function(x, low = min(x), high = max(x)) {
        x <- (x - low)/(high - low)
        x
    }
    retval <- list()
    scale <- if (symm && missing(scale))
        "none"
    else match.arg(scale)
    dendrogram <- match.arg(dendrogram)
    trace <- match.arg(trace)
    density.info <- match.arg(density.info)
    if (length(col) == 1 && is.character(col))
        col <- get(col, mode = "function")
    if (!missing(breaks) && (scale != "none"))
        warning("Using scale=\"row\" or scale=\"column\" when breaks are",
            "specified can produce unpredictable results.", "Please consider using only one or the other.")
    if (is.null(Rowv) || is.na(Rowv))
        Rowv <- FALSE
    if (is.null(Colv) || is.na(Colv))
        Colv <- FALSE
    else if (Colv == "Rowv" && !isTRUE(Rowv))
        Colv <- FALSE
    if (length(di <- dim(x)) != 2 || !is.numeric(x))
        stop("`x' must be a numeric matrix")
    nr <- di[1]
    nc <- di[2]
    if (nr <= 1 || nc <= 1)
        stop("`x' must have at least 2 rows and 2 columns")
    if (!is.numeric(margins) || length(margins) != 2)
        stop("`margins' must be a numeric vector of length 2")
    if (missing(cellnote))
        cellnote <- matrix("", ncol = ncol(x), nrow = nrow(x))
    if (!inherits(Rowv, "dendrogram")) {
        if (((!isTRUE(Rowv)) || (is.null(Rowv))) && (dendrogram %in%
            c("both", "row"))) {
            if (is.logical(Colv) && (Colv))
                dendrogram <- "column"
            else dedrogram <- "none"
            warning("Discrepancy: Rowv is FALSE, while dendrogram is `",
                dendrogram, "'. Omitting row dendogram.")
        }
    }
    if (!inherits(Colv, "dendrogram")) {
        if (((!isTRUE(Colv)) || (is.null(Colv))) && (dendrogram %in%
            c("both", "column"))) {
            if (is.logical(Rowv) && (Rowv))
                dendrogram <- "row"
            else dendrogram <- "none"
            warning("Discrepancy: Colv is FALSE, while dendrogram is `",
                dendrogram, "'. Omitting column dendogram.")
        }
    }
    if (inherits(Rowv, "dendrogram")) {
        ddr <- Rowv
        rowInd <- order.dendrogram(ddr)
    }
    else if (is.integer(Rowv)) {
        hcr <- hclustfun(distfun(x))
        ddr <- as.dendrogram(hcr)
        ddr <- reorder(ddr, Rowv)
        rowInd <- order.dendrogram(ddr)
        if (nr != length(rowInd))
            stop("row dendrogram ordering gave index of wrong length")
    }
    else if (isTRUE(Rowv)) {
        Rowv <- rowMeans(x, na.rm = na.rm)
        hcr <- hclustfun(distfun(x))
        ddr <- as.dendrogram(hcr)
        ddr <- reorder(ddr, Rowv)
        rowInd <- order.dendrogram(ddr)
        if (nr != length(rowInd))
            stop("row dendrogram ordering gave index of wrong length")
    }
    else {
        rowInd <- nr:1
    }
    if (inherits(Colv, "dendrogram")) {
        ddc <- Colv
        colInd <- order.dendrogram(ddc)
    }
    else if (identical(Colv, "Rowv")) {
        if (nr != nc)
            stop("Colv = \"Rowv\" but nrow(x) != ncol(x)")
        if (exists("ddr")) {
            ddc <- ddr
            colInd <- order.dendrogram(ddc)
        }
        else colInd <- rowInd
    }
    else if (is.integer(Colv)) {
        hcc <- hclustfun(distfun(if (symm)
            x
        else t(x)))
        ddc <- as.dendrogram(hcc)
        ddc <- reorder(ddc, Colv)
        colInd <- order.dendrogram(ddc)
        if (nc != length(colInd))
            stop("column dendrogram ordering gave index of wrong length")
    }
    else if (isTRUE(Colv)) {
        Colv <- colMeans(x, na.rm = na.rm)
        hcc <- hclustfun(distfun(if (symm)
            x
        else t(x)))
        ddc <- as.dendrogram(hcc)
        ddc <- reorder(ddc, Colv)
        colInd <- order.dendrogram(ddc)
        if (nc != length(colInd))
            stop("column dendrogram ordering gave index of wrong length")
    }
    else {
        colInd <- 1:nc
    }
    retval$rowInd <- rowInd
    retval$colInd <- colInd
    retval$call <- match.call()
    x <- x[rowInd, colInd]
    x.unscaled <- x
    cellnote <- cellnote[rowInd, colInd]
    if (is.null(labRow))
        labRow <- if (is.null(rownames(x)))
            (1:nr)[rowInd]
        else rownames(x)
    else labRow <- labRow[rowInd]
    if (is.null(labCol))
        labCol <- if (is.null(colnames(x)))
            (1:nc)[colInd]
        else colnames(x)
    else labCol <- labCol[colInd]
    if (scale == "row") {
        retval$rowMeans <- rm <- rowMeans(x, na.rm = na.rm)
        x <- sweep(x, 1, rm)
        retval$rowSDs <- sx <- apply(x, 1, sd, na.rm = na.rm)
        x <- sweep(x, 1, sx, "/")
    }
    else if (scale == "column") {
        retval$colMeans <- rm <- colMeans(x, na.rm = na.rm)
        x <- sweep(x, 2, rm)
        retval$colSDs <- sx <- apply(x, 2, sd, na.rm = na.rm)
        x <- sweep(x, 2, sx, "/")
    }
    if (missing(breaks) || is.null(breaks) || length(breaks) <
        1) {
        if (missing(col) || is.function(col))
            breaks <- 16
        else breaks <- length(col) + 1
    }
    if (length(breaks) == 1) {
        if (!symbreaks)
            breaks <- seq(min(x, na.rm = na.rm), max(x, na.rm = na.rm),
                length = breaks)
        else {
            extreme <- max(abs(x), na.rm = TRUE)
            breaks <- seq(-extreme, extreme, length = breaks)
        }
    }
    nbr <- length(breaks)
    ncol <- length(breaks) - 1
    if (class(col) == "function")
        col <- col(ncol)
    min.breaks <- min(breaks)
    max.breaks <- max(breaks)
    x[x < min.breaks] <- min.breaks
    x[x > max.breaks] <- max.breaks
    # if (missing(lhei) || is.null(lhei))
    #     lhei <- c(keysize, 4)
    # if (missing(lwid) || is.null(lwid))
    #     lwid <- c(keysize, 4)
    # if (missing(lmat) || is.null(lmat)) {
    #     lmat <- rbind(4:3, 2:1)
    #     if (!missing(ColSideColors)) {
    #         if (!is.character(ColSideColors) || length(ColSideColors) !=
    #             nc)
    #             stop("'ColSideColors' must be a character vector of length ncol(x)")
    #         lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] +
    #             1)
    #         lhei <- c(lhei[1], 0.2, lhei[2])
    #     }
    #     if (!missing(RowSideColors)) {
    #         if (!is.character(RowSideColors) || length(RowSideColors) !=
    #             nr)
    #             stop("'RowSideColors' must be a character vector of length nrow(x)")
    #         lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) -
    #             1), 1), lmat[, 2] + 1)
    #         lwid <- c(lwid[1], 0.2, lwid[2])
    #     }
    #     lmat[is.na(lmat)] <- 0
    # }
    # if (length(lhei) != nrow(lmat))
    #     stop("lhei must have length = nrow(lmat) = ", nrow(lmat))
    # if (length(lwid) != ncol(lmat))
    #     stop("lwid must have length = ncol(lmat) =", ncol(lmat))
    # op <- par(no.readonly = TRUE)
    # on.exit(par(op))
    # layout(lmat, widths = lwid, heights = lhei, respect = FALSE)
    if (!missing(RowSideColors)) {
        par(mar = c(margins[1], 0, 0, 0.5))
        image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE)
    }
    if (!missing(ColSideColors)) {
        par(mar = c(0.5, 0, 0, margins[2]))
        image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE)
    }
    par(mar = c(margins[1], 0, 0, margins[2]))
    x <- t(x)
    cellnote <- t(cellnote)
    if (revC) {
        iy <- nr:1
        if (exists("ddr"))
            ddr <- rev(ddr)
        x <- x[, iy]
        cellnote <- cellnote[, iy]
    }
    else iy <- 1:nr
    image(1:nc, 1:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 +
        c(0, nr), axes = FALSE, xlab = "", ylab = "", col = col,
        breaks = breaks, ...)
    retval$carpet <- x
    if (exists("ddr"))
        retval$rowDendrogram <- ddr
    if (exists("ddc"))
        retval$colDendrogram <- ddc
    retval$breaks <- breaks
    retval$col <- col
    if (!invalid(na.color) & any(is.na(x))) {
        mmat <- ifelse(is.na(x), 1, NA)
        image(1:nc, 1:nr, mmat, axes = FALSE, xlab = "", ylab = "",
            col = na.color, add = TRUE)
    }
    axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0,
        cex.axis = cexCol)
    if (!is.null(xlab))
        mtext(xlab, side = 1, line = margins[1] - 1.25)
    axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0,
        cex.axis = cexRow)
    if (!is.null(ylab))
        mtext(ylab, side = 4, line = margins[2] - 1.25)
    if (!missing(add.expr))
        eval(substitute(add.expr))
    if (!missing(colsep))
        for (csep in colsep) rect(xleft = csep + 0.5, ybottom = rep(0,
            length(csep)), xright = csep + 0.5 + sepwidth[1],
            ytop = rep(ncol(x) + 1, csep), lty = 1, lwd = 1,
            col = sepcolor, border = sepcolor)
    if (!missing(rowsep))
        for (rsep in rowsep) rect(xleft = 0, ybottom = (ncol(x) +
            1 - rsep) - 0.5, xright = nrow(x) + 1, ytop = (ncol(x) +
            1 - rsep) - 0.5 - sepwidth[2], lty = 1, lwd = 1,
            col = sepcolor, border = sepcolor)
    min.scale <- min(breaks)
    max.scale <- max(breaks)
    x.scaled <- scale01(t(x), min.scale, max.scale)
    if (trace %in% c("both", "column")) {
        retval$vline <- vline
        vline.vals <- scale01(vline, min.scale, max.scale)
        for (i in colInd) {
            if (!is.null(vline)) {
                abline(v = i - 0.5 + vline.vals, col = linecol,
                  lty = 2)
            }
            xv <- rep(i, nrow(x.scaled)) + x.scaled[, i] - 0.5
            xv <- c(xv[1], xv)
            yv <- 1:length(xv) - 0.5
            lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s")
        }
    }
    if (trace %in% c("both", "row")) {
        retval$hline <- hline
        hline.vals <- scale01(hline, min.scale, max.scale)
        for (i in rowInd) {
            if (!is.null(hline)) {
                abline(h = i + hline, col = linecol, lty = 2)
            }
            yv <- rep(i, ncol(x.scaled)) + x.scaled[i, ] - 0.5
            yv <- rev(c(yv[1], yv))
            xv <- length(yv):1 - 0.5
            lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s")
        }
    }
    if (!missing(cellnote))
        text(x = c(row(cellnote)), y = c(col(cellnote)), labels = c(cellnote),
            col = notecol, cex = notecex)
    par(mar = c(margins[1], 0, 0, 0))
    # if (dendrogram %in% c("both", "row")) {
    #     plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none")
    # }
    # else plot.new()
    par(mar = c(0, 0, if (!is.null(main)) 5 else 0, margins[2]))
    # if (dendrogram %in% c("both", "column")) {
    #     plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none")
    # }
    # else plot.new()
    if (!is.null(main))
        title(main, cex.main = 1.5 * op[["cex.main"]])
    # if (key) {
    #     par(mar = c(5, 4, 2, 1), cex = 0.75)
    #     tmpbreaks <- breaks
    #     if (symkey) {
    #         max.raw <- max(abs(c(x, breaks)), na.rm = TRUE)
    #         min.raw <- -max.raw
    #         tmpbreaks[1] <- -max(abs(x), na.rm = TRUE)
    #         tmpbreaks[length(tmpbreaks)] <- max(abs(x), na.rm = TRUE)
    #     }
    #     else {
    #         min.raw <- min(x, na.rm = TRUE)
    #         max.raw <- max(x, na.rm = TRUE)
    #     }
    #     z <- seq(min.raw, max.raw, length = length(col))
    #     image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks,
    #         xaxt = "n", yaxt = "n")
    #     par(usr = c(0, 1, 0, 1))
    #     lv <- pretty(breaks)
    #     xv <- scale01(as.numeric(lv), min.raw, max.raw)
    #     axis(1, at = xv, labels = lv)
    #     if (scale == "row")
    #         mtext(side = 1, "Row Z-Score", line = 2)
    #     else if (scale == "column")
    #         mtext(side = 1, "Column Z-Score", line = 2)
    #     else mtext(side = 1, "Value", line = 2)
    #     if (density.info == "density") {
    #         dens <- density(x, adjust = densadj, na.rm = TRUE)
    #         omit <- dens$x < min(breaks) | dens$x > max(breaks)
    #         dens$x <- dens$x[-omit]
    #         dens$y <- dens$y[-omit]
    #         dens$x <- scale01(dens$x, min.raw, max.raw)
    #         lines(dens$x, dens$y/max(dens$y) * 0.95, col = denscol,
    #             lwd = 1)
    #         axis(2, at = pretty(dens$y)/max(dens$y) * 0.95, pretty(dens$y))
    #         title("Color Key\nand Density Plot")
    #         par(cex = 0.5)
    #         mtext(side = 2, "Density", line = 2)
    #     }
    #     else if (density.info == "histogram") {
    #         h <- hist(x, plot = FALSE, breaks = breaks)
    #         hx <- scale01(breaks, min.raw, max.raw)
    #         hy <- c(h$counts, h$counts[length(h$counts)])
    #         lines(hx, hy/max(hy) * 0.95, lwd = 1, type = "s",
    #             col = denscol)
    #         axis(2, at = pretty(hy)/max(hy) * 0.95, pretty(hy))
    #         title("Color Key\nand Histogram")
    #         par(cex = 0.5)
    #         mtext(side = 2, "Count", line = 2)
    #     }
    #     else title("Color Key")
    # }
    # else plot.new()
    retval$colorTable <- data.frame(low = retval$breaks[-length(retval$breaks)],
        high = retval$breaks[-1], color = retval$col)
    invisible(retval)
}

#rm(list=ls())
param <- list()
param$debug <- FALSE #T

print("Finished loading functions")


if(param$debug) {
    param <- LoadDebugParams(param)
} else {
    param <- LoadReqiredParams(param)
    param <- LoadOptionalParams(param)
}
print (param)
Rprof() #Remove me
# Load Chip data, alread in pval format
# Column.order need to be a vecor of strings, that correspond to the order (and
# inclustion) of chip experiments.
chip <- GetChipData(param$annotated.macs.file, column.order = param$chip.order)

#Get the top percentages on different criteria
#ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage)
#chip$peaks <- chip$peaks[ix.top$union, ]
#chip$targets <- chip$targets[ix.top$union]
#Bin data:
chip$peaks <- bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=(param$number.bins+1)))

file.names <- unlist(strsplit(param$rna.files, split="::"))
print(file.names)
file.lables <- unlist(strsplit(param$rna.names, split="::"))
print(file.lables)

library("RColorBrewer")
color.2 =  rev(colorRampPalette(c("darkblue", "steelblue", "lightgrey"))(param$number.bins))
if(!is.na(file.names) && file.names != "none") {
    #rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg")
    #do differential expression if the user specifies such:
    if(param$rna.normalization != "no") {
        if(param$rna.normalization != "mean"){
            norm.file.names <- unlist(strsplit(param$rna.normalization.file, split="::"))
            all.file.names <- c(file.names, norm.file.names)
            all.file.lables <- c(file.lables, norm.file.names)
            print (all.file.names)
            print (all.file.lables)
            rna.scores <- GetRNAData(all.file.names, file.lables = all.file.lables, fpkm = "avg")
            rna.scores <- NormalizeRNA(scores = rna.scores)
            rna.scores.sign <- rna.scores/abs(rna.scores)
            rna.scores.sign[which(is.nan(rna.scores.sign))] <- 0
            rna.scores <- bindata.non.zero.matrix(abs(rna.scores), qunts = seq(0, 0.9, length=(param$number.bins+1)))
            rna.scores <- rna.scores.sign * rna.scores
            color.2 = colorRampPalette(c("darkblue", "steelblue", "lightgrey", "pink", "darkred"))(param$number.bins)
        } else {
            print("we are normalizing by average")
            print("file lables are:")
            print(file.lables)
            rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg")
            rna.scores <- t(apply(rna.scores,1, function(x) { if(mean(x)!=0){ return(x/mean(x)) } else { return(x) }  }))
            rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1)))
        }
    } else {
        rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg")
        rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1)))
    }

    # chip$peaks <- chip$peaks[order(chip$targets), ]
    # rna.scores <- rna.scores[order(rownames(rna.scores)), ]
    # chip$targets <- chip$targets[order(chip$targets)]

    rna.scores.mapped <- MapExpressiontoChip(chip.targets = chip$targets, expression=rna.scores)
    all.data <- cbind(chip$peaks, rna.scores.mapped)
    if(param$include.targetless!="yes"){
        all.data <- all.data[-which(chip$targets==""),]
        chip$peaks <- chip$peaks[-which(chip$targets==""),]
        rna.scores.mapped <- rna.scores.mapped[-which(chip$targets==""),]
        chip$targets <- chip$targets[-which(chip$targets=="")]
    }
} else {
    all.data <- chip$peaks
}

print("Clustering")
set.seed(1234)
km <- kmeans(all.data, param$clustering.number.of.clusters, iter.max = 50)
print("Ordering")
km.order <- GenerateKMOrder(all.data, km)

##AM edits##
km.new.order <- numeric(length(km$cluster))
for(i in 1:length(km.order)){
    cur.clst <- km.order[i]
    ix <- which(km$cluster==cur.clst)
    km.new.order[ix] <- i
}
km.order <- 1:length(km.order)
km$cluster <- km.new.order
## Done AM ##
print("Creating image")
#bmp(param$heatmap.document.name, 640, 480)#1280, 960)
bmp(paste(param$heatmap.document.name,".bmp", sep = ""), 5120, 3840)#2560, 1920)
#pdf(paste(param$heatmap.document.name,".pdf", sep = ""))
color.1 <- c("#000000", colorRampPalette(c("blue", "yellow","orange","red"))(param$number.bins))
if(!is.na(file.names) && file.names != "none") {
    PrintClusters(trgts=chip$targets,
        k.ix=km$cluster,
        f.nm = param$cluster.groups.document.name,
        km.order = NA)

    #if(param$rna.normalization != "no") {
    #    data.split <- cbind(chip$peaks, PrepareRNAforHeatmap(rna.scores.mapped))
    #} else {
    #    data.split <- cbind(chip$peaks, rna.scores.mapped)
    #}
    expression.width.multiplier <- 2
    if(ncol(rna.scores.mapped)==1) {
        rna.scores.mapped <- cbind(rna.scores.mapped, rna.scores.mapped)
        expression.width.multiplier <- 1
    }

    layout(matrix(c(1,2,2,5,
        1,3,4,6,
        1,3,4,7,
        1,3,4,8,
        1,3,4,9),
    5,4,
    byrow=T),
    widths=c(1,2*ncol(chip$peaks),expression.width.multiplier*ncol(rna.scores.mapped),1),
    heights=c(1,10,2,10,2))
    #1
    plot.new()
    #2
    plot.new()
    #3
    print("Creating peak image")
    CreateIndividualHeatMap(chip$peaks,
        km,
        km.order, color.ramp=color.1)
    #4
    print("Creating rna image")
    CreateIndividualHeatMap(rna.scores.mapped,
        km,
        km.order, color.ramp=color.2)
    #5
    plot.new()
    #6
    image(t(matrix(1:param$number.bins,nc=1)), col=color.1)
    #7
    plot.new()
    #8
    image(t(matrix(1:param$number.bins,nc=1)), col=color.2)
    #9
    plot.new()
} else {
    PrintClusters(trgts=1:nrow(chip$peaks),
        k.ix=km$cluster,
        f.nm = param$cluster.groups.document.name,
        km.order = NA)


    layout(matrix(c(1,2,2,
        1,3,4,
        1,3,5),3,3,byrow=T),
        widths=c(1,3*ncol(chip$peaks),1),
        heights=c(1,8,1))
    #1
    plot.new()
    #2
    plot.new()
    #3
    CreateIndividualHeatMap(chip$peaks,
        km,
        km.order, color.ramp=color.1)
    #4
    image(t(matrix(1:param$number.bins,nc=1)), col=color.1)
    #5
    plot.new()

}

dev.off()
Rprof(NULL)
profile <- summaryRprof()
print(str(profile))
print(profile)
#save.image("test/testData.RData")
author	kmace
date	Mon, 23 Jul 2012 13:00:15 -0400
parents
children