# HG changeset patch
# User kmace
# Date 1343062815 14400
# Node ID b465306d00bad605eefeedee4529cee4ab5db487
# Parent  a0306edbf2f8dd0d001605507a786731dad9f90a
Uploaded

diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/annotate_mtls.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/annotate_mtls.R	Mon Jul 23 13:00:15 2012 -0400
@@ -0,0 +1,79 @@
+debug = F
+# Read in Data
+if (debug == T) {
+  mtls <- read.delim("~/code/scorchR-annotate_mtls/mtls.xls", quote="", stringsAsFactors=F)
+  annotation <- read.delim("~/code/scorchR-annotate_mtls/gene_annotation.txt", header=T, stringsAsFactors=F)
+  threshold <- 100000  
+} else {
+  cmd <- commandArgs(trailingOnly = T)
+  mtls <- read.delim(cmd[1], quote="", stringsAsFactors=F)
+  annotation <- read.delim(cmd[2], header=T, stringsAsFactors=F)
+  threshold <- as.numeric(cmd[3])
+}
+
+
+
+# Get the median summit
+summits <- sapply(strsplit(mtls[ ,"summit"], split="_"), function(x) {median(as.numeric(x))})
+# Extract every tss
+tss <- apply(annotation,1,function(x) {
+  if (x["strand"]=="+"){
+    return(as.numeric(x["txStart"]))
+    }else{
+      return(as.numeric(x["txEnd"]))
+      }
+  })
+
+
+# Get each Chromosome
+chromosomes <- unique( mtls[,"chr"])
+
+# Set up output:
+trgt <- character(length = length(summits))
+
+# Loop through Chromosomes
+for (chromosome in chromosomes){
+  cat(chromosome, "\n")
+  chr.ix.tss <- which(annotation[ ,"chrom"]==chromosome)
+  chr.ix.summits <- which(mtls[ ,"chr"]==chromosome)
+  
+
+  cat(length(chr.ix.summits), " -summits\n",
+      length(chr.ix.tss), " -tss\n")
+  if (length(chr.ix.summits)>0 && length(chr.ix.tss)>0) {
+  # Loop through Summits
+  for(summit.ix in chr.ix.summits) {
+    #cat("Summit = ", summit.ix, class(summit.ix),"\n")
+    distances <- abs(tss[chr.ix.tss] - summits[summit.ix])
+    closest.ix.tss <- which(distances == min(distances))[1]
+    if(!is.na(closest.ix.tss) && !is.na(distances[closest.ix.tss])) {
+    #print(closest.ix.tss)
+    if (distances[closest.ix.tss] < threshold) {
+      
+      trgt[summit.ix] <- annotation[chr.ix.tss, "name2"][closest.ix.tss]
+   
+#       cat("A distance of ",
+#           distances[closest.ix.tss],
+#           " was found for the mtl summit located at: ",
+#           chromosome,
+#           ": ",
+#           summits[summit.ix],
+#           " and TSS located at: ",
+#           chromosome,
+#           ": ",
+#           tss[chr.ix.tss][closest.ix.tss],
+#           "\n")
+    }
+    }
+  }
+  } else {
+    cat("chromosome", chromosome, " either has no transcripts or no peaks in this experiment. \n")
+  }
+  output <- cbind(mtls, trgt)
+}
+
+write.table(output, file="annotated_mtls.xls", sep="\t",row.names=F, quote=F)
+
+
+
+
diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/cluster_peaks.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/cluster_peaks.R	Mon Jul 23 13:00:15 2012 -0400
@@ -0,0 +1,431 @@
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+## Feb 2012 th17
+## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, 
+## NYU - Center for Genomics and Systems Biology
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+
+rm(list=ls())
+debug=F
+verbose=F
+script.version=0.1
+print.error <- function(){
+	cat("
+DESCRIPTIION:	
+	cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where 
+	each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
+	fall within a certain distance from eachother. If you cluster MTLs based on summits, you need to specify dist.summits.
+	If you use .bed files, the files must contain no header, and at least the five columns: 
+	1. chrom, 2. chromStart, 3. chromEnd, 4. name, and 5.score. 2 and 3 represent the end points used for MTLs defined based on a shared
+	interval. (2+3)/2 is used as the summit of each row if used for MTLs defined based on proximity of summits.
+
+INPUT:
+	1.input_files: path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
+	2.path_output: path to save generated MTL cluster file (where to save mtls.xls)
+	3.expt_names: user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
+	4.input_type: the type of input file used (MACS or BED; defaults to MACS)
+	5.mtl_type: interval or summit (defaults to summit)
+	6.dist.summits: maximum distance between summits belonging to the same MTL (defaults to 100; only used if mtl_type is summit)
+		
+EXAMPLE RUN: 
+	cluster_peaks.R
+	--input_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
+	--input_type MACS
+	--path_output results/
+	--expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
+	--dist_summits 100
+	--mtl_type summit
+	
+Please cite us if you used this script: 
+	The transcription factor network regulating Th17 lineage specification and function.
+	Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, 
+	Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n")
+}
+
+############# helper functions:
+## split.macs.list.to.chrmosomes
+# input:
+#      - macs.list: a list of macs expts: here are a few lines of one expt
+## chr	start	end	length	summit	tags	#NAME?	fold_enrichment	FDR(%)
+## chr1	4322210	4323069	860	494	55	158.95	6.03	0.05
+## chr1	4797749	4798368	620	211	29	119.82	3.47	0.09
+## chr1	4848182	4849113	932	494	46	105.42	2.9	0.09
+#      - expts: a list of the expts names from macs.list that you want to process
+#      - chr: chrmosomes names
+# output:
+#      - x: a list with as many elements as chr specified by input.
+#      -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS)
+split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){
+  x <- list()
+  n <- length(expts)
+  for(i in 1:n){
+    e <- expts[i] #experiment name
+    cat("wroking on expt", e,"\n")
+    x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]])
+    names(x[[e]]) <- chr
+  }
+  return(x)
+}
+# a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m
+# input:
+#     - r is chromosome
+#     - m is macs matrix for expt e from above function
+split.one.macs.expt.by.chromosome.no.pml <- function(r,m){
+  ix.chr.i <- which(m[,"chr"]==r)
+  # cat("working on",r,"\n")
+  o <- list()
+  o[[r]] <- m[ix.chr.i,]
+  o[[r]]$peak.id <- ix.chr.i
+  return(o[[r]])
+}
+
+## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
+# also match these summit locations with corresponding:
+# pvals, tfs, peak start and peak end
+# input:
+#     - chr: a list of chromosome names
+#     - macs.list.per.chrom: a list of macs peaks for each chromosome
+# output:
+#     - o: a list each chormosome ruler as an element
+make.ruler.no.pml <- function(chr,macs.list.per.chrom){
+  x <- macs.list.per.chrom
+  o <- list()
+  for(j in 1:length(chr)){
+    r <- chr[j] # chrmosome we go over
+    s <- numeric()
+    pval <- numeric()
+    tf.to.s <- character()
+    start <- numeric()
+    end <- numeric()
+    trtmnts <- names(x) # the treatments name (expt names)
+    ## debug parameters ###
+    ## which experiment peaks come from
+    expt <- character()
+    ## what was the peak id in that experiment
+    peak.ids <- numeric()
+    ## this will allow us to always back track from a cluster to the actual peaks in it
+    ## debug params end ###
+    for(i in 1:length(trtmnts)){
+      o[[r]] <- list()
+      e <- trtmnts[i] #experiment name
+      tf <- strsplit(e,"_")[[1]][1]
+      s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"])
+      pval <- c(pval,x[[e]][[r]][,"score"]) 
+      start <- c(start,x[[e]][[r]][,"start"])
+      end <- c(end,x[[e]][[r]][,"end"])
+      expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"])))
+      peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"])    
+    }
+    ix <- sort(s,index.return=TRUE)$ix
+    o[[r]] <- list(summit=s[ix],score=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix])				
+  }
+  return(o)
+}
+
+## add cluster memberships based on ruler
+## require no more than d.cut distance between tf summits
+## cur.l is the current loci number (or cluster number)
+assign.clusters.ids.to.peaks.based.on.summits <- function(ruler,d.cut){
+ cur.l <- 0
+ for(j in 1:length(ruler)){
+   s <- ruler[[j]][["summit"]]
+   l <- numeric(length=length(s))
+   if(length(l)>0){
+	   cur.l <- cur.l+1
+   }
+   if(length(l)==1){
+   	l[1] <- cur.l
+   } else if(length(l)>1) {
+	l[1] <- cur.l
+	   for(i in 2:length(l)){
+	     d <- s[i]-s[i-1] # assumes s is sorted increasingly
+	     if(d>d.cut){
+	       cur.l <- cur.l+1
+	     }
+	     l[i] <- cur.l    
+	   }
+	}
+   ruler[[j]][["mtl.id"]] <- l
+ }
+ return(ruler)
+}
+
+## add cluster memberships based on ruler
+## require that peaks span will have a non-empty intersection 
+## cur.mtl is the current loci number (or cluster number)
+assign.clusters.ids.to.peaks.based.on.intervals <- function(ruler){
+cur.mtl <- 0
+for(j in 1:length(ruler)){
+	s <- ruler[[j]][["start"]]
+	e <- ruler[[j]][["end"]]
+	l <- numeric(length=length(s))
+	if(length(l)>0){
+	   cur.mtl <- cur.mtl+1
+	}
+	if(length(l)==1){
+	  	l[1] <- cur.mtl
+	} else if(length(l)>1) {
+		l[1] <- cur.mtl
+		cur.e <- e[1] # the right-most bp belonging to the current mtl
+	   	for(i in 2:length(l)){
+	     	if(cur.e<s[i]){
+	       		cur.mtl <- cur.mtl+1
+	     	}
+	     	l[i] <- cur.mtl 
+			cur.e <- max(cur.e,e[i])
+	   	}
+	}
+	ruler[[j]][["mtl.id"]] <- l
+}
+return(ruler)
+}
+
+## here we create a list of TF enriched loci (clusters)
+## input:
+##     - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
+## output:
+##     -ll: a list of clusters ll where each cluster (elem in ll holds):
+##        - mtl.id: the current loci (elem in ll)
+##        - s: summits vector as before
+##        - pval: pvals vector matching the summits
+##        - tfs: a vector of tfs matching s, the summits in loci l
+##        - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
+create.cluster.list.no.pml <- function(ruler){
+  tmp <- list()
+  ll <- list()
+  for(j in 1:length(ruler)){
+    r <- names(ruler)[j]
+    # cat("working on ",r,"\n")
+    x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
+    l.vec <- unique(x[["mtl.id"]]) # the clusters ids on j chr (only unique)
+    n <- length(l.vec) # iterate over n clusters
+    tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
+  }
+  ## concatenate tmp into one list
+  command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
+  ll=eval(parse(text=command))
+  return(ll)
+}
+get.cluster.params.no.pml <- function(i,x,l.vec,r){
+  # ix <- which(x[["l"]]==l.vec[i])
+  # l <- l.vec[i]
+  ix <- which(x[["mtl.id"]]==l.vec[i])
+  l <- l.vec[i]
+  start <- min(x[["start"]][ix])
+  end <- max(x[["end"]][ix])
+  # s <- x[["s"]][ix]
+  summit <- x[["summit"]][ix]
+  # pval <- x[["pval"]][ix]
+  score <- x[["score"]][ix]
+  # pval.mean <- mean(pval)
+  score.mean <- mean(score)
+  span.tfs <- x[["end"]][ix]-x[["start"]][ix]
+  span.l <- end-start
+  peak.ids <- x[["peak.ids"]][ix]
+  expt <- x[["expt"]][ix]
+  expt.alphanum.sorted <- sort(x[["expt"]][ix])
+
+  chr <- rep(r,length(ix))
+  return(list(mtl.id=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
+			  end=end,summit=summit,score=score,score.mean=score.mean,span.tfs=span.tfs,
+			  span.l=span.l,peak.ids=peak.ids))
+}
+
+## pretty print (tab deim) to file requested elements out of chip cluster list, ll.
+## input:
+##     - ll: a cluster list
+##     - f.nm: a file name (include path) to where you want files to print
+##     - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering)
+## output
+##     - a tab delim file with clusers as rows and elems tab delim for each cluster
+print.ll.verbose.all <- function(ll,f.nm="ll.xls"){
+  options(digits=5)
+  cat(file=f.nm,names(ll[[1]]),sep="\t")
+  cat(file=f.nm,"\n",append=TRUE)
+  for(i in 1:length(ll)){
+	line <- ll[[i]][[1]] # put MTL number
+	line <- paste(line,ll[[i]][[2]][1],sep="\t")
+	for(j in 3:length(ll[[i]])){
+		val <- ll[[i]][[j]]
+		if(length(val) == 1){
+			line <- paste(line,val,sep="\t")
+		} else {
+			line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t")
+		}
+	}
+  	cat(file=f.nm,line,"\n",append=TRUE)	
+  }
+}
+# read command line paramters that are not optional
+read.cmd.line.params.must <- function(args.nms, cmd.line.args){
+	if(length(grep("--version",cmd.line.args))){
+		cat("version",script.version,"\n")
+		q()
+	}
+	args <- sapply(strsplit(cmd.line.args," "),function(i) i)
+	vals <- character(length(args.nms))
+	# split cmd.line to key and value pairs
+	for(i in 1:length(args.nms)){
+		ix <- grep(args.nms[i],args)
+		if(length(ix)>1){
+			stop("arg ",args.nms[i]," used more than once.  Bailing out...\n",print.error())
+		} else if (length(ix)==0){
+			stop("could not find ",args.nms[i],". Bailing out...\n",print.error())
+		} else {
+			vals[i] <- args[ix+1]
+		}
+	}
+	return(vals)
+}
+# read command line paramters that are optional, if an optional param is not give functions return na for this param
+read.cmd.line.params.optional <- function(args.nms, cmd.line.args){
+	args <- sapply(strsplit(cmd.line.args," "),function(i) i)
+	vals <- character(length(args.nms))
+	# split cmd.line to key and value pairs
+	for(i in 1:length(args.nms)){
+		ix <- grep(args.nms[i],args)
+		if(length(ix)>1){
+			stop("arg ",args.nms[i]," used more than once.  Bailing out...\n",print.error())
+		} else if (length(ix)==0){ # if --param was not written in cmd line
+			vals[i] <- NA
+		} else if(( (ix+1) <= length(args) ) & ( length(grep("--",args[ix+1])) == 0) ){ # if --param was written in cmd line AND was followed by a value
+		 	vals[i] <- args[ix+1]
+		} else { # otherwise
+			vals[i] <- NA
+		}
+	}
+	return(vals)
+}
+
+############# Code:
+# retrieve args
+
+if(debug==T){
+	cmd.args <- c(
+		"--input_files data/xls/SL10571_SL10565_peaks.xls::data/xls/SL10570_SL10564_peaks.xls::data/xls/SL10572_SL10566_peaks.xls",
+		#"--input_files data/bed/SL10571_SL10565_peaks.bed::data/bed/SL10570_SL10564_peaks.bed::data/bed/SL10572_SL10566_peaks.bed",
+		"--input_type MACS", #BED
+		"--path_output ./results/",
+		"--expt_names macs1::macs2::macs3",
+		#"--expt_names bed1::bed2::bed3",
+		"--mtl_type interval", #interval summit
+		"--dist_summits 100"
+	)
+} else {
+	cmd.args <- commandArgs(trailingOnly = T);      
+}
+
+# if(length(grep("--version",cmd.args))){
+# 	cat("version",script.version,"\n")
+# 	q()
+# }
+args.nms.must <- c(	
+		"--input_files",   #1
+		"--path_output",  	 #2
+		"--expt_names" #3
+)
+
+# all numeric params must come at the end of args.nms.optional
+n.start.numeric <- 3
+args.nms.optional <- c(	
+		"--input_type",	#1
+		"--mtl_type", 	 #2
+		"--dist_summits" #3
+)
+
+# get must parameters
+vals.must <- read.cmd.line.params.must(args.nms = args.nms.must, cmd.line.args = cmd.args)
+if(verbose){
+	cat("\nfinshed reading vals: \n")
+	cat("\nvals.must", unlist(vals.must), "\n")
+}
+input.files <- strsplit(vals.must[1],"::")[[1]]
+if(length(input.files)==1){
+	cat("only provided one MACS file to cluster.")
+	print.error()
+}
+path.output <- vals.must[2]
+expt.names <- strsplit(vals.must[3],"::")[[1]]
+# get optional parameters
+vals.optional <- read.cmd.line.params.optional(args.nms = args.nms.optional, cmd.line.args = cmd.args)
+if(verbose){
+	cat("\nvals.optional:", unlist(vals.optional),"\n")
+}
+if(is.na(vals.optional[1])){
+	input.type <- "MACS"
+} else {
+	input.type <- vals.optional[1]
+}
+if(is.na(vals.optional[2])){
+	mtl.type <- "interval"
+} else {
+	mtl.type <- vals.optional[2]
+}
+if(is.na(vals.optional[2])){
+	dist.summits <- 100
+} else {
+	dist.summits <- as.numeric(vals.optional[3])
+}
+		
+# read MACS files
+unique.chr <- character()
+infile.list <- list()
+col.nms.keepers <- c("chr","start","end","summit","score")
+for(i in 1:length(input.files)){
+	e <- expt.names[i]
+	if(toupper(input.type)=="MACS"){
+		tmp <- read.delim(file=input.files[i],comment.char="#",stringsAsFactors = F)		
+		columns <- c("chr","start","end","summit","pvalue")
+		ix.cols <- numeric()
+		for(j in 1:length(columns)){
+			ix.j <- grep(columns[j],names(tmp))
+			if(length(ix.j)==0){
+				stop("can't find (grep) column ",columns[j]," in MACS input file ", e)
+			}
+			ix.cols <- c(ix.cols,ix.j)
+		}
+		infile.list[[e]] <- tmp[,ix.cols]
+		colnames(infile.list[[e]]) <- col.nms.keepers
+	}
+	if(toupper(input.type)=="BED"){
+		tmp <- read.delim(file=input.files[i],stringsAsFactors = F,header = FALSE)
+		tmp[,4] <- tmp[,2]+(tmp[,3]-tmp[,2])/2 # define summit and put istead of name column
+		colnames(tmp)  <- col.nms.keepers
+		infile.list[[e]] <- tmp[,1:5]
+	}	
+	unique.chr <- unique(c(unique.chr,infile.list[[ e ]][,"chr"]))
+}
+
+unique.chr <- sort(unique.chr)
+
+# take all macs files and put peaks together on each chromosome 
+# (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls)
+cat("splitting input files per chromosome\n")
+x <- split.macs.list.to.chrmosomes.no.pml(macs.list=infile.list,expts=expt.names,chr=unique.chr)
+cat("adding peaks from all input files into chromosome rulers\n")
+ruler <- make.ruler.no.pml(chr=unique.chr,macs.list.per.chrom=x)
+cat("add MTL membership to the ruler\n")
+
+if(mtl.type=="interval"){
+	ruler <- assign.clusters.ids.to.peaks.based.on.intervals(ruler)
+} else {
+	ruler <- assign.clusters.ids.to.peaks.based.on.summits(ruler,d.cut=dist.summits)
+}
+
+cat("creating MTL list\n")
+ll <- create.cluster.list.no.pml(ruler=ruler)
+cat("writing MTL table\n")
+f.nm <- paste(sep="",path.output,"mtls",".xls")
+print.ll.verbose.all(ll=ll,f.nm=f.nm)
+
+
+
+
+
+
+
+
+
diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/gtfToMapFriendlyAnnotation.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/gtfToMapFriendlyAnnotation.R	Mon Jul 23 13:00:15 2012 -0400
@@ -0,0 +1,102 @@
+rm (list = ls())
+args <- commandArgs()
+print(args)
+gtf <- read.table(args[6],as.is=T)
+colnames(gtf) <- c("chrom", 
+                   "source", 
+                   "type", 
+                   "five_prime", 
+                   "three_prime", 
+                   "score", 
+                   "strand", 
+                   "frame", 
+                   "nothinggene", 
+                   "gene", 
+                   "nothingSemi1",
+                   "nothingtranscript",
+                   "transcript",
+                   "nothingSemi2")
+gtf.exon <- gtf[which(gtf[,"type"]=="exon"),]
+gtf.exon.slim <-gtf.exon[,c("chrom", "five_prime", "three_prime", "strand", "transcript")]
+gtf.exon.slim.sort <- gtf.exon.slim[sort.list(gtf.exon.slim[,"transcript"]),]
+
+
+
+transcripts <- as.vector(unique(gtf.exon.slim[,"transcript"]))
+#transcripts <- as.vector(unique(gtf.exon.slim[,"transcript"]))
+
+
+
+output <- matrix(0,nr=length(transcripts),nc=ncol(gtf.exon.slim.sort))
+
+all.chrom <- gtf.exon.slim.sort[,"chrom"]
+all.five_prime <- gtf.exon.slim.sort[,"five_prime"]
+all.three_prime <- gtf.exon.slim.sort[,"three_prime"]
+all.strand <- gtf.exon.slim.sort[,"strand"]
+all.transcript <- gtf.exon.slim.sort[,"transcript"]
+
+colnames(output) <- colnames(gtf.exon.slim.sort)
+
+j <- 0
+i <- 1
+
+previous.strand <- gtf.exon.slim.sort[i,"strand"]
+previous.chrom <- gtf.exon.slim.sort[i,"chrom"]
+previous.transcript <- gtf.exon.slim.sort[i,"transcript"]
+previous.five_prime.min <- gtf.exon.slim.sort[i,"five_prime"]
+previous.three_prime.max <- gtf.exon.slim.sort[i,"three_prime"]
+
+# Progress bar:
+total <- dim(gtf.exon.slim.sort)[1]
+# create progress bar
+pb <- txtProgressBar(min = 0, max = total, style = 3)
+
+
+
+
+for ( i in 1:length(all.transcript)) { 
+  current.transcript <- all.transcript[i]
+  setTxtProgressBar(pb, i)
+  if (previous.transcript != current.transcript) {
+    j <- j + 1
+    # Write out the current transcript info
+    output[j,"chrom"] <- previous.chrom
+    output[j,"five_prime"] <- previous.five_prime.min
+    output[j,"three_prime"] <- previous.three_prime.max
+    output[j,"strand"] <- previous.strand
+    output[j,"transcript"] <- previous.transcript
+    
+    # Save the new transcript info (convert the current to previous)
+    
+    previous.strand <- all.strand[i]
+    previous.chrom <- all.chrom[i]
+    previous.transcript <- all.transcript[i] # current.transcript
+    previous.five_prime.min <- all.five_prime[i]
+    previous.three_prime.max <- all.three_prime[i]
+    }
+  else {
+    previous.five_prime.min <- min(previous.five_prime.min, all.five_prime[i])
+    previous.three_prime.max <- max(previous.three_prime.max, all.three_prime[i])
+    previous.transcript <- all.transcript[i] # current.transcript
+  }
+}
+# Write the last item
+j <- j + 1
+# Write out the current transcript info
+output[j,"chrom"] <- previous.chrom
+output[j,"five_prime"] <- previous.five_prime.min
+output[j,"three_prime"] <- previous.three_prime.max
+output[j,"strand"] <- previous.strand
+output[j,"transcript"] <- previous.transcript
+
+close(pb)
+
+colnames(output) <- c("chrom", "txStart", "txEnd", "strand", "name2")
+write.table(output, file="gene_annotation.txt", sep="\t", row.names=F)
+
+  #ix <- which(gtf.exon.slim[,"transcript"] == transcripts[i])
+  #output[i,"transcript"] <- transcripts[i]
+  #output[i,"five_prime"] <- min(gtf.exon.slim[ix,"five_prime"])
+  #output[i,"three_prime"] <- max(gtf.exon.slim[ix,"three_prime"])
+  #output[i,"strand"] <- gtf.exon.slim[ix,"strand"][1]
+  #output[i,"chrom"] <- gtf.exon.slim[ix,"chrom"][1]
diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/heatmap.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/heatmap.R	Mon Jul 23 13:00:15 2012 -0400
@@ -0,0 +1,1524 @@
+GeneratePeakMatrix <- function(experiments, scores) {
+  # Generates a score matrix for all mtls.
+  #
+  # Args:
+  #   experiments: A list of underscore deliminated experiments for each mtl.
+  #                There should never be a completely empty item in this list.
+  #                for eg. experiments[1] = expA_expD_expB.
+  #   scores: A list of underscore deliminated scores for each mtl, the length
+  #           of these scores should be identicle to the length of the
+  #           experiments. eg. scores[1] = 55_33_245.
+  #
+  # Returns:
+  #   The peak score matrix for all mtls.
+  experiments <- sapply(experiments, function(x) strsplit(x, split = "_"))
+  scores <- sapply(scores, function(x) strsplit(x, split = "_"))
+  unique.experiments <- unique(unlist(experiments))
+
+  peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments))
+  colnames(peaks) <- unique.experiments
+
+  for(i in 1:length(experiments)){
+    for(j in 1:length(experiments[[i]])){
+      peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j])
+    }
+  }
+  return(peaks)
+##################################################################
+}
+GetChipData <- function(file.name,
+                        proximity = "distal",
+                        include.targetless = TRUE,
+                        column.order = NA) {
+  # Reads in, filters, and organizes mtls data
+  #
+  # Args:
+  #   file.name: The path to the mtls file (including the file name).
+  #   proximity: Either "distal" or "proximal", defines the gene target distance
+  #              from the mtl. Default is "distal".
+  #   include.targetless: If TRUE, includes mtls with no targets (applied after
+  #                       the proximity filter); if not, mtls with no target
+  #                       will be exclided. Default is TRUE.
+  #   column.order: An optional vector of column names in the order in which
+  #                 they will be presented. If this is left to default (NA) the
+  #                 presented order of the chip columns will be the order in
+  #                 which they are seen.
+  #
+  # Returns:
+  #   An organized list of mtls data with the following elements:
+  #     $peaks - a matrix of peak p-values
+  #     $targets - a list of underscore deliminated gene targets for each mtl
+  if(param$debug) {
+    print("In GetChipData")
+  }
+  # Set Constants for the mtls file type:
+  MTLNUM <- "mtl.id"
+  CHROMOSOME <- "chr"
+  EXPERIMENTS <- "expt"
+  EXPERIMENTS.SORTED <- "expt.alphanum.sorted"
+  START <- "start"
+  END <- "end"
+  SUMMIT <- "summit"
+  SCORES <- "score"
+  SCORE.MEAN <- "score.mean"
+  SPANS <- "span.tfs"
+  SPAN.TOTAL <- "span.l"
+  PEAK.IDS <- "peak.ids"
+  TARGETS <- "trgt"
+#  PROXIMAL.TARGETS <- "trgt.prox"
+#  DISTAL.TARGETS <- "trgt.dist"
+#  PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox"
+#  DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist"
+
+
+  #Get chip data in from files:
+  #TARGETS <- switch(proximity,
+  #                   distal = DISTAL.TARGETS,
+  #                   proximal = PROXIMAL.TARGETS,
+  #                   stop(" Bad proximity argument supplied."))  # Default
+cat("param$rna.files = ", param$rna.files, "\n")
+if(!is.na(param$rna.files) && param$rna.files != "none") {
+  keep.columns <- c(EXPERIMENTS, SCORES, TARGETS)
+} else {
+  keep.columns <- c(EXPERIMENTS, SCORES)
+}
+  file <- read.delim(file.name, header=T, as.is=T)[, keep.columns]
+if(!is.na(param$rna.files) && param$rna.files != "none") {
+  if (!include.targetless) {
+    ix.has.trgt <- which(as.character(file[, TARGETS])!="")
+    file <- file[ix.has.trgt, ]
+  }
+}
+  chip = list()
+
+  chip$peaks <- GeneratePeakMatrix(file[, EXPERIMENTS], file[, SCORES])
+  if(!is.na(column.order)) {
+  #If you specify an order, or a subset of experiments to include, this is where
+  #that gets done.
+  order <- unlist(strsplit(column.order, split="::"))
+  chip$peaks <- chip$peaks[,order]
+  }
+  if(!is.na(param$rna.files) && param$rna.files != "none") {
+  chip$targets <- file[ , TARGETS]
+}
+  
+  return (chip)
+}
+GetRNADataFromOneFile <- function(file.name, fpkm = "avg") {
+  # Reads in rna expression data from cufflinks
+  #
+  # Args:
+  #   file.name: The path to the cufflinks file (including the file name).
+  #   fpkm: What fpkm is chosen, options are hi, low and avg
+  # Returns:
+  #   An organized vector of rnaseq data with the following elements:
+  #     names(data) - the genes from the file. eg names(data[1]) = JUND
+  #     data - the fpkm score for that gene data[1] = 31 
+  if(param$debug) {
+    cat("In GetRNADataFromOneFile for ",file.name)
+  }
+
+ # Set Constants for the cufflinks file type:
+  GENE <- "gene_id"
+  BUNDLE <- "bundle_id"
+  CHROMOSOME <- "chr"
+  LEFT <- "left"
+  RIGHT <- "right"
+  FPKM_AVG <- "FPKM"
+  FPKM_LOW <- "FPKM_conf_lo"
+  FPKM_HIGH <- "FPKM_conf_hi"
+  STATUS <- "status"
+
+  
+  #Get chip data in from files:
+   FPKM <- switch(fpkm,
+                  hi = FPKM_HIGH,
+                  avg = FPKM_AVG,
+                  low = FPKM_LOW,
+                  stop(" Bad fpkm quality argument supplied."))  # Default
+  
+  keep.columns <- c(GENE, FPKM) 
+  
+  file <- read.delim(file.name, header=T, as.is=T)[, keep.columns]
+  
+  rna = vector(mode = "numeric", length = nrow(file))
+  
+  rna <- file[, FPKM]
+  names(rna) <- file[ , GENE] 
+  return (rna)
+}
+
+GetRNAData <- function(file.names, file.lables = file.names, fpkm = "avg") {
+  # Reads in rna expression data from cufflinks
+  #
+  # Args:
+  #   file.names: The list of paths to the cufflinks files (including the file name).
+  #   fpkm: What fpkm is chosen, options are hi, low and avg
+  # Returns:
+  #   A matrix of rnaseq data with the following description:
+  #     each col corresponds to an rnaseq run
+  #     eacg row corresponds to a gene 
+  if(param$debug) {
+    print("In GetRNAData")
+  }
+  files <- list()
+  for(i in 1:length(file.names)) {
+  files[[i]] <-GetRNADataFromOneFile(file.names[i])
+  }
+  genes <- unique(names(unlist(files)))
+  scores <- matrix(0,nrow=length(genes),ncol=length(file.names))
+  rownames(scores) <- genes
+  colnames(scores) <- file.names
+
+  for(j in 1:length(file.names)) {
+    scores[names(files[[j]]),j] <- files[[j]]
+  }
+print("# of cols for scores is ")
+print(ncol(scores))
+print ("file lables are ")
+print (file.lables)
+  colnames(scores) <- file.lables
+  return(scores)
+  #scores[genes,file.names] <- files[[]]
+}
+
+NormalizeRNA <- function(scores) {
+
+#add psudocount
+scores <- scores+1
+
+numerator <- scores[,1:(ncol(scores)/2)]
+denominator <- scores[,(ncol(scores)/2 + 1):ncol(scores)]
+
+#new.scores <- scores[,-which(colnames(scores) == norm.exp)]
+#norm.score <- scores[,which(colnames(scores) == norm.exp)]
+#return(log2(new.scores/norm.score))
+
+return(log2(numerator/denominator))
+  
+}
+
+PrepareRNAforHeatmap <- function(new.scores.foldchange){
+new.scores.split <- matrix(0, nr=nrow(new.scores.foldchange), nc=2*ncol(new.scores.foldchange))
+is.even <- function(x){ x %% 2 == 0 }
+corresponding.col <- function(x){ceiling(x/2)}
+new.col.names <- vector(length = ncol(new.scores.split))
+rownames(new.scores.split) <- rownames(new.scores.foldchange)
+  for(i in 1:ncol(new.scores.split)) {
+	if(is.even(i)) {sign <- "down"} else {sign <- "up"}
+  new.col.names[i] <- paste(colnames(new.scores.foldchange)[corresponding.col(i)], 
+					sign, sep =".")
+	new.scores.split[,i] <- new.scores.foldchange[,corresponding.col(i)]
+	if(is.even(i)) {
+		new.scores.split[which(new.scores.split[,i]>0),i] <- 0
+		new.scores.split[,i] <- -1*new.scores.split[,i]
+	} else {
+		new.scores.split[which(new.scores.split[,i]<0),i] <- 0
+	}
+	colnames(new.scores.split) <- new.col.names
+}
+return(new.scores.split)}
+
+#ConvertExpressionToPval = function(expression, threshold = 1)
+#{
+#    print("In ConvertExpressionToPval")
+#    #Generate Normal Stats
+#    # expression.mean = apply(expression, 2, mean)
+#    # expression.sd = apply(expression, 2, sd)
+#    
+#    ix.above.threshold = which(expression > threshold)
+#    expression.mean = apply(expression, 2, function(i) mean(i[which(i > threshold)]))
+#    expression.sd = apply(expression, 2, function(i)     sd(i[which(i > threshold)]))
+#    
+
+#    # CDF from zero to point
+#    # expression.pval = matrix(,nr=nrow(expression),nc=ncol(expression),dimnames=dimnames(expression))
+#    # ix.low = which(expression < expression.mean)
+#    # ix.upper = which(expression >= expression.mean)
+#    # expression.pval[ix.low] = (-10 * pnorm(expression[ix.low], mean = expression.mean, sd = expression.sd, log=T) *log10(exp(1)))
+#    
+#    expression.pval = (-10 * log10(exp(1)) * pnorm(expression, 
+#                                                    mean = expression.mean, 
+#                                                    sd = expression.sd, 
+#                                                    log=T, 
+#                                                    lower.tail=F
+#    ) 
+#    )
+#    #squash those under threshold to zero
+#    expression.pval[-ix.above.threshold] = 0
+
+#    # correct for upper tail (ie from point to inf)
+#    #Still need to do
+#    return (expression.pval)
+#}
+MapExpressiontoChip2 = function(chip.targets, expression)
+{
+mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression))
+rownames(mymatrix) <- chip.targets
+colnames(mymatrix) <- colnames(expression)
+
+
+j <- 1 #expression counter
+i <- 1 #mymatrix counter
+expression.genes <- rownames(expression)
+previous.expression.gene <- expression.genes[j]
+
+# Progress bar:
+total <- length(chip.targets)
+# create progress bar
+pb <- txtProgressBar(min = 0, max = total, style = 3)
+
+for ( i in 1:length(chip.targets)) { 
+  current.target <- chip.targets[i]
+  setTxtProgressBar(pb, i)
+  while (current.target > previous.expression.gene && j<=length(expression.genes)) {
+    j <- j + 1
+    previous.expression.gene <- expression.genes[j]
+  }
+  if (current.target == previous.expression.gene){
+    mymatrix[i,] <- expression[j,]
+  }
+}
+
+close(pb)
+
+           print("Leaving MapRNAtoChip")
+           return(mymatrix)
+}
+
+
+MapExpressiontoChip = function(chip.targets, expression)
+{
+    print("In MapRNAtoChip")
+    # targets = unlist(strsplit(chip$targets[i], "_"))
+    #   exp = matrix(rna$all.pval[targets, ])   
+    #   return (apply(exp, 2, mean))
+    
+    #rna col 1 = th0 overexpressed  
+    #rna col 2 = th17 overexpressed
+    #rna col 3 = true zscores
+
+
+
+
+
+
+    ####################################################################
+    chip.targets.notnull.unique <- unique(chip.targets[which(chip.targets!="")])
+    gene.intersect <- intersect(rownames(expression), chip.targets.notnull.unique)
+    chip.targets.notnull.unique.with.data <- chip.targets.notnull.unique[which(chip.targets.notnull.unique %in% gene.intersect)]
+    expression.useful.data <- expression[which(rownames(expression) %in% gene.intersect),]
+    chip.targets.notnull.unique.with.data.sorted <- chip.targets.notnull.unique.with.data[order(chip.targets.notnull.unique.with.data)]
+    expression.useful.data.sorted <- expression.useful.data[order(rownames(expression.useful.data)),]
+    ####################################################################
+    
+    mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression))
+       rownames(mymatrix) <- chip.targets
+       colnames(mymatrix) <- colnames(expression)
+       head(chip.targets.notnull.unique.with.data.sorted)
+       head(rownames(expression.useful.data.sorted))
+       if(!identical(chip.targets.notnull.unique.with.data.sorted, rownames(expression.useful.data.sorted))){
+        stop("We have a serious problem, chip is not alligned to expression")
+      }
+       for(i in 1:length(chip.targets.notnull.unique.with.data.sorted))
+          {
+            mtls.ix <- which(chip.targets == chip.targets.notnull.unique.with.data.sorted[i])
+            for(j in 1:length(mtls.ix)){
+                mymatrix[mtls.ix[j],] <- expression.useful.data.sorted[i,]
+              }
+          }
+           print("Leaving MapRNAtoChip")
+           return(mymatrix)
+    
+   #return( sapply(chip$targets, function(x) apply(rna$all.pval[unlist(strsplit(chip$targets[3], "_")), ], 2, mean)) )
+}
+
+
+
+
+
+GenerateKMOrder <- function(data, km) {
+  # generates the order of clusters from high to low mean values
+  #
+  # Args:
+  #   data: A matrix of scores that have already been clustered.
+  #   km: The K-means object generated from running kmeans on data. Another
+  #       method could be used so long as it supplies a (km)$cluser list. Must
+  #       have the same length as the number of rows in data
+  #
+  # Returns:
+  #   cluster.order: The order in which the clusters should be displayed.
+
+    km.cluster = km$cluster
+    clusters = unique(km.cluster)
+    clusters.avg = numeric()
+    for(i in clusters) {
+        clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ]))
+    }
+    if(param$debug) {
+      print ("straight clusters")
+      print (clusters)
+      print ("straigth average")
+      print (clusters.avg)
+      print ("ordered clusters")
+      print (clusters[order(clusters.avg)])
+      print("ordered average")
+      print (clusters.avg[order(clusters.avg)])
+    }
+    return(clusters[rev(order(clusters.avg))])
+}
+
+OrderMTL <- function(data, km, cluster.order) {
+  # Orders a matrix of data according to a clustering algorithm
+  #
+  # Args:
+  #   data: A matrix of scores that have already been clustered.
+  #   km: The K-means object generated from running kmeans on data. Another
+  #       method could be used so long as it supplies a (km)$cluser list. Must
+  #       have the same length as the number of rows in data
+  #   cluster.order: The order in which the clusters should be displayed. 
+  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2 
+  #                  being on top, then cluster 3 and lastly cluster 1.
+  #
+  # Returns:
+  #   a list that contains 3 objects:
+  #     list$data: the ordered version of the data.
+  #     list$color.vector: a list of colors that should be assigned to each row.
+  #     list$start.row: the starting row of each cluster in data.
+  number.clusters <- length(cluster.order)
+  cluster.colors <- sample(rainbow(number.clusters))
+  
+  # Set up return objects
+  sorted.data <- matrix(,nr=nrow(data), nc=ncol(data))
+  colnames(sorted.data) <- colnames(data)
+  cluster.color.vector = vector(length=length(km$cluster))
+  cluster.start.row = numeric(number.clusters)
+  cluster.start.row[1]=1
+  
+  for (i in 1:number.clusters)
+  {
+      current.cluster = cluster.order[i]
+      ix = which(km$cluster == current.cluster)
+      current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1)
+      sorted.data[current.cluster.range, ] = data[ix, ]
+      cluster.color.vector[current.cluster.range] = cluster.colors[i]
+      cluster.start.row[i+1] = (cluster.start.row[i]+length(ix))  
+  }
+  ret.list = list()
+  ret.list$data = sorted.data
+  ret.list$color.vector = cluster.color.vector
+  ret.list$cluster.index = cluster.start.row
+  return(ret.list)
+}    
+
+
+CreateHeatMap <- function(data,
+                         km,
+                         cluster.order, 
+                         document.name,
+                         document.type = "png",
+                         number.colors = 30) {
+  # Generates a heatmap image based for a matrix based on a clustering 
+  # algorithm.
+  #
+  # Args:
+  #   data: A matrix of scores that have already been clustered, the column
+  #         names of this matrix will become the column titels of the heatmap.
+  #   km: The K-means object generated from running kmeans on data. Another
+  #       method could be used so long as it supplies a (km)$cluser list.
+  #   cluster.order: The order in which the clusters should be displayed. 
+  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2 
+  #                  being on top, then cluster 3 and lastly cluster 1.
+  #   document.name: A name for the produced file. there is no need to
+  #                  supply the .png/.pdf in your argument.
+  #   document.type: The type of file you want to produce. current options are
+  #                  png and pdf. Default is pdf.
+  #
+  # Returns:
+  #   Nothing to the script that calls it, however it creates an image at the 
+  #   path specified.
+  if(param$debug) {
+    print("In CreateHeatMap")    
+  }
+  
+  data.ordered <- OrderMTL(data, km, cluster.order)
+    
+	
+	#Load Lib
+	library(gplots)
+	
+	#Set Color gradient
+	color.ramp = colorRampPalette(c("black",
+	                                "darkblue",
+	                                "blue",
+	                                "yellow",
+	                                "orange",
+	                                "red"))(number.colors) #7
+	
+	if(document.type == "png") {
+          png(paste(document.name,".png", sep = ""),
+              #width=15360,#1920,
+              #height=204080,#2560,
+              #res=500,
+              antialias="gray")
+        }
+        else if(document.type == "pdf") {
+        pdf(paste(document.name,".pdf", sep = ""))
+        }
+        else if(document.type == "tiff") {
+        tiff(paste(document.name,".tiff", sep = ""),
+        res=800,
+        pointsize=2,
+        width=1920,
+        height=1920)
+        }
+        else {
+        bitmap(paste(document.name,".bmp", sep = ""),
+        height = 5,
+        width = 5,
+        res = 500)
+        }
+#op <- par(mar = rep(0, 4))
+	
+	heatmap.2(
+		data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))],
+		rowsep = data.ordered$cluster.index[-1],
+		sepwidth = c(0.5, ncol(data)/100), 
+		dendrogram = "none",
+		Rowv = F,
+		Colv = F,
+		trace = "none",
+		labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString),
+		labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)), 
+		RowSideColors = data.ordered$color.vector, 
+		keysize=0.6,
+		key=F,
+		col = color.ramp,
+		cexCol = 0.8,
+		cexRow = 0.8)
+  
+	dev.off()  
+
+}
+
+CreateIndividualHeatMap <- function(data,
+                         km,
+                         cluster.order,
+                         color.ramp = colorRampPalette(c("black",
+                                  "darkblue",
+                                  "blue",
+                                  "yellow",
+                                  "orange",
+                                  "red"))(30)) {
+  # Generates a heatmap image based for a matrix based on a clustering 
+  # algorithm.
+  #
+  # Args:
+  #   data: A matrix of scores that have already been clustered, the column
+  #         names of this matrix will become the column titels of the heatmap.
+  #   km: The K-means object generated from running kmeans on data. Another
+  #       method could be used so long as it supplies a (km)$cluser list.
+  #   cluster.order: The order in which the clusters should be displayed. 
+  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2 
+  #                  being on top, then cluster 3 and lastly cluster 1.
+  #   document.name: A name for the produced file. there is no need to
+  #                  supply the .png/.pdf in your argument.
+  #   document.type: The type of file you want to produce. current options are
+  #                  png and pdf. Default is pdf.
+  #
+  # Returns:
+  #   Nothing to the script that calls it, however it creates an image at the 
+  #   path specified.
+  if(param$debug) {
+    print("In CreateHeatMap")    
+  }
+  
+  data.ordered <- OrderMTL(data, km, cluster.order)
+    
+  
+  #Load Lib
+  library(gplots)
+  
+
+  
+
+  heatmap.3(
+    data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))],
+    rowsep = data.ordered$cluster.index[-1],
+    sepwidth = c(0.5, nrow(data.ordered$data)/100), 
+    dendrogram = "none",
+    Rowv = F,
+    Colv = F,
+    trace = "none",
+    labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString),
+    labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)), 
+    #sourcRowSideColors = data.ordered$color.vector, 
+    keysize=0.6,
+    key=F,
+    col = color.ramp,
+    cexCol = 0.8,
+    cexRow = 0.8)
+   
+
+}
+
+ReadCommadLineParameters <- function(argument.names, command.line.arguments, optional = F) {
+  # Reads the parameters from the command line arguments.
+  #
+  # Args:
+  #   argument.names: A list of expected argument names.
+  #   command.line.arguments: The list of recieved command line arguments
+  #   optional: Are the areguments optional, or are they required, default is required
+  #
+  # Returns:
+  #   The arguments for argument.names. As strings In that order.
+
+    if(length(grep("--version",command.line.arguments))) {
+        cat("version",script.version,"\n")
+        q()
+    }
+    # Split command line arguments
+    args <- sapply(strsplit(command.line.arguments, " "),function(i) i)
+    vals <- character(length(argument.names))
+    # split cmd.line to key and value pairs
+    for(i in 1:length(argument.names)) {
+        ix <- grep(argument.names[i], args)
+        if(length(ix)>1) {
+            stop("arg ",
+                 argument.names[i],
+                 " used more than once.  Bailing out...\n",
+                 PrintParamError())
+        }
+        else if (length(ix)==0 && !optional) {
+            stop("could not find ",
+                 argument.names[i],
+                 ". Bailing out...\n",
+                 PrintParamError())
+        }
+        else if (length(ix)==0 && optional) {
+        vals[i] <- NA
+        }
+        else {
+        vals[i] <- args[ix+1]
+        }
+    }
+    return(vals)
+}
+
+PrintParamError <- function(){
+  # Prints the usage of the function, shows users what arguments they can use
+  #
+  # Args:
+  #   param: A list that contains all the paramaters.
+  #
+  # Returns:
+  #   A modified version of the param list, with the default values loaded.
+    cat("
+DESCRIPTIION:
+    heatmap.R takes a ...
+
+INPUT:
+    1.--mtls_file: path to mtls file.\n
+    2.--cluster_file: the destination path for the output cluster file.\n
+    3.--chip_experiment_order: The order of desired chip experiments (optional).\n
+    4.--heatmap_file: path for output heatmap image (no extension).\n
+    5.--heatmap_type: choice of image format, currently support png, pdf, tiff and bmp (optional)\n
+    6.--expression_file: list of expression files to be included in analysis (optional).\n
+    7.--expression_name: lables for the expression files (optional).\n
+    8.--normalization_file: a list of files to be used for normalization,
+    they can be the same file, however the number of expression nominated
+    normalization files must match the number of expression files (optional)\n
+    9.--n_clusters: number of clusters\n
+    10.--filter_percentage: percentage of mtls that will be analysed. Eg: if
+    we make filter_percentage 30, we will take the union of the top mtls in
+    mean, non-zero mean and variance (optional).\n
+
+EXAMPLE RUN:
+    Rscript heatmap.R
+        --mtls_file path/to/mtls.xls
+        --cluster_file path/to/output/cluster
+        --chip_experiment_order tf1::tf2::tf5::tf3
+        --heatmap_file path/to/output/heatmap
+        --heatmap_type png
+        --expression_file path/to/exp1::path/to/exp2
+        --expression_name myexp1::myexp2
+        --normalization_file path/to/exp3::path/to/exp3
+        --n_clusters 13
+        --filter_percentage 100
+        --include_targetless yes
+        --number_bins 30
+
+    ")
+}
+
+#LoadDefaultParams <- function(param) {
+#  # Loads default paramaters for the heatmap application
+#  #
+#  # Args:
+#  #   param: A list that contains all the previous paramaters.
+#  #
+#  # Returns:
+#  #   A modified version of the param list, with the default values loaded.
+
+#script.version=0.1
+#param$debug = F
+
+## RNA data:
+#param$rna.files = ""
+#param$rna.normalization = "none"
+
+## Filter:
+#param$filter.percentage <- 100
+
+## Clustering:
+#param$clustering.number.of.clusters <- 13
+
+## Heatmap:
+#param$heatmap.document.name <- "heatmap"
+#param$heatmap.document.type <- "png"
+##Cluster Groups:
+#param$cluster.groups.document.name <- "clusters"
+
+#return(param)
+
+#}
+
+
+
+LoadParams <- function(cmd.args, args.nms, n.start.numeric, optional = F) {
+  # Loads user defined paramaters for the heatmap application
+  #
+  # Args:
+  #   cmd.args: The command line arguments given.
+  #   arg.nms: A list of possible command arguments.
+  #   n.start.numeric: the first argument that should be numeric (alway put
+  #                        these last).
+  #   optional: This specifies if the params in cmd.args are optional or
+  #             required.
+  # Returns:
+  #   A list of values assigned to each argument.
+
+
+  vals <- ReadCommadLineParameters(argument.names = args.nms,
+                                 command.line.arguments = cmd.args,
+                                 optional = optional)
+
+   #check if numeric params are indeed numeric
+  if(!optional) {
+  for(i in n.start.numeric:length(vals)){
+    if(is.na(as.numeric(vals[i]))){
+      stop("arg ",args.nms[i]," is not numeric.  Bailing out...\n",print.error())
+    }
+  }
+}
+  return (vals)
+
+}
+
+#ValidateParams <- function(params) {
+#    return(T)
+#}
+
+
+##########
+
+LoadDebugParams <- function(param) {
+
+    cmd.args <- c(
+      "--mtls_file data/test/mtls.xls",
+      "--cluster_file data/test/cluster",
+      "--heatmap_file data/test/heatmap",
+      "--heatmap_type bmp",
+      "--n_clusters 13",
+      "--filter_percentage 100",
+      "--expression_file /home/kieran/code/scorchR-heatmap/data/expression/rna1.tabular::/home/kieran/code/scorchR-heatmap/data/expression/rna2.tabular",
+      "--expression_name batf.ko.0::batf.ko.17", 
+      "--normalization_file mean",
+      "--chip_experiment_order ac::bc::cc::dc::ec",
+      "--include_targetless yes",
+      "--number_bins 20"
+    )
+
+args.nms <- c(            "--mtls_file",        #1
+                "--cluster_file",     #2
+                "--chip_experiment_order",    #3
+                "--heatmap_file",     #4
+                "--heatmap_type",         #5
+                "--expression_file",    #6
+                "--expression_name",    #7
+                "--normalization_file",    #8
+                "--include_targetless",    #9
+                 "--n_clusters",     #10
+                "--filter_percentage", #11
+                "--number_bins")    #12
+
+
+# vals has the same order as args.nms
+vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 10, optional = F)
+
+
+# ChIP data:
+param$annotated.macs.file <- vals[1]
+param$chip.order <- vals[3]
+# RNA data:
+param$rna.files = vals[6]
+param$rna.names = vals[7]
+param$rna.normalization.file = vals[8]
+param$include.targetless = vals[9]
+
+# Filter:
+param$filter.percentage <- as.numeric(vals[11])
+
+# Clustering:
+param$number.bins <- as.numeric(vals[12])
+param$clustering.number.of.clusters <- as.numeric(vals[10])
+
+# Heatmap:
+param$heatmap.document.name <- vals[4]
+param$heatmap.document.type <- vals[5]
+#Cluster Groups:
+param$cluster.groups.document.name <- vals[2]
+
+return(param)
+
+}
+
+
+
+LoadOptionalParams <- function(param) {
+
+cmd.args <- commandArgs(trailingOnly = T)
+
+args.nms <- c(  "--chip_experiment_order",  #1
+                "--expression_file",        #2
+                "--expression_name",        #3
+                "--normalization_file",     #4
+                "--heatmap_type",           #5
+                "--include_targetless",     #6
+                "--filter_percentage",      #7
+                "--number_bins")            #8
+
+
+# vals has the same order as args.nms
+vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 7, optional = T)
+
+# ChIP data:
+param$chip.order <- if(!is.na(vals[1])){vals[1]}else{NA}
+
+# RNA data:
+param$rna.files <- if(!is.na(vals[2])){vals[2]}else{"none"}
+param$rna.names <- if(!is.na(vals[3])){vals[3]}else{"none"}
+param$rna.normalization.file <- if(!is.na(vals[4])){vals[4]}else{"no"}
+param$include.targetless <- if(!is.na(vals[6])){vals[6]}else{"yes"}
+# Filter:
+param$filter.percentage <- if(!is.na(vals[7])){as.numeric(vals[7])}else{100}
+param$number.bins <- if(!is.na(vals[8])){as.numeric(vals[8])}else{30}
+# Heatmap file (output)
+param$heatmap.document.type <- if(!is.na(vals[5])){vals[5]}else{"none"}
+
+return(param)
+}
+
+
+LoadReqiredParams <- function(param){
+
+cmd.args <- commandArgs(trailingOnly = T)
+
+args.nms <- c(  "--mtls_file",          #1
+                "--cluster_file",       #2
+                "--heatmap_file",       #3
+                 "--n_clusters")        #4
+
+
+# vals has the same order as args.nms
+vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 4, optional = F)
+
+# ChIP data:
+param$annotated.macs.file <- vals[1]
+
+# Clustering
+param$clustering.number.of.clusters <- as.numeric(vals[4])
+
+# Cluster file (output):
+param$cluster.groups.document.name <- vals[2]
+
+
+# Heatmap file (output):
+param$heatmap.document.name <- vals[3]
+
+return(param)
+
+}
+
+###########
+# here we output the fasta file of the targets of each kmeans cluster
+CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){
+  f.nm = paste(f.nm, ".fasta", sep="")
+    clusters = km.order
+    for(i in 1:length(clusters)){
+        v=trgts[which(k.ix==clusters[i])]
+        v.split = unlist(sapply(v,strsplit, "_"))
+        if(i == 1){
+            cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n")    
+        } else {
+            cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE)
+        }
+        cat(sep="\n",file=f.nm,v.split,append=TRUE)
+    }
+}
+
+
+PrintClusters <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){
+	f.nm = paste(f.nm, ".tsv", sep="")
+	cat(sep="\t",file=f.nm,"row_number/target","cluster","\n")
+	trgts[which(trgts=="")] <- "no_target"
+    for(i in 1:length(trgts)){
+		cat(sep="",file=f.nm,trgts[i],"\t","cluster_",k.ix[i],"\n",append=TRUE)
+			}
+#  	  if(i == 1){
+#  	    cat(sep="",file=f.nm,"cluster_",i,"\n")#clusters[i],"\n")    
+#  	  } else {
+#  	    cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE)
+#  	  }
+      #cat(sep="\n",file=f.nm,v.split,append=TRUE)
+  	}
+GetTopRowsFromMatrix = function(mtrx, percentage = 10)
+{
+  if (param$debug) {
+    print("In GetTopRowsFromMatrix")
+	}
+	#Store the stats for the mtrx
+	stats = list()
+	stats$mean=apply(mtrx,1,mean)
+	stats$sd=apply(mtrx,1,sd)
+	stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)]))
+    
+	
+	#Store the indexes for the mtrx
+	index = list()
+	index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix
+	index$sd   = sort(stats$sd, decreasing=T, index.return=T)$ix
+	index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix
+	
+	#Calculate how many data points we want to take
+	cutoff = floor(length(mtrx[ ,1])*(percentage/100))
+	
+	#Extract Indexes and return to caller
+	index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff]))
+	
+	return(index)
+}
+
+#GetUniq = function(targets.cluster)
+#{
+#    #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order
+#    
+#    return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))))
+#    #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))
+#    
+#}
+
+bindata = function(d,qunts=seq(.4,.9,.1))
+{
+    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
+    for(j in 1:ncol(d))
+    {
+        bins=quantile(d[,j],qunts)
+            for(i in 1:length(bins))
+            {
+                tmp[which(d[,j]>bins[i]),j] = i
+         }
+    }
+    return(tmp)
+}
+
+bindata.non.zero = function(d,qunts=seq(0,0.9,0.1))
+{
+    tmp=matrix(0,nr=nrow(d),nc=ncol(d))
+    for(j in 1:ncol(d))
+    {
+        ix.non.zero=which(d[,j]!=0)
+        bins=quantile(d[ix.non.zero,j],qunts)
+            for(i in 1:length(bins))
+            {
+                tmp[which(d[,j]>bins[i]),j] = i
+         }
+    }
+    
+    return(tmp)
+}
+bindata.matrix = function(d,qunts=seq(0,0.9,0.1))
+{
+    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
+    #ix.non.zero=which(d!=0)
+    bins=quantile(d[],qunts)
+    for(i in 1:length(bins))
+    {
+        tmp[which(d>bins[i])] = i
+    }
+    return(tmp)
+}
+bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1))
+{
+    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
+    ix.non.zero=which(d!=0)
+    bins=quantile(d[ix.non.zero],qunts)
+    for(i in 1:length(bins))
+    {
+        tmp[which(d>bins[i])] = i
+    }
+    return(tmp)
+}
+heatmap.3 <- function (x, Rowv = TRUE, Colv = if (symm) "Rowv" else TRUE, 
+    distfun = dist, hclustfun = hclust, dendrogram = c("both", 
+        "row", "column", "none"), symm = FALSE, scale = c("none", 
+        "row", "column"), na.rm = TRUE, revC = identical(Colv, 
+        "Rowv"), add.expr, breaks, symbreaks = min(x < 0, na.rm = TRUE) || 
+        scale != "none", col = "heat.colors", colsep, rowsep, 
+    sepcolor = "white", sepwidth = c(0.05, 0.05), cellnote, notecex = 1, 
+    notecol = "cyan", na.color = par("bg"), trace = c("column", 
+        "row", "both", "none"), tracecol = "cyan", hline = median(breaks), 
+    vline = median(breaks), linecol = tracecol, margins = c(5, 
+        5), ColSideColors, RowSideColors, cexRow = 0.2 + 1/log10(nr), 
+    cexCol = 0.2 + 1/log10(nc), labRow = NULL, labCol = NULL, 
+    key = TRUE, keysize = 1.5, density.info = c("histogram", 
+        "density", "none"), denscol = tracecol, symkey = min(x < 
+        0, na.rm = TRUE) || symbreaks, densadj = 0.25, main = NULL, 
+    xlab = NULL, ylab = NULL, lmat = NULL, lhei = NULL, lwid = NULL, 
+    ...) 
+{
+    scale01 <- function(x, low = min(x), high = max(x)) {
+        x <- (x - low)/(high - low)
+        x
+    }
+    retval <- list()
+    scale <- if (symm && missing(scale)) 
+        "none"
+    else match.arg(scale)
+    dendrogram <- match.arg(dendrogram)
+    trace <- match.arg(trace)
+    density.info <- match.arg(density.info)
+    if (length(col) == 1 && is.character(col)) 
+        col <- get(col, mode = "function")
+    if (!missing(breaks) && (scale != "none")) 
+        warning("Using scale=\"row\" or scale=\"column\" when breaks are", 
+            "specified can produce unpredictable results.", "Please consider using only one or the other.")
+    if (is.null(Rowv) || is.na(Rowv)) 
+        Rowv <- FALSE
+    if (is.null(Colv) || is.na(Colv)) 
+        Colv <- FALSE
+    else if (Colv == "Rowv" && !isTRUE(Rowv)) 
+        Colv <- FALSE
+    if (length(di <- dim(x)) != 2 || !is.numeric(x)) 
+        stop("`x' must be a numeric matrix")
+    nr <- di[1]
+    nc <- di[2]
+    if (nr <= 1 || nc <= 1) 
+        stop("`x' must have at least 2 rows and 2 columns")
+    if (!is.numeric(margins) || length(margins) != 2) 
+        stop("`margins' must be a numeric vector of length 2")
+    if (missing(cellnote)) 
+        cellnote <- matrix("", ncol = ncol(x), nrow = nrow(x))
+    if (!inherits(Rowv, "dendrogram")) {
+        if (((!isTRUE(Rowv)) || (is.null(Rowv))) && (dendrogram %in% 
+            c("both", "row"))) {
+            if (is.logical(Colv) && (Colv)) 
+                dendrogram <- "column"
+            else dedrogram <- "none"
+            warning("Discrepancy: Rowv is FALSE, while dendrogram is `", 
+                dendrogram, "'. Omitting row dendogram.")
+        }
+    }
+    if (!inherits(Colv, "dendrogram")) {
+        if (((!isTRUE(Colv)) || (is.null(Colv))) && (dendrogram %in% 
+            c("both", "column"))) {
+            if (is.logical(Rowv) && (Rowv)) 
+                dendrogram <- "row"
+            else dendrogram <- "none"
+            warning("Discrepancy: Colv is FALSE, while dendrogram is `", 
+                dendrogram, "'. Omitting column dendogram.")
+        }
+    }
+    if (inherits(Rowv, "dendrogram")) {
+        ddr <- Rowv
+        rowInd <- order.dendrogram(ddr)
+    }
+    else if (is.integer(Rowv)) {
+        hcr <- hclustfun(distfun(x))
+        ddr <- as.dendrogram(hcr)
+        ddr <- reorder(ddr, Rowv)
+        rowInd <- order.dendrogram(ddr)
+        if (nr != length(rowInd)) 
+            stop("row dendrogram ordering gave index of wrong length")
+    }
+    else if (isTRUE(Rowv)) {
+        Rowv <- rowMeans(x, na.rm = na.rm)
+        hcr <- hclustfun(distfun(x))
+        ddr <- as.dendrogram(hcr)
+        ddr <- reorder(ddr, Rowv)
+        rowInd <- order.dendrogram(ddr)
+        if (nr != length(rowInd)) 
+            stop("row dendrogram ordering gave index of wrong length")
+    }
+    else {
+        rowInd <- nr:1
+    }
+    if (inherits(Colv, "dendrogram")) {
+        ddc <- Colv
+        colInd <- order.dendrogram(ddc)
+    }
+    else if (identical(Colv, "Rowv")) {
+        if (nr != nc) 
+            stop("Colv = \"Rowv\" but nrow(x) != ncol(x)")
+        if (exists("ddr")) {
+            ddc <- ddr
+            colInd <- order.dendrogram(ddc)
+        }
+        else colInd <- rowInd
+    }
+    else if (is.integer(Colv)) {
+        hcc <- hclustfun(distfun(if (symm) 
+            x
+        else t(x)))
+        ddc <- as.dendrogram(hcc)
+        ddc <- reorder(ddc, Colv)
+        colInd <- order.dendrogram(ddc)
+        if (nc != length(colInd)) 
+            stop("column dendrogram ordering gave index of wrong length")
+    }
+    else if (isTRUE(Colv)) {
+        Colv <- colMeans(x, na.rm = na.rm)
+        hcc <- hclustfun(distfun(if (symm) 
+            x
+        else t(x)))
+        ddc <- as.dendrogram(hcc)
+        ddc <- reorder(ddc, Colv)
+        colInd <- order.dendrogram(ddc)
+        if (nc != length(colInd)) 
+            stop("column dendrogram ordering gave index of wrong length")
+    }
+    else {
+        colInd <- 1:nc
+    }
+    retval$rowInd <- rowInd
+    retval$colInd <- colInd
+    retval$call <- match.call()
+    x <- x[rowInd, colInd]
+    x.unscaled <- x
+    cellnote <- cellnote[rowInd, colInd]
+    if (is.null(labRow)) 
+        labRow <- if (is.null(rownames(x))) 
+            (1:nr)[rowInd]
+        else rownames(x)
+    else labRow <- labRow[rowInd]
+    if (is.null(labCol)) 
+        labCol <- if (is.null(colnames(x))) 
+            (1:nc)[colInd]
+        else colnames(x)
+    else labCol <- labCol[colInd]
+    if (scale == "row") {
+        retval$rowMeans <- rm <- rowMeans(x, na.rm = na.rm)
+        x <- sweep(x, 1, rm)
+        retval$rowSDs <- sx <- apply(x, 1, sd, na.rm = na.rm)
+        x <- sweep(x, 1, sx, "/")
+    }
+    else if (scale == "column") {
+        retval$colMeans <- rm <- colMeans(x, na.rm = na.rm)
+        x <- sweep(x, 2, rm)
+        retval$colSDs <- sx <- apply(x, 2, sd, na.rm = na.rm)
+        x <- sweep(x, 2, sx, "/")
+    }
+    if (missing(breaks) || is.null(breaks) || length(breaks) < 
+        1) {
+        if (missing(col) || is.function(col)) 
+            breaks <- 16
+        else breaks <- length(col) + 1
+    }
+    if (length(breaks) == 1) {
+        if (!symbreaks) 
+            breaks <- seq(min(x, na.rm = na.rm), max(x, na.rm = na.rm), 
+                length = breaks)
+        else {
+            extreme <- max(abs(x), na.rm = TRUE)
+            breaks <- seq(-extreme, extreme, length = breaks)
+        }
+    }
+    nbr <- length(breaks)
+    ncol <- length(breaks) - 1
+    if (class(col) == "function") 
+        col <- col(ncol)
+    min.breaks <- min(breaks)
+    max.breaks <- max(breaks)
+    x[x < min.breaks] <- min.breaks
+    x[x > max.breaks] <- max.breaks
+    # if (missing(lhei) || is.null(lhei)) 
+    #     lhei <- c(keysize, 4)
+    # if (missing(lwid) || is.null(lwid)) 
+    #     lwid <- c(keysize, 4)
+    # if (missing(lmat) || is.null(lmat)) {
+    #     lmat <- rbind(4:3, 2:1)
+    #     if (!missing(ColSideColors)) {
+    #         if (!is.character(ColSideColors) || length(ColSideColors) != 
+    #             nc) 
+    #             stop("'ColSideColors' must be a character vector of length ncol(x)")
+    #         lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + 
+    #             1)
+    #         lhei <- c(lhei[1], 0.2, lhei[2])
+    #     }
+    #     if (!missing(RowSideColors)) {
+    #         if (!is.character(RowSideColors) || length(RowSideColors) != 
+    #             nr) 
+    #             stop("'RowSideColors' must be a character vector of length nrow(x)")
+    #         lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - 
+    #             1), 1), lmat[, 2] + 1)
+    #         lwid <- c(lwid[1], 0.2, lwid[2])
+    #     }
+    #     lmat[is.na(lmat)] <- 0
+    # }
+    # if (length(lhei) != nrow(lmat)) 
+    #     stop("lhei must have length = nrow(lmat) = ", nrow(lmat))
+    # if (length(lwid) != ncol(lmat)) 
+    #     stop("lwid must have length = ncol(lmat) =", ncol(lmat))
+    # op <- par(no.readonly = TRUE)
+    # on.exit(par(op))
+    # layout(lmat, widths = lwid, heights = lhei, respect = FALSE)
+    if (!missing(RowSideColors)) {
+        par(mar = c(margins[1], 0, 0, 0.5))
+        image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE)
+    }
+    if (!missing(ColSideColors)) {
+        par(mar = c(0.5, 0, 0, margins[2]))
+        image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE)
+    }
+    par(mar = c(margins[1], 0, 0, margins[2]))
+    x <- t(x)
+    cellnote <- t(cellnote)
+    if (revC) {
+        iy <- nr:1
+        if (exists("ddr")) 
+            ddr <- rev(ddr)
+        x <- x[, iy]
+        cellnote <- cellnote[, iy]
+    }
+    else iy <- 1:nr
+    image(1:nc, 1:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 + 
+        c(0, nr), axes = FALSE, xlab = "", ylab = "", col = col, 
+        breaks = breaks, ...)
+    retval$carpet <- x
+    if (exists("ddr")) 
+        retval$rowDendrogram <- ddr
+    if (exists("ddc")) 
+        retval$colDendrogram <- ddc
+    retval$breaks <- breaks
+    retval$col <- col
+    if (!invalid(na.color) & any(is.na(x))) {
+        mmat <- ifelse(is.na(x), 1, NA)
+        image(1:nc, 1:nr, mmat, axes = FALSE, xlab = "", ylab = "", 
+            col = na.color, add = TRUE)
+    }
+    axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0, 
+        cex.axis = cexCol)
+    if (!is.null(xlab)) 
+        mtext(xlab, side = 1, line = margins[1] - 1.25)
+    axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0, 
+        cex.axis = cexRow)
+    if (!is.null(ylab)) 
+        mtext(ylab, side = 4, line = margins[2] - 1.25)
+    if (!missing(add.expr)) 
+        eval(substitute(add.expr))
+    if (!missing(colsep)) 
+        for (csep in colsep) rect(xleft = csep + 0.5, ybottom = rep(0, 
+            length(csep)), xright = csep + 0.5 + sepwidth[1], 
+            ytop = rep(ncol(x) + 1, csep), lty = 1, lwd = 1, 
+            col = sepcolor, border = sepcolor)
+    if (!missing(rowsep)) 
+        for (rsep in rowsep) rect(xleft = 0, ybottom = (ncol(x) + 
+            1 - rsep) - 0.5, xright = nrow(x) + 1, ytop = (ncol(x) + 
+            1 - rsep) - 0.5 - sepwidth[2], lty = 1, lwd = 1, 
+            col = sepcolor, border = sepcolor)
+    min.scale <- min(breaks)
+    max.scale <- max(breaks)
+    x.scaled <- scale01(t(x), min.scale, max.scale)
+    if (trace %in% c("both", "column")) {
+        retval$vline <- vline
+        vline.vals <- scale01(vline, min.scale, max.scale)
+        for (i in colInd) {
+            if (!is.null(vline)) {
+                abline(v = i - 0.5 + vline.vals, col = linecol, 
+                  lty = 2)
+            }
+            xv <- rep(i, nrow(x.scaled)) + x.scaled[, i] - 0.5
+            xv <- c(xv[1], xv)
+            yv <- 1:length(xv) - 0.5
+            lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s")
+        }
+    }
+    if (trace %in% c("both", "row")) {
+        retval$hline <- hline
+        hline.vals <- scale01(hline, min.scale, max.scale)
+        for (i in rowInd) {
+            if (!is.null(hline)) {
+                abline(h = i + hline, col = linecol, lty = 2)
+            }
+            yv <- rep(i, ncol(x.scaled)) + x.scaled[i, ] - 0.5
+            yv <- rev(c(yv[1], yv))
+            xv <- length(yv):1 - 0.5
+            lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s")
+        }
+    }
+    if (!missing(cellnote)) 
+        text(x = c(row(cellnote)), y = c(col(cellnote)), labels = c(cellnote), 
+            col = notecol, cex = notecex)
+    par(mar = c(margins[1], 0, 0, 0))
+    # if (dendrogram %in% c("both", "row")) {
+    #     plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none")
+    # }
+    # else plot.new()
+    par(mar = c(0, 0, if (!is.null(main)) 5 else 0, margins[2]))
+    # if (dendrogram %in% c("both", "column")) {
+    #     plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none")
+    # }
+    # else plot.new()
+    if (!is.null(main)) 
+        title(main, cex.main = 1.5 * op[["cex.main"]])
+    # if (key) {
+    #     par(mar = c(5, 4, 2, 1), cex = 0.75)
+    #     tmpbreaks <- breaks
+    #     if (symkey) {
+    #         max.raw <- max(abs(c(x, breaks)), na.rm = TRUE)
+    #         min.raw <- -max.raw
+    #         tmpbreaks[1] <- -max(abs(x), na.rm = TRUE)
+    #         tmpbreaks[length(tmpbreaks)] <- max(abs(x), na.rm = TRUE)
+    #     }
+    #     else {
+    #         min.raw <- min(x, na.rm = TRUE)
+    #         max.raw <- max(x, na.rm = TRUE)
+    #     }
+    #     z <- seq(min.raw, max.raw, length = length(col))
+    #     image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks, 
+    #         xaxt = "n", yaxt = "n")
+    #     par(usr = c(0, 1, 0, 1))
+    #     lv <- pretty(breaks)
+    #     xv <- scale01(as.numeric(lv), min.raw, max.raw)
+    #     axis(1, at = xv, labels = lv)
+    #     if (scale == "row") 
+    #         mtext(side = 1, "Row Z-Score", line = 2)
+    #     else if (scale == "column") 
+    #         mtext(side = 1, "Column Z-Score", line = 2)
+    #     else mtext(side = 1, "Value", line = 2)
+    #     if (density.info == "density") {
+    #         dens <- density(x, adjust = densadj, na.rm = TRUE)
+    #         omit <- dens$x < min(breaks) | dens$x > max(breaks)
+    #         dens$x <- dens$x[-omit]
+    #         dens$y <- dens$y[-omit]
+    #         dens$x <- scale01(dens$x, min.raw, max.raw)
+    #         lines(dens$x, dens$y/max(dens$y) * 0.95, col = denscol, 
+    #             lwd = 1)
+    #         axis(2, at = pretty(dens$y)/max(dens$y) * 0.95, pretty(dens$y))
+    #         title("Color Key\nand Density Plot")
+    #         par(cex = 0.5)
+    #         mtext(side = 2, "Density", line = 2)
+    #     }
+    #     else if (density.info == "histogram") {
+    #         h <- hist(x, plot = FALSE, breaks = breaks)
+    #         hx <- scale01(breaks, min.raw, max.raw)
+    #         hy <- c(h$counts, h$counts[length(h$counts)])
+    #         lines(hx, hy/max(hy) * 0.95, lwd = 1, type = "s", 
+    #             col = denscol)
+    #         axis(2, at = pretty(hy)/max(hy) * 0.95, pretty(hy))
+    #         title("Color Key\nand Histogram")
+    #         par(cex = 0.5)
+    #         mtext(side = 2, "Count", line = 2)
+    #     }
+    #     else title("Color Key")
+    # }
+    # else plot.new()
+    retval$colorTable <- data.frame(low = retval$breaks[-length(retval$breaks)], 
+        high = retval$breaks[-1], color = retval$col)
+    invisible(retval)
+}
+
+#rm(list=ls())
+param <- list()
+param$debug <- FALSE #T
+
+print("Finished loading functions")
+
+
+if(param$debug) {
+    param <- LoadDebugParams(param)
+} else {
+    param <- LoadReqiredParams(param)
+    param <- LoadOptionalParams(param)
+}
+print (param)
+Rprof() #Remove me
+# Load Chip data, alread in pval format
+# Column.order need to be a vecor of strings, that correspond to the order (and
+# inclustion) of chip experiments.
+chip <- GetChipData(param$annotated.macs.file, column.order = param$chip.order)
+
+#Get the top percentages on different criteria
+#ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage)
+#chip$peaks <- chip$peaks[ix.top$union, ]
+#chip$targets <- chip$targets[ix.top$union]
+#Bin data:
+chip$peaks <- bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=(param$number.bins+1)))
+
+file.names <- unlist(strsplit(param$rna.files, split="::"))
+print(file.names)
+file.lables <- unlist(strsplit(param$rna.names, split="::"))
+print(file.lables)
+
+library("RColorBrewer")
+color.2 =  rev(colorRampPalette(c("darkblue", "steelblue", "lightgrey"))(param$number.bins))
+if(!is.na(file.names) && file.names != "none") {
+    #rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg")
+    #do differential expression if the user specifies such:
+    if(param$rna.normalization != "no") {
+        if(param$rna.normalization != "mean"){
+            norm.file.names <- unlist(strsplit(param$rna.normalization.file, split="::"))
+            all.file.names <- c(file.names, norm.file.names)
+            all.file.lables <- c(file.lables, norm.file.names)
+            print (all.file.names)
+            print (all.file.lables)
+            rna.scores <- GetRNAData(all.file.names, file.lables = all.file.lables, fpkm = "avg")
+            rna.scores <- NormalizeRNA(scores = rna.scores)
+            rna.scores.sign <- rna.scores/abs(rna.scores)
+            rna.scores.sign[which(is.nan(rna.scores.sign))] <- 0
+            rna.scores <- bindata.non.zero.matrix(abs(rna.scores), qunts = seq(0, 0.9, length=(param$number.bins+1)))
+            rna.scores <- rna.scores.sign * rna.scores
+            color.2 = colorRampPalette(c("darkblue", "steelblue", "lightgrey", "pink", "darkred"))(param$number.bins)
+        } else {
+            print("we are normalizing by average")
+            print("file lables are:")
+            print(file.lables)
+            rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg")
+            rna.scores <- t(apply(rna.scores,1, function(x) { if(mean(x)!=0){ return(x/mean(x)) } else { return(x) }  }))
+            rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1)))
+        }
+    } else {
+        rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg")
+        rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1)))
+    }
+    
+    # chip$peaks <- chip$peaks[order(chip$targets), ]
+    # rna.scores <- rna.scores[order(rownames(rna.scores)), ]
+    # chip$targets <- chip$targets[order(chip$targets)]
+
+    rna.scores.mapped <- MapExpressiontoChip(chip.targets = chip$targets, expression=rna.scores)
+    all.data <- cbind(chip$peaks, rna.scores.mapped)
+    if(param$include.targetless!="yes"){
+        all.data <- all.data[-which(chip$targets==""),]
+        chip$peaks <- chip$peaks[-which(chip$targets==""),]
+        rna.scores.mapped <- rna.scores.mapped[-which(chip$targets==""),]
+        chip$targets <- chip$targets[-which(chip$targets=="")]
+    }
+} else {
+    all.data <- chip$peaks
+}
+
+print("Clustering")
+set.seed(1234)
+km <- kmeans(all.data, param$clustering.number.of.clusters, iter.max = 50)
+print("Ordering")
+km.order <- GenerateKMOrder(all.data, km)
+
+##AM edits##
+km.new.order <- numeric(length(km$cluster))
+for(i in 1:length(km.order)){
+    cur.clst <- km.order[i]
+    ix <- which(km$cluster==cur.clst)
+    km.new.order[ix] <- i
+}
+km.order <- 1:length(km.order)
+km$cluster <- km.new.order
+## Done AM ##
+print("Creating image")
+#bmp(param$heatmap.document.name, 640, 480)#1280, 960)
+bmp(paste(param$heatmap.document.name,".bmp", sep = ""), 5120, 3840)#2560, 1920)
+#pdf(paste(param$heatmap.document.name,".pdf", sep = ""))
+color.1 <- c("#000000", colorRampPalette(c("blue", "yellow","orange","red"))(param$number.bins))
+if(!is.na(file.names) && file.names != "none") {
+    PrintClusters(trgts=chip$targets,
+        k.ix=km$cluster,
+        f.nm = param$cluster.groups.document.name,
+        km.order = NA)
+
+    #if(param$rna.normalization != "no") {
+    #    data.split <- cbind(chip$peaks, PrepareRNAforHeatmap(rna.scores.mapped))
+    #} else {
+    #    data.split <- cbind(chip$peaks, rna.scores.mapped)
+    #}
+    expression.width.multiplier <- 2
+    if(ncol(rna.scores.mapped)==1) {
+        rna.scores.mapped <- cbind(rna.scores.mapped, rna.scores.mapped)
+        expression.width.multiplier <- 1
+    }
+
+    layout(matrix(c(1,2,2,5,
+        1,3,4,6,
+        1,3,4,7,
+        1,3,4,8,
+        1,3,4,9),
+    5,4,
+    byrow=T),
+    widths=c(1,2*ncol(chip$peaks),expression.width.multiplier*ncol(rna.scores.mapped),1),
+    heights=c(1,10,2,10,2))
+    #1
+    plot.new()
+    #2
+    plot.new()
+    #3
+    print("Creating peak image")
+    CreateIndividualHeatMap(chip$peaks,
+        km,
+        km.order, color.ramp=color.1)
+    #4
+    print("Creating rna image")
+    CreateIndividualHeatMap(rna.scores.mapped,
+        km,
+        km.order, color.ramp=color.2)
+    #5
+    plot.new()
+    #6
+    image(t(matrix(1:param$number.bins,nc=1)), col=color.1)
+    #7
+    plot.new()
+    #8
+    image(t(matrix(1:param$number.bins,nc=1)), col=color.2)
+    #9
+    plot.new()
+} else {
+    PrintClusters(trgts=1:nrow(chip$peaks),
+        k.ix=km$cluster,
+        f.nm = param$cluster.groups.document.name,
+        km.order = NA)
+
+
+    layout(matrix(c(1,2,2,
+        1,3,4,
+        1,3,5),3,3,byrow=T),
+        widths=c(1,3*ncol(chip$peaks),1),
+        heights=c(1,8,1))
+    #1
+    plot.new()
+    #2
+    plot.new()
+    #3
+    CreateIndividualHeatMap(chip$peaks,
+        km,
+        km.order, color.ramp=color.1)
+    #4
+    image(t(matrix(1:param$number.bins,nc=1)), col=color.1)
+    #5
+    plot.new()
+
+}
+
+dev.off()
+Rprof(NULL)
+profile <- summaryRprof()
+print(str(profile))
+print(profile)
+#save.image("test/testData.RData")
diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/map_ChIP_peaks_to_genes.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/map_ChIP_peaks_to_genes.R	Mon Jul 23 13:00:15 2012 -0400
@@ -0,0 +1,304 @@
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+## Nov 2011 th17
+## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, 
+## NYU - Center for Genomics and Systems Biology
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+
+rm(list=ls())
+debug=F
+script.version=0.1
+print.error <- function(){
+  cat("
+NAME
+       map_ChIP_peaks_to_genes.R
+
+DESCRIPTION
+       This script takes a MACS tab delimited file as input and
+       produces two files:
+
+       1) genes.xls
+          A gene centered table that stores the peaks that are
+          proximal (within x[kb] of TSS) and distal (within the
+          gene's region + y[kb] upstream or downstream of TSS and
+          TES respectively).
+
+       2) peaks.xls
+          A peak centered table that equals the MACS input table plus
+          colums for proximal and distal targets of each peak and
+          their distance from TSS (if exist). For gene.xls script also
+          gives a poisson model based -log10(pval) score for proximal
+          and genewide enrichment of peaks for each gene with at least
+          one prox/dist peak associated with it.
+
+ARGUMENTS
+       input_file=path
+         Path to MACS file.
+
+       refseq_table=path
+         Path to refseq table (gives TSS/TES locations for all genes).
+
+       path_output=path
+         Path to save genes.xls and peaks.xls output files.
+
+       tss.dist=N
+         The absolute distance from TSS where we connect a peak to
+         gene (for proximal peaks).
+
+       gene.wide.dist=N
+         The absolute distance from TSS or TES where we connect a peak
+         to gene (for peaks hitting anywhere in gene).
+
+       effective.genome.size=N
+         The effective mappable genome size (for mm9 the value is
+         1.87e9. For hg19 the value is 2.7e9 (from MACS manual)).
+
+       macs.skip.lines=N
+         Number of lines to skip in input_file.
+
+EXAMPLE 
+       Rscript map_ChIP_peaks_to_genes_v.1.R \\
+         input_file=SL971_SL970_peaks.xls \\
+         refseq_table=UCSC_mm9_refseq_genes_Sep_15_2011.txt \\
+         path_output=./ \\
+         tss.dist=5000 \\
+         gene.wide.dist=10000 \\
+         effective.genome.size=1.87e9 \\
+         macs.skip.lines=23
+
+CITATION
+       Please cite us if you used this script: The transcription
+       factor network regulating Th17 lineage specification and
+       function.  Maria Ciofani, Aviv Madar, Carolina Galan, Kieran
+       Mace, Ashish Agarwal, Kim Newberry, Richard M. Myers, Richard
+       Bonneau and Dan R. Littman et. al. (in preperation).
+
+")
+}
+
+# retrieve args
+if(debug==T){
+	cmd.args <- c(
+		# "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL971_SL970_peaks.xls",
+		"input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL3594_SL3592_peaks.xls",		
+		"refseq_table=input/th17/used_for_paper/rawData/UCSC_mm9_refseq_genes_Sep_15_2011.txt",
+		"path_output=/Users/aviv/Desktop/test_script/",
+		"tss.dist=5000",
+		"tes.dist=10000",
+		"effective.genome.size=1.87e9",
+		"gene.wide.dist=10000",
+		"macs.skip.lines=23"
+	)
+} else {
+	cmd.args <- commandArgs();      
+}
+
+if(length(grep("--version",cmd.args))){
+	cat("version",script.version,"\n")
+	q()
+}
+
+arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1])
+args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2])
+
+arg.nms <- c("input_file","refseq_table","path_output","tss.dist",
+			 "gene.wide.dist","effective.genome.size","macs.skip.lines")
+arg.val <- character(length=length(arg.nms))
+for(i in 1:length(arg.nms)){
+	ix <- which(arg.names.cmd.line==arg.nms[i])
+	if(length(ix)==1){
+		arg.val[i] <- args.val.cmd.line[ix]
+	} else {
+		stop("######could not find ",arg.nms[i]," arg######\n\n",print.error())
+		
+	} 
+}
+if(debug==T){
+	print(paste(arg.nms,"=",arg.val))
+}
+# the files here adhere to tab delim format
+path.input.macs <- arg.val[1]
+path.input.refseq <- arg.val[2]
+path.output <- arg.val[3]
+tss.dist <- as.numeric(arg.val[4])
+gene.wide.dist <- as.numeric(arg.val[5])
+effective.genome.size <- as.numeric(arg.val[6])
+macs.skip.lines=as.numeric(arg.val[7])
+
+if(debug==T){
+# if(1){
+	load("/Users/aviv/Desktop/r.u.RData")
+	gn.nms <- rownames(r.u)
+} else {
+	##################
+	### step 1 handle refseq table file (read, make unique)
+	cat("reading refseq table",path.input.refseq,"\n")
+	refseq <- read.delim(file=path.input.refseq)
+	# refseq can have many transcripts for each gene
+	# here i make it have only one transcript for each gene (the longest one)
+	cat("for transcripts matching more than one gene keeping only the longest transcript\n")
+	refseq$name2 <- as.character(refseq$name2)
+	refseq$chrom <- as.character(refseq$chrom)
+	refseq$strand <- as.character(refseq$strand)
+	gn.nms <- unique(refseq$name2)
+	#x <- sort(table(refseq$name2),decreasing=T)
+	#gn.nms.non.unique <- names(x)[which(x>1)]
+	#gn.nms.unique <- names(x)[which(x==1)]
+	# create refseq unique r.u
+	n <- length(gn.nms)
+	r.u <- data.frame(cbind(chrom=rep("NA",n),strand=rep("NA",n),txStart=rep(0,n),txEnd=rep(0,n)),stringsAsFactors=FALSE,row.names=gn.nms)
+	for(i in 1:n){
+	  ix <- which(refseq$name2==gn.nms[i])
+	  if(length(ix)==0) {
+	    error("could not find gene", ng.nms[i], "in refseq table.  Bailing out...\n")
+	  } else if (length(ix)>1){
+	    l <- apply(refseq[ix,c("txStart","txEnd")],1,function(i) abs(i[1]-i[2]) )
+	    l.max <- max(l)
+	    ix <- ix[which(l==l.max)[1]]
+	  }
+	  r.u[gn.nms[i],c("chrom","strand","txStart","txEnd")] <- refseq[ix,c("chrom","strand","txStart","txEnd")]
+	}
+	r.u[,"txStart"] <- as.numeric(r.u[,"txStart"])
+	r.u[,"txEnd"] <- as.numeric(r.u[,"txEnd"])
+
+	# switch TSS and TES if we have chr "-"
+	cat("correcting TSS, TES assiments in refseq table based on strand\n")
+	ix <- which(r.u$strand=="-")
+	tmp.tes <- r.u$txStart[ix]
+	tmp.tss <- r.u$txEnd[ix]
+	r.u[ix,c("txStart","txEnd")] <- cbind(tmp.tss,tmp.tes)
+	# ############
+}
+
+
+#### step 2 read macs file
+cat("reading Macs file",path.input.macs,"\n")
+
+# read macs file
+M <- read.delim(file=path.input.macs,skip=macs.skip.lines)
+trgt.prox <- NA
+trgt.dist <- NA
+dtss.prox <- NA
+dtss.dist <- NA
+M.annot <- cbind(M,trgt.prox,trgt.dist,dtss.prox,dtss.dist)
+M.annot[is.na(M.annot)] <- ""
+# get the number of genes
+m <- length(gn.nms)
+if(debug==T){
+	# m <- 100
+}
+f.nm.peak <- paste(sep="",path.output,"peaks.xls")
+f.nm.gene <- paste(sep="",path.output,"genes.xls")
+# for each genes in the expt (j goes over genes)
+cat(file=f.nm.gene,sep="\t","Gene_ID","prox_n_peak","genewide_n_peak","gn_length","strand","prox_mean_pval","genewide_mean_pval",
+			"prox_max_pval","genewide_max_pval",
+			"prox_pois_model_pval","genewide_pois_model_pval","(peak_id,summit,d_TSS,d_TES,class,pval,fold_enrich,FDR),(..)\n")
+peaks.summit <- (M[,"start"]+M[,"summit"])
+cat(sep="","mapping MACS peaks to genes\n")
+for (j in 1:m){
+  if(j%%20==0){cat(".")}
+  tss <- r.u[j,"txStart"]
+  tes <- r.u[j,"txEnd"]
+  gn.lngth <- abs(tss-tes)
+  strand <- r.u[j,"strand"]
+  chr <- r.u[j,"chrom"]
+  if(strand=="+"){
+    d.tss.all <- peaks.summit-tss
+   	d.tes.all <- peaks.summit-tes
+  } else {
+    d.tss.all <- tss-peaks.summit
+   	d.tes.all <- tes-peaks.summit
+  }
+  ix.distal <- sort(union( c(which( abs(d.tss.all) < gene.wide.dist ),which( abs(d.tes.all) < gene.wide.dist )),
+               which( sign(d.tss.all) != sign(d.tes.all) )),decreasing=TRUE)
+  ix.prox <- sort(which( abs(d.tss.all) < tss.dist ),decreasing=TRUE)	
+  ix.prox <- ix.prox[which(M[ix.prox,"chr"]==chr)]    
+  ix.distal <- ix.distal[which(M[ix.distal,"chr"]==chr)]
+  n.peaks.prox <- length(ix.prox)
+  n.peaks.distal <- length(ix.distal)
+	# if there is at least one peak hitting gene j
+  if(n.peaks.distal > 0){
+    # for each peak (l goes over peaks)
+    peaks.line <- paste(sep="","(")
+    for (k in 1:n.peaks.distal){
+      if(k>1){
+        peaks.line <- paste(sep="",peaks.line,";(")
+      }
+      d.tss <- d.tss.all[ ix.distal[k] ]
+      d.tes <- d.tes.all[ ix.distal[k] ]
+      if(sign(d.tss)!=sign(d.tes)){
+        class="intra"
+      } else if(d.tss<=0){
+        class="upstream"
+      } else if(d.tss>0){
+        class="downstream"
+      }
+      peaks.line <- paste(sep="",peaks.line,paste(sep=",", ix.distal[k], peaks.summit[ ix.distal[k] ], 
+					d.tss, d.tes, class, M[ ix.distal[k] ,7], M[ ix.distal[k] ,8], M[ ix.distal[k] ,9] ),")")
+	  # handle M.annot
+	  dtss <- d.tss.all[ ix.distal[k] ]
+	  prev.trgt <- M.annot[ix.distal[k],"trgt.dist"]
+	  if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
+		if(abs(dtss)<abs(as.numeric(M.annot[ix.distal[k],"dtss.dist"]))){
+			M.annot[ix.distal[k],"dtss.dist"] <- dtss
+			M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
+		}
+	  } else { # if peak does not have trgt add current trgt
+		M.annot[ix.distal[k],"dtss.dist"] <- dtss
+		M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
+	  }
+	# calc the pval for these many peaks in proximal region and in gene-wide region
+	}
+	if(n.peaks.prox > 0){
+		mean.pval.prox <- mean(M[ ix.prox ,7])
+		max.pval.prox <- max(M[ ix.prox ,7])
+		expected.num.peaks.genome.wide.prox <- length(which(M[,7]>=mean.pval.prox))
+		# lambda = num peaks / genome size * searched region
+		lambda.prox <- expected.num.peaks.genome.wide.prox/effective.genome.size*(tss.dist*2)		
+		pval.pois.prox <- -log10(ppois(n.peaks.prox,lambda.prox,lower.tail=FALSE))
+		# handle M.annot
+		for(k in 1:n.peaks.prox){
+			dtss <- d.tss.all[ ix.prox[k] ]
+			prev.trgt <- M.annot[ix.prox[k],"trgt.prox"]
+			if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
+				if(abs(dtss)<abs(as.numeric(M.annot[ix.prox[k],"dtss.prox"]))){
+					M.annot[ix.prox[k],"dtss.prox"] <- dtss
+					M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
+				}
+			} else { # if peak does not have trgt add current trgt
+				M.annot[ix.prox[k],"dtss.prox"] <- dtss
+				M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
+			}
+		}
+	} else {
+		mean.pval.prox <- "NA"
+		max.pval.prox <- "NA"
+		pval.pois.prox <- "NA"
+	}
+	mean.pval.distal <- mean(M[ ix.distal ,7])
+	max.pval.distal <- max(M[ ix.distal ,7])
+	expected.num.peaks.genome.wide.distal <- length(which(M[,7]>=mean.pval.distal))
+	# lambda = num peaks / genome size * searched region
+	lambda.distal <- expected.num.peaks.genome.wide.distal/effective.genome.size*(gn.lngth+gene.wide.dist*2)
+    pval.pois.distal <- -log10(ppois(n.peaks.distal,lambda.distal,lower.tail=FALSE))
+    # pring peaks for gene j    
+    core.line <- paste(sep="\t",gn.nms[ j ],n.peaks.prox,n.peaks.distal,gn.lngth,strand,
+							mean.pval.prox,mean.pval.distal,max.pval.prox,max.pval.distal,
+							pval.pois.prox,pval.pois.distal)
+    cat(file=f.nm.gene,append=TRUE,sep="",core.line,"\t",peaks.line,"\n")
+  }
+}
+cat("Done!\n")
+
+cat(file=f.nm.peak,colnames(M.annot),"\n",sep="\t")
+write.table(file=f.nm.peak,append=T,col.names=F,row.names=F,M.annot,sep="\t")
+
+
+
+
+
+
+
diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/mtls_analyze.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/mtls_analyze.xml	Mon Jul 23 13:00:15 2012 -0400
@@ -0,0 +1,333 @@
+<tool name="Chip-Cluster: Cluster ChIP-seq peaks and create a heatmap" id="chip-cluster">
+  <description>
+    Merge multiple ChIP-seq experiments, alligning their peaks to MTLs (Multi
+    Transcription Factor Loci(us)) and optionally incorperate expression
+  </description>
+  <command interpreter="command">/bin/bash $shscript </command>
+  <inputs>
+    <param name="chipInputFormat" type="select" display="radio" label="ChIP Input Format">
+      <option name="macs" value="MACS">MACS</option>
+      <option name="bed" value="BED">BED</option>
+    </param>
+    <param name="mtlType" type="select" display="radio" label="Cluster by: ">
+      <option name="summit" value="summit">Summit</option>
+      <option name="interval" value="interval">Interval</option>
+    </param>
+    <param name="summitDistance" type="text" label="Summit Distance (BP) - Summit only" value="100">
+    </param>
+    <param name="numberBins" type="text" label="Number of Bins" value="30">
+    </param>
+    <repeat name="chip_tracks" title="MACS/BED Files">
+      <param name="file" type="data" format="tabular" label="Dataset"/>
+      <param name="name" type="text" label="Dataset Name"/>
+    </repeat>
+    <param name="map_rna" type="boolean" truevalue="yes" falsevalue="no" label="Incorperate RNA?"/>
+    <param name="includeTargetless" checked="true" type="boolean" truevalue="yes" falsevalue="no" label="Include Targetless MTLs?"/>
+    <param name="reference_file" type="data" format="tabular" label="Reference Genome File"/>
+
+    <param name="normalize_rna" type="boolean" truevalue="yes" falsevalue="no" label="Normalize Expression?"/>
+    <param name="use_mean" type="boolean" truevalue="yes" falsevalue="no" label="Use mean expression across exp. to normalize?"/>
+    <param name="rnaInputFormat" type="select" display="radio" label="RNA Input Format">
+      <option name="cufflinks" value="cufflinks">Cufflinks</option>
+      <option name="bed" value="bed">BED</option>
+    </param>
+    <param name="numClusters" type="text" label="Number of Clusters (kmeans)" value="8">
+    </param>
+    <param name="trgtDistance" type="text" label="Transcript threshold distance" value="5000">
+    </param>
+    <repeat name="rna_tracks" title="Cufflinks/BED Files">
+      <param name="file" type="data" format="tabular" label="Dataset"/>
+      <param name="name" type="text" label="Dataset Name"/>
+      <param name="norm" type="data" label="Normalization Dataset"/>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="xls" name="cluster_assignments" label="Cluster Assignments"/>
+    <data format="xls" name="mtls" label="MTLS File"/>
+    <data format="txt" name="log" label="Log file" />
+    <data format="bmp" name="heatmap_image" label="Heatmap Image" />
+<!--    <data format="png" name="heatmap_image" label="Heatmap Image" >-->
+<!--      <filter>imageFormat=="png"</filter>-->
+<!--    </data>-->
+<!--    <data format="pdf" name="heatmap_image" label="Heatmap Image" >-->
+<!--      <filter>imageFormat=="pdf"</filter>-->
+<!--    </data>-->
+
+  </outputs>
+  <configfiles>
+    <configfile name="shscript">
+<!-- This is the script that runs (Chettah/bash code)-->
+#!/bin/bash
+
+#import os
+#set $path = $os.path.abspath($__app__.config.tool_path)
+
+
+## Set symbols so that they are not incorrectly interpreted:
+#set $dollar = chr(36)
+#set $gt = chr(62)
+#set $lt = chr(60)
+#set $ad = chr(38)
+#set $bs = chr(92)
+
+echo $map_rna ${ad}${gt}${gt} $log
+echo "This is the Bash log file: " ${ad}${gt}${gt} $log
+###############################################################################
+## Convert the gtf file to a file that aviv's script can hadel
+#if str($map_rna)=='yes'
+  echo "Converting gtf file" ${ad}${gt}${gt} $log
+  Rscript $path/visualization/gtfToMapFriendlyAnnotation.R $reference_file ${ad}${gt}${gt} $log
+  echo "done converting gtf file" ${ad}${gt}${gt} $log
+#end if
+###############################################################################
+## Get ChIP data in correctly formated strings and annotate if nessisary.
+#set $sep = '::'
+#for $i, $chip in enumerate( $chip_tracks )
+  #if $i==0
+    echo "Chip Files:" ${ad}${gt}${gt} $log
+    echo "The first file label is:  ${chip.name}" ${ad}${gt}${gt} $log
+    echo "The first file path is:  ${chip.file}" ${ad}${gt}${gt} $log
+    chip_labels=${chip.name}
+    chip_paths=${chip.file}
+  #else
+    echo "The next file label is:  ${chip.name}" ${ad}${gt}${gt} $log
+    echo "The next file path is:  ${chip.file}" ${ad}${gt}${gt} $log
+    chip_labels=${dollar}chip_labels${sep}${chip.name}
+    chip_paths=${dollar}chip_paths${sep}${chip.file}
+  #end if
+#end for
+
+echo chip paths are - ${dollar}chip_paths ${ad}${gt}${gt} $log
+echo chip labels are - ${dollar}chip_labels ${ad}${gt}${gt} $log
+
+###############################################################################
+## Cluster peaks
+
+Rscript $path/visualization/cluster_peaks.R \
+--input_files ${dollar}chip_paths \
+--input_type $chipInputFormat \
+--path_output ./ \
+--expt_names ${dollar}chip_labels \
+--dist_summits $summitDistance \
+--mtl_type $mtlType ${ad}${gt}${gt} $log
+
+###############################################################################
+## Annotate mtls.xls if nessisary
+#if str($map_rna)=="yes"
+  echo "annotating mtls.xls..." ${ad}${gt}${gt} $log
+  Rscript $path/visualization/annotate_mtls.R mtls.xls gene_annotation.txt $trgtDistance ${ad}${gt}${gt} $log
+#end if
+###############################################################################
+## If rna is specified, then get RNA data in correctly formated strings:
+#if str($map_rna)=='yes'
+  #set $sep = '::'
+  #for $i, $rna in enumerate( $rna_tracks )
+    #if $i==0
+      echo "The first file label is:  ${rna.name}" ${ad}${gt}${gt} $log
+      echo "The first file path is:  ${rna.file}" ${ad}${gt}${gt} $log
+      rna_labels=${rna.name}
+      rna_paths=${rna.file}
+      rna_norm_paths=${rna.norm}
+    #else
+      echo "The next file label is:  ${rna.name}" ${ad}${gt}${gt} $log
+      echo "The next file path is:  ${rna.file}" ${ad}${gt}${gt} $log
+      rna_labels=${dollar}rna_labels${sep}${rna.name}
+      rna_paths=${dollar}rna_paths${sep}${rna.file}
+      rna_norm_paths=${dollar}rna_norm_paths${sep}${rna.norm}
+    #end if
+  #end for
+  echo rna paths are - ${dollar}rna_paths ${ad}${gt}${gt} $log
+  echo rna labels are - ${dollar}rna_labels ${ad}${gt}${gt} $log
+  echo rna norm files are - ${dollar}rna_norm_paths ${ad}${gt}${gt} $log
+#end if
+###############################################################################
+
+#if str($normalize_rna)=='no'
+  echo "Normalization by file is set to no" ${ad}${gt}${gt} $log
+  rna_norm_paths=no
+#end if
+
+#if str($use_mean)=='yes'
+  echo "Normalization of expression will be done by mean" ${ad}${gt}${gt} $log
+  rna_norm_paths=mean
+#end if
+
+#if str($map_rna)=='no'
+  mtls_file=mtls.xls
+  rna_paths=none
+  rna_labels=none
+#else
+  mtls_file=annotated_mtls.xls
+#end if
+
+echo "
+Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
+--cluster_file ./cluster \
+--chip_experiment_order ${dollar}chip_labels \
+--heatmap_file ./heatmap \
+--heatmap_type bmp \
+--n_clusters $numClusters \
+--filter_percentage 100 \
+--expression_file ${dollar}rna_paths \
+--expression_name ${dollar}rna_labels \
+--normalization_file ${dollar}rna_norm_paths \
+${ad}${gt}${gt} $log" ${ad}${gt}${gt} $log  
+
+Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
+--cluster_file ./cluster \
+--chip_experiment_order ${dollar}chip_labels \
+--heatmap_file ./heatmap \
+--heatmap_type bmp \
+--n_clusters $numClusters \
+--filter_percentage 100 \
+--number_bins $numberBins \
+--include_targetless $includeTargetless \
+--expression_file ${dollar}rna_paths \
+--expression_name ${dollar}rna_labels \
+--normalization_file ${dollar}rna_norm_paths \
+${ad}${gt}${gt} $log
+
+ls ${ad}${gt}${gt} $log
+
+
+
+
+##################################################################
+#if str($map_rna)=='yes'
+    mv ./annotated_mtls.xls $mtls
+#else
+    mv ./mtls.xls $mtls
+#end if
+mv ./heatmap.* $heatmap_image
+mv ./cluster.tsv $cluster_assignments
+
+    </configfile>
+  </configfiles>
+<!--<tests>-->
+<!--  <test maxseconds="3600" name="GCA_1">-->
+<!--    <param name="bfile" value="bedfile.bed" />-->
+<!--    <param name="span" value="3000" />-->
+<!--    <param name="genome" value="hg18" />-->
+<!--    <output name="output" file="gca_1/gca_1.xls" />-->
+<!--    <output name="output" file="gca_1/gca_1.log" lines_diff = "200" />-->
+<!--  </test>-->
+<!--  <test maxseconds="3600" name="GCA_2">-->
+<!--    <param name="bfile" value="bedfile.bed" />-->
+<!--    <param name="span" value="100" />-->
+<!--    <param name="genome" value="hg18" />-->
+<!--    <output name="output" file="gca_2/gca_2.xls" />-->
+<!--    <output name="output" file="gca_2/gca_2.log" lines_diff = "200" />-->
+<!--  </test>-->
+<!--  <test maxseconds="3600" name="GCA_3">-->
+<!--    <param name="bfile" value="bedfile.bed" />-->
+<!--    <param name="span" value="500" />-->
+<!--    <param name="genome" value="hg18" />-->
+<!--    <output name="output" file="gca_3/gca_3.xls" />-->
+<!--    <output name="output" file="gca_3/gca_3.log" lines_diff = "200" />-->
+<!--  </test>-->
+<!--  <test maxseconds="3600" name="GCA_4">-->
+<!--    <param name="bfile" value="bedfile.bed" />-->
+<!--    <param name="span" value="1000" />-->
+<!--    <param name="genome" value="hg18" />-->
+<!--    <output name="output" file="gca_4/gca_4.xls" />-->
+<!--    <output name="output" file="gca_4/gca_4.log" lines_diff = "200" />-->
+<!--  </test>-->
+<!--  <test maxseconds="3600" name="GCA_5">-->
+<!--    <param name="bfile" value="bedfile.bed" />-->
+<!--    <param name="span" value="10000" />-->
+<!--    <param name="genome" value="hg18" />-->
+<!--    <output name="output" file="gca_5/gca_5.xls" />-->
+<!--    <output name="output" file="gca_5/gca_5.log" lines_diff = "200" />-->
+<!--  </test>-->
+<!--</tests>-->
+  <help>
+This tool will merge peaks form multiple chip-seq experiments, creating MTLs for
+each overlapping region. It will then cluster each MTL based on the score of
+each peak within each MTL (using K-means clustering, with k set by user). A
+heatmap is then generated from the resulting cluster along with the MTLs
+generated. This module in writin in R and is will be made available on github
+and bioconductor. This work was done by Kieran Mace and Aviv Madar.
+
+**NEED IMPROVEMENT**
+
+-----
+
+**Parameters**
+
+- **Input files** contains either macs or BED files to be merged. This list of files must be two or larger.
+- **Experiment names** contains the name given to each track.
+- **Summit distance** is the cuttoff distance (in BP) to be included in an MTL. This option is not used with the summit option below
+- **Input Format** Either bed of MACS file format, all files must be of one type. Defaults to MACS
+- **MTL Type** Either interval or summit (defaults to summit).
+- **Number clusters** the value of k for kmeans clustering.
+- **Filter top MTLS** The top percentage of MTLs to keep for image and cluster (based on the union of mean, non-zero mean, and variance of the scores).
+-----
+
+**Output**
+
+- **XLS file** is the tab-delimited file containing the MTL data.
+- **PNG file** is the heatmap image generated after clustering the MTL data.
+
+-----
+
+**script parameter list of Chip-Cluster**
+
+Options:
+DESCRIPTIION:
+	cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where
+	each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
+	fall within a certain distance between summits from eachother.
+
+INPUT:
+	1.path_input=path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
+	2.path_output=path to save generated MTL cluster file (where to save mtls.xls)
+	3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
+	4.dist.summits=maximum distance between summits belonging to the same MTL (defaults to 100)
+	5.input_type=the type of input file used (MACS or .bed; defaults to MACS)
+	6.mtl_type=interval or summit (defaults to summit)
+
+EXAMPLE RUN:
+	cluster_peaks.R
+	--input_macs_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
+	--input_type MACS
+	--path_output results/
+	--expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
+	--dist_summits 100
+	--mtl_type summit
+
+	DESCRIPTIION:
+	heatmap.R takes a ...
+
+INPUT:
+	1.--mtls_file path to mtls file.
+
+	2.--cluster_file the destination path for the cluster file.
+
+	3.--heatmap_file the destination path for heatmap image (no extension).
+
+	4.--heatmap_type choice of image type, currently support png and pdf.
+
+	5.--n_clusters number of clusters in the heatmap
+
+	6.--filter_percentage percentage of mtls that will be analysed. for eg. if
+	we make filter_percentage 30, we will take the union of the top mtls in
+	mean, non-zero mean and variance.
+
+
+EXAMPLE RUN:
+	Rscript heatmap.R
+				--mtls_file mtls.xls
+				--cluster_file output/cluster
+				--heatmap_file output/heatmap
+				--heatmap_type png
+				--n_clusters 13
+				--filter_percentage 60
+
+Please cite us if you used this script:
+	The transcription factor network regulating Th17 lineage specification and function.
+	Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers,
+	Richard Bonneau and Dan R. Littman et. al. (in preperation)
+
+  </help>
+
+</tool>