Mercurial > repos > kmace > mtls_analysis

--- a/mtls_analyze/mtls_analyze/annotate_mtls.R	Mon Jul 23 12:58:55 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,79 +0,0 @@
-debug = F
-# Read in Data
-if (debug == T) {
-  mtls <- read.delim("~/code/scorchR-annotate_mtls/mtls.xls", quote="", stringsAsFactors=F)
-  annotation <- read.delim("~/code/scorchR-annotate_mtls/gene_annotation.txt", header=T, stringsAsFactors=F)
-  threshold <- 100000
-} else {
-  cmd <- commandArgs(trailingOnly = T)
-  mtls <- read.delim(cmd[1], quote="", stringsAsFactors=F)
-  annotation <- read.delim(cmd[2], header=T, stringsAsFactors=F)
-  threshold <- as.numeric(cmd[3])
-}
-
-
-
-# Get the median summit
-summits <- sapply(strsplit(mtls[ ,"summit"], split="_"), function(x) {median(as.numeric(x))})
-# Extract every tss
-tss <- apply(annotation,1,function(x) {
-  if (x["strand"]=="+"){
-    return(as.numeric(x["txStart"]))
-    }else{
-      return(as.numeric(x["txEnd"]))
-      }
-  })
-
-
-# Get each Chromosome
-chromosomes <- unique( mtls[,"chr"])
-
-# Set up output:
-trgt <- character(length = length(summits))
-
-# Loop through Chromosomes
-for (chromosome in chromosomes){
-  cat(chromosome, "\n")
-  chr.ix.tss <- which(annotation[ ,"chrom"]==chromosome)
-  chr.ix.summits <- which(mtls[ ,"chr"]==chromosome)
-
-
-  cat(length(chr.ix.summits), " -summits\n",
-      length(chr.ix.tss), " -tss\n")
-  if (length(chr.ix.summits)>0 && length(chr.ix.tss)>0) {
-  # Loop through Summits
-  for(summit.ix in chr.ix.summits) {
-    #cat("Summit = ", summit.ix, class(summit.ix),"\n")
-    distances <- abs(tss[chr.ix.tss] - summits[summit.ix])
-    closest.ix.tss <- which(distances == min(distances))[1]
-    if(!is.na(closest.ix.tss) && !is.na(distances[closest.ix.tss])) {
-    #print(closest.ix.tss)
-    if (distances[closest.ix.tss] < threshold) {
-
-      trgt[summit.ix] <- annotation[chr.ix.tss, "name2"][closest.ix.tss]
-
-#       cat("A distance of ",
-#           distances[closest.ix.tss],
-#           " was found for the mtl summit located at: ",
-#           chromosome,
-#           ": ",
-#           summits[summit.ix],
-#           " and TSS located at: ",
-#           chromosome,
-#           ": ",
-#           tss[chr.ix.tss][closest.ix.tss],
-#           "\n")
-    }
-    }
-  }
-  } else {
-    cat("chromosome", chromosome, " either has no transcripts or no peaks in this experiment. \n")
-  }
-  output <- cbind(mtls, trgt)
-}
-
-write.table(output, file="annotated_mtls.xls", sep="\t",row.names=F, quote=F)
-
-
-
-
--- a/mtls_analyze/mtls_analyze/cluster_peaks.R	Mon Jul 23 12:58:55 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,431 +0,0 @@
-##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
-## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
-##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
-## Feb 2012 th17
-## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>,
-## NYU - Center for Genomics and Systems Biology
-##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
-## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
-##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
-
-rm(list=ls())
-debug=F
-verbose=F
-script.version=0.1
-print.error <- function(){
-	cat("
-DESCRIPTIION:
-	cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where
-	each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
-	fall within a certain distance from eachother. If you cluster MTLs based on summits, you need to specify dist.summits.
-	If you use .bed files, the files must contain no header, and at least the five columns:
-	1. chrom, 2. chromStart, 3. chromEnd, 4. name, and 5.score. 2 and 3 represent the end points used for MTLs defined based on a shared
-	interval. (2+3)/2 is used as the summit of each row if used for MTLs defined based on proximity of summits.
-
-INPUT:
-	1.input_files: path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
-	2.path_output: path to save generated MTL cluster file (where to save mtls.xls)
-	3.expt_names: user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
-	4.input_type: the type of input file used (MACS or BED; defaults to MACS)
-	5.mtl_type: interval or summit (defaults to summit)
-	6.dist.summits: maximum distance between summits belonging to the same MTL (defaults to 100; only used if mtl_type is summit)
-
-EXAMPLE RUN:
-	cluster_peaks.R
-	--input_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
-	--input_type MACS
-	--path_output results/
-	--expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
-	--dist_summits 100
-	--mtl_type summit
-
-Please cite us if you used this script:
-	The transcription factor network regulating Th17 lineage specification and function.
-	Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers,
-	Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n")
-}
-
-############# helper functions:
-## split.macs.list.to.chrmosomes
-# input:
-#      - macs.list: a list of macs expts: here are a few lines of one expt
-## chr	start	end	length	summit	tags	#NAME?	fold_enrichment	FDR(%)
-## chr1	4322210	4323069	860	494	55	158.95	6.03	0.05
-## chr1	4797749	4798368	620	211	29	119.82	3.47	0.09
-## chr1	4848182	4849113	932	494	46	105.42	2.9	0.09
-#      - expts: a list of the expts names from macs.list that you want to process
-#      - chr: chrmosomes names
-# output:
-#      - x: a list with as many elements as chr specified by input.
-#      -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS)
-split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){
-  x <- list()
-  n <- length(expts)
-  for(i in 1:n){
-    e <- expts[i] #experiment name
-    cat("wroking on expt", e,"\n")
-    x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]])
-    names(x[[e]]) <- chr
-  }
-  return(x)
-}
-# a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m
-# input:
-#     - r is chromosome
-#     - m is macs matrix for expt e from above function
-split.one.macs.expt.by.chromosome.no.pml <- function(r,m){
-  ix.chr.i <- which(m[,"chr"]==r)
-  # cat("working on",r,"\n")
-  o <- list()
-  o[[r]] <- m[ix.chr.i,]
-  o[[r]]$peak.id <- ix.chr.i
-  return(o[[r]])
-}
-
-## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
-# also match these summit locations with corresponding:
-# pvals, tfs, peak start and peak end
-# input:
-#     - chr: a list of chromosome names
-#     - macs.list.per.chrom: a list of macs peaks for each chromosome
-# output:
-#     - o: a list each chormosome ruler as an element
-make.ruler.no.pml <- function(chr,macs.list.per.chrom){
-  x <- macs.list.per.chrom
-  o <- list()
-  for(j in 1:length(chr)){
-    r <- chr[j] # chrmosome we go over
-    s <- numeric()
-    pval <- numeric()
-    tf.to.s <- character()
-    start <- numeric()
-    end <- numeric()
-    trtmnts <- names(x) # the treatments name (expt names)
-    ## debug parameters ###
-    ## which experiment peaks come from
-    expt <- character()
-    ## what was the peak id in that experiment
-    peak.ids <- numeric()
-    ## this will allow us to always back track from a cluster to the actual peaks in it
-    ## debug params end ###
-    for(i in 1:length(trtmnts)){
-      o[[r]] <- list()
-      e <- trtmnts[i] #experiment name
-      tf <- strsplit(e,"_")[[1]][1]
-      s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"])
-      pval <- c(pval,x[[e]][[r]][,"score"])
-      start <- c(start,x[[e]][[r]][,"start"])
-      end <- c(end,x[[e]][[r]][,"end"])
-      expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"])))
-      peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"])
-    }
-    ix <- sort(s,index.return=TRUE)$ix
-    o[[r]] <- list(summit=s[ix],score=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix])
-  }
-  return(o)
-}
-
-## add cluster memberships based on ruler
-## require no more than d.cut distance between tf summits
-## cur.l is the current loci number (or cluster number)
-assign.clusters.ids.to.peaks.based.on.summits <- function(ruler,d.cut){
- cur.l <- 0
- for(j in 1:length(ruler)){
-   s <- ruler[[j]][["summit"]]
-   l <- numeric(length=length(s))
-   if(length(l)>0){
-	   cur.l <- cur.l+1
-   }
-   if(length(l)==1){
-   	l[1] <- cur.l
-   } else if(length(l)>1) {
-	l[1] <- cur.l
-	   for(i in 2:length(l)){
-	     d <- s[i]-s[i-1] # assumes s is sorted increasingly
-	     if(d>d.cut){
-	       cur.l <- cur.l+1
-	     }
-	     l[i] <- cur.l
-	   }
-	}
-   ruler[[j]][["mtl.id"]] <- l
- }
- return(ruler)
-}
-
-## add cluster memberships based on ruler
-## require that peaks span will have a non-empty intersection
-## cur.mtl is the current loci number (or cluster number)
-assign.clusters.ids.to.peaks.based.on.intervals <- function(ruler){
-cur.mtl <- 0
-for(j in 1:length(ruler)){
-	s <- ruler[[j]][["start"]]
-	e <- ruler[[j]][["end"]]
-	l <- numeric(length=length(s))
-	if(length(l)>0){
-	   cur.mtl <- cur.mtl+1
-	}
-	if(length(l)==1){
-	  	l[1] <- cur.mtl
-	} else if(length(l)>1) {
-		l[1] <- cur.mtl
-		cur.e <- e[1] # the right-most bp belonging to the current mtl
-	   	for(i in 2:length(l)){
-	     	if(cur.e<s[i]){
-	       		cur.mtl <- cur.mtl+1
-	     	}
-	     	l[i] <- cur.mtl
-			cur.e <- max(cur.e,e[i])
-	   	}
-	}
-	ruler[[j]][["mtl.id"]] <- l
-}
-return(ruler)
-}
-
-## here we create a list of TF enriched loci (clusters)
-## input:
-##     - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
-## output:
-##     -ll: a list of clusters ll where each cluster (elem in ll holds):
-##        - mtl.id: the current loci (elem in ll)
-##        - s: summits vector as before
-##        - pval: pvals vector matching the summits
-##        - tfs: a vector of tfs matching s, the summits in loci l
-##        - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
-create.cluster.list.no.pml <- function(ruler){
-  tmp <- list()
-  ll <- list()
-  for(j in 1:length(ruler)){
-    r <- names(ruler)[j]
-    # cat("working on ",r,"\n")
-    x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
-    l.vec <- unique(x[["mtl.id"]]) # the clusters ids on j chr (only unique)
-    n <- length(l.vec) # iterate over n clusters
-    tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
-  }
-  ## concatenate tmp into one list
-  command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
-  ll=eval(parse(text=command))
-  return(ll)
-}
-get.cluster.params.no.pml <- function(i,x,l.vec,r){
-  # ix <- which(x[["l"]]==l.vec[i])
-  # l <- l.vec[i]
-  ix <- which(x[["mtl.id"]]==l.vec[i])
-  l <- l.vec[i]
-  start <- min(x[["start"]][ix])
-  end <- max(x[["end"]][ix])
-  # s <- x[["s"]][ix]
-  summit <- x[["summit"]][ix]
-  # pval <- x[["pval"]][ix]
-  score <- x[["score"]][ix]
-  # pval.mean <- mean(pval)
-  score.mean <- mean(score)
-  span.tfs <- x[["end"]][ix]-x[["start"]][ix]
-  span.l <- end-start
-  peak.ids <- x[["peak.ids"]][ix]
-  expt <- x[["expt"]][ix]
-  expt.alphanum.sorted <- sort(x[["expt"]][ix])
-
-  chr <- rep(r,length(ix))
-  return(list(mtl.id=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
-			  end=end,summit=summit,score=score,score.mean=score.mean,span.tfs=span.tfs,
-			  span.l=span.l,peak.ids=peak.ids))
-}
-
-## pretty print (tab deim) to file requested elements out of chip cluster list, ll.
-## input:
-##     - ll: a cluster list
-##     - f.nm: a file name (include path) to where you want files to print
-##     - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering)
-## output
-##     - a tab delim file with clusers as rows and elems tab delim for each cluster
-print.ll.verbose.all <- function(ll,f.nm="ll.xls"){
-  options(digits=5)
-  cat(file=f.nm,names(ll[[1]]),sep="\t")
-  cat(file=f.nm,"\n",append=TRUE)
-  for(i in 1:length(ll)){
-	line <- ll[[i]][[1]] # put MTL number
-	line <- paste(line,ll[[i]][[2]][1],sep="\t")
-	for(j in 3:length(ll[[i]])){
-		val <- ll[[i]][[j]]
-		if(length(val) == 1){
-			line <- paste(line,val,sep="\t")
-		} else {
-			line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t")
-		}
-	}
-  	cat(file=f.nm,line,"\n",append=TRUE)
-  }
-}
-# read command line paramters that are not optional
-read.cmd.line.params.must <- function(args.nms, cmd.line.args){
-	if(length(grep("--version",cmd.line.args))){
-		cat("version",script.version,"\n")
-		q()
-	}
-	args <- sapply(strsplit(cmd.line.args," "),function(i) i)
-	vals <- character(length(args.nms))
-	# split cmd.line to key and value pairs
-	for(i in 1:length(args.nms)){
-		ix <- grep(args.nms[i],args)
-		if(length(ix)>1){
-			stop("arg ",args.nms[i]," used more than once.  Bailing out...\n",print.error())
-		} else if (length(ix)==0){
-			stop("could not find ",args.nms[i],". Bailing out...\n",print.error())
-		} else {
-			vals[i] <- args[ix+1]
-		}
-	}
-	return(vals)
-}
-# read command line paramters that are optional, if an optional param is not give functions return na for this param
-read.cmd.line.params.optional <- function(args.nms, cmd.line.args){
-	args <- sapply(strsplit(cmd.line.args," "),function(i) i)
-	vals <- character(length(args.nms))
-	# split cmd.line to key and value pairs
-	for(i in 1:length(args.nms)){
-		ix <- grep(args.nms[i],args)
-		if(length(ix)>1){
-			stop("arg ",args.nms[i]," used more than once.  Bailing out...\n",print.error())
-		} else if (length(ix)==0){ # if --param was not written in cmd line
-			vals[i] <- NA
-		} else if(( (ix+1) <= length(args) ) & ( length(grep("--",args[ix+1])) == 0) ){ # if --param was written in cmd line AND was followed by a value
-		 	vals[i] <- args[ix+1]
-		} else { # otherwise
-			vals[i] <- NA
-		}
-	}
-	return(vals)
-}
-
-############# Code:
-# retrieve args
-
-if(debug==T){
-	cmd.args <- c(
-		"--input_files data/xls/SL10571_SL10565_peaks.xls::data/xls/SL10570_SL10564_peaks.xls::data/xls/SL10572_SL10566_peaks.xls",
-		#"--input_files data/bed/SL10571_SL10565_peaks.bed::data/bed/SL10570_SL10564_peaks.bed::data/bed/SL10572_SL10566_peaks.bed",
-		"--input_type MACS", #BED
-		"--path_output ./results/",
-		"--expt_names macs1::macs2::macs3",
-		#"--expt_names bed1::bed2::bed3",
-		"--mtl_type interval", #interval summit
-		"--dist_summits 100"
-	)
-} else {
-	cmd.args <- commandArgs(trailingOnly = T);
-}
-
-# if(length(grep("--version",cmd.args))){
-# 	cat("version",script.version,"\n")
-# 	q()
-# }
-args.nms.must <- c(
-		"--input_files",   #1
-		"--path_output",  	 #2
-		"--expt_names" #3
-)
-
-# all numeric params must come at the end of args.nms.optional
-n.start.numeric <- 3
-args.nms.optional <- c(
-		"--input_type",	#1
-		"--mtl_type", 	 #2
-		"--dist_summits" #3
-)
-
-# get must parameters
-vals.must <- read.cmd.line.params.must(args.nms = args.nms.must, cmd.line.args = cmd.args)
-if(verbose){
-	cat("\nfinshed reading vals: \n")
-	cat("\nvals.must", unlist(vals.must), "\n")
-}
-input.files <- strsplit(vals.must[1],"::")[[1]]
-if(length(input.files)==1){
-	cat("only provided one MACS file to cluster.")
-	print.error()
-}
-path.output <- vals.must[2]
-expt.names <- strsplit(vals.must[3],"::")[[1]]
-# get optional parameters
-vals.optional <- read.cmd.line.params.optional(args.nms = args.nms.optional, cmd.line.args = cmd.args)
-if(verbose){
-	cat("\nvals.optional:", unlist(vals.optional),"\n")
-}
-if(is.na(vals.optional[1])){
-	input.type <- "MACS"
-} else {
-	input.type <- vals.optional[1]
-}
-if(is.na(vals.optional[2])){
-	mtl.type <- "interval"
-} else {
-	mtl.type <- vals.optional[2]
-}
-if(is.na(vals.optional[2])){
-	dist.summits <- 100
-} else {
-	dist.summits <- as.numeric(vals.optional[3])
-}
-
-# read MACS files
-unique.chr <- character()
-infile.list <- list()
-col.nms.keepers <- c("chr","start","end","summit","score")
-for(i in 1:length(input.files)){
-	e <- expt.names[i]
-	if(toupper(input.type)=="MACS"){
-		tmp <- read.delim(file=input.files[i],comment.char="#",stringsAsFactors = F)
-		columns <- c("chr","start","end","summit","pvalue")
-		ix.cols <- numeric()
-		for(j in 1:length(columns)){
-			ix.j <- grep(columns[j],names(tmp))
-			if(length(ix.j)==0){
-				stop("can't find (grep) column ",columns[j]," in MACS input file ", e)
-			}
-			ix.cols <- c(ix.cols,ix.j)
-		}
-		infile.list[[e]] <- tmp[,ix.cols]
-		colnames(infile.list[[e]]) <- col.nms.keepers
-	}
-	if(toupper(input.type)=="BED"){
-		tmp <- read.delim(file=input.files[i],stringsAsFactors = F,header = FALSE)
-		tmp[,4] <- tmp[,2]+(tmp[,3]-tmp[,2])/2 # define summit and put istead of name column
-		colnames(tmp)  <- col.nms.keepers
-		infile.list[[e]] <- tmp[,1:5]
-	}
-	unique.chr <- unique(c(unique.chr,infile.list[[ e ]][,"chr"]))
-}
-
-unique.chr <- sort(unique.chr)
-
-# take all macs files and put peaks together on each chromosome
-# (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls)
-cat("splitting input files per chromosome\n")
-x <- split.macs.list.to.chrmosomes.no.pml(macs.list=infile.list,expts=expt.names,chr=unique.chr)
-cat("adding peaks from all input files into chromosome rulers\n")
-ruler <- make.ruler.no.pml(chr=unique.chr,macs.list.per.chrom=x)
-cat("add MTL membership to the ruler\n")
-
-if(mtl.type=="interval"){
-	ruler <- assign.clusters.ids.to.peaks.based.on.intervals(ruler)
-} else {
-	ruler <- assign.clusters.ids.to.peaks.based.on.summits(ruler,d.cut=dist.summits)
-}
-
-cat("creating MTL list\n")
-ll <- create.cluster.list.no.pml(ruler=ruler)
-cat("writing MTL table\n")
-f.nm <- paste(sep="",path.output,"mtls",".xls")
-print.ll.verbose.all(ll=ll,f.nm=f.nm)
-
-
-
-
-
-
-
-
-
--- a/mtls_analyze/mtls_analyze/gtfToMapFriendlyAnnotation.R	Mon Jul 23 12:58:55 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,102 +0,0 @@
-rm (list = ls())
-args <- commandArgs()
-print(args)
-gtf <- read.table(args[6],as.is=T)
-colnames(gtf) <- c("chrom",
-                   "source",
-                   "type",
-                   "five_prime",
-                   "three_prime",
-                   "score",
-                   "strand",
-                   "frame",
-                   "nothinggene",
-                   "gene",
-                   "nothingSemi1",
-                   "nothingtranscript",
-                   "transcript",
-                   "nothingSemi2")
-gtf.exon <- gtf[which(gtf[,"type"]=="exon"),]
-gtf.exon.slim <-gtf.exon[,c("chrom", "five_prime", "three_prime", "strand", "transcript")]
-gtf.exon.slim.sort <- gtf.exon.slim[sort.list(gtf.exon.slim[,"transcript"]),]
-
-
-
-transcripts <- as.vector(unique(gtf.exon.slim[,"transcript"]))
-#transcripts <- as.vector(unique(gtf.exon.slim[,"transcript"]))
-
-
-
-output <- matrix(0,nr=length(transcripts),nc=ncol(gtf.exon.slim.sort))
-
-all.chrom <- gtf.exon.slim.sort[,"chrom"]
-all.five_prime <- gtf.exon.slim.sort[,"five_prime"]
-all.three_prime <- gtf.exon.slim.sort[,"three_prime"]
-all.strand <- gtf.exon.slim.sort[,"strand"]
-all.transcript <- gtf.exon.slim.sort[,"transcript"]
-
-colnames(output) <- colnames(gtf.exon.slim.sort)
-
-j <- 0
-i <- 1
-
-previous.strand <- gtf.exon.slim.sort[i,"strand"]
-previous.chrom <- gtf.exon.slim.sort[i,"chrom"]
-previous.transcript <- gtf.exon.slim.sort[i,"transcript"]
-previous.five_prime.min <- gtf.exon.slim.sort[i,"five_prime"]
-previous.three_prime.max <- gtf.exon.slim.sort[i,"three_prime"]
-
-# Progress bar:
-total <- dim(gtf.exon.slim.sort)[1]
-# create progress bar
-pb <- txtProgressBar(min = 0, max = total, style = 3)
-
-
-
-
-for ( i in 1:length(all.transcript)) {
-  current.transcript <- all.transcript[i]
-  setTxtProgressBar(pb, i)
-  if (previous.transcript != current.transcript) {
-    j <- j + 1
-    # Write out the current transcript info
-    output[j,"chrom"] <- previous.chrom
-    output[j,"five_prime"] <- previous.five_prime.min
-    output[j,"three_prime"] <- previous.three_prime.max
-    output[j,"strand"] <- previous.strand
-    output[j,"transcript"] <- previous.transcript
-
-    # Save the new transcript info (convert the current to previous)
-
-    previous.strand <- all.strand[i]
-    previous.chrom <- all.chrom[i]
-    previous.transcript <- all.transcript[i] # current.transcript
-    previous.five_prime.min <- all.five_prime[i]
-    previous.three_prime.max <- all.three_prime[i]
-    }
-  else {
-    previous.five_prime.min <- min(previous.five_prime.min, all.five_prime[i])
-    previous.three_prime.max <- max(previous.three_prime.max, all.three_prime[i])
-    previous.transcript <- all.transcript[i] # current.transcript
-  }
-}
-# Write the last item
-j <- j + 1
-# Write out the current transcript info
-output[j,"chrom"] <- previous.chrom
-output[j,"five_prime"] <- previous.five_prime.min
-output[j,"three_prime"] <- previous.three_prime.max
-output[j,"strand"] <- previous.strand
-output[j,"transcript"] <- previous.transcript
-
-close(pb)
-
-colnames(output) <- c("chrom", "txStart", "txEnd", "strand", "name2")
-write.table(output, file="gene_annotation.txt", sep="\t", row.names=F)
-
-  #ix <- which(gtf.exon.slim[,"transcript"] == transcripts[i])
-  #output[i,"transcript"] <- transcripts[i]
-  #output[i,"five_prime"] <- min(gtf.exon.slim[ix,"five_prime"])
-  #output[i,"three_prime"] <- max(gtf.exon.slim[ix,"three_prime"])
-  #output[i,"strand"] <- gtf.exon.slim[ix,"strand"][1]
-  #output[i,"chrom"] <- gtf.exon.slim[ix,"chrom"][1]
--- a/mtls_analyze/mtls_analyze/heatmap.R	Mon Jul 23 12:58:55 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1524 +0,0 @@
-GeneratePeakMatrix <- function(experiments, scores) {
-  # Generates a score matrix for all mtls.
-  #
-  # Args:
-  #   experiments: A list of underscore deliminated experiments for each mtl.
-  #                There should never be a completely empty item in this list.
-  #                for eg. experiments[1] = expA_expD_expB.
-  #   scores: A list of underscore deliminated scores for each mtl, the length
-  #           of these scores should be identicle to the length of the
-  #           experiments. eg. scores[1] = 55_33_245.
-  #
-  # Returns:
-  #   The peak score matrix for all mtls.
-  experiments <- sapply(experiments, function(x) strsplit(x, split = "_"))
-  scores <- sapply(scores, function(x) strsplit(x, split = "_"))
-  unique.experiments <- unique(unlist(experiments))
-
-  peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments))
-  colnames(peaks) <- unique.experiments
-
-  for(i in 1:length(experiments)){
-    for(j in 1:length(experiments[[i]])){
-      peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j])
-    }
-  }
-  return(peaks)
-##################################################################
-}
-GetChipData <- function(file.name,
-                        proximity = "distal",
-                        include.targetless = TRUE,
-                        column.order = NA) {
-  # Reads in, filters, and organizes mtls data
-  #
-  # Args:
-  #   file.name: The path to the mtls file (including the file name).
-  #   proximity: Either "distal" or "proximal", defines the gene target distance
-  #              from the mtl. Default is "distal".
-  #   include.targetless: If TRUE, includes mtls with no targets (applied after
-  #                       the proximity filter); if not, mtls with no target
-  #                       will be exclided. Default is TRUE.
-  #   column.order: An optional vector of column names in the order in which
-  #                 they will be presented. If this is left to default (NA) the
-  #                 presented order of the chip columns will be the order in
-  #                 which they are seen.
-  #
-  # Returns:
-  #   An organized list of mtls data with the following elements:
-  #     $peaks - a matrix of peak p-values
-  #     $targets - a list of underscore deliminated gene targets for each mtl
-  if(param$debug) {
-    print("In GetChipData")
-  }
-  # Set Constants for the mtls file type:
-  MTLNUM <- "mtl.id"
-  CHROMOSOME <- "chr"
-  EXPERIMENTS <- "expt"
-  EXPERIMENTS.SORTED <- "expt.alphanum.sorted"
-  START <- "start"
-  END <- "end"
-  SUMMIT <- "summit"
-  SCORES <- "score"
-  SCORE.MEAN <- "score.mean"
-  SPANS <- "span.tfs"
-  SPAN.TOTAL <- "span.l"
-  PEAK.IDS <- "peak.ids"
-  TARGETS <- "trgt"
-#  PROXIMAL.TARGETS <- "trgt.prox"
-#  DISTAL.TARGETS <- "trgt.dist"
-#  PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox"
-#  DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist"
-
-
-  #Get chip data in from files:
-  #TARGETS <- switch(proximity,
-  #                   distal = DISTAL.TARGETS,
-  #                   proximal = PROXIMAL.TARGETS,
-  #                   stop(" Bad proximity argument supplied."))  # Default
-cat("param$rna.files = ", param$rna.files, "\n")
-if(!is.na(param$rna.files) && param$rna.files != "none") {
-  keep.columns <- c(EXPERIMENTS, SCORES, TARGETS)
-} else {
-  keep.columns <- c(EXPERIMENTS, SCORES)
-}
-  file <- read.delim(file.name, header=T, as.is=T)[, keep.columns]
-if(!is.na(param$rna.files) && param$rna.files != "none") {
-  if (!include.targetless) {
-    ix.has.trgt <- which(as.character(file[, TARGETS])!="")
-    file <- file[ix.has.trgt, ]
-  }
-}
-  chip = list()
-
-  chip$peaks <- GeneratePeakMatrix(file[, EXPERIMENTS], file[, SCORES])
-  if(!is.na(column.order)) {
-  #If you specify an order, or a subset of experiments to include, this is where
-  #that gets done.
-  order <- unlist(strsplit(column.order, split="::"))
-  chip$peaks <- chip$peaks[,order]
-  }
-  if(!is.na(param$rna.files) && param$rna.files != "none") {
-  chip$targets <- file[ , TARGETS]
-}
-
-  return (chip)
-}
-GetRNADataFromOneFile <- function(file.name, fpkm = "avg") {
-  # Reads in rna expression data from cufflinks
-  #
-  # Args:
-  #   file.name: The path to the cufflinks file (including the file name).
-  #   fpkm: What fpkm is chosen, options are hi, low and avg
-  # Returns:
-  #   An organized vector of rnaseq data with the following elements:
-  #     names(data) - the genes from the file. eg names(data[1]) = JUND
-  #     data - the fpkm score for that gene data[1] = 31
-  if(param$debug) {
-    cat("In GetRNADataFromOneFile for ",file.name)
-  }
-
- # Set Constants for the cufflinks file type:
-  GENE <- "gene_id"
-  BUNDLE <- "bundle_id"
-  CHROMOSOME <- "chr"
-  LEFT <- "left"
-  RIGHT <- "right"
-  FPKM_AVG <- "FPKM"
-  FPKM_LOW <- "FPKM_conf_lo"
-  FPKM_HIGH <- "FPKM_conf_hi"
-  STATUS <- "status"
-
-
-  #Get chip data in from files:
-   FPKM <- switch(fpkm,
-                  hi = FPKM_HIGH,
-                  avg = FPKM_AVG,
-                  low = FPKM_LOW,
-                  stop(" Bad fpkm quality argument supplied."))  # Default
-
-  keep.columns <- c(GENE, FPKM)
-
-  file <- read.delim(file.name, header=T, as.is=T)[, keep.columns]
-
-  rna = vector(mode = "numeric", length = nrow(file))
-
-  rna <- file[, FPKM]
-  names(rna) <- file[ , GENE]
-  return (rna)
-}
-
-GetRNAData <- function(file.names, file.lables = file.names, fpkm = "avg") {
-  # Reads in rna expression data from cufflinks
-  #
-  # Args:
-  #   file.names: The list of paths to the cufflinks files (including the file name).
-  #   fpkm: What fpkm is chosen, options are hi, low and avg
-  # Returns:
-  #   A matrix of rnaseq data with the following description:
-  #     each col corresponds to an rnaseq run
-  #     eacg row corresponds to a gene
-  if(param$debug) {
-    print("In GetRNAData")
-  }
-  files <- list()
-  for(i in 1:length(file.names)) {
-  files[[i]] <-GetRNADataFromOneFile(file.names[i])
-  }
-  genes <- unique(names(unlist(files)))
-  scores <- matrix(0,nrow=length(genes),ncol=length(file.names))
-  rownames(scores) <- genes
-  colnames(scores) <- file.names
-
-  for(j in 1:length(file.names)) {
-    scores[names(files[[j]]),j] <- files[[j]]
-  }
-print("# of cols for scores is ")
-print(ncol(scores))
-print ("file lables are ")
-print (file.lables)
-  colnames(scores) <- file.lables
-  return(scores)
-  #scores[genes,file.names] <- files[[]]
-}
-
-NormalizeRNA <- function(scores) {
-
-#add psudocount
-scores <- scores+1
-
-numerator <- scores[,1:(ncol(scores)/2)]
-denominator <- scores[,(ncol(scores)/2 + 1):ncol(scores)]
-
-#new.scores <- scores[,-which(colnames(scores) == norm.exp)]
-#norm.score <- scores[,which(colnames(scores) == norm.exp)]
-#return(log2(new.scores/norm.score))
-
-return(log2(numerator/denominator))
-
-}
-
-PrepareRNAforHeatmap <- function(new.scores.foldchange){
-new.scores.split <- matrix(0, nr=nrow(new.scores.foldchange), nc=2*ncol(new.scores.foldchange))
-is.even <- function(x){ x %% 2 == 0 }
-corresponding.col <- function(x){ceiling(x/2)}
-new.col.names <- vector(length = ncol(new.scores.split))
-rownames(new.scores.split) <- rownames(new.scores.foldchange)
-  for(i in 1:ncol(new.scores.split)) {
-	if(is.even(i)) {sign <- "down"} else {sign <- "up"}
-  new.col.names[i] <- paste(colnames(new.scores.foldchange)[corresponding.col(i)],
-					sign, sep =".")
-	new.scores.split[,i] <- new.scores.foldchange[,corresponding.col(i)]
-	if(is.even(i)) {
-		new.scores.split[which(new.scores.split[,i]>0),i] <- 0
-		new.scores.split[,i] <- -1*new.scores.split[,i]
-	} else {
-		new.scores.split[which(new.scores.split[,i]<0),i] <- 0
-	}
-	colnames(new.scores.split) <- new.col.names
-}
-return(new.scores.split)}
-
-#ConvertExpressionToPval = function(expression, threshold = 1)
-#{
-#    print("In ConvertExpressionToPval")
-#    #Generate Normal Stats
-#    # expression.mean = apply(expression, 2, mean)
-#    # expression.sd = apply(expression, 2, sd)
-#
-#    ix.above.threshold = which(expression > threshold)
-#    expression.mean = apply(expression, 2, function(i) mean(i[which(i > threshold)]))
-#    expression.sd = apply(expression, 2, function(i)     sd(i[which(i > threshold)]))
-#
-
-#    # CDF from zero to point
-#    # expression.pval = matrix(,nr=nrow(expression),nc=ncol(expression),dimnames=dimnames(expression))
-#    # ix.low = which(expression < expression.mean)
-#    # ix.upper = which(expression >= expression.mean)
-#    # expression.pval[ix.low] = (-10 * pnorm(expression[ix.low], mean = expression.mean, sd = expression.sd, log=T) *log10(exp(1)))
-#
-#    expression.pval = (-10 * log10(exp(1)) * pnorm(expression,
-#                                                    mean = expression.mean,
-#                                                    sd = expression.sd,
-#                                                    log=T,
-#                                                    lower.tail=F
-#    )
-#    )
-#    #squash those under threshold to zero
-#    expression.pval[-ix.above.threshold] = 0
-
-#    # correct for upper tail (ie from point to inf)
-#    #Still need to do
-#    return (expression.pval)
-#}
-MapExpressiontoChip2 = function(chip.targets, expression)
-{
-mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression))
-rownames(mymatrix) <- chip.targets
-colnames(mymatrix) <- colnames(expression)
-
-
-j <- 1 #expression counter
-i <- 1 #mymatrix counter
-expression.genes <- rownames(expression)
-previous.expression.gene <- expression.genes[j]
-
-# Progress bar:
-total <- length(chip.targets)
-# create progress bar
-pb <- txtProgressBar(min = 0, max = total, style = 3)
-
-for ( i in 1:length(chip.targets)) {
-  current.target <- chip.targets[i]
-  setTxtProgressBar(pb, i)
-  while (current.target > previous.expression.gene && j<=length(expression.genes)) {
-    j <- j + 1
-    previous.expression.gene <- expression.genes[j]
-  }
-  if (current.target == previous.expression.gene){
-    mymatrix[i,] <- expression[j,]
-  }
-}
-
-close(pb)
-
-           print("Leaving MapRNAtoChip")
-           return(mymatrix)
-}
-
-
-MapExpressiontoChip = function(chip.targets, expression)
-{
-    print("In MapRNAtoChip")
-    # targets = unlist(strsplit(chip$targets[i], "_"))
-    #   exp = matrix(rna$all.pval[targets, ])
-    #   return (apply(exp, 2, mean))
-
-    #rna col 1 = th0 overexpressed
-    #rna col 2 = th17 overexpressed
-    #rna col 3 = true zscores
-
-
-
-
-
-
-    ####################################################################
-    chip.targets.notnull.unique <- unique(chip.targets[which(chip.targets!="")])
-    gene.intersect <- intersect(rownames(expression), chip.targets.notnull.unique)
-    chip.targets.notnull.unique.with.data <- chip.targets.notnull.unique[which(chip.targets.notnull.unique %in% gene.intersect)]
-    expression.useful.data <- expression[which(rownames(expression) %in% gene.intersect),]
-    chip.targets.notnull.unique.with.data.sorted <- chip.targets.notnull.unique.with.data[order(chip.targets.notnull.unique.with.data)]
-    expression.useful.data.sorted <- expression.useful.data[order(rownames(expression.useful.data)),]
-    ####################################################################
-
-    mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression))
-       rownames(mymatrix) <- chip.targets
-       colnames(mymatrix) <- colnames(expression)
-       head(chip.targets.notnull.unique.with.data.sorted)
-       head(rownames(expression.useful.data.sorted))
-       if(!identical(chip.targets.notnull.unique.with.data.sorted, rownames(expression.useful.data.sorted))){
-        stop("We have a serious problem, chip is not alligned to expression")
-      }
-       for(i in 1:length(chip.targets.notnull.unique.with.data.sorted))
-          {
-            mtls.ix <- which(chip.targets == chip.targets.notnull.unique.with.data.sorted[i])
-            for(j in 1:length(mtls.ix)){
-                mymatrix[mtls.ix[j],] <- expression.useful.data.sorted[i,]
-              }
-          }
-           print("Leaving MapRNAtoChip")
-           return(mymatrix)
-
-   #return( sapply(chip$targets, function(x) apply(rna$all.pval[unlist(strsplit(chip$targets[3], "_")), ], 2, mean)) )
-}
-
-
-
-
-
-GenerateKMOrder <- function(data, km) {
-  # generates the order of clusters from high to low mean values
-  #
-  # Args:
-  #   data: A matrix of scores that have already been clustered.
-  #   km: The K-means object generated from running kmeans on data. Another
-  #       method could be used so long as it supplies a (km)$cluser list. Must
-  #       have the same length as the number of rows in data
-  #
-  # Returns:
-  #   cluster.order: The order in which the clusters should be displayed.
-
-    km.cluster = km$cluster
-    clusters = unique(km.cluster)
-    clusters.avg = numeric()
-    for(i in clusters) {
-        clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ]))
-    }
-    if(param$debug) {
-      print ("straight clusters")
-      print (clusters)
-      print ("straigth average")
-      print (clusters.avg)
-      print ("ordered clusters")
-      print (clusters[order(clusters.avg)])
-      print("ordered average")
-      print (clusters.avg[order(clusters.avg)])
-    }
-    return(clusters[rev(order(clusters.avg))])
-}
-
-OrderMTL <- function(data, km, cluster.order) {
-  # Orders a matrix of data according to a clustering algorithm
-  #
-  # Args:
-  #   data: A matrix of scores that have already been clustered.
-  #   km: The K-means object generated from running kmeans on data. Another
-  #       method could be used so long as it supplies a (km)$cluser list. Must
-  #       have the same length as the number of rows in data
-  #   cluster.order: The order in which the clusters should be displayed.
-  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2
-  #                  being on top, then cluster 3 and lastly cluster 1.
-  #
-  # Returns:
-  #   a list that contains 3 objects:
-  #     list$data: the ordered version of the data.
-  #     list$color.vector: a list of colors that should be assigned to each row.
-  #     list$start.row: the starting row of each cluster in data.
-  number.clusters <- length(cluster.order)
-  cluster.colors <- sample(rainbow(number.clusters))
-
-  # Set up return objects
-  sorted.data <- matrix(,nr=nrow(data), nc=ncol(data))
-  colnames(sorted.data) <- colnames(data)
-  cluster.color.vector = vector(length=length(km$cluster))
-  cluster.start.row = numeric(number.clusters)
-  cluster.start.row[1]=1
-
-  for (i in 1:number.clusters)
-  {
-      current.cluster = cluster.order[i]
-      ix = which(km$cluster == current.cluster)
-      current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1)
-      sorted.data[current.cluster.range, ] = data[ix, ]
-      cluster.color.vector[current.cluster.range] = cluster.colors[i]
-      cluster.start.row[i+1] = (cluster.start.row[i]+length(ix))
-  }
-  ret.list = list()
-  ret.list$data = sorted.data
-  ret.list$color.vector = cluster.color.vector
-  ret.list$cluster.index = cluster.start.row
-  return(ret.list)
-}
-
-
-CreateHeatMap <- function(data,
-                         km,
-                         cluster.order,
-                         document.name,
-                         document.type = "png",
-                         number.colors = 30) {
-  # Generates a heatmap image based for a matrix based on a clustering
-  # algorithm.
-  #
-  # Args:
-  #   data: A matrix of scores that have already been clustered, the column
-  #         names of this matrix will become the column titels of the heatmap.
-  #   km: The K-means object generated from running kmeans on data. Another
-  #       method could be used so long as it supplies a (km)$cluser list.
-  #   cluster.order: The order in which the clusters should be displayed.
-  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2
-  #                  being on top, then cluster 3 and lastly cluster 1.
-  #   document.name: A name for the produced file. there is no need to
-  #                  supply the .png/.pdf in your argument.
-  #   document.type: The type of file you want to produce. current options are
-  #                  png and pdf. Default is pdf.
-  #
-  # Returns:
-  #   Nothing to the script that calls it, however it creates an image at the
-  #   path specified.
-  if(param$debug) {
-    print("In CreateHeatMap")
-  }
-
-  data.ordered <- OrderMTL(data, km, cluster.order)
-
-
-	#Load Lib
-	library(gplots)
-
-	#Set Color gradient
-	color.ramp = colorRampPalette(c("black",
-	                                "darkblue",
-	                                "blue",
-	                                "yellow",
-	                                "orange",
-	                                "red"))(number.colors) #7
-
-	if(document.type == "png") {
-          png(paste(document.name,".png", sep = ""),
-              #width=15360,#1920,
-              #height=204080,#2560,
-              #res=500,
-              antialias="gray")
-        }
-        else if(document.type == "pdf") {
-        pdf(paste(document.name,".pdf", sep = ""))
-        }
-        else if(document.type == "tiff") {
-        tiff(paste(document.name,".tiff", sep = ""),
-        res=800,
-        pointsize=2,
-        width=1920,
-        height=1920)
-        }
-        else {
-        bitmap(paste(document.name,".bmp", sep = ""),
-        height = 5,
-        width = 5,
-        res = 500)
-        }
-#op <- par(mar = rep(0, 4))
-
-	heatmap.2(
-		data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))],
-		rowsep = data.ordered$cluster.index[-1],
-		sepwidth = c(0.5, ncol(data)/100),
-		dendrogram = "none",
-		Rowv = F,
-		Colv = F,
-		trace = "none",
-		labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString),
-		labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)),
-		RowSideColors = data.ordered$color.vector,
-		keysize=0.6,
-		key=F,
-		col = color.ramp,
-		cexCol = 0.8,
-		cexRow = 0.8)
-
-	dev.off()
-
-}
-
-CreateIndividualHeatMap <- function(data,
-                         km,
-                         cluster.order,
-                         color.ramp = colorRampPalette(c("black",
-                                  "darkblue",
-                                  "blue",
-                                  "yellow",
-                                  "orange",
-                                  "red"))(30)) {
-  # Generates a heatmap image based for a matrix based on a clustering
-  # algorithm.
-  #
-  # Args:
-  #   data: A matrix of scores that have already been clustered, the column
-  #         names of this matrix will become the column titels of the heatmap.
-  #   km: The K-means object generated from running kmeans on data. Another
-  #       method could be used so long as it supplies a (km)$cluser list.
-  #   cluster.order: The order in which the clusters should be displayed.
-  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2
-  #                  being on top, then cluster 3 and lastly cluster 1.
-  #   document.name: A name for the produced file. there is no need to
-  #                  supply the .png/.pdf in your argument.
-  #   document.type: The type of file you want to produce. current options are
-  #                  png and pdf. Default is pdf.
-  #
-  # Returns:
-  #   Nothing to the script that calls it, however it creates an image at the
-  #   path specified.
-  if(param$debug) {
-    print("In CreateHeatMap")
-  }
-
-  data.ordered <- OrderMTL(data, km, cluster.order)
-
-
-  #Load Lib
-  library(gplots)
-
-
-
-
-  heatmap.3(
-    data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))],
-    rowsep = data.ordered$cluster.index[-1],
-    sepwidth = c(0.5, nrow(data.ordered$data)/100),
-    dendrogram = "none",
-    Rowv = F,
-    Colv = F,
-    trace = "none",
-    labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString),
-    labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)),
-    #sourcRowSideColors = data.ordered$color.vector,
-    keysize=0.6,
-    key=F,
-    col = color.ramp,
-    cexCol = 0.8,
-    cexRow = 0.8)
-
-
-}
-
-ReadCommadLineParameters <- function(argument.names, command.line.arguments, optional = F) {
-  # Reads the parameters from the command line arguments.
-  #
-  # Args:
-  #   argument.names: A list of expected argument names.
-  #   command.line.arguments: The list of recieved command line arguments
-  #   optional: Are the areguments optional, or are they required, default is required
-  #
-  # Returns:
-  #   The arguments for argument.names. As strings In that order.
-
-    if(length(grep("--version",command.line.arguments))) {
-        cat("version",script.version,"\n")
-        q()
-    }
-    # Split command line arguments
-    args <- sapply(strsplit(command.line.arguments, " "),function(i) i)
-    vals <- character(length(argument.names))
-    # split cmd.line to key and value pairs
-    for(i in 1:length(argument.names)) {
-        ix <- grep(argument.names[i], args)
-        if(length(ix)>1) {
-            stop("arg ",
-                 argument.names[i],
-                 " used more than once.  Bailing out...\n",
-                 PrintParamError())
-        }
-        else if (length(ix)==0 && !optional) {
-            stop("could not find ",
-                 argument.names[i],
-                 ". Bailing out...\n",
-                 PrintParamError())
-        }
-        else if (length(ix)==0 && optional) {
-        vals[i] <- NA
-        }
-        else {
-        vals[i] <- args[ix+1]
-        }
-    }
-    return(vals)
-}
-
-PrintParamError <- function(){
-  # Prints the usage of the function, shows users what arguments they can use
-  #
-  # Args:
-  #   param: A list that contains all the paramaters.
-  #
-  # Returns:
-  #   A modified version of the param list, with the default values loaded.
-    cat("
-DESCRIPTIION:
-    heatmap.R takes a ...
-
-INPUT:
-    1.--mtls_file: path to mtls file.\n
-    2.--cluster_file: the destination path for the output cluster file.\n
-    3.--chip_experiment_order: The order of desired chip experiments (optional).\n
-    4.--heatmap_file: path for output heatmap image (no extension).\n
-    5.--heatmap_type: choice of image format, currently support png, pdf, tiff and bmp (optional)\n
-    6.--expression_file: list of expression files to be included in analysis (optional).\n
-    7.--expression_name: lables for the expression files (optional).\n
-    8.--normalization_file: a list of files to be used for normalization,
-    they can be the same file, however the number of expression nominated
-    normalization files must match the number of expression files (optional)\n
-    9.--n_clusters: number of clusters\n
-    10.--filter_percentage: percentage of mtls that will be analysed. Eg: if
-    we make filter_percentage 30, we will take the union of the top mtls in
-    mean, non-zero mean and variance (optional).\n
-
-EXAMPLE RUN:
-    Rscript heatmap.R
-        --mtls_file path/to/mtls.xls
-        --cluster_file path/to/output/cluster
-        --chip_experiment_order tf1::tf2::tf5::tf3
-        --heatmap_file path/to/output/heatmap
-        --heatmap_type png
-        --expression_file path/to/exp1::path/to/exp2
-        --expression_name myexp1::myexp2
-        --normalization_file path/to/exp3::path/to/exp3
-        --n_clusters 13
-        --filter_percentage 100
-        --include_targetless yes
-        --number_bins 30
-
-    ")
-}
-
-#LoadDefaultParams <- function(param) {
-#  # Loads default paramaters for the heatmap application
-#  #
-#  # Args:
-#  #   param: A list that contains all the previous paramaters.
-#  #
-#  # Returns:
-#  #   A modified version of the param list, with the default values loaded.
-
-#script.version=0.1
-#param$debug = F
-
-## RNA data:
-#param$rna.files = ""
-#param$rna.normalization = "none"
-
-## Filter:
-#param$filter.percentage <- 100
-
-## Clustering:
-#param$clustering.number.of.clusters <- 13
-
-## Heatmap:
-#param$heatmap.document.name <- "heatmap"
-#param$heatmap.document.type <- "png"
-##Cluster Groups:
-#param$cluster.groups.document.name <- "clusters"
-
-#return(param)
-
-#}
-
-
-
-LoadParams <- function(cmd.args, args.nms, n.start.numeric, optional = F) {
-  # Loads user defined paramaters for the heatmap application
-  #
-  # Args:
-  #   cmd.args: The command line arguments given.
-  #   arg.nms: A list of possible command arguments.
-  #   n.start.numeric: the first argument that should be numeric (alway put
-  #                        these last).
-  #   optional: This specifies if the params in cmd.args are optional or
-  #             required.
-  # Returns:
-  #   A list of values assigned to each argument.
-
-
-  vals <- ReadCommadLineParameters(argument.names = args.nms,
-                                 command.line.arguments = cmd.args,
-                                 optional = optional)
-
-   #check if numeric params are indeed numeric
-  if(!optional) {
-  for(i in n.start.numeric:length(vals)){
-    if(is.na(as.numeric(vals[i]))){
-      stop("arg ",args.nms[i]," is not numeric.  Bailing out...\n",print.error())
-    }
-  }
-}
-  return (vals)
-
-}
-
-#ValidateParams <- function(params) {
-#    return(T)
-#}
-
-
-##########
-
-LoadDebugParams <- function(param) {
-
-    cmd.args <- c(
-      "--mtls_file data/test/mtls.xls",
-      "--cluster_file data/test/cluster",
-      "--heatmap_file data/test/heatmap",
-      "--heatmap_type bmp",
-      "--n_clusters 13",
-      "--filter_percentage 100",
-      "--expression_file /home/kieran/code/scorchR-heatmap/data/expression/rna1.tabular::/home/kieran/code/scorchR-heatmap/data/expression/rna2.tabular",
-      "--expression_name batf.ko.0::batf.ko.17",
-      "--normalization_file mean",
-      "--chip_experiment_order ac::bc::cc::dc::ec",
-      "--include_targetless yes",
-      "--number_bins 20"
-    )
-
-args.nms <- c(            "--mtls_file",        #1
-                "--cluster_file",     #2
-                "--chip_experiment_order",    #3
-                "--heatmap_file",     #4
-                "--heatmap_type",         #5
-                "--expression_file",    #6
-                "--expression_name",    #7
-                "--normalization_file",    #8
-                "--include_targetless",    #9
-                 "--n_clusters",     #10
-                "--filter_percentage", #11
-                "--number_bins")    #12
-
-
-# vals has the same order as args.nms
-vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 10, optional = F)
-
-
-# ChIP data:
-param$annotated.macs.file <- vals[1]
-param$chip.order <- vals[3]
-# RNA data:
-param$rna.files = vals[6]
-param$rna.names = vals[7]
-param$rna.normalization.file = vals[8]
-param$include.targetless = vals[9]
-
-# Filter:
-param$filter.percentage <- as.numeric(vals[11])
-
-# Clustering:
-param$number.bins <- as.numeric(vals[12])
-param$clustering.number.of.clusters <- as.numeric(vals[10])
-
-# Heatmap:
-param$heatmap.document.name <- vals[4]
-param$heatmap.document.type <- vals[5]
-#Cluster Groups:
-param$cluster.groups.document.name <- vals[2]
-
-return(param)
-
-}
-
-
-
-LoadOptionalParams <- function(param) {
-
-cmd.args <- commandArgs(trailingOnly = T)
-
-args.nms <- c(  "--chip_experiment_order",  #1
-                "--expression_file",        #2
-                "--expression_name",        #3
-                "--normalization_file",     #4
-                "--heatmap_type",           #5
-                "--include_targetless",     #6
-                "--filter_percentage",      #7
-                "--number_bins")            #8
-
-
-# vals has the same order as args.nms
-vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 7, optional = T)
-
-# ChIP data:
-param$chip.order <- if(!is.na(vals[1])){vals[1]}else{NA}
-
-# RNA data:
-param$rna.files <- if(!is.na(vals[2])){vals[2]}else{"none"}
-param$rna.names <- if(!is.na(vals[3])){vals[3]}else{"none"}
-param$rna.normalization.file <- if(!is.na(vals[4])){vals[4]}else{"no"}
-param$include.targetless <- if(!is.na(vals[6])){vals[6]}else{"yes"}
-# Filter:
-param$filter.percentage <- if(!is.na(vals[7])){as.numeric(vals[7])}else{100}
-param$number.bins <- if(!is.na(vals[8])){as.numeric(vals[8])}else{30}
-# Heatmap file (output)
-param$heatmap.document.type <- if(!is.na(vals[5])){vals[5]}else{"none"}
-
-return(param)
-}
-
-
-LoadReqiredParams <- function(param){
-
-cmd.args <- commandArgs(trailingOnly = T)
-
-args.nms <- c(  "--mtls_file",          #1
-                "--cluster_file",       #2
-                "--heatmap_file",       #3
-                 "--n_clusters")        #4
-
-
-# vals has the same order as args.nms
-vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 4, optional = F)
-
-# ChIP data:
-param$annotated.macs.file <- vals[1]
-
-# Clustering
-param$clustering.number.of.clusters <- as.numeric(vals[4])
-
-# Cluster file (output):
-param$cluster.groups.document.name <- vals[2]
-
-
-# Heatmap file (output):
-param$heatmap.document.name <- vals[3]
-
-return(param)
-
-}
-
-###########
-# here we output the fasta file of the targets of each kmeans cluster
-CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){
-  f.nm = paste(f.nm, ".fasta", sep="")
-    clusters = km.order
-    for(i in 1:length(clusters)){
-        v=trgts[which(k.ix==clusters[i])]
-        v.split = unlist(sapply(v,strsplit, "_"))
-        if(i == 1){
-            cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n")
-        } else {
-            cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE)
-        }
-        cat(sep="\n",file=f.nm,v.split,append=TRUE)
-    }
-}
-
-
-PrintClusters <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){
-	f.nm = paste(f.nm, ".tsv", sep="")
-	cat(sep="\t",file=f.nm,"row_number/target","cluster","\n")
-	trgts[which(trgts=="")] <- "no_target"
-    for(i in 1:length(trgts)){
-		cat(sep="",file=f.nm,trgts[i],"\t","cluster_",k.ix[i],"\n",append=TRUE)
-			}
-#  	  if(i == 1){
-#  	    cat(sep="",file=f.nm,"cluster_",i,"\n")#clusters[i],"\n")
-#  	  } else {
-#  	    cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE)
-#  	  }
-      #cat(sep="\n",file=f.nm,v.split,append=TRUE)
-  	}
-GetTopRowsFromMatrix = function(mtrx, percentage = 10)
-{
-  if (param$debug) {
-    print("In GetTopRowsFromMatrix")
-	}
-	#Store the stats for the mtrx
-	stats = list()
-	stats$mean=apply(mtrx,1,mean)
-	stats$sd=apply(mtrx,1,sd)
-	stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)]))
-
-
-	#Store the indexes for the mtrx
-	index = list()
-	index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix
-	index$sd   = sort(stats$sd, decreasing=T, index.return=T)$ix
-	index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix
-
-	#Calculate how many data points we want to take
-	cutoff = floor(length(mtrx[ ,1])*(percentage/100))
-
-	#Extract Indexes and return to caller
-	index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff]))
-
-	return(index)
-}
-
-#GetUniq = function(targets.cluster)
-#{
-#    #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order
-#
-#    return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))))
-#    #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))
-#
-#}
-
-bindata = function(d,qunts=seq(.4,.9,.1))
-{
-    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
-    for(j in 1:ncol(d))
-    {
-        bins=quantile(d[,j],qunts)
-            for(i in 1:length(bins))
-            {
-                tmp[which(d[,j]>bins[i]),j] = i
-         }
-    }
-    return(tmp)
-}
-
-bindata.non.zero = function(d,qunts=seq(0,0.9,0.1))
-{
-    tmp=matrix(0,nr=nrow(d),nc=ncol(d))
-    for(j in 1:ncol(d))
-    {
-        ix.non.zero=which(d[,j]!=0)
-        bins=quantile(d[ix.non.zero,j],qunts)
-            for(i in 1:length(bins))
-            {
-                tmp[which(d[,j]>bins[i]),j] = i
-         }
-    }
-
-    return(tmp)
-}
-bindata.matrix = function(d,qunts=seq(0,0.9,0.1))
-{
-    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
-    #ix.non.zero=which(d!=0)
-    bins=quantile(d[],qunts)
-    for(i in 1:length(bins))
-    {
-        tmp[which(d>bins[i])] = i
-    }
-    return(tmp)
-}
-bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1))
-{
-    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
-    ix.non.zero=which(d!=0)
-    bins=quantile(d[ix.non.zero],qunts)
-    for(i in 1:length(bins))
-    {
-        tmp[which(d>bins[i])] = i
-    }
-    return(tmp)
-}
-heatmap.3 <- function (x, Rowv = TRUE, Colv = if (symm) "Rowv" else TRUE,
-    distfun = dist, hclustfun = hclust, dendrogram = c("both",
-        "row", "column", "none"), symm = FALSE, scale = c("none",
-        "row", "column"), na.rm = TRUE, revC = identical(Colv,
-        "Rowv"), add.expr, breaks, symbreaks = min(x < 0, na.rm = TRUE) ||
-        scale != "none", col = "heat.colors", colsep, rowsep,
-    sepcolor = "white", sepwidth = c(0.05, 0.05), cellnote, notecex = 1,
-    notecol = "cyan", na.color = par("bg"), trace = c("column",
-        "row", "both", "none"), tracecol = "cyan", hline = median(breaks),
-    vline = median(breaks), linecol = tracecol, margins = c(5,
-        5), ColSideColors, RowSideColors, cexRow = 0.2 + 1/log10(nr),
-    cexCol = 0.2 + 1/log10(nc), labRow = NULL, labCol = NULL,
-    key = TRUE, keysize = 1.5, density.info = c("histogram",
-        "density", "none"), denscol = tracecol, symkey = min(x <
-        0, na.rm = TRUE) || symbreaks, densadj = 0.25, main = NULL,
-    xlab = NULL, ylab = NULL, lmat = NULL, lhei = NULL, lwid = NULL,
-    ...)
-{
-    scale01 <- function(x, low = min(x), high = max(x)) {
-        x <- (x - low)/(high - low)
-        x
-    }
-    retval <- list()
-    scale <- if (symm && missing(scale))
-        "none"
-    else match.arg(scale)
-    dendrogram <- match.arg(dendrogram)
-    trace <- match.arg(trace)
-    density.info <- match.arg(density.info)
-    if (length(col) == 1 && is.character(col))
-        col <- get(col, mode = "function")
-    if (!missing(breaks) && (scale != "none"))
-        warning("Using scale=\"row\" or scale=\"column\" when breaks are",
-            "specified can produce unpredictable results.", "Please consider using only one or the other.")
-    if (is.null(Rowv) || is.na(Rowv))
-        Rowv <- FALSE
-    if (is.null(Colv) || is.na(Colv))
-        Colv <- FALSE
-    else if (Colv == "Rowv" && !isTRUE(Rowv))
-        Colv <- FALSE
-    if (length(di <- dim(x)) != 2 || !is.numeric(x))
-        stop("`x' must be a numeric matrix")
-    nr <- di[1]
-    nc <- di[2]
-    if (nr <= 1 || nc <= 1)
-        stop("`x' must have at least 2 rows and 2 columns")
-    if (!is.numeric(margins) || length(margins) != 2)
-        stop("`margins' must be a numeric vector of length 2")
-    if (missing(cellnote))
-        cellnote <- matrix("", ncol = ncol(x), nrow = nrow(x))
-    if (!inherits(Rowv, "dendrogram")) {
-        if (((!isTRUE(Rowv)) || (is.null(Rowv))) && (dendrogram %in%
-            c("both", "row"))) {
-            if (is.logical(Colv) && (Colv))
-                dendrogram <- "column"
-            else dedrogram <- "none"
-            warning("Discrepancy: Rowv is FALSE, while dendrogram is `",
-                dendrogram, "'. Omitting row dendogram.")
-        }
-    }
-    if (!inherits(Colv, "dendrogram")) {
-        if (((!isTRUE(Colv)) || (is.null(Colv))) && (dendrogram %in%
-            c("both", "column"))) {
-            if (is.logical(Rowv) && (Rowv))
-                dendrogram <- "row"
-            else dendrogram <- "none"
-            warning("Discrepancy: Colv is FALSE, while dendrogram is `",
-                dendrogram, "'. Omitting column dendogram.")
-        }
-    }
-    if (inherits(Rowv, "dendrogram")) {
-        ddr <- Rowv
-        rowInd <- order.dendrogram(ddr)
-    }
-    else if (is.integer(Rowv)) {
-        hcr <- hclustfun(distfun(x))
-        ddr <- as.dendrogram(hcr)
-        ddr <- reorder(ddr, Rowv)
-        rowInd <- order.dendrogram(ddr)
-        if (nr != length(rowInd))
-            stop("row dendrogram ordering gave index of wrong length")
-    }
-    else if (isTRUE(Rowv)) {
-        Rowv <- rowMeans(x, na.rm = na.rm)
-        hcr <- hclustfun(distfun(x))
-        ddr <- as.dendrogram(hcr)
-        ddr <- reorder(ddr, Rowv)
-        rowInd <- order.dendrogram(ddr)
-        if (nr != length(rowInd))
-            stop("row dendrogram ordering gave index of wrong length")
-    }
-    else {
-        rowInd <- nr:1
-    }
-    if (inherits(Colv, "dendrogram")) {
-        ddc <- Colv
-        colInd <- order.dendrogram(ddc)
-    }
-    else if (identical(Colv, "Rowv")) {
-        if (nr != nc)
-            stop("Colv = \"Rowv\" but nrow(x) != ncol(x)")
-        if (exists("ddr")) {
-            ddc <- ddr
-            colInd <- order.dendrogram(ddc)
-        }
-        else colInd <- rowInd
-    }
-    else if (is.integer(Colv)) {
-        hcc <- hclustfun(distfun(if (symm)
-            x
-        else t(x)))
-        ddc <- as.dendrogram(hcc)
-        ddc <- reorder(ddc, Colv)
-        colInd <- order.dendrogram(ddc)
-        if (nc != length(colInd))
-            stop("column dendrogram ordering gave index of wrong length")
-    }
-    else if (isTRUE(Colv)) {
-        Colv <- colMeans(x, na.rm = na.rm)
-        hcc <- hclustfun(distfun(if (symm)
-            x
-        else t(x)))
-        ddc <- as.dendrogram(hcc)
-        ddc <- reorder(ddc, Colv)
-        colInd <- order.dendrogram(ddc)
-        if (nc != length(colInd))
-            stop("column dendrogram ordering gave index of wrong length")
-    }
-    else {
-        colInd <- 1:nc
-    }
-    retval$rowInd <- rowInd
-    retval$colInd <- colInd
-    retval$call <- match.call()
-    x <- x[rowInd, colInd]
-    x.unscaled <- x
-    cellnote <- cellnote[rowInd, colInd]
-    if (is.null(labRow))
-        labRow <- if (is.null(rownames(x)))
-            (1:nr)[rowInd]
-        else rownames(x)
-    else labRow <- labRow[rowInd]
-    if (is.null(labCol))
-        labCol <- if (is.null(colnames(x)))
-            (1:nc)[colInd]
-        else colnames(x)
-    else labCol <- labCol[colInd]
-    if (scale == "row") {
-        retval$rowMeans <- rm <- rowMeans(x, na.rm = na.rm)
-        x <- sweep(x, 1, rm)
-        retval$rowSDs <- sx <- apply(x, 1, sd, na.rm = na.rm)
-        x <- sweep(x, 1, sx, "/")
-    }
-    else if (scale == "column") {
-        retval$colMeans <- rm <- colMeans(x, na.rm = na.rm)
-        x <- sweep(x, 2, rm)
-        retval$colSDs <- sx <- apply(x, 2, sd, na.rm = na.rm)
-        x <- sweep(x, 2, sx, "/")
-    }
-    if (missing(breaks) || is.null(breaks) || length(breaks) <
-        1) {
-        if (missing(col) || is.function(col))
-            breaks <- 16
-        else breaks <- length(col) + 1
-    }
-    if (length(breaks) == 1) {
-        if (!symbreaks)
-            breaks <- seq(min(x, na.rm = na.rm), max(x, na.rm = na.rm),
-                length = breaks)
-        else {
-            extreme <- max(abs(x), na.rm = TRUE)
-            breaks <- seq(-extreme, extreme, length = breaks)
-        }
-    }
-    nbr <- length(breaks)
-    ncol <- length(breaks) - 1
-    if (class(col) == "function")
-        col <- col(ncol)
-    min.breaks <- min(breaks)
-    max.breaks <- max(breaks)
-    x[x < min.breaks] <- min.breaks
-    x[x > max.breaks] <- max.breaks
-    # if (missing(lhei) || is.null(lhei))
-    #     lhei <- c(keysize, 4)
-    # if (missing(lwid) || is.null(lwid))
-    #     lwid <- c(keysize, 4)
-    # if (missing(lmat) || is.null(lmat)) {
-    #     lmat <- rbind(4:3, 2:1)
-    #     if (!missing(ColSideColors)) {
-    #         if (!is.character(ColSideColors) || length(ColSideColors) !=
-    #             nc)
-    #             stop("'ColSideColors' must be a character vector of length ncol(x)")
-    #         lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] +
-    #             1)
-    #         lhei <- c(lhei[1], 0.2, lhei[2])
-    #     }
-    #     if (!missing(RowSideColors)) {
-    #         if (!is.character(RowSideColors) || length(RowSideColors) !=
-    #             nr)
-    #             stop("'RowSideColors' must be a character vector of length nrow(x)")
-    #         lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) -
-    #             1), 1), lmat[, 2] + 1)
-    #         lwid <- c(lwid[1], 0.2, lwid[2])
-    #     }
-    #     lmat[is.na(lmat)] <- 0
-    # }
-    # if (length(lhei) != nrow(lmat))
-    #     stop("lhei must have length = nrow(lmat) = ", nrow(lmat))
-    # if (length(lwid) != ncol(lmat))
-    #     stop("lwid must have length = ncol(lmat) =", ncol(lmat))
-    # op <- par(no.readonly = TRUE)
-    # on.exit(par(op))
-    # layout(lmat, widths = lwid, heights = lhei, respect = FALSE)
-    if (!missing(RowSideColors)) {
-        par(mar = c(margins[1], 0, 0, 0.5))
-        image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE)
-    }
-    if (!missing(ColSideColors)) {
-        par(mar = c(0.5, 0, 0, margins[2]))
-        image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE)
-    }
-    par(mar = c(margins[1], 0, 0, margins[2]))
-    x <- t(x)
-    cellnote <- t(cellnote)
-    if (revC) {
-        iy <- nr:1
-        if (exists("ddr"))
-            ddr <- rev(ddr)
-        x <- x[, iy]
-        cellnote <- cellnote[, iy]
-    }
-    else iy <- 1:nr
-    image(1:nc, 1:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 +
-        c(0, nr), axes = FALSE, xlab = "", ylab = "", col = col,
-        breaks = breaks, ...)
-    retval$carpet <- x
-    if (exists("ddr"))
-        retval$rowDendrogram <- ddr
-    if (exists("ddc"))
-        retval$colDendrogram <- ddc
-    retval$breaks <- breaks
-    retval$col <- col
-    if (!invalid(na.color) & any(is.na(x))) {
-        mmat <- ifelse(is.na(x), 1, NA)
-        image(1:nc, 1:nr, mmat, axes = FALSE, xlab = "", ylab = "",
-            col = na.color, add = TRUE)
-    }
-    axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0,
-        cex.axis = cexCol)
-    if (!is.null(xlab))
-        mtext(xlab, side = 1, line = margins[1] - 1.25)
-    axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0,
-        cex.axis = cexRow)
-    if (!is.null(ylab))
-        mtext(ylab, side = 4, line = margins[2] - 1.25)
-    if (!missing(add.expr))
-        eval(substitute(add.expr))
-    if (!missing(colsep))
-        for (csep in colsep) rect(xleft = csep + 0.5, ybottom = rep(0,
-            length(csep)), xright = csep + 0.5 + sepwidth[1],
-            ytop = rep(ncol(x) + 1, csep), lty = 1, lwd = 1,
-            col = sepcolor, border = sepcolor)
-    if (!missing(rowsep))
-        for (rsep in rowsep) rect(xleft = 0, ybottom = (ncol(x) +
-            1 - rsep) - 0.5, xright = nrow(x) + 1, ytop = (ncol(x) +
-            1 - rsep) - 0.5 - sepwidth[2], lty = 1, lwd = 1,
-            col = sepcolor, border = sepcolor)
-    min.scale <- min(breaks)
-    max.scale <- max(breaks)
-    x.scaled <- scale01(t(x), min.scale, max.scale)
-    if (trace %in% c("both", "column")) {
-        retval$vline <- vline
-        vline.vals <- scale01(vline, min.scale, max.scale)
-        for (i in colInd) {
-            if (!is.null(vline)) {
-                abline(v = i - 0.5 + vline.vals, col = linecol,
-                  lty = 2)
-            }
-            xv <- rep(i, nrow(x.scaled)) + x.scaled[, i] - 0.5
-            xv <- c(xv[1], xv)
-            yv <- 1:length(xv) - 0.5
-            lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s")
-        }
-    }
-    if (trace %in% c("both", "row")) {
-        retval$hline <- hline
-        hline.vals <- scale01(hline, min.scale, max.scale)
-        for (i in rowInd) {
-            if (!is.null(hline)) {
-                abline(h = i + hline, col = linecol, lty = 2)
-            }
-            yv <- rep(i, ncol(x.scaled)) + x.scaled[i, ] - 0.5
-            yv <- rev(c(yv[1], yv))
-            xv <- length(yv):1 - 0.5
-            lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s")
-        }
-    }
-    if (!missing(cellnote))
-        text(x = c(row(cellnote)), y = c(col(cellnote)), labels = c(cellnote),
-            col = notecol, cex = notecex)
-    par(mar = c(margins[1], 0, 0, 0))
-    # if (dendrogram %in% c("both", "row")) {
-    #     plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none")
-    # }
-    # else plot.new()
-    par(mar = c(0, 0, if (!is.null(main)) 5 else 0, margins[2]))
-    # if (dendrogram %in% c("both", "column")) {
-    #     plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none")
-    # }
-    # else plot.new()
-    if (!is.null(main))
-        title(main, cex.main = 1.5 * op[["cex.main"]])
-    # if (key) {
-    #     par(mar = c(5, 4, 2, 1), cex = 0.75)
-    #     tmpbreaks <- breaks
-    #     if (symkey) {
-    #         max.raw <- max(abs(c(x, breaks)), na.rm = TRUE)
-    #         min.raw <- -max.raw
-    #         tmpbreaks[1] <- -max(abs(x), na.rm = TRUE)
-    #         tmpbreaks[length(tmpbreaks)] <- max(abs(x), na.rm = TRUE)
-    #     }
-    #     else {
-    #         min.raw <- min(x, na.rm = TRUE)
-    #         max.raw <- max(x, na.rm = TRUE)
-    #     }
-    #     z <- seq(min.raw, max.raw, length = length(col))
-    #     image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks,
-    #         xaxt = "n", yaxt = "n")
-    #     par(usr = c(0, 1, 0, 1))
-    #     lv <- pretty(breaks)
-    #     xv <- scale01(as.numeric(lv), min.raw, max.raw)
-    #     axis(1, at = xv, labels = lv)
-    #     if (scale == "row")
-    #         mtext(side = 1, "Row Z-Score", line = 2)
-    #     else if (scale == "column")
-    #         mtext(side = 1, "Column Z-Score", line = 2)
-    #     else mtext(side = 1, "Value", line = 2)
-    #     if (density.info == "density") {
-    #         dens <- density(x, adjust = densadj, na.rm = TRUE)
-    #         omit <- dens$x < min(breaks) | dens$x > max(breaks)
-    #         dens$x <- dens$x[-omit]
-    #         dens$y <- dens$y[-omit]
-    #         dens$x <- scale01(dens$x, min.raw, max.raw)
-    #         lines(dens$x, dens$y/max(dens$y) * 0.95, col = denscol,
-    #             lwd = 1)
-    #         axis(2, at = pretty(dens$y)/max(dens$y) * 0.95, pretty(dens$y))
-    #         title("Color Key\nand Density Plot")
-    #         par(cex = 0.5)
-    #         mtext(side = 2, "Density", line = 2)
-    #     }
-    #     else if (density.info == "histogram") {
-    #         h <- hist(x, plot = FALSE, breaks = breaks)
-    #         hx <- scale01(breaks, min.raw, max.raw)
-    #         hy <- c(h$counts, h$counts[length(h$counts)])
-    #         lines(hx, hy/max(hy) * 0.95, lwd = 1, type = "s",
-    #             col = denscol)
-    #         axis(2, at = pretty(hy)/max(hy) * 0.95, pretty(hy))
-    #         title("Color Key\nand Histogram")
-    #         par(cex = 0.5)
-    #         mtext(side = 2, "Count", line = 2)
-    #     }
-    #     else title("Color Key")
-    # }
-    # else plot.new()
-    retval$colorTable <- data.frame(low = retval$breaks[-length(retval$breaks)],
-        high = retval$breaks[-1], color = retval$col)
-    invisible(retval)
-}
-
-#rm(list=ls())
-param <- list()
-param$debug <- FALSE #T
-
-print("Finished loading functions")
-
-
-if(param$debug) {
-    param <- LoadDebugParams(param)
-} else {
-    param <- LoadReqiredParams(param)
-    param <- LoadOptionalParams(param)
-}
-print (param)
-Rprof() #Remove me
-# Load Chip data, alread in pval format
-# Column.order need to be a vecor of strings, that correspond to the order (and
-# inclustion) of chip experiments.
-chip <- GetChipData(param$annotated.macs.file, column.order = param$chip.order)
-
-#Get the top percentages on different criteria
-#ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage)
-#chip$peaks <- chip$peaks[ix.top$union, ]
-#chip$targets <- chip$targets[ix.top$union]
-#Bin data:
-chip$peaks <- bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=(param$number.bins+1)))
-
-file.names <- unlist(strsplit(param$rna.files, split="::"))
-print(file.names)
-file.lables <- unlist(strsplit(param$rna.names, split="::"))
-print(file.lables)
-
-library("RColorBrewer")
-color.2 =  rev(colorRampPalette(c("darkblue", "steelblue", "lightgrey"))(param$number.bins))
-if(!is.na(file.names) && file.names != "none") {
-    #rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg")
-    #do differential expression if the user specifies such:
-    if(param$rna.normalization != "no") {
-        if(param$rna.normalization != "mean"){
-            norm.file.names <- unlist(strsplit(param$rna.normalization.file, split="::"))
-            all.file.names <- c(file.names, norm.file.names)
-            all.file.lables <- c(file.lables, norm.file.names)
-            print (all.file.names)
-            print (all.file.lables)
-            rna.scores <- GetRNAData(all.file.names, file.lables = all.file.lables, fpkm = "avg")
-            rna.scores <- NormalizeRNA(scores = rna.scores)
-            rna.scores.sign <- rna.scores/abs(rna.scores)
-            rna.scores.sign[which(is.nan(rna.scores.sign))] <- 0
-            rna.scores <- bindata.non.zero.matrix(abs(rna.scores), qunts = seq(0, 0.9, length=(param$number.bins+1)))
-            rna.scores <- rna.scores.sign * rna.scores
-            color.2 = colorRampPalette(c("darkblue", "steelblue", "lightgrey", "pink", "darkred"))(param$number.bins)
-        } else {
-            print("we are normalizing by average")
-            print("file lables are:")
-            print(file.lables)
-            rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg")
-            rna.scores <- t(apply(rna.scores,1, function(x) { if(mean(x)!=0){ return(x/mean(x)) } else { return(x) }  }))
-            rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1)))
-        }
-    } else {
-        rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg")
-        rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1)))
-    }
-
-    # chip$peaks <- chip$peaks[order(chip$targets), ]
-    # rna.scores <- rna.scores[order(rownames(rna.scores)), ]
-    # chip$targets <- chip$targets[order(chip$targets)]
-
-    rna.scores.mapped <- MapExpressiontoChip(chip.targets = chip$targets, expression=rna.scores)
-    all.data <- cbind(chip$peaks, rna.scores.mapped)
-    if(param$include.targetless!="yes"){
-        all.data <- all.data[-which(chip$targets==""),]
-        chip$peaks <- chip$peaks[-which(chip$targets==""),]
-        rna.scores.mapped <- rna.scores.mapped[-which(chip$targets==""),]
-        chip$targets <- chip$targets[-which(chip$targets=="")]
-    }
-} else {
-    all.data <- chip$peaks
-}
-
-print("Clustering")
-set.seed(1234)
-km <- kmeans(all.data, param$clustering.number.of.clusters, iter.max = 50)
-print("Ordering")
-km.order <- GenerateKMOrder(all.data, km)
-
-##AM edits##
-km.new.order <- numeric(length(km$cluster))
-for(i in 1:length(km.order)){
-    cur.clst <- km.order[i]
-    ix <- which(km$cluster==cur.clst)
-    km.new.order[ix] <- i
-}
-km.order <- 1:length(km.order)
-km$cluster <- km.new.order
-## Done AM ##
-print("Creating image")
-#bmp(param$heatmap.document.name, 640, 480)#1280, 960)
-bmp(paste(param$heatmap.document.name,".bmp", sep = ""), 5120, 3840)#2560, 1920)
-#pdf(paste(param$heatmap.document.name,".pdf", sep = ""))
-color.1 <- c("#000000", colorRampPalette(c("blue", "yellow","orange","red"))(param$number.bins))
-if(!is.na(file.names) && file.names != "none") {
-    PrintClusters(trgts=chip$targets,
-        k.ix=km$cluster,
-        f.nm = param$cluster.groups.document.name,
-        km.order = NA)
-
-    #if(param$rna.normalization != "no") {
-    #    data.split <- cbind(chip$peaks, PrepareRNAforHeatmap(rna.scores.mapped))
-    #} else {
-    #    data.split <- cbind(chip$peaks, rna.scores.mapped)
-    #}
-    expression.width.multiplier <- 2
-    if(ncol(rna.scores.mapped)==1) {
-        rna.scores.mapped <- cbind(rna.scores.mapped, rna.scores.mapped)
-        expression.width.multiplier <- 1
-    }
-
-    layout(matrix(c(1,2,2,5,
-        1,3,4,6,
-        1,3,4,7,
-        1,3,4,8,
-        1,3,4,9),
-    5,4,
-    byrow=T),
-    widths=c(1,2*ncol(chip$peaks),expression.width.multiplier*ncol(rna.scores.mapped),1),
-    heights=c(1,10,2,10,2))
-    #1
-    plot.new()
-    #2
-    plot.new()
-    #3
-    print("Creating peak image")
-    CreateIndividualHeatMap(chip$peaks,
-        km,
-        km.order, color.ramp=color.1)
-    #4
-    print("Creating rna image")
-    CreateIndividualHeatMap(rna.scores.mapped,
-        km,
-        km.order, color.ramp=color.2)
-    #5
-    plot.new()
-    #6
-    image(t(matrix(1:param$number.bins,nc=1)), col=color.1)
-    #7
-    plot.new()
-    #8
-    image(t(matrix(1:param$number.bins,nc=1)), col=color.2)
-    #9
-    plot.new()
-} else {
-    PrintClusters(trgts=1:nrow(chip$peaks),
-        k.ix=km$cluster,
-        f.nm = param$cluster.groups.document.name,
-        km.order = NA)
-
-
-    layout(matrix(c(1,2,2,
-        1,3,4,
-        1,3,5),3,3,byrow=T),
-        widths=c(1,3*ncol(chip$peaks),1),
-        heights=c(1,8,1))
-    #1
-    plot.new()
-    #2
-    plot.new()
-    #3
-    CreateIndividualHeatMap(chip$peaks,
-        km,
-        km.order, color.ramp=color.1)
-    #4
-    image(t(matrix(1:param$number.bins,nc=1)), col=color.1)
-    #5
-    plot.new()
-
-}
-
-dev.off()
-Rprof(NULL)
-profile <- summaryRprof()
-print(str(profile))
-print(profile)
-#save.image("test/testData.RData")
--- a/mtls_analyze/mtls_analyze/map_ChIP_peaks_to_genes.R	Mon Jul 23 12:58:55 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,304 +0,0 @@
-##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
-## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
-##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
-## Nov 2011 th17
-## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>,
-## NYU - Center for Genomics and Systems Biology
-##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
-## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
-##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
-
-rm(list=ls())
-debug=F
-script.version=0.1
-print.error <- function(){
-  cat("
-NAME
-       map_ChIP_peaks_to_genes.R
-
-DESCRIPTION
-       This script takes a MACS tab delimited file as input and
-       produces two files:
-
-       1) genes.xls
-          A gene centered table that stores the peaks that are
-          proximal (within x[kb] of TSS) and distal (within the
-          gene's region + y[kb] upstream or downstream of TSS and
-          TES respectively).
-
-       2) peaks.xls
-          A peak centered table that equals the MACS input table plus
-          colums for proximal and distal targets of each peak and
-          their distance from TSS (if exist). For gene.xls script also
-          gives a poisson model based -log10(pval) score for proximal
-          and genewide enrichment of peaks for each gene with at least
-          one prox/dist peak associated with it.
-
-ARGUMENTS
-       input_file=path
-         Path to MACS file.
-
-       refseq_table=path
-         Path to refseq table (gives TSS/TES locations for all genes).
-
-       path_output=path
-         Path to save genes.xls and peaks.xls output files.
-
-       tss.dist=N
-         The absolute distance from TSS where we connect a peak to
-         gene (for proximal peaks).
-
-       gene.wide.dist=N
-         The absolute distance from TSS or TES where we connect a peak
-         to gene (for peaks hitting anywhere in gene).
-
-       effective.genome.size=N
-         The effective mappable genome size (for mm9 the value is
-         1.87e9. For hg19 the value is 2.7e9 (from MACS manual)).
-
-       macs.skip.lines=N
-         Number of lines to skip in input_file.
-
-EXAMPLE
-       Rscript map_ChIP_peaks_to_genes_v.1.R \\
-         input_file=SL971_SL970_peaks.xls \\
-         refseq_table=UCSC_mm9_refseq_genes_Sep_15_2011.txt \\
-         path_output=./ \\
-         tss.dist=5000 \\
-         gene.wide.dist=10000 \\
-         effective.genome.size=1.87e9 \\
-         macs.skip.lines=23
-
-CITATION
-       Please cite us if you used this script: The transcription
-       factor network regulating Th17 lineage specification and
-       function.  Maria Ciofani, Aviv Madar, Carolina Galan, Kieran
-       Mace, Ashish Agarwal, Kim Newberry, Richard M. Myers, Richard
-       Bonneau and Dan R. Littman et. al. (in preperation).
-
-")
-}
-
-# retrieve args
-if(debug==T){
-	cmd.args <- c(
-		# "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL971_SL970_peaks.xls",
-		"input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL3594_SL3592_peaks.xls",
-		"refseq_table=input/th17/used_for_paper/rawData/UCSC_mm9_refseq_genes_Sep_15_2011.txt",
-		"path_output=/Users/aviv/Desktop/test_script/",
-		"tss.dist=5000",
-		"tes.dist=10000",
-		"effective.genome.size=1.87e9",
-		"gene.wide.dist=10000",
-		"macs.skip.lines=23"
-	)
-} else {
-	cmd.args <- commandArgs();
-}
-
-if(length(grep("--version",cmd.args))){
-	cat("version",script.version,"\n")
-	q()
-}
-
-arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1])
-args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2])
-
-arg.nms <- c("input_file","refseq_table","path_output","tss.dist",
-			 "gene.wide.dist","effective.genome.size","macs.skip.lines")
-arg.val <- character(length=length(arg.nms))
-for(i in 1:length(arg.nms)){
-	ix <- which(arg.names.cmd.line==arg.nms[i])
-	if(length(ix)==1){
-		arg.val[i] <- args.val.cmd.line[ix]
-	} else {
-		stop("######could not find ",arg.nms[i]," arg######\n\n",print.error())
-
-	}
-}
-if(debug==T){
-	print(paste(arg.nms,"=",arg.val))
-}
-# the files here adhere to tab delim format
-path.input.macs <- arg.val[1]
-path.input.refseq <- arg.val[2]
-path.output <- arg.val[3]
-tss.dist <- as.numeric(arg.val[4])
-gene.wide.dist <- as.numeric(arg.val[5])
-effective.genome.size <- as.numeric(arg.val[6])
-macs.skip.lines=as.numeric(arg.val[7])
-
-if(debug==T){
-# if(1){
-	load("/Users/aviv/Desktop/r.u.RData")
-	gn.nms <- rownames(r.u)
-} else {
-	##################
-	### step 1 handle refseq table file (read, make unique)
-	cat("reading refseq table",path.input.refseq,"\n")
-	refseq <- read.delim(file=path.input.refseq)
-	# refseq can have many transcripts for each gene
-	# here i make it have only one transcript for each gene (the longest one)
-	cat("for transcripts matching more than one gene keeping only the longest transcript\n")
-	refseq$name2 <- as.character(refseq$name2)
-	refseq$chrom <- as.character(refseq$chrom)
-	refseq$strand <- as.character(refseq$strand)
-	gn.nms <- unique(refseq$name2)
-	#x <- sort(table(refseq$name2),decreasing=T)
-	#gn.nms.non.unique <- names(x)[which(x>1)]
-	#gn.nms.unique <- names(x)[which(x==1)]
-	# create refseq unique r.u
-	n <- length(gn.nms)
-	r.u <- data.frame(cbind(chrom=rep("NA",n),strand=rep("NA",n),txStart=rep(0,n),txEnd=rep(0,n)),stringsAsFactors=FALSE,row.names=gn.nms)
-	for(i in 1:n){
-	  ix <- which(refseq$name2==gn.nms[i])
-	  if(length(ix)==0) {
-	    error("could not find gene", ng.nms[i], "in refseq table.  Bailing out...\n")
-	  } else if (length(ix)>1){
-	    l <- apply(refseq[ix,c("txStart","txEnd")],1,function(i) abs(i[1]-i[2]) )
-	    l.max <- max(l)
-	    ix <- ix[which(l==l.max)[1]]
-	  }
-	  r.u[gn.nms[i],c("chrom","strand","txStart","txEnd")] <- refseq[ix,c("chrom","strand","txStart","txEnd")]
-	}
-	r.u[,"txStart"] <- as.numeric(r.u[,"txStart"])
-	r.u[,"txEnd"] <- as.numeric(r.u[,"txEnd"])
-
-	# switch TSS and TES if we have chr "-"
-	cat("correcting TSS, TES assiments in refseq table based on strand\n")
-	ix <- which(r.u$strand=="-")
-	tmp.tes <- r.u$txStart[ix]
-	tmp.tss <- r.u$txEnd[ix]
-	r.u[ix,c("txStart","txEnd")] <- cbind(tmp.tss,tmp.tes)
-	# ############
-}
-
-
-#### step 2 read macs file
-cat("reading Macs file",path.input.macs,"\n")
-
-# read macs file
-M <- read.delim(file=path.input.macs,skip=macs.skip.lines)
-trgt.prox <- NA
-trgt.dist <- NA
-dtss.prox <- NA
-dtss.dist <- NA
-M.annot <- cbind(M,trgt.prox,trgt.dist,dtss.prox,dtss.dist)
-M.annot[is.na(M.annot)] <- ""
-# get the number of genes
-m <- length(gn.nms)
-if(debug==T){
-	# m <- 100
-}
-f.nm.peak <- paste(sep="",path.output,"peaks.xls")
-f.nm.gene <- paste(sep="",path.output,"genes.xls")
-# for each genes in the expt (j goes over genes)
-cat(file=f.nm.gene,sep="\t","Gene_ID","prox_n_peak","genewide_n_peak","gn_length","strand","prox_mean_pval","genewide_mean_pval",
-			"prox_max_pval","genewide_max_pval",
-			"prox_pois_model_pval","genewide_pois_model_pval","(peak_id,summit,d_TSS,d_TES,class,pval,fold_enrich,FDR),(..)\n")
-peaks.summit <- (M[,"start"]+M[,"summit"])
-cat(sep="","mapping MACS peaks to genes\n")
-for (j in 1:m){
-  if(j%%20==0){cat(".")}
-  tss <- r.u[j,"txStart"]
-  tes <- r.u[j,"txEnd"]
-  gn.lngth <- abs(tss-tes)
-  strand <- r.u[j,"strand"]
-  chr <- r.u[j,"chrom"]
-  if(strand=="+"){
-    d.tss.all <- peaks.summit-tss
-   	d.tes.all <- peaks.summit-tes
-  } else {
-    d.tss.all <- tss-peaks.summit
-   	d.tes.all <- tes-peaks.summit
-  }
-  ix.distal <- sort(union( c(which( abs(d.tss.all) < gene.wide.dist ),which( abs(d.tes.all) < gene.wide.dist )),
-               which( sign(d.tss.all) != sign(d.tes.all) )),decreasing=TRUE)
-  ix.prox <- sort(which( abs(d.tss.all) < tss.dist ),decreasing=TRUE)
-  ix.prox <- ix.prox[which(M[ix.prox,"chr"]==chr)]
-  ix.distal <- ix.distal[which(M[ix.distal,"chr"]==chr)]
-  n.peaks.prox <- length(ix.prox)
-  n.peaks.distal <- length(ix.distal)
-	# if there is at least one peak hitting gene j
-  if(n.peaks.distal > 0){
-    # for each peak (l goes over peaks)
-    peaks.line <- paste(sep="","(")
-    for (k in 1:n.peaks.distal){
-      if(k>1){
-        peaks.line <- paste(sep="",peaks.line,";(")
-      }
-      d.tss <- d.tss.all[ ix.distal[k] ]
-      d.tes <- d.tes.all[ ix.distal[k] ]
-      if(sign(d.tss)!=sign(d.tes)){
-        class="intra"
-      } else if(d.tss<=0){
-        class="upstream"
-      } else if(d.tss>0){
-        class="downstream"
-      }
-      peaks.line <- paste(sep="",peaks.line,paste(sep=",", ix.distal[k], peaks.summit[ ix.distal[k] ],
-					d.tss, d.tes, class, M[ ix.distal[k] ,7], M[ ix.distal[k] ,8], M[ ix.distal[k] ,9] ),")")
-	  # handle M.annot
-	  dtss <- d.tss.all[ ix.distal[k] ]
-	  prev.trgt <- M.annot[ix.distal[k],"trgt.dist"]
-	  if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
-		if(abs(dtss)<abs(as.numeric(M.annot[ix.distal[k],"dtss.dist"]))){
-			M.annot[ix.distal[k],"dtss.dist"] <- dtss
-			M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
-		}
-	  } else { # if peak does not have trgt add current trgt
-		M.annot[ix.distal[k],"dtss.dist"] <- dtss
-		M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
-	  }
-	# calc the pval for these many peaks in proximal region and in gene-wide region
-	}
-	if(n.peaks.prox > 0){
-		mean.pval.prox <- mean(M[ ix.prox ,7])
-		max.pval.prox <- max(M[ ix.prox ,7])
-		expected.num.peaks.genome.wide.prox <- length(which(M[,7]>=mean.pval.prox))
-		# lambda = num peaks / genome size * searched region
-		lambda.prox <- expected.num.peaks.genome.wide.prox/effective.genome.size*(tss.dist*2)
-		pval.pois.prox <- -log10(ppois(n.peaks.prox,lambda.prox,lower.tail=FALSE))
-		# handle M.annot
-		for(k in 1:n.peaks.prox){
-			dtss <- d.tss.all[ ix.prox[k] ]
-			prev.trgt <- M.annot[ix.prox[k],"trgt.prox"]
-			if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
-				if(abs(dtss)<abs(as.numeric(M.annot[ix.prox[k],"dtss.prox"]))){
-					M.annot[ix.prox[k],"dtss.prox"] <- dtss
-					M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
-				}
-			} else { # if peak does not have trgt add current trgt
-				M.annot[ix.prox[k],"dtss.prox"] <- dtss
-				M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
-			}
-		}
-	} else {
-		mean.pval.prox <- "NA"
-		max.pval.prox <- "NA"
-		pval.pois.prox <- "NA"
-	}
-	mean.pval.distal <- mean(M[ ix.distal ,7])
-	max.pval.distal <- max(M[ ix.distal ,7])
-	expected.num.peaks.genome.wide.distal <- length(which(M[,7]>=mean.pval.distal))
-	# lambda = num peaks / genome size * searched region
-	lambda.distal <- expected.num.peaks.genome.wide.distal/effective.genome.size*(gn.lngth+gene.wide.dist*2)
-    pval.pois.distal <- -log10(ppois(n.peaks.distal,lambda.distal,lower.tail=FALSE))
-    # pring peaks for gene j
-    core.line <- paste(sep="\t",gn.nms[ j ],n.peaks.prox,n.peaks.distal,gn.lngth,strand,
-							mean.pval.prox,mean.pval.distal,max.pval.prox,max.pval.distal,
-							pval.pois.prox,pval.pois.distal)
-    cat(file=f.nm.gene,append=TRUE,sep="",core.line,"\t",peaks.line,"\n")
-  }
-}
-cat("Done!\n")
-
-cat(file=f.nm.peak,colnames(M.annot),"\n",sep="\t")
-write.table(file=f.nm.peak,append=T,col.names=F,row.names=F,M.annot,sep="\t")
-
-
-
-
-
-
-
--- a/mtls_analyze/mtls_analyze/mtls_analyze.xml	Mon Jul 23 12:58:55 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,333 +0,0 @@
-<tool name="Chip-Cluster: Cluster ChIP-seq peaks and create a heatmap" id="chip-cluster">
-  <description>
-    Merge multiple ChIP-seq experiments, alligning their peaks to MTLs (Multi
-    Transcription Factor Loci(us)) and optionally incorperate expression
-  </description>
-  <command interpreter="command">/bin/bash $shscript </command>
-  <inputs>
-    <param name="chipInputFormat" type="select" display="radio" label="ChIP Input Format">
-      <option name="macs" value="MACS">MACS</option>
-      <option name="bed" value="BED">BED</option>
-    </param>
-    <param name="mtlType" type="select" display="radio" label="Cluster by: ">
-      <option name="summit" value="summit">Summit</option>
-      <option name="interval" value="interval">Interval</option>
-    </param>
-    <param name="summitDistance" type="text" label="Summit Distance (BP) - Summit only" value="100">
-    </param>
-    <param name="numberBins" type="text" label="Number of Bins" value="30">
-    </param>
-    <repeat name="chip_tracks" title="MACS/BED Files">
-      <param name="file" type="data" format="tabular" label="Dataset"/>
-      <param name="name" type="text" label="Dataset Name"/>
-    </repeat>
-    <param name="map_rna" type="boolean" truevalue="yes" falsevalue="no" label="Incorperate RNA?"/>
-    <param name="includeTargetless" checked="true" type="boolean" truevalue="yes" falsevalue="no" label="Include Targetless MTLs?"/>
-    <param name="reference_file" type="data" format="tabular" label="Reference Genome File"/>
-
-    <param name="normalize_rna" type="boolean" truevalue="yes" falsevalue="no" label="Normalize Expression?"/>
-    <param name="use_mean" type="boolean" truevalue="yes" falsevalue="no" label="Use mean expression across exp. to normalize?"/>
-    <param name="rnaInputFormat" type="select" display="radio" label="RNA Input Format">
-      <option name="cufflinks" value="cufflinks">Cufflinks</option>
-      <option name="bed" value="bed">BED</option>
-    </param>
-    <param name="numClusters" type="text" label="Number of Clusters (kmeans)" value="8">
-    </param>
-    <param name="trgtDistance" type="text" label="Transcript threshold distance" value="5000">
-    </param>
-    <repeat name="rna_tracks" title="Cufflinks/BED Files">
-      <param name="file" type="data" format="tabular" label="Dataset"/>
-      <param name="name" type="text" label="Dataset Name"/>
-      <param name="norm" type="data" label="Normalization Dataset"/>
-    </repeat>
-  </inputs>
-  <outputs>
-    <data format="xls" name="cluster_assignments" label="Cluster Assignments"/>
-    <data format="xls" name="mtls" label="MTLS File"/>
-    <data format="txt" name="log" label="Log file" />
-    <data format="bmp" name="heatmap_image" label="Heatmap Image" />
-<!--    <data format="png" name="heatmap_image" label="Heatmap Image" >-->
-<!--      <filter>imageFormat=="png"</filter>-->
-<!--    </data>-->
-<!--    <data format="pdf" name="heatmap_image" label="Heatmap Image" >-->
-<!--      <filter>imageFormat=="pdf"</filter>-->
-<!--    </data>-->
-
-  </outputs>
-  <configfiles>
-    <configfile name="shscript">
-<!-- This is the script that runs (Chettah/bash code)-->
-#!/bin/bash
-
-#import os
-#set $path = $os.path.abspath($__app__.config.tool_path)
-
-
-## Set symbols so that they are not incorrectly interpreted:
-#set $dollar = chr(36)
-#set $gt = chr(62)
-#set $lt = chr(60)
-#set $ad = chr(38)
-#set $bs = chr(92)
-
-echo $map_rna ${ad}${gt}${gt} $log
-echo "This is the Bash log file: " ${ad}${gt}${gt} $log
-###############################################################################
-## Convert the gtf file to a file that aviv's script can hadel
-#if str($map_rna)=='yes'
-  echo "Converting gtf file" ${ad}${gt}${gt} $log
-  Rscript $path/visualization/gtfToMapFriendlyAnnotation.R $reference_file ${ad}${gt}${gt} $log
-  echo "done converting gtf file" ${ad}${gt}${gt} $log
-#end if
-###############################################################################
-## Get ChIP data in correctly formated strings and annotate if nessisary.
-#set $sep = '::'
-#for $i, $chip in enumerate( $chip_tracks )
-  #if $i==0
-    echo "Chip Files:" ${ad}${gt}${gt} $log
-    echo "The first file label is:  ${chip.name}" ${ad}${gt}${gt} $log
-    echo "The first file path is:  ${chip.file}" ${ad}${gt}${gt} $log
-    chip_labels=${chip.name}
-    chip_paths=${chip.file}
-  #else
-    echo "The next file label is:  ${chip.name}" ${ad}${gt}${gt} $log
-    echo "The next file path is:  ${chip.file}" ${ad}${gt}${gt} $log
-    chip_labels=${dollar}chip_labels${sep}${chip.name}
-    chip_paths=${dollar}chip_paths${sep}${chip.file}
-  #end if
-#end for
-
-echo chip paths are - ${dollar}chip_paths ${ad}${gt}${gt} $log
-echo chip labels are - ${dollar}chip_labels ${ad}${gt}${gt} $log
-
-###############################################################################
-## Cluster peaks
-
-Rscript $path/visualization/cluster_peaks.R \
---input_files ${dollar}chip_paths \
---input_type $chipInputFormat \
---path_output ./ \
---expt_names ${dollar}chip_labels \
---dist_summits $summitDistance \
---mtl_type $mtlType ${ad}${gt}${gt} $log
-
-###############################################################################
-## Annotate mtls.xls if nessisary
-#if str($map_rna)=="yes"
-  echo "annotating mtls.xls..." ${ad}${gt}${gt} $log
-  Rscript $path/visualization/annotate_mtls.R mtls.xls gene_annotation.txt $trgtDistance ${ad}${gt}${gt} $log
-#end if
-###############################################################################
-## If rna is specified, then get RNA data in correctly formated strings:
-#if str($map_rna)=='yes'
-  #set $sep = '::'
-  #for $i, $rna in enumerate( $rna_tracks )
-    #if $i==0
-      echo "The first file label is:  ${rna.name}" ${ad}${gt}${gt} $log
-      echo "The first file path is:  ${rna.file}" ${ad}${gt}${gt} $log
-      rna_labels=${rna.name}
-      rna_paths=${rna.file}
-      rna_norm_paths=${rna.norm}
-    #else
-      echo "The next file label is:  ${rna.name}" ${ad}${gt}${gt} $log
-      echo "The next file path is:  ${rna.file}" ${ad}${gt}${gt} $log
-      rna_labels=${dollar}rna_labels${sep}${rna.name}
-      rna_paths=${dollar}rna_paths${sep}${rna.file}
-      rna_norm_paths=${dollar}rna_norm_paths${sep}${rna.norm}
-    #end if
-  #end for
-  echo rna paths are - ${dollar}rna_paths ${ad}${gt}${gt} $log
-  echo rna labels are - ${dollar}rna_labels ${ad}${gt}${gt} $log
-  echo rna norm files are - ${dollar}rna_norm_paths ${ad}${gt}${gt} $log
-#end if
-###############################################################################
-
-#if str($normalize_rna)=='no'
-  echo "Normalization by file is set to no" ${ad}${gt}${gt} $log
-  rna_norm_paths=no
-#end if
-
-#if str($use_mean)=='yes'
-  echo "Normalization of expression will be done by mean" ${ad}${gt}${gt} $log
-  rna_norm_paths=mean
-#end if
-
-#if str($map_rna)=='no'
-  mtls_file=mtls.xls
-  rna_paths=none
-  rna_labels=none
-#else
-  mtls_file=annotated_mtls.xls
-#end if
-
-echo "
-Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
---cluster_file ./cluster \
---chip_experiment_order ${dollar}chip_labels \
---heatmap_file ./heatmap \
---heatmap_type bmp \
---n_clusters $numClusters \
---filter_percentage 100 \
---expression_file ${dollar}rna_paths \
---expression_name ${dollar}rna_labels \
---normalization_file ${dollar}rna_norm_paths \
-${ad}${gt}${gt} $log" ${ad}${gt}${gt} $log
-
-Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
---cluster_file ./cluster \
---chip_experiment_order ${dollar}chip_labels \
---heatmap_file ./heatmap \
---heatmap_type bmp \
---n_clusters $numClusters \
---filter_percentage 100 \
---number_bins $numberBins \
---include_targetless $includeTargetless \
---expression_file ${dollar}rna_paths \
---expression_name ${dollar}rna_labels \
---normalization_file ${dollar}rna_norm_paths \
-${ad}${gt}${gt} $log
-
-ls ${ad}${gt}${gt} $log
-
-
-
-
-##################################################################
-#if str($map_rna)=='yes'
-    mv ./annotated_mtls.xls $mtls
-#else
-    mv ./mtls.xls $mtls
-#end if
-mv ./heatmap.* $heatmap_image
-mv ./cluster.tsv $cluster_assignments
-
-    </configfile>
-  </configfiles>
-<!--<tests>-->
-<!--  <test maxseconds="3600" name="GCA_1">-->
-<!--    <param name="bfile" value="bedfile.bed" />-->
-<!--    <param name="span" value="3000" />-->
-<!--    <param name="genome" value="hg18" />-->
-<!--    <output name="output" file="gca_1/gca_1.xls" />-->
-<!--    <output name="output" file="gca_1/gca_1.log" lines_diff = "200" />-->
-<!--  </test>-->
-<!--  <test maxseconds="3600" name="GCA_2">-->
-<!--    <param name="bfile" value="bedfile.bed" />-->
-<!--    <param name="span" value="100" />-->
-<!--    <param name="genome" value="hg18" />-->
-<!--    <output name="output" file="gca_2/gca_2.xls" />-->
-<!--    <output name="output" file="gca_2/gca_2.log" lines_diff = "200" />-->
-<!--  </test>-->
-<!--  <test maxseconds="3600" name="GCA_3">-->
-<!--    <param name="bfile" value="bedfile.bed" />-->
-<!--    <param name="span" value="500" />-->
-<!--    <param name="genome" value="hg18" />-->
-<!--    <output name="output" file="gca_3/gca_3.xls" />-->
-<!--    <output name="output" file="gca_3/gca_3.log" lines_diff = "200" />-->
-<!--  </test>-->
-<!--  <test maxseconds="3600" name="GCA_4">-->
-<!--    <param name="bfile" value="bedfile.bed" />-->
-<!--    <param name="span" value="1000" />-->
-<!--    <param name="genome" value="hg18" />-->
-<!--    <output name="output" file="gca_4/gca_4.xls" />-->
-<!--    <output name="output" file="gca_4/gca_4.log" lines_diff = "200" />-->
-<!--  </test>-->
-<!--  <test maxseconds="3600" name="GCA_5">-->
-<!--    <param name="bfile" value="bedfile.bed" />-->
-<!--    <param name="span" value="10000" />-->
-<!--    <param name="genome" value="hg18" />-->
-<!--    <output name="output" file="gca_5/gca_5.xls" />-->
-<!--    <output name="output" file="gca_5/gca_5.log" lines_diff = "200" />-->
-<!--  </test>-->
-<!--</tests>-->
-  <help>
-This tool will merge peaks form multiple chip-seq experiments, creating MTLs for
-each overlapping region. It will then cluster each MTL based on the score of
-each peak within each MTL (using K-means clustering, with k set by user). A
-heatmap is then generated from the resulting cluster along with the MTLs
-generated. This module in writin in R and is will be made available on github
-and bioconductor. This work was done by Kieran Mace and Aviv Madar.
-
-**NEED IMPROVEMENT**
-
------
-
-**Parameters**
-
-- **Input files** contains either macs or BED files to be merged. This list of files must be two or larger.
-- **Experiment names** contains the name given to each track.
-- **Summit distance** is the cuttoff distance (in BP) to be included in an MTL. This option is not used with the summit option below
-- **Input Format** Either bed of MACS file format, all files must be of one type. Defaults to MACS
-- **MTL Type** Either interval or summit (defaults to summit).
-- **Number clusters** the value of k for kmeans clustering.
-- **Filter top MTLS** The top percentage of MTLs to keep for image and cluster (based on the union of mean, non-zero mean, and variance of the scores).
------
-
-**Output**
-
-- **XLS file** is the tab-delimited file containing the MTL data.
-- **PNG file** is the heatmap image generated after clustering the MTL data.
-
------
-
-**script parameter list of Chip-Cluster**
-
-Options:
-DESCRIPTIION:
-	cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where
-	each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
-	fall within a certain distance between summits from eachother.
-
-INPUT:
-	1.path_input=path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
-	2.path_output=path to save generated MTL cluster file (where to save mtls.xls)
-	3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
-	4.dist.summits=maximum distance between summits belonging to the same MTL (defaults to 100)
-	5.input_type=the type of input file used (MACS or .bed; defaults to MACS)
-	6.mtl_type=interval or summit (defaults to summit)
-
-EXAMPLE RUN:
-	cluster_peaks.R
-	--input_macs_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
-	--input_type MACS
-	--path_output results/
-	--expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
-	--dist_summits 100
-	--mtl_type summit
-
-	DESCRIPTIION:
-	heatmap.R takes a ...
-
-INPUT:
-	1.--mtls_file path to mtls file.
-
-	2.--cluster_file the destination path for the cluster file.
-
-	3.--heatmap_file the destination path for heatmap image (no extension).
-
-	4.--heatmap_type choice of image type, currently support png and pdf.
-
-	5.--n_clusters number of clusters in the heatmap
-
-	6.--filter_percentage percentage of mtls that will be analysed. for eg. if
-	we make filter_percentage 30, we will take the union of the top mtls in
-	mean, non-zero mean and variance.
-
-
-EXAMPLE RUN:
-	Rscript heatmap.R
-				--mtls_file mtls.xls
-				--cluster_file output/cluster
-				--heatmap_file output/heatmap
-				--heatmap_type png
-				--n_clusters 13
-				--filter_percentage 60
-
-Please cite us if you used this script:
-	The transcription factor network regulating Th17 lineage specification and function.
-	Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers,
-	Richard Bonneau and Dan R. Littman et. al. (in preperation)
-
-  </help>
-
-</tool>