changeset 0:a903c48f05b4

Added non-integrated scripts to galaxy
author kmace
date Thu, 22 Mar 2012 15:21:00 -0400
parents
children 797901557e32
files mtls_analyze/cluster_peaks.R mtls_analyze/heatmap.R mtls_analyze/map_ChIP_peaks_to_genes.R
diffstat 3 files changed, 1235 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/cluster_peaks.R	Thu Mar 22 15:21:00 2012 -0400
@@ -0,0 +1,376 @@
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+## Feb 2012 th17
+## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, 
+## NYU - Center for Genomics and Systems Biology
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+
+rm(list=ls())
+debug=F
+script.version=0.1
+print.error <- function(){
+	cat("
+DESCRIPTIION:	
+	cluster_peaks.R takes MACS tab delimited files as input and produces one tab delimeted file (named mtls.xls) where 
+	each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS files)
+	fall within a certain distance between summits from eachother.
+
+INPUT:
+	1.path_input=path to MACS files '::' delim [path_input=f1::f2::f3::...::fk]
+	2.path_output=path to save generated MTL cluster file (where to save mtls.xls)
+	3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
+	4.dist.summits=maximum distance between summits belonging to the same MTL
+	5.n.autosome.chr=19 for mouse, 22 for human
+	
+EXAMPLE RUN: 
+	cluster_peaks.R
+	input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls
+	path_output=~/Desktop/
+	expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
+	dist_summits=100
+	n_autosome_chr=19
+	
+Please cite us if you used this script: 
+	The transcription factor network regulating Th17 lineage specification and function.
+	Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, 
+	Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n")
+}
+
+############# helper functions:
+## split.macs.list.to.chrmosomes
+# input:
+#      - macs.list: a list of macs expts: here are a few lines of one expt
+## chr	start	end	length	summit	tags	#NAME?	fold_enrichment	FDR(%)
+## chr1	4322210	4323069	860	494	55	158.95	6.03	0.05
+## chr1	4797749	4798368	620	211	29	119.82	3.47	0.09
+## chr1	4848182	4849113	932	494	46	105.42	2.9	0.09
+#      - expts: a list of the expts names from macs.list that you want to process
+#      - chr: chrmosomes names
+# output:
+#      - x: a list with as many elements as chr specified by input.
+#      -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS)
+split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){
+  x <- list()
+  n <- length(expts)
+  for(i in 1:n){
+    e <- expts[i] #experiment name
+    cat("wroking on expt", e,"\n")
+    x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]])
+    names(x[[e]]) <- chr
+  }
+  return(x)
+}
+# a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m
+# input:
+#     - r is chromosome
+#     - m is macs matrix for expt e from above function
+split.one.macs.expt.by.chromosome.no.pml <- function(r,m){
+  ix.chr.i <- which(m[,"chr"]==r)
+  # cat("working on",r,"\n")
+  o <- list()
+  o[[r]] <- m[ix.chr.i,]
+  o[[r]]$peak.id <- ix.chr.i
+  return(o[[r]])
+}
+
+## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
+# also match these summit locations with corresponding:
+# pvals, tfs, peak start and peak end trgts
+# input:
+#     - chr: a list of chromosome names
+#     - macs.list.per.chrom: a list of macs peaks for each chromosome
+# output:
+#     - o: a list each chormosome ruler as an element
+make.ruler.no.pml <- function(chr,macs.list.per.chrom){
+  x <- macs.list.per.chrom
+  o <- list()
+  for(j in 1:length(chr)){
+    r <- chr[j] # chrmosome we go over
+    s <- numeric()
+    pval <- numeric()
+    tf.to.s <- character()
+    trgt.prox <- character()
+    trgt.dist <- character()
+    dtss.prox <- character() 
+    dtss.dist <- character()
+    start <- numeric()
+    end <- numeric()
+    trtmnts <- names(x) # the treatments name (expt names)
+    ## debug parameters ###
+    ## which experiment peaks come from
+    expt <- character()
+    ## what was the peak id in that experiment
+    peak.ids <- numeric()
+    ## this will allow us to always back track from a cluster to the actual peaks in it
+    ## debug params end ###
+    for(i in 1:length(trtmnts)){
+      o[[r]] <- list()
+      e <- trtmnts[i] #experiment name
+      tf <- strsplit(e,"_")[[1]][1]
+      s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"])
+      pval <- c(pval,x[[e]][[r]][,7]) # the name of 7th column is X.10.log10.pvalue. this is pval
+      # tf.to.s <- c(tf.to.s,rep(tf,length(x[[e]][[r]][,7]))) # all summits belong to tf
+      start <- c(start,x[[e]][[r]][,"start"])
+      end <- c(end,x[[e]][[r]][,"end"])
+      expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"])))
+      peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"])    
+      trgt.prox <- c(trgt.prox,x[[e]][[r]][,"trgt.prox"])
+	  trgt.dist <- c(trgt.dist,x[[e]][[r]][,"trgt.dist"])
+      dtss.prox <- c(dtss.prox,x[[e]][[r]][,"dtss.prox"])
+      dtss.dist <- c(dtss.dist,x[[e]][[r]][,"dtss.dist"])
+    }
+    ix <- sort(s,index.return=TRUE)$ix
+    o[[r]] <- list(s=s[ix],pval=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix],
+				   trgt.prox=trgt.prox[ix],trgt.dist=trgt.dist[ix],dtss.prox=dtss.prox[ix],dtss.dist=dtss.dist[ix])
+  }
+  return(o)
+}
+
+## add cluster memberships based on ruler
+## require no more than d.cut distance between tf summits
+## cur.l is the current loci number (or cluster number)
+assign.clusters.ids.to.peaks <- function(ruler,d.cut){
+ cur.l <- 0
+ for(j in 1:length(ruler)){
+   s <- ruler[[j]][["s"]]
+   l <- numeric(length=length(s))
+   if(length(l)>0){
+	   cur.l <- cur.l+1
+   }
+   if(length(l)==1){
+   	l[1] <- cur.l
+   } else if(length(l)>1) {
+	l[1] <- cur.l
+	   for(i in 2:length(l)){
+	     d <- s[i]-s[i-1] # assumes s is sorted increasingly
+	     if(d>d.cut){
+	       cur.l <- cur.l+1
+	     }
+	     l[i] <- cur.l    
+	   }
+	}
+   ruler[[j]][["l"]] <- l
+ }
+ return(ruler)
+}
+
+## here we create a list of TF enriched loci (clusters)
+## input:
+##     - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
+## output:
+##     -ll: a list of clusters ll where each cluster (elem in ll holds):
+##        - l: the current loci (elem in ll)
+##        - s: summits vector as before
+##        - pval: pvals vector matching the summits
+##        - tfs: a vector of tfs matching s, the summits in loci l
+##        - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
+##        - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream)
+create.cluster.list.no.pml <- function(ruler){
+  tmp <- list()
+  ll <- list()
+  for(j in 1:length(ruler)){
+    r <- names(ruler)[j]
+    # cat("working on ",r,"\n")
+    x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
+    l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique)
+    n <- length(l.vec) # iterate over n clusters
+    tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
+  }
+  ## concatenate tmp into one list
+  command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
+  ll=eval(parse(text=command))
+  return(ll)
+}
+get.cluster.params.no.pml <- function(i,x,l.vec,r){
+  ix <- which(x[["l"]]==l.vec[i])
+  l <- l.vec[i]
+  start <- min(x[["start"]][ix])
+  end <- max(x[["end"]][ix])
+  s <- x[["s"]][ix]
+  # tf.to.s <- x[["tf.to.s"]][ix]
+  pval <- x[["pval"]][ix]
+  pval.mean <- mean(pval)
+  span.tfs <- x[["end"]][ix]-x[["start"]][ix]
+  span.l <- end-start
+  peak.ids <- x[["peak.ids"]][ix]
+  expt <- x[["expt"]][ix]
+  expt.alphanum.sorted <- sort(x[["expt"]][ix])
+  trgt.prox <- unique(x[["trgt.prox"]][ix])
+  trgt.dist <- unique(x[["trgt.dist"]][ix])
+  dtss.prox <- unique(x[["dtss.prox"]][ix])
+  dtss.dist <- unique(x[["dtss.dist"]][ix])
+  chr <- rep(r,length(ix))
+  return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
+			  end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs,
+			  span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist,
+			  dtss.prox=dtss.prox,dtss.dist=dtss.dist))
+}
+
+## here we create a list of TF enriched loci (clusters)
+## input:
+##     - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
+## output:
+##     -ll: a list of clusters ll where each cluster (elem in ll holds):
+##        - l: the current loci (elem in ll)
+##        - s: summits vector as before
+##        - pval: pvals vector matching the summits
+##        - tfs: a vector of tfs matching s, the summits in loci l
+##        - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
+##        - trgts: genes that are targeted by the cluster (10kb upstream, in gene, 10kb downstream)
+create.cluster.list.no.pml <- function(ruler){
+  tmp <- list()
+  ll <- list()
+  for(j in 1:length(ruler)){
+    r <- names(ruler)[j]
+    # cat("working on ",r,"\n")
+    x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
+    l.vec <- unique(x[["l"]]) # the clusters ids on j chr (only unique)
+    n <- length(l.vec) # iterate over n clusters
+    tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
+  }
+  ## concatenate tmp into one list
+  command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
+  ll=eval(parse(text=command))
+  return(ll)
+}
+get.cluster.params.no.pml <- function(i,x,l.vec,r){
+  ix <- which(x[["l"]]==l.vec[i])
+  l <- l.vec[i]
+  start <- min(x[["start"]][ix])
+  end <- max(x[["end"]][ix])
+  s <- x[["s"]][ix]
+  # tf.to.s <- x[["tf.to.s"]][ix]
+  pval <- x[["pval"]][ix]
+  pval.mean <- mean(pval)
+  span.tfs <- x[["end"]][ix]-x[["start"]][ix]
+  span.l <- end-start
+  peak.ids <- x[["peak.ids"]][ix]
+  expt <- x[["expt"]][ix]
+  expt.alphanum.sorted <- sort(x[["expt"]][ix])
+  trgt.prox <- unique(x[["trgt.prox"]][ix])
+  trgt.dist <- unique(x[["trgt.dist"]][ix])
+  dtss.prox <- unique(x[["dtss.prox"]][ix])
+  dtss.dist <- unique(x[["dtss.dist"]][ix])
+  chr <- rep(r,length(ix))
+  return(list(l=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
+			  end=end,s=s,pval=pval,pval.mean=pval.mean,span.tfs=span.tfs,
+			  span.l=span.l,peak.ids=peak.ids,trgt.prox=trgt.prox,trgt.dist=trgt.dist,
+			  dtss.prox=dtss.prox,dtss.dist=dtss.dist))
+}
+## pretty print (tab deim) to file requested elements out of chip cluster list, ll.
+## input:
+##     - ll: a cluster list
+##     - f.nm: a file name (include path) to where you want files to print
+##     - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering)
+## output
+##     - a tab delim file with clusers as rows and elems tab delim for each cluster
+print.ll.verbose.all <- function(ll,f.nm="ll.xls"){
+  options(digits=5)
+  cat(file=f.nm,names(ll[[1]]),sep="\t")
+  cat(file=f.nm,"\n",append=TRUE)
+  for(i in 1:length(ll)){
+	line <- ll[[i]][[1]] # put MTL number
+	line <- paste(line,ll[[i]][[2]][1],sep="\t")
+	for(j in 3:length(ll[[i]])){
+		val <- ll[[i]][[j]]
+		if(length(val) == 1){
+			line <- paste(line,val,sep="\t")
+		} else {
+			line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t")
+		}
+	}
+  	cat(file=f.nm,line,"\n",append=TRUE)	
+  }
+}
+############# Code:
+# retrieve args
+if(debug==T){
+	cmd.args <- c(
+		"input_macs_files=SL2870_SL2871_peak.xls::SL2872_SL2876_peak.xls::SL3032_SL2871_peak.xls::SL3037_SL3036_peak.xls::SL3315_SL3319_peak.xls",
+		"path_output=~/Desktop/",
+		"expt_names=RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17",
+		"dist_summits=100",
+		"n_autosome_chr=19"
+	)
+} else {
+	cmd.args <- commandArgs();      
+}
+
+if(length(grep("--version",cmd.args))){
+	cat("version",script.version,"\n")
+	q()
+}
+
+arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1])
+args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2])
+
+arg.nms <- c("input_macs_files","path_output","expt_names","dist_summits","n_autosome_chr")
+arg.val <- character(length=length(arg.nms))
+for(i in 1:length(arg.nms)){
+	ix <- which(arg.names.cmd.line==arg.nms[i])
+	if(length(ix)==1){
+		arg.val[i] <- args.val.cmd.line[ix]
+	} else {
+		stop("######could not find ",arg.nms[i]," arg######\n\n",print.error())
+		
+	} 
+}
+if(debug==T){
+	print(paste(arg.nms,"=",arg.val))
+}
+# the files here adhere to tab delim format
+input.macs.files <- strsplit(arg.val[1],"::")[[1]]
+if(length(input.macs.files)==1){
+	cat("only provided one MACS file to cluster.")
+	print.error()
+}
+path.output <- arg.val[2]
+expt.names <- strsplit(arg.val[3],"::")[[1]]
+dist.summits <- as.numeric(arg.val[4])
+n.autosome.chr <- as.numeric(arg.val[5])
+
+# source("~/Documents/nyu/littmanLab/th17_used_for_paper/r_scripts/th17/used_for_paper/cluster_peaks_util.R")
+chr <- c(paste(sep="","chr",1:n.autosome.chr),paste(sep="","chr",c("X","Y")))
+
+# read MACS files
+macs.list <- list()
+for(i in 1:length(input.macs.files)){
+	e <- expt.names[i]
+	macs.list[[ e ]] <- read.delim(file=input.macs.files[i])
+	macs.list[[ e ]][,"chr"] <- as.character(macs.list[[ e ]][,"chr"])
+	macs.list[[ e ]][,"trgt.prox"] <- as.character(macs.list[[ e ]][,"trgt.prox"])
+	macs.list[[ e ]][,"trgt.dist"] <- as.character(macs.list[[ e ]][,"trgt.dist"])
+}
+
+# take all macs files and put peaks together on each chromosome 
+# (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls)
+cat("splitting macs files per chromosome\n")
+x <- split.macs.list.to.chrmosomes.no.pml(macs.list=macs.list,expts=expt.names,chr=chr)
+cat("adding peaks from all macs files into chromosome rulers\n")
+ruler <- make.ruler.no.pml(chr,macs.list.per.chrom=x)
+cat("add MTL membership to the ruler\n")
+ruler <- assign.clusters.ids.to.peaks(ruler,d.cut=dist.summits)
+for(i in 1:length(ruler)){
+	ix <- which(is.na(ruler[[i]][["dtss.prox"]]))
+	ruler[[i]][["dtss.prox"]][ix] <- ""
+	ix <- which(is.na(ruler[[i]][["dtss.dist"]]))
+	ruler[[i]][["dtss.dist"]][ix] <- ""
+}
+cat("creating MTL list\n")
+ll <- create.cluster.list.no.pml(ruler=ruler)
+cat("writing MTL table\n")
+# f.nm <- paste(sep="",path.output,paste(expt.names,collapse="_"),"_MTLs.xls")
+f.nm <- paste(sep="",path.output,"mtls",".xls")
+print.ll.verbose.all(ll=ll,f.nm=f.nm)
+
+
+
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/heatmap.R	Thu Mar 22 15:21:00 2012 -0400
@@ -0,0 +1,555 @@
+rm(list = ls())
+
+GeneratePeakMatrix <- function(experiments, scores) {
+  # Generates a score matrix for all mtls.
+  #
+  # Args:
+  #   experiments: A list of underscore deliminated experiments for each mtl.
+  #                There should never be a completely empty item in this list.
+  #                for eg. experiments[1] = expA_expD_expB.
+  #   scores: A list of underscore deliminated scores for each mtl, the length
+  #           of these scores should be identicle to the length of the
+  #           experiments. eg. scores[1] = 55_33_245.
+  #
+  # Returns:
+  #   The peak score matrix for all mtls.
+  experiments <- sapply(experiments, function(x) strsplit(x, split = "_"))
+  scores <- sapply(scores, function(x) strsplit(x, split = "_"))
+  unique.experiments <- unique(unlist(experiments))
+  
+  peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments))
+  colnames(peaks) <- unique.experiments
+  
+  for(i in 1:length(experiments)){
+    for(j in 1:length(experiments[[i]])){
+      peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j])
+    }
+  }
+  return(peaks)
+##################################################################
+}
+GetChipData <- function(file.name, 
+                        proximity = "distal", 
+                        include.targetless = TRUE) {
+  # Reads in, filters, and organizes mtls data
+  #
+  # Args:
+  #   file.name: The path to the mtls file (including the file name).
+  #   proximity: Either "distal" or "proximal", defines the gene target distance
+  #              from the mtl. Default is "distal".
+  #   include.targetless: If TRUE, includes mtls with no targets (applied after 
+  #                       the proximity filter); if not, mtls with no target
+  #                       will be exclided. Default is TRUE. 
+  #
+  # Returns:
+  #   An organized list of mtls data with the following elements:
+  #     $peaks - a matrix of peak p-values
+  #     $targets - a list of underscore deliminated gene targets for each mtl 
+  if(param$debug) {
+    print("In GetChipData")
+  }
+  # Set Constants for the mtls file type:
+  MTLNUM <- "l"
+  CHROMOSOME <- "chr"
+  EXPERIMENTS <- "expt"
+  EXPERIMENTS.SORTED <- "expt.alphanum.sorted"
+  START <- "start"
+  END <- "end"
+  SUMMIT <- "s"
+  PVALS <- "pval"
+  PVAL.MEAN <- "pval.mean"
+  SPANS <- "span.tfs"
+  SPAN.TOTAL <- "span.l"
+  PEAK.IDS <- "peak.ids"  # TODO(kieran): Find out what this is
+  PROXIMAL.TARGETS <- "trgt.prox"
+  DISTAL.TARGETS <- "trgt.dist"
+  PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox"
+  DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist"
+  
+  
+  #Get chip data in from files:
+  TARGETS <- switch(proximity,
+                     distal = DISTAL.TARGETS,
+                     proximal = PROXIMAL.TARGETS,
+                     stop(" Bad proximity argument supplied."))  # Default
+  
+  keep.columns <- c(EXPERIMENTS, PVALS, TARGETS) 
+  
+  file <- read.delim(file.name, header=T, as.is=T)[, keep.columns]
+  
+  if (!include.targetless) {
+    ix.has.trgt <- which(as.character(file[, TARGETS])!="")  
+    file <- file[ix.has.trgt, ]
+  }
+  
+  chip = list()
+  
+  chip$peaks = GeneratePeakMatrix(file[, EXPERIMENTS], file[, PVALS])
+  chip$targets = file[ , TARGETS] 
+  return (chip)
+}
+
+
+
+GenerateKMOrder <- function(data, km) {
+  # generates the order of clusters from high to low mean values
+  #
+  # Args:
+  #   data: A matrix of scores that have already been clustered.
+  #   km: The K-means object generated from running kmeans on data. Another
+  #       method could be used so long as it supplies a (km)$cluser list. Must
+  #       have the same length as the number of rows in data
+  #
+  # Returns:
+  #   cluster.order: The order in which the clusters should be displayed.
+
+    km.cluster = km$cluster
+    clusters = unique(km.cluster)
+    clusters.avg = numeric()
+    for(i in clusters) {
+        clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ]))
+    }
+    if(param$debug) {
+      print ("straight clusters")
+      print (clusters)
+      print ("straigth average")
+      print (clusters.avg)
+      print ("ordered clusters")
+      print (clusters[order(clusters.avg)])
+      print("ordered average")
+      print (clusters.avg[order(clusters.avg)])
+    }
+    return(clusters[rev(order(clusters.avg))])
+}
+
+OrderMTL <- function(data, km, cluster.order) {
+  # Orders a matrix of data according to a clustering algorithm
+  #
+  # Args:
+  #   data: A matrix of scores that have already been clustered.
+  #   km: The K-means object generated from running kmeans on data. Another
+  #       method could be used so long as it supplies a (km)$cluser list. Must
+  #       have the same length as the number of rows in data
+  #   cluster.order: The order in which the clusters should be displayed. 
+  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2 
+  #                  being on top, then cluster 3 and lastly cluster 1.
+  #
+  # Returns:
+  #   a list that contains 3 objects:
+  #     list$data: the ordered version of the data.
+  #     list$color.vector: a list of colors that should be assigned to each row.
+  #     list$start.row: the starting row of each cluster in data.
+  number.clusters <- length(cluster.order)
+  cluster.colors <- sample(rainbow(number.clusters))
+  
+  # Set up return objects
+  sorted.data <- matrix(,nr=nrow(data), nc=ncol(data))
+  colnames(sorted.data) <- colnames(data)
+  cluster.color.vector = vector(length=length(km$cluster))
+  cluster.start.row = numeric(number.clusters)
+  cluster.start.row[1]=1
+  
+  for (i in 1:number.clusters)
+  {
+      current.cluster = cluster.order[i]
+      ix = which(km$cluster == current.cluster)
+      current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1)
+      sorted.data[current.cluster.range, ] = data[ix, ]
+      cluster.color.vector[current.cluster.range] = cluster.colors[i]
+      cluster.start.row[i+1] = (cluster.start.row[i]+length(ix))  
+  }
+  ret.list = list()
+  ret.list$data = sorted.data
+  ret.list$color.vector = cluster.color.vector
+  ret.list$cluster.index = cluster.start.row
+  return(ret.list)
+}    
+
+
+CreateHeatMap <- function(data,
+                         km,
+                         cluster.order, 
+                         document.name,
+                         document.type = "png",
+                         number.colors = 30) {
+  # Generates a heatmap image based for a matrix based on a clustering 
+  # algorithm.
+  #
+  # Args:
+  #   data: A matrix of scores that have already been clustered, the column
+  #         names of this matrix will become the column titels of the heatmap.
+  #   km: The K-means object generated from running kmeans on data. Another
+  #       method could be used so long as it supplies a (km)$cluser list.
+  #   cluster.order: The order in which the clusters should be displayed. 
+  #                  for eg. km.order = c(2, 3, 1) would result in cluster 2 
+  #                  being on top, then cluster 3 and lastly cluster 1.
+  #   document.name: A name for the produced file. there is no need to
+  #                  supply the .png/.pdf in your argument.
+  #   document.type: The type of file you want to produce. current options are
+  #                  png and pdf. Default is pdf.
+  #
+  # Returns:
+  #   Nothing to the script that calls it, however it creates an image at the 
+  #   path specified.
+  if(param$debug) {
+    print("In CreateHeatMap")    
+  }
+  
+  data.ordered <- OrderMTL(data, km, cluster.order)
+    
+	
+	#Load Lib
+	library(gplots)
+	
+	#Set Color gradient
+	color.ramp = colorRampPalette(c("black",
+	                                "darkblue",
+	                                "blue",
+	                                "yellow",
+	                                "orange",
+	                                "red"))(number.colors) #7
+	
+	if(document.type == "png") {
+	  png(paste(document.name,".png", sep = ""), 
+	      width=1920, 
+	      height=2560, 
+	      res=100, 
+	      antialias="gray")
+	}
+	else {
+	pdf(paste(document.name,".pdf", sep = ""))
+	}	
+	
+	heatmap.2(
+		data.ordered$data[,sort(colnames(data.ordered$data))],
+		rowsep = data.ordered$cluster.index[-1],
+		sepwidth = c(0.5, 60), 
+		dendrogram = "none",
+		Rowv = F,
+		Colv = F,
+		trace = "none",
+		labRow = F,
+		labCol = sort(colnames(data.ordered$data)), 
+		RowSideColors = data.ordered$color.vector, 
+		col = color.ramp,
+		cexCol = 1.8,
+		cexRow = 0.8,
+		useRaster=F)
+  
+	dev.off()  
+
+}
+
+
+
+
+ReadCommadLineParameters <- function(argument.names, command.line.arguments) {
+  # Reads the parameters from the command line arguments.
+  #
+  # Args:
+  #   argument.names: A list of expected argument names.
+  #   command.line.arguments: The list of recieved command line arguments
+  #
+  # Returns:
+  #   The arguments for argument.names. As strings In that order.
+
+	if(length(grep("--version",command.line.arguments))){
+		cat("version",script.version,"\n")
+		q()
+	}
+	# Split command line arguments
+	args <- sapply(strsplit(command.line.arguments, " "),function(i) i)
+	vals <- character(length(argument.names))
+	# split cmd.line to key and value pairs
+	for(i in 1:length(argument.names)) {
+		ix <- grep(argument.names[i], args)
+		if(length(ix)>1) {
+			stop("arg ",
+			     argument.names[i],
+			     " used more than once.  Bailing out...\n",
+			     PrintParamError())
+		} 
+		else if (length(ix)==0) {
+			stop("could not find --",
+			     argument.names[i],
+			     ". Bailing out...\n",
+			     PrintParamError())
+		}
+		vals[i] <- args[ix+1]
+	}
+	return(vals)
+}
+
+PrintParamError <- function(){
+  # Prints the usage of the function, shows users what arguments they can use
+  #
+  # Args:
+  #   param: A list that contains all the paramaters.
+  #
+  # Returns:
+  #   A modified version of the param list, with the default values loaded.
+	cat("
+DESCRIPTIION:	
+	heatmap.R takes a ...
+	
+INPUT:
+	1.--mtls_file path to mtls file.\n
+	2.--cluster_file the destination path for the cluster file.\n
+	3.--heatmap_file the destination path for heatmap image (no extension).\n
+	4.--heatmap_type choice of image type, currently support png and pdf.\n
+	5.--n_clusters number of clusters in the heatmap\n
+	6.--filter_percentage percentage of mtls that will be analysed. for eg. if 
+	we make filter_percentage 30, we will take the union of the top mtls in
+	mean, non-zero mean and variance.\n
+	
+EXAMPLE RUN:
+	Rscript heatmap.R
+				--mtls_file mtls.xls
+				--cluster_file output/cluster
+				--heatmap_file output/heatmap
+				--heatmap_type png
+				--n_clusters 13
+				--filter_percentage 60
+	
+	")
+}
+
+LoadDefaultParams <- function(param) {
+  # Loads default paramaters for the heatmap application
+  #
+  # Args:
+  #   param: A list that contains all the previous paramaters.
+  #
+  # Returns:
+  #   A modified version of the param list, with the default values loaded.
+
+script.version=0.1
+param$debug = F
+
+# ChIP data:
+param$annotated.macs.file = "data/mtls/mtls.xls"
+
+# Filter:
+param$filter.percentage <- 100
+
+# Analyze Clustering:
+param$analyze.number.of.clusters = F
+param$analyze.number.of.clusters.sample.size = 10000 
+param$analyze.number.of.clusters.min = 4
+param$analyze.number.of.clusters.max = 9
+param$analyze.number.of.clusters.document.name <- "data/analysis/cluster_analysis" 
+param$analyze.number.of.clusters.itterations = 25
+
+# Clustering:
+param$clustering.number.of.clusters <- 13
+
+# Heatmap:
+param$heatmap.document.name <- "data/heatmap/heatmap"
+param$heatmap.document.type <- "png"
+#Cluster Groups:
+param$cluster.groups.document.name <- "data/clusters/clusters"
+
+return(param)
+
+}
+
+
+
+LoadDefinedParams <- function(param) {
+  # Loads user defined paramaters for the heatmap application
+  #
+  # Args:
+  #   param: A list that contains all the previous paramaters.
+  #
+  # Returns:
+  #   A modified version of the param list, with the user defined values loaded.
+
+cmd.args <- commandArgs(trailingOnly = T);
+
+
+# put the numeric params at the end!
+n.start.numeric <- 5
+args.nms <- c(	"--mtls_file", 	#1
+				"--cluster_file", 	    #2
+				"--heatmap_file", 	    #3
+				"--heatmap_type",     	#4
+			 	"--n_clusters", 	    	#5
+				"--filter_percentage")	#6
+				
+				# get parameters
+vals <- ReadCommadLineParameters(argument.names = args.nms, 
+                                 command.line.arguments = cmd.args)
+
+# check if numeric params are indeed numeric
+for(i in n.start.numeric:length(vals)){
+	if(is.na(as.numeric(vals[i]))){
+		stop("arg ",args.nms[i]," is not numeric.  Bailing out...\n",print.error())
+	}
+}
+
+# vals has the same order as args.nms
+
+# ChIP data:
+param$annotated.macs.file <- vals[1]
+
+# Filter:
+param$filter.percentage <- as.numeric(vals[6])
+
+# Clustering:
+param$clustering.number.of.clusters <- as.numeric(vals[5])
+
+# Heatmap:
+param$heatmap.document.name <- vals[3]
+param$heatmap.document.type <- vals[4]
+#Cluster Groups:
+param$cluster.groups.document.name <- vals[2]
+
+return(param)
+}
+
+
+
+
+# here we output the fasta file of the targets of each kmeans cluster
+CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){
+  f.nm = paste(f.nm, ".fasta", sep="")
+    clusters = km.order
+    for(i in 1:length(clusters)){
+        v=trgts[which(k.ix==clusters[i])]
+        v.split = unlist(sapply(v,strsplit, "_"))
+        if(i == 1){
+            cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n")    
+        } else {
+            cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE)
+        }
+        cat(sep="\n",file=f.nm,v.split,append=TRUE)
+    }
+}
+
+
+GetTopRowsFromMatrix = function(mtrx, percentage = 10)
+{
+  if (param$debug) {
+    print("In GetTopRowsFromMatrix")
+	}
+	#Store the stats for the mtrx
+	stats = list()
+	stats$mean=apply(mtrx,1,mean)
+	stats$sd=apply(mtrx,1,sd)
+	stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)]))
+    
+	
+	#Store the indexes for the mtrx
+	index = list()
+	index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix
+	index$sd   = sort(stats$sd, decreasing=T, index.return=T)$ix
+	index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix
+	
+	#Calculate how many data points we want to take
+	cutoff = floor(length(mtrx[ ,1])*(percentage/100))
+	
+	#Extract Indexes and return to caller
+	index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff]))
+	
+	return(index)
+}
+
+#GetUniq = function(targets.cluster)
+#{
+#    #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order
+#    
+#    return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))))
+#    #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))
+#    
+#}
+
+bindata = function(d,qunts=seq(.4,.9,.1))
+{
+    tmp=matrix(0,nr=nrow(d),nc=ncol(d))
+    for(j in 1:ncol(d))
+    {
+        bins=quantile(d[,j],qunts)
+            for(i in 1:length(bins))
+            {
+                tmp[which(d[,j]>bins[i]),j] = i
+         }
+    }
+    return(tmp)
+}
+
+bindata.non.zero = function(d,qunts=seq(0,0.9,0.1))
+{
+    tmp=matrix(0,nr=nrow(d),nc=ncol(d))
+    for(j in 1:ncol(d))
+    {
+        ix.non.zero=which(d[,j]!=0)
+        bins=quantile(d[ix.non.zero,j],qunts)
+            for(i in 1:length(bins))
+            {
+                tmp[which(d[,j]>bins[i]),j] = i
+         }
+    }
+    
+    return(tmp)
+}
+
+bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1))
+{
+    tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
+    ix.non.zero=which(d!=0)
+    bins=quantile(d[ix.non.zero],qunts)
+    for(i in 1:length(bins))
+    {
+        tmp[which(d>bins[i])] = i
+    }
+    return(tmp)
+}
+
+print("Finished loading functions")
+
+param <- list()
+param <- LoadDefaultParams(param)
+param <- LoadDefinedParams(param)
+
+print (param)
+
+#Load Chip data, alread in pval format
+chip <- GetChipData(param$annotated.macs.file)
+
+#Get the top percentages on different criteria
+ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage)
+chip$peaks <- chip$peaks[ix.top$union, ]
+chip$targets <- chip$targets[ix.top$union]
+
+#Bin data:
+chip$peaks = bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=30))
+
+
+#Analyze-cluster
+if(param$analyze.number.of.clusters)
+{
+    AnalyzeCluster(data, 
+    sample.size = param$analyze.number.of.clusters.sample.size, 
+    clusters.min = param$analyze.number.of.clusters.min, 
+    clusters.max = param$analyze.number.of.clusters.max, 
+    document.name = param$analyze.number.of.clusters.document.name,
+    itterations = param$analyze.number.of.clusters.itterations)
+    
+}
+
+#Cluster
+set.seed(1234)
+km <- kmeans(chip$peaks, param$clustering.number.of.clusters)
+km.order <- GenerateKMOrder(chip$peaks, km)
+
+CreateHeatMap(chip$peaks, 
+              km,
+              km.order, 
+              document.name = param$heatmap.document.name,
+              document.type = param$heatmap.document.type)
+
+CreateClusterGroups(trgts=chip$targets,
+                    k.ix=km$cluster,
+                    f.nm = param$cluster.groups.document.name, 
+                    km.order = km.order)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/map_ChIP_peaks_to_genes.R	Thu Mar 22 15:21:00 2012 -0400
@@ -0,0 +1,304 @@
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+## Nov 2011 th17
+## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, 
+## NYU - Center for Genomics and Systems Biology
+##  .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.   .-.-.
+## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
+##`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   `-`-'   ' '
+
+rm(list=ls())
+debug=F
+script.version=0.1
+print.error <- function(){
+  cat("
+NAME
+       map_ChIP_peaks_to_genes.R
+
+DESCRIPTION
+       This script takes a MACS tab delimited file as input and
+       produces two files:
+
+       1) genes.xls
+          A gene centered table that stores the peaks that are
+          proximal (within x[kb] of TSS) and distal (within the
+          gene's region + y[kb] upstream or downstream of TSS and
+          TES respectively).
+
+       2) peaks.xls
+          A peak centered table that equals the MACS input table plus
+          colums for proximal and distal targets of each peak and
+          their distance from TSS (if exist). For gene.xls script also
+          gives a poisson model based -log10(pval) score for proximal
+          and genewide enrichment of peaks for each gene with at least
+          one prox/dist peak associated with it.
+
+ARGUMENTS
+       input_file=path
+         Path to MACS file.
+
+       refseq_table=path
+         Path to refseq table (gives TSS/TES locations for all genes).
+
+       path_output=path
+         Path to save genes.xls and peaks.xls output files.
+
+       tss.dist=N
+         The absolute distance from TSS where we connect a peak to
+         gene (for proximal peaks).
+
+       gene.wide.dist=N
+         The absolute distance from TSS or TES where we connect a peak
+         to gene (for peaks hitting anywhere in gene).
+
+       effective.genome.size=N
+         The effective mappable genome size (for mm9 the value is
+         1.87e9. For hg19 the value is 2.7e9 (from MACS manual)).
+
+       macs.skip.lines=N
+         Number of lines to skip in input_file.
+
+EXAMPLE 
+       Rscript map_ChIP_peaks_to_genes_v.1.R \\
+         input_file=SL971_SL970_peaks.xls \\
+         refseq_table=UCSC_mm9_refseq_genes_Sep_15_2011.txt \\
+         path_output=./ \\
+         tss.dist=5000 \\
+         gene.wide.dist=10000 \\
+         effective.genome.size=1.87e9 \\
+         macs.skip.lines=23
+
+CITATION
+       Please cite us if you used this script: The transcription
+       factor network regulating Th17 lineage specification and
+       function.  Maria Ciofani, Aviv Madar, Carolina Galan, Kieran
+       Mace, Ashish Agarwal, Kim Newberry, Richard M. Myers, Richard
+       Bonneau and Dan R. Littman et. al. (in preperation).
+
+")
+}
+
+# retrieve args
+if(debug==T){
+	cmd.args <- c(
+		# "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL971_SL970_peaks.xls",
+		"input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL3594_SL3592_peaks.xls",		
+		"refseq_table=input/th17/used_for_paper/rawData/UCSC_mm9_refseq_genes_Sep_15_2011.txt",
+		"path_output=/Users/aviv/Desktop/test_script/",
+		"tss.dist=5000",
+		"tes.dist=10000",
+		"effective.genome.size=1.87e9",
+		"gene.wide.dist=10000",
+		"macs.skip.lines=23"
+	)
+} else {
+	cmd.args <- commandArgs();      
+}
+
+if(length(grep("--version",cmd.args))){
+	cat("version",script.version,"\n")
+	q()
+}
+
+arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1])
+args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2])
+
+arg.nms <- c("input_file","refseq_table","path_output","tss.dist",
+			 "gene.wide.dist","effective.genome.size","macs.skip.lines")
+arg.val <- character(length=length(arg.nms))
+for(i in 1:length(arg.nms)){
+	ix <- which(arg.names.cmd.line==arg.nms[i])
+	if(length(ix)==1){
+		arg.val[i] <- args.val.cmd.line[ix]
+	} else {
+		stop("######could not find ",arg.nms[i]," arg######\n\n",print.error())
+		
+	} 
+}
+if(debug==T){
+	print(paste(arg.nms,"=",arg.val))
+}
+# the files here adhere to tab delim format
+path.input.macs <- arg.val[1]
+path.input.refseq <- arg.val[2]
+path.output <- arg.val[3]
+tss.dist <- as.numeric(arg.val[4])
+gene.wide.dist <- as.numeric(arg.val[5])
+effective.genome.size <- as.numeric(arg.val[6])
+macs.skip.lines=as.numeric(arg.val[7])
+
+if(debug==T){
+# if(1){
+	load("/Users/aviv/Desktop/r.u.RData")
+	gn.nms <- rownames(r.u)
+} else {
+	##################
+	### step 1 handle refseq table file (read, make unique)
+	cat("reading refseq table",path.input.refseq,"\n")
+	refseq <- read.delim(file=path.input.refseq)
+	# refseq can have many transcripts for each gene
+	# here i make it have only one transcript for each gene (the longest one)
+	cat("for transcripts matching more than one gene keeping only the longest transcript\n")
+	refseq$name2 <- as.character(refseq$name2)
+	refseq$chrom <- as.character(refseq$chrom)
+	refseq$strand <- as.character(refseq$strand)
+	gn.nms <- unique(refseq$name2)
+	#x <- sort(table(refseq$name2),decreasing=T)
+	#gn.nms.non.unique <- names(x)[which(x>1)]
+	#gn.nms.unique <- names(x)[which(x==1)]
+	# create refseq unique r.u
+	n <- length(gn.nms)
+	r.u <- data.frame(cbind(chrom=rep("NA",n),strand=rep("NA",n),txStart=rep(0,n),txEnd=rep(0,n)),stringsAsFactors=FALSE,row.names=gn.nms)
+	for(i in 1:n){
+	  ix <- which(refseq$name2==gn.nms[i])
+	  if(length(ix)==0) {
+	    error("could not find gene", ng.nms[i], "in refseq table.  Bailing out...\n")
+	  } else if (length(ix)>1){
+	    l <- apply(refseq[ix,c("txStart","txEnd")],1,function(i) abs(i[1]-i[2]) )
+	    l.max <- max(l)
+	    ix <- ix[which(l==l.max)[1]]
+	  }
+	  r.u[gn.nms[i],c("chrom","strand","txStart","txEnd")] <- refseq[ix,c("chrom","strand","txStart","txEnd")]
+	}
+	r.u[,"txStart"] <- as.numeric(r.u[,"txStart"])
+	r.u[,"txEnd"] <- as.numeric(r.u[,"txEnd"])
+
+	# switch TSS and TES if we have chr "-"
+	cat("correcting TSS, TES assiments in refseq table based on strand\n")
+	ix <- which(r.u$strand=="-")
+	tmp.tes <- r.u$txStart[ix]
+	tmp.tss <- r.u$txEnd[ix]
+	r.u[ix,c("txStart","txEnd")] <- cbind(tmp.tss,tmp.tes)
+	# ############
+}
+
+
+#### step 2 read macs file
+cat("reading Macs file",path.input.macs,"\n")
+
+# read macs file
+M <- read.delim(file=path.input.macs,skip=macs.skip.lines)
+trgt.prox <- NA
+trgt.dist <- NA
+dtss.prox <- NA
+dtss.dist <- NA
+M.annot <- cbind(M,trgt.prox,trgt.dist,dtss.prox,dtss.dist)
+M.annot[is.na(M.annot)] <- ""
+# get the number of genes
+m <- length(gn.nms)
+if(debug==T){
+	# m <- 100
+}
+f.nm.peak <- paste(sep="",path.output,"peaks.xls")
+f.nm.gene <- paste(sep="",path.output,"genes.xls")
+# for each genes in the expt (j goes over genes)
+cat(file=f.nm.gene,sep="\t","Gene_ID","prox_n_peak","genewide_n_peak","gn_length","strand","prox_mean_pval","genewide_mean_pval",
+			"prox_max_pval","genewide_max_pval",
+			"prox_pois_model_pval","genewide_pois_model_pval","(peak_id,summit,d_TSS,d_TES,class,pval,fold_enrich,FDR),(..)\n")
+peaks.summit <- (M[,"start"]+M[,"summit"])
+cat(sep="","mapping MACS peaks to genes\n")
+for (j in 1:m){
+  if(j%%20==0){cat(".")}
+  tss <- r.u[j,"txStart"]
+  tes <- r.u[j,"txEnd"]
+  gn.lngth <- abs(tss-tes)
+  strand <- r.u[j,"strand"]
+  chr <- r.u[j,"chrom"]
+  if(strand=="+"){
+    d.tss.all <- peaks.summit-tss
+   	d.tes.all <- peaks.summit-tes
+  } else {
+    d.tss.all <- tss-peaks.summit
+   	d.tes.all <- tes-peaks.summit
+  }
+  ix.distal <- sort(union( c(which( abs(d.tss.all) < gene.wide.dist ),which( abs(d.tes.all) < gene.wide.dist )),
+               which( sign(d.tss.all) != sign(d.tes.all) )),decreasing=TRUE)
+  ix.prox <- sort(which( abs(d.tss.all) < tss.dist ),decreasing=TRUE)	
+  ix.prox <- ix.prox[which(M[ix.prox,"chr"]==chr)]    
+  ix.distal <- ix.distal[which(M[ix.distal,"chr"]==chr)]
+  n.peaks.prox <- length(ix.prox)
+  n.peaks.distal <- length(ix.distal)
+	# if there is at least one peak hitting gene j
+  if(n.peaks.distal > 0){
+    # for each peak (l goes over peaks)
+    peaks.line <- paste(sep="","(")
+    for (k in 1:n.peaks.distal){
+      if(k>1){
+        peaks.line <- paste(sep="",peaks.line,";(")
+      }
+      d.tss <- d.tss.all[ ix.distal[k] ]
+      d.tes <- d.tes.all[ ix.distal[k] ]
+      if(sign(d.tss)!=sign(d.tes)){
+        class="intra"
+      } else if(d.tss<=0){
+        class="upstream"
+      } else if(d.tss>0){
+        class="downstream"
+      }
+      peaks.line <- paste(sep="",peaks.line,paste(sep=",", ix.distal[k], peaks.summit[ ix.distal[k] ], 
+					d.tss, d.tes, class, M[ ix.distal[k] ,7], M[ ix.distal[k] ,8], M[ ix.distal[k] ,9] ),")")
+	  # handle M.annot
+	  dtss <- d.tss.all[ ix.distal[k] ]
+	  prev.trgt <- M.annot[ix.distal[k],"trgt.dist"]
+	  if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
+		if(abs(dtss)<abs(as.numeric(M.annot[ix.distal[k],"dtss.dist"]))){
+			M.annot[ix.distal[k],"dtss.dist"] <- dtss
+			M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
+		}
+	  } else { # if peak does not have trgt add current trgt
+		M.annot[ix.distal[k],"dtss.dist"] <- dtss
+		M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
+	  }
+	# calc the pval for these many peaks in proximal region and in gene-wide region
+	}
+	if(n.peaks.prox > 0){
+		mean.pval.prox <- mean(M[ ix.prox ,7])
+		max.pval.prox <- max(M[ ix.prox ,7])
+		expected.num.peaks.genome.wide.prox <- length(which(M[,7]>=mean.pval.prox))
+		# lambda = num peaks / genome size * searched region
+		lambda.prox <- expected.num.peaks.genome.wide.prox/effective.genome.size*(tss.dist*2)		
+		pval.pois.prox <- -log10(ppois(n.peaks.prox,lambda.prox,lower.tail=FALSE))
+		# handle M.annot
+		for(k in 1:n.peaks.prox){
+			dtss <- d.tss.all[ ix.prox[k] ]
+			prev.trgt <- M.annot[ix.prox[k],"trgt.prox"]
+			if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
+				if(abs(dtss)<abs(as.numeric(M.annot[ix.prox[k],"dtss.prox"]))){
+					M.annot[ix.prox[k],"dtss.prox"] <- dtss
+					M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
+				}
+			} else { # if peak does not have trgt add current trgt
+				M.annot[ix.prox[k],"dtss.prox"] <- dtss
+				M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
+			}
+		}
+	} else {
+		mean.pval.prox <- "NA"
+		max.pval.prox <- "NA"
+		pval.pois.prox <- "NA"
+	}
+	mean.pval.distal <- mean(M[ ix.distal ,7])
+	max.pval.distal <- max(M[ ix.distal ,7])
+	expected.num.peaks.genome.wide.distal <- length(which(M[,7]>=mean.pval.distal))
+	# lambda = num peaks / genome size * searched region
+	lambda.distal <- expected.num.peaks.genome.wide.distal/effective.genome.size*(gn.lngth+gene.wide.dist*2)
+    pval.pois.distal <- -log10(ppois(n.peaks.distal,lambda.distal,lower.tail=FALSE))
+    # pring peaks for gene j    
+    core.line <- paste(sep="\t",gn.nms[ j ],n.peaks.prox,n.peaks.distal,gn.lngth,strand,
+							mean.pval.prox,mean.pval.distal,max.pval.prox,max.pval.distal,
+							pval.pois.prox,pval.pois.distal)
+    cat(file=f.nm.gene,append=TRUE,sep="",core.line,"\t",peaks.line,"\n")
+  }
+}
+cat("Done!\n")
+
+cat(file=f.nm.peak,colnames(M.annot),"\n",sep="\t")
+write.table(file=f.nm.peak,append=T,col.names=F,row.names=F,M.annot,sep="\t")
+
+
+
+
+
+
+