# HG changeset patch
# User modencode-dcc
# Date 1358455515 18000
# Node ID 5e6efd5f3567561357aac0475060327bfdcde23f
# Parent  e7295ec4e7500c7c0b14355fdf29a8c724b5ad61
Uploaded

diff -r e7295ec4e750 -r 5e6efd5f3567 functions-all-clayton-12-13.r
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/functions-all-clayton-12-13.r	Thu Jan 17 15:45:15 2013 -0500
@@ -0,0 +1,3099 @@
+# revised on 2-20-10
+# - fix error in pass.structure: reverse rank.combined, so that big sig.value
+#  are ranked with small numbers (1, 2, ...)
+# - fix error on get.ez.tt.all: get ez.cutoff from sorted e.z
+
+#
+# modified EM procedure to compute empirical CDF more precisely - 09/2009
+
+
+
+# this file contains the functions for  
+# 1. computing the correspondence profile (upper rank intersection and derivatives)
+# 2. inference of copula mixture model
+#
+# It also has functions for
+# 1. reading peak caller results
+# 2. processing and matching called peaks
+# 3. plotting results
+
+
+################ read peak caller results
+
+# process narrow peak format
+# some peak callers may not report q-values, p-values or fold of enrichment
+# need further process before comparison
+#
+# stop.exclusive: Is the basepair of peak.list$stop exclusive? In narrowpeak and broadpeak format they are exclusive.
+# If it is exclusive, we need subtract peak.list$stop by 1 to avoid the same basepair being both a start and a stop of two 
+# adjacent peaks, which creates trouble for finding correct intersect  
+process.narrowpeak <- function(narrow.file, chr.size, half.width=NULL, summit="offset", stop.exclusive=T, broadpeak=F){
+
+
+  aa <- read.table(narrow.file)
+
+  if(broadpeak){
+    bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9)
+  }else{
+    bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9, summit=aa$V10)
+  }
+    
+  if(summit=="summit"){
+    bb.ori$summit <- bb.ori$summit-bb.ori$start # change summit to offset to avoid error when concatenating chromosomes
+  }
+ 
+  bb <- concatenate.chr(bb.ori, chr.size)
+
+  #bb <- bb.ori
+
+  # remove the peaks that has the same start and stop value
+  bb <- bb[bb$start != bb$stop,]
+
+  if(stop.exclusive==T){
+    bb$stop <- bb$stop-1
+  }
+
+  if(!is.null(half.width)){
+    bb$start.ori <- bb$start    
+    bb$stop.ori <- bb$stop 
+
+    # if peak is narrower than the specified window, stay with its width
+    # otherwise chop wider peaks to specified width
+    width <- bb$stop-bb$start +1
+    is.wider <- width > 2*half.width
+
+    if(summit=="offset" | summit=="summit"){ # if summit is offset from start
+      bb$start[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]-half.width
+      bb$stop[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]+half.width
+    } else { 
+      if(summit=="unknown"){
+        bb$start[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) - half.width
+        bb$stop[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) + half.width
+      }
+    }
+  }
+
+  bb <- clean.data(bb)
+  invisible(list(data.ori=bb.ori, data.cleaned=bb))
+}
+
+# clean data 
+# and concatenate chromosomes if needed
+clean.data <- function(adata){
+
+  # remove the peaks that has the same start and stop value
+  adata <- adata[adata$start != adata$stop,]
+
+  # if some stops and starts are the same, need fix them
+  stop.in.start <- is.element(adata$stop, adata$start)
+  n.fix <- sum(stop.in.start)
+  if(n.fix >0){
+    print(paste("Fix", n.fix, "stops\n"))
+    adata$stop[stop.in.start] <- adata$stop[stop.in.start]-1 
+  }  
+ 
+  return(adata) 
+}
+
+# concatenate peaks
+# peaks: the dataframe to have all the peaks
+# chr.file: the file to keep the length of each chromosome 
+# chr files should come from the species that the data is from
+#concatenate.chr <- function(peaks, chr.size){
+
+ # chr.size <- read.table(chr.file)
+#  chr.o <- order(chr.size[,1])
+#  chr.size <- chr.size[chr.o,]
+#
+#  chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
+#  chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)  
+#
+#  for(i in 1:nrow(chr.size)){
+#    is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i])
+#    if(sum(is.in)>0){
+#      peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i]
+#      peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i]
+#    }
+#  }
+#
+#  invisible(peaks)
+#}
+
+
+
+
+# concatenate peaks
+# peaks: the dataframe to have all the peaks
+# chr.file: the file to keep the length of each chromosome 
+# chr files should come from the species that the data is from
+concatenate.chr <- function(peaks, chr.size){
+
+ # chr.size <- read.table(chr.file)
+  chr.o <- order(chr.size[,1])
+  chr.size <- chr.size[chr.o,]
+
+  chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
+  chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)  
+
+  peaks$start.ori <- peaks$start
+  peaks$stop.ori <- peaks$stop
+  
+  for(i in 1:nrow(chr.size)){
+    is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i])
+    if(sum(is.in)>0){
+      peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i]
+      peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i]
+    }
+  }
+
+  invisible(peaks)
+}
+
+
+deconcatenate.chr <- function(peaks, chr.size){
+
+  chr.o <- order(chr.size[,1])
+  chr.size <- chr.size[chr.o,]
+
+  chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
+  chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)  
+
+  peaks$chr <- rep(NA, nrow(peaks))
+  
+  for(i in 1:(nrow(chr.size.cum)-1)){
+    is.in <- peaks$start > chr.size.cum[i,2] & peaks$start <= chr.size.cum[i+1, 2]
+    if(sum(is.in)>0){
+      peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2]
+      peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1    
+      peaks[is.in,]$chr <- chr.size[i,1]
+    }
+  }
+
+  if(i == nrow(chr.size.cum)){
+    is.in <- peaks$start > chr.size.cum[i, 2]
+    if(sum(is.in)>0){
+      peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2]
+      peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1    
+      peaks[is.in,]$chr <- chr.size[i,1]
+    }
+  }
+  
+  invisible(peaks)
+}
+
+################ preprocessing peak calling output
+
+
+# 
+# read two calling results and sort by peak starting locations, 
+# then find overlap between peaks
+# INPUT:
+#   rep1: the 1st replicate
+#   rep2: the 2nd replicate
+# OUTPUT:
+#   id1, id2: the labels for the identified peaks on the replicates
+find.overlap <- function(rep1, rep2){
+
+  o1 <- order(rep1$start)
+  rep1 <- rep1[o1,]
+    
+  o2 <- order(rep2$start)
+  rep2 <- rep2[o2,]
+
+  n1 <- length(o1)
+  n2 <- length(o2)
+  
+  # assign common ID to peaks
+  id1 <- rep(0, n1) # ID assigned on rep1
+  id2 <- rep(0, n2) # ID assigned on rep2
+  id <- 1 # keep track common id's
+  
+  # check if two replicates overlap with each other
+  i <- 1
+  j <- 1
+
+  while(i <= n1|| j <= n2){
+
+    # && (id1[n1] ==0 || id2[n2] ==0)
+    
+    # if one list runs out
+    if(i > n1 && j < n2){
+      
+      j <- j+1
+      id2[j] <- id
+      id <- id +1
+      next
+    } else{
+      if(j > n2 && i < n1){
+        i <- i+1        
+        id1[i] <- id
+        id <- id +1
+        next
+      } else {
+        if(i >= n1 && j >=n2)
+          break
+      }
+    }
+
+    # if not overlap
+
+    if(!(rep1$start[i] <= rep2$stop[j] && rep2$start[j] <= rep1$stop[i])){
+
+      # at the start of loop, when both are not assigned an ID
+      # the one locates in front is assigned first
+      if(id1[i] ==0 && id2[j]==0){
+        if(rep1$stop[i] < rep2$stop[j]){
+          id1[i] <- id
+        } else {
+          id2[j] <- id
+        }
+      } else { # in the middle of the loop, when one is already assigned
+      # The one that has not assigned gets assigned
+      #  if(id1[i] ==0){ # id1[i] is not assigned
+      #    id1[i] <- id
+      #  } else { # id2[i] is not assigned
+      #    id2[j] <- id 
+      #  }
+
+        # order the id according to location
+        if(rep1$stop[i] <= rep2$stop[j]){
+          id1[i] <- max(id2[j], id1[i])
+          id2[j] <- id  
+        } else {
+          if(rep1$stop[i] > rep2$stop[j]){
+            id2[j] <- max(id1[i], id2[j])
+            id1[i] <- id
+          }
+        }
+          
+      }
+      
+      id <- id +1
+      
+    } else { # if overlap
+    
+      if(id1[i] == 0 && id2[j] == 0){ # not assign label yet
+        id1[i] <- id 
+        id2[j] <- id
+        id <- id +1
+      } else { # one peak is already assigned label, the other is 0
+        
+        id1[i] <- max(id1[i], id2[j]) # this is a way to copy the label of the assigned peak without knowing which one is already assigned
+        id2[j] <- id1[i] # syncronize the labels        
+      }
+      
+    }
+    
+    if(rep1$stop[i] < rep2$stop[j]){
+      i <- i+1
+    } else {
+      j <- j+1
+    } 
+    
+  }
+
+  invisible(list(id1=id1, id2=id2))
+  
+}
+
+# Impute the missing significant value for the peaks called only on one replicate.
+# value 
+# INPUT:
+#   rep1, rep2: the two peak calling output 
+#   id1, id2: the IDs assigned by function find.overlap, vectors
+#        If id1[i]==id2[j], peak i on rep1 overlaps with peak j on rep2
+#   p.value.impute: the significant value to impute for the missing peaks 
+# OUTPUT:   
+#   rep1, rep2: peaks ordered by the start locations with imputed peaks
+#   id1, id2: the IDs with imputed peaks
+fill.missing.peaks <- function(rep1, rep2, id1, id2, p.value.impute){
+
+#   rep1 <- data.frame(chr=rep1$chr, start=rep1$start, stop=rep1$stop, sig.value=rep1$sig.value)
+#   rep2 <- data.frame(chr=rep2$chr, start=rep2$start, stop=rep2$stop, sig.value=rep2$sig.value)   
+   
+   o1 <- order(rep1$start)
+   rep1 <- rep1[o1,]
+    
+   o2 <- order(rep2$start)
+   rep2 <- rep2[o2,]  
+     
+   entry.in1.not2 <- !is.element(id1, id2)
+   entry.in2.not1 <- !is.element(id2, id1)
+
+   if(sum(entry.in1.not2) > 0){
+   
+     temp1 <- rep1[entry.in1.not2, ]
+
+     # impute sig.value
+     temp1$sig.value <- p.value.impute
+     temp1$signal.value <- p.value.impute
+     temp1$p.value <- p.value.impute
+     temp1$q.value <- p.value.impute
+     
+     rep2.filled <- rbind(rep2, temp1)
+     id2.filled <- c(id2, id1[entry.in1.not2])
+   } else {
+     id2.filled <- id2
+     rep2.filled <- rep2
+   }
+
+   if(sum(entry.in2.not1) > 0){
+
+     temp2 <- rep2[entry.in2.not1, ]
+
+     # fill in p.values to 1
+     temp2$sig.value <- p.value.impute
+     temp2$signal.value <- p.value.impute
+     temp2$p.value <- p.value.impute
+     temp2$q.value <- p.value.impute
+   
+
+     # append to the end
+     rep1.filled <- rbind(rep1, temp2)
+
+     id1.filled <- c(id1, id2[entry.in2.not1])
+   } else {
+     id1.filled <- id1
+     rep1.filled <- rep1
+   }
+
+   # sort rep1 and rep2 by the same id
+   o1 <- order(id1.filled)
+   rep1.ordered <- rep1.filled[o1, ]
+
+   o2 <- order(id2.filled)
+   rep2.ordered <- rep2.filled[o2, ]   
+   
+   invisible(list(rep1=rep1.ordered, rep2=rep2.ordered,
+                  id1=id1.filled[o1], id2=id2.filled[o2]))
+ }
+
+# Merge peaks with same ID on the same replicates 
+# (They are generated if two peaks on rep1 map to the same peak on rep2)
+# need peak.list have 3 columns: start, stop and sig.value 
+merge.peaks <- function(peak.list, id){
+
+  i <- 1
+  j <- 1
+  dup.index <- c()
+  sig.value <- c()
+  start.new <- c()
+  stop.new <- c()
+  id.new <- c()
+
+  # original data
+  chr <- c()
+  start.ori <- c()
+  stop.ori <- c()
+  
+  signal.value <- c()
+  p.value <- c()
+  q.value <- c()
+
+  while(i < length(id)){
+    
+    if(id[i] == id[i+1]){
+      dup.index <- c(dup.index, i, i+1) # push on dup.index
+    } else {
+      if(length(dup.index)>0){ # pop from dup.index
+        sig.value[j] <- mean(peak.list$sig.value[unique(dup.index)]) # mean of -log(pvalue)
+        start.new[j] <- peak.list$start[min(dup.index)]
+        stop.new[j] <- peak.list$stop[max(dup.index)]
+        id.new[j] <- id[max(dup.index)]
+        
+        signal.value[j] <- mean(peak.list$signal.value[unique(dup.index)]) # mean of -log(pvalue)
+        p.value[j] <- mean(peak.list$p.value[unique(dup.index)]) # mean of -log(pvalue)
+        q.value[j] <- mean(peak.list$q.value[unique(dup.index)]) # mean of -log(pvalue)
+
+        chr[j] <- as.character(peak.list$chr[min(dup.index)])
+        start.ori[j] <- peak.list$start.ori[min(dup.index)]
+        stop.ori[j] <- peak.list$stop.ori[max(dup.index)]
+        
+        dup.index <- c()
+      } else { # nothing to pop
+        sig.value[j] <- peak.list$sig.value[i]
+        start.new[j] <- peak.list$start[i]
+        stop.new[j] <- peak.list$stop[i]
+        id.new[j] <- id[i]
+
+        signal.value[j] <- peak.list$signal.value[i] 
+        p.value[j] <- peak.list$p.value[i] 
+        q.value[j] <- peak.list$q.value[i] 
+
+        chr[j] <- as.character(peak.list$chr[i])
+        start.ori[j] <- peak.list$start.ori[i]
+        stop.ori[j] <- peak.list$stop.ori[i]
+        
+      }
+      j <- j+1
+    }
+    i <- i+1
+  }
+
+  data.new <- data.frame(id=id.new, sig.value=sig.value, start=start.new, stop=stop.new, signal.value=signal.value, p.value=p.value, q.value=q.value, chr=chr, start.ori=start.ori, stop.ori=stop.ori)
+  invisible(data.new)
+}
+
+
+
+
+
+# a wrap function to fill in missing peaks, merge peaks and impute significant values
+# out1 and out2 are two peak calling outputs
+pair.peaks <- function(out1, out2, p.value.impute=0){
+
+  aa <- find.overlap(out1, out2)
+  bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0)
+
+  cc1 <- merge.peaks(bb$rep1, bb$id1)
+  cc2 <- merge.peaks(bb$rep2, bb$id2)
+
+  invisible(list(merge1=cc1, merge2=cc2))
+}
+
+
+
+# overlap.ratio is a parameter to define the percentage of overlap
+# if overlap.ratio =0, 1 basepair overlap is counted as overlap
+# if overlap.ratio between 0 and 1, it is the minimum proportion of
+# overlap required to be called as a match
+# it is computed as the overlap part/min(peak1.length, peak2.length)
+pair.peaks.filter <- function(out1, out2, p.value.impute=0, overlap.ratio=0){
+
+  aa <- find.overlap(out1, out2)
+  bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0)
+
+  cc1 <- merge.peaks(bb$rep1, bb$id1)
+  cc2 <- merge.peaks(bb$rep2, bb$id2)
+
+  frag12 <- cbind(cc1$start, cc1$stop, cc2$start, cc2$stop)
+  
+  frag.ratio <- apply(frag12, 1, overlap.middle)
+
+  frag.ratio[cc1$sig.value==p.value.impute | cc2$sig.value==p.value.impute] <- 0
+
+  cc1$frag.ratio <- frag.ratio
+  cc2$frag.ratio <- frag.ratio
+
+  merge1 <- cc1[cc1$frag.ratio >= overlap.ratio,]
+  merge2 <- cc2[cc2$frag.ratio >= overlap.ratio,]
+  
+  invisible(list(merge1=merge1, merge2=merge2))
+}
+
+# x[1], x[2] are the start and end of the first fragment
+# and x[3] and x[4] are the start and end of the 2nd fragment 
+# If there are two fragments, we can find the overlap by ordering the
+# start and stop of all the ends and find the difference between the middle two
+overlap.middle  <- function(x){
+
+  x.o <- x[order(x)]
+  f1 <- x[2]-x[1]
+  f2 <- x[4]-x[3]
+  
+  f.overlap <- abs(x.o[3]-x.o[2])
+  f.overlap.ratio <- f.overlap/min(f1, f2)
+
+  return(f.overlap.ratio)
+}
+
+
+
+#######
+####### compute correspondence profile
+#######
+
+# compute upper rank intersection for one t
+# tv: the upper percentile
+# x is sorted by the order of paired variable
+comp.uri <- function(tv, x){
+  n <- length(x)
+  qt <- quantile(x, prob=1-tv[1]) # tv[1] is t
+#  sum(x[1:ceiling(n*tv[2])] >= qt)/n/tv[2]- tv[1]*tv[2] #tv[2] is v
+  sum(x[1:ceiling(n*tv[2])] >= qt)/n
+
+}
+
+# compute the correspondence profile
+# tt, vv: vector between (0, 1) for percentages
+get.uri.2d <- function(x1, x2, tt, vv, spline.df=NULL){
+
+  o <- order(x1, x2, decreasing=T)
+  
+  # sort x2 by the order of x1
+  x2.ordered <- x2[o]
+  
+  tv <- cbind(tt, vv)
+  ntotal <- length(x1) # number of peaks    
+
+  uri <- apply(tv, 1, comp.uri, x=x2.ordered)
+
+  # compute the derivative of URI vs t using small bins
+  uri.binned <- uri[seq(1, length(uri), by=4)]
+  tt.binned <- tt[seq(1, length(uri), by=4)]
+  uri.slope <- (uri.binned[2:(length(uri.binned))] - uri.binned[1:(length(uri.binned)-1)])/(tt.binned[2:(length(uri.binned))] - tt.binned[1:(length(tt.binned)-1)])
+
+  # smooth uri using spline
+  # first find where the jump is and don't fit the jump
+  # this is the index on the left
+  # jump.left.old  <- which.max(uri[-1]-uri[-length(uri)])
+  short.list.length <- min(sum(x1>0)/length(x1), sum(x2>0)/length(x2))
+
+  if(short.list.length < max(tt)){
+    jump.left <- which(tt>short.list.length)[1]-1
+  } else {
+    jump.left <- which.max(tt)
+  }
+
+#  reversed.index <- seq(length(tt), 1, by=-1)
+#  nequal <- sum(uri[reversed.index]== tt[reversed.index])
+#  temp  <- which(uri[reversed.index]== tt[reversed.index])[nequal]
+#  jump.left <- length(tt)-temp
+ 
+  if(jump.left < 6){
+   jump.left <- length(tt)
+  }
+    
+ 
+  if(is.null(spline.df))
+    uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=6.4)
+  else{
+    uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=spline.df)
+  }
+  # predict the first derivative
+  uri.der <- predict(uri.spl, tt[1:jump.left], deriv=1)
+
+  invisible(list(tv=tv, uri=uri, 
+                 uri.slope=uri.slope, t.binned=tt.binned[2:length(uri.binned)], 
+                 uri.spl=uri.spl, uri.der=uri.der, jump.left=jump.left,
+                 ntotal=ntotal))
+ }
+
+
+# change the scale of uri from based on t (percentage) to n (number of peaks or basepairs)
+# this is for plotting multiple pairwise URI's on the same plot 
+scale.t2n <- function(uri){
+
+  ntotal <- uri$ntotal
+  tv <- uri$tv*uri$ntotal
+  uri.uri <- uri$uri*uri$ntotal
+  jump.left <- uri$jump.left
+  uri.spl <- uri$uri.spl
+  uri.spl$x <- uri$uri.spl$x*uri$ntotal 
+  uri.spl$y <- uri$uri.spl$y*uri$ntotal
+
+  t.binned <- uri$t.binned*uri$ntotal
+  uri.slope <- uri$uri.slope
+  uri.der <- uri$uri.der
+  uri.der$x <- uri$uri.der$x*uri$ntotal
+  uri.der$y <- uri$uri.der$y
+
+  uri.n <- list(tv=tv, uri=uri.uri, t.binned=t.binned, uri.slope=uri.slope, uri.spl=uri.spl, uri.der=uri.der, ntotal=ntotal, jump.left=jump.left)
+  return(uri.n)
+} 
+
+
+
+
+# a wrapper for running URI for peaks from peak calling results
+# both data1 and data2 are calling results in narrowpeak format
+compute.pair.uri <- function(data.1, data.2, sig.value1="signal.value", sig.value2="signal.value", spline.df=NULL, overlap.ratio=0){
+
+  tt <- seq(0.01, 1, by=0.01)
+  vv <- tt
+
+  if(sig.value1=="signal.value"){
+    data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$signal.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
+  } else {
+    if(sig.value1=="p.value"){ 
+      data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$p.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
+    } else {
+      if(sig.value1=="q.value"){
+        data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$q.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
+      }
+    }
+  }
+
+  if(sig.value2=="signal.value"){
+    data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$signal.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
+  } else {
+    if(sig.value2=="p.value"){ 
+      data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$p.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
+    } else {
+      if(sig.value2=="q.value"){
+        data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$q.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
+      }
+    }
+  }
+
+  ### by peaks
+  # data12.enrich <- pair.peaks(data.1.enrich, data.2.enrich)
+  data12.enrich <- pair.peaks.filter(data.1.enrich, data.2.enrich, p.value.impute=0, overlap.ratio)
+  uri <- get.uri.2d(as.numeric(as.character(data12.enrich$merge1$sig.value)), as.numeric(as.character(data12.enrich$merge2$sig.value)), tt, vv, spline.df=spline.df)
+  uri.n <- scale.t2n(uri)
+
+  return(list(uri=uri, uri.n=uri.n, data12.enrich=data12.enrich, sig.value1=sig.value1, sig.value2=sig.value2))
+
+
+}
+
+
+
+# compute uri for matched sample
+get.uri.matched <- function(data12, df=10){
+
+  tt <- seq(0.01, 1, by=0.01)
+  vv <- tt
+  uri <- get.uri.2d(data12$sample1$sig.value, data12$sample2$sig.value, tt, vv, spline.df=df)
+
+  # change scale from t to n
+  uri.n <- scale.t2n(uri)
+
+  return(list(uri=uri, uri.n=uri.n))
+  
+}
+
+# map.uv is a pair of significant values corresponding to specified consistency FDR
+# assuming values in map.uv and qvalue are linearly related
+# data.set is the original data set
+# sig.value is the name of the significant value in map.uv, say enrichment
+# nominal.value is the one we want to map to, say q-value
+# 
+map.sig.value <- function(data.set, map.uv, nominal.value){
+
+  index.nominal <- which(names(data.set$merge1)==nominal.value)
+  nentry <- nrow(map.uv)  
+  map.nominal <- rbind(map.uv[, c("sig.value1", "sig.value2")])
+
+  for(i in 1:nentry){
+
+    map.nominal[i, "sig.value1"] <- data.set$merge1[unique(which.min(abs(data.set$merge1$sig.value-map.uv[i, "sig.value1"]))), index.nominal]
+    map.nominal[i, "sig.value2"] <- data.set$merge2[unique(which.min(abs(data.set$merge2$sig.value-map.uv[i, "sig.value2"]))), index.nominal]
+  }
+
+  invisible(map.nominal)
+}
+
+
+############### plot correspondence profile
+
+# plot multiple comparison wrt one template
+# uri.list contains the total number of peaks
+# plot.missing=F: not plot the missing points on the right 
+plot.uri.group <- function(uri.n.list, plot.dir, file.name=NULL, legend.txt, xlab.txt="num of significant peaks", ylab.txt="num of peaks in common", col.start=0, col.txt=NULL, plot.missing=F, title.txt=NULL){
+
+  if(is.null(col.txt))
+    col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
+
+  n <- length(uri.n.list)  
+
+  ntotal <- c()
+  for(i in 1:n)
+    ntotal[i] <- uri.n.list[[i]]$ntotal
+
+  jump.left <- c()
+  jump.left.der <- c()
+  ncommon <- c()
+  for(i in 1:n){
+#    jump.left[i]  <- which.max(uri.n.list[[i]]$uri[-1]-uri.n.list[[i]]$uri[-length(uri.n.list[[i]]$uri)])
+#    if(jump.left[i] < 6)
+#      jump.left[i] <- length(uri.n.list[[i]]$uri)
+
+##  reversed.index <- seq(length(uri.n.list[[i]]$tv[,1]), 1, by=-1)
+##  nequal <- sum(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1])
+##  temp  <- which(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1])[nequal]
+##  jump.left[i] <- length(uri.n.list[[i]]$tv[,1])-temp
+##print(uri.n.list[[i]]$uri)
+##print(uri.n.list[[i]]$tv[,1])
+##   jump.left[i] <- uri.n.list[[i]]$jump.left
+
+#    jump.left.der[i] <- sum(uri.n.list[[i]]$t.binned < uri.n.list[[i]]$uri.der$x[length(uri.n.list[[i]]$uri.der$x)])
+
+    jump.left[i] <- uri.n.list[[i]]$jump.left
+    jump.left.der[i] <- jump.left[i]
+    ncommon[i] <- uri.n.list[[i]]$tv[jump.left[i],1]
+  }
+
+
+  if(plot.missing){
+    max.peak <- max(ntotal)
+  } else {
+    max.peak <- max(ncommon)*1.05
+  }
+
+  if(!is.null(file.name)){
+    postscript(paste(plot.dir, "uri.", file.name, sep=""))
+    par(mfrow=c(1,1), mar=c(5,5,4,2))
+  }
+
+  plot(uri.n.list[[1]]$tv[,1], uri.n.list[[1]]$uri, type="n", xlab=xlab.txt, ylab=ylab.txt, xlim=c(0, max.peak), ylim=c(0, max.peak), cex.lab=2)
+
+  for(i in 1:n){
+
+    if(plot.missing){ 
+      points(uri.n.list[[i]]$tv[,1], uri.n.list[[i]]$uri, col=col.txt[i+col.start], cex=0.5 )
+    } else {
+      points(uri.n.list[[i]]$tv[1:jump.left[i],1], uri.n.list[[i]]$uri[1:jump.left[i]], col=col.txt[i+col.start], cex=0.5)
+    }
+    lines(uri.n.list[[i]]$uri.spl, col=col.txt[i+col.start], lwd=4)
+  }
+  abline(coef=c(0,1), lty=3)
+  legend(0, max.peak, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2)
+  if(!is.null(title))
+    title(title.txt)
+
+  if(!is.null(file.name)){
+    dev.off()
+  }
+
+  if(!is.null(file.name)){
+    postscript(paste(plot.dir, "duri.", file.name, sep=""))
+    par(mfrow=c(1,1), mar=c(5,5,4,2))
+  } 
+  plot(uri.n.list[[1]]$t.binned, uri.n.list[[1]]$uri.slope, type="n", xlab=xlab.txt, ylab="slope", xlim=c(0, max.peak), ylim=c(0, 1.5), cex.lab=2)
+
+  for(i in 1:n){
+#    if(plot.missing){ 
+#      points(uri.n.list[[i]]$t.binned, uri.n.list[[i]]$uri.slope, col=col.txt[i+col.start], cex=0.5)
+#    } else {
+#      points(uri.n.list[[i]]$t.binned[1:jump.left.der[i]], uri.n.list[[i]]$uri.slope[1:jump.left.der[i]], col=col.txt[i+col.start], cex=0.5)
+#    }
+    lines(uri.n.list[[i]]$uri.der, col=col.txt[i+col.start], lwd=4)
+  }
+  abline(h=1, lty=3)
+  legend(0.5*max.peak, 1.5, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2)
+
+  if(!is.null(title))
+    title(title.txt)
+
+  if(!is.null(file.name)){
+    dev.off()
+  }
+  
+}
+
+
+
+#######################
+####################### copula fitting for matched peaks
+#######################
+
+# estimation from mixed copula model 
+
+# 4-5-09
+# A nonparametric estimation of mixed copula model
+
+
+# updated
+
+# c1, c2, f1, f2, g1, g2 are vectors
+# c1*f1*g1 and c2*f2*g2 are copula densities for the two components
+# xd1 and yd1 are the values of marginals for the first component
+# xd2 and yd2 are the values of marginals for the 2nd component
+#
+# ez is the prob for being in the consistent group
+get.ez <- function(p, c1, c2, xd1, yd1, xd2, yd2){
+
+  return(p*c1*xd1*yd1/(p*c1*xd1*yd1 + (1-p)*c2*xd2*yd2))
+}
+
+# checked
+
+# this is C_12 not the copula density function c=C_12 * f1* f2
+# since nonparametric estimation is used here for f1 and f2, which
+# are constant throughout the iterations, we don't need them for optimization
+# 
+# bivariate gaussian copula function
+# t and s are vectors of same length, both are percentiles 
+# return a vector
+gaussian.cop.den <- function(t, s, rho){
+
+  A <- qnorm(t)^2 + qnorm(s)^2
+  B <- qnorm(t)*qnorm(s)
+
+  loglik <-  -log(1-rho^2)/2 - rho/(2*(1-rho^2))*(rho*A-2*B)
+
+  return(exp(loglik))
+}
+
+clayton.cop.den <- function(t, s, rho){
+
+  if(rho > 0)
+    return(exp(log(rho+1)-(rho+1)*(log(t)+log(s))-(2+1/rho)*log(t^(-rho) + s^(-rho)-1)))
+
+  if(rho==0)
+    return(1)
+
+  if(rho<0)
+    stop("Incorrect Clayton copula coefficient")
+  
+}
+
+
+# checked
+# estimate rho from Gaussian copula
+mle.gaussian.copula <- function(t, s, e.z){
+
+  # reparameterize to bound from rho=+-1
+  l.c <- function(rho, t, s, e.z){
+#    cat("rho=", rho, "\n")
+    sum(e.z*log(gaussian.cop.den(t, s, rho)))}
+
+  rho.max <- optimize(f=l.c, c(-0.998, 0.998), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z)
+
+#print(rho.max$m)
+
+#cat("cor=", cor(qnorm(t)*e.z, qnorm(s)*e.z), "\t", "rho.max=", rho.max$m, "\n")
+#  return(sign(rho.max$m)/(1+rho.max$m))
+  return(rho.max$m)
+}
+
+
+# estimate mle from Clayton copula, 
+mle.clayton.copula <- function(t, s, e.z){
+
+  l.c <- function(rho, t, s, e.z){
+    lc <- sum(e.z*log(clayton.cop.den(t, s, rho)))
+#    cat("rho=", rho, "\t", "l.c=", lc, "\n")
+    return(lc)
+  }
+
+  rho.max <- optimize(f=l.c, c(0.1, 20), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z)
+
+  return(rho.max$m)
+}
+
+
+
+# updated
+# mixture likelihood of two gaussian copula
+# nonparametric and ranked transformed
+loglik.2gaussian.copula <- function(x, y, p, rho1, rho2, x.mar, y.mar){
+ 
+  px.1 <- get.pdf.cdf(x, x.mar$f1)
+  px.2 <- get.pdf.cdf(x, x.mar$f2)
+  py.1 <- get.pdf.cdf(y, y.mar$f1)
+  py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+  c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+  c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+
+  sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+}
+
+loglik.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){
+
+  px.1 <- pdf.cdf$px.1
+  px.2 <- pdf.cdf$px.2
+  py.1 <- pdf.cdf$py.1
+  py.2 <- pdf.cdf$py.2
+
+  if(copula.txt=="gaussian"){
+    c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+    c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+  } else {
+    if(copula.txt=="clayton"){
+      c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
+      c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
+    }
+  }  
+  sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+}
+
+
+# estimate the marginals of each component using histogram estimator in EM
+# return the density, breaks, and cdf of the histogram estimator 
+est.mar.hist <- function(x, e.z, breaks){
+
+  binwidth <- c()
+  nbin <- length(breaks)-1
+  nx <- length(x) 
+
+  # the histogram
+  x1.pdf <- c()
+  x2.pdf <- c()
+  x1.cdf <- c()
+  x2.cdf <- c()
+
+  # the pdf for each point
+  x1.pdf.value <- rep(NA, nx)
+  x2.pdf.value <- rep(NA, nx)
+
+  x1.cdf.value <- rep(NA, nx)
+  x2.cdf.value <- rep(NA, nx) 
+
+  for(i in 1:nbin){
+
+    binwidth[i] <- breaks[i+1] - breaks[i]
+    if(i < nbin)
+      in.bin <- x>= breaks[i] & x < breaks[i+1]
+    else    # last bin
+      in.bin <- x>= breaks[i] & x <=breaks[i+1]
+
+    # each bin add one observation to avoid empty bins
+    # multiple (nx+nbin)/(nx+nbin+1) to avoid blowup when looking up for
+    # quantiles 
+    x1.pdf[i] <- (sum(e.z[in.bin])+1)/(sum(e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1)        
+    x2.pdf[i] <- (sum(1-e.z[in.bin])+1)/(sum(1-e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1) 
+
+
+#    x1.pdf[i] <- sum(e.z[in.bin])/sum(e.z)/binwidth[i]*nx/(nx+1)        
+#    x2.pdf[i] <- sum(1-e.z[in.bin])/sum(1-e.z)/binwidth[i]*nx/(nx+1) 
+    
+# treat each bin as a value for a discrete variable    
+#    x1.cdf[i] <- sum(x1.pdf[1:i]*binwidth[1:i])
+#    x2.cdf[i] <- sum(x2.pdf[1:i]*binwidth[1:i])
+
+
+    # cumulative density before reaching i
+    if(i>1){
+      x1.cdf[i] <- sum(x1.pdf[1:(i-1)]*binwidth[1:(i-1)])
+      x2.cdf[i] <- sum(x2.pdf[1:(i-1)]*binwidth[1:(i-1)])    
+    } else{
+      x1.cdf[i] <- 0
+      x2.cdf[i] <- 0
+    }
+
+    # make a vector of nx to store the values of pdf and cdf for each x
+    # this will speed up the computation dramatically
+    x1.pdf.value[in.bin] <- x1.pdf[i]
+    x2.pdf.value[in.bin] <- x2.pdf[i]
+
+    x1.cdf.value[in.bin] <- x1.cdf[i] + x1.pdf[i]*(x[in.bin]-breaks[i])
+    x2.cdf.value[in.bin] <- x2.cdf[i] + x2.pdf[i]*(x[in.bin]-breaks[i])      
+  }
+
+#  x1.cdf <- cumsum(x1.pdf*binwidth)
+#  x2.cdf <- cumsum(x2.pdf*binwidth)
+
+  f1 <-list(breaks=breaks, density=x1.pdf, cdf=x1.cdf)
+  f2 <-list(breaks=breaks, density=x2.pdf, cdf=x2.cdf)
+
+  f1.value <- list(pdf=x1.pdf.value, cdf=x1.cdf.value)
+  f2.value <- list(pdf=x2.pdf.value, cdf=x2.cdf.value)
+
+  return(list(f1=f1, f2=f2, f1.value=f1.value, f2.value=f2.value))
+}
+
+# estimate the marginal cdf from rank
+est.cdf.rank <- function(x, conf.z){
+
+  # add 1 to prevent blow up
+  x1.cdf <- rank(x[conf.z==1])/(length(x[conf.z==1])+1)
+
+  x2.cdf <- rank(x[conf.z==0])/(length(x[conf.z==0])+1)
+
+  return(list(cdf1=x1.cdf, cdf2=x2.cdf))
+}
+
+# df is a density function with fields: density, cdf and breaks, x is a scalar
+get.pdf <- function(x, df){
+
+  if(x < df$breaks[1])
+    cat("x is out of the range of df\n")
+
+  index <- which(df$breaks >= x)[1]
+
+  if(index==1)
+    index <- index +1
+  return(df$density[index-1])  
+}
+
+# get cdf from histgram estimator for a single value
+get.cdf <- function(x, df){
+
+  index <- which(df$breaks >= x)[1]
+  if(index==1)
+    index <- index +1
+  return(df$cdf[index-1])   
+}
+
+# df is a density function with fields: density, cdf and breaks
+get.pdf.cdf <- function(x.vec, df){
+
+  x.pdf <- sapply(x.vec, get.pdf, df=df)
+  x.cdf <- sapply(x.vec, get.cdf, df=df) 
+  return(list(cdf=x.cdf, pdf=x.pdf))
+}
+
+# E-step
+# x and y are the original observations or ranks
+# rho1 and rho2 are the parameters of each copula
+# f1, f2, g1, g2 are functions, each is a histogram 
+e.step.2gaussian <- function(x, y, p, rho1, rho2, x.mar, y.mar){
+
+  # get pdf and cdf of each component from functions in the corresponding component 
+  px.1 <- get.pdf.cdf(x, x.mar$f1)
+  px.2 <- get.pdf.cdf(x, x.mar$f2)
+  py.1 <- get.pdf.cdf(y, y.mar$f1)
+  py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+  c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+  c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+  
+  return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf))
+}
+
+# E-step
+# rho1 and rho2 are the parameters of each copula 
+e.step.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){
+
+  # get pdf and cdf of each component from functions in the corresponding component 
+  px.1 <- get.pdf.cdf(x, x.mar$f1)
+  px.2 <- get.pdf.cdf(x, x.mar$f2)
+  py.1 <- get.pdf.cdf(y, y.mar$f1)
+  py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+  if(copula.txt=="gaussian"){
+    c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+    c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+  } else {
+    if(copula.txt=="clayton"){
+      c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
+      c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
+    } 
+  }
+  return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf))
+}
+
+
+
+
+# M-step
+m.step.2gaussian <- function(x, y, e.z, breaks){
+
+  # compute f1, f2, g1 and g2
+  x.mar <- est.mar.hist(x, e.z, breaks)
+  y.mar <- est.mar.hist(y, e.z, breaks)  
+
+  px.1 <- get.pdf.cdf(x, x.mar$f1)
+  px.2 <- get.pdf.cdf(x, x.mar$f2)
+  py.1 <- get.pdf.cdf(y, y.mar$f1)
+  py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+  rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)  
+  rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) 
+
+  p <- sum(e.z)/length(e.z) 
+
+  return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar))
+}
+
+m.step.2copula <- function(x, y, e.z, breaks, copula.txt){
+
+  # compute f1, f2, g1 and g2
+  x.mar <- est.mar.hist(x, e.z, breaks)
+  y.mar <- est.mar.hist(y, e.z, breaks)  
+
+  px.1 <- get.pdf.cdf(x, x.mar$f1)
+  px.2 <- get.pdf.cdf(x, x.mar$f2)
+  py.1 <- get.pdf.cdf(y, y.mar$f1)
+  py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+  if(copula.txt=="gaussian"){
+    rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)  
+    rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) 
+  } else {
+    if(copula.txt=="clayton"){
+      rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z)  
+      rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z)      
+    }
+  }
+  
+  p <- sum(e.z)/length(e.z) 
+
+  return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar))
+}
+
+
+
+# E-step: pass values
+# x and y are the original observations or ranks
+# rho1 and rho2 are the parameters of each copula
+# f1, f2, g1, g2 are functions, each is a histogram 
+e.step.2gaussian.value <- function(x, y, p, rho1, rho2, pdf.cdf){
+
+  c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
+  c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)
+  
+  e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf, 
+               pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf)
+  return(e.z)
+}
+
+
+e.step.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){
+
+  if(copula.txt =="gaussian"){
+    c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
+    c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)
+  } else {
+    if(copula.txt =="clayton"){
+      c1 <- clayton.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
+      c2 <- clayton.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)      
+    }
+  }
+  
+  e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf, 
+               pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf)
+  return(e.z)
+}
+
+
+# M-step: pass values
+m.step.2gaussian.value <- function(x, y, e.z, breaks, fix.rho2){
+
+  # compute f1, f2, g1 and g2
+  x.mar <- est.mar.hist(x, e.z, breaks)
+  y.mar <- est.mar.hist(y, e.z, breaks)  
+
+#  px.1 <- get.pdf.cdf(x, x.mar$f1)
+#  px.2 <- get.pdf.cdf(x, x.mar$f2)
+#  py.1 <- get.pdf.cdf(y, y.mar$f1)
+#  py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+  px.1 <- x.mar$f1.value
+  px.2 <- x.mar$f2.value
+  py.1 <- y.mar$f1.value
+  py.2 <- y.mar$f2.value
+
+  rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)  
+
+  if(!fix.rho2)
+    rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) 
+  else
+    rho2 <- 0
+
+  p <- sum(e.z)/length(e.z) 
+
+  pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
+
+  return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
+              pdf.cdf=pdf.cdf))
+}
+
+m.step.2gaussian.value2 <- function(x, y, e.z, breaks, fix.rho2, x.mar, y.mar){
+
+  # compute f1, f2, g1 and g2
+#  x.mar <- est.mar.hist(x, e.z, breaks)
+#  y.mar <- est.mar.hist(y, e.z, breaks)  
+
+#  px.1 <- get.pdf.cdf(x, x.mar$f1)
+#  px.2 <- get.pdf.cdf(x, x.mar$f2)
+#  py.1 <- get.pdf.cdf(y, y.mar$f1)
+#  py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+  px.1 <- x.mar$f1.value
+  px.2 <- x.mar$f2.value
+  py.1 <- y.mar$f1.value
+  py.2 <- y.mar$f2.value
+
+  rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)  
+
+  if(!fix.rho2)
+    rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) 
+  else
+    rho2 <- 0
+
+  p <- sum(e.z)/length(e.z) 
+
+  pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
+
+  return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
+              pdf.cdf=pdf.cdf))
+}
+
+
+
+m.step.2copula.value <- function(x, y, e.z, breaks, fix.rho2, copula.txt){
+
+  # compute f1, f2, g1 and g2
+  x.mar <- est.mar.hist(x, e.z, breaks)
+  y.mar <- est.mar.hist(y, e.z, breaks)  
+
+#  px.1 <- get.pdf.cdf(x, x.mar$f1)
+#  px.2 <- get.pdf.cdf(x, x.mar$f2)
+#  py.1 <- get.pdf.cdf(y, y.mar$f1)
+#  py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+  px.1 <- x.mar$f1.value
+  px.2 <- x.mar$f2.value
+  py.1 <- y.mar$f1.value
+  py.2 <- y.mar$f2.value
+
+  if(copula.txt=="gaussian"){
+    rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)  
+    
+    if(!fix.rho2)
+      rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) 
+    else
+      rho2 <- 0
+  } else {
+
+    if(copula.txt=="clayton"){
+      rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z)  
+    
+      if(!fix.rho2)
+        rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z) 
+      else
+        rho2 <- 0
+    }    
+  }
+    
+  p <- sum(e.z)/length(e.z) 
+
+  pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
+
+  return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
+              pdf.cdf=pdf.cdf))
+}
+
+
+
+
+# updated
+# mixture likelihood of two gaussian copula
+# nonparametric and ranked transformed
+loglik.2gaussian.copula.value <- function(x, y, p, rho1, rho2, pdf.cdf){
+
+  px.1 <- pdf.cdf$px.1
+  px.2 <- pdf.cdf$px.2
+  py.1 <- pdf.cdf$py.1
+  py.2 <- pdf.cdf$py.2
+
+  c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+  c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+
+  sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+}
+
+
+
+# updated
+# mixture likelihood of two gaussian copula
+# nonparametric and ranked transformed
+loglik.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){
+
+  px.1 <- pdf.cdf$px.1
+  px.2 <- pdf.cdf$px.2
+  py.1 <- pdf.cdf$py.1
+  py.2 <- pdf.cdf$py.2
+
+  if(copula.txt=="gaussian"){
+    c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+    c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+  } else {
+    if(copula.txt=="clayton"){
+      c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
+      c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
+    }
+  }
+
+  sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+}
+
+
+
+# EM for 2 Gaussian, speed up computation, unfinished
+
+em.2gaussian.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T){
+
+  x <- rank(x, tie="random")
+  y <- rank(y, tie="random")
+
+#  x <- rank(x, tie="average")
+#  y <- rank(y, tie="average")
+
+  # nbin=20
+  xy.min <- min(x, y)
+  xy.max <- max(x, y)
+  binwidth <- (xy.max-xy.min)/50
+  breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/50)
+#  breaks <- seq(xy.min, xy.max, by=binwidth)
+  
+
+  # initiate marginals 
+  # initialization: first p0 data has 
+#  e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped
+
+  e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0)))
+
+  if(!stoc)
+    para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2)
+  else 
+    para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
+
+
+  if(fix.p){
+    p <- p0
+  } else {
+    p <- para$p  
+  }
+
+  if(fix.rho2){
+    rho2 <- rho2.0
+  } else {
+    rho2 <- para$rho2
+  }
+
+#  rho1 <- 0.8
+  rho1 <- para$rho1
+
+  l0 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf)
+
+  loglik.trace <- c()
+  loglik.trace[1] <- l0
+#  loglik.trace[2] <- l1
+  to.run <- T
+
+  i <- 2
+
+  # this two lines to remove
+#  x.mar <- est.mar.hist(x, e.z, breaks)
+#  y.mar <- est.mar.hist(y, e.z, breaks)  
+  
+  while(to.run){
+
+    e.z <- e.step.2gaussian.value(x, y, p, rho1, rho2, para$pdf.cdf) 
+    if(!stoc)
+      para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2)
+    else
+      para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
+
+    # fix x.mar and y.mar : to remove
+#    if(!stoc)
+#      para <- m.step.2gaussian.value2(x, y, e.z, breaks, fix.rho2, x.mar, y.mar)
+#    else
+#      para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
+
+    
+    if(fix.p){
+      p <- p0
+    } else {
+      p <- para$p  
+    }
+
+    if(fix.rho2){
+      rho2 <- rho2.0
+    } else {
+      rho2 <- para$rho2
+    }
+
+#    rho1 <- 0.8
+    rho1 <- para$rho1
+
+  #  l0 <- l1
+    l1 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf)
+    loglik.trace[i] <- l1
+
+#cat("l1=", l1, "\n") 
+
+    # Aitken acceleration criterion
+    if(i > 2){
+      l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2])) 
+      to.run <- abs(l.inf - loglik.trace[i]) > eps 
+#cat("para=", "p=", para$p, " rho1=", rho1, " rho2=", rho2, "\n")
+#cat("l.inf=", l.inf, "\n")
+#cat(l.inf-loglik.trace[i], "\n")     
+    }
+
+    i <- i+1
+  }
+
+  bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters
+  return(list(para=list(p=para$p, rho1=rho1, rho2=rho2), 
+              loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z, 
+              loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar,
+              breaks=breaks))
+}
+
+
+
+em.2copula.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T, copula.txt, nbin=50){
+
+  x <- rank(x, tie="random")
+  y <- rank(y, tie="random")
+
+#  x <- rank(x, tie="first")
+#  y <- rank(y, tie="first")
+
+  # nbin=50
+  xy.min <- min(x, y)
+  xy.max <- max(x, y)
+  binwidth <- (xy.max-xy.min)/50
+  breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/nbin)  
+#  breaks <- seq(xy.min, xy.max, by=binwidth)
+  
+  # initiate marginals 
+  # initialization: first p0 data has 
+#  e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped
+
+  e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0)))
+
+
+  if(!stoc)
+    para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+  else 
+    para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+
+  if(fix.p){
+    p <- p0
+  } else {
+    p <- para$p  
+  }
+
+  if(fix.rho2){
+    rho2 <- rho2.0
+  } else {
+    rho2 <- para$rho2
+  }
+
+  l0 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt)
+
+  loglik.trace <- c()
+  loglik.trace[1] <- l0
+#  loglik.trace[2] <- l1
+  to.run <- T
+
+  i <- 2
+
+  while(to.run){
+
+    e.z <- e.step.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt) 
+    if(!stoc)
+      para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+    else
+      para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+
+    if(fix.p){
+      p <- p0
+    } else {
+      p <- para$p  
+    }
+
+    if(fix.rho2){
+      rho2 <- rho2.0
+    } else {
+      rho2 <- para$rho2
+    }
+
+
+  #  l0 <- l1
+    l1 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt)
+    loglik.trace[i] <- l1
+
+cat("l1=", l1, "\n") 
+
+    # Aitken acceleration criterion
+    if(i > 2){
+      l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2])) 
+      to.run <- abs(l.inf - loglik.trace[i]) > eps 
+cat("para=", "p=", para$p, " rho1=", para$rho1, " rho2=", rho2, "\n")
+#cat("l.inf=", l.inf, "\n")
+#cat(l.inf-loglik.trace[i], "\n")     
+    }
+
+    i <- i+1
+  }
+
+  bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters
+  return(list(para=list(p=para$p, rho1=para$rho1, rho2=rho2), 
+              loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z, 
+              loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar,
+              breaks=breaks))
+}
+
+
+#######################
+####################### fit EM procedure for the matched peaks
+#######################
+
+# remove the unmatched ones
+#rm.unmatch <- function(sample1, sample2, p.value.impute=0){
+#
+#  sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+#  sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+# 
+#  invisible(list(sample1=sample1.prune$sig.value, sample2=sample2.prune$sig.value))
+#}
+
+
+# fit 2-component model
+#fit.em <- function(sample12, fix.rho2=T){
+#
+#  prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
+#
+#  em.fit <- em.2gaussian.quick(-prune.sample$sample1, -prune.sample$sample2,
+# p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2)
+#
+#  invisible(list(em.fit=em.fit, data.pruned=prune.sample))
+#}
+
+
+rm.unmatch <- function(sample1, sample2, p.value.impute=0){
+
+  sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+  sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+ 
+  invisible(list(sample1=sample1.prune, sample2=sample2.prune))
+}
+
+
+# fit 2-component model
+fit.em <- function(sample12, fix.rho2=T){
+
+  prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
+
+  em.fit <- em.2gaussian.quick(-prune.sample$sample1$sig.value, -prune.sample$sample2$sig.value,
+ p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2)
+
+  invisible(list(em.fit=em.fit, data.pruned=prune.sample))
+}
+
+
+
+fit.2copula.em <- function(sample12, fix.rho2=T, copula.txt){
+
+  prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
+
+#  o <- order(prune.sample$sample1)
+#  n <- length(prune.sample$sample1)
+    
+#  para <- init(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, c(rep(0, round(n/3)), rep(c(0,1), round(n/6)), rep(1, n-round(n/3)-round(n/6))))
+
+#  temp <- init.dist(f0, f1)
+  para <- list()
+  para$rho <- 0.6
+  para$p <- 0.3
+  para$mu <- 2.5
+  para$sigma <- 1
+##  para$mu <- -temp$mu
+##  para$sigma <- temp$sigma
+#cat("mu=", para$mu, "sigma=", para$sigma, "\n")
+  
+#  em.fit <- em.transform.1loop(-prune.sample$sample1, -prune.sample$sample2,
+  cat("EM is running")
+  em.fit <- em.transform(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, para$mu, para$sigma, para$rho, para$p, eps=0.01)
+
+  invisible(list(em.fit=em.fit, data.pruned=prune.sample))
+}
+
+
+
+
+# fit 1-component model
+fit.1.component <- function(data.pruned, breaks){
+
+#  gaussian.1 <- fit.gaussian.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks)
+#  clayton.1 <- fit.clayton.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks)
+
+  gaussian.1 <- fit.gaussian.1(-data.pruned$sample1, -data.pruned$sample2, breaks)
+  clayton.1 <- fit.clayton.1(-data.pruned$sample1, -data.pruned$sample2, breaks)
+
+  return(list(gaussian.1=gaussian.1, clayton.1=clayton.1))
+}
+
+
+
+#################
+# Fit a single component  
+#################
+
+# a single gaussian copula
+# if breaks=NULL, use empirical pdf, otherwise use histogram estimate
+fit.gaussian.1 <- function(x, y, breaks=NULL){
+
+  # rank transformed and compute the empirical cdf
+  t <- emp.mar.cdf.rank(x)
+  s <- emp.mar.cdf.rank(y)
+
+  mle.rho <- mle.gaussian.copula(t, s, rep(1, length(t)))
+
+  c1 <- gaussian.cop.den(t, s, mle.rho)
+cat("c1", sum(log(c1)), "\n")
+
+  if(is.null(breaks)){
+    f1 <- emp.mar.pdf.rank(t)
+    f2 <- emp.mar.pdf.rank(s)
+  } else {
+    x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks)
+    y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks)
+
+    f1 <- x.mar$f1.value$pdf  # only one component
+    f2 <- y.mar$f1.value$pdf
+  }
+
+
+cat("f1", sum(log(f1)), "\n")
+cat("f2", sum(log(f2)), "\n")
+
+  loglik <- sum(log(c1)+log(f1)+log(f2))
+
+  bic <- -2*loglik + log(length(t))*(1+length(breaks)-1)
+
+  return(list(rho=mle.rho, loglik=loglik, bic=bic))
+}
+
+
+# a single Clayton copula
+fit.clayton.1 <- function(x, y, breaks=NULL){
+
+  # rank transformed and compute the empirical cdf
+  t <- emp.mar.cdf.rank(x)
+  s <- emp.mar.cdf.rank(y)
+
+  mle.rho <- mle.clayton.copula(t, s, rep(1, length(t)))
+
+  c1 <- clayton.cop.den(t, s, mle.rho)
+
+  if(is.null(breaks)){
+    f1 <- emp.mar.pdf.rank(t)
+    f2 <- emp.mar.pdf.rank(s)
+  } else {
+    x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks)
+    y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks)
+
+    f1 <- x.mar$f1.value$pdf  # only one component
+    f2 <- y.mar$f1.value$pdf
+  }
+
+  loglik <- sum(log(c1)+log(f1)+log(f2))
+
+  bic <- -2*loglik + log(length(t))*(1+length(breaks)-1)
+
+  return(list(rho=mle.rho, tau=rho/(rho+2), loglik=loglik, bic=bic)) 
+}
+
+## obsolete function (01-06-2010)
+## compute the average posterior probability to belong to the random component
+## for peaks selected at different cutoffs 
+comp.uri.ez <- function(tt, u, v, e.z){
+
+   u.t <- quantile(u, prob=(1-tt))
+   v.t <- quantile(v, prob=(1-tt))
+
+ #  ez <- mean(e.z[u >= u.t & v >=u.t]) Is this wrong?
+   ez <- mean(e.z[u >= u.t & v >=v.t])
+
+   return(ez)
+}
+
+## obsolete function (01-06-2010)
+# compute the largest posterior error probability corresponding to
+# the square centered at the origin and spanned top tt% on both coordinates
+# so the consistent low rank ones are excluded
+# boundary.txt: either "max" or "min", if it is error prob, use "max"
+comp.ez.cutoff <- function(tt, u, v, e.z, boundary.txt){
+
+   u.t <- quantile(u, prob=(1-tt))
+   v.t <- quantile(v, prob=(1-tt))
+
+   if(boundary.txt == "max"){
+ #    ez.bound <- max(e.z[u >= u.t & v >=u.t])
+     ez.bound <- max(e.z[u >= u.t & v >=v.t])
+   } else {
+ #    ez.bound <- min(e.z[u >= u.t & v >=u.t])
+     ez.bound <- min(e.z[u >= u.t & v >=v.t])     
+   }
+
+   return(ez.bound)
+
+}
+
+# obsolete functions: 01-06-2010
+# compute the error rate
+# u.t and v.t are the quantiles
+# this one is used for the plots generated initially in the brief writeup  
+# and it was used for processing merged data in July before the IDR definition
+# is formalized
+# It does not implement the current definition of IDR
+get.ez.tt.old  <- function(em.fit, reverse=T, fdr.level=c(0.01, 0.05, 0.1)){
+
+  u <- em.fit$data.pruned$sample1
+  v <- em.fit$data.pruned$sample2
+
+  tt <- seq(0.01, 0.99, by=0.01)
+  if(reverse){ 
+    e.z <-  1-em.fit$em.fit$e.z # this is the error prob
+    uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+    ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max") 
+  } else {
+    e.z <-  em.fit$em.fit$e.z
+    uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+    ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min") 
+  }
+
+  u.t <- quantile(u, prob=(1-tt))
+  v.t <- quantile(v, prob=(1-tt))  
+
+  # find the levels on the two replicates
+  sig.value1 <- c()
+  sig.value2 <- c()
+  error.prob.cutoff <- c()
+  n.selected.match <- c()
+
+  for(i in 1:length(fdr.level)){
+
+    # find which uri.ez is closet to fdr.level
+    index <- which.min(abs(uri.ez - fdr.level[i]))
+    sig.value1[i] <- u.t[index]
+    sig.value2[i] <- v.t[index]
+    error.prob.cutoff[i] <- ez.bound[index]  
+    if(reverse){
+      n.selected.match[i] <- sum(e.z<=ez.bound[index])    
+    } else {
+      n.selected.match[i] <- sum(e.z>=ez.bound[index])    
+    }
+  }   
+
+  # output the cutoff of posterior probability, signal values on two replicates
+  map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match)
+
+  return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, fdr.level=fdr.level,  map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
+}
+
+# created: 01-06-2010
+# Output IDR at various number of selected peaks
+# Find cutoff (idr cutoff, sig.value cutoff on each replicate) for specified IDR level
+# IDR definition is similar to FDR
+get.ez.tt <- function(em.fit, idr.level=c(0.01, 0.05, 0.1)){
+
+#  u <- em.fit$data.pruned$sample1$sig.value
+#  v <- em.fit$data.pruned$sample2$sig.value
+  u <- em.fit$data.pruned$sample1
+  v <- em.fit$data.pruned$sample2
+  
+  e.z <-  1-em.fit$em.fit$e.z # this is the error prob
+  
+  o <- order(e.z)
+  e.z.ordered <- e.z[o]
+  n.select <- c(1:length(e.z))
+  IDR <- cumsum(e.z.ordered)/n.select
+
+  u.o <- u[o]
+  v.o <- v[o]
+
+  n.level <- length(idr.level)
+#  sig.value1 <- rep(NA, n.level)
+#  sig.value2 <- rep(NA, n.level)
+  ez.cutoff <- rep(NA, n.level)
+  n.selected <- rep(NA, n.level)
+  
+  for(i in 1:length(idr.level)){
+
+    # find which uri.ez is closet to fdr.level
+    index <- which.min(abs(IDR - idr.level[i]))
+#    sig.value1[i] <- min(u.o[1:index])
+#    sig.value2[i] <- min(v.o[1:index])
+    ez.cutoff[i] <- e.z[index]      
+    n.selected[i] <- sum(e.z<=ez.cutoff[i])    
+  }   
+
+  # output the cutoff of posterior probability, number of selected overlapped peaks 
+#  map.uv <- cbind(ez.cutoff, sig.value1, sig.value2, n.selected)
+
+  map.uv <- cbind(ez.cutoff, n.selected)
+
+  return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv))
+}   
+  
+#  return(list(n=tt*length(u), uri.ez=uri.ez,  fdr.level=fdr.level,  map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))  
+  
+
+
+
+
+### compute the mean of the marginals
+get.mar.mean <- function(em.out){
+
+  x.f1 <- em.out$x.mar$f1
+  x.f2 <- em.out$x.mar$f2
+
+  y.f1 <- em.out$y.mar$f1
+  y.f2 <- em.out$y.mar$f2
+
+  x.stat1 <- get.hist.mean(x.f1)
+  x.stat2 <- get.hist.mean(x.f2)
+  y.stat1 <- get.hist.mean(y.f1)
+  y.stat2 <- get.hist.mean(y.f2)
+
+  return(list(x.mean1=x.stat1$mean, x.mean2=x.stat2$mean, 
+              y.mean1=y.stat1$mean, y.mean2=y.stat2$mean,
+              x.sd1=x.stat1$sd, x.sd2=x.stat2$sd, 
+              y.sd1=y.stat1$sd, y.sd2=y.stat2$sd
+              ))
+
+}
+
+
+# compute the mean of marginals
+get.hist.mean  <- function(x.f){
+
+  nbreaks <- length(x.f$breaks)
+  x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks]
+
+  x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2
+  x.mean <- sum(x.mid*x.f$density*x.bin)
+  x.sd <- sqrt(sum(x.mid*x.mid*x.f$density*x.bin)-x.mean^2)
+  
+  return(list(mean=x.mean, sd=x.sd))
+}
+
+get.hist.var <- function(x.f){
+
+  nbreaks <- length(x.f$breaks)
+  x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks]
+
+  x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2
+  x.mean <- sum(x.mid*x.f$density*x.bin)
+
+  return(mean=x.mean)  
+}
+
+# obsolete function (01-06-2010)
+# plot 
+plot.ez.group.old <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks",  ylab.txt="avg posterior prob of being random", col.txt=NULL, title.txt=NULL){
+
+  if(is.null(col.txt))
+    col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
+
+  x <- c()
+  y <- c()
+
+  for(i in 1:length(ez.list)){
+    x <- c(x, ez.list[[i]]$n)
+      
+    y <- c(y, ez.list[[i]]$uri.ez)
+  }
+
+  if(is.null(y.lim))
+    y.lim <- c(0, max(y))
+
+  if(!is.null(file.name)){
+    postscript(paste(plot.dir, "ez.", file.name, sep=""))
+    par(mfrow=c(1,1), mar=c(5,5,4,2))
+  }
+
+  plot(x, y, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+
+  for(i in 1:length(ez.list)){
+    lines(ez.list[[i]]$n, ez.list[[i]]$uri.ez, col=col.txt[i], cex=2, lwd=5)    
+  }
+
+#   plot(ez.list[[1]]$u.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+#   plot(ez.list[[1]]$v.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+  
+
+  legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2)
+
+  if(!is.null(title))
+    title(title.txt)
+
+  if(!is.null(file.name)){
+    dev.off()
+  }
+  
+}
+
+
+plot.ez.group <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks",  ylab.txt="IDR", col.txt=NULL, title.txt=NULL){
+
+  if(is.null(col.txt))
+    col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
+
+  n.entry <- length(ez.list)
+  x <- rep(NA, n.entry)
+  y.max <- rep(NA, n.entry)
+
+  for(i in 1:n.entry){
+    x[i] <- max(ez.list[[i]]$n)
+      
+    y.max[i] <- max(ez.list[[i]]$IDR)
+  
+  }
+
+  if(is.null(y.lim))
+    y.lim <- c(0, max(y.max))
+
+  if(!is.null(file.name)){
+    postscript(paste(plot.dir, "ez.", file.name, sep=""))
+    par(mfrow=c(1,1), mar=c(5,5,4,2))
+  }
+
+
+  
+  plot(c(0, max(x)), y.lim, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+
+  q <- seq(0.01, 0.99, by=0.01)
+  
+  for(i in 1:length(ez.list)){
+
+    n.plot <- round(quantile(ez.list[[i]]$n, prob=q))
+    IDR.plot <- ez.list[[i]]$IDR[n.plot]
+    lines(n.plot, IDR.plot, col=col.txt[i], cex=2, lwd=5)    
+  }
+
+
+  legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2)
+
+  if(!is.null(title))
+    title(title.txt)
+
+  if(!is.null(file.name)){
+    dev.off()
+  }
+  
+}
+
+
+
+#############################################################################
+#############################################################################
+# statistics about peaks selected on the individual replicates
+#
+# idr.level: the consistency cutoff, say 0.05
+# uri.output: a list of uri.output from consistency analysis generated by batch-consistency-analysis.r
+# ez.list : a list of IDRs computed from get.ez.tt using the same idr.level
+#
+##################
+
+
+# obsolete?
+# compute the error rate
+# u.t and v.t are the quantiles
+# 
+# map back to all peaks and report the number of peaks selected
+get.ez.tt.all.old  <- function(em.fit, all.data1, all.data2, idr.level){
+
+  u <- em.fit$data.pruned$sample1
+  v <- em.fit$data.pruned$sample2
+
+  tt <- seq(0.01, 0.99, by=0.01)
+#  if(reverse){ 
+    e.z <-  1-em.fit$em.fit$e.z # this is the error prob
+    uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+    ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max") 
+#  } else {
+#    e.z <-  em.fit$em.fit$e.z
+#    uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+#    ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min") 
+#  }
+
+  u.t <- quantile(u, prob=(1-tt))
+  v.t <- quantile(v, prob=(1-tt))  
+
+  # find the levels on the two replicates
+  sig.value1 <- c()
+  sig.value2 <- c()
+  error.prob.cutoff <- c()
+  n.selected.match <- c()
+  npeak.rep1 <- c()
+  npeak.rep2 <- c()
+
+  for(i in 1:length(idr.level)){
+
+    # find which uri.ez is closet to idr.level
+    index <- which.min(abs(uri.ez - as.numeric(idr.level[i])))
+
+    sig.value1[i] <- u.t[index]
+    sig.value2[i] <- v.t[index]
+    error.prob.cutoff[i] <- ez.bound[index]  
+    n.selected.match[i] <- sum(u>= u.t[index] & v>=v.t[index])
+
+    npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i])
+    npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i])    
+  }   
+
+
+  # output the cutoff of posterior probability, signal values on two replicates
+  map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match, npeak.rep1, npeak.rep2)
+
+  return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, idr.level=idr.level,  map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
+}
+
+
+get.ez.tt.all <- function(em.fit, all.data1, all.data2, idr.level=c(0.01, 0.05, 0.1)){
+
+  u <- em.fit$data.pruned$sample1$sig.value
+  v <- em.fit$data.pruned$sample2$sig.value
+#  u <- em.fit$data.pruned$sample1
+#  v <- em.fit$data.pruned$sample2
+  
+  e.z <-  1-em.fit$em.fit$e.z # this is the error prob
+  
+  o <- order(e.z)
+  e.z.ordered <- e.z[o]
+  n.select <- c(1:length(e.z))
+  IDR <- cumsum(e.z.ordered)/n.select
+
+  u.o <- u[o]
+  v.o <- v[o]
+
+  n.level <- length(idr.level)
+#  sig.value1 <- rep(NA, n.level)
+#  sig.value2 <- rep(NA, n.level)
+  ez.cutoff <- rep(NA, n.level)
+  n.selected <- rep(NA, n.level)
+  npeak.rep1 <- rep(NA, n.level)
+  npeak.rep2 <- rep(NA, n.level)
+  
+  for(i in 1:length(idr.level)){
+
+    # find which uri.ez is closet to fdr.level
+    index <- which.min(abs(IDR - idr.level[i]))
+#    sig.value1[i] <- min(u.o[1:index])
+#    sig.value2[i] <- min(v.o[1:index])
+    ez.cutoff[i] <- e.z.ordered[index]      # fixed on 02/20/10
+    n.selected[i] <- sum(e.z<=ez.cutoff[i])
+#    npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i])
+#    npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i])     
+  }   
+
+  # output the cutoff of posterior probability, number of selected overlapped peaks 
+  map.uv <- cbind(ez.cutoff, n.selected)
+
+  return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv))
+}   
+  
+#  return(list(n=tt*length(u), uri.ez=uri.ez,  fdr.level=fdr.level,  map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))  
+  
+
+
+
+
+
+####### the following is for determining thresholds for merged dataset
+
+############# select peaks above a given threshold
+#
+# pass.threshold: a simple method, passing the threshold on the threshold on the individual replicate to the pooled sample 
+#                 
+# sig.map.list: a list of matrix to include all the cutoff values, each row corresponds to a cutoff. The first column is idr.level
+#          the 2nd column is the cutoff of ez, the rest of columns are consistency analysis for other replicates 
+# sig.value.name: the name of the sig.value column
+# combined: combined dataset
+# nrep: number of pairs of comparisons
+#
+# Procedure:
+# 1. Find the significant threshold corresponding to the idr cutoff on the matched peaks. 
+# 2. Each time we will get two or more (if >2 replicates) cutoffs and will report the most stringent and the least stringent
+#    cutoff and the number of peaks selected at those two cutoffs
+#############
+
+pass.threshold <- function(sig.map.list, sig.value.name, combined, idr.level, nrep, chr.size){
+
+  sig.map <- c()
+
+  # choose idr.level
+  idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level)
+  if(length(i) ==0){
+    print("no level matches specified idr.level")
+    return(-1)
+  }
+
+  for(i in 1:length(sig.map.list))
+    sig.map <- c(sig.map, rbind(sig.map.list[[i]])[idr.index, c("sig.value1", "sig.value2")])
+  
+  
+  npeak.tight <- c()  
+  npeak.loose <- c()
+
+
+  max.sig <- max(sig.map)
+  min.sig <- min(sig.map)
+  selected.sig.tight <- combined[combined[,sig.value.name]>=max.sig, ]
+  selected.sig.loose <- combined[combined[,sig.value.name]>=min.sig, ]
+
+  selected.sig.tight <- deconcatenate.chr(selected.sig.tight, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+  selected.sig.loose <- deconcatenate.chr(selected.sig.loose, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+  
+  npeak.tight <- nrow(selected.sig.tight)
+  npeak.loose <- nrow(selected.sig.loose)
+  
+  
+  npeak.stat <- list(idr.level=idr.level, max.sig=max.sig, min.sig=min.sig, npeak.tight=npeak.tight, npeak.loose=npeak.loose)
+
+  invisible(list(npeak.stat=npeak.stat, combined.selected.tight=selected.sig.tight, combined.selected.loose=selected.sig.loose))  
+}
+
+#################
+# pass the regions selected from consistency analysis to combined data
+# Threshold is determined on the replicates, the regions above the threshold are selected
+# then peaks on the combined data are selected from the selected regions
+#
+# To avoid being too stringent, regions satisfying the following conditions are selected
+# 1. regions above the significant threshold determined by consistency analysis on either replicate
+# 2. regions that have consistent low peaks, i.e. posterior prob > threshold but not passing the significant threshold
+#
+# This method doesn't make a difference when using different thresholds
+#################
+
+pass.region <- function(sig.map.list, uri.output, ez.list, em.output, combined, idr.level, sig.value.impute=0, chr.size){
+ 
+  combined <- combined[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+  npair <- length(uri.output) # number of pairs of consistency analysis
+  combined.region <- c()
+
+  # choose idr.level
+  idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level)
+  if(length(idr.index) ==0){
+    print("no level matches specified idr.level")
+    return(-1)
+  }
+
+    for(j in 1:npair){
+      # select peaks from individual replicates using individual cutoff
+      above.1 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value1"]
+      above.2 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value2"]
+      selected.sig.rep1 <- uri.output[[j]]$data12.enrich$merge1[above.1, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+      selected.sig.rep2 <- uri.output[[j]]$data12.enrich$merge2[above.2, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] 
+      
+      # find the peaks that are overlapped with reliable peaks in the individual replicates
+      overlap.1 <- pair.peaks(selected.sig.rep1, combined)$merge2
+      overlap.2 <- pair.peaks(selected.sig.rep2, combined)$merge2
+
+      # choose the ones with significant value > 0, which are the overlapped ones
+
+      combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+      combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+
+      ## consistent low significant ones
+      ## first find consistenct ones, ie. high posterior prob
+      # is.consistent <- ez.list[[j]]$e.z < ez.list[[j]]$ez.cutoff 
+
+      # data.matched <- keep.match(uri.output[[j]]$data12.enrich$merge1[!above.1, ], uri.output[[j]]$data12.enrich$merge2[!above.2, ], sig.value.impute=0)
+      # data.matched$sample1 <- data.matched$sample1[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+      # data.matched$sample2 <- data.matched$sample2[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+
+      # consistent.in1 <- data.matched$sample1[is.consistent, ]
+      # consistent.in2 <- data.matched$sample2[is.consistent, ]
+
+      # overlap.consistent.1 <- pair.peaks(consistent.in1, combined)$merge2
+      # overlap.consistent.2 <- pair.peaks(consistent.in2, combined)$merge2
+
+      ## choose the ones with significant value > 0, which are the overlapped ones
+
+      # combined.consistent.in1 <- overlap.consistent.1[overlap.consistent.1$sig.value > sig.value.impute, ]
+      # combined.consistent.in2 <- overlap.consistent.2[overlap.consistent.2$sig.value > sig.value.impute, ]
+
+      # combined.region <- rbind(combined.region, combined.in1, combined.in2, combined.consistent.in1, combined.consistent.in2)
+
+       combined.region <- rbind(combined.region, combined.in1, combined.in2)
+
+      is.repeated <- duplicated(combined.region$start)
+      combined.region <- combined.region[!is.repeated, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+      
+    }
+    npeak <- nrow(combined.region)
+  
+   sig.combined <- c(min(combined.region[,"sig.value"], na.rm=T), max(combined.region[,"sig.value"], na.rm=T))
+
+  # idr.combined <- c(min(combined.region[,"q.value"], na.rm=T), max(combined.region[,"q.value"], na.rm=T))
+
+   npeak.stat <- list(idr.level=idr.level, npeak=npeak)
+
+   combined.region <- deconcatenate.chr(combined.region, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+
+  invisible(list(npeak.stat=npeak.stat, combined.selected=combined.region, sig.combined=sig.combined))
+}
+
+################
+# pass structure: this method does another round of inference on the combined data
+#
+# To make the mixture structure comparable on the replicates and the combined data, the 2nd inference is done on the peaks
+# at the reliable regions on the combined data, using rank transformed significant values. The mixture structure is estimated using my consistency analysis, which 
+# estimates marginal distributions of ranks using nonparametric ways. Then the significant values are found out.
+# There are several advantages to do it this way:  
+# 1. The premise of passing structure is that the means and variance (i.e. distribution) of two replicates should be the same
+#    The significant values on the two replicates clearly have different distributions. The structure estimated from consistency
+#    analysis will generate similar rank distribution on two replicates by its setup (i.e. same number of peaks are paired up).  
+# 2. Because pooled sample is a black box, the structure is more likely to be followed in the matched regions than other locations,
+#    after all, we don't know what other things are. If even the structure doesn't hold on the matched regions, 
+#    which is possible, let alone the other regions. Focusing on the reliable regions helps to get rid of those unknown noises.
+#  
+# 
+# modified on 2-20-10: reverse rank.combined, make big sig.value with small
+# ranks, to be consistent with f1 and f2
+################ 
+
+pass.structure <- function(uri.output, em.output, combined, idr.level, sig.value.impute, chr.size, overlap.ratio=0){
+
+  columns.keep <- c("sig.value", "start", "stop", "signal.value", "p.value", "q.value", "chr", "start.ori", "stop.ori")
+  combined <- combined[, columns.keep]
+  combined.selected.all <- c()
+
+  for(j in 1:npair){
+
+    sample1 <- uri.output[[j]]$data12.enrich$merge1[, columns.keep]
+    sample2 <- uri.output[[j]]$data12.enrich$merge2[, columns.keep]
+        
+    # find peaks on the matched region on the combined one
+    data.matched <- keep.match(sample1, sample2, sig.value.impute=sig.value.impute)
+
+    data.matched$sample1 <- data.matched$sample1[, columns.keep]
+    data.matched$sample2 <- data.matched$sample2[, columns.keep]
+
+    overlap.1 <- pair.peaks.filter(data.matched$sample1, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2
+    overlap.2 <- pair.peaks.filter(data.matched$sample2, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2
+
+    # choose the ones with significant value > sig.value.impute, which are the overlapped ones
+
+    combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, ]
+    combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, ]
+
+    combined.region <- rbind(combined.in1, combined.in2)
+  
+    is.repeated <- duplicated(combined.region$start)
+    combined.region <- combined.region[!is.repeated,]
+
+    # now rank the peaks in matched region
+    rank.combined <- rank(-combined.region$sig.value)
+    
+    # now transform the parameters estimated into the new scale
+    npeaks.overlap <- nrow(combined.region)
+    npeaks.consistent <- nrow(cbind(em.output[[j]]$data.pruned$sample1))
+
+    
+    # the breaks are the same for x and y
+    f1 <- list(breaks=em.output[[j]]$em.fit$x.mar$f1$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f1$density+em.output[[j]]$em.fit$y.mar$f1$density)/2)
+    # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1
+    f1$breaks[1] <- min(f1$breaks[1], 0.95)
+    
+    f2 <- list(breaks=em.output[[j]]$em.fit$x.mar$f2$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f2$density+em.output[[j]]$em.fit$y.mar$f2$density)/2)
+    # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1
+    f2$breaks[1] <- min(f2$breaks[1], 0.95)
+    
+    p <- em.output[[j]]$em.fit$para$p
+ 
+    # find the posterior probability
+    errorprob.combined <- get.comp2.prob(rank.combined, p, f1, f2)
+
+    # compute the FDR and find cutoff of posterior prob and the sig value
+    o <- order(errorprob.combined)
+    idr <- cumsum(errorprob.combined[o])/c(1:length(o))
+    idr.index <- which(idr > idr.level)[1]
+    errorprob.cutoff <- errorprob.combined[o][idr.index]
+
+    # find the minimum significant measure among selected peaks
+    sig.value <- min(combined.region$sig.value[o][1:idr.index])
+  #  sig.value <- quantile(combined.region$sig.value[o][1:idr.index], prob=0.05)
+#sig.value <- quantile(combined.region$sig.value[errorprob.combined<=errorprob.cutoff], prob=0.05)
+    
+    # apply the significant value on the whole pooled list
+    combined.selected <- combined[combined$sig.value >= sig.value,]
+
+    combined.selected.all <- rbind(combined.selected.all, combined.selected)
+  }
+
+  is.repeated <- duplicated(combined.selected.all$start)
+  combined.selected.all <- combined.selected.all[!is.repeated,]
+
+  npeak <- nrow(combined.selected.all)  
+  
+  npeak.stat <- list(idr.level=idr.level, npeak=npeak)
+
+  sig.combined <- c(min(combined.selected.all[,"sig.value"], na.rm=T), max(combined.selected.all[,"sig.value"], na.rm=T))
+
+ #  idr.combined <- c(min(combined.selected.all[,"q.value"], na.rm=T), max(combined.selected.all[,"q.value"], na.rm=T))
+ # combined.selected.all <- deconcatenate.chr(combined.selected.all, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+
+  combined.selected.all <- combined.selected.all[,  c("chr", "start.ori", "stop.ori", "signal.value", "p.value", "q.value")]
+  colnames(combined.selected.all) <- c("chr", "start", "stop", "signal.value", "p.value", "q.value")
+  
+  invisible(list(npeak.stat=npeak.stat, combined.selected=combined.selected.all, sig.combined=sig.combined))
+}
+
+
+
+# get the posterior probability of the 2nd component
+get.comp2.prob <- function(x, p, f1, f2){
+
+  # get pdf and cdf of each component from functions in the corresponding component 
+  px.1 <- sapply(x, get.pdf, df=f1)
+  px.2 <- sapply(x, get.pdf, df=f2)
+
+  comp2prob <- 1 - p*px.1/(p*px.1+(1-p)*px.2)
+  
+  return(comp2prob)
+}
+
+keep.match <- function(sample1, sample2, sig.value.impute=0){
+
+  sample1.prune <- sample1[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,]
+  sample2.prune <- sample2[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,]
+ 
+  invisible(list(sample1=sample1.prune, sample2=sample2.prune))
+}
+
+
+##############################################
+#
+# The following is for simulation
+#
+##############################################
+
+
+# simulate gaussian copula
+# u is the uniform random variable and rho is correlation coefficient 
+simu.gaussian.copula <- function(u, rho){
+
+  n <- length(u)
+
+  # simulate y given x=qnorm(u)
+  y <- qnorm(u)*rho + rnorm(n)*sqrt(1-rho^2)
+
+  v <- pnorm(y)
+
+  invisible(v)
+}
+
+## simulate Clayton copula from its generating function
+## Genest and MacKay (1986)
+
+phi.ori <- function(t, s){
+
+ (t^(-s) -1)/s
+}
+
+
+phi.inv <- function(y, s){
+
+  exp(-log(s*y+1)/s)
+}
+
+phi.der <- function(t, s){
+
+  -t^(-s-1)
+}
+
+phi.der.inv <- function(y, s){
+
+  exp(log(-y)/(-s-1))
+}
+
+get.w <- function(u, t, s){
+
+  phi.der.inv(phi.der(u, s)/t, s)
+} 
+
+get.v <- function(w, u, s){
+
+  phi.inv(phi.ori(w, s) - phi.ori(u, s), s) 
+}
+
+# u is a uniform random variable, s is the association parameter
+simu.clayton.copula <- function(u, s){
+
+  t <- runif(length(u))
+
+  if(s>0){
+    w <- get.w(u, t, s)
+    v <- get.v(w, u, s)
+    return(v)    
+  }
+
+  if(s==0){
+    return(t)
+  }
+
+  if(s <0){
+    print("Invalid association parameters for clayton copula")
+  }
+  
+}
+
+
+
+###### 09-09-09
+
+# simulate a two-component copula mixture:
+# - marginal distributions for the two variables in each component are both 
+#   normal and with the same parameters 
+# p is the mixing proportion of component 1
+# n is the total sample size
+simu.copula.2mix <- function(s1, s2, p, n, mu1, mu2, sd1, sd2, copula.txt){
+
+  n1 <- round(n*p)
+  n2 <- n-n1
+
+  u1 <- runif(n1)
+  
+  if(copula.txt =="clayton")
+    v1 <- simu.clayton.copula(u1, s1)
+  else{
+    if(copula.txt =="gaussian")
+      v1 <- simu.gaussian.copula(u1, s1)
+  }
+
+  u2 <- runif(n2)
+
+  if(copula.txt =="clayton")
+    v2 <- simu.clayton.copula(u2, s2)
+  else{
+    if(copula.txt =="gaussian")
+      v2 <- simu.gaussian.copula(u2, s2)
+  }
+
+  # generate test statistics
+  sample1.1 <- qnorm(u1, mu1, sd1)
+  sample1.2 <- qnorm(v1, mu1, sd1)
+
+  sample2.1 <- qnorm(u2, mu2, sd2)
+  sample2.2 <- qnorm(v2, mu2, sd2)
+
+  return(list(u=c(u1, u2), v=c(v1, v2), 
+              u.inv=c(sample1.1, sample2.1), v.inv=c(sample1.2, sample2.2),
+              label=c(rep(1, n1), rep(2, n2))))
+}
+
+# using inverse of the cdf to generate original observations 
+
+simu.copula.2mix.inv <- function(s1, s2, p, n, cdf1.x, cdf1.y, cdf2.x, cdf2.y, copula.txt){
+
+  n1 <- round(n*p)
+  n2 <- n-n1
+
+  u1 <- runif(n1)
+  
+  if(copula.txt =="clayton")
+    v1 <- simu.clayton.copula(u1, s1)
+  else{
+    if(copula.txt =="gaussian")
+      v1 <- simu.gaussian.copula(u1, s1)
+  }
+
+  u2 <- runif(n2)
+
+  if(copula.txt =="clayton")
+    v2 <- simu.clayton.copula(u2, s2)
+  else{
+    if(copula.txt =="gaussian")
+      v2 <- simu.gaussian.copula(u2, s2)
+  }
+
+  # generate test statistics
+#  sample1.1 <- qnorm(u1, mu1, sd1)
+#  sample1.2 <- qnorm(v1, mu1, sd1)
+
+#  sample2.1 <- qnorm(u2, mu2, sd2)
+#  sample2.2 <- qnorm(v2, mu2, sd2)
+  
+  sample1.x <- inv.cdf.vec(u1, cdf1.x)
+  sample1.y <- inv.cdf.vec(v1, cdf1.y)
+
+  sample2.x <- inv.cdf.vec(u2, cdf2.x)
+  sample2.y <- inv.cdf.vec(v2, cdf2.y)
+  
+  
+  return(list(u=c(u1, u2), v=c(v1, v2), 
+              u.inv=c(sample1.x, sample2.x), v.inv=c(sample1.y, sample2.y),
+              label=c(rep(1, n1), rep(2, n2))))
+}
+
+# obtain original observation by converting cdf into quantiles
+# u is one cdf
+# u.cdf is a cdf (assuming it is a histogram) and has the break points (cdf$cdf and cdf$breaks)
+# the smallest value of cdf=0 and the largest =1 
+inv.cdf <- function(u, u.cdf){
+
+  # which bin it falls into
+  i <- which(u.cdf$cdf> u)[1]
+  q.u  <- (u - u.cdf$cdf[i-1])/(u.cdf$cdf[i] - u.cdf$cdf[i-1])* (u.cdf$breaks[i]-u.cdf$breaks[i-1]) + u.cdf$breaks[i-1]
+
+  return(q.u)
+}
+
+inv.cdf.vec <- function(u, u.cdf){
+
+  # check if cdf has the right range (0, 1)  
+  ncdf <- length(u.cdf$cdf)
+  nbreaks <- length(u.cdf$breaks)
+  
+  if(ncdf == nbreaks-1 & u.cdf$cdf[ncdf]< 1)
+    u.cdf[ncdf] <- 1
+    
+  q.u <- sapply(u, inv.cdf, u.cdf)
+
+  return(q.u) 
+}
+
+# here we simulate a likely real situation
+# the test statistics from two normal distributions
+# according to their labels, then convert them into p-values w.r.t H0 using
+# one-sided test.
+# The test statistics are correlated for the signal component and independent
+# for the noise component
+# For the signal component, Y = X + eps, where eps ~ N(0, sigma^2)
+simu.test.stat <- function(p, n, mu1, sd1, mu0, sd0, sd.e){
+
+  # first component - signal
+  n.signal <- round(n*p)
+  n.noise <- n - n.signal
+
+  # labels
+  labels <- c(rep(1, n.signal), rep(0, n.noise))
+  
+  # test statistics for signal and noise
+  mu.signal <- rnorm(n.signal, mu1, sd1)
+  x.signal <- mu.signal + rnorm(n.signal, 0, sd.e)
+  x.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e)
+
+  y.signal <- mu.signal + rnorm(n.signal, 0, sd.e)
+                                        # sd.e can be dependent on signal
+  y.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e)
+
+  # concatenate
+  x <- c(x.signal, x.noise)
+  y <- c(y.signal, y.noise)
+  
+  # convert to p-values based on H0
+  p.x <- 1-pnorm(x, mu0, sqrt(sd0^2+sd.e^2))
+  p.y <- 1-pnorm(y, mu0, sqrt(sd0^2+sd.e^2))
+
+  return(list(p.x=p.x, p.y=p.y, x=x, y=y, labels=labels))
+  
+}
+
+# compute the tradeoff and calibration
+forward.decoy.tradeoff.ndecoy <- function(xx, labels, ndecoy){
+
+  xx <- round(xx, 5) 
+  o <- order(xx, decreasing=T)
+
+  rand <- 1-labels # if rand==0, consistent
+  # order the random indicator in the same order
+  rand.o <- rand[o]
+
+  if(sum(rand.o) > ndecoy){
+    index.decoy <- which(cumsum(rand.o)==ndecoy)
+  } else {
+    index.decoy <- which(cumsum(rand.o)==sum(rand.o))
+  }
+  
+  cutoff.decoy <- xx[o][index.decoy]
+    
+  # only consider the unique ones
+  cutoff.unique <- unique(xx[o])
+
+  cutoff <- cutoff.unique[cutoff.unique >= cutoff.decoy[length(cutoff.decoy)]]
+
+   get.decoy.count <- function(cut.off){
+     above <- rep(0, length(xx))
+     above[xx >= cut.off] <- 1
+     decoy.count <- sum(above==1 & rand==1)     
+     return(decoy.count)
+   }
+
+   get.forward.count <- function(cut.off){
+     above <- rep(0, length(xx))
+     above[xx >= cut.off] <- 1
+     forward.count <- sum(above==1 & rand==0)
+     return(forward.count)
+   }
+
+   get.est.fdr <- function(cut.off){
+     above <- rep(0, length(xx))
+     above[xx >= cut.off] <- 1
+     est.fdr <- 1-mean(xx[above==1])
+     return(est.fdr)
+   }
+
+  # assuming rand=0 is right
+   get.false.neg.count <- function(cut.off){
+     below <- rep(0, length(xx))
+     below[xx < cut.off] <- 1
+     false.neg.count <- sum(below==1 & rand==0)
+     return(false.neg.count)
+   }
+
+  get.false.pos.count <- function(cut.off){
+     above <- rep(0, length(xx))
+     above[xx >= cut.off] <- 1
+     false.pos.count <- sum(above==1 & rand==1)
+     return(false.pos.count)
+   } 
+
+   decoy <- sapply(cutoff, get.decoy.count)
+   forward <- sapply(cutoff, get.forward.count)
+
+   est.fdr <- sapply(cutoff, get.est.fdr)
+   emp.fdr <- decoy/(decoy+forward)
+  
+   # compute specificity and sensitivity
+   # assuming rand=1 is wrong and rand=0 is right
+   false.neg <- sapply(cutoff, get.false.neg.count)
+   false.pos <- sapply(cutoff, get.false.pos.count)
+  
+   true.pos <- sum(rand==0)-false.neg
+   true.neg <- sum(rand==1)-false.pos
+  
+   sensitivity <- true.pos/(true.pos+false.neg)
+   specificity <- true.neg/(true.neg+false.pos)
+  
+   return(list(decoy=decoy, forward=forward, cutoff=cutoff, est.fdr=est.fdr, emp.fdr=emp.fdr, sensitivity=sensitivity, specificity=specificity))  
+}
+
+
+# compute the em for jackknife and all data, and find FDR
+get.emp.jack <- function(a, p0){
+
+  nobs <- length(a$labels)
+  est <- list()
+  est.all <- list()
+
+  temp.all <- em.transform(-a$p.x, -a$p.y, mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01)
+#  temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7,
+#      rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+
+  est.all$p <- temp.all$para$p
+  est.all$rho1 <- temp.all$para$rho1
+  est.all$FDR <- get.FDR(temp.all$e.z)
+
+  FDR <- list()
+  p <- c()
+  rho1 <- c()
+
+
+  for(i in 1:nobs){
+
+    temp <- em.transform(-a$p.x[-i], -a$p.y[-i], mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01)    
+#    temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7,
+#      rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+
+    est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z))
+
+    FDR[[i]] <- est[[i]]$FDR # this is the FDR for top n peaks
+    p[i] <- est[[i]]$p
+    rho1[i] <- est[[i]]$rho1 
+  }
+
+  est.jack <- list(FDR=FDR, p=p, rho1=rho1) 
+  return(list(est.jack=est.jack, est.all=est.all))
+}
+
+
+# get the npeaks corresponding to the nominal FDR estimated from the sample
+# and find the corresponding FDR from the entire data
+get.FDR.jack <- function(est, FDR.nominal){
+  
+  nobs <- length(est$est.jack$FDR)
+  FDR.all <- c()  
+  top.n <- c()
+  
+  for(i in 1:nobs){
+    top.n[i] <- max(which(est$est.jack$FDR[[i]] <= FDR.nominal))
+    FDR.all[i] <- est$est.all$FDR[top.n[i]]
+  }
+  
+  invisible(list(FDR.all=FDR.all, top.n=top.n))
+}
+
+# compute Jackknife peudonumber
+# a is the dataset
+get.emp.IF <- function(a, p0){
+
+  nobs <- length(a$labels)
+  est <- list()
+  est.all <- list()
+
+  temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7,
+      rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+
+  est.all$p <- temp.all$para$p
+  est.all$rho1 <- temp.all$para$rho1
+  est.all$FDR <- get.FDR(temp.all$e.z)
+
+  IF.FDR <- list()
+  IF.p <- c()
+  IF.rho1 <- c()
+  
+  for(i in 1:nobs){
+    
+    temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7,
+      rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+
+    est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z))
+
+    IF.FDR[[i]] <- (nobs-1)*(est.all$FDR[-nobs] - est[[i]]$FDR) # this is the FDR for top n peaks
+    IF.p[i] <- (nobs-1)*(est.all$p - est[[i]]$p)
+    IF.rho1[i] <- (nobs-1)*(est.all$rho1 - est[[i]]$rho1) 
+  }
+
+  emp.IF <- list(FDR=IF.FDR, p=IF.p, rho1=IF.rho1) 
+
+  invisible(list(emp.IF=emp.IF, est.all=est.all, est=est))
+}
+
+# e.z is the posterior probability of being in signal component
+get.FDR <- function(e.z){
+
+  e.z.o <- order(1-e.z)
+  FDR <- cumsum(1-e.z[e.z.o])/c(1:length(e.z.o))
+
+  invisible(FDR)
+}
+
+# get the FDR of selecting the top n peaks
+# IF.est is the sample influence function
+# top.n
+get.IF.FDR <- function(IF.est, top.n){
+
+  nobs <- length(IF.est$emp.IF$FDR)
+  FDR <- c()  
+  
+  # influence function of p
+  for(i in 1:nobs)
+    FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n]
+  
+  invisible(FDR)
+}
+
+# get the sample influence function for FDR at a given FDR size
+# 1. find the number of peaks selected at a given FDR computed from all obs
+# 2. use the number to find the sample influence function for FDR
+# IF.est$est.all is the FDR with all peaks
+get.IF.FDR.all <- function(IF.est, FDR.size){
+
+  top.n <- which.min(abs(IF.est$est.all$FDR -FDR.size))
+  nobs <- length(IF.est$est.all$FDR)
+  FDR <- c()  
+  
+  # influence function of p
+  for(i in 1:nobs)
+    FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n]
+  
+  invisible(list(FDR=FDR, top.n=top.n))
+}
+
+plot.simu.uri <- function(x, y){
+
+  tt <- seq(0.01, 0.99, by=0.01)
+  uri <- sapply(tt, comp.uri.prob, u=x, v=y)
+  uri.thin <- uri[seq(1, length(tt), by=3)]
+  tt.thin <- tt[seq(1, length(tt), by=3)]
+  duri <- (uri.thin[-1]-uri.thin[-length(uri.thin)])/(tt.thin[-1]-tt.thin[-length(tt.thin)])
+  uri.spl <- smooth.spline(tt, uri, df=6.4)
+  uri.der <- predict(uri.spl, tt, deriv=1)
+
+  par(mfrow=c(2,2))
+  plot(x[1:n0], y[1:n0])
+  points(x[(n0+1):n], y[(n0+1):n], col=2)
+  plot(rank(-x)[1:n0], rank(-y)[1:n0])
+  points(rank(-x)[(1+n0):n], rank(-y)[(1+n0):n])
+  plot(tt, uri)
+  lines(c(0,1), c(0,1), lty=2)
+  title(paste("rho1=", rho1, " rho2=", rho2, "p=", p, sep=""))
+  plot(tt.thin[-1], duri)
+  lines(uri.der)
+  abline(h=1)
+  invisible(list(x=x, y=y, uri=uri, tt=tt, duri=duri, tt.thin=tt.thin, uri.der=uri.der))
+
+}
+
+
+###### new fitting procedure
+
+
+
+
+# 1. rank pairs
+
+# 2. initialization
+# 3. convert to pseudo-number
+
+# 4. EM
+
+# need plugin and test
+# find the middle point between the bins
+get.pseudo.mix <- function(x, mu, sigma, rho, p){
+
+  
+  # first compute cdf for points on the grid
+  # generate 200 points between [-3, mu+3*sigma]
+  nw <- 1000
+  w <- seq(min(-3, mu-3*sigma), max(mu+3*sigma, 3), length=nw) 
+  w.cdf <- p*pnorm(w, mean=mu, sd=sigma) + (1-p)*pnorm(w, mean=0, sd=1)
+
+  i <- 1
+
+  quan.x <- rep(NA, length(x))
+
+  for(i in c(1:nw)){
+    index <- which(x >= w.cdf[i] & x < w.cdf[i+1])
+    quan.x[index] <- (x[index]-w.cdf[i])*(w[i+1]-w[i])/(w.cdf[i+1]-w.cdf[i]) +w[i]
+  }
+
+  index <- which(x < w.cdf[1])
+  if(length(index)>0)
+    quan.x[index] <- w[1]
+
+  index <- which(x > w.cdf[nw])
+  if(length(index)>0)
+    quan.x[index] <- w[nw]  
+  
+#  linear.ext <- function(x, w, w.cdf){
+  # linear interpolation
+#    index.up <- which(w.cdf>= x)[1]
+#    left.index <- which(w.cdf <=x)
+#    index.down <- left.index[length(left.index)]
+#    quan.x <- (w[index.up] + w[index.down])/2  
+#  }
+  
+#  x.pseudo <- sapply(x, linear.ext, w=w, w.cdf=w.cdf)
+
+#  invisible(x.pseudo)
+  invisible(quan.x)
+}
+
+
+# EM to compute the latent structure
+# steps:
+# 1. raw values are first transformed into pseudovalues
+# 2. EM is used to compute the underlining structure, which is a mixture
+#    of two normals
+em.transform <- function(x, y, mu, sigma, rho, p, eps){
+  
+  x.cdf.func <- ecdf(x)
+  y.cdf.func <- ecdf(y)
+  afactor <- length(x)/(length(x)+1)
+  x.cdf <- x.cdf.func(x)*afactor
+  y.cdf <- y.cdf.func(y)*afactor
+  
+  # initialization
+  para <- list()
+  para$mu <- mu
+  para$sigma <- sigma
+  para$rho <- rho
+  para$p <- p  
+
+  j <- 1
+  to.run <- T
+  loglik.trace <- c()
+  loglik.inner.trace <- c()
+  
+  #to.run.inner <- T
+  z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
+  z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
+
+#  cat("length(z1)", length(z.1), "\n")
+  while(to.run){
+    
+    # get pseudo value in each cycle
+#    z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
+#    z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
+
+    i <- 1
+    while(to.run){
+      
+      # EM for latent structure
+      e.z <- e.step.2normal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
+      para <- m.step.2normal(z.1, z.2, e.z)
+#para$rho <- rho
+#para$p <- p    
+#para$mu <- mu
+#para$sigma <- sigma    
+      if(i > 1)
+        l.old <- l.new
+    
+      # this is just the mixture likelihood of two-component Gaussian
+      l.new <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
+
+      loglik.inner.trace[i] <- l.new 
+
+      if(i > 1){
+        to.run <- loglik.inner.trace[i]-loglik.inner.trace[i-1]>eps         
+      }
+        
+    
+#      if(i > 2){
+#        l.inf <- loglik.inner.trace[i-2] + (loglik.inner.trace[i-1] - loglik.inner.trace[i-2])/(1-(loglik.inner.trace[i]-loglik.inner.trace[i-1])/(loglik.inner.trace[i-1]-loglik.inner.trace[i-2]))
+
+#        if(loglik.inner.trace[i-1]!=loglik.inner.trace[i-2])
+#          to.run <- abs(l.inf - loglik.inner.trace[i]) > eps
+#        else
+#          to.run <- F
+          
+#      }
+
+      cat("loglik.inner.trace[", i, "]=", loglik.inner.trace[i], "\n")
+    cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n\n")
+      
+      i <- i+1
+    }
+    
+
+    # get pseudo value in each cycle
+    z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
+    z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
+
+    if(j > 1)
+      l.old.outer <- l.new.outer
+
+    l.new.outer <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
+
+    loglik.trace[j] <- l.new.outer
+    
+    if(j == 1)
+      to.run <- T
+    else{ # stop when iteration>100
+      if(j > 100)
+        to.run <- F
+      else
+        to.run <- l.new.outer - l.old.outer > eps
+    }
+
+#    if(j %% 10==0)
+      cat("loglik.trace[", j, "]=", loglik.trace[j], "\n")
+    cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n")
+    
+    j <- j+1
+  }
+
+  bic <- -2*l.new + 4*log(length(z.1))
+  
+  return(list(para=list(p=para$p, rho=para$rho, mu=para$mu, sigma=para$sigma),
+              loglik=l.new, bic=bic, e.z=e.z, loglik.trace=loglik.trace))
+}  
+
+
+
+
+# compute log-likelihood for mixture of two bivariate normals
+loglik.2binormal <- function(z.1, z.2, mu, sigma, rho, p){
+
+  l.m <- sum(d.binormal(z.1, z.2, 0, 1, 0)+log(p*exp(d.binormal(z.1, z.2, mu, sigma, rho)-d.binormal(z.1, z.2, 0, 1, 0))+(1-p)))
+  
+#  l.m <- sum((p*d.binormal(z.1, z.2, mu, sigma, rho) + (1-p)*d.binormal(z.1, z.2, 0, 1, 0)))
+  return(l.m) 
+}
+
+# check this when rho=1
+
+# density of binomial distribution with equal mean and sigma on both dimensions
+d.binormal <- function(z.1, z.2, mu, sigma, rho){
+
+  loglik <- (-log(2)-log(pi)-2*log(sigma) - log(1-rho^2)/2 - (0.5/(1-rho^2)/sigma^2)*((z.1-mu)^2 -2*rho*(z.1-mu)*(z.2-mu) + (z.2-mu)^2))
+
+  return(loglik)
+}
+
+# E-step for computing the latent strucutre
+# e.z is the prob to be in the consistent group
+# e.step for estimating posterior prob
+# z.1 and z.2 can be vectors or scalars
+e.step.2normal <- function(z.1, z.2, mu, sigma, rho, p){
+
+  e.z <- p/((1-p)*exp(d.binormal(z.1, z.2, 0, 1, 0)-d.binormal(z.1, z.2, mu, sigma, rho))+ p)
+  
+  invisible(e.z)
+}
+
+# M-step for computing the latent structure
+# m.step for estimating proportion, mean, sd and correlation coefficient
+m.step.2normal <- function(z.1, z.2, e.z){
+
+  p <- mean(e.z)
+  mu <- sum((z.1+z.2)*e.z)/2/sum(e.z) 
+  sigma <- sqrt(sum(e.z*((z.1-mu)^2+(z.2-mu)^2))/2/sum(e.z))
+  rho <- 2*sum(e.z*(z.1-mu)*(z.2-mu))/(sum(e.z*((z.1-mu)^2+(z.2-mu)^2)))
+
+  return(list(p=p, mu=mu, sigma=sigma, rho=rho))
+}
+
+
+# assume top p percent of observations are true
+# x and y are ranks, estimate
+init <- function(x, y, x.label){
+  
+  x.o <- order(x)
+
+  x.ordered <- x[x.o]
+  y.ordered <- y[x.o]
+  x.label.ordered <- x.label[x.o]
+  
+  n <- length(x)
+  p <- sum(x.label)/n
+  
+  rho <- cor(x.ordered[1:ceiling(p*n)], y.ordered[1:ceiling(p*n)])
+
+  temp <- find.mu.sigma(x.ordered, x.label.ordered)
+  mu <- temp$mu
+  sigma <- temp$sigma
+  
+  invisible(list(mu=mu, sigma=sigma, rho=rho, p=p))
+
+}
+
+# find mu and sigma if the distributions of marginal ranks are known
+# take the medians of the two dist and map back to the original
+init.dist <- function(f0, f1){
+
+  # take the median in f0
+  index.median.0 <- which(f0$cdf>0.5)[1]
+  q.0.small <- f0$cdf[index.median.0] # because f0 and f1 have the same bins
+  q.1.small <- f1$cdf[index.median.0]
+
+  # take the median in f1
+  index.median.1 <- which(f1$cdf>0.5)[1]
+  q.0.big <- f0$cdf[index.median.1] # because f0 and f1 have the same bins
+  q.1.big <- f1$cdf[index.median.1]
+
+  # find pseudo value for x.middle[1] on normal(0,1) 
+  pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1)
+  pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1)
+
+  # find pseudo value for x.middle[2] on normal(0,1) 
+  pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1)
+  pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1)
+
+  mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1) 
+
+  sigma <- (pseudo.small.0-mu)/pseudo.small.1
+
+  return(list(mu=mu, sigma=sigma))  
+}
+
+# generate labels
+
+# find the part of data with overlap
+
+# find the percentile on noise and signal
+
+# Suppose there are signal and noise components, with mean=0 and sd=1 for noise
+# x and x.label are the rank of the observations and their labels,
+# find the mean and sd of the other component
+# x.label takes values of 0 and 1
+find.mu.sigma <- function(x, x.label){
+
+  x.0 <- x[x.label==0]
+  x.1 <- x[x.label==1]
+
+  n.x0 <- length(x.0)
+  n.x1 <- length(x.1)
+
+  x.end <- c(min(x.0), min(x.1), max(x.0), max(x.1))
+  o <- order(x.end)
+  x.middle <- x.end[o][c(2,3)]
+
+  # the smaller end of the overlap
+  q.1.small <- mean(x.1 <= x.middle[1])*n.x1/(n.x1+1)
+  q.0.small <- mean(x.0 <= x.middle[1])*n.x0/(n.x0+1)
+
+  # the bigger end of the overlap
+  q.1.big <- mean(x.1 <= x.middle[2])*n.x1/(n.x1+1)
+  q.0.big <- mean(x.0 <= x.middle[2])*n.x0/(n.x0+1)
+
+  # find pseudo value for x.middle[1] on normal(0,1) 
+  pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1)
+  pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1)
+
+  # find pseudo value for x.middle[2] on normal(0,1) 
+  pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1)
+  pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1)
+
+  mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1) 
+
+  sigma <- (pseudo.small.0-mu)/pseudo.small.1
+
+  return(list(mu=mu, sigma=sigma))
+}