Previous changeset 17:70dc9bb76f92 (2013-01-21) Next changeset 19:11269f3b68a0 (2013-01-21) |
Commit message:
Uploaded |
added:
functions-all-clayton-12-13.r |
b |
diff -r 70dc9bb76f92 -r 7dd341a53e77 functions-all-clayton-12-13.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/functions-all-clayton-12-13.r Mon Jan 21 13:36:04 2013 -0500 |
[ |
b'@@ -0,0 +1,3099 @@\n+# revised on 2-20-10\n+# - fix error in pass.structure: reverse rank.combined, so that big sig.value\n+# are ranked with small numbers (1, 2, ...)\n+# - fix error on get.ez.tt.all: get ez.cutoff from sorted e.z\n+\n+#\n+# modified EM procedure to compute empirical CDF more precisely - 09/2009\n+\n+\n+\n+# this file contains the functions for \n+# 1. computing the correspondence profile (upper rank intersection and derivatives)\n+# 2. inference of copula mixture model\n+#\n+# It also has functions for\n+# 1. reading peak caller results\n+# 2. processing and matching called peaks\n+# 3. plotting results\n+\n+\n+################ read peak caller results\n+\n+# process narrow peak format\n+# some peak callers may not report q-values, p-values or fold of enrichment\n+# need further process before comparison\n+#\n+# stop.exclusive: Is the basepair of peak.list$stop exclusive? In narrowpeak and broadpeak format they are exclusive.\n+# If it is exclusive, we need subtract peak.list$stop by 1 to avoid the same basepair being both a start and a stop of two \n+# adjacent peaks, which creates trouble for finding correct intersect \n+process.narrowpeak <- function(narrow.file, chr.size, half.width=NULL, summit="offset", stop.exclusive=T, broadpeak=F){\n+\n+\n+ aa <- read.table(narrow.file)\n+\n+ if(broadpeak){\n+ bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9)\n+ }else{\n+ bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9, summit=aa$V10)\n+ }\n+ \n+ if(summit=="summit"){\n+ bb.ori$summit <- bb.ori$summit-bb.ori$start # change summit to offset to avoid error when concatenating chromosomes\n+ }\n+ \n+ bb <- concatenate.chr(bb.ori, chr.size)\n+\n+ #bb <- bb.ori\n+\n+ # remove the peaks that has the same start and stop value\n+ bb <- bb[bb$start != bb$stop,]\n+\n+ if(stop.exclusive==T){\n+ bb$stop <- bb$stop-1\n+ }\n+\n+ if(!is.null(half.width)){\n+ bb$start.ori <- bb$start \n+ bb$stop.ori <- bb$stop \n+\n+ # if peak is narrower than the specified window, stay with its width\n+ # otherwise chop wider peaks to specified width\n+ width <- bb$stop-bb$start +1\n+ is.wider <- width > 2*half.width\n+\n+ if(summit=="offset" | summit=="summit"){ # if summit is offset from start\n+ bb$start[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]-half.width\n+ bb$stop[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]+half.width\n+ } else { \n+ if(summit=="unknown"){\n+ bb$start[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) - half.width\n+ bb$stop[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) + half.width\n+ }\n+ }\n+ }\n+\n+ bb <- clean.data(bb)\n+ invisible(list(data.ori=bb.ori, data.cleaned=bb))\n+}\n+\n+# clean data \n+# and concatenate chromosomes if needed\n+clean.data <- function(adata){\n+\n+ # remove the peaks that has the same start and stop value\n+ adata <- adata[adata$start != adata$stop,]\n+\n+ # if some stops and starts are the same, need fix them\n+ stop.in.start <- is.element(adata$stop, adata$start)\n+ n.fix <- sum(stop.in.start)\n+ if(n.fix >0){\n+ print(paste("Fix", n.fix, "stops\\n"))\n+ adata$stop[stop.in.start] <- adata$stop[stop.in.start]-1 \n+ } \n+ \n+ return(adata) \n+}\n+\n+# concatenate peaks\n+# peaks: the dataframe to have all the peaks\n+# chr.file: the file to keep the length of each chromosome \n+# chr files should come from the species that the data is from\n+#concatenate.chr <- function(peaks, chr.size){\n+\n+ # chr.size <- read.table(chr.file)\n+# chr.o <- order(chr.size[,1])\n+# chr.size <- chr.size[chr.o,]\n+#\n+# chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))\n+# chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift) \n+#\n+# for(i in 1:nrow(chr.size)){\n+# is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i])\n+# if(sum(is.in)>0){\n+# peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shi'..b'\n+# density of binomial distribution with equal mean and sigma on both dimensions\n+d.binormal <- function(z.1, z.2, mu, sigma, rho){\n+\n+ loglik <- (-log(2)-log(pi)-2*log(sigma) - log(1-rho^2)/2 - (0.5/(1-rho^2)/sigma^2)*((z.1-mu)^2 -2*rho*(z.1-mu)*(z.2-mu) + (z.2-mu)^2))\n+\n+ return(loglik)\n+}\n+\n+# E-step for computing the latent strucutre\n+# e.z is the prob to be in the consistent group\n+# e.step for estimating posterior prob\n+# z.1 and z.2 can be vectors or scalars\n+e.step.2normal <- function(z.1, z.2, mu, sigma, rho, p){\n+\n+ e.z <- p/((1-p)*exp(d.binormal(z.1, z.2, 0, 1, 0)-d.binormal(z.1, z.2, mu, sigma, rho))+ p)\n+ \n+ invisible(e.z)\n+}\n+\n+# M-step for computing the latent structure\n+# m.step for estimating proportion, mean, sd and correlation coefficient\n+m.step.2normal <- function(z.1, z.2, e.z){\n+\n+ p <- mean(e.z)\n+ mu <- sum((z.1+z.2)*e.z)/2/sum(e.z) \n+ sigma <- sqrt(sum(e.z*((z.1-mu)^2+(z.2-mu)^2))/2/sum(e.z))\n+ rho <- 2*sum(e.z*(z.1-mu)*(z.2-mu))/(sum(e.z*((z.1-mu)^2+(z.2-mu)^2)))\n+\n+ return(list(p=p, mu=mu, sigma=sigma, rho=rho))\n+}\n+\n+\n+# assume top p percent of observations are true\n+# x and y are ranks, estimate\n+init <- function(x, y, x.label){\n+ \n+ x.o <- order(x)\n+\n+ x.ordered <- x[x.o]\n+ y.ordered <- y[x.o]\n+ x.label.ordered <- x.label[x.o]\n+ \n+ n <- length(x)\n+ p <- sum(x.label)/n\n+ \n+ rho <- cor(x.ordered[1:ceiling(p*n)], y.ordered[1:ceiling(p*n)])\n+\n+ temp <- find.mu.sigma(x.ordered, x.label.ordered)\n+ mu <- temp$mu\n+ sigma <- temp$sigma\n+ \n+ invisible(list(mu=mu, sigma=sigma, rho=rho, p=p))\n+\n+}\n+\n+# find mu and sigma if the distributions of marginal ranks are known\n+# take the medians of the two dist and map back to the original\n+init.dist <- function(f0, f1){\n+\n+ # take the median in f0\n+ index.median.0 <- which(f0$cdf>0.5)[1]\n+ q.0.small <- f0$cdf[index.median.0] # because f0 and f1 have the same bins\n+ q.1.small <- f1$cdf[index.median.0]\n+\n+ # take the median in f1\n+ index.median.1 <- which(f1$cdf>0.5)[1]\n+ q.0.big <- f0$cdf[index.median.1] # because f0 and f1 have the same bins\n+ q.1.big <- f1$cdf[index.median.1]\n+\n+ # find pseudo value for x.middle[1] on normal(0,1) \n+ pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1)\n+ pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1)\n+\n+ # find pseudo value for x.middle[2] on normal(0,1) \n+ pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1)\n+ pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1)\n+\n+ mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1) \n+\n+ sigma <- (pseudo.small.0-mu)/pseudo.small.1\n+\n+ return(list(mu=mu, sigma=sigma)) \n+}\n+\n+# generate labels\n+\n+# find the part of data with overlap\n+\n+# find the percentile on noise and signal\n+\n+# Suppose there are signal and noise components, with mean=0 and sd=1 for noise\n+# x and x.label are the rank of the observations and their labels,\n+# find the mean and sd of the other component\n+# x.label takes values of 0 and 1\n+find.mu.sigma <- function(x, x.label){\n+\n+ x.0 <- x[x.label==0]\n+ x.1 <- x[x.label==1]\n+\n+ n.x0 <- length(x.0)\n+ n.x1 <- length(x.1)\n+\n+ x.end <- c(min(x.0), min(x.1), max(x.0), max(x.1))\n+ o <- order(x.end)\n+ x.middle <- x.end[o][c(2,3)]\n+\n+ # the smaller end of the overlap\n+ q.1.small <- mean(x.1 <= x.middle[1])*n.x1/(n.x1+1)\n+ q.0.small <- mean(x.0 <= x.middle[1])*n.x0/(n.x0+1)\n+\n+ # the bigger end of the overlap\n+ q.1.big <- mean(x.1 <= x.middle[2])*n.x1/(n.x1+1)\n+ q.0.big <- mean(x.0 <= x.middle[2])*n.x0/(n.x0+1)\n+\n+ # find pseudo value for x.middle[1] on normal(0,1) \n+ pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1)\n+ pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1)\n+\n+ # find pseudo value for x.middle[2] on normal(0,1) \n+ pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1)\n+ pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1)\n+\n+ mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1) \n+\n+ sigma <- (pseudo.small.0-mu)/pseudo.small.1\n+\n+ return(list(mu=mu, sigma=sigma))\n+}\n' |