notos: Functions/Kernel_function

comparison Functions/Kernel_function_form.R @ 0:1535ffddeff4 draft

planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty

author	cristian
date	Thu, 07 Sep 2017 08:51:57 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1535ffddeff4
+##############################################
+# KDE function for single observation sequence #
+##############################################
+# load packages
+require(moments, quietly = TRUE)
+require(doParallel, quietly = TRUE)
+# get names for the columns of the table characterizing the peaks
+col.names.peaks <- function()
+{
+v <- c("Number of modes", "Number of modes (5% excluded)",
+"Number of modes (10% excluded)", "Skewness",
+"Mode skewness", "Nonparametric skewness", "Q50 skewness",
+"Absolute Q50 mode skewness", "Absolute Q80 mode skewness",
+"Peak 1", "Probability Mass 1",
+"Peak 2", "Probability Mass 2",
+"Peak 3", "Probability Mass 3",
+"Peak 4", "Probability Mass 4",
+"Peak 5", "Probability Mass 5",
+"Peak 6", "Probability Mass 6",
+"Peak 7", "Probability Mass 7",
+"Peak 8", "Probability Mass 8",
+"Peak 9", "Probability Mass 9",
+"Peak 10", "Probability Mass 10",
+"Warning close modes",
+"Number close modes",
+"Modes (close modes excluded)",
+"SD", "IQR 80", "IQR 90",
+"Total number of sequences")
+return(v)
+}
+# get names for the columns of the table containing the boostrap results
+col.names.bs <- function()
+{
+v <- c("Number of modes (NM)",
+"% of samples with same NM",
+"% of samples with more NM",
+"% of samples with less NM",
+"no. of samples with same NM",
+"% BS samples excluded by prob. mass crit.",
+"Warning CI")
+return(v)
+}
+# plot KDE for one set up CpGo/e ratios
+# obs: CpGo/e ratios
+# bs.cis: Is bootstrap done?
+# t.name: Title of the plot
+# t.sub: Text that is added below the title
+# t.legend: Is a legend printed?
+plot.KDE <- function(obs, t.name, bs.cis = FALSE, bstrp.reps = 1500, conf.lev = 0.95, t.sub = NULL, t.legend = TRUE, min.dist = 0.2, mode.mass = 0.01, band.width = 1.06)
+{
+# determine directory where functions are located
+cmdArgs <- commandArgs(trailingOnly = FALSE)
+str <- "--file="
+match <- grep(str, cmdArgs)
+if (length(match) == 0) {
+FCTN.DIR <- "../Galaxy/Functions"
+} else {
+path <- normalizePath( sub(str, "", cmdArgs[match]) )
+FCTN.DIR <- file.path(dirname(path), "Functions")
+}
+# part 1: initialize parameters etc
+# ---------------------------------
+# table with names and number of peaks
+v <- col.names.peaks()
+tab1.m <- data.frame(matrix(NA, nrow = 1, ncol = length(v)))
+names(tab1.m) <- col.names.peaks()
+# parameters to set in any case
+num.points <- 10 ^ 4 # number of points where to estimate the density
+p.bw <- "nrd" # algorithm for the bandwidth selection, "nrd" for Scott's bandwith
+use.seed <- TRUE
+threshold.modes <- mode.mass
+threshold.bs.ci <- 0.2 # only changes of +- threshold.bs.ci% in prob. mass is allowed for entering the CI calculation
+# bootstrap with optional parameters
+if (bs.cis) {
+ncpus = max(1, detectCores()) # number of processsors
+cl <- makeCluster(ncpus)
+registerDoParallel(cl)
+# table for the bootstrap
+tab2.m <- data.frame(matrix(NA, nrow = 1, ncol = 7))
+names(tab2.m) <- col.names.bs()
+}
+# part 2: estimate the KD and calculate probability masses
+# --------------------------------------------------------
+source( file.path(FCTN.DIR, "density_pm.R") )
+estimate <- density_pm(obs, num.points, p.bw = band.width * sd(obs) / length(obs)^0.2, threshold.modes = threshold.modes)
+ker <- estimate$ker # kernel density
+p <- estimate$peaks
+v <- estimate$valleys
+pm <- estimate$pm
+# select for each peak a line type
+num.pm <- length(p) # number of peaks
+p5 <- estimate$p5
+p10 <- estimate$p10
+lty = rep(1, num.pm) #pm > 0.10
+if (length(p10) > 0) { #pm < 0.10
+lty[p10] <- 2
+}
+if (length(p5) > 0) { #pm < 0.05
+lty[p5] <- 4
+}
+p.lty <- lty
+# part 3: bootstrap
+# -----------------
+if (bs.cis == TRUE) {
+# do bootstrapping
+estimateB = foreach(j = 1 : bstrp.reps ) %dopar% {
+if (use.seed == TRUE) {set.seed(j)}
+source( file.path(FCTN.DIR, "density_pm.R") )   # As doParallel is used, the source code has to be included here
+obs.boot = obs[sample(seq(obs), replace = TRUE)]
+density_pm_boot(obs.boot, num.points = num.points, p.bw = band.width * sd(obs) / length(obs)^0.2, threshold.modes = threshold.modes)
+}
+stopCluster(cl)
+# calculate CIs based on samples where the number of peaks of the KDE coincide with the original data
+# and the probability mass of the peaks of the sample don not divert to strongly from the original data
+# ... extract modes and pm for samples
+num.pmB <- sapply(lapply(estimateB, "[[", "peaks"), length)   # no of peaks before cleaning
+num.peaks.ok <- num.pmB == num.pm    # samples with same number of peaks before cleaning
+pB <- t(sapply( estimateB[num.peaks.ok], "[[", "peaks") )
+pmB <- t(sapply( estimateB[num.peaks.ok], "[[", "pm") )
+if (num.pm == 1) { # put into right matrix form
+pB <- t(pB)
+pmB <- t(pmB)
+}
+# ... check if prob. mass changes too strongly for any mode
+t.pmB <- t(pmB)
+keep.ub <- !apply(pm * (1 + threshold.bs.ci) < t.pmB, 2, any)
+keep.lb <- !apply(pm * (1 - threshold.bs.ci) > t.pmB, 2, any)
+mass.ok <- keep.ub & keep.lb
+pB.cl <- as.matrix( pB[mass.ok, ] )
+# ... determine CIs
+q <- (1 - conf.lev) / 2
+p.CI <- apply(pB.cl, 2, quantile, probs = c(q, 1 - q))
+}
+# part 4: plots
+# -------------
+#t.breaks <- seq(0, max(obs)*1.05, by = 0.03)
+t.breaks = 50
+hist_data <- hist(obs, breaks = t.breaks, plot = FALSE)
+hist(obs, breaks = t.breaks, prob = TRUE, main = t.name,
+# sub = paste("Gaussian kernel with band width", band.width),
+	   xlab = "CpG o/e ratio",
+col = grey(0.9), border = grey(0.6)) #, xlim = c(-0.05, max(obs)*1.1))
+if (!is.null(t.sub)) {
+mtext(t.sub)
+}
+if (bs.cis == TRUE) { # CI
+j <- 1 : ncol(p.CI)
+rect(ker$x[p.CI[1, j]], 0, ker$x[p.CI[2, j]],
+15, density = 20, angle = 45 + (j - 1) * 90, col = "blue")
+}
+lines(ker, col = "red", lwd = 2) # density
+# vertical lines at peaks
+x.pos = ker$x[p]
+ok <- c()
+close <- c()
+for (i in 1:length(x.pos)) {
+b <- TRUE
+if (i > 1) {
+if (x.pos[i] - x.pos[i - 1] < min.dist) {
+b <- FALSE
+}
+}
+if (i < length(x.pos)) {
+if (x.pos[i + 1] - x.pos[i] < min.dist) {
+b <- FALSE
+}
+}
+if (b) {
+ok <- c(ok, i)
+} else {
+close <- c(close, i)
+}
+}
+# peaks
+if (length(ok) > 0) {
+abline(v = ker$x[p][ok], col = "blue", lwd = 3, lty = p.lty)
+}
+if (length(close) > 0) {
+abline(v = ker$x[p][close], col = "orange", lwd = 3, lty = p.lty)
+}
+# valleys
+vals = ""
+if (length(x.pos)>1) {
+#	ker$x[v][c(-1, -length(ker$x[v]))]
+	  vals = ker$x[v][c(-1, -length(ker$x[v]))]
+	  abline(v = vals, col = "black", lwd = 1)
+}
+# legend
+if(t.legend == TRUE) {
+t.sym <- expression(""<="", ""<"", "">="")
+thr = threshold.modes
+if (thr >= 0.1) {
+md.labs <- substitute(paste("Mode with PM ", sym3, " ", thr, sep = ""), list(thr = thr, sym3 = t.sym[[3]]))
+} else {
+md.labs <- substitute(paste("Mode with PM ", sym3, " 0.1", sep = ""), list(sym3 = t.sym[[3]]))
+if (thr >= 0.05) {
+md.labs <- c( md.labs, substitute(paste("Mode with ", thr, " ", sym1, " PM ", sym2, " 0.1", sep = ""), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]])) )
+} else {
+md.labs <- c( md.labs, substitute(paste("Mode with 0.05 ", sym1, " PM ", sym2, " 0.1", sep = ""), list(sym1 = t.sym[[1]], sym2 = t.sym[[2]])),
+substitute(paste("Mode with ", thr, sym1, " PM ", sym2, " 0.05"), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]])))
+}
+}
+legend("topright",
+c(expression("Estimated density"), md.labs),
+lty = c(1, 1, 2, 4), lwd = c(2, 3, 3, 3),
+col = c("red", "blue", "blue", "blue"), bg = "white")
+}
+# part 5: return results
+# ----------------------
+# part 5 a): results table 1
+# tbd (maybe): introduce maximum number of modes (10 right now)
+tab1.m[1, "Number of modes"] <- num.pm
+tab1.m[1, 2] = num.pm - length(estimate$p5)
+tab1.m[1, 3] = num.pm - length(estimate$p10)
+for(j in 1 : 10){
+if(num.pm < j){
+tab1.m[1, j * 2 + 8] = c(" ")
+tab1.m[1, j * 2 + 9] = c(" ")
+} else{
+tab1.m[1, j * 2 + 8] = ker$x[p][j]
+tab1.m[1, j * 2 + 9] = pm[j]
+}
+}
+# fill table 1 with descriptives
+tab1.m[1, "Skewness"] <- skewness(obs)
+mode <- ker$x[ which.max(ker$y) ]
+tab1.m[1, "Mode skewness"] <- (mean(obs) - mode) / sd(obs)
+tab1.m[1, "Nonparametric skewness"] <- (mean(obs) - median(obs)) / sd(obs)
+q <- quantile(obs, c(0.25, 0.5, 0.75))
+tab1.m[1, "Q50 skewness"] <- (q[3] + q[1] - 2 * q[2]) / (q[3] - q[1])
+tab1.m[1, "Absolute Q50 mode skewness"] <- (q[3] + q[1]) / 2 - mode
+q <- quantile(obs, c(0.1, 0.5, 0.9))
+tab1.m[1, "Absolute Q80 mode skewness"] <- (q[3] + q[1]) / 2 - mode
+tab1.m[1, "SD"] <- sd(obs)
+tab1.m[1, "IQR 80"] <- diff(quantile(obs, c(0.1, 0.9)))
+tab1.m[1, "IQR 90"] <- diff(quantile(obs, c(0.05, 0.95)))
+tab1.m[1, "Total number of sequences"] = length(obs)
+# check if any peak is closer than a given threshold to any other
+num.close.modes <- sum(diff(ker$x[p]) < min.dist)
+if ( any(diff(ker$x[p]) < min.dist) && (num.pm > 1) ) {
+tab1.m[1, "Warning close modes"] <- "Modes too close"
+tab1.m[1, "Number close modes"] <- num.close.modes
+tab1.m[1, "Modes (close modes excluded)"] <- num.pm - num.close.modes
+} else {
+tab1.m[1, "Modes (close modes excluded)"] <- num.pm
+}
+# part 5 b): results table 2
+if (bs.cis == TRUE) {
+ker <- lapply( estimateB, "[[", "ker")
+peaks <- lapply( estimateB, "[[", "peaks")
+num.peaks <- c()
+for (i in 1:length(peaks)) {
+curr.peaks <- ker[[i]]$x[ peaks[[i]] ]
+num.cl <- sum(diff(curr.peaks) < min.dist)
+num.peaks <- c(num.peaks, length(curr.peaks) - num.cl)
+}
+# fill table 2 with stats on number of modes in bs samples
+num <- num.pm - num.close.modes
+tab2.m[1, "Number of modes (NM)"] <- num
+tab2.m[1, "% of samples with same NM"] <- 100 * sum(num.peaks == num) / bstrp.reps  # equal
+tab2.m[1, "% of samples with more NM"] <- 100 * sum(num.peaks > num) / bstrp.reps  # more
+tab2.m[1, "% of samples with less NM"] <- 100 * sum(num.peaks < num) / bstrp.reps  # less
+if (num.pm > 1) {
+tab2.m[1, "Warning CI"] <- "CI's may be unreliable"
+}
+tab2.m[1, "no. of samples with same NM"] <- sum(num.peaks == num)
+tab2.m[1,  "% BS samples excluded by prob. mass crit."] <- (1 - sum(mass.ok) / sum(num.peaks.ok)) * 100
+}
+# return the results
+if (bs.cis){
+return(list(tab.des = tab1.m, tab.bs = tab2.m, valleys = vals))
+} else {
+return(list(tab.des = tab1.m, valleys = vals))
+}
+}

Mercurial > repos > cristian > notos

comparison Functions/Kernel_function_form.R @ 0:1535ffddeff4 draft