Mercurial > repos > cristian > notos
diff Functions/Kernel_function_form.R @ 0:1535ffddeff4 draft
planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
author | cristian |
---|---|
date | Thu, 07 Sep 2017 08:51:57 -0400 (2017-09-07) |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Functions/Kernel_function_form.R Thu Sep 07 08:51:57 2017 -0400 @@ -0,0 +1,311 @@ +############################################## +# KDE function for single observation sequence # +############################################## + +# load packages +require(moments, quietly = TRUE) +require(doParallel, quietly = TRUE) + + +# get names for the columns of the table characterizing the peaks +col.names.peaks <- function() +{ + v <- c("Number of modes", "Number of modes (5% excluded)", + "Number of modes (10% excluded)", "Skewness", + "Mode skewness", "Nonparametric skewness", "Q50 skewness", + "Absolute Q50 mode skewness", "Absolute Q80 mode skewness", + "Peak 1", "Probability Mass 1", + "Peak 2", "Probability Mass 2", + "Peak 3", "Probability Mass 3", + "Peak 4", "Probability Mass 4", + "Peak 5", "Probability Mass 5", + "Peak 6", "Probability Mass 6", + "Peak 7", "Probability Mass 7", + "Peak 8", "Probability Mass 8", + "Peak 9", "Probability Mass 9", + "Peak 10", "Probability Mass 10", + "Warning close modes", + "Number close modes", + "Modes (close modes excluded)", + "SD", "IQR 80", "IQR 90", + "Total number of sequences") + return(v) +} + + +# get names for the columns of the table containing the boostrap results +col.names.bs <- function() +{ + v <- c("Number of modes (NM)", + "% of samples with same NM", + "% of samples with more NM", + "% of samples with less NM", + "no. of samples with same NM", + "% BS samples excluded by prob. mass crit.", + "Warning CI") + return(v) +} + + +# plot KDE for one set up CpGo/e ratios +# obs: CpGo/e ratios +# bs.cis: Is bootstrap done? +# t.name: Title of the plot +# t.sub: Text that is added below the title +# t.legend: Is a legend printed? +plot.KDE <- function(obs, t.name, bs.cis = FALSE, bstrp.reps = 1500, conf.lev = 0.95, t.sub = NULL, t.legend = TRUE, min.dist = 0.2, mode.mass = 0.01, band.width = 1.06) +{ + # determine directory where functions are located + cmdArgs <- commandArgs(trailingOnly = FALSE) + str <- "--file=" + match <- grep(str, cmdArgs) + if (length(match) == 0) { + FCTN.DIR <- "../Galaxy/Functions" + } else { + path <- normalizePath( sub(str, "", cmdArgs[match]) ) + FCTN.DIR <- file.path(dirname(path), "Functions") + } + + # part 1: initialize parameters etc + # --------------------------------- + + # table with names and number of peaks + v <- col.names.peaks() + tab1.m <- data.frame(matrix(NA, nrow = 1, ncol = length(v))) + names(tab1.m) <- col.names.peaks() + # parameters to set in any case + num.points <- 10 ^ 4 # number of points where to estimate the density + p.bw <- "nrd" # algorithm for the bandwidth selection, "nrd" for Scott's bandwith + use.seed <- TRUE + threshold.modes <- mode.mass + threshold.bs.ci <- 0.2 # only changes of +- threshold.bs.ci% in prob. mass is allowed for entering the CI calculation + # bootstrap with optional parameters + if (bs.cis) { + ncpus = max(1, detectCores()) # number of processsors + cl <- makeCluster(ncpus) + registerDoParallel(cl) + # table for the bootstrap + tab2.m <- data.frame(matrix(NA, nrow = 1, ncol = 7)) + names(tab2.m) <- col.names.bs() + } + + + # part 2: estimate the KD and calculate probability masses + # -------------------------------------------------------- + + source( file.path(FCTN.DIR, "density_pm.R") ) + estimate <- density_pm(obs, num.points, p.bw = band.width * sd(obs) / length(obs)^0.2, threshold.modes = threshold.modes) + ker <- estimate$ker # kernel density + p <- estimate$peaks + v <- estimate$valleys + pm <- estimate$pm + + # select for each peak a line type + num.pm <- length(p) # number of peaks + p5 <- estimate$p5 + p10 <- estimate$p10 + lty = rep(1, num.pm) #pm > 0.10 + if (length(p10) > 0) { #pm < 0.10 + lty[p10] <- 2 + } + if (length(p5) > 0) { #pm < 0.05 + lty[p5] <- 4 + } + p.lty <- lty + + # part 3: bootstrap + # ----------------- + + if (bs.cis == TRUE) { + # do bootstrapping + estimateB = foreach(j = 1 : bstrp.reps ) %dopar% { + if (use.seed == TRUE) {set.seed(j)} + source( file.path(FCTN.DIR, "density_pm.R") ) # As doParallel is used, the source code has to be included here + obs.boot = obs[sample(seq(obs), replace = TRUE)] + density_pm_boot(obs.boot, num.points = num.points, p.bw = band.width * sd(obs) / length(obs)^0.2, threshold.modes = threshold.modes) + } + stopCluster(cl) + + # calculate CIs based on samples where the number of peaks of the KDE coincide with the original data + # and the probability mass of the peaks of the sample don not divert to strongly from the original data + # ... extract modes and pm for samples + num.pmB <- sapply(lapply(estimateB, "[[", "peaks"), length) # no of peaks before cleaning + num.peaks.ok <- num.pmB == num.pm # samples with same number of peaks before cleaning + pB <- t(sapply( estimateB[num.peaks.ok], "[[", "peaks") ) + pmB <- t(sapply( estimateB[num.peaks.ok], "[[", "pm") ) + if (num.pm == 1) { # put into right matrix form + pB <- t(pB) + pmB <- t(pmB) + } + + # ... check if prob. mass changes too strongly for any mode + t.pmB <- t(pmB) + keep.ub <- !apply(pm * (1 + threshold.bs.ci) < t.pmB, 2, any) + keep.lb <- !apply(pm * (1 - threshold.bs.ci) > t.pmB, 2, any) + mass.ok <- keep.ub & keep.lb + pB.cl <- as.matrix( pB[mass.ok, ] ) + + # ... determine CIs + q <- (1 - conf.lev) / 2 + p.CI <- apply(pB.cl, 2, quantile, probs = c(q, 1 - q)) + } + + + # part 4: plots + # ------------- + + #t.breaks <- seq(0, max(obs)*1.05, by = 0.03) + t.breaks = 50 + hist_data <- hist(obs, breaks = t.breaks, plot = FALSE) + hist(obs, breaks = t.breaks, prob = TRUE, main = t.name, + # sub = paste("Gaussian kernel with band width", band.width), + xlab = "CpG o/e ratio", + col = grey(0.9), border = grey(0.6)) #, xlim = c(-0.05, max(obs)*1.1)) + if (!is.null(t.sub)) { + mtext(t.sub) + } + if (bs.cis == TRUE) { # CI + j <- 1 : ncol(p.CI) + rect(ker$x[p.CI[1, j]], 0, ker$x[p.CI[2, j]], + 15, density = 20, angle = 45 + (j - 1) * 90, col = "blue") + } + lines(ker, col = "red", lwd = 2) # density + + # vertical lines at peaks + x.pos = ker$x[p] + ok <- c() + close <- c() + for (i in 1:length(x.pos)) { + b <- TRUE + if (i > 1) { + if (x.pos[i] - x.pos[i - 1] < min.dist) { + b <- FALSE + } + } + if (i < length(x.pos)) { + if (x.pos[i + 1] - x.pos[i] < min.dist) { + b <- FALSE + } + } + + if (b) { + ok <- c(ok, i) + } else { + close <- c(close, i) + } + } + # peaks + if (length(ok) > 0) { + abline(v = ker$x[p][ok], col = "blue", lwd = 3, lty = p.lty) + } + if (length(close) > 0) { + abline(v = ker$x[p][close], col = "orange", lwd = 3, lty = p.lty) + } + # valleys + vals = "" + if (length(x.pos)>1) { +# ker$x[v][c(-1, -length(ker$x[v]))] + vals = ker$x[v][c(-1, -length(ker$x[v]))] + abline(v = vals, col = "black", lwd = 1) + } + + # legend + if(t.legend == TRUE) { + t.sym <- expression(""<="", ""<"", "">="") + thr = threshold.modes + if (thr >= 0.1) { + md.labs <- substitute(paste("Mode with PM ", sym3, " ", thr, sep = ""), list(thr = thr, sym3 = t.sym[[3]])) + } else { + md.labs <- substitute(paste("Mode with PM ", sym3, " 0.1", sep = ""), list(sym3 = t.sym[[3]])) + if (thr >= 0.05) { + md.labs <- c( md.labs, substitute(paste("Mode with ", thr, " ", sym1, " PM ", sym2, " 0.1", sep = ""), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]])) ) + } else { + md.labs <- c( md.labs, substitute(paste("Mode with 0.05 ", sym1, " PM ", sym2, " 0.1", sep = ""), list(sym1 = t.sym[[1]], sym2 = t.sym[[2]])), + substitute(paste("Mode with ", thr, sym1, " PM ", sym2, " 0.05"), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]]))) + } + } + legend("topright", + c(expression("Estimated density"), md.labs), + lty = c(1, 1, 2, 4), lwd = c(2, 3, 3, 3), + col = c("red", "blue", "blue", "blue"), bg = "white") + } + + + # part 5: return results + # ---------------------- + + # part 5 a): results table 1 + # tbd (maybe): introduce maximum number of modes (10 right now) + tab1.m[1, "Number of modes"] <- num.pm + tab1.m[1, 2] = num.pm - length(estimate$p5) + tab1.m[1, 3] = num.pm - length(estimate$p10) + for(j in 1 : 10){ + if(num.pm < j){ + tab1.m[1, j * 2 + 8] = c(" ") + tab1.m[1, j * 2 + 9] = c(" ") + } else{ + tab1.m[1, j * 2 + 8] = ker$x[p][j] + tab1.m[1, j * 2 + 9] = pm[j] + } + } + + # fill table 1 with descriptives + tab1.m[1, "Skewness"] <- skewness(obs) + mode <- ker$x[ which.max(ker$y) ] + tab1.m[1, "Mode skewness"] <- (mean(obs) - mode) / sd(obs) + tab1.m[1, "Nonparametric skewness"] <- (mean(obs) - median(obs)) / sd(obs) + q <- quantile(obs, c(0.25, 0.5, 0.75)) + tab1.m[1, "Q50 skewness"] <- (q[3] + q[1] - 2 * q[2]) / (q[3] - q[1]) + tab1.m[1, "Absolute Q50 mode skewness"] <- (q[3] + q[1]) / 2 - mode + q <- quantile(obs, c(0.1, 0.5, 0.9)) + tab1.m[1, "Absolute Q80 mode skewness"] <- (q[3] + q[1]) / 2 - mode + tab1.m[1, "SD"] <- sd(obs) + tab1.m[1, "IQR 80"] <- diff(quantile(obs, c(0.1, 0.9))) + tab1.m[1, "IQR 90"] <- diff(quantile(obs, c(0.05, 0.95))) + tab1.m[1, "Total number of sequences"] = length(obs) + + # check if any peak is closer than a given threshold to any other + num.close.modes <- sum(diff(ker$x[p]) < min.dist) + if ( any(diff(ker$x[p]) < min.dist) && (num.pm > 1) ) { + tab1.m[1, "Warning close modes"] <- "Modes too close" + tab1.m[1, "Number close modes"] <- num.close.modes + tab1.m[1, "Modes (close modes excluded)"] <- num.pm - num.close.modes + } else { + tab1.m[1, "Modes (close modes excluded)"] <- num.pm + } + + # part 5 b): results table 2 + if (bs.cis == TRUE) { + ker <- lapply( estimateB, "[[", "ker") + peaks <- lapply( estimateB, "[[", "peaks") + num.peaks <- c() + for (i in 1:length(peaks)) { + curr.peaks <- ker[[i]]$x[ peaks[[i]] ] + num.cl <- sum(diff(curr.peaks) < min.dist) + num.peaks <- c(num.peaks, length(curr.peaks) - num.cl) + } + + # fill table 2 with stats on number of modes in bs samples + num <- num.pm - num.close.modes + tab2.m[1, "Number of modes (NM)"] <- num + tab2.m[1, "% of samples with same NM"] <- 100 * sum(num.peaks == num) / bstrp.reps # equal + tab2.m[1, "% of samples with more NM"] <- 100 * sum(num.peaks > num) / bstrp.reps # more + tab2.m[1, "% of samples with less NM"] <- 100 * sum(num.peaks < num) / bstrp.reps # less + if (num.pm > 1) { + tab2.m[1, "Warning CI"] <- "CI's may be unreliable" + } + + tab2.m[1, "no. of samples with same NM"] <- sum(num.peaks == num) + tab2.m[1, "% BS samples excluded by prob. mass crit."] <- (1 - sum(mass.ok) / sum(num.peaks.ok)) * 100 + } + + # return the results + if (bs.cis){ + return(list(tab.des = tab1.m, tab.bs = tab2.m, valleys = vals)) + } else { + return(list(tab.des = tab1.m, valleys = vals)) + } +} + + +