diff Functions/Kernel_function_form.R @ 0:1535ffddeff4 draft

planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
author cristian
date Thu, 07 Sep 2017 08:51:57 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Functions/Kernel_function_form.R	Thu Sep 07 08:51:57 2017 -0400
@@ -0,0 +1,311 @@
+##############################################
+# KDE function for single observation sequence #
+##############################################
+
+# load packages
+require(moments, quietly = TRUE)
+require(doParallel, quietly = TRUE)
+
+
+# get names for the columns of the table characterizing the peaks
+col.names.peaks <- function()
+{
+  v <- c("Number of modes", "Number of modes (5% excluded)",
+         "Number of modes (10% excluded)", "Skewness",
+         "Mode skewness", "Nonparametric skewness", "Q50 skewness",
+         "Absolute Q50 mode skewness", "Absolute Q80 mode skewness",
+         "Peak 1", "Probability Mass 1",
+         "Peak 2", "Probability Mass 2",
+         "Peak 3", "Probability Mass 3",
+         "Peak 4", "Probability Mass 4",
+         "Peak 5", "Probability Mass 5",
+         "Peak 6", "Probability Mass 6",
+         "Peak 7", "Probability Mass 7",
+         "Peak 8", "Probability Mass 8",
+         "Peak 9", "Probability Mass 9",
+         "Peak 10", "Probability Mass 10", 
+         "Warning close modes",
+         "Number close modes", 
+         "Modes (close modes excluded)",
+         "SD", "IQR 80", "IQR 90",
+         "Total number of sequences")
+  return(v)
+}
+
+
+# get names for the columns of the table containing the boostrap results
+col.names.bs <- function()
+{
+  v <- c("Number of modes (NM)", 
+        "% of samples with same NM",
+        "% of samples with more NM",
+        "% of samples with less NM",
+        "no. of samples with same NM",
+        "% BS samples excluded by prob. mass crit.",
+        "Warning CI")
+  return(v)
+}
+
+
+# plot KDE for one set up CpGo/e ratios
+# obs: CpGo/e ratios
+# bs.cis: Is bootstrap done?
+# t.name: Title of the plot
+# t.sub: Text that is added below the title
+# t.legend: Is a legend printed?
+plot.KDE <- function(obs, t.name, bs.cis = FALSE, bstrp.reps = 1500, conf.lev = 0.95, t.sub = NULL, t.legend = TRUE, min.dist = 0.2, mode.mass = 0.01, band.width = 1.06)
+{ 
+  # determine directory where functions are located
+  cmdArgs <- commandArgs(trailingOnly = FALSE)
+  str <- "--file="
+  match <- grep(str, cmdArgs)
+  if (length(match) == 0) { 
+    FCTN.DIR <- "../Galaxy/Functions"
+  } else {
+    path <- normalizePath( sub(str, "", cmdArgs[match]) )
+    FCTN.DIR <- file.path(dirname(path), "Functions")
+  }
+  
+ # part 1: initialize parameters etc
+  # ---------------------------------
+  
+  # table with names and number of peaks
+  v <- col.names.peaks()
+  tab1.m <- data.frame(matrix(NA, nrow = 1, ncol = length(v)))
+  names(tab1.m) <- col.names.peaks()
+  # parameters to set in any case
+  num.points <- 10 ^ 4 # number of points where to estimate the density
+  p.bw <- "nrd" # algorithm for the bandwidth selection, "nrd" for Scott's bandwith
+  use.seed <- TRUE
+  threshold.modes <- mode.mass 
+  threshold.bs.ci <- 0.2 # only changes of +- threshold.bs.ci% in prob. mass is allowed for entering the CI calculation
+  # bootstrap with optional parameters
+  if (bs.cis) {
+    ncpus = max(1, detectCores()) # number of processsors
+    cl <- makeCluster(ncpus)
+    registerDoParallel(cl)
+    # table for the bootstrap
+    tab2.m <- data.frame(matrix(NA, nrow = 1, ncol = 7))
+    names(tab2.m) <- col.names.bs()
+  }
+
+
+  # part 2: estimate the KD and calculate probability masses
+  # --------------------------------------------------------
+ 
+  source( file.path(FCTN.DIR, "density_pm.R") )
+  estimate <- density_pm(obs, num.points, p.bw = band.width * sd(obs) / length(obs)^0.2, threshold.modes = threshold.modes)
+  ker <- estimate$ker # kernel density
+  p <- estimate$peaks
+  v <- estimate$valleys
+  pm <- estimate$pm
+  
+  # select for each peak a line type
+  num.pm <- length(p) # number of peaks
+  p5 <- estimate$p5
+  p10 <- estimate$p10
+  lty = rep(1, num.pm) #pm > 0.10
+  if (length(p10) > 0) { #pm < 0.10
+    lty[p10] <- 2
+  }
+  if (length(p5) > 0) { #pm < 0.05
+    lty[p5] <- 4
+  }
+  p.lty <- lty
+  
+  # part 3: bootstrap
+  # -----------------
+  
+  if (bs.cis == TRUE) { 
+    # do bootstrapping
+    estimateB = foreach(j = 1 : bstrp.reps ) %dopar% {
+      if (use.seed == TRUE) {set.seed(j)}
+      source( file.path(FCTN.DIR, "density_pm.R") )   # As doParallel is used, the source code has to be included here
+      obs.boot = obs[sample(seq(obs), replace = TRUE)]
+      density_pm_boot(obs.boot, num.points = num.points, p.bw = band.width * sd(obs) / length(obs)^0.2, threshold.modes = threshold.modes)
+    }
+    stopCluster(cl)
+    
+    # calculate CIs based on samples where the number of peaks of the KDE coincide with the original data
+    # and the probability mass of the peaks of the sample don not divert to strongly from the original data
+    # ... extract modes and pm for samples
+    num.pmB <- sapply(lapply(estimateB, "[[", "peaks"), length)   # no of peaks before cleaning
+    num.peaks.ok <- num.pmB == num.pm    # samples with same number of peaks before cleaning
+    pB <- t(sapply( estimateB[num.peaks.ok], "[[", "peaks") )
+    pmB <- t(sapply( estimateB[num.peaks.ok], "[[", "pm") )
+    if (num.pm == 1) { # put into right matrix form
+      pB <- t(pB)
+      pmB <- t(pmB)
+    }
+    
+    # ... check if prob. mass changes too strongly for any mode
+    t.pmB <- t(pmB)
+    keep.ub <- !apply(pm * (1 + threshold.bs.ci) < t.pmB, 2, any)
+    keep.lb <- !apply(pm * (1 - threshold.bs.ci) > t.pmB, 2, any)
+    mass.ok <- keep.ub & keep.lb
+    pB.cl <- as.matrix( pB[mass.ok, ] )
+    
+    # ... determine CIs
+    q <- (1 - conf.lev) / 2
+    p.CI <- apply(pB.cl, 2, quantile, probs = c(q, 1 - q))
+  }
+  
+  
+  # part 4: plots
+  # -------------
+    
+  #t.breaks <- seq(0, max(obs)*1.05, by = 0.03)
+  t.breaks = 50
+  hist_data <- hist(obs, breaks = t.breaks, plot = FALSE)
+  hist(obs, breaks = t.breaks, prob = TRUE, main = t.name,
+       # sub = paste("Gaussian kernel with band width", band.width),
+	   xlab = "CpG o/e ratio",
+       col = grey(0.9), border = grey(0.6)) #, xlim = c(-0.05, max(obs)*1.1))
+  if (!is.null(t.sub)) {
+    mtext(t.sub)
+  }
+  if (bs.cis == TRUE) { # CI
+    j <- 1 : ncol(p.CI)
+    rect(ker$x[p.CI[1, j]], 0, ker$x[p.CI[2, j]],
+         15, density = 20, angle = 45 + (j - 1) * 90, col = "blue") 
+  }
+  lines(ker, col = "red", lwd = 2) # density
+  
+  # vertical lines at peaks
+  x.pos = ker$x[p]
+  ok <- c()
+  close <- c()
+  for (i in 1:length(x.pos)) {
+    b <- TRUE
+    if (i > 1) {
+      if (x.pos[i] - x.pos[i - 1] < min.dist) {
+        b <- FALSE
+      }
+    } 
+    if (i < length(x.pos)) {
+      if (x.pos[i + 1] - x.pos[i] < min.dist) {
+        b <- FALSE
+      }
+    } 
+     
+    if (b) {
+      ok <- c(ok, i)
+    } else {
+      close <- c(close, i)
+    }
+  }
+  # peaks
+  if (length(ok) > 0) {
+    abline(v = ker$x[p][ok], col = "blue", lwd = 3, lty = p.lty) 
+  }
+  if (length(close) > 0) {
+    abline(v = ker$x[p][close], col = "orange", lwd = 3, lty = p.lty) 
+  }
+  # valleys
+  vals = ""
+  if (length(x.pos)>1) {
+#	ker$x[v][c(-1, -length(ker$x[v]))]
+	  vals = ker$x[v][c(-1, -length(ker$x[v]))]
+	  abline(v = vals, col = "black", lwd = 1) 
+  }
+  
+  # legend
+  if(t.legend == TRUE) {
+    t.sym <- expression(""<="", ""<"", "">="")
+    thr = threshold.modes
+    if (thr >= 0.1) {
+      md.labs <- substitute(paste("Mode with PM ", sym3, " ", thr, sep = ""), list(thr = thr, sym3 = t.sym[[3]]))
+    } else {
+      md.labs <- substitute(paste("Mode with PM ", sym3, " 0.1", sep = ""), list(sym3 = t.sym[[3]]))
+      if (thr >= 0.05) {
+        md.labs <- c( md.labs, substitute(paste("Mode with ", thr, " ", sym1, " PM ", sym2, " 0.1", sep = ""), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]])) )     
+      } else {
+        md.labs <- c( md.labs, substitute(paste("Mode with 0.05 ", sym1, " PM ", sym2, " 0.1", sep = ""), list(sym1 = t.sym[[1]], sym2 = t.sym[[2]])),
+                               substitute(paste("Mode with ", thr, sym1, " PM ", sym2, " 0.05"), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]])))    
+      }
+    }
+    legend("topright", 
+           c(expression("Estimated density"), md.labs),
+           lty = c(1, 1, 2, 4), lwd = c(2, 3, 3, 3),
+           col = c("red", "blue", "blue", "blue"), bg = "white") 
+  }
+  
+  
+  # part 5: return results
+  # ----------------------
+  
+  # part 5 a): results table 1
+  # tbd (maybe): introduce maximum number of modes (10 right now)
+  tab1.m[1, "Number of modes"] <- num.pm
+  tab1.m[1, 2] = num.pm - length(estimate$p5)
+  tab1.m[1, 3] = num.pm - length(estimate$p10)
+  for(j in 1 : 10){
+    if(num.pm < j){
+      tab1.m[1, j * 2 + 8] = c(" ")
+      tab1.m[1, j * 2 + 9] = c(" ")
+    } else{
+      tab1.m[1, j * 2 + 8] = ker$x[p][j]
+      tab1.m[1, j * 2 + 9] = pm[j]
+    }
+  }
+  
+  # fill table 1 with descriptives
+  tab1.m[1, "Skewness"] <- skewness(obs)
+  mode <- ker$x[ which.max(ker$y) ]
+  tab1.m[1, "Mode skewness"] <- (mean(obs) - mode) / sd(obs)
+  tab1.m[1, "Nonparametric skewness"] <- (mean(obs) - median(obs)) / sd(obs)
+  q <- quantile(obs, c(0.25, 0.5, 0.75))
+  tab1.m[1, "Q50 skewness"] <- (q[3] + q[1] - 2 * q[2]) / (q[3] - q[1])
+  tab1.m[1, "Absolute Q50 mode skewness"] <- (q[3] + q[1]) / 2 - mode
+  q <- quantile(obs, c(0.1, 0.5, 0.9))
+  tab1.m[1, "Absolute Q80 mode skewness"] <- (q[3] + q[1]) / 2 - mode
+  tab1.m[1, "SD"] <- sd(obs)
+  tab1.m[1, "IQR 80"] <- diff(quantile(obs, c(0.1, 0.9)))
+  tab1.m[1, "IQR 90"] <- diff(quantile(obs, c(0.05, 0.95)))
+  tab1.m[1, "Total number of sequences"] = length(obs)
+  
+  # check if any peak is closer than a given threshold to any other
+  num.close.modes <- sum(diff(ker$x[p]) < min.dist)
+  if ( any(diff(ker$x[p]) < min.dist) && (num.pm > 1) ) {
+    tab1.m[1, "Warning close modes"] <- "Modes too close"
+    tab1.m[1, "Number close modes"] <- num.close.modes
+    tab1.m[1, "Modes (close modes excluded)"] <- num.pm - num.close.modes
+  } else {
+    tab1.m[1, "Modes (close modes excluded)"] <- num.pm 
+  }
+  
+  # part 5 b): results table 2  
+  if (bs.cis == TRUE) {
+    ker <- lapply( estimateB, "[[", "ker")
+    peaks <- lapply( estimateB, "[[", "peaks")
+    num.peaks <- c()
+    for (i in 1:length(peaks)) {
+      curr.peaks <- ker[[i]]$x[ peaks[[i]] ]
+      num.cl <- sum(diff(curr.peaks) < min.dist)  
+      num.peaks <- c(num.peaks, length(curr.peaks) - num.cl)
+    }
+    
+    # fill table 2 with stats on number of modes in bs samples 
+    num <- num.pm - num.close.modes
+    tab2.m[1, "Number of modes (NM)"] <- num
+    tab2.m[1, "% of samples with same NM"] <- 100 * sum(num.peaks == num) / bstrp.reps  # equal
+    tab2.m[1, "% of samples with more NM"] <- 100 * sum(num.peaks > num) / bstrp.reps  # more
+    tab2.m[1, "% of samples with less NM"] <- 100 * sum(num.peaks < num) / bstrp.reps  # less
+    if (num.pm > 1) {
+      tab2.m[1, "Warning CI"] <- "CI's may be unreliable"
+    }
+    
+    tab2.m[1, "no. of samples with same NM"] <- sum(num.peaks == num)
+    tab2.m[1,  "% BS samples excluded by prob. mass crit."] <- (1 - sum(mass.ok) / sum(num.peaks.ok)) * 100
+  }
+
+  # return the results
+  if (bs.cis){
+    return(list(tab.des = tab1.m, tab.bs = tab2.m, valleys = vals))
+  } else {
+    return(list(tab.des = tab1.m, valleys = vals))
+  }
+}
+
+
+