comparison Functions/Kernel_function_form.R @ 0:1535ffddeff4 draft

planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
author cristian
date Thu, 07 Sep 2017 08:51:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1535ffddeff4
1 ##############################################
2 # KDE function for single observation sequence #
3 ##############################################
4
5 # load packages
6 require(moments, quietly = TRUE)
7 require(doParallel, quietly = TRUE)
8
9
10 # get names for the columns of the table characterizing the peaks
11 col.names.peaks <- function()
12 {
13 v <- c("Number of modes", "Number of modes (5% excluded)",
14 "Number of modes (10% excluded)", "Skewness",
15 "Mode skewness", "Nonparametric skewness", "Q50 skewness",
16 "Absolute Q50 mode skewness", "Absolute Q80 mode skewness",
17 "Peak 1", "Probability Mass 1",
18 "Peak 2", "Probability Mass 2",
19 "Peak 3", "Probability Mass 3",
20 "Peak 4", "Probability Mass 4",
21 "Peak 5", "Probability Mass 5",
22 "Peak 6", "Probability Mass 6",
23 "Peak 7", "Probability Mass 7",
24 "Peak 8", "Probability Mass 8",
25 "Peak 9", "Probability Mass 9",
26 "Peak 10", "Probability Mass 10",
27 "Warning close modes",
28 "Number close modes",
29 "Modes (close modes excluded)",
30 "SD", "IQR 80", "IQR 90",
31 "Total number of sequences")
32 return(v)
33 }
34
35
36 # get names for the columns of the table containing the boostrap results
37 col.names.bs <- function()
38 {
39 v <- c("Number of modes (NM)",
40 "% of samples with same NM",
41 "% of samples with more NM",
42 "% of samples with less NM",
43 "no. of samples with same NM",
44 "% BS samples excluded by prob. mass crit.",
45 "Warning CI")
46 return(v)
47 }
48
49
50 # plot KDE for one set up CpGo/e ratios
51 # obs: CpGo/e ratios
52 # bs.cis: Is bootstrap done?
53 # t.name: Title of the plot
54 # t.sub: Text that is added below the title
55 # t.legend: Is a legend printed?
56 plot.KDE <- function(obs, t.name, bs.cis = FALSE, bstrp.reps = 1500, conf.lev = 0.95, t.sub = NULL, t.legend = TRUE, min.dist = 0.2, mode.mass = 0.01, band.width = 1.06)
57 {
58 # determine directory where functions are located
59 cmdArgs <- commandArgs(trailingOnly = FALSE)
60 str <- "--file="
61 match <- grep(str, cmdArgs)
62 if (length(match) == 0) {
63 FCTN.DIR <- "../Galaxy/Functions"
64 } else {
65 path <- normalizePath( sub(str, "", cmdArgs[match]) )
66 FCTN.DIR <- file.path(dirname(path), "Functions")
67 }
68
69 # part 1: initialize parameters etc
70 # ---------------------------------
71
72 # table with names and number of peaks
73 v <- col.names.peaks()
74 tab1.m <- data.frame(matrix(NA, nrow = 1, ncol = length(v)))
75 names(tab1.m) <- col.names.peaks()
76 # parameters to set in any case
77 num.points <- 10 ^ 4 # number of points where to estimate the density
78 p.bw <- "nrd" # algorithm for the bandwidth selection, "nrd" for Scott's bandwith
79 use.seed <- TRUE
80 threshold.modes <- mode.mass
81 threshold.bs.ci <- 0.2 # only changes of +- threshold.bs.ci% in prob. mass is allowed for entering the CI calculation
82 # bootstrap with optional parameters
83 if (bs.cis) {
84 ncpus = max(1, detectCores()) # number of processsors
85 cl <- makeCluster(ncpus)
86 registerDoParallel(cl)
87 # table for the bootstrap
88 tab2.m <- data.frame(matrix(NA, nrow = 1, ncol = 7))
89 names(tab2.m) <- col.names.bs()
90 }
91
92
93 # part 2: estimate the KD and calculate probability masses
94 # --------------------------------------------------------
95
96 source( file.path(FCTN.DIR, "density_pm.R") )
97 estimate <- density_pm(obs, num.points, p.bw = band.width * sd(obs) / length(obs)^0.2, threshold.modes = threshold.modes)
98 ker <- estimate$ker # kernel density
99 p <- estimate$peaks
100 v <- estimate$valleys
101 pm <- estimate$pm
102
103 # select for each peak a line type
104 num.pm <- length(p) # number of peaks
105 p5 <- estimate$p5
106 p10 <- estimate$p10
107 lty = rep(1, num.pm) #pm > 0.10
108 if (length(p10) > 0) { #pm < 0.10
109 lty[p10] <- 2
110 }
111 if (length(p5) > 0) { #pm < 0.05
112 lty[p5] <- 4
113 }
114 p.lty <- lty
115
116 # part 3: bootstrap
117 # -----------------
118
119 if (bs.cis == TRUE) {
120 # do bootstrapping
121 estimateB = foreach(j = 1 : bstrp.reps ) %dopar% {
122 if (use.seed == TRUE) {set.seed(j)}
123 source( file.path(FCTN.DIR, "density_pm.R") ) # As doParallel is used, the source code has to be included here
124 obs.boot = obs[sample(seq(obs), replace = TRUE)]
125 density_pm_boot(obs.boot, num.points = num.points, p.bw = band.width * sd(obs) / length(obs)^0.2, threshold.modes = threshold.modes)
126 }
127 stopCluster(cl)
128
129 # calculate CIs based on samples where the number of peaks of the KDE coincide with the original data
130 # and the probability mass of the peaks of the sample don not divert to strongly from the original data
131 # ... extract modes and pm for samples
132 num.pmB <- sapply(lapply(estimateB, "[[", "peaks"), length) # no of peaks before cleaning
133 num.peaks.ok <- num.pmB == num.pm # samples with same number of peaks before cleaning
134 pB <- t(sapply( estimateB[num.peaks.ok], "[[", "peaks") )
135 pmB <- t(sapply( estimateB[num.peaks.ok], "[[", "pm") )
136 if (num.pm == 1) { # put into right matrix form
137 pB <- t(pB)
138 pmB <- t(pmB)
139 }
140
141 # ... check if prob. mass changes too strongly for any mode
142 t.pmB <- t(pmB)
143 keep.ub <- !apply(pm * (1 + threshold.bs.ci) < t.pmB, 2, any)
144 keep.lb <- !apply(pm * (1 - threshold.bs.ci) > t.pmB, 2, any)
145 mass.ok <- keep.ub & keep.lb
146 pB.cl <- as.matrix( pB[mass.ok, ] )
147
148 # ... determine CIs
149 q <- (1 - conf.lev) / 2
150 p.CI <- apply(pB.cl, 2, quantile, probs = c(q, 1 - q))
151 }
152
153
154 # part 4: plots
155 # -------------
156
157 #t.breaks <- seq(0, max(obs)*1.05, by = 0.03)
158 t.breaks = 50
159 hist_data <- hist(obs, breaks = t.breaks, plot = FALSE)
160 hist(obs, breaks = t.breaks, prob = TRUE, main = t.name,
161 # sub = paste("Gaussian kernel with band width", band.width),
162 xlab = "CpG o/e ratio",
163 col = grey(0.9), border = grey(0.6)) #, xlim = c(-0.05, max(obs)*1.1))
164 if (!is.null(t.sub)) {
165 mtext(t.sub)
166 }
167 if (bs.cis == TRUE) { # CI
168 j <- 1 : ncol(p.CI)
169 rect(ker$x[p.CI[1, j]], 0, ker$x[p.CI[2, j]],
170 15, density = 20, angle = 45 + (j - 1) * 90, col = "blue")
171 }
172 lines(ker, col = "red", lwd = 2) # density
173
174 # vertical lines at peaks
175 x.pos = ker$x[p]
176 ok <- c()
177 close <- c()
178 for (i in 1:length(x.pos)) {
179 b <- TRUE
180 if (i > 1) {
181 if (x.pos[i] - x.pos[i - 1] < min.dist) {
182 b <- FALSE
183 }
184 }
185 if (i < length(x.pos)) {
186 if (x.pos[i + 1] - x.pos[i] < min.dist) {
187 b <- FALSE
188 }
189 }
190
191 if (b) {
192 ok <- c(ok, i)
193 } else {
194 close <- c(close, i)
195 }
196 }
197 # peaks
198 if (length(ok) > 0) {
199 abline(v = ker$x[p][ok], col = "blue", lwd = 3, lty = p.lty)
200 }
201 if (length(close) > 0) {
202 abline(v = ker$x[p][close], col = "orange", lwd = 3, lty = p.lty)
203 }
204 # valleys
205 vals = ""
206 if (length(x.pos)>1) {
207 # ker$x[v][c(-1, -length(ker$x[v]))]
208 vals = ker$x[v][c(-1, -length(ker$x[v]))]
209 abline(v = vals, col = "black", lwd = 1)
210 }
211
212 # legend
213 if(t.legend == TRUE) {
214 t.sym <- expression(""<="", ""<"", "">="")
215 thr = threshold.modes
216 if (thr >= 0.1) {
217 md.labs <- substitute(paste("Mode with PM ", sym3, " ", thr, sep = ""), list(thr = thr, sym3 = t.sym[[3]]))
218 } else {
219 md.labs <- substitute(paste("Mode with PM ", sym3, " 0.1", sep = ""), list(sym3 = t.sym[[3]]))
220 if (thr >= 0.05) {
221 md.labs <- c( md.labs, substitute(paste("Mode with ", thr, " ", sym1, " PM ", sym2, " 0.1", sep = ""), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]])) )
222 } else {
223 md.labs <- c( md.labs, substitute(paste("Mode with 0.05 ", sym1, " PM ", sym2, " 0.1", sep = ""), list(sym1 = t.sym[[1]], sym2 = t.sym[[2]])),
224 substitute(paste("Mode with ", thr, sym1, " PM ", sym2, " 0.05"), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]])))
225 }
226 }
227 legend("topright",
228 c(expression("Estimated density"), md.labs),
229 lty = c(1, 1, 2, 4), lwd = c(2, 3, 3, 3),
230 col = c("red", "blue", "blue", "blue"), bg = "white")
231 }
232
233
234 # part 5: return results
235 # ----------------------
236
237 # part 5 a): results table 1
238 # tbd (maybe): introduce maximum number of modes (10 right now)
239 tab1.m[1, "Number of modes"] <- num.pm
240 tab1.m[1, 2] = num.pm - length(estimate$p5)
241 tab1.m[1, 3] = num.pm - length(estimate$p10)
242 for(j in 1 : 10){
243 if(num.pm < j){
244 tab1.m[1, j * 2 + 8] = c(" ")
245 tab1.m[1, j * 2 + 9] = c(" ")
246 } else{
247 tab1.m[1, j * 2 + 8] = ker$x[p][j]
248 tab1.m[1, j * 2 + 9] = pm[j]
249 }
250 }
251
252 # fill table 1 with descriptives
253 tab1.m[1, "Skewness"] <- skewness(obs)
254 mode <- ker$x[ which.max(ker$y) ]
255 tab1.m[1, "Mode skewness"] <- (mean(obs) - mode) / sd(obs)
256 tab1.m[1, "Nonparametric skewness"] <- (mean(obs) - median(obs)) / sd(obs)
257 q <- quantile(obs, c(0.25, 0.5, 0.75))
258 tab1.m[1, "Q50 skewness"] <- (q[3] + q[1] - 2 * q[2]) / (q[3] - q[1])
259 tab1.m[1, "Absolute Q50 mode skewness"] <- (q[3] + q[1]) / 2 - mode
260 q <- quantile(obs, c(0.1, 0.5, 0.9))
261 tab1.m[1, "Absolute Q80 mode skewness"] <- (q[3] + q[1]) / 2 - mode
262 tab1.m[1, "SD"] <- sd(obs)
263 tab1.m[1, "IQR 80"] <- diff(quantile(obs, c(0.1, 0.9)))
264 tab1.m[1, "IQR 90"] <- diff(quantile(obs, c(0.05, 0.95)))
265 tab1.m[1, "Total number of sequences"] = length(obs)
266
267 # check if any peak is closer than a given threshold to any other
268 num.close.modes <- sum(diff(ker$x[p]) < min.dist)
269 if ( any(diff(ker$x[p]) < min.dist) && (num.pm > 1) ) {
270 tab1.m[1, "Warning close modes"] <- "Modes too close"
271 tab1.m[1, "Number close modes"] <- num.close.modes
272 tab1.m[1, "Modes (close modes excluded)"] <- num.pm - num.close.modes
273 } else {
274 tab1.m[1, "Modes (close modes excluded)"] <- num.pm
275 }
276
277 # part 5 b): results table 2
278 if (bs.cis == TRUE) {
279 ker <- lapply( estimateB, "[[", "ker")
280 peaks <- lapply( estimateB, "[[", "peaks")
281 num.peaks <- c()
282 for (i in 1:length(peaks)) {
283 curr.peaks <- ker[[i]]$x[ peaks[[i]] ]
284 num.cl <- sum(diff(curr.peaks) < min.dist)
285 num.peaks <- c(num.peaks, length(curr.peaks) - num.cl)
286 }
287
288 # fill table 2 with stats on number of modes in bs samples
289 num <- num.pm - num.close.modes
290 tab2.m[1, "Number of modes (NM)"] <- num
291 tab2.m[1, "% of samples with same NM"] <- 100 * sum(num.peaks == num) / bstrp.reps # equal
292 tab2.m[1, "% of samples with more NM"] <- 100 * sum(num.peaks > num) / bstrp.reps # more
293 tab2.m[1, "% of samples with less NM"] <- 100 * sum(num.peaks < num) / bstrp.reps # less
294 if (num.pm > 1) {
295 tab2.m[1, "Warning CI"] <- "CI's may be unreliable"
296 }
297
298 tab2.m[1, "no. of samples with same NM"] <- sum(num.peaks == num)
299 tab2.m[1, "% BS samples excluded by prob. mass crit."] <- (1 - sum(mass.ok) / sum(num.peaks.ok)) * 100
300 }
301
302 # return the results
303 if (bs.cis){
304 return(list(tab.des = tab1.m, tab.bs = tab2.m, valleys = vals))
305 } else {
306 return(list(tab.des = tab1.m, valleys = vals))
307 }
308 }
309
310
311