Mercurial > repos > cristian > notos
comparison Functions/Kernel_function_form.R @ 0:1535ffddeff4 draft
planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
author | cristian |
---|---|
date | Thu, 07 Sep 2017 08:51:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1535ffddeff4 |
---|---|
1 ############################################## | |
2 # KDE function for single observation sequence # | |
3 ############################################## | |
4 | |
5 # load packages | |
6 require(moments, quietly = TRUE) | |
7 require(doParallel, quietly = TRUE) | |
8 | |
9 | |
10 # get names for the columns of the table characterizing the peaks | |
11 col.names.peaks <- function() | |
12 { | |
13 v <- c("Number of modes", "Number of modes (5% excluded)", | |
14 "Number of modes (10% excluded)", "Skewness", | |
15 "Mode skewness", "Nonparametric skewness", "Q50 skewness", | |
16 "Absolute Q50 mode skewness", "Absolute Q80 mode skewness", | |
17 "Peak 1", "Probability Mass 1", | |
18 "Peak 2", "Probability Mass 2", | |
19 "Peak 3", "Probability Mass 3", | |
20 "Peak 4", "Probability Mass 4", | |
21 "Peak 5", "Probability Mass 5", | |
22 "Peak 6", "Probability Mass 6", | |
23 "Peak 7", "Probability Mass 7", | |
24 "Peak 8", "Probability Mass 8", | |
25 "Peak 9", "Probability Mass 9", | |
26 "Peak 10", "Probability Mass 10", | |
27 "Warning close modes", | |
28 "Number close modes", | |
29 "Modes (close modes excluded)", | |
30 "SD", "IQR 80", "IQR 90", | |
31 "Total number of sequences") | |
32 return(v) | |
33 } | |
34 | |
35 | |
36 # get names for the columns of the table containing the boostrap results | |
37 col.names.bs <- function() | |
38 { | |
39 v <- c("Number of modes (NM)", | |
40 "% of samples with same NM", | |
41 "% of samples with more NM", | |
42 "% of samples with less NM", | |
43 "no. of samples with same NM", | |
44 "% BS samples excluded by prob. mass crit.", | |
45 "Warning CI") | |
46 return(v) | |
47 } | |
48 | |
49 | |
50 # plot KDE for one set up CpGo/e ratios | |
51 # obs: CpGo/e ratios | |
52 # bs.cis: Is bootstrap done? | |
53 # t.name: Title of the plot | |
54 # t.sub: Text that is added below the title | |
55 # t.legend: Is a legend printed? | |
56 plot.KDE <- function(obs, t.name, bs.cis = FALSE, bstrp.reps = 1500, conf.lev = 0.95, t.sub = NULL, t.legend = TRUE, min.dist = 0.2, mode.mass = 0.01, band.width = 1.06) | |
57 { | |
58 # determine directory where functions are located | |
59 cmdArgs <- commandArgs(trailingOnly = FALSE) | |
60 str <- "--file=" | |
61 match <- grep(str, cmdArgs) | |
62 if (length(match) == 0) { | |
63 FCTN.DIR <- "../Galaxy/Functions" | |
64 } else { | |
65 path <- normalizePath( sub(str, "", cmdArgs[match]) ) | |
66 FCTN.DIR <- file.path(dirname(path), "Functions") | |
67 } | |
68 | |
69 # part 1: initialize parameters etc | |
70 # --------------------------------- | |
71 | |
72 # table with names and number of peaks | |
73 v <- col.names.peaks() | |
74 tab1.m <- data.frame(matrix(NA, nrow = 1, ncol = length(v))) | |
75 names(tab1.m) <- col.names.peaks() | |
76 # parameters to set in any case | |
77 num.points <- 10 ^ 4 # number of points where to estimate the density | |
78 p.bw <- "nrd" # algorithm for the bandwidth selection, "nrd" for Scott's bandwith | |
79 use.seed <- TRUE | |
80 threshold.modes <- mode.mass | |
81 threshold.bs.ci <- 0.2 # only changes of +- threshold.bs.ci% in prob. mass is allowed for entering the CI calculation | |
82 # bootstrap with optional parameters | |
83 if (bs.cis) { | |
84 ncpus = max(1, detectCores()) # number of processsors | |
85 cl <- makeCluster(ncpus) | |
86 registerDoParallel(cl) | |
87 # table for the bootstrap | |
88 tab2.m <- data.frame(matrix(NA, nrow = 1, ncol = 7)) | |
89 names(tab2.m) <- col.names.bs() | |
90 } | |
91 | |
92 | |
93 # part 2: estimate the KD and calculate probability masses | |
94 # -------------------------------------------------------- | |
95 | |
96 source( file.path(FCTN.DIR, "density_pm.R") ) | |
97 estimate <- density_pm(obs, num.points, p.bw = band.width * sd(obs) / length(obs)^0.2, threshold.modes = threshold.modes) | |
98 ker <- estimate$ker # kernel density | |
99 p <- estimate$peaks | |
100 v <- estimate$valleys | |
101 pm <- estimate$pm | |
102 | |
103 # select for each peak a line type | |
104 num.pm <- length(p) # number of peaks | |
105 p5 <- estimate$p5 | |
106 p10 <- estimate$p10 | |
107 lty = rep(1, num.pm) #pm > 0.10 | |
108 if (length(p10) > 0) { #pm < 0.10 | |
109 lty[p10] <- 2 | |
110 } | |
111 if (length(p5) > 0) { #pm < 0.05 | |
112 lty[p5] <- 4 | |
113 } | |
114 p.lty <- lty | |
115 | |
116 # part 3: bootstrap | |
117 # ----------------- | |
118 | |
119 if (bs.cis == TRUE) { | |
120 # do bootstrapping | |
121 estimateB = foreach(j = 1 : bstrp.reps ) %dopar% { | |
122 if (use.seed == TRUE) {set.seed(j)} | |
123 source( file.path(FCTN.DIR, "density_pm.R") ) # As doParallel is used, the source code has to be included here | |
124 obs.boot = obs[sample(seq(obs), replace = TRUE)] | |
125 density_pm_boot(obs.boot, num.points = num.points, p.bw = band.width * sd(obs) / length(obs)^0.2, threshold.modes = threshold.modes) | |
126 } | |
127 stopCluster(cl) | |
128 | |
129 # calculate CIs based on samples where the number of peaks of the KDE coincide with the original data | |
130 # and the probability mass of the peaks of the sample don not divert to strongly from the original data | |
131 # ... extract modes and pm for samples | |
132 num.pmB <- sapply(lapply(estimateB, "[[", "peaks"), length) # no of peaks before cleaning | |
133 num.peaks.ok <- num.pmB == num.pm # samples with same number of peaks before cleaning | |
134 pB <- t(sapply( estimateB[num.peaks.ok], "[[", "peaks") ) | |
135 pmB <- t(sapply( estimateB[num.peaks.ok], "[[", "pm") ) | |
136 if (num.pm == 1) { # put into right matrix form | |
137 pB <- t(pB) | |
138 pmB <- t(pmB) | |
139 } | |
140 | |
141 # ... check if prob. mass changes too strongly for any mode | |
142 t.pmB <- t(pmB) | |
143 keep.ub <- !apply(pm * (1 + threshold.bs.ci) < t.pmB, 2, any) | |
144 keep.lb <- !apply(pm * (1 - threshold.bs.ci) > t.pmB, 2, any) | |
145 mass.ok <- keep.ub & keep.lb | |
146 pB.cl <- as.matrix( pB[mass.ok, ] ) | |
147 | |
148 # ... determine CIs | |
149 q <- (1 - conf.lev) / 2 | |
150 p.CI <- apply(pB.cl, 2, quantile, probs = c(q, 1 - q)) | |
151 } | |
152 | |
153 | |
154 # part 4: plots | |
155 # ------------- | |
156 | |
157 #t.breaks <- seq(0, max(obs)*1.05, by = 0.03) | |
158 t.breaks = 50 | |
159 hist_data <- hist(obs, breaks = t.breaks, plot = FALSE) | |
160 hist(obs, breaks = t.breaks, prob = TRUE, main = t.name, | |
161 # sub = paste("Gaussian kernel with band width", band.width), | |
162 xlab = "CpG o/e ratio", | |
163 col = grey(0.9), border = grey(0.6)) #, xlim = c(-0.05, max(obs)*1.1)) | |
164 if (!is.null(t.sub)) { | |
165 mtext(t.sub) | |
166 } | |
167 if (bs.cis == TRUE) { # CI | |
168 j <- 1 : ncol(p.CI) | |
169 rect(ker$x[p.CI[1, j]], 0, ker$x[p.CI[2, j]], | |
170 15, density = 20, angle = 45 + (j - 1) * 90, col = "blue") | |
171 } | |
172 lines(ker, col = "red", lwd = 2) # density | |
173 | |
174 # vertical lines at peaks | |
175 x.pos = ker$x[p] | |
176 ok <- c() | |
177 close <- c() | |
178 for (i in 1:length(x.pos)) { | |
179 b <- TRUE | |
180 if (i > 1) { | |
181 if (x.pos[i] - x.pos[i - 1] < min.dist) { | |
182 b <- FALSE | |
183 } | |
184 } | |
185 if (i < length(x.pos)) { | |
186 if (x.pos[i + 1] - x.pos[i] < min.dist) { | |
187 b <- FALSE | |
188 } | |
189 } | |
190 | |
191 if (b) { | |
192 ok <- c(ok, i) | |
193 } else { | |
194 close <- c(close, i) | |
195 } | |
196 } | |
197 # peaks | |
198 if (length(ok) > 0) { | |
199 abline(v = ker$x[p][ok], col = "blue", lwd = 3, lty = p.lty) | |
200 } | |
201 if (length(close) > 0) { | |
202 abline(v = ker$x[p][close], col = "orange", lwd = 3, lty = p.lty) | |
203 } | |
204 # valleys | |
205 vals = "" | |
206 if (length(x.pos)>1) { | |
207 # ker$x[v][c(-1, -length(ker$x[v]))] | |
208 vals = ker$x[v][c(-1, -length(ker$x[v]))] | |
209 abline(v = vals, col = "black", lwd = 1) | |
210 } | |
211 | |
212 # legend | |
213 if(t.legend == TRUE) { | |
214 t.sym <- expression(""<="", ""<"", "">="") | |
215 thr = threshold.modes | |
216 if (thr >= 0.1) { | |
217 md.labs <- substitute(paste("Mode with PM ", sym3, " ", thr, sep = ""), list(thr = thr, sym3 = t.sym[[3]])) | |
218 } else { | |
219 md.labs <- substitute(paste("Mode with PM ", sym3, " 0.1", sep = ""), list(sym3 = t.sym[[3]])) | |
220 if (thr >= 0.05) { | |
221 md.labs <- c( md.labs, substitute(paste("Mode with ", thr, " ", sym1, " PM ", sym2, " 0.1", sep = ""), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]])) ) | |
222 } else { | |
223 md.labs <- c( md.labs, substitute(paste("Mode with 0.05 ", sym1, " PM ", sym2, " 0.1", sep = ""), list(sym1 = t.sym[[1]], sym2 = t.sym[[2]])), | |
224 substitute(paste("Mode with ", thr, sym1, " PM ", sym2, " 0.05"), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]]))) | |
225 } | |
226 } | |
227 legend("topright", | |
228 c(expression("Estimated density"), md.labs), | |
229 lty = c(1, 1, 2, 4), lwd = c(2, 3, 3, 3), | |
230 col = c("red", "blue", "blue", "blue"), bg = "white") | |
231 } | |
232 | |
233 | |
234 # part 5: return results | |
235 # ---------------------- | |
236 | |
237 # part 5 a): results table 1 | |
238 # tbd (maybe): introduce maximum number of modes (10 right now) | |
239 tab1.m[1, "Number of modes"] <- num.pm | |
240 tab1.m[1, 2] = num.pm - length(estimate$p5) | |
241 tab1.m[1, 3] = num.pm - length(estimate$p10) | |
242 for(j in 1 : 10){ | |
243 if(num.pm < j){ | |
244 tab1.m[1, j * 2 + 8] = c(" ") | |
245 tab1.m[1, j * 2 + 9] = c(" ") | |
246 } else{ | |
247 tab1.m[1, j * 2 + 8] = ker$x[p][j] | |
248 tab1.m[1, j * 2 + 9] = pm[j] | |
249 } | |
250 } | |
251 | |
252 # fill table 1 with descriptives | |
253 tab1.m[1, "Skewness"] <- skewness(obs) | |
254 mode <- ker$x[ which.max(ker$y) ] | |
255 tab1.m[1, "Mode skewness"] <- (mean(obs) - mode) / sd(obs) | |
256 tab1.m[1, "Nonparametric skewness"] <- (mean(obs) - median(obs)) / sd(obs) | |
257 q <- quantile(obs, c(0.25, 0.5, 0.75)) | |
258 tab1.m[1, "Q50 skewness"] <- (q[3] + q[1] - 2 * q[2]) / (q[3] - q[1]) | |
259 tab1.m[1, "Absolute Q50 mode skewness"] <- (q[3] + q[1]) / 2 - mode | |
260 q <- quantile(obs, c(0.1, 0.5, 0.9)) | |
261 tab1.m[1, "Absolute Q80 mode skewness"] <- (q[3] + q[1]) / 2 - mode | |
262 tab1.m[1, "SD"] <- sd(obs) | |
263 tab1.m[1, "IQR 80"] <- diff(quantile(obs, c(0.1, 0.9))) | |
264 tab1.m[1, "IQR 90"] <- diff(quantile(obs, c(0.05, 0.95))) | |
265 tab1.m[1, "Total number of sequences"] = length(obs) | |
266 | |
267 # check if any peak is closer than a given threshold to any other | |
268 num.close.modes <- sum(diff(ker$x[p]) < min.dist) | |
269 if ( any(diff(ker$x[p]) < min.dist) && (num.pm > 1) ) { | |
270 tab1.m[1, "Warning close modes"] <- "Modes too close" | |
271 tab1.m[1, "Number close modes"] <- num.close.modes | |
272 tab1.m[1, "Modes (close modes excluded)"] <- num.pm - num.close.modes | |
273 } else { | |
274 tab1.m[1, "Modes (close modes excluded)"] <- num.pm | |
275 } | |
276 | |
277 # part 5 b): results table 2 | |
278 if (bs.cis == TRUE) { | |
279 ker <- lapply( estimateB, "[[", "ker") | |
280 peaks <- lapply( estimateB, "[[", "peaks") | |
281 num.peaks <- c() | |
282 for (i in 1:length(peaks)) { | |
283 curr.peaks <- ker[[i]]$x[ peaks[[i]] ] | |
284 num.cl <- sum(diff(curr.peaks) < min.dist) | |
285 num.peaks <- c(num.peaks, length(curr.peaks) - num.cl) | |
286 } | |
287 | |
288 # fill table 2 with stats on number of modes in bs samples | |
289 num <- num.pm - num.close.modes | |
290 tab2.m[1, "Number of modes (NM)"] <- num | |
291 tab2.m[1, "% of samples with same NM"] <- 100 * sum(num.peaks == num) / bstrp.reps # equal | |
292 tab2.m[1, "% of samples with more NM"] <- 100 * sum(num.peaks > num) / bstrp.reps # more | |
293 tab2.m[1, "% of samples with less NM"] <- 100 * sum(num.peaks < num) / bstrp.reps # less | |
294 if (num.pm > 1) { | |
295 tab2.m[1, "Warning CI"] <- "CI's may be unreliable" | |
296 } | |
297 | |
298 tab2.m[1, "no. of samples with same NM"] <- sum(num.peaks == num) | |
299 tab2.m[1, "% BS samples excluded by prob. mass crit."] <- (1 - sum(mass.ok) / sum(num.peaks.ok)) * 100 | |
300 } | |
301 | |
302 # return the results | |
303 if (bs.cis){ | |
304 return(list(tab.des = tab1.m, tab.bs = tab2.m, valleys = vals)) | |
305 } else { | |
306 return(list(tab.des = tab1.m, valleys = vals)) | |
307 } | |
308 } | |
309 | |
310 | |
311 |