Mercurial > repos > galaxyp > mqppep_preproc
comparison mqppep_anova_script.Rmd @ 1:b76c75521d91 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
author | galaxyp |
---|---|
date | Fri, 28 Oct 2022 18:26:42 +0000 |
parents | 8dfd5d2b5903 |
children | a5e7469dfdfa |
comparison
equal
deleted
inserted
replaced
0:8dfd5d2b5903 | 1:b76c75521d91 |
---|---|
5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" | 5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" |
6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" | 6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" |
7 date: | 7 date: |
8 - "May 28, 2018" | 8 - "May 28, 2018" |
9 - "; revised June 23, 2022" | 9 - "; revised June 23, 2022" |
10 lot: true | |
10 output: | 11 output: |
11 pdf_document: | 12 pdf_document: |
12 toc: true | 13 toc: true |
13 toc_depth: 3 | 14 toc_depth: 2 |
14 keep_tex: true | 15 keep_tex: true |
15 header-includes: | 16 dev: pdf |
16 - \usepackage{longtable} | 17 includes: |
17 - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut | 18 in_header: mqppep_anova_preamble.tex |
18 - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut | 19 latex_macros: false |
20 raw_tex: true | |
21 urlcolor: blue | |
19 params: | 22 params: |
20 alphaFile: "test-data/alpha_levels.tabular" | 23 alphaFile: "test-data/alpha_levels.tabular" |
21 inputFile: "test-data/test_input_for_anova.tabular" | 24 inputFile: "test-data/test_input_for_anova.tabular" |
22 preprocDb: "test-data/test_input_for_anova.sqlite" | 25 preprocDb: "test-data/test_input_for_anova.sqlite" |
23 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | 26 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] |
24 show_toc: true | |
25 firstDataColumn: "^Intensity[^_]" | |
26 imputationMethod: !r c("group-median", "median", "mean", "random")[1] | |
27 meanPercentile: 1 | |
28 sdPercentile: 1.0 | |
29 regexSampleNames: "\\.\\d+[A-Z]$" | 27 regexSampleNames: "\\.\\d+[A-Z]$" |
30 regexSampleGrouping: "\\d+" | 28 regexSampleGrouping: "\\d+" |
29 groupFilterPatterns: ".+" | |
30 groupFilter: !r c("none", "exclude", "include")[1] | |
31 imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
32 kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5] | |
33 #imputationMethod: !r c("group-median", "median", "mean", "random")[1] | |
34 | |
35 # how should sample groups be interpreted? | |
36 # - "f": fixed patterns (like `grep -F`) | |
37 # - "p": PERL-compatible (like `grep -P`) | |
38 # - "r": extended grep patterns (like `grep -E`) | |
39 # use what case sensitivity? | |
40 # - "i": case insensitive matching (like `grep -i`) | |
41 groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1] | |
42 # what pattern should be used for the first column | |
43 # (extended grep pattern, case sensitive) | |
44 firstDataColumn: "^Intensity[^_]" | |
45 # for small random value imputation, what percentile should be center? | |
46 meanPercentile: 50 | |
47 #meanPercentile: 1 | |
48 # for small random value imputation, what should `s / mean(x)` ratio be? | |
49 sdPercentile: 1.0 | |
50 # output path for imputed data file | |
31 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" | 51 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" |
52 # output path for imputed/quantile-normalized/log-transformed data file | |
32 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" | 53 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" |
54 # output path for contents of `stats_metadata_v` table | |
33 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" | 55 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" |
56 # how to test one variable with > 2 categories (e.g., aov or kruskal.test) | |
34 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] | 57 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] |
58 # how to test one variable with 2 categories (e.g., oneway.test) | |
35 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] | 59 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] |
36 kseaCutoffStatistic: !r c("p.value", "FDR")[2] | 60 # what should be the minimum quality for consideration in both |
37 kseaCutoffThreshold: !r c( 0.1, 0.05)[2] | 61 minQuality: 0 |
38 kseaMinKinaseCount: 1 | 62 # correct KSEA with FDR (recommended) or raw p-value |
39 intensityHeatmapRows: 75 | 63 kseaCutoffStatistic: !r c("FDR", "p.value")[1] |
64 # correct KSEA threshold 0.05 (conventional) or higher (perhaps better) | |
65 # "perhaps better" meaning that KSEA is an hypothesis-generator, not -test | |
66 #kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5)[1] | |
67 # minimum number of substrates required for a kinase to be considered in KSEA | |
68 kseaMinSubstrateCount: 1 | |
69 # Should KSEA be performed aggregating signed log2FC or absolute? | |
70 # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores | |
71 # TRUE use abs(log2FC) for KSEA as Justin Drake requested; this is a | |
72 # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. | |
73 kseaUseAbsoluteLog2FC: TRUE | |
74 #kseaUseAbsoluteLog2FC: FALSE | |
75 # minimum number of observed values per sample-group | |
76 intensityMinValuesPerGroup: 1 | |
77 # maximum number of heatmap rows (result are poor when > 50) | |
78 intensityHeatmapRows: 50 | |
79 # what should be the primary criterion to eliminate excessive heatmap rows | |
80 intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1] | |
81 # should correlation among substrates be used (rather than covariance) | |
82 correlateSubstrates: TRUE | |
83 # only show covariance among variables having variance > 1 | |
84 filterCovVarGT1: TRUE | |
85 # maximum number of residues to display for ppeps in rownames or columnames | |
86 ppepTruncN: 10 | |
87 # maximum number of characters of subgenes to display in rownames or columnames | |
88 subgeneTruncN: 10 | |
89 # maximum number of characters for paste(subgene, ppep) for enrichment plots | |
90 substTruncN: 20 | |
91 # should boxplots use variable-width boxes to reflect # of samples | |
92 boxPlotVarWidth: TRUE | |
93 # should boxplots use notched boxes to reflect difference between samples | |
94 boxPlotNotch: TRUE | |
95 # look-up tables for kinase descriptions | |
96 kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2" | |
97 kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2" | |
98 # should debugging trace messages be printed? | |
99 showEnrichedSubstrates: FALSE | |
100 | |
101 # should debugging nb/nbe messages be printed? | |
102 printNBMsgs: FALSE | |
103 # should debugging trace messages be printed? | |
104 printTraceMsgs: FALSE | |
105 # when debugging files are needed, set debugFileBasePath to the path | |
106 # to the directory where they should be writtn | |
107 debugFileBasePath: !r if (TRUE) NULL else "test-data" | |
40 --- | 108 --- |
41 <!-- | 109 |
42 kseaCutoffStatistic: !r c("p.value", "FDR")[2] | 110 ```{r setup, include = FALSE, results = 'asis'} |
43 kseaCutoffThreshold: !r c(0.05, 0.1)[1] | 111 |
44 | 112 # simple debug messaging |
45 alphaFile: "test-data/alpha_levels.tabular" | 113 print_nb_messages <- params$printNBMsgs |
46 inputFile: "test-data/test_input_for_anova.tabular" | 114 |
47 preprocDb: "test-data/test_input_for_anova.sqlite" | 115 nb <- if (!print_nb_messages) { |
48 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | 116 function(...) invisible() |
49 | 117 } else { |
50 alphaFile: "test-data/alpha_levels.tabular" | 118 function(..., f = cat) f("\n$\\exists{}\\supset\\forall{}$", ...) |
51 inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" | 119 } |
52 preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" | 120 |
53 kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] | 121 nbe <- if (!print_nb_messages) { |
54 | 122 function(...) invisible() |
55 alphaFile: "test-data/alpha_levels.tabular" | 123 } else { |
56 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" | 124 function(..., f = cat, file = stderr()) { |
57 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" | 125 cat( |
58 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | 126 stringi::stri_unescape_unicode("\nNBE \\u2203\\u2283\\u2200"), |
59 | 127 ..., |
60 alphaFile: "test-data/alpha_levels.tabular" | 128 file = file |
61 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" | 129 ) |
62 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" | 130 } |
63 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | 131 } |
64 | 132 |
65 inputFile: "test-data/density_failure.preproc_tab.tabular" | |
66 kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] | |
67 latex_document: default | |
68 --> | |
69 ```{r setup, include = FALSE} | |
70 #ref for debugging: https://yihui.org/tinytex/r/#debugging | 133 #ref for debugging: https://yihui.org/tinytex/r/#debugging |
71 options(tinytex.verbose = TRUE) | 134 options(tinytex.verbose = TRUE) |
72 | 135 |
73 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 | 136 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 |
74 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 | 137 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 |
75 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) | 138 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10), dpi = 300) |
76 | 139 |
77 # freeze the random number generator so the same results will be produced | 140 # freeze the random number generator so the same results will be produced |
78 # from run to run | 141 # from run to run |
79 set.seed(28571) | 142 set.seed(28571) |
80 | 143 |
81 ### LIBRARIES | 144 ### LIBRARIES |
145 | |
146 if (print_nb_messages) nbe("library(gplots)") | |
82 library(gplots) | 147 library(gplots) |
148 if (print_nb_messages) nbe("library(caret)") | |
149 # load caret for nearZeroVar | |
150 if (print_nb_messages) nbe("Please ignore the messages about systemd, if any.\n") | |
151 library(caret) | |
152 if (print_nb_messages) nbe("library(DBI)") | |
83 library(DBI) | 153 library(DBI) |
154 if (print_nb_messages) nbe("library(RSQLite)") | |
84 library(RSQLite) | 155 library(RSQLite) |
156 if (print_nb_messages) nbe("library(sqldf)\n") | |
85 # Suppress "Warning: no DISPLAY variable so Tk is not available" | 157 # Suppress "Warning: no DISPLAY variable so Tk is not available" |
86 suppressWarnings(suppressMessages(library(sqldf))) | 158 suppressWarnings(suppressMessages(library(sqldf))) |
87 | 159 |
88 # required but not added to search list: | 160 # required but not added to search list: |
89 # - DBI | 161 # - DBI |
95 # - reshape2 | 167 # - reshape2 |
96 # - vioplot | 168 # - vioplot |
97 | 169 |
98 ### CONSTANTS | 170 ### CONSTANTS |
99 | 171 |
100 const_parfin <- par("fin") | 172 const_boxplot_fill <- "grey94" |
101 const_boxplot_fill <- "grey94" | |
102 const_stripchart_cex <- 0.5 | |
103 const_stripsmall_cex <- | |
104 sqrt(const_stripchart_cex * const_stripchart_cex / 2) | |
105 const_stripchart_jitter <- 0.3 | |
106 const_write_debug_files <- FALSE | |
107 const_table_anchor_bp <- "bp" | |
108 const_table_anchor_ht <- "ht" | |
109 const_table_anchor_p <- "p" | |
110 const_table_anchor_tbp <- "tbp" | |
111 | |
112 | |
113 const_ksea_astrsk_kinases <- 1 | 173 const_ksea_astrsk_kinases <- 1 |
114 const_ksea_nonastrsk_kinases <- 2 | 174 const_ksea_nonastrsk_kinases <- 2 |
115 const_ksea_all_kinases <- 3 | 175 const_ksea_all_kinases <- 3 |
116 | 176 const_log10_e <- log10(exp(1)) |
117 const_log10_e <- log10(exp(1)) | 177 const_stripchart_cex <- 0.5 |
118 | 178 const_stripchart_jitter <- 0.3 |
119 ### FUNCTIONS | 179 const_table_anchor_bp <- "bp" |
120 | 180 const_table_anchor_ht <- "ht" |
121 # from `demo(error.catching)` | 181 const_table_anchor_p <- "p" |
182 const_table_anchor_t <- "t" | |
183 const_table_anchor_tbp <- "tbp" | |
184 | |
185 | |
186 ### GLOBAL VARIABLES (params) | |
187 | |
188 ## functions to process params | |
189 | |
190 is_string_null_or_empty <- function(x) { | |
191 # N. B. non-strings are intentionally treated as NULL | |
192 if (is.null(x)) | |
193 TRUE | |
194 else if (!is.character(x)) | |
195 TRUE | |
196 else x == "" | |
197 } | |
198 | |
122 ##' Catch *and* save both errors and warnings, and in the case of | 199 ##' Catch *and* save both errors and warnings, and in the case of |
123 ##' a warning, also keep the computed result. | 200 ##' a warning, also keep the computed result. |
201 ##' return result as list(value = ..., warning = ...) | |
202 ##' - value will be: | |
203 ##' - the result if no exception is thrown | |
204 ##' - the exception if an exception is caught | |
205 ##' - warning will be a string except perhaps when warning argument is not NULL | |
206 ##' | |
207 ##' adapted from `demo(error.catching)` | |
124 ##' | 208 ##' |
125 ##' @title tryCatch both warnings (with value) and errors | 209 ##' @title tryCatch both warnings (with value) and errors |
126 ##' @param expr an \R expression to evaluate | 210 ##' @param expr an \R expression to evaluate |
127 ##' @return a list with 'value' and 'warning', where | 211 ##' @return a list with 'value' and 'warning', where |
128 ##' 'value' may be an error caught. | 212 ##' 'value' may be an error caught. |
129 ##' @author Martin Maechler; | 213 ##' @author Martin Maechler; |
130 ##' Copyright (C) 2010-2012 The R Core Team | 214 ##' Copyright (C) 2010-2012 The R Core Team |
131 try_catch_w_e <- function(expr) { | 215 try_catch_w_e <- |
132 wrn <- NULL | 216 function(expr, error = function(e) e, warning = NULL) { |
133 # warning handler | 217 wrn <- NULL |
134 w_handler <- function(w) { | 218 # warning handler |
135 wrn <<- w | 219 w_handler <- |
136 invokeRestart("muffleWarning") | 220 if (is.function(warning)) |
137 } | 221 warning |
138 list( | 222 else |
139 value = withCallingHandlers( | 223 function(w) { |
140 tryCatch( | 224 wrn <<- w |
141 expr, | 225 invokeRestart("muffleWarning") |
142 error = function(e) e | 226 } |
227 e_handler <- | |
228 if (is.function(error)) | |
229 error | |
230 else | |
231 function(e) e | |
232 # return result as list(value = ..., warning = ...) | |
233 # - value will be: | |
234 # - the result if no exception is thrown | |
235 # - the exception if an exception is caught | |
236 list( | |
237 value = withCallingHandlers( | |
238 tryCatch( | |
239 expr, | |
240 error = e_handler | |
241 ), | |
242 warning = w_handler | |
143 ), | 243 ), |
144 warning = w_handler | 244 warning = wrn |
145 ), | 245 ) |
146 warning = wrn | 246 } |
147 ) | 247 |
148 } | 248 see_kvp <- |
149 | 249 function(format, key, value, suffix = "") { |
150 | 250 if ( |
151 write_debug_file <- function(s) { | 251 !all( |
152 if (const_write_debug_files) { | 252 is.character(format), |
153 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) | 253 is.character(key), |
154 print(sprintf("DEBUG writing file %s", spath)) | 254 is.character(value), |
255 is.character(suffix) | |
256 ) | |
257 ) { | |
258 cat("all arguments to see_kvp should be character") | |
259 knitr::knit_exit() | |
260 } | |
261 result <- sprintf(format, value) | |
262 if (length(result) > 1) { | |
263 sprintf( | |
264 "%s = c(%s)%s", | |
265 whack_underscores(key), | |
266 paste(result, collapse = ", "), | |
267 suffix | |
268 ) | |
269 } else { | |
270 sprintf( | |
271 "%s = %s%s", | |
272 key, | |
273 result, | |
274 suffix | |
275 ) | |
276 } | |
277 } | |
278 | |
279 see_logical <- | |
280 function(x, suffix = "", xprssn = deparse1(substitute(x))) { | |
281 result <- as.character(as.logical(x)) | |
282 # handle NAs and NaNs | |
283 result[is.na(result)] <- "NA" | |
284 see_kvp( | |
285 format = "%s", | |
286 key = xprssn, | |
287 value = result, | |
288 suffix = suffix | |
289 ) | |
290 } | |
291 | |
292 see_numeric <- | |
293 function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { | |
294 if (is.numeric(digits) && is.numeric(x)) { | |
295 digits <- as.integer(digits) | |
296 digits <- min(16, max(0, digits)) | |
297 format <- paste0("%0.", as.character(digits), "g") | |
298 result <- sprintf(format, x) | |
299 see_kvp( | |
300 format = "%s", | |
301 key = xprssn, | |
302 value = result, | |
303 suffix = suffix | |
304 ) | |
305 } | |
306 } | |
307 | |
308 see_character <- | |
309 function(x, suffix = "", xprssn = deparse1(substitute(x))) { | |
310 if (is.character(x)) { | |
311 see_kvp( | |
312 format = "%s", | |
313 key = xprssn, | |
314 value = sprintf("\"%s\"", x), | |
315 suffix = suffix | |
316 ) | |
317 } | |
318 } | |
319 | |
320 see_variable <- | |
321 function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { | |
322 if (is.character(x)) { | |
323 see_character(x, suffix, xprssn) | |
324 } else if (is.numeric(x)) { | |
325 see_numeric(x, suffix, digits, xprssn) | |
326 } else if (is.logical(x)) { | |
327 see_logical(x, suffix, xprssn) | |
328 } else { | |
329 f <- file("") | |
330 sink(f) | |
331 str(x) | |
332 msg <- paste(readLines(f), collapse = "\n") | |
333 sink() | |
334 close(f) | |
335 paste0( | |
336 "see_variable - str(", | |
337 xprssn, | |
338 "):\n", | |
339 msg, "\n" | |
340 ) | |
341 } | |
342 } | |
343 | |
344 # ref: https://tug.org/texinfohtml/latex2e.html | |
345 # LaTeX sets aside the following characters for special purposes. | |
346 # For example, the percent sign % is for comments. | |
347 # They are called reserved characters or special characters. | |
348 # They are all discussed elsewhere in this manual. | |
349 # | |
350 # $ % & { } _ ~ ^ \ # | |
351 # | |
352 # If you want a reserved character to be printed as itself, in the text body | |
353 # font, for all but the final three characters in that list simply put | |
354 # a backslash \ in front of the character. | |
355 # Thus, typing \$1.23 will produce $1.23 in your output. | |
356 # | |
357 # As to the last three characters, to get a tilde in the text body font, | |
358 # use \~{} (omitting the curly braces would result in the next character | |
359 # receiving a tilde accent). | |
360 # Similarly, to get a text body font circumflex use \^{}. | |
361 # To get a backslash in the font of the text body enter \textbackslash{}. | |
362 whack_math <- | |
363 function(v) { | |
364 v <- as.character(v) | |
365 w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE) | |
366 w <- Reduce( | |
367 f = function(l, r) { | |
368 gsub(r, paste0("\\", r), l, fixed = TRUE) | |
369 }, | |
370 x = c("#", "$", "%", "&", "{", "}", "_"), | |
371 init = w | |
372 ) | |
373 w <- gsub("^", "\\^{}", w, fixed = TRUE) | |
374 return(w) | |
375 if (all(v == w)) | |
376 v | |
377 else | |
378 paste0("\\texttt{", w, "}") | |
379 } | |
380 whack_underscores <- whack_math | |
381 | |
382 ## dump params to stderr (remove this eventually) | |
383 | |
384 if (FALSE) nbe(see_variable(params)) | |
385 | |
386 ## unlist params for eventual output | |
387 | |
388 param_unlist <- unlist(as.list(params)) | |
389 | |
390 # no need to whack underscores and dollars because this is verbatim | |
391 param_df <- data.frame( | |
392 parameter = paste0("\\verb@", names(param_unlist), "@"), | |
393 value = paste0( | |
394 "\n\\begin{tiny}\n\\verb@", | |
395 param_unlist, | |
396 "@\n\\end{tiny}" | |
397 ) | |
398 ) | |
399 param_df <- data.frame( | |
400 parameter = names(param_unlist), | |
401 value = param_unlist | |
402 ) | |
403 param_df <- param_df[order(param_df$parameter), ] | |
404 | |
405 ## general output control | |
406 | |
407 debug_file_base_path <- params$debugFileBasePath | |
408 print_trace_messages <- params$printTraceMsgs | |
409 show_enriched_substrates <- params$showEnrichedSubstrates | |
410 boxplot_varwidth <- params$boxPlotVarWidth | |
411 boxplot_notch <- params$boxPlotNotch | |
412 | |
413 ## parameters for static data | |
414 | |
415 kinase_name_uprt_lut_bz2 <- params$kinaseNameUprtLutBz2 | |
416 kinase_uprt_desc_lut_bz2 <- params$kinaseUprtDescLutBz2 | |
417 | |
418 ## parameters for input file | |
419 | |
420 preproc_db <- params$preprocDb | |
421 alpha_file <- params$alphaFile | |
422 input_file <- params$inputFile | |
423 | |
424 # First data column - ideally, this could be detected via | |
425 # regexSampleNames, but for now leave it as is. | |
426 first_data_column <- params$firstDataColumn | |
427 fdc_is_integer <- is.integer(first_data_column) | |
428 if (fdc_is_integer) { | |
429 first_data_column <- as.integer(params$firstDataColumn) | |
430 } | |
431 | |
432 ## parameters for output files | |
433 | |
434 ksea_app_prep_db <- params$kseaAppPrepDb | |
435 imputed_data_filename <- params$imputedDataFilename | |
436 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile | |
437 anova_ksea_mtdt_file <- params$anovaKseaMetadata | |
438 | |
439 ## parameters for imputation | |
440 | |
441 # Imputation method, should be one of | |
442 # "random", "group-median", "median", or "mean" | |
443 imputation_method <- params$imputationMethod | |
444 | |
445 # Selection of percentile of logvalue data to set the mean for random number | |
446 # generation when using random imputation | |
447 mean_percentile <- params$meanPercentile / 100.0 | |
448 | |
449 # deviation adjustment-factor for random values; real number. | |
450 sd_percentile <- params$sdPercentile | |
451 | |
452 ## parameters for group parsing and filtering | |
453 | |
454 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | |
455 regex_sample_names <- params$regexSampleNames | |
456 # Regular expression to extract Sample Grouping from Sample Name; | |
457 # if error occurs, compare smpl_trt vs. sample_name_matches | |
458 # to see if groupings/pairs line up | |
459 # e.g., "(\\d+)" | |
460 | |
461 regex_sample_grouping <- params$regexSampleGrouping | |
462 # What are the patterns for filtering sample groups? | |
463 # How should sample groups be filtered? | |
464 # - none: do not filter | |
465 # - include: include sample groups matching filter | |
466 # - exclude: include sample groups not matching filter | |
467 | |
468 sample_group_filter <- params$groupFilter | |
469 if (grepl("f", params$groupFilterMode, fixed = TRUE)) { | |
470 sample_group_filter_perl <- FALSE | |
471 sample_group_filter_fixed <- TRUE | |
472 } else if (grepl("p", params$groupFilterMode, fixed = TRUE)) { | |
473 sample_group_filter_perl <- TRUE | |
474 sample_group_filter_fixed <- FALSE | |
475 } else { # normal regex | |
476 sample_group_filter_perl <- FALSE | |
477 sample_group_filter_fixed <- FALSE | |
478 } | |
479 | |
480 sample_group_filter_nocase <- | |
481 grepl("i", params$groupFilterMode, fixed = TRUE) | |
482 | |
483 # What PCRE patterns should be included or excluded | |
484 group_filter_patterns_csv <- params$groupFilterPatterns | |
485 sample_group_filter_patterns <- strsplit( | |
486 x = group_filter_patterns_csv, | |
487 split = ",", | |
488 fixed = TRUE | |
489 )[[1]] | |
490 | |
491 ## parameters for hypothesis testing | |
492 | |
493 one_way_all_categories_fname <- params$oneWayManyCategories | |
494 | |
495 one_way_all_categories <- try_catch_w_e( | |
496 match.fun(one_way_all_categories_fname)) | |
497 | |
498 if (!is.function(one_way_all_categories$value)) { | |
499 write("fatal error for parameter oneWayManyCategories:", stderr()) | |
500 write(one_way_all_categories$value$message, stderr()) | |
501 if (sys.nframe() > 0) { | |
502 cat("Cannot continue and quit() failed. Goodbye.") | |
503 knitr::knit_exit() | |
504 quit(save = "no", status = 1) | |
505 } | |
506 } | |
507 | |
508 one_way_all_categories <- one_way_all_categories$value | |
509 | |
510 one_way_two_categories_fname <- params$oneWayManyCategories | |
511 one_way_two_categories <- try_catch_w_e( | |
512 match.fun(one_way_two_categories_fname)) | |
513 if (!is.function(one_way_two_categories$value)) { | |
514 cat("fatal error for parameter oneWayTwoCategories: \n") | |
515 cat(one_way_two_categories$value$message, fill = TRUE) | |
516 if (sys.nframe() > 0) { | |
517 cat("Cannot continue and quit() failed. Goodbye.") | |
518 knitr::knit_exit() | |
519 quit(save = "no", status = 1) | |
520 } | |
521 } | |
522 one_way_two_categories <- one_way_two_categories$value | |
523 | |
524 ## parameters for KSEA | |
525 | |
526 ksea_cutoff_statistic <- params$kseaCutoffStatistic | |
527 ksea_cutoff_threshold <- params$kseaCutoffThreshold | |
528 ksea_min_substrate_count <- params$kseaMinSubstrateCount | |
529 | |
530 ## parameters for global variables consumed by functions | |
531 | |
532 # intensityHeatmapCriteria: !r c("na_count", "p_value")[2] # TODO switch to 1 | |
533 # TODO Validate within list | |
534 g_intensity_hm_criteria <- params$intensityHeatmapCriteria | |
535 if (is_string_null_or_empty(g_intensity_hm_criteria)) { | |
536 cat("invalid intensityHeatmapCriteria parameter (must be string)") | |
537 knitr::knit_exit() | |
538 } | |
539 switch( | |
540 g_intensity_hm_criteria, | |
541 "quality" = NULL, | |
542 "na_count" = NULL, | |
543 "p_value" = NULL, | |
544 { | |
545 with( | |
546 params, | |
547 cat( | |
548 sprintf( | |
549 "invalid %s (must be %s)", | |
550 see_variable(intensityHeatmapCriteria), | |
551 "one of quality or na_count or p_value" | |
552 ) | |
553 ) | |
554 ) | |
555 knitr::knit_exit() | |
556 } | |
557 ) | |
558 | |
559 # intensityHeatmapRows: 50 | |
560 # TODO Validate >> 0 < 75 | |
561 g_intensity_hm_rows <- params$intensityHeatmapRows | |
562 if (!is.integer(g_intensity_hm_rows) || g_intensity_hm_rows < 1) { | |
563 cat("invalid intensityHeatmapRows (must be integer > 0)") | |
564 knitr::knit_exit() | |
565 } | |
566 | |
567 g_intensity_min_per_class <- params$intensityMinValuesPerGroup | |
568 if (!is.integer(g_intensity_min_per_class) || g_intensity_min_per_class < 0) { | |
569 cat("invalid intensityMinValuesPerGroup (must be integer > -1") | |
570 knitr::knit_exit() | |
571 } | |
572 | |
573 if (is.na(as.logical(g_correlate_substrates <- params$correlateSubstrates))) { | |
574 cat("invalid correlateSubstrates (must be TRUE or FALSE)") | |
575 knitr::knit_exit() | |
576 } | |
577 | |
578 if (is.na(as.logical(g_filter_cov_var_gt_1 <- params$filterCovVarGT1))) { | |
579 cat("invalid filterCovVarGT1 parameter (must be TRUE or FALSE)") | |
580 knitr::knit_exit() | |
581 } | |
582 | |
583 # TODO Validate >> 0 < 30 | |
584 g_ppep_trunc_n <- params$ppepTruncN | |
585 | |
586 # TODO Validate >> 0 < 30 | |
587 g_subgene_trunc_n <- params$subgeneTruncN | |
588 | |
589 # TODO Validate >> 0 < 30 | |
590 g_sbstr_trunc_n <- params$substTruncN | |
591 | |
592 | |
593 ### OPERATORS | |
594 | |
595 # Test for exclusion | |
596 # ref: https://www.reneshbedre.com/blog/in-operator-r.html | |
597 `%notin%` <- Negate(`%in%`) | |
598 | |
599 # Augmented assignment | |
600 # ref: https://www2.cs.arizona.edu/icon/refernce/infix2.htm#aug_assign | |
601 `%||:=%` <- function(lvalue, ...) { | |
602 pf <- parent.frame() | |
603 rvalue <- Reduce(paste0, x = ..., init = lvalue) | |
604 assign( | |
605 x = as.character(substitute(lvalue)), | |
606 value = rvalue, | |
607 pos = pf | |
608 ) | |
609 invisible(rvalue) | |
610 } | |
611 | |
612 ### FUNCTIONS | |
613 | |
614 no_op <- | |
615 function() { | |
616 } | |
617 # this function is not used in this file and should be removed while | |
618 # factoring out reusable code | |
619 all_apply <- function(f, v, na_rm = TRUE, ...) { | |
620 Reduce( | |
621 f = function(l, r) if (na_rm && is.na(r)) TRUE else l && r, | |
622 x = sapply(X = v, FUN = f, ...), | |
623 init = TRUE | |
624 ) | |
625 } | |
626 | |
627 write_debug_file <- function(data_frame) { | |
628 if (!is.null(debug_file_base_path)) { | |
629 s_path <- | |
630 sprintf( | |
631 "%s/%s.txt", | |
632 debug_file_base_path, | |
633 deparse(substitute(data_frame)) | |
634 ) | |
155 write.table( | 635 write.table( |
156 s, | 636 data_frame, |
157 file = s_path, | 637 file = s_path, |
158 sep = "\t", | 638 sep = "\t", |
159 col.names = TRUE, | 639 col.names = TRUE, |
160 row.names = TRUE, | 640 row.names = TRUE, |
161 quote = FALSE | 641 quote = FALSE |
172 # Hence, `x <- 1; get("x", new_env())` fails by design. | 652 # Hence, `x <- 1; get("x", new_env())` fails by design. |
173 new_env <- function() { | 653 new_env <- function() { |
174 new.env(parent = emptyenv()) | 654 new.env(parent = emptyenv()) |
175 } | 655 } |
176 | 656 |
657 # make apply readable for rows | |
658 row_apply <- function(x, fun, ..., simplify = TRUE) { | |
659 apply(x, MARGIN = 1, fun, ..., simplify = TRUE) | |
660 } | |
661 | |
662 # make apply readable for columns | |
663 column_apply <- function(x, fun, ..., simplify = TRUE) { | |
664 apply(x, MARGIN = 2, fun, ..., simplify = TRUE) | |
665 } | |
666 | |
667 ##' Produce a vector of boolean values whose i-th value is TRUE when any | |
668 ##' member of v matches the i-th membr of s, where i in 1:seq_len(length(s)) | |
669 ##' | |
670 ##' @title Search multiple strings for matches of multiple substrings | |
671 ##' @param v a vector of substrings to match | |
672 ##' @param s a vector of strings to search for matches | |
673 ##' @param ... additional arguments to grepl | |
674 ##' @return a list with keys in s and valuse that are vectors of elements of v | |
675 ##' @author Art Eschenlauer | |
676 ##' Copyright (C) 2022 Art Eschenlauer; | |
677 ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms | |
678 mgrepl <- function(v, s, ...) { | |
679 grpl_rslt <- rep_len(0, length(s)) | |
680 for (vi in v) { | |
681 grpl_rslt_v <- sapply( | |
682 X = s, | |
683 FUN = function(t) { | |
684 Reduce( | |
685 f = function(l, r) if (is.null(l)) r else c(l, r), | |
686 x = sapply( | |
687 X = vi, | |
688 FUN = function(f) grepl(f, t, ...) | |
689 ), | |
690 init = c() | |
691 ) | |
692 }, | |
693 simplify = "array" | |
694 ) | |
695 grpl_rslt <- grpl_rslt + grpl_rslt_v | |
696 } | |
697 rslt <- unname(grpl_rslt > 0) | |
698 } | |
699 | |
700 ##' Produce positions in a vector where succeeding value != current valus | |
701 ##' | |
702 ##' @title Search vector for neighboring positions having different values | |
703 ##' @param v a vector of comparable numeric values (e.g. integers) | |
704 ##' @return a vector of positions i where v[i] != v[i + 1] | |
705 ##' @author Art Eschenlauer | |
706 ##' Copyright (C) 2022 Art Eschenlauer; | |
707 ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms | |
708 transition_positions <- function(v) { | |
709 Reduce( | |
710 f = function(l, i) if ((i != 1) && (v[i - 1] != v[i])) c(l, i - 1) else l, | |
711 x = seq_along(v)[-1:0], | |
712 init = c() | |
713 ) | |
714 } | |
715 | |
716 ### figure debug functions | |
717 | |
718 cat_par_vector <- function(par_name, lbl = "", newlines = TRUE) { | |
719 cat( | |
720 sprintf( | |
721 "%spar(%s) = c(%s)%s", | |
722 lbl, | |
723 par_name, | |
724 paste(par(par_name), collapse = ", "), | |
725 if (newlines) "\n\n" else "" | |
726 ) | |
727 ) | |
728 } | |
729 | |
730 cat_margins <- function(lbl = NULL) { | |
731 for (p in c("fig", "fin", "mar", "mai", "omd", "omi", "oma")) | |
732 cat_par_vector(p, if (!is.null(lbl)) paste0(lbl, " ") else NULL) | |
733 } | |
734 | |
735 cat_variable <- | |
736 function(x, suffix = "", digits = 3, force_str = FALSE) { | |
737 xprssn <- deparse1(substitute(x)) | |
738 if (force_str || is.matrix(x) || is.list(x) || is.data.frame(x)) { | |
739 cat( | |
740 paste0( | |
741 "\n\\texttt{\\textbf{", | |
742 whack_underscores(xprssn), | |
743 "}} [", | |
744 typeof(x), | |
745 ",", | |
746 mode(x), | |
747 "] =\n" | |
748 ) | |
749 ) | |
750 cat("\n\\begin{verbatim}\n") | |
751 str(x) | |
752 cat("\n\\end{verbatim}\n") | |
753 } else { | |
754 cat("\n", see_variable(x, suffix, digits, xprssn)) | |
755 } | |
756 } | |
757 | |
758 ### structure helper functions | |
759 | |
760 # ref: staque.R - Icon-oriented stack and queue operations | |
761 # - https://gist.github.com/eschen42/917690355e53918b9e7ba7138a02d1f8 | |
762 # | |
763 # sq_get(v):x produces the leftmost element of v and removes it from v, | |
764 # but produces NA if v is empty | |
765 sq_get <- function(v) { | |
766 if (length(v) == 0) return(NA) | |
767 assign(as.character(substitute(v)), v[-1], parent.frame()) | |
768 return(v[1]) | |
769 } | |
770 # | |
771 # sq_put(v,x1,...,xn):v puts x1, x2, ..., xn onto the right end of v, | |
772 # producing v. | |
773 # Values are pushed in order from left to right, | |
774 # so xn becomes the last (rightmost) value on v. | |
775 # sq_put(v) with no second argument does nothing. | |
776 sq_put <- function(v, x = NA, ...) { | |
777 pf <- parent.frame() | |
778 if (is.null(x)) return(pf$v) | |
779 if ( | |
780 !(length(x) > 1) && | |
781 !rlang::is_closure(x) && | |
782 is.na(x) | |
783 ) return(pf$v) | |
784 assign(as.character(substitute(v)), c(v, x, ...), pf) | |
785 pf[[as.character(substitute(v))]] | |
786 } | |
787 | |
177 ### numerical/statistical helper functions | 788 ### numerical/statistical helper functions |
178 | 789 |
179 any_nan <- function(x) { | 790 any_nan <- function(x) { |
180 !any(x == "NaN") | 791 !any(x == "NaN") |
181 } | 792 } |
184 sd_finite <- function(x) { | 795 sd_finite <- function(x) { |
185 ok <- is.finite(x) | 796 ok <- is.finite(x) |
186 sd(x[ok]) | 797 sd(x[ok]) |
187 } | 798 } |
188 | 799 |
800 # compute anova raw p-value | |
189 anova_func <- function(x, grouping_factor, one_way_f) { | 801 anova_func <- function(x, grouping_factor, one_way_f) { |
190 subject <- data.frame( | 802 subject <- data.frame( |
191 intensity = x | 803 intensity = x |
192 ) | 804 ) |
193 x_aov <- | 805 x_aov <- |
201 else | 813 else |
202 pvalue <- x_aov$p.value | 814 pvalue <- x_aov$p.value |
203 pvalue | 815 pvalue |
204 } | 816 } |
205 | 817 |
818 # This code adapted from matrixcalc::is.positive.definite | |
819 # Notably, this simply tests without calling stop() | |
820 is_positive_definite <- function(x, tol = 1e-08) { | |
821 if (!is.matrix(x)) | |
822 return(FALSE) | |
823 if (!is.numeric(x)) | |
824 return(FALSE) | |
825 if (nrow(x) < 1) | |
826 return(FALSE) | |
827 if (ncol(x) < 1) | |
828 return(FALSE) | |
829 if (nrow(x) != ncol(x)) | |
830 return(FALSE) | |
831 sum_symm <- sum(x == t(x), na.rm = TRUE) | |
832 value_count <- Reduce("*", dim(x)) | |
833 if (sum_symm != value_count) | |
834 return(FALSE) | |
835 eigenvalues <- eigen(x, only.values = TRUE)$values | |
836 n <- nrow(x) | |
837 for (i in 1:n) { | |
838 if (abs(eigenvalues[i]) < tol) { | |
839 eigenvalues[i] <- 0 | |
840 } | |
841 } | |
842 if (any(eigenvalues <= 0)) { | |
843 return(FALSE) | |
844 } | |
845 return(TRUE) | |
846 } | |
206 | 847 |
207 ### LaTeX functions | 848 ### LaTeX functions |
208 | 849 |
209 latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) { | |
210 v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v | |
211 cat( | |
212 paste0( | |
213 v_sub, | |
214 collapse = collapse_string | |
215 ) | |
216 ) | |
217 } | |
218 | |
219 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
220 cat("\\begin{itemize}\n\\item ") | |
221 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
222 cat("\n\\end{itemize}\n") | |
223 } | |
224 | |
225 latex_itemized_list <- function(v, underscore_whack = TRUE) { | |
226 latex_itemized_collapsed("\n\\item ", v, underscore_whack) | |
227 } | |
228 | |
229 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
230 cat("\\begin{enumerate}\n\\item ") | |
231 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
232 cat("\n\\end{enumerate}\n") | |
233 } | |
234 | |
235 latex_enumerated_list <- function(v) { | |
236 latex_enumerated_collapsed("\n\\item ", v) | |
237 } | |
238 | |
239 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { | |
240 latex_collapsed_vector(" & ", v, underscore_whack) | |
241 cat(extra) | |
242 cat(" \\\\\n") | |
243 } | |
244 | |
245 # Use this like print.data.frame, from which it is adapted: | 850 # Use this like print.data.frame, from which it is adapted: |
246 data_frame_latex <- | 851 data_frame_table_latex <- |
247 function( | 852 function( |
248 x, | 853 x, |
249 ..., | |
250 # digits to pass to format.data.frame | 854 # digits to pass to format.data.frame |
251 digits = NULL, | 855 digits = NULL, |
252 # TRUE -> right-justify columns; FALSE -> left-justify | 856 # TRUE -> right-justify columns; FALSE -> left-justify |
253 right = TRUE, | 857 right = TRUE, |
254 # maximumn number of rows to print | 858 # maximumn number of rows to print |
260 # optional caption | 864 # optional caption |
261 caption = NULL, | 865 caption = NULL, |
262 # h(inline); b(bottom); t (top) or p (separate page) | 866 # h(inline); b(bottom); t (top) or p (separate page) |
263 anchor = "h", | 867 anchor = "h", |
264 # set underscore_whack to TRUE to escape underscores | 868 # set underscore_whack to TRUE to escape underscores |
265 underscore_whack = TRUE | 869 underscore_whack = TRUE, |
870 # how to emit results | |
871 emit = cat | |
266 ) { | 872 ) { |
267 if (is.null(justification)) | 873 if (is.null(justification)) |
268 justification <- | 874 justification <- |
269 Reduce( | 875 Reduce( |
270 f = paste, | 876 f = paste, |
271 x = rep_len(if (right) "r" else "l", length(colnames(x))) | 877 x = rep_len(if (right) "r" else "l", length(colnames(x))) |
272 ) | 878 ) |
879 n <- length(rownames(x)) | |
880 if (length(x) == 0L) { | |
881 emit( | |
882 sprintf( | |
883 # if n is one, use singular 'row', else use plural 'rows' | |
884 ngettext( | |
885 n, | |
886 "data frame with 0 columns and %d row", | |
887 "data frame with 0 columns and %d rows" | |
888 ), | |
889 n | |
890 ), | |
891 "\n", | |
892 sep = "" | |
893 ) | |
894 } else if (n == 0L) { | |
895 emit("0 rows for:\n") | |
896 latex_itemized_list( | |
897 v = names(x), | |
898 underscore_whack = underscore_whack | |
899 ) | |
900 } else { | |
901 if (is.null(max)) | |
902 max <- getOption("max.print", 99999L) | |
903 if (!is.finite(max)) { | |
904 cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) | |
905 knitr::knit_exit() | |
906 } | |
907 omit <- (n0 <- max %/% length(x)) < n | |
908 m <- as.matrix( | |
909 format.data.frame( | |
910 if (omit) x[seq_len(n0), , drop = FALSE] else x, | |
911 digits = digits, | |
912 na.encode = FALSE | |
913 ) | |
914 ) | |
915 emit( | |
916 # h(inline); b(bottom); t (top) or p (separate page) | |
917 paste0("\\begin{table}[", anchor, "]"), | |
918 "\\leavevmode", | |
919 sep = "\n" | |
920 ) | |
921 if (!is.null(caption)) | |
922 emit(paste0(" \\caption{", caption, "}")) | |
923 if (centered) emit("\\centering\n") | |
924 emit( | |
925 paste( | |
926 " \\begin{tabular}{", | |
927 justification, | |
928 "}\n", | |
929 sep = "" | |
930 ) | |
931 ) | |
932 | |
933 # ref for top and bottom struts (\T and \B): | |
934 # https://tex.stackexchange.com/a/50355 | |
935 if (!is.null(caption)) | |
936 emit("\\B \\\\\n") | |
937 latex_table_row( | |
938 v = colnames(m), | |
939 extra = " \\T \\B", | |
940 underscore_whack = underscore_whack | |
941 ) | |
942 emit("\\hline \\\\\n") | |
943 for (i in seq_len(length(m[, 1]))) { | |
944 latex_table_row( | |
945 v = m[i, ], | |
946 underscore_whack = underscore_whack | |
947 ) | |
948 } | |
949 emit( | |
950 paste( | |
951 " \\end{tabular}", | |
952 "\\end{table}", | |
953 sep = "\n" | |
954 ) | |
955 ) | |
956 if (omit) | |
957 emit(" [ reached 'max' / getOption(\"max.print\") -- omitted", | |
958 n - n0, "rows ]\n") | |
959 } | |
960 invisible(x) | |
961 } | |
962 | |
963 # Use this like print.data.frame, from which it is adapted: | |
964 data_frame_tabbing_latex <- | |
965 function( | |
966 x, | |
967 # vector of tab stops, in inches | |
968 tabstops, | |
969 # vector of headings, registered with tab-stops | |
970 headings = colnames(x), | |
971 # digits to pass to format.data.frame | |
972 digits = NULL, | |
973 # maximumn number of rows to print | |
974 max = NULL, | |
975 # optional caption | |
976 caption = NULL, | |
977 # set underscore_whack to TRUE to escape underscores | |
978 underscore_whack = TRUE, | |
979 # flag for landscape mode | |
980 landscape = FALSE, | |
981 # flag indicating that subsubsection should be used for caption | |
982 # rather than subsection | |
983 use_subsubsection_header = TRUE, | |
984 # character-size indicator; for possible values, see: | |
985 # https://tug.org/texinfohtml/latex2e.html#Font-sizes | |
986 charactersize = "small", | |
987 # set verbatim to TRUE to debug output | |
988 verbatim = FALSE | |
989 ) { | |
990 | |
991 hlinport <- if (landscape) { | |
992 function() cat("\\hlinlscp \\\\\n") | |
993 } else { | |
994 function() cat("\\hlinport \\\\\n") | |
995 } | |
996 | |
997 tabstops_tex <- | |
998 Reduce( | |
999 f = function(l, r) paste0(l, r), | |
1000 x = sprintf("\\hspace{%0.2fin}\\=", tabstops), | |
1001 init = "" | |
1002 ) | |
1003 | |
273 n <- length(rownames(x)) | 1004 n <- length(rownames(x)) |
274 if (length(x) == 0L) { | 1005 if (length(x) == 0L) { |
275 cat( | 1006 cat( |
276 sprintf( | 1007 sprintf( |
277 # if n is one, use singular 'row', else use plural 'rows' | 1008 # if n is one, use singular 'row', else use plural 'rows' |
292 underscore_whack = underscore_whack | 1023 underscore_whack = underscore_whack |
293 ) | 1024 ) |
294 } else { | 1025 } else { |
295 if (is.null(max)) | 1026 if (is.null(max)) |
296 max <- getOption("max.print", 99999L) | 1027 max <- getOption("max.print", 99999L) |
297 if (!is.finite(max)) | 1028 if (!is.finite(max)) { |
298 stop("invalid 'max' / getOption(\"max.print\"): ", | 1029 cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) |
299 max) | 1030 knitr::knit_exit() |
1031 } | |
300 omit <- (n0 <- max %/% length(x)) < n | 1032 omit <- (n0 <- max %/% length(x)) < n |
301 m <- as.matrix( | 1033 m <- as.matrix( |
302 format.data.frame( | 1034 format.data.frame( |
303 if (omit) x[seq_len(n0), , drop = FALSE] else x, | 1035 if (omit) x[seq_len(n0), , drop = FALSE] else x, |
304 digits = digits, | 1036 digits = digits, |
305 na.encode = FALSE | 1037 na.encode = FALSE |
306 ) | 1038 ) |
307 ) | 1039 ) |
308 cat( | 1040 if (landscape) |
309 # h(inline); b(bottom); t (top) or p (separate page) | 1041 cat("\n\\begin{landscape}") |
310 paste0("\\begin{table}[", anchor, "]\n") | 1042 tex_caption <- |
311 ) | 1043 if (!is.null(caption)) sprintf("\\captionof{table}{%s}\n", caption) |
312 if (!is.null(caption)) | 1044 else "\n" |
313 cat(paste0(" \\caption{", caption, "}")) | 1045 # build the column names, which have multiple lines when |
314 if (centered) cat("\\centering\n") | 1046 # length(headings) is a multiple of the number of columns |
315 cat( | 1047 column_names <- "" |
316 paste( | 1048 while (length(headings) > 0) { |
317 " \\begin{tabular}{", | 1049 my_row <- c() |
318 justification, | 1050 for (i in 1:(1 + length(tabstops))) { |
319 "}\n", | 1051 my_field <- sq_get(headings) |
320 sep = "" | 1052 sq_put(my_row, if (is.na(my_field)) "" else my_field) |
321 ) | 1053 } |
322 ) | 1054 column_names %||:=% latex_tabbing_row( |
323 # ref: https://tex.stackexchange.com/a/50353 | 1055 v = my_row, |
324 # Describes use of \rule{0pt}{3ex} | 1056 underscore_whack = underscore_whack, |
325 if (!is.null(caption)) | 1057 action = paste0 |
326 cat("\\B \\\\ \\hline\\hline\n") | |
327 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 | |
328 latex_table_row( | |
329 v = colnames(m), | |
330 extra = "\\T\\B", | |
331 underscore_whack = underscore_whack | |
332 ) | |
333 cat("\\hline\n") | |
334 for (i in seq_len(length(m[, 1]))) { | |
335 latex_table_row( | |
336 v = m[i, ], | |
337 underscore_whack = underscore_whack | |
338 ) | 1058 ) |
339 } | 1059 } |
1060 | |
1061 # Begin tabbing environment after beginning charactersize environment | |
1062 if (verbatim) cat("\n\\begin{verbatim}") | |
340 cat( | 1063 cat( |
341 paste( | 1064 paste0( |
342 " \\end{tabular}", | 1065 "\n\\begin{", charactersize, "}", tex_caption, |
343 "\\end{table}", | 1066 "\\begin{tabwrap}{", tabstops_tex, "}\n" |
344 sep = "\n" | |
345 ) | |
346 ) | 1067 ) |
1068 ) | |
1069 # emit column names | |
1070 cat(column_names) | |
1071 # emit hline | |
1072 hlinport() | |
1073 for (i in seq_len(length(m[, 1]))) { | |
1074 my_row <- latex_tabbing_row( | |
1075 v = m[i, ], | |
1076 underscore_whack = underscore_whack, | |
1077 action = paste0 | |
1078 ) | |
1079 if (FALSE) | |
1080 cat(my_row) | |
1081 else | |
1082 cat(my_row) | |
1083 } | |
1084 hlinport() | |
347 if (omit) | 1085 if (omit) |
348 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", | 1086 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", |
349 n - n0, "rows ]\n") | 1087 n - n0, "rows ]\n") |
1088 # End charactersize environment after ending tabbing environment | |
1089 cat(paste0("\\end{tabwrap}\n\\end{", charactersize, "}\n")) | |
1090 if (verbatim) cat("\\end{verbatim}\n") | |
1091 if (landscape) | |
1092 cat("\\end{landscape}\n") | |
350 } | 1093 } |
351 invisible(x) | 1094 invisible(x) |
1095 } | |
1096 | |
1097 param_df_noexit <- | |
1098 function(e = NULL) { | |
1099 data_frame_tabbing_latex( | |
1100 x = param_df, | |
1101 tabstops = c(1.75), | |
1102 underscore_whack = TRUE, | |
1103 caption = "Input parameters", | |
1104 verbatim = FALSE | |
1105 ) | |
1106 if (!is.null(e)) { | |
1107 sink(stderr()) | |
1108 cat("Caught fatal error:\n\n") | |
1109 str(e) | |
1110 sink() | |
1111 } | |
1112 } | |
1113 | |
1114 param_df_exit <- | |
1115 function(e = NULL) { | |
1116 param_df_noexit(e) | |
1117 knitr::knit_exit() | |
1118 exit(-1) | |
1119 } | |
1120 | |
1121 # exit with exit code (default 0) and optional msg | |
1122 exit <- | |
1123 function(code = 0, msg = NULL, use_stderr = FALSE) { | |
1124 if (!is.null(msg)) { | |
1125 if (use_stderr) sink(stderr()) | |
1126 cat("\n\n", msg, "\n\n") | |
1127 if (use_stderr) sink() | |
1128 } | |
1129 q(save = "no", status = code) | |
1130 } | |
1131 | |
1132 # make control sequences into printable latex sequences | |
1133 latex_printable_control_seqs <- | |
1134 function(s) { | |
1135 s <- gsub("[\\]", "xyzzy_plugh", s) | |
1136 s <- gsub("[$]", "\\\\$", s) | |
1137 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) | |
1138 return(s) | |
1139 } | |
1140 nolatex_verbatim <- | |
1141 function(expr) eval(expr) | |
1142 | |
1143 latex_verbatim <- | |
1144 function(expr) { | |
1145 arg_string <- deparse1(substitute(expr)) | |
1146 cat("\n\\begin{verbatim}\n___\n") | |
1147 tryCatch( | |
1148 expr = expr, | |
1149 error = param_df_exit, | |
1150 #ACE error = | |
1151 #ACE function(e) { | |
1152 #ACE cat("Caught error:\n\n") | |
1153 #ACE str(e) | |
1154 #ACE knitr::knit_exit() | |
1155 #ACE stop(e) | |
1156 #ACE }, | |
1157 finally = cat("...\n\\end{verbatim}\n") | |
1158 ) | |
1159 } | |
1160 | |
1161 latex_samepage <- | |
1162 function(expr) { | |
1163 arg_string <- deparse1(substitute(expr)) | |
1164 cat("\n\\begin{samepage}\n") | |
1165 tryCatch( | |
1166 expr = expr, | |
1167 error = param_df_exit, | |
1168 #ACE error = | |
1169 #ACE function(e) { | |
1170 #ACE cat("Caught error:\n\n") | |
1171 #ACE str(e) | |
1172 #ACE knitr::knit_exit() | |
1173 #ACE stop(e) | |
1174 #ACE }, | |
1175 finally = cat("\n\\end{samepage}\n") | |
1176 ) | |
1177 } | |
1178 | |
1179 # return the result of invocation after showing parameters | |
1180 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ | |
1181 latex_show_invocation <- | |
1182 function(f, f_name = deparse1(substitute(f)), head_patch = FALSE) { | |
1183 function(...) { | |
1184 my_env <- (as.list(environment())) | |
1185 va <- list(...) | |
1186 my_rslt <- new_env() | |
1187 my_rslt$rslt <- NULL | |
1188 latex_verbatim( | |
1189 expr = { | |
1190 cat(sprintf("\n .. Local variables for '%s':\n\n", f_name)) | |
1191 str(va) | |
1192 if (!head_patch) { | |
1193 # return this result | |
1194 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ | |
1195 cat(sprintf("\n .. Invoking '%s'\n", f_name)) | |
1196 tryCatch( | |
1197 { | |
1198 cat("\n\\end{verbatim}\n") | |
1199 rslt <- do.call(f, va) | |
1200 }, | |
1201 error = param_df_exit, | |
1202 #ACE error = function(e) { | |
1203 #ACE cat("\n\\begin{verbatim}\n") | |
1204 #ACE str(e) | |
1205 #ACE cat("\n\\end{verbatim}\n") | |
1206 #ACE knitr::knit_exit() | |
1207 #ACE stop(e) | |
1208 #ACE }, | |
1209 finally = cat("\n\\begin{verbatim}\n") | |
1210 ) | |
1211 cat(sprintf("\n .. '%s' returned:\n", f_name)) | |
1212 str(rslt) | |
1213 my_rslt$rslt <- rslt | |
1214 } | |
1215 } | |
1216 ) | |
1217 # return the result of invocation with the shown parameters | |
1218 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ | |
1219 if (head_patch) my_rslt$rslt <- do.call(f, va) | |
1220 (my_rslt$rslt) | |
1221 } | |
1222 } | |
1223 | |
1224 latex_collapsed_vector <- function( | |
1225 collapse_string, | |
1226 v, | |
1227 underscore_whack = TRUE, | |
1228 action = cat0 | |
1229 ) { | |
1230 v_sub <- | |
1231 if (underscore_whack) whack_underscores(v) else v | |
1232 action( | |
1233 paste0( | |
1234 v_sub, | |
1235 collapse = collapse_string | |
1236 ) | |
1237 ) | |
1238 } | |
1239 | |
1240 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
1241 cat("\\begin{itemize}\n\\item ") | |
1242 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
1243 cat("\n\\end{itemize}\n") | |
1244 } | |
1245 | |
1246 latex_itemized_list <- function(v, underscore_whack = TRUE) { | |
1247 latex_itemized_collapsed("\n\\item ", v, underscore_whack) | |
1248 } | |
1249 | |
1250 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
1251 cat("\\begin{enumerate}\n\\item ") | |
1252 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
1253 cat("\n\\end{enumerate}\n") | |
1254 } | |
1255 | |
1256 latex_enumerated_list <- function(v) { | |
1257 latex_enumerated_collapsed("\n\\item ", v) | |
1258 } | |
1259 | |
1260 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { | |
1261 latex_collapsed_vector(" & ", v, underscore_whack) | |
1262 cat(extra) | |
1263 cat(" \\\\\n") | |
1264 } | |
1265 | |
1266 latex_tabbing_row <- function( | |
1267 v, | |
1268 extra = "", | |
1269 underscore_whack = TRUE, | |
1270 action = cat0 | |
1271 ) { | |
1272 # latex_collapsed_vector applies action to result of paste0; | |
1273 # by default, action = cat; | |
1274 # hence, a scalar string is assigned to v_collapsed | |
1275 v_collapsed <- | |
1276 latex_collapsed_vector( | |
1277 "} \\> \\tabfill{", | |
1278 v, | |
1279 underscore_whack, | |
1280 action = paste0 | |
1281 ) | |
1282 action( | |
1283 "\\tabfill{", | |
1284 v_collapsed, | |
1285 "}", | |
1286 extra, | |
1287 " \\\\\n" | |
1288 ) | |
1289 } | |
1290 | |
1291 # N.B. use con = "" to emulate regular cat | |
1292 fcat0 <- | |
1293 function(..., sprtr = " ", cnnctn = file()) { | |
1294 cat0(..., sep = sprtr, file = cnnctn) | |
1295 invisible(cnnctn) | |
352 } | 1296 } |
353 | 1297 |
354 hypersub <- | 1298 hypersub <- |
355 function(s) { | 1299 function(s) { |
356 hyper <- tolower(s) | 1300 hyper <- tolower(s) |
357 hyper <- gsub("[^a-z0-9]+", "-", hyper) | 1301 hyper <- gsub("[^a-z0-9]+", "-", hyper) |
358 hyper <- gsub("[-]+", "-", hyper) | 1302 hyper <- gsub("[-]+", "-", hyper) |
1303 hyper <- gsub("[_]+", "-", hyper) | |
359 hyper <- sub("^[-]", "", hyper) | 1304 hyper <- sub("^[-]", "", hyper) |
360 hyper <- sub("[-]$", "", hyper) | 1305 hyper <- sub("[-]$", "", hyper) |
361 return(hyper) | 1306 return(hyper) |
362 } | 1307 } |
363 | 1308 |
364 subsection_header <- | 1309 table_href <- function(s = "offset", caption = "") { |
365 function(s) { | 1310 paste0("\\hyperlink{table.\\arabic{", s, "}}{Table \\arabic{", s, "}}") |
1311 } | |
1312 | |
1313 table_offset <- function(i = 0, s = "offset", new = FALSE) { | |
1314 paste0( | |
1315 if (new) paste0("\\newcounter{", s, "}\n") else "", | |
1316 "\\setcounter{", s, "}{\\value{table}}\n", | |
1317 paste0(if (i > 0) rep(paste0("\\stepcounter{", s, "}"), i), "\n") | |
1318 ) | |
1319 } | |
1320 | |
1321 a_section_header <- | |
1322 function(s, prefix = "") { | |
366 hyper <- hypersub(s) | 1323 hyper <- hypersub(s) |
367 cat( | 1324 my_subsection_header <- sprintf( |
368 sprintf( | 1325 "\\hypertarget{%s}{\\%ssection{%s}\\label{%s}}\n", |
369 "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", | 1326 hyper, |
370 hyper, s, hyper | 1327 prefix, |
371 ) | 1328 gsub("_", "\\_", s, fixed = TRUE), |
372 ) | 1329 hyper |
373 } | 1330 ) |
374 | 1331 my_subsection_header |
375 subsubsection_header <- | 1332 } |
376 function(s) { | 1333 section_header <- function(s) a_section_header(s, "") |
377 hyper <- hypersub(s) | 1334 subsection_header <- function(s) a_section_header(s, "sub") |
378 cat( | 1335 subsubsection_header <- function(s) a_section_header(s, "subsub") |
379 sprintf( | |
380 "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n", | |
381 hyper, s, hyper | |
382 ) | |
383 ) | |
384 } | |
385 | 1336 |
386 ### SQLite functions | 1337 ### SQLite functions |
387 | 1338 |
388 ddl_exec <- function(db, sql) { | 1339 ddl_exec <- function(db, sql) { |
389 discard <- DBI::dbExecute(conn = db, statement = sql) | 1340 discard <- DBI::dbExecute(conn = db, statement = sql) |
417 } | 1368 } |
418 } | 1369 } |
419 | 1370 |
420 ### KSEA functions and helpers | 1371 ### KSEA functions and helpers |
421 | 1372 |
422 # Adapted from KSEAapp::KSEA.Scores to allow retrieval of: | 1373 #' The KSEA App Analysis (KSEA Kinase Scores Only) |
423 # - maximum log2(FC) | 1374 #' |
1375 #' Compute KSEA kinase scores and statistics from phoshoproteomics data input | |
1376 #' Adapted from KSEAapp::KSEA.Scores to allow retrieval of maximum log2(FC) | |
1377 #' | |
1378 #' Result is an R data.frame with column names | |
1379 #' "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" | |
1380 #' "Please refer to the original Casado et al. publication for detailed | |
1381 #' description of these columns and what they represent: | |
1382 #' | |
1383 #' - Kinase.Gene indicates the gene name for each kinase. | |
1384 #' - mS represents the mean log2(fold change) of all the | |
1385 #' kinase's substrates. | |
1386 #' - Enrichment is the background-adjusted value of the kinase's mS. | |
1387 #' - m is the total number of detected substrates | |
1388 #' from the experimental dataset for each kinase. | |
1389 #' - z.score is the normalized score for each kinase, weighted by | |
1390 #' the number of identified substrates. | |
1391 #' - p.value represents the statistical assessment for the z.score. | |
1392 #' - FDR is the p-value adjusted for multiple hypothesis testing | |
1393 #' using the Benjamini & Hochberg method." | |
1394 #' | |
1395 #' @param ksdata the Kinase-Substrate dataset uploaded from the file | |
1396 #' prefaced with "PSP&NetworKIN_" | |
1397 #' available from github.com/casecpb/KSEA/ | |
1398 #' @param px the experimental data file formatted as described in | |
1399 #' the KSEA.Complete() documentation | |
1400 #' @param networkin a binary input of TRUE or FALSE, indicating whether | |
1401 #' or not to include NetworKIN predictions, where | |
1402 #' \code{NetworKIN = TRUE} | |
1403 #' means include NetworKIN predictions | |
1404 #' @param networkin_cutoff a numeric value between 1 and infinity setting | |
1405 #' the minimum NetworKIN score | |
1406 #' (this can be omitted if NetworKIN = FALSE) | |
1407 #' | |
1408 #' @return creates a new R data.frame with all the KSEA kinase | |
1409 #' scores, along with each one's statistical | |
1410 #' assessment, as described herein. | |
1411 #' | |
1412 #' @references | |
1413 #' | |
1414 #' Casado et al. (2013) Sci Signal. 6(268):rs6 | |
1415 #' | |
1416 #' Hornbeck et al. (2015) Nucleic Acids Res. 43:D512-20 | |
1417 #' | |
1418 #' Horn et al. (2014) Nature Methods 11(6):603-4 | |
1419 #' | |
424 ksea_scores <- function( | 1420 ksea_scores <- function( |
425 | |
426 # For human data, typically, ksdata = KSEAapp::ksdata | 1421 # For human data, typically, ksdata = KSEAapp::ksdata |
427 ksdata, | 1422 ksdata, |
428 | 1423 |
429 # Input data file having columns: | 1424 # Input data file having columns: |
430 # - Protein : abbreviated protein name | 1425 # - Protein : abbreviated protein name |
442 # NetworKIN predictions | 1437 # NetworKIN predictions |
443 networkin, | 1438 networkin, |
444 | 1439 |
445 # A numeric value between 1 and infinity setting the minimum NetworKIN | 1440 # A numeric value between 1 and infinity setting the minimum NetworKIN |
446 # score (can be left out if networkin = FALSE) | 1441 # score (can be left out if networkin = FALSE) |
447 networkin_cutoff | 1442 networkin_cutoff, |
1443 | |
1444 # Minimum substrate count, necessary to adjust the p-value appropriately. | |
1445 minimum_substrate_count | |
448 | 1446 |
449 ) { | 1447 ) { |
1448 # no px$FC should be <= 0, but abs(px$FC) is used below as a precaution. | |
450 if (length(grep(";", px$Residue.Both)) == 0) { | 1449 if (length(grep(";", px$Residue.Both)) == 0) { |
451 # There are no Residue.Both entries having semicolons, so new is | 1450 # There are no Residue.Both entries having semicolons, so new is |
452 # simply px except two columns are renamed and a column is added | 1451 # simply px except two columns are renamed and a column is added |
453 # for log2(abs(fold-change)) | 1452 # for log2(abs(fold-change)) |
454 new <- px | 1453 new <- px |
490 # Convert any illegal values from NaN to NA | 1489 # Convert any illegal values from NaN to NA |
491 new[is.nan(new$log2_fc), "log2_fc"] <- NA | 1490 new[is.nan(new$log2_fc), "log2_fc"] <- NA |
492 # Eliminate rows having missing values (e.g., non-imputed data) | 1491 # Eliminate rows having missing values (e.g., non-imputed data) |
493 new <- new[complete.cases(new$log2_fc), ] | 1492 new <- new[complete.cases(new$log2_fc), ] |
494 } | 1493 } |
495 if (networkin == TRUE) { | 1494 # At this point, new$log2_fc is signed according to which contrast has |
496 # When NetworKIN is true, filter on NetworKIN.cutoff which includes | 1495 # the greater intensity |
497 # PhosphoSitePlus data *because its networkin_score is set to Inf* | 1496 # To take the magnitude into account without taking the direction into |
498 ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] | 1497 # account, set params$kseaUseAbsoluteLog2FC to TRUE |
499 ksdata_filtered <- ksdata_filtered[ | 1498 # |
500 (ksdata_filtered$networkin_score >= networkin_cutoff), ] | 1499 # Should KSEA be performed aggregating signed log2FC or absolute? |
501 } else { | 1500 # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores |
502 # Otherwise, simply use PhosphSitePlus rows | 1501 if (params$kseaUseAbsoluteLog2FC) { |
503 ksdata_filtered <- ksdata[ | 1502 # TRUE use abs(log2FC) for KSEA as Justin requested; this is a |
504 grep("PhosphoSitePlus", ksdata$Source), ] | 1503 # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. |
505 } | 1504 new$log2_fc <- abs(new$log2_fc) |
506 # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD | 1505 } |
1506 | |
1507 monitor_filtration_on_stderr <- TRUE | |
1508 if (monitor_filtration_on_stderr) { | |
1509 # set to TRUE to monitor filtration on stderr | |
1510 sink(stderr()) | |
1511 cat(see_variable(networkin, "\n")) | |
1512 } | |
1513 ksdata_filtered <- | |
1514 sqldf( | |
1515 sprintf("%s %s", | |
1516 "select * from ksdata where not Source = 'NetworKIN'", | |
1517 if (networkin) | |
1518 sprintf("or networkin_score >= %d", networkin_cutoff) | |
1519 else | |
1520 "" | |
1521 ) | |
1522 ) | |
1523 if (monitor_filtration_on_stderr) { | |
1524 cat(see_variable(sqldf( | |
1525 "select count(*), Source from ksdata group by Source"), "\n")) | |
1526 cat(see_variable(sqldf( | |
1527 "select count(*), Source from ksdata_filtered group by Source"), "\n")) | |
1528 sink() | |
1529 } | |
1530 | |
1531 ############################################################################ | |
1532 # Line numbers below refer to lines of: | |
1533 # https://github.com/casecpb/KSEAapp/blob/master/R/KSEA.Scores.R | |
1534 # I would put the original line in a comment but then lint would complain... | |
1535 # - Indeed, I had to rename all the variables because lint didn't like names | |
1536 # containing periods or capital letters. | |
1537 # ACE | |
1538 ############################################################################ | |
1539 # | |
1540 # (1) Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD | |
507 # colnames of ksdata_filtered: | 1541 # colnames of ksdata_filtered: |
508 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" | 1542 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" |
509 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" | 1543 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" |
510 # "SITE_...7_AA" "networkin_score" "Source" | 1544 # "SITE_...7_AA" "networkin_score" "Source" |
511 # colnames of new: | 1545 # colnames of new: |
514 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc | 1548 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc |
515 # FROM ksdata_filtered a | 1549 # FROM ksdata_filtered a |
516 # INNER JOIN new b | 1550 # INNER JOIN new b |
517 # ON a.SUB_GENE = b.SUB_GENE | 1551 # ON a.SUB_GENE = b.SUB_GENE |
518 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD | 1552 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD |
1553 # (KSEA.Scores.R line # 105) | |
1554 # "Extract KSData.filtered annotations that are only found in new" | |
519 ksdata_dataset <- base::merge(ksdata_filtered, new) | 1555 ksdata_dataset <- base::merge(ksdata_filtered, new) |
520 # colnames of ksdata_dataset: | 1556 # colnames of ksdata_dataset: |
521 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" | 1557 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" |
522 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" | 1558 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" |
523 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" | 1559 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" |
524 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) | 1560 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) |
525 # Re-order dataset; prior to accounting for isoforms | 1561 # Re-order dataset; prior to accounting for isoforms |
1562 # (KSEA.Scores.R line # 106) | |
526 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] | 1563 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] |
527 # Extract non-isoform accession in UniProtKB | 1564 # Extract non-isoform accession in UniProtKB |
1565 # (KSEA.Scores.R line # 107) | |
528 ksdata_dataset$uniprot_no_isoform <- sapply( | 1566 ksdata_dataset$uniprot_no_isoform <- sapply( |
529 ksdata_dataset$KIN_ACC_ID, | 1567 ksdata_dataset$KIN_ACC_ID, |
530 function(x) unlist(strsplit(as.character(x), split = "-"))[1] | 1568 function(x) unlist(strsplit(as.character(x), split = "-"))[1] |
531 ) | 1569 ) |
1570 # "last expression collapses isoforms ... for easy processing" | |
532 # Discard previous results while selecting interesting columns ... | 1571 # Discard previous results while selecting interesting columns ... |
1572 # (KSEA.Scores.R line # 110) | |
533 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] | 1573 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] |
534 # Column names are now: | 1574 # Column names are now: |
535 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" | 1575 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" |
536 # "FC" "log2_fc" "Source" | 1576 # "FC" "log2_fc" "Source" |
537 # Make column names human-readable | 1577 # Make column names human-readable |
1578 # (KSEA.Scores.R line # 111) | |
538 colnames(ksdata_dataset_abbrev) <- c( | 1579 colnames(ksdata_dataset_abbrev) <- c( |
539 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", | 1580 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", |
540 "FC", "log2FC", "Source" | 1581 "FC", "log2FC", "Source" |
541 ) | 1582 ) |
542 # SELECT * FROM ksdata_dataset_abbrev | 1583 # SELECT * FROM ksdata_dataset_abbrev |
543 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p | 1584 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p |
1585 # (KSEA.Scores.R line # 112) | |
1586 # "Extract KSData.filtered annotations that are only found in new" | |
544 ksdata_dataset_abbrev <- | 1587 ksdata_dataset_abbrev <- |
545 ksdata_dataset_abbrev[ | 1588 ksdata_dataset_abbrev[ |
546 order( | 1589 order( |
547 ksdata_dataset_abbrev$Kinase.Gene, | 1590 ksdata_dataset_abbrev$Kinase.Gene, |
548 ksdata_dataset_abbrev$Substrate.Gene, | 1591 ksdata_dataset_abbrev$Substrate.Gene, |
549 ksdata_dataset_abbrev$Substrate.Mod, | 1592 ksdata_dataset_abbrev$Substrate.Mod, |
550 ksdata_dataset_abbrev$p), | 1593 ksdata_dataset_abbrev$p), |
551 ] | 1594 ] |
1595 if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) | |
552 # First aggregation step to account for multiply phosphorylated peptides | 1596 # First aggregation step to account for multiply phosphorylated peptides |
553 # and differing peptide sequences; the goal here is to combine results | 1597 # and differing peptide sequences; the goal here is to combine results |
554 # for all measurements of the same substrate. | 1598 # for all measurements of the same substrate. |
555 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | 1599 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, |
556 # `Source`, avg(log2FC) AS log2FC | 1600 # `Source`, avg(log2FC) AS log2FC |
558 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | 1602 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, |
559 # `Source` | 1603 # `Source` |
560 # ORDER BY `Kinase.Gene`; | 1604 # ORDER BY `Kinase.Gene`; |
561 # in two steps: | 1605 # in two steps: |
562 # (1) compute average log_2(fold-change) | 1606 # (1) compute average log_2(fold-change) |
1607 # "take the mean of the log2FC amongst phosphosite duplicates" | |
1608 # (KSEA.Scores.R line # 115) | |
563 ksdata_dataset_abbrev <- aggregate( | 1609 ksdata_dataset_abbrev <- aggregate( |
564 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, | 1610 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, |
565 data = ksdata_dataset_abbrev, | 1611 data = ksdata_dataset_abbrev, |
566 FUN = mean | 1612 FUN = mean |
567 ) | 1613 ) |
1614 if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) | |
568 # (2) order by Kinase.Gene | 1615 # (2) order by Kinase.Gene |
1616 # (KSEA.Scores.R line # 117) | |
569 ksdata_dataset_abbrev <- | 1617 ksdata_dataset_abbrev <- |
570 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] | 1618 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] |
571 # SELECT `Kinase.Gene`, count(*) | 1619 # SELECT `Kinase.Gene`, count(*) |
572 # FROM ksdata_dataset_abbrev | 1620 # FROM ksdata_dataset_abbrev |
573 # GROUP BY `Kinase.Gene`; | 1621 # GROUP BY `Kinase.Gene`; |
574 # in two steps: | 1622 # in two steps: |
575 # (1) Extract the list of Kinase.Gene names | 1623 # (1) Extract the list of Kinase.Gene names |
1624 # "@@@@@@@@@@@@@@@@@@@@" | |
1625 # "Do analysis for KSEA" | |
1626 # "@@@@@@@@@@@@@@@@@@@@" | |
1627 # (KSEA.Scores.R line # 124) | |
576 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) | 1628 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) |
577 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, | 1629 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, |
578 # named by Kinase.Gene | 1630 # named by Kinase.Gene |
1631 # (KSEA.Scores.R line # 125) | |
579 kinase_list <- as.matrix(table(kinase_list)) | 1632 kinase_list <- as.matrix(table(kinase_list)) |
580 # Second aggregation step to account for all substrates per kinase | 1633 # Second aggregation step to account for all substrates per kinase |
581 # CREATE TABLE mean_fc | 1634 # CREATE TABLE mean_fc |
582 # AS | 1635 # AS |
583 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC | 1636 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC |
584 # FROM ksdata_dataset_abbrev | 1637 # FROM ksdata_dataset_abbrev |
585 # GROUP BY `Kinase.Gene` | 1638 # GROUP BY `Kinase.Gene` |
586 mean_fc <- aggregate( | 1639 # (KSEA.Scores.R line # 127) |
587 log2FC ~ Kinase.Gene, | 1640 if (print_nb_messages) nb(see_variable(ksdata_dataset_abbrev), "\n") |
588 data = ksdata_dataset_abbrev, | 1641 mean_fc <- |
589 FUN = mean | 1642 aggregate( |
590 ) | |
591 # mean_fc columns: "Kinase.Gene", "log2FC" | |
592 if (FALSE) { | |
593 # I need to re-think this; I was trying to find the most-represented | |
594 # peptide, but that horse has already left the barn | |
595 # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC | |
596 # FROM ksdata_dataset_abbrev | |
597 # GROUP BY `Kinase.Gene` | |
598 max_fc <- aggregate( | |
599 log2FC ~ Kinase.Gene, | 1643 log2FC ~ Kinase.Gene, |
600 data = ksdata_dataset_abbrev, | 1644 data = ksdata_dataset_abbrev, |
601 FUN = function(r) max(abs(r)) | 1645 FUN = mean |
602 ) | 1646 ) |
603 } | 1647 if (print_nb_messages) nbe(see_variable(mean_fc), "\n") |
1648 | |
1649 # for contrast j | |
1650 # for each kinase i | |
1651 # extract log2 of fold-change (from `new` above) | |
1652 # (used in KSEA.Scores.R lines # 130 & 132) | |
1653 log2_fc_j_each_i <- | |
1654 new$log2_fc | |
1655 | |
1656 # for contrast j | |
1657 # for all kinases i | |
1658 # compute mean of abs(log2 of fold-change) | |
1659 # (used in KSEA.Scores.R lines # 130) | |
1660 mean_abs_log2_fc_j_all_i <- | |
1661 mean(abs(log2_fc_j_each_i), na.rm = TRUE) | |
1662 | |
1663 # for contrast j | |
1664 # for all kinases i | |
1665 # compute mean of log2 of fold-change | |
1666 # (used in KSEA.Scores.R lines # 132) | |
1667 mean_log2_fc_j_all_i <- | |
1668 mean(log2_fc_j_each_i, na.rm = TRUE) | |
1669 | |
1670 # Reorder mean_fc, although I don't see why | |
1671 # (KSEA.Scores.R line 128 | |
1672 mean_fc <- mean_fc[order(mean_fc[, 1]), ] | |
1673 # mean_fc columns so far: "Kinase.Gene", "log2FC" | |
1674 # - Kinase.Gene | |
1675 # indicates the gene name for each kinase. | |
604 | 1676 |
605 # Create column 3: mS | 1677 # Create column 3: mS |
606 mean_fc$m_s <- mean_fc[, 2] | 1678 # - mS |
1679 # represents the mean log2(fold change) of all the | |
1680 # kinase's substrates. | |
1681 # (KSEA.Scores.R line # 129) | |
1682 mean_fc$m_s <- | |
1683 mean_fc_m_s <- mean_fc[, 2] | |
1684 | |
607 # Create column 4: Enrichment | 1685 # Create column 4: Enrichment |
608 mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) | 1686 # - Enrichment |
609 # Create column 5: m, count of substrates | 1687 # is the background-adjusted value of the kinase's mS. |
610 mean_fc$m <- kinase_list | 1688 # (KSEA.Scores.R line # 130) |
1689 mean_fc$enrichment <- | |
1690 mean_fc_m_s / mean_abs_log2_fc_j_all_i | |
1691 | |
1692 # Create column 5: m, count of substrates of kinase (count of j for i) | |
1693 # - m | |
1694 # is the total number of detected substrates | |
1695 # from the experimental dataset for each kinase. | |
1696 # (KSEA.Scores.R line # 131) | |
1697 mean_fc$m <- | |
1698 mean_fc_m <- kinase_list | |
1699 | |
1700 | |
611 # Create column 6: z-score | 1701 # Create column 6: z-score |
612 mean_fc$z_score <- ( | 1702 # - z.score |
613 (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * | 1703 # is the normalized score for each kinase, weighted by |
614 sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) | 1704 # the number of identified substrates. |
1705 # (KSEA.Scores.R line # 132) | |
1706 mean_fc$z_score <- | |
1707 (mean_fc_m_s - mean_log2_fc_j_all_i) * sqrt(mean_fc_m) / | |
1708 sd(log2_fc_j_each_i, na.rm = TRUE) | |
1709 | |
615 # Create column 7: p-value, deduced from z-score | 1710 # Create column 7: p-value, deduced from z-score |
616 mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) | 1711 # - p.value |
1712 # represents the statistical assessment for the z.score. | |
1713 # (KSEA.Scores.R line # 133) | |
1714 # "one-tailed p-value" | |
1715 mean_fc$p_value <- | |
1716 pnorm(-abs(mean_fc$z_score)) | |
1717 | |
1718 # zap excluded kinases; this must be done before adjusting p-value | |
1719 if (TRUE) { | |
1720 mean_fc <- | |
1721 mean_fc[ | |
1722 mean_fc$m >= minimum_substrate_count, | |
1723 , | |
1724 drop = FALSE | |
1725 ] | |
1726 } | |
1727 | |
1728 #ACE nb(see_variable(nrow(mean_fc)), "\n") | |
617 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value | 1729 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value |
618 mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") | 1730 # - FDR |
619 | 1731 # is the p-value adjusted for multiple hypothesis testing |
620 # Remove log2FC column, which is duplicated as mS | 1732 # using the Benjamini & Hochberg method." |
621 mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] | 1733 # (KSEA.Scores.R line # 134) |
1734 mean_fc$fdr <- | |
1735 p.adjust(mean_fc$p_value, method = "fdr") | |
1736 | |
1737 # It makes no sense to leave Z-scores negative when using | |
1738 # absolute value of fold-change | |
1739 if (params$kseaUseAbsoluteLog2FC) { | |
1740 mean_fc$z_score <- abs(mean_fc$z_score) | |
1741 } | |
1742 | |
1743 # Remove second column (log2FC), which is duplicated as mS | |
1744 # (KSEA.Scores.R line # 136) | |
1745 mean_fc <- | |
1746 mean_fc[order(mean_fc$Kinase.Gene), -2] | |
622 # Correct the column names which we had to hack because of the linter... | 1747 # Correct the column names which we had to hack because of the linter... |
623 colnames(mean_fc) <- c( | 1748 colnames(mean_fc) <- c( |
624 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" | 1749 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" |
625 ) | 1750 ) |
1751 # (KSEA.Scores.R line # 138) | |
626 return(mean_fc) | 1752 return(mean_fc) |
627 } | 1753 } |
628 | 1754 |
629 low_fdr_barplot <- function( | 1755 ksea_low_fdr_barplot_factory <- function( |
630 rslt, | 1756 rslt, |
631 i_cntrst, | 1757 i_cntrst, |
632 i, | 1758 i, |
633 a_level, | 1759 a_level, |
634 b_level, | 1760 b_level, |
656 k$fdr | 1782 k$fdr |
657 }, | 1783 }, |
658 "p.value" = { | 1784 "p.value" = { |
659 k$p_value | 1785 k$p_value |
660 }, | 1786 }, |
661 stop( | 1787 { |
662 sprintf( | 1788 cat( |
663 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | 1789 sprintf( |
664 ksea_cutoff_statistic | 1790 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", |
1791 ksea_cutoff_statistic | |
1792 ) | |
665 ) | 1793 ) |
666 ) | 1794 param_df_exit() |
667 ) | 1795 knitr::knit_exit() |
1796 } | |
1797 ) | |
668 | 1798 |
669 k <- k[selector < ksea_cutoff_threshold, ] | 1799 k <- k[selector < ksea_cutoff_threshold, ] |
670 | 1800 nrow_k <- nrow(k) |
671 if (nrow(k) > 1) { | 1801 |
672 op <- par(mai = c(1, 1.5, 0.4, 0.4)) | 1802 #ACE nbe(see_variable(fdr_barplot_dataframe <- k)) |
1803 | |
1804 if (nrow_k > 0) { | |
1805 max_nchar_rowname <- max(nchar(rownames(k))) | |
1806 my_cex_names <- 1.0 / (1 + nrow_k / 50) | |
1807 | |
1808 if (print_trace_messages) cat_margins("Initially") | |
1809 if (print_trace_messages) cat_variable(nrow_k, "\n\n", 0) | |
1810 if (print_trace_messages) cat_variable(my_cex_names, "\n\n", 0) | |
1811 if (print_trace_messages) cat_variable(max_nchar_rowname, "\n\n", 0) | |
1812 | |
1813 # fin: The figure region dimensions, (width, height), in inches. | |
1814 # mar: A numerical vector of the form c(bottom, left, top, right) | |
1815 # that gives the number of lines of margin to be specified | |
1816 # on the four sides of the plot; default: c(5, 4, 4, 2) + 0.1 | |
1817 | |
1818 # mar: The figure region dimensions, (width, height), in inches. | |
673 numeric_z_score <- as.numeric(k$z_score) | 1819 numeric_z_score <- as.numeric(k$z_score) |
674 z_score_order <- order(numeric_z_score) | 1820 bar_order <- order(-as.numeric(k$p_value)) |
675 kinase_name <- k$kinase_gene | 1821 kinase_name <- k$kinase_gene |
676 long_caption <- | 1822 long_caption <- |
677 sprintf( | 1823 sprintf( |
678 "Kinase z-score, %s < %s, %s", | 1824 "Kinase z-score, %s, KSEA %s < %s", |
1825 caption, | |
679 ksea_cutoff_statistic, | 1826 ksea_cutoff_statistic, |
680 ksea_cutoff_threshold, | 1827 ksea_cutoff_threshold |
681 caption | |
682 ) | 1828 ) |
683 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) | 1829 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) |
684 cat("\n\\clearpage\n") | 1830 # return a function that draws the plot |
685 barplot( | 1831 function() { |
686 height = numeric_z_score[z_score_order], | 1832 par_fin <- par("fin") # vector of width_in_inches and height_in_inches) |
687 border = NA, | 1833 op <- par( |
688 xpd = FALSE, | 1834 bg = if (print_trace_messages) "yellow" else "white", |
689 cex.names = 1.0, | 1835 fin = c(par_fin[1], min(par_fin[2], 2.5 + nrow_k / 6)), |
690 cex.axis = 1.0, | 1836 mar = par("mar") + |
691 main = long_caption, | 1837 c(3 / nrow_k, (1 + max_nchar_rowname * my_cex_names) / 2, 0, 0) |
692 cex.main = my_cex_caption, | 1838 # bottom, left, top, right |
693 names.arg = kinase_name[z_score_order], | 1839 ) |
694 horiz = TRUE, | 1840 on.exit(par(op)) |
695 srt = 45, | 1841 if (print_trace_messages) cat_margins("Eventually") |
696 las = 1) | 1842 |
697 par(op) | 1843 barplot( |
1844 height = numeric_z_score[bar_order], | |
1845 border = NA, | |
1846 xpd = FALSE, | |
1847 cex.names = my_cex_names, | |
1848 main = long_caption, | |
1849 cex.main = my_cex_caption, | |
1850 names.arg = kinase_name[bar_order], | |
1851 horiz = TRUE, | |
1852 srt = 45, | |
1853 las = 1, | |
1854 cex.axis = 0.9 | |
1855 ) | |
1856 } | |
698 } | 1857 } |
1858 } else { | |
1859 no_op | |
699 } | 1860 } |
700 } | 1861 } |
701 | 1862 |
702 # note that this adds elements to the global variable `ksea_asterisk_hash` | 1863 # note that this adds elements to the global variable `ksea_asterisk_hash` |
703 | 1864 |
704 low_fdr_print <- function( | 1865 ksea_low_fdr_print <- function( |
705 rslt, | 1866 rslt, |
706 i_cntrst, | 1867 i_cntrst, |
707 i, | 1868 i, |
708 a_level, | 1869 a_level, |
709 b_level, | 1870 b_level, |
710 fold_change, | 1871 fold_change, |
711 caption | 1872 caption, |
1873 write_db = TRUE, # if TRUE, write to DB, else print table | |
1874 anchor = c(const_table_anchor_p, const_table_anchor_t) | |
712 ) { | 1875 ) { |
713 rslt_score_list_i <- rslt$score_list[[i]] | 1876 rslt_score_list_i <- rslt$score_list[[i]] |
714 if (!is.null(rslt_score_list_i)) { | 1877 if (!is.null(rslt_score_list_i)) { |
715 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) | 1878 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) |
716 k <- contrast_ksea_scores <- data.frame( | 1879 k <- contrast_ksea_scores <- data.frame( |
732 k$fdr | 1895 k$fdr |
733 }, | 1896 }, |
734 "p.value" = { | 1897 "p.value" = { |
735 k$p_value | 1898 k$p_value |
736 }, | 1899 }, |
737 stop( | 1900 { |
738 sprintf( | 1901 cat( |
739 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | 1902 sprintf( |
740 ksea_cutoff_statistic | 1903 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", |
1904 ksea_cutoff_statistic | |
1905 ) | |
741 ) | 1906 ) |
742 ) | 1907 param_df_exit() |
743 ) | 1908 knitr::knit_exit() |
1909 } | |
1910 ) | |
744 | 1911 |
745 k <- k[selector < ksea_cutoff_threshold, ] | 1912 k <- k[selector < ksea_cutoff_threshold, ] |
746 # save kinase names to ksea_asterisk_hash | 1913 # save kinase names to ksea_asterisk_hash |
747 for (kinase_name in k$kinase_gene) { | 1914 for (kinase_name in k$kinase_gene) { |
748 ksea_asterisk_hash[[kinase_name]] <- 1 | 1915 ksea_asterisk_hash[[kinase_name]] <- 1 |
749 } | 1916 } |
750 | 1917 |
751 db_write_table_overwrite <- (i_cntrst < 2) | 1918 if (write_db) { |
752 db_write_table_append <- !db_write_table_overwrite | 1919 db_write_table_overwrite <- (i_cntrst < 2) |
753 RSQLite::dbWriteTable( | 1920 db_write_table_append <- !db_write_table_overwrite |
754 conn = db, | 1921 RSQLite::dbWriteTable( |
755 name = "contrast_ksea_scores", | 1922 conn = db, |
756 value = contrast_ksea_scores, | 1923 name = "contrast_ksea_scores", |
757 append = db_write_table_append | 1924 value = contrast_ksea_scores, |
758 ) | 1925 append = db_write_table_append |
759 selector <- switch( | 1926 ) |
760 ksea_cutoff_statistic, | 1927 "" |
761 "FDR" = { | 1928 } else { |
762 contrast_ksea_scores$fdr | 1929 selector <- switch( |
763 }, | 1930 ksea_cutoff_statistic, |
764 "p.value" = { | 1931 "FDR" = { |
765 contrast_ksea_scores$p_value | 1932 contrast_ksea_scores$fdr |
766 }, | 1933 }, |
767 stop( | 1934 "p.value" = { |
768 sprintf( | 1935 contrast_ksea_scores$p_value |
769 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | 1936 }, |
770 ksea_cutoff_statistic | 1937 { |
1938 cat( | |
1939 sprintf( | |
1940 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
1941 ksea_cutoff_statistic | |
1942 ) | |
1943 ) | |
1944 param_df_exit() | |
1945 knitr::knit_exit() | |
1946 } | |
1947 ) | |
1948 if (print_nb_messages) nbe(see_variable(contrast_ksea_scores)) | |
1949 output_df <- contrast_ksea_scores[ | |
1950 selector < ksea_cutoff_threshold, | |
1951 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", | |
1952 "z_score", "p_value", "fdr") | |
1953 ] | |
1954 output_df$kinase_gene <- | |
1955 gsub( | |
1956 "_", | |
1957 "\\\\_", | |
1958 output_df$kinase_gene | |
1959 ) | |
1960 colnames(output_df) <- | |
1961 c( | |
1962 colnames(output_df)[1], | |
1963 colnames(output_df)[2], | |
1964 "enrichment", | |
1965 "m_s", | |
1966 "z_score", | |
1967 "p_value", | |
1968 "fdr" | |
1969 ) | |
1970 #ACE output_order <- with(output_df, order(fdr)) | |
1971 output_order <- with(output_df, order(p_value)) | |
1972 output_df <- output_df[output_order, ] | |
1973 | |
1974 output_df[, 2] <- sprintf("%0.3g", output_df[, 2]) | |
1975 output_df$fdr <- sprintf("%0.4f", output_df$fdr) | |
1976 output_df$p_value <- sprintf("%0.2e", output_df$p_value) | |
1977 output_df$z_score <- sprintf("%0.2f", output_df$z_score) | |
1978 output_df$m_s <- sprintf("%d", output_df$m_s) | |
1979 output_df$enrichment <- sprintf("%0.3g", output_df$enrichment) | |
1980 output_ncol <- ncol(output_df) | |
1981 colnames(output_df) <- | |
1982 c( | |
1983 "Kinase", | |
1984 "\\(\\overline{{\\lvert}\\log_2 (\\text{fold-change}){\\rvert}}\\)", | |
1985 "Enrichment", | |
1986 "Substrates", | |
1987 "z-score", | |
1988 "p-value", | |
1989 "FDR" | |
1990 ) | |
1991 selector <- switch( | |
1992 ksea_cutoff_statistic, | |
1993 "FDR" = { | |
1994 rslt$score_list[[i]]$FDR | |
1995 }, | |
1996 "p.value" = { | |
1997 rslt$score_list[[i]]$p.value | |
1998 }, | |
1999 { | |
2000 cat( | |
2001 sprintf( | |
2002 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
2003 ksea_cutoff_statistic | |
2004 ) | |
2005 ) | |
2006 param_df_exit() | |
2007 knitr::knit_exit() | |
2008 } | |
2009 ) | |
2010 if (sum(selector < ksea_cutoff_threshold) > 0) { | |
2011 if (print_nb_messages) nbe(see_variable(output_df)) | |
2012 math_caption <- gsub("{", "\\{", caption, fixed = TRUE) | |
2013 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) | |
2014 # with ( | |
2015 # output_df, | |
2016 # ) | |
2017 if (TRUE) { | |
2018 output_df$Kinase <- whack_underscores(output_df$Kinase) | |
2019 data_frame_tabbing_latex( | |
2020 x = output_df, | |
2021 # vector of tab stops, in inches | |
2022 tabstops = c(1.0, 1.2, 1.0, 1.0, 1.0, 1.0), | |
2023 # vector of headings, registered with tab-stops | |
2024 headings = colnames(output_df), | |
2025 # digits to pass to format.data.frame | |
2026 digits = NULL, | |
2027 # maximumn number of rows to print | |
2028 max = NULL, | |
2029 # optional caption | |
2030 caption = sprintf( | |
2031 "\\text{%s}, KSEA %s < %s", | |
2032 math_caption, | |
2033 ksea_cutoff_statistic, | |
2034 ksea_cutoff_threshold | |
2035 ), | |
2036 # set underscore_whack to TRUE to escape underscores | |
2037 underscore_whack = FALSE, | |
2038 # flag for landscape mode | |
2039 landscape = FALSE, | |
2040 # flag indicating that subsubsection should be used for caption | |
2041 # rather than subsection | |
2042 use_subsubsection_header = TRUE, | |
2043 # character-size indicator; for possible values, see: | |
2044 # https://tug.org/texinfohtml/latex2e.html#Font-sizes | |
2045 charactersize = "small", | |
2046 # set verbatim to TRUE to debug output | |
2047 verbatim = FALSE | |
2048 ) | |
2049 } else { | |
2050 data_frame_table_latex( | |
2051 x = output_df, | |
2052 justification = "l c c c c c c", | |
2053 centered = TRUE, | |
2054 caption = sprintf( | |
2055 "\\text{%s}, KSEA %s < %s", | |
2056 math_caption, | |
2057 ksea_cutoff_statistic, | |
2058 ksea_cutoff_threshold | |
2059 ), | |
2060 anchor = anchor, | |
2061 underscore_whack = FALSE | |
2062 ) | |
2063 } | |
2064 } else { | |
2065 cat( | |
2066 sprintf( | |
2067 "\\break | |
2068 No kinases had | |
2069 \\(\\text{KSEA %s}_\\text{enrichment} < %s\\) | |
2070 for contrast %s\\hfill\\break\n", | |
2071 ksea_cutoff_statistic, | |
2072 ksea_cutoff_threshold, | |
2073 caption | |
771 ) | 2074 ) |
772 ) | 2075 ) |
773 ) | 2076 } |
774 output_df <- contrast_ksea_scores[ | |
775 selector < ksea_cutoff_threshold, | |
776 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", | |
777 "z_score", "p_value", "fdr") | |
778 ] | |
779 output_order <- with(output_df, order(mean_log2_fc, kinase_gene)) | |
780 output_df <- output_df[output_order, ] | |
781 colnames(output_df) <- | |
782 c( | |
783 colnames(output_df)[1], | |
784 colnames(output_df)[2], | |
785 "enrichment", | |
786 "m_s", | |
787 "z_score", | |
788 "p_value", | |
789 "fdr" | |
790 ) | |
791 output_df$fdr <- sprintf("%0.4f", output_df$fdr) | |
792 output_df$p_value <- sprintf("%0.2e", output_df$p_value) | |
793 output_df$z_score <- sprintf("%0.2f", output_df$z_score) | |
794 output_df$m_s <- sprintf("%d", output_df$m_s) | |
795 output_df$enrichment <- sprintf("%0.2f", output_df$enrichment) | |
796 output_ncol <- ncol(output_df) | |
797 colnames(output_df) <- | |
798 c( | |
799 "Kinase", | |
800 "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)", | |
801 "Enrichment", | |
802 "Substrates", | |
803 "z-score", | |
804 "p-value", | |
805 "FDR" | |
806 ) | |
807 selector <- switch( | |
808 ksea_cutoff_statistic, | |
809 "FDR" = { | |
810 rslt$score_list[[i]]$FDR | |
811 }, | |
812 "p.value" = { | |
813 rslt$score_list[[i]]$p.value | |
814 }, | |
815 stop( | |
816 sprintf( | |
817 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
818 ksea_cutoff_statistic | |
819 ) | |
820 ) | |
821 ) | |
822 if (sum(selector < ksea_cutoff_threshold) > 0) { | |
823 math_caption <- gsub("{", "\\{", caption, fixed = TRUE) | |
824 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) | |
825 data_frame_latex( | |
826 x = output_df, | |
827 justification = "l c c c c c c", | |
828 centered = TRUE, | |
829 caption = sprintf( | |
830 "\\text{%s}, %s < %s", | |
831 math_caption, | |
832 ksea_cutoff_statistic, | |
833 ksea_cutoff_threshold | |
834 ), | |
835 anchor = const_table_anchor_p | |
836 ) | |
837 } else { | |
838 cat( | |
839 sprintf( | |
840 "\\break | |
841 No kinases had | |
842 \\(\\text{%s}_\\text{enrichment} < %s\\) | |
843 for contrast %s\\hfill\\break\n", | |
844 ksea_cutoff_statistic, | |
845 ksea_cutoff_threshold, | |
846 caption | |
847 ) | |
848 ) | |
849 } | 2077 } |
2078 } else { | |
2079 "" | |
850 } | 2080 } |
851 } | 2081 } |
852 | 2082 |
853 # create_breaks is a helper for ksea_heatmap | 2083 # create_breaks is a helper for ksea_heatmap |
854 create_breaks <- function(merged_scores) { | 2084 create_breaks <- function(merged_scores) { |
2085 if (sum(!is.na(merged_scores)) < 2) | |
2086 return(NULL) | |
855 if (min(merged_scores, na.rm = TRUE) < -1.6) { | 2087 if (min(merged_scores, na.rm = TRUE) < -1.6) { |
856 breaks_neg <- seq(-1.6, 0, length.out = 30) | 2088 breaks_neg <- seq(-1.6, 0, length.out = 30) |
857 breaks_neg <- | 2089 breaks_neg <- |
858 append( | 2090 append( |
859 seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10), | 2091 seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10), |
887 mycol <- unique(append(mycol_neg, mycol_pos)) | 2119 mycol <- unique(append(mycol_neg, mycol_pos)) |
888 color_breaks <- list(breaks_all, mycol) | 2120 color_breaks <- list(breaks_all, mycol) |
889 return(color_breaks) | 2121 return(color_breaks) |
890 } | 2122 } |
891 | 2123 |
2124 hm2plus <- function( | |
2125 x, | |
2126 mat = matrix( | |
2127 c( | |
2128 c(0, 4, 0), | |
2129 c(0, 3, 3), | |
2130 c(2, 1, 1) | |
2131 ), | |
2132 nrow = 3, | |
2133 ncol = 3, | |
2134 byrow = TRUE | |
2135 ), | |
2136 denwid = 0.5, | |
2137 denhgt = 0.15, | |
2138 widths = c(0.5, 2.5, 1.5), | |
2139 heights = c(0.4, 0.15, 3.95), | |
2140 divergent = FALSE, | |
2141 notecol = "grey50", | |
2142 trace = "none", | |
2143 margins = c(6, 20), | |
2144 srtcol = 90, | |
2145 srtrow = 0, | |
2146 density_info = "none", | |
2147 key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), | |
2148 key_par = list(), | |
2149 hclustfun = hclust, | |
2150 ... | |
2151 ) { | |
2152 | |
2153 varargs <- list(...) | |
2154 if (FALSE) # this is to avoid commenting out code to pass linting... | |
2155 my_hm2 <- latex_show_invocation(heatmap.2, head_patch = FALSE) | |
2156 else | |
2157 my_hm2 <- heatmap.2 | |
2158 | |
2159 x <- as.matrix(x) | |
2160 if (sum(!is.na(x)) < 1) | |
2161 return(NULL) | |
2162 color_count <- 1 + max(64, length(as.vector(x))) # 8 was not enough | |
2163 break_count <- 1 + color_count | |
2164 min_nonax <- min(x, na.rm = TRUE) | |
2165 max_nonax <- max(x, na.rm = TRUE) | |
2166 if (print_nb_messages) nb("within hm2plus", see_variable(divergent), "\n") | |
2167 if (divergent) { | |
2168 zlim <- max(abs(min_nonax), abs(max_nonax)) | |
2169 if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n")) | |
2170 breaks <- (zlim) / (break_count:1) | |
2171 if (print_nb_messages) nb(see_variable(breaks, "\n")) | |
2172 breaks <- breaks - median(breaks) | |
2173 zlim <- c(-zlim, zlim) | |
2174 if (print_nb_messages) nb(see_variable(zlim, "\n")) | |
2175 } else { | |
2176 zlim <- max(abs(min_nonax), abs(max_nonax)) | |
2177 if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n")) | |
2178 breaks <- zlim / (break_count:1) | |
2179 if (print_nb_messages) nb(see_variable(breaks, "\n")) | |
2180 if (max_nonax < 0) { | |
2181 breaks <- breaks - zlim | |
2182 zlim <- c(-zlim, 0) | |
2183 } else { | |
2184 zlim <- c(0, zlim) | |
2185 } | |
2186 if (print_nb_messages) nb(see_variable(zlim, "\n")) | |
2187 } | |
2188 nonax <- x | |
2189 nonax[is.na(x)] <- min_nonax | |
2190 if (is.null(widths)) widths <- c(denwid, 4 - denwid, 1.5) | |
2191 if (is.null(heights)) heights <- c(0.4, denhgt, 4.0) | |
2192 colors <- | |
2193 if (divergent && min_nonax < 0) { | |
2194 # divergent colors on both sides of zero | |
2195 colorRampPalette(c("red", "white", "blue"))(color_count) | |
2196 } else if (divergent && min_nonax > 0) { | |
2197 # "divergent" colors > zero | |
2198 colorRampPalette(c("white", "blue"))(color_count) | |
2199 } else if (divergent && max_nonax < 0) { | |
2200 # "divergent" colors < zero | |
2201 colorRampPalette(c("red", "white"))(color_count) | |
2202 } else { | |
2203 # "non-divergent" colors including zero | |
2204 hcl.colors(color_count, "YlOrRd", rev = TRUE) | |
2205 } | |
2206 | |
2207 #ACE if (print_nb_messages) nb("within hm2plus", see_variable(key_par), "\n") | |
2208 #ACE if (print_nb_messages) nb(see_variable(colors, "\n")) | |
2209 #ACE key_par$col = colors | |
2210 #ACE key_par$breaks = breaks | |
2211 | |
2212 if (print_nb_messages) nb(see_variable(par(), "\n")) #ACE TODO remove me | |
2213 if (print_nb_messages) cat("\\leavevmode\n\\linebreak\n") #ACE TODO remove me | |
2214 suppressWarnings( | |
2215 my_hm2( | |
2216 x = x, | |
2217 col = colors, | |
2218 #ACE symkey = FALSE, | |
2219 density.info = density_info, | |
2220 srtCol = srtcol, | |
2221 srtRow = srtrow, | |
2222 margins = margins, | |
2223 lwid = widths, | |
2224 lhei = heights, | |
2225 key.title = NA, | |
2226 key.xlab = key_xlab, | |
2227 key.par = key_par, | |
2228 lmat = mat, | |
2229 notecol = notecol, | |
2230 trace = trace, | |
2231 bg = "yellow", | |
2232 hclustfun = hclustfun, | |
2233 #ACE breaks = breaks, | |
2234 oldstyle = FALSE, | |
2235 ... # varargs | |
2236 ) | |
2237 ) | |
2238 # implicitly returning value returned by heatmap.2 | |
2239 } | |
2240 | |
892 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap | 2241 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap |
893 draw_kseaapp_summary_heatmap <- function( | 2242 draw_kseaapp_summary_heatmap <- function( |
894 x, | 2243 x, # matrix with row/col names already formatted |
895 sample_cluster, | 2244 sample_cluster, # a binary input of TRUE or FALSE, |
896 merged_asterisk, | 2245 # indicating whether or not to perform |
897 my_cex_row, | 2246 # hierarchical clustering of the sample columns |
898 color_breaks, | 2247 merged_asterisk, # matrix having dimensions of x, values "*" or "" |
899 margins, | 2248 color_breaks, # breaks for color gradation, from create_breaks |
2249 # passed to `breaks` argument of `image` | |
2250 margins = c(8, 15), # two integers setting the bottom and right margins | |
2251 # to accommodate row and column labels | |
2252 master_cex = 0.7, # basis for text sizes | |
900 ... | 2253 ... |
901 ) { | 2254 ) { |
902 merged_scores <- x | 2255 merged_scores <- x |
903 if (!is.matrix(x)) { | 2256 if (!is.matrix(x)) { |
904 cat( | 2257 cat( |
906 "No plot because \\texttt{typeof(x)} is '", | 2259 "No plot because \\texttt{typeof(x)} is '", |
907 typeof(x), | 2260 typeof(x), |
908 "' rather than 'matrix'.\n\n" | 2261 "' rather than 'matrix'.\n\n" |
909 ) | 2262 ) |
910 ) | 2263 ) |
911 } else if (nrow(x) < 2) { | 2264 cat_variable(x) |
912 cat("No plot because matrix x has ", nrow(x), " rows.\n\n") | 2265 return(FALSE) |
913 cat("\\begin{verbatim}\n") | 2266 } |
914 str(x) | 2267 if (print_trace_messages) cat(sprintf("master_cex = %03f\n\n", master_cex)) |
915 cat("\\end{verbatim}\n") | 2268 nrow_x <- nrow(x) |
916 } else if (ncol(x) < 2) { | 2269 ncol_x <- ncol(x) |
917 cat("No plot because matrix x has ", ncol(x), " columns.\n\n") | 2270 #if (nrow_x < 2) { |
918 cat("\\begin{verbatim}\n") | 2271 if (nrow_x < 1) { |
919 str(x) | 2272 cat("No plot because matrix has no rows.\n\n") |
920 cat("\\end{verbatim}\n") | 2273 return(FALSE) |
921 } else { | 2274 } else if (nrow_x < 2) { |
922 gplots::heatmap.2( | 2275 cat("No plot because matrix has one row. Matrix looks like this:\n\n") |
923 x = merged_scores, | 2276 cat("\n\\begin{verbatim}\n") |
924 Colv = sample_cluster, | 2277 print(x) |
925 scale = "none", | 2278 cat("\n\\end{verbatim}\n") |
926 cellnote = merged_asterisk, | 2279 return(FALSE) |
927 notecol = "white", | 2280 } else if (ncol_x < 2) { |
928 cexCol = 0.9, | 2281 cat("No plot because matrix x has ", ncol_x, " columns.\n\n") |
929 # Heuristically assign size of row labels | 2282 cat_variable(x) |
930 cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25), | 2283 return(FALSE) |
931 srtCol = 45, | 2284 } |
932 srtRow = 45, | 2285 max_nchar_rowname <- max(nchar(rownames(x))) |
933 notecex = 3 * my_cex_row, | 2286 max_nchar_colname <- max(nchar(colnames(x))) |
934 col = color_breaks[[2]], | 2287 my_limit <- g_intensity_hm_rows |
935 density.info = "none", | 2288 |
936 trace = "none", | 2289 my_row_cex_scale <- master_cex * 150 / nrow_x |
937 breaks = color_breaks[[1]], | 2290 my_col_cex_scale <- 3.0 |
938 lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), | 2291 my_asterisk_scale <- 0.4 * my_row_cex_scale |
939 lhei = c(0.4, 8.0, 1.1), | 2292 my_row_warp <- 1 |
940 lwid = c(0.5, 3), | 2293 my_note_warp <- 2 |
941 key = FALSE, | 2294 my_row_warp <- 1 |
942 margins = margins, | 2295 my_row_cex_asterisk <- |
943 ... | 2296 master_cex * my_row_warp * my_asterisk_scale |
944 ) | 2297 |
945 } | 2298 |
946 } | 2299 my_col_cex <- my_col_cex_scale * master_cex |
947 | 2300 my_row_cex <- min(3.5 * my_row_cex_asterisk, my_col_cex) |
948 # Adapted from KSEAapp::KSEA.Heatmap | 2301 my_key_cex <- 1.286 |
2302 my_hm2_cex <- 1 * master_cex | |
2303 my_offset <- (4.8 / (9 + nrow_x / 10)) - 0.4 | |
2304 if (print_trace_messages) cat(sprintf("nrow_x = %03f\n\n", nrow_x)) | |
2305 if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) | |
2306 my_offset <- 0.05 | |
2307 if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) | |
2308 my_scale <- 3.0 | |
2309 if (ncol_x < 10 && nrow_x < 10) | |
2310 my_scale <- my_scale * 10 / (10 - nrow_x) * 10 / (10 - ncol_x) | |
2311 | |
2312 my_heights <- c( | |
2313 0.15, | |
2314 3.85 - my_offset, | |
2315 0.5 + my_offset | |
2316 ) | |
2317 my_margins <- c(1, 1) + | |
2318 c( | |
2319 margins[1] * 0.08 * max_nchar_colname * my_col_cex, | |
2320 margins[2] * 0.04 * max_nchar_rowname * my_row_cex | |
2321 ) | |
2322 | |
2323 my_notecex <- | |
2324 my_scale * | |
2325 min( | |
2326 1.1, | |
2327 my_row_cex_asterisk * my_note_warp, | |
2328 my_col_cex * my_note_warp | |
2329 ) | |
2330 | |
2331 if (print_trace_messages) { | |
2332 cat_variable(my_heights, suffix = "; ") | |
2333 cat_variable(my_margins, suffix = "\n\n") | |
2334 cat_variable(my_row_cex_scale, suffix = "; ") | |
2335 cat_variable(my_col_cex_scale, suffix = "\n\n") | |
2336 cat_variable(my_row_cex_asterisk, suffix = "\n\n") | |
2337 cat_variable(my_row_cex, suffix = "; ") | |
2338 cat_variable(my_col_cex, suffix = "\n\n") | |
2339 cat_variable(my_row_cex, suffix = "; ") | |
2340 cat_variable(my_col_cex, suffix = "\n\n") | |
2341 } | |
2342 | |
2343 hm2plus( | |
2344 x = merged_scores, | |
2345 Colv = sample_cluster, | |
2346 cellnote = merged_asterisk, | |
2347 cex = my_hm2_cex, | |
2348 cexCol = my_col_cex, | |
2349 cexRow = my_row_cex, | |
2350 denhgt = 0.15, | |
2351 density_info = "none", | |
2352 denwid = 0.5, | |
2353 divergent = TRUE, | |
2354 key_par = list(cex = my_key_cex), | |
2355 key_xlab = "Z-score", | |
2356 margins = my_margins, | |
2357 notecex = my_scale * min( | |
2358 1.5, | |
2359 my_row_cex_asterisk * my_note_warp, | |
2360 my_col_cex * my_note_warp | |
2361 ), | |
2362 notecol = "white", | |
2363 scale = "none", | |
2364 srtcol = 90, | |
2365 srtrow = 0, | |
2366 trace = "none", | |
2367 mat = matrix( | |
2368 c( | |
2369 c(0, 3, 3), | |
2370 c(2, 1, 1), | |
2371 c(0, 4, 0) | |
2372 ), | |
2373 nrow = 3, | |
2374 ncol = 3, | |
2375 byrow = TRUE | |
2376 ), | |
2377 widths = c(0.5, 3.1, 0.9), | |
2378 heights = my_heights, | |
2379 ... | |
2380 ) | |
2381 return(TRUE) | |
2382 } | |
2383 | |
2384 # function drawing heatmap of contrast fold-change for each kinase, | |
2385 # adapted from KSEAapp::KSEA.Heatmap | |
949 ksea_heatmap <- function( | 2386 ksea_heatmap <- function( |
950 # the data frame outputs from the KSEA.Scores() function, in list format | 2387 # the data frame outputs from the KSEA.Scores() function, in list format |
951 score_list, | 2388 score_list, |
952 # a character vector of all the sample names for heatmap annotation: | 2389 # a character vector of all the sample names for heatmap annotation: |
953 # - the names must be in the same order as the data in score_list | 2390 # - the names must be in the same order as the data in score_list |
959 # a numeric value between 0 and infinity indicating the min. number of | 2396 # a numeric value between 0 and infinity indicating the min. number of |
960 # substrates a kinase must have to be included in the heatmap | 2397 # substrates a kinase must have to be included in the heatmap |
961 m_cutoff, | 2398 m_cutoff, |
962 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | 2399 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff |
963 # for indicating significant kinases in the heatmap | 2400 # for indicating significant kinases in the heatmap |
964 p_cutoff = | 2401 p_cutoff = { |
965 stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), | 2402 cat("argument 'p_cutoff' is required for function 'ksea_heatmap'") |
2403 param_df_exit() | |
2404 knitr::knit_exit() | |
2405 }, | |
966 # a binary input of TRUE or FALSE, indicating whether or not to perform | 2406 # a binary input of TRUE or FALSE, indicating whether or not to perform |
967 # hierarchical clustering of the sample columns | 2407 # hierarchical clustering of the sample columns |
968 sample_cluster, | 2408 sample_cluster, |
969 # a binary input of TRUE or FALSE, indicating whether or not to export | 2409 # a binary input of TRUE or FALSE, indicating whether or not to export |
970 # the heatmap as a .png image into the working directory | 2410 # the heatmap as a .png image into the working directory |
971 export = FALSE, | 2411 export = FALSE, |
972 # bottom and right margins; adjust as needed if contrast names are too long | 2412 # bottom and right margins; adjust as needehttps://tex.stackexchange.com/a/56795d if contrast names are too long |
973 margins = c(6, 20), | 2413 margins = c(6, 6), |
974 # print which kinases? | 2414 # print which kinases? |
975 # - Mandatory argument, must be one of const_ksea_.*_kinases | 2415 # - Mandatory argument, must be one of const_ksea_.*_kinases |
976 which_kinases, | 2416 which_kinases, |
977 # additional arguments to gplots::heatmap.2, such as: | 2417 # additional arguments to gplots::heatmap.2, such as: |
978 # - main: main title of plot | 2418 # - main: main title of plot |
991 paste(names, i, sep = ".") | 2431 paste(names, i, sep = ".") |
992 } | 2432 } |
993 master <- | 2433 master <- |
994 Reduce( | 2434 Reduce( |
995 f = function(...) { | 2435 f = function(...) { |
996 base::merge(..., by = "Kinase.Gene", all = FALSE) | 2436 base::merge(..., by = "Kinase.Gene", all = TRUE) |
997 }, | 2437 }, |
998 x = score_list_m | 2438 x = score_list_m |
999 ) | 2439 ) |
1000 | 2440 |
1001 row.names(master) <- master$Kinase.Gene | 2441 row.names(master) <- master$Kinase.Gene |
1017 } | 2457 } |
1018 return(new) | 2458 return(new) |
1019 } | 2459 } |
1020 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) | 2460 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) |
1021 | 2461 |
1022 # begin hack to print only significant rows | |
1023 asterisk_rows <- rowSums(merged_asterisk == "*") > 0 | 2462 asterisk_rows <- rowSums(merged_asterisk == "*") > 0 |
1024 all_rows <- rownames(merged_stats) | 2463 all_rows <- rownames(merged_stats) |
1025 names(asterisk_rows) <- all_rows | 2464 names(asterisk_rows) <- all_rows |
1026 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) | 2465 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) |
1027 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) | 2466 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) |
1028 merged_scores_asterisk <- merged_scores[names(asterisk_rows), ] | 2467 merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE] |
1029 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ] | 2468 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE] |
1030 # end hack to print only significant rows | |
1031 | 2469 |
1032 row_list <- list() | 2470 row_list <- list() |
1033 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows | 2471 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows |
1034 row_list[[const_ksea_all_kinases]] <- all_rows | 2472 row_list[[const_ksea_all_kinases]] <- all_rows |
1035 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows | 2473 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows |
1036 | 2474 |
1037 i <- which_kinases | 2475 i <- which_kinases |
1038 my_row_names <- row_list[[i]] | 2476 my_row_names <- row_list[[i]] |
1039 scrs <- merged_scores[my_row_names, ] | 2477 scrs <- merged_scores[my_row_names, , drop = FALSE] |
1040 stts <- merged_stats[my_row_names, ] | 2478 stts <- merged_stats[my_row_names, , drop = FALSE] |
1041 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) | 2479 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) |
1042 | 2480 |
1043 color_breaks <- create_breaks(scrs) | 2481 color_breaks <- create_breaks(scrs) |
2482 if (is.null(color_breaks)) { | |
2483 cat("No plot because matrix has too few rows.\n\n") | |
2484 return(NULL) | |
2485 } | |
1044 plot_height <- nrow(scrs) ^ 0.55 | 2486 plot_height <- nrow(scrs) ^ 0.55 |
1045 plot_width <- ncol(scrs) ^ 0.7 | 2487 plot_width <- ncol(scrs) ^ 0.7 |
1046 my_cex_row <- 0.25 * 16 / plot_height | |
1047 if (export == "TRUE") { | 2488 if (export == "TRUE") { |
1048 png( | 2489 png( |
1049 "KSEA.Merged.Heatmap.png", | 2490 "KSEA.Merged.Heatmap.png", |
1050 width = plot_width * 300, | 2491 width = plot_width * 300, |
1051 height = 2 * plot_height * 300, | 2492 height = 2 * plot_height * 300, |
1052 res = 300, | 2493 res = 300, |
1053 pointsize = 14 | 2494 pointsize = 14 |
1054 ) | 2495 ) |
1055 } | 2496 } |
1056 draw_kseaapp_summary_heatmap( | 2497 did_draw <- draw_kseaapp_summary_heatmap( |
1057 x = scrs, | 2498 x = scrs, |
1058 sample_cluster = sample_cluster, | 2499 sample_cluster = sample_cluster, |
1059 merged_asterisk = merged_asterisk, | 2500 merged_asterisk = merged_asterisk, |
1060 my_cex_row = my_cex_row, | |
1061 color_breaks = color_breaks, | 2501 color_breaks = color_breaks, |
1062 margins = margins | 2502 margins = margins |
1063 ) | 2503 ) |
1064 if (export == "TRUE") { | 2504 if (export == "TRUE") { |
1065 dev.off() | 2505 dev.off() |
1066 } | 2506 } |
2507 if (!did_draw) | |
2508 return(NULL) | |
1067 return(my_row_names) | 2509 return(my_row_names) |
1068 } | 2510 } |
1069 | 2511 |
1070 # helper for heatmaps of phosphopeptide intensities | 2512 # helpers for heatmaps of phosphopeptide intensities |
1071 | 2513 |
1072 draw_intensity_heatmap <- | 2514 # factory producing function to truncate string after n characters |
2515 trunc_n <- function(n) { | |
2516 function(x) { | |
2517 sapply( | |
2518 X = x, | |
2519 FUN = function(s) { | |
2520 if (is.na(s)) | |
2521 return("NA") | |
2522 cond <- try_catch_w_e(nchar(s) > n) | |
2523 if (!is.logical(cond$value)) { | |
2524 return(cond$value$message) | |
2525 } else if (cond$value) { | |
2526 paste0( | |
2527 strtrim(s, n), | |
2528 "..." | |
2529 ) | |
2530 } else { | |
2531 s | |
2532 } | |
2533 }, | |
2534 USE.NAMES = FALSE | |
2535 ) | |
2536 } | |
2537 } | |
2538 trunc_long_ppep <- function(x) trunc_n(40)(x) | |
2539 trunc_ppep <- function(x) trunc_n(g_ppep_trunc_n)(x) | |
2540 trunc_subgene <- function(x) trunc_n(g_subgene_trunc_n)(x) | |
2541 trunc_enriched_substrate <- function(x) trunc_n(g_sbstr_trunc_n)(x) | |
2542 | |
2543 # factory producing a function that returns a covariance | |
2544 # matrix's rows (and columns) having variance > v_min | |
2545 keep_cov_w_var_gtr_min <- function(v_min) { | |
2546 function(x) { | |
2547 if (!is.matrix(x)) | |
2548 return(NULL) | |
2549 keepers <- sapply( | |
2550 X = seq_len(nrow(x)), | |
2551 FUN = function(i) { | |
2552 if (x[i, i] < v_min) | |
2553 NA | |
2554 else | |
2555 x[i, i] | |
2556 } | |
2557 ) | |
2558 names(keepers) <- rownames(x) | |
2559 keepers <- keepers[!is.na(keepers)] | |
2560 keepers <- names(keepers) | |
2561 if (length(keepers) == 0) | |
2562 return(NULL) | |
2563 x[keepers, keepers] | |
2564 } | |
2565 } | |
2566 # function that returns a matrix's rows having variance > 1 | |
2567 keep_cov_w_var_gtr_1 <- keep_cov_w_var_gtr_min(1) | |
2568 | |
2569 # factory producing a function that returns | |
2570 # - either a matrix's rows (rows = TRUE) | |
2571 # - or a matrix's columns (rows = FALSE) | |
2572 # having variance > v_min | |
2573 keep_var_gtr_min <- function(v_min) { | |
2574 function(x, rows = TRUE) { | |
2575 nrowcol <- if (rows) nrow else ncol | |
2576 if (!is.matrix(x)) | |
2577 return(NULL) | |
2578 keepers <- sapply( | |
2579 X = seq_len(nrowcol(x)), | |
2580 FUN = function(i) { | |
2581 row_var <- var( | |
2582 if (rows) x[i, ] else x[, i], | |
2583 na.rm = TRUE | |
2584 ) | |
2585 if (is.na(row_var) || row_var <= v_min) NA else i | |
2586 } | |
2587 ) | |
2588 keepers <- keepers[!is.na(keepers)] | |
2589 if (rows) x[keepers, ] else x[, keepers] | |
2590 } | |
2591 } | |
2592 | |
2593 keep_var_gtr_0 <- keep_var_gtr_min(0) | |
2594 | |
2595 # function drawing heatmap of phosphopeptide intensities | |
2596 ppep_heatmap <- | |
1073 function( | 2597 function( |
1074 m, # matrix with rownames already formatted | 2598 m, # matrix with rownames already formatted |
1075 cutoff, # cutoff used by hm_heading_function | 2599 cutoff, # cutoff used by hm_heading_function |
1076 hm_heading_function, # construct and cat heading from m and cutoff | 2600 hm_heading_function, # construct $ cat heading from m and cutoff |
1077 hm_main_title, # main title for plot (drawn below heading) | 2601 hm_main_title, # main title for plot (drawn below heading) |
1078 suppress_row_dendrogram = TRUE, # set to false to show dendrogram | 2602 suppress_row_dendrogram = TRUE, # set to false to show dendrogram |
1079 max_peptide_count # experimental: | 2603 max_peptide_count = # experimental: |
1080 = intensity_hm_rows, # values of 50 and 75 worked well | 2604 g_intensity_hm_rows, # values of 50 and 75 worked well |
1081 ... # passthru parameters for heatmap | 2605 master_cex = 1.0, # basis for text sizes |
2606 margins = NULL, # optional margins (bottom, right) | |
2607 cellnote = NULL, # optional matrix of character; dim = dim(m) | |
2608 adj = 0.5, # adjust text: 0 left, 0.5 middle, 1 right | |
2609 ... # passthru to hm2plus or heatmap.2 | |
1082 ) { | 2610 ) { |
2611 use_heatmap_1 <- FALSE | |
1083 peptide_count <- 0 | 2612 peptide_count <- 0 |
1084 # emit the heading for the heatmap | 2613 # emit the heading for the heatmap |
1085 if (hm_heading_function(m, cutoff)) { | 2614 if (hm_heading_function(m, cutoff)) { |
1086 peptide_count <- min(max_peptide_count, nrow(m)) | 2615 nrow_m <- nrow(m) |
1087 if (nrow(m) > 1) { | 2616 peptide_count <- min(max_peptide_count, nrow_m) |
2617 if (nrow_m > 1) { | |
1088 m_margin <- m[peptide_count:1, ] | 2618 m_margin <- m[peptide_count:1, ] |
1089 # Margin setting was heuristically derived | 2619 # Margin was heuristically derived to accommodate the widest label |
1090 margins <- | 2620 row_mchar_max <- max(nchar(rownames(m_margin))) |
1091 c(0.5, # col | 2621 col_mchar_max <- max(nchar(colnames(m_margin))) |
1092 max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row | 2622 row_margin <- master_cex * row_mchar_max * 2.6 |
1093 ) | 2623 col_margin <- master_cex * col_mchar_max * 2.6 |
1094 } | 2624 if (print_trace_messages) cat(sprintf("row_margin = %0.3f; ", row_margin)) |
1095 if (nrow(m) > 1) { | 2625 if (print_trace_messages) cat(sprintf("col_margin = %0.3f; ", col_margin)) |
2626 hm_call <- NULL | |
1096 tryCatch( | 2627 tryCatch( |
1097 { | 2628 { |
1098 old_oma <- par("oma") | 2629 # set non-argument parameters for hm_call inner function |
1099 par(cex.main = 0.6) | 2630 my_row_cex <- |
1100 # Heuristically determined character size adjustment formula | 2631 master_cex * 200000 / ( |
1101 char_contractor <- | 2632 (max(nchar(rownames(m_margin)))^2) * g_intensity_hm_rows |
1102 250000 / ( | |
1103 max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows | |
1104 ) | 2633 ) |
1105 heatmap( | 2634 m_hm <- m[peptide_count:1, , drop = FALSE] |
1106 m[peptide_count:1, ], | 2635 if (is.null(cellnote)) { |
1107 Rowv = if (suppress_row_dendrogram) NA else NULL, | 2636 cellnote <- matrix("", nrow = nrow(m_hm), ncol = ncol(m_hm)) |
1108 Colv = NA, | 2637 cellnote[is.na(m_hm)] <- "NA" |
1109 cexRow = char_contractor, | 2638 } else { |
1110 cexCol = char_contractor * 50 / max_peptide_count, | 2639 cellnote <- cellnote[peptide_count:1, , drop = FALSE] |
1111 scale = "row", | 2640 } |
1112 margins = margins, | 2641 m_hm[is.na(m_hm)] <- 0 |
1113 main = | 2642 nrow_m_hm <- nrow(m_hm) |
1114 "Unimputed, unnormalized log(intensities)", | 2643 ncol_m_hm <- ncol(m_hm) |
1115 xlab = "", | 2644 if (nrow_m_hm < 1 || ncol_m_hm < 1) |
1116 las = 1, | 2645 return(peptide_count) # return zero as initialized above |
1117 ... | 2646 my_limit <- g_intensity_hm_rows |
2647 | |
2648 | |
2649 my_row_cex <- master_cex * (100 / (2 + row_mchar_max)) | |
2650 my_col_cex <- master_cex * 6 * row_margin / col_margin | |
2651 my_col_adj <- min(my_col_cex, my_row_cex) / my_col_cex | |
2652 my_col_cex <- min(my_col_cex, my_row_cex) | |
2653 col_margin <- sqrt(my_col_adj) * col_margin | |
2654 if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex)) | |
2655 if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex)) | |
2656 if (is.null(margins)) my_margins <- | |
2657 c( | |
2658 (my_col_cex + col_margin), # col | |
2659 (my_row_cex + row_margin) / my_row_cex # row | |
1118 ) | 2660 ) |
2661 else | |
2662 my_margins <- margins | |
2663 | |
2664 if (print_trace_messages) cat( | |
2665 sprintf( | |
2666 "my_margins = c(%s)\n\n", | |
2667 paste(my_margins, collapse = ", ") | |
2668 ) | |
2669 ) | |
2670 my_hm2_cex <- 2 * master_cex | |
2671 my_key_cex <- 0.9 - 0.1 * (g_intensity_hm_rows + nrow_m_hm) / g_intensity_hm_rows | |
2672 my_key_warp <- 1.5 * 22.75 / row_margin | |
2673 my_key_cex <- min(1.10, my_key_warp * my_key_cex) | |
2674 my_hgt_scale <- 3.70 - 0.4 * (max(1, 0.9 * my_row_cex) - 1) | |
2675 my_hgt_scale <- 3.75 # 3.615 | |
2676 my_hgt_scale <- 3.60 # 3.615 | |
2677 if (print_trace_messages) | |
2678 cat_variable(my_hgt_scale, "\n\n", 3) | |
2679 my_warp <- max(0.1, 1.4 * (7.5 + nrow_m) / g_intensity_hm_rows) | |
2680 if (print_trace_messages) | |
2681 cat_variable(my_warp, "\n\n", 3) | |
2682 # added 0.9 heuristically... | |
2683 my_plot_height <- | |
2684 (0.566 + 0.354 * (nrow_m / g_intensity_hm_rows)) * | |
2685 min(my_hgt_scale, my_hgt_scale * my_warp) | |
2686 my_plot_height <- min(3.65, my_plot_height * g_intensity_hm_rows / 50) | |
2687 my_heights <- c( | |
2688 0.3, # title and top dendrogram | |
2689 my_plot_height, # plot and bottom margin | |
2690 4.15 - my_hgt_scale, # legend | |
2691 0.05 + my_hgt_scale - my_plot_height # whitespace below legend | |
2692 ) | |
2693 my_note_cex <- min(0.8, my_row_cex, my_col_cex) | |
2694 if (print_trace_messages) { | |
2695 cat_variable(my_plot_height, "\n\n", 3) | |
2696 cat_variable(4.19 - my_hgt_scale, "\n\n", 3) | |
2697 cat_variable(nrow_m_hm, "; ", 0) | |
2698 cat_variable(ncol_m_hm, "; ", 0) | |
2699 cat_variable(my_row_cex, "; ", 3) | |
2700 cat_variable(my_col_cex, "; ", 3) | |
2701 cat_variable(my_note_cex, "; ", 3) | |
2702 cat_variable(my_key_cex, "\n\n", 3) | |
2703 cat_variable(my_hgt_scale, "; ", 3) | |
2704 cat_variable(my_plot_height, "; ", 3) | |
2705 cat_variable(my_warp, "\n\n", 3) | |
2706 cat_variable(my_heights, "; ", 2) | |
2707 cat_variable(sum(my_heights), "\n\n", 3) | |
2708 } | |
2709 | |
2710 # define hm_call inner function | |
2711 hm_call <- function(x, scaling, title) { | |
2712 my_cex_main <- min(5.0, 220 / nchar(title)) | |
2713 op <- par( | |
2714 cex.main = my_cex_main * master_cex, | |
2715 adj = adj | |
2716 ) | |
2717 if ( | |
2718 !is.null( | |
2719 hm2plus( | |
2720 x, | |
2721 Colv = NA, | |
2722 Rowv = TRUE, | |
2723 cexRow = my_row_cex, | |
2724 cexCol = my_col_cex, | |
2725 dendrogram = "row", | |
2726 las = 1, | |
2727 main = title, | |
2728 key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), | |
2729 cex = my_hm2_cex, | |
2730 key_par = list(cex = my_key_cex), | |
2731 margins = my_margins, | |
2732 widths = c(0.4, 2.6, 1.5), | |
2733 heights = my_heights, | |
2734 mat = matrix( | |
2735 c( | |
2736 c(0, 3, 3), | |
2737 c(2, 1, 1), | |
2738 c(0, 4, 0), | |
2739 c(0, 0, 0) | |
2740 ), | |
2741 nrow = 4, | |
2742 ncol = 3, | |
2743 byrow = TRUE | |
2744 ), | |
2745 na.rm = TRUE, | |
2746 scale = scaling, | |
2747 srtcol = 90, | |
2748 srtrow = 0, | |
2749 xlab = "", | |
2750 cellnote = cellnote, | |
2751 notecex = my_note_cex, | |
2752 ... | |
2753 ) | |
2754 ) | |
2755 ) { | |
2756 if (print_trace_messages) cat( | |
2757 sprintf( | |
2758 "my_heights = c(%s); sum = %0.3f\n\n", | |
2759 paste( | |
2760 sprintf("%0.3f", my_heights), | |
2761 collapse = ", " | |
2762 ), | |
2763 sum(my_heights) | |
2764 ) | |
2765 ) | |
2766 if (print_trace_messages) cat( | |
2767 sprintf("my_key_cex = %0.3f\n\n", | |
2768 my_key_cex) | |
2769 ) | |
2770 if (print_trace_messages) cat( | |
2771 sprintf("my_key_cex/my_heights[3] = %0.3f\n\n", | |
2772 my_key_cex / my_heights[3]) | |
2773 ) | |
2774 if (print_trace_messages) cat( | |
2775 sprintf("my_heights[2]/my_heights[3] = %0.3f\n\n", | |
2776 my_heights[2] / my_heights[3]) | |
2777 ) | |
2778 } | |
2779 par(op) | |
2780 } | |
2781 | |
2782 # invoke hm_call inner function | |
2783 if (sum(rowSums(!is.na(m_hm)) < 2)) | |
2784 hm_call( | |
2785 m_hm, | |
2786 "none", | |
2787 "log(intensities), unscaled, unimputed, and unnormalized" | |
2788 ) | |
2789 else | |
2790 hm_call( | |
2791 m_hm, | |
2792 "row", | |
2793 "log(intensities), row-scaled, unimputed, and unnormalized" | |
2794 ) | |
1119 }, | 2795 }, |
1120 error = function(e) { | 2796 error = function(e) { |
1121 cat( | 2797 if (!is.null(hm_call)) { |
1122 sprintf( | 2798 m_hm[is.na(m_hm)] <- 0 |
1123 "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n", | 2799 tryCatch( |
1124 e$message | 2800 { |
2801 if (nrow(m_hm) > 1) | |
2802 hm_call( | |
2803 m_hm, | |
2804 "none", | |
2805 paste( | |
2806 "log(intensities), unscaled,", | |
2807 "zero-imputed, unnormalized" | |
2808 ) | |
2809 ) | |
2810 else | |
2811 cat("\nThere are too few peptides to produce a heatmap.\n") | |
2812 }, | |
2813 error = function(r) { | |
2814 cat( | |
2815 sprintf( | |
2816 "\n%s %s Internal message: %s\n\\newline\n\n", | |
2817 "Failure drawing heatmap,", | |
2818 "possibly because of too many missing values.\n\\newline\n\n", | |
2819 r$message | |
2820 ) | |
2821 ) | |
2822 cat_margins() | |
2823 } | |
2824 ) | |
2825 } else { | |
2826 cat( | |
2827 "\nFailure drawing heatmap, possibly because of too many missing values.\n" | |
1125 ) | 2828 ) |
1126 ) | 2829 } |
1127 }, | 2830 } |
1128 finally = par(old_oma) | |
1129 ) | 2831 ) |
1130 } | 2832 } |
1131 } | 2833 } |
1132 return(peptide_count) | 2834 # return value: |
1133 } | 2835 peptide_count |
2836 } | |
2837 | |
2838 # function drawing heatmap of correlations if they exist, else covariances | |
2839 cov_heatmap <- | |
2840 function( | |
2841 m, # matrix with rownames already formatted | |
2842 top_substrates = FALSE, | |
2843 ... # passthru to hm2plus or heatmap.2 | |
2844 ) { | |
2845 if (print_nb_messages) nbe(see_variable(m), " [", nrow(m), "x", ncol(m), "\n") | |
2846 #ACE nb(rowSums(m, na.rm = TRUE)) | |
2847 #ACE bad_rows <- (rowSums(m, na.rm = TRUE) == 0) | |
2848 #ACE nb(see_variable(bad_rows)) | |
2849 #ACE m <- m[-bad_rows, , drop = FALSE] | |
2850 colnames_m <- colnames(m) | |
2851 is_na_m <- is.na(m) | |
2852 tmp <- m | |
2853 tmp[is_na_m] <- 0 | |
2854 | |
2855 tmp <- m[, 0 < colSums(x = tmp)] # by default, na.rm is FALSE | |
2856 | |
2857 colnames_tmp <- colnames(tmp) | |
2858 | |
2859 my_low_p_seq <- seq( | |
2860 from = min(g_intensity_hm_rows, nrow(m)), | |
2861 to = 1, | |
2862 by = -1 | |
2863 ) | |
2864 | |
2865 if (g_correlate_substrates) { | |
2866 # zap samples having zero or near-zero variance | |
2867 tmp[is.na(tmp)] <- 0 | |
2868 nzv <- caret::nearZeroVar( | |
2869 tmp, # matrix of values, samples x variables | |
2870 freqCut = 1.01, # min(freq most prevalent value / | |
2871 # freq second most prevalent) | |
2872 uniqueCut = 99 # max(number of unique values / | |
2873 # total number of samples) | |
2874 ) | |
2875 tmp <- if (length(nzv) > 0) { | |
2876 m[, -nzv, drop = FALSE] | |
2877 } else { | |
2878 m | |
2879 } | |
2880 } else { | |
2881 tmp <- m[my_low_p_seq, , drop = FALSE] | |
2882 } | |
2883 | |
2884 | |
2885 t_m <- t(tmp) | |
2886 t_m[is.na(t_m)] <- 0 | |
2887 prefiltered_nrow <- ncol(t_m) | |
2888 | |
2889 my_corcov <- cov(t_m) | |
2890 did_filter_rows <- did_filter_cols <- FALSE | |
2891 if (g_correlate_substrates && !is_positive_definite(my_corcov)) { | |
2892 my_correlate_substrates <- FALSE | |
2893 t_m <- t(m[my_low_p_seq, , drop = FALSE]) | |
2894 t_m[is.na(t_m)] <- 0 | |
2895 unfiltered_row_count <- ncol(t_m) | |
2896 unfiltered_col_count <- nrow(t_m) | |
2897 | |
2898 # zap empty samples | |
2899 t_m <- t_m[0 < rowSums(x = t_m), ] | |
2900 # zap substrates present in fewer than two samples | |
2901 foo <- t_m > 0 | |
2902 foo <- colSums(x = foo) > 1 | |
2903 t_m <- t_m[, foo] | |
2904 | |
2905 did_filter_rows <- unfiltered_row_count > ncol(t_m) | |
2906 did_filter_cols <- unfiltered_col_count > nrow(t_m) | |
2907 | |
2908 colnames_tmp <- rownames(t_m) | |
2909 my_corcov <- cov(t_m) | |
2910 if (g_filter_cov_var_gt_1) { | |
2911 my_corcov <- keep_cov_w_var_gtr_1(my_corcov) | |
2912 } | |
2913 } else if (g_correlate_substrates) { | |
2914 my_corcov <- cov2cor(my_corcov) | |
2915 my_correlate_substrates <- TRUE | |
2916 } else { | |
2917 my_correlate_substrates <- FALSE | |
2918 if (g_filter_cov_var_gt_1) my_corcov <- keep_cov_w_var_gtr_1(my_corcov) | |
2919 } | |
2920 | |
2921 omitted_samples <- colnames_m[colnames_m %notin% colnames_tmp] | |
2922 suffix <- if (length(omitted_samples) > 1) "s" else "" | |
2923 | |
2924 f_omissions <- | |
2925 function(is_corr) { | |
2926 cat( | |
2927 sprintf( | |
2928 "Below is the %s plot for %s substrates", | |
2929 if (is_corr) "correlation" else "covariance", | |
2930 sprintf( | |
2931 if (top_substrates) | |
2932 "%0.0f \"highest-quality\"" | |
2933 else | |
2934 "%0.0f", | |
2935 ncol(t_m) | |
2936 ) | |
2937 ) | |
2938 ) | |
2939 if (did_filter_cols) { | |
2940 cat(sprintf(", omitting sample%s ", suffix)) | |
2941 latex_collapsed_vector(", ", omitted_samples) | |
2942 } | |
2943 cat(".\n\n") | |
2944 } | |
2945 | |
2946 if (is.null(my_corcov) || sum(!is.na(t_m)) < 2) { | |
2947 cat( | |
2948 sprintf( | |
2949 "\\newline\n%s %s plot.\n", | |
2950 "Insufficient covariance to produce", | |
2951 if (my_correlate_substrates) | |
2952 "correlation" | |
2953 else | |
2954 "covariance" | |
2955 ), | |
2956 "\\newpage\n" | |
2957 ) | |
2958 return(NULL) | |
2959 } | |
2960 | |
2961 cat("\\leavevmode\n", "\\newpage\n") | |
2962 f_omissions(my_correlate_substrates) | |
2963 | |
2964 master_cex <- 0.4 | |
2965 max_nchar <- max(nchar(rownames(t_m))) | |
2966 my_limit <- g_intensity_hm_rows | |
2967 diminution <- sqrt(my_limit / (my_limit + ncol(t_m))) | |
2968 my_row_cex <- | |
2969 my_col_cex <- | |
2970 min(1.75, master_cex * 9 * diminution ^ 1.5) | |
2971 my_margin <- 3 + my_row_cex * 64 / (8 + max_nchar) | |
2972 my_key_cex <- 1.4 | |
2973 my_hm2_cex <- 1.0 * master_cex | |
2974 my_hgt_scale <- 3.50 - 0.26 * (max(0.4, my_key_cex) - 0.4) | |
2975 my_hgt_scale <- 2.7 | |
2976 | |
2977 my_legend_height <- 4.0 - my_hgt_scale | |
2978 my_legend_height <- 0.5 * my_key_cex | |
2979 my_warp <- 0.65 * (my_limit + ncol(t_m)) / my_limit | |
2980 my_warp <- 0.8 | |
2981 my_legend_height <- 0.77 | |
2982 my_legend_height <- 0.67 | |
2983 my_plot_height <- my_hgt_scale + (1 - my_warp) * my_legend_height | |
2984 my_legend_height <- my_warp * my_legend_height | |
2985 | |
2986 parjust <- par(adj = 0.5) | |
2987 on.exit(par(parjust)) | |
2988 my_corcov <- my_corcov[order(rownames(my_corcov)), ] | |
2989 my_main <- | |
2990 sprintf("%s among %s substrates %s", | |
2991 if (my_correlate_substrates) "Correlation" | |
2992 else "Covariance", | |
2993 kinase_name, | |
2994 if (!my_correlate_substrates && | |
2995 g_filter_cov_var_gt_1 && | |
2996 did_filter_rows | |
2997 ) | |
2998 "having variance > 1" | |
2999 else "" | |
3000 ) | |
3001 my_main_nchar <- nchar(my_main) | |
3002 my_heights <- c( | |
3003 0.3, | |
3004 my_plot_height, | |
3005 my_legend_height # was 4.0 - my_hgt_scale # was 4.19 | |
3006 ) | |
3007 if (print_trace_messages) cat(sprintf("max_nchar = %0.3f; ", max_nchar)) | |
3008 if (print_trace_messages) cat(sprintf("my_margin = %0.3f; ", my_margin)) | |
3009 if (print_trace_messages) cat(sprintf("my_plot_height = %0.3f\n\n", my_plot_height)) | |
3010 if (print_trace_messages) cat(sprintf("master_cex = %0.3f; ", master_cex)) | |
3011 if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex)) | |
3012 if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex)) | |
3013 if (print_trace_messages) cat(sprintf("my_key_cex = %0.3f\n\n", my_key_cex)) | |
3014 if (print_trace_messages) cat(sprintf("my_hgt_scale = %0.3f\n\n", my_hgt_scale)) | |
3015 if (print_trace_messages) cat(sprintf("legend height = %0.3f\n\n", my_legend_height)) | |
3016 if (print_trace_messages) cat( | |
3017 sprintf( | |
3018 "my_heights = c(%s); sum = %0.3f\n\n", | |
3019 paste( | |
3020 sprintf("%0.3f", my_heights), | |
3021 collapse = ", " | |
3022 ), | |
3023 sum(my_heights) | |
3024 ) | |
3025 ) | |
3026 op <- par(cex.main = (30 + my_main_nchar) / my_main_nchar) | |
3027 on.exit(par(op)) | |
3028 hm2plus( | |
3029 x = my_corcov, | |
3030 cex = my_hm2_cex, | |
3031 cexCol = my_col_cex, | |
3032 cexRow = my_row_cex, | |
3033 density_info = "none", | |
3034 denhgt = 0.15, | |
3035 denwid = 0.5, | |
3036 divergent = TRUE, | |
3037 key_par = list(cex = my_key_cex), | |
3038 key_xlab = if (my_correlate_substrates) "Correlation" | |
3039 else "Covariance", | |
3040 main = my_main, | |
3041 mat = matrix( | |
3042 c( | |
3043 c(0, 3, 3), | |
3044 c(2, 1, 1), | |
3045 c(0, 4, 0) | |
3046 ), | |
3047 nrow = 3, | |
3048 ncol = 3, | |
3049 byrow = TRUE | |
3050 ), | |
3051 heights = my_heights, | |
3052 margins = c(my_margin, my_margin), | |
3053 widths = c(0.5, 3.1, 0.9), | |
3054 scale = "none", | |
3055 symkey = TRUE, | |
3056 symbreaks = TRUE, | |
3057 symm = FALSE #TODO evaluate TRUE | |
3058 # ... | |
3059 ) | |
3060 } # end cov_heatmap | |
3061 | |
3062 ### FILE IMPORT | |
3063 | |
3064 # function reading bzipped file to data.frame | |
3065 bzip2df <- function(d, f, ctor = bzfile) { | |
3066 # read.delim file (by default, compressed by bzip2) | |
3067 if (file.exists(f)) { | |
3068 conn <- NULL | |
3069 pf <- parent.frame() | |
3070 tryCatch( | |
3071 assign( | |
3072 as.character(substitute(d)), | |
3073 read.delim(conn <- bzfile(f, open = "r")), | |
3074 pf | |
3075 ), | |
3076 finally = if (!is.null(conn)) close(conn) | |
3077 ) | |
3078 } | |
3079 } | |
3080 | |
1134 ``` | 3081 ``` |
1135 | 3082 |
1136 ```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
1137 cat("\\listoftables\n") | |
1138 ``` | |
1139 # Purpose | 3083 # Purpose |
1140 | 3084 |
1141 To perform for phosphopeptides: | 3085 The purpose of this analysis is to perform for phosphopeptides: |
1142 | 3086 |
1143 - imputation of missing values, | 3087 - imputation of missing values, |
1144 - quantile normalization, | 3088 - quantile normalization, |
1145 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), and | 3089 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), |
3090 - assignment of an FDR-adjusted $p$-value and a "quality score" to each phosphopeptide, and | |
1146 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: | 3091 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: |
1147 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) | 3092 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) |
1148 - The Human Proteome Database [http://hprd.org](http://hprd.org) | 3093 - The Human Proteome Database [http://hprd.org](http://hprd.org) |
1149 - NetworKIN [http://networkin.science/](http://networkin.science/) | 3094 - NetworKIN [http://networkin.science/](http://networkin.science/) |
1150 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) | 3095 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) |
1151 | 3096 |
1152 ```{r include = FALSE} | 3097 ```{r include = FALSE} |
1153 | 3098 |
1154 ### GLOBAL VARIABLES | 3099 if (params$kseaUseAbsoluteLog2FC) { |
1155 | 3100 sfc <- "|s|" |
1156 # parameters for KSEA | 3101 pfc <- "|p|" |
1157 | 3102 pfc_txt <- "$\\text{absolute value}({\\log_2 (\\text{fold-change})})$" |
1158 ksea_cutoff_statistic <- params$kseaCutoffStatistic | 3103 } else { |
1159 ksea_cutoff_threshold <- params$kseaCutoffThreshold | 3104 sfc <- "s" |
1160 ksea_min_kinase_count <- params$kseaMinKinaseCount | 3105 pfc <- "p" |
3106 pfc_txt <- "${\\log_2 (\\text{fold-change}})$" | |
3107 } | |
1161 | 3108 |
1162 ksea_heatmap_titles <- list() | 3109 ksea_heatmap_titles <- list() |
1163 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- | 3110 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- |
1164 sprintf( | 3111 sprintf( |
1165 "Summary for all kinases enriched in one or more contrasts at %s < %s", | 3112 "Summary for all kinases enriched in one or more contrasts at %s < %s", |
1175 ksea_cutoff_threshold | 3122 ksea_cutoff_threshold |
1176 ) | 3123 ) |
1177 # hash to hold names of significantly enriched kinases | 3124 # hash to hold names of significantly enriched kinases |
1178 ksea_asterisk_hash <- new_env() | 3125 ksea_asterisk_hash <- new_env() |
1179 | 3126 |
1180 # READ PARAMETERS (mostly) | 3127 # PROCESS (mostly read) PARAMETERS |
1181 | |
1182 intensity_hm_rows <- params$intensityHeatmapRows | |
1183 # Input Filename | |
1184 input_file <- params$inputFile | |
1185 | |
1186 # First data column - ideally, this could be detected via regexSampleNames, | |
1187 # but for now leave it as is. | |
1188 first_data_column <- params$firstDataColumn | |
1189 fdc_is_integer <- is.integer(first_data_column) | |
1190 if (fdc_is_integer) { | |
1191 first_data_column <- as.integer(params$firstDataColumn) | |
1192 } | |
1193 | 3128 |
1194 # False discovery rate adjustment for ANOVA | 3129 # False discovery rate adjustment for ANOVA |
1195 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 | 3130 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 |
1196 val_fdr <- | 3131 val_fdr <- read.table(file = alpha_file, sep = "\t", header = FALSE, quote = "") |
1197 read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "") | |
1198 | 3132 |
1199 if ( | 3133 if ( |
1200 ncol(val_fdr) != 1 || | 3134 ncol(val_fdr) != 1 || |
1201 sum(!is.numeric(val_fdr[, 1])) || | 3135 sum(!is.numeric(val_fdr[, 1])) || |
1202 sum(val_fdr[, 1] < 0) || | 3136 sum(val_fdr[, 1] < 0) || |
1203 sum(val_fdr[, 1] > 1) | 3137 sum(val_fdr[, 1] > 1) |
1204 ) { | 3138 ) { |
1205 stop("alphaFile should be one column of numbers within the range [0.0,1.0]") | 3139 cat("alphaFile should be one column of numbers within the range [0.0,1.0]") |
3140 param_df_exit() | |
3141 knitr::knit_exit() | |
1206 } | 3142 } |
1207 val_fdr <- val_fdr[, 1] | 3143 val_fdr <- val_fdr[, 1] |
1208 | 3144 |
1209 #Imputed Data filename | |
1210 imputed_data_filename <- params$imputedDataFilename | |
1211 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile | |
1212 anova_ksea_mtdt_file <- params$anovaKseaMetadata | |
1213 | |
1214 ``` | 3145 ``` |
1215 | 3146 |
1216 ```{r echo = FALSE} | 3147 ```{r echo = FALSE, results = 'asis'} |
1217 # Imputation method, should be one of | 3148 |
1218 # "random", "group-median", "median", or "mean" | 3149 |
1219 imputation_method <- params$imputationMethod | |
1220 | |
1221 # Selection of percentile of logvalue data to set the mean for random number | |
1222 # generation when using random imputation | |
1223 mean_percentile <- params$meanPercentile / 100.0 | |
1224 | |
1225 # deviation adjustment-factor for random values; real number. | |
1226 sd_percentile <- params$sdPercentile | |
1227 | |
1228 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | |
1229 regex_sample_names <- params$regexSampleNames | |
1230 | |
1231 # Regular expression to extract Sample Grouping from Sample Name; | |
1232 # if error occurs, compare sample_treatment_levels vs. sample_name_matches | |
1233 # to see if groupings/pairs line up | |
1234 # e.g., "(\\d+)" | |
1235 regex_sample_grouping <- params$regexSampleGrouping | |
1236 | |
1237 one_way_all_categories_fname <- params$oneWayManyCategories | |
1238 one_way_all_categories <- try_catch_w_e( | |
1239 match.fun(one_way_all_categories_fname)) | |
1240 if (!is.function(one_way_all_categories$value)) { | |
1241 write("fatal error for parameter oneWayManyCategories:", stderr()) | |
1242 write(one_way_all_categories$value$message, stderr()) | |
1243 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
1244 stop("Cannot continue. Goodbye.") | |
1245 } | |
1246 one_way_all_categories <- one_way_all_categories$value | |
1247 | |
1248 one_way_two_categories_fname <- params$oneWayManyCategories | |
1249 one_way_two_categories <- try_catch_w_e( | |
1250 match.fun(one_way_two_categories_fname)) | |
1251 if (!is.function(one_way_two_categories$value)) { | |
1252 cat("fatal error for parameter oneWayTwoCategories: \n") | |
1253 cat(one_way_two_categories$value$message, fill = TRUE) | |
1254 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
1255 stop("Cannot continue. Goodbye.") | |
1256 } | |
1257 one_way_two_categories <- one_way_two_categories$value | |
1258 | |
1259 preproc_db <- params$preprocDb | |
1260 ksea_app_prep_db <- params$kseaAppPrepDb | |
1261 result <- file.copy( | 3150 result <- file.copy( |
1262 from = preproc_db, | 3151 from = preproc_db, |
1263 to = ksea_app_prep_db, | 3152 to = ksea_app_prep_db, |
1264 overwrite = TRUE | 3153 overwrite = TRUE |
1265 ) | 3154 ) |
1270 preproc_db, | 3159 preproc_db, |
1271 ksea_app_prep_db, | 3160 ksea_app_prep_db, |
1272 ), | 3161 ), |
1273 stderr() | 3162 stderr() |
1274 ) | 3163 ) |
1275 if (sys.nframe() > 0) quit(save = "no", status = 1) | 3164 if (sys.nframe() > 0) { |
1276 stop("Cannot continue. Goodbye.") | 3165 cat("Cannot continue and quit() failed. Goodbye.") |
1277 } | 3166 param_df_exit() |
3167 knitr::knit_exit() | |
3168 # in case knit_exit doesn't exit | |
3169 quit(save = "no", status = 1) | |
3170 } | |
3171 } | |
3172 | |
3173 if (FALSE) { | |
3174 write.table(x = param_df, file = "test-data/params.txt") | |
3175 } | |
3176 | |
1278 ``` | 3177 ``` |
1279 | 3178 |
1280 ```{r echo = FALSE} | 3179 ```{r echo = FALSE} |
1281 ### READ DATA | 3180 ### READ DATA |
1282 | 3181 |
1287 sep = "\t", | 3186 sep = "\t", |
1288 header = TRUE, | 3187 header = TRUE, |
1289 quote = "", | 3188 quote = "", |
1290 check.names = FALSE | 3189 check.names = FALSE |
1291 ) | 3190 ) |
3191 | |
1292 ``` | 3192 ``` |
1293 | 3193 |
1294 # Extract Sample Names and Treatment Levels | 3194 |
1295 | 3195 # Extraction of Sample Classes and Names from Input Data |
1296 Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2. | |
1297 | 3196 |
1298 ```{r echo = FALSE, results = 'asis'} | 3197 ```{r echo = FALSE, results = 'asis'} |
1299 | 3198 |
1300 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) | 3199 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) |
3200 my_column_names <- names(full_data) | |
1301 | 3201 |
1302 if (!fdc_is_integer) { | 3202 if (!fdc_is_integer) { |
1303 if (length(data_column_indices) > 0) { | 3203 if (length(data_column_indices) > 0) { |
1304 first_data_column <- data_column_indices[1] | 3204 first_data_column <- data_column_indices[1] |
1305 } else { | 3205 } else { |
1306 stop(paste("failed to convert firstDataColumn:", first_data_column)) | 3206 cat(paste("failed to convert firstDataColumn:", first_data_column)) |
3207 param_df_exit() | |
3208 knitr::knit_exit() | |
1307 } | 3209 } |
1308 } | 3210 } |
1309 | 3211 |
1310 cat( | 3212 cat( |
1311 sprintf( | 3213 sprintf( |
1312 paste( | 3214 paste( |
1313 "\n\nThe input data file has peptide-intensity data for each sample", | 3215 "\n\nThe input data file has peptide-intensity data", |
1314 "in one of columns %d through %d.\n\n" | 3216 "in columns %d (\"%s\") through %d (\"%s\")." |
1315 ), | 3217 ), |
1316 min(data_column_indices), | 3218 tmp <- min(data_column_indices), |
1317 max(data_column_indices) | 3219 my_column_names[tmp], |
1318 ) | 3220 tmp <- max(data_column_indices), |
1319 ) | 3221 my_column_names[tmp] |
1320 | 3222 ) |
1321 # Write column names as a LaTeX enumerated list. | 3223 ) |
1322 column_name_df <- data.frame( | 3224 |
1323 column = seq_len(length(colnames(full_data))), | 3225 if (TRUE) { |
1324 name = paste0("\\verb@", colnames(full_data), "@") | 3226 cat0( |
1325 ) | 3227 table_offset(i = 1, new = TRUE), |
1326 data_frame_latex( | 3228 "Sample classes and names are shown in ", |
1327 x = column_name_df, | 3229 table_href(), |
1328 justification = "l l", | 3230 ".\n\n" |
1329 centered = TRUE, | 3231 ) |
1330 caption = "Input data column names", | 3232 } else { |
1331 anchor = const_table_anchor_bp, | 3233 cat0( |
1332 underscore_whack = FALSE | 3234 "\\newcounter{offset}\n", |
1333 ) | 3235 "\\setcounter{offset}{\\value{table}}\n", |
3236 "\\stepcounter{offset}\n", | |
3237 "Sample classes and names are shown in ", | |
3238 table_href(), | |
3239 ".\n\n" | |
3240 ) | |
3241 } | |
3242 | |
3243 #TODO remove this unused variable and assignment | |
3244 if (FALSE) { | |
3245 # Write column names as a LaTeX enumerated list. | |
3246 column_name_df <- data.frame( | |
3247 column = seq_len(length(colnames(full_data))), | |
3248 name = paste0("\\verb@", colnames(full_data), "@") | |
3249 ) | |
3250 } | |
1334 | 3251 |
1335 ``` | 3252 ``` |
1336 | 3253 |
1337 ```{r echo = FALSE, results = 'asis'} | 3254 ```{r echo = FALSE, results = 'asis'} |
3255 # extract intensity columns from full_data to quant_data | |
1338 quant_data <- full_data[first_data_column:length(full_data)] | 3256 quant_data <- full_data[first_data_column:length(full_data)] |
1339 quant_data[quant_data == 0] <- NA | 3257 quant_data[quant_data == 0] <- NA |
1340 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide | 3258 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide |
3259 full_data_names <- colnames(quant_data) | |
1341 # Extract factors and trt-replicates using regular expressions. | 3260 # Extract factors and trt-replicates using regular expressions. |
1342 # Typically: | 3261 # Typically: |
1343 # regex_sample_names is "\\.\\d+[A-Z]$" | 3262 # regex_sample_names is "\\.\\d+[A-Z]$" |
1344 # regex_sample_grouping is "\\d+" | 3263 # regex_sample_grouping is "\\d+" |
1345 # This would distinguish trt-replicates by terminal letter [A-Z] | 3264 # This would distinguish trt-replicates by terminal letter [A-Z] |
1353 colnames(quant_data) <- sample_name_matches | 3272 colnames(quant_data) <- sample_name_matches |
1354 | 3273 |
1355 write_debug_file(quant_data) | 3274 write_debug_file(quant_data) |
1356 | 3275 |
1357 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) | 3276 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) |
1358 sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) | 3277 smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) |
3278 | |
3279 if (print_nb_messages) nbe(see_variable(smpl_trt, "\n\n")) | |
3280 if (print_nb_messages) nbe(see_variable(sample_name_matches, "\n\n")) | |
3281 if (print_nb_messages) nbe(see_variable(full_data_names, "\n\n")) | |
3282 | |
3283 sample_treatment_df <- | |
3284 save_sample_treatment_df <- | |
3285 data.frame( | |
3286 class = smpl_trt, | |
3287 sample = sample_name_matches, | |
3288 full_sample_names = full_data_names | |
3289 ) | |
3290 | |
3291 if (print_nb_messages) nbe(see_variable(sample_treatment_df, "\n\n")) | |
3292 | |
3293 # reorder data | |
3294 my_order <- with(sample_treatment_df, order(class, sample)) | |
3295 quant_data <- quant_data[, my_order] | |
3296 sample_name_matches <- sample_name_matches[my_order] | |
3297 smpl_trt <- smpl_trt[my_order] | |
3298 sample_treatment_df <- data.frame( | |
3299 class = smpl_trt, | |
3300 sample = sample_name_matches | |
3301 ) | |
3302 | |
3303 # filter smpl_trt as appropriate | |
3304 if (sample_group_filter %in% c("include", "exclude")) { | |
3305 include_sample <- | |
3306 mgrepl( | |
3307 v = sample_group_filter_patterns, | |
3308 s = as.character(smpl_trt), | |
3309 fixed = sample_group_filter_fixed, | |
3310 perl = sample_group_filter_perl, | |
3311 ignore.case = sample_group_filter_nocase | |
3312 ) | |
3313 if (sum(include_sample) < 2) { | |
3314 errmsg <- | |
3315 paste( | |
3316 "ERROR:", | |
3317 sum(include_sample), | |
3318 "samples are too few for analysis;", | |
3319 "check input parameters for sample-name parsing" | |
3320 ) | |
3321 cat0( | |
3322 errmsg, | |
3323 "\\stepcounter{offset}\n", | |
3324 " in ", | |
3325 table_href(), | |
3326 ".\n\n" | |
3327 ) | |
3328 data_frame_tabbing_latex( | |
3329 x = save_sample_treatment_df, | |
3330 tabstops = c(1.25, 1.25), | |
3331 caption = "Sample classes", | |
3332 use_subsubsection_header = FALSE | |
3333 ) | |
3334 data_frame_tabbing_latex( | |
3335 x = | |
3336 param_df[ | |
3337 c("regexSampleNames", | |
3338 "regexSampleGrouping", | |
3339 "groupFilterPatterns", | |
3340 "groupFilter", | |
3341 "groupFilterMode" | |
3342 ), | |
3343 ], | |
3344 tabstops = c(1.75), | |
3345 underscore_whack = TRUE, | |
3346 caption = "Input parameters for sample-name parsing", | |
3347 verbatim = FALSE | |
3348 ) | |
3349 param_df_exit() | |
3350 knitr::knit_exit() | |
3351 return(invisible(-1)) | |
3352 } | |
3353 sample_treatment_df <- | |
3354 if (sample_group_filter == "include") | |
3355 sample_treatment_df[include_sample, ] | |
3356 else | |
3357 sample_treatment_df[!include_sample, ] | |
3358 } else { | |
3359 include_sample <- rep.int(TRUE, length(smpl_trt)) | |
3360 } | |
3361 sample_name_matches <- sample_treatment_df$sample | |
3362 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) | |
3363 smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) | |
3364 sample_treatment_df$class <- smpl_trt | |
3365 | |
3366 # filter quant_data as appropriate | |
1359 number_of_samples <- length(sample_name_matches) | 3367 number_of_samples <- length(sample_name_matches) |
1360 sample_treatment_df <- data.frame( | 3368 quant_data <- quant_data[, sample_name_matches] |
1361 level = sample_treatment_levels, | 3369 |
1362 sample = sample_name_matches | 3370 sample_level_integers <- as.integer(smpl_trt) |
1363 ) | 3371 sample_treatment_levels <- levels(smpl_trt) |
1364 data_frame_latex( | 3372 count_of_treatment_levels <- length(sample_treatment_levels) |
3373 | |
3374 # for each phosphopeptide, across treatment levels, compute minimum | |
3375 # count of observed (i.e., non-missing) values | |
3376 my_env <- new_env() | |
3377 for (l in sample_treatment_levels) | |
3378 my_env[[as.character(l)]] <- | |
3379 as.vector(rowSums(!is.na(quant_data[l == smpl_trt]))) | |
3380 min_group_obs_count <- row_apply( | |
3381 x = Reduce( | |
3382 f = function(l, r) cbind(l, my_env[[r]]), | |
3383 x = sample_treatment_levels, | |
3384 init = c() | |
3385 ), | |
3386 fun = min | |
3387 ) | |
3388 names(min_group_obs_count) <- rownames(quant_data) | |
3389 rm(my_env) | |
3390 | |
3391 # display (possibly-filtered) results | |
3392 cat("\\newpage\n") | |
3393 | |
3394 if (sum(include_sample) > 1) { | |
3395 data_frame_tabbing_latex( | |
1365 x = sample_treatment_df, | 3396 x = sample_treatment_df, |
1366 justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", | 3397 tabstops = c(1.25), |
1367 centered = TRUE, | 3398 caption = "Sample classes", |
1368 caption = "Treatment levels", | 3399 use_subsubsection_header = FALSE |
1369 anchor = const_table_anchor_tbp, | 3400 ) |
1370 underscore_whack = FALSE | 3401 } |
1371 ) | 3402 sample_name_grow <- 10 / (10 + max(nchar(sample_name_matches) + 6)) |
3403 sample_colsep <- transition_positions(as.integer(sample_treatment_df$class)) | |
1372 ``` | 3404 ``` |
1373 | 3405 |
1374 ```{r echo = FALSE, results = 'asis'} | 3406 ```{r echo = FALSE, results = 'asis'} |
1375 cat("\\newpage\n") | 3407 cat("\\newpage\n") |
1376 ``` | 3408 ``` |
1377 | 3409 |
1378 ## Are the log-transformed sample distributions similar? | 3410 ## Are the log-transformed sample distributions similar? |
1379 | 3411 |
1380 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 3412 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} |
1381 | 3413 |
1382 quant_data[quant_data == 0] <- NA #replace 0 with NA | 3414 quant_data[quant_data == 0] <- NA #replace 0 with NA |
1383 quant_data_log <- log10(quant_data) | 3415 quant_data_log <- log10(quant_data) |
1384 | 3416 |
1385 rownames(quant_data_log) <- rownames(quant_data) | 3417 rownames(quant_data_log) <- rownames(quant_data) |
1386 colnames(quant_data_log) <- sample_name_matches | 3418 colnames(quant_data_log) <- sample_name_matches |
1387 | 3419 |
1388 write_debug_file(quant_data_log) | 3420 write_debug_file(quant_data_log) |
1389 | 3421 |
1390 # data visualization | 3422 g_ppep_distrib_ctl <- new_env() |
1391 old_par <- par( | 3423 g_ppep_distrib_ctl$xlab_line <- 3.5 + 11.86 * (0.67 - sample_name_grow) |
1392 mai = par("mai") + c(0.5, 0, 0, 0) | 3424 g_ppep_distrib_ctl$mai_bottom <- (0.5 + 3.95 * (0.67 - sample_name_grow)) |
1393 ) | 3425 g_ppep_distrib_ctl$axis <- (0.6 + 0.925 * (0.67 - sample_name_grow)) |
1394 # ref: https://r-charts.com/distribution/add-points-boxplot/ | 3426 |
1395 # Vertical plot | 3427 my_ppep_distrib_bxp <- function( |
1396 boxplot( | 3428 x |
1397 quant_data_log | 3429 , sample_name_grow = sample_name_grow |
1398 , las = 1 | 3430 , main |
1399 , col = const_boxplot_fill | 3431 , varwidth = FALSE |
1400 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 3432 , sub = NULL |
1401 , xlab = "Sample" | 3433 , xlab |
1402 ) | 3434 , ylab |
1403 par(old_par) | 3435 , col = const_boxplot_fill |
1404 | 3436 , notch = FALSE |
1405 | 3437 , ppep_distrib_ctl = g_ppep_distrib_ctl |
1406 | 3438 , ... |
1407 cat("\n\n\n") | 3439 ) { |
1408 cat("\n\n\n") | 3440 my_xlab_line <- g_ppep_distrib_ctl$xlab_line |
3441 my_mai_bottom <- g_ppep_distrib_ctl$mai_bottom | |
3442 my_axis <- g_ppep_distrib_ctl$axis | |
3443 | |
3444 if (print_trace_messages) { | |
3445 cat_variable(my_xlab_line, suffix = "; ") | |
3446 cat_variable(my_mai_bottom, suffix = "; ") | |
3447 cat_variable(my_axis, suffix = "\n\n") | |
3448 } | |
3449 | |
3450 old_par <- par( | |
3451 mai = par("mai") + c(my_mai_bottom, 0, 0, 0), | |
3452 cex.axis = my_axis, | |
3453 cex.lab = 1.2 | |
3454 ) | |
3455 tryCatch( | |
3456 { | |
3457 # Vertical plot | |
3458 boxplot( | |
3459 x | |
3460 , las = 2 | |
3461 , col = col | |
3462 , main = main | |
3463 , sub = NULL | |
3464 , ylab = ylab | |
3465 , xlab = NULL | |
3466 , notch = notch | |
3467 , varwidth = varwidth | |
3468 , ... | |
3469 ) | |
3470 title( | |
3471 sub = sub | |
3472 , cex.sub = 1.0 | |
3473 , line = my_xlab_line + 1 | |
3474 ) | |
3475 title( | |
3476 xlab = xlab | |
3477 , line = my_xlab_line | |
3478 ) | |
3479 }, | |
3480 finally = par(old_par) | |
3481 ) | |
3482 } | |
3483 | |
3484 my_ppep_distrib_bxp( | |
3485 x = quant_data_log | |
3486 , sample_name_grow = sample_name_grow | |
3487 , main = "Peptide intensities for each sample" | |
3488 , varwidth = boxplot_varwidth | |
3489 , sub = "Box widths reflect number of peptides for sample" | |
3490 , xlab = "Sample" | |
3491 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
3492 , col = const_boxplot_fill | |
3493 , notch = FALSE | |
3494 ) | |
3495 | |
3496 cat("\n\n\n\n") | |
1409 | 3497 |
1410 ``` | 3498 ``` |
1411 | 3499 |
1412 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} | 3500 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} |
1413 if (nrow(quant_data_log) > 1) { | 3501 if (nrow(quant_data_log) > 1) { |
1452 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), | 3540 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), |
1453 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 3541 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
1454 ) | 3542 ) |
1455 ``` | 3543 ``` |
1456 | 3544 |
3545 # Characterization of Input Data | |
3546 | |
1457 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values | 3547 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values |
1458 | 3548 |
1459 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} | 3549 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} |
1460 # determine quantile | 3550 # determine quantile |
1461 q1 <- quantile(logvalues, probs = mean_percentile)[1] | 3551 q1 <- quantile(logvalues, probs = mean_percentile)[1] |
1462 | 3552 |
1463 # 1 = row of matrix (ie, phosphopeptide) | 3553 # 1 = row of matrix (ie, phosphopeptide) |
1464 sds <- apply(quant_data_log, 1, sd_finite) | 3554 sds <- row_apply(quant_data_log, sd_finite) |
1465 if (sum(!is.na(sds)) > 2) { | 3555 if (sum(!is.na(sds)) > 2) { |
1466 plot( | 3556 plot( |
1467 density(sds, na.rm = TRUE) | 3557 density(sds, na.rm = TRUE) |
1468 , main = "Smoothed estimated probability density vs. std. deviation" | 3558 , main = "Smoothed estimated probability density vs. std. deviation" |
1469 , sub = "(probability estimation made with Gaussian smoothing)" | 3559 , sub = "(probability estimation made with Gaussian smoothing)" |
1493 ```{r echo = FALSE} | 3583 ```{r echo = FALSE} |
1494 | 3584 |
1495 # prep for trt-median based imputation | 3585 # prep for trt-median based imputation |
1496 | 3586 |
1497 ``` | 3587 ``` |
1498 # Impute Missing Values | 3588 # Imputation of Missing Values |
1499 | 3589 |
1500 ```{r echo = FALSE} | 3590 ```{r echo = FALSE} |
1501 | 3591 |
1502 imp_smry_pot_peptides_before <- nrow(quant_data_log) | 3592 imp_smry_pot_peptides_before <- nrow(quant_data_log) |
1503 imp_smry_missing_values_before <- number_to_impute | 3593 imp_smry_missing_values_before <- number_to_impute |
1524 quant_data_imp <- quant_data | 3614 quant_data_imp <- quant_data |
1525 imputation_method_description <- | 3615 imputation_method_description <- |
1526 paste("Substitute missing value with", | 3616 paste("Substitute missing value with", |
1527 "median peptide-intensity for sample group.\n" | 3617 "median peptide-intensity for sample group.\n" |
1528 ) | 3618 ) |
1529 sample_level_integers <- as.integer(sample_treatment_levels) | |
1530 # Take the accurate ln(x+1) because the data are log-normally distributed | 3619 # Take the accurate ln(x+1) because the data are log-normally distributed |
1531 # and because median can involve an average of two measurements. | 3620 # and because median can involve an average of two measurements. |
1532 quant_data_imp <- log1p(quant_data_imp) | 3621 quant_data_imp <- log1p(quant_data_imp) |
1533 for (i in seq_len(length(levels(sample_treatment_levels)))) { | 3622 for (i in seq_len(count_of_treatment_levels)) { |
1534 # Determine the columns for this factor-level | 3623 # Determine the columns for this factor-level |
1535 level_cols <- i == sample_level_integers | 3624 level_cols <- i == sample_level_integers |
1536 # Extract those columns | 3625 # Extract those columns |
1537 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] | 3626 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] |
1538 # assign to ind the row-column pairs corresponding to each NA | 3627 # assign to ind the row-column pairs corresponding to each NA |
1539 ind <- which(is.na(lvlsbst), arr.ind = TRUE) | 3628 ind <- which(is.na(lvlsbst), arr.ind = TRUE) |
1540 # No group-median exists if there is only one sample | 3629 # No group-median exists if there is only one sample |
1541 # a given ppep has no measurement; otherwise, proceed. | 3630 # a given ppep has no measurement; otherwise, proceed. |
1542 if (ncol(lvlsbst) > 1) { | 3631 if (ncol(lvlsbst) > 1) { |
1543 the_centers <- | 3632 the_centers <- |
1544 apply(lvlsbst, 1, median, na.rm = TRUE) | 3633 row_apply(lvlsbst, median, na.rm = TRUE) |
1545 for (j in seq_len(nrow(lvlsbst))) { | 3634 for (j in seq_len(nrow(lvlsbst))) { |
1546 for (k in seq_len(ncol(lvlsbst))) { | 3635 for (k in seq_len(ncol(lvlsbst))) { |
1547 if (is.na(lvlsbst[j, k])) { | 3636 if (is.na(lvlsbst[j, k])) { |
1548 lvlsbst[j, k] <- the_centers[j] | 3637 lvlsbst[j, k] <- the_centers[j] |
1549 } | 3638 } |
1563 "median peptide-intensity across all sample classes.\n" | 3652 "median peptide-intensity across all sample classes.\n" |
1564 ) | 3653 ) |
1565 # Take the accurate ln(x+1) because the data are log-normally distributed | 3654 # Take the accurate ln(x+1) because the data are log-normally distributed |
1566 # and because median can involve an average of two measurements. | 3655 # and because median can involve an average of two measurements. |
1567 quant_data_imp <- log1p(quant_data_imp) | 3656 quant_data_imp <- log1p(quant_data_imp) |
1568 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] | 3657 quant_data_imp[ind] <- row_apply(quant_data_imp, median, na.rm = TRUE)[ind[, 1]] |
1569 # Take the accurate e^x - 1 to match scaling of original input. | 3658 # Take the accurate e^x - 1 to match scaling of original input. |
1570 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | 3659 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) |
1571 good_rows <- !is.nan(rowMeans(quant_data_imp)) | 3660 good_rows <- !is.nan(rowMeans(quant_data_imp)) |
1572 } | 3661 } |
1573 , "mean" = { | 3662 , "mean" = { |
1579 # Take the accurate ln(x+1) because the data are log-normally distributed, | 3668 # Take the accurate ln(x+1) because the data are log-normally distributed, |
1580 # so arguments to mean should be previously transformed. | 3669 # so arguments to mean should be previously transformed. |
1581 # this will have to be | 3670 # this will have to be |
1582 quant_data_imp <- log1p(quant_data_imp) | 3671 quant_data_imp <- log1p(quant_data_imp) |
1583 # Assign to NA cells the mean for the row | 3672 # Assign to NA cells the mean for the row |
1584 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] | 3673 quant_data_imp[ind] <- row_apply(quant_data_imp, mean, na.rm = TRUE)[ind[, 1]] |
1585 # Take the accurate e^x - 1 to match scaling of original input. | 3674 # Take the accurate e^x - 1 to match scaling of original input. |
1586 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | 3675 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) |
1587 good_rows <- !is.nan(rowMeans(quant_data_imp)) | 3676 good_rows <- !is.nan(rowMeans(quant_data_imp)) |
1588 } | 3677 } |
1589 , "random" = { | 3678 , "random" = { |
1623 ```{r echo = FALSE} | 3712 ```{r echo = FALSE} |
1624 | 3713 |
1625 imp_smry_pot_peptides_after <- sum(good_rows) | 3714 imp_smry_pot_peptides_after <- sum(good_rows) |
1626 imp_smry_rejected_after <- sum(!good_rows) | 3715 imp_smry_rejected_after <- sum(!good_rows) |
1627 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) | 3716 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) |
3717 | |
3718 # From ?`%in%`, %in% is currently defined as function(x, table) match(x, table, nomatch = 0) > 0 | |
3719 | |
3720 sink(stderr()) | |
3721 print("`%in%`:") | |
3722 print(`%in%`) | |
3723 sink() | |
3724 | |
3725 stock_in <- | |
3726 names(good_rows) %in% | |
3727 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) | |
3728 if (print_nb_messages) nbe(see_variable(stock_in), "\n") | |
3729 | |
3730 explicit_in <- | |
3731 0 < match( | |
3732 names(good_rows), | |
3733 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) | |
3734 ) | |
3735 if (print_nb_messages) nbe(see_variable(explicit_in), "\n") | |
3736 | |
3737 great_enough_row_names <- good_rows[ | |
3738 names(good_rows) %in% | |
3739 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) | |
3740 ] | |
3741 if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") | |
3742 great_enough_row_names <- great_enough_row_names[great_enough_row_names] | |
3743 if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") | |
1628 ``` | 3744 ``` |
3745 | |
1629 ```{r echo = FALSE, results = 'asis'} | 3746 ```{r echo = FALSE, results = 'asis'} |
1630 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf | 3747 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf |
1631 tabular_lines_fmt <- paste( | 3748 tabular_lines_fmt <- paste( |
1632 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) | 3749 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) |
3750 " \\leavevmode", | |
1633 " \\caption{Imputation Results}", | 3751 " \\caption{Imputation Results}", |
1634 " \\centering", # \centering centers the table on the page | 3752 " \\centering", # \centering centers the table on the page |
1635 " \\begin{tabular}{l c c c}", | 3753 " \\begin{tabular}{l c c c}", |
1636 " \\hline\\hline", | 3754 " \\hline\\hline", |
1637 " \\ & potential peptides & missing values & rejected", | 3755 " \\ & potential peptides & missing values & rejected", |
1638 " peptides \\\\ [0.5ex]", | 3756 " peptides \\\\ [0.5ex]", |
1639 " \\hline", | 3757 " \\hline", |
1640 " before imputation & %d & %d (%d\\%s) & \\\\", | 3758 " before imputation & %d & %d (%d\\%s) & \\\\", |
1641 " after imputation & %d & %d & %d \\\\ [1ex]", | 3759 " after imputation & %d & %d & %d \\\\", |
3760 " after keep comparable & %d & & %d \\\\ [1ex]", | |
1642 " \\hline", | 3761 " \\hline", |
1643 " \\end{tabular}", | 3762 " \\end{tabular}", |
1644 #" \\label{table:nonlin}", # may be used to refer this table in the text | 3763 #" \\label{table:nonlin}", # may be used to refer this table in the text |
1645 "\\end{table}", | 3764 "\\end{table}", |
1646 sep = "\n" | 3765 sep = "\n" |
1652 imp_smry_missing_values_before, | 3771 imp_smry_missing_values_before, |
1653 imp_smry_pct_missing, | 3772 imp_smry_pct_missing, |
1654 "%", | 3773 "%", |
1655 imp_smry_pot_peptides_after, | 3774 imp_smry_pot_peptides_after, |
1656 imp_smry_missing_values_after, | 3775 imp_smry_missing_values_after, |
1657 imp_smry_rejected_after | 3776 imp_smry_rejected_after, |
3777 length(great_enough_row_names), | |
3778 imp_smry_pot_peptides_before - | |
3779 length(great_enough_row_names) | |
1658 ) | 3780 ) |
1659 cat(tabular_lines) | 3781 cat(tabular_lines) |
1660 ``` | 3782 ``` |
1661 ```{r echo = FALSE} | 3783 |
1662 | 3784 ```{r filter_good_rows, echo = FALSE} |
1663 | 3785 |
1664 # Zap rows where imputation was ineffective | 3786 if (print_nb_messages) nbe("before name extraction, ", see_variable(length(good_rows)), " ", see_variable(good_rows), "\n") |
3787 good_rows <- names(good_rows[names(great_enough_row_names)]) | |
3788 if (print_nb_messages) nbe("after name extraction, ", see_variable(length(good_rows)), see_variable(good_rows), "\n") | |
3789 | |
3790 #ACE min_group_obs_count <- min_group_obs_count[names(great_enough_row_names)] | |
3791 #ACE nbe("good_rows") | |
3792 #ACE nbe(see_variable(good_rows)) | |
3793 #ACE nbe("names(min_group_obs_count) before filter for good rows") | |
3794 #ACE nbe(see_variable(names(min_group_obs_count))) | |
3795 min_group_obs_count <- min_group_obs_count[good_rows] | |
3796 #ACE nbe("min_group_obs_count after filter for good rows") | |
3797 #ACE nbe(see_variable(names(min_group_obs_count))) | |
3798 | |
3799 # Zap rows where imputation was insufficiently effective | |
1665 full_data <- full_data [good_rows, ] | 3800 full_data <- full_data [good_rows, ] |
1666 quant_data <- quant_data [good_rows, ] | 3801 quant_data <- quant_data [good_rows, ] |
1667 | 3802 quant_data_log <- quant_data_log [good_rows, ] |
3803 | |
3804 if (print_nb_messages) nbe("before row filter, ", see_variable(nrow(quant_data_imp)), "\n") | |
1668 quant_data_imp <- quant_data_imp[good_rows, ] | 3805 quant_data_imp <- quant_data_imp[good_rows, ] |
3806 if (print_nb_messages) nbe("after row filter, ", see_variable(nrow(quant_data_imp)), "\n") | |
1669 write_debug_file(quant_data_imp) | 3807 write_debug_file(quant_data_imp) |
1670 quant_data_imp_good_rows <- quant_data_imp | 3808 quant_data_imp_good_rows <- quant_data_imp |
1671 | 3809 |
1672 write_debug_file(quant_data_imp_good_rows) | 3810 write_debug_file(quant_data_imp_good_rows) |
1673 ``` | 3811 ``` |
1717 d_imputed <- d_combined | 3855 d_imputed <- d_combined |
1718 } | 3856 } |
1719 | 3857 |
1720 ``` | 3858 ``` |
1721 | 3859 |
1722 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 3860 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} |
1723 zero_sd_rownames <- | 3861 zero_sd_rownames <- |
1724 rownames(quant_data_imp)[ | 3862 rownames(quant_data_imp)[ |
1725 is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) | 3863 is.na((row_apply(quant_data_imp, sd, na.rm = TRUE)) == 0) |
1726 ] | 3864 ] |
1727 | 3865 |
1728 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { | 3866 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { |
1729 stop("All peptides have zero standard deviation. Cannot continue.") | 3867 cat("All peptides have zero standard deviation. Cannot continue.") |
3868 param_df_exit() | |
3869 knitr::knit_exit() | |
1730 } | 3870 } |
1731 if (length(zero_sd_rownames) > 0) { | 3871 if (length(zero_sd_rownames) > 0) { |
1732 cat( | 3872 cat( |
1733 sprintf("%d peptides with zero variance were removed from statistical consideration", | 3873 sprintf( |
1734 length(zero_sd_rownames) | 3874 "%d %s %s", |
3875 length(zero_sd_rownames), | |
3876 "peptides with zero variance", | |
3877 "were removed from statistical consideration" | |
1735 ) | 3878 ) |
1736 ) | 3879 ) |
1737 zap_named_rows <- function(df, nms) { | 3880 zap_named_rows <- function(df, nms) { |
1738 return(df[!(row.names(df) %in% nms), ]) | 3881 return(df[!(row.names(df) %in% nms), ]) |
1739 } | 3882 } |
1740 quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) | 3883 quant_data_imp <- |
1741 quant_data <- zap_named_rows(quant_data, zero_sd_rownames) | 3884 zap_named_rows(quant_data_imp, zero_sd_rownames) |
1742 full_data <- zap_named_rows(full_data, zero_sd_rownames) | 3885 quant_data <- |
3886 zap_named_rows(quant_data, zero_sd_rownames) | |
3887 full_data <- | |
3888 zap_named_rows(full_data, zero_sd_rownames) | |
3889 min_group_obs_count <- | |
3890 min_group_obs_count[names(min_group_obs_count) %notin% zero_sd_rownames] | |
1743 } | 3891 } |
1744 | 3892 |
1745 if (sum(is.na(quant_data)) > 0) { | 3893 if (sum(is.na(quant_data)) > 0) { |
1746 cat("\\leavevmode\\newpage\n") | 3894 cat("\\leavevmode\\newpage\n") |
1747 # data visualization | |
1748 old_par <- par( | |
1749 mai = par("mai") + c(0.5, 0, 0, 0) | |
1750 ) | |
1751 # Copy quant data to x | 3895 # Copy quant data to x |
1752 x <- quant_data | 3896 x <- quant_data |
1753 # x gets to have values of: | 3897 # x gets to have values of: |
1754 # - NA for observed values | 3898 # - NA for observed values |
1755 # - 1 for missing values | 3899 # - 1 for missing values |
1774 max(red_dots, blue_dots, na.rm = TRUE) | 3918 max(red_dots, blue_dots, na.rm = TRUE) |
1775 ) | 3919 ) |
1776 show_stripchart <- | 3920 show_stripchart <- |
1777 50 > (count_red + count_blue) / length(sample_name_matches) | 3921 50 > (count_red + count_blue) / length(sample_name_matches) |
1778 if (show_stripchart) { | 3922 if (show_stripchart) { |
1779 boxplot_sub <- "Light blue = data before imputation; Red = imputed data" | 3923 boxplot_sub <- "Light blue = data before imputation; Red = imputed data;" |
1780 } else { | 3924 } else { |
1781 boxplot_sub <- "" | 3925 boxplot_sub <- "" |
1782 } | 3926 } |
1783 | 3927 |
1784 # Vertical plot | 3928 # Vertical plot |
1785 colnames(blue_dots) <- sample_name_matches | 3929 colnames(blue_dots) <- sample_name_matches |
1786 boxplot( | 3930 my_ppep_distrib_bxp( |
1787 blue_dots | 3931 x = blue_dots |
1788 , las = 1 # "always horizontal" | 3932 , sample_name_grow = sample_name_grow |
1789 , col = const_boxplot_fill | |
1790 , ylim = ylim | |
1791 , main = "Peptide intensities after eliminating unusable peptides" | 3933 , main = "Peptide intensities after eliminating unusable peptides" |
1792 , sub = boxplot_sub | 3934 , varwidth = boxplot_varwidth |
3935 , sub = paste(boxplot_sub, "Box widths reflect number of peptides for sample") | |
1793 , xlab = "Sample" | 3936 , xlab = "Sample" |
1794 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 3937 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
3938 , col = const_boxplot_fill | |
3939 , notch = FALSE | |
3940 , ylim = ylim | |
1795 ) | 3941 ) |
1796 | 3942 |
1797 if (show_stripchart) { | 3943 if (show_stripchart) { |
1798 # Points | 3944 # Points |
1799 # ref: https://r-charts.com/distribution/add-points-boxplot/ | 3945 # ref: https://r-charts.com/distribution/add-points-boxplot/ |
1801 stripchart( | 3947 stripchart( |
1802 blue_dots, # Data | 3948 blue_dots, # Data |
1803 method = "jitter", # Random noise | 3949 method = "jitter", # Random noise |
1804 jitter = const_stripchart_jitter, | 3950 jitter = const_stripchart_jitter, |
1805 pch = 19, # Pch symbols | 3951 pch = 19, # Pch symbols |
1806 cex = const_stripsmall_cex, # Size of symbols reduced | 3952 cex = const_stripchart_cex, # Size of symbols reduced |
1807 col = "lightblue", # Color of the symbol | 3953 col = "lightblue", # Color of the symbol |
1808 vertical = TRUE, # Vertical mode | 3954 vertical = TRUE, # Vertical mode |
1809 add = TRUE # Add it over | 3955 add = TRUE # Add it over |
1810 ) | 3956 ) |
1811 stripchart( | 3957 stripchart( |
1812 red_dots, # Data | 3958 red_dots, # Data |
1813 method = "jitter", # Random noise | 3959 method = "jitter", # Random noise |
1814 jitter = const_stripchart_jitter, | 3960 jitter = const_stripchart_jitter, |
1815 pch = 19, # Pch symbols | 3961 pch = 19, # Pch symbols |
1816 cex = const_stripsmall_cex, # Size of symbols reduced | 3962 cex = const_stripchart_cex, # Size of symbols reduced |
1817 col = "red", # Color of the symbol | 3963 col = "red", # Color of the symbol |
1818 vertical = TRUE, # Vertical mode | 3964 vertical = TRUE, # Vertical mode |
1819 add = TRUE # Add it over | 3965 add = TRUE # Add it over |
1820 ) | 3966 ) |
1821 | 3967 |
1822 } | 3968 } |
1823 if (TRUE) { | 3969 } |
1824 # show measured values in blue on left half-violin plot | 3970 ``` |
1825 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") | 3971 |
1826 vioplot::vioplot( | 3972 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} |
1827 x = lapply(blue_dots, function(x) x[!is.na(x)]), | 3973 if (sum(is.na(quant_data)) > 0) { |
1828 col = "lightblue1", | 3974 # show measured values in blue on left half-violin plot |
1829 side = "left", | 3975 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") |
1830 plotCentre = "line", | 3976 old_par <- par( |
1831 ylim = ylim_save, | 3977 mai = par("mai") + c(g_ppep_distrib_ctl$mai_bottom, 0, 0, 0), |
1832 main = "Distributions of observed and imputed data", | 3978 cex.axis = g_ppep_distrib_ctl$axis, |
1833 sub = "Light blue = observed data; Pink = imputed data", | 3979 cex.lab = 1.2 |
1834 xlab = "Sample", | 3980 ) |
1835 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 3981 tryCatch( |
1836 ) | 3982 { |
1837 red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) | 3983 vioplot::vioplot( |
1838 cols_to_delete <- c() | 3984 x = lapply(blue_dots, function(x) x[!is.na(x)]), |
1839 for (ix in seq_len(length(red_violins))) { | 3985 col = "lightblue1", |
1840 if (length(red_violins[[ix]]) < 1) { | 3986 side = "left", |
1841 cols_to_delete <- c(cols_to_delete, ix) | 3987 plotCentre = "line", |
3988 ylim = ylim_save, | |
3989 main = "Distributions of observed and imputed data", | |
3990 sub = NULL, | |
3991 las = 2, | |
3992 xlab = NULL, | |
3993 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
3994 ) | |
3995 title( | |
3996 sub = "Light blue = observed data; Pink = imputed data", | |
3997 cex.sub = 1.0, | |
3998 line = g_ppep_distrib_ctl$xlab_line + 1 | |
3999 ) | |
4000 title( | |
4001 xlab = "Sample", | |
4002 line = g_ppep_distrib_ctl$xlab_line | |
4003 ) | |
4004 red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) | |
4005 cols_to_delete <- c() | |
4006 for (ix in seq_len(length(red_violins))) { | |
4007 if (length(red_violins[[ix]]) < 1) { | |
4008 cols_to_delete <- c(cols_to_delete, ix) | |
4009 } | |
1842 } | 4010 } |
1843 } | 4011 # destroy any unimputable columns |
1844 # destroy any unimputable columns | 4012 if (!is.null(cols_to_delete)) { |
1845 if (!is.null(cols_to_delete)) { | 4013 red_violins <- red_violins[-cols_to_delete] |
1846 red_violins <- red_violins[-cols_to_delete] | 4014 } |
1847 } | 4015 # plot imputed values in red on right half-violin plot |
1848 # plot imputed values in red on right half-violin plot | 4016 vioplot::vioplot( |
1849 vioplot::vioplot( | 4017 x = red_violins, |
1850 x = red_violins, | 4018 col = "lightpink1", |
1851 col = "lightpink1", | 4019 side = "right", |
1852 side = "right", | 4020 plotCentre = "line", |
1853 plotCentre = "line", | 4021 add = TRUE |
1854 add = TRUE | 4022 ) |
1855 ) | 4023 |
1856 } | 4024 }, |
1857 | 4025 finally = par(old_par) |
1858 par(old_par) | 4026 ) |
1859 | 4027 |
1860 # density plot | 4028 # density plot |
1861 cat("\\leavevmode\n\n\n\n\n\n\n") | 4029 cat("\\leavevmode\n\n\n\n\n\n\n") |
1862 if (can_plot_before_after_imp) { | 4030 if (can_plot_before_after_imp) { |
1863 ylim <- c( | 4031 ylim <- c( |
1891 } | 4059 } |
1892 cat("\\leavevmode\\newpage\n") | 4060 cat("\\leavevmode\\newpage\n") |
1893 } | 4061 } |
1894 ``` | 4062 ``` |
1895 | 4063 |
1896 # Perform Quantile Normalization | 4064 # Quantile Normalization |
1897 | 4065 |
1898 The excellent `normalize.quantiles` function from | 4066 The excellent `normalize.quantiles` function from |
1899 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* | 4067 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* |
1900 performs "quantile normalization" as described Bolstad *et al.* (2003), | 4068 performs "quantile normalization" as described Bolstad *et al.* (2003), |
1901 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* | 4069 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* |
1902 and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, | 4070 and its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html), |
1903 i.e., it assumes that the goal is to detect | 4071 i.e., it assumes that the goal is to detect |
1904 subtle differences among grossly similar samples (having similar distributions) | 4072 subtle differences among grossly similar samples (having similar distributions) |
1905 by equailzing intra-quantile quantitations. | 4073 by equalizing intra-quantile quantitations^[Unfortunately, |
1906 Unfortunately, one software library upon which it depends | 4074 one software library upon which `preprocessCore` depends |
1907 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* | 4075 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* |
1908 that requires that a specific, non-concurrent version of the library be | 4076 that requires that a specific, non-concurrent version of the library (`openblas` version $0.3.3$) be |
1909 installed. The installation command equivalent to what was used to install the library to produce the results presented here is: | 4077 installed. The installation command equivalent to what was used to install the library to produce the results presented here is: |
1910 ``` | 4078 \linebreak |
1911 conda install bioconductor-preprocesscore openblas=0.3.3 | 4079 ` conda install bioconductor-preprocesscore openblas=0.3.3`]. |
1912 ``` | |
1913 | 4080 |
1914 | 4081 |
1915 <!-- | 4082 <!-- |
1916 # Apply quantile normalization using preprocessCore::normalize.quantiles | 4083 # Apply quantile normalization using preprocessCore::normalize.quantiles |
1917 # --- | 4084 # --- |
1918 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html | 4085 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html |
1919 # except this: https://support.bioconductor.org/p/122925/#9135989 | 4086 # except this: https://support.bioconductor.org/p/122925/#9135989 |
1920 # says to install it like this: | 4087 # says to install it like this: |
1921 # ``` | 4088 # ``` |
1922 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) | 4089 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) |
1923 # ``` | 4090 # ``` |
1924 # conda installation (necessary because of a bug in recent openblas): | 4091 # conda installation (necessary because of a bug in recent openblas): |
1925 # conda install bioconductor-preprocesscore openblas=0.3.3 | 4092 # conda install bioconductor-preprocesscore openblas=0.3.3 |
1926 # ... | 4093 # ... |
1927 # --- | 4094 # --- |
1928 # normalize.quantiles {preprocessCore} -- Quantile Normalization | 4095 # normalize.quantiles {preprocessCore} -- Quantile Normalization |
1929 # | 4096 # |
1930 # Description: | 4097 # Description: |
1931 # Using a normalization based upon quantiles, this function normalizes a | 4098 # Using a normalization based upon quantiles, this function normalizes a |
1932 # matrix of probe level intensities. | 4099 # matrix of probe level intensities. |
1933 # | 4100 # |
1934 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the | 4101 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the |
1935 # assumption that the data is missing at random. | 4102 # assumption that the data is missing at random. |
1936 # | 4103 # |
1937 # Usage: | 4104 # Usage: |
1938 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) | 4105 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) |
1939 # | 4106 # |
1940 # Arguments: | 4107 # Arguments: |
1941 # | 4108 # |
1942 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. | 4109 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. |
1943 # | 4110 # |
1944 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, | 4111 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, |
1945 # but in certain situations not making a copy of the matrix, but instead normalizing | 4112 # but in certain situations not making a copy of the matrix, but instead normalizing |
1946 # it in place will be more memory friendly. | 4113 # it in place will be more memory friendly. |
1947 # | 4114 # |
1948 # - keep.names: Boolean option to preserve matrix row and column names in output. | 4115 # - keep.names: Boolean option to preserve matrix row and column names in output. |
1949 # | 4116 # |
1950 # Details: | 4117 # Details: |
1951 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. | 4118 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. |
1952 # No special allowances are made for outliers. If you make use of quantile normalization | 4119 # No special allowances are made for outliers. If you make use of quantile normalization |
1953 # please cite Bolstad et al, Bioinformatics (2003). | 4120 # please cite Bolstad et al, Bioinformatics (2003). |
1954 # | 4121 # |
1955 # This functions will handle missing data (ie NA values), based on | 4122 # This functions will handle missing data (ie NA values), based on |
1956 # the assumption that the data is missing at random. | 4123 # the assumption that the data is missing at random. |
1957 # | 4124 # |
1958 # Note that the current implementation optimizes for better memory usage | 4125 # Note that the current implementation optimizes for better memory usage |
1959 # at the cost of some additional run-time. | 4126 # at the cost of some additional run-time. |
1960 # | 4127 # |
1961 # Value: A normalized matrix. | 4128 # Value: A normalized matrix. |
1962 # | 4129 # |
1963 # Author: Ben Bolstad, bmbolstad.com | 4130 # Author: Ben Bolstad, bmbolstad.com |
1964 # | 4131 # |
1965 # References | 4132 # References |
1966 # | 4133 # |
1967 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide | 4134 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide |
1968 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf | 4135 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf |
1969 # | 4136 # |
1970 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of | 4137 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of |
1971 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias | 4138 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias |
1972 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 | 4139 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 |
1973 # http://bmbolstad.com/misc/normalize/normalize.html | 4140 # http://bmbolstad.com/misc/normalize/normalize.html |
1974 # ... | 4141 # ... |
1975 --> | 4142 --> |
1976 ```{r echo = FALSE, results = 'asis'} | 4143 ```{r echo = FALSE, results = 'asis'} |
1977 | 4144 |
4145 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp)), "\n") | |
1978 if (nrow(quant_data_imp) > 0) { | 4146 if (nrow(quant_data_imp) > 0) { |
1979 quant_data_imp_qn <- preprocessCore::normalize.quantiles( | 4147 quant_data_imp_qn <- preprocessCore::normalize.quantiles( |
1980 as.matrix(quant_data_imp), keep.names = TRUE | 4148 as.matrix(quant_data_imp), keep.names = TRUE |
1981 ) | 4149 ) |
1982 } else { | 4150 } else { |
1983 quant_data_imp_qn <- as.matrix(quant_data_imp) | 4151 quant_data_imp_qn <- as.matrix(quant_data_imp) |
1984 } | 4152 } |
1985 | 4153 |
4154 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn)), "\n") | |
4155 | |
1986 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) | 4156 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) |
1987 | |
1988 write_debug_file(quant_data_imp_qn) | 4157 write_debug_file(quant_data_imp_qn) |
1989 | 4158 |
1990 quant_data_imp_qn_log <- log10(quant_data_imp_qn) | 4159 quant_data_imp_qn_log <- log10(quant_data_imp_qn) |
1991 | |
1992 write_debug_file(quant_data_imp_qn_log) | 4160 write_debug_file(quant_data_imp_qn_log) |
1993 | 4161 |
4162 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") | |
4163 if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") | |
4164 | |
1994 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) | 4165 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) |
1995 | 4166 |
1996 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) | 4167 sel <- row_apply(quant_data_imp_qn_ls, any_nan) |
1997 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls | 4168 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls |
1998 | 4169 |
1999 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] | 4170 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] |
2000 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) | 4171 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) |
2001 | 4172 |
2006 | 4177 |
2007 # Create data.frame used by ANOVA analysis | 4178 # Create data.frame used by ANOVA analysis |
2008 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) | 4179 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) |
2009 ``` | 4180 ``` |
2010 | 4181 |
2011 <!-- ACE insertion begin --> | |
2012 ## Are normalized, imputed, log-transformed sample distributions similar? | 4182 ## Are normalized, imputed, log-transformed sample distributions similar? |
2013 | 4183 |
2014 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 4184 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} |
2015 | 4185 |
2016 # Save unimputed quant_data_log for plotting below | 4186 # Save unimputed quant_data_log for plotting below |
2017 unimputed_quant_data_log <- quant_data_log | 4187 unimputed_quant_data_log <- quant_data_log |
2018 | 4188 |
2019 # log10 transform (after preparing for zero values, | 4189 # log10 transform (after preparing for zero values, |
2032 ) | 4202 ) |
2033 cat("\n\n\n") | 4203 cat("\n\n\n") |
2034 | 4204 |
2035 | 4205 |
2036 # data visualization | 4206 # data visualization |
4207 if (TRUE) { | |
4208 | |
4209 my_ppep_distrib_bxp( | |
4210 x = quant_data_log | |
4211 , sample_name_grow = sample_name_grow | |
4212 , main = "Peptide intensities for each sample" | |
4213 , varwidth = boxplot_varwidth | |
4214 , sub = NULL | |
4215 , xlab = "Sample" | |
4216 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
4217 , col = const_boxplot_fill | |
4218 , notch = boxplot_notch | |
4219 ) | |
4220 | |
4221 } else { | |
4222 | |
2037 old_par <- par( | 4223 old_par <- par( |
2038 mai = par("mai") + c(0.5, 0, 0, 0) | 4224 mai = par("mai") + c(0.5, 0, 0, 0) |
2039 , oma = par("oma") + c(0.5, 0, 0, 0) | 4225 , oma = par("oma") + c(0.5, 0, 0, 0) |
2040 ) | 4226 ) |
2041 # ref: https://r-charts.com/distribution/add-points-boxplot/ | 4227 # ref: https://r-charts.com/distribution/add-points-boxplot/ |
2042 # Vertical plot | 4228 # Vertical plot |
2043 colnames(quant_data_log) <- sample_name_matches | 4229 colnames(quant_data_log) <- sample_name_matches |
2044 boxplot( | 4230 boxplot( |
2045 quant_data_log | 4231 quant_data_log |
2046 , las = 1 | 4232 , las = 2 |
4233 , cex.axis = 0.9 * sample_name_grow | |
2047 , col = const_boxplot_fill | 4234 , col = const_boxplot_fill |
2048 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 4235 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
2049 , xlab = "Sample" | 4236 , xlab = "Sample" |
4237 , notch = boxplot_notch | |
4238 , varwidth = boxplot_varwidth | |
2050 ) | 4239 ) |
2051 par(old_par) | 4240 par(old_par) |
4241 } | |
2052 } else { | 4242 } else { |
2053 cat("There are no peptides to plot\n") | 4243 cat("There are no peptides to plot\n") |
2054 } | 4244 } |
2055 | 4245 |
2056 cat("\n\n\n") | 4246 cat("\n\n\n") |
2072 cat("\\leavevmode\\newpage\n") | 4262 cat("\\leavevmode\\newpage\n") |
2073 ``` | 4263 ``` |
2074 | 4264 |
2075 # ANOVA Analysis | 4265 # ANOVA Analysis |
2076 | 4266 |
2077 ```{r, echo = FALSE} | 4267 ## Assignment of $p$-value and quality score |
4268 | |
4269 For each phosphopeptide, ANOVA analysis was performed to compute a $p$-value representing the evidence against the "null hypothesis" ($H_0$) that the intensity does not vary significantly among sample groups. | |
4270 | |
4271 However, because as more and more phosphopeptides are tested, there is increasd probability that, by random chance, a given peptide will have a $p$-value that appears to indicate significance. For this reason, the $p$-values were adjusted by applying the False Discovery Rate (FDR) correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x). | |
4272 | |
4273 Furthermore, to give more weight to phosphopeptides having fewer missing values, an (arbitrarily defined) quality score was assigned to each, defined as: | |
4274 | |
4275 $$ | |
4276 \textit{quality}_j | |
4277 = \frac{1 + o_{j}}{v_{j}(1 + o_{j}) + 0.005} | |
4278 $$ | |
4279 | |
4280 where: | |
4281 | |
4282 - $o_j$ is the minimum number of non-missing observations per sample group for substrate $j$ for all sample groups, and | |
4283 - $v_j$ is the FDR-adjusted ANOVA $p$-value for substrate $j$. | |
4284 | |
4285 | |
4286 ```{r, echo = FALSE, results = 'asis'} | |
4287 cat("\\newpage\n") | |
4288 | |
2078 # Make new data frame containing only Phosphopeptides | 4289 # Make new data frame containing only Phosphopeptides |
2079 # to connect preANOVA to ANOVA (connect_df) | 4290 # to connect preANOVA to ANOVA (connect_df) |
2080 connect_df <- data.frame( | 4291 connect_df <- data.frame( |
2081 data_table_imp_qn_lt$Phosphopeptide | 4292 data_table_imp_qn_lt$Phosphopeptide |
2082 , data_table_imp_qn_lt[, first_data_column] | 4293 , data_table_imp_qn_lt[, first_data_column] |
2083 ) | 4294 ) |
2084 colnames(connect_df) <- c("Phosphopeptide", "Intensity") | 4295 colnames(connect_df) <- c("Phosphopeptide", "Intensity") |
2085 ``` | 4296 ``` |
2086 | 4297 |
2087 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 4298 ```{r anova, echo = FALSE, fig.dim = c(10, 12), results = 'asis'} |
2088 count_of_treatment_levels <- length(levels(sample_treatment_levels)) | 4299 g_can_run_ksea <- FALSE |
4300 old_oma <- par("oma") | |
2089 if (count_of_treatment_levels < 2) { | 4301 if (count_of_treatment_levels < 2) { |
2090 nuke_control_sequences <- | |
2091 function(s) { | |
2092 s <- gsub("[\\]", "xyzzy_plugh", s) | |
2093 s <- gsub("[$]", "\\\\$", s) | |
2094 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) | |
2095 return(s) | |
2096 } | |
2097 cat( | 4302 cat( |
2098 "ERROR!!!! Cannot perform ANOVA analysis", | 4303 "ERROR!!!! Cannot perform ANOVA analysis", |
2099 "(see next page)\\newpage\n" | 4304 "(see next page)\\newpage\n" |
2100 ) | 4305 ) |
2101 cat( | 4306 cat( |
2107 cat("Unparsed sample names are:\n\n\n", | 4312 cat("Unparsed sample names are:\n\n\n", |
2108 "\\begin{quote}\n", | 4313 "\\begin{quote}\n", |
2109 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), | 4314 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), |
2110 "\n\\end{quote}\n\n") | 4315 "\n\\end{quote}\n\n") |
2111 | 4316 |
2112 regex_sample_names <- nuke_control_sequences(regex_sample_names) | 4317 regex_sample_names <- latex_printable_control_seqs(regex_sample_names) |
2113 | 4318 |
2114 cat("\\leavevmode\n\n\n") | 4319 cat("\\leavevmode\n\n\n") |
2115 cat("Parsing rule for SampleNames is", | 4320 cat("Parsing rule for SampleNames is", |
2116 "\n\n\n", | 4321 "\n\n\n", |
2117 "\\text{'", | 4322 "\\text{'", |
2123 cat("\nParsed sample names are:\n", | 4328 cat("\nParsed sample names are:\n", |
2124 "\\begin{quote}\n", | 4329 "\\begin{quote}\n", |
2125 paste(sample_name_matches, collapse = "\n\n\n"), | 4330 paste(sample_name_matches, collapse = "\n\n\n"), |
2126 "\n\\end{quote}\n\n") | 4331 "\n\\end{quote}\n\n") |
2127 | 4332 |
2128 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) | 4333 regex_sample_grouping <- latex_printable_control_seqs(regex_sample_grouping) |
2129 | 4334 |
2130 cat("\\leavevmode\n\n\n") | 4335 cat("\\leavevmode\n\n\n") |
2131 cat("Parsing rule for SampleGrouping is", | 4336 cat("Parsing rule for SampleGrouping is", |
2132 "\n\n\n", | 4337 "\n\n\n", |
2133 "\\text{'", | 4338 "\\text{'", |
2142 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), | 4347 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), |
2143 "\n\\end{quote}\n\n") | 4348 "\n\\end{quote}\n\n") |
2144 | 4349 |
2145 } else { | 4350 } else { |
2146 | 4351 |
2147 p_value_data_anova_ps <- | 4352 if (print_nb_messages) nbe("computing p_value_data_anova_ps\n") |
2148 apply( | 4353 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") |
2149 quant_data_imp_qn_log, | 4354 if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") |
2150 1, | 4355 if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log[, ".NE.7C"]), "\n") |
2151 anova_func, | 4356 if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log), "\n") |
2152 grouping_factor = sample_treatment_levels, | 4357 if (print_nb_messages) nbe(see_variable(anova_func), "\n") |
2153 one_way_f = one_way_all_categories | 4358 if (print_nb_messages) nbe(see_variable(smpl_trt), "\n") |
2154 ) | 4359 if (print_nb_messages) nbe(see_variable(one_way_all_categories), "\n") |
4360 tryCatch( | |
4361 { | |
4362 p_value_data_anova_ps <- | |
4363 row_apply( | |
4364 quant_data_imp_qn_log, | |
4365 anova_func, | |
4366 grouping_factor = smpl_trt, | |
4367 one_way_f = one_way_all_categories | |
4368 ) | |
4369 }, | |
4370 error = function(e) { | |
4371 mesg <- paste("Could not compute ANOVA because", e$message) | |
4372 cat("\n\n", mesg, "\n\n") | |
4373 param_df_noexit(e) | |
4374 sink(stderr()) | |
4375 cat("\n\n", mesg, "\n\n") | |
4376 values <- paste(param_df$parameter, "=", param_df$value, collapse = "\n") | |
4377 cat(values) | |
4378 sink() | |
4379 knitr::knit_exit() | |
4380 exit(code = 1) | |
4381 } | |
4382 ) | |
4383 if (print_nb_messages) nbe(see_variable(p_value_data_anova_ps), "\n") | |
2155 | 4384 |
2156 p_value_data_anova_ps_fdr <- | 4385 p_value_data_anova_ps_fdr <- |
2157 p.adjust(p_value_data_anova_ps, method = "fdr") | 4386 p.adjust(p_value_data_anova_ps, method = "fdr") |
4387 my_ppep_v <- full_data[, 1] | |
4388 p_value_data <- list( | |
4389 phosphopeptide = my_ppep_v, | |
4390 raw_anova_p = p_value_data_anova_ps, | |
4391 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, | |
4392 missing_values = rowSums(is.na(quant_data)), | |
4393 min_group_obs_count = min_group_obs_count | |
4394 ) | |
2158 p_value_data <- data.frame( | 4395 p_value_data <- data.frame( |
2159 phosphopeptide = full_data[, 1] | 4396 phosphopeptide = my_ppep_v, |
2160 , | 4397 raw_anova_p = p_value_data_anova_ps, |
2161 raw_anova_p = p_value_data_anova_ps | 4398 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, |
2162 , | 4399 missing_values = rowSums(is.na(quant_data)), |
2163 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr | 4400 min_group_obs_count = min_group_obs_count |
2164 ) | 4401 ) |
4402 p_value_data$quality <- 1.0 / with( | |
4403 p_value_data, | |
4404 fdr_adjusted_anova_p + 0.005 / (1 + min_group_obs_count) | |
4405 ) | |
4406 | |
4407 p_value_data$ranking <- | |
4408 with( | |
4409 p_value_data, | |
4410 switch( | |
4411 g_intensity_hm_criteria, | |
4412 "quality" = order(-quality), | |
4413 "na_count" = order(missing_values, fdr_adjusted_anova_p), | |
4414 # the default is "p_value" | |
4415 order(fdr_adjusted_anova_p) | |
4416 ) | |
4417 ) | |
4418 p_value_data <- p_value_data[p_value_data$ranking, , drop = FALSE] | |
4419 | |
4420 write.table( | |
4421 p_value_data, | |
4422 file = "p_value_data.txt", | |
4423 sep = "\t", | |
4424 col.names = TRUE, | |
4425 row.names = FALSE, | |
4426 quote = FALSE | |
4427 ) | |
4428 | |
2165 | 4429 |
2166 # output ANOVA file to constructed filename, | 4430 # output ANOVA file to constructed filename, |
2167 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" | 4431 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" |
2168 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" | 4432 # becomes "Outputfile_pST_ANOVA_STEP5_FDR0.05.txt" |
2169 | 4433 |
2170 # Re-output datasets to include p-values | 4434 # Re-output datasets to include p-values |
2171 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) | 4435 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:ncol(p_value_data)]) |
2172 write.table( | 4436 write.table( |
2173 cbind(metadata_plus_p, quant_data_imp), | 4437 cbind(metadata_plus_p, quant_data_imp), |
2174 file = imputed_data_filename, | 4438 file = imputed_data_filename, |
2175 sep = "\t", | 4439 sep = "\t", |
2176 col.names = TRUE, | 4440 col.names = TRUE, |
2186 row.names = FALSE, | 4450 row.names = FALSE, |
2187 quote = FALSE | 4451 quote = FALSE |
2188 ) | 4452 ) |
2189 | 4453 |
2190 | 4454 |
2191 p_value_data <- | |
2192 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] | |
2193 | |
2194 first_page_suppress <- 1 | 4455 first_page_suppress <- 1 |
2195 number_of_peptides_found <- 0 | 4456 number_of_peptides_found <- 0 |
2196 cutoff <- val_fdr[1] | 4457 cutoff <- val_fdr[1] |
2197 for (cutoff in val_fdr) { | 4458 for (cutoff in val_fdr) { |
2198 if (number_of_peptides_found > 49) { | 4459 #loop through FDR cutoffs |
4460 if (number_of_peptides_found > g_intensity_hm_rows - 1) { | |
2199 cat("\\leavevmode\n\n\n") | 4461 cat("\\leavevmode\n\n\n") |
2200 break | 4462 break |
2201 } | 4463 } |
2202 | 4464 |
2203 #loop through FDR cutoffs | 4465 bool_1 <- (p_value_data$fdr_adjusted_anova_p < cutoff) |
4466 bool_2 <- (p_value_data$min_group_obs_count >= g_intensity_min_per_class) | |
4467 g_can_run_ksea <- g_can_run_ksea || (sum(bool_2) > 0) | |
4468 bool_4 <- (p_value_data$quality >= params$minQuality) | |
4469 bool_3 <- as.logical( | |
4470 as.integer(bool_1) * | |
4471 as.integer(bool_2) * | |
4472 as.integer(bool_4) | |
4473 ) | |
4474 if (print_trace_messages) { | |
4475 if (length(bool_1) > 30) { | |
4476 cat_variable(bool_1, force_str = TRUE) | |
4477 cat_variable(bool_2, force_str = TRUE) | |
4478 cat_variable(bool_3, force_str = TRUE) | |
4479 } else { | |
4480 cat_variable(bool_1, suffix = "\n\n") | |
4481 cat_variable(bool_2, suffix = "\n\n") | |
4482 cat_variable(bool_3, suffix = "\n\n") | |
4483 } | |
4484 cat_variable(length(bool_3), digits = 0, suffix = "; ") | |
4485 cat_variable(sum(bool_3), digits = 0, suffix = "\n\n") | |
4486 } | |
2204 | 4487 |
2205 filtered_p <- | 4488 filtered_p <- |
2206 p_value_data[ | 4489 p_value_data[bool_3, , drop = FALSE] |
2207 which(p_value_data$fdr_adjusted_anova_p < cutoff), | 4490 filtered_p <- |
2208 , drop = FALSE | 4491 filtered_p[!is.na(filtered_p$phosphopeptide), , drop = FALSE] |
2209 ] | 4492 |
4493 if (print_trace_messages) | |
4494 cat_variable(filtered_p, force_str = TRUE) | |
4495 | |
4496 have_remaining_peptides <- sum(bool_3, na.rm = TRUE) > 0 | |
4497 filter_result_string <- | |
4498 sprintf( | |
4499 "%s, %s of %0.0f peptides remained having both %s and %s.\n\n", | |
4500 "After filtering for ANOVA results", | |
4501 if (have_remaining_peptides) | |
4502 as.character(sum(bool_3, na.rm = TRUE)) | |
4503 else | |
4504 "none", | |
4505 length(bool_3), | |
4506 sprintf("adjusted p-value < %s", as.character(signif(cutoff, 2))), | |
4507 sprintf( | |
4508 "more than %0.0f observations in some groups", | |
4509 max(0, g_intensity_min_per_class - 1) | |
4510 ) | |
4511 ) | |
4512 | |
2210 filtered_data_filtered <- | 4513 filtered_data_filtered <- |
2211 quant_data_imp_qn_log[ | 4514 quant_data_imp_qn_log[ |
2212 rownames(filtered_p), | 4515 rownames(filtered_p), |
2213 , drop = FALSE | 4516 , drop = FALSE |
2214 ] | 4517 ] |
4518 # order by p-value | |
2215 filtered_data_filtered <- | 4519 filtered_data_filtered <- |
2216 filtered_data_filtered[ | 4520 filtered_data_filtered[ |
2217 order(filtered_p$fdr_adjusted_anova_p), | 4521 order(filtered_p$fdr_adjusted_anova_p), |
2218 , drop = FALSE | 4522 , drop = FALSE |
2219 ] | 4523 ] |
2220 | 4524 |
2221 # <!-- ACE insertion start --> | 4525 if (have_remaining_peptides && nrow(filtered_p) > 0 && nrow(filtered_data_filtered) > 0) { |
2222 | |
2223 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { | |
2224 if (first_page_suppress == 1) { | 4526 if (first_page_suppress == 1) { |
2225 first_page_suppress <- 0 | 4527 first_page_suppress <- 0 |
2226 } else { | 4528 } else { |
2227 cat("\\newpage\n") | 4529 cat("\\newpage\n") |
2228 } | 4530 } |
2229 if (nrow(filtered_data_filtered) > 1) { | 4531 latex_samepage({ |
2230 subsection_header(sprintf( | 4532 cat(filter_result_string) |
2231 "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", | 4533 if (nrow(filtered_data_filtered) > 1) { |
2232 nrow(filtered_data_filtered), | 4534 cat(subsection_header(sprintf( |
2233 cutoff | 4535 "Intensity distributions for %d phosphopeptides\n", |
2234 )) | 4536 nrow(filtered_data_filtered) |
2235 } else { | 4537 ))) |
2236 subsection_header(sprintf( | 4538 } else { |
2237 "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", | 4539 cat(subsection_header(sprintf( |
2238 rownames(filtered_data_filtered)[1], | 4540 "Intensity distribution for one phosphopeptide (%s)\n", |
2239 cutoff | 4541 rownames(filtered_data_filtered)[1] |
2240 )) | 4542 ))) |
2241 } | 4543 } |
2242 cat("\n\n\n") | 4544 }) # end latex_samepage |
2243 cat("\n\n\n") | 4545 |
2244 | |
2245 old_oma <- par("oma") | |
2246 old_par <- par( | 4546 old_par <- par( |
2247 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), | 4547 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), |
2248 oma = old_oma * c(1, 1, 0.3, 1), | 4548 oma = old_oma * c(1, 1, 0.3, 1), |
2249 cex.main = 0.9, | 4549 cex.main = 0.9, |
2250 cex.axis = 0.7, | 4550 cex.axis = 0.7, |
2251 fin = c(9, 7.25) | 4551 fin = c(9, 7.25) |
2252 ) | 4552 ) |
2253 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
2254 # Vertical plot | 4553 # Vertical plot |
2255 colnames(filtered_data_filtered) <- sample_name_matches | 4554 colnames(filtered_data_filtered) <- sample_name_matches |
2256 tryCatch( | 4555 tryCatch( |
2257 boxplot( | 4556 boxplot( |
2258 filtered_data_filtered, | 4557 filtered_data_filtered, |
2259 main = "Imputed, normalized intensities", # no line plot | 4558 main = "Imputed, normalized intensities", # no line plot |
2260 las = 1, | 4559 las = 2, |
4560 cex.axis = 0.9 * sample_name_grow, | |
2261 col = const_boxplot_fill, | 4561 col = const_boxplot_fill, |
2262 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 4562 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)"), |
4563 notch = FALSE, | |
4564 varwidth = boxplot_varwidth | |
2263 ), | 4565 ), |
2264 error = function(e) print(e) | 4566 error = function(e) { |
4567 print(e) | |
4568 cat_margins() | |
4569 } | |
4570 | |
2265 ) | 4571 ) |
2266 par(old_par) | 4572 par(old_par) |
2267 } else { | 4573 } else { |
2268 cat(sprintf( | 4574 cat(sprintf( |
2269 "%s < %0.2f\n\n\n\n\n", | 4575 "%s < %0.2f\n\n\n\n\n", |
2270 "No peptides were found to have cutoff adjusted p-value", | 4576 "No peptides were found to have cutoff adjusted p-value", |
2271 cutoff | 4577 cutoff |
2272 )) | 4578 )) |
2273 } | 4579 } |
2274 | 4580 |
2275 if (nrow(filtered_data_filtered) > 0) { | 4581 if (have_remaining_peptides && nrow(filtered_data_filtered) > 0) { |
2276 # Add Phosphopeptide column to anova_filtered table | 4582 # Add Phosphopeptide column to anova_filtered table |
2277 # The assumption here is that the first intensity is unique; | 4583 # The assumption here is that the first intensity is unique; |
2278 # this is a hokey assumption but almost definitely will | 4584 # this is a hokey assumption but almost definitely will |
2279 # be true in the real world unless there is a computation | 4585 # be true in the real world unless there is a computation |
2280 # error upstream. | 4586 # error upstream. |
2303 by.y = "Phosphopeptide" | 4609 by.y = "Phosphopeptide" |
2304 ) | 4610 ) |
2305 | 4611 |
2306 # Produce heatmap to visualize significance and the effect of imputation | 4612 # Produce heatmap to visualize significance and the effect of imputation |
2307 | 4613 |
2308 anova_filtered_merge_format <- sapply( | |
2309 X = filtered_p$fdr_adjusted_anova_p | |
2310 , | |
2311 FUN = function(x) { | |
2312 if (x > 0.0001) | |
2313 paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") | |
2314 else | |
2315 paste0("(%0.4e) %s") | |
2316 } | |
2317 ) | |
2318 | |
2319 cat_hm_heading <- function(m, cutoff) { | 4614 cat_hm_heading <- function(m, cutoff) { |
2320 cat("\\newpage\n") | 4615 if (nrow(m) > g_intensity_hm_rows) { |
2321 if (nrow(m) > intensity_hm_rows) { | 4616 cat("\\clearpage\n") |
2322 subsection_header( | 4617 cat(subsection_header( |
2323 paste( | 4618 paste( |
2324 sprintf("Heatmap for the %d most-significant peptides", | 4619 sprintf("Heatmap for the %d most-significant peptides", |
2325 intensity_hm_rows), | 4620 g_intensity_hm_rows), |
2326 sprintf("whose adjusted p-value < %0.2f\n", cutoff) | 4621 sprintf("whose adjusted p-value < %0.2f\n", cutoff) |
2327 ) | 4622 ) |
2328 ) | 4623 )) |
2329 } else { | 4624 } else { |
2330 if (nrow(m) == 1) { | 4625 if (nrow(m) == 0) { |
2331 return(FALSE) | 4626 return(FALSE) |
2332 } else { | 4627 } else { |
2333 subsection_header( | 4628 cat(subsection_header( |
2334 paste( | 4629 paste( |
2335 sprintf("Heatmap for %d usable peptides whose", nrow(m)), | 4630 sprintf("Heatmap for %d usable peptide genes whose", nrow(m)), |
2336 sprintf("adjusted p-value < %0.2f\n", cutoff) | 4631 sprintf("adjusted p-value < %0.2f\n", cutoff) |
2337 ) | 4632 ) |
2338 ) | 4633 )) |
2339 } | 4634 } |
2340 } | 4635 } |
2341 cat("\n\n\n") | 4636 cat("\n\n\n") |
2342 cat("\n\n\n") | 4637 cat("\n\n\n") |
2343 return(TRUE) | 4638 return(TRUE) |
2344 } | 4639 } |
2345 | 4640 |
2346 # construct matrix with appropriate rownames | 4641 # construct matrix with appropriate rownames |
2347 m <- | 4642 m <- |
2348 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) | 4643 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) |
2349 if (nrow(m) > 0) { | 4644 nrow_m <- nrow(m) |
4645 ncol_m <- ncol(m) | |
4646 if (nrow_m > 0) { | |
2350 rownames_m <- rownames(m) | 4647 rownames_m <- rownames(m) |
2351 rownames(m) <- sapply( | 4648 q <- data.frame(pepname = rownames_m) |
2352 X = seq_len(nrow(m)) | 4649 g <- sqldf(" |
2353 , | 4650 SELECT q.pepname, substr(met.Gene_Name, 1, 30) AS gene_name |
4651 FROM q, metadata_plus_p AS met | |
4652 WHERE q.pepname = met.Phosphopeptide | |
4653 ORDER BY q.rowid | |
4654 ") | |
4655 tmp <- sapply( | |
4656 X = seq_len(nrow(g)), | |
4657 FUN = function(i) { | |
4658 pre <- strsplit(g$gene_name[i], "; ")[[1]] | |
4659 rslt <- paste(unique(pre), sep = "; ") | |
4660 return(rslt) | |
4661 } | |
4662 ) | |
4663 tmp <- unlist(tmp) | |
4664 tmp <- | |
4665 make.names(tmp, unique = TRUE) | |
4666 tmp <- sub( | |
4667 "No_Gene_Name", | |
4668 "not_found", | |
4669 tmp, | |
4670 fixed = TRUE | |
4671 ) | |
4672 ten_trunc_names <- trunc_ppep(rownames_m) | |
4673 tmp <- sapply( | |
4674 X = seq_len(nrow_m), | |
2354 FUN = function(i) { | 4675 FUN = function(i) { |
2355 sprintf( | 4676 sprintf( |
2356 anova_filtered_merge_format[i], | 4677 "(%s) %s", |
2357 filtered_p$fdr_adjusted_anova_p[i], | 4678 tmp[i], |
2358 rownames_m[i] | 4679 ten_trunc_names[i] |
2359 ) | 4680 ) |
2360 } | 4681 } |
2361 ) | 4682 ) |
4683 rownames(m) <- tmp | |
2362 } | 4684 } |
2363 # draw the heading and heatmap | 4685 # draw the heading and heatmap |
2364 if (nrow(m) > 0) { | 4686 if (nrow_m > 0) { |
2365 number_of_peptides_found <- | 4687 number_of_peptides_found <- |
2366 draw_intensity_heatmap( | 4688 ppep_heatmap( |
2367 m = m, | 4689 m = m, |
2368 cutoff = cutoff, | 4690 cutoff = cutoff, |
2369 hm_heading_function = cat_hm_heading, | 4691 hm_heading_function = cat_hm_heading, |
2370 hm_main_title = "Unimputed, unnormalized log(intensities)", | 4692 hm_main_title = |
2371 suppress_row_dendrogram = FALSE | 4693 "log(intensities), row-scaled, unimputed, unnormalized", |
4694 suppress_row_dendrogram = FALSE, | |
4695 master_cex = 0.35, | |
4696 sepcolor = "black", | |
4697 colsep = sample_colsep | |
2372 ) | 4698 ) |
4699 if (number_of_peptides_found > 1) { | |
4700 cat("\\leavevmode\n") | |
4701 } | |
2373 } | 4702 } |
2374 } | 4703 } |
2375 } | 4704 } |
2376 } | 4705 } |
2377 cat("\\leavevmode\n\n\n") | 4706 cat(filter_result_string) |
4707 cat("\\leavevmode\n") | |
4708 | |
4709 if (!g_can_run_ksea) { | |
4710 errmsg <- paste("Cannot proceed with KSEA analysis", | |
4711 "because too many values are missing.") | |
4712 if (FALSE) cat0( | |
4713 errmsg, | |
4714 "\\stepcounter{offset}\n", | |
4715 "\\stepcounter{offset}\n", | |
4716 "\\stepcounter{offset}\n", | |
4717 " in ", | |
4718 table_href(), | |
4719 ".\n\n" | |
4720 ) | |
4721 if (FALSE) { | |
4722 if (print_nb_messages) nbe(see_variable(p_value_data)) | |
4723 } else { | |
4724 if (print_nb_messages) nbe(see_variable(p_value_data)) | |
4725 | |
4726 display_p_value_data <- p_value_data | |
4727 display_p_value_data$raw_anova_p <- | |
4728 sprintf("%0.3g", display_p_value_data$raw_anova_p) | |
4729 display_p_value_data$fdr_adjusted_anova_p <- | |
4730 sprintf("%0.3g", display_p_value_data$fdr_adjusted_anova_p) | |
4731 display_p_value_data$quality <- | |
4732 sprintf("%0.3g", display_p_value_data$quality) | |
4733 | |
4734 headers_1st_line <- | |
4735 c("", "Raw ANOVA", "FDR-adj.", "Missing", "Min. #", "", "") | |
4736 headers_2nd_line <- | |
4737 c("Phosphopeptide", "p-value", "p-value", "values", "group-obs", "Quality", "Ranking") | |
4738 data_frame_tabbing_latex( | |
4739 x = display_p_value_data, | |
4740 tabstops = c(2.75, 0.80, 0.80, 0.5, 0.6, 0.60), | |
4741 use_subsubsection_header = FALSE, | |
4742 headings = c(headers_1st_line, headers_2nd_line), | |
4743 caption = "ANOVA results" | |
4744 | |
4745 ) | |
4746 } | |
4747 data_frame_tabbing_latex( | |
4748 x = save_sample_treatment_df, | |
4749 tabstops = c(1.25, 1.25), | |
4750 caption = "Sample classes", | |
4751 use_subsubsection_header = FALSE | |
4752 ) | |
4753 param_df_exit() | |
4754 knitr::knit_exit() | |
4755 return(invisible(-1)) | |
4756 } | |
4757 | |
2378 ``` | 4758 ``` |
2379 | 4759 |
2380 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 4760 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} |
2381 | 4761 |
2382 if (count_of_treatment_levels > 1) { | 4762 if (g_can_run_ksea && count_of_treatment_levels > 1) { |
2383 # Prepare two-way contrasts with adjusted p-values | 4763 # Prepare two-way contrasts with adjusted p-values |
2384 # Strategy: | 4764 # Strategy: |
2385 # - use imputed, log-transformed data: | 4765 # - use imputed, log-transformed data: |
2386 # - remember this when computing log2(fold-change) | 4766 # - remember this when computing log2(fold-change) |
2387 # - each contrast is between a combination of trt levels | 4767 # - each contrast is between a combination of trt levels |
2393 # - adjust p-value, assuming that | 4773 # - adjust p-value, assuming that |
2394 # (# of pppeps)*(# of contrasts) tests were performed | 4774 # (# of pppeps)*(# of contrasts) tests were performed |
2395 | 4775 |
2396 # Each contrast is between a combination of trt levels | 4776 # Each contrast is between a combination of trt levels |
2397 m2 <- combn( | 4777 m2 <- combn( |
2398 x = seq_len(length(levels(sample_treatment_levels))), | 4778 x = seq_len(length(levels(smpl_trt))), |
2399 m = 2, | 4779 m = 2, |
2400 simplify = TRUE | 4780 simplify = TRUE |
2401 ) | 4781 ) |
2402 contrast_count <- ncol(m2) | 4782 contrast_count <- ncol(m2) |
2403 | 4783 |
2407 f_m2 <- | 4787 f_m2 <- |
2408 function(cntrst, lvl1, lvl2) { | 4788 function(cntrst, lvl1, lvl2) { |
2409 return( | 4789 return( |
2410 data.frame( | 4790 data.frame( |
2411 contrast = cntrst, | 4791 contrast = cntrst, |
2412 level = sample_treatment_levels[ | 4792 level = smpl_trt[ |
2413 sample_treatment_levels %in% | 4793 smpl_trt %in% |
2414 levels(sample_treatment_levels)[c(lvl1, lvl2)] | 4794 levels(smpl_trt)[c(lvl1, lvl2)] |
2415 ], | 4795 ], |
2416 label = sample_name_matches[ | 4796 label = sample_name_matches[ |
2417 sample_treatment_levels %in% | 4797 smpl_trt %in% |
2418 levels(sample_treatment_levels)[c(lvl1, lvl2)] | 4798 levels(smpl_trt)[c(lvl1, lvl2)] |
2419 ] | 4799 ] |
2420 ) | 4800 ) |
2421 ) | 4801 ) |
2422 } | 4802 } |
2423 # - compute a df for each contrast | 4803 # - compute a df for each contrast |
2593 ; | 4973 ; |
2594 " | 4974 " |
2595 ) | 4975 ) |
2596 | 4976 |
2597 # - create contrast-metadata table | 4977 # - create contrast-metadata table |
4978 if (print_nb_messages) nbe("CREATE TABLE contrast_lvl_lvl_metadata") | |
2598 dml_no_rows_exec(db, " | 4979 dml_no_rows_exec(db, " |
2599 CREATE TABLE contrast_lvl_lvl_metadata | 4980 CREATE TABLE contrast_lvl_lvl_metadata |
2600 AS | 4981 AS |
2601 SELECT DISTINCT | 4982 SELECT DISTINCT |
2602 a.contrast AS ab_contrast, | 4983 a.contrast AS ab_contrast, |
2703 rownames(grouping_factor) <- grouping_factor$sample | 5084 rownames(grouping_factor) <- grouping_factor$sample |
2704 grouping_factor <- grouping_factor[, "level", drop = FALSE] | 5085 grouping_factor <- grouping_factor[, "level", drop = FALSE] |
2705 | 5086 |
2706 # - run the two-level (one-way) test | 5087 # - run the two-level (one-way) test |
2707 p_value_data_contrast_ps <- | 5088 p_value_data_contrast_ps <- |
2708 apply( | 5089 row_apply( |
2709 X = contrast_cast_data, | 5090 x = contrast_cast_data, |
2710 MARGIN = 1, # apply to rows | 5091 fun = anova_func, |
2711 FUN = anova_func, | |
2712 grouping_factor = | 5092 grouping_factor = |
2713 as.factor(as.numeric(grouping_factor$level)), # anova_func arg2 | 5093 as.factor(grouping_factor$level), # anova_func arg2 |
2714 one_way_f = one_way_two_categories, # anova_func arg3 | 5094 one_way_f = one_way_two_categories, # anova_func arg3 |
2715 simplify = TRUE # TRUE is the default for simplify | 5095 simplify = TRUE # TRUE is the default for simplify |
2716 ) | 5096 ) |
2717 contrast_data_adj_p_values <- p.adjust( | 5097 contrast_data_adj_p_values <- p.adjust( |
2718 p = p_value_data_contrast_ps, | 5098 p = p_value_data_contrast_ps, |
2920 AND NOT m.`Gene` = 'No_Gene_Name' | 5300 AND NOT m.`Gene` = 'No_Gene_Name' |
2921 AND NOT v.log2_fc = 0 | 5301 AND NOT v.log2_fc = 0 |
2922 ; | 5302 ; |
2923 " | 5303 " |
2924 ) | 5304 ) |
5305 # We are done with DDL and insertion | |
5306 RSQLite::dbDisconnect(db) | |
2925 } | 5307 } |
2926 ``` | 5308 ``` |
2927 | 5309 |
2928 ```{r echo = FALSE, results = 'asis'} | 5310 ```{r echo = FALSE, results = 'asis'} |
2929 cat("\\newpage\n") | 5311 cat("\\newpage\n") |
2930 ``` | 5312 ``` |
2931 | 5313 |
2932 # KSEA Analysis | 5314 # KSEA Analysis Summaries |
2933 | 5315 |
2934 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: | 5316 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: |
2935 | 5317 |
2936 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp | 5318 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp |
2937 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). | 5319 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). |
2938 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). | 5320 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). |
2939 | 5321 |
2940 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: | 5322 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: |
2941 | 5323 |
2942 $$ | 5324 $$ |
2943 \text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} | 5325 \text{kinase enrichment }z\text{-score}_{j,i} = \frac{(\overline{`r sfc`}_{j,i} - \overline{`r pfc`}_j)\sqrt{m_{j,i}}}{\delta_j} |
2944 $$ | 5326 $$ |
2945 | 5327 |
2946 and fold-enrichment is computed as: | 5328 and fold-enrichment is computed as: |
2947 | 5329 |
2948 $$ | 5330 $$ |
2949 \text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} | 5331 \text{Enrichment}_{j,i} = \frac{\overline{`r sfc`}_{j,i}}{\overline{`r pfc`}_j} |
2950 $$ | 5332 $$ |
2951 | 5333 |
2952 where: | 5334 where: |
2953 | 5335 |
2954 - $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, | 5336 - $\overline{`r sfc`}_{j,i}$ is the mean `r pfc_txt` in intensities of known substrates of the kinase $i$ in contrast $j$, |
2955 - $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and | 5337 - $\overline{`r pfc`}_j$ is the mean `r pfc_txt` of all phosphosites identified in contrast $j$, and |
2956 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, | 5338 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, |
2957 - $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. | 5339 - $\delta_j$ is the standard deviation of the $\log_2 (\text{fold-change})$ for contrast $j$ across all phosphosites in the dataset. |
2958 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. | 5340 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. |
2959 | 5341 |
2960 $\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) | 5342 $\text{FDR}_{j,i}$ is the False Discovery Rate corrected kinase enrichment score. |
2961 | 5343 |
2962 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). | 5344 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). |
2963 | 5345 |
2964 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. | 5346 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. |
2965 | 5347 |
2966 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. | 5348 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. |
2967 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). | 5349 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). |
2968 - Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). | 5350 - Kinase-substrate data were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). |
2969 | 5351 |
2970 ```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 5352 For each enriched kinase, a heatmap showing the intensities is presented for up to `r g_intensity_hm_rows` substrates, i.e., those substrates having the highest"quality". |
5353 | |
5354 Where possible, a heatmap of the correlations among these the selected substrates is also presented; if correlations cannot be computed (because of too many missing values), then the covariances are heatmapped for substrates having a variance greater than 1. | |
5355 | |
5356 ```{r ksea, echo = FALSE, fig.dim = c(12, 14.5), results = 'asis'} | |
5357 cat("\\clearpage\n") | |
2971 | 5358 |
2972 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) | 5359 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) |
2973 | 5360 |
2974 # -- eliminate the table that's about to be defined | 5361 # -- eliminate the table that's about to be defined |
2975 ddl_exec(db, " | 5362 ddl_exec(db, " |
3044 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | 5431 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] |
3045 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | 5432 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] |
3046 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | 5433 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) |
3047 contrast_longlabel <- ( | 5434 contrast_longlabel <- ( |
3048 sprintf( | 5435 sprintf( |
3049 "Trt %s {%s} -> Trt %s {%s}", | 5436 "Class %s -> Class %s", |
3050 contrast_metadata_df[i_cntrst, "b_level"], | 5437 contrast_metadata_df[i_cntrst, "b_level"], |
3051 gsub( | 5438 contrast_metadata_df[i_cntrst, "a_level"] |
3052 pattern = ";", | |
3053 replacement = ", ", | |
3054 x = contrast_metadata_df[i_cntrst, "b_samples"], | |
3055 fixed = TRUE | |
3056 ), | |
3057 contrast_metadata_df[i_cntrst, "a_level"], | |
3058 gsub( | |
3059 pattern = ";", | |
3060 replacement = ", ", | |
3061 x = contrast_metadata_df[i_cntrst, "a_samples"], | |
3062 fixed = TRUE | |
3063 ) | |
3064 ) | 5439 ) |
3065 ) | 5440 ) |
3066 kseaapp_input <- | 5441 kseaapp_input <- |
3067 sqldf::sqldf( | 5442 sqldf::sqldf( |
3068 x = sprintf(" | 5443 x = sprintf(" |
3089 sub_title <- contrast_longlabel | 5464 sub_title <- contrast_longlabel |
3090 tryCatch( | 5465 tryCatch( |
3091 expr = { | 5466 expr = { |
3092 ksea_scores_rslt <- | 5467 ksea_scores_rslt <- |
3093 ksea_scores( | 5468 ksea_scores( |
3094 ksdata = pseudo_ksdata, # KSEAapp::KSData, | 5469 ksdata = pseudo_ksdata, |
3095 px = kseaapp_input, | 5470 px = kseaapp_input, |
3096 networkin = TRUE, | 5471 networkin = TRUE, |
3097 networkin_cutoff = 2 | 5472 networkin_cutoff = 2, |
5473 minimum_substrate_count = ksea_min_substrate_count | |
3098 ) | 5474 ) |
5475 | |
5476 if (FALSE) { | |
5477 ksea_scores_rslt <- | |
5478 ksea_scores_rslt[ | |
5479 ksea_scores_rslt$m >= ksea_min_substrate_count, | |
5480 , | |
5481 drop = FALSE | |
5482 ] | |
5483 } | |
5484 | |
5485 if (FALSE) { | |
5486 data_frame_tabbing_latex( | |
5487 x = ksea_scores_rslt, | |
5488 tabstops = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.8), | |
5489 caption = paste("KSEA scores for contrast ", | |
5490 cntrst_b_level, "to", cntrst_a_level), | |
5491 use_subsubsection_header = FALSE | |
5492 ) | |
5493 } | |
5494 | |
5495 if (FALSE) { | |
5496 if (print_nb_messages) nbe("Output contents of `ksea_scores_rslt` table\n") | |
5497 cat_variable(ksea_scores_rslt) | |
5498 cat("\n\\clearpage\n") | |
5499 } | |
3099 | 5500 |
3100 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | 5501 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { |
3101 next_index <- 1 + next_index | 5502 next_index <- 1 + next_index |
3102 rslt$score_list[[next_index]] <- ksea_scores_rslt | 5503 rslt$score_list[[next_index]] <- ksea_scores_rslt |
3103 rslt$name_list[[next_index]] <- contrast_label | 5504 rslt$name_list[[next_index]] <- contrast_label |
3104 rslt$longname_list[[next_index]] <- contrast_longlabel | 5505 rslt$longname_list[[next_index]] <- contrast_longlabel |
3105 low_fdr_print( | 5506 ksea_low_fdr_print( |
3106 rslt = rslt, | 5507 rslt = rslt, |
3107 i_cntrst = i_cntrst, | 5508 i_cntrst = i_cntrst, |
3108 i = next_index, | 5509 i = next_index, |
3109 a_level = cntrst_a_level, | 5510 a_level = cntrst_a_level, |
3110 b_level = cntrst_b_level, | 5511 b_level = cntrst_b_level, |
3111 fold_change = cntrst_fold_change, | 5512 fold_change = cntrst_fold_change, |
3112 caption = contrast_longlabel | 5513 caption = contrast_longlabel |
3113 ) | 5514 ) |
3114 } | 5515 } |
3115 }, | 5516 }, |
3116 error = function(e) str(e) | 5517 error = function(e) { |
5518 str(e) | |
5519 cat_margins() | |
5520 } | |
3117 ) | 5521 ) |
3118 } | 5522 } |
3119 | 5523 |
3120 plotted_kinases <- NULL | 5524 plotted_kinases <- NULL |
3121 if (length(rslt$score_list) > 1) { | 5525 if (g_can_run_ksea && length(rslt$score_list) > 1) { |
3122 for (i in seq_len(length(ksea_heatmap_titles))) { | 5526 for (i in seq_len(length(ksea_heatmap_titles))) { |
3123 hdr <- ksea_heatmap_titles[[i]] | 5527 hdr <- ksea_heatmap_titles[[i]] |
3124 which_kinases <- i | 5528 which_kinases <- i |
3125 | 5529 |
3126 cat("\\clearpage\n\\begin{center}\n") | 5530 cat("\\clearpage\n\\begin{center}\n") |
3127 if (i == const_ksea_astrsk_kinases) { | 5531 if (i == const_ksea_astrsk_kinases) { |
3128 subsection_header(hdr) | 5532 cat(subsection_header(hdr)) |
3129 } else { | 5533 } else { |
3130 subsection_header(hdr) | 5534 cat(subsection_header(hdr)) |
3131 } | 5535 } |
3132 cat("\\end{center}\n") | 5536 cat("\\end{center}\n") |
3133 | 5537 |
3134 plotted_kinases <- ksea_heatmap( | 5538 plotted_kinases <- ksea_heatmap( |
3135 # the data frame outputs from the KSEA.Scores() function, in list format | 5539 # the data frame outputs from the KSEA.Scores() function, in list format |
3144 # a numeric value between 0 and infinity indicating the min. number of | 5548 # a numeric value between 0 and infinity indicating the min. number of |
3145 # substrates a kinase must have to be included in the heatmap | 5549 # substrates a kinase must have to be included in the heatmap |
3146 m_cutoff = 1, | 5550 m_cutoff = 1, |
3147 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | 5551 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff |
3148 # for indicating significant kinases in the heatmap | 5552 # for indicating significant kinases in the heatmap |
3149 p_cutoff = 0.05, | 5553 p_cutoff = params$kseaCutoffThreshold, |
3150 # a binary input of TRUE or FALSE, indicating whether or not to perform | 5554 # a binary input of TRUE or FALSE, indicating whether or not to perform |
3151 # hierarchical clustering of the sample columns | 5555 # hierarchical clustering of the sample columns |
3152 sample_cluster = TRUE, | 5556 sample_cluster = TRUE, |
3153 # a binary input of TRUE or FALSE, indicating whether or not to export | 5557 # a binary input of TRUE or FALSE, indicating whether or not to export |
3154 # the heatmap as a .png image into the working directory | 5558 # the heatmap as a .png image into the working directory |
3161 ylab = "Kinase", | 5565 ylab = "Kinase", |
3162 # print which kinases: | 5566 # print which kinases: |
3163 # - 1 : all kinases | 5567 # - 1 : all kinases |
3164 # - 2 : significant kinases | 5568 # - 2 : significant kinases |
3165 # - 3 : non-significant kinases | 5569 # - 3 : non-significant kinases |
3166 which_kinases = which_kinases | 5570 which_kinases = which_kinases, |
3167 ) | 5571 margins = c(7, 15) |
3168 cat("\\begin{center}\n") | 5572 ) |
3169 cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") | 5573 if (!is.null(plotted_kinases)) { |
3170 cat("\\end{center}\n") | 5574 cat("\\begin{center}\n") |
5575 if (which_kinases != const_ksea_nonastrsk_kinases) | |
5576 cat("Asterisks reflect significance.\n") | |
5577 cat("\\end{center}\n") | |
5578 } | |
3171 } # end for (i in ... | 5579 } # end for (i in ... |
3172 } # end if (length ... | 5580 } # end if (length ... |
3173 | 5581 ``` |
3174 for (i_cntrst in seq_len(length(rslt$score_list))) { | 5582 |
3175 next_index <- i_cntrst | 5583 ```{r kseabar_calc, echo = FALSE, fig.dim = c(9.5, 6), results = 'asis'} |
3176 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] | 5584 ksea_prints <- list() |
3177 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | 5585 ksea_barplots <- list() |
3178 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | 5586 for (i_cntrst in seq_len(length(rslt$score_list))) { |
3179 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | 5587 next_index <- i_cntrst |
3180 contrast_longlabel <- ( | 5588 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] |
5589 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | |
5590 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | |
5591 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | |
5592 contrast_longlabel <- ( | |
5593 sprintf( | |
5594 "Class %s -> Class %s", | |
5595 contrast_metadata_df[i_cntrst, "b_level"], | |
5596 contrast_metadata_df[i_cntrst, "a_level"] | |
5597 ) | |
5598 ) | |
5599 main_title <- ( | |
5600 sprintf( | |
5601 "Change from treatment %s to treatment %s", | |
5602 contrast_metadata_df[i_cntrst, "b_level"], | |
5603 contrast_metadata_df[i_cntrst, "a_level"] | |
5604 ) | |
5605 ) | |
5606 sub_title <- contrast_longlabel | |
5607 tryCatch( | |
5608 expr = { | |
5609 ksea_scores_rslt <- rslt$score_list[[next_index]] | |
5610 if (print_nb_messages) nbe(see_variable(ksea_scores_rslt)) #ACE | |
5611 | |
5612 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | |
5613 sink(deferred <- file()) | |
5614 ksea_low_fdr_print( | |
5615 rslt = rslt, | |
5616 i_cntrst = i_cntrst, | |
5617 i = next_index, | |
5618 a_level = cntrst_a_level, | |
5619 b_level = cntrst_b_level, | |
5620 fold_change = cntrst_fold_change, | |
5621 caption = contrast_longlabel, | |
5622 write_db = FALSE, | |
5623 anchor = const_table_anchor_t | |
5624 ) | |
5625 cat("\n") | |
5626 sink() | |
5627 lines <- | |
5628 paste( | |
5629 readLines(deferred, warn = FALSE), | |
5630 collapse = "\n" | |
5631 ) | |
5632 close(deferred) | |
5633 sq_put(ksea_prints, lines) | |
5634 sink(stderr()) | |
5635 cat("\n---\n") | |
5636 cat_variable(ksea_prints) | |
5637 barplot_closure <- | |
5638 ksea_low_fdr_barplot_factory( | |
5639 rslt = rslt, | |
5640 i_cntrst = i_cntrst, | |
5641 i = next_index, | |
5642 a_level = cntrst_a_level, | |
5643 b_level = cntrst_b_level, | |
5644 fold_change = cntrst_fold_change, | |
5645 caption = contrast_longlabel | |
5646 ) | |
5647 if (rlang::is_closure(barplot_closure)) | |
5648 sq_put(ksea_barplots, barplot_closure) | |
5649 else | |
5650 sq_put(ksea_barplots, no_op) | |
5651 str(ksea_barplots) | |
5652 cat("\n...\n") | |
5653 sink() | |
5654 } | |
5655 }, | |
5656 error = function(e) { | |
5657 str(e) | |
5658 cat_margins() | |
5659 } | |
5660 ) | |
5661 } | |
5662 ``` | |
5663 | |
5664 ```{r phosphoelm_kinase_upid_desc, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} | |
5665 | |
5666 have_kinase_descriptions <- | |
5667 if (!is.null(bzip2df(kinase_uprt_desc_lut, kinase_uprt_desc_lut_bz2)) && | |
5668 !is.null(bzip2df(kinase_name_uprt_lut, kinase_name_uprt_lut_bz2)) | |
5669 ) { | |
5670 rownames(kinase_uprt_desc_lut) <- kinase_uprt_desc_lut$UniProtID | |
5671 kinase_name_to_desc_uprt <- function(s) { | |
5672 rslt <- NULL | |
5673 tryCatch( | |
5674 { | |
5675 which_rows <- eval(s == kinase_name_uprt_lut$kinase) | |
5676 kinase_uprtid <- | |
5677 kinase_name_uprt_lut[which_rows, 2] | |
5678 # filter for first _HUMAN match if any | |
5679 grepl_human <- grepl("_HUMAN$", kinase_uprtid) | |
5680 if (0 < sum(grepl_human)) | |
5681 kinase_uprtid <- kinase_uprtid[grepl_human] | |
5682 # filter for first match if any | |
5683 if (0 < length(kinase_uprtid)) { | |
5684 kinase_uprtid <- kinase_uprtid[1] | |
5685 kinase_desc <- kinase_uprt_desc_lut[kinase_uprtid, 2] | |
5686 if (!is.na(kinase_desc)) | |
5687 rslt <- c(kinase_desc, kinase_uprtid) | |
5688 else | |
5689 rslt <- c(kinase_desc, "") | |
5690 } | |
5691 }, | |
5692 warning = str | |
5693 ) | |
5694 rslt | |
5695 } | |
5696 TRUE | |
5697 } else { | |
5698 kinase_name_to_desc_uprt <- function(s) NULL | |
5699 FALSE | |
5700 } | |
5701 ``` | |
5702 | |
5703 ```{r write_params, echo = FALSE, results = 'asis'} | |
5704 # perhaps this should be moved into the functions section, eventually ... | |
5705 write_params <- function(db) { | |
5706 # write parameters to report | |
5707 | |
5708 # write parameters to SQLite output | |
5709 | |
5710 mqppep_anova_script_param_df <- data.frame( | |
5711 script = "mqppep_anova_script.Rmd", | |
5712 parameter = names(param_unlist), | |
5713 value = param_unlist | |
5714 ) | |
5715 ddl_exec(db, " | |
5716 DROP TABLE IF EXISTS script_parameter; | |
5717 " | |
5718 ) | |
5719 ddl_exec(db, " | |
5720 CREATE TABLE IF NOT EXISTS script_parameter( | |
5721 script TEXT, | |
5722 parameter TEXT, | |
5723 value ANY, | |
5724 UNIQUE (script, parameter) ON CONFLICT REPLACE | |
5725 ) | |
5726 ; | |
5727 " | |
5728 ) | |
5729 RSQLite::dbWriteTable( | |
5730 conn = db, | |
5731 name = "script_parameter", | |
5732 value = mqppep_anova_script_param_df, | |
5733 append = TRUE | |
5734 ) | |
5735 | |
5736 loaded_packages_df <- sessioninfo::package_info("loaded") | |
5737 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) | |
5738 loaded_packages_df <- data.frame( | |
5739 package = loaded_packages_df$package, | |
5740 version = loaded_packages_df$loadedversion, | |
5741 date = loaded_packages_df$date | |
5742 ) | |
5743 #ACE cat("\\clearpage\n\\section{R package versions}\n") | |
5744 #ACE data_frame_tabbing_latex( | |
5745 #ACE x = loaded_packages_df, | |
5746 #ACE tabstops = c(2.5, 1.25), | |
5747 #ACE caption = "R package versions" | |
5748 #ACE ) | |
5749 cat("\\clearpage\n\\section{Input parameter settings}\n") | |
5750 data_frame_tabbing_latex( | |
5751 x = param_df, | |
5752 tabstops = c(1.75), | |
5753 underscore_whack = TRUE, | |
5754 caption = "Input parameters", | |
5755 verbatim = FALSE | |
5756 ) | |
5757 } | |
5758 | |
5759 if (!have_kinase_descriptions) { | |
5760 write_params(db) | |
5761 # We are done with output | |
5762 RSQLite::dbDisconnect(db) | |
5763 param_df_exit() | |
5764 knitr::knit_exit() | |
5765 } | |
5766 ``` | |
5767 | |
5768 ```{r kseabar, echo = FALSE, fig.dim = c(9.5, 12.3), results = 'asis'} | |
5769 if (have_kinase_descriptions) { | |
5770 my_section_header <- | |
3181 sprintf( | 5771 sprintf( |
3182 "Trt %s {%s} -> Trt %s {%s}", | 5772 "inases whose KSEA %s < %s\n", |
3183 contrast_metadata_df[i_cntrst, "b_level"], | 5773 ksea_cutoff_statistic, |
3184 gsub( | 5774 signif(ksea_cutoff_threshold, 2) |
3185 pattern = ";", | 5775 ) |
3186 replacement = ", ", | 5776 |
3187 x = contrast_metadata_df[i_cntrst, "b_samples"], | 5777 # Use enriched kinases to find enriched kinase-substrate pairs |
3188 fixed = TRUE | 5778 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) |
3189 ), | 5779 |
3190 contrast_metadata_df[i_cntrst, "a_level"], | 5780 enriched_kinase_descs <- |
3191 gsub( | 5781 Reduce( |
3192 pattern = ";", | 5782 f = function(l, r) { |
3193 replacement = ", ", | 5783 lkup <- kinase_name_to_desc_uprt(r) |
3194 x = contrast_metadata_df[i_cntrst, "a_samples"], | 5784 if (is.null(lkup)) l |
3195 fixed = TRUE | 5785 else r2 <- rbind( |
3196 ) | 5786 l, |
3197 ) | 5787 data.frame( |
3198 ) | 5788 kinase = r, |
3199 main_title <- ( | 5789 uniprot_id = lkup[2], |
5790 description = lkup[1] | |
5791 ) | |
5792 ) | |
5793 }, | |
5794 x = enriched_kinases$kinase, | |
5795 init = NULL | |
5796 ) | |
5797 | |
5798 if (length(enriched_kinase_descs) > 0 && nrow(enriched_kinase_descs) > 0) { | |
5799 cat("\n\\clearpage\n") | |
5800 data_frame_tabbing_latex( | |
5801 x = enriched_kinase_descs, | |
5802 tabstops = c(0.9, 1.3), | |
5803 headings = c("Kinase", "UniProt ID", "Description"), | |
5804 caption = paste0("Descriptions of k", my_section_header) | |
5805 ) | |
5806 } | |
5807 | |
5808 if (FALSE) { | |
5809 cat_variable(sqldf("SELECT kinase FROM enriched_kinases")) | |
5810 cat_variable(sqldf(" | |
5811 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
5812 FROM pseudo_ksdata | |
5813 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
5814 ")) | |
5815 data_frame_table_latex( | |
5816 x = sqldf(" | |
5817 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
5818 FROM pseudo_ksdata | |
5819 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
5820 "), | |
5821 justification = "l l l", | |
5822 centered = TRUE, | |
5823 caption = "substrates of enriched kinases", | |
5824 anchor = c(const_table_anchor_p, const_table_anchor_t), | |
5825 underscore_whack = TRUE | |
5826 ) | |
5827 data_frame_table_latex( | |
5828 x = sqldf(" | |
5829 SELECT | |
5830 gene AS kinase, | |
5831 ppep, | |
5832 sub_gene, | |
5833 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, | |
5834 fdr_adjusted_anova_p, | |
5835 quality, | |
5836 min_group_obs_count | |
5837 FROM ( | |
5838 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
5839 FROM pseudo_ksdata | |
5840 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
5841 ), | |
5842 p_value_data | |
5843 WHERE ppep = phosphopeptide | |
5844 GROUP BY kinase, ppep | |
5845 ORDER BY kinase, ppep, p_value_data.quality DESC | |
5846 "), | |
5847 justification = "l l l l l l l", | |
5848 centered = TRUE, | |
5849 caption = "labeled substrates of enriched kinases", | |
5850 anchor = c(const_table_anchor_p, const_table_anchor_t), | |
5851 underscore_whack = TRUE | |
5852 ) | |
5853 } | |
5854 all_enriched_substrates <- sqldf(" | |
5855 SELECT | |
5856 gene AS kinase, | |
5857 ppep, | |
5858 sub_gene, | |
5859 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, | |
5860 fdr_adjusted_anova_p, | |
5861 quality, | |
5862 min_group_obs_count | |
5863 FROM ( | |
5864 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
5865 FROM pseudo_ksdata | |
5866 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
5867 ), | |
5868 p_value_data | |
5869 WHERE ppep = phosphopeptide | |
5870 GROUP BY kinase, ppep | |
5871 ORDER BY kinase, ppep, p_value_data.quality DESC | |
5872 ") | |
5873 | |
5874 all_enriched_substrates <- | |
5875 all_enriched_substrates[ | |
5876 all_enriched_substrates$quality >= params$minQuality, | |
5877 , | |
5878 drop = FALSE | |
5879 ] | |
5880 | |
5881 all_enriched_substrates$sub_gene <- | |
5882 sub( | |
5883 " ///.*", | |
5884 " ...", | |
5885 all_enriched_substrates$sub_gene | |
5886 ) | |
5887 | |
5888 all_enriched_substrates$label <- | |
5889 with( | |
5890 all_enriched_substrates, | |
3200 sprintf( | 5891 sprintf( |
3201 "Change from treatment %s to treatment %s", | 5892 "(%s-%s) %s", |
3202 contrast_metadata_df[i_cntrst, "b_level"], | 5893 kinase, |
3203 contrast_metadata_df[i_cntrst, "a_level"] | 5894 trunc_subgene(sub_gene), |
3204 ) | 5895 ppep |
3205 ) | 5896 ) |
3206 sub_title <- contrast_longlabel | 5897 ) |
3207 tryCatch( | 5898 |
3208 expr = { | 5899 # this global is set to TRUE by cat_enriched_heading immediately below |
3209 ksea_scores_rslt <- rslt$score_list[[next_index]] | 5900 g_neednewpage <- FALSE |
3210 | 5901 |
3211 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | 5902 # helper used to label per-kinase substrate enrichment figure |
3212 low_fdr_barplot( | 5903 cat_enriched_heading <- function(m, cut_args) { |
3213 rslt = rslt, | 5904 cutoff <- cut_args$cutoff |
3214 i_cntrst = i_cntrst, | 5905 kinase <- cut_args$kinase |
3215 i = next_index, | 5906 if (g_neednewpage) cat("\\newpage\n") |
3216 a_level = cntrst_a_level, | 5907 g_neednewpage <- TRUE |
3217 b_level = cntrst_b_level, | 5908 if (nrow(m) > g_intensity_hm_rows) { |
3218 fold_change = cntrst_fold_change, | 5909 cat(subsection_header( |
3219 caption = contrast_longlabel | |
3220 ) | |
3221 } | |
3222 }, | |
3223 error = function(e) str(e) | |
3224 ) | |
3225 } | |
3226 ``` | |
3227 | |
3228 ```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
3229 | |
3230 # Use enriched kinases to find enriched kinase-substrate pairs | |
3231 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) | |
3232 all_enriched_substrates <- sqldf(" | |
3233 SELECT | |
3234 gene AS kinase, | |
3235 ppep, | |
3236 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label | |
3237 FROM ( | |
3238 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
3239 FROM pseudo_ksdata | |
3240 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
3241 ) | |
3242 GROUP BY ppep | |
3243 ") | |
3244 | |
3245 # helper used to label per-kinase substrate enrichment figure | |
3246 cat_enriched_heading <- function(m, cut_args) { | |
3247 cutoff <- cut_args$cutoff | |
3248 kinase <- cut_args$kinase | |
3249 statistic <- cut_args$statistic | |
3250 threshold <- cut_args$threshold | |
3251 cat("\\newpage\n") | |
3252 if (nrow(m) > intensity_hm_rows) { | |
3253 subsection_header( | |
3254 paste( | |
3255 sprintf( | 5910 sprintf( |
3256 "Lowest p-valued %d (of %d) enriched %s-substrates,", | 5911 "Highest-quality %d (of %d) enriched %s-substrates", |
3257 intensity_hm_rows, | 5912 g_intensity_hm_rows, |
3258 nrow(m), | 5913 nrow(m), |
3259 kinase | 5914 kinase |
3260 ), | 5915 ) |
3261 sprintf(" KSEA %s < %0.2f\n", statistic, threshold) | 5916 )) |
3262 ) | |
3263 ) | |
3264 } else { | |
3265 if (nrow(m) == 1) { | |
3266 return(FALSE) | |
3267 } else { | 5917 } else { |
3268 subsection_header( | 5918 if (nrow(m) == 0) { |
3269 paste( | 5919 return(FALSE) |
5920 } else { | |
5921 nrow_m <- nrow(m) | |
5922 cat(subsection_header( | |
3270 sprintf( | 5923 sprintf( |
3271 "%d enriched %s-substrates,", | 5924 "%d enriched %s-substrate%s", |
3272 nrow(m), | 5925 nrow_m, |
3273 kinase | 5926 kinase, |
3274 ), | 5927 if (nrow_m > 1) "s" else "" |
3275 sprintf( | |
3276 " KSEA %s < %0.2f\n", | |
3277 statistic, | |
3278 threshold | |
3279 ) | 5928 ) |
5929 )) | |
5930 } | |
5931 } | |
5932 cat("\n\n\n") | |
5933 cat("\n\n\n") | |
5934 return(TRUE) | |
5935 } | |
5936 | |
5937 # -------------------------------- | |
5938 # hack begin - show all substrates | |
5939 enriched_substrates <- all_enriched_substrates | |
5940 # add "FALSE &&" to prevent listing of substrates | |
5941 if (show_enriched_substrates && nrow(enriched_substrates) > 0) { | |
5942 short_row_names <- sub( | |
5943 "$FAILED_MATCH_GENE_NAME", | |
5944 "not_found", | |
5945 enriched_substrates$sub_gene, | |
5946 fixed = TRUE | |
5947 ) | |
5948 | |
5949 if (print_nb_messages) nbe(see_variable(enriched_substrates)) | |
5950 substrates_df <- with( | |
5951 enriched_substrates, | |
5952 data.frame( | |
5953 kinase = kinase, | |
5954 substrate = sub(" ///*", "...", short_row_names), | |
5955 anova_p_value = signif(fdr_adjusted_anova_p, 2), | |
5956 min_group_obs_count = signif(min_group_obs_count, 0), | |
5957 quality = signif(quality, 3), | |
5958 sequence = trunc_n(30)(ppep) | |
3280 ) | 5959 ) |
3281 ) | 5960 ) |
3282 } | 5961 |
3283 } | 5962 substrates_df <- substrates_df[ |
3284 cat("\n\n\n") | 5963 with(substrates_df, order(kinase, -quality)), |
3285 cat("\n\n\n") | 5964 , |
3286 return(TRUE) | 5965 drop = FALSE |
3287 } | 5966 ] |
3288 | 5967 |
3289 # Disabling heatmaps for substrates pending decision whether to eliminate them altogether | 5968 if (print_nb_messages) nbe(see_variable(substrates_df)) |
3290 if (FALSE) | 5969 if (nrow(substrates_df) < 1) |
5970 substrates_df$sequence <- c() | |
5971 if (print_nb_messages) nbe(see_variable(substrates_df)) | |
5972 names(substrates_df) <- headers_2nd_line <- | |
5973 c("Kinase", "Substrate", "p-value", "per group)", "quality", "Sequence") | |
5974 headers_1st_line <- c("", "", "ANOVA", "min(values", "", "") | |
5975 data_frame_tabbing_latex( | |
5976 x = substrates_df, | |
5977 tabstops = c(1.2, 0.8, 0.5, 0.65, 0.5), | |
5978 headings = c(headers_1st_line, headers_2nd_line), | |
5979 caption = "Details for all enriched substrates of enriched kinases" | |
5980 ) | |
5981 rm( | |
5982 enriched_substrates, | |
5983 substrates_df, | |
5984 short_row_names, | |
5985 headers_1st_line, | |
5986 headers_2nd_line | |
5987 ) | |
5988 } | |
5989 cat("\\clearpage\n") | |
5990 # hack end - show all substrates | |
5991 # -------------------------------- | |
5992 | |
5993 # print deferred tables and graphs for kinases from contrasts | |
5994 for (i_cntrst in seq_len(length(ksea_prints))) { | |
5995 #latex_samepage({ | |
5996 cat(ksea_prints[[i_cntrst]]) | |
5997 cat("\n") | |
5998 ksea_barplots[[i_cntrst]]() | |
5999 cat("\n") | |
6000 cat("\\clearpage\n") | |
6001 #}) | |
6002 } | |
6003 | |
6004 } | |
6005 ``` | |
6006 | |
6007 ```{r enriched, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} | |
6008 if (g_can_run_ksea) { | |
6009 g_did_enriched_header <- FALSE | |
3291 for (kinase_name in sort(enriched_kinases$kinase)) { | 6010 for (kinase_name in sort(enriched_kinases$kinase)) { |
3292 enriched_substrates <- | 6011 enriched_substrates <- |
3293 all_enriched_substrates[ | 6012 all_enriched_substrates[ |
3294 all_enriched_substrates$kinase == kinase_name, | 6013 all_enriched_substrates$kinase == kinase_name, |
3295 , | 6014 , |
3296 drop = FALSE | 6015 drop = FALSE |
3297 ] | 6016 ] |
6017 ten_trunc_ppep <- trunc_enriched_substrate(enriched_substrates$ppep) | |
6018 enriched_substrates$label <- with( | |
6019 enriched_substrates, | |
6020 sprintf( | |
6021 "(%s) %s", | |
6022 make.names( | |
6023 sub("$FAILED_MATCH_GENE_NAME", "not_found", sub_gene, fixed = TRUE), | |
6024 unique = TRUE | |
6025 ), | |
6026 ten_trunc_ppep | |
6027 ) | |
6028 ) | |
3298 # Get the intensity values for the heatmap | 6029 # Get the intensity values for the heatmap |
3299 enriched_intensities <- | 6030 enriched_intensities <- |
3300 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) | 6031 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) |
6032 | |
3301 # Remove rows having too many NA values to be relevant | 6033 # Remove rows having too many NA values to be relevant |
3302 na_counter <- is.na(enriched_intensities) | 6034 good_rows <- (rowSums(enriched_intensities, na.rm = TRUE) != 0) |
3303 na_counts <- apply(na_counter, 1, sum) | 6035 #ACE nbe(see_variable(good_rows), "\n") |
3304 enriched_intensities <- | 6036 enriched_substrates <- enriched_substrates[good_rows, , drop = FALSE] |
3305 enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE] | 6037 enriched_intensities <- enriched_intensities[good_rows, , drop = FALSE] |
6038 | |
3306 # Rename the rows with the display-name for the heatmap | 6039 # Rename the rows with the display-name for the heatmap |
3307 rownames(enriched_intensities) <- | 6040 short_row_names <- sub( |
6041 "$FAILED_MATCH_GENE_NAME", | |
6042 "not_found", | |
6043 enriched_substrates$sub_gene, | |
6044 fixed = TRUE | |
6045 ) | |
6046 short_row_names <- | |
6047 make.names(short_row_names, unique = TRUE) | |
6048 long_row_names <- | |
3308 sapply( | 6049 sapply( |
3309 X = rownames(enriched_intensities), | 6050 X = rownames(enriched_intensities), |
3310 FUN = function(rn) { | 6051 FUN = function(rn) { |
3311 enriched_substrates[enriched_substrates$ppep == rn, "label"] | 6052 enriched_substrates[enriched_substrates$ppep == rn, "label"] |
3312 } | 6053 } |
3313 ) | 6054 ) |
6055 rownames(enriched_intensities) <- long_row_names | |
3314 # Format as matrix for heatmap | 6056 # Format as matrix for heatmap |
3315 m <- as.matrix(enriched_intensities) | 6057 m <- as.matrix(enriched_intensities) |
6058 rownames(m) <- trunc_enriched_substrate(rownames(m)) | |
6059 | |
6060 #ACE nb("m with bad rows: ", see_variable(m), "\n") | |
6061 #ACE good_rows <- (rowSums(m, na.rm = TRUE) != 0) | |
6062 #ACE nb(see_variable(good_rows), "\n") | |
6063 #ACE m <- m[good_rows, , drop = FALSE] | |
6064 #ACE nb("m without(?) bad rows: ", see_variable(m), "\n") | |
6065 #ACE nb(see_variable(short_row_names), "\n") | |
6066 #ACE local_short_row_names <- short_row_names[good_rows] | |
6067 #ACE local_long_row_names <- long_row_names[good_rows] | |
6068 #ACE local_enriched_intensities <- enriched_intensities[local_long_row_names, ] | |
6069 | |
3316 # Draw the heading and heatmap | 6070 # Draw the heading and heatmap |
3317 if (nrow(m) > 0) { | 6071 nrow_m <- nrow(m) |
6072 if (nrow_m > 0) { | |
6073 if (!g_did_enriched_header) { | |
6074 cat("\n\\clearpage\n") | |
6075 cat(section_header(paste0("K", my_section_header))) | |
6076 g_did_enriched_header <- TRUE | |
6077 } | |
6078 is_na_m <- is.na(m) | |
6079 cellnote_m <- is_na_m | |
6080 cellnote_m[!is_na_m] <- "" | |
6081 cellnote_m[is_na_m] <- "NA" | |
3318 cut_args <- new_env() | 6082 cut_args <- new_env() |
3319 cut_args$cutoff <- cutoff | 6083 cut_args$cutoff <- cutoff |
3320 cut_args$kinase <- kinase_name | 6084 cut_args$kinase <- kinase_name |
3321 cut_args$statistic <- ksea_cutoff_statistic | 6085 cut_args$statistic <- ksea_cutoff_statistic |
3322 cut_args$threshold <- ksea_cutoff_threshold | 6086 cut_args$threshold <- ksea_cutoff_threshold |
3323 number_of_peptides_found <- | 6087 number_of_peptides_found <- |
3324 draw_intensity_heatmap( | 6088 ppep_heatmap( |
3325 m = m, | 6089 m = m, |
6090 cellnote = cellnote_m, | |
3326 cutoff = cut_args, | 6091 cutoff = cut_args, |
3327 hm_heading_function = cat_enriched_heading, | 6092 hm_heading_function = cat_enriched_heading, |
3328 hm_main_title | 6093 hm_main_title |
3329 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", | 6094 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", |
3330 suppress_row_dendrogram = FALSE | 6095 suppress_row_dendrogram = FALSE, |
6096 master_cex = 0.35, | |
6097 sepcolor = "black", | |
6098 colsep = sample_colsep | |
3331 ) | 6099 ) |
6100 if (number_of_peptides_found > 1) { | |
6101 | |
6102 tryCatch( | |
6103 { | |
6104 rownames(m) <- short_row_names | |
6105 cov_heatmap(m, nrow_m > g_intensity_hm_rows) | |
6106 }, | |
6107 error = function(e) { | |
6108 cat( | |
6109 sprintf( | |
6110 "ERROR: %s\n\\newline\n", | |
6111 mget("e") | |
6112 ) | |
6113 ) | |
6114 cat( | |
6115 sprintf( | |
6116 "message: %s\n\\newline\n", | |
6117 e$message | |
6118 ) | |
6119 ) | |
6120 cat_margins() | |
6121 } | |
6122 ) | |
6123 } | |
6124 substrates_df <- with( | |
6125 enriched_substrates, | |
6126 data.frame( | |
6127 substrate = sub(" ///*", "...", short_row_names), | |
6128 sequence = trunc_long_ppep(ppep), | |
6129 anova_p_value = signif(fdr_adjusted_anova_p, 2), | |
6130 min_group_obs_count = signif(min_group_obs_count, 0), | |
6131 quality = signif(quality, 3) | |
6132 ) | |
6133 ) | |
6134 excess_substrates <- nrow(substrates_df) > g_intensity_hm_rows | |
6135 if (excess_substrates) | |
6136 substrates_df <- substrates_df[1:g_intensity_hm_rows, ] | |
6137 names(substrates_df) <- headers_2nd_line <- | |
6138 c("Substrate", "Sequence", "p-value", "per group)", "quality") | |
6139 headers_1st_line <- c("", "", "ANOVA", "min(values", "") | |
6140 if (1 < nrow(enriched_substrates)) | |
6141 cat("\n\\newpage\n") | |
6142 cat(subsubsection_header( | |
6143 sprintf( | |
6144 "Details for %s%s-substrates", | |
6145 if (excess_substrates) | |
6146 sprintf( | |
6147 "%s \"highest quality\" ", | |
6148 g_intensity_hm_rows | |
6149 ) | |
6150 else "", | |
6151 kinase_name | |
6152 ) | |
6153 )) | |
6154 substrates_df <- substrates_df[order(-substrates_df$quality), ] | |
6155 data_frame_tabbing_latex( | |
6156 x = substrates_df, | |
6157 tabstops = c(0.8, 3.8, 0.6, 0.8), | |
6158 headings = c(headers_1st_line, headers_2nd_line) | |
6159 ) | |
6160 } else { | |
6161 if (print_nb_messages) nbe(see_variable(nrow_m > 0), "\n") | |
3332 } | 6162 } |
3333 } | 6163 if (print_nb_messages) nb("end kinase ", kinase_name, "\n") |
3334 | 6164 } |
3335 # Write output tabular files | 6165 |
3336 | 6166 # Write output tabular files |
3337 # get kinase, ppep, concat(kinase) tuples for enriched kinases | 6167 |
3338 | 6168 # get kinase, ppep, concat(kinase) tuples for enriched kinases |
3339 kinase_ppep_label <- sqldf(" | 6169 |
3340 WITH | 6170 if (print_nb_messages) nb("kinase_ppep_label <- ...\n") |
3341 t(ppep, label) AS | 6171 if (print_nb_messages) nbe("kinase_ppep_label <- ...\n") |
3342 ( | 6172 kinase_ppep_label <- sqldf(" |
3343 SELECT DISTINCT | 6173 WITH |
3344 SUB_MOD_RSD AS ppep, | 6174 t(ppep, label) AS |
3345 group_concat(gene, '; ') AS label | 6175 ( |
6176 SELECT DISTINCT | |
6177 SUB_MOD_RSD AS ppep, | |
6178 group_concat(gene, '; ') AS label | |
6179 FROM pseudo_ksdata | |
6180 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
6181 GROUP BY ppep | |
6182 ), | |
6183 k(kinase, ppep_join) AS | |
6184 ( | |
6185 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join | |
3346 FROM pseudo_ksdata | 6186 FROM pseudo_ksdata |
3347 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | 6187 WHERE GENE IN (SELECT kinase FROM enriched_kinases) |
3348 GROUP BY ppep | 6188 ) |
3349 ), | 6189 SELECT k.kinase, t.ppep, t.label |
3350 k(kinase, ppep_join) AS | 6190 FROM t, k |
3351 ( | 6191 WHERE t.ppep = k.ppep_join |
3352 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join | 6192 ORDER BY k.kinase, t.ppep |
3353 FROM pseudo_ksdata | 6193 ") |
3354 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | 6194 |
3355 ) | 6195 |
3356 SELECT k.kinase, t.ppep, t.label | 6196 # extract what we need from full_data |
3357 FROM t, k | 6197 impish <- cbind(rownames(quant_data_imp), quant_data_imp) |
3358 WHERE t.ppep = k.ppep_join | 6198 colnames(impish)[1] <- "Phosphopeptide" |
3359 ORDER BY k.kinase, t.ppep | 6199 data_table_imputed_sql <- " |
3360 ") | 6200 SELECT |
3361 | 6201 f.*, |
3362 # extract what we need from full_data | 6202 k.label AS KSEA_enrichments, |
3363 impish <- cbind(rownames(quant_data_imp), quant_data_imp) | 6203 q.* |
3364 colnames(impish)[1] <- "Phosphopeptide" | 6204 FROM |
3365 data_table_imputed_sql <- " | 6205 metadata_plus_p f |
3366 SELECT | 6206 LEFT JOIN kinase_ppep_label k |
3367 f.*, | 6207 ON f.Phosphopeptide = k.ppep, |
3368 k.label AS KSEA_enrichments, | 6208 impish q |
3369 q.* | 6209 WHERE |
3370 FROM | 6210 f.Phosphopeptide = q.Phosphopeptide |
3371 metadata_plus_p f | 6211 " |
3372 LEFT JOIN kinase_ppep_label k | 6212 data_table_imputed <- sqldf(data_table_imputed_sql) |
3373 ON f.Phosphopeptide = k.ppep, | 6213 # Zap the duplicated 'Phosphopeptide' column named 'ppep' |
3374 impish q | 6214 data_table_imputed <- |
3375 WHERE | 6215 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] |
3376 f.Phosphopeptide = q.Phosphopeptide | 6216 |
3377 " | 6217 # Output imputed, un-normalized data |
3378 data_table_imputed <- sqldf(data_table_imputed_sql) | 6218 if (print_nb_messages) nb("Output imputed, un-normalized data tabular file\n") |
3379 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | 6219 if (print_nb_messages) nbe("Output imputed, un-normalized data tabular file\n") |
3380 data_table_imputed <- | 6220 write.table( |
3381 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | 6221 data_table_imputed |
3382 | 6222 , file = imputed_data_filename |
3383 # Output with imputed, un-normalized data | 6223 , sep = "\t" |
3384 | 6224 , col.names = TRUE |
3385 write.table( | 6225 , row.names = FALSE |
3386 data_table_imputed | 6226 , quote = FALSE |
3387 , file = imputed_data_filename | 6227 ) |
3388 , sep = "\t" | 6228 |
3389 , col.names = TRUE | 6229 |
3390 , row.names = FALSE | 6230 #output quantile normalized data |
3391 , quote = FALSE | 6231 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) |
3392 ) | 6232 colnames(impish)[1] <- "Phosphopeptide" |
3393 | 6233 data_table_imputed <- sqldf(data_table_imputed_sql) |
3394 | 6234 # Zap the duplicated 'Phosphopeptide' column named 'ppep' |
3395 #output quantile normalized data | 6235 data_table_imputed <- |
3396 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) | 6236 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] |
3397 colnames(impish)[1] <- "Phosphopeptide" | 6237 if (print_nb_messages) nb("Output quantile normalized data tabular file\n") |
3398 data_table_imputed <- sqldf(data_table_imputed_sql) | 6238 if (print_nb_messages) nbe("Output quantile normalized data tabular file\n") |
3399 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | 6239 write.table( |
3400 data_table_imputed <- | 6240 data_table_imputed, |
3401 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | 6241 file = imp_qn_lt_data_filenm, |
3402 write.table( | 6242 sep = "\t", |
3403 data_table_imputed, | 6243 col.names = TRUE, |
3404 file = imp_qn_lt_data_filenm, | 6244 row.names = FALSE, |
3405 sep = "\t", | 6245 quote = FALSE |
3406 col.names = TRUE, | 6246 ) |
3407 row.names = FALSE, | 6247 |
3408 quote = FALSE | 6248 ppep_kinase <- sqldf(" |
3409 ) | 6249 SELECT DISTINCT k.ppep, k.kinase |
3410 | 6250 FROM ( |
3411 ppep_kinase <- sqldf(" | 6251 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep |
3412 SELECT DISTINCT k.ppep, k.kinase | 6252 FROM pseudo_ksdata |
3413 FROM ( | 6253 WHERE GENE IN (SELECT kinase FROM enriched_kinases) |
3414 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep | 6254 ) k |
3415 FROM pseudo_ksdata | 6255 ORDER BY k.ppep, k.kinase |
3416 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | 6256 ") |
3417 ) k | 6257 |
3418 ORDER BY k.ppep, k.kinase | 6258 RSQLite::dbWriteTable( |
3419 ") | 6259 conn = db, |
3420 | 6260 name = "ksea_enriched_ks", |
3421 RSQLite::dbWriteTable( | 6261 value = ppep_kinase, |
3422 conn = db, | 6262 append = FALSE |
3423 name = "ksea_enriched_ks", | 6263 ) |
3424 value = ppep_kinase, | 6264 } |
3425 append = FALSE | 6265 |
3426 ) | 6266 if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\n") |
3427 | 6267 |
3428 RSQLite::dbWriteTable( | 6268 RSQLite::dbWriteTable( |
3429 conn = db, | 6269 conn = db, |
3430 name = "anova_signif", | 6270 name = "anova_signif", |
3431 value = p_value_data, | 6271 value = p_value_data, |
3451 ON m.phospho_peptide = kek.ppep | 6291 ON m.phospho_peptide = kek.ppep |
3452 ; | 6292 ; |
3453 " | 6293 " |
3454 ) | 6294 ) |
3455 | 6295 |
6296 if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\n") | |
6297 if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\n") | |
3456 write.table( | 6298 write.table( |
3457 dbReadTable(db, "stats_metadata_v"), | 6299 dbReadTable(db, "stats_metadata_v"), |
3458 file = anova_ksea_mtdt_file, | 6300 file = anova_ksea_mtdt_file, |
3459 sep = "\t", | 6301 sep = "\t", |
3460 col.names = TRUE, | 6302 col.names = TRUE, |
3461 row.names = FALSE, | 6303 row.names = FALSE, |
3462 quote = FALSE | 6304 quote = FALSE |
3463 ) | 6305 ) |
3464 | 6306 |
6307 cat("\n\\clearpage\n") | |
3465 | 6308 |
3466 ``` | 6309 ``` |
6310 | |
6311 # Data-processing summary flowchart | |
6312 | |
6313 ![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf) | |
3467 | 6314 |
3468 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 6315 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} |
3469 cat("\\leavevmode\n\n\n") | 6316 cat("\\leavevmode\n\n\n") |
3470 | 6317 |
3471 # write parameters to report | 6318 write_params(db) |
3472 | |
3473 param_unlist <- unlist(as.list(params)) | |
3474 param_df <- data.frame( | |
3475 parameter = paste0("\\verb@", names(param_unlist), "@"), | |
3476 value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@") | |
3477 ) | |
3478 | |
3479 data_frame_latex( | |
3480 x = param_df, | |
3481 justification = "p{0.35\\linewidth} p{0.6\\linewidth}", | |
3482 centered = TRUE, | |
3483 caption = "Input parameters", | |
3484 anchor = const_table_anchor_bp, | |
3485 underscore_whack = FALSE | |
3486 ) | |
3487 | |
3488 # write parameters to SQLite output | |
3489 | |
3490 mqppep_anova_script_param_df <- data.frame( | |
3491 script = "mqppep_anova_script.Rmd", | |
3492 parameter = names(param_unlist), | |
3493 value = param_unlist | |
3494 ) | |
3495 ddl_exec(db, " | |
3496 DROP TABLE IF EXISTS script_parameter; | |
3497 " | |
3498 ) | |
3499 ddl_exec(db, " | |
3500 CREATE TABLE IF NOT EXISTS script_parameter( | |
3501 script TEXT, | |
3502 parameter TEXT, | |
3503 value ANY, | |
3504 UNIQUE (script, parameter) ON CONFLICT REPLACE | |
3505 ) | |
3506 ; | |
3507 " | |
3508 ) | |
3509 RSQLite::dbWriteTable( | |
3510 conn = db, | |
3511 name = "script_parameter", | |
3512 value = mqppep_anova_script_param_df, | |
3513 append = TRUE | |
3514 ) | |
3515 | |
3516 # We are done with output | 6319 # We are done with output |
3517 RSQLite::dbDisconnect(db) | 6320 RSQLite::dbDisconnect(db) |
6321 | |
6322 cat("\\clearpage\n\\section{R package versions}\n") | |
6323 utils::toLatex(utils::sessionInfo()) | |
3518 ``` | 6324 ``` |
3519 <!-- | |
3520 There's gotta be a better way... | |
3521 | |
3522 loaded_packages_df <- sessioninfo::package_info("loaded") | |
3523 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) | |
3524 loaded_packages_df <- data.frame( | |
3525 package = loaded_packages_df$package, | |
3526 version = loaded_packages_df$loadedversion, | |
3527 date = loaded_packages_df$date | |
3528 ) | |
3529 data_frame_latex( | |
3530 x = loaded_packages_df, | |
3531 justification = "l | l l", | |
3532 centered = FALSE, | |
3533 caption = "Loaded R packages", | |
3534 anchor = const_table_anchor_bp | |
3535 ) | |
3536 --> |