comparison PMD_FDR_package_for_Galaxy.R @ 0:5cc0c32d05a2 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pmd_fdr commit 00f85eca73cd8afedfefbeec94a4462455ac1a9a"
author galaxyp
date Mon, 07 Oct 2019 11:59:37 -0400
parents
children 460edeedeb7d
comparison
equal deleted inserted replaced
-1:000000000000 0:5cc0c32d05a2
1 ###############################################################################
2 # PMD_FDR_package_for_Galaxy.R #
3 # #
4 # Project 021 - PMD-FDR for Galaxy-P #
5 # #
6 # Description: Computes iFDR and gFDR on PSMs as a script designed for Galaxy #
7 # Note that plotting code has been left in that is not used #
8 # in this file; this is the code I used to create figures for #
9 # publication. I left it in for potential development of views. #
10 # #
11 # This file was created by concatenating the following files: #
12 # #
13 # A - 005 - Parser - ArgParser.R #
14 # B - 019 - PMD-FDR - functions.R #
15 # C - 021 - PMD-FDR Wrapper - functions.R #
16 # D - 021 - PMD-FDR Main.R #
17 # #
18 # Required packages: argparser #
19 # stringr #
20 # RUnit #
21 # #
22 # Release date: 2019-10-05 #
23 # Version: 1.4 #
24 # #
25 ###############################################################################
26 # Package currently supports the following parameters:
27 #
28 # --psm_report full name and path to the PSM report
29 # --psm_report_1_percent full name and path to the PSM report for 1% FDR
30 # --output_i_fdr full name and path to the i-FDR output file
31 # --output_g_fdr full name and path to the g-FDR output file
32 # --output_densities full name and path to the densities output file
33 #
34 ###############################################################################
35 # A - 005 - Parser - ArgParser.R #
36 # #
37 # Description: Wrapper for argparser package, using RefClass #
38 # #
39 ###############################################################################
40
41 #install.packages("argparser")
42 library(argparser)
43
44 # Class definition
45
46 ArgParser <- setRefClass("ArgParser",
47 fields = c("parser"))
48 ArgParser$methods(
49 initialize = function(...){
50 parser <<- arg_parser(...)
51 },
52 local_add_argument = function(...){
53 parser <<- add_argument(parser, ...)
54 },
55 parse_arguments = function(...){
56 result = parse_args(parser, ...)
57 return(result)
58 }
59 )
60
61 ###############################################################################
62 # B - 019 - PMD-FDR - functions.R #
63 # #
64 # Primary work-horse for PMD-FDR #
65 # #
66 ###############################################################################
67 ###############################################################################
68 ####### Load libraries etc.
69 ###############################################################################
70 library(stringr)
71 library(RUnit)
72
73 #############################################################
74 ####### Global values (should be parameters to module but aren't yet)
75 #############################################################
76
77 MIN_GOOD_PEPTIDE_LENGTH <- 11
78 MIN_ACCEPTABLE_POINTS_IN_DENSITY <- 10
79
80 #############################################################
81 ####### General purpose functions
82 #############################################################
83 # Creates a more useful error report when file is not reasonable
84 safe_file_exists <- function(file_path){ # Still not particularly useful in cases where it is a valid directory
85 tryCatch(
86 return(file_test(op = "-f", x=file_path)),
87 error=function(e) {simpleError(sprintf("file path is not valid: '%s'", file_path))}
88 )
89 }
90 # My standard way of loading data into data.frames
91 load_standard_df <- function(file_path=NULL){
92 clean_field_names = function(field_names){
93 result <- field_names
94 idx_blank <- which(result == "")
95 result[idx_blank] <- sprintf("<Field %d>", idx_blank)
96 return(result)
97 }
98 if (safe_file_exists(file_path)){
99 field_names <- read_field_names(file_path, sep = "\t")
100 field_names <- clean_field_names(field_names)
101
102 if (length(field_names) == 0){
103 return(data.frame())
104 }
105 data <- read.table(file = file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE, blank.lines.skip = TRUE)#, check.names = FALSE)
106 colnames(data) = field_names
107 } else {
108 stop(sprintf("File path does not exist: '%s'", file_path))
109 }
110 return(data)
111 }
112 save_standard_df <- function(x=NULL, file_path=NULL){
113 if (file_path != ""){
114 write.table(x = x, file = file_path, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
115 }
116 }
117 rename_column <- function(df=NULL, name_before=NULL, name_after=NULL, suppressWarnings=FALSE){
118 if (is.null(df)){
119 stop("Dataframe (df) does not exist - unable to rename column")
120 }
121 if (name_before %in% colnames(df)){
122 df[,name_after] <- df[,name_before]
123 df[,name_before] <- NULL
124 } else if (!suppressWarnings){
125 warning(sprintf("'%s' is not a field in the data frame and so has not been renamed", name_before))
126 }
127 return(df)
128 }
129 rename_columns <- function(df=NULL, names_before=NULL, names_after=NULL){
130 for (i in safe_iterator(length(names_before))){
131 df <- rename_column(df, names_before[i], names_after[i])
132 }
133 return(df)
134 }
135 round_to_tolerance <- function(x=NULL, tolerance=NULL, ...){
136 return(function_to_tolerance(x=x, tolerance=tolerance, FUN=round, ...))
137 }
138 function_to_tolerance <- function(x=NULL, tolerance=NULL, FUN=NULL, ...){
139 return(FUN(x/tolerance, ...) * tolerance)
140 }
141 safe_median <- function(x) median(x, na.rm=TRUE)
142 normalize_density <- function(d){
143 # Normalizes y-values in density function
144 # so that the integral under the curve is 1
145 # (uses rectangles to approximate area)
146 delta_x <- diff(range(d$x)) / length(d$x)
147 unnormalized_integral <- delta_x * sum(d$y)
148 new_d <- d
149 new_d$y <- with(new_d, y )
150
151 return(new_d)
152 }
153 if_null <- function(cond=NULL, null_result=NULL, not_null_result=NULL){
154 return(switch(1+is.null(cond),
155 not_null_result,
156 null_result))
157 }
158 rainbow_with_fixed_intensity <- function(n=NULL, goal_intensity_0_1=NULL, alpha=NULL){
159 goal_intensity <- 255*goal_intensity_0_1
160 hex_colors <- rainbow(n)
161 rgb_colors <- col2rgb(hex_colors)
162 df_colors <- data.frame(t(rgb_colors))
163 df_colors$intensity <- with(df_colors, 0.2989*red + 0.5870*green + 0.1140*blue)
164
165 df_colors$white_black <- with(df_colors, ifelse(intensity < goal_intensity, 255, 0))
166 df_colors$mix_level <- with(df_colors, (white_black - goal_intensity) / (white_black - intensity ) )
167 df_colors$new_red <- with(df_colors, mix_level*red + (1-mix_level)*white_black)
168 df_colors$new_green <- with(df_colors, mix_level*green + (1-mix_level)*white_black)
169 df_colors$new_blue <- with(df_colors, mix_level*blue + (1-mix_level)*white_black)
170 names_pref_new <- c("new_red", "new_green", "new_blue")
171 names_no_pref <- c("red", "green", "blue")
172 df_colors <- df_colors[,names_pref_new]
173 df_colors <- rename_columns(df_colors, names_before = names_pref_new, names_after = names_no_pref)
174 rgb_colors <-as.matrix(df_colors/255 )
175
176 return(rgb(rgb_colors, alpha=alpha))
177 }
178 safe_iterator <- function(n_steps = NULL){
179 if (n_steps < 1){
180 result = numeric(0)
181 } else {
182 result = 1:n_steps
183 }
184 return(result)
185 }
186 col2hex <- function(cols=NULL, col_alpha=255){
187 if (all(col_alpha<=1)){
188 col_alpha <- round(col_alpha*255)
189 }
190 col_matrix <- t(col2rgb(cols))
191 results <- rgb(col_matrix, alpha=col_alpha, maxColorValue = 255)
192 return(results)
193 }
194 credible_interval <- function(x=NULL, N=NULL, precision=0.001, alpha=0.05){
195 # Approximates "highest posterior density interval"
196 # Uses exact binomial but with a finite list of potential values (1/precision)
197
198 p <- seq(from=0, to=1, by=precision)
199 d <- dbinom(x = x, size = N, prob = p)
200 d <- d / sum(d)
201 df <- data.frame(p=p, d=d)
202 df <- df[order(-df$d),]
203 df$cumsum <- cumsum(df$d)
204 max_idx <- sum(df$cumsum < (1-alpha)) + 1
205 max_idx <- min(max_idx, nrow(df))
206
207 lower <- min(df$p[1:max_idx])
208 upper <- max(df$p[1:max_idx])
209
210 return(c(lower,upper))
211 }
212 verified_element_of_list <- function(parent_list=NULL, element_name=NULL, object_name=NULL){
213 if (is.null(parent_list[[element_name]])){
214 if (is.null(object_name)){
215 object_name = "the list"
216 }
217 stop(sprintf("Element '%s' does not yet exist in %s", element_name, object_name))
218 }
219 return(parent_list[[element_name]])
220 }
221 read_field_names = function(file_path=NULL, sep = "\t"){
222 con = file(file_path,"r")
223 fields = readLines(con, n=1)
224 close(con)
225
226 if (length(fields) == 0){
227 return(c())
228 }
229 fields = strsplit(x = fields, split = sep)[[1]]
230 return(fields)
231 }
232 check_field_name = function(input_df = NULL, name_of_input_df=NULL, field_name=NULL){
233 test_succeeded <- field_name %in% colnames(input_df)
234 current_columns <- paste0(colnames(input_df), collapse=", ")
235 checkTrue(test_succeeded,
236 msg = sprintf("Expected fieldname '%s' in %s (but did not find it among %s)",
237 field_name, name_of_input_df, current_columns))
238 }
239
240 #############################################################
241 ####### Classes for Data
242 #############################################################
243
244 ###############################################################################
245 # Class: Data_Object
246 ###############################################################################
247 Data_Object <- setRefClass("Data_Object",
248 fields =list(m_is_dirty = "logical",
249 parents = "list",
250 children = "list",
251 class_name = "character"))
252 Data_Object$methods(
253 initialize = function(){
254 m_is_dirty <<- TRUE
255 class_name <<- "Data_Object <abstract class - class_name needs to be set in subclass>"
256 },
257 load_data = function(){
258 #print(sprintf("Calling %s$load_data()", class_name)) # Useful for debugging
259 ensure_parents()
260 verify()
261 m_load_data()
262 set_dirty(new_value = FALSE)
263 },
264 ensure = function(){
265 if (m_is_dirty){
266 load_data()
267 }
268 },
269 set_dirty = function(new_value){
270 if (new_value != m_is_dirty){
271 m_is_dirty <<- new_value
272 set_children_dirty()
273 }
274 },
275 verify = function(){
276 stop(sprintf("verify() is an abstract method - define it in %s before calling load_data()", class_name))
277 },
278 m_load_data = function(){
279 stop(sprintf("m_load_data() is an abstract method - define it in %s before calling load_data()", class_name))
280 },
281 append_parent = function(parent=NULL){
282 parents <<- append(parents, parent)
283 },
284 append_child = function(child=NULL){
285 children <<- append(children, child)
286 },
287 ensure_parents = function(){
288 for (parent in parents){
289 # print(sprintf("Calling %s$ensure()", parent$class_name)) # Useful for debugging
290 parent$ensure()
291 }
292 },
293 set_children_dirty = function(){
294 for (child in children){
295 child$set_dirty(TRUE)
296 }
297 }
298 )
299 ###############################################################################
300 # Class: Data_Object_Info
301 ###############################################################################
302 Data_Object_Info <- setRefClass("Data_Object_Info",
303 contains = "Data_Object",
304 fields =list(
305 data_file_name_1_percent_FDR = "character",
306 data_file_name = "character",
307 data_path_name = "character",
308 experiment_name = "character",
309 designation = "character",
310
311 input_file_type = "character"
312
313 #score_field_name = "character"
314 #collection_name="character",
315 #dir_results="character",
316 #dir_dataset="character",
317 #dataset_designation="character",
318 #file_name_dataset="character",
319 #file_name_dataset_1_percent="character",
320 #experiment_name="character"
321 ) )
322 Data_Object_Info$methods(
323 initialize = function(){
324 callSuper()
325 class_name <<- "Data_Object_Info - <Abstract class - class_name needs to be set in subclass>"
326 },
327 verify = function(){
328 checkFieldExists = function(field_name=NULL){
329 field_value <- .self[[field_name]]
330 checkTrue(length(field_value) > 0,
331 sprintf("Field %s$%s has not been set (and should have been)", class_name, field_name))
332 checkTrue(length(field_value) == 1,
333 sprintf("Field %s$%s has been set to multiple values (and should be a single value)", class_name, field_name))
334 checkTrue(field_value != "",
335 sprintf("Field %s$%s has been set to an empty string (and should not have been)", class_name, field_name))
336 }
337 checkFieldExists("data_file_name")
338 checkFieldExists("data_path_name")
339 checkFieldExists("experiment_name")
340 checkFieldExists("designation")
341 checkFieldExists("input_file_type")
342 #checkFieldExists("score_field_name")
343 },
344 m_load_data = function(){
345 # Nothing to do - this is really a data class
346 },
347 file_path = function(){
348 result <- file.path(data_path_name, data_file_name)
349 if (length(result) == 0){
350 stop("Unable to validate file path - one or both of path name and file name are missing")
351 }
352 return(result)
353 },
354 file_path_1_percent_FDR = function(){
355 local_file_name <- get_data_file_name_1_percent_FDR()
356 if (length(local_file_name) == 0){
357 result <- ""
358 } else {
359 result <- file.path(data_path_name, local_file_name)
360 }
361
362 # Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream
363
364 # if (length(result) == 0){
365 # stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing")
366 # }
367 return(result)
368 },
369 get_data_file_name_1_percent_FDR = function(){
370 return(data_file_name_1_percent_FDR)
371 },
372 collection_name = function(){
373 result <- sprintf("%s_%s", experiment_name, designation)
374 return(result)
375 }
376 )
377 ###############################################################################
378 # Class: Data_Object_Info_737_two_step
379 ###############################################################################
380 Data_Object_Info_737_two_step <- setRefClass("Data_Object_Info_737_two_step",
381 contains = "Data_Object_Info",
382 fields =list())
383 Data_Object_Info_737_two_step$methods(
384 initialize = function(){
385 callSuper()
386 class_name <<- "Data_Object_Info_737_two_step"
387 #score_field_name <<- "Confidence [%]"
388 data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular"
389 data_file_name <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_Multi_Stage_Two_Step.tabular.tabular"
390 data_path_name <<- file.path(".", "Data")
391 experiment_name <<- "Oral_737_NS"
392 designation <<- "two_step"
393
394 input_file_type <<- "PSM_Report"
395
396 #data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular"
397 #data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular"
398
399 }
400 )
401
402 ###############################################################################
403 # Class: Data_Object_Info_737_combined
404 ###############################################################################
405 Data_Object_Info_737_combined <- setRefClass("Data_Object_Info_737_combined",
406 contains = "Data_Object_Info",
407 fields =list())
408 Data_Object_Info_737_combined$methods(
409 initialize = function(){
410 callSuper()
411 class_name <<- "Data_Object_Info_737_combined"
412 #score_field_name <<- "Confidence [%]"
413 data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular"
414 data_file_name <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_CombinedDB.tabular"
415 data_path_name <<- file.path(".", "Data")
416 experiment_name <<- "Oral_737_NS"
417 designation <<- "two_step"
418
419 input_file_type <<- "PSM_Report"
420
421 #data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular"
422 #data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular"
423
424 }
425 )
426
427 ###############################################################################
428 # Class: Data_Object_Pyrococcus_tr
429 ###############################################################################
430 Data_Object_Pyrococcus_tr <- setRefClass("Data_Object_Pyrococcus_tr",
431 contains = "Data_Object_Info",
432 fields =list())
433 Data_Object_Pyrococcus_tr$methods(
434 initialize = function(){
435 callSuper()
436 class_name <<- "Data_Object_Pyrococcus_tr"
437 #score_field_name <<- "Confidence [%]"
438 data_file_name_1_percent_FDR <<- ""
439 data_file_name <<- "Pfu_traditional_Extended_PSM_Report.tabular"
440 data_path_name <<- file.path(".", "Data")
441 experiment_name <<- "Pyrococcus"
442 designation <<- "tr"
443
444 input_file_type <<- "PSM_Report"
445
446 }
447 )
448 ###############################################################################
449 # Class: Data_Object_Mouse_Mutations
450 ###############################################################################
451 Data_Object_Mouse_Mutations <- setRefClass("Data_Object_Mouse_Mutations",
452 contains = "Data_Object_Info",
453 fields =list())
454 Data_Object_Mouse_Mutations$methods(
455 initialize = function(){
456 callSuper()
457 class_name <<- "Data_Object_Mouse_Mutations"
458 #score_field_name <<- "Confidence [%]"
459 data_file_name_1_percent_FDR <<- ""
460 data_file_name <<- "Combined_DB_Mouse_5PTM.tabular"
461 data_path_name <<- file.path(".", "Data")
462 experiment_name <<- "Mouse Mutations"
463 designation <<- "combined_05"
464
465 input_file_type <<- "PSM_Report"
466
467 }
468 )
469 ###############################################################################
470 # Class: Data_Object_Raw_Data
471 ###############################################################################
472 Data_Object_Raw_Data <- setRefClass("Data_Object_Raw_Data",
473 contains = "Data_Object",
474 fields =list(df = "data.frame"))
475 Data_Object_Raw_Data$methods(
476 initialize = function(){
477 callSuper()
478 class_name <<- "Data_Object_Raw_Data"
479 },
480 verify = function(){
481 # Check that file exists before using it
482 file_path <- get_info()$file_path()
483 if (! safe_file_exists(file_path)){
484 stop(sprintf("Raw data file does not exist (%s)", file_path))
485 }
486 # BUGBUG: Needs to also check the following:
487 # - file is tab-delimited
488 # - first row is a list of column names
489 },
490 set_info = function(info){
491 parents[["info"]] <<- info
492 },
493 get_info = function(){
494 return(verified_element_of_list(parents, "info", "Data_Object_Raw_Data$parents"))
495 },
496 m_load_data = function(){
497 info <- get_info()
498 df <<- load_standard_df(info$file_path())
499 }
500 )
501 ###############################################################################
502 # Class: Data_Object_Raw_1_Percent
503 ###############################################################################
504 Data_Object_Raw_1_Percent <- setRefClass("Data_Object_Raw_1_Percent",
505 contains = "Data_Object",
506 fields =list(df = "data.frame"))
507 Data_Object_Raw_1_Percent$methods(
508 initialize = function(){
509 callSuper()
510 class_name <<- "Data_Object_Raw_1_Percent"
511 },
512 set_info = function(info){
513 parents[["info"]] <<- info
514 },
515 verify = function(){
516 # Do nothing - a missing file name is acceptable for this module and is dealt with in load()
517 },
518 get_info = function(){
519 return(verified_element_of_list(parents, "info", "Data_Object_Raw_1_Percent$parents"))
520 },
521 m_load_data = function(){
522
523 info <- get_info()
524 file_path <- info$file_path_1_percent_FDR()
525 if (exists()){
526 df <<- load_standard_df(info$file_path_1_percent_FDR())
527 } # Note that failing to load is a valid state for this file, leading to not is_dirty. BUGBUG: this could lead to problems if a good file appears later
528 },
529 exists = function(){
530
531 info <- get_info()
532 local_file_name <- info$get_data_file_name_1_percent_FDR() # Check file name not file path
533
534 if (length(local_file_name) == 0 ){ # variable not set
535 result = FALSE
536 } else if (local_file_name == ""){ # variable set to empty string
537 result = FALSE
538 } else {
539 result = safe_file_exists(info$file_path_1_percent_FDR())
540 }
541
542 return(result)
543 }
544 )
545 ###############################################################################
546 # Class: Data_Converter
547 ###############################################################################
548 Data_Converter <- setRefClass("Data_Converter",
549 fields =list(class_name = "character",
550 file_type = "character"
551 ) )
552 Data_Converter$methods(
553 initialize = function(){
554 class_name <<- "Data_Converter <abstract class - class_name needs to be set in subclass>"
555 file_type <<- "file_type has not been set before being used <needs to be set in initialize() of subclass>"
556 },
557 check_raw_fields = function(info=NULL, raw_data=NULL){
558 stop(sprintf("check_raw_fields() is an abstract method - define it in %s before calling Data_Object_Data_Converter$load_data()", class_name))
559 },
560 convert_data = function(){
561 stop(sprintf("convert_data() is an abstract method - define it in %s before calling Data_Object_Data_Converter$load_data()", class_name))
562 }
563 )
564 ###############################################################################
565 # Class: Data_Converter_PMD_FDR_input_file
566 ###############################################################################
567 Data_Converter_PMD_FDR_input_file <- setRefClass("Data_Converter_PMD_FDR_input_file",
568 contains = "Data_Converter",
569 fields =list(
570
571 ) )
572 Data_Converter_PMD_FDR_input_file$methods(
573 initialize = function(){
574 callSuper()
575
576 class_name <<- "Data_Converter_PMD_FDR_input_file"
577 file_type <<- "PMD_FDR_file_type"
578 },
579 check_raw_fields = function(info=NULL, raw_data=NULL){
580 data_original <- raw_data$df
581 check_field_name(data_original, "raw_data", "PMD_FDR_input_score")
582 check_field_name(data_original, "raw_data", "PMD_FDR_pmd")
583 check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_file")
584 check_field_name(data_original, "raw_data", "PMD_FDR_proteins")
585 check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_title")
586 check_field_name(data_original, "raw_data", "PMD_FDR_sequence")
587 check_field_name(data_original, "raw_data", "PMD_FDR_decoy")
588 },
589 convert_data = function(info=NULL, raw_data=NULL){
590 data_new <- raw_data$df
591
592 return(data_new) # Pass through - everything should be in order
593 }
594 )
595 ###############################################################################
596 # Class: Data_Converter_PSM_Report
597 ###############################################################################
598 Data_Converter_PSM_Report <- setRefClass("Data_Converter_PSM_Report",
599 contains = "Data_Converter",
600 fields =list(
601
602 ) )
603 Data_Converter_PSM_Report$methods(
604 initialize = function(){
605 callSuper()
606
607 class_name <<- "Data_Converter_PSM_Report"
608 file_type <<- "PSM_Report"
609 },
610 check_raw_fields = function(info=NULL, raw_data=NULL){
611 data_original <- raw_data$df
612 check_field_name(data_original, "raw_data", "Confidence [%]")
613 check_field_name(data_original, "raw_data", "Precursor m/z Error [ppm]")
614 check_field_name(data_original, "raw_data", "Spectrum File")
615 check_field_name(data_original, "raw_data", "Protein(s)")
616 check_field_name(data_original, "raw_data", "Spectrum Title")
617 check_field_name(data_original, "raw_data", "Decoy")
618 check_field_name(data_original, "raw_data", "Sequence")
619
620 },
621 convert_data = function(info=NULL, raw_data=NULL){
622 data_new <- raw_data$df
623
624 data_new$PMD_FDR_input_score <- data_new[, "Confidence [%]" ]
625 data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"]
626 data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ]
627 data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ]
628 data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ]
629 data_new$PMD_FDR_sequence <- data_new[, "Sequence" ]
630 data_new$PMD_FDR_decoy <- data_new[, "Decoy" ]
631
632 return(data_new)
633 }
634 )
635 ###############################################################################
636 # Class: Data_Converter_MaxQuant_Evidence
637 ###############################################################################
638 Data_Converter_MaxQuant_Evidence <- setRefClass("Data_Converter_MaxQuant_Evidence",
639 contains = "Data_Converter",
640 fields =list(
641
642 ) )
643 Data_Converter_MaxQuant_Evidence$methods(
644 initialize = function(){
645 callSuper()
646
647 class_name <<- "Data_Converter_MaxQuant_Evidence"
648 file_type <<- "MaxQuant_Evidence"
649 },
650 check_raw_fields = function(info=NULL, raw_data=NULL){
651 data_original <- raw_data$df
652
653 check_field_name(data_original, "raw_data", "PEP")
654 check_field_name(data_original, "raw_data", "Mass error [ppm]")
655 check_field_name(data_original, "raw_data", "Proteins")
656 check_field_name(data_original, "raw_data", "Retention time")
657 check_field_name(data_original, "raw_data", "Sequence")
658 check_field_name(data_original, "raw_data", "Reverse")
659 },
660 convert_data = function(info=NULL, raw_data=NULL){
661 data_new <- raw_data$df
662
663 data_new$PMD_FDR_input_score <- 100 * (1 - data_new[, "PEP" ])
664 data_new$PMD_FDR_pmd <- data_new[, "Mass error [ppm]"]
665 data_new$PMD_FDR_spectrum_file <- "<place_holder - assumes a single spectra file>"
666 data_new$PMD_FDR_proteins <- data_new[, "Proteins" ]
667 data_new$PMD_FDR_spectrum_title <- data_new[, "Retention time" ] # Used for ordering peptides - not important in MaxQuant since PMD has already been normalized effectively
668 data_new$PMD_FDR_sequence <- data_new[, "Sequence" ]
669 data_new$PMD_FDR_decoy <- ifelse( data_new[, "Reverse" ] == "+", 1, 0)
670
671 return(data_new)
672 }
673 )
674
675 ###############################################################################
676 # Class: Data_Object_Data_Converter
677 ###############################################################################
678 Data_Object_Data_Converter <- setRefClass("Data_Object_Data_Converter",
679 contains = "Data_Object",
680 fields =list(df = "data.frame",
681 data_converter = "Data_Converter"))
682 Data_Object_Data_Converter$methods(
683 initialize = function(){
684 callSuper()
685 class_name <<- "Data_Object_Data_Converter"
686 },
687 currently_supported_file_types = function(){
688 return(c("PSM_Report", "PMD_FDR_input_file"))
689 },
690 verify = function(){
691 info <- get_info()
692 raw_data <- get_raw_data()
693 file_type <- get_info()$input_file_type
694
695 set_file_type(file_type)
696 data_converter$check_raw_fields(info=info, raw_data=raw_data)
697
698 },
699 m_load_data = function(){
700
701 info <- get_info()
702 raw_data <- get_raw_data()
703 file_type <- get_info()$input_file_type
704
705 df <<- data_converter$convert_data(info=info, raw_data=raw_data)
706
707 },
708 set_file_type = function(file_type = NULL){
709 if (file_type == "PSM_Report" ){
710 data_converter <<- Data_Converter_PSM_Report $new()
711 } else if (file_type == "PMD_FDR_input_file"){
712 data_converter <<- Data_Converter_PMD_FDR_input_file$new()
713 } else if (file_type == "MaxQuant_Evidence"){
714 data_converter <<- Data_Converter_MaxQuant_Evidence $new()
715 } else {
716 stop(sprintf("File type '%s' is not currently supported by PMD-FDR module", file_type))
717 }
718 },
719 set_info = function(info){
720 parents[["info"]] <<- info
721 },
722 get_info = function(){
723 return(verified_element_of_list(parents, "info", "Data_Object_Data_Converter$parents"))
724 },
725 set_raw_data = function(raw_data){
726 parents[["raw_data"]] <<- raw_data
727 },
728 get_raw_data = function(){
729 return(verified_element_of_list(parents, "raw_data", "Data_Object_Data_Converter$parents"))
730 }
731 )
732 ###############################################################################
733 # Class: Data_Object_Groupings
734 ###############################################################################
735 Data_Object_Groupings <- setRefClass("Data_Object_Groupings",
736 contains = "Data_Object",
737 fields =list(df = "data.frame"))
738 Data_Object_Groupings$methods(
739 initialize = function(){
740 callSuper()
741 class_name <<- "Data_Object_Groupings"
742 },
743 simplify_field_name = function(x=NULL){
744 result <- gsub(pattern = "PMD_FDR_", replacement = "", x = x)
745 return(result)
746 },
747 verify = function(){
748 data_original <- get_data_converter()$df
749
750 check_field_name(data_original, "data_converter", "PMD_FDR_input_score")
751 check_field_name(data_original, "data_converter", "PMD_FDR_pmd")
752 check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_file")
753 check_field_name(data_original, "data_converter", "PMD_FDR_proteins")
754 check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_title")
755 check_field_name(data_original, "data_converter", "PMD_FDR_sequence")
756 check_field_name(data_original, "data_converter", "PMD_FDR_decoy")
757
758 },
759 m_load_data = function(){
760 make_data_groups <- function(data_original=NULL){
761
762 # Functions supporting make_data_groups()
763
764 standardize_fields <- function(data=NULL){
765 data_new <- data
766
767 info <- get_info()
768 info$ensure()
769 #field_name_of_score <- info$get_field_name_of_score()
770
771 # #data_new <- rename_column(data_new, "Variable Modifications" , "ptm_list")
772 # data_new <- rename_column(data_new, field_name_of_score , "PMD_FDR_input_score")
773 # data_new <- rename_column(data_new, "Precursor m/z Error [ppm]", "PMD_FDR_pmd")
774 # #data_new <- rename_column(data_new, "Isotope Number" , "isotope_number")
775 # #data_new <- rename_column(data_new, "m/z" , "m_z")
776 # #data_new <- rename_column(data_new, "Measured Charge" , "charge")
777 # data_new <- rename_column(data_new, "Spectrum File" , "PMD_FDR_spectrum_file")
778 # data_new <- rename_column(data_new, "Protein(s)" , "PMD_FDR_proteins")
779 # data_new <- rename_column(data_new, "Spectrum Title" , "PMD_FDR_spectrum_title")
780 # data_new <- manage_decoy_column(data_new)
781
782 # Now managed in Data_Converter
783 # data_new$PMD_FDR_input_score <- data_new[, field_name_of_score ]
784 # data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"]
785 # data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ]
786 # data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ]
787 # data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ]
788
789 data_new$value <- data_new$PMD_FDR_pmd
790 data_new$PMD_FDR_peptide_length <- str_length(data_new$PMD_FDR_sequence)
791 #data_new$charge_value <- with(data_new, as.numeric(substr(charge, start=1, stop=str_length(charge)-1)))
792 #data_new$measured_mass <- with(data_new, m_z*charge_value)
793 data_new$PMD_FDR_spectrum_index <- NA
794 data_new$PMD_FDR_spectrum_index[order(data_new$PMD_FDR_spectrum_title, na.last = TRUE)] <- 1:nrow(data_new)
795
796 return(data_new)
797 }
798 add_grouped_variable <- function(data_groups = data_groups, field_name_to_group = NULL, vec.length.out = NULL, vec.tolerance = NULL, value_format = NULL){
799
800 # Support functions for add_grouped_variable()
801 find_interval_vec <- function(x=NULL, length.out = NULL, tolerance = NULL){
802 q <- quantile(x = x, probs = seq(from=0, to=1, length.out = length.out), na.rm=TRUE)
803 q <- round_to_tolerance(q, tolerance = tolerance)
804 return(q)
805 }
806 get_group_data_frame <- function(vec=NULL, value_format = NULL){
807 n <- length(vec)
808 a <- vec[-n]
809 b <- vec[-1]
810
811 lower <- ifelse(a == b , "eq", NA)
812 lower <- ifelse(is.na(lower ), "ge", lower)
813 upper <- ifelse(a == b , "eq", NA)
814 upper[n-1] <- ifelse(is.na(upper[n-1]), "le", "eq")
815 upper <- ifelse(is.na(upper ), "lt", upper)
816 group <- data.frame(list(idx=1:(n-1), a=a, b=b, lower=lower, upper=upper))
817
818 name_format <- sprintf("%%%s_%%%s_%%s_%%s", value_format, value_format)
819 group$new_var <- with(group, sprintf(name_format, a, b, lower, upper))
820
821 return(group)
822 }
823 merge_group_with_data <- function(data_groups = NULL, group = NULL, vec = NULL, field_name_to_group = NULL){
824 field_name_new <- sprintf("group_%s", simplify_field_name(field_name_to_group))
825 group_idx <- findInterval(x = data_groups[,field_name_to_group],
826 vec = vec,
827 all.inside=TRUE)
828 data_groups$new_var <- group$new_var[group_idx]
829 data_groups <- rename_column(data_groups, "new_var", field_name_new)
830 }
831 # Body of add_grouped_variable()
832
833 vec <- find_interval_vec(x = data_groups[[field_name_to_group]],
834 length.out = vec.length.out,
835 tolerance = vec.tolerance )
836 group <- get_group_data_frame(vec = vec,
837 value_format = value_format)
838 df_new <- merge_group_with_data(data_groups = data_groups,
839 group = group,
840 vec = vec,
841 field_name_to_group = field_name_to_group)
842 df_new <- add_group_decoy(df_new, field_name_to_group)
843
844 return(df_new)
845 }
846 add_already_grouped_variable <- function(field_name_to_group = NULL, data_groups = NULL ){
847 old_name <- field_name_to_group
848 new_name <- sprintf("group_%s", simplify_field_name(old_name))
849 df_new <- data_groups
850 df_new[[new_name]] <- data_groups[[old_name]]
851
852 df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group)
853
854 return(df_new)
855 }
856 add_value_norm <- function(data_groups = NULL){
857
858 df_new <- data_groups
859 df_new$value_norm <- with(df_new, value - median_of_group_index)
860
861 return(df_new)
862 }
863 add_protein_group <-function(data_groups = NULL){
864 data_new <- data_groups
865 df_group_def <- data.frame(stringsAsFactors = FALSE,
866 list(pattern = c("" , "pfu_" , "cRAP"),
867 group_name = c("human", "pyrococcus", "contaminant")))
868 for (i in 1:nrow(df_group_def)){
869 idx <- grepl(pattern = df_group_def$pattern[i],
870 x = data_new$PMD_FDR_proteins)
871 data_new$group_proteins[idx] <- df_group_def$group_name[i]
872 }
873
874 data_new <- add_group_decoy(data_groups = data_new, field_name_to_group = "PMD_FDR_proteins")
875 return(data_new)
876 }
877 add_group_decoy <- function(data_groups=NULL, field_name_to_group=NULL){
878 simple_field_name <- simplify_field_name(field_name_to_group)
879 field_name_decoy <- sprintf("group_decoy_%s", simple_field_name)
880 field_name_group <- sprintf("group_%s", simple_field_name)
881
882 data_groups[[field_name_decoy]] <- with(data_groups, ifelse(PMD_FDR_decoy, "decoy", data_groups[[field_name_group]]))
883
884 return(data_groups)
885 }
886 add_group_training_class <- function(data_groups = NULL){
887 df_new <- data_groups
888
889 lowest_confidence_group <- min(data_groups$group_input_score)
890
891 is_long_enough <- with(df_new, (PMD_FDR_peptide_length >= MIN_GOOD_PEPTIDE_LENGTH) )
892 is_good <- with(df_new, (PMD_FDR_decoy == 0) & (PMD_FDR_input_score == 100) )
893 is_bad <- with(df_new, (PMD_FDR_decoy == 1) )
894 #is_used_to_train <- with(df_new, used_to_find_middle) # BUGBUG: circular definition
895
896 idx_good <- which(is_good ) # & is_long_enough)
897 n_good <- length(idx_good)
898 idx_testing <- idx_good[c(TRUE,FALSE)] # Selects every other item
899 idx_training <- setdiff(idx_good, idx_testing)
900
901 #is_good_short <- with(df_new, is_good & !is_long_enough )
902 #is_good_long <- with(df_new, is_good & is_long_enough )
903 is_bad_short <- with(df_new, is_bad & !is_long_enough )
904 is_bad_long <- with(df_new, is_bad & is_long_enough )
905 #is_good_training <- with(df_new, is_good_long & (used_to_find_middle == TRUE ) )
906 #is_good_testing <- with(df_new, is_good_long & (used_to_find_middle == FALSE) )
907
908 df_new$group_training_class <- "other_short" # Default
909 df_new$group_training_class[is_long_enough ] <- "other_long" # Default (if long enough)
910 df_new$group_training_class[idx_training ] <- "good_training" # Length does not matter (anymore)
911 df_new$group_training_class[idx_testing ] <- "good_testing" # Ditto
912 #df_new$group_training_class[is_good_short ] <- "good_short"
913 df_new$group_training_class[is_bad_long ] <- "bad_long" # ...except for "bad"
914 df_new$group_training_class[is_bad_short ] <- "bad_short"
915
916 df_new <- add_used_to_find_middle( data_groups = df_new ) # Guarantees consistency between duplicated definitions
917
918 return(df_new)
919 }
920 add_used_to_find_middle <- function(data_groups = NULL){
921 df_new <- data_groups
922 idx_used <- which(data_groups$group_training_class == "good_training")
923
924 df_new$used_to_find_middle <- FALSE
925 df_new$used_to_find_middle[idx_used] <- TRUE
926
927 return(df_new)
928 }
929 add_group_spectrum_index <- function(data_groups = NULL){
930
931 # Supporting functions for add_group_spectrum_index()
932
933 get_breaks_all <- function(df_new){
934 # Supporting function(s) for get_breaks_all()
935
936 get_cut_points <- function(data_subset){
937
938 # Supporting function(s) for get_cut_points()
939
940 cut_values <- function(data=NULL, minimum_segment_length=NULL){
941 # using cpt.mean -- Appears to have a memory leak
942 #results_cpt <- cpt.mean(data=data, method="PELT", minimum_segment_length=minimum_segment_length)
943 #results <- results_cpt@cpts
944
945 # Just look at the end
946 #results <- c(length(data))
947
948 # regularly spaced, slightly larger than minimum_segment_length
949 n_points <- length(data)
950 n_regions <- floor(n_points / minimum_segment_length)
951 n_regions <- ifelse(n_regions == 0, 1, n_regions)
952 results <- round(seq(1, n_points, length.out = n_regions + 1))
953 results <- results[-1]
954 return(results)
955 }
956 remove_last <- function(x){
957 return(x[-length(x)] )
958 }
959
960 # Main code of for get_cut_points()
961 max_idx = max(data_subset$PMD_FDR_spectrum_index)
962 data_sub_sub <- subset(data_subset, group_training_class == "good_training") #(PMD_FDR_input_score==100) & (PMD_FDR_decoy==0))
963 minimum_segment_length = 50
964
965 values <- data_sub_sub$value
966 n_values <- length(values)
967 local_to_global_idx <- data_sub_sub$PMD_FDR_spectrum_index
968 if (n_values <= minimum_segment_length){
969 result <- c()
970 } else {
971 local_idx <- cut_values(data=values, minimum_segment_length=minimum_segment_length)
972 result <- local_to_global_idx[local_idx]
973 result <- remove_last(result)
974 }
975 result <- c(result, max_idx)
976 return(result)
977 }
978 remove_last <- function(vec) {
979 return(vec[-length(vec)])
980 }
981
982 # Main code of get_breaks_all()
983
984 breaks <- 1
985
986 files <- unique(df_new$PMD_FDR_spectrum_file)
987
988 for (local_file in files){
989 data_subset <- subset(df_new, (PMD_FDR_spectrum_file==local_file))
990 if (nrow(data_subset) > 0){
991 breaks <- c(breaks, get_cut_points(data_subset))
992 }
993 }
994 breaks <- sort(unique(breaks))
995 breaks <- remove_last(breaks)
996 breaks <- c(breaks, max(df_new$PMD_FDR_spectrum_index + 1))
997
998 return(breaks)
999 }
1000
1001 # Main code of add_group_spectrum_index()
1002
1003 field_name_to_group <- "PMD_FDR_spectrum_index"
1004
1005 df_new <- data_groups[order(data_groups[[field_name_to_group]]),]
1006 breaks <- get_breaks_all(df_new)
1007
1008 df_new$group_spectrum_index <- cut(x = df_new[[field_name_to_group]], breaks = breaks, right = FALSE, dig.lab = 6)
1009 df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group)
1010
1011 return(df_new)
1012 }
1013 add_median_of_group_index <-function(data_groups = NULL){
1014 field_median <- "median_of_group_index"
1015 data_good <- subset(data_groups, used_to_find_middle )
1016 med <- aggregate(value~group_spectrum_index, data=data_good, FUN=safe_median)
1017 med <- rename_column(med, "value", field_median)
1018
1019 data_groups[[field_median]] <- NULL
1020 df_new <- merge(data_groups, med)
1021
1022 return(df_new)
1023 }
1024 add_1_percent_to_data_groups <- function(data_groups=NULL){
1025
1026 data_new <- data_groups
1027
1028 if (get_raw_1_percent()$exists()){
1029 # Load 1 percent file
1030 df_1_percent <- get_raw_1_percent()$df
1031
1032 # Get relevant fields
1033 df_1_percent$is_in_1percent <- TRUE
1034 df_1_percent <- rename_column(df_1_percent, "Spectrum Title", "PMD_FDR_spectrum_title")
1035 df_1_percent <- df_1_percent[,c("PMD_FDR_spectrum_title", "is_in_1percent")]
1036
1037 # Merge with data_groups
1038 data_new <- merge(data_new, df_1_percent, all.x=TRUE)
1039 data_new$is_in_1percent[is.na(data_new$is_in_1percent)] <- FALSE
1040 }
1041
1042 # Save results
1043 return(data_new)
1044
1045 }
1046
1047
1048 # Main code of make_data_groups()
1049 data_groups <- standardize_fields(data_original)
1050
1051 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_input_score",
1052 data_groups = data_groups,
1053 vec.length.out = 14,
1054 vec.tolerance = 1,
1055 value_format = "03d")
1056
1057 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_pmd",
1058 data_groups = data_groups,
1059 vec.length.out = 21,
1060 vec.tolerance = 0.1,
1061 value_format = "+05.1f")
1062
1063 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_peptide_length",
1064 data_groups = data_groups,
1065 vec.length.out = 11,
1066 vec.tolerance = 1,
1067 value_format = "02d")
1068
1069 # data_groups <- add_grouped_variable(field_name_to_group = "m_z",
1070 # data_groups = data_groups,
1071 # vec.length.out = 11,
1072 # vec.tolerance = 10,
1073 # value_format = "04.0f")
1074 #
1075 # data_groups <- add_grouped_variable(field_name_to_group = "measured_mass",
1076 # data_groups = data_groups,
1077 # vec.length.out = 11,
1078 # vec.tolerance = 1,
1079 # value_format = "04.0f")
1080 #
1081 # data_groups <- add_already_grouped_variable(field_name_to_group = "isotope_number",
1082 # data_groups = data_groups )
1083 #
1084 # data_groups <- add_already_grouped_variable(field_name_to_group = "charge",
1085 # data_groups = data_groups )
1086 #
1087 data_groups <- add_already_grouped_variable(field_name_to_group = "PMD_FDR_spectrum_file",
1088 data_groups = data_groups )
1089 data_groups <- add_protein_group(data_groups = data_groups)
1090 data_groups <- add_group_training_class( data_groups = data_groups)
1091 data_groups <- add_group_spectrum_index( data_groups = data_groups)
1092 data_groups <- add_median_of_group_index( data_groups = data_groups)
1093 data_groups <- add_value_norm( data_groups = data_groups)
1094
1095 # fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "m_z", "PMD_FDR_peptide_length", "isotope_number", "charge", "PMD_FDR_spectrum_file", "measured_mass", "PMD_FDR_spectrum_index", "PMD_FDR_proteins")
1096 # fields_of_interest <- c("value",
1097 # "PMD_FDR_decoy",
1098 # "PMD_FDR_spectrum_title",
1099 # "median_of_group_index",
1100 # "value_norm",
1101 # "used_to_find_middle",
1102 # "group_training_class",
1103 # fields_of_interest,
1104 # sprintf("group_%s" , fields_of_interest),
1105 # sprintf("group_decoy_%s", fields_of_interest))
1106
1107 fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "PMD_FDR_peptide_length", "PMD_FDR_spectrum_file", "PMD_FDR_spectrum_index", "PMD_FDR_proteins")
1108 fields_of_interest <- c("value",
1109 "PMD_FDR_decoy",
1110 "PMD_FDR_spectrum_title",
1111 "median_of_group_index",
1112 "value_norm",
1113 "used_to_find_middle",
1114 "group_training_class",
1115 fields_of_interest,
1116 sprintf("group_%s" , simplify_field_name(fields_of_interest)),
1117 sprintf("group_decoy_%s", simplify_field_name(fields_of_interest)))
1118
1119 data_groups <- data_groups[,fields_of_interest]
1120 data_groups <- add_1_percent_to_data_groups(data_groups)
1121
1122 return(data_groups)
1123 }
1124
1125 data_original <- get_data_converter()$df #parents[[INDEX_OF_ORIGINAL_DATA]]$df
1126 df <<- make_data_groups(data_original)
1127 },
1128 set_info = function(info){
1129 parents[["info"]] <<- info
1130 },
1131 get_info = function(){
1132 return(verified_element_of_list(parents, "info", "Data_Object_Groupings$parents"))
1133 },
1134 set_data_converter = function(data_converter){
1135 parents[["data_converter"]] <<- data_converter
1136 },
1137 get_data_converter = function(){
1138 return(verified_element_of_list(parents, "data_converter", "Data_Object_Groupings$parents"))
1139 },
1140 set_raw_1_percent = function(raw_1_percent){ ############## BUGBUG: the 1% file should be using the same file type format as the standard data (but isn't)
1141 parents[["raw_1_percent"]] <<- raw_1_percent
1142 },
1143 get_raw_1_percent = function(){
1144 return(verified_element_of_list(parents, "raw_1_percent", "Data_Object_Groupings$parents"))
1145 }
1146 )
1147 ###############################################################################
1148 # Class: Data_Object_Individual_FDR
1149 ###############################################################################
1150 Data_Object_Individual_FDR <- setRefClass("Data_Object_Individual_FDR",
1151 contains = "Data_Object",
1152 fields =list(df = "data.frame"))
1153 Data_Object_Individual_FDR$methods(
1154 initialize = function(){
1155 callSuper()
1156 class_name <<- "Data_Object_Individual_FDR"
1157 },
1158 verify = function(){
1159 data_groups = get_data_groups()$df
1160 densities = get_densities()$df
1161 alpha = get_alpha()$df
1162
1163 check_field_name(data_groups, "data_groups", "value_norm")
1164 check_field_name(data_groups, "data_groups", "group_decoy_input_score")
1165 check_field_name(data_groups, "data_groups", "PMD_FDR_peptide_length")
1166 check_field_name(data_groups, "data_groups", "PMD_FDR_input_score")
1167 check_field_name(alpha, "alpha", "alpha") # BUGBUG: I'm missing a field here...
1168 check_field_name(densities, "densities", "x")
1169 check_field_name(densities, "densities", "t")
1170 check_field_name(densities, "densities", "f")
1171
1172 },
1173 set_data_groups = function(parent){
1174 parents[["data_groups"]] <<- parent
1175 },
1176 get_data_groups = function(){
1177 return(verified_element_of_list(parents, "data_groups", "Data_Object_Individual_FDR$parents"))
1178 },
1179 set_densities = function(parent){
1180 parents[["densities"]] <<- parent
1181 },
1182 get_densities = function(){
1183 return(verified_element_of_list(parents, "densities", "Data_Object_Individual_FDR$parents"))
1184 },
1185 set_alpha = function(parent){
1186 parents[["alpha"]] <<- parent
1187 },
1188 get_alpha = function(){
1189 return(verified_element_of_list(parents, "alpha", "Data_Object_Individual_FDR$parents"))
1190 },
1191 m_load_data = function(){
1192 add_FDR_to_data_groups <- function(data_groups=NULL, densities=NULL, alpha=NULL, field_value=NULL, field_decoy_group=NULL, set_decoy_to_1=FALSE){
1193 # Support functions for add_FDR_to_data_groups()
1194 get_group_fdr <- function(group_stats = NULL, data_groups = NULL, densities=NULL){
1195 group_fdr <- apply(X = densities, MARGIN = 2, FUN = max)
1196 df_group_fdr <- data.frame(group_fdr)
1197 df_group_fdr <- rename_column(df_group_fdr, "group_fdr", "v")
1198 df_group_fdr$group_of_interest <- names(group_fdr)
1199 t <- df_group_fdr[df_group_fdr$group_of_interest == "t", "v"]
1200 f <- df_group_fdr[df_group_fdr$group_of_interest == "f", "v"]
1201 df_group_fdr <- subset(df_group_fdr, !(group_of_interest %in% c("x", "t", "f")))
1202 df_group_fdr$group_fdr <-(df_group_fdr$v - t) / (f - t)
1203
1204 return(df_group_fdr)
1205 }
1206
1207 get_mode <- function(x){
1208 d <- density(x)
1209 return(d$x[which.max(d$y)])
1210 }
1211
1212 # Main code for add_FDR_to_data_groups()
1213
1214 # Set up analysis
1215 data_new <- data_groups
1216 data_new$value_of_interest <- data_new[,field_value]
1217 data_new$group_of_interest <- data_new[,field_decoy_group]
1218
1219 data_subset <- subset(data_new, PMD_FDR_peptide_length >= 11)
1220
1221 # Identify mean PMD_FDR_input_score per group
1222
1223 group_input_score <- aggregate(PMD_FDR_input_score~group_of_interest, data=data_subset, FUN=mean)
1224 group_input_score <- rename_column(group_input_score, "PMD_FDR_input_score", "group_input_score")
1225
1226 #group_fdr <- get_group_fdr(data_groups = data_subset, densities=densities)
1227 group_stats <- merge(alpha, group_input_score)
1228 group_stats <- subset(group_stats, group_of_interest != "PMD_FDR_decoy")
1229
1230 x=c(0,group_stats$group_input_score)
1231 y=c(1,group_stats$alpha)
1232 FUN_interp <- approxfun(x=x,y=y)
1233
1234 data_new$interpolated_groupwise_FDR <- FUN_interp(data_new$PMD_FDR_input_score)
1235 if (set_decoy_to_1){
1236 data_new$interpolated_groupwise_FDR[data_new$PMD_FDR_decoy == 1] <- 1
1237 }
1238
1239 return(data_new)
1240 }
1241
1242 data_groups = get_data_groups()$df
1243 densities = get_densities()$df
1244 alpha = get_alpha()$df
1245
1246 d_true <- densities[,c("x", "t")]
1247 d_false <- densities[,c("x", "f")]
1248
1249 i_fdr <- add_FDR_to_data_groups(data_groups = data_groups,
1250 densities = densities,
1251 alpha = alpha,
1252 field_value ="value_norm",
1253 field_decoy_group = "group_decoy_input_score")
1254 # Derive local t
1255 interp_t <- splinefun(x=d_true$x, y=d_true$t) #approxfun(x=d_true$x, y=d_true$y)
1256
1257 # Derive local f
1258 interp_f <- splinefun(x=d_false$x, y=d_false$f) #approxfun(x=d_true$x, y=d_true$y)
1259
1260 # Derive local FDR
1261 i_fdr$t <- interp_t(i_fdr$value_of_interest)
1262 i_fdr$f <- interp_f(i_fdr$value_of_interest)
1263 i_fdr$alpha <- i_fdr$interpolated_groupwise_FDR
1264 i_fdr$i_fdr <- with(i_fdr, (alpha*f) / (alpha*f + (1-alpha)*t))
1265
1266 df <<- i_fdr
1267
1268 }
1269 )
1270 ###############################################################################
1271 # Class: Data_Object_Densities
1272 ###############################################################################
1273 Data_Object_Densities <- setRefClass("Data_Object_Densities",
1274 contains = "Data_Object",
1275 fields =list(df = "data.frame"))
1276 Data_Object_Densities$methods(
1277 initialize = function(){
1278 callSuper()
1279 class_name <<- "Data_Object_Densities"
1280 },
1281 verify = function(){
1282 df_data_groups <- get_data_groups()$df
1283
1284 checkTrue(nrow(df_data_groups) > 0,
1285 msg = "data_groups data frame was empty (and should not have been)")
1286
1287 check_field_name(df_data_groups, "data_groups", "value_norm")
1288 check_field_name(df_data_groups, "data_groups", "group_decoy_input_score")
1289 check_field_name(df_data_groups, "data_groups", "group_training_class")
1290 },
1291 set_data_groups = function(parent=NULL){
1292 parents[["data_groups"]] <<- parent
1293 },
1294 get_data_groups = function(){
1295 return(verified_element_of_list(parent_list = parents, element_name = "data_groups", object_name = "Data_Object_Densities$parents"))
1296 },
1297 m_load_data = function(){
1298
1299 # Support functions for make_densities()
1300 set_values_of_interest <- function(df_data_groups=NULL, field_group = NULL){
1301 field_value = "value_norm"
1302
1303 new_data_groups <- get_data_groups()$df
1304 new_data_groups$value_of_interest <- new_data_groups[,field_value]
1305 new_data_groups$group_of_interest <- new_data_groups[,field_group]
1306 #groups <- sort(unique(new_data_groups$group_of_interest))
1307
1308 return(new_data_groups)
1309 }
1310 get_ylim <- function(data_groups=NULL){
1311 ylim <- range(data_groups$value_of_interest, na.rm = TRUE)
1312 return(ylim)
1313 }
1314 make_hit_density <- function(data_subset=NULL, descr_of_df=NULL, ylim=NULL){
1315 #stop("Data_Object_Densities$make_hit_density() is untested beyond here")
1316 verify_density = function(data_subset=NULL, value_field=NULL, descr_of_df=NULL, ylim=NULL){
1317 values <- data_subset[value_field]
1318 values <- values[! is.na(values)]
1319 if (length(values) < MIN_ACCEPTABLE_POINTS_IN_DENSITY){
1320 stop (sprintf("There are too few valid %s (%d < %d) in %s to be used for calculating a density function",
1321 value_field,
1322 length(values),
1323 MIN_ACCEPTABLE_POINTS_IN_DENSITY,
1324 descr_of_df))
1325 }
1326 d <- density(values, from = ylim[1], to = ylim[2])
1327
1328 return(d)
1329 }
1330 uniformalize_density <- function(d){
1331 # Reorganizes y-values of density function so that
1332 # function is monotone increasing to mode
1333 # and monotone decreasing afterwards
1334 idx_mode <- which.max(d$y)
1335 idx_lower <- 1:(idx_mode-1)
1336 idx_upper <- idx_mode:length(d$y)
1337
1338 values_lower <- d$y[idx_lower]
1339 values_upper <- d$y[idx_upper]
1340
1341 new_d <- d
1342 new_d$y <- c(sort(values_lower, decreasing = FALSE),
1343 sort(values_upper, decreasing = TRUE))
1344
1345 return(new_d)
1346 }
1347
1348 local_df <- subset(data_subset,
1349 (PMD_FDR_peptide_length >= MIN_GOOD_PEPTIDE_LENGTH) &
1350 (used_to_find_middle == FALSE))
1351 d <- verify_density (data_subset=local_df, value_field = "value_of_interest", descr_of_df = descr_of_df, ylim=ylim)
1352 d <- normalize_density (d)
1353 d <- uniformalize_density(d)
1354
1355 return(d)
1356 }
1357 make_true_hit_density <- function(data_groups=NULL){
1358 d_true <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "good_testing") ),
1359 descr_of_df = "Good-testing dataset",
1360 ylim = get_ylim(data_groups))
1361 return(d_true)
1362 }
1363 make_false_hit_density <- function(data_groups=NULL){
1364 d_false <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "bad_long") ),
1365 descr_of_df = "Bad-long dataset",
1366 ylim = get_ylim(data_groups))
1367
1368 return(d_false)
1369 }
1370 add_v_densities <- function(data_groups=NULL, densities=NULL, field_group = NULL){
1371 groups <- sort(unique(data_groups$group_of_interest))
1372
1373 new_densities <- densities
1374
1375 for (local_group in groups){
1376 d_v <- make_hit_density(data_subset = subset(data_groups, (group_of_interest == local_group)),
1377 descr_of_df = sprintf("subset of data (where %s is '%s')",
1378 field_group,
1379 local_group),
1380 ylim = get_ylim(data_groups))
1381 new_densities[local_group] <- d_v$y
1382 }
1383
1384 return(new_densities)
1385 }
1386
1387 # Main section of make_densities()
1388 df_data_groups <- get_data_groups()$df
1389 new_data_groups <- set_values_of_interest(df_data_groups, field_group = "group_decoy_input_score")
1390 d_true <- make_true_hit_density( new_data_groups)
1391 d_false <- make_false_hit_density(new_data_groups)
1392
1393 densities <- data.frame(x=d_true$x,
1394 t=d_true$y,
1395 f=d_false$y)
1396 densities <- add_v_densities(data_groups=new_data_groups, densities=densities, field_group = "group_decoy_input_score")
1397 df <<- densities
1398 }
1399 )
1400 ###############################################################################
1401 # Class: Data_Object_Alpha
1402 ###############################################################################
1403 Data_Object_Alpha <- setRefClass("Data_Object_Alpha",
1404 contains = "Data_Object",
1405 fields =list(df = "data.frame"))
1406 Data_Object_Alpha$methods(
1407 initialize = function(){
1408 callSuper()
1409 class_name <<- "Data_Object_Alpha"
1410 },
1411 verify = function(){
1412 densities <- get_densities()$df
1413
1414 checkTrue(nrow(densities) > 0,
1415 msg = "Densities data.frame was empty (and should not have been)")
1416 },
1417 set_densities = function(parent=NULL){
1418 parents[["densities"]] <<- parent
1419 },
1420 get_densities = function(){
1421 return(verified_element_of_list(parent_list = parents, element_name = "densities", object_name = "Data_Object_Alpha"))
1422 },
1423 m_load_data = function(){
1424
1425 densities <- get_densities()$df
1426
1427 max_of_density = apply(X = densities, MARGIN = 2, FUN = max)
1428 df_alpha <- data.frame(stringsAsFactors = FALSE,
1429 list(v = max_of_density,
1430 group_of_interest = names(max_of_density)))
1431 df_alpha <- subset(df_alpha, group_of_interest != "x")
1432 t <- with(subset(df_alpha, group_of_interest=="t"), v)
1433 f <- with(subset(df_alpha, group_of_interest=="f"), v)
1434 df_alpha$alpha <- with(df_alpha, (t-v)/(t-f))
1435
1436 alpha <- df_alpha[,c("group_of_interest", "alpha")]
1437 alpha <- subset(alpha, (group_of_interest != "t") & (group_of_interest != "f"))
1438
1439 df <<- alpha
1440 }
1441 )
1442 ###############################################################################
1443 # Class: Data_Processor
1444 ###############################################################################
1445 Data_Processor <- setRefClass("Data_Processor",
1446 fields =list(info = "Data_Object_Info",
1447 raw_data = "Data_Object_Raw_Data",
1448 raw_1_percent = "Data_Object_Raw_1_Percent",
1449 data_converter = "Data_Object_Data_Converter",
1450 data_groups = "Data_Object_Groupings",
1451 densities = "Data_Object_Densities",
1452 alpha = "Data_Object_Alpha",
1453 i_fdr = "Data_Object_Individual_FDR"))
1454 Data_Processor$methods(
1455 initialize = function(p_info=NULL){
1456 if (! is.null(p_info)){
1457 set_info(p_info)
1458 }
1459 },
1460 set_info = function(p_info=NULL){
1461 # This initialization defines all of the dependencies between the various components
1462
1463 info <<- p_info
1464
1465 # raw_data
1466 raw_data$set_info(info)
1467 info$append_child(raw_data)
1468
1469 # raw_1_percent
1470 raw_1_percent$set_info(info)
1471 info$append_child(raw_1_percent)
1472
1473 # data_converter
1474 data_converter$set_info (info)
1475 data_converter$set_raw_data(raw_data)
1476 info $append_child (data_converter)
1477 raw_data $append_child (data_converter)
1478
1479 # data_groups
1480 data_groups$set_info (info)
1481 data_groups$set_data_converter(data_converter)
1482 data_groups$set_raw_1_percent (raw_1_percent)
1483 info $append_child (data_groups)
1484 data_converter$append_child (data_groups)
1485 raw_1_percent $append_child (data_groups)
1486
1487 # densities
1488 densities $set_data_groups(data_groups)
1489 data_groups$append_child (densities)
1490
1491 # alpha
1492 alpha $set_densities(densities)
1493 densities$append_child (alpha)
1494
1495 # i_fdr
1496 i_fdr$set_data_groups(data_groups)
1497 i_fdr$set_densities (densities)
1498 i_fdr$set_alpha (alpha)
1499 data_groups $append_child(i_fdr)
1500 densities $append_child(i_fdr)
1501 alpha $append_child(i_fdr)
1502 }
1503 )
1504
1505
1506 #############################################################
1507 ####### Classes for Plotting
1508 #############################################################
1509
1510 ###############################################################################
1511 # Class: Plot_Image
1512 ###############################################################################
1513 Plot_Image = setRefClass("Plot_Image",
1514 fields = list(data_processors = "list",
1515 plot_title = "character",
1516 include_text = "logical",
1517 include_main = "logical",
1518 x.intersp = "numeric",
1519 y.intersp = "numeric",
1520 scale = "numeric",
1521 main = "character",
1522 is_image_container = "logical"))
1523 Plot_Image$methods(
1524 initialize = function(p_data_processors = list(),
1525 p_include_main = TRUE,
1526 p_include_text = TRUE,
1527 p_is_image_container = FALSE){
1528 include_main <<- p_include_main
1529 include_text <<- p_include_text
1530 data_processors <<- p_data_processors
1531 is_image_container <<- p_is_image_container
1532 },
1533 plot_image = function(){
1534 plot(main="Define plot_image() for subclass") # Abstract function
1535 },
1536 get_n = function(){
1537 stop("Need to define function get_n() for subclass") #Abstract function
1538 },
1539 create_standard_main = function(){
1540 needs_main <- function(){
1541 return(include_text & include_main & !is_image_container)
1542 }
1543 if (needs_main()){
1544 collection_name <- data_processors[[1]]$info$collection_name()
1545 main <<- sprintf("%s\n(Dataset: %s; n=%s)", plot_title, collection_name, format(get_n(), big.mark = ","))
1546 }
1547 },
1548 plot_image_in_window = function(p_scale=NULL, window_height=NULL, window_width=NULL){
1549 scale <<- p_scale
1550 SIZE_AXIS <- 2.5 * scale # in the units used by mar
1551 SIZE_MAIN <- 2.5 * scale
1552 SIZE_NO_MARGIN <- 0.1 * scale
1553 FONT_SIZE <- 8 * scale
1554 WINDOW_WIDTH <- window_width * scale
1555 WINDOW_HEIGHT <- window_height * scale
1556 X_INTERSP <- 0.5 * scale + 0.4 # manages legend text spacing
1557 Y_INTERSP <- 0.5 * scale + 0.4 # manages
1558
1559 if (include_main){
1560 mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_MAIN , SIZE_NO_MARGIN)
1561 } else {
1562 mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_NO_MARGIN, SIZE_NO_MARGIN)
1563 }
1564 mgp = c(SIZE_AXIS/2, SIZE_AXIS/4, 0) # Margin line (mex units) for axis title, axis labels, axis lines
1565 ps = FONT_SIZE
1566 x.intersp <<- X_INTERSP
1567 y.intersp <<- Y_INTERSP
1568
1569 windows(width = WINDOW_WIDTH, height=WINDOW_HEIGHT)
1570
1571 old_par <- par(mar=mar, ps=ps, mgp=mgp)
1572 create_standard_main()
1573
1574 plot_image()
1575 if (!is_image_container){
1576 axis(side=1, labels=include_text, tcl=-0.5, lwd=scale)
1577 axis(side=2, labels=include_text, tcl=-0.5, lwd=scale)
1578 box(lwd=scale)
1579 }
1580 par(old_par)
1581 },
1582 plot_image_in_small_window = function(p_scale=1){
1583 plot_image_in_window(p_scale=p_scale, window_height=2, window_width=3.25)
1584 },
1585 plot_image_in_large_window = function(p_scale=1, window_height=NULL){
1586 plot_image_in_window(p_scale=p_scale, window_height=window_height, window_width=7)
1587 }
1588 )
1589 ###############################################################################
1590 # Class: Legend_Object
1591 ###############################################################################
1592 Legend_Object = setRefClass("Legend_Object",
1593 contains = "Plot_Image",
1594 fields = list(user_params = "list",
1595 scale = "numeric"))
1596 Legend_Object$methods(
1597 initialize = function(p_user_params = NULL, p_scale = NULL){
1598 if (is.null(p_user_params)){
1599 user_params <<- list()
1600 } else {
1601 user_params <<- p_user_params
1602 }
1603 if (is.null(p_scale)){
1604 stop("Legend_Object must have a valid scale")
1605 } else {
1606 scale <<- p_scale
1607 }
1608 user_params$x <<- if_null(user_params$x , "topleft", user_params$x)
1609 user_params$y <<- if_null(user_params$y , NULL, user_params$y)
1610 user_params$bty <<- if_null(user_params$bty , "o", user_params$bty)
1611 user_params$lwd <<- if_null(user_params$lwd , NULL, user_params$lwd * scale) # Because we allow NULL, scale must be inside parens
1612 user_params$seg.len <<- if_null(user_params$seg.len , 3, user_params$seg.len ) * scale
1613 user_params$box.lwd <<- if_null(user_params$box.lwd , 1, user_params$box.lwd ) * scale
1614 user_params$x.intersp <<- if_null(user_params$x.intersp, 0.6, user_params$x.intersp) * scale
1615 user_params$y.intersp <<- if_null(user_params$y.intersp, 0.4, user_params$y.intersp) * scale + 0.2
1616 },
1617 show = function(){
1618 first_legend = legend(x = user_params$x,
1619 y = user_params$y,
1620 title = "",
1621 legend = user_params$leg,
1622 col = user_params$col,
1623 bty = user_params$bty,
1624 lty = user_params$lty,
1625 lwd = user_params$lwd,
1626 seg.len = user_params$seg.len,
1627 box.lwd = user_params$box.lwd,
1628 x.intersp = user_params$x.intersp,
1629 y.intersp = user_params$y.intersp)
1630 new_x = first_legend$rect$left
1631 new_y = first_legend$rect$top + first_legend$rect$h * ifelse(scale==1, 0.07, 0.03 - (scale * 0.02)) #switch(scale, 0.01, -0.01, -0.03, -0.05)# (0.07 - 0.09 * ((scale-1)^2))#(0.15 - 0.08*scale)#.07 * (2 - scale)
1632 legend(x=new_x, y=new_y, title = user_params$title, legend = "", cex=1.15, bty="n")
1633
1634 }
1635 )
1636 ###############################################################################
1637 # Class: Plot_Multiple_Images
1638 ###############################################################################
1639 Plot_Multiple_Images = setRefClass("Plot_Multiple_Images",
1640 contains = "Plot_Image",
1641 fields = list(n_images_wide = "numeric",
1642 n_images_tall = "numeric",
1643 image_list = "list"))
1644 Plot_Multiple_Images$methods(
1645 initialize = function(p_n_images_wide=1, p_n_images_tall=2, p_image_list=NULL, ...){
1646 n_images_wide <<- p_n_images_wide
1647 n_images_tall <<- p_n_images_tall
1648 image_list <<- p_image_list
1649 #plot_title <<- "True Hit and False Hit Distributions"
1650
1651 callSuper(p_is_image_container=TRUE, ...)
1652 },
1653 plot_image = function(){
1654 # Support functions
1655 apply_mtext <- function(letter=NULL){
1656 line=1.3*scale
1657 mtext(letter, side=1, line=line, adj=0)
1658 }
1659 # main code
1660 old_par <- par(mfrow=c(n_images_tall, n_images_wide))
1661 i=0
1662 n_images <- length(image_list)
1663
1664 for (i in 1:n_images){
1665 image <- image_list[[i]]
1666 image$create_standard_main()
1667 image$scale <- scale
1668 image$plot_image()
1669 axis(side=1, labels=include_text, tcl=-0.5, lwd=scale)
1670 axis(side=2, labels=include_text, tcl=-0.5, lwd=scale)
1671 box(lwd=scale)
1672 apply_mtext(letter=sprintf("(%s)", letters[i]))
1673
1674 }
1675 par(old_par)
1676
1677 }
1678 )
1679 ###############################################################################
1680 # Class: Plot_Compare_PMD_and_Norm_Density
1681 ###############################################################################
1682 Plot_Compare_PMD_and_Norm_Density = setRefClass("Plot_Compare_PMD_and_Norm_Density",
1683 contains = "Plot_Image",
1684 fields = list(show_norm = "logical",
1685 display_n_psms = "logical"))
1686 Plot_Compare_PMD_and_Norm_Density$methods(
1687 initialize = function(p_show_norm=TRUE, p_display_n_psms=TRUE, ...){
1688 show_norm <<- p_show_norm
1689 display_n_psms <<- p_display_n_psms
1690 plot_title <<- "True Hit and False Hit Distributions"
1691
1692 callSuper(...)
1693 },
1694 plot_image = function(){
1695 # Support functions for plot_compare_PMD_and_norm_density()
1696
1697 get_densities <- function(data_subset = NULL, var_value = NULL){
1698 data_subset$value_of_interest <- data_subset[,var_value]
1699 from <- min(data_subset$value_of_interest, na.rm = TRUE)
1700 to <- max(data_subset$value_of_interest, na.rm = TRUE)
1701 xlim = range(data_subset$value_of_interest)
1702 data_true <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100))
1703 data_false <- subset(data_subset, (PMD_FDR_decoy==1))
1704
1705 d_true <- with(data_true , density(value_of_interest, from = from, to = to, na.rm = TRUE))
1706 d_false <- with(data_false, density(value_of_interest, from = from, to = to, na.rm = TRUE))
1707 d_true <- normalize_density(d_true)
1708 d_false <- normalize_density(d_false)
1709
1710 densities <- list(d_true=d_true, d_false=d_false, var_value = var_value, n_true = nrow(data_true), n_false = nrow(data_false))
1711
1712 return(densities)
1713 }
1714 get_xlim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){
1715 xlim <- range(c( densities_a$d_true$x, densities_a$d_false$y))
1716 if (show_norm){
1717 xlim <- range(c(xlim, densities_b$d_true$x, densities_b$d_false$y))
1718 }
1719 return(xlim)
1720 }
1721 get_ylim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){
1722 ylim <- range(c( densities_a$d_true$y, densities_a$d_false$y))
1723 if (show_norm){
1724 ylim <- range(c(ylim, densities_b$d_true$y, densities_b$d_false$y))
1725 }
1726 return(ylim)
1727 }
1728 plot_distributions <- function(densities = NULL, var_value= NULL, dataset_name = NULL, ...){
1729 leg = list()
1730 leg$leg = c("Good", "Bad")
1731 if (display_n_psms){
1732 leg$leg = sprintf("%s (%d PSMs)",
1733 leg$leg,
1734 c(densities$n_true, densities$n_false))
1735
1736 }
1737 leg$col = c("black", "red")
1738 leg$lwd = c(3 , 3)
1739 leg$lty = c(1 , 2)
1740 leg$title = "Hit Category"
1741 xlab = ifelse(var_value == "value",
1742 "PMD (ppm)",
1743 "PMD - normalized (ppm)")
1744 ylab = "Density"
1745 if (!include_text){
1746 xlab = ""
1747 ylab = ""
1748 }
1749 plot( densities$d_true , col=leg$col[1], lwd=leg$lwd[1] * scale, lty=leg$lty[1], xaxt = "n", yaxt = "n", main=main, xlab = xlab, ylab=ylab, ...)
1750 lines(densities$d_false, col=leg$col[2], lwd=leg$lwd[2] * scale, lty=leg$lty[2])
1751 abline(v=0, h=0, col="gray", lwd=1*scale)
1752 if (include_text){
1753 legend_object <- Legend_Object$new(leg, scale)
1754 legend_object$show()
1755 #legend("topleft", legend=leg.leg, col=leg.col, lwd=leg.lwd, lty=leg.lty, x.intersp = x.intersp, y.intersp = y.intersp)
1756 }
1757 }
1758
1759 # Main code block for plot_compare_PMD_and_norm_density
1760 data_processor <- data_processors[[1]]
1761 data_processor$data_groups$ensure()
1762 data_groups <- data_processor$data_groups$df
1763
1764 data_subset_a <- subset(data_groups , used_to_find_middle == FALSE)
1765 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11)
1766
1767 densities_a <- get_densities(data_subset = data_subset_a, var_value = "value")
1768 densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm")
1769
1770 xlim=get_xlim(densities_a, densities_b, show_norm = show_norm)
1771 ylim=get_ylim(densities_a, densities_b, show_norm = show_norm)
1772
1773 dataset_name <- data_processor$info$collection_name
1774 plot_distributions( densities=densities_a, var_value = "value" , dataset_name = dataset_name, xlim=xlim, ylim=ylim)
1775 if (show_norm){
1776 plot_distributions(densities=densities_b, var_value = "value_norm", dataset_name = dataset_name, xlim=xlim, ylim=ylim)
1777 }
1778 },
1779 get_n = function(){
1780 data_processor <- data_processors[[1]]
1781 data_processor$data_groups$ensure()
1782 data_subset_a <- subset(data_processor$data_groups$df , used_to_find_middle == FALSE)
1783 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11)
1784
1785 if (show_norm){
1786 data_subset <- data_subset_a
1787 } else {
1788 data_subset <- data_subset_b
1789 }
1790
1791 data_true <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100))
1792 data_false <- subset(data_subset, (PMD_FDR_decoy==1))
1793
1794 return(nrow(data_true) + nrow(data_false))
1795 }
1796 )
1797
1798 ###############################################################################
1799 # Class: Plot_Time_Invariance_Alt
1800 ###############################################################################
1801 Plot_Time_Invariance_Alt = setRefClass("Plot_Time_Invariance_Alt",
1802 contains = "Plot_Image",
1803 fields = list(show_norm = "logical",
1804 display_n_psms = "logical",
1805 training_class = "character",
1806 ylim = "numeric",
1807 field_of_interest = "character"))
1808 Plot_Time_Invariance_Alt$methods(
1809 initialize = function(p_ylim=NULL, p_training_class=NULL, p_field_of_interest="value_norm", ...){
1810 get_subset_title <- function(training_class=NULL){
1811 if (training_class == "bad_long"){
1812 subset_title="bad only"
1813 } else if (training_class == "good_testing"){
1814 subset_title="good-testing only"
1815 } else if (training_class == "good_training"){
1816 subset_title="good-training only"
1817 } else if (training_class == "other"){
1818 subset_title="other only"
1819 } else {
1820 stop("Unexpected training_class in plot_time_invariance")
1821 }
1822 return(subset_title)
1823 }
1824
1825 ylim <<- p_ylim
1826 training_class <<- p_training_class
1827 field_of_interest <<- p_field_of_interest
1828 subset_title <- get_subset_title(training_class=training_class)
1829 backup_title <- sprintf("Middle 25%% PMD for spectra sorted by index%s",
1830 ifelse(is.null(subset_title),
1831 "",
1832 sprintf(" - %s", subset_title)))
1833 #plot_title <<- get_main(main_title=main, backup_title=backup_title, data_collection = data_collection)
1834 plot_title <<- backup_title
1835
1836 callSuper(...)
1837 },
1838 plot_image = function(){
1839 # Support functions for plot_time_invariance()
1840
1841 # Main code of plot_time_invariance()
1842 data_subset = get_data_subset()
1843 plot_group_spectrum_index_from_subset_boxes(data_subset = data_subset)
1844 abline(h=0, col="blue", lwd=scale)
1845 },
1846 get_data_subset = function(){
1847 data_processor <- data_processors[[1]]
1848 data_processor$data_groups$ensure()
1849 return(subset(data_processor$data_groups$df, (group_training_class==training_class)))
1850 },
1851 get_n = function(){
1852 return(nrow(get_data_subset()))
1853 },
1854 plot_group_spectrum_index_from_subset_boxes = function(data_subset = NULL){
1855 n_plot_groups <- 100
1856
1857 field_name_text <- ifelse(field_of_interest=="value", "PMD", "Translated PMD")
1858 new_subset <- data_subset
1859 new_subset$value_of_interest <- new_subset[,field_of_interest]
1860 new_subset <- new_subset[order(new_subset$PMD_FDR_spectrum_index),]
1861
1862 idxs <- round_to_tolerance(seq(from=1, to=nrow(new_subset), length.out = n_plot_groups+1), 1)
1863 idxs_left <- idxs[-(n_plot_groups+1)]
1864 idxs_right <- idxs[-1] - 1
1865 idxs_right[n_plot_groups] <- idxs_right[n_plot_groups] + 1
1866
1867 new_subset$plot_group <- NA
1868 for (i in 1:n_plot_groups){
1869 new_subset$plot_group[idxs_left[i]:idxs_right[i]] <- i
1870 }
1871 xleft <- aggregate(PMD_FDR_spectrum_index ~plot_group, data=new_subset, FUN=min)
1872 xright <- aggregate(PMD_FDR_spectrum_index ~plot_group, data=new_subset, FUN=max)
1873 ybottom <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 - (0.25/2))})
1874 ytop <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 + (0.25/2))})
1875 boxes <- merge( rename_column(xleft , "PMD_FDR_spectrum_index" , "xleft"),
1876 merge( rename_column(xright , "PMD_FDR_spectrum_index" , "xright"),
1877 merge(rename_column(ybottom, "value_of_interest", "ybottom"),
1878 rename_column(ytop , "value_of_interest", "ytop"))))
1879
1880 xlab <- "Spectrum Index"
1881 ylab <- sprintf("%s (ppm)", field_name_text )
1882 if (is.null(ylim)){
1883 ylim <<- range(new_subset$value_of_interest)
1884 }
1885 if (!include_text){
1886 xlab=""
1887 ylab=""
1888 }
1889 plot(value_of_interest~PMD_FDR_spectrum_index, data=new_subset, type="n", ylim=ylim, xlab = xlab, ylab=ylab, main=main, xaxt="n", yaxt="n")
1890 with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale))
1891 #points(median_of_group_index~PMD_FDR_spectrum_index, data=data_subset, cex=.5, pch=15)
1892 axis(1, labels=include_text, lwd=scale)
1893 axis(2, labels=include_text, lwd=scale)
1894 box(lwd=scale) #box around plot area
1895 }
1896
1897 )
1898 ###############################################################################
1899 # Class: Plot_Time_Invariance_Alt_Before_and_After
1900 ###############################################################################
1901 Plot_Time_Invariance_Alt_Before_and_After = setRefClass("Plot_Time_Invariance_Alt_Before_and_After",
1902 contains = "Plot_Multiple_Images",
1903 fields = list())
1904 Plot_Time_Invariance_Alt_Before_and_After$methods(
1905 initialize = function(p_data_processors = NULL,
1906 p_include_text=TRUE,
1907 p_include_main=FALSE,
1908 p_ylim = c(-4,4), ...){
1909 plot_object1 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors,
1910 p_include_text=p_include_text,
1911 p_include_main=p_include_main,
1912 p_training_class = "good_testing",
1913 p_field_of_interest = "value",
1914 p_ylim = p_ylim)
1915
1916 plot_object2 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors,
1917 p_include_text=p_include_text,
1918 p_include_main=p_include_main,
1919 p_training_class = "good_testing",
1920 p_field_of_interest = "value_norm",
1921 p_ylim = p_ylim)
1922
1923 callSuper(p_n_images_wide=1,
1924 p_n_images_tall=2,
1925 p_include_text=p_include_text,
1926 p_include_main=p_include_main,
1927 p_image_list = list(plot_object1, plot_object2), ...)
1928 }
1929 )
1930
1931 ###############################################################################
1932 # Class: Plot_Density_PMD_and_Norm_Decoy_by_AA_Length
1933 ###############################################################################
1934 Plot_Density_PMD_and_Norm_Decoy_by_AA_Length = setRefClass("Plot_Density_PMD_and_Norm_Decoy_by_AA_Length",
1935 contains = "Plot_Image",
1936 fields = list(show_norm = "logical"))
1937 Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$methods(
1938 initialize = function(p_show_norm=FALSE, ...){
1939 plot_title <<- "The Decoy Bump: PMD Distribution of Decoy matches by peptide length"
1940 show_norm <<- p_show_norm
1941 callSuper(...)
1942 },
1943 get_n = function(){
1944 data_processor <- data_processors[[1]]
1945 data_processor$data_groups$ensure()
1946 data_subset <- subset(data_processor$data_groups$df, (PMD_FDR_decoy == 1))
1947 return(nrow(data_subset))
1948 },
1949 plot_image = function(){
1950
1951 # Support functions for plot_density_PMD_and_norm_decoy_by_aa_length()
1952
1953 add_group_peptide_length_special <- function(){
1954 data_processor <- data_processors[[1]]
1955 data_processor$data_groups$ensure()
1956 data_groups <- data_processor$data_groups$df # the name data_groups is a data.frame instead of a Data_Object
1957 data_groups <- subset(data_groups, used_to_find_middle == FALSE)
1958
1959 df_group_definition <- data.frame(stringsAsFactors = FALSE,
1960 list(group_peptide_length_special = c("06-08", "09-10", "11-12", "13-15", "16-20", "21-50"),
1961 min = c( 6 , 9 , 11 , 13 , 16 , 21 ),
1962 max = c( 8 , 10 , 12 , 15 , 20 , 50 ) ))
1963 group_peptide_length_special <- data.frame(list(PMD_FDR_peptide_length = 6:50))
1964 group_peptide_length_special$min <- with(group_peptide_length_special, sapply(PMD_FDR_peptide_length, FUN = function(i) max(df_group_definition$min[df_group_definition$min <= i])))
1965 group_peptide_length_special <- merge(group_peptide_length_special, df_group_definition)
1966
1967 data_groups$group_peptide_length_special <- NULL
1968 new_data_groups <- (merge(data_groups,
1969 group_peptide_length_special[,c("PMD_FDR_peptide_length",
1970 "group_peptide_length_special")]))
1971 return(new_data_groups)
1972 }
1973 get_densities <- function(data_subset = NULL, field_value = NULL, field_group=NULL){
1974 get_density_from_subset <- function(data_subset=NULL, xlim=NULL){
1975
1976 d_group <- with(data_subset , density(value_of_interest, from = xlim[1], to = xlim[2], na.rm=TRUE))
1977 d_group <- normalize_density(d_group)
1978
1979 return(d_group)
1980 }
1981
1982 data_temp <- data_subset
1983 data_temp$value_of_interest <- data_temp[[field_value]]
1984 data_temp$group_of_interest <- data_temp[[field_group]]
1985
1986 xlim = range(data_temp$value_of_interest, na.rm=TRUE)
1987
1988 groups <- sort(unique(data_temp$group_of_interest))
1989 n_groups <- length(groups)
1990
1991 d_group <- get_density_from_subset( data_subset=data_temp, xlim = xlim )
1992 densities <- list("All decoys" = d_group)
1993 for (i in 1:n_groups){
1994 group <- groups[i]
1995
1996 d_group <- get_density_from_subset( data_subset=subset(data_temp, (group_of_interest == group)),
1997 xlim = xlim )
1998 densities[[group]] <- d_group
1999 }
2000
2001 return(densities)
2002 }
2003 get_limits <- function(densities_a = NULL, densities_b = NULL){
2004 xlim = c()
2005 ylim = c(0)
2006 for (single_density in densities_a){
2007 xlim=range(c(xlim, single_density$x))
2008 ylim=range(c(ylim, single_density$y))
2009 }
2010 for (single_density in densities_b){
2011 xlim=range(c(xlim, single_density$x))
2012 ylim=range(c(ylim, single_density$y))
2013 }
2014
2015 return(list(xlim=xlim, ylim=ylim))
2016 }
2017 plot_distributions <- function(data_groups = NULL, xlim=NULL, ylim=NULL, densities = NULL, field_group= NULL, field_value = "value", xlab_modifier = "", var_value= NULL, include_peak_dots=TRUE, dataset_name = NULL, ...){
2018 data_groups$group_of_interest <- data_groups[[field_group]]
2019 data_groups$value_of_interest <- data_groups[[field_value]]
2020
2021 # Main body of plot_decoy_distribution_by_field_of_interest()
2022 FIXED_LWD=3
2023
2024 groups <- sort(unique(data_groups$group_of_interest))
2025 n <- length(groups)
2026
2027 df_leg <- data.frame(stringsAsFactors = FALSE,
2028 list(leg = groups,
2029 col = rainbow_with_fixed_intensity(n = n, goal_intensity_0_1 = 0.4),
2030 lty = rep(1:6, length.out=n),
2031 lwd = rep(FIXED_LWD , n)) )
2032
2033 d <- densities[["All decoys"]]
2034
2035 xlab = sprintf("Precursor Mass Discrepancy%s (ppm)", xlab_modifier)
2036 ylab = "Density"
2037
2038 if (!include_text){
2039 xlab=""
2040 ylab=""
2041 }
2042 plot(d, lwd=FIXED_LWD * scale, main=main, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n")
2043
2044 ave_peak <- max(d$y)
2045 max_peak <- 0
2046
2047 for (local_group in groups){
2048 data_subset <- subset(data_groups, group_of_interest == local_group)
2049 data_info <- subset(df_leg , leg == local_group)
2050 col <- data_info$col[1]
2051 lty <- data_info$lty[1]
2052 lwd <- data_info$lwd[1]
2053 if (nrow(data_subset) > 100){
2054 d <- densities[[local_group]] #density(data_subset[[field_value]])
2055 lines(d, col=col, lty=lty, lwd=lwd * scale)
2056 peak <- max(d$y)
2057 max_peak <- max(max_peak, peak)
2058 }
2059 }
2060 abline(v=0, h=0, lwd=scale)
2061 leg <- list(title = "Peptide length (aa)",
2062 leg = c("All decoys" , df_leg$leg),
2063 col = c(col2hex("black") , df_leg$col),
2064 lty = c(1 , df_leg$lty),
2065 lwd = c(FIXED_LWD , df_leg$lwd)
2066 )
2067 if (include_text){
2068 legend_object = Legend_Object$new(leg, scale)
2069 legend_object$show()
2070 #first_legend = legend(x="topleft", title = "", legend = leg$leg, col = leg$col, lty = leg$lty, lwd = leg$lwd, seg.len=leg$seg.len, box.lwd=leg$box.lwd, x.intersp = leg$x.intersp, y.intersp = leg$y.intersp)
2071 #new_x = first_legend$rect$left
2072 #new_y = first_legend$rect$top + first_legend$rect$h * .07 * (2 - scale)
2073 #legend(x=new_x, y=new_y, title = leg$title, legend = "", cex=1.15, bty="n")
2074 }
2075
2076 box(lwd=scale) #box around plot area
2077
2078 }
2079
2080 # Main body for plot_density_PMD_and_norm_decoy_by_aa_length()
2081
2082 data_mod <- add_group_peptide_length_special()
2083 data_mod <- subset(data_mod, PMD_FDR_decoy==1)
2084
2085 densities_a <- get_densities(data_subset = data_mod, field_value = "value" , field_group = "group_peptide_length_special")
2086 densities_b <- get_densities(data_subset = data_mod, field_value = "value_norm", field_group = "group_peptide_length_special")
2087
2088 data_processor <- data_processors[[1]]
2089 dataset_name <- data_processor$info$collection_name()
2090
2091 limits <- get_limits(densities_a, densities_b)
2092 xlim <- limits$xlim
2093 ylim <- limits$ylim
2094
2095 if (show_norm){
2096 plot_distributions(data_groups = data_mod, densities=densities_b, field_value = "value_norm", xlab_modifier = " - normalized", field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim)
2097 } else {
2098 plot_distributions(data_groups = data_mod, densities=densities_a, field_value = "value" , xlab_modifier = "" , field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim)
2099 }
2100 }
2101
2102 )
2103
2104 ###############################################################################
2105 # Class: Plot_Bad_CI
2106 ###############################################################################
2107 Plot_Bad_CI = setRefClass("Plot_Bad_CI",
2108 contains = "Plot_Image",
2109 fields = list(breaks = "numeric",
2110 ylim = "numeric"))
2111 Plot_Bad_CI$methods(
2112 initialize = function(p_breaks=20, p_ylim=NULL, ...){
2113 if (is.null(p_ylim)){
2114 ylim <<- numeric(0)
2115 } else {
2116 ylim <<- p_ylim
2117 }
2118 breaks <<- p_breaks
2119 plot_title <<- "Credible Intervals for proportion within range - bad"
2120 callSuper(...)
2121 },
2122 data_processor = function(){
2123 return(data_processors[[1]])
2124 },
2125 get_n = function(){
2126 data_processor()$data_groups$ensure()
2127 return(nrow(subset(data_processor()$data_groups$df, (PMD_FDR_decoy == 1))))
2128 },
2129 plot_image = function(){
2130 data_processor()$data_groups$ensure()
2131 data_groups <- data_processor()$data_groups$df
2132 data_decoy <- subset(data_groups, data_groups$group_training_class == "bad_long")
2133 data_decoy$region <- cut(x = data_decoy$value, breaks = breaks)
2134 table(data_decoy$region)
2135 regions <- unique(data_decoy$region)
2136
2137 N = nrow(data_decoy)
2138 find_lower_ci_bound <- function(x){
2139 ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05)
2140 return(ci[1])
2141 }
2142 find_upper_ci_bound <- function(x){
2143 ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05)
2144 return(ci[2])
2145 }
2146 xleft <- aggregate(value~region, data=data_decoy, FUN=min)
2147 xright <- aggregate(value~region, data=data_decoy, FUN=max)
2148 ytop <- aggregate(value~region, data=data_decoy, FUN=find_upper_ci_bound)
2149 ybottom <- aggregate(value~region, data=data_decoy, FUN=find_lower_ci_bound)
2150
2151 xleft <- rename_column(xleft , "value", "xleft" )
2152 xright <- rename_column(xright , "value", "xright" )
2153 ytop <- rename_column(ytop , "value", "ytop" )
2154 ybottom <- rename_column(ybottom, "value", "ybottom")
2155
2156 boxes <- merge(merge(xleft, xright), merge(ytop, ybottom))
2157
2158
2159 xlab <- "Precursor Mass Discrepancy (ppm)"
2160 ylab <- "Proportion of PSMs\nin subgroup"
2161 xlim=range(data_decoy$value, na.rm = TRUE)
2162 get_ylim(boxes=boxes)
2163 if (!include_text){
2164 xlab=""
2165 ylab=""
2166 }
2167
2168 plot(x=c(-10,10), y=c(0,1), type="n", ylim=ylim, xlim=xlim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n")
2169
2170 with(boxes, rect(xleft=xleft, xright=xright, ytop=ytop, ybottom=ybottom, lwd=scale))
2171
2172 abline(h=1/breaks, col="blue", lwd=scale)
2173 },
2174 get_ylim = function(boxes=NULL){
2175 is_valid_range <- function(r=NULL){
2176 return(length(r) == 2)
2177 }
2178 if (! is_valid_range(ylim)){
2179 ylim <<- range(c(0,boxes$ytop, boxes$ybottom))
2180 }
2181 }
2182
2183 )
2184 ###############################################################################
2185 # Class: Plot_Selective_Loss
2186 ###############################################################################
2187 Plot_Selective_Loss = setRefClass("Plot_Selective_Loss",
2188 contains = "Plot_Image",
2189 fields = list())
2190 Plot_Selective_Loss$methods(
2191 initialize = function( ...){
2192 plot_title <<- "PMD-FDR Selectively removes Bad Hits"
2193 callSuper(...)
2194 },
2195 data_processor = function(){
2196 return(data_processors[[1]])
2197 },
2198 get_n = function(){
2199 data_processor()$i_fdr$ensure()
2200 data_subset <- data_processor()$i_fdr$df
2201 return(nrow(data_subset))
2202 },
2203 plot_image = function(){
2204 # Support functions for plot_selective_loss()
2205
2206 samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){
2207 data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold)
2208 tbl <- with(updated_i_fdr,
2209 table(PMD_FDR_input_score >= score_threshold,
2210 new_confidence < score_threshold,
2211 group_decoy_proteins))
2212 df <- data.frame(tbl)
2213 df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum)
2214 df_n <- rename_column(df_n, name_before = "Freq", "n")
2215 df <- merge(df, df_n)
2216 df$rate_of_loss <- with(df, Freq/n)
2217 df <- subset(df, (Var1==TRUE) & (Var2==TRUE))
2218 df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")]
2219 if (nrow(df) > 0){
2220 df$score_threshold <- score_threshold
2221 }
2222 return(df)
2223 }
2224 get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){
2225 df=data.frame()
2226 for (score_threshold in score_thresholds){
2227 df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold)
2228 df <- rbind(df, df_new_loss)
2229 }
2230 return(df)
2231 }
2232
2233 # Main code for plot_selective_loss()
2234
2235 updated_i_fdr <- data_processor()$i_fdr$df
2236 updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100)))
2237 loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100)
2238 xlim <- with(loss_record, range(score_threshold))
2239 ylim <- c(0,1)
2240 xlab <- "Fixed Confidence threshold (PeptideShaker score)"
2241 ylab <- "Rate of PSM disqualification from PMD-FDR"
2242 lwd <- 4
2243 plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab)
2244
2245 groups <- sort(unique(loss_record$group_decoy_proteins))
2246 n_g <- length(groups)
2247
2248 cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1)
2249 ltys <- rep(1:6, length.out=n_g)
2250
2251 leg <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, title="Species/Category")
2252
2253 for (i in 1:n_g){
2254 lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$leg[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i])
2255 }
2256 abline(h=0, v=100, lwd=scale)
2257 abline(h=c(0.1, 0.8), col="gray", lwd=scale)
2258
2259 #leg = list(leg=group, col=col, lty=lty, lwd=lwd)
2260 #with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len))
2261 legend_object <- Legend_Object$new(leg, scale)
2262 legend_object$show()
2263 }
2264
2265 )
2266 ###############################################################################
2267 # Class: Plot_Selective_Loss_for_TOC
2268 ###############################################################################
2269 Plot_Selective_Loss_for_TOC = setRefClass("Plot_Selective_Loss_for_TOC",
2270 contains = "Plot_Image",
2271 fields = list(xlab="character",
2272 ylab="character",
2273 title_x="numeric",
2274 title_y="numeric",
2275 legend_border="logical",
2276 legend_x = "numeric",
2277 legend_y = "numeric",
2278 legend_title="character",
2279 legend_location = "character",
2280 name_contaminant = "character",
2281 name_decoy = "character",
2282 name_human = "character",
2283 name_pyro = "character"))
2284 Plot_Selective_Loss_for_TOC$methods(
2285 initialize = function( ...){
2286 plot_title <<- "PMD-FDR selectively removes bad hits"
2287 callSuper(...)
2288 xlab <<- "Confidence threshold (PeptideShaker)"
2289 ylab <<- "PMD Disqualifiction Rate"
2290 legend_border <<- FALSE
2291 #legend_title <<- "Species/Category"
2292 title_x <<- 50
2293 title_y <<- 0.9
2294 legend_x <<- 10
2295 legend_y <<- 0.75
2296 name_contaminant <<- "signal - contaminant"
2297 name_decoy <<- "decoy - reversed"
2298 name_human <<- "decoy - human"
2299 name_pyro <<- "signal - pyrococcus"
2300 },
2301 data_processor = function(){
2302 return(data_processors[[1]])
2303 },
2304 get_n = function(){
2305 data_processor()$i_fdr$ensure()
2306 data_subset <- data_processor()$i_fdr$df
2307 return(nrow(data_subset))
2308 },
2309 plot_image = function(){
2310 # Support functions for plot_selective_loss()
2311
2312 samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){
2313 data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold)
2314 tbl <- with(updated_i_fdr,
2315 table(PMD_FDR_input_score >= score_threshold,
2316 new_confidence < score_threshold,
2317 group_decoy_proteins))
2318 df <- data.frame(tbl)
2319 df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum)
2320 df_n <- rename_column(df_n, name_before = "Freq", "n")
2321 df <- merge(df, df_n)
2322 df$rate_of_loss <- with(df, Freq/n)
2323 df <- subset(df, (Var1==TRUE) & (Var2==TRUE))
2324 df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")]
2325 if (nrow(df) > 0){
2326 df$score_threshold <- score_threshold
2327 }
2328 return(df)
2329 }
2330 get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){
2331 df=data.frame()
2332 for (score_threshold in score_thresholds){
2333 df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold)
2334 df <- rbind(df, df_new_loss)
2335 }
2336 return(df)
2337 }
2338 convert_groups <- function(groups=NULL){
2339 new_groups <- groups
2340 new_groups <- gsub(pattern = "contaminant", replacement = name_contaminant, x = new_groups)
2341 new_groups <- gsub(pattern = "decoy" , replacement = name_decoy , x = new_groups)
2342 new_groups <- gsub(pattern = "human" , replacement = name_human , x = new_groups)
2343 new_groups <- gsub(pattern = "pyrococcus" , replacement = name_pyro , x = new_groups)
2344
2345 return(new_groups)
2346 }
2347
2348 # Main code for plot_selective_loss()
2349
2350 updated_i_fdr <- data_processor()$i_fdr$df
2351 updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100)))
2352 loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100)
2353 xlim <- with(loss_record, range(score_threshold))
2354 ylim <- c(0,1)
2355 #xlab <- "Fixed Confidence threshold (PeptideShaker score)"
2356 #ylab <- "Rate of PSM disqualification from PMD-FDR"
2357 lwd <- 4
2358 plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab)
2359
2360 groups <- sort(unique(loss_record$group_decoy_proteins))
2361 n_g <- length(groups)
2362
2363 cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1)
2364 ltys <- rep(1:6, length.out=n_g)
2365 bty <- ifelse(legend_border, "o", "n")
2366
2367 leg <- list(leg=convert_groups(groups), var_name=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y)
2368 #leg <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y)
2369
2370 for (i in 1:n_g){
2371 lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$var_name[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i])
2372 }
2373 abline(h=0, v=100, lwd=scale)
2374 abline(h=c(0.1, 0.8), col="gray", lwd=scale)
2375
2376 #leg = list(leg=group, col=col, lty=lty, lwd=lwd)
2377 #with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len))
2378 legend_object <- Legend_Object$new(leg, scale)
2379 legend_object$show()
2380 text(x=title_x, y=title_y, labels = plot_title)
2381 }
2382
2383 )
2384 ###############################################################################
2385 # Class: Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR
2386 ###############################################################################
2387 Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR = setRefClass("Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR",
2388 contains = "Plot_Image",
2389 fields = list())
2390 Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$methods(
2391 initialize = function( ...){
2392 plot_title <<- "Precursor Mass Discrepance i-FDR for 1% Target-Decoy FDR PSMs"
2393 callSuper(...)
2394 },
2395 data_processor = function(){
2396 return(data_processors[[1]])
2397 },
2398 get_n = function(){
2399 data_processor()$i_fdr$ensure()
2400 if (one_percent_calculation_exists()){
2401 i_fdr <- data_processor()$i_fdr$df
2402 data_subset <- subset(i_fdr, is_in_1percent==TRUE)
2403 n <- nrow(data_subset)
2404 } else {
2405 n <- 0
2406 }
2407
2408 return (n)
2409 },
2410 plot_image = function(){
2411 if (one_percent_calculation_exists()){
2412 i_fdr <- get_modified_fdr()
2413 report_good_discrepancies(i_fdr)
2414 data_TD_good <- get_data_TD_good(i_fdr)
2415 mean_results <- get_mean_results(data_TD_good)
2416 boxes <- mean_results
2417 boxes <- rename_columns(df = boxes,
2418 names_before = c("min_conf", "max_conf", "lower" , "upper"),
2419 names_after = c("xleft" , "xright" , "ybottom", "ytop" ))
2420 xlim <- range(boxes[,c("xleft", "xright")])
2421 ylim <- range(boxes[,c("ybottom", "ytop")])
2422
2423 #head(mean_results)
2424
2425 xlab = "Confidence Score (Peptide Shaker)"
2426 ylab = "Mean PMD i-FDR"
2427
2428 if (!include_text){
2429 xlab=""
2430 ylab=""
2431 }
2432
2433 plot(mean_i_fdr~mean_conf, data=mean_results, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n", cex=scale, lwd=scale)
2434 with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale))
2435 abline(b=-1, a=100, lwd=4*scale, col="dark gray")
2436 abline(h=0, v=100, lwd=1*scale)
2437
2438 } else {
2439 stop(sprintf("Dataset '%s' does not include 1%% FDR data", data_processor()$info$collection_name()))
2440 }
2441 },
2442 get_mean_results = function(data_TD_good = NULL){
2443 mean_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=mean)
2444 mean_i_fdr <- rename_column(mean_i_fdr, "i_fdr", "mean_i_fdr")
2445 sd_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=sd)
2446 sd_i_fdr <- rename_column(sd_i_fdr, "i_fdr", "sd_i_fdr")
2447 n_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=length)
2448 n_i_fdr <- rename_column(n_i_fdr, "i_fdr", "n")
2449 mean_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=mean)
2450 mean_conf <- rename_column(mean_conf, "PMD_FDR_input_score", "mean_conf")
2451 min_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=min)
2452 min_conf <- rename_column(min_conf, "PMD_FDR_input_score", "min_conf")
2453 max_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=max)
2454 max_conf <- rename_column(max_conf, "PMD_FDR_input_score", "max_conf")
2455
2456 mean_results <- mean_i_fdr
2457 mean_results <- merge(mean_results, sd_i_fdr)
2458 mean_results <- merge(mean_results, n_i_fdr)
2459 mean_results <- merge(mean_results, mean_conf)
2460 mean_results <- merge(mean_results, min_conf)
2461 mean_results <- merge(mean_results, max_conf)
2462
2463 mean_results$se <- with(mean_results, sd_i_fdr / sqrt(n - 1))
2464 mean_results$lower <- with(mean_results, mean_i_fdr - 2*se)
2465 mean_results$upper <- with(mean_results, mean_i_fdr + 2*se)
2466 return(mean_results)
2467 },
2468 get_data_TD_good = function(i_fdr=NULL){
2469 data_TD_good <- subset(i_fdr, TD_good==TRUE)
2470 data_TD_good <- data_TD_good[order(data_TD_good$PMD_FDR_input_score),]
2471 n <- nrow(data_TD_good)
2472 data_TD_good$conf_group <- cut(1:n, breaks=floor(n/100))
2473 data_TD_good$i_fdr <- 100 * data_TD_good$i_fdr
2474 return(data_TD_good)
2475 },
2476 get_modified_fdr = function(){
2477 i_fdr <- data_processor()$i_fdr$df
2478 i_fdr$PMD_good <- i_fdr$i_fdr < 0.01
2479 i_fdr$TD_good <- i_fdr$is_in_1percent == TRUE
2480 i_fdr$conf_good <- i_fdr$PMD_FDR_input_score == 100
2481 return(i_fdr)
2482 },
2483 one_percent_calculation_exists = function(){
2484 data_processor()$raw_1_percent$ensure()
2485 return(data_processor()$raw_1_percent$exists())# "is_in_1percent" %in% colnames(data_processor()$i_fdr))
2486 },
2487 report_good_discrepancies = function(i_fdr=NULL){
2488 with(subset(i_fdr, (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
2489 with(subset(i_fdr, (PMD_FDR_input_score==100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
2490 with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
2491 with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
2492 with(subset(i_fdr, (PMD_FDR_input_score>= 90) & (PMD_FDR_input_score< 99) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
2493 }
2494
2495 )
2496
2497 ###############################################################################
2498 # Class: Plot_Density_PMD_by_Score
2499 ###############################################################################
2500 Plot_Density_PMD_by_Score = setRefClass("Plot_Density_PMD_by_Score",
2501 contains = "Plot_Image",
2502 fields = list(show_norm = "logical"))
2503 Plot_Density_PMD_by_Score$methods(
2504 initialize = function(p_show_norm=FALSE, ...){
2505 show_norm <<- p_show_norm
2506 plot_title <<- "PMD distribution, by Confidence ranges"
2507 callSuper(...)
2508
2509 },
2510 data_processor = function(){
2511 return(data_processors[[1]])
2512 },
2513 get_n = function(){
2514 return(nrow(data_processor()$data_groups$df))
2515 #data_subset <- data_collection$i_fdr
2516 #return(nrow(data_subset))
2517 },
2518 get_modified_data_groups = function(var_value = NULL){
2519 # Note: Filters out used_to_find_middle
2520 # Note: Creates "value_of_interest" field
2521 # Note: Remakes "group_decoy_input_score" field
2522 data_new <- data_processor()$data_groups$df
2523 data_new <- subset(data_new, !used_to_find_middle )
2524 data_new$value_of_interest <- data_new[, var_value]
2525
2526 cutoff_points <- c(100, 100, 95, 80, 50, 0, 0)
2527 n <- length(cutoff_points)
2528 uppers <- cutoff_points[-n]
2529 lowers <- cutoff_points[-1]
2530
2531 for (i in 1:(n-1)){
2532 upper <- uppers[i]
2533 lower <- lowers[i]
2534
2535
2536 if (lower==upper){
2537 idx <- with(data_new, which( (PMD_FDR_input_score == upper) & (PMD_FDR_decoy == 0)))
2538 cat_name <- sprintf("%d", upper)
2539 } else {
2540 idx <- with(data_new, which((PMD_FDR_input_score >= lower) & (PMD_FDR_input_score < upper) & (PMD_FDR_decoy == 0)))
2541 cat_name <- sprintf("%02d - %2d", lower, upper)
2542 }
2543 data_new$group_decoy_input_score[idx] <- cat_name
2544 }
2545
2546 return(data_new)
2547 },
2548 plot_image = function(){
2549
2550 # Support functions for plot_density_PMD_by_score()
2551
2552 get_densities <- function(data_subset = NULL, var_value = NULL){
2553
2554 # Support functions for get_densities()
2555
2556 # New version
2557
2558 # Main body of get_densities()
2559
2560 data_subset <- get_modified_data_groups(var_value=var_value)
2561 #data_subset$value_of_interest <- data_subset[,var_value]
2562 from <- min(data_subset$value_of_interest, na.rm=TRUE)
2563 to <- max(data_subset$value_of_interest, na.rm=TRUE)
2564 xlim = range(data_subset$value_of_interest, na.rm=TRUE)
2565
2566 groups <- sort(unique(data_subset$group_decoy_input_score), decreasing = TRUE)
2567 n_groups <- length(groups)
2568
2569 densities <- list(var_value = var_value, groups=groups)
2570
2571 for (i in 1:n_groups){
2572 group <- groups[i]
2573
2574 data_group_single <- subset(data_subset, (group_decoy_input_score == group))
2575 d_group <- with(data_group_single , density(value_of_interest, from = from, to = to, na.rm = TRUE))
2576 d_group <- normalize_density(d_group)
2577
2578 densities[[group]] <- d_group
2579 }
2580
2581 return(densities)
2582
2583 }
2584 get_xlim <- function(densities_a = NULL, densities_b = NULL){
2585 groups <- densities_a$groups
2586
2587 xlim <- 0
2588 for (group in groups){
2589 xlim <- range(xlim, densities_a[[group]]$x, densities_b[[group]]$x)
2590 }
2591
2592 return(xlim)
2593
2594 }
2595 get_ylim <- function(densities_a = NULL, densities_b = NULL){
2596 groups <- densities_a$groups
2597
2598 ylim <- 0
2599 for (group in groups){
2600 ylim <- range(ylim, densities_a[[group]]$y, densities_b[[group]]$y)
2601 }
2602
2603 return(ylim)
2604
2605 }
2606 plot_distributions <- function(densities = NULL, var_value= NULL,include_peak_dots=TRUE, xlab_modifier="", xlim=NULL, ylim=NULL, ...){
2607 data_groups <- get_modified_data_groups(var_value=var_value)
2608 groups <- sort(unique(data_groups$group_decoy_input_score))
2609 n_groups <- length(groups)
2610
2611 groups_std <- setdiff(groups, c("100", "decoy", "0") )
2612 groups_std <- sort(groups_std, decreasing = TRUE)
2613 groups_std <- c(groups_std, "0")
2614 n_std <- length(groups_std)
2615 cols <- rainbow_with_fixed_intensity(n = n_std, goal_intensity_0_1 = 0.5, alpha=0.5)
2616
2617 leg <- list(group = c("100" , groups_std , "decoy" ),
2618 leg = c("100" , groups_std , "All Decoys" ),
2619 col = c(col2hex("black") , cols , col2hex("purple", col_alpha = 0.5)),
2620 lwd = c(4 , rep(2, n_std), 4 ),
2621 title = "Confidence Score")
2622
2623 xlab = sprintf("Precursor Mass Discrepancy%s (ppm)",
2624 xlab_modifier)
2625 ylab = "Density"
2626 if (!include_text){
2627 xlab=""
2628 ylab=""
2629 }
2630
2631
2632 plot( x=xlim, y=ylim, col=leg$col[1], lwd=leg$lwd[1] * scale, main=main, xlab=xlab, ylab=ylab, xaxt="n", yaxt="n", cex=scale, type="n")#, lty=leg.lty[1], ...)
2633
2634 include_peak_dots = FALSE # BUGBUG: Disabling this for now. Need to move this to class parameter
2635
2636 for (i in 1:length(leg$group)){
2637 group <- leg$group[i]
2638 d <- densities[[group]]
2639 lines(d, col=leg$col[i], lwd=leg$lwd[i] * scale)
2640 if (include_peak_dots){
2641 x=d$x[which.max(d$y)]
2642 y=max(d$y)
2643 points(x=c(x,x), y=c(0,y), pch=19, col=leg$col[i], cex=scale)
2644 }
2645 }
2646
2647 abline(v=0, lwd=scale)
2648
2649 if (include_text){
2650 legend_object = Legend_Object$new(leg, scale)
2651 legend_object$show()
2652 }
2653
2654 }
2655
2656 # Main body for plot_density_PMD_by_score()
2657
2658 data_groups <- data_processor()$data_groups$df
2659
2660 data_subset_a <- subset(data_groups , used_to_find_middle == FALSE)
2661 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11)
2662
2663 densities_a <- get_densities(data_subset = data_subset_a, var_value = "value")
2664 densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm")
2665
2666 xlim=get_xlim(densities_a, densities_b)
2667 ylim=get_ylim(densities_a, densities_b)
2668
2669 dataset_name <- data_processor()$info$collection_name()
2670 if (show_norm){
2671 plot_distributions(densities=densities_b, var_value = "value_norm", xlab_modifier = " - normalized", xlim=xlim, ylim=ylim)
2672 } else {
2673 plot_distributions(densities=densities_a, var_value = "value" , xlab_modifier = "" , xlim=xlim, ylim=ylim)
2674 }
2675 }
2676 )
2677 ###############################################################################
2678 # Class: Plot_Dataset_Description
2679 ###############################################################################
2680 Plot_Dataset_Description = setRefClass("Plot_Dataset_Description",
2681 contains = "Plot_Multiple_Images",
2682 fields = list(ylim_time_invariance = "numeric"))
2683 Plot_Dataset_Description$methods(
2684 initialize = function(p_data_processors = NULL,
2685 p_include_text=TRUE,
2686 p_include_main=FALSE,
2687 p_ylim_time_invariance = c(-4,4), ...){
2688 plot_object_r1_c1 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors,
2689 p_include_text=p_include_text,
2690 p_include_main=p_include_main,
2691 p_training_class = "good_testing",
2692 p_field_of_interest = "value",
2693 p_ylim = p_ylim_time_invariance)
2694
2695 plot_object_r1_c2 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors,
2696 p_include_text=p_include_text,
2697 p_include_main=p_include_main,
2698 p_training_class = "good_testing",
2699 p_field_of_interest = "value_norm",
2700 p_ylim = p_ylim_time_invariance)
2701 plot_object_r2_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors,
2702 p_show_norm=FALSE,
2703 p_include_text=p_include_text,
2704 p_include_main=p_include_main)
2705
2706 plot_object_r2_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors,
2707 p_show_norm=FALSE,
2708 p_include_text=p_include_text,
2709 p_include_main=p_include_main)
2710
2711 plot_object_r3_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors,
2712 p_show_norm=TRUE,
2713 p_include_text=p_include_text,
2714 p_include_main=p_include_main)
2715 plot_object_r3_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors,
2716 p_show_norm=TRUE,
2717 p_include_text=p_include_text,
2718 p_include_main=p_include_main)
2719 callSuper(p_n_images_wide=2,
2720 p_n_images_tall=3,
2721 p_include_text=p_include_text,
2722 p_include_main=p_include_main,
2723 p_image_list = list(plot_object_r1_c1, plot_object_r1_c2,
2724 plot_object_r2_c1, plot_object_r2_c2,
2725 plot_object_r3_c1, plot_object_r3_c2), ...)
2726
2727 }
2728 )
2729 ###############################################################################
2730 # Class: Plots_for_Paper
2731 ###############################################################################
2732 Plots_for_Paper <- setRefClass("Plots_for_Paper", fields =list(data_processor_a = "Data_Processor",
2733 data_processor_b = "Data_Processor",
2734 data_processor_c = "Data_Processor",
2735 data_processor_d = "Data_Processor",
2736 include_text = "logical",
2737 include_main = "logical",
2738 mai = "numeric"))
2739 Plots_for_Paper$methods(
2740 initialize = function(){
2741 data_processor_a <<- Data_Processor$new(p_info = Data_Object_Info_737_two_step$new())
2742 data_processor_b <<- Data_Processor$new(p_info = Data_Object_Info_737_combined$new())
2743 data_processor_c <<- Data_Processor$new(p_info = Data_Object_Pyrococcus_tr $new())
2744 data_processor_d <<- Data_Processor$new(p_info = Data_Object_Mouse_Mutations $new())
2745 },
2746 create_plots_for_paper = function(include_main=TRUE, finalize=TRUE){
2747 print_table_4_data()
2748 print_figure_2_data()
2749 plot_figure_D(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2750 plot_figure_C(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2751 plot_figure_B(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2752 plot_figure_A(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2753 plot_figure_8(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2754 plot_figure_7(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2755 plot_figure_6(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main)
2756 plot_figure_5(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2757 plot_figure_4(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2758 plot_figure_3(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main)
2759 },
2760 print_figure_2_data = function(){
2761 print(create_stats_for_grouping_figure(list(data_processor_a)))
2762 },
2763 print_table_4_data = function(){
2764 report_ranges_of_comparisons(processors = list(data_processor_a))
2765 report_ranges_of_comparisons(processors = list(data_processor_c))
2766 },
2767 plot_figure_3 = function(p_scale=NULL, p_include_main=NULL){
2768 plot_object <- Plot_Compare_PMD_and_Norm_Density$new(p_data_processor = list(data_processor_a),
2769 p_show_norm = FALSE,
2770 p_include_text = TRUE,
2771 p_include_main = p_include_main,
2772 p_display_n_psms = FALSE)
2773 plot_object$plot_image_in_small_window(p_scale=p_scale)
2774 },
2775 plot_figure_4 = function(p_scale=NULL, p_include_main=NULL){
2776 plot_object <- Plot_Time_Invariance_Alt_Before_and_After$new(p_data_processors = list(data_processor_a),
2777 p_include_text=TRUE,
2778 p_include_main=p_include_main,
2779 p_ylim = c(-4,4))
2780 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2781
2782 },
2783 plot_figure_5 = function(p_scale=NULL, p_include_main=NULL){
2784 plot_object <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors = list(data_processor_a),
2785 p_include_text=TRUE,
2786 p_include_main=p_include_main)
2787 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2788 },
2789 plot_figure_6 = function(p_scale=NULL, p_include_main=NULL){
2790 plot_object <- Plot_Bad_CI$new(p_data_processors = list(data_processor_a),
2791 p_include_text=TRUE,
2792 p_include_main=p_include_main)
2793 plot_object$plot_image_in_small_window(p_scale=p_scale)
2794 },
2795 plot_figure_7 = function(p_scale=NULL, p_include_main=NULL){
2796 plot_object <- Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$new(p_data_processors = list(data_processor_a),
2797 p_include_text=TRUE,
2798 p_include_main=p_include_main)
2799 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2800 },
2801 plot_figure_8 = function(p_scale=NULL, p_include_main=NULL){
2802 plot_object <- Plot_Selective_Loss$new(p_data_processors = list(data_processor_c),
2803 p_include_text=TRUE,
2804 p_include_main=p_include_main)
2805 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2806 },
2807 plot_figure_A = function(p_scale=NULL, p_include_main=NULL){
2808 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_a),
2809 p_include_text=TRUE,
2810 p_include_main=p_include_main,
2811 p_ylim_time_invariance=c(-4,4) )
2812 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2813 },
2814 plot_figure_B = function(p_scale=NULL, p_include_main=NULL){
2815 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_b),
2816 p_include_text=TRUE,
2817 p_include_main=p_include_main,
2818 p_ylim_time_invariance=c(-4,4) )
2819 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2820 },
2821 plot_figure_C = function(p_scale=NULL, p_include_main=NULL){
2822 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_c),
2823 p_include_text=TRUE,
2824 p_include_main=p_include_main,
2825 p_ylim_time_invariance=c(-4,4) )
2826 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2827 },
2828 plot_figure_D = function(p_scale=NULL, p_include_main=NULL){
2829 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_d),
2830 p_include_text=TRUE,
2831 p_include_main=p_include_main,
2832 p_ylim_time_invariance=c(-4,4) )
2833 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2834 },
2835 create_stats_for_grouping_figure = function(processors=NULL){
2836 processor <- processors[[1]]
2837 processor$i_fdr$ensure()
2838 aug_i_fdr <- processor$i_fdr$df
2839 aug_i_fdr$group_good_bad_other <- gsub("_.*", "", aug_i_fdr$group_training_class)
2840 aug_i_fdr$group_null <- "all"
2841 table(aug_i_fdr$group_training_class)
2842 table(aug_i_fdr$group_good_bad_other)
2843 table(aug_i_fdr$group_null)
2844
2845 create_agg_fdr_stats <- function(i_fdr=NULL, grouping_var_name = NULL){
2846 formula_fdr <- as.formula(sprintf("%s~%s", "i_fdr", grouping_var_name))
2847 formula_len <- as.formula(sprintf("%s~%s", "PMD_FDR_peptide_length", grouping_var_name))
2848 agg_fdr <- aggregate(formula=formula_fdr, data=i_fdr, FUN=mean)
2849 agg_n <- aggregate(formula=formula_fdr, data=i_fdr, FUN=length)
2850 agg_len <- aggregate(formula=formula_len, data=i_fdr, FUN=mean)
2851 agg_fdr <- rename_columns(df = agg_fdr,
2852 names_before = c(grouping_var_name, "i_fdr"),
2853 names_after = c("group" , "fdr"))
2854 agg_n <- rename_columns(df = agg_n,
2855 names_before = c(grouping_var_name, "i_fdr"),
2856 names_after = c("group" , "n"))
2857 agg_len <- rename_columns(df = agg_len,
2858 names_before = c(grouping_var_name),
2859 names_after = c("group" ))
2860 agg <- merge(agg_fdr, agg_n)
2861 agg <- merge(agg , agg_len)
2862
2863 return(agg)
2864 }
2865
2866 agg_detail <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_training_class")
2867 agg_grouped <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_good_bad_other")
2868 agg_all <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_null")
2869
2870 agg <- rbind(agg_detail, agg_grouped)
2871 agg <- rbind(agg, agg_all)
2872
2873 agg$fdr <- ifelse(agg$fdr < 1, agg$fdr, 1)
2874
2875 linear_combo <- function(x=NULL, a0=NULL, a1=NULL){
2876 result <- (a0 * (1-x) + a1 * x)
2877 return(result)
2878 }
2879
2880 agg$r <- linear_combo(agg$fdr, a0=197, a1= 47)
2881 agg$g <- linear_combo(agg$fdr, a0= 90, a1= 85)
2882 agg$b <- linear_combo(agg$fdr, a0= 17, a1=151)
2883
2884 return(agg)
2885 },
2886 report_ranges_of_comparisons = function(processors=NULL){
2887 report_comparison_of_Confidence_and_PMD = function (i_fdr = NULL, min_conf=NULL, max_conf=NULL, include_max=FALSE){
2888 report_PMD_confidence_comparison_from_subset = function(data_subset=NULL, group_name=NULL){
2889 print(group_name)
2890 print(sprintf(" Number of PSMs: %d", nrow(data_subset)))
2891 mean_confidence <- mean(data_subset$PMD_FDR_input_score)
2892 print(sprintf(" Mean Confidence Score: %3.1f", mean_confidence))
2893 print(sprintf(" PeptideShaker g-FDR: %3.1f", 100-mean_confidence))
2894 mean_PMD_FDR = mean(data_subset$i_fdr)
2895 print(sprintf(" PMD g-FDR: %3.1f", 100*mean_PMD_FDR))
2896 #col <- col2hex("black", 0.2)
2897 #plot(data_subset$i_fdr, pch=".", cex=2, col=col)
2898 #abline(h=0)
2899 }
2900
2901 if (is.null(max_conf)) {
2902 data_subset <- subset(i_fdr, PMD_FDR_input_score == min_conf)
2903 group_name <- sprintf("Group %d", min_conf)
2904 } else if (include_max){
2905 data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score <= max_conf))
2906 group_name <- sprintf("Group %d through %d", min_conf, max_conf)
2907 } else {
2908 data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score < max_conf))
2909 group_name <- sprintf("Group %d to %d", min_conf, max_conf)
2910 }
2911
2912 report_PMD_confidence_comparison_from_subset(data_subset=data_subset, group_name=group_name)
2913 }
2914
2915 processor <- processors[[1]]
2916 processor$i_fdr$ensure()
2917 i_fdr <- processor$i_fdr$df
2918 info <- processor$info
2919 print(sprintf("PMD and Confidence comparison for -- %s", info$collection_name()))
2920 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf=100, max_conf=NULL, include_max=TRUE)
2921 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 99, max_conf=100 , include_max=FALSE)
2922 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 90, max_conf= 99 , include_max=FALSE)
2923 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 0, max_conf=100 , include_max=TRUE)
2924 }
2925 )
2926 ###############################################################################
2927 # C - 021 - PMD-FDR Wrapper - functions.R #
2928 # #
2929 # Creates the necessary structure to convert the PMD-FDR code into one that #
2930 # can run as a batch file #
2931 # #
2932 ###############################################################################
2933 ###############################################################################
2934 # Class: ModuleArgParser_PMD_FDR
2935 ###############################################################################
2936 ModuleArgParser_PMD_FDR <- setRefClass("ModuleArgParser_PMD_FDR",
2937 contains = c("ArgParser"),
2938 fields =list(args = "character") )
2939 ModuleArgParser_PMD_FDR$methods(
2940 initialize = function(description = "Computes individual and global FDR using Precursor Mass Discrepancy (PMD-FDR)", ...){
2941 callSuper(description=description, ...)
2942 local_add_argument("--psm_report" , help="full name and path to the PSM report")
2943 local_add_argument("--psm_report_1_percent", default = "" , help="full name and path to the PSM report for 1% FDR")
2944 local_add_argument("--output_i_fdr" , default = "" , help="full name and path to the i-FDR output file ")
2945 local_add_argument("--output_g_fdr" , default = "" , help="full name and path to the g-FDR output file ")
2946 local_add_argument("--output_densities" , default = "" , help="full name and path to the densities output file ")
2947 #local_add_argument("--score_field_name" , default = "" , help="name of score field (in R format)")
2948 local_add_argument("--input_file_type" , default = "PMD_FDR_input_file", help="type of input file (currently supports: PSM_Report)")
2949 }
2950 )
2951 ###############################################################################
2952 # Class: Data_Object_Parser
2953 ###############################################################################
2954 Data_Object_Parser <- setRefClass("Data_Object_Parser",
2955 contains = c("Data_Object"),
2956 fields =list(parser = "ModuleArgParser_PMD_FDR",
2957 args = "character",
2958 parsing_results = "list") )
2959 Data_Object_Parser$methods(
2960 initialize = function(){
2961 callSuper()
2962 class_name <<- "Data_Object_Parser"
2963 },
2964 verify = function(){
2965 # Nothing to do here - parser handles verification during load
2966 },
2967 m_load_data = function(){
2968 if (length(args) == 0){
2969 parsing_results <<- parser$parse_arguments(NULL)
2970 } else {
2971 parsing_results <<- parser$parse_arguments(args)
2972 }
2973
2974 },
2975 set_args = function(p_args=NULL){
2976 # This is primarily used for testing. In operation arguments will be passed automatically (through use of commandArgs)
2977 args <<- p_args
2978 set_dirty(TRUE)
2979 }
2980 )
2981 ###############################################################################
2982 # Class: Data_Object_Info_Parser
2983 ###############################################################################
2984 Data_Object_Info_Parser <- setRefClass("Data_Object_Info_Parser",
2985 contains = c("Data_Object_Info"),
2986 fields =list(
2987 output_i_fdr = "character",
2988 output_g_fdr = "character",
2989 output_densities = "character"
2990 ) )
2991 Data_Object_Info_Parser$methods(
2992 initialize = function(){
2993 callSuper()
2994 class_name <<- "Data_Object_Info_Parser"
2995 },
2996 verify = function(){
2997 check_field_exists = function(field_name=NULL, check_empty = TRUE){
2998 field_value <- get_parser()$parsing_results[field_name]
2999 checkTrue(! is.null(field_value),
3000 msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value))
3001 if (check_empty){
3002 checkTrue(! is.null(field_value),
3003 msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value))
3004 }
3005 }
3006 # Check parameters passed in
3007 check_field_exists("junk")
3008 check_field_exists("psm_report")
3009 check_field_exists("psm_report_1_percent", check_empty = FALSE)
3010 check_field_exists("output_i_fdr" , check_empty = FALSE)
3011 check_field_exists("output_g_fdr" , check_empty = FALSE)
3012 check_field_exists("output_densities" , check_empty = FALSE)
3013 #check_field_exists("score_field_name")
3014 check_field_exists("input_file_type")
3015 },
3016 m_load_data = function(){
3017 parsing_results <- get_parser()$parsing_results
3018
3019 data_file_name <<- as.character(parsing_results["psm_report"])
3020 data_file_name_1_percent_FDR <<- as.character(parsing_results["psm_report_1_percent"])
3021 data_path_name <<- as.character(parsing_results[""])
3022 #experiment_name <<- data_file_name
3023 #designation <<- ""
3024 output_i_fdr <<- as.character(parsing_results["output_i_fdr"])
3025 output_g_fdr <<- as.character(parsing_results["output_g_fdr"])
3026 output_densities <<- as.character(parsing_results["output_densities"])
3027
3028 input_file_type <<- as.character(parsing_results["input_file_type"])
3029 #score_field_name <<- as.character(parsing_results["score_field_name"])
3030 },
3031 set_parser = function(parser){
3032 parents[["parser"]] <<- parser
3033 },
3034 get_parser = function(){
3035 return(verified_element_of_list(parents, "parser", "Data_Object_Info_Parser$parents"))
3036 },
3037 file_path = function(){
3038 result <- data_file_name # Now assumes that full path is provided
3039 if (length(result) == 0){
3040 stop("Unable to validate file path - file name is missing")
3041 }
3042 return(result)
3043 },
3044 file_path_1_percent_FDR = function(){
3045 local_file_name <- get_data_file_name_1_percent_FDR()
3046 if (length(local_file_name) == 0){
3047 result <- ""
3048 } else {
3049 result <- local_file_name # path name is no longer relevant
3050 }
3051
3052 # Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream
3053
3054 # if (length(result) == 0){
3055 # stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing")
3056 # }
3057 return(result)
3058 },
3059 get_data_file_name_1_percent_FDR = function(){
3060 return(data_file_name_1_percent_FDR)
3061 },
3062 collection_name = function(){
3063 result <- ""
3064 return(result)
3065 }
3066
3067 )
3068 ###############################################################################
3069 # Class: Processor_PMD_FDR_for_Galaxy
3070 # Purpose: Wrapper on tools from Project 019 to enable a Galaxy-based interface
3071 ###############################################################################
3072 Processor_PMD_FDR_for_Galaxy <- setRefClass("Processor_PMD_FDR_for_Galaxy",
3073 fields = list(
3074 parser = "Data_Object_Parser",
3075 info = "Data_Object_Info_Parser",
3076 raw_data = "Data_Object_Raw_Data",
3077 raw_1_percent = "Data_Object_Raw_1_Percent",
3078 data_converter = "Data_Object_Data_Converter",
3079 data_groups = "Data_Object_Groupings",
3080 densities = "Data_Object_Densities",
3081 alpha = "Data_Object_Alpha",
3082 i_fdr = "Data_Object_Individual_FDR"
3083 ))
3084 Processor_PMD_FDR_for_Galaxy$methods(
3085 initialize = function(){
3086 # This initialization defines all of the dependencies between the various components
3087 # (Unfortunately, inheriting from Data_Processor leads to issues - I had to reimplement it here with a change to "info")
3088
3089 # info
3090 info$set_parser(parser)
3091 parser$append_child(info)
3092
3093 # raw_data
3094 raw_data$set_info(info)
3095 info$append_child(raw_data)
3096
3097 # raw_1_percent
3098 raw_1_percent$set_info(info)
3099 info$append_child(raw_1_percent)
3100
3101 # data_converter
3102 data_converter$set_info (info)
3103 data_converter$set_raw_data(raw_data)
3104 info $append_child (data_converter)
3105 raw_data $append_child (data_converter)
3106
3107 # data_groups
3108 data_groups$set_info (info)
3109 data_groups$set_data_converter(data_converter)
3110 data_groups$set_raw_1_percent (raw_1_percent)
3111 info $append_child (data_groups)
3112 data_converter$append_child (data_groups)
3113 raw_1_percent $append_child (data_groups)
3114
3115 # densities
3116 densities $set_data_groups(data_groups)
3117 data_groups$append_child (densities)
3118
3119 # alpha
3120 alpha $set_densities(densities)
3121 densities$append_child (alpha)
3122
3123 # i_fdr
3124 i_fdr$set_data_groups(data_groups)
3125 i_fdr$set_densities (densities)
3126 i_fdr$set_alpha (alpha)
3127 data_groups $append_child(i_fdr)
3128 densities $append_child(i_fdr)
3129 alpha $append_child(i_fdr)
3130
3131 },
3132 compute = function(){
3133 #i_fdr is currently the lowest level object - it ultimately depends on everything else.
3134 i_fdr$ensure() # All pieces on which i_fdr depends are automatically verified and computed (through their verify() and ensure())
3135
3136 save_standard_df(x = densities$df, file_path = info$output_densities)
3137 save_standard_df(x = alpha$df, file_path = info$output_g_fdr)
3138 save_standard_df(x = i_fdr$df, file_path = info$output_i_fdr)
3139 }
3140 )
3141 ###############################################################################
3142 # D - 021 - PMD-FDR Main.R #
3143 # #
3144 # File Description: Contains the base code that interprets the parameters #
3145 # and computes i-FDR and g-FDR for a mass spec project #
3146 # #
3147 ###############################################################################
3148 argv <- commandArgs(TRUE) # Saves the parameters (command code)
3149
3150 processor <- Processor_PMD_FDR_for_Galaxy$new()
3151 processor$parser$set_args(argv)
3152 processor$compute()
3153