comparison mqppep_anova.R @ 1:08678c931f5d draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
author galaxyp
date Fri, 28 Oct 2022 18:27:21 +0000
parents dbff53e6f75f
children dda27b9273a8
comparison
equal deleted inserted replaced
0:dbff53e6f75f 1:08678c931f5d
1 #!/usr/bin/env Rscript 1 #!/usr/bin/env Rscript
2 # libraries 2 # libraries
3 library(optparse) 3 library(optparse)
4 library(data.table)
5 library(stringr) 4 library(stringr)
5 library(tinytex)
6 6
7 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 7 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
8 8
9 # parse options 9 # parse options
10 option_list <- list( 10 option_list <- list(
11 make_option( 11
12 c("-i", "--inputFile"), 12 # files
13 action = "store",
14 default = NA,
15 type = "character",
16 help = "Phosphopeptide Intensities sparse input file path"
17 ),
18 make_option( 13 make_option(
19 c("-a", "--alphaFile"), 14 c("-a", "--alphaFile"),
20 action = "store", 15 action = "store",
21 default = NA, 16 default = NA,
22 type = "character", 17 type = "character",
23 help = paste0("List of alpha cutoff values for significance testing;", 18 help = paste0("List of alpha cutoff values for significance testing;",
24 " path to text file having one column and no header") 19 " path to text file having one column and no header")
25 ), 20 ),
26 make_option( 21 make_option(
27 c("-S", "--preproc_sqlite"), 22 c("-M", "--anova_ksea_metadata"),
28 action = "store", 23 action = "store",
29 default = NA, 24 default = "anova_ksea_metadata.tsv",
30 type = "character", 25 type = "character",
31 help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`" 26 help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"
32 ),
33 make_option(
34 c("-K", "--ksea_sqlite"),
35 action = "store",
36 default = NA,
37 type = "character",
38 help = "Path to 'ksea_sqlite' output produced by this tool"
39 ),
40 make_option(
41 c("-f", "--firstDataColumn"),
42 action = "store",
43 default = "^Intensity[^_]",
44 type = "character",
45 help = "First column of intensity values"
46 ),
47 make_option(
48 c("-m", "--imputationMethod"),
49 action = "store",
50 default = "random",
51 type = "character",
52 help = paste0("Method for missing-value imputation,",
53 " one of c('group-median','median','mean','random')")
54 ),
55 make_option(
56 c("-p", "--meanPercentile"),
57 action = "store",
58 default = 3,
59 type = "integer",
60 help = paste0("Mean percentile for randomly generated imputed values;",
61 ", range [1,99]")
62 ),
63 make_option(
64 c("-d", "--sdPercentile"),
65 action = "store",
66 default = 3,
67 type = "double",
68 help = paste0("Adjustment value for standard deviation of",
69 " randomly generated imputed values; real")
70 ),
71 make_option(
72 c("-s", "--regexSampleNames"),
73 action = "store",
74 default = "\\.(\\d+)[A-Z]$",
75 type = "character",
76 help = "Regular expression extracting sample-names"
77 ),
78 make_option(
79 c("-g", "--regexSampleGrouping"),
80 action = "store",
81 default = "(\\d+)",
82 type = "character",
83 help = paste0("Regular expression extracting sample-group",
84 " from an extracted sample-name")
85 ), 27 ),
86 make_option( 28 make_option(
87 c("-o", "--imputedDataFile"), 29 c("-o", "--imputedDataFile"),
88 action = "store", 30 action = "store",
89 default = "output_imputed.tsv", 31 default = "output_imputed.tsv",
100 "Imputed, Quantile-Normalized Log-Transformed Phosphopeptide", 42 "Imputed, Quantile-Normalized Log-Transformed Phosphopeptide",
101 "Intensities output file path" 43 "Intensities output file path"
102 ) 44 )
103 ), 45 ),
104 make_option( 46 make_option(
47 c("-i", "--inputFile"),
48 action = "store",
49 default = NA,
50 type = "character",
51 help = "Phosphopeptide Intensities sparse input file path"
52 ),
53 make_option(
54 c("-K", "--ksea_sqlite"),
55 action = "store",
56 default = NA,
57 type = "character",
58 help = "Path to 'ksea_sqlite' output produced by this tool"
59 ),
60 make_option(
61 c("-S", "--preproc_sqlite"),
62 action = "store",
63 default = NA,
64 type = "character",
65 help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`"
66 ),
67 make_option(
105 c("-r", "--reportFile"), 68 c("-r", "--reportFile"),
106 action = "store", 69 action = "store",
107 default = "QuantDataProcessingScript.html", 70 default = "mqppep_anova.pdf",
108 type = "character", 71 type = "character",
109 help = "HTML report file path" 72 help = "PDF report file path"
73 ),
74
75 # parameters
76 make_option(
77 c("-f", "--firstDataColumn"),
78 action = "store",
79 default = "^Intensity[^_]",
80 type = "character",
81 help = "First column of intensity values"
82 ),
83 make_option(
84 c("-m", "--imputationMethod"),
85 action = "store",
86 default = "random",
87 type = "character",
88 help = paste0("Method for missing-value imputation,",
89 " one of c('group-median','median','mean','random')")
90 ),
91 make_option(
92 c("-C", "--intensityMinValuesPerClass"),
93 action = "store",
94 default = "0",
95 type = "integer",
96 help = "Minimum number of observed values per class"
110 ), 97 ),
111 make_option( 98 make_option(
112 c("-k", "--ksea_cutoff_statistic"), 99 c("-k", "--ksea_cutoff_statistic"),
113 action = "store", 100 action = "store",
114 default = "FDR", 101 default = "FDR",
122 default = 0.05, 109 default = 0.05,
123 type = "double", 110 type = "double",
124 help = paste0("Maximum score to be used to score a kinase enrichment as significant") 111 help = paste0("Maximum score to be used to score a kinase enrichment as significant")
125 ), 112 ),
126 make_option( 113 make_option(
127 c("-M", "--anova_ksea_metadata"), 114 c("-c", "--kseaMinSubstrateCount"),
128 action = "store", 115 action = "store",
129 default = "anova_ksea_metadata.tsv", 116 default = "1",
130 type = "character", 117 type = "integer",
131 help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments" 118 help = "Minimum number of substrates to consider any kinase for KSEA"
119 ),
120 make_option(
121 c("--kseaUseAbsoluteLog2FC"),
122 action = "store_true",
123 default = "FALSE",
124 type = "logical",
125 help = paste0("Should abs(log2(fold-change)) be used for KSEA?",
126 " (TRUE may alter number of hits.)")
127 ),
128 make_option(
129 c("-p", "--meanPercentile"),
130 action = "store",
131 default = 3,
132 type = "integer",
133 help = paste0("Mean percentile for randomly generated imputed values;",
134 ", range [1,99]")
135 ),
136 make_option(
137 c("--minQuality"),
138 action = "store",
139 default = 0,
140 type = "integer",
141 help = paste0("Minimum quality (higher value reduces number of substrates",
142 " accepted; you may want to keep below 100), range [0,infinity]")
143 ),
144 make_option(
145 c("--oneWayManyCategories"),
146 action = "store",
147 default = "aov",
148 type = "character",
149 help = "Name of R function for one-way tests among more than two categories"
150 ),
151 make_option(
152 c("--oneWayTwoCategories"),
153 action = "store",
154 default = "two.way",
155 type = "character",
156 help = "Name of R function for one-way tests between two categories"
157 ),
158 make_option(
159 c("-s", "--regexSampleNames"),
160 action = "store",
161 default = "\\.(\\d+)[A-Z]$",
162 type = "character",
163 help = "Regular expression extracting sample-names"
164 ),
165 make_option(
166 c("-g", "--regexSampleGrouping"),
167 action = "store",
168 default = "(\\d+)",
169 type = "character",
170 help = paste0("Regular expression extracting sample-group",
171 " from an extracted sample-name")
172 ),
173 make_option(
174 c("-d", "--sdPercentile"),
175 action = "store",
176 default = 3,
177 type = "double",
178 help = paste0("Adjustment value for standard deviation of",
179 " randomly generated imputed values; real")
180 ),
181 make_option(
182 c("-F", "--sampleGroupFilter"),
183 action = "store",
184 default = "none",
185 type = "character",
186 help = paste0("Should no filter be applied to sample group names (none)",
187 " or should the filter specify samples to include or exclude?")
188 ),
189 make_option(
190 c("--sampleGroupFilterMode"),
191 action = "store",
192 default = "r",
193 type = "character",
194 help = paste0("First character ('f', 'p', or 'r') indicating regular",
195 "expression matching mode ('fixed', 'perl', or 'grep'; ",
196 "see https://rdrr.io/r/base/grep.html). Second character may be 'i;",
197 "to make search ignore case.")
198 ),
199 make_option(
200 c("-G", "--sampleGroupFilterPatterns"),
201 action = "store",
202 default = ".*",
203 type = "character",
204 help = paste0("Regular expression extracting sample-group",
205 " from an extracted sample-name")
132 ) 206 )
133 ) 207 )
134 args <- parse_args(OptionParser(option_list = option_list)) 208
209 tryCatch(
210 args <- parse_args(
211 OptionParser(
212 option_list = option_list,
213 add_help_option = TRUE
214 ),
215 print_help_and_exit = TRUE
216 ),
217 error = function(e) {
218 parse_args(
219 OptionParser(
220 option_list = option_list,
221 add_help_option = TRUE
222 ),
223 print_help_and_exit = TRUE
224 )
225 stop(as.character(e))
226 }
227 )
135 print("args is:") 228 print("args is:")
136 cat(str(args)) 229 cat(str(args))
137 230
138 # Check parameter values 231 # Check parameter values
139 232
140 if (! file.exists(args$inputFile)) { 233 if (! file.exists(args$inputFile)) {
141 stop((paste("Input file", args$inputFile, "does not exist"))) 234 stop((paste("Input file", args$inputFile, "does not exist")))
142 } 235 }
143 input_file <- args$inputFile 236
144 alpha_file <- args$alphaFile 237 # files
145 preproc_sqlite <- args$preproc_sqlite 238 alpha_file <- args$alphaFile
146 imputed_data_file_name <- args$imputedDataFile 239 anova_ksea_metadata_file <- args$anova_ksea_metadata
147 imp_qn_lt_data_filenm <- args$imputedQNLTDataFile 240 imp_qn_lt_data_file <- args$imputedQNLTDataFile
148 anova_ksea_metadata <- args$anova_ksea_metadata 241 imputed_data_file <- args$imputedDataFile
149 report_file_name <- args$reportFile 242 input_file <- args$inputFile
150 ksea_sqlite <- args$ksea_sqlite 243 ksea_sqlite_file <- args$ksea_sqlite
151 ksea_cutoff_statistic <- args$ksea_cutoff_statistic 244 preproc_sqlite_file <- args$preproc_sqlite
152 ksea_cutoff_threshold <- args$ksea_cutoff_threshold 245 report_file_name <- args$reportFile
246
247 # parameters
248 # firstDataColumn - see below
249 group_filter <- args$sampleGroupFilter
250 group_filter_mode <- args$sampleGroupFilterMode
251 # imputationMethod - see below
252 intensity_min_values_per_class <- args$intensityMinValuesPerClass
253 ksea_cutoff_statistic <- args$ksea_cutoff_statistic
254 ksea_cutoff_threshold <- args$ksea_cutoff_threshold
255 ksea_min_substrate_count <- args$kseaMinSubstrateCount
256 ksea_use_absolute_log2_fc <- args$kseaUseAbsoluteLog2FC
257 # mean_percentile - see below
258 min_quality <- args$minQuality
259 # regexSampleNames - see below
260 # regexSampleGrouping - see below
261 # sampleGroupFilterPatterns - see below (becomes group_filter_patterns)
262 # sd_percentile - see below
263
153 if ( 264 if (
154 sum( 265 sum(
155 grepl( 266 grepl(
156 pattern = ksea_cutoff_statistic, 267 pattern = ksea_cutoff_statistic,
157 x = c("FDR", "p.value") 268 x = c("FDR", "p.value")
190 # convert string parameters that are passed in via config files: 301 # convert string parameters that are passed in via config files:
191 # - firstDataColumn 302 # - firstDataColumn
192 # - regexSampleNames 303 # - regexSampleNames
193 # - regexSampleGrouping 304 # - regexSampleGrouping
194 read_config_file_string <- function(fname, limit) { 305 read_config_file_string <- function(fname, limit) {
306 cat(sprintf("read_config_file_string: fname = '%s'\n", fname))
307 cat(sprintf("length(fname) = '%s'\n", length(fname)))
308 result <-
309 if (file.exists(fname)) {
310 cat(sprintf("reading '%s' ...\n", fname))
311 readChar(fname, limit)
312 } else {
313 cat(sprintf("not a file: '%s'\n", fname))
314 fname
315 }
316 #AC print(paste0("read_config_file_string: opening file '", as.character(fname), "'"))
195 # eliminate any leading whitespace 317 # eliminate any leading whitespace
196 result <- gsub("^[ \t\n]*", "", readChar(fname, limit)) 318 result <- gsub("^[ \t\n]*", "", result)
197 # eliminate any trailing whitespace 319 # eliminate any trailing whitespace
198 result <- gsub("[ \t\n]*$", "", result) 320 result <- gsub("[ \t\n]*$", "", result)
199 # substitute characters escaped by Galaxy sanitizer 321 # substitute characters escaped by Galaxy sanitizer
200 result <- gsub("__lt__", "<", result) 322 result <- gsub("__lt__", "<", result)
201 result <- gsub("__le__", "<=", result) 323 result <- gsub("__le__", "<=", result)
202 result <- gsub("__eq__", "==", result) 324 result <- gsub("__eq__", "==", result)
203 result <- gsub("__ne__", "!=", result) 325 result <- gsub("__ne__", "!=", result)
204 result <- gsub("__gt__", ">", result) 326 result <- gsub("__gt__", ">", result)
205 result <- gsub("__ge__", ">=", result) 327 result <- gsub("__ge__", ">=", result)
206 result <- gsub("__sq__", "'", result) 328 result <- gsub("__sq__", "'", result)
207 result <- gsub("__dq__", '"', result) 329 result <- gsub("__dq__", '"', result)
208 result <- gsub("__ob__", "[", result) 330 result <- gsub("__ob__", "[", result)
209 result <- gsub("__cb__", "]", result) 331 result <- gsub("__cb__", "]", result)
210 } 332 }
333 nc <- 1000
334
335 sink(stderr())
336
211 cat(paste0("first_data_column file: ", args$firstDataColumn, "\n")) 337 cat(paste0("first_data_column file: ", args$firstDataColumn, "\n"))
212 cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n"))
213 cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n"))
214 nc <- 1000
215 regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)
216 regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc)
217 first_data_column <- read_config_file_string(args$firstDataColumn, nc) 338 first_data_column <- read_config_file_string(args$firstDataColumn, nc)
218 cat(paste0("first_data_column: ", first_data_column, "\n")) 339 cat(paste0("first_data_column: ", first_data_column, "\n"))
340
341 cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n"))
342 regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc)
343 cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n"))
344
345 cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n"))
346 regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)
219 cat(paste0("regex_sample_names: ", regex_sample_names, "\n")) 347 cat(paste0("regex_sample_names: ", regex_sample_names, "\n"))
220 cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n")) 348
349 if (group_filter != "none") {
350 cat(paste0("group_filter_patterns file: '", args$sampleGroupFilterPatterns, "'\n"))
351 group_filter_patterns <- read_config_file_string(args$sampleGroupFilterPatterns, nc)
352 } else {
353 group_filter_patterns <- ".*"
354 }
355 cat(paste0("group_filter_patterns: ", group_filter_patterns, "\n"))
356
357 sink()
358
221 359
222 # from: https://github.com/molgenis/molgenis-pipelines/wiki/ 360 # from: https://github.com/molgenis/molgenis-pipelines/wiki/
223 # How-to-source-another_file.R-from-within-your-R-script 361 # How-to-source-another_file.R-from-within-your-R-script
224 # Function location_of_this_script returns the location of this .R script 362 # Function location_of_this_script returns the location of this .R script
225 # (may be needed to source other files in same dir) 363 # (may be needed to source other files in same dir)
251 389
252 # Both are not the case. Maybe we are in an R GUI? 390 # Both are not the case. Maybe we are in an R GUI?
253 return(NULL) 391 return(NULL)
254 } 392 }
255 393
256 script_dir <- location_of_this_script() 394 # validation of input parameters is complete; it is now justifiable to
395 # install LaTeX tools to render markdown as PDF; this involves a big
396 # download from GitHub
397 if (!tinytex::is_tinytex()) tinytex::install_tinytex()
257 398
258 rmarkdown_params <- list( 399 rmarkdown_params <- list(
259 inputFile = input_file 400
260 , alphaFile = alpha_file 401 # files
261 , preprocDb = preproc_sqlite 402 alphaFile = alpha_file
403 , anovaKseaMetadata = anova_ksea_metadata_file
404 , imputedDataFilename = imputed_data_file
405 , imputedQNLTDataFile = imp_qn_lt_data_file
406 , inputFile = input_file
407 , kseaAppPrepDb = ksea_sqlite_file
408 , preprocDb = preproc_sqlite_file
409
410 # parameters
262 , firstDataColumn = first_data_column 411 , firstDataColumn = first_data_column
412 , groupFilter = group_filter
413 , groupFilterMode = group_filter_mode # arg sampleGroupFilterMode
414 , groupFilterPatterns = group_filter_patterns # arg sampleGroupFilterPatterns
263 , imputationMethod = imputation_method 415 , imputationMethod = imputation_method
416 , intensityMinValuesPerGroup = intensity_min_values_per_class
417 , kseaCutoffStatistic = ksea_cutoff_statistic
418 , kseaCutoffThreshold = ksea_cutoff_threshold
419 , kseaMinSubstrateCount = ksea_min_substrate_count
420 , kseaUseAbsoluteLog2FC = ksea_use_absolute_log2_fc # add
264 , meanPercentile = mean_percentile 421 , meanPercentile = mean_percentile
422 , minQuality = min_quality # add
423 , regexSampleGrouping = regex_sample_grouping
424 , regexSampleNames = regex_sample_names
265 , sdPercentile = sd_percentile 425 , sdPercentile = sd_percentile
266 , regexSampleNames = regex_sample_names
267 , regexSampleGrouping = regex_sample_grouping
268 , imputedDataFilename = imputed_data_file_name
269 , imputedQNLTDataFile = imp_qn_lt_data_filenm
270 , anovaKseaMetadata = anova_ksea_metadata
271 , kseaAppPrepDb = ksea_sqlite
272 , kseaCutoffThreshold = ksea_cutoff_threshold
273 , kseaCutoffStatistic = ksea_cutoff_statistic
274 ) 426 )
275 427
276 print("rmarkdown_params") 428 print("rmarkdown_params")
277 str(rmarkdown_params) 429 print(rmarkdown_params)
430 print(
431 lapply(
432 X = rmarkdown_params,
433 FUN = function(x) {
434 paste0(
435 nchar(as.character(x)),
436 ": '",
437 as.character(x),
438 "'"
439 )
440 }
441 )
442 )
443
278 444
279 # freeze the random number generator so the same results will be produced 445 # freeze the random number generator so the same results will be produced
280 # from run to run 446 # from run to run
281 set.seed(28571) 447 set.seed(28571)
282 448
283 # BUG (or "opportunity") 449 script_dir <- location_of_this_script()
284 # To render as PDF for the time being requires installing the conda 450
285 # package `r-texlive` until this issue in `texlive-core` is resolved:
286 # https://github.com/conda-forge/texlive-core-feedstock/issues/19
287 # This workaround is detailed in the fourth comment of:
288 # https://github.com/conda-forge/texlive-core-feedstock/issues/61
289
290 library(tinytex)
291 tinytex::install_tinytex()
292 rmarkdown::render( 451 rmarkdown::render(
293 input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/") 452 input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/")
294 , output_format = rmarkdown::pdf_document(toc = TRUE)
295 , output_file = report_file_name 453 , output_file = report_file_name
296 , params = rmarkdown_params 454 , params = rmarkdown_params
455 , output_format = rmarkdown::pdf_document(
456 includes = rmarkdown::includes(in_header = "mqppep_anova_preamble.tex")
457 , dev = "pdf"
458 , toc = TRUE
459 , toc_depth = 2
460 , number_sections = FALSE
461 )
297 ) 462 )