Mercurial > repos > galaxyp > mqppep_preproc
comparison mqppep_anova.R @ 1:b76c75521d91 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
author | galaxyp |
---|---|
date | Fri, 28 Oct 2022 18:26:42 +0000 |
parents | 8dfd5d2b5903 |
children | bae3a23461c9 |
comparison
equal
deleted
inserted
replaced
0:8dfd5d2b5903 | 1:b76c75521d91 |
---|---|
1 #!/usr/bin/env Rscript | 1 #!/usr/bin/env Rscript |
2 # libraries | 2 # libraries |
3 library(optparse) | 3 library(optparse) |
4 library(data.table) | |
5 library(stringr) | 4 library(stringr) |
5 library(tinytex) | |
6 | 6 |
7 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 | 7 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 |
8 | 8 |
9 # parse options | 9 # parse options |
10 option_list <- list( | 10 option_list <- list( |
11 make_option( | 11 |
12 c("-i", "--inputFile"), | 12 # files |
13 action = "store", | |
14 default = NA, | |
15 type = "character", | |
16 help = "Phosphopeptide Intensities sparse input file path" | |
17 ), | |
18 make_option( | 13 make_option( |
19 c("-a", "--alphaFile"), | 14 c("-a", "--alphaFile"), |
20 action = "store", | 15 action = "store", |
21 default = NA, | 16 default = NA, |
22 type = "character", | 17 type = "character", |
23 help = paste0("List of alpha cutoff values for significance testing;", | 18 help = paste0("List of alpha cutoff values for significance testing;", |
24 " path to text file having one column and no header") | 19 " path to text file having one column and no header") |
25 ), | 20 ), |
26 make_option( | 21 make_option( |
27 c("-S", "--preproc_sqlite"), | 22 c("-M", "--anova_ksea_metadata"), |
28 action = "store", | 23 action = "store", |
29 default = NA, | 24 default = "anova_ksea_metadata.tsv", |
30 type = "character", | 25 type = "character", |
31 help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`" | 26 help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments" |
32 ), | |
33 make_option( | |
34 c("-K", "--ksea_sqlite"), | |
35 action = "store", | |
36 default = NA, | |
37 type = "character", | |
38 help = "Path to 'ksea_sqlite' output produced by this tool" | |
39 ), | |
40 make_option( | |
41 c("-f", "--firstDataColumn"), | |
42 action = "store", | |
43 default = "^Intensity[^_]", | |
44 type = "character", | |
45 help = "First column of intensity values" | |
46 ), | |
47 make_option( | |
48 c("-m", "--imputationMethod"), | |
49 action = "store", | |
50 default = "random", | |
51 type = "character", | |
52 help = paste0("Method for missing-value imputation,", | |
53 " one of c('group-median','median','mean','random')") | |
54 ), | |
55 make_option( | |
56 c("-p", "--meanPercentile"), | |
57 action = "store", | |
58 default = 3, | |
59 type = "integer", | |
60 help = paste0("Mean percentile for randomly generated imputed values;", | |
61 ", range [1,99]") | |
62 ), | |
63 make_option( | |
64 c("-d", "--sdPercentile"), | |
65 action = "store", | |
66 default = 3, | |
67 type = "double", | |
68 help = paste0("Adjustment value for standard deviation of", | |
69 " randomly generated imputed values; real") | |
70 ), | |
71 make_option( | |
72 c("-s", "--regexSampleNames"), | |
73 action = "store", | |
74 default = "\\.(\\d+)[A-Z]$", | |
75 type = "character", | |
76 help = "Regular expression extracting sample-names" | |
77 ), | |
78 make_option( | |
79 c("-g", "--regexSampleGrouping"), | |
80 action = "store", | |
81 default = "(\\d+)", | |
82 type = "character", | |
83 help = paste0("Regular expression extracting sample-group", | |
84 " from an extracted sample-name") | |
85 ), | 27 ), |
86 make_option( | 28 make_option( |
87 c("-o", "--imputedDataFile"), | 29 c("-o", "--imputedDataFile"), |
88 action = "store", | 30 action = "store", |
89 default = "output_imputed.tsv", | 31 default = "output_imputed.tsv", |
100 "Imputed, Quantile-Normalized Log-Transformed Phosphopeptide", | 42 "Imputed, Quantile-Normalized Log-Transformed Phosphopeptide", |
101 "Intensities output file path" | 43 "Intensities output file path" |
102 ) | 44 ) |
103 ), | 45 ), |
104 make_option( | 46 make_option( |
47 c("-i", "--inputFile"), | |
48 action = "store", | |
49 default = NA, | |
50 type = "character", | |
51 help = "Phosphopeptide Intensities sparse input file path" | |
52 ), | |
53 make_option( | |
54 c("-K", "--ksea_sqlite"), | |
55 action = "store", | |
56 default = NA, | |
57 type = "character", | |
58 help = "Path to 'ksea_sqlite' output produced by this tool" | |
59 ), | |
60 make_option( | |
61 c("-S", "--preproc_sqlite"), | |
62 action = "store", | |
63 default = NA, | |
64 type = "character", | |
65 help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`" | |
66 ), | |
67 make_option( | |
105 c("-r", "--reportFile"), | 68 c("-r", "--reportFile"), |
106 action = "store", | 69 action = "store", |
107 default = "QuantDataProcessingScript.html", | 70 default = "mqppep_anova.pdf", |
108 type = "character", | 71 type = "character", |
109 help = "HTML report file path" | 72 help = "PDF report file path" |
73 ), | |
74 | |
75 # parameters | |
76 make_option( | |
77 c("-f", "--firstDataColumn"), | |
78 action = "store", | |
79 default = "^Intensity[^_]", | |
80 type = "character", | |
81 help = "First column of intensity values" | |
82 ), | |
83 make_option( | |
84 c("-m", "--imputationMethod"), | |
85 action = "store", | |
86 default = "random", | |
87 type = "character", | |
88 help = paste0("Method for missing-value imputation,", | |
89 " one of c('group-median','median','mean','random')") | |
90 ), | |
91 make_option( | |
92 c("-C", "--intensityMinValuesPerClass"), | |
93 action = "store", | |
94 default = "0", | |
95 type = "integer", | |
96 help = "Minimum number of observed values per class" | |
110 ), | 97 ), |
111 make_option( | 98 make_option( |
112 c("-k", "--ksea_cutoff_statistic"), | 99 c("-k", "--ksea_cutoff_statistic"), |
113 action = "store", | 100 action = "store", |
114 default = "FDR", | 101 default = "FDR", |
122 default = 0.05, | 109 default = 0.05, |
123 type = "double", | 110 type = "double", |
124 help = paste0("Maximum score to be used to score a kinase enrichment as significant") | 111 help = paste0("Maximum score to be used to score a kinase enrichment as significant") |
125 ), | 112 ), |
126 make_option( | 113 make_option( |
127 c("-M", "--anova_ksea_metadata"), | 114 c("-c", "--kseaMinSubstrateCount"), |
128 action = "store", | 115 action = "store", |
129 default = "anova_ksea_metadata.tsv", | 116 default = "1", |
130 type = "character", | 117 type = "integer", |
131 help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments" | 118 help = "Minimum number of substrates to consider any kinase for KSEA" |
119 ), | |
120 make_option( | |
121 c("--kseaUseAbsoluteLog2FC"), | |
122 action = "store_true", | |
123 default = "FALSE", | |
124 type = "logical", | |
125 help = paste0("Should abs(log2(fold-change)) be used for KSEA?", | |
126 " (TRUE may alter number of hits.)") | |
127 ), | |
128 make_option( | |
129 c("-p", "--meanPercentile"), | |
130 action = "store", | |
131 default = 3, | |
132 type = "integer", | |
133 help = paste0("Mean percentile for randomly generated imputed values;", | |
134 ", range [1,99]") | |
135 ), | |
136 make_option( | |
137 c("--minQuality"), | |
138 action = "store", | |
139 default = 0, | |
140 type = "integer", | |
141 help = paste0("Minimum quality (higher value reduces number of substrates", | |
142 " accepted; you may want to keep below 100), range [0,infinity]") | |
143 ), | |
144 make_option( | |
145 c("--oneWayManyCategories"), | |
146 action = "store", | |
147 default = "aov", | |
148 type = "character", | |
149 help = "Name of R function for one-way tests among more than two categories" | |
150 ), | |
151 make_option( | |
152 c("--oneWayTwoCategories"), | |
153 action = "store", | |
154 default = "two.way", | |
155 type = "character", | |
156 help = "Name of R function for one-way tests between two categories" | |
157 ), | |
158 make_option( | |
159 c("-s", "--regexSampleNames"), | |
160 action = "store", | |
161 default = "\\.(\\d+)[A-Z]$", | |
162 type = "character", | |
163 help = "Regular expression extracting sample-names" | |
164 ), | |
165 make_option( | |
166 c("-g", "--regexSampleGrouping"), | |
167 action = "store", | |
168 default = "(\\d+)", | |
169 type = "character", | |
170 help = paste0("Regular expression extracting sample-group", | |
171 " from an extracted sample-name") | |
172 ), | |
173 make_option( | |
174 c("-d", "--sdPercentile"), | |
175 action = "store", | |
176 default = 3, | |
177 type = "double", | |
178 help = paste0("Adjustment value for standard deviation of", | |
179 " randomly generated imputed values; real") | |
180 ), | |
181 make_option( | |
182 c("-F", "--sampleGroupFilter"), | |
183 action = "store", | |
184 default = "none", | |
185 type = "character", | |
186 help = paste0("Should no filter be applied to sample group names (none)", | |
187 " or should the filter specify samples to include or exclude?") | |
188 ), | |
189 make_option( | |
190 c("--sampleGroupFilterMode"), | |
191 action = "store", | |
192 default = "r", | |
193 type = "character", | |
194 help = paste0("First character ('f', 'p', or 'r') indicating regular", | |
195 "expression matching mode ('fixed', 'perl', or 'grep'; ", | |
196 "see https://rdrr.io/r/base/grep.html). Second character may be 'i;", | |
197 "to make search ignore case.") | |
198 ), | |
199 make_option( | |
200 c("-G", "--sampleGroupFilterPatterns"), | |
201 action = "store", | |
202 default = ".*", | |
203 type = "character", | |
204 help = paste0("Regular expression extracting sample-group", | |
205 " from an extracted sample-name") | |
132 ) | 206 ) |
133 ) | 207 ) |
134 args <- parse_args(OptionParser(option_list = option_list)) | 208 |
209 tryCatch( | |
210 args <- parse_args( | |
211 OptionParser( | |
212 option_list = option_list, | |
213 add_help_option = TRUE | |
214 ), | |
215 print_help_and_exit = TRUE | |
216 ), | |
217 error = function(e) { | |
218 parse_args( | |
219 OptionParser( | |
220 option_list = option_list, | |
221 add_help_option = TRUE | |
222 ), | |
223 print_help_and_exit = TRUE | |
224 ) | |
225 stop(as.character(e)) | |
226 } | |
227 ) | |
135 print("args is:") | 228 print("args is:") |
136 cat(str(args)) | 229 cat(str(args)) |
137 | 230 |
138 # Check parameter values | 231 # Check parameter values |
139 | 232 |
140 if (! file.exists(args$inputFile)) { | 233 if (! file.exists(args$inputFile)) { |
141 stop((paste("Input file", args$inputFile, "does not exist"))) | 234 stop((paste("Input file", args$inputFile, "does not exist"))) |
142 } | 235 } |
143 input_file <- args$inputFile | 236 |
144 alpha_file <- args$alphaFile | 237 # files |
145 preproc_sqlite <- args$preproc_sqlite | 238 alpha_file <- args$alphaFile |
146 imputed_data_file_name <- args$imputedDataFile | 239 anova_ksea_metadata_file <- args$anova_ksea_metadata |
147 imp_qn_lt_data_filenm <- args$imputedQNLTDataFile | 240 imp_qn_lt_data_file <- args$imputedQNLTDataFile |
148 anova_ksea_metadata <- args$anova_ksea_metadata | 241 imputed_data_file <- args$imputedDataFile |
149 report_file_name <- args$reportFile | 242 input_file <- args$inputFile |
150 ksea_sqlite <- args$ksea_sqlite | 243 ksea_sqlite_file <- args$ksea_sqlite |
151 ksea_cutoff_statistic <- args$ksea_cutoff_statistic | 244 preproc_sqlite_file <- args$preproc_sqlite |
152 ksea_cutoff_threshold <- args$ksea_cutoff_threshold | 245 report_file_name <- args$reportFile |
246 | |
247 # parameters | |
248 # firstDataColumn - see below | |
249 group_filter <- args$sampleGroupFilter | |
250 group_filter_mode <- args$sampleGroupFilterMode | |
251 # imputationMethod - see below | |
252 intensity_min_values_per_class <- args$intensityMinValuesPerClass | |
253 ksea_cutoff_statistic <- args$ksea_cutoff_statistic | |
254 ksea_cutoff_threshold <- args$ksea_cutoff_threshold | |
255 ksea_min_substrate_count <- args$kseaMinSubstrateCount | |
256 ksea_use_absolute_log2_fc <- args$kseaUseAbsoluteLog2FC | |
257 # mean_percentile - see below | |
258 min_quality <- args$minQuality | |
259 # regexSampleNames - see below | |
260 # regexSampleGrouping - see below | |
261 # sampleGroupFilterPatterns - see below (becomes group_filter_patterns) | |
262 # sd_percentile - see below | |
263 | |
153 if ( | 264 if ( |
154 sum( | 265 sum( |
155 grepl( | 266 grepl( |
156 pattern = ksea_cutoff_statistic, | 267 pattern = ksea_cutoff_statistic, |
157 x = c("FDR", "p.value") | 268 x = c("FDR", "p.value") |
190 # convert string parameters that are passed in via config files: | 301 # convert string parameters that are passed in via config files: |
191 # - firstDataColumn | 302 # - firstDataColumn |
192 # - regexSampleNames | 303 # - regexSampleNames |
193 # - regexSampleGrouping | 304 # - regexSampleGrouping |
194 read_config_file_string <- function(fname, limit) { | 305 read_config_file_string <- function(fname, limit) { |
306 cat(sprintf("read_config_file_string: fname = '%s'\n", fname)) | |
307 cat(sprintf("length(fname) = '%s'\n", length(fname))) | |
308 result <- | |
309 if (file.exists(fname)) { | |
310 cat(sprintf("reading '%s' ...\n", fname)) | |
311 readChar(fname, limit) | |
312 } else { | |
313 cat(sprintf("not a file: '%s'\n", fname)) | |
314 fname | |
315 } | |
316 #AC print(paste0("read_config_file_string: opening file '", as.character(fname), "'")) | |
195 # eliminate any leading whitespace | 317 # eliminate any leading whitespace |
196 result <- gsub("^[ \t\n]*", "", readChar(fname, limit)) | 318 result <- gsub("^[ \t\n]*", "", result) |
197 # eliminate any trailing whitespace | 319 # eliminate any trailing whitespace |
198 result <- gsub("[ \t\n]*$", "", result) | 320 result <- gsub("[ \t\n]*$", "", result) |
199 # substitute characters escaped by Galaxy sanitizer | 321 # substitute characters escaped by Galaxy sanitizer |
200 result <- gsub("__lt__", "<", result) | 322 result <- gsub("__lt__", "<", result) |
201 result <- gsub("__le__", "<=", result) | 323 result <- gsub("__le__", "<=", result) |
202 result <- gsub("__eq__", "==", result) | 324 result <- gsub("__eq__", "==", result) |
203 result <- gsub("__ne__", "!=", result) | 325 result <- gsub("__ne__", "!=", result) |
204 result <- gsub("__gt__", ">", result) | 326 result <- gsub("__gt__", ">", result) |
205 result <- gsub("__ge__", ">=", result) | 327 result <- gsub("__ge__", ">=", result) |
206 result <- gsub("__sq__", "'", result) | 328 result <- gsub("__sq__", "'", result) |
207 result <- gsub("__dq__", '"', result) | 329 result <- gsub("__dq__", '"', result) |
208 result <- gsub("__ob__", "[", result) | 330 result <- gsub("__ob__", "[", result) |
209 result <- gsub("__cb__", "]", result) | 331 result <- gsub("__cb__", "]", result) |
210 } | 332 } |
333 nc <- 1000 | |
334 | |
335 sink(stderr()) | |
336 | |
211 cat(paste0("first_data_column file: ", args$firstDataColumn, "\n")) | 337 cat(paste0("first_data_column file: ", args$firstDataColumn, "\n")) |
212 cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n")) | |
213 cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n")) | |
214 nc <- 1000 | |
215 regex_sample_names <- read_config_file_string(args$regexSampleNames, nc) | |
216 regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc) | |
217 first_data_column <- read_config_file_string(args$firstDataColumn, nc) | 338 first_data_column <- read_config_file_string(args$firstDataColumn, nc) |
218 cat(paste0("first_data_column: ", first_data_column, "\n")) | 339 cat(paste0("first_data_column: ", first_data_column, "\n")) |
340 | |
341 cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n")) | |
342 regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc) | |
343 cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n")) | |
344 | |
345 cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n")) | |
346 regex_sample_names <- read_config_file_string(args$regexSampleNames, nc) | |
219 cat(paste0("regex_sample_names: ", regex_sample_names, "\n")) | 347 cat(paste0("regex_sample_names: ", regex_sample_names, "\n")) |
220 cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n")) | 348 |
349 if (group_filter != "none") { | |
350 cat(paste0("group_filter_patterns file: '", args$sampleGroupFilterPatterns, "'\n")) | |
351 group_filter_patterns <- read_config_file_string(args$sampleGroupFilterPatterns, nc) | |
352 } else { | |
353 group_filter_patterns <- ".*" | |
354 } | |
355 cat(paste0("group_filter_patterns: ", group_filter_patterns, "\n")) | |
356 | |
357 sink() | |
358 | |
221 | 359 |
222 # from: https://github.com/molgenis/molgenis-pipelines/wiki/ | 360 # from: https://github.com/molgenis/molgenis-pipelines/wiki/ |
223 # How-to-source-another_file.R-from-within-your-R-script | 361 # How-to-source-another_file.R-from-within-your-R-script |
224 # Function location_of_this_script returns the location of this .R script | 362 # Function location_of_this_script returns the location of this .R script |
225 # (may be needed to source other files in same dir) | 363 # (may be needed to source other files in same dir) |
251 | 389 |
252 # Both are not the case. Maybe we are in an R GUI? | 390 # Both are not the case. Maybe we are in an R GUI? |
253 return(NULL) | 391 return(NULL) |
254 } | 392 } |
255 | 393 |
256 script_dir <- location_of_this_script() | 394 # validation of input parameters is complete; it is now justifiable to |
395 # install LaTeX tools to render markdown as PDF; this involves a big | |
396 # download from GitHub | |
397 if (!tinytex::is_tinytex()) tinytex::install_tinytex() | |
257 | 398 |
258 rmarkdown_params <- list( | 399 rmarkdown_params <- list( |
259 inputFile = input_file | 400 |
260 , alphaFile = alpha_file | 401 # files |
261 , preprocDb = preproc_sqlite | 402 alphaFile = alpha_file |
403 , anovaKseaMetadata = anova_ksea_metadata_file | |
404 , imputedDataFilename = imputed_data_file | |
405 , imputedQNLTDataFile = imp_qn_lt_data_file | |
406 , inputFile = input_file | |
407 , kseaAppPrepDb = ksea_sqlite_file | |
408 , preprocDb = preproc_sqlite_file | |
409 | |
410 # parameters | |
262 , firstDataColumn = first_data_column | 411 , firstDataColumn = first_data_column |
412 , groupFilter = group_filter | |
413 , groupFilterMode = group_filter_mode # arg sampleGroupFilterMode | |
414 , groupFilterPatterns = group_filter_patterns # arg sampleGroupFilterPatterns | |
263 , imputationMethod = imputation_method | 415 , imputationMethod = imputation_method |
416 , intensityMinValuesPerGroup = intensity_min_values_per_class | |
417 , kseaCutoffStatistic = ksea_cutoff_statistic | |
418 , kseaCutoffThreshold = ksea_cutoff_threshold | |
419 , kseaMinSubstrateCount = ksea_min_substrate_count | |
420 , kseaUseAbsoluteLog2FC = ksea_use_absolute_log2_fc # add | |
264 , meanPercentile = mean_percentile | 421 , meanPercentile = mean_percentile |
422 , minQuality = min_quality # add | |
423 , regexSampleGrouping = regex_sample_grouping | |
424 , regexSampleNames = regex_sample_names | |
265 , sdPercentile = sd_percentile | 425 , sdPercentile = sd_percentile |
266 , regexSampleNames = regex_sample_names | |
267 , regexSampleGrouping = regex_sample_grouping | |
268 , imputedDataFilename = imputed_data_file_name | |
269 , imputedQNLTDataFile = imp_qn_lt_data_filenm | |
270 , anovaKseaMetadata = anova_ksea_metadata | |
271 , kseaAppPrepDb = ksea_sqlite | |
272 , kseaCutoffThreshold = ksea_cutoff_threshold | |
273 , kseaCutoffStatistic = ksea_cutoff_statistic | |
274 ) | 426 ) |
275 | 427 |
276 print("rmarkdown_params") | 428 print("rmarkdown_params") |
277 str(rmarkdown_params) | 429 print(rmarkdown_params) |
430 print( | |
431 lapply( | |
432 X = rmarkdown_params, | |
433 FUN = function(x) { | |
434 paste0( | |
435 nchar(as.character(x)), | |
436 ": '", | |
437 as.character(x), | |
438 "'" | |
439 ) | |
440 } | |
441 ) | |
442 ) | |
443 | |
278 | 444 |
279 # freeze the random number generator so the same results will be produced | 445 # freeze the random number generator so the same results will be produced |
280 # from run to run | 446 # from run to run |
281 set.seed(28571) | 447 set.seed(28571) |
282 | 448 |
283 # BUG (or "opportunity") | 449 script_dir <- location_of_this_script() |
284 # To render as PDF for the time being requires installing the conda | 450 |
285 # package `r-texlive` until this issue in `texlive-core` is resolved: | |
286 # https://github.com/conda-forge/texlive-core-feedstock/issues/19 | |
287 # This workaround is detailed in the fourth comment of: | |
288 # https://github.com/conda-forge/texlive-core-feedstock/issues/61 | |
289 | |
290 library(tinytex) | |
291 tinytex::install_tinytex() | |
292 rmarkdown::render( | 451 rmarkdown::render( |
293 input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/") | 452 input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/") |
294 , output_format = rmarkdown::pdf_document(toc = TRUE) | |
295 , output_file = report_file_name | 453 , output_file = report_file_name |
296 , params = rmarkdown_params | 454 , params = rmarkdown_params |
455 , output_format = rmarkdown::pdf_document( | |
456 includes = rmarkdown::includes(in_header = "mqppep_anova_preamble.tex") | |
457 , dev = "pdf" | |
458 , toc = TRUE | |
459 , toc_depth = 2 | |
460 , number_sections = FALSE | |
461 ) | |
297 ) | 462 ) |