Mercurial > repos > galaxyp > mqppep_preproc
comparison mqppep_anova.R @ 0:8dfd5d2b5903 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author | galaxyp |
---|---|
date | Mon, 11 Jul 2022 19:22:54 +0000 |
parents | |
children | b76c75521d91 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8dfd5d2b5903 |
---|---|
1 #!/usr/bin/env Rscript | |
2 # libraries | |
3 library(optparse) | |
4 library(data.table) | |
5 library(stringr) | |
6 | |
7 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 | |
8 | |
9 # parse options | |
10 option_list <- list( | |
11 make_option( | |
12 c("-i", "--inputFile"), | |
13 action = "store", | |
14 default = NA, | |
15 type = "character", | |
16 help = "Phosphopeptide Intensities sparse input file path" | |
17 ), | |
18 make_option( | |
19 c("-a", "--alphaFile"), | |
20 action = "store", | |
21 default = NA, | |
22 type = "character", | |
23 help = paste0("List of alpha cutoff values for significance testing;", | |
24 " path to text file having one column and no header") | |
25 ), | |
26 make_option( | |
27 c("-S", "--preproc_sqlite"), | |
28 action = "store", | |
29 default = NA, | |
30 type = "character", | |
31 help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`" | |
32 ), | |
33 make_option( | |
34 c("-K", "--ksea_sqlite"), | |
35 action = "store", | |
36 default = NA, | |
37 type = "character", | |
38 help = "Path to 'ksea_sqlite' output produced by this tool" | |
39 ), | |
40 make_option( | |
41 c("-f", "--firstDataColumn"), | |
42 action = "store", | |
43 default = "^Intensity[^_]", | |
44 type = "character", | |
45 help = "First column of intensity values" | |
46 ), | |
47 make_option( | |
48 c("-m", "--imputationMethod"), | |
49 action = "store", | |
50 default = "random", | |
51 type = "character", | |
52 help = paste0("Method for missing-value imputation,", | |
53 " one of c('group-median','median','mean','random')") | |
54 ), | |
55 make_option( | |
56 c("-p", "--meanPercentile"), | |
57 action = "store", | |
58 default = 3, | |
59 type = "integer", | |
60 help = paste0("Mean percentile for randomly generated imputed values;", | |
61 ", range [1,99]") | |
62 ), | |
63 make_option( | |
64 c("-d", "--sdPercentile"), | |
65 action = "store", | |
66 default = 3, | |
67 type = "double", | |
68 help = paste0("Adjustment value for standard deviation of", | |
69 " randomly generated imputed values; real") | |
70 ), | |
71 make_option( | |
72 c("-s", "--regexSampleNames"), | |
73 action = "store", | |
74 default = "\\.(\\d+)[A-Z]$", | |
75 type = "character", | |
76 help = "Regular expression extracting sample-names" | |
77 ), | |
78 make_option( | |
79 c("-g", "--regexSampleGrouping"), | |
80 action = "store", | |
81 default = "(\\d+)", | |
82 type = "character", | |
83 help = paste0("Regular expression extracting sample-group", | |
84 " from an extracted sample-name") | |
85 ), | |
86 make_option( | |
87 c("-o", "--imputedDataFile"), | |
88 action = "store", | |
89 default = "output_imputed.tsv", | |
90 type = "character", | |
91 help = "Imputed Phosphopeptide Intensities output file path" | |
92 ), | |
93 make_option( | |
94 c("-n", "--imputedQNLTDataFile"), | |
95 action = "store", | |
96 default = "output_imp_qn_lt.tsv", | |
97 type = "character", | |
98 help = | |
99 paste( | |
100 "Imputed, Quantile-Normalized Log-Transformed Phosphopeptide", | |
101 "Intensities output file path" | |
102 ) | |
103 ), | |
104 make_option( | |
105 c("-r", "--reportFile"), | |
106 action = "store", | |
107 default = "QuantDataProcessingScript.html", | |
108 type = "character", | |
109 help = "HTML report file path" | |
110 ), | |
111 make_option( | |
112 c("-k", "--ksea_cutoff_statistic"), | |
113 action = "store", | |
114 default = "FDR", | |
115 type = "character", | |
116 help = paste0("Method for missing-value imputation,", | |
117 " one of c('FDR','p.value'), but don't expect 'p.value' to work well.") | |
118 ), | |
119 make_option( | |
120 c("-t", "--ksea_cutoff_threshold"), | |
121 action = "store", | |
122 default = 0.05, | |
123 type = "double", | |
124 help = paste0("Maximum score to be used to score a kinase enrichment as significant") | |
125 ), | |
126 make_option( | |
127 c("-M", "--anova_ksea_metadata"), | |
128 action = "store", | |
129 default = "anova_ksea_metadata.tsv", | |
130 type = "character", | |
131 help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments" | |
132 ) | |
133 ) | |
134 args <- parse_args(OptionParser(option_list = option_list)) | |
135 print("args is:") | |
136 cat(str(args)) | |
137 | |
138 # Check parameter values | |
139 | |
140 if (! file.exists(args$inputFile)) { | |
141 stop((paste("Input file", args$inputFile, "does not exist"))) | |
142 } | |
143 input_file <- args$inputFile | |
144 alpha_file <- args$alphaFile | |
145 preproc_sqlite <- args$preproc_sqlite | |
146 imputed_data_file_name <- args$imputedDataFile | |
147 imp_qn_lt_data_filenm <- args$imputedQNLTDataFile | |
148 anova_ksea_metadata <- args$anova_ksea_metadata | |
149 report_file_name <- args$reportFile | |
150 ksea_sqlite <- args$ksea_sqlite | |
151 ksea_cutoff_statistic <- args$ksea_cutoff_statistic | |
152 ksea_cutoff_threshold <- args$ksea_cutoff_threshold | |
153 if ( | |
154 sum( | |
155 grepl( | |
156 pattern = ksea_cutoff_statistic, | |
157 x = c("FDR", "p.value") | |
158 ) | |
159 ) < 1 | |
160 ) { | |
161 print(sprintf("bad ksea_cutoff_statistic argument: %s", ksea_cutoff_statistic)) | |
162 return(-1) | |
163 } | |
164 | |
165 imputation_method <- args$imputationMethod | |
166 if ( | |
167 sum( | |
168 grepl( | |
169 pattern = imputation_method, | |
170 x = c("group-median", "median", "mean", "random") | |
171 ) | |
172 ) < 1 | |
173 ) { | |
174 print(sprintf("bad imputationMethod argument: %s", imputation_method)) | |
175 return(-1) | |
176 } | |
177 | |
178 # read with default values, when applicable | |
179 mean_percentile <- args$meanPercentile | |
180 sd_percentile <- args$sdPercentile | |
181 # in the case of 'random" these values are ignored by the client script | |
182 if (imputation_method == "random") { | |
183 print("mean_percentile is:") | |
184 cat(str(mean_percentile)) | |
185 | |
186 print("sd_percentile is:") | |
187 cat(str(mean_percentile)) | |
188 } | |
189 | |
190 # convert string parameters that are passed in via config files: | |
191 # - firstDataColumn | |
192 # - regexSampleNames | |
193 # - regexSampleGrouping | |
194 read_config_file_string <- function(fname, limit) { | |
195 # eliminate any leading whitespace | |
196 result <- gsub("^[ \t\n]*", "", readChar(fname, limit)) | |
197 # eliminate any trailing whitespace | |
198 result <- gsub("[ \t\n]*$", "", result) | |
199 # substitute characters escaped by Galaxy sanitizer | |
200 result <- gsub("__lt__", "<", result) | |
201 result <- gsub("__le__", "<=", result) | |
202 result <- gsub("__eq__", "==", result) | |
203 result <- gsub("__ne__", "!=", result) | |
204 result <- gsub("__gt__", ">", result) | |
205 result <- gsub("__ge__", ">=", result) | |
206 result <- gsub("__sq__", "'", result) | |
207 result <- gsub("__dq__", '"', result) | |
208 result <- gsub("__ob__", "[", result) | |
209 result <- gsub("__cb__", "]", result) | |
210 } | |
211 cat(paste0("first_data_column file: ", args$firstDataColumn, "\n")) | |
212 cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n")) | |
213 cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n")) | |
214 nc <- 1000 | |
215 regex_sample_names <- read_config_file_string(args$regexSampleNames, nc) | |
216 regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc) | |
217 first_data_column <- read_config_file_string(args$firstDataColumn, nc) | |
218 cat(paste0("first_data_column: ", first_data_column, "\n")) | |
219 cat(paste0("regex_sample_names: ", regex_sample_names, "\n")) | |
220 cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n")) | |
221 | |
222 # from: https://github.com/molgenis/molgenis-pipelines/wiki/ | |
223 # How-to-source-another_file.R-from-within-your-R-script | |
224 # Function location_of_this_script returns the location of this .R script | |
225 # (may be needed to source other files in same dir) | |
226 location_of_this_script <- function() { | |
227 this_file <- NULL | |
228 # This file may be 'sourced' | |
229 for (i in - (1:sys.nframe())) { | |
230 if (identical(sys.function(i), base::source)) { | |
231 this_file <- (normalizePath(sys.frame(i)$ofile)) | |
232 } | |
233 } | |
234 | |
235 if (!is.null(this_file)) return(dirname(this_file)) | |
236 | |
237 # But it may also be called from the command line | |
238 cmd_args <- commandArgs(trailingOnly = FALSE) | |
239 cmd_args_trailing <- commandArgs(trailingOnly = TRUE) | |
240 cmd_args <- cmd_args[ | |
241 seq.int( | |
242 from = 1, | |
243 length.out = length(cmd_args) - length(cmd_args_trailing) | |
244 ) | |
245 ] | |
246 res <- gsub("^(?:--file=(.*)|.*)$", "\\1", cmd_args) | |
247 | |
248 # If multiple --file arguments are given, R uses the last one | |
249 res <- tail(res[res != ""], 1) | |
250 if (0 < length(res)) return(dirname(res)) | |
251 | |
252 # Both are not the case. Maybe we are in an R GUI? | |
253 return(NULL) | |
254 } | |
255 | |
256 script_dir <- location_of_this_script() | |
257 | |
258 rmarkdown_params <- list( | |
259 inputFile = input_file | |
260 , alphaFile = alpha_file | |
261 , preprocDb = preproc_sqlite | |
262 , firstDataColumn = first_data_column | |
263 , imputationMethod = imputation_method | |
264 , meanPercentile = mean_percentile | |
265 , sdPercentile = sd_percentile | |
266 , regexSampleNames = regex_sample_names | |
267 , regexSampleGrouping = regex_sample_grouping | |
268 , imputedDataFilename = imputed_data_file_name | |
269 , imputedQNLTDataFile = imp_qn_lt_data_filenm | |
270 , anovaKseaMetadata = anova_ksea_metadata | |
271 , kseaAppPrepDb = ksea_sqlite | |
272 , kseaCutoffThreshold = ksea_cutoff_threshold | |
273 , kseaCutoffStatistic = ksea_cutoff_statistic | |
274 ) | |
275 | |
276 print("rmarkdown_params") | |
277 str(rmarkdown_params) | |
278 | |
279 # freeze the random number generator so the same results will be produced | |
280 # from run to run | |
281 set.seed(28571) | |
282 | |
283 # BUG (or "opportunity") | |
284 # To render as PDF for the time being requires installing the conda | |
285 # package `r-texlive` until this issue in `texlive-core` is resolved: | |
286 # https://github.com/conda-forge/texlive-core-feedstock/issues/19 | |
287 # This workaround is detailed in the fourth comment of: | |
288 # https://github.com/conda-forge/texlive-core-feedstock/issues/61 | |
289 | |
290 library(tinytex) | |
291 tinytex::install_tinytex() | |
292 rmarkdown::render( | |
293 input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/") | |
294 , output_format = rmarkdown::pdf_document(toc = TRUE) | |
295 , output_file = report_file_name | |
296 , params = rmarkdown_params | |
297 ) |