Mercurial > repos > workflow4metabolomics > intensity_check
comparison wrapper_intensity_check.R @ 0:860907bf0e0f draft default tip
planemo upload for repository https://github.com/your_org_or_user/tools-metabolomics commit 3dd90b60510fa72eb8351464ec5d5883b0b6222a
author | workflow4metabolomics |
---|---|
date | Fri, 04 Jul 2025 16:31:19 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:860907bf0e0f |
---|---|
1 suppressPackageStartupMessages(library(argparse)) | |
2 | |
3 check.err <- function(err.stock) { | |
4 # err.stock = vector of results returned by check functions | |
5 if (length(err.stock) != 0) { | |
6 stop("\n- - - - - - - - -\n", err.stock, "\n- - - - - - - - -\n") | |
7 } | |
8 } | |
9 | |
10 | |
11 # To check if the 3 standard tables match regarding identifiers | |
12 table_match <- function(dataMatrix, sampleMetadata, variableMetadata) { | |
13 err.stock <- character() | |
14 | |
15 # Sample identifiers | |
16 dm_samples <- colnames(dataMatrix)[-1] | |
17 sm_samples <- as.character(sampleMetadata[[1]]) | |
18 # Variable identifiers | |
19 dm_vars <- as.character(dataMatrix[[1]]) | |
20 vm_vars <- as.character(variableMetadata[[1]]) | |
21 | |
22 # Check sample IDs | |
23 missing_in_sm <- setdiff(dm_samples, sm_samples) | |
24 missing_in_dm <- setdiff(sm_samples, dm_samples) | |
25 if (length(missing_in_sm) > 0 || length(missing_in_dm) > 0) { | |
26 err.stock <- c( | |
27 err.stock, | |
28 "\nData matrix and sample metadata do not match regarding sample identifiers." | |
29 ) | |
30 if (length(missing_in_sm) > 0) { | |
31 prefix <- if (length(missing_in_sm) < 4) { | |
32 "\n The " | |
33 } else { | |
34 "\n For example, the " | |
35 } | |
36 err.stock <- c( | |
37 err.stock, | |
38 prefix, | |
39 "following identifiers found in the data matrix\n", | |
40 " do not appear in the sample metadata file:\n", | |
41 " ", | |
42 paste(head(missing_in_sm, 3), collapse = "\n "), | |
43 "\n" | |
44 ) | |
45 } | |
46 if (length(missing_in_dm) > 0) { | |
47 prefix <- if (length(missing_in_dm) < 4) { | |
48 "\n The " | |
49 } else { | |
50 "\n For example, the " | |
51 } | |
52 err.stock <- c( | |
53 err.stock, | |
54 prefix, | |
55 "following identifiers found in the sample metadata file\n", | |
56 " do not appear in the data matrix:\n", | |
57 " ", | |
58 paste(head(missing_in_dm, 3), collapse = "\n "), | |
59 "\n" | |
60 ) | |
61 } | |
62 } | |
63 | |
64 # Check variable IDs | |
65 missing_in_vm <- setdiff(dm_vars, vm_vars) | |
66 missing_in_dm_var <- setdiff(vm_vars, dm_vars) | |
67 if (length(missing_in_vm) > 0 || length(missing_in_dm_var) > 0) { | |
68 err.stock <- c( | |
69 err.stock, | |
70 "\nData matrix and variable metadata do not match regarding variable identifiers." | |
71 ) | |
72 if (length(missing_in_vm) > 0) { | |
73 prefix <- if (length(missing_in_vm) < 4) { | |
74 "\n The " | |
75 } else { | |
76 "\n For example, the " | |
77 } | |
78 err.stock <- c( | |
79 err.stock, | |
80 prefix, | |
81 "following identifiers found in the data matrix\n", | |
82 " do not appear in the variable metadata file:\n", | |
83 " ", | |
84 paste(head(missing_in_vm, 3), collapse = "\n "), | |
85 "\n" | |
86 ) | |
87 } | |
88 if (length(missing_in_dm_var) > 0) { | |
89 prefix <- if (length(missing_in_dm_var) < 4) { | |
90 "\n The " | |
91 } else { | |
92 "\n For example, the " | |
93 } | |
94 err.stock <- c( | |
95 err.stock, | |
96 prefix, | |
97 "following identifiers found in the variable metadata file\n", | |
98 " do not appear in the data matrix:\n", | |
99 " ", | |
100 paste(head(missing_in_dm_var, 3), collapse = "\n "), | |
101 "\n" | |
102 ) | |
103 } | |
104 } | |
105 | |
106 if (length(err.stock) > 0) { | |
107 err.stock <- c(err.stock, "\nPlease check your data.\n") | |
108 } | |
109 | |
110 return(err.stock) | |
111 } | |
112 | |
113 | |
114 intens_check <- function( | |
115 DM.name, | |
116 SM.name, | |
117 VM.name, | |
118 method, | |
119 chosen.stat, | |
120 class.col, | |
121 test.fold, | |
122 class1, | |
123 fold.frac, | |
124 logarithm, | |
125 VM.output, | |
126 graphs.output) { | |
127 # Read input tables | |
128 DM <- read.table(DM.name, header = TRUE, sep = "\t", check.names = FALSE) | |
129 SM <- read.table(SM.name, header = TRUE, sep = "\t", check.names = FALSE) | |
130 VM <- read.table(VM.name, header = TRUE, sep = "\t", check.names = FALSE) | |
131 | |
132 # Table match check | |
133 table.check <- table_match(DM, SM, VM) | |
134 check.err(table.check) | |
135 | |
136 # Transpose and align DM | |
137 rownames(DM) <- DM[, 1] | |
138 DM <- data.frame(t(DM[, -1])) | |
139 DM <- merge(x = cbind(1:nrow(SM), SM), y = DM, by.x = 2, by.y = 0) | |
140 DM <- DM[order(DM[, 2]), ] | |
141 rownames(DM) <- DM[, 1] | |
142 DM <- DM[, -c(1:(ncol(SM) + 1))] | |
143 | |
144 stat.list <- strsplit(chosen.stat, ",")[[1]] | |
145 | |
146 # Class assignment | |
147 if (method == "no_class") { | |
148 c_class <- rep("global", nrow(DM)) | |
149 classnames <- "global" | |
150 nb_class <- 1 | |
151 test.fold <- "No" | |
152 } else { | |
153 class.col <- colnames(SM)[as.numeric(class.col)] | |
154 if (!(class.col %in% colnames(SM))) { | |
155 stop("The column ", class.col, " is not in sample metadata") | |
156 } | |
157 c_class <- as.factor(SM[, class.col]) | |
158 nb_class <- nlevels(c_class) | |
159 classnames <- levels(c_class) | |
160 if (nb_class < 2 && test.fold == "Yes") { | |
161 cat( | |
162 "The '", | |
163 class.col, | |
164 "' column contains only one class, fold calculation could not be executed.\n" | |
165 ) | |
166 } | |
167 if (nb_class > (nrow(SM)) / 3 && method == "each_class") { | |
168 cat("There are too many classes, consider reducing the number.\n") | |
169 } | |
170 if (method == "one_class") { | |
171 if (!(class1 %in% classnames)) { | |
172 stop("The '", class1, "' class does not appear in the '", class.col, "'column.") | |
173 } | |
174 c_class <- factor( | |
175 ifelse(c_class == class1, class1, "Other"), | |
176 levels = c(class1, "Other") | |
177 ) | |
178 nb_class <- nlevels(c_class) | |
179 classnames <- levels(c_class) | |
180 } | |
181 } | |
182 | |
183 # Check numeric | |
184 if (!is.numeric(as.matrix(DM))) { | |
185 findchar <- function(myval) { | |
186 ifelse( | |
187 is.na(myval), | |
188 "ok", | |
189 ifelse(is.na(as.numeric(as.character(myval))), "char", "ok") | |
190 ) | |
191 } | |
192 chardiag <- suppressWarnings(apply(DM, 2, vapply, findchar, "character")) | |
193 charlist <- which(chardiag == "char") | |
194 err.stock <- paste( | |
195 "\n- - - - - - - - -\nYour dataMatrix contains", | |
196 length(charlist), | |
197 "non-numeric value(s).\n" | |
198 ) | |
199 stop(c( | |
200 err.stock, | |
201 "The dataMatrix file is supposed to contain only numeric values.\n- - - - - - - - -\n" | |
202 )) | |
203 } | |
204 | |
205 DM <- cbind(c_class, DM) | |
206 stat.res <- t(DM[0, -1, drop = FALSE]) | |
207 names <- NULL | |
208 | |
209 mean.res <- sd.res <- med.res <- quart.res <- dec.res <- NA.res <- pct_NA.res <- fold.res <- NULL | |
210 mean.names <- sd.names <- med.names <- quart.names <- dec.names <- NA.names <- pct_NA.names <- fold.names <- NULL | |
211 graphs <- if (("NA" %in% stat.list) || (test.fold == "Yes")) 1 else 0 | |
212 data_bp <- data.frame() | |
213 | |
214 for (j in 1:nb_class) { | |
215 idx <- which(DM$c_class == classnames[j]) | |
216 # Mean | |
217 if ("mean" %in% stat.list) { | |
218 mean.res <- cbind(mean.res, colMeans(DM[idx, -1], na.rm = TRUE)) | |
219 mean.names <- cbind(mean.names, paste("Mean", classnames[j], sep = "_")) | |
220 if (j == nb_class) { | |
221 stat.res <- cbind(stat.res, mean.res) | |
222 names <- cbind(names, mean.names) | |
223 } | |
224 } | |
225 # SD | |
226 if ("sd" %in% stat.list) { | |
227 sd.res <- cbind(sd.res, apply(DM[idx, -1], 2, sd, na.rm = TRUE)) | |
228 sd.names <- cbind(sd.names, paste("Sd", classnames[j], sep = "_")) | |
229 if (j == nb_class) { | |
230 stat.res <- cbind(stat.res, sd.res) | |
231 names <- cbind(names, sd.names) | |
232 } | |
233 } | |
234 # Median | |
235 if (("median" %in% stat.list) && (!("quartile" %in% stat.list))) { | |
236 med.res <- cbind(med.res, apply(DM[idx, -1], 2, median, na.rm = TRUE)) | |
237 med.names <- cbind(med.names, paste("Median", classnames[j], sep = "_")) | |
238 if (j == nb_class) { | |
239 stat.res <- cbind(stat.res, med.res) | |
240 names <- cbind(names, med.names) | |
241 } | |
242 } | |
243 # Quartiles | |
244 if ("quartile" %in% stat.list) { | |
245 quart.res <- cbind( | |
246 quart.res, | |
247 t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE)) | |
248 ) | |
249 quart.names <- cbind( | |
250 quart.names, | |
251 paste("Min", classnames[j], sep = "_"), | |
252 paste("Q1", classnames[j], sep = "_"), | |
253 paste("Median", classnames[j], sep = "_"), | |
254 paste("Q3", classnames[j], sep = "_"), | |
255 paste("Max", classnames[j], sep = "_") | |
256 ) | |
257 if (j == nb_class) { | |
258 stat.res <- cbind(stat.res, quart.res) | |
259 names <- cbind(names, quart.names) | |
260 } | |
261 } | |
262 # Deciles | |
263 if ("decile" %in% stat.list) { | |
264 dec.res <- cbind( | |
265 dec.res, | |
266 t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE, seq(0, 1, 0.1))) | |
267 ) | |
268 dec.names <- cbind( | |
269 dec.names, | |
270 t(matrix(paste("D", seq(0, 10, 1), sep = "_", classnames[j]))) | |
271 ) | |
272 if (j == nb_class) { | |
273 stat.res <- cbind(stat.res, dec.res) | |
274 names <- cbind(names, dec.names) | |
275 } | |
276 } | |
277 # Missing values | |
278 if ("NA" %in% stat.list) { | |
279 nb_NA <- apply(DM[idx, -1], 2, function(x) sum(is.na(x))) | |
280 pct_NA <- round(nb_NA / nrow(DM[idx, -1]) * 100, 4) | |
281 NA.res <- cbind(NA.res, nb_NA) | |
282 pct_NA.res <- cbind(pct_NA.res, pct_NA) | |
283 NA.names <- cbind(NA.names, paste("NA", classnames[j], sep = "_")) | |
284 pct_NA.names <- cbind( | |
285 pct_NA.names, | |
286 paste("Pct_NA", classnames[j], sep = "_") | |
287 ) | |
288 if (j == nb_class) { | |
289 stat.res <- cbind(stat.res, NA.res, pct_NA.res) | |
290 names <- cbind(names, NA.names, pct_NA.names) | |
291 } | |
292 # Barplot data | |
293 bins <- cut(pct_NA, breaks = c(-1, 20, 40, 60, 80, 100), labels = FALSE) | |
294 for (b in 1:5) data_bp[b, j] <- sum(bins == b) | |
295 rownames(data_bp) <- c( | |
296 "0%-20%", | |
297 "20%-40%", | |
298 "40%-60%", | |
299 "60%-80%", | |
300 "80%-100%" | |
301 ) | |
302 if (j == nb_class) { | |
303 if (sum(NA.res) == 0) cat("Data Matrix contains no NA.\n") | |
304 colnames(data_bp) <- classnames | |
305 data_bp <- as.matrix(data_bp) | |
306 } | |
307 } | |
308 # Mean fold change | |
309 if (test.fold == "Yes" && nb_class >= 2 && j != nb_class) { | |
310 if (method == "each_class") fold.frac <- "Top" | |
311 for (k in (j + 1):nb_class) { | |
312 ratio1 <- if (fold.frac == "Bottom") classnames[k] else classnames[j] | |
313 ratio2 <- if (fold.frac == "Bottom") classnames[j] else classnames[k] | |
314 fold <- colMeans(DM[which(DM$c_class == ratio1), -1], na.rm = TRUE) / | |
315 colMeans(DM[which(DM$c_class == ratio2), -1], na.rm = TRUE) | |
316 if (logarithm == "log2") { | |
317 fold <- log2(fold) | |
318 } else if ( | |
319 logarithm == "log10" | |
320 ) { | |
321 fold <- log10(fold) | |
322 } | |
323 fold.res <- cbind(fold.res, fold) | |
324 fname <- if (logarithm == "none") { | |
325 paste("fold", ratio1, "VS", ratio2, sep = "_") | |
326 } else { | |
327 paste(logarithm, "fold", ratio1, "VS", ratio2, sep = "_") | |
328 } | |
329 fold.names <- cbind(fold.names, fname) | |
330 } | |
331 if (j == nb_class) { | |
332 stat.res <- cbind(stat.res, fold.res) | |
333 names <- cbind(names, fold.names) | |
334 } | |
335 } | |
336 } | |
337 | |
338 # Ensure unique column names | |
339 VM.names <- colnames(VM) | |
340 for (i in seq_along(VM.names)) { | |
341 for (j in seq_along(names)) { | |
342 if (VM.names[i] == names[j]) names[j] <- paste(names[j], "2", sep = "_") | |
343 } | |
344 } | |
345 colnames(stat.res) <- names | |
346 | |
347 # Output | |
348 VM <- cbind(VM, stat.res) | |
349 write.table(VM, VM.output, sep = "\t", quote = FALSE, row.names = FALSE) | |
350 | |
351 # Graphics | |
352 pdf(graphs.output) | |
353 if (graphs == 1) { | |
354 if ("NA" %in% stat.list) { | |
355 graph.colors <- c("green3", "palegreen3", "lightblue", "orangered", "red") | |
356 par(mar = c(5.1, 4.1, 4.1, 8.1), xpd = TRUE) | |
357 bp <- barplot( | |
358 data_bp, | |
359 col = graph.colors, | |
360 main = "Proportion of NA", | |
361 xlab = "Classes", | |
362 ylab = "Variables" | |
363 ) | |
364 legend( | |
365 "topright", | |
366 fill = graph.colors, | |
367 rownames(data_bp), | |
368 inset = c(-0.3, 0) | |
369 ) | |
370 stock <- 0 | |
371 for (i in 1:nrow(data_bp)) { | |
372 text( | |
373 bp, | |
374 stock + data_bp[i, ] / 2, | |
375 data_bp[i, ], | |
376 col = "white", | |
377 cex = 0.7 | |
378 ) | |
379 stock <- stock + data_bp[i, ] | |
380 } | |
381 } | |
382 if ((test.fold == "Yes") && (nb_class >= 2)) { | |
383 clean_fold <- fold.res | |
384 clean_fold[is.infinite(clean_fold)] <- NA | |
385 for (j in 1:ncol(clean_fold)) { | |
386 boxplot(clean_fold[, j], main = fold.names[j]) | |
387 } | |
388 } | |
389 } else { | |
390 plot.new() | |
391 legend("center", "You did not select any option with graphical output.") | |
392 } | |
393 dev.off() | |
394 } | |
395 | |
396 | |
397 parser <- ArgumentParser(description = "Intensity Check Tool") | |
398 | |
399 parser$add_argument( | |
400 "--dataMatrix_in", | |
401 required = TRUE, | |
402 help = "Input data matrix file" | |
403 ) | |
404 parser$add_argument( | |
405 "--sampleMetadata_in", | |
406 required = TRUE, | |
407 help = "Input sample metadata file" | |
408 ) | |
409 parser$add_argument( | |
410 "--variableMetadata_in", | |
411 required = TRUE, | |
412 help = "Input variable metadata file" | |
413 ) | |
414 parser$add_argument("--method", required = TRUE, help = "Computation method") | |
415 parser$add_argument( | |
416 "--chosen_stat", | |
417 required = TRUE, | |
418 help = "Statistics to compute" | |
419 ) | |
420 parser$add_argument( | |
421 "--class_col", | |
422 default = NULL, | |
423 help = "Class column (optional)" | |
424 ) | |
425 parser$add_argument( | |
426 "--test_fold", | |
427 default = NULL, | |
428 help = "Test fold (optional)" | |
429 ) | |
430 parser$add_argument("--class1", default = NULL, help = "Class1 (optional)") | |
431 parser$add_argument( | |
432 "--fold_frac", | |
433 default = NULL, | |
434 help = "Fold fraction (optional)" | |
435 ) | |
436 parser$add_argument( | |
437 "--logarithm", | |
438 default = NULL, | |
439 help = "Logarithm (optional)" | |
440 ) | |
441 parser$add_argument( | |
442 "--variableMetadata_out", | |
443 required = TRUE, | |
444 help = "Output variable metadata file" | |
445 ) | |
446 parser$add_argument( | |
447 "--graphs_out", | |
448 required = TRUE, | |
449 help = "Output graphs file" | |
450 ) | |
451 | |
452 args <- parser$parse_args() | |
453 | |
454 print(args) | |
455 | |
456 if (length(args) < 7) { | |
457 stop("NOT enough argument !!!") | |
458 } | |
459 | |
460 cat( | |
461 "\nJob starting time:\n", | |
462 format(Sys.time(), "%a %d %b %Y %X"), | |
463 "\n\n--------------------------------------------------------------------", | |
464 "\nIntensity Check parameters:\n\n" | |
465 ) | |
466 print(args) | |
467 cat("--------------------------------------------------------------------\n\n") | |
468 | |
469 class_col <- NULL | |
470 test_fold <- NULL | |
471 class1 <- NULL | |
472 fold_frac <- NULL | |
473 logarithm <- NULL | |
474 if (args$method == "each_class") { | |
475 class_col <- args$class_col | |
476 test_fold <- args$test_fold | |
477 if (args$test_fold == "Yes") { | |
478 logarithm <- args$logarithm | |
479 } | |
480 } | |
481 if (args$method == "one_class") { | |
482 class_col <- args$class_col | |
483 class1 <- args$class1 | |
484 test_fold <- args$test_fold | |
485 if (args$test_fold == "Yes") { | |
486 fold_frac <- args$fold_frac | |
487 logarithm <- args$logarithm | |
488 } | |
489 } | |
490 | |
491 err_no_option <- NULL | |
492 | |
493 if ( | |
494 ((args$method == "no_class") && (args$chosen_stat == "None")) || | |
495 ((args$method != "no_class") && | |
496 (args$chosen_stat == "None") && | |
497 (test_fold == "No")) | |
498 ) { | |
499 err_no_option <- "You did not select any computational option. Program can not be executed." | |
500 stop("\n- - - - - - - - -\n", err_no_option, "\n- - - - - - - - -\n") | |
501 } | |
502 | |
503 | |
504 if (is.null(err_no_option)) { | |
505 intens_check( | |
506 args$dataMatrix_in, | |
507 args$sampleMetadata_in, | |
508 args$variableMetadata_in, | |
509 args$method, | |
510 args$chosen_stat, | |
511 class_col, | |
512 test_fold, | |
513 class1, | |
514 fold_frac, | |
515 logarithm, | |
516 args$variableMetadata_out, | |
517 args$graphs_out | |
518 ) | |
519 } | |
520 | |
521 | |
522 cat( | |
523 "\n--------------------------------------------------------------------", | |
524 "\nInformation about R (version, Operating System, attached or loaded packages):\n\n" | |
525 ) | |
526 sessionInfo() | |
527 cat( | |
528 "--------------------------------------------------------------------\n", | |
529 "\nJob ending time:\n", | |
530 format(Sys.time(), "%a %d %b %Y %X") | |
531 ) | |
532 | |
533 | |
534 # delete the parameters to avoid the passage to the next tool in .RData image | |
535 rm(args) |