comparison wrapper_intensity_check.R @ 0:860907bf0e0f draft default tip

planemo upload for repository https://github.com/your_org_or_user/tools-metabolomics commit 3dd90b60510fa72eb8351464ec5d5883b0b6222a
author workflow4metabolomics
date Fri, 04 Jul 2025 16:31:19 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:860907bf0e0f
1 suppressPackageStartupMessages(library(argparse))
2
3 check.err <- function(err.stock) {
4 # err.stock = vector of results returned by check functions
5 if (length(err.stock) != 0) {
6 stop("\n- - - - - - - - -\n", err.stock, "\n- - - - - - - - -\n")
7 }
8 }
9
10
11 # To check if the 3 standard tables match regarding identifiers
12 table_match <- function(dataMatrix, sampleMetadata, variableMetadata) {
13 err.stock <- character()
14
15 # Sample identifiers
16 dm_samples <- colnames(dataMatrix)[-1]
17 sm_samples <- as.character(sampleMetadata[[1]])
18 # Variable identifiers
19 dm_vars <- as.character(dataMatrix[[1]])
20 vm_vars <- as.character(variableMetadata[[1]])
21
22 # Check sample IDs
23 missing_in_sm <- setdiff(dm_samples, sm_samples)
24 missing_in_dm <- setdiff(sm_samples, dm_samples)
25 if (length(missing_in_sm) > 0 || length(missing_in_dm) > 0) {
26 err.stock <- c(
27 err.stock,
28 "\nData matrix and sample metadata do not match regarding sample identifiers."
29 )
30 if (length(missing_in_sm) > 0) {
31 prefix <- if (length(missing_in_sm) < 4) {
32 "\n The "
33 } else {
34 "\n For example, the "
35 }
36 err.stock <- c(
37 err.stock,
38 prefix,
39 "following identifiers found in the data matrix\n",
40 " do not appear in the sample metadata file:\n",
41 " ",
42 paste(head(missing_in_sm, 3), collapse = "\n "),
43 "\n"
44 )
45 }
46 if (length(missing_in_dm) > 0) {
47 prefix <- if (length(missing_in_dm) < 4) {
48 "\n The "
49 } else {
50 "\n For example, the "
51 }
52 err.stock <- c(
53 err.stock,
54 prefix,
55 "following identifiers found in the sample metadata file\n",
56 " do not appear in the data matrix:\n",
57 " ",
58 paste(head(missing_in_dm, 3), collapse = "\n "),
59 "\n"
60 )
61 }
62 }
63
64 # Check variable IDs
65 missing_in_vm <- setdiff(dm_vars, vm_vars)
66 missing_in_dm_var <- setdiff(vm_vars, dm_vars)
67 if (length(missing_in_vm) > 0 || length(missing_in_dm_var) > 0) {
68 err.stock <- c(
69 err.stock,
70 "\nData matrix and variable metadata do not match regarding variable identifiers."
71 )
72 if (length(missing_in_vm) > 0) {
73 prefix <- if (length(missing_in_vm) < 4) {
74 "\n The "
75 } else {
76 "\n For example, the "
77 }
78 err.stock <- c(
79 err.stock,
80 prefix,
81 "following identifiers found in the data matrix\n",
82 " do not appear in the variable metadata file:\n",
83 " ",
84 paste(head(missing_in_vm, 3), collapse = "\n "),
85 "\n"
86 )
87 }
88 if (length(missing_in_dm_var) > 0) {
89 prefix <- if (length(missing_in_dm_var) < 4) {
90 "\n The "
91 } else {
92 "\n For example, the "
93 }
94 err.stock <- c(
95 err.stock,
96 prefix,
97 "following identifiers found in the variable metadata file\n",
98 " do not appear in the data matrix:\n",
99 " ",
100 paste(head(missing_in_dm_var, 3), collapse = "\n "),
101 "\n"
102 )
103 }
104 }
105
106 if (length(err.stock) > 0) {
107 err.stock <- c(err.stock, "\nPlease check your data.\n")
108 }
109
110 return(err.stock)
111 }
112
113
114 intens_check <- function(
115 DM.name,
116 SM.name,
117 VM.name,
118 method,
119 chosen.stat,
120 class.col,
121 test.fold,
122 class1,
123 fold.frac,
124 logarithm,
125 VM.output,
126 graphs.output) {
127 # Read input tables
128 DM <- read.table(DM.name, header = TRUE, sep = "\t", check.names = FALSE)
129 SM <- read.table(SM.name, header = TRUE, sep = "\t", check.names = FALSE)
130 VM <- read.table(VM.name, header = TRUE, sep = "\t", check.names = FALSE)
131
132 # Table match check
133 table.check <- table_match(DM, SM, VM)
134 check.err(table.check)
135
136 # Transpose and align DM
137 rownames(DM) <- DM[, 1]
138 DM <- data.frame(t(DM[, -1]))
139 DM <- merge(x = cbind(1:nrow(SM), SM), y = DM, by.x = 2, by.y = 0)
140 DM <- DM[order(DM[, 2]), ]
141 rownames(DM) <- DM[, 1]
142 DM <- DM[, -c(1:(ncol(SM) + 1))]
143
144 stat.list <- strsplit(chosen.stat, ",")[[1]]
145
146 # Class assignment
147 if (method == "no_class") {
148 c_class <- rep("global", nrow(DM))
149 classnames <- "global"
150 nb_class <- 1
151 test.fold <- "No"
152 } else {
153 class.col <- colnames(SM)[as.numeric(class.col)]
154 if (!(class.col %in% colnames(SM))) {
155 stop("The column ", class.col, " is not in sample metadata")
156 }
157 c_class <- as.factor(SM[, class.col])
158 nb_class <- nlevels(c_class)
159 classnames <- levels(c_class)
160 if (nb_class < 2 && test.fold == "Yes") {
161 cat(
162 "The '",
163 class.col,
164 "' column contains only one class, fold calculation could not be executed.\n"
165 )
166 }
167 if (nb_class > (nrow(SM)) / 3 && method == "each_class") {
168 cat("There are too many classes, consider reducing the number.\n")
169 }
170 if (method == "one_class") {
171 if (!(class1 %in% classnames)) {
172 stop("The '", class1, "' class does not appear in the '", class.col, "'column.")
173 }
174 c_class <- factor(
175 ifelse(c_class == class1, class1, "Other"),
176 levels = c(class1, "Other")
177 )
178 nb_class <- nlevels(c_class)
179 classnames <- levels(c_class)
180 }
181 }
182
183 # Check numeric
184 if (!is.numeric(as.matrix(DM))) {
185 findchar <- function(myval) {
186 ifelse(
187 is.na(myval),
188 "ok",
189 ifelse(is.na(as.numeric(as.character(myval))), "char", "ok")
190 )
191 }
192 chardiag <- suppressWarnings(apply(DM, 2, vapply, findchar, "character"))
193 charlist <- which(chardiag == "char")
194 err.stock <- paste(
195 "\n- - - - - - - - -\nYour dataMatrix contains",
196 length(charlist),
197 "non-numeric value(s).\n"
198 )
199 stop(c(
200 err.stock,
201 "The dataMatrix file is supposed to contain only numeric values.\n- - - - - - - - -\n"
202 ))
203 }
204
205 DM <- cbind(c_class, DM)
206 stat.res <- t(DM[0, -1, drop = FALSE])
207 names <- NULL
208
209 mean.res <- sd.res <- med.res <- quart.res <- dec.res <- NA.res <- pct_NA.res <- fold.res <- NULL
210 mean.names <- sd.names <- med.names <- quart.names <- dec.names <- NA.names <- pct_NA.names <- fold.names <- NULL
211 graphs <- if (("NA" %in% stat.list) || (test.fold == "Yes")) 1 else 0
212 data_bp <- data.frame()
213
214 for (j in 1:nb_class) {
215 idx <- which(DM$c_class == classnames[j])
216 # Mean
217 if ("mean" %in% stat.list) {
218 mean.res <- cbind(mean.res, colMeans(DM[idx, -1], na.rm = TRUE))
219 mean.names <- cbind(mean.names, paste("Mean", classnames[j], sep = "_"))
220 if (j == nb_class) {
221 stat.res <- cbind(stat.res, mean.res)
222 names <- cbind(names, mean.names)
223 }
224 }
225 # SD
226 if ("sd" %in% stat.list) {
227 sd.res <- cbind(sd.res, apply(DM[idx, -1], 2, sd, na.rm = TRUE))
228 sd.names <- cbind(sd.names, paste("Sd", classnames[j], sep = "_"))
229 if (j == nb_class) {
230 stat.res <- cbind(stat.res, sd.res)
231 names <- cbind(names, sd.names)
232 }
233 }
234 # Median
235 if (("median" %in% stat.list) && (!("quartile" %in% stat.list))) {
236 med.res <- cbind(med.res, apply(DM[idx, -1], 2, median, na.rm = TRUE))
237 med.names <- cbind(med.names, paste("Median", classnames[j], sep = "_"))
238 if (j == nb_class) {
239 stat.res <- cbind(stat.res, med.res)
240 names <- cbind(names, med.names)
241 }
242 }
243 # Quartiles
244 if ("quartile" %in% stat.list) {
245 quart.res <- cbind(
246 quart.res,
247 t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE))
248 )
249 quart.names <- cbind(
250 quart.names,
251 paste("Min", classnames[j], sep = "_"),
252 paste("Q1", classnames[j], sep = "_"),
253 paste("Median", classnames[j], sep = "_"),
254 paste("Q3", classnames[j], sep = "_"),
255 paste("Max", classnames[j], sep = "_")
256 )
257 if (j == nb_class) {
258 stat.res <- cbind(stat.res, quart.res)
259 names <- cbind(names, quart.names)
260 }
261 }
262 # Deciles
263 if ("decile" %in% stat.list) {
264 dec.res <- cbind(
265 dec.res,
266 t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE, seq(0, 1, 0.1)))
267 )
268 dec.names <- cbind(
269 dec.names,
270 t(matrix(paste("D", seq(0, 10, 1), sep = "_", classnames[j])))
271 )
272 if (j == nb_class) {
273 stat.res <- cbind(stat.res, dec.res)
274 names <- cbind(names, dec.names)
275 }
276 }
277 # Missing values
278 if ("NA" %in% stat.list) {
279 nb_NA <- apply(DM[idx, -1], 2, function(x) sum(is.na(x)))
280 pct_NA <- round(nb_NA / nrow(DM[idx, -1]) * 100, 4)
281 NA.res <- cbind(NA.res, nb_NA)
282 pct_NA.res <- cbind(pct_NA.res, pct_NA)
283 NA.names <- cbind(NA.names, paste("NA", classnames[j], sep = "_"))
284 pct_NA.names <- cbind(
285 pct_NA.names,
286 paste("Pct_NA", classnames[j], sep = "_")
287 )
288 if (j == nb_class) {
289 stat.res <- cbind(stat.res, NA.res, pct_NA.res)
290 names <- cbind(names, NA.names, pct_NA.names)
291 }
292 # Barplot data
293 bins <- cut(pct_NA, breaks = c(-1, 20, 40, 60, 80, 100), labels = FALSE)
294 for (b in 1:5) data_bp[b, j] <- sum(bins == b)
295 rownames(data_bp) <- c(
296 "0%-20%",
297 "20%-40%",
298 "40%-60%",
299 "60%-80%",
300 "80%-100%"
301 )
302 if (j == nb_class) {
303 if (sum(NA.res) == 0) cat("Data Matrix contains no NA.\n")
304 colnames(data_bp) <- classnames
305 data_bp <- as.matrix(data_bp)
306 }
307 }
308 # Mean fold change
309 if (test.fold == "Yes" && nb_class >= 2 && j != nb_class) {
310 if (method == "each_class") fold.frac <- "Top"
311 for (k in (j + 1):nb_class) {
312 ratio1 <- if (fold.frac == "Bottom") classnames[k] else classnames[j]
313 ratio2 <- if (fold.frac == "Bottom") classnames[j] else classnames[k]
314 fold <- colMeans(DM[which(DM$c_class == ratio1), -1], na.rm = TRUE) /
315 colMeans(DM[which(DM$c_class == ratio2), -1], na.rm = TRUE)
316 if (logarithm == "log2") {
317 fold <- log2(fold)
318 } else if (
319 logarithm == "log10"
320 ) {
321 fold <- log10(fold)
322 }
323 fold.res <- cbind(fold.res, fold)
324 fname <- if (logarithm == "none") {
325 paste("fold", ratio1, "VS", ratio2, sep = "_")
326 } else {
327 paste(logarithm, "fold", ratio1, "VS", ratio2, sep = "_")
328 }
329 fold.names <- cbind(fold.names, fname)
330 }
331 if (j == nb_class) {
332 stat.res <- cbind(stat.res, fold.res)
333 names <- cbind(names, fold.names)
334 }
335 }
336 }
337
338 # Ensure unique column names
339 VM.names <- colnames(VM)
340 for (i in seq_along(VM.names)) {
341 for (j in seq_along(names)) {
342 if (VM.names[i] == names[j]) names[j] <- paste(names[j], "2", sep = "_")
343 }
344 }
345 colnames(stat.res) <- names
346
347 # Output
348 VM <- cbind(VM, stat.res)
349 write.table(VM, VM.output, sep = "\t", quote = FALSE, row.names = FALSE)
350
351 # Graphics
352 pdf(graphs.output)
353 if (graphs == 1) {
354 if ("NA" %in% stat.list) {
355 graph.colors <- c("green3", "palegreen3", "lightblue", "orangered", "red")
356 par(mar = c(5.1, 4.1, 4.1, 8.1), xpd = TRUE)
357 bp <- barplot(
358 data_bp,
359 col = graph.colors,
360 main = "Proportion of NA",
361 xlab = "Classes",
362 ylab = "Variables"
363 )
364 legend(
365 "topright",
366 fill = graph.colors,
367 rownames(data_bp),
368 inset = c(-0.3, 0)
369 )
370 stock <- 0
371 for (i in 1:nrow(data_bp)) {
372 text(
373 bp,
374 stock + data_bp[i, ] / 2,
375 data_bp[i, ],
376 col = "white",
377 cex = 0.7
378 )
379 stock <- stock + data_bp[i, ]
380 }
381 }
382 if ((test.fold == "Yes") && (nb_class >= 2)) {
383 clean_fold <- fold.res
384 clean_fold[is.infinite(clean_fold)] <- NA
385 for (j in 1:ncol(clean_fold)) {
386 boxplot(clean_fold[, j], main = fold.names[j])
387 }
388 }
389 } else {
390 plot.new()
391 legend("center", "You did not select any option with graphical output.")
392 }
393 dev.off()
394 }
395
396
397 parser <- ArgumentParser(description = "Intensity Check Tool")
398
399 parser$add_argument(
400 "--dataMatrix_in",
401 required = TRUE,
402 help = "Input data matrix file"
403 )
404 parser$add_argument(
405 "--sampleMetadata_in",
406 required = TRUE,
407 help = "Input sample metadata file"
408 )
409 parser$add_argument(
410 "--variableMetadata_in",
411 required = TRUE,
412 help = "Input variable metadata file"
413 )
414 parser$add_argument("--method", required = TRUE, help = "Computation method")
415 parser$add_argument(
416 "--chosen_stat",
417 required = TRUE,
418 help = "Statistics to compute"
419 )
420 parser$add_argument(
421 "--class_col",
422 default = NULL,
423 help = "Class column (optional)"
424 )
425 parser$add_argument(
426 "--test_fold",
427 default = NULL,
428 help = "Test fold (optional)"
429 )
430 parser$add_argument("--class1", default = NULL, help = "Class1 (optional)")
431 parser$add_argument(
432 "--fold_frac",
433 default = NULL,
434 help = "Fold fraction (optional)"
435 )
436 parser$add_argument(
437 "--logarithm",
438 default = NULL,
439 help = "Logarithm (optional)"
440 )
441 parser$add_argument(
442 "--variableMetadata_out",
443 required = TRUE,
444 help = "Output variable metadata file"
445 )
446 parser$add_argument(
447 "--graphs_out",
448 required = TRUE,
449 help = "Output graphs file"
450 )
451
452 args <- parser$parse_args()
453
454 print(args)
455
456 if (length(args) < 7) {
457 stop("NOT enough argument !!!")
458 }
459
460 cat(
461 "\nJob starting time:\n",
462 format(Sys.time(), "%a %d %b %Y %X"),
463 "\n\n--------------------------------------------------------------------",
464 "\nIntensity Check parameters:\n\n"
465 )
466 print(args)
467 cat("--------------------------------------------------------------------\n\n")
468
469 class_col <- NULL
470 test_fold <- NULL
471 class1 <- NULL
472 fold_frac <- NULL
473 logarithm <- NULL
474 if (args$method == "each_class") {
475 class_col <- args$class_col
476 test_fold <- args$test_fold
477 if (args$test_fold == "Yes") {
478 logarithm <- args$logarithm
479 }
480 }
481 if (args$method == "one_class") {
482 class_col <- args$class_col
483 class1 <- args$class1
484 test_fold <- args$test_fold
485 if (args$test_fold == "Yes") {
486 fold_frac <- args$fold_frac
487 logarithm <- args$logarithm
488 }
489 }
490
491 err_no_option <- NULL
492
493 if (
494 ((args$method == "no_class") && (args$chosen_stat == "None")) ||
495 ((args$method != "no_class") &&
496 (args$chosen_stat == "None") &&
497 (test_fold == "No"))
498 ) {
499 err_no_option <- "You did not select any computational option. Program can not be executed."
500 stop("\n- - - - - - - - -\n", err_no_option, "\n- - - - - - - - -\n")
501 }
502
503
504 if (is.null(err_no_option)) {
505 intens_check(
506 args$dataMatrix_in,
507 args$sampleMetadata_in,
508 args$variableMetadata_in,
509 args$method,
510 args$chosen_stat,
511 class_col,
512 test_fold,
513 class1,
514 fold_frac,
515 logarithm,
516 args$variableMetadata_out,
517 args$graphs_out
518 )
519 }
520
521
522 cat(
523 "\n--------------------------------------------------------------------",
524 "\nInformation about R (version, Operating System, attached or loaded packages):\n\n"
525 )
526 sessionInfo()
527 cat(
528 "--------------------------------------------------------------------\n",
529 "\nJob ending time:\n",
530 format(Sys.time(), "%a %d %b %Y %X")
531 )
532
533
534 # delete the parameters to avoid the passage to the next tool in .RData image
535 rm(args)