Mercurial > repos > melpetera > intensity_checks
comparison wrapper_intensity_check.R @ 7:00181e66c50f draft
planemo upload for repository https://github.com/workflow4metabolomics/tools-metabolomics commit 689ab2a3db06eeed6a190f3cfa1618b9d9b7b07b
| author | workflow4metabolomics |
|---|---|
| date | Wed, 09 Jul 2025 15:43:29 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 6:ec75de7f1e08 | 7:00181e66c50f |
|---|---|
| 1 suppressPackageStartupMessages(library(argparse)) | |
| 2 | |
| 3 check.err <- function(err.stock) { | |
| 4 # err.stock = vector of results returned by check functions | |
| 5 if (length(err.stock) != 0) { | |
| 6 stop("\n- - - - - - - - -\n", err.stock, "\n- - - - - - - - -\n") | |
| 7 } | |
| 8 } | |
| 9 | |
| 10 | |
| 11 # To check if the 3 standard tables match regarding identifiers | |
| 12 table_match <- function(dataMatrix, sampleMetadata, variableMetadata) { | |
| 13 err.stock <- character() | |
| 14 | |
| 15 # Sample identifiers | |
| 16 dm_samples <- colnames(dataMatrix)[-1] | |
| 17 sm_samples <- as.character(sampleMetadata[[1]]) | |
| 18 # Variable identifiers | |
| 19 dm_vars <- as.character(dataMatrix[[1]]) | |
| 20 vm_vars <- as.character(variableMetadata[[1]]) | |
| 21 | |
| 22 # Check sample IDs | |
| 23 missing_in_sm <- setdiff(dm_samples, sm_samples) | |
| 24 missing_in_dm <- setdiff(sm_samples, dm_samples) | |
| 25 if (length(missing_in_sm) > 0 || length(missing_in_dm) > 0) { | |
| 26 err.stock <- c( | |
| 27 err.stock, | |
| 28 "\nData matrix and sample metadata do not match regarding sample identifiers." | |
| 29 ) | |
| 30 if (length(missing_in_sm) > 0) { | |
| 31 prefix <- if (length(missing_in_sm) < 4) { | |
| 32 "\n The " | |
| 33 } else { | |
| 34 "\n For example, the " | |
| 35 } | |
| 36 err.stock <- c( | |
| 37 err.stock, | |
| 38 prefix, | |
| 39 "following identifiers found in the data matrix\n", | |
| 40 " do not appear in the sample metadata file:\n", | |
| 41 " ", | |
| 42 paste(head(missing_in_sm, 3), collapse = "\n "), | |
| 43 "\n" | |
| 44 ) | |
| 45 } | |
| 46 if (length(missing_in_dm) > 0) { | |
| 47 prefix <- if (length(missing_in_dm) < 4) { | |
| 48 "\n The " | |
| 49 } else { | |
| 50 "\n For example, the " | |
| 51 } | |
| 52 err.stock <- c( | |
| 53 err.stock, | |
| 54 prefix, | |
| 55 "following identifiers found in the sample metadata file\n", | |
| 56 " do not appear in the data matrix:\n", | |
| 57 " ", | |
| 58 paste(head(missing_in_dm, 3), collapse = "\n "), | |
| 59 "\n" | |
| 60 ) | |
| 61 } | |
| 62 } | |
| 63 | |
| 64 # Check variable IDs | |
| 65 missing_in_vm <- setdiff(dm_vars, vm_vars) | |
| 66 missing_in_dm_var <- setdiff(vm_vars, dm_vars) | |
| 67 if (length(missing_in_vm) > 0 || length(missing_in_dm_var) > 0) { | |
| 68 err.stock <- c( | |
| 69 err.stock, | |
| 70 "\nData matrix and variable metadata do not match regarding variable identifiers." | |
| 71 ) | |
| 72 if (length(missing_in_vm) > 0) { | |
| 73 prefix <- if (length(missing_in_vm) < 4) { | |
| 74 "\n The " | |
| 75 } else { | |
| 76 "\n For example, the " | |
| 77 } | |
| 78 err.stock <- c( | |
| 79 err.stock, | |
| 80 prefix, | |
| 81 "following identifiers found in the data matrix\n", | |
| 82 " do not appear in the variable metadata file:\n", | |
| 83 " ", | |
| 84 paste(head(missing_in_vm, 3), collapse = "\n "), | |
| 85 "\n" | |
| 86 ) | |
| 87 } | |
| 88 if (length(missing_in_dm_var) > 0) { | |
| 89 prefix <- if (length(missing_in_dm_var) < 4) { | |
| 90 "\n The " | |
| 91 } else { | |
| 92 "\n For example, the " | |
| 93 } | |
| 94 err.stock <- c( | |
| 95 err.stock, | |
| 96 prefix, | |
| 97 "following identifiers found in the variable metadata file\n", | |
| 98 " do not appear in the data matrix:\n", | |
| 99 " ", | |
| 100 paste(head(missing_in_dm_var, 3), collapse = "\n "), | |
| 101 "\n" | |
| 102 ) | |
| 103 } | |
| 104 } | |
| 105 | |
| 106 if (length(err.stock) > 0) { | |
| 107 err.stock <- c(err.stock, "\nPlease check your data.\n") | |
| 108 } | |
| 109 | |
| 110 return(err.stock) | |
| 111 } | |
| 112 | |
| 113 | |
| 114 intens_check <- function( | |
| 115 DM.name, | |
| 116 SM.name, | |
| 117 VM.name, | |
| 118 method, | |
| 119 chosen.stat, | |
| 120 class.col, | |
| 121 test.fold, | |
| 122 class1, | |
| 123 fold.frac, | |
| 124 logarithm, | |
| 125 VM.output, | |
| 126 graphs.output) { | |
| 127 # Read input tables | |
| 128 DM <- read.table(DM.name, header = TRUE, sep = "\t", check.names = FALSE) | |
| 129 SM <- read.table(SM.name, header = TRUE, sep = "\t", check.names = FALSE) | |
| 130 VM <- read.table(VM.name, header = TRUE, sep = "\t", check.names = FALSE) | |
| 131 | |
| 132 # Table match check | |
| 133 table.check <- table_match(DM, SM, VM) | |
| 134 check.err(table.check) | |
| 135 | |
| 136 # Transpose and align DM | |
| 137 rownames(DM) <- DM[, 1] | |
| 138 DM <- data.frame(t(DM[, -1])) | |
| 139 DM <- merge(x = cbind(1:nrow(SM), SM), y = DM, by.x = 2, by.y = 0) | |
| 140 DM <- DM[order(DM[, 2]), ] | |
| 141 rownames(DM) <- DM[, 1] | |
| 142 DM <- DM[, -c(1:(ncol(SM) + 1))] | |
| 143 | |
| 144 stat.list <- strsplit(chosen.stat, ",")[[1]] | |
| 145 | |
| 146 # Class assignment | |
| 147 if (method == "no_class") { | |
| 148 c_class <- rep("global", nrow(DM)) | |
| 149 classnames <- "global" | |
| 150 nb_class <- 1 | |
| 151 test.fold <- "No" | |
| 152 } else { | |
| 153 class.col <- colnames(SM)[as.numeric(class.col)] | |
| 154 if (!(class.col %in% colnames(SM))) { | |
| 155 stop("The column ", class.col, " is not in sample metadata") | |
| 156 } | |
| 157 c_class <- as.factor(SM[, class.col]) | |
| 158 nb_class <- nlevels(c_class) | |
| 159 classnames <- levels(c_class) | |
| 160 if (nb_class < 2 && test.fold == "Yes") { | |
| 161 cat( | |
| 162 "The '", | |
| 163 class.col, | |
| 164 "' column contains only one class, fold calculation could not be executed.\n" | |
| 165 ) | |
| 166 } | |
| 167 if (nb_class > (nrow(SM)) / 3 && method == "each_class") { | |
| 168 cat("There are too many classes, consider reducing the number.\n") | |
| 169 } | |
| 170 if (method == "one_class") { | |
| 171 if (!(class1 %in% classnames)) { | |
| 172 stop("The '", class1, "' class does not appear in the '", class.col, "'column.") | |
| 173 } | |
| 174 c_class <- factor( | |
| 175 ifelse(c_class == class1, class1, "Other"), | |
| 176 levels = c(class1, "Other") | |
| 177 ) | |
| 178 nb_class <- nlevels(c_class) | |
| 179 classnames <- levels(c_class) | |
| 180 } | |
| 181 } | |
| 182 | |
| 183 # Check numeric | |
| 184 if (!is.numeric(as.matrix(DM))) { | |
| 185 findchar <- function(myval) { | |
| 186 ifelse( | |
| 187 is.na(myval), | |
| 188 "ok", | |
| 189 ifelse(is.na(as.numeric(as.character(myval))), "char", "ok") | |
| 190 ) | |
| 191 } | |
| 192 chardiag <- suppressWarnings(apply(DM, 2, vapply, findchar, "character")) | |
| 193 charlist <- which(chardiag == "char") | |
| 194 err.stock <- paste( | |
| 195 "\n- - - - - - - - -\nYour dataMatrix contains", | |
| 196 length(charlist), | |
| 197 "non-numeric value(s).\n" | |
| 198 ) | |
| 199 stop(c( | |
| 200 err.stock, | |
| 201 "The dataMatrix file is supposed to contain only numeric values.\n- - - - - - - - -\n" | |
| 202 )) | |
| 203 } | |
| 204 | |
| 205 DM <- cbind(c_class, DM) | |
| 206 stat.res <- t(DM[0, -1, drop = FALSE]) | |
| 207 names <- NULL | |
| 208 | |
| 209 mean.res <- sd.res <- med.res <- quart.res <- dec.res <- NA.res <- pct_NA.res <- fold.res <- NULL | |
| 210 mean.names <- sd.names <- med.names <- quart.names <- dec.names <- NA.names <- pct_NA.names <- fold.names <- NULL | |
| 211 graphs <- if (("NA" %in% stat.list) || (test.fold == "Yes")) 1 else 0 | |
| 212 data_bp <- data.frame() | |
| 213 | |
| 214 for (j in 1:nb_class) { | |
| 215 idx <- which(DM$c_class == classnames[j]) | |
| 216 # Mean | |
| 217 if ("mean" %in% stat.list) { | |
| 218 mean.res <- cbind(mean.res, colMeans(DM[idx, -1], na.rm = TRUE)) | |
| 219 mean.names <- cbind(mean.names, paste("Mean", classnames[j], sep = "_")) | |
| 220 if (j == nb_class) { | |
| 221 stat.res <- cbind(stat.res, mean.res) | |
| 222 names <- cbind(names, mean.names) | |
| 223 } | |
| 224 } | |
| 225 # SD | |
| 226 if ("sd" %in% stat.list) { | |
| 227 sd.res <- cbind(sd.res, apply(DM[idx, -1], 2, sd, na.rm = TRUE)) | |
| 228 sd.names <- cbind(sd.names, paste("Sd", classnames[j], sep = "_")) | |
| 229 if (j == nb_class) { | |
| 230 stat.res <- cbind(stat.res, sd.res) | |
| 231 names <- cbind(names, sd.names) | |
| 232 } | |
| 233 } | |
| 234 # Median | |
| 235 if (("median" %in% stat.list) && (!("quartile" %in% stat.list))) { | |
| 236 med.res <- cbind(med.res, apply(DM[idx, -1], 2, median, na.rm = TRUE)) | |
| 237 med.names <- cbind(med.names, paste("Median", classnames[j], sep = "_")) | |
| 238 if (j == nb_class) { | |
| 239 stat.res <- cbind(stat.res, med.res) | |
| 240 names <- cbind(names, med.names) | |
| 241 } | |
| 242 } | |
| 243 # Quartiles | |
| 244 if ("quartile" %in% stat.list) { | |
| 245 quart.res <- cbind( | |
| 246 quart.res, | |
| 247 t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE)) | |
| 248 ) | |
| 249 quart.names <- cbind( | |
| 250 quart.names, | |
| 251 paste("Min", classnames[j], sep = "_"), | |
| 252 paste("Q1", classnames[j], sep = "_"), | |
| 253 paste("Median", classnames[j], sep = "_"), | |
| 254 paste("Q3", classnames[j], sep = "_"), | |
| 255 paste("Max", classnames[j], sep = "_") | |
| 256 ) | |
| 257 if (j == nb_class) { | |
| 258 stat.res <- cbind(stat.res, quart.res) | |
| 259 names <- cbind(names, quart.names) | |
| 260 } | |
| 261 } | |
| 262 # Deciles | |
| 263 if ("decile" %in% stat.list) { | |
| 264 dec.res <- cbind( | |
| 265 dec.res, | |
| 266 t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE, seq(0, 1, 0.1))) | |
| 267 ) | |
| 268 dec.names <- cbind( | |
| 269 dec.names, | |
| 270 t(matrix(paste("D", seq(0, 10, 1), sep = "_", classnames[j]))) | |
| 271 ) | |
| 272 if (j == nb_class) { | |
| 273 stat.res <- cbind(stat.res, dec.res) | |
| 274 names <- cbind(names, dec.names) | |
| 275 } | |
| 276 } | |
| 277 # Missing values | |
| 278 if ("NA" %in% stat.list) { | |
| 279 nb_NA <- apply(DM[idx, -1], 2, function(x) sum(is.na(x))) | |
| 280 pct_NA <- round(nb_NA / nrow(DM[idx, -1]) * 100, 4) | |
| 281 NA.res <- cbind(NA.res, nb_NA) | |
| 282 pct_NA.res <- cbind(pct_NA.res, pct_NA) | |
| 283 NA.names <- cbind(NA.names, paste("NA", classnames[j], sep = "_")) | |
| 284 pct_NA.names <- cbind( | |
| 285 pct_NA.names, | |
| 286 paste("Pct_NA", classnames[j], sep = "_") | |
| 287 ) | |
| 288 if (j == nb_class) { | |
| 289 stat.res <- cbind(stat.res, NA.res, pct_NA.res) | |
| 290 names <- cbind(names, NA.names, pct_NA.names) | |
| 291 } | |
| 292 # Barplot data | |
| 293 bins <- cut(pct_NA, breaks = c(-1, 20, 40, 60, 80, 100), labels = FALSE) | |
| 294 for (b in 1:5) data_bp[b, j] <- sum(bins == b) | |
| 295 rownames(data_bp) <- c( | |
| 296 "0%-20%", | |
| 297 "20%-40%", | |
| 298 "40%-60%", | |
| 299 "60%-80%", | |
| 300 "80%-100%" | |
| 301 ) | |
| 302 if (j == nb_class) { | |
| 303 if (sum(NA.res) == 0) cat("Data Matrix contains no NA.\n") | |
| 304 colnames(data_bp) <- classnames | |
| 305 data_bp <- as.matrix(data_bp) | |
| 306 } | |
| 307 } | |
| 308 # Mean fold change | |
| 309 if (test.fold == "Yes" && nb_class >= 2 && j != nb_class) { | |
| 310 if (method == "each_class") fold.frac <- "Top" | |
| 311 for (k in (j + 1):nb_class) { | |
| 312 ratio1 <- if (fold.frac == "Bottom") classnames[k] else classnames[j] | |
| 313 ratio2 <- if (fold.frac == "Bottom") classnames[j] else classnames[k] | |
| 314 fold <- colMeans(DM[which(DM$c_class == ratio1), -1], na.rm = TRUE) / | |
| 315 colMeans(DM[which(DM$c_class == ratio2), -1], na.rm = TRUE) | |
| 316 if (logarithm == "log2") { | |
| 317 fold <- log2(fold) | |
| 318 } else if ( | |
| 319 logarithm == "log10" | |
| 320 ) { | |
| 321 fold <- log10(fold) | |
| 322 } | |
| 323 fold.res <- cbind(fold.res, fold) | |
| 324 fname <- if (logarithm == "none") { | |
| 325 paste("fold", ratio1, "VS", ratio2, sep = "_") | |
| 326 } else { | |
| 327 paste(logarithm, "fold", ratio1, "VS", ratio2, sep = "_") | |
| 328 } | |
| 329 fold.names <- cbind(fold.names, fname) | |
| 330 } | |
| 331 if (j == nb_class) { | |
| 332 stat.res <- cbind(stat.res, fold.res) | |
| 333 names <- cbind(names, fold.names) | |
| 334 } | |
| 335 } | |
| 336 } | |
| 337 | |
| 338 # Ensure unique column names | |
| 339 VM.names <- colnames(VM) | |
| 340 for (i in seq_along(VM.names)) { | |
| 341 for (j in seq_along(names)) { | |
| 342 if (VM.names[i] == names[j]) names[j] <- paste(names[j], "2", sep = "_") | |
| 343 } | |
| 344 } | |
| 345 colnames(stat.res) <- names | |
| 346 | |
| 347 # Output | |
| 348 VM <- cbind(VM, stat.res) | |
| 349 write.table(VM, VM.output, sep = "\t", quote = FALSE, row.names = FALSE) | |
| 350 | |
| 351 # Graphics | |
| 352 pdf(graphs.output) | |
| 353 if (graphs == 1) { | |
| 354 if ("NA" %in% stat.list) { | |
| 355 graph.colors <- c("green3", "palegreen3", "lightblue", "orangered", "red") | |
| 356 par(mar = c(5.1, 4.1, 4.1, 8.1), xpd = TRUE) | |
| 357 bp <- barplot( | |
| 358 data_bp, | |
| 359 col = graph.colors, | |
| 360 main = "Proportion of NA", | |
| 361 xlab = "Classes", | |
| 362 ylab = "Variables" | |
| 363 ) | |
| 364 legend( | |
| 365 "topright", | |
| 366 fill = graph.colors, | |
| 367 rownames(data_bp), | |
| 368 inset = c(-0.3, 0) | |
| 369 ) | |
| 370 stock <- 0 | |
| 371 for (i in 1:nrow(data_bp)) { | |
| 372 text( | |
| 373 bp, | |
| 374 stock + data_bp[i, ] / 2, | |
| 375 data_bp[i, ], | |
| 376 col = "white", | |
| 377 cex = 0.7 | |
| 378 ) | |
| 379 stock <- stock + data_bp[i, ] | |
| 380 } | |
| 381 } | |
| 382 if ((test.fold == "Yes") && (nb_class >= 2)) { | |
| 383 clean_fold <- fold.res | |
| 384 clean_fold[is.infinite(clean_fold)] <- NA | |
| 385 for (j in 1:ncol(clean_fold)) { | |
| 386 boxplot(clean_fold[, j], main = fold.names[j]) | |
| 387 } | |
| 388 } | |
| 389 } else { | |
| 390 plot.new() | |
| 391 legend("center", "You did not select any option with graphical output.") | |
| 392 } | |
| 393 dev.off() | |
| 394 } | |
| 395 | |
| 396 | |
| 397 parser <- ArgumentParser(description = "Intensity Check Tool") | |
| 398 | |
| 399 parser$add_argument( | |
| 400 "--dataMatrix_in", | |
| 401 required = TRUE, | |
| 402 help = "Input data matrix file" | |
| 403 ) | |
| 404 parser$add_argument( | |
| 405 "--sampleMetadata_in", | |
| 406 required = TRUE, | |
| 407 help = "Input sample metadata file" | |
| 408 ) | |
| 409 parser$add_argument( | |
| 410 "--variableMetadata_in", | |
| 411 required = TRUE, | |
| 412 help = "Input variable metadata file" | |
| 413 ) | |
| 414 parser$add_argument("--method", required = TRUE, help = "Computation method") | |
| 415 parser$add_argument( | |
| 416 "--chosen_stat", | |
| 417 required = TRUE, | |
| 418 help = "Statistics to compute" | |
| 419 ) | |
| 420 parser$add_argument( | |
| 421 "--class_col", | |
| 422 default = NULL, | |
| 423 help = "Class column (optional)" | |
| 424 ) | |
| 425 parser$add_argument( | |
| 426 "--test_fold", | |
| 427 default = NULL, | |
| 428 help = "Test fold (optional)" | |
| 429 ) | |
| 430 parser$add_argument("--class1", default = NULL, help = "Class1 (optional)") | |
| 431 parser$add_argument( | |
| 432 "--fold_frac", | |
| 433 default = NULL, | |
| 434 help = "Fold fraction (optional)" | |
| 435 ) | |
| 436 parser$add_argument( | |
| 437 "--logarithm", | |
| 438 default = NULL, | |
| 439 help = "Logarithm (optional)" | |
| 440 ) | |
| 441 parser$add_argument( | |
| 442 "--variableMetadata_out", | |
| 443 required = TRUE, | |
| 444 help = "Output variable metadata file" | |
| 445 ) | |
| 446 parser$add_argument( | |
| 447 "--graphs_out", | |
| 448 required = TRUE, | |
| 449 help = "Output graphs file" | |
| 450 ) | |
| 451 | |
| 452 args <- parser$parse_args() | |
| 453 | |
| 454 print(args) | |
| 455 | |
| 456 if (length(args) < 7) { | |
| 457 stop("NOT enough argument !!!") | |
| 458 } | |
| 459 | |
| 460 cat( | |
| 461 "\nJob starting time:\n", | |
| 462 format(Sys.time(), "%a %d %b %Y %X"), | |
| 463 "\n\n--------------------------------------------------------------------", | |
| 464 "\nIntensity Check parameters:\n\n" | |
| 465 ) | |
| 466 print(args) | |
| 467 cat("--------------------------------------------------------------------\n\n") | |
| 468 | |
| 469 class_col <- NULL | |
| 470 test_fold <- NULL | |
| 471 class1 <- NULL | |
| 472 fold_frac <- NULL | |
| 473 logarithm <- NULL | |
| 474 if (args$method == "each_class") { | |
| 475 class_col <- args$class_col | |
| 476 test_fold <- args$test_fold | |
| 477 if (args$test_fold == "Yes") { | |
| 478 logarithm <- args$logarithm | |
| 479 } | |
| 480 } | |
| 481 if (args$method == "one_class") { | |
| 482 class_col <- args$class_col | |
| 483 class1 <- args$class1 | |
| 484 test_fold <- args$test_fold | |
| 485 if (args$test_fold == "Yes") { | |
| 486 fold_frac <- args$fold_frac | |
| 487 logarithm <- args$logarithm | |
| 488 } | |
| 489 } | |
| 490 | |
| 491 err_no_option <- NULL | |
| 492 | |
| 493 if ( | |
| 494 ((args$method == "no_class") && (args$chosen_stat == "None")) || | |
| 495 ((args$method != "no_class") && | |
| 496 (args$chosen_stat == "None") && | |
| 497 (test_fold == "No")) | |
| 498 ) { | |
| 499 err_no_option <- "You did not select any computational option. Program can not be executed." | |
| 500 stop("\n- - - - - - - - -\n", err_no_option, "\n- - - - - - - - -\n") | |
| 501 } | |
| 502 | |
| 503 | |
| 504 if (is.null(err_no_option)) { | |
| 505 intens_check( | |
| 506 args$dataMatrix_in, | |
| 507 args$sampleMetadata_in, | |
| 508 args$variableMetadata_in, | |
| 509 args$method, | |
| 510 args$chosen_stat, | |
| 511 class_col, | |
| 512 test_fold, | |
| 513 class1, | |
| 514 fold_frac, | |
| 515 logarithm, | |
| 516 args$variableMetadata_out, | |
| 517 args$graphs_out | |
| 518 ) | |
| 519 } | |
| 520 | |
| 521 | |
| 522 cat( | |
| 523 "\n--------------------------------------------------------------------", | |
| 524 "\nInformation about R (version, Operating System, attached or loaded packages):\n\n" | |
| 525 ) | |
| 526 sessionInfo() | |
| 527 cat( | |
| 528 "--------------------------------------------------------------------\n", | |
| 529 "\nJob ending time:\n", | |
| 530 format(Sys.time(), "%a %d %b %Y %X") | |
| 531 ) | |
| 532 | |
| 533 | |
| 534 # delete the parameters to avoid the passage to the next tool in .RData image | |
| 535 rm(args) |
