Mercurial > repos > workflow4metabolomics > ms2snoop
diff MS2snoop.R @ 6:77abacd33c31 draft
planemo upload commit 1f791337b9b8f08326c01bf4799f50493ef54f80
author | workflow4metabolomics |
---|---|
date | Fri, 30 Sep 2022 16:18:56 +0000 |
parents | 78d5a12406c2 |
children | 2a1f120a6874 |
line wrap: on
line diff
--- a/MS2snoop.R Fri Aug 05 17:25:45 2022 +0000 +++ b/MS2snoop.R Fri Sep 30 16:18:56 2022 +0000 @@ -1,1236 +1,1233 @@ -#' -#' read and process mspurity W4M files -#' create a summary of fragment for each precursor and a graphics of peseudo -#' spectra + correlation on which checking of fragment is based on -#' V3 try to identify and process multiple files for 1 precursor which may -#' occur if different collision energy are used -#' V4 elimination of correlation = NA. Correlation is done with precursor, if -#' precursor is not present correlation with most intense peak -#' author: Jean-Francois Martin -#' V5 is versionned, lintR-compliant, packaged, unit-tested, documented and -#' tested against data from other labs. -#' new maintainer: Lain Pavot - lain.pavot@inrae.fr -#' -#' @import optparse -#' - - -get_version <- function() { - cmd <- commandArgs(trailingOnly = FALSE) - root <- dirname(gsub("--file=", "", cmd[grep("--file=", cmd)])) - readme <- readLines(file.path(root, "README.md")) - version_line <- readme[grepl(" * **@version**: ", readme, fixed = TRUE)] - return(gsub(".*: ", "", version_line)) -} - -defaults <- list( - MS2SNOOP_VERSION = get_version(), - MISSING_PARAMETER_ERROR = 1, - BAD_PARAMETER_VALUE_ERROR = 2, - MISSING_INPUT_FILE_ERROR = 3, - NO_ANY_RESULT_ERROR = 255, - DEFAULT_PRECURSOR_PATH = NULL, - DEFAULT_FRAGMENTS_PATH = NULL, - DEFAULT_COMPOUNDS_PATH = NULL, - DEFAULT_OUTPUT_PATH = "compound_fragments_result.txt", - DEFAULT_TOLMZ = 0.01, - DEFAULT_TOLRT = 20, - DEFAULT_MZDECIMAL = 3, - DEFAULT_R_THRESHOLD = 0.85, - DEFAULT_MINNUMBERSCAN = 8, - DEFAULT_SEUIL_RA = 0.05, - DEFAULT_FRAGMENTS_MATCH_DELTA = 10, - DEFAULT_FRAGMENTS_MATCH_DELTA_UNIT = "ppm", - DEFAULT_PDF_PATH = "" -) -env <- globalenv() -for (default in names(defaults)) { - assign(default, defaults[[default]], envir = env) - lockBinding(default, env) -} - -######################################################################## - -get_formulas <- function( - mzref, - spectra, - nominal_mz_list, - processing_parameters, - background = !TRUE -) { - if (is.vector(mzref) && length(mzref) > 1) { - return(lapply( - mzref, - function(mz) { - return(get_formulas( - mzref = mz, - spectra = spectra, - nominal_mz_list = nominal_mz_list, - processing_parameters = processing_parameters, - background = background - )) - } - )) - } - input <- sprintf( - "%s-%s.ms", - gsub("[[:space:]]", "_", processing_parameters$c_name), - mzref - ) - create_ms_file(input, mzref, spectra, processing_parameters) - output <- sprintf( - "out/%s-%s.out", - gsub("[[:space:]]", "_", processing_parameters$c_name), - mzref - ) - command <- sprintf( - paste( - "sirius", - "--noCite", - "--noSummaries", - "--loglevel=WARNING", - "-i='%s'", - "-o='%s'", - "tree", - ## loglevel is not working taken into account during - ## sirius startup, so we filter outputs... - "2>&1 | grep '^(WARNING|SEVERE)'" - ), - input, - output - ) - verbose_catf( - ">> Sirius is running %swith the command: %s\n", - if (background) "in the background " else "", - command - ) - system( - command, - wait = !background, - ignore.stdout = background, - ignore.stderr = background - ) - return(extract_sirius_results(output, spectra$mz, processing_parameters)) -} - -create_ms_file <- function( - path, - mzref, - spectra, - processing_parameters -) { - file_content <- paste( - sprintf(">compound %s", processing_parameters$c_name), - sprintf(">ionization %s", processing_parameters$ionization), - sprintf(">parentmass %s", mzref), - sprintf(">formula %s", processing_parameters$elemcomposition), - sep = "\n" - ) - displayed_file_content <- sprintf( - "%s\n>collision\n%s", - file_content, - paste( - sprintf( - "%s %s", - spectra[1:3, "mz"], - spectra[1:3, "intensities"] - ), - collapse = "\n" - ) - ) - if (nrow(spectra) > 3) { - displayed_file_content <- sprintf( - "%s\n[... %s more rows of mz and intensities ...]", - displayed_file_content, - nrow(spectra) - 3 - ) - } - catf( - ">> MS file created for %s with content:\n%s\n", - processing_parameters$c_name, - displayed_file_content - ) - file_content <- sprintf( - "%s\n\n>collision\n%s", - file_content, - paste( - sprintf("%s %s", spectra$mz, spectra$intensities), - collapse = "\n" - ) - ) - cat(file_content, file = path, append = FALSE) -} - -extract_sirius_results <- function( - output, - mz_list, - processing_parameters -) { - - delta <- processing_parameters$fragment_match_delta - delta_unit <- tolower(processing_parameters$fragment_match_delta_unit) - - output <- list.dirs(output, recursive = FALSE)[[1]] - - spectra_out_dir <- sprintf("%s/spectra", output) - spectra_filename <- sprintf( - "%s/%s", - spectra_out_dir, - list.files(spectra_out_dir)[[1]] - ) - - trees_out_dir <- sprintf("%s/trees", output) - trees_filename <- sprintf( - "%s/%s", - trees_out_dir, - list.files(trees_out_dir)[[1]] - ) - - if (!is.null(spectra_filename)) { - sirius_results <- get_csv_or_tsv(spectra_filename) - } else { - return(rep(NA, length(mz_list))) - } - if (!is.null(trees_filename)) { - sirius_results <- cbind(sirius_results, extract_sirius_ppm(trees_filename)) - } else { - return(rep(NA, length(mz_list))) - } - - fragment_matchings <- data.frame( - formula = NA, - ppm = NA, - mz = mz_list, - error = NA - ) - - sirius_results <- filter_sirius_with_delta( - sirius_results = sirius_results, - original_mz = fragment_matchings$mz, - delta = delta, - delta_unit = delta_unit - ) - - for (index in seq_len(nrow(sirius_results))) { - result <- sirius_results[index, ] - filter <- which(order(abs(fragment_matchings$mz - result$mz)) == 1) - fragment_matchings[filter, "formula"] <- result$formula - fragment_matchings[filter, "ppm"] <- result$ppm - catf( - "[OK] Fragment with m/z=%s matches %s with a difference of %s ppm\n", - fragment_matchings[filter, "mz"], result$formula, result$ppm - ) - } - return(fragment_matchings) -} - -filter_sirius_with_delta <- function( - sirius_results, - original_mz, - delta, - delta_unit -) { - if (is.numeric(delta) && !is.na(delta) && delta > 0) { - if (delta_unit == "ppm") { - filter <- abs(sirius_results$ppm) <= delta - fine <- which(filter) - not_fine <- which(!filter) - catf( - paste("[KO] fragment %s (m/z=%s) eleminated because ppm=%s is greater", - "than delta=%s\n" - ), - sirius_results[not_fine, ]$formula, - sirius_results[not_fine, ]$mz, - sirius_results[not_fine, ]$ppm, - delta - ) - sirius_results <- sirius_results[fine, ] - } else if (delta_unit == "mz") { - differences <- sapply( - sirius_results$mz, - function(mz) min(abs(original_mz - mz)) - ) - fine <- which(sapply( - sirius_results$mz, - function(mz) any(abs(original_mz - mz) <= delta) - )) - not_fine <- which(sapply( - sirius_results$mz, - function(mz) all(abs(original_mz - mz) > delta) - )) - catf( - paste( - "[KO] fragment %s eleminated because mz difference=%s is", - "greater than delta=%s\n" - ), - sirius_results[not_fine, ]$formula, - differences[not_fine], - delta - ) - sirius_results <- sirius_results[fine, ] - } - } - return(sirius_results) -} - -extract_sirius_ppm <- function(path) { - json <- file(path, "r") - suppressWarnings(json_lines <- readLines(json)) - close(json) - json_lines <- json_lines[ - grepl("\\s+\"(massDeviation|recalibratedMass)\" :", json_lines) - ] - json_lines <- gsub("^\\s+\"[^\"]+\" : \"?", "", json_lines) - ppms <- json_lines[seq(1, length(json_lines), 2)] - mz <- json_lines[seq(2, length(json_lines), 2)] - ppms <- as.numeric(gsub(" ppm .*", "", ppms)) - mz <- as.numeric(gsub(",$", "", mz)) - ordered <- order(mz) - return(list(ppm = ppms[ordered], recalibrated_mz = mz[ordered])) -} - -#' @title plot_pseudo_spectra -#' @param x -#' @param fid -#' @param sum_int -#' @param vmz -#' @param cor_abs_int -#' @param refcol -#' @param c_name -#' @description plot_pseudo_spectra -#' function to compute sum of intensities among scans for all -#' m/z kept (cor > r_threshold & minimum number of scans) -#' and plot pseudo spectra -#' x dataframe scan X fragments with scans number in the 1st column and -#' ions in next with intensities -#' fid file id when several a precursor has been detected in several files -plot_pseudo_spectra <- function( - x, - fid, - sum_int, - vmz, - cor_abs_int, - refcol, - meaned_mz, - processing_parameters -) { - ## du fait de la difference de nombre de colonne entre la dataframe qui - ## inclue les scans en 1ere col, mzRef se decale de 1 - refcol <- refcol - 1 - ## compute relative intensities max=100% - rel_int <- sum_int[-1] - rel_int <- rel_int / max(rel_int) - - if (processing_parameters$do_pdf) { - ## define max value on vertical axis (need to increase in order to plot the - ## label of fragments) - ymax <- max(rel_int) + 0.2 * max(rel_int) - - par(mfrow = c(2, 1)) - plot(vmz, rel_int, type = "h", ylim = c(0, ymax), - main = processing_parameters$c_name - ) - ## low correl coef. will be display in grey - cor_low <- which(round(cor_abs_int, 2) < processing_parameters$r_threshold) - - lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2)) - - if (length(cor_low) > 0) { - text( - vmz[cor_low], - rel_int[cor_low], - lbmzcor[cor_low], - cex = 0.5, - col = "grey", - srt = 90, - adj = 0 - ) - if (length(vmz) - length(cor_low) > 1) { - text( - vmz[-c(refcol, cor_low)], - rel_int[-c(refcol, cor_low)], - lbmzcor[-c(refcol, cor_low)], - cex = 0.6, - col = 1, - srt = 90, - adj = 0 - ) - } - } else { - if (length(vmz) > 1) { - text( - vmz[-c(refcol)], - rel_int[-c(refcol)], - lbmzcor[-c(refcol)], - cex = 0.6, - col = 1, - srt = 90, - adj = 0 - ) - } - } - - text( - vmz[refcol], - rel_int[refcol], - lbmzcor[refcol], - cex = 0.8, - col = 2, - srt = 90, - adj = 0 - ) - } - - ## prepare result file - cor_valid <- (round(cor_abs_int, 2) >= processing_parameters$r_threshold) - - do_sirius <- TRUE - verbose_catf("Checking sirius parameters...\n") - if (is.null(processing_parameters$ionization)) { - do_sirius <- FALSE - verbose_catf("[KO] No ionization passed in parameter.\n") - } else { - verbose_catf("[OK] Ionization=%s.\n", processing_parameters$ionization) - } - if (is.na(processing_parameters$elemcomposition)) { - do_sirius <- FALSE - verbose_catf("[KO] Elemental composition is NA.\n") - } else if (length(processing_parameters$elemcomposition) < 1) { - do_sirius <- FALSE - verbose_catf("[KO] No elemental composition is provided.\n") - } else if (processing_parameters$elemcomposition == "") { - do_sirius <- FALSE - verbose_catf("[KO] Elemental composition is an empty string.\n") - } else { - verbose_catf( - "[OK] Elemental composition=%s.\n", - processing_parameters$elemcomposition - ) - } - - cp_res_length <- length(vmz) - ppm <- rep(NA, cp_res_length) - formulas <- rep(NA, cp_res_length) - if (do_sirius) { - verbose_catf("Everything is ok, preparing for sirius.\n") - formulas <- get_formulas( - mzref = processing_parameters$mzref, - spectra = data.frame(mz = meaned_mz, intensities = sum_int[-1]), - nominal_mz_list = vmz, - processing_parameters = processing_parameters - ) - if (nrow(formulas) == 0) { - catf("No formula found.\n") - } else { - ppm <- formulas$ppm - formulas <- formulas$formula - catf( - "Found %s formula for %s fragments\n", - length(formulas[which(!(is.na(formulas)))]), - cp_res_length - ) - } - } else { - verbose_catf("Sirius cannot be run.\n") - } - cp_res <- data.frame( - rep(processing_parameters$c_name, cp_res_length), - rep(processing_parameters$inchikey, cp_res_length), - rep(processing_parameters$elemcomposition, cp_res_length), - formulas, - vmz, - ppm, - rep(fid, cp_res_length), - cor_abs_int, - sum_int[-1], - rel_int, - cor_valid - ) - - colnames(cp_res) <- c( - "compoundName", - "inchikey", - "elemcomposition", - "fragment", - "fragment_mz", - "ppm", - "fileid", - "CorWithPrecursor", - "AbsoluteIntensity", - "relativeIntensity", - "corValid" - ) - return(cp_res) -} - -#' -#' @title extract_fragments -#' -#' @param precursors the precursor list from mspurity -#' @param fragments the fragments list from ms purity -# ' @param mzref -# ' @param rtref -# ' @param c_name -# ' @param inchikey -# ' @param elemcomposition -#' @param processing_parameters -#' @returns -#' -#' @description -#' function for extraction of fragments corresponding to precursors -#' detected by MSPurity -extract_fragments <- function( ## nolint cyclocomp_linter - precursors, - fragments, - processing_parameters -) { - ## filter precursor in the precursors file based on mz and rt in the - ## compound list - catf("processing %s\n", processing_parameters$c_name) - verbose_catf("===\n") - param <- processing_parameters - selected_precursors <- which( - (abs(precursors$precurMtchMZ - param$mzref) <= param$tolmz) - & (abs(precursors$precurMtchRT - param$rtref) <= param$tolrt) - ) - rm(param) - - verbose_catf( - "> %s precursors selected with mz=%s±%s and rt=%s±%s\n", - length(selected_precursors), - processing_parameters$mzref, - processing_parameters$tolmz, - processing_parameters$rtref, - processing_parameters$tolrt - ) - - ## check if there is the precursor in the file - - if (length(selected_precursors) < 1) { - cat("> non detected in precursor file\n") - show_end_processing() - return(NULL) - } - - precursors <- precursors[selected_precursors, ] - - ## check if fragments corresponding to precursor are found in several - ## files (collision energy) - ## this lead to a processing for each fileid - file_ids <- as.character(sort(unique(precursors$fileid))) - if (length(file_ids) > 1) { - catf("> several files detected for this compounds :\n") - } else if (length(file_ids) < 1 || nrow(precursors) < 1) { - return(data.frame()) - } - - res_comp <- data.frame() - for (curent_file_id in file_ids) { - curent_precursors <- precursors[precursors$fileid == curent_file_id, ] - selected_fragments <- fragments[ - fragments$grpid %in% as.character(curent_precursors$grpid) - & fragments$fileid == curent_file_id, - ] - filtered_fragments <- selected_fragments[ - selected_fragments$ra > processing_parameters$seuil_ra, - ] - if (nrow(filtered_fragments) != 0) { - res_comp_by_file <- process_file( - curent_file_id = curent_file_id, - precursor_mz = curent_precursors$mz, - filtered_fragments = filtered_fragments, - processing_parameters = processing_parameters - ) - if (!is.null(res_comp_by_file)) { - res_comp <- rbind(res_comp, res_comp_by_file) - } - } else { - catf("No fragment found for in fragment file\n") - } - } - return(unique(res_comp)) -} - -process_file <- function( - curent_file_id, - precursor_mz, - filtered_fragments, - processing_parameters -) { - mznominal <- round(x = filtered_fragments$mz, digits = 0) - meaned_mz <- round( - aggregate( - data.frame( - mz = filtered_fragments$mz, - mznominal = mznominal - ), - list(mznominal), - FUN = mean - )$mz, - digits = processing_parameters$mzdecimal - ) - filtered_fragments <- data.frame(filtered_fragments, mznominal) - - ## creation of cross table row=scan col=mz X=ra - - vmz <- as.character(sort(unique(filtered_fragments$mznominal))) - - ds_abs_int <- create_ds_abs_int(vmz, filtered_fragments) - - if (global_debug) { - print(ds_abs_int) - } - - ## elimination of mz with less than min_number_scan scans (user defined - ## parameter) - xmz <- rep(NA, ncol(ds_abs_int) - 1) - sum_int <- rep(NA, ncol(ds_abs_int)) - nbxmz <- 0 - nb_scan_check <- min(nrow(ds_abs_int), processing_parameters$min_number_scan) - - for (j in 2:ncol(ds_abs_int)) { - sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE) - if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) { - nbxmz <- nbxmz + 1 - xmz[nbxmz] <- j - } - } - - xmz <- xmz[-which(is.na(xmz))] - if (length(xmz) > 0) { - ds_abs_int <- ds_abs_int[, -c(xmz)] - sum_int <- sum_int[-c(xmz)] - ## liste des mz keeped decale de 1 avec ds_abs_int - vmz <- as.numeric(vmz[-c(xmz - 1)]) - meaned_mz <- meaned_mz[-c(xmz - 1)] - } - - ## mz of precursor in data precursor to check correlation with - mz_prec <- paste0( - "mz", - round(mean(precursor_mz), processing_parameters$mzdecimal) - ) - ## reference ion for correlation computing = precursor OR maximum - ## intensity ion in precursor is not present - refcol <- which(colnames(ds_abs_int) == mz_prec) - if (length(refcol) == 0) { - refcol <- which(sum_int == max(sum_int, na.rm = TRUE)) - } - - if (processing_parameters$do_pdf) { - start_pdf(processing_parameters, curent_file_id) - } - - ## Pearson correlations between absolute intensities computing - cor_abs_int <- rep(NA, length(vmz)) - - if (length(refcol) > 0) { - for (i in 2:length(ds_abs_int)) { - cor_abs_int[i - 1] <- stats::cor( - x = ds_abs_int[[refcol]], - y = ds_abs_int[[i]], - use = "pairwise.complete.obs", - method = "pearson" - ) - debug_catf( - "Correlation between %s and %s: %s\n", - paste(ds_abs_int[[refcol]], collapse = ";"), - paste(ds_abs_int[[i]], collapse = ";"), - paste(cor_abs_int[i - 1], collapse = ";") - ) - if (processing_parameters$do_pdf) { - pdf_plot_ds_abs_int( - processing_parameters$c_name, - ds_abs_int, - refcol, - i, - round(cor_abs_int[i - 1], 2) - ) - } - } - ## plot pseudo spectra - res_comp_by_file <- plot_pseudo_spectra( - x = ds_abs_int, - fid = curent_file_id, - sum_int = sum_int, - vmz = vmz, - cor_abs_int = cor_abs_int, - refcol = refcol, - meaned_mz = meaned_mz, - processing_parameters = processing_parameters - ) - catf( - "%s has been processed and %s fragments have been found.\n", - processing_parameters$c_name, - nrow(res_comp_by_file) - ) - } else { - res_comp_by_file <- NULL - cat(">> non detected in fragments file \n") - } - show_end_processing() - if (processing_parameters$do_pdf) { - end_pdf() - } - return(res_comp_by_file) -} - -create_ds_abs_int <- function(vmz, filtered_fragments) { - verbose_catf( - ">> fragments: %s\n", - paste(vmz, collapse = " ") - ) - ds_abs_int <- create_int_mz(vmz[1], filtered_fragments) - for (mz in vmz[-1]) { - int_mz <- create_int_mz(mz, filtered_fragments) - ds_abs_int <- merge( - x = ds_abs_int, - y = int_mz, - by.x = 1, - by.y = 1, - all.x = TRUE, - all.y = TRUE - ) - } - return(ds_abs_int) -} - -create_int_mz <- function(mz, filtered_fragments) { - ## absolute intensity - int_mz <- filtered_fragments[ - filtered_fragments$mznominal == mz, - c("acquisitionNum", "i") - ] - colnames(int_mz)[2] <- paste0("mz", mz) - ## average intensities of mass in duplicate scans - comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean) - return(comp_scans[, -1]) -} - -show_end_processing <- function() { - verbose_catf("==========\n") - cat("\n") -} - -start_pdf <- function(processing_parameters, curent_file_id) { - if (!dir.exists(processing_parameters$pdf_path)) { - dir.create(processing_parameters$pdf_path, recursive = TRUE) - } - pdf( - file = sprintf( - "%s/%s_processing_file%s.pdf", - processing_parameters$pdf_path, - processing_parameters$c_name, - curent_file_id - ), - width = 8, - height = 11 - ) - par(mfrow = c(3, 2)) -} - -pdf_plot_ds_abs_int <- function(c_name, ds_abs_int, refcol, i, r_coef) { - plot( - ds_abs_int[[refcol]], - ds_abs_int[[i]], - xlab = colnames(ds_abs_int)[refcol], - ylab = colnames(ds_abs_int)[i], - main = sprintf( - "%s corr coeff r=%s", c_name, r_coef - ) - ) -} -end_pdf <- function() { - dev.off() -} - -set_global <- function(var, value) { - assign(var, value, envir = globalenv()) -} - -set_debug <- function() { - set_global("global_debug", TRUE) -} - -unset_debug <- function() { - set_global("global_debug", FALSE) -} - -set_verbose <- function() { - set_global("global_verbose", TRUE) -} - -unset_verbose <- function() { - set_global("global_verbose", FALSE) -} - -verbose_catf <- function(...) { - if (global_verbose) { - cat(sprintf(...), sep = "") - } -} - - -debug_catf <- function(...) { - if (global_debug) { - cat(sprintf(...), sep = "") - } -} - -catf <- function(...) { - cat(sprintf(...), sep = "") -} - -create_parser <- function() { - parser <- optparse::OptionParser() - parser <- optparse::add_option( - parser, - c("-v", "--verbose"), - action = "store_true", - default = FALSE, - help = paste( - "[default %default]", - "Print extra output" - ) - ) - parser <- optparse::add_option( - parser, - c("-V", "--version"), - action = "store_true", - default = FALSE, - help = "Prints version and exits" - ) - parser <- optparse::add_option( - parser, - c("-d", "--debug"), - action = "store_true", - default = FALSE, - help = paste( - "[default %default]", - "Print debug outputs" - ) - ) - parser <- optparse::add_option( - parser, - c("-o", "--output"), - type = "character", - default = DEFAULT_OUTPUT_PATH, - action = "store", - help = "Path to the output file [default %default]" - ) - parser <- optparse::add_option( - parser, - c("-p", "--precursors"), - type = "character", - default = DEFAULT_PRECURSOR_PATH, - action = "store", - help = "Path to the precursors file [default %default]" - ) - parser <- optparse::add_option( - parser, - c("-f", "--fragments"), - type = "character", - default = DEFAULT_FRAGMENTS_PATH, - action = "store", - help = "Path to the fragments file [default %default]" - ) - parser <- optparse::add_option( - parser, - c("-c", "--compounds"), - type = "character", - default = DEFAULT_COMPOUNDS_PATH, - action = "store", - help = "Path to the compounds file [default %default]" - ) - parser <- optparse::add_option( - parser, - c("--tolmz"), - type = "numeric", - action = "store", - default = DEFAULT_TOLMZ, - metavar = "number", - help = paste( - "[default %default]", - "Tolerance for MZ (in Dalton) to match the standard in the compounds" - ) - ) - parser <- optparse::add_option( - parser, - c("--tolrt"), - type = "integer", - action = "store", - default = DEFAULT_TOLRT, - metavar = "number", - help = paste( - "[default %default]", - "RT (in seconds) to match the standard in the compounds" - ) - ) - parser <- optparse::add_option( - parser, - c("--seuil_ra"), - type = "numeric", - action = "store", - default = DEFAULT_SEUIL_RA, - metavar = "number", - help = paste( - "[default %default]", - "relative intensity threshold" - ), - ) - parser <- optparse::add_option( - parser, - c("--mzdecimal"), - type = "integer", - default = DEFAULT_MZDECIMAL, - action = "store", - help = paste( - "[default %default]", - "Number of decimal to write for MZ" - ), - metavar = "number" - ) - parser <- optparse::add_option( - parser, - c("--r_threshold"), - type = "integer", - default = DEFAULT_R_THRESHOLD, - action = "store", - help = paste( - "[default %default]", - "R-Pearson correlation threshold between precursor and fragment", - "absolute intensity" - ), - metavar = "number" - ) - parser <- optparse::add_option( - parser, - c("--min_number_scan"), - type = "numeric", - action = "store", - default = DEFAULT_MINNUMBERSCAN, - help = paste( - "[default %default]", - "Fragments are kept if there are found in a minimum number", - "of min_number_scan scans" - ), - metavar = "number" - ) - parser <- optparse::add_option( - parser, - c("--pdf_path"), - type = "character", - default = DEFAULT_PDF_PATH, - help = paste( - "[default %default]", - "PDF files output path" - ) - ) - parser <- optparse::add_option( - parser, - c("--ionization"), - type = "character", - action = "store", - default = "None", - help = paste( - "[default %default]", - "Which ionization to use for sirius" - ), - metavar = "character" - ) - parser <- optparse::add_option( - parser, - c("--fragment_match_delta"), - type = "numeric", - action = "store", - default = DEFAULT_FRAGMENTS_MATCH_DELTA, - help = paste( - "[default %default]", - "Fragment match delta" - ), - metavar = "numeric" - ) - parser <- optparse::add_option( - parser, - c("--fragment_match_delta_unit"), - type = "character", - action = "store", - default = DEFAULT_FRAGMENTS_MATCH_DELTA_UNIT, - help = paste( - "[default %default]", - "Fragment match delta" - ), - metavar = "character" - ) - return(parser) -} - -stop_with_status <- function(msg, status) { - sink(stderr()) - message(sprintf("Error: %s", msg)) - message(sprintf("Error code: %s", status)) - sink(NULL) - base::quit(status = status) -} - -check_args_validity <- function(args) { ## nolint cyclocomp_linter - if (length(args$output) == 0 || nchar(args$output[1]) == 0) { - stop_with_status( - "Missing output parameters. Please set it with --output.", - MISSING_PARAMETER_ERROR - ) - } - if (length(args$precursors) == 0 || nchar(args$precursors[1]) == 0) { - stop_with_status( - "Missing precursors parameters. Please set it with --precursors.", - MISSING_PARAMETER_ERROR - ) - } - if (length(args$fragments) == 0 || nchar(args$fragments[1]) == 0) { - stop_with_status( - "Missing fragments parameters. Please set it with --fragments.", - MISSING_PARAMETER_ERROR - ) - } - if (length(args$compounds) == 0 || nchar(args$compounds[1]) == 0) { - stop_with_status( - "Missing compounds parameters. Please set it with --compounds.", - MISSING_PARAMETER_ERROR - ) - } - if (!file.exists(args$precursors)) { - stop_with_status( - sprintf( - "Precursors file %s does not exist or cannot be accessed.", - args$precursors - ), - MISSING_INPUT_FILE_ERROR - ) - } - if (!file.exists(args$fragments)) { - stop_with_status( - sprintf( - "Fragments file %s does not exist or cannot be accessed.", - args$fragments - ), - MISSING_INPUT_FILE_ERROR - ) - } - if (!file.exists(args$compounds)) { - stop_with_status( - sprintf( - "Compounds file %s does not exist or cannot be accessed.", - args$compounds - ), - MISSING_INPUT_FILE_ERROR - ) - } - if (in_galaxy_env()) { - check_galaxy_args_validity(args) - } -} - -in_galaxy_env <- function() { - sysvars <- Sys.getenv() - sysvarnames <- names(sysvars) - return( - "_GALAXY_JOB_HOME_DIR" %in% sysvarnames - || "_GALAXY_JOB_TMP_DIR" %in% sysvarnames - || "GALAXY_MEMORY_MB" %in% sysvarnames - || "GALAXY_MEMORY_MB_PER_SLOT" %in% sysvarnames - || "GALAXY_SLOTS" %in% sysvarnames - ) -} - -check_galaxy_args_validity <- function(args) { - if (!file.exists(args$output)) { - stop_with_status( - sprintf( - "Output file %s does not exist or cannot be accessed.", - args$output - ), - MISSING_INPUT_FILE_ERROR - ) - } -} - -get_csv_or_tsv <- function( - path, - sep_stack = c("\t", ",", ";"), - sep_names = c("tab", "comma", "semicolon"), - header = TRUE, - quote = "\"" -) { - sep <- determine_csv_or_tsv_sep( - path = path, - sep_stack = sep_stack, - header = header, - quote = quote - ) - verbose_catf( - "%s separator has been determined for %s.\n", - sep_names[sep_stack == sep], - path - ) - return(read.table( - file = path, - sep = sep, - header = header, - quote = quote - )) -} - -determine_csv_or_tsv_sep <- function( - path, - sep_stack = c("\t", ",", ";"), - header = TRUE, - quote = "\"" -) { - count <- -1 - best_sep <- sep_stack[1] - for (sep in sep_stack) { - tryCatch({ - table <- read.table( - file = path, - sep = sep, - header = header, - quote = quote, - nrows = 1 - ) - if (ncol(table) > count) { - count <- ncol(table) - best_sep <- sep - } - }) - } - return(best_sep) -} - -uniformize_columns <- function(df) { - cols <- colnames(df) - for (func in c(tolower)) { - cols <- func(cols) - } - colnames(df) <- cols - return(df) -} - -handle_galaxy_param <- function(args) { - for (param in names(args)) { - if (is.character(args[[param]])) { - args[[param]] <- gsub("__ob__", "[", args[[param]]) - args[[param]] <- gsub("__cb__", "]", args[[param]]) - } - } - return(args) -} - -zip_pdfs <- function(processing_parameters) { - if (processing_parameters$do_pdf) { - if (zip <- Sys.getenv("R_ZIPCMD", "zip") == "") { - catf("R could not fin the zip executable. Trying luck: zip = \"zip\"") - zip <- "zip" - } else { - catf("Found zip executable at %s .", zip) - } - utils::zip( - processing_parameters$pdf_zip_path, - processing_parameters$pdf_path, - zip = zip - ) - } -} - -main <- function(args) { - if (args$version) { - catf("%s\n", MS2SNOOP_VERSION) - base::quit(status = 0) - } - if (in_galaxy_env()) { - print(sessionInfo()) - cat("\n\n") - } - check_args_validity(args) - args <- handle_galaxy_param(args) - if (args$ionization == "None") { - args$ionization <- NULL - } - if (args$debug) { - set_debug() - } - if (args$verbose) { - set_verbose() - } - precursors <- get_csv_or_tsv(args$precursors) - fragments <- get_csv_or_tsv(args$fragments) - compounds <- get_csv_or_tsv(args$compounds) - - compounds <- uniformize_columns(compounds) - mandatory_columns <- c( - "compound_name", - "mz", - "rtsec", - "inchikey" - ) - presents <- mandatory_columns %in% colnames(compounds) - if (!all(presents)) { - stop_with_status( - sprintf( - "Some columns are missing: %s", - paste(mandatory_columns[which(!presents)], collapse = ", ") - ), - BAD_PARAMETER_VALUE_ERROR - ) - } - - res_all <- data.frame() - processing_parameters <- list( - min_number_scan = args$min_number_scan, - mzdecimal = args$mzdecimal, - r_threshold = args$r_threshold, - seuil_ra = args$seuil_ra, - tolmz = args$tolmz, - tolrt = args$tolrt, - ionization = args$ionization, - do_pdf = nchar(args$pdf_path) > 0, - pdf_zip_path = args$pdf_path, - pdf_path = tempdir(), - fragment_match_delta = args$fragment_match_delta, - fragment_match_delta_unit = args$fragment_match_delta_unit - ) - for (i in seq_len(nrow(compounds))) { - processing_parameters$mzref <- compounds[["mz"]][i] - processing_parameters$rtref <- compounds[["rtsec"]][i] - processing_parameters$c_name <- compounds[["compound_name"]][i] - processing_parameters$inchikey <- compounds[["inchikey"]][i] - processing_parameters$elemcomposition <- compounds[["elemcomposition"]][i] - res_cor <- extract_fragments( - precursors = precursors, - fragments = fragments, - processing_parameters = processing_parameters - ) - if (!is.null(res_cor)) { - res_all <- rbind(res_all, res_cor) - } - } - - if (nrow(res_all) == 0) { - stop_with_status("No result at all!", NO_ANY_RESULT_ERROR) - } - - write.table( - x = res_all, - file = args$output, - sep = "\t", - row.names = FALSE - ) - zip_pdfs(processing_parameters) - unlink(processing_parameters$pdf_path, recursive = TRUE) -} - -global_debug <- FALSE -global_verbose <- FALSE -args <- optparse::parse_args(create_parser()) -main(args) - -warnings() +#' read and process mspurity W4M files +#' create a summary of fragment for each precursor and a graphics of peseudo +#' spectra + correlation on which checking of fragment is based on +#' V3 try to identify and process multiple files for 1 precursor which may +#' occur if different collision energy are used +#' V4 elimination of correlation = NA. Correlation is done with precursor, if +#' precursor is not present correlation with most intense peak +#' author: Jean-Francois Martin +#' V5 is versionned, lintR-compliant, packaged, unit-tested, documented and +#' tested against data from other labs. +#' new maintainer: Lain Pavot - lain.pavot@inrae.fr +#' +#' @import optparse +#' + + +get_version <- function() { + cmd <- commandArgs(trailingOnly = FALSE) + root <- dirname(gsub("--file=", "", cmd[grep("--file=", cmd)])) + readme <- readLines(file.path(root, "README.md")) + version_line <- readme[grepl(" * **@version**: ", readme, fixed = TRUE)] + return(gsub(".*: ", "", version_line)) +} + +defaults <- list( + MS2SNOOP_VERSION = get_version(), + MISSING_PARAMETER_ERROR = 1, + BAD_PARAMETER_VALUE_ERROR = 2, + MISSING_INPUT_FILE_ERROR = 3, + NO_ANY_RESULT_ERROR = 255, + DEFAULT_PRECURSOR_PATH = NULL, + DEFAULT_FRAGMENTS_PATH = NULL, + DEFAULT_COMPOUNDS_PATH = NULL, + DEFAULT_OUTPUT_PATH = "compound_fragments_result.txt", + DEFAULT_TOLMZ = 0.01, + DEFAULT_TOLRT = 20, + DEFAULT_MZDECIMAL = 3, + DEFAULT_R_THRESHOLD = 0.85, + DEFAULT_MINNUMBERSCAN = 8, + DEFAULT_SEUIL_RA = 0.05, + DEFAULT_FRAGMENTS_MATCH_DELTA = 10, + DEFAULT_FRAGMENTS_MATCH_DELTA_UNIT = "ppm", + DEFAULT_PDF_PATH = "" +) +env <- globalenv() +for (default in names(defaults)) { + assign(default, defaults[[default]], envir = env) + lockBinding(default, env) +} + +######################################################################## + +get_formulas <- function( + mzref, + spectra, + processing_parameters, + background = !TRUE +) { + if (is.vector(mzref) && length(mzref) > 1) { + return(lapply( + mzref, + function(mz) { + return(get_formulas( + mzref = mz, + spectra = spectra, + processing_parameters = processing_parameters, + background = background + )) + } + )) + } + input <- sprintf( + "%s-%s.ms", + gsub("[[:space:]]", "_", processing_parameters$c_name), + mzref + ) + create_ms_file(input, mzref, spectra, processing_parameters) + output <- sprintf( + "out/%s-%s.out", + gsub("[[:space:]]", "_", processing_parameters$c_name), + mzref + ) + command <- sprintf( + paste( + "sirius", + "--noCite", + "--noSummaries", + "--loglevel=WARNING", + "-i='%s'", + "-o='%s'", + "tree", + ## loglevel is not working taken into account during + ## sirius startup, so we filter outputs... + "2>&1 | grep '^(WARNING|SEVERE)'" + ), + input, + output + ) + verbose_catf( + ">> Sirius is running %swith the command: %s\n", + if (background) "in the background " else "", + command + ) + system( + command, + wait = !background, + ignore.stdout = background, + ignore.stderr = background + ) + return(extract_sirius_results(output, spectra$mz, processing_parameters)) +} + +create_ms_file <- function( + path, + mzref, + spectra, + processing_parameters +) { + file_content <- paste( + sprintf(">compound %s", processing_parameters$c_name), + sprintf(">ionization %s", processing_parameters$ionization), + sprintf(">parentmass %s", mzref), + sprintf(">formula %s", processing_parameters$elemcomposition), + sep = "\n" + ) + displayed_file_content <- sprintf( + "%s\n>collision\n%s", + file_content, + paste( + sprintf( + "%s %s", + spectra[1:3, "mz"], + spectra[1:3, "intensities"] + ), + collapse = "\n" + ) + ) + if (nrow(spectra) > 3) { + displayed_file_content <- sprintf( + "%s\n[... %s more rows of mz and intensities ...]", + displayed_file_content, + nrow(spectra) - 3 + ) + } + catf( + ">> MS file created for %s with content:\n%s\n", + processing_parameters$c_name, + displayed_file_content + ) + file_content <- sprintf( + "%s\n\n>collision\n%s", + file_content, + paste( + paste(spectra$mz, spectra$intensities), + collapse = "\n" + ) + ) + cat(file_content, file = path, append = FALSE) +} + +extract_sirius_results <- function( + output, + mz_list, + processing_parameters +) { + + delta <- processing_parameters$fragment_match_delta + delta_unit <- tolower(processing_parameters$fragment_match_delta_unit) + + output <- list.dirs(output, recursive = FALSE)[[1]] + + spectra_out_dir <- sprintf("%s/spectra", output) + spectra_filename <- sprintf( + "%s/%s", + spectra_out_dir, + list.files(spectra_out_dir)[[1]] + ) + + trees_out_dir <- sprintf("%s/trees", output) + trees_filename <- sprintf( + "%s/%s", + trees_out_dir, + list.files(trees_out_dir)[[1]] + ) + + if (!is.null(spectra_filename)) { + sirius_results <- get_csv_or_tsv(spectra_filename) + } else { + return(rep(NA, length(mz_list))) + } + if (!is.null(trees_filename)) { + sirius_results <- cbind(sirius_results, extract_sirius_ppm(trees_filename)) + } else { + return(rep(NA, length(mz_list))) + } + + fragment_matchings <- data.frame( + formula = NA, + ppm = NA, + mz = mz_list, + error = NA + ) + + sirius_results <- filter_sirius_with_delta( + sirius_results = sirius_results, + original_mz = fragment_matchings$mz, + delta = delta, + delta_unit = delta_unit + ) + + for (index in seq_len(nrow(sirius_results))) { + result <- sirius_results[index, ] + filter <- which(order(abs(fragment_matchings$mz - result$mz)) == 1) + fragment_matchings[filter, "formula"] <- result$formula + fragment_matchings[filter, "ppm"] <- result$ppm + catf( + "[OK] Fragment with m/z=%s matches %s with a difference of %s ppm\n", + fragment_matchings[filter, "mz"], result$formula, result$ppm + ) + } + return(fragment_matchings) +} + +filter_sirius_with_delta <- function( + sirius_results, + original_mz, + delta, + delta_unit +) { + if (is.numeric(delta) && !is.na(delta) && delta > 0) { + if (delta_unit == "ppm") { + filter <- abs(sirius_results$ppm) <= delta + fine <- which(filter) + not_fine <- which(!filter) + catf( + paste( + "[KO] fragment %s (m/z=%s) eleminated because ppm=%s is greater", + "than delta=%s\n" + ), + sirius_results[not_fine, ]$formula, + sirius_results[not_fine, ]$mz, + sirius_results[not_fine, ]$ppm, + delta + ) + sirius_results <- sirius_results[fine, ] + } else if (delta_unit == "mz") { + differences <- sapply( + sirius_results$mz, + function(mz) min(abs(original_mz - mz)) + ) + fine <- which(sapply( + sirius_results$mz, + function(mz) any(abs(original_mz - mz) <= delta) + )) + not_fine <- which(sapply( + sirius_results$mz, + function(mz) all(abs(original_mz - mz) > delta) + )) + catf( + paste( + "[KO] fragment %s eleminated because mz difference=%s is", + "greater than delta=%s\n" + ), + sirius_results[not_fine, ]$formula, + differences[not_fine], + delta + ) + sirius_results <- sirius_results[fine, ] + } + } + return(sirius_results) +} + +extract_sirius_ppm <- function(path) { + json <- file(path, "r") + suppressWarnings(json_lines <- readLines(json)) + close(json) + json_lines <- json_lines[ + grepl("\\s+\"(massDeviation|recalibratedMass)\" :", json_lines) + ] + json_lines <- gsub("^\\s+\"[^\"]+\" : \"?", "", json_lines) + ppms <- json_lines[seq(1, length(json_lines), 2)] + mz <- json_lines[seq(2, length(json_lines), 2)] + ppms <- as.numeric(gsub(" ppm .*", "", ppms)) + mz <- as.numeric(gsub(",$", "", mz)) + ordered <- order(mz) + return(list(ppm = ppms[ordered], recalibrated_mz = mz[ordered])) +} + +#' @title plot_pseudo_spectra +#' @param x +#' @param fid +#' @param sum_int +#' @param vmz +#' @param cor_abs_int +#' @param refcol +#' @param c_name +#' @description plot_pseudo_spectra +#' function to compute sum of intensities among scans for all +#' m/z kept (cor > r_threshold & minimum number of scans) +#' and plot pseudo spectra +#' x dataframe scan X fragments with scans number in the 1st column and +#' ions in next with intensities +#' fid file id when several a precursor has been detected in several files +plot_pseudo_spectra <- function( + x, + fid, + sum_int, + vmz, + cor_abs_int, + refcol, + meaned_mz, + processing_parameters +) { + ## du fait de la difference de nombre de colonne entre la dataframe qui + ## inclue les scans en 1ere col, mzRef se decale de 1 + refcol <- refcol - 1 + ## compute relative intensities max=100% + rel_int <- sum_int[-1] + rel_int <- rel_int / max(rel_int) + + if (processing_parameters$do_pdf) { + ## define max value on vertical axis (need to increase in order to plot the + ## label of fragments) + ymax <- max(rel_int) + 0.2 * max(rel_int) + + par(mfrow = c(2, 1)) + plot(vmz, rel_int, type = "h", ylim = c(0, ymax), + main = processing_parameters$c_name + ) + ## low correl coef. will be display in grey + cor_low <- which(round(cor_abs_int, 2) < processing_parameters$r_threshold) + + lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2)) + + if (length(cor_low) > 0) { + text( + vmz[cor_low], + rel_int[cor_low], + lbmzcor[cor_low], + cex = 0.5, + col = "grey", + srt = 90, + adj = 0 + ) + if (length(vmz) - length(cor_low) > 1) { + text( + vmz[-c(refcol, cor_low)], + rel_int[-c(refcol, cor_low)], + lbmzcor[-c(refcol, cor_low)], + cex = 0.6, + col = 1, + srt = 90, + adj = 0 + ) + } + } else { + if (length(vmz) > 1) { + text( + vmz[-c(refcol)], + rel_int[-c(refcol)], + lbmzcor[-c(refcol)], + cex = 0.6, + col = 1, + srt = 90, + adj = 0 + ) + } + } + + text( + vmz[refcol], + rel_int[refcol], + lbmzcor[refcol], + cex = 0.8, + col = 2, + srt = 90, + adj = 0 + ) + } + + ## prepare result file + cor_valid <- (round(cor_abs_int, 2) >= processing_parameters$r_threshold) + + do_sirius <- TRUE + verbose_catf("Checking sirius parameters...\n") + if (is.null(processing_parameters$ionization)) { + do_sirius <- FALSE + verbose_catf("[KO] No ionization passed in parameter.\n") + } else { + verbose_catf("[OK] Ionization=%s.\n", processing_parameters$ionization) + } + if (is.na(processing_parameters$elemcomposition)) { + do_sirius <- FALSE + verbose_catf("[KO] Elemental composition is NA.\n") + } else if (length(processing_parameters$elemcomposition) < 1) { + do_sirius <- FALSE + verbose_catf("[KO] No elemental composition is provided.\n") + } else if (processing_parameters$elemcomposition == "") { + do_sirius <- FALSE + verbose_catf("[KO] Elemental composition is an empty string.\n") + } else { + verbose_catf( + "[OK] Elemental composition=%s.\n", + processing_parameters$elemcomposition + ) + } + + cp_res_length <- length(vmz) + ppm <- rep(NA, cp_res_length) + formulas <- rep(NA, cp_res_length) + if (do_sirius) { + verbose_catf("Everything is ok, preparing for sirius.\n") + formulas <- get_formulas( + mzref = processing_parameters$mzref, + spectra = data.frame(mz = meaned_mz, intensities = sum_int[-1]), + processing_parameters = processing_parameters + ) + if (nrow(formulas) == 0) { + catf("No formula found.\n") + } else { + ppm <- formulas$ppm + formulas <- formulas$formula + catf( + "Found %s formula for %s fragments\n", + length(formulas[which(!(is.na(formulas)))]), + cp_res_length + ) + } + } else { + verbose_catf("Sirius cannot be run.\n") + } + cp_res <- data.frame( + rep(processing_parameters$c_name, cp_res_length), + rep(processing_parameters$inchikey, cp_res_length), + rep(processing_parameters$elemcomposition, cp_res_length), + formulas, + meaned_mz, + ppm, + rep(fid, cp_res_length), + cor_abs_int, + sum_int[-1], + rel_int, + cor_valid + ) + + colnames(cp_res) <- c( + "compoundName", + "inchikey", + "elemcomposition", + "fragment", + "fragment_mz", + "ppm", + "fileid", + "CorWithPrecursor", + "AbsoluteIntensity", + "relativeIntensity", + "corValid" + ) + return(cp_res) +} + +#' +#' @title extract_fragments +#' +#' @param precursors the precursor list from mspurity +#' @param fragments the fragments list from ms purity +# ' @param mzref +# ' @param rtref +# ' @param c_name +# ' @param inchikey +# ' @param elemcomposition +#' @param processing_parameters +#' @returns +#' +#' @description +#' function for extraction of fragments corresponding to precursors +#' detected by MSPurity +extract_fragments <- function( ## nolint cyclocomp_linter + precursors, + fragments, + processing_parameters +) { + ## filter precursor in the precursors file based on mz and rt in the + ## compound list + catf("processing %s\n", processing_parameters$c_name) + verbose_catf("===\n") + param <- processing_parameters + selected_precursors <- which( + (abs(precursors$precurMtchMZ - param$mzref) <= param$tolmz) + & (abs(precursors$precurMtchRT - param$rtref) <= param$tolrt) + ) + rm(param) + + verbose_catf( + "> %s precursors selected with mz=%s±%s and rt=%s±%s\n", + length(selected_precursors), + processing_parameters$mzref, + processing_parameters$tolmz, + processing_parameters$rtref, + processing_parameters$tolrt + ) + + ## check if there is the precursor in the file + + if (length(selected_precursors) < 1) { + cat("> non detected in precursor file\n") + show_end_processing() + return(NULL) + } + + precursors <- precursors[selected_precursors, ] + + ## check if fragments corresponding to precursor are found in several + ## files (collision energy) + ## this lead to a processing for each fileid + file_ids <- as.character(sort(unique(precursors$fileid))) + if (length(file_ids) > 1) { + catf("> several files detected for this compounds :\n") + } else if (length(file_ids) < 1 || nrow(precursors) < 1) { + return(data.frame()) + } + + res_comp <- data.frame() + for (curent_file_id in file_ids) { + curent_precursors <- precursors[precursors$fileid == curent_file_id, ] + selected_fragments <- fragments[ + fragments$grpid %in% as.character(curent_precursors$grpid) + & fragments$fileid == curent_file_id, + ] + filtered_fragments <- selected_fragments[ + selected_fragments$ra > processing_parameters$seuil_ra, + ] + if (nrow(filtered_fragments) != 0) { + res_comp_by_file <- process_file( + curent_file_id = curent_file_id, + precursor_mz = curent_precursors$mz, + filtered_fragments = filtered_fragments, + processing_parameters = processing_parameters + ) + if (!is.null(res_comp_by_file)) { + res_comp <- rbind(res_comp, res_comp_by_file) + } + } else { + catf("No fragment found for in fragment file\n") + } + } + return(unique(res_comp)) +} + +process_file <- function( + curent_file_id, + precursor_mz, + filtered_fragments, + processing_parameters +) { + mznominal <- round(x = filtered_fragments$mz, digits = 0) + meaned_mz <- round( + aggregate( + data.frame( + mz = filtered_fragments$mz, + mznominal = mznominal + ), + list(mznominal), + FUN = mean + )$mz, + digits = processing_parameters$mzdecimal + ) + filtered_fragments <- data.frame(filtered_fragments, mznominal) + + ## creation of cross table row=scan col=mz X=ra + + vmz <- as.character(sort(unique(filtered_fragments$mznominal))) + + ds_abs_int <- create_ds_abs_int(vmz, filtered_fragments) + + if (global_debug) { + print(ds_abs_int) + } + + ## elimination of mz with less than min_number_scan scans (user defined + ## parameter) + xmz <- rep(NA, ncol(ds_abs_int) - 1) + sum_int <- rep(NA, ncol(ds_abs_int)) + nbxmz <- 0 + nb_scan_check <- min(nrow(ds_abs_int), processing_parameters$min_number_scan) + + for (j in 2:ncol(ds_abs_int)) { + sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE) + if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) { + nbxmz <- nbxmz + 1 + xmz[nbxmz] <- j + } + } + + xmz <- xmz[-which(is.na(xmz))] + if (length(xmz) > 0) { + ds_abs_int <- ds_abs_int[, -c(xmz)] + sum_int <- sum_int[-c(xmz)] + ## liste des mz keeped decale de 1 avec ds_abs_int + vmz <- as.numeric(vmz[-c(xmz - 1)]) + meaned_mz <- meaned_mz[-c(xmz - 1)] + } + + ## mz of precursor in data precursor to check correlation with + mz_prec <- paste0( + "mz", + round(mean(precursor_mz), processing_parameters$mzdecimal) + ) + ## reference ion for correlation computing = precursor OR maximum + ## intensity ion in precursor is not present + refcol <- which(colnames(ds_abs_int) == mz_prec) + if (length(refcol) == 0) { + refcol <- which(sum_int == max(sum_int, na.rm = TRUE)) + } + + if (processing_parameters$do_pdf) { + start_pdf(processing_parameters, curent_file_id) + } + + ## Pearson correlations between absolute intensities computing + cor_abs_int <- rep(NA, length(vmz)) + + if (length(refcol) > 0) { + for (i in 2:length(ds_abs_int)) { + cor_abs_int[i - 1] <- stats::cor( + x = ds_abs_int[[refcol]], + y = ds_abs_int[[i]], + use = "pairwise.complete.obs", + method = "pearson" + ) + debug_catf( + "Correlation between %s and %s: %s\n", + paste(ds_abs_int[[refcol]], collapse = ";"), + paste(ds_abs_int[[i]], collapse = ";"), + paste(cor_abs_int[i - 1], collapse = ";") + ) + if (processing_parameters$do_pdf) { + pdf_plot_ds_abs_int( + processing_parameters$c_name, + ds_abs_int, + refcol, + i, + round(cor_abs_int[i - 1], 2) + ) + } + } + ## plot pseudo spectra + res_comp_by_file <- plot_pseudo_spectra( + x = ds_abs_int, + fid = curent_file_id, + sum_int = sum_int, + vmz = vmz, + cor_abs_int = cor_abs_int, + refcol = refcol, + meaned_mz = meaned_mz, + processing_parameters = processing_parameters + ) + catf( + "%s has been processed and %s fragments have been found.\n", + processing_parameters$c_name, + nrow(res_comp_by_file) + ) + } else { + res_comp_by_file <- NULL + cat(">> non detected in fragments file \n") + } + show_end_processing() + if (processing_parameters$do_pdf) { + end_pdf() + } + return(res_comp_by_file) +} + +create_ds_abs_int <- function(vmz, filtered_fragments) { + verbose_catf( + ">> fragments: %s\n", + paste(vmz, collapse = " ") + ) + ds_abs_int <- create_int_mz(vmz[1], filtered_fragments) + for (mz in vmz[-1]) { + int_mz <- create_int_mz(mz, filtered_fragments) + ds_abs_int <- merge( + x = ds_abs_int, + y = int_mz, + by.x = 1, + by.y = 1, + all.x = TRUE, + all.y = TRUE + ) + } + return(ds_abs_int) +} + +create_int_mz <- function(mz, filtered_fragments) { + ## absolute intensity + int_mz <- filtered_fragments[ + filtered_fragments$mznominal == mz, + c("acquisitionNum", "i") + ] + colnames(int_mz)[2] <- paste0("mz", mz) + ## average intensities of mass in duplicate scans + comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean) + return(comp_scans[, -1]) +} + +show_end_processing <- function() { + verbose_catf("==========\n") + cat("\n") +} + +start_pdf <- function(processing_parameters, curent_file_id) { + if (!dir.exists(processing_parameters$pdf_path)) { + dir.create(processing_parameters$pdf_path, recursive = TRUE) + } + pdf( + file = sprintf( + "%s/%s_processing_file%s.pdf", + processing_parameters$pdf_path, + processing_parameters$c_name, + curent_file_id + ), + width = 8, + height = 11 + ) + par(mfrow = c(3, 2)) +} + +pdf_plot_ds_abs_int <- function(c_name, ds_abs_int, refcol, i, r_coef) { + plot( + ds_abs_int[[refcol]], + ds_abs_int[[i]], + xlab = colnames(ds_abs_int)[refcol], + ylab = colnames(ds_abs_int)[i], + main = sprintf( + "%s corr coeff r=%s", c_name, r_coef + ) + ) +} +end_pdf <- function() { + dev.off() +} + +set_global <- function(var, value) { + assign(var, value, envir = globalenv()) +} + +set_debug <- function() { + set_global("global_debug", TRUE) +} + +unset_debug <- function() { + set_global("global_debug", FALSE) +} + +set_verbose <- function() { + set_global("global_verbose", TRUE) +} + +unset_verbose <- function() { + set_global("global_verbose", FALSE) +} + +verbose_catf <- function(...) { + if (global_verbose) { + cat(sprintf(...), sep = "") + } +} + + +debug_catf <- function(...) { + if (global_debug) { + cat(sprintf(...), sep = "") + } +} + +catf <- function(...) { + cat(sprintf(...), sep = "") +} + +create_parser <- function() { + parser <- optparse::OptionParser() + parser <- optparse::add_option( + parser, + c("-v", "--verbose"), + action = "store_true", + default = FALSE, + help = paste( + "[default %default]", + "Print extra output" + ) + ) + parser <- optparse::add_option( + parser, + c("-V", "--version"), + action = "store_true", + default = FALSE, + help = "Prints version and exits" + ) + parser <- optparse::add_option( + parser, + c("-d", "--debug"), + action = "store_true", + default = FALSE, + help = paste( + "[default %default]", + "Print debug outputs" + ) + ) + parser <- optparse::add_option( + parser, + c("-o", "--output"), + type = "character", + default = DEFAULT_OUTPUT_PATH, + action = "store", + help = "Path to the output file [default %default]" + ) + parser <- optparse::add_option( + parser, + c("-p", "--precursors"), + type = "character", + default = DEFAULT_PRECURSOR_PATH, + action = "store", + help = "Path to the precursors file [default %default]" + ) + parser <- optparse::add_option( + parser, + c("-f", "--fragments"), + type = "character", + default = DEFAULT_FRAGMENTS_PATH, + action = "store", + help = "Path to the fragments file [default %default]" + ) + parser <- optparse::add_option( + parser, + c("-c", "--compounds"), + type = "character", + default = DEFAULT_COMPOUNDS_PATH, + action = "store", + help = "Path to the compounds file [default %default]" + ) + parser <- optparse::add_option( + parser, + c("--tolmz"), + type = "numeric", + action = "store", + default = DEFAULT_TOLMZ, + metavar = "number", + help = paste( + "[default %default]", + "Tolerance for MZ (in Dalton) to match the standard in the compounds" + ) + ) + parser <- optparse::add_option( + parser, + c("--tolrt"), + type = "integer", + action = "store", + default = DEFAULT_TOLRT, + metavar = "number", + help = paste( + "[default %default]", + "RT (in seconds) to match the standard in the compounds" + ) + ) + parser <- optparse::add_option( + parser, + c("--seuil_ra"), + type = "numeric", + action = "store", + default = DEFAULT_SEUIL_RA, + metavar = "number", + help = paste( + "[default %default]", + "relative intensity threshold" + ), + ) + parser <- optparse::add_option( + parser, + c("--mzdecimal"), + type = "integer", + default = DEFAULT_MZDECIMAL, + action = "store", + help = paste( + "[default %default]", + "Number of decimal to write for MZ" + ), + metavar = "number" + ) + parser <- optparse::add_option( + parser, + c("--r_threshold"), + type = "integer", + default = DEFAULT_R_THRESHOLD, + action = "store", + help = paste( + "[default %default]", + "R-Pearson correlation threshold between precursor and fragment", + "absolute intensity" + ), + metavar = "number" + ) + parser <- optparse::add_option( + parser, + c("--min_number_scan"), + type = "numeric", + action = "store", + default = DEFAULT_MINNUMBERSCAN, + help = paste( + "[default %default]", + "Fragments are kept if there are found in a minimum number", + "of min_number_scan scans" + ), + metavar = "number" + ) + parser <- optparse::add_option( + parser, + c("--pdf_path"), + type = "character", + default = DEFAULT_PDF_PATH, + help = paste( + "[default %default]", + "PDF files output path" + ) + ) + parser <- optparse::add_option( + parser, + c("--ionization"), + type = "character", + action = "store", + default = "None", + help = paste( + "[default %default]", + "Which ionization to use for sirius" + ), + metavar = "character" + ) + parser <- optparse::add_option( + parser, + c("--fragment_match_delta"), + type = "numeric", + action = "store", + default = DEFAULT_FRAGMENTS_MATCH_DELTA, + help = paste( + "[default %default]", + "Fragment match delta" + ), + metavar = "numeric" + ) + parser <- optparse::add_option( + parser, + c("--fragment_match_delta_unit"), + type = "character", + action = "store", + default = DEFAULT_FRAGMENTS_MATCH_DELTA_UNIT, + help = paste( + "[default %default]", + "Fragment match delta" + ), + metavar = "character" + ) + return(parser) +} + +stop_with_status <- function(msg, status) { + sink(stderr()) + message(sprintf("Error: %s", msg)) + message(sprintf("Error code: %s", status)) + sink(NULL) + base::quit(status = status) +} + +check_args_validity <- function(args) { ## nolint cyclocomp_linter + if (length(args$output) == 0 || nchar(args$output[1]) == 0) { + stop_with_status( + "Missing output parameters. Please set it with --output.", + MISSING_PARAMETER_ERROR + ) + } + if (length(args$precursors) == 0 || nchar(args$precursors[1]) == 0) { + stop_with_status( + "Missing precursors parameters. Please set it with --precursors.", + MISSING_PARAMETER_ERROR + ) + } + if (length(args$fragments) == 0 || nchar(args$fragments[1]) == 0) { + stop_with_status( + "Missing fragments parameters. Please set it with --fragments.", + MISSING_PARAMETER_ERROR + ) + } + if (length(args$compounds) == 0 || nchar(args$compounds[1]) == 0) { + stop_with_status( + "Missing compounds parameters. Please set it with --compounds.", + MISSING_PARAMETER_ERROR + ) + } + if (!file.exists(args$precursors)) { + stop_with_status( + sprintf( + "Precursors file %s does not exist or cannot be accessed.", + args$precursors + ), + MISSING_INPUT_FILE_ERROR + ) + } + if (!file.exists(args$fragments)) { + stop_with_status( + sprintf( + "Fragments file %s does not exist or cannot be accessed.", + args$fragments + ), + MISSING_INPUT_FILE_ERROR + ) + } + if (!file.exists(args$compounds)) { + stop_with_status( + sprintf( + "Compounds file %s does not exist or cannot be accessed.", + args$compounds + ), + MISSING_INPUT_FILE_ERROR + ) + } + if (in_galaxy_env()) { + check_galaxy_args_validity(args) + } +} + +in_galaxy_env <- function() { + sysvars <- Sys.getenv() + sysvarnames <- names(sysvars) + return( + "_GALAXY_JOB_HOME_DIR" %in% sysvarnames + || "_GALAXY_JOB_TMP_DIR" %in% sysvarnames + || "GALAXY_MEMORY_MB" %in% sysvarnames + || "GALAXY_MEMORY_MB_PER_SLOT" %in% sysvarnames + || "GALAXY_SLOTS" %in% sysvarnames + ) +} + +check_galaxy_args_validity <- function(args) { + if (!file.exists(args$output)) { + stop_with_status( + sprintf( + "Output file %s does not exist or cannot be accessed.", + args$output + ), + MISSING_INPUT_FILE_ERROR + ) + } +} + +get_csv_or_tsv <- function( + path, + sep_stack = c("\t", ",", ";"), + sep_names = c("tab", "comma", "semicolon"), + header = TRUE, + quote = "\"" +) { + sep <- determine_csv_or_tsv_sep( + path = path, + sep_stack = sep_stack, + header = header, + quote = quote + ) + verbose_catf( + "%s separator has been determined for %s.\n", + sep_names[sep_stack == sep], + path + ) + return(read.table( + file = path, + sep = sep, + header = header, + quote = quote + )) +} + +determine_csv_or_tsv_sep <- function( + path, + sep_stack = c("\t", ",", ";"), + header = TRUE, + quote = "\"" +) { + count <- -1 + best_sep <- sep_stack[1] + for (sep in sep_stack) { + tryCatch({ + table <- read.table( + file = path, + sep = sep, + header = header, + quote = quote, + nrows = 1 + ) + if (ncol(table) > count) { + count <- ncol(table) + best_sep <- sep + } + }) + } + return(best_sep) +} + +uniformize_columns <- function(df) { + cols <- colnames(df) + for (func in c(tolower)) { + cols <- func(cols) + } + colnames(df) <- cols + return(df) +} + +handle_galaxy_param <- function(args) { + for (param in names(args)) { + if (is.character(args[[param]])) { + args[[param]] <- gsub("__ob__", "[", args[[param]]) + args[[param]] <- gsub("__cb__", "]", args[[param]]) + } + } + return(args) +} + +zip_pdfs <- function(processing_parameters) { + if (processing_parameters$do_pdf) { + if (zip <- Sys.getenv("R_ZIPCMD", "zip") == "") { + catf("R could not fin the zip executable. Trying luck: zip = \"zip\"") + zip <- "zip" + } else { + catf("Found zip executable at %s .", zip) + } + utils::zip( + processing_parameters$pdf_zip_path, + processing_parameters$pdf_path, + zip = zip + ) + } +} + +main <- function(args) { + if (args$version) { + catf("%s\n", MS2SNOOP_VERSION) + base::quit(status = 0) + } + if (in_galaxy_env()) { + print(sessionInfo()) + cat("\n\n") + } + check_args_validity(args) + args <- handle_galaxy_param(args) + if (args$ionization == "None") { + args$ionization <- NULL + } + if (args$debug) { + set_debug() + } + if (args$verbose) { + set_verbose() + } + precursors <- get_csv_or_tsv(args$precursors) + fragments <- get_csv_or_tsv(args$fragments) + compounds <- get_csv_or_tsv(args$compounds) + + compounds <- uniformize_columns(compounds) + mandatory_columns <- c( + "compound_name", + "mz", + "rtsec", + "inchikey" + ) + presents <- mandatory_columns %in% colnames(compounds) + if (!all(presents)) { + stop_with_status( + sprintf( + "Some columns are missing: %s", + paste(mandatory_columns[which(!presents)], collapse = ", ") + ), + BAD_PARAMETER_VALUE_ERROR + ) + } + + res_all <- data.frame() + processing_parameters <- list( + min_number_scan = args$min_number_scan, + mzdecimal = args$mzdecimal, + r_threshold = args$r_threshold, + seuil_ra = args$seuil_ra, + tolmz = args$tolmz, + tolrt = args$tolrt, + ionization = args$ionization, + do_pdf = nchar(args$pdf_path) > 0, + pdf_zip_path = args$pdf_path, + pdf_path = tempdir(), + fragment_match_delta = args$fragment_match_delta, + fragment_match_delta_unit = args$fragment_match_delta_unit + ) + for (i in seq_len(nrow(compounds))) { + processing_parameters$mzref <- compounds[["mz"]][i] + processing_parameters$rtref <- compounds[["rtsec"]][i] + processing_parameters$c_name <- compounds[["compound_name"]][i] + processing_parameters$inchikey <- compounds[["inchikey"]][i] + processing_parameters$elemcomposition <- compounds[["elemcomposition"]][i] + res_cor <- extract_fragments( + precursors = precursors, + fragments = fragments, + processing_parameters = processing_parameters + ) + if (!is.null(res_cor)) { + res_all <- rbind(res_all, res_cor) + } + } + + if (nrow(res_all) == 0) { + stop_with_status("No result at all!", NO_ANY_RESULT_ERROR) + } + + write.table( + x = res_all, + file = args$output, + sep = "\t", + row.names = FALSE + ) + zip_pdfs(processing_parameters) + unlink(processing_parameters$pdf_path, recursive = TRUE) +} + +global_debug <- FALSE +global_verbose <- FALSE +args <- optparse::parse_args(create_parser()) +main(args) + +warnings()