diff MS2snoop.R @ 0:67733206be53 draft

" master branch Updating"
author lain
date Thu, 14 Apr 2022 10:23:15 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MS2snoop.R	Thu Apr 14 10:23:15 2022 +0000
@@ -0,0 +1,551 @@
+#'
+#' read and process mspurity W4M files
+#' create a summary of fragment for each precursor and a graphics of peseudo
+#' spectra + correlation on which checking of fragment is based on
+#' V3 try to identify and process multiple files for 1 precursor which may
+#' occur if different collision energy are used
+#' V4 elimination of correlation = NA. Correlation is done with precursor, if
+#' precursor is not present correlation with most intense peak
+#' author: Jean-Francois Martin
+#' V5 is versionned, lintR-compliant, packaged, unit-tested, documented and
+#' tested against data from other labs.
+#' new maintainer: Lain Pavot - lain.pavot@inrae.fr
+#'
+#' @import optparse
+#'
+NULL
+
+
+assign("DEFAULT_PRECURSOR_PATH", "peaklist_precursors.tsv")
+assign("DEFAULT_FRAGMENTS_PATH", "peaklist_fragments.tsv")
+assign("DEFAULT_COMPOUNDS_PATH", "compounds_pos.txt")
+assign("DEFAULT_OUTPUT_PATH", "compound_fragments_result.txt")
+assign("DEFAULT_TOLMZ", 0.01)
+assign("DEFAULT_TOLRT", 20)
+assign("DEFAULT_MZDECIMAL", 0)
+assign("DEFAULT_R_THRESHOLD", 0.85)
+assign("DEFAULT_MINNUMBERSCAN", 8)
+assign("DEFAULT_SEUIL_RA", 0.5)
+lockBinding("DEFAULT_PRECURSOR_PATH", globalenv())
+lockBinding("DEFAULT_FRAGMENTS_PATH", globalenv())
+lockBinding("DEFAULT_COMPOUNDS_PATH", globalenv())
+lockBinding("DEFAULT_OUTPUT_PATH", globalenv())
+lockBinding("DEFAULT_TOLMZ", globalenv())
+lockBinding("DEFAULT_TOLRT", globalenv())
+lockBinding("DEFAULT_MZDECIMAL", globalenv())
+lockBinding("DEFAULT_R_THRESHOLD", globalenv())
+lockBinding("DEFAULT_MINNUMBERSCAN", globalenv())
+lockBinding("DEFAULT_SEUIL_RA", globalenv())
+
+assign("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", 0.85)
+assign("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", 0.1)
+assign("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", 0.01)
+assign("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", 60)
+lockBinding("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", globalenv())
+lockBinding("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", globalenv())
+lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", globalenv())
+lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", globalenv())
+
+
+debug <- FALSE
+
+
+########################################################################
+
+#' @title plot_pseudo_spectra
+#' @param x
+#' @param r_threshold
+#' @param fid
+#' @param sum_int
+#' @param vmz
+#' @param cor_abs_int
+#' @param refcol
+#' @param c_name
+#' @description plot_pseudo_spectra
+#' function to compute sum of intensities among scans for all
+#' m/z kept (cor > r_threshold & minimum number of scans)
+#' and plot pseudo spectra
+#' x dataframe scan X fragments with scans number in the 1st column and
+#' ions in next with intensities
+#' fid file id when several a precursor has been detected in several files
+plot_pseudo_spectra <- function(
+  x,
+  r_threshold,
+  fid,
+  sum_int,
+  vmz,
+  cor_abs_int,
+  refcol,
+  c_name
+) {
+  ## du fait de la difference de nombre de colonne entre la dataframe qui
+  ## inclue les scans en 1ere col, mzRef se decale de 1
+  refcol <- refcol - 1
+  ## compute relative intensities max=100%
+  rel_int <- sum_int[-1]
+  rel_int <- rel_int / max(rel_int)
+
+  ## define max value on vertical axis (need to increase in order to plot the
+  ## label of fragments)
+  ymax <- max(rel_int) + 0.2 * max(rel_int)
+
+  par(mfrow = c(2, 1))
+  plot(vmz, rel_int, type = "h", ylim = c(0, ymax), main = c_name)
+  ## low correl coef. will be display in grey
+  cor_low <- which(round(cor_abs_int, 2) < r_threshold)
+
+  lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2))
+
+  if (length(cor_low) > 0) {
+    text(
+      vmz[cor_low],
+      rel_int[cor_low],
+      lbmzcor[cor_low],
+      cex = 0.5,
+      col = "grey",
+      srt = 90,
+      adj = 0
+    )
+    if (length(vmz) - length(cor_low) > 1) {
+      text(
+        vmz[-c(refcol, cor_low)],
+        rel_int[-c(refcol, cor_low)],
+        lbmzcor[-c(refcol, cor_low)],
+        cex = 0.6,
+        col = 1,
+        srt = 90,
+        adj = 0
+      )
+    }
+  } else {
+    if (length(vmz) > 1) {
+      text(
+        vmz[-c(refcol)],
+        rel_int[-c(refcol)],
+        lbmzcor[-c(refcol)],
+        cex = 0.6,
+        col = 1,
+        srt = 90,
+        adj = 0
+      )
+    }
+  }
+
+  text(
+    vmz[refcol],
+    rel_int[refcol],
+    lbmzcor[refcol],
+    cex = 0.8,
+    col = 2,
+    srt = 90,
+    adj = 0
+  )
+
+  ## prepare result file
+  corValid <- (round(cor_abs_int, 2) >= r_threshold) ##nolint object_name_linter
+  cp_res <- data.frame(
+    rep(c_name, length(vmz)),
+    rep(fid, length(vmz)),
+    vmz,
+    cor_abs_int,
+    sum_int[-1],
+    rel_int,
+    corValid
+  )
+
+  colnames(cp_res) <- c(
+    "compoundName",
+    "fileid",
+    "fragments_mz",
+    "CorWithPrecursor",
+    "AbsoluteIntensity",
+    "relativeIntensity",
+    "corValid"
+  )
+  return(cp_res)
+
+}
+
+#'
+#' @title extract_fragments
+#'
+#' @param precursors the precursor list from mspurity
+#' @param fragments the fragments list from ms purity
+#' @param mzref
+#' @param rtref
+#' @param c_name
+#' @param r_threshold default = DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD
+#' @param seuil_ra default = DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA
+#' @param tolmz default = DEFAULT_EXTRACT_FRAGMENTS_TOLMZ
+#' @param tolrt default = DEFAULT_EXTRACT_FRAGMENTS_TOLRT
+#' @returns
+#'
+#' @description
+#' function for extraction of fragments corresponding to precursors
+#' detected by MSPurity
+extract_fragments <- function( ## nolint cyclocomp_linter
+  precursors,
+  fragments,
+  mzref,
+  rtref,
+  c_name,
+  min_number_scan,
+  mzdecimal,
+  r_threshold=DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD,
+  seuil_ra=DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA,
+  tolmz=DEFAULT_EXTRACT_FRAGMENTS_TOLMZ,
+  tolrt=DEFAULT_EXTRACT_FRAGMENTS_TOLRT
+) {
+  ## filter precursor in the precursors file based on mz and rt in the
+  ## compound list
+  cat("processing ", c_name, "\n")
+  selected_precursors <- which(
+    (abs(precursors$precurMtchMZ - mzref) <= tolmz)
+    & (abs(precursors$precurMtchRT - rtref) <= tolrt)
+  )
+
+  ## check if there is the precursor in the file
+  if (length(selected_precursors) > 0) {
+
+    sprecini <- precursors[selected_precursors, ]
+
+    ## check if fragments corresponding to precursor are found in several
+    ## files (collision energy)
+    ## this lead to a processing for each fileid
+    mf <- levels(as.factor(sprecini$fileid))
+    if (length(mf) > 1) {
+      cat(" several files detected for this compounds :\n")
+    }
+
+    for (f in seq_along(mf)) {
+
+      sprec <- sprecini[sprecini$fileid == mf[f], ]
+
+      ## selection of fragment in the fragments file with the grpid common in
+      ## both fragments and precursors
+      selfrgt <- levels(as.factor(sprec$grpid))
+      sfrgt <- fragments[
+        fragments$grpid %in% selfrgt
+        & fragments$fileid == mf[f],
+      ]
+
+      ## filter fragments on relative intensity seuil_ra = user defined
+      ## parameter (MSpurity flags could be used here)
+      sfrgtfil <- sfrgt[sfrgt$ra > seuil_ra, ]
+
+      mznominal <- round(x = sfrgtfil$mz, mzdecimal)
+      sfrgtfil <- data.frame(sfrgtfil, mznominal)
+
+      ## creation of cross table row=scan col=mz X=ra
+      vmz <- levels(as.factor(sfrgtfil$mznominal))
+
+      cat(" fragments :", vmz)
+
+      ## mz of precursor in data precursor to check correlation with
+      mz_prec <- paste0("mz", round(mean(sprec$mz), mzdecimal))
+
+      for (m in seq_along(vmz)) {
+
+        ## absolute intensity
+        cln <- c(
+          which(colnames(sfrgtfil) == "acquisitionNum"),
+          which(colnames(sfrgtfil) == "i")
+        )
+        int_mz <- sfrgtfil[sfrgtfil$mznominal == vmz[m], cln]
+        colnames(int_mz)[2] <- paste0("mz", vmz[m])
+
+        ## average intensities of mass in duplicate scans
+        comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean)
+        int_mz <- comp_scans[, -1]
+
+        if (m == 1) {
+          ds_abs_int <- int_mz
+        } else {
+          ds_abs_int <- merge(
+            x = ds_abs_int,
+            y = int_mz,
+            by.x = 1,
+            by.y = 1,
+            all.x = TRUE,
+            all.y = TRUE
+          )
+        }
+      }
+      if (debug) {
+        write.table(
+          x = ds_abs_int,
+          file = paste0(c_name, "ds_abs_int.txt"),
+          row.names = FALSE,
+          sep = "\t"
+        )
+      }
+
+      ## elimination of mz with less than min_number_scan scans (user defined
+      ## parameter)
+      xmz <- rep(NA, ncol(ds_abs_int) - 1)
+      sum_int <- rep(NA, ncol(ds_abs_int))
+      nbxmz <- 0
+      nb_scan_check <- min(nrow(ds_abs_int), min_number_scan)
+
+      for (j in 2:ncol(ds_abs_int)) {
+        sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE)
+        if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) {
+          nbxmz <- nbxmz + 1
+          xmz[nbxmz] <- j
+        }
+      }
+
+      xmz <- xmz[-which(is.na(xmz))]
+      if (length(xmz) > 0) {
+        ds_abs_int <- ds_abs_int[, -c(xmz)]
+        sum_int <- sum_int[-c(xmz)]
+        ## liste des mz keeped decale de 1 avec ds_abs_int
+        vmz <- as.numeric(vmz[-c(xmz - 1)])
+      }
+
+      ## reference ion for correlation computing = precursor OR maximum
+      ## intensity ion in precursor is not present
+      refcol <- which(colnames(ds_abs_int) == mz_prec)
+      if (length(refcol) == 0) {
+        refcol <- which(sum_int == max(sum_int, na.rm = TRUE))
+      }
+      pdf(
+        file = sprintf("%s_processing_file%s.pdf", c_name, mf[f]),
+        width = 8,
+        height = 11
+      )
+      par(mfrow = c(3, 2))
+
+      ## Pearson correlations between absolute intensities computing
+      cor_abs_int <- rep(NA, length(vmz))
+
+      if (length(refcol) > 0) {
+        for (i in 2:length(ds_abs_int)) {
+          cor_abs_int[i - 1] <- cor(
+            x = ds_abs_int[[refcol]],
+            y = ds_abs_int[[i]],
+            use = "pairwise.complete.obs",
+            method = "pearson"
+          )
+          plot(
+            ds_abs_int[[refcol]],
+            ds_abs_int[[i]],
+             xlab = colnames(ds_abs_int)[refcol],
+             ylab = colnames(ds_abs_int)[i],
+             main = sprintf(
+              "%s corr coeff r=%s", c_name, round(cor_abs_int[i - 1], 2)
+            )
+          )
+        }
+        ## plot pseudo spectra
+        res_comp_by_file <- plot_pseudo_spectra(
+          x = ds_abs_int,
+          r_threshold = r_threshold,
+          fid = mf[f],
+          sum_int = sum_int,
+          vmz = vmz,
+          cor_abs_int = cor_abs_int,
+          refcol = refcol,
+          c_name = c_name
+        )
+        if (f == 1) {
+          res_comp <- res_comp_by_file
+        }
+      } else {
+        res_comp_by_file <- NULL
+        cat(" non detected in fragments file \n")
+      }
+      if (!is.null(res_comp_by_file)) {
+        res_comp <- rbind(res_comp, res_comp_by_file)
+      }
+      cat("\n")
+      dev.off()
+    }
+  } else {
+    res_comp <- NULL
+    cat(" non detected in precursor file \n")
+  }
+  return(res_comp)
+}
+
+
+create_parser <- function() {
+  parser <- optparse::OptionParser()
+  parser <- optparse::add_option(
+    parser,
+    c("-v", "--verbose"),
+    action = "store_true",
+    default = FALSE,
+    help = "Print extra output [default %default]"
+  )
+  parser <- optparse::add_option(
+    parser,
+    c("-o", "--output"),
+    type = "character",
+    default = DEFAULT_OUTPUT_PATH,
+    action = "store",
+    help = "Path to the output file [default %default]"
+  )
+  parser <- optparse::add_option(
+    parser,
+    c("-p", "--precursors"),
+    type = "character",
+    default = DEFAULT_PRECURSOR_PATH,
+    action = "store",
+    help = "Path to the precursors file [default %default]"
+  )
+  parser <- optparse::add_option(
+    parser,
+    c("-f", "--fragments"),
+    type = "character",
+    default = DEFAULT_FRAGMENTS_PATH,
+    action = "store",
+    help = "Path to the fragments file [default %default]"
+  )
+  parser <- optparse::add_option(
+    parser,
+    c("-c", "--compounds"),
+    type = "character",
+    default = DEFAULT_COMPOUNDS_PATH,
+    action = "store",
+    help = "Path to the compounds file [default %default]"
+  )
+  parser <- optparse::add_option(
+    parser,
+    c("--tolmz"),
+    type = "numeric",
+    action = "store",
+    default = DEFAULT_TOLMZ,
+    metavar = "number"
+  )
+  parser <- optparse::add_option(
+    parser,
+    c("--tolrt"),
+    type = "integer",
+    action = "store",
+    default = DEFAULT_TOLRT,
+    metavar = "number"
+  )
+  parser <- optparse::add_option(
+    parser,
+    c("--seuil_ra"),
+    type = "numeric",
+    action = "store",
+    help = "relative intensity threshold",
+    default = DEFAULT_SEUIL_RA,
+    metavar = "number"
+  )
+  parser <- optparse::add_option(
+    parser,
+    c("--mzdecimal"),
+    type = "integer",
+    default = DEFAULT_MZDECIMAL,
+    action = "store",
+    help = "nb decimal for mz",
+    metavar = "number"
+  )
+  parser <- optparse::add_option(
+    parser,
+    c("--r_threshold"),
+    type = "integer",
+    default = DEFAULT_R_THRESHOLD,
+    action = "store",
+    help = paste0(
+      "r pearson correlation threshold between precursor and fragment ",
+      "absolute intensity"
+    ),
+    metavar = "number"
+  )
+  parser <- optparse::add_option(
+    parser,
+    c("--min_number_scan"),
+    type = "numeric",
+    action = "store",
+    default = DEFAULT_MINNUMBERSCAN,
+    help = paste0(
+      "fragments are kept if there are found in a minimum number ",
+      "of scans"
+    ),
+    metavar = "number"
+  )
+  return(parser)
+}
+
+main <- function(args) {
+  ## FOLDER AND FILES
+  ## MSpurity precursors file
+  precursors <- read.table(
+    file = args$precursors,
+    header = TRUE,
+    sep = "\t",
+    quote = "\""
+  )
+  ## MSpurity fragments file
+  fragments <- read.table(
+    file = args$fragments,
+    header = TRUE,
+    sep = "\t",
+    quote = "\""
+  )
+  ## list of compounds : col1=Name of molecule, col2=m/z, col3=retention time
+  compounds <- read.table(
+    file = args$compounds,
+    sep = "\t",
+    quote = "\"",
+    header = TRUE
+  )
+  ## PARAMETERS
+  ## tolerance for mz(dalton) rt(seconds) to match the standard in the compounds
+  ## list with the precursor MSpurity file
+  tolmz <- args$tolmz
+  tolrt <- args$tolrt
+
+  ##  relative intensity threshold
+  seuil_ra <- args$seuil_ra
+  ## nb decimal for mz
+  mzdecimal <- args$mzdecimal
+  ## r pearson correlation threshold between precursor and
+  # #fragment absolute intensity
+  r_threshold <- args$r_threshold
+  ## fragments are kept if there are found in a minimum number of scans
+  min_number_scan <- args$min_number_scan
+
+  for (i in seq_len(nrow(compounds))) {
+    ## loop execution for all compounds in the compounds file
+    res_cor <- NULL
+    res_cor <- extract_fragments(
+      precursors = precursors,
+      fragments = fragments,
+      mzref = compounds[[2]][i],
+      rtref = compounds[[3]][i] * 60,
+      c_name = compounds[[1]][i],
+      min_number_scan = min_number_scan,
+      mzdecimal = mzdecimal,
+      r_threshold = r_threshold,
+      seuil_ra = seuil_ra,
+      tolmz = tolmz,
+      tolrt = tolrt
+    )
+    if (i == 1 & !is.null(res_cor)) {
+      res_all <- res_cor
+    } else if (!is.null(res_cor)) {
+      res_all <- rbind(res_all, res_cor)
+    }
+  }
+
+  if (is.null(res_all)) {
+    stop("No result at all!")
+  }
+  write.table(
+    x = res_all,
+    file = args$output,
+    sep = "\t",
+    row.names = FALSE
+  )
+}
+
+args <- optparse::parse_args(create_parser())
+sessionInfo()
+main(args)
+
+warnings()