comparison MS2snoop.R @ 0:67733206be53 draft

" master branch Updating"
author lain
date Thu, 14 Apr 2022 10:23:15 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:67733206be53
1 #'
2 #' read and process mspurity W4M files
3 #' create a summary of fragment for each precursor and a graphics of peseudo
4 #' spectra + correlation on which checking of fragment is based on
5 #' V3 try to identify and process multiple files for 1 precursor which may
6 #' occur if different collision energy are used
7 #' V4 elimination of correlation = NA. Correlation is done with precursor, if
8 #' precursor is not present correlation with most intense peak
9 #' author: Jean-Francois Martin
10 #' V5 is versionned, lintR-compliant, packaged, unit-tested, documented and
11 #' tested against data from other labs.
12 #' new maintainer: Lain Pavot - lain.pavot@inrae.fr
13 #'
14 #' @import optparse
15 #'
16 NULL
17
18
19 assign("DEFAULT_PRECURSOR_PATH", "peaklist_precursors.tsv")
20 assign("DEFAULT_FRAGMENTS_PATH", "peaklist_fragments.tsv")
21 assign("DEFAULT_COMPOUNDS_PATH", "compounds_pos.txt")
22 assign("DEFAULT_OUTPUT_PATH", "compound_fragments_result.txt")
23 assign("DEFAULT_TOLMZ", 0.01)
24 assign("DEFAULT_TOLRT", 20)
25 assign("DEFAULT_MZDECIMAL", 0)
26 assign("DEFAULT_R_THRESHOLD", 0.85)
27 assign("DEFAULT_MINNUMBERSCAN", 8)
28 assign("DEFAULT_SEUIL_RA", 0.5)
29 lockBinding("DEFAULT_PRECURSOR_PATH", globalenv())
30 lockBinding("DEFAULT_FRAGMENTS_PATH", globalenv())
31 lockBinding("DEFAULT_COMPOUNDS_PATH", globalenv())
32 lockBinding("DEFAULT_OUTPUT_PATH", globalenv())
33 lockBinding("DEFAULT_TOLMZ", globalenv())
34 lockBinding("DEFAULT_TOLRT", globalenv())
35 lockBinding("DEFAULT_MZDECIMAL", globalenv())
36 lockBinding("DEFAULT_R_THRESHOLD", globalenv())
37 lockBinding("DEFAULT_MINNUMBERSCAN", globalenv())
38 lockBinding("DEFAULT_SEUIL_RA", globalenv())
39
40 assign("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", 0.85)
41 assign("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", 0.1)
42 assign("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", 0.01)
43 assign("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", 60)
44 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", globalenv())
45 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", globalenv())
46 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", globalenv())
47 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", globalenv())
48
49
50 debug <- FALSE
51
52
53 ########################################################################
54
55 #' @title plot_pseudo_spectra
56 #' @param x
57 #' @param r_threshold
58 #' @param fid
59 #' @param sum_int
60 #' @param vmz
61 #' @param cor_abs_int
62 #' @param refcol
63 #' @param c_name
64 #' @description plot_pseudo_spectra
65 #' function to compute sum of intensities among scans for all
66 #' m/z kept (cor > r_threshold & minimum number of scans)
67 #' and plot pseudo spectra
68 #' x dataframe scan X fragments with scans number in the 1st column and
69 #' ions in next with intensities
70 #' fid file id when several a precursor has been detected in several files
71 plot_pseudo_spectra <- function(
72 x,
73 r_threshold,
74 fid,
75 sum_int,
76 vmz,
77 cor_abs_int,
78 refcol,
79 c_name
80 ) {
81 ## du fait de la difference de nombre de colonne entre la dataframe qui
82 ## inclue les scans en 1ere col, mzRef se decale de 1
83 refcol <- refcol - 1
84 ## compute relative intensities max=100%
85 rel_int <- sum_int[-1]
86 rel_int <- rel_int / max(rel_int)
87
88 ## define max value on vertical axis (need to increase in order to plot the
89 ## label of fragments)
90 ymax <- max(rel_int) + 0.2 * max(rel_int)
91
92 par(mfrow = c(2, 1))
93 plot(vmz, rel_int, type = "h", ylim = c(0, ymax), main = c_name)
94 ## low correl coef. will be display in grey
95 cor_low <- which(round(cor_abs_int, 2) < r_threshold)
96
97 lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2))
98
99 if (length(cor_low) > 0) {
100 text(
101 vmz[cor_low],
102 rel_int[cor_low],
103 lbmzcor[cor_low],
104 cex = 0.5,
105 col = "grey",
106 srt = 90,
107 adj = 0
108 )
109 if (length(vmz) - length(cor_low) > 1) {
110 text(
111 vmz[-c(refcol, cor_low)],
112 rel_int[-c(refcol, cor_low)],
113 lbmzcor[-c(refcol, cor_low)],
114 cex = 0.6,
115 col = 1,
116 srt = 90,
117 adj = 0
118 )
119 }
120 } else {
121 if (length(vmz) > 1) {
122 text(
123 vmz[-c(refcol)],
124 rel_int[-c(refcol)],
125 lbmzcor[-c(refcol)],
126 cex = 0.6,
127 col = 1,
128 srt = 90,
129 adj = 0
130 )
131 }
132 }
133
134 text(
135 vmz[refcol],
136 rel_int[refcol],
137 lbmzcor[refcol],
138 cex = 0.8,
139 col = 2,
140 srt = 90,
141 adj = 0
142 )
143
144 ## prepare result file
145 corValid <- (round(cor_abs_int, 2) >= r_threshold) ##nolint object_name_linter
146 cp_res <- data.frame(
147 rep(c_name, length(vmz)),
148 rep(fid, length(vmz)),
149 vmz,
150 cor_abs_int,
151 sum_int[-1],
152 rel_int,
153 corValid
154 )
155
156 colnames(cp_res) <- c(
157 "compoundName",
158 "fileid",
159 "fragments_mz",
160 "CorWithPrecursor",
161 "AbsoluteIntensity",
162 "relativeIntensity",
163 "corValid"
164 )
165 return(cp_res)
166
167 }
168
169 #'
170 #' @title extract_fragments
171 #'
172 #' @param precursors the precursor list from mspurity
173 #' @param fragments the fragments list from ms purity
174 #' @param mzref
175 #' @param rtref
176 #' @param c_name
177 #' @param r_threshold default = DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD
178 #' @param seuil_ra default = DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA
179 #' @param tolmz default = DEFAULT_EXTRACT_FRAGMENTS_TOLMZ
180 #' @param tolrt default = DEFAULT_EXTRACT_FRAGMENTS_TOLRT
181 #' @returns
182 #'
183 #' @description
184 #' function for extraction of fragments corresponding to precursors
185 #' detected by MSPurity
186 extract_fragments <- function( ## nolint cyclocomp_linter
187 precursors,
188 fragments,
189 mzref,
190 rtref,
191 c_name,
192 min_number_scan,
193 mzdecimal,
194 r_threshold=DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD,
195 seuil_ra=DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA,
196 tolmz=DEFAULT_EXTRACT_FRAGMENTS_TOLMZ,
197 tolrt=DEFAULT_EXTRACT_FRAGMENTS_TOLRT
198 ) {
199 ## filter precursor in the precursors file based on mz and rt in the
200 ## compound list
201 cat("processing ", c_name, "\n")
202 selected_precursors <- which(
203 (abs(precursors$precurMtchMZ - mzref) <= tolmz)
204 & (abs(precursors$precurMtchRT - rtref) <= tolrt)
205 )
206
207 ## check if there is the precursor in the file
208 if (length(selected_precursors) > 0) {
209
210 sprecini <- precursors[selected_precursors, ]
211
212 ## check if fragments corresponding to precursor are found in several
213 ## files (collision energy)
214 ## this lead to a processing for each fileid
215 mf <- levels(as.factor(sprecini$fileid))
216 if (length(mf) > 1) {
217 cat(" several files detected for this compounds :\n")
218 }
219
220 for (f in seq_along(mf)) {
221
222 sprec <- sprecini[sprecini$fileid == mf[f], ]
223
224 ## selection of fragment in the fragments file with the grpid common in
225 ## both fragments and precursors
226 selfrgt <- levels(as.factor(sprec$grpid))
227 sfrgt <- fragments[
228 fragments$grpid %in% selfrgt
229 & fragments$fileid == mf[f],
230 ]
231
232 ## filter fragments on relative intensity seuil_ra = user defined
233 ## parameter (MSpurity flags could be used here)
234 sfrgtfil <- sfrgt[sfrgt$ra > seuil_ra, ]
235
236 mznominal <- round(x = sfrgtfil$mz, mzdecimal)
237 sfrgtfil <- data.frame(sfrgtfil, mznominal)
238
239 ## creation of cross table row=scan col=mz X=ra
240 vmz <- levels(as.factor(sfrgtfil$mznominal))
241
242 cat(" fragments :", vmz)
243
244 ## mz of precursor in data precursor to check correlation with
245 mz_prec <- paste0("mz", round(mean(sprec$mz), mzdecimal))
246
247 for (m in seq_along(vmz)) {
248
249 ## absolute intensity
250 cln <- c(
251 which(colnames(sfrgtfil) == "acquisitionNum"),
252 which(colnames(sfrgtfil) == "i")
253 )
254 int_mz <- sfrgtfil[sfrgtfil$mznominal == vmz[m], cln]
255 colnames(int_mz)[2] <- paste0("mz", vmz[m])
256
257 ## average intensities of mass in duplicate scans
258 comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean)
259 int_mz <- comp_scans[, -1]
260
261 if (m == 1) {
262 ds_abs_int <- int_mz
263 } else {
264 ds_abs_int <- merge(
265 x = ds_abs_int,
266 y = int_mz,
267 by.x = 1,
268 by.y = 1,
269 all.x = TRUE,
270 all.y = TRUE
271 )
272 }
273 }
274 if (debug) {
275 write.table(
276 x = ds_abs_int,
277 file = paste0(c_name, "ds_abs_int.txt"),
278 row.names = FALSE,
279 sep = "\t"
280 )
281 }
282
283 ## elimination of mz with less than min_number_scan scans (user defined
284 ## parameter)
285 xmz <- rep(NA, ncol(ds_abs_int) - 1)
286 sum_int <- rep(NA, ncol(ds_abs_int))
287 nbxmz <- 0
288 nb_scan_check <- min(nrow(ds_abs_int), min_number_scan)
289
290 for (j in 2:ncol(ds_abs_int)) {
291 sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE)
292 if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) {
293 nbxmz <- nbxmz + 1
294 xmz[nbxmz] <- j
295 }
296 }
297
298 xmz <- xmz[-which(is.na(xmz))]
299 if (length(xmz) > 0) {
300 ds_abs_int <- ds_abs_int[, -c(xmz)]
301 sum_int <- sum_int[-c(xmz)]
302 ## liste des mz keeped decale de 1 avec ds_abs_int
303 vmz <- as.numeric(vmz[-c(xmz - 1)])
304 }
305
306 ## reference ion for correlation computing = precursor OR maximum
307 ## intensity ion in precursor is not present
308 refcol <- which(colnames(ds_abs_int) == mz_prec)
309 if (length(refcol) == 0) {
310 refcol <- which(sum_int == max(sum_int, na.rm = TRUE))
311 }
312 pdf(
313 file = sprintf("%s_processing_file%s.pdf", c_name, mf[f]),
314 width = 8,
315 height = 11
316 )
317 par(mfrow = c(3, 2))
318
319 ## Pearson correlations between absolute intensities computing
320 cor_abs_int <- rep(NA, length(vmz))
321
322 if (length(refcol) > 0) {
323 for (i in 2:length(ds_abs_int)) {
324 cor_abs_int[i - 1] <- cor(
325 x = ds_abs_int[[refcol]],
326 y = ds_abs_int[[i]],
327 use = "pairwise.complete.obs",
328 method = "pearson"
329 )
330 plot(
331 ds_abs_int[[refcol]],
332 ds_abs_int[[i]],
333 xlab = colnames(ds_abs_int)[refcol],
334 ylab = colnames(ds_abs_int)[i],
335 main = sprintf(
336 "%s corr coeff r=%s", c_name, round(cor_abs_int[i - 1], 2)
337 )
338 )
339 }
340 ## plot pseudo spectra
341 res_comp_by_file <- plot_pseudo_spectra(
342 x = ds_abs_int,
343 r_threshold = r_threshold,
344 fid = mf[f],
345 sum_int = sum_int,
346 vmz = vmz,
347 cor_abs_int = cor_abs_int,
348 refcol = refcol,
349 c_name = c_name
350 )
351 if (f == 1) {
352 res_comp <- res_comp_by_file
353 }
354 } else {
355 res_comp_by_file <- NULL
356 cat(" non detected in fragments file \n")
357 }
358 if (!is.null(res_comp_by_file)) {
359 res_comp <- rbind(res_comp, res_comp_by_file)
360 }
361 cat("\n")
362 dev.off()
363 }
364 } else {
365 res_comp <- NULL
366 cat(" non detected in precursor file \n")
367 }
368 return(res_comp)
369 }
370
371
372 create_parser <- function() {
373 parser <- optparse::OptionParser()
374 parser <- optparse::add_option(
375 parser,
376 c("-v", "--verbose"),
377 action = "store_true",
378 default = FALSE,
379 help = "Print extra output [default %default]"
380 )
381 parser <- optparse::add_option(
382 parser,
383 c("-o", "--output"),
384 type = "character",
385 default = DEFAULT_OUTPUT_PATH,
386 action = "store",
387 help = "Path to the output file [default %default]"
388 )
389 parser <- optparse::add_option(
390 parser,
391 c("-p", "--precursors"),
392 type = "character",
393 default = DEFAULT_PRECURSOR_PATH,
394 action = "store",
395 help = "Path to the precursors file [default %default]"
396 )
397 parser <- optparse::add_option(
398 parser,
399 c("-f", "--fragments"),
400 type = "character",
401 default = DEFAULT_FRAGMENTS_PATH,
402 action = "store",
403 help = "Path to the fragments file [default %default]"
404 )
405 parser <- optparse::add_option(
406 parser,
407 c("-c", "--compounds"),
408 type = "character",
409 default = DEFAULT_COMPOUNDS_PATH,
410 action = "store",
411 help = "Path to the compounds file [default %default]"
412 )
413 parser <- optparse::add_option(
414 parser,
415 c("--tolmz"),
416 type = "numeric",
417 action = "store",
418 default = DEFAULT_TOLMZ,
419 metavar = "number"
420 )
421 parser <- optparse::add_option(
422 parser,
423 c("--tolrt"),
424 type = "integer",
425 action = "store",
426 default = DEFAULT_TOLRT,
427 metavar = "number"
428 )
429 parser <- optparse::add_option(
430 parser,
431 c("--seuil_ra"),
432 type = "numeric",
433 action = "store",
434 help = "relative intensity threshold",
435 default = DEFAULT_SEUIL_RA,
436 metavar = "number"
437 )
438 parser <- optparse::add_option(
439 parser,
440 c("--mzdecimal"),
441 type = "integer",
442 default = DEFAULT_MZDECIMAL,
443 action = "store",
444 help = "nb decimal for mz",
445 metavar = "number"
446 )
447 parser <- optparse::add_option(
448 parser,
449 c("--r_threshold"),
450 type = "integer",
451 default = DEFAULT_R_THRESHOLD,
452 action = "store",
453 help = paste0(
454 "r pearson correlation threshold between precursor and fragment ",
455 "absolute intensity"
456 ),
457 metavar = "number"
458 )
459 parser <- optparse::add_option(
460 parser,
461 c("--min_number_scan"),
462 type = "numeric",
463 action = "store",
464 default = DEFAULT_MINNUMBERSCAN,
465 help = paste0(
466 "fragments are kept if there are found in a minimum number ",
467 "of scans"
468 ),
469 metavar = "number"
470 )
471 return(parser)
472 }
473
474 main <- function(args) {
475 ## FOLDER AND FILES
476 ## MSpurity precursors file
477 precursors <- read.table(
478 file = args$precursors,
479 header = TRUE,
480 sep = "\t",
481 quote = "\""
482 )
483 ## MSpurity fragments file
484 fragments <- read.table(
485 file = args$fragments,
486 header = TRUE,
487 sep = "\t",
488 quote = "\""
489 )
490 ## list of compounds : col1=Name of molecule, col2=m/z, col3=retention time
491 compounds <- read.table(
492 file = args$compounds,
493 sep = "\t",
494 quote = "\"",
495 header = TRUE
496 )
497 ## PARAMETERS
498 ## tolerance for mz(dalton) rt(seconds) to match the standard in the compounds
499 ## list with the precursor MSpurity file
500 tolmz <- args$tolmz
501 tolrt <- args$tolrt
502
503 ## relative intensity threshold
504 seuil_ra <- args$seuil_ra
505 ## nb decimal for mz
506 mzdecimal <- args$mzdecimal
507 ## r pearson correlation threshold between precursor and
508 # #fragment absolute intensity
509 r_threshold <- args$r_threshold
510 ## fragments are kept if there are found in a minimum number of scans
511 min_number_scan <- args$min_number_scan
512
513 for (i in seq_len(nrow(compounds))) {
514 ## loop execution for all compounds in the compounds file
515 res_cor <- NULL
516 res_cor <- extract_fragments(
517 precursors = precursors,
518 fragments = fragments,
519 mzref = compounds[[2]][i],
520 rtref = compounds[[3]][i] * 60,
521 c_name = compounds[[1]][i],
522 min_number_scan = min_number_scan,
523 mzdecimal = mzdecimal,
524 r_threshold = r_threshold,
525 seuil_ra = seuil_ra,
526 tolmz = tolmz,
527 tolrt = tolrt
528 )
529 if (i == 1 & !is.null(res_cor)) {
530 res_all <- res_cor
531 } else if (!is.null(res_cor)) {
532 res_all <- rbind(res_all, res_cor)
533 }
534 }
535
536 if (is.null(res_all)) {
537 stop("No result at all!")
538 }
539 write.table(
540 x = res_all,
541 file = args$output,
542 sep = "\t",
543 row.names = FALSE
544 )
545 }
546
547 args <- optparse::parse_args(create_parser())
548 sessionInfo()
549 main(args)
550
551 warnings()