0
|
1 #'
|
|
2 #' read and process mspurity W4M files
|
|
3 #' create a summary of fragment for each precursor and a graphics of peseudo
|
|
4 #' spectra + correlation on which checking of fragment is based on
|
|
5 #' V3 try to identify and process multiple files for 1 precursor which may
|
|
6 #' occur if different collision energy are used
|
|
7 #' V4 elimination of correlation = NA. Correlation is done with precursor, if
|
|
8 #' precursor is not present correlation with most intense peak
|
|
9 #' author: Jean-Francois Martin
|
|
10 #' V5 is versionned, lintR-compliant, packaged, unit-tested, documented and
|
|
11 #' tested against data from other labs.
|
|
12 #' new maintainer: Lain Pavot - lain.pavot@inrae.fr
|
|
13 #'
|
|
14 #' @import optparse
|
|
15 #'
|
|
16 NULL
|
|
17
|
|
18
|
|
19 assign("DEFAULT_PRECURSOR_PATH", "peaklist_precursors.tsv")
|
|
20 assign("DEFAULT_FRAGMENTS_PATH", "peaklist_fragments.tsv")
|
|
21 assign("DEFAULT_COMPOUNDS_PATH", "compounds_pos.txt")
|
|
22 assign("DEFAULT_OUTPUT_PATH", "compound_fragments_result.txt")
|
|
23 assign("DEFAULT_TOLMZ", 0.01)
|
|
24 assign("DEFAULT_TOLRT", 20)
|
|
25 assign("DEFAULT_MZDECIMAL", 0)
|
|
26 assign("DEFAULT_R_THRESHOLD", 0.85)
|
|
27 assign("DEFAULT_MINNUMBERSCAN", 8)
|
|
28 assign("DEFAULT_SEUIL_RA", 0.5)
|
|
29 lockBinding("DEFAULT_PRECURSOR_PATH", globalenv())
|
|
30 lockBinding("DEFAULT_FRAGMENTS_PATH", globalenv())
|
|
31 lockBinding("DEFAULT_COMPOUNDS_PATH", globalenv())
|
|
32 lockBinding("DEFAULT_OUTPUT_PATH", globalenv())
|
|
33 lockBinding("DEFAULT_TOLMZ", globalenv())
|
|
34 lockBinding("DEFAULT_TOLRT", globalenv())
|
|
35 lockBinding("DEFAULT_MZDECIMAL", globalenv())
|
|
36 lockBinding("DEFAULT_R_THRESHOLD", globalenv())
|
|
37 lockBinding("DEFAULT_MINNUMBERSCAN", globalenv())
|
|
38 lockBinding("DEFAULT_SEUIL_RA", globalenv())
|
|
39
|
|
40 assign("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", 0.85)
|
|
41 assign("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", 0.1)
|
|
42 assign("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", 0.01)
|
|
43 assign("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", 60)
|
|
44 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", globalenv())
|
|
45 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", globalenv())
|
|
46 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", globalenv())
|
|
47 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", globalenv())
|
|
48
|
|
49
|
|
50 debug <- FALSE
|
|
51
|
|
52
|
|
53 ########################################################################
|
|
54
|
|
55 #' @title plot_pseudo_spectra
|
|
56 #' @param x
|
|
57 #' @param r_threshold
|
|
58 #' @param fid
|
|
59 #' @param sum_int
|
|
60 #' @param vmz
|
|
61 #' @param cor_abs_int
|
|
62 #' @param refcol
|
|
63 #' @param c_name
|
|
64 #' @description plot_pseudo_spectra
|
|
65 #' function to compute sum of intensities among scans for all
|
|
66 #' m/z kept (cor > r_threshold & minimum number of scans)
|
|
67 #' and plot pseudo spectra
|
|
68 #' x dataframe scan X fragments with scans number in the 1st column and
|
|
69 #' ions in next with intensities
|
|
70 #' fid file id when several a precursor has been detected in several files
|
|
71 plot_pseudo_spectra <- function(
|
|
72 x,
|
|
73 r_threshold,
|
|
74 fid,
|
|
75 sum_int,
|
|
76 vmz,
|
|
77 cor_abs_int,
|
|
78 refcol,
|
|
79 c_name
|
|
80 ) {
|
|
81 ## du fait de la difference de nombre de colonne entre la dataframe qui
|
|
82 ## inclue les scans en 1ere col, mzRef se decale de 1
|
|
83 refcol <- refcol - 1
|
|
84 ## compute relative intensities max=100%
|
|
85 rel_int <- sum_int[-1]
|
|
86 rel_int <- rel_int / max(rel_int)
|
|
87
|
|
88 ## define max value on vertical axis (need to increase in order to plot the
|
|
89 ## label of fragments)
|
|
90 ymax <- max(rel_int) + 0.2 * max(rel_int)
|
|
91
|
|
92 par(mfrow = c(2, 1))
|
|
93 plot(vmz, rel_int, type = "h", ylim = c(0, ymax), main = c_name)
|
|
94 ## low correl coef. will be display in grey
|
|
95 cor_low <- which(round(cor_abs_int, 2) < r_threshold)
|
|
96
|
|
97 lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2))
|
|
98
|
|
99 if (length(cor_low) > 0) {
|
|
100 text(
|
|
101 vmz[cor_low],
|
|
102 rel_int[cor_low],
|
|
103 lbmzcor[cor_low],
|
|
104 cex = 0.5,
|
|
105 col = "grey",
|
|
106 srt = 90,
|
|
107 adj = 0
|
|
108 )
|
|
109 if (length(vmz) - length(cor_low) > 1) {
|
|
110 text(
|
|
111 vmz[-c(refcol, cor_low)],
|
|
112 rel_int[-c(refcol, cor_low)],
|
|
113 lbmzcor[-c(refcol, cor_low)],
|
|
114 cex = 0.6,
|
|
115 col = 1,
|
|
116 srt = 90,
|
|
117 adj = 0
|
|
118 )
|
|
119 }
|
|
120 } else {
|
|
121 if (length(vmz) > 1) {
|
|
122 text(
|
|
123 vmz[-c(refcol)],
|
|
124 rel_int[-c(refcol)],
|
|
125 lbmzcor[-c(refcol)],
|
|
126 cex = 0.6,
|
|
127 col = 1,
|
|
128 srt = 90,
|
|
129 adj = 0
|
|
130 )
|
|
131 }
|
|
132 }
|
|
133
|
|
134 text(
|
|
135 vmz[refcol],
|
|
136 rel_int[refcol],
|
|
137 lbmzcor[refcol],
|
|
138 cex = 0.8,
|
|
139 col = 2,
|
|
140 srt = 90,
|
|
141 adj = 0
|
|
142 )
|
|
143
|
|
144 ## prepare result file
|
|
145 corValid <- (round(cor_abs_int, 2) >= r_threshold) ##nolint object_name_linter
|
|
146 cp_res <- data.frame(
|
|
147 rep(c_name, length(vmz)),
|
|
148 rep(fid, length(vmz)),
|
|
149 vmz,
|
|
150 cor_abs_int,
|
|
151 sum_int[-1],
|
|
152 rel_int,
|
|
153 corValid
|
|
154 )
|
|
155
|
|
156 colnames(cp_res) <- c(
|
|
157 "compoundName",
|
|
158 "fileid",
|
|
159 "fragments_mz",
|
|
160 "CorWithPrecursor",
|
|
161 "AbsoluteIntensity",
|
|
162 "relativeIntensity",
|
|
163 "corValid"
|
|
164 )
|
|
165 return(cp_res)
|
|
166
|
|
167 }
|
|
168
|
|
169 #'
|
|
170 #' @title extract_fragments
|
|
171 #'
|
|
172 #' @param precursors the precursor list from mspurity
|
|
173 #' @param fragments the fragments list from ms purity
|
|
174 #' @param mzref
|
|
175 #' @param rtref
|
|
176 #' @param c_name
|
|
177 #' @param r_threshold default = DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD
|
|
178 #' @param seuil_ra default = DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA
|
|
179 #' @param tolmz default = DEFAULT_EXTRACT_FRAGMENTS_TOLMZ
|
|
180 #' @param tolrt default = DEFAULT_EXTRACT_FRAGMENTS_TOLRT
|
|
181 #' @returns
|
|
182 #'
|
|
183 #' @description
|
|
184 #' function for extraction of fragments corresponding to precursors
|
|
185 #' detected by MSPurity
|
|
186 extract_fragments <- function( ## nolint cyclocomp_linter
|
|
187 precursors,
|
|
188 fragments,
|
|
189 mzref,
|
|
190 rtref,
|
|
191 c_name,
|
|
192 min_number_scan,
|
|
193 mzdecimal,
|
|
194 r_threshold=DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD,
|
|
195 seuil_ra=DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA,
|
|
196 tolmz=DEFAULT_EXTRACT_FRAGMENTS_TOLMZ,
|
|
197 tolrt=DEFAULT_EXTRACT_FRAGMENTS_TOLRT
|
|
198 ) {
|
|
199 ## filter precursor in the precursors file based on mz and rt in the
|
|
200 ## compound list
|
|
201 cat("processing ", c_name, "\n")
|
|
202 selected_precursors <- which(
|
|
203 (abs(precursors$precurMtchMZ - mzref) <= tolmz)
|
|
204 & (abs(precursors$precurMtchRT - rtref) <= tolrt)
|
|
205 )
|
|
206
|
|
207 ## check if there is the precursor in the file
|
|
208 if (length(selected_precursors) > 0) {
|
|
209
|
|
210 sprecini <- precursors[selected_precursors, ]
|
|
211
|
|
212 ## check if fragments corresponding to precursor are found in several
|
|
213 ## files (collision energy)
|
|
214 ## this lead to a processing for each fileid
|
|
215 mf <- levels(as.factor(sprecini$fileid))
|
|
216 if (length(mf) > 1) {
|
|
217 cat(" several files detected for this compounds :\n")
|
|
218 }
|
|
219
|
|
220 for (f in seq_along(mf)) {
|
|
221
|
|
222 sprec <- sprecini[sprecini$fileid == mf[f], ]
|
|
223
|
|
224 ## selection of fragment in the fragments file with the grpid common in
|
|
225 ## both fragments and precursors
|
|
226 selfrgt <- levels(as.factor(sprec$grpid))
|
|
227 sfrgt <- fragments[
|
|
228 fragments$grpid %in% selfrgt
|
|
229 & fragments$fileid == mf[f],
|
|
230 ]
|
|
231
|
|
232 ## filter fragments on relative intensity seuil_ra = user defined
|
|
233 ## parameter (MSpurity flags could be used here)
|
|
234 sfrgtfil <- sfrgt[sfrgt$ra > seuil_ra, ]
|
|
235
|
|
236 mznominal <- round(x = sfrgtfil$mz, mzdecimal)
|
|
237 sfrgtfil <- data.frame(sfrgtfil, mznominal)
|
|
238
|
|
239 ## creation of cross table row=scan col=mz X=ra
|
|
240 vmz <- levels(as.factor(sfrgtfil$mznominal))
|
|
241
|
|
242 cat(" fragments :", vmz)
|
|
243
|
|
244 ## mz of precursor in data precursor to check correlation with
|
|
245 mz_prec <- paste0("mz", round(mean(sprec$mz), mzdecimal))
|
|
246
|
|
247 for (m in seq_along(vmz)) {
|
|
248
|
|
249 ## absolute intensity
|
|
250 cln <- c(
|
|
251 which(colnames(sfrgtfil) == "acquisitionNum"),
|
|
252 which(colnames(sfrgtfil) == "i")
|
|
253 )
|
|
254 int_mz <- sfrgtfil[sfrgtfil$mznominal == vmz[m], cln]
|
|
255 colnames(int_mz)[2] <- paste0("mz", vmz[m])
|
|
256
|
|
257 ## average intensities of mass in duplicate scans
|
|
258 comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean)
|
|
259 int_mz <- comp_scans[, -1]
|
|
260
|
|
261 if (m == 1) {
|
|
262 ds_abs_int <- int_mz
|
|
263 } else {
|
|
264 ds_abs_int <- merge(
|
|
265 x = ds_abs_int,
|
|
266 y = int_mz,
|
|
267 by.x = 1,
|
|
268 by.y = 1,
|
|
269 all.x = TRUE,
|
|
270 all.y = TRUE
|
|
271 )
|
|
272 }
|
|
273 }
|
|
274 if (debug) {
|
|
275 write.table(
|
|
276 x = ds_abs_int,
|
|
277 file = paste0(c_name, "ds_abs_int.txt"),
|
|
278 row.names = FALSE,
|
|
279 sep = "\t"
|
|
280 )
|
|
281 }
|
|
282
|
|
283 ## elimination of mz with less than min_number_scan scans (user defined
|
|
284 ## parameter)
|
|
285 xmz <- rep(NA, ncol(ds_abs_int) - 1)
|
|
286 sum_int <- rep(NA, ncol(ds_abs_int))
|
|
287 nbxmz <- 0
|
|
288 nb_scan_check <- min(nrow(ds_abs_int), min_number_scan)
|
|
289
|
|
290 for (j in 2:ncol(ds_abs_int)) {
|
|
291 sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE)
|
|
292 if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) {
|
|
293 nbxmz <- nbxmz + 1
|
|
294 xmz[nbxmz] <- j
|
|
295 }
|
|
296 }
|
|
297
|
|
298 xmz <- xmz[-which(is.na(xmz))]
|
|
299 if (length(xmz) > 0) {
|
|
300 ds_abs_int <- ds_abs_int[, -c(xmz)]
|
|
301 sum_int <- sum_int[-c(xmz)]
|
|
302 ## liste des mz keeped decale de 1 avec ds_abs_int
|
|
303 vmz <- as.numeric(vmz[-c(xmz - 1)])
|
|
304 }
|
|
305
|
|
306 ## reference ion for correlation computing = precursor OR maximum
|
|
307 ## intensity ion in precursor is not present
|
|
308 refcol <- which(colnames(ds_abs_int) == mz_prec)
|
|
309 if (length(refcol) == 0) {
|
|
310 refcol <- which(sum_int == max(sum_int, na.rm = TRUE))
|
|
311 }
|
|
312 pdf(
|
|
313 file = sprintf("%s_processing_file%s.pdf", c_name, mf[f]),
|
|
314 width = 8,
|
|
315 height = 11
|
|
316 )
|
|
317 par(mfrow = c(3, 2))
|
|
318
|
|
319 ## Pearson correlations between absolute intensities computing
|
|
320 cor_abs_int <- rep(NA, length(vmz))
|
|
321
|
|
322 if (length(refcol) > 0) {
|
|
323 for (i in 2:length(ds_abs_int)) {
|
|
324 cor_abs_int[i - 1] <- cor(
|
|
325 x = ds_abs_int[[refcol]],
|
|
326 y = ds_abs_int[[i]],
|
|
327 use = "pairwise.complete.obs",
|
|
328 method = "pearson"
|
|
329 )
|
|
330 plot(
|
|
331 ds_abs_int[[refcol]],
|
|
332 ds_abs_int[[i]],
|
|
333 xlab = colnames(ds_abs_int)[refcol],
|
|
334 ylab = colnames(ds_abs_int)[i],
|
|
335 main = sprintf(
|
|
336 "%s corr coeff r=%s", c_name, round(cor_abs_int[i - 1], 2)
|
|
337 )
|
|
338 )
|
|
339 }
|
|
340 ## plot pseudo spectra
|
|
341 res_comp_by_file <- plot_pseudo_spectra(
|
|
342 x = ds_abs_int,
|
|
343 r_threshold = r_threshold,
|
|
344 fid = mf[f],
|
|
345 sum_int = sum_int,
|
|
346 vmz = vmz,
|
|
347 cor_abs_int = cor_abs_int,
|
|
348 refcol = refcol,
|
|
349 c_name = c_name
|
|
350 )
|
|
351 if (f == 1) {
|
|
352 res_comp <- res_comp_by_file
|
|
353 }
|
|
354 } else {
|
|
355 res_comp_by_file <- NULL
|
|
356 cat(" non detected in fragments file \n")
|
|
357 }
|
|
358 if (!is.null(res_comp_by_file)) {
|
|
359 res_comp <- rbind(res_comp, res_comp_by_file)
|
|
360 }
|
|
361 cat("\n")
|
|
362 dev.off()
|
|
363 }
|
|
364 } else {
|
|
365 res_comp <- NULL
|
|
366 cat(" non detected in precursor file \n")
|
|
367 }
|
|
368 return(res_comp)
|
|
369 }
|
|
370
|
|
371
|
|
372 create_parser <- function() {
|
|
373 parser <- optparse::OptionParser()
|
|
374 parser <- optparse::add_option(
|
|
375 parser,
|
|
376 c("-v", "--verbose"),
|
|
377 action = "store_true",
|
|
378 default = FALSE,
|
|
379 help = "Print extra output [default %default]"
|
|
380 )
|
|
381 parser <- optparse::add_option(
|
|
382 parser,
|
|
383 c("-o", "--output"),
|
|
384 type = "character",
|
|
385 default = DEFAULT_OUTPUT_PATH,
|
|
386 action = "store",
|
|
387 help = "Path to the output file [default %default]"
|
|
388 )
|
|
389 parser <- optparse::add_option(
|
|
390 parser,
|
|
391 c("-p", "--precursors"),
|
|
392 type = "character",
|
|
393 default = DEFAULT_PRECURSOR_PATH,
|
|
394 action = "store",
|
|
395 help = "Path to the precursors file [default %default]"
|
|
396 )
|
|
397 parser <- optparse::add_option(
|
|
398 parser,
|
|
399 c("-f", "--fragments"),
|
|
400 type = "character",
|
|
401 default = DEFAULT_FRAGMENTS_PATH,
|
|
402 action = "store",
|
|
403 help = "Path to the fragments file [default %default]"
|
|
404 )
|
|
405 parser <- optparse::add_option(
|
|
406 parser,
|
|
407 c("-c", "--compounds"),
|
|
408 type = "character",
|
|
409 default = DEFAULT_COMPOUNDS_PATH,
|
|
410 action = "store",
|
|
411 help = "Path to the compounds file [default %default]"
|
|
412 )
|
|
413 parser <- optparse::add_option(
|
|
414 parser,
|
|
415 c("--tolmz"),
|
|
416 type = "numeric",
|
|
417 action = "store",
|
|
418 default = DEFAULT_TOLMZ,
|
|
419 metavar = "number"
|
|
420 )
|
|
421 parser <- optparse::add_option(
|
|
422 parser,
|
|
423 c("--tolrt"),
|
|
424 type = "integer",
|
|
425 action = "store",
|
|
426 default = DEFAULT_TOLRT,
|
|
427 metavar = "number"
|
|
428 )
|
|
429 parser <- optparse::add_option(
|
|
430 parser,
|
|
431 c("--seuil_ra"),
|
|
432 type = "numeric",
|
|
433 action = "store",
|
|
434 help = "relative intensity threshold",
|
|
435 default = DEFAULT_SEUIL_RA,
|
|
436 metavar = "number"
|
|
437 )
|
|
438 parser <- optparse::add_option(
|
|
439 parser,
|
|
440 c("--mzdecimal"),
|
|
441 type = "integer",
|
|
442 default = DEFAULT_MZDECIMAL,
|
|
443 action = "store",
|
|
444 help = "nb decimal for mz",
|
|
445 metavar = "number"
|
|
446 )
|
|
447 parser <- optparse::add_option(
|
|
448 parser,
|
|
449 c("--r_threshold"),
|
|
450 type = "integer",
|
|
451 default = DEFAULT_R_THRESHOLD,
|
|
452 action = "store",
|
|
453 help = paste0(
|
|
454 "r pearson correlation threshold between precursor and fragment ",
|
|
455 "absolute intensity"
|
|
456 ),
|
|
457 metavar = "number"
|
|
458 )
|
|
459 parser <- optparse::add_option(
|
|
460 parser,
|
|
461 c("--min_number_scan"),
|
|
462 type = "numeric",
|
|
463 action = "store",
|
|
464 default = DEFAULT_MINNUMBERSCAN,
|
|
465 help = paste0(
|
|
466 "fragments are kept if there are found in a minimum number ",
|
|
467 "of scans"
|
|
468 ),
|
|
469 metavar = "number"
|
|
470 )
|
|
471 return(parser)
|
|
472 }
|
|
473
|
|
474 main <- function(args) {
|
|
475 ## FOLDER AND FILES
|
|
476 ## MSpurity precursors file
|
|
477 precursors <- read.table(
|
|
478 file = args$precursors,
|
|
479 header = TRUE,
|
|
480 sep = "\t",
|
|
481 quote = "\""
|
|
482 )
|
|
483 ## MSpurity fragments file
|
|
484 fragments <- read.table(
|
|
485 file = args$fragments,
|
|
486 header = TRUE,
|
|
487 sep = "\t",
|
|
488 quote = "\""
|
|
489 )
|
|
490 ## list of compounds : col1=Name of molecule, col2=m/z, col3=retention time
|
|
491 compounds <- read.table(
|
|
492 file = args$compounds,
|
|
493 sep = "\t",
|
|
494 quote = "\"",
|
|
495 header = TRUE
|
|
496 )
|
|
497 ## PARAMETERS
|
|
498 ## tolerance for mz(dalton) rt(seconds) to match the standard in the compounds
|
|
499 ## list with the precursor MSpurity file
|
|
500 tolmz <- args$tolmz
|
|
501 tolrt <- args$tolrt
|
|
502
|
|
503 ## relative intensity threshold
|
|
504 seuil_ra <- args$seuil_ra
|
|
505 ## nb decimal for mz
|
|
506 mzdecimal <- args$mzdecimal
|
|
507 ## r pearson correlation threshold between precursor and
|
|
508 # #fragment absolute intensity
|
|
509 r_threshold <- args$r_threshold
|
|
510 ## fragments are kept if there are found in a minimum number of scans
|
|
511 min_number_scan <- args$min_number_scan
|
|
512
|
|
513 for (i in seq_len(nrow(compounds))) {
|
|
514 ## loop execution for all compounds in the compounds file
|
|
515 res_cor <- NULL
|
|
516 res_cor <- extract_fragments(
|
|
517 precursors = precursors,
|
|
518 fragments = fragments,
|
|
519 mzref = compounds[[2]][i],
|
|
520 rtref = compounds[[3]][i] * 60,
|
|
521 c_name = compounds[[1]][i],
|
|
522 min_number_scan = min_number_scan,
|
|
523 mzdecimal = mzdecimal,
|
|
524 r_threshold = r_threshold,
|
|
525 seuil_ra = seuil_ra,
|
|
526 tolmz = tolmz,
|
|
527 tolrt = tolrt
|
|
528 )
|
|
529 if (i == 1 & !is.null(res_cor)) {
|
|
530 res_all <- res_cor
|
|
531 } else if (!is.null(res_cor)) {
|
|
532 res_all <- rbind(res_all, res_cor)
|
|
533 }
|
|
534 }
|
|
535
|
|
536 if (is.null(res_all)) {
|
|
537 stop("No result at all!")
|
|
538 }
|
|
539 write.table(
|
|
540 x = res_all,
|
|
541 file = args$output,
|
|
542 sep = "\t",
|
|
543 row.names = FALSE
|
|
544 )
|
|
545 }
|
|
546
|
|
547 args <- optparse::parse_args(create_parser())
|
|
548 sessionInfo()
|
|
549 main(args)
|
|
550
|
|
551 warnings()
|