Mercurial > repos > prog > lcmsmatching
comparison MsXlsDb.R @ 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author | prog |
---|---|
date | Fri, 22 Feb 2019 16:04:22 -0500 |
parents | fb9c0409d85c |
children |
comparison
equal
deleted
inserted
replaced
5:fb9c0409d85c | 6:f86fec07f392 |
---|---|
1 if ( ! exists('MsXlsDb')) { # Do not load again if already loaded | |
2 | |
3 library('methods') | |
4 library('stringr') | |
5 source('msdb-common.R') | |
6 source('MsDb.R') | |
7 source('strhlp.R', chdir = TRUE) | |
8 source('dfhlp.R', chdir = TRUE) | |
9 source('search.R', chdir = TRUE) | |
10 source('excelhlp.R', chdir = TRUE) | |
11 | |
12 ############# | |
13 # CONSTANTS # | |
14 ############# | |
15 | |
16 .THIS.FILE.PATH <- getwd() # We suppose that the file has been sourced with the option chdir = TRUE | |
17 | |
18 .XLS_PEAKS_ROW_OFFSET <- 8 | |
19 .XLS_PEAKS_RT_COL_START <- 11 | |
20 .XLS_MSPOS_TAB <- 'MS_POS' | |
21 .XLS_MSNEG_TAB <- 'MS_NEG' | |
22 .XLS_MZ_COL <- 1 | |
23 .XLS_INTENSITY_COL <- 2 | |
24 .XLS_RELATIVE_COL <- 3 | |
25 .XLS_THEORETICAL_MZ_COL <- 5 | |
26 .XLS_COMPOSITION_COL <- 8 | |
27 .XLS_ATTRIBUTION_COL <- 9 | |
28 | |
29 ##################### | |
30 # CLASS DECLARATION # | |
31 ##################### | |
32 | |
33 MsXlsDb <- setRefClass("MsXlsDb", contains = "MsDb", fields = list(.mz.index = "ANY", .name_index = "ANY", .db_dir = "character", .limit = "numeric", .files = "ANY", .cache_dir = "character", .db = "ANY")) | |
34 | |
35 ############### | |
36 # CONSTRUCTOR # | |
37 ############### | |
38 | |
39 MsXlsDb$methods( initialize = function(db_dir = NA_character_, limit = NA_integer_, cache_dir = NA_character_, cache = FALSE, ...) { | |
40 | |
41 # Initialize members | |
42 # TODO check that db_dir is not null neither na, and tests that it exists and is a directory. | |
43 .db_dir <<- if ( ! is.null(db_dir)) db_dir else NA_character_ | |
44 .limit <<- if ( ! is.null(limit) && ! is.na(limit) && limit > 0) limit else NA_integer_ | |
45 cache_dir <- if (cache && is.na(cache_dir) && ! is.na(db_dir)) file.path(db_dir, 'cache') else cache_dir | |
46 .cache_dir <<- if ( cache || ! is.null(cache_dir)) cache_dir else NA_character_ | |
47 .files <<- NULL | |
48 .db <<- NULL | |
49 .mz.index <<- NULL | |
50 .name_index <<- NULL | |
51 | |
52 callSuper(...) | |
53 }) | |
54 | |
55 #################### | |
56 # GET MOLECULE IDS # | |
57 #################### | |
58 | |
59 MsXlsDb$methods( getMoleculeIds = function(max.results = NA_integer_) { | |
60 | |
61 # Init file list | |
62 .self$.init.file.list() | |
63 | |
64 # Get IDs | |
65 mol.ids <- as.integer(which( ! is.na(.self$.files))) | |
66 | |
67 # Cut | |
68 if ( ! is.na(max.results) && length(mol.ids) > max.results) | |
69 mol.ids <- mol.ids[max.results, ] | |
70 | |
71 return(mol.ids) | |
72 }) | |
73 | |
74 #################### | |
75 # GET NB MOLECULES # | |
76 #################### | |
77 | |
78 # Returns a list of all molecule names | |
79 MsXlsDb$methods( getNbMolecules = function() { | |
80 return(length(.self$getMoleculeIds())) | |
81 }) | |
82 | |
83 ##################### | |
84 # GET MOLECULE NAME # | |
85 ##################### | |
86 | |
87 MsXlsDb$methods( getMoleculeName = function(molid) { | |
88 return(vapply(molid, function(m) .self$.get.mol.name(m), FUN.VALUE = "")) | |
89 }) | |
90 | |
91 ############################### | |
92 # GET CHROMATOGRAPHIC COLUMNS # | |
93 ############################### | |
94 | |
95 # Returns a list of all chromatographic columns used | |
96 MsXlsDb$methods( getChromCol = function(molid = NULL) { | |
97 | |
98 cn <- character() | |
99 | |
100 # If no molecule IDs provided, then look at all molecules | |
101 if (is.null(molid)) | |
102 molid <- .self$getMoleculeIds() | |
103 | |
104 # Loop on molecules | |
105 for (mid in molid) { | |
106 | |
107 rt <- .self$getRetentionTimes(mid) | |
108 | |
109 if ( ! is.null(rt)) | |
110 cn <- c(cn, names(rt)) | |
111 } | |
112 | |
113 # Remove duplicates | |
114 cn <- cn[ ! duplicated(cn)] | |
115 | |
116 # Make data frame | |
117 cn <- data.frame(id = cn, title = cn, stringsAsFactors = FALSE) | |
118 | |
119 return(cn) | |
120 }) | |
121 | |
122 ################ | |
123 # FIND BY NAME # | |
124 ################ | |
125 | |
126 MsXlsDb$methods( findByName = function(name) { | |
127 | |
128 # NULL entry | |
129 if (is.null(name)) | |
130 return(NA_integer_) | |
131 | |
132 # Initialize output list | |
133 ids <- NULL | |
134 | |
135 for (n in name) { | |
136 | |
137 id <- NA_integer_ | |
138 | |
139 if ( ! is.na(n)) { | |
140 | |
141 # Get index | |
142 index <- .self$.get.name.index() | |
143 | |
144 # Search for name in index | |
145 i <- binary.search(toupper(n), index[['name']]) | |
146 | |
147 id <- if (is.na(i)) NA_integer_ else index[i, 'id'] | |
148 } | |
149 | |
150 ids <- c(ids, id) | |
151 } | |
152 | |
153 return(ids) | |
154 }) | |
155 | |
156 ####################### | |
157 # GET RETENTION TIMES # | |
158 ####################### | |
159 | |
160 MsXlsDb$methods( getRetentionTimes = function(molid, col = NA_character_) { | |
161 | |
162 if (is.null(molid) || is.na(molid)) | |
163 return(NULL) | |
164 | |
165 # Find it in memory | |
166 rt <- .self$.mem.get(molid, 'rt') | |
167 | |
168 if (is.null(rt)) { | |
169 | |
170 # Call observers | |
171 if ( ! is.null(.self$.observers)) | |
172 for (obs in .self$.observers) | |
173 obs$progress(paste0("Loading retention times of file", .self$.get.file(molid), "."), level = 2) | |
174 | |
175 rt <- NULL | |
176 | |
177 # Load from cache file | |
178 cache_file <- NA_character_ | |
179 if ( ! is.na(.self$.get.cache.dir())) { | |
180 cache_file <- file.path(.self$.get.cache.dir(), paste0('rt-', molid, '.bin')) | |
181 if (file.exists(cache_file)) | |
182 load(file = cache_file) # load rt | |
183 } | |
184 | |
185 if (is.null(rt)) { | |
186 | |
187 # Get retention times of both positive and negative mode tabs | |
188 mspos_rt <- .self$.parse_retention_times(molid, .XLS_MSPOS_TAB) | |
189 msneg_rt <- .self$.parse_retention_times(molid, .XLS_MSNEG_TAB) | |
190 | |
191 # Retention times stored in negative and positive modes | |
192 if ( ! is.null(mspos_rt) && ! is.null(msneg_rt)) { | |
193 | |
194 # Warn observers when both retention time lists are not identical | |
195 if ( ! identical(mspos_rt, msneg_rt)) | |
196 for (obs in .self$.observers) | |
197 obs$warning(paste0("Retention times in negative and positive modes are different in file ", .self$.get.file(molid), ".")) | |
198 | |
199 # Merge both lists | |
200 rt <- mspos_rt | |
201 for (c in names(msneg_rt)) | |
202 if (c %in% names(rt)) { | |
203 v <- c(rt[[c]], msneg_rt[[c]]) | |
204 rt[[c]] <- v[ ! duplicated(v)] | |
205 } | |
206 else | |
207 rt[[c]] <- msneg_rt[[c]] | |
208 } | |
209 else | |
210 # Set retention times | |
211 rt <- if (is.null(mspos_rt)) msneg_rt else mspos_rt | |
212 | |
213 if (is.null(rt)) rt <- list() | |
214 | |
215 # Write in cache | |
216 if ( ! is.na(cache_file)) { | |
217 | |
218 # Call observers | |
219 if ( ! is.null(.self$.observers)) | |
220 for (obs in .self$.observers) | |
221 obs$progress(paste0("Caching retention times of file ", .self$.get.file(molid), ".")) | |
222 | |
223 save(rt, file = cache_file) | |
224 } | |
225 } | |
226 | |
227 # Store in memory | |
228 .self$.mem.set(rt, molid, 'rt') | |
229 } | |
230 | |
231 # Select only one column if asked | |
232 if ( ! is.na(col)) rt <- rt[[col]] | |
233 | |
234 return(rt) | |
235 }) | |
236 | |
237 ################# | |
238 # GET NB PEAKS # | |
239 ################# | |
240 | |
241 MsXlsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { | |
242 | |
243 # Initialize parameters | |
244 if (is.null(molid) || (length(molid) == 1 && is.na(molid))) | |
245 molid <- .self$getMoleculeIds() | |
246 if (is.na(type)) | |
247 type <- c(MSDB.TAG.POS, MSDB.TAG.NEG) | |
248 | |
249 return(sum(vapply(molid, function(m) { if (is.na(m)) 0 else sum(vapply(type, function(t) { peaks <- .self$.get.peaks(m, t) ; if (is.null(peaks)) 0 else nrow(peaks) }, FUN.VALUE = 1)) }, FUN.VALUE = 1))) | |
250 }) | |
251 | |
252 ################## | |
253 # GET PEAK TABLE # | |
254 ################## | |
255 | |
256 MsXlsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) { | |
257 | |
258 peaks <- NULL | |
259 | |
260 # Set default molecule IDs | |
261 if (is.null(molid) || (length(molid) == 1 && is.na(molid))) | |
262 molid <- .self$getMoleculeIds() | |
263 | |
264 # Set default modes | |
265 if (is.null(mode) || (length(mode) == 1 && is.na(mode))) | |
266 mode <- c(MSDB.TAG.POS, MSDB.TAG.NEG) | |
267 | |
268 # Loop on all molecules | |
269 for (mol in molid) { | |
270 | |
271 # Loop on all modes | |
272 for (mod in mode) { | |
273 m.peaks <- .self$.get.peaks(mol, mod) | |
274 if ( ! is.null(m.peaks) && nrow(m.peaks) > 0) { | |
275 m.peaks[[MSDB.TAG.MOLID]] <- mol | |
276 m.peaks[[MSDB.TAG.MODE]] <- mod | |
277 peaks <- if (is.null(peaks)) m.peaks else rbind(peaks, m.peaks) | |
278 peaks <- df.move.col.first(peaks, c(MSDB.TAG.MOLID, MSDB.TAG.MODE)) | |
279 } | |
280 } | |
281 } | |
282 | |
283 return(peaks) | |
284 }) | |
285 | |
286 ################# | |
287 # GET MZ VALUES # | |
288 ################# | |
289 | |
290 # Returns a numeric vector of all masses stored inside the database. | |
291 MsXlsDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { | |
292 | |
293 mz <- numeric() | |
294 | |
295 # Get all mz values of all molecules | |
296 for(molid in .self$getMoleculeIds()) | |
297 for (m in (if (is.null(mode) || is.na(mode)) c(MSDB.TAG.POS, MSDB.TAG.NEG) else mode)) | |
298 mz <- c(mz, .self$.get.peaks(molid, m)[[MSDB.TAG.MZTHEO]]) | |
299 | |
300 # Remove duplicated | |
301 mz <- mz[ ! duplicated(mz)] | |
302 | |
303 # Apply cut-off | |
304 if ( ! is.na(max.results)) | |
305 mz <- mz[1:max.results] | |
306 | |
307 return(mz) | |
308 }) | |
309 | |
310 ############# | |
311 # GET PEAKS # | |
312 ############# | |
313 | |
314 MsXlsDb$methods( .get.peaks = function(molid, mode) { | |
315 | |
316 tab <- if (mode == MSDB.TAG.POS) .XLS_MSPOS_TAB else .XLS_MSNEG_TAB | |
317 | |
318 # Find it in memory | |
319 peak_df <- .self$.mem.get(molid, 'peaks', mode) | |
320 | |
321 if (is.null(peak_df)) { | |
322 # Call observers | |
323 if ( ! is.null(.self$.observers)) | |
324 for (obs in .self$.observers) | |
325 obs$progress(paste0("Loading peaks of tab ", tab, " of file ", .self$.get.file(molid), "."), level = 2) | |
326 | |
327 peak_df <- NULL | |
328 | |
329 # Load from cache file | |
330 cache_file <- NA_character_ | |
331 if ( ! is.na(.self$.get.cache.dir())) { | |
332 cache_file <- file.path(.self$.get.cache.dir(), paste0('peaks-', molid, '-', tab, '.csv')) | |
333 if (file.exists(cache_file)) | |
334 peak_df <- read.csv(cache_file, header = TRUE, stringsAsFactors = FALSE) | |
335 } | |
336 | |
337 # Read from XLS file, if not in cache | |
338 if (is.null(peak_df)) { | |
339 | |
340 # Load tab (peaks start at row 8) | |
341 if (.self$.tab.exists(.self$.get.file(molid), tab)) { | |
342 | |
343 peaks <- read.excel(.self$.get.file(molid), tab, start.row = .XLS_PEAKS_ROW_OFFSET, stringsAsFactors = FALSE) | |
344 if ( ! is.null(peaks)) | |
345 peaks <- peaks[ ! is.na(peaks[.XLS_MZ_COL]), , drop = FALSE] # Remove rows where m/z is not defined. TODO maybe call observer for notify a line with non NA values but without m/z value. | |
346 | |
347 # Instantiate peaks | |
348 if ( ! is.null(peaks) && nrow(peaks) > 0) { | |
349 peak_df <- peaks[1:length(peaks[[.XLS_MZ_COL]]), c(.XLS_MZ_COL, .XLS_THEORETICAL_MZ_COL, .XLS_INTENSITY_COL, .XLS_RELATIVE_COL, .XLS_COMPOSITION_COL, .XLS_ATTRIBUTION_COL), drop = FALSE] | |
350 colnames(peak_df) <- c(MSDB.TAG.MZEXP, MSDB.TAG.MZTHEO, MSDB.TAG.INT, MSDB.TAG.REL, MSDB.TAG.COMP, MSDB.TAG.ATTR) | |
351 } | |
352 | |
353 # Set default data frame (important for cache file writing, because we need a correct header to be written in order for loading) | |
354 else { | |
355 peak_df <- data.frame(stringsAsFactors = FALSE) | |
356 peak_df[MSDB.TAG.MZEXP] <- numeric() | |
357 peak_df[MSDB.TAG.MZTHEO] <- numeric() | |
358 peak_df[MSDB.TAG.INT] <- numeric() | |
359 peak_df[MSDB.TAG.REL] <- numeric() | |
360 peak_df[MSDB.TAG.COMP] <- character() | |
361 peak_df[MSDB.TAG.ATTR] <- character() | |
362 } | |
363 | |
364 if (is.null(peak_df)) peak_df <- data.frame() | |
365 | |
366 # Write in cache | |
367 if ( ! is.na(cache_file)) { | |
368 | |
369 # Call observers | |
370 if ( ! is.null(.self$.observers)) | |
371 for (obs in .self$.observers) | |
372 obs$progress(paste0("Caching peaks of tab ", tab, " of file ", .self$.get.file(molid), ".")) | |
373 | |
374 write.csv(peak_df, cache_file, row.names = FALSE) | |
375 } | |
376 } | |
377 } | |
378 | |
379 # Store in memory | |
380 .self$.mem.set(peak_df, molid, 'peaks', mode) | |
381 } | |
382 | |
383 return(peak_df) | |
384 }) | |
385 | |
386 ############################## | |
387 # GET FULL MS PEAK M/Z INDEX # | |
388 ############################## | |
389 | |
390 # Get mz index for full ions, creating it if necessary. | |
391 MsXlsDb$methods( .get.mz.index = function(mode) { | |
392 | |
393 if (is.null(.self$.mz.index[[mode]])) { | |
394 | |
395 # Initialize data frame | |
396 mzi <- data.frame(stringsAsFactors = FALSE) | |
397 mzi[MSDB.TAG.MZTHEO] <- numeric() | |
398 mzi[MSDB.TAG.MOLID] <- character() | |
399 mzi[MSDB.TAG.COMP] <- character() | |
400 mzi[MSDB.TAG.ATTR] <- character() | |
401 | |
402 # Loop on all molecules | |
403 for(molid in .self$getMoleculeIds()) { | |
404 | |
405 # Get all peaks of this molecule | |
406 peaks <- .self$.get.peaks(molid, mode) | |
407 | |
408 # Remove rows whose mz is NA. | |
409 peaks <- peaks[ ! is.na(peaks[[MSDB.TAG.MZTHEO]]), ] | |
410 | |
411 if (nrow(peaks) > 0) { | |
412 | |
413 # Add id column | |
414 peaks[MSDB.TAG.MOLID] <- molid | |
415 | |
416 # Append peaks | |
417 r <- nrow(mzi) + 1 | |
418 rows <- r:(r+nrow(peaks)-1) | |
419 mzi[rows, ] <- peaks[colnames(mzi)] | |
420 } | |
421 } | |
422 | |
423 # Sort by M/Z | |
424 sorted_indices <- order(mzi[[MSDB.TAG.MZTHEO]]) | |
425 | |
426 # Group in a data frame | |
427 .self$.mz.index[[mode]] <- mzi[sorted_indices, ] | |
428 } | |
429 | |
430 return(.self$.mz.index[[mode]]) | |
431 }) | |
432 | |
433 ###################### | |
434 # SEARCH FOR MZ & RT # | |
435 ###################### | |
436 | |
437 MsXlsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { | |
438 | |
439 # Search for m/z | |
440 results <- .self$.do.search.for.mz(mode, mz.low, mz.high) | |
441 | |
442 # Filter on attributions | |
443 if ( ! is.null(attribs)) { | |
444 results <- results[results[[MSDB.TAG.ATTR]] %in% attribs, ] | |
445 } | |
446 | |
447 # Filer on molecule IDs | |
448 if ( ! is.null(molids)) { | |
449 results <- results[results[[MSDB.TAG.MOLID]] %in% molids, ] | |
450 } | |
451 | |
452 # Use retention time | |
453 if ( ! is.null(col) && ! is.null(rt.low) && ! is.null(rt.high)) { | |
454 | |
455 # Get list of unique IDs | |
456 ids <- results[[MSDB.TAG.MOLID]] | |
457 ids <- ids[ ! duplicated(ids)] | |
458 rt <- .self$.search.for.rt(mols = ids, rt.low = rt.low, rt.high = rt.high, col = col) | |
459 results <- results[results[[MSDB.TAG.MOLID]] %in% rt[[MSDB.TAG.MOLID]], ] | |
460 results <- merge(results, rt) | |
461 } | |
462 | |
463 return(results) | |
464 }) | |
465 | |
466 ############################## | |
467 # SEARCH FOR M/Z IN MS PEAKS # | |
468 ############################## | |
469 | |
470 MsXlsDb$methods( .do.search.for.mz = function(mode, mz.low, mz.high) { | |
471 | |
472 results <- data.frame(stringsAsFactors = FALSE) | |
473 results[MSDB.TAG.MZTHEO] <- numeric() | |
474 results[MSDB.TAG.MOLID] <- character() | |
475 results[MSDB.TAG.MOLNAMES] <- character() | |
476 results[MSDB.TAG.COMP] <- character() | |
477 results[MSDB.TAG.ATTR] <- character() | |
478 | |
479 # Create m/z index | |
480 mz_index <- .self$.get.mz.index(mode) | |
481 | |
482 # Find molecules | |
483 low_bound <- binary.search(mz.low, mz_index[[MSDB.TAG.MZTHEO]], lower = FALSE) | |
484 high_bound <- binary.search(mz.high, mz_index[[MSDB.TAG.MZTHEO]], lower = TRUE) | |
485 | |
486 # Get results | |
487 if ( ! is.na(high_bound) && ! is.na(low_bound) && low_bound <= high_bound) | |
488 results <- mz_index[low_bound:high_bound,] | |
489 | |
490 # Remove row names | |
491 rownames(results) <- NULL | |
492 | |
493 return(results) | |
494 }) | |
495 | |
496 ################ | |
497 # GET MOL NAME # | |
498 ################ | |
499 | |
500 MsXlsDb$methods( .get.mol.name = function(molid) { | |
501 | |
502 if (is.na(molid)) | |
503 return(NA_character_) | |
504 | |
505 # Find it in memory | |
506 name <- .self$.mem.get(molid, 'name') | |
507 | |
508 if (is.null(name)) { | |
509 | |
510 # Load molecule | |
511 mol <- .self$.load.molecule(molid) | |
512 | |
513 # Look for name in tabs | |
514 for (tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { | |
515 hdr <- mol[[tab]][['header']] | |
516 if ( ! is.null(hdr)) | |
517 name <- hdr[[1]] | |
518 if ( ! is.null(name) && ! is.na(name)) break | |
519 } | |
520 | |
521 # Store in memory | |
522 if (is.null(name)) name <- NA_character_ | |
523 .self$.mem.set(name, molid, 'name') | |
524 } | |
525 | |
526 return(name) | |
527 }) | |
528 | |
529 ################## | |
530 # GET NAME INDEX # | |
531 ################## | |
532 | |
533 # Get name index. | |
534 MsXlsDb$methods( .get.name.index = function() { | |
535 | |
536 if (is.null(.self$.name_index)) { | |
537 | |
538 # Get names | |
539 names <- vapply(.self$getMoleculeIds(), function(id) toupper(.self$getMoleculeName(id)), FUN.VALUE = "") | |
540 | |
541 # Get molecule IDs | |
542 id <- .self$getMoleculeIds() | |
543 | |
544 # Sort by names | |
545 sorted_indices <- order(names) | |
546 | |
547 # Group in a data frame | |
548 .self$.name_index <- data.frame(name = rbind(names)[, sorted_indices], | |
549 id = rbind(id)[, sorted_indices], | |
550 stringsAsFactors = FALSE) | |
551 } | |
552 | |
553 return(.self$.name_index) | |
554 }) | |
555 | |
556 ################## | |
557 # INIT FILE LIST # | |
558 ################## | |
559 | |
560 MsXlsDb$methods( .init.file.list = function() { | |
561 | |
562 if (is.null(.self$.files)) { | |
563 | |
564 # List all files | |
565 files <- Sys.glob(file.path(.self$.db_dir, '*.xls')) | |
566 | |
567 # Limit the size of the database | |
568 if ( ! is.na(.self$.limit)) | |
569 files <- head(files, .self$.limit) | |
570 | |
571 # Get IDs | |
572 ids <- vapply(files, function(f) .extract_molecule_id_from_filename(f), FUN.VALUE = 1) | |
573 | |
574 # Use ids as indices to build the vector of files | |
575 .files <<- rep(NA_character_, max(ids)) | |
576 .files[ids] <<- files | |
577 } | |
578 }) | |
579 | |
580 ################# | |
581 # GET CACHE DIR # | |
582 ################# | |
583 | |
584 MsXlsDb$methods( .get.cache.dir = function() { | |
585 | |
586 if ( ! is.na(.self$.cache_dir) && ! file.exists(.self$.cache_dir)) | |
587 dir.create(.self$.cache_dir) | |
588 | |
589 return(.self$.cache_dir) | |
590 }) | |
591 | |
592 ################# | |
593 # LOAD MOLECULE # | |
594 ################# | |
595 | |
596 MsXlsDb$methods( .load.molecule = function(molid) { | |
597 | |
598 # Init local variables | |
599 mol <- NULL | |
600 cache_file <- NA_character_ | |
601 excel_file <- .self$.get.file(molid) | |
602 | |
603 # Call observers | |
604 if ( ! is.null(.self$.observers)) | |
605 for (obs in .self$.observers) | |
606 obs$progress(paste0("Loading molecule ", molid, "."), level = 2) | |
607 | |
608 # Load from cache | |
609 if ( ! is.na(.self$.get.cache.dir())) { | |
610 cache_file <- file.path(.self$.get.cache.dir(), paste0(molid, '.bin')) | |
611 if (file.exists(cache_file)) | |
612 load(file = cache_file) # load mol variable | |
613 } | |
614 | |
615 # Load from Excel file & write to cache | |
616 if (is.null(mol) && ! is.na(excel_file)) { | |
617 | |
618 source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. | |
619 | |
620 # Load from Excel file | |
621 for(tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { | |
622 | |
623 # Test that tab exists | |
624 if (.self$.tab.exists(excel_file, tab)) { | |
625 header <- read.excel(excel_file, tab, start.row = 1, end.row = .XLS_PEAKS_ROW_OFFSET - 1, header = FALSE, stringsAsFactors = FALSE, trim.values = TRUE, col.index = c(1))[[1]] | |
626 peaks <- read.excel(excel_file, tab, start.row = .XLS_PEAKS_ROW_OFFSET) | |
627 mol[[tab]] <- list(header = header, peaks = peaks) | |
628 } | |
629 | |
630 # Missing tab | |
631 else { | |
632 for (obs in .self$.observers) | |
633 obs$warning(paste0("No excel tab ", tab, " in file ", excel_file, ".")) | |
634 } | |
635 } | |
636 | |
637 # Write in cache | |
638 if ( ! is.na(cache_file)) { | |
639 | |
640 # Call observers | |
641 if ( ! is.null(.self$.observers)) | |
642 for (obs in .self$.observers) | |
643 obs$progress(paste0("Caching file ", excel_file, ".")) | |
644 | |
645 save(mol, file = cache_file) | |
646 } | |
647 } | |
648 | |
649 return(mol) | |
650 }) | |
651 | |
652 ######################## | |
653 # DOES EXCEL TAB EXIST # | |
654 ######################## | |
655 | |
656 MsXlsDb$methods( .tab.exists = function(file, tab) { | |
657 | |
658 source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. | |
659 | |
660 if ( ! tab.exists(file, tab)) { | |
661 | |
662 # Warn observers | |
663 for (obs in .self$.observers) | |
664 obs$warning(paste0("No excel tab ", tab, " in file ", file, ".")) | |
665 | |
666 return(FALSE) | |
667 } | |
668 | |
669 return(TRUE) | |
670 }) | |
671 | |
672 ######################### | |
673 # PARSE RETENTION TIMES # | |
674 ######################### | |
675 | |
676 MsXlsDb$methods( .parse_retention_times = function(id, tab) { | |
677 | |
678 rt <- NULL | |
679 | |
680 if (.self$.tab.exists(.self$.get.file(id), tab)) { | |
681 peaks <- read.excel(.self$.get.file(id), tab, start.row = .XLS_PEAKS_ROW_OFFSET) | |
682 | |
683 # Get retention times | |
684 if ( ! is.null(peaks) && length(peaks) > 0 && ! is.na(peaks[[1]][[1]])) | |
685 for (c in .XLS_PEAKS_RT_COL_START:length(names(peaks))) | |
686 if ( ! is.na(peaks[[c]][[1]])) { | |
687 | |
688 # Check retention times of all different m/z peaks for the same column. | |
689 .self$.check_retention_times(id, tab, names(peaks)[[c]], peaks[[c]], sum( ! is.na(peaks[[1]]))) | |
690 | |
691 # Add retention time | |
692 # TODO The column names are transformed through the read.xlsx call. For instance: | |
693 # HPLC (C18) 25mn QTOF (Bis) --> HPLC..C18..25mn.QTOF..Bis. | |
694 # ZICpHILIC 150*5*2.1 Shimadzu-Exactive-42mn --> ZICpHILIC.150.5.2.1.Shimadzu.Exactive.42mn | |
695 # This can be an issue, since we loose the formating. | |
696 col_id <- names(peaks)[[c]] | |
697 time <- peaks[[c]][[1]] * 60 # Read and convert retention time in seconds. | |
698 if (is.null(rt) || ! col_id %in% names(rt)) | |
699 rt[[col_id]] <- list(time) | |
700 else | |
701 rt[[col_id]] <- c(rt[[col_id]], time) | |
702 } | |
703 } | |
704 | |
705 return(rt) | |
706 }) | |
707 | |
708 ######################### | |
709 # CHECK RETENTION TIMES # | |
710 ######################### | |
711 | |
712 MsXlsDb$methods( .check_retention_times = function(id, tab_name, column_name, rt, n) { | |
713 | |
714 if (n >= 1 && ! is.null(.self$.observers) && length(.self$.observers) > 0) | |
715 | |
716 # Check column only if there is at least one value inside | |
717 if (sum( ! is.na(rt)) > 0) | |
718 | |
719 # Loop on all values | |
720 for(i in 1:n) { | |
721 | |
722 # Check that it's defined | |
723 if (i > 1 && is.na(rt[[i]])) | |
724 for (obs in .self$.observers) | |
725 obs$warning(paste0("Retention times undefined for column ", column_name, " at row ", i + .XLS_PEAKS_ROW_OFFSET, " of tab ", tab_name, " in file ", .self$.get.file(id), ".")) | |
726 | |
727 else if (i > 1) | |
728 # Check the value (it must be constant) | |
729 if (rt[[i-1]] != rt[[i]]) | |
730 for (obs in .self$.observers) | |
731 obs$error(paste0("Retention times not constant for column ", column_name, " between row ", i - 1 + .XLS_PEAKS_ROW_OFFSET, " and row ", i + .XLS_PEAKS_ROW_OFFSET, "o tab", tab_name, "in file", .self$.get.file(id))) | |
732 } | |
733 }) | |
734 | |
735 #################### | |
736 # GET FILE FROM ID # | |
737 #################### | |
738 | |
739 MsXlsDb$methods( .get.file = function(id) { | |
740 | |
741 # List files | |
742 .self$.init.file.list() | |
743 | |
744 return( if (id > 0 && id <= length(.self$.files)) .self$.files[id] else NA_character_) | |
745 }) | |
746 | |
747 ########### | |
748 # MEM GET # | |
749 ########### | |
750 | |
751 # Get database data from memory | |
752 MsXlsDb$methods( .mem.get = function(molid, field, second.field = NA_character_) { | |
753 | |
754 data <- .self$.db[[as.character(molid)]][[field]] | |
755 | |
756 if ( ! is.na(second.field)) | |
757 data <- data[[second.field]] | |
758 | |
759 return(data) | |
760 }) | |
761 | |
762 ########### | |
763 # MEM SET # | |
764 ########### | |
765 | |
766 # Set database data into memory | |
767 MsXlsDb$methods( .mem.set = function(data, molid, field, second.field = NA_character_) { | |
768 | |
769 id <- as.character(molid) | |
770 | |
771 # Create db | |
772 if (is.null(.self$.db)) | |
773 .db <<- list() | |
774 | |
775 # Create first level | |
776 if (is.null(.self$.db[[id]])) | |
777 .self$.db[[id]] <- list() | |
778 | |
779 # Create second level | |
780 if ( ! is.na(second.field) && is.null(.self$.db[[id]][[field]])) | |
781 .self$.db[[id]][[field]] <- list() | |
782 | |
783 # Store data | |
784 if (is.na(second.field)) { | |
785 .self$.db[[id]][[field]] <- data | |
786 } else { | |
787 .self$.db[[id]][[field]][[second.field]] <- data | |
788 } | |
789 }) | |
790 | |
791 ################# | |
792 # SEARCH FOR RT # | |
793 ################# | |
794 | |
795 # Find molecules matching a certain retention time. | |
796 # col A list of chromatographic columns to use. | |
797 # rt.low The lower bound of the rt value. | |
798 # rt.high The higher bound of the rt value. | |
799 # mols A list of molecule IDs to process. If unset, then take all molecules. | |
800 # Return a data frame with the following columns: id, col, colrt. | |
801 MsXlsDb$methods( .search.for.rt = function(col, rt.low, rt.high, mols = NULL) { | |
802 | |
803 # Use all molecules if no list is provided | |
804 if (is.null(mols)) | |
805 mols <- .self$getMoleculeIds() | |
806 | |
807 results <- data.frame(id = integer(), col = character(), colrt = double(), stringsAsFactors = FALSE) | |
808 colnames(results) <- c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT) | |
809 | |
810 # Loop on all molecules | |
811 for (molid in mols) { | |
812 no.col <- TRUE | |
813 for (c in col) { | |
814 molrts <- .self$getRetentionTimes(molid, c) | |
815 if ( ! is.null(molrts)) { | |
816 no.col <- FALSE | |
817 for (molrt in molrts) { | |
818 if (molrt >= rt.low && molrt <= rt.high) { | |
819 r <- nrow(results) + 1 | |
820 results[r, ] <- c(id = molid, col = c, colrt = molrt) | |
821 } | |
822 } | |
823 } | |
824 } | |
825 | |
826 if (no.col) { | |
827 r <- nrow(results) + 1 | |
828 results[r, c(MSDB.TAG.MOLID)] <- c(id = molid) | |
829 } | |
830 } | |
831 | |
832 return(results) | |
833 }) | |
834 | |
835 ############################ | |
836 # EXTRACT ID FROM FILENAME # | |
837 ############################ | |
838 | |
839 .extract_molecule_id_from_filename <- function(filename) { | |
840 | |
841 id <- NA_integer_ | |
842 | |
843 if ( ! is.na(filename)) { | |
844 g <- str_match(filename, "N(\\d+)[._-]") | |
845 if ( ! is.na(g[1,1])) | |
846 id <- as.numeric(g[1,2]) | |
847 } | |
848 | |
849 return(id) | |
850 } | |
851 | |
852 } # end of load safe guard |