Mercurial > repos > prog > lcmsmatching
comparison MsXlsDb.R @ 0:e66bb061af06 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 3529b25417f8e1a5836474c9adec4b696d35099d-dirty
author | prog |
---|---|
date | Tue, 12 Jul 2016 12:02:37 -0400 |
parents | |
children | 20d69a062da3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e66bb061af06 |
---|---|
1 if ( ! exists('MsXlsDb')) { # Do not load again if already loaded | |
2 | |
3 library('methods') | |
4 library('stringr') | |
5 source('msdb-common.R') | |
6 source('MsDb.R') | |
7 source('strhlp.R', chdir = TRUE) | |
8 source('dfhlp.R', chdir = TRUE) | |
9 source('search.R', chdir = TRUE) | |
10 source('excelhlp.R', chdir = TRUE) | |
11 | |
12 ############# | |
13 # CONSTANTS # | |
14 ############# | |
15 | |
16 .THIS.FILE.PATH <- getwd() # We suppose that the file has been sourced with the option chdir = TRUE | |
17 | |
18 .XLS_PEAKS_ROW_OFFSET <- 8 | |
19 .XLS_PEAKS_RT_COL_START <- 11 | |
20 .XLS_MSPOS_TAB <- 'MS_POS' | |
21 .XLS_MSNEG_TAB <- 'MS_NEG' | |
22 .XLS_MZ_COL <- 1 | |
23 .XLS_INTENSITY_COL <- 2 | |
24 .XLS_RELATIVE_COL <- 3 | |
25 .XLS_THEORETICAL_MZ_COL <- 5 | |
26 .XLS_COMPOSITION_COL <- 8 | |
27 .XLS_ATTRIBUTION_COL <- 9 | |
28 | |
29 ##################### | |
30 # CLASS DECLARATION # | |
31 ##################### | |
32 | |
33 MsXlsDb <- setRefClass("MsXlsDb", contains = "MsDb", fields = list(.mz.index = "ANY", .name_index = "ANY", .db_dir = "character", .limit = "numeric", .files = "ANY", .cache_dir = "character", .db = "ANY")) | |
34 | |
35 ############### | |
36 # CONSTRUCTOR # | |
37 ############### | |
38 | |
39 MsXlsDb$methods( initialize = function(db_dir = NA_character_, limit = NA_integer_, cache_dir = NA_character_, cache = FALSE, ...) { | |
40 | |
41 # Initialize members | |
42 .db_dir <<- if ( ! is.null(db_dir)) db_dir else NA_character_ | |
43 .limit <<- if ( ! is.null(limit) && ! is.na(limit) && limit > 0) limit else NA_integer_ | |
44 cache_dir <- if (cache && is.na(cache_dir) && ! is.na(db_dir)) file.path(db_dir, 'cache') else cache_dir | |
45 .cache_dir <<- if ( cache || ! is.null(cache_dir)) cache_dir else NA_character_ | |
46 .files <<- NULL | |
47 .db <<- NULL | |
48 .mz.index <<- NULL | |
49 .name_index <<- NULL | |
50 | |
51 callSuper(...) | |
52 }) | |
53 | |
54 #################### | |
55 # GET MOLECULE IDS # | |
56 #################### | |
57 | |
58 MsXlsDb$methods( getMoleculeIds = function() { | |
59 | |
60 # Init file list | |
61 .self$.init.file.list() | |
62 | |
63 # Get IDs | |
64 mol.ids <- as.integer(which( ! is.na(.self$.files))) | |
65 | |
66 return(mol.ids) | |
67 }) | |
68 | |
69 #################### | |
70 # GET NB MOLECULES # | |
71 #################### | |
72 | |
73 # Returns a list of all molecule names | |
74 MsXlsDb$methods( getNbMolecules = function() { | |
75 return(length(.self$getMoleculeIds())) | |
76 }) | |
77 | |
78 ##################### | |
79 # GET MOLECULE NAME # | |
80 ##################### | |
81 | |
82 MsXlsDb$methods( getMoleculeName = function(molid) { | |
83 return(vapply(molid, function(m) .self$.get.mol.name(m), FUN.VALUE = "")) | |
84 }) | |
85 | |
86 ############################### | |
87 # GET CHROMATOGRAPHIC COLUMNS # | |
88 ############################### | |
89 | |
90 # Returns a list of all chromatographic columns used | |
91 MsXlsDb$methods( getChromCol = function(molid = NULL) { | |
92 | |
93 cn <- character() | |
94 | |
95 # If no molecule IDs provided, then look at all molecules | |
96 if (is.null(molid)) | |
97 molid <- .self$getMoleculeIds() | |
98 | |
99 # Loop on molecules | |
100 for (mid in molid) { | |
101 | |
102 rt <- .self$getRetentionTimes(mid) | |
103 | |
104 if ( ! is.null(rt)) | |
105 cn <- c(cn, names(rt)) | |
106 } | |
107 | |
108 # Remove duplicates | |
109 cn <- cn[ ! duplicated(cn)] | |
110 | |
111 # Make data frame | |
112 cn <- data.frame(id = cn, title = cn, stringsAsFactors = FALSE) | |
113 | |
114 return(cn) | |
115 }) | |
116 | |
117 ################ | |
118 # FIND BY NAME # | |
119 ################ | |
120 | |
121 MsXlsDb$methods( findByName = function(name) { | |
122 | |
123 # NULL entry | |
124 if (is.null(name)) | |
125 return(NA_integer_) | |
126 | |
127 # Initialize output list | |
128 ids <- NULL | |
129 | |
130 for (n in name) { | |
131 | |
132 id <- NA_integer_ | |
133 | |
134 if ( ! is.na(n)) { | |
135 | |
136 # Get index | |
137 index <- .self$.get.name.index() | |
138 | |
139 # Search for name in index | |
140 i <- binary.search(toupper(n), index[['name']]) | |
141 | |
142 id <- if (is.na(i)) NA_integer_ else index[i, 'id'] | |
143 } | |
144 | |
145 ids <- c(ids, id) | |
146 } | |
147 | |
148 return(ids) | |
149 }) | |
150 | |
151 ####################### | |
152 # GET RETENTION TIMES # | |
153 ####################### | |
154 | |
155 MsXlsDb$methods( getRetentionTimes = function(molid, col = NA_character_) { | |
156 | |
157 if (is.null(molid) || is.na(molid)) | |
158 return(NULL) | |
159 | |
160 # Find it in memory | |
161 rt <- .self$.mem.get(molid, 'rt') | |
162 | |
163 if (is.null(rt)) { | |
164 | |
165 # Call observers | |
166 if ( ! is.null(.self$.observers)) | |
167 for (obs in .self$.observers) | |
168 obs$progress(paste0("Loading retention times of file", .self$.get.file(molid), "."), level = 2) | |
169 | |
170 rt <- NULL | |
171 | |
172 # Load from cache file | |
173 cache_file <- NA_character_ | |
174 if ( ! is.na(.self$.get.cache.dir())) { | |
175 cache_file <- file.path(.self$.get.cache.dir(), paste0('rt-', molid, '.bin')) | |
176 if (file.exists(cache_file)) | |
177 load(file = cache_file) # load rt | |
178 } | |
179 | |
180 if (is.null(rt)) { | |
181 | |
182 # Get retention times of both positive and negative mode tabs | |
183 mspos_rt <- .self$.parse_retention_times(molid, .XLS_MSPOS_TAB) | |
184 msneg_rt <- .self$.parse_retention_times(molid, .XLS_MSNEG_TAB) | |
185 | |
186 # Retention times stored in negative and positive modes | |
187 if ( ! is.null(mspos_rt) && ! is.null(msneg_rt)) { | |
188 | |
189 # Warn observers when both retention time lists are not identical | |
190 if ( ! identical(mspos_rt, msneg_rt)) | |
191 for (obs in .self$.observers) | |
192 obs$warning(paste0("Retention times in negative and positive modes are different in file ", .self$.get.file(molid), ".")) | |
193 | |
194 # Merge both lists | |
195 rt <- mspos_rt | |
196 for (c in names(msneg_rt)) | |
197 if (c %in% names(rt)) { | |
198 v <- c(rt[[c]], msneg_rt[[c]]) | |
199 rt[[c]] <- v[ ! duplicated(v)] | |
200 } | |
201 else | |
202 rt[[c]] <- msneg_rt[[c]] | |
203 } | |
204 else | |
205 # Set retention times | |
206 rt <- if (is.null(mspos_rt)) msneg_rt else mspos_rt | |
207 | |
208 if (is.null(rt)) rt <- list() | |
209 | |
210 # Write in cache | |
211 if ( ! is.na(cache_file)) { | |
212 | |
213 # Call observers | |
214 if ( ! is.null(.self$.observers)) | |
215 for (obs in .self$.observers) | |
216 obs$progress(paste0("Caching retention times of file ", .self$.get.file(molid), ".")) | |
217 | |
218 save(rt, file = cache_file) | |
219 } | |
220 } | |
221 | |
222 # Store in memory | |
223 .self$.mem.set(rt, molid, 'rt') | |
224 } | |
225 | |
226 # Select only one column if asked | |
227 if ( ! is.na(col)) rt <- rt[[col]] | |
228 | |
229 return(rt) | |
230 }) | |
231 | |
232 ################# | |
233 # GET NB PEAKS # | |
234 ################# | |
235 | |
236 MsXlsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { | |
237 | |
238 # Initialize parameters | |
239 if (is.null(molid) || (length(molid) == 1 && is.na(molid))) | |
240 molid <- .self$getMoleculeIds() | |
241 if (is.na(type)) | |
242 type <- c(MSDB.TAG.POS, MSDB.TAG.NEG) | |
243 | |
244 return(sum(vapply(molid, function(m) { if (is.na(m)) 0 else sum(vapply(type, function(t) { peaks <- .self$.get.peaks(m, t) ; if (is.null(peaks)) 0 else nrow(peaks) }, FUN.VALUE = 1)) }, FUN.VALUE = 1))) | |
245 }) | |
246 | |
247 ################## | |
248 # GET PEAK TABLE # | |
249 ################## | |
250 | |
251 MsXlsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) { | |
252 | |
253 peaks <- NULL | |
254 | |
255 # Set default molecule IDs | |
256 if (is.null(molid) || (length(molid) == 1 && is.na(molid))) | |
257 molid <- .self$getMoleculeIds() | |
258 | |
259 # Set default modes | |
260 if (is.null(mode) || (length(mode) == 1 && is.na(mode))) | |
261 mode <- c(MSDB.TAG.POS, MSDB.TAG.NEG) | |
262 | |
263 # Loop on all molecules | |
264 for (mol in molid) { | |
265 | |
266 # Loop on all modes | |
267 for (mod in mode) { | |
268 m.peaks <- .self$.get.peaks(mol, mod) | |
269 if ( ! is.null(m.peaks) && nrow(m.peaks) > 0) { | |
270 m.peaks[[MSDB.TAG.MOLID]] <- mol | |
271 m.peaks[[MSDB.TAG.MODE]] <- mod | |
272 peaks <- if (is.null(peaks)) m.peaks else rbind(peaks, m.peaks) | |
273 peaks <- df.move.col.first(peaks, c(MSDB.TAG.MOLID, MSDB.TAG.MODE)) | |
274 } | |
275 } | |
276 } | |
277 | |
278 return(peaks) | |
279 }) | |
280 | |
281 ################# | |
282 # GET MZ VALUES # | |
283 ################# | |
284 | |
285 # Returns a numeric vector of all masses stored inside the database. | |
286 MsXlsDb$methods( getMzValues = function(mode = NULL) { | |
287 | |
288 mz <- numeric() | |
289 | |
290 # Get all mz values of all molecules | |
291 for(molid in .self$getMoleculeIds()) | |
292 for (m in (if (is.null(mode) || is.na(mode)) c(MSDB.TAG.POS, MSDB.TAG.NEG) else mode)) | |
293 mz <- c(mz, .self$.get.peaks(molid, m)[[MSDB.TAG.MZTHEO]]) | |
294 | |
295 # Remove duplicated | |
296 mz <- mz[ ! duplicated(mz)] | |
297 | |
298 return(mz) | |
299 }) | |
300 | |
301 ############# | |
302 # GET PEAKS # | |
303 ############# | |
304 | |
305 MsXlsDb$methods( .get.peaks = function(molid, mode) { | |
306 | |
307 tab <- if (mode == MSDB.TAG.POS) .XLS_MSPOS_TAB else .XLS_MSNEG_TAB | |
308 | |
309 # Find it in memory | |
310 peak_df <- .self$.mem.get(molid, 'peaks', mode) | |
311 | |
312 if (is.null(peak_df)) { | |
313 # Call observers | |
314 if ( ! is.null(.self$.observers)) | |
315 for (obs in .self$.observers) | |
316 obs$progress(paste0("Loading peaks of tab ", tab, " of file ", .self$.get.file(molid), "."), level = 2) | |
317 | |
318 peak_df <- NULL | |
319 | |
320 # Load from cache file | |
321 cache_file <- NA_character_ | |
322 if ( ! is.na(.self$.get.cache.dir())) { | |
323 cache_file <- file.path(.self$.get.cache.dir(), paste0('peaks-', molid, '-', tab, '.csv')) | |
324 if (file.exists(cache_file)) | |
325 peak_df <- read.csv(cache_file, header = TRUE, stringsAsFactors = FALSE) | |
326 } | |
327 | |
328 # Read from XLS file, if not in cache | |
329 if (is.null(peak_df)) { | |
330 | |
331 # Load tab (peaks start at row 8) | |
332 if (.self$.tab.exists(.self$.get.file(molid), tab)) { | |
333 | |
334 peaks <- read.excel(.self$.get.file(molid), tab, start.row = .XLS_PEAKS_ROW_OFFSET, stringsAsFactors = FALSE) | |
335 if ( ! is.null(peaks)) | |
336 peaks <- peaks[ ! is.na(peaks[.XLS_MZ_COL]), , drop = FALSE] # Remove rows where m/z is not defined. TODO maybe call observer for notify a line with non NA values but without m/z value. | |
337 | |
338 # Instantiate peaks | |
339 if ( ! is.null(peaks) && nrow(peaks) > 0) { | |
340 peak_df <- peaks[1:length(peaks[[.XLS_MZ_COL]]), c(.XLS_MZ_COL, .XLS_THEORETICAL_MZ_COL, .XLS_INTENSITY_COL, .XLS_RELATIVE_COL, .XLS_COMPOSITION_COL, .XLS_ATTRIBUTION_COL), drop = FALSE] | |
341 colnames(peak_df) <- c(MSDB.TAG.MZEXP, MSDB.TAG.MZTHEO, MSDB.TAG.INT, MSDB.TAG.REL, MSDB.TAG.COMP, MSDB.TAG.ATTR) | |
342 } | |
343 | |
344 # Set default data frame (important for cache file writing, because we need a correct header to be written in order for loading) | |
345 else { | |
346 peak_df <- data.frame(stringsAsFactors = FALSE) | |
347 peak_df[MSDB.TAG.MZEXP] <- numeric() | |
348 peak_df[MSDB.TAG.MZTHEO] <- numeric() | |
349 peak_df[MSDB.TAG.INT] <- numeric() | |
350 peak_df[MSDB.TAG.REL] <- numeric() | |
351 peak_df[MSDB.TAG.COMP] <- character() | |
352 peak_df[MSDB.TAG.ATTR] <- character() | |
353 } | |
354 | |
355 if (is.null(peak_df)) peak_df <- data.frame() | |
356 | |
357 # Write in cache | |
358 if ( ! is.na(cache_file)) { | |
359 | |
360 # Call observers | |
361 if ( ! is.null(.self$.observers)) | |
362 for (obs in .self$.observers) | |
363 obs$progress(paste0("Caching peaks of tab ", tab, " of file ", .self$.get.file(molid), ".")) | |
364 | |
365 write.csv(peak_df, cache_file, row.names = FALSE) | |
366 } | |
367 } | |
368 } | |
369 | |
370 # Store in memory | |
371 .self$.mem.set(peak_df, molid, 'peaks', mode) | |
372 } | |
373 | |
374 return(peak_df) | |
375 }) | |
376 | |
377 ############################## | |
378 # GET FULL MS PEAK M/Z INDEX # | |
379 ############################## | |
380 | |
381 # Get mz index for full ions, creating it if necessary. | |
382 MsXlsDb$methods( .get.mz.index = function(mode) { | |
383 | |
384 if (is.null(.self$.mz.index[[mode]])) { | |
385 | |
386 # Initialize data frame | |
387 mzi <- data.frame(stringsAsFactors = FALSE) | |
388 mzi[MSDB.TAG.MZTHEO] <- numeric() | |
389 mzi[MSDB.TAG.MOLID] <- character() | |
390 mzi[MSDB.TAG.COMP] <- character() | |
391 mzi[MSDB.TAG.ATTR] <- character() | |
392 | |
393 # Loop on all molecules | |
394 for(molid in .self$getMoleculeIds()) { | |
395 | |
396 # Get all peaks of this molecule | |
397 peaks <- .self$.get.peaks(molid, mode) | |
398 | |
399 # Remove rows whose mz is NA. | |
400 peaks <- peaks[ ! is.na(peaks[[MSDB.TAG.MZTHEO]]), ] | |
401 | |
402 if (nrow(peaks) > 0) { | |
403 | |
404 # Add id column | |
405 peaks[MSDB.TAG.MOLID] <- molid | |
406 | |
407 # Append peaks | |
408 r <- nrow(mzi) + 1 | |
409 rows <- r:(r+nrow(peaks)-1) | |
410 mzi[rows, ] <- peaks[colnames(mzi)] | |
411 } | |
412 } | |
413 | |
414 # Sort by M/Z | |
415 sorted_indices <- order(mzi[[MSDB.TAG.MZTHEO]]) | |
416 | |
417 # Group in a data frame | |
418 .self$.mz.index[[mode]] <- mzi[sorted_indices, ] | |
419 } | |
420 | |
421 return(.self$.mz.index[[mode]]) | |
422 }) | |
423 | |
424 ###################### | |
425 # SEARCH FOR MZ & RT # | |
426 ###################### | |
427 | |
428 MsXlsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { | |
429 | |
430 # Search for m/z | |
431 results <- .self$.do.search.for.mz(mode, mz.low, mz.high) | |
432 | |
433 # Filter on attributions | |
434 if ( ! is.null(attribs)) { | |
435 results <- results[results[[MSDB.TAG.ATTR]] %in% attribs, ] | |
436 } | |
437 | |
438 # Filer on molecule IDs | |
439 if ( ! is.null(molids)) { | |
440 results <- results[results[[MSDB.TAG.MOLID]] %in% molids, ] | |
441 } | |
442 | |
443 # Use retention time | |
444 if ( ! is.null(col) && ! is.null(rt.low) && ! is.null(rt.high)) { | |
445 | |
446 # Get list of unique IDs | |
447 ids <- results[[MSDB.TAG.MOLID]] | |
448 ids <- ids[ ! duplicated(ids)] | |
449 rt <- .self$.search.for.rt(mols = ids, rt.low = rt.low, rt.high = rt.high, col = col) | |
450 results <- results[results[[MSDB.TAG.MOLID]] %in% rt[[MSDB.TAG.MOLID]], ] | |
451 results <- merge(results, rt) | |
452 } | |
453 | |
454 return(results) | |
455 }) | |
456 | |
457 ############################## | |
458 # SEARCH FOR M/Z IN MS PEAKS # | |
459 ############################## | |
460 | |
461 MsXlsDb$methods( .do.search.for.mz = function(mode, mz.low, mz.high) { | |
462 | |
463 results <- data.frame(stringsAsFactors = FALSE) | |
464 results[MSDB.TAG.MZTHEO] <- numeric() | |
465 results[MSDB.TAG.MOLID] <- character() | |
466 results[MSDB.TAG.MOLNAMES] <- character() | |
467 results[MSDB.TAG.COMP] <- character() | |
468 results[MSDB.TAG.ATTR] <- character() | |
469 | |
470 # Create m/z index | |
471 mz_index <- .self$.get.mz.index(mode) | |
472 | |
473 # Find molecules | |
474 low_bound <- binary.search(mz.low, mz_index[[MSDB.TAG.MZTHEO]], lower = FALSE) | |
475 high_bound <- binary.search(mz.high, mz_index[[MSDB.TAG.MZTHEO]], lower = TRUE) | |
476 | |
477 # Get results | |
478 if ( ! is.na(high_bound) && ! is.na(low_bound) && low_bound <= high_bound) | |
479 results <- mz_index[low_bound:high_bound,] | |
480 | |
481 # Remove row names | |
482 rownames(results) <- NULL | |
483 | |
484 return(results) | |
485 }) | |
486 | |
487 ################ | |
488 # GET MOL NAME # | |
489 ################ | |
490 | |
491 MsXlsDb$methods( .get.mol.name = function(molid) { | |
492 | |
493 if (is.na(molid)) | |
494 return(NA_character_) | |
495 | |
496 # Find it in memory | |
497 name <- .self$.mem.get(molid, 'name') | |
498 | |
499 if (is.null(name)) { | |
500 | |
501 # Load molecule | |
502 mol <- .self$.load.molecule(molid) | |
503 | |
504 # Look for name in tabs | |
505 for (tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { | |
506 hdr <- mol[[tab]][['header']] | |
507 if ( ! is.null(hdr)) | |
508 name <- hdr[[1]] | |
509 if ( ! is.null(name) && ! is.na(name)) break | |
510 } | |
511 | |
512 # Store in memory | |
513 if (is.null(name)) name <- NA_character_ | |
514 .self$.mem.set(name, molid, 'name') | |
515 } | |
516 | |
517 return(name) | |
518 }) | |
519 | |
520 ################## | |
521 # GET NAME INDEX # | |
522 ################## | |
523 | |
524 # Get name index. | |
525 MsXlsDb$methods( .get.name.index = function() { | |
526 | |
527 if (is.null(.self$.name_index)) { | |
528 | |
529 # Get names | |
530 names <- vapply(.self$getMoleculeIds(), function(id) toupper(.self$getMoleculeName(id)), FUN.VALUE = "") | |
531 | |
532 # Get molecule IDs | |
533 id <- .self$getMoleculeIds() | |
534 | |
535 # Sort by names | |
536 sorted_indices <- order(names) | |
537 | |
538 # Group in a data frame | |
539 .self$.name_index <- data.frame(name = rbind(names)[, sorted_indices], | |
540 id = rbind(id)[, sorted_indices], | |
541 stringsAsFactors = FALSE) | |
542 } | |
543 | |
544 return(.self$.name_index) | |
545 }) | |
546 | |
547 ################## | |
548 # INIT FILE LIST # | |
549 ################## | |
550 | |
551 MsXlsDb$methods( .init.file.list = function() { | |
552 | |
553 if (is.null(.self$.files)) { | |
554 | |
555 # List all files | |
556 files <- Sys.glob(file.path(.self$.db_dir, '*.xls')) | |
557 | |
558 # Limit the size of the database | |
559 if ( ! is.na(.self$.limit)) | |
560 files <- head(files, .self$.limit) | |
561 | |
562 # Get IDs | |
563 ids <- vapply(files, function(f) .extract_molecule_id_from_filename(f), FUN.VALUE = 1) | |
564 | |
565 # Use ids as indices to build the vector of files | |
566 .files <<- rep(NA_character_, max(ids)) | |
567 .files[ids] <<- files | |
568 } | |
569 }) | |
570 | |
571 ################# | |
572 # GET CACHE DIR # | |
573 ################# | |
574 | |
575 MsXlsDb$methods( .get.cache.dir = function() { | |
576 | |
577 if ( ! is.na(.self$.cache_dir) && ! file.exists(.self$.cache_dir)) | |
578 dir.create(.self$.cache_dir) | |
579 | |
580 return(.self$.cache_dir) | |
581 }) | |
582 | |
583 ################# | |
584 # LOAD MOLECULE # | |
585 ################# | |
586 | |
587 MsXlsDb$methods( .load.molecule = function(molid) { | |
588 | |
589 # Init local variables | |
590 mol <- NULL | |
591 cache_file <- NA_character_ | |
592 excel_file <- .self$.get.file(molid) | |
593 | |
594 # Call observers | |
595 if ( ! is.null(.self$.observers)) | |
596 for (obs in .self$.observers) | |
597 obs$progress(paste0("Loading molecule ", molid, "."), level = 2) | |
598 | |
599 # Load from cache | |
600 if ( ! is.na(.self$.get.cache.dir())) { | |
601 cache_file <- file.path(.self$.get.cache.dir(), paste0(molid, '.bin')) | |
602 if (file.exists(cache_file)) | |
603 load(file = cache_file) # load mol variable | |
604 } | |
605 | |
606 # Load from Excel file & write to cache | |
607 if (is.null(mol) && ! is.na(excel_file)) { | |
608 | |
609 source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. | |
610 | |
611 # Load from Excel file | |
612 for(tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { | |
613 | |
614 # Test that tab exists | |
615 if (.self$.tab.exists(excel_file, tab)) { | |
616 header <- read.excel(excel_file, tab, start.row = 1, end.row = .XLS_PEAKS_ROW_OFFSET - 1, header = FALSE, stringsAsFactors = FALSE, trim.values = TRUE, col.index = c(1))[[1]] | |
617 peaks <- read.excel(excel_file, tab, start.row = .XLS_PEAKS_ROW_OFFSET) | |
618 mol[[tab]] <- list(header = header, peaks = peaks) | |
619 } | |
620 | |
621 # Missing tab | |
622 else { | |
623 for (obs in .self$.observers) | |
624 obs$warning(paste0("No excel tab ", tab, " in file ", excel_file, ".")) | |
625 } | |
626 } | |
627 | |
628 # Write in cache | |
629 if ( ! is.na(cache_file)) { | |
630 | |
631 # Call observers | |
632 if ( ! is.null(.self$.observers)) | |
633 for (obs in .self$.observers) | |
634 obs$progress(paste0("Caching file ", excel_file, ".")) | |
635 | |
636 save(mol, file = cache_file) | |
637 } | |
638 } | |
639 | |
640 return(mol) | |
641 }) | |
642 | |
643 ######################## | |
644 # DOES EXCEL TAB EXIST # | |
645 ######################## | |
646 | |
647 MsXlsDb$methods( .tab.exists = function(file, tab) { | |
648 | |
649 source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. | |
650 | |
651 if ( ! tab.exists(file, tab)) { | |
652 | |
653 # Warn observers | |
654 for (obs in .self$.observers) | |
655 obs$warning(paste0("No excel tab ", tab, " in file ", file, ".")) | |
656 | |
657 return(FALSE) | |
658 } | |
659 | |
660 return(TRUE) | |
661 }) | |
662 | |
663 ######################### | |
664 # PARSE RETENTION TIMES # | |
665 ######################### | |
666 | |
667 MsXlsDb$methods( .parse_retention_times = function(id, tab) { | |
668 | |
669 rt <- NULL | |
670 | |
671 if (.self$.tab.exists(.self$.get.file(id), tab)) { | |
672 peaks <- read.excel(.self$.get.file(id), tab, start.row = .XLS_PEAKS_ROW_OFFSET) | |
673 | |
674 # Get retention times | |
675 if ( ! is.null(peaks) && length(peaks) > 0 && ! is.na(peaks[[1]][[1]])) | |
676 for (c in .XLS_PEAKS_RT_COL_START:length(names(peaks))) | |
677 if ( ! is.na(peaks[[c]][[1]])) { | |
678 | |
679 # Check retention times of all different m/z peaks for the same column. | |
680 .self$.check_retention_times(id, tab, names(peaks)[[c]], peaks[[c]], sum( ! is.na(peaks[[1]]))) | |
681 | |
682 # Add retention time | |
683 # TODO The column names are transformed through the read.xlsx call. For instance: | |
684 # HPLC (C18) 25mn QTOF (Bis) --> HPLC..C18..25mn.QTOF..Bis. | |
685 # ZICpHILIC 150*5*2.1 Shimadzu-Exactive-42mn --> ZICpHILIC.150.5.2.1.Shimadzu.Exactive.42mn | |
686 # This can be an issue, since we loose the formating. | |
687 col_id <- names(peaks)[[c]] | |
688 time <- peaks[[c]][[1]] * 60 # Read and convert retention time in seconds. | |
689 if (is.null(rt) || ! col_id %in% names(rt)) | |
690 rt[[col_id]] <- list(time) | |
691 else | |
692 rt[[col_id]] <- c(rt[[col_id]], time) | |
693 } | |
694 } | |
695 | |
696 return(rt) | |
697 }) | |
698 | |
699 ######################### | |
700 # CHECK RETENTION TIMES # | |
701 ######################### | |
702 | |
703 MsXlsDb$methods( .check_retention_times = function(id, tab_name, column_name, rt, n) { | |
704 | |
705 if (n >= 1 && ! is.null(.self$.observers) && length(.self$.observers) > 0) | |
706 | |
707 # Check column only if there is at least one value inside | |
708 if (sum( ! is.na(rt)) > 0) | |
709 | |
710 # Loop on all values | |
711 for(i in 1:n) { | |
712 | |
713 # Check that it's defined | |
714 if (i > 1 && is.na(rt[[i]])) | |
715 for (obs in .self$.observers) | |
716 obs$warning(paste0("Retention times undefined for column ", column_name, " at row ", i + .XLS_PEAKS_ROW_OFFSET, " of tab ", tab_name, " in file ", .self$.get.file(id), ".")) | |
717 | |
718 else if (i > 1) | |
719 # Check the value (it must be constant) | |
720 if (rt[[i-1]] != rt[[i]]) | |
721 for (obs in .self$.observers) | |
722 obs$error(paste0("Retention times not constant for column ", column_name, " between row ", i - 1 + .XLS_PEAKS_ROW_OFFSET, " and row ", i + .XLS_PEAKS_ROW_OFFSET, "o tab", tab_name, "in file", .self$.get.file(id))) | |
723 } | |
724 }) | |
725 | |
726 #################### | |
727 # GET FILE FROM ID # | |
728 #################### | |
729 | |
730 MsXlsDb$methods( .get.file = function(id) { | |
731 | |
732 # List files | |
733 .self$.init.file.list() | |
734 | |
735 return( if (id > 0 && id <= length(.self$.files)) .self$.files[id] else NA_character_) | |
736 }) | |
737 | |
738 ########### | |
739 # MEM GET # | |
740 ########### | |
741 | |
742 # Get database data from memory | |
743 MsXlsDb$methods( .mem.get = function(molid, field, second.field = NA_character_) { | |
744 | |
745 data <- .self$.db[[as.character(molid)]][[field]] | |
746 | |
747 if ( ! is.na(second.field)) | |
748 data <- data[[second.field]] | |
749 | |
750 return(data) | |
751 }) | |
752 | |
753 ########### | |
754 # MEM SET # | |
755 ########### | |
756 | |
757 # Set database data into memory | |
758 MsXlsDb$methods( .mem.set = function(data, molid, field, second.field = NA_character_) { | |
759 | |
760 id <- as.character(molid) | |
761 | |
762 # Create db | |
763 if (is.null(.self$.db)) | |
764 .db <<- list() | |
765 | |
766 # Create first level | |
767 if (is.null(.self$.db[[id]])) | |
768 .self$.db[[id]] <- list() | |
769 | |
770 # Create second level | |
771 if ( ! is.na(second.field) && is.null(.self$.db[[id]][[field]])) | |
772 .self$.db[[id]][[field]] <- list() | |
773 | |
774 # Store data | |
775 if (is.na(second.field)) { | |
776 .self$.db[[id]][[field]] <- data | |
777 } else { | |
778 .self$.db[[id]][[field]][[second.field]] <- data | |
779 } | |
780 }) | |
781 | |
782 ################# | |
783 # SEARCH FOR RT # | |
784 ################# | |
785 | |
786 # Find molecules matching a certain retention time. | |
787 # col A list of chromatographic columns to use. | |
788 # rt.low The lower bound of the rt value. | |
789 # rt.high The higher bound of the rt value. | |
790 # mols A list of molecule IDs to process. If unset, then take all molecules. | |
791 # Return a data frame with the following columns: id, col, colrt. | |
792 MsXlsDb$methods( .search.for.rt = function(col, rt.low, rt.high, mols = NULL) { | |
793 | |
794 # Use all molecules if no list is provided | |
795 if (is.null(mols)) | |
796 mols <- .self$getMoleculeIds() | |
797 | |
798 results <- data.frame(id = integer(), col = character(), colrt = double(), stringsAsFactors = FALSE) | |
799 | |
800 # Loop on all molecules | |
801 for (molid in mols) { | |
802 no.col <- TRUE | |
803 for (c in col) { | |
804 molrts <- .self$getRetentionTimes(molid, c) | |
805 if ( ! is.null(molrts)) { | |
806 no.col <- FALSE | |
807 for (molrt in molrts) { | |
808 if (molrt >= rt.low && molrt <= rt.high) { | |
809 r <- nrow(results) + 1 | |
810 results[r, ] <- c(id = molid, col = c, colrt = molrt) | |
811 } | |
812 } | |
813 } | |
814 } | |
815 | |
816 if (no.col) { | |
817 r <- nrow(results) + 1 | |
818 results[r, c(MSDB.TAG.MOLID)] <- c(id = molid) | |
819 } | |
820 } | |
821 | |
822 return(results) | |
823 }) | |
824 | |
825 ############################ | |
826 # EXTRACT ID FROM FILENAME # | |
827 ############################ | |
828 | |
829 .extract_molecule_id_from_filename <- function(filename) { | |
830 | |
831 id <- NA_integer_ | |
832 | |
833 if ( ! is.na(filename)) { | |
834 g <- str_match(filename, "N(\\d+)[._-]") | |
835 if ( ! is.na(g[1,1])) | |
836 id <- as.numeric(g[1,2]) | |
837 } | |
838 | |
839 return(id) | |
840 } | |
841 | |
842 } # end of load safe guard |