comparison MsDb.R @ 6:f86fec07f392 draft default tip

planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author prog
date Fri, 22 Feb 2019 16:04:22 -0500
parents fb9c0409d85c
children
comparison
equal deleted inserted replaced
5:fb9c0409d85c 6:f86fec07f392
1 if ( ! exists('MsDb')) { # Do not load again if already loaded
2
3 library('methods')
4 source('msdb-common.R')
5 source('MsDbObserver.R')
6 source('MsDbOutputStream.R')
7
8 #####################
9 # CLASS DECLARATION #
10 #####################
11
12 MsDb <- setRefClass("MsDb", fields = list(.observers = "ANY", .prec = "list", .output.streams = "ANY", .input.stream = "ANY", .mz.tol.unit = "character", .rt.unit = "character"))
13
14 ###############
15 # CONSTRUCTOR #
16 ###############
17
18 MsDb$methods( initialize = function(...) {
19
20 callSuper(...)
21
22 .observers <<- NULL
23 .output.streams <<- NULL
24 .input.stream <<- NULL
25 .prec <<- MSDB.DFT.PREC
26 .mz.tol.unit <<- MSDB.DFT.MZTOLUNIT
27 .rt.unit <<- MSDB.RTUNIT.SEC
28 })
29
30 ####################
31 # SET INPUT STREAM #
32 ####################
33
34 MsDb$methods( setInputStream = function(stream) {
35
36 # Check types of input stream
37 if ( ! inherits(stream, "MsDbInputStream") && ! is.null(stream))
38 stop("Input stream must inherit from MsDbInputStream class.")
39
40 # Save current stream
41 cur.stream <- .self$.input.stream
42
43 # Set stream
44 .input.stream <<- stream
45
46 return(cur.stream)
47 })
48
49 ######################
50 # ADD OUTPUT STREAMS #
51 ######################
52
53 MsDb$methods( addOutputStreams = function(stream) {
54
55 # Check types of output streams
56 if ( ( ! is.list(stream) && ! inherits(stream, "MsDbOutputStream")) || (is.list(stream) && any( ! vapply(stream, function(s) inherits(s, "MsDbOutputStream"), FUN.VALUE = TRUE))))
57 stop("Output streams must inherit from MsDbOutputStream class.")
58
59 # Add streams to current list
60 .output.streams <<- if (is.null(.self$.output.streams)) c(stream) else c(.self$.output.streams, stream)
61 })
62
63 #########################
64 # REMOVE OUTPUT STREAMS #
65 #########################
66
67 MsDb$methods( removeOutputStreams = function(stream) {
68
69 # Check types of output streams
70 if ( ( ! is.list(stream) && ! inherits(stream, "MsDbOutputStream")) || (is.list(stream) && any( ! vapply(stream, function(s) inherits(s, "MsDbOutputStream"), FUN.VALUE = TRUE))))
71
72 # Remove streams from current list
73 .output.streams <<- .self$.output.streams[ ! stream %in% .self$.output.streams]
74 })
75
76 ########################
77 # RESET OUTPUT STREAMS #
78 ########################
79
80 MsDb$methods( resetOutputStreams = function(stream) {
81 .output.streams <<- NULL
82 })
83
84 #################
85 # ADD OBSERVERS #
86 #################
87
88 MsDb$methods( addObservers = function(obs) {
89
90 # Check types of observers
91 if ( ( ! is.list(obs) && ! inherits(obs, "MsDbObserver")) || (is.list(obs) && any( ! vapply(obs, function(o) inherits(o, "MsDbObserver"), FUN.VALUE = TRUE))))
92 stop("Observers must inherit from MsDbObserver class.")
93
94 # Add observers to current list
95 .observers <<- if (is.null(.self$.observers)) c(obs) else c(.self$.observers, obs)
96 })
97
98 ##################
99 # SET PRECURSORS #
100 ##################
101
102 MsDb$methods( setPrecursors = function(prec) {
103 .prec <<- prec
104 })
105
106 #################
107 # SET DB FIELDS #
108 #################
109
110 MsDb$methods( areDbFieldsSettable = function() {
111 return(FALSE)
112 })
113
114 MsDb$methods( setDbFields = function(fields) {
115 stop("Method setDbFields() not implemented in concrete class.")
116 })
117
118 ################
119 # SET MS MODES #
120 ################
121
122 MsDb$methods( areDbMsModesSettable = function() {
123 return(FALSE)
124 })
125
126 MsDb$methods( setDbMsModes = function(modes) {
127 stop("Method setDbMsModes() not implemented in concrete class.")
128 })
129
130 ###################
131 # SET MZ TOL UNIT #
132 ###################
133
134 MsDb$methods( setMzTolUnit = function(mztolunit) {
135
136 if ( ! mztolunit %in% MSDB.MZTOLUNIT.VALS)
137 stop(paste0("M/Z tolerance unit must be one of: ", paste(MSDB.MZTOLUNIT.VALS, collapse = ', '), "."))
138
139 .mz.tol.unit <<- mztolunit
140 })
141
142 ###############
143 # SET RT UNIT #
144 ###############
145
146 MsDb$methods( setRtUnit = function(unit) {
147
148 if ( ! unit %in% MSDB.RTUNIT.VALS)
149 stop(paste0("RT unit must be one of: ", paste(MSDB.RTUNIT.VALS, collapse = ', '), "."))
150
151 .rt.unit <<- unit
152 })
153
154 ###############
155 # GET RT UNIT #
156 ###############
157
158 MsDb$methods( getRtUnit = function(unit) {
159 return(.self$.rt.unit)
160 })
161
162 ####################
163 # HANDLE COMPOUNDS #
164 ####################
165
166 # Returns TRUE if this database handles compounds directly (by IDs)
167 MsDb$methods( handleCompounds = function() {
168 return(TRUE)
169 })
170
171 ####################
172 # GET MOLECULE IDS #
173 ####################
174
175 # Returns an integer vector of all molecule IDs stored inside the database.
176 MsDb$methods( getMoleculeIds = function(max.results = NA_integer_) {
177 stop("Method getMoleculeIds() not implemented in concrete class.")
178 })
179
180 ####################
181 # GET NB MOLECULES #
182 ####################
183
184 # Returns the number of molecules in the database.
185 MsDb$methods( getNbMolecules = function() {
186 stop("Method getNbMolecules() not implemented in concrete class.")
187 })
188
189 #################
190 # GET MZ VALUES #
191 #################
192
193 # Returns a numeric vector of all masses stored inside the database.
194 MsDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
195 stop("Method getMzValues() not implemented in concrete class.")
196 })
197
198 #####################
199 # GET MOLECULE NAME #
200 #####################
201
202 # Get molecule names
203 # molid An integer vector of molecule IDs.
204 # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector.
205 MsDb$methods( getMoleculeName = function(molid) {
206 stop("Method getMoleculeName() not implemented in concrete class.")
207 })
208
209 ###############################
210 # GET CHROMATOGRAPHIC COLUMNS #
211 ###############################
212
213 # Get chromatographic columns.
214 # Returns a vector of character listing the chromatographic column names. The name must be formatted in lowercase as following: uplc(-c8)?(-20min)?.
215 MsDb$methods( getChromCol = function(molid = NULL) {
216 stop("Method getChromCol() not implemented in concrete class.")
217 })
218
219 ################
220 # FIND BY NAME #
221 ################
222
223 # Find a molecule by name
224 # name A vector of molecule names to search for.
225 # Return an integer vector of the same size as the name input vector, containing the found molecule IDs, in the same order.
226 MsDb$methods( findByName = function(name) {
227 stop("Method findByName() not implemented in concrete class.")
228 })
229
230 #######################
231 # GET RETENTION TIMES #
232 #######################
233
234 # Get the retention times of a molecule.
235 # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list.
236 MsDb$methods( getRetentionTimes = function(molid, col = NA_character_) {
237 stop("Method getRetentionTimes() not implemented in concrete class.")
238 })
239
240 ################
241 # GET NB PEAKS #
242 ################
243
244 # Get the total number of MS peaks stored inside the database.
245 # molid The ID of the molecule.
246 # type The MS type.
247 MsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) {
248 stop("Method getNbPeaks() not implemented in concrete class.")
249 })
250
251 ##################
252 # GET PEAK TABLE #
253 ##################
254
255 MsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) {
256 stop("Method getPeakTable() not implemented in concrete class.")
257 })
258
259 ##########
260 # SEARCH #
261 ##########
262
263 # Find molecule MS peaks whose m/z matches the submitted m/z in the tolerance specified.
264 # mode The mode to use: either MSDB.TAG.POS or MSDB.TAG.NEG.
265 # shift The m/z shift to use, in ppm.
266 # prec The m/z precision to use, in ppm.
267 # col The chromatographic column used.
268 # rt.tol Simple retention tolerance parameter: rtinf = rt - rt.tol and rtsup = rt + rt.tol
269 # rt.tol.x Tolerance parameter for the equations : rtinf = rt - rt.tol.x - rt ^ rt.tol.y and rtsup = rt + rt.tol.x + rt ^ rt.tol.y
270 # rt.tol.y Tolerance parameter. See rt.tol.x parameter.
271 # attribs Only search for peaks whose attribution is among this set of attributions.
272 # molids Only search for peaks whose molecule ID is among this vector of integer molecule IDs. Can also be a data frame with a retention time column x.colnames$rt and a molecule ID column MSDB.TAG.MOLID.
273 # molids.rt.tol Retention time tolerance used when molids parameter is a data frame (rt, id)
274 # precursor.match Remove peaks whose molecule precursor peak has not also been matched.
275 # precursor.rt.tol
276 # Returns a data frame, listing m/z values provided in input. Several matches can be found for an m/z value, in which case several lines (the same number as the number of matches found) with the same m/z value repeated will be inserted. The m/z values will be listed in the same order as in the input. The columns of the data.frame are: mz, rt (only if present in the input), id, mztheo, col, colrt, composition, attribution.
277 MsDb$methods( searchForMzRtList = function(x = NULL, mode, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = NULL, molids.rt.tol = NULL, attribs = NULL, precursor.match = FALSE, precursor.rt.tol = NULL, same.cols = FALSE, same.rows = FALSE, peak.table = FALSE) {
278
279 # Use provided data frame
280 old.input <- NULL
281 tmp.output <- NULL
282 if ( ! is.null(x)) {
283 tmp.input <- MsDbInputDataFrameStream$new(df = x)
284 tmp.output <- MsDbOutputDataFrameStream$new()
285 old.input <- .self$setInputStream(tmp.input)
286 .self$addOutputStreams(tmp.output)
287 }
288
289 if (precursor.match) {
290 # Get IDs of all molecules whose precursor peak matches one of the mz in the list
291 precursors.df <- .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, attribs = .self$.prec[[mode]], output.to.stream = FALSE)
292 cols.to.keep <- if (is.null(col)) MSDB.TAG.MOLID else c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)
293 precursors.ids <- precursors.df[, cols.to.keep, drop = FALSE]
294 precursors.ids <- precursors.ids[ ! is.na(precursors.ids[[MSDB.TAG.MOLID]]), , drop = FALSE]
295 precursors.ids <- precursors.ids[ ! duplicated(precursors.ids), ]
296
297 # Get all matching peaks whose molecule is inside the previously obtained list of molecules
298 df <- .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = precursors.ids, molids.rt.tol = precursor.rt.tol, same.cols = same.cols, same.rows = same.rows, peak.table = peak.table)
299 # TODO
300 #
301 # peaks <- if (peak.table) results[['peaks']] else results
302 #
303 # # Merge results with the column/rt found for precursors.
304 # if ( ! is.null(col) && ! is.null(peaks)) {
305 # precursors.ids <- precursors.df[, c(MSDB.TAG.MOLID, MSDB.TAG.col, MSDB.TAG.COLRT)]
306 # precursors.ids <- precursors.ids[ ! is.na(precursors.ids[[MSDB.TAG.MOLID]]), ]
307 #
308 # # Get rows where ID is NA
309 # peaks.na <- peaks[is.na(peaks[[MSDB.TAG.MOLID]]), ]
310 #
311 # # Get rows where ID is found (i.e.: not NA)
312 # peaks <- peaks[, !(colnames(peaks) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))] # drop col and colrt columns
313 # peaks.not.na <- peaks[! is.na(peaks[[MSDB.TAG.MOLID]]), ]
314 #
315 # # Add col and colrt values to found peaks
316 # peaks <- merge(peaks.not.na, precursors.ids, by = MSDB.TAG.MOLID)
317 #
318 # # Put back unfound peaks
319 # peaks <- rbind(peaks, peaks.na)
320 #
321 # # Sort
322 # peaks <- peaks[order(peaks[[x.colnames$mz]], peaks[[x.colnames$rt]], peaks[[MSDB.TAG.MOLID]], peaks[[MSDB.TAG.COL]]), ]
323 #
324 # # Remove rownames
325 # rownames(peaks) <- NULL
326 #
327 # # Reorder columns
328 # peaks <- peaks[unlist(.self$.output.fields[names(.PEAK.TABLE.COLS)])]
329 # }
330 #
331 # # Remove duplicates
332 # if ( ! is.null(peaks))
333 # peaks <- peaks[ ! duplicated(peaks), ]
334 #
335 # if (peak.table)
336 # results[['peaks']] <- peaks
337 # else
338 # results <- peaks
339 #
340 # return(results)
341 }
342 else
343 .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, molids = molids, molids.rt.tol = molids.rt.tol, attribs = attribs, same.cols = same.cols, same.rows = same.rows, peak.table = peak.table)
344
345 if ( ! is.null(x)) {
346 results <- tmp.output$getDataFrame()
347 .self$removeOutputStreams(tmp.output)
348 .self$setInputStream(old.input)
349 return(results)
350 }
351 })
352
353 MsDb$methods( .doSearchForMzRtList = function(mode, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = NULL, molids.rt.tol = NULL, attribs = NULL, same.cols = FALSE, same.rows = FALSE, peak.table = FALSE, output.to.stream = TRUE) {
354
355 # # Choose columns to keep from x
356 # x.cols <- if (same.cols) colnames(x) else intersect(if (is.null(col)) c(x.colnames$mz) else c(x.colnames$mz, x.colnames$rt), colnames(x))
357 #
358 # # Create a peak fake data frame for defining columns
359 # peaks.fake <- data.frame(stringsAsFactors = FALSE)
360 # for (field in names(.PEAK.TABLE.COLS))
361 # if ( ! is.null(col) || ! field %in% .RT.MATCHING.COLS)
362 # peaks.fake[.self$.output.fields[[field]]] <- vector(mode = .PEAK.TABLE.COLS[[field]], length = 0)
363 #
364 # # Initialize y data frame, so when x contains no rows an empty y data frame is returned with all the columns set with right type.
365 # if (same.rows) {
366 # y <- peaks.fake[, if (is.null(col)) c(MSDB.TAG.MZ) else c(MSDB.TAG.MZ, MSDB.TAG.RT), drop = FALSE]
367 # y[MSDB.TAG.MSMATCHING] <- character()
368 # }
369 # else
370 # y <- peaks.fake
371 # y <- cbind(y, x[NULL, ! x.cols %in% colnames(y), drop = FALSE])
372 # if (peak.table) {
373 # z <- peaks.fake
374 # z <- cbind(z, x[NULL, ! x.cols %in% colnames(z), drop = FALSE])
375 # }
376
377 # Loop on all lines of input
378 peaks <- NULL
379 .self$.input.stream$reset()
380 while (.self$.input.stream$hasNextValues()) {
381
382 .self$.input.stream$nextValues()
383
384 # Search for m/z
385 results <- .self$searchForMzRtTols(mode = mode, mz = .self$.input.stream$getMz(), shift = shift, prec = prec, rt = .self$.input.stream$getRt(), col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, attribs = attribs, molids = molids, molids.rt.tol = molids.rt.tol)
386
387 # Call output streams
388 if (output.to.stream && ! is.null(.self$.output.streams))
389 for (s in .self$.output.streams)
390 s$matchedPeaks(mz = .self$.input.stream$getMz(), rt = if (is.null(col)) NULL else .self$.input.stream$getRt(), peaks = results, unused = .self$.input.stream$getAll(but = if (is.null(col)) c(MSDB.TAG.MZ) else c(MSDB.TAG.MZ, MSDB.TAG.RT)))
391
392 # Append to peak list
393 peaks <- rbind(peaks, results)
394
395 # # Add results to output
396 # r <- nrow(y) + 1
397 # x.lines <- x[i, x.cols, drop = FALSE]
398 # x.lines <- rename.col(x.lines, unlist(x.colnames), unlist(.self$.output.fields[names(x.colnames)]))
399 # if (nrow(results) == 0) {
400 # y[r, colnames(x.lines)] <- x.lines
401 # }
402 # else {
403 # if (same.rows) {
404 # y[r, colnames(x.lines)] <- x.lines
405 # ids <- results[[MSDB.TAG.MOLID]]
406 # ids <- ids[ ! duplicated(ids)] # Remove duplicated values
407 # y[r, MSDB.TAG.msmatching] <- paste(ids, collapse = .self$.molids.sep)
408 # }
409 # if ( ! same.rows || peak.table) {
410 # new.rows <- cbind(x.lines, results, row.names = NULL)
411 # if ( ! same.rows) {
412 # rows <- r:(r+nrow(results)-1)
413 # y[rows, colnames(new.rows)] <- new.rows
414 # }
415 # if (peak.table) {
416 # zr <- nrow(z) + 1
417 # zrows <- zr:(zr+nrow(results)-1)
418 # z[zrows, colnames(new.rows)] <- new.rows
419 # }
420 # }
421 # }
422 }
423
424 # results <- if (peak.table) list(main = y, peaks = z) else y
425
426 # return(results)
427 return(peaks)
428 })
429
430 # rt Retention time in seconds.
431 # molids An option vector of molecule IDs, used to restrict the search.
432 MsDb$methods( searchForMzRtTols = function(mode, mz, rt = NULL, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, attribs = NULL, molids = NULL, molids.rt.tol = NULL, colnames = MSDB.DFT.INPUT.FIELDS) {
433
434 # Set M/Z bounds
435 if (.self$.mz.tol.unit == MSDB.MZTOLUNIT.PPM) {
436 mz.low <- mz * (1 + (- shift - prec) * 1e-6)
437 mz.high <- mz * (1 + (- shift + prec) * 1e-6)
438 }
439 else { # PLAIN
440 mz.low <- mz - shift - prec
441 mz.high <- mz - shift + prec
442 }
443
444 # Set retention time bounds
445 rt.low <- NULL
446 rt.high <- NULL
447 if ( ! is.null(rt.tol)) {
448 low <- rt - rt.tol
449 high <- rt + rt.tol
450 rt.low <- if (is.null(rt.low)) low else max(low, rt.low)
451 rt.high <- if (is.null(rt.high)) high else min(high, rt.high)
452 }
453 if ( ! is.null(rt.tol.x)) {
454 low <- rt - rt.tol.x - rt ^ rt.tol.y
455 high <- rt + rt.tol.x + rt ^ rt.tol.y
456 rt.low <- if (is.null(rt.low)) low else max(low, rt.low)
457 rt.high <- if (is.null(rt.high)) high else min(high, rt.high)
458 }
459
460 # List molecule IDs
461 if ( ! is.null(molids.rt.tol) && is.data.frame(molids)) {
462 ids <- molids[(rt >= molids[[MSDB.TAG.COLRT]] - molids.rt.tol) & (rt <= molids[[MSDB.TAG.COLRT]] + molids.rt.tol), MSDB.TAG.MOLID]
463 if (length(ids) == 0)
464 # No molecule ID match for this retention time
465 return(data.frame()) # return empty result set
466 } else {
467 ids <- molids
468 }
469
470 return(.self$searchForMzRtBounds(mode,
471 mz.low = mz * (1 + (- shift - prec) * 1e-6),
472 mz.high = mz * (1 + (- shift + prec) * 1e-6),
473 rt.low = rt.low,
474 rt.high = rt.high,
475 col = col,
476 attribs = attribs,
477 molids = ids))
478 })
479
480 # rt.low Lower bound of the retention time in seconds.
481 # rt.high Higher bound of the retention time in seconds.
482 MsDb$methods( searchForMzRtBounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) {
483
484 results <- .self$.do.search.for.mz.rt.bounds(mode = mode, mz.low = mz.low, mz.high = mz.high, rt.low = rt.low, rt.high = rt.high, col = col, attribs = attribs, molids = molids)
485
486 return(results)
487 })
488
489 # TODO Write description of output: data frame with which columns ?
490 MsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) {
491 stop("Method .do.search.for.mz.rt.bounds() not implemented in concrete class.")
492 })
493
494 # DEPRECATED
495 MsDb$methods( searchForMz = function(x, mode, tol = 5, col = NULL, rt.tol.x = 5, rt.tol.y = 0.80) {
496 warning("Method searchForMz() is deprecated. Use searchForMzRtList() instead.")
497 .self$searchForMzRtList(x = x, mode = mode, prec = tol, col = col, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y)
498 })
499
500 } # end of load safe guard