Mercurial > repos > prog > lcmsmatching
comparison MsDb.R @ 0:e66bb061af06 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 3529b25417f8e1a5836474c9adec4b696d35099d-dirty
author | prog |
---|---|
date | Tue, 12 Jul 2016 12:02:37 -0400 |
parents | |
children | 20d69a062da3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e66bb061af06 |
---|---|
1 if ( ! exists('MsDb')) { # Do not load again if already loaded | |
2 | |
3 library('methods') | |
4 source('msdb-common.R') | |
5 source('MsDbObserver.R') | |
6 source('MsDbOutputStream.R') | |
7 | |
8 ##################### | |
9 # CLASS DECLARATION # | |
10 ##################### | |
11 | |
12 MsDb <- setRefClass("MsDb", fields = list(.observers = "ANY", .prec = "list", .output.streams = "ANY", .input.stream = "ANY", .mz.tol.unit = "character")) | |
13 | |
14 ############### | |
15 # CONSTRUCTOR # | |
16 ############### | |
17 | |
18 MsDb$methods( initialize = function(...) { | |
19 | |
20 .observers <<- NULL | |
21 .output.streams <<- NULL | |
22 .input.stream <<- NULL | |
23 .prec <<- MSDB.DFT.PREC | |
24 .mz.tol.unit <<- MSDB.DFT.MZTOLUNIT | |
25 | |
26 callSuper(...) | |
27 }) | |
28 | |
29 #################### | |
30 # SET INPUT STREAM # | |
31 #################### | |
32 | |
33 MsDb$methods( setInputStream = function(stream) { | |
34 | |
35 # Check types of input stream | |
36 if ( ! inherits(stream, "MsDbInputStream") && ! is.null(stream)) | |
37 stop("Input stream must inherit from MsDbInputStream class.") | |
38 | |
39 # Save current stream | |
40 cur.stream <- .self$.input.stream | |
41 | |
42 # Set stream | |
43 .input.stream <<- stream | |
44 | |
45 return(cur.stream) | |
46 }) | |
47 | |
48 ###################### | |
49 # ADD OUTPUT STREAMS # | |
50 ###################### | |
51 | |
52 MsDb$methods( addOutputStreams = function(stream) { | |
53 | |
54 # Check types of output streams | |
55 if ( ( ! is.list(stream) && ! inherits(stream, "MsDbOutputStream")) || (is.list(stream) && any( ! vapply(stream, function(s) inherits(s, "MsDbOutputStream"), FUN.VALUE = TRUE)))) | |
56 stop("Output streams must inherit from MsDbOutputStream class.") | |
57 | |
58 # Add streams to current list | |
59 .output.streams <<- if (is.null(.self$.output.streams)) c(stream) else c(.self$.output.streams, stream) | |
60 }) | |
61 | |
62 ######################### | |
63 # REMOVE OUTPUT STREAMS # | |
64 ######################### | |
65 | |
66 MsDb$methods( removeOutputStreams = function(stream) { | |
67 | |
68 # Check types of output streams | |
69 if ( ( ! is.list(stream) && ! inherits(stream, "MsDbOutputStream")) || (is.list(stream) && any( ! vapply(stream, function(s) inherits(s, "MsDbOutputStream"), FUN.VALUE = TRUE)))) | |
70 | |
71 # Remove streams from current list | |
72 .output.streams <<- .self$.output.streams[ ! stream %in% .self$.output.streams] | |
73 }) | |
74 | |
75 ######################## | |
76 # RESET OUTPUT STREAMS # | |
77 ######################## | |
78 | |
79 MsDb$methods( resetOutputStreams = function(stream) { | |
80 .output.streams <<- NULL | |
81 }) | |
82 | |
83 ################# | |
84 # ADD OBSERVERS # | |
85 ################# | |
86 | |
87 MsDb$methods( addObservers = function(obs) { | |
88 | |
89 # Check types of observers | |
90 if ( ( ! is.list(obs) && ! inherits(obs, "MsDbObserver")) || (is.list(obs) && any( ! vapply(obs, function(o) inherits(o, "MsDbObserver"), FUN.VALUE = TRUE)))) | |
91 stop("Observers must inherit from MsDbObserver class.") | |
92 | |
93 # Add observers to current list | |
94 .observers <<- if (is.null(.self$.observers)) c(obs) else c(.self$.observers, obs) | |
95 }) | |
96 | |
97 ################## | |
98 # SET PRECURSORS # | |
99 ################## | |
100 | |
101 MsDb$methods( setPrecursors = function(prec) { | |
102 .prec <<- prec | |
103 }) | |
104 | |
105 ################# | |
106 # SET DB FIELDS # | |
107 ################# | |
108 | |
109 MsDb$methods( areDbFieldsSettable = function() { | |
110 return(FALSE) | |
111 }) | |
112 | |
113 MsDb$methods( setDbFields = function(fields) { | |
114 stop("Method setDbFields() not implemented in concrete class.") | |
115 }) | |
116 | |
117 ################ | |
118 # SET MS MODES # | |
119 ################ | |
120 | |
121 MsDb$methods( areDbMsModesSettable = function() { | |
122 return(FALSE) | |
123 }) | |
124 | |
125 MsDb$methods( setDbMsModes = function(modes) { | |
126 stop("Method setDbMsModes() not implemented in concrete class.") | |
127 }) | |
128 | |
129 MsDb$methods( setMzTolUnit = function(mztolunit) { | |
130 | |
131 if ( ! mztolunit %in% MSDB.MZTOLUNIT.VALS) | |
132 stop(paste0("M/Z tolerance unit must be one of: ", paste(MSDB.MZTOLUNIT.VALS, collapse = ', '), ".")) | |
133 | |
134 .mz.tol.unit <<- mztolunit | |
135 }) | |
136 | |
137 #################### | |
138 # GET MOLECULE IDS # | |
139 #################### | |
140 | |
141 # Returns an integer vector of all molecule IDs stored inside the database. | |
142 MsDb$methods( getMoleculeIds = function() { | |
143 stop("Method getMoleculeIds() not implemented in concrete class.") | |
144 }) | |
145 | |
146 #################### | |
147 # GET NB MOLECULES # | |
148 #################### | |
149 | |
150 # Returns the number of molecules in the database. | |
151 MsDb$methods( getNbMolecules = function() { | |
152 stop("Method getNbMolecules() not implemented in concrete class.") | |
153 }) | |
154 | |
155 ################# | |
156 # GET MZ VALUES # | |
157 ################# | |
158 | |
159 # Returns a numeric vector of all masses stored inside the database. | |
160 MsDb$methods( getMzValues = function(mode = NULL) { | |
161 stop("Method getMzValues() not implemented in concrete class.") | |
162 }) | |
163 | |
164 ##################### | |
165 # GET MOLECULE NAME # | |
166 ##################### | |
167 | |
168 # Get molecule names | |
169 # molid An integer vector of molecule IDs. | |
170 # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector. | |
171 MsDb$methods( getMoleculeName = function(molid) { | |
172 stop("Method getMoleculeName() not implemented in concrete class.") | |
173 }) | |
174 | |
175 ############################### | |
176 # GET CHROMATOGRAPHIC COLUMNS # | |
177 ############################### | |
178 | |
179 # Get chromatographic columns. | |
180 # Returns a vector of character listing the chromatographic column names. The name must be formatted in lowercase as following: uplc(-c8)?(-20min)?. | |
181 MsDb$methods( getChromCol = function(molid = NULL) { | |
182 stop("Method getChromCol() not implemented in concrete class.") | |
183 }) | |
184 | |
185 ################ | |
186 # FIND BY NAME # | |
187 ################ | |
188 | |
189 # Find a molecule by name | |
190 # name A vector of molecule names to search for. | |
191 # Return an integer vector of the same size as the name input vector, containing the found molecule IDs, in the same order. | |
192 MsDb$methods( findByName = function(name) { | |
193 stop("Method findByName() not implemented in concrete class.") | |
194 }) | |
195 | |
196 ####################### | |
197 # GET RETENTION TIMES # | |
198 ####################### | |
199 | |
200 # Get the retention times of a molecule. | |
201 # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list. | |
202 MsDb$methods( getRetentionTimes = function(molid, col = NA_character_) { | |
203 stop("Method getRetentionTimes() not implemented in concrete class.") | |
204 }) | |
205 | |
206 ################ | |
207 # GET NB PEAKS # | |
208 ################ | |
209 | |
210 # Get the total number of MS peaks stored inside the database. | |
211 # molid The ID of the molecule. | |
212 # type The MS type. | |
213 MsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { | |
214 stop("Method getNbPeaks() not implemented in concrete class.") | |
215 }) | |
216 | |
217 ################## | |
218 # GET PEAK TABLE # | |
219 ################## | |
220 | |
221 MsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_){ | |
222 stop("Method getPeakTable() not implemented in concrete class.") | |
223 }) | |
224 | |
225 ########## | |
226 # SEARCH # | |
227 ########## | |
228 | |
229 # Find molecule MS peaks whose m/z matches the submitted m/z in the tolerance specified. | |
230 # mode The mode to use: either MSDB.TAG.POS or MSDB.TAG.NEG. | |
231 # shift The m/z shift to use, in ppm. | |
232 # prec The m/z precision to use, in ppm. | |
233 # col The chromatographic column used. | |
234 # rt.tol Simple retention tolerance parameter: rtinf = rt - rt.tol and rtsup = rt + rt.tol | |
235 # rt.tol.x Tolerance parameter for the equations : rtinf = rt - rt.tol.x - rt ^ rt.tol.y and rtsup = rt + rt.tol.x + rt ^ rt.tol.y | |
236 # rt.tol.y Tolerance parameter. See rt.tol.x parameter. | |
237 # attribs Only search for peaks whose attribution is among this set of attributions. | |
238 # molids Only search for peaks whose molecule ID is among this vector of integer molecule IDs. Can also be a data frame with a retention time column x.colnames$rt and a molecule ID column MSDB.TAG.molid. | |
239 # molids.rt.tol Retention time tolerance used when molids parameter is a data frame (rt, id) | |
240 # precursor.match Remove peaks whose molecule precursor peak has not also been matched. | |
241 # precursor.rt.tol | |
242 # Returns a data frame, listing m/z values provided in input. Several matches can be found for an m/z value, in which case several lines (the same number as the number of matches found) with the same m/z value repeated will be inserted. The m/z values will be listed in the same order as in the input. The columns of the data.frame are: mz, rt (only if present in the input), id, mztheo, col, colrt, composition, attribution. | |
243 MsDb$methods( searchForMzRtList = function(x = NULL, mode, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = NULL, molids.rt.tol = NULL, attribs = NULL, precursor.match = FALSE, precursor.rt.tol = NULL, same.cols = FALSE, same.rows = FALSE, peak.table = FALSE) { | |
244 | |
245 # Use provided data frame | |
246 old.input <- NULL | |
247 tmp.output <- NULL | |
248 if ( ! is.null(x)) { | |
249 tmp.input <- MsDbInputDataFrameStream$new(df = x) | |
250 tmp.output <- MsDbOutputDataFrameStream$new() | |
251 old.input <- .self$setInputStream(tmp.input) | |
252 .self$addOutputStreams(tmp.output) | |
253 } | |
254 | |
255 if (precursor.match) { | |
256 # Get IDs of all molecules whose precursor peak matches one of the mz in the list | |
257 precursors.df <- .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, attribs = .self$.prec[[mode]], output.to.stream = FALSE) | |
258 cols.to.keep <- if (is.null(col)) MSDB.TAG.MOLID else c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT) | |
259 precursors.ids <- precursors.df[, cols.to.keep, drop = FALSE] | |
260 precursors.ids <- precursors.ids[ ! is.na(precursors.ids[[MSDB.TAG.MOLID]]), , drop = FALSE] | |
261 precursors.ids <- precursors.ids[ ! duplicated(precursors.ids), ] | |
262 | |
263 # Get all matching peaks whose molecule is inside the previously obtained list of molecules | |
264 .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = precursors.ids, molids.rt.tol = precursor.rt.tol, same.cols = same.cols, same.rows = same.rows, peak.table = peak.table) | |
265 # TODO | |
266 # | |
267 # peaks <- if (peak.table) results[['peaks']] else results | |
268 # | |
269 # # Merge results with the column/rt found for precursors. | |
270 # if ( ! is.null(col) && ! is.null(peaks)) { | |
271 # precursors.ids <- precursors.df[, c(MSDB.TAG.MOLID, MSDB.TAG.col, MSDB.TAG.COLRT)] | |
272 # precursors.ids <- precursors.ids[ ! is.na(precursors.ids[[MSDB.TAG.MOLID]]), ] | |
273 # | |
274 # # Get rows where ID is NA | |
275 # peaks.na <- peaks[is.na(peaks[[MSDB.TAG.MOLID]]), ] | |
276 # | |
277 # # Get rows where ID is found (i.e.: not NA) | |
278 # peaks <- peaks[, !(colnames(peaks) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))] # drop col and colrt columns | |
279 # peaks.not.na <- peaks[! is.na(peaks[[MSDB.TAG.MOLID]]), ] | |
280 # | |
281 # # Add col and colrt values to found peaks | |
282 # peaks <- merge(peaks.not.na, precursors.ids, by = MSDB.TAG.MOLID) | |
283 # | |
284 # # Put back unfound peaks | |
285 # peaks <- rbind(peaks, peaks.na) | |
286 # | |
287 # # Sort | |
288 # print(colnames(peaks)) | |
289 # print(x.colnames) | |
290 # peaks <- peaks[order(peaks[[x.colnames$mz]], peaks[[x.colnames$rt]], peaks[[MSDB.TAG.MOLID]], peaks[[MSDB.TAG.COL]]), ] | |
291 # | |
292 # # Remove rownames | |
293 # rownames(peaks) <- NULL | |
294 # | |
295 # # Reorder columns | |
296 # peaks <- peaks[unlist(.self$.output.fields[names(.PEAK.TABLE.COLS)])] | |
297 # } | |
298 # | |
299 # # Remove duplicates | |
300 # if ( ! is.null(peaks)) | |
301 # peaks <- peaks[ ! duplicated(peaks), ] | |
302 # | |
303 # if (peak.table) | |
304 # results[['peaks']] <- peaks | |
305 # else | |
306 # results <- peaks | |
307 # | |
308 # return(results) | |
309 } | |
310 else | |
311 .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, molids = molids, molids.rt.tol = molids.rt.tol, attribs = attribs, same.cols = same.cols, same.rows = same.rows, peak.table = peak.table) | |
312 | |
313 if ( ! is.null(x)) { | |
314 results <- tmp.output$getDataFrame() | |
315 .self$removeOutputStreams(tmp.output) | |
316 .self$setInputStream(old.input) | |
317 return(results) | |
318 } | |
319 }) | |
320 | |
321 MsDb$methods( .doSearchForMzRtList = function(mode, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = NULL, molids.rt.tol = NULL, attribs = NULL, same.cols = FALSE, same.rows = FALSE, peak.table = FALSE, output.to.stream = TRUE) { | |
322 | |
323 # # Choose columns to keep from x | |
324 # x.cols <- if (same.cols) colnames(x) else intersect(if (is.null(col)) c(x.colnames$mz) else c(x.colnames$mz, x.colnames$rt), colnames(x)) | |
325 # | |
326 # # Create a peak fake data frame for defining columns | |
327 # peaks.fake <- data.frame(stringsAsFactors = FALSE) | |
328 # for (field in names(.PEAK.TABLE.COLS)) | |
329 # if ( ! is.null(col) || ! field %in% .RT.MATCHING.COLS) | |
330 # peaks.fake[.self$.output.fields[[field]]] <- vector(mode = .PEAK.TABLE.COLS[[field]], length = 0) | |
331 # | |
332 # # Initialize y data frame, so when x contains no rows an empty y data frame is returned with all the columns set with right type. | |
333 # if (same.rows) { | |
334 # y <- peaks.fake[, if (is.null(col)) c(MSDB.TAG.MZ) else c(MSDB.TAG.MZ, MSDB.TAG.RT), drop = FALSE] | |
335 # y[MSDB.TAG.MSMATCHING] <- character() | |
336 # } | |
337 # else | |
338 # y <- peaks.fake | |
339 # y <- cbind(y, x[NULL, ! x.cols %in% colnames(y), drop = FALSE]) | |
340 # if (peak.table) { | |
341 # z <- peaks.fake | |
342 # z <- cbind(z, x[NULL, ! x.cols %in% colnames(z), drop = FALSE]) | |
343 # } | |
344 | |
345 # Loop on all lines of input | |
346 peaks <- NULL | |
347 while (.self$.input.stream$hasNextValues()) { | |
348 | |
349 .self$.input.stream$nextValues() | |
350 | |
351 # Search for m/z | |
352 results <- .self$searchForMzRtTols(mode = mode, mz = .self$.input.stream$getMz(), shift = shift, prec = prec, rt = .self$.input.stream$getRt(), col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, attribs = attribs, molids = molids, molids.rt.tol = molids.rt.tol) | |
353 | |
354 # Call output streams | |
355 if (output.to.stream && ! is.null(.self$.output.streams)) | |
356 for (s in .self$.output.streams) | |
357 s$matchedPeaks(mz = .self$.input.stream$getMz(), rt = if (is.null(col)) NULL else .self$.input.stream$getRt(), peaks = results, unused = .self$.input.stream$getAll(but = if (is.null(col)) c(MSDB.TAG.MZ) else c(MSDB.TAG.MZ, MSDB.TAG.RT))) | |
358 | |
359 # Append to peak list | |
360 peaks <- rbind(peaks, results) | |
361 | |
362 # # Add results to output | |
363 # r <- nrow(y) + 1 | |
364 # x.lines <- x[i, x.cols, drop = FALSE] | |
365 # x.lines <- rename.col(x.lines, unlist(x.colnames), unlist(.self$.output.fields[names(x.colnames)])) | |
366 # if (nrow(results) == 0) { | |
367 # y[r, colnames(x.lines)] <- x.lines | |
368 # } | |
369 # else { | |
370 # if (same.rows) { | |
371 # y[r, colnames(x.lines)] <- x.lines | |
372 # ids <- results[[MSDB.TAG.molid]] | |
373 # ids <- ids[ ! duplicated(ids)] # Remove duplicated values | |
374 # y[r, MSDB.TAG.msmatching] <- paste(ids, collapse = .self$.molids.sep) | |
375 # } | |
376 # if ( ! same.rows || peak.table) { | |
377 # new.rows <- cbind(x.lines, results, row.names = NULL) | |
378 # if ( ! same.rows) { | |
379 # rows <- r:(r+nrow(results)-1) | |
380 # y[rows, colnames(new.rows)] <- new.rows | |
381 # } | |
382 # if (peak.table) { | |
383 # zr <- nrow(z) + 1 | |
384 # zrows <- zr:(zr+nrow(results)-1) | |
385 # z[zrows, colnames(new.rows)] <- new.rows | |
386 # } | |
387 # } | |
388 # } | |
389 } | |
390 | |
391 # results <- if (peak.table) list(main = y, peaks = z) else y | |
392 | |
393 # return(results) | |
394 return(peaks) | |
395 }) | |
396 | |
397 # rt Retention time in seconds. | |
398 # molids An option vector of molecule IDs, used to restrict the search. | |
399 MsDb$methods( searchForMzRtTols = function(mode, mz, rt = NULL, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, attribs = NULL, molids = NULL, molids.rt.tol = NULL, colnames = MSDB.DFT.INPUT.FIELDS) { | |
400 | |
401 # Set M/Z bounds | |
402 if (.self$.mz.tol.unit == MSDB.MZTOLUNIT.PPM) { | |
403 mz.low <- mz * (1 + (- shift - prec) * 1e-6) | |
404 mz.high <- mz * (1 + (- shift + prec) * 1e-6) | |
405 } | |
406 else { # PLAIN | |
407 mz.low <- mz - shift - prec | |
408 mz.high <- mz - shift + prec | |
409 } | |
410 | |
411 # Set retention time bounds | |
412 rt.low <- NULL | |
413 rt.high <- NULL | |
414 if ( ! is.null(rt.tol)) { | |
415 low <- rt - rt.tol | |
416 high <- rt + rt.tol | |
417 rt.low <- if (is.null(rt.low)) low else max(low, rt.low) | |
418 rt.high <- if (is.null(rt.high)) high else min(high, rt.high) | |
419 } | |
420 if ( ! is.null(rt.tol.x)) { | |
421 low <- rt - rt.tol.x - rt ^ rt.tol.y | |
422 high <- rt + rt.tol.x + rt ^ rt.tol.y | |
423 rt.low <- if (is.null(rt.low)) low else max(low, rt.low) | |
424 rt.high <- if (is.null(rt.high)) high else min(high, rt.high) | |
425 } | |
426 | |
427 # List molecule IDs | |
428 if ( ! is.null(molids.rt.tol) && is.data.frame(molids)) { | |
429 ids <- molids[(rt >= molids[[MSDB.TAG.colrt]] - molids.rt.tol) & (rt <= molids[[MSDB.TAG.colrt]] + molids.rt.tol), MSDB.TAG.molid] | |
430 if (length(ids) == 0) | |
431 # No molecule ID match for this retention time | |
432 return(data.frame()) # return empty result set | |
433 } else { | |
434 ids <- molids | |
435 } | |
436 | |
437 return(.self$searchForMzRtBounds(mode, | |
438 mz.low = mz * (1 + (- shift - prec) * 1e-6), | |
439 mz.high = mz * (1 + (- shift + prec) * 1e-6), | |
440 rt.low = rt.low, | |
441 rt.high = rt.high, | |
442 col = col, | |
443 attribs = attribs, | |
444 molids = ids)) | |
445 }) | |
446 | |
447 # rt.low Lower bound of the retention time in seconds. | |
448 # rt.high Higher bound of the retention time in seconds. | |
449 MsDb$methods( searchForMzRtBounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { | |
450 | |
451 results <- .self$.do.search.for.mz.rt.bounds(mode = mode, mz.low = mz.low, mz.high = mz.high, rt.low = rt.low, rt.high = rt.high, col = col, attribs = attribs, molids = molids) | |
452 | |
453 return(results) | |
454 }) | |
455 | |
456 # TODO Write description of output: data frame with which columns ? | |
457 MsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { | |
458 stop("Method .do.search.for.mz.rt.bounds() not implemented in concrete class.") | |
459 }) | |
460 | |
461 # DEPRECATED | |
462 MsDb$methods( searchForMz = function(x, mode, tol = 5, col = NULL, rt.tol.x = 5, rt.tol.y = 0.80) { | |
463 warning("Method searchForMz() is deprecated. Use searchForMzRtList() instead.") | |
464 .self$searchForMzRtList(x = x, mode = mode, prec = tol, col = col, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y) | |
465 }) | |
466 | |
467 } # end of load safe guard |