comparison MsFileDb.R @ 6:f86fec07f392 draft default tip

planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author prog
date Fri, 22 Feb 2019 16:04:22 -0500
parents fb9c0409d85c
children
comparison
equal deleted inserted replaced
5:fb9c0409d85c 6:f86fec07f392
1 if ( ! exists('MsFileDb')) { # Do not load again if already loaded
2
3 library('methods')
4 source('MsDb.R')
5 source('msdb-common.R')
6 source('search.R', chdir = TRUE)
7
8 #####################
9 # CLASS DECLARATION #
10 #####################
11
12 MsFileDb <- setRefClass("MsFileDb", contains = "MsDb", fields = list(.file = "character", .db = "ANY", .fields = "list", .modes = "list", .name.to.id = "ANY"))
13
14 ###############
15 # CONSTRUCTOR #
16 ###############
17
18 MsFileDb$methods( initialize = function(file = NA_character_, ...) {
19
20 # Initialize members
21 .file <<- if ( ! is.null(file)) file else NA_character_
22 .db <<- NULL
23 .fields <<- msdb.get.dft.db.fields()
24 .modes <<- MSDB.DFT.MODES
25 .name.to.id <<- NULL
26
27 callSuper(...)
28 })
29
30 #################
31 # SET DB FIELDS #
32 #################
33
34 MsFileDb$methods( areDbFieldsSettable = function() {
35 return(TRUE)
36 })
37
38 MsFileDb$methods( setDbFields = function(fields) {
39 .fields <<- as.list(fields)
40 })
41
42 ################
43 # CHECK FIELDS #
44 ################
45
46 MsFileDb$methods( .check.fields = function(fields) {
47
48 if (is.null(fields))
49 stop("No fields specified for .check.fields()")
50
51 # Check that fields are defined in the fields list
52 unknown <- fields[ ! fields %in% names(.self$.fields)]
53 if (length(unknown) > 0)
54 stop(paste0("Database field", if (length(unknown) == 1) "" else "s", " \"", paste(unkown, collapse = ", "), "\" ", if (length(unknown) == 1) "is" else "are", " not defined."))
55
56 # Check that field values are real columns inside the database
57 .self$.init.db()
58 db.col.names <- fields #vapply(fields, function(s) .self$.fields[[s]], FUN.VALUE = '')
59 unknown.cols <- db.col.names[ ! db.col.names %in% colnames(.self$.db)]
60 if (length(unknown.cols) > 0)
61 stop(paste0("Column", if (length(unknown.cols) == 1) "" else "s", " \"", paste(unknown.cols, collapse = ", "), "\" ", if (length(unknown.cols) == 1) "is" else "are", " not defined inside the database \"", .self$.file, "\"."))
62 })
63
64 ################
65 # SET MS MODES #
66 ################
67
68 MsFileDb$methods( areDbMsModesSettable = function() {
69 return(TRUE)
70 })
71
72 MsFileDb$methods( setDbMsModes = function(modes) {
73 .modes <<- as.list(modes)
74 })
75
76 ###########
77 # INIT DB #
78 ###########
79
80 MsFileDb$methods( .init.db = function() {
81
82 if (is.null(.self$.db)) {
83
84 # Load database
85 .db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL, check.names = FALSE, comment.char = '')
86
87 # Check that colnames are unique
88 dupcol <- duplicated(colnames(.self$.db))
89 if (any(dupcol))
90 stop(paste("Database header contains duplicated names: ", paste(unique(colnames(.self$.db)[dupcol]), collapse = ', '), "."))
91
92 # Check that columns names supplied through field map are unique
93 dupfields <- duplicated(.self$.fields)
94 if (any(dupfields))
95 stop(paste("Some db column names supplied are duplicated: ", paste(unique(.self$.fields[dupfields]), collapse = ', '), "."))
96
97 # Rename columns
98 colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '')
99 }
100 })
101
102 ############
103 # GET DATA #
104 ############
105
106 MsFileDb$methods( .get = function(db = NULL, col = NULL) {
107
108 # Init db
109 if (is.null(db)) {
110 .self$.init.db()
111 db <- .self$.db
112 }
113
114 # Check fields
115 .self$.check.fields(col)
116
117 # Get database columns
118 # db.cols <- unlist(.self$.fields[col])
119
120 return(db[, col])
121 })
122
123 ###########
124 # GET ROW #
125 ###########
126
127 MsFileDb$methods( .get.row = function(row, cols = NULL) {
128
129 # Init db
130 .self$.init.db()
131
132 # Check fields
133 if ( ! is.null(cols))
134 .self$.check.fields(cols)
135
136 if ( ! is.null(cols)) {
137 #cols <- vapply(cols, function(c) .self$.fields[[c]], FUN.VALUE = '')
138 return(.self$.db[row, cols])
139 }
140
141 return(.self$.db[row, ])
142 })
143
144 ###########
145 # GET COL #
146 ###########
147
148 MsFileDb$methods( .get.col = function(col) {
149
150 # Init db
151 .self$.init.db()
152
153 # Check fields
154 .self$.check.fields(col)
155
156 #return(.self$.db[[.self$.fields[[col]]]])
157 return(.self$.db[[col]])
158 })
159
160 ####################
161 # GET MOLECULE IDS #
162 ####################
163
164 MsFileDb$methods( getMoleculeIds = function(max.results = NA_integer_) {
165
166 # Init db
167 .self$.init.db()
168
169 # Get IDs
170 mol.ids <- as.character(.self$.get.col(MSDB.TAG.MOLID))
171 mol.ids <- mol.ids[ ! duplicated(mol.ids)]
172 mol.ids <- sort(mol.ids)
173
174 # Cut results
175 if ( ! is.na(max.results) && length(mol.ids) > max.results)
176 mol.ids <- mol.ids[1:max.results]
177
178 return(mol.ids)
179 })
180
181 ####################
182 # GET NB MOLECULES #
183 ####################
184
185 # Returns the number of molecules in the database.
186 MsFileDb$methods( getNbMolecules = function() {
187
188 # Init db
189 .self$.init.db()
190
191 # Get IDs
192 mol.ids <- .self$.get.col(MSDB.TAG.MOLID)
193 mol.ids <- mol.ids[ ! duplicated(mol.ids)]
194
195 return(length(mol.ids))
196 })
197
198 #####################
199 # GET MOLECULE NAME #
200 #####################
201
202 MsFileDb$methods( .get.name.from.id = function(db, id) {
203
204 if(is.na(id))
205 return(NA_character_)
206
207 # Get names
208 names <- db[db[[MSDB.TAG.MOLID]] %in% id, MSDB.TAG.MOLNAMES]
209 if (length(names) == 0)
210 return(NA_character_)
211
212 # Each molecule has potentially several names. Since we must return only one name for each molecule, we choose the first one.
213 name <- strsplit(names, ';')[[1]][[1]]
214
215 return(name)
216 })
217
218 # Get molecule names
219 # molid An integer vector of molecule IDs.
220 # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector.
221 MsFileDb$methods( getMoleculeName = function(molid) {
222
223 if (is.null(molid))
224 return(NA_character_)
225
226 # Init db
227 .self$.init.db()
228
229 # Get database
230 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)]
231
232 # Remove duplicates
233 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ]
234
235 # Look for ids
236 names <- vapply(molid, function(i) .self$.get.name.from.id(db, i), FUN.VALUE = '')
237
238 return(names)
239 })
240
241 ###################
242 # INIT NAME TO ID #
243 ###################
244
245 MsFileDb$methods( .init.name.to.id = function() {
246
247 if (is.null(.self$.name.to.id)) {
248
249 # Create data frame
250 .name.to.id <<- data.frame(name = character(), id = character(), stringsAsFactors = FALSE)
251
252 # Init db
253 .self$.init.db()
254
255 # Get database subset (columns name and id only).
256 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)]
257
258 # Remove duplicate IDs
259 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ]
260
261 # Loop on all
262 for(i in seq(db[[MSDB.TAG.MOLID]])) {
263 i.id <- db[i, MSDB.TAG.MOLID]
264 i.names <- split.str(db[i, MSDB.TAG.MOLNAMES], ';', unlist = TRUE)
265 .name.to.id <<- rbind(.self$.name.to.id, data.frame(name = toupper(i.names), id = rep(i.id, length(i.names)), stringsAsFactors = FALSE))
266 }
267
268 # Order by name
269 .name.to.id <<- .self$.name.to.id[order(.self$.name.to.id[['name']]), ]
270 }
271 })
272
273 ####################
274 # GET ID FROM NAME #
275 ####################
276
277 MsFileDb$methods( .get.id.from.name = function(name) {
278
279 # Initialize name.to.id search tree
280 .self$.init.name.to.id()
281
282 # Search for name
283 i <- binary.search(toupper(name), .self$.name.to.id[['name']])
284
285 # Get ID
286 id <- if (is.na(i)) NA_character_ else as.character(.self$.name.to.id[i, 'id'])
287
288 return(id)
289 })
290
291 ################
292 # FIND BY NAME #
293 ################
294
295 # Find a molecule by name
296 # name A vector of molecule names to search for.
297 # Return a vector of the same size as the name input vector, containing the found molecule IDs, in the same order.
298 MsFileDb$methods( findByName = function(name) {
299
300 if (is.null(name))
301 return(NA_character_)
302
303 # Look for molecules with this name
304 ids <- list()
305 for (n in name)
306 ids <- c(ids, list(.self$.get.id.from.name(n)))
307
308 return(ids)
309 })
310
311 ###############################
312 # GET CHROMATOGRAPHIC COLUMNS #
313 ###############################
314
315 MsFileDb$methods( getChromCol = function(molid = NULL) {
316
317 # Init db
318 .self$.init.db()
319
320 # Get database
321 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL)]
322
323 # Filter on molecule IDs
324 if ( ! is.null(molid))
325 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,]
326
327 # Get column names
328 cols <- db[[MSDB.TAG.COL]]
329
330 # Remove duplicates
331 cols <- cols[ ! duplicated(cols)]
332
333 # Make data frame
334 cols <- data.frame(id = cols, title = cols, stringsAsFactors = FALSE)
335
336 return(cols)
337 })
338
339 ################
340 # GET NB PEAKS #
341 ################
342
343 # Get the total number of MS peaks stored inside the database.
344 # molid The ID of the molecule.
345 # type The MS type.
346 MsFileDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) {
347
348 # Init db
349 .self$.init.db()
350
351 # Get database
352 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MODE, MSDB.TAG.MZTHEO)]
353
354 # Filter on mode
355 if ( ! is.null(type) && ! is.na(type))
356 db <- db[db[[MSDB.TAG.MODE]] == (if (type == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ]
357
358 # Filter on molecule IDs
359 if ( ! is.null(molid) && ! is.na(molid))
360 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,]
361
362 # Get mz values
363 mz <- db[[MSDB.TAG.MZTHEO]]
364
365 # Count number of unique values
366 n <- sum(as.integer(! duplicated(mz)))
367
368 return(n)
369 })
370
371 ##########
372 # SEARCH #
373 ##########
374
375 MsFileDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) {
376
377 # Init db
378 .self$.init.db()
379 db <- .self$.db
380
381 # Filter on mode
382 if ( ! is.null(mode) && ! is.na(mode))
383 db <- db[db[[MSDB.TAG.MODE]] == (if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ]
384
385 # Filter on molecule IDs
386 if ( ! is.null(molids))
387 db <- db[db[[MSDB.TAG.MOLID]] %in% molids,]
388
389 # Filter on attributions
390 if ( ! is.null(attribs) && ! is.na(attribs))
391 db <- db[db[[MSDB.TAG.ATTR]] %in% attribs,]
392
393 # Filter on columns
394 if ( ! is.null(col) && ! is.na(col))
395 db <- db[db[[MSDB.TAG.COL]] %in% col,]
396
397 # Filter on retention time
398 if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) {
399 scale <- if (.self$getRtUnit() == MSDB.RTUNIT.MIN) 60 else 1
400 db <- db[db[[MSDB.TAG.COLRT]] * scale >= rt.low & db[[MSDB.TAG.COLRT]] * scale <= rt.high, ]
401 }
402
403 # Remove retention times and column information
404 if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) {
405 db <- db[, ! (colnames(db) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))]
406
407 # Remove duplicates
408 db <- db[ ! duplicated(db), ]
409 }
410
411 # Filter on mz
412 db <- db[db[[MSDB.TAG.MZTHEO]] >= mz.low & db[[MSDB.TAG.MZTHEO]] <= mz.high, ]
413
414 return(db)
415 })
416
417 #################
418 # GET MZ VALUES #
419 #################
420
421 # Returns a numeric vector of all masses stored inside the database.
422 MsFileDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
423
424 # Init db
425 .self$.init.db()
426 db <- .self$.db
427
428 # Filter on mode
429 if ( ! is.null(mode) && ! is.na(mode)) {
430 mode.tag <- if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg
431 selected.lines <- (.self$.get(db, col = MSDB.TAG.MODE) == mode.tag)
432 db <- db[selected.lines, ]
433 }
434
435 # Get masses
436 mz <- .self$.get(db, col = MSDB.TAG.MZTHEO)
437
438 # Remove duplicates
439 mz <- mz[ ! duplicated(mz)]
440
441 # Apply cut-off
442 if ( ! is.na(max.results))
443 mz <- mz[1:max.results]
444
445 return(mz)
446 })
447
448 #######################
449 # GET RETENTION TIMES #
450 #######################
451
452 # Get the retention times of a molecule.
453 # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list.
454 MsFileDb$methods( getRetentionTimes = function(molid, col = NA_character_) {
455
456 if (is.null(molid) || is.na(molid))
457 return(list())
458
459 # Init db
460 .self$.init.db()
461 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)]
462
463 # Filter on molecule ID
464 if ( ! is.null(molid) && ! is.na(molid))
465 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,]
466
467 # Remove duplicates
468 db <- db[! duplicated(db), ]
469
470 # Build retention time list
471 rt <- list()
472 cols <- db[[MSDB.TAG.COL]]
473 cols <- cols[ ! duplicated(cols)]
474 for (col in cols) {
475 colrts <- db[db[[MSDB.TAG.COL]] %in% col, MSDB.TAG.COLRT]
476 rt[col] <- list(colrts)
477 }
478
479 if (.self$getRtUnit() == MSDB.RTUNIT.MIN)
480 rt <- 60 * rt
481
482 return(rt)
483 })
484
485 } # end of load safe guard