comparison MsFileDb.R @ 0:e66bb061af06 draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 3529b25417f8e1a5836474c9adec4b696d35099d-dirty
author prog
date Tue, 12 Jul 2016 12:02:37 -0400
parents
children 20d69a062da3
comparison
equal deleted inserted replaced
-1:000000000000 0:e66bb061af06
1 if ( ! exists('MsFileDb')) { # Do not load again if already loaded
2
3 library('methods')
4 source('MsDb.R')
5 source('msdb-common.R')
6 source('search.R', chdir = TRUE)
7
8 #####################
9 # CLASS DECLARATION #
10 #####################
11
12 MsFileDb <- setRefClass("MsFileDb", contains = "MsDb", fields = list(.file = "character", .db = "ANY", .fields = "list", .modes = "list", .name.to.id = "ANY"))
13
14 ###############
15 # CONSTRUCTOR #
16 ###############
17
18 MsFileDb$methods( initialize = function(file = NA_character_, ...) {
19
20 # Initialize members
21 .file <<- if ( ! is.null(file)) file else NA_character_
22 .db <<- NULL
23 .fields <<- msdb.get.dft.db.fields()
24 .modes <<- MSDB.DFT.MODES
25 .name.to.id <<- NULL
26
27 callSuper(...)
28 })
29
30 #################
31 # SET DB FIELDS #
32 #################
33
34 MsFileDb$methods( areDbFieldsSettable = function() {
35 return(TRUE)
36 })
37
38 MsFileDb$methods( setDbFields = function(fields) {
39 .fields <<- as.list(fields)
40 })
41
42 ################
43 # CHECK FIELDS #
44 ################
45
46 MsFileDb$methods( .check.fields = function(fields) {
47
48 if (is.null(fields))
49 stop("No fields specified for .check.fields()")
50
51 # Check that fields are defined in the fields list
52 unknown <- fields[ ! fields %in% names(.self$.fields)]
53 if (length(unknown) > 0)
54 stop(paste0("Database field", if (length(unknown) == 1) "" else "s", " \"", paste(unkown, collapse = ", "), "\" ", if (length(unknown) == 1) "is" else "are", " not defined."))
55
56 # Check that field values are real columns inside the database
57 .self$.init.db()
58 db.col.names <- fields #vapply(fields, function(s) .self$.fields[[s]], FUN.VALUE = '')
59 unknown.cols <- db.col.names[ ! db.col.names %in% colnames(.self$.db)]
60 if (length(unknown.cols) > 0)
61 stop(paste0("Column", if (length(unknown.cols) == 1) "" else "s", " \"", paste(unknown.cols, collapse = ", "), "\" ", if (length(unknown.cols) == 1) "is" else "are", " not defined inside the database \"", .self$.file, "\"."))
62 })
63
64 ################
65 # SET MS MODES #
66 ################
67
68 MsFileDb$methods( areDbMsModesSettable = function() {
69 return(TRUE)
70 })
71
72 MsFileDb$methods( setDbMsModes = function(modes) {
73 .modes <<- as.list(modes)
74 })
75
76 ###########
77 # INIT DB #
78 ###########
79
80 MsFileDb$methods( .init.db = function() {
81
82 if (is.null(.self$.db)) {
83
84 # Load database
85 .db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL)
86
87 # Rename columns
88 colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '')
89 }
90 })
91
92 ############
93 # GET DATA #
94 ############
95
96 MsFileDb$methods( .get = function(db = NULL, col = NULL) {
97
98 # Init db
99 if (is.null(db)) {
100 .self$.init.db()
101 db <- .self$.db
102 }
103
104 # Check fields
105 .self$.check.fields(col)
106
107 # Get database columns
108 # db.cols <- unlist(.self$.fields[col])
109
110 return(db[, col])
111 })
112
113 ###########
114 # GET ROW #
115 ###########
116
117 MsFileDb$methods( .get.row = function(row, cols = NULL) {
118
119 # Init db
120 .self$.init.db()
121
122 # Check fields
123 if ( ! is.null(cols))
124 .self$.check.fields(cols)
125
126 if ( ! is.null(cols)) {
127 #cols <- vapply(cols, function(c) .self$.fields[[c]], FUN.VALUE = '')
128 return(.self$.db[row, cols])
129 }
130
131 return(.self$.db[row, ])
132 })
133
134 ###########
135 # GET COL #
136 ###########
137
138 MsFileDb$methods( .get.col = function(col) {
139
140 # Init db
141 .self$.init.db()
142
143 # Check fields
144 .self$.check.fields(col)
145
146 #return(.self$.db[[.self$.fields[[col]]]])
147 return(.self$.db[[col]])
148 })
149
150 ####################
151 # GET MOLECULE IDS #
152 ####################
153
154 MsFileDb$methods( getMoleculeIds = function() {
155
156 # Init db
157 .self$.init.db()
158
159 # Get IDs
160 mol.ids <- as.character(.self$.get.col(MSDB.TAG.MOLID))
161 mol.ids <- mol.ids[ ! duplicated(mol.ids)]
162 mol.ids <- sort(mol.ids)
163
164 return(mol.ids)
165 })
166
167 ####################
168 # GET NB MOLECULES #
169 ####################
170
171 # Returns the number of molecules in the database.
172 MsFileDb$methods( getNbMolecules = function() {
173
174 # Init db
175 .self$.init.db()
176
177 # Get IDs
178 mol.ids <- .self$.get.col(MSDB.TAG.MOLID)
179 mol.ids <- mol.ids[ ! duplicated(mol.ids)]
180
181 return(length(mol.ids))
182 })
183
184 #####################
185 # GET MOLECULE NAME #
186 #####################
187
188 MsFileDb$methods( .get.name.from.id = function(db, id) {
189
190 if(is.na(id))
191 return(NA_character_)
192
193 # Get names
194 names <- db[db[[MSDB.TAG.MOLID]] %in% id, MSDB.TAG.MOLNAMES]
195 if (length(names) == 0)
196 return(NA_character_)
197
198 # Each molecule has potentially several names. Since we must return only one name for each molecule, we choose the first one.
199 name <- strsplit(names, ';')[[1]][[1]]
200
201 return(name)
202 })
203
204 # Get molecule names
205 # molid An integer vector of molecule IDs.
206 # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector.
207 MsFileDb$methods( getMoleculeName = function(molid) {
208
209 if (is.null(molid))
210 return(NA_character_)
211
212 # Init db
213 .self$.init.db()
214
215 # Get database
216 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)]
217
218 # Remove duplicates
219 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ]
220
221 # Look for ids
222 names <- vapply(molid, function(i) .self$.get.name.from.id(db, i), FUN.VALUE = '')
223
224 return(names)
225 })
226
227 ###################
228 # INIT NAME TO ID #
229 ###################
230
231 MsFileDb$methods( .init.name.to.id = function() {
232
233 if (is.null(.self$.name.to.id)) {
234
235 # Create data frame
236 .name.to.id <<- data.frame(name = character(), id = character(), stringsAsFactors = FALSE)
237
238 # Init db
239 .self$.init.db()
240
241 # Get database subset (columns name and id only).
242 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)]
243
244 # Remove duplicate IDs
245 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ]
246
247 # Loop on all
248 for(i in seq(db[[MSDB.TAG.MOLID]])) {
249 i.id <- db[i, MSDB.TAG.MOLID]
250 i.names <- split.str(db[i, MSDB.TAG.MOLNAMES], ';', unlist = TRUE)
251 .name.to.id <<- rbind(.self$.name.to.id, data.frame(name = toupper(i.names), id = rep(i.id, length(i.names)), stringsAsFactors = FALSE))
252 }
253
254 # Order by name
255 .name.to.id <<- .self$.name.to.id[order(.self$.name.to.id[['name']]), ]
256 }
257 })
258
259 ####################
260 # GET ID FROM NAME #
261 ####################
262
263 MsFileDb$methods( .get.id.from.name = function(name) {
264
265 # Initialize name.to.id search tree
266 .self$.init.name.to.id()
267
268 # Search for name
269 i <- binary.search(toupper(name), .self$.name.to.id[['name']])
270
271 # Get ID
272 id <- if (is.na(i)) NA_character_ else as.character(.self$.name.to.id[i, 'id'])
273
274 return(id)
275 })
276
277 ################
278 # FIND BY NAME #
279 ################
280
281 # Find a molecule by name
282 # name A vector of molecule names to search for.
283 # Return a vector of the same size as the name input vector, containing the found molecule IDs, in the same order.
284 MsFileDb$methods( findByName = function(name) {
285
286 if (is.null(name))
287 return(NA_character_)
288
289 # Look for molecules with this name
290 ids <- list()
291 for (n in name)
292 ids <- c(ids, list(.self$.get.id.from.name(n)))
293
294 return(ids)
295 })
296
297 ###############################
298 # GET CHROMATOGRAPHIC COLUMNS #
299 ###############################
300
301 MsFileDb$methods( getChromCol = function(molid = NULL) {
302
303 # Init db
304 .self$.init.db()
305
306 # Get database
307 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL)]
308
309 # Filter on molecule IDs
310 if ( ! is.null(molid))
311 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,]
312
313 # Get column names
314 cols <- db[[MSDB.TAG.COL]]
315
316 # Remove duplicates
317 cols <- cols[ ! duplicated(cols)]
318
319 # Make data frame
320 cols <- data.frame(id = cols, title = cols, stringsAsFactors = FALSE)
321
322 return(cols)
323 })
324
325 ################
326 # GET NB PEAKS #
327 ################
328
329 # Get the total number of MS peaks stored inside the database.
330 # molid The ID of the molecule.
331 # type The MS type.
332 MsFileDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) {
333
334 # Init db
335 .self$.init.db()
336
337 # Get database
338 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MODE, MSDB.TAG.MZTHEO)]
339
340 # Filter on mode
341 if ( ! is.null(type) && ! is.na(type))
342 db <- db[db[[MSDB.TAG.MODE]] == (if (type == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ]
343
344 # Filter on molecule IDs
345 if ( ! is.null(molid) && ! is.na(molid))
346 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,]
347
348 # Get mz values
349 mz <- db[[MSDB.TAG.MZTHEO]]
350
351 # Count number of unique values
352 n <- sum(as.integer(! duplicated(mz)))
353
354 return(n)
355 })
356
357 ##########
358 # SEARCH #
359 ##########
360
361 MsFileDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) {
362
363 # Init db
364 .self$.init.db()
365 db <- .self$.db
366
367 # Filter on mode
368 if ( ! is.null(mode) && ! is.na(mode))
369 db <- db[db[[MSDB.TAG.MODE]] == (if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ]
370
371 # Filter on molecule IDs
372 if ( ! is.null(molids))
373 db <- db[db[[MSDB.TAG.MOLID]] %in% molids,]
374
375 # Filter on attributions
376 if ( ! is.null(attribs) && ! is.na(attribs))
377 db <- db[db[[MSDB.TAG.ATTR]] %in% attribs,]
378
379 # Filter on columns
380 if ( ! is.null(col) && ! is.na(col))
381 db <- db[db[[MSDB.TAG.COL]] %in% col,]
382
383 # Filter on retention time
384 if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high))
385 db <- db[db[[MSDB.TAG.COLRT]] >= rt.low & db[[MSDB.TAG.COLRT]] <= rt.high, ]
386
387 # Remove retention times and column information
388 if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) {
389 db <- db[, ! (colnames(db) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))]
390
391 # Remove duplicates
392 db <- db[ ! duplicated(db), ]
393 }
394
395 # Filter on mz
396 db <- db[db[[MSDB.TAG.MZTHEO]] >= mz.low & db[[MSDB.TAG.MZTHEO]] <= mz.high, ]
397
398 # Rename database fields
399 # conv <- c( mz = 'mztheo', rt = 'colrt') # solving mismatch of field names between database and output
400 # cols <- colnames(db)
401 # for (db.field in names(.self$.fields)) {
402 # output.field <- if (db.field %in% names(conv)) conv[[db.field]] else db.field
403 # if (.self$.fields[[db.field]] %in% cols && output.field %in% names(.self$.output.fields))
404 # cols[cols %in% .self$.fields[[db.field]]] <- .self$.output.fields[[output.field]]
405 # }
406 # colnames(db) <- cols
407
408 # Remove unwanted columns
409 # db <- db[, colnames(db) %in% .self$.output.fields]
410
411 return(db)
412 })
413
414 #################
415 # GET MZ VALUES #
416 #################
417
418 # Returns a numeric vector of all masses stored inside the database.
419 MsFileDb$methods( getMzValues = function(mode = NULL) {
420
421 # Init db
422 .self$.init.db()
423 db <- .self$.db
424
425 # Filter on mode
426 if ( ! is.null(mode) && ! is.na(mode)) {
427 mode.tag <- if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg
428 selected.lines <- (.self$.get(db, col = MSDB.TAG.MODE) == mode.tag)
429 db <- db[selected.lines, ]
430 }
431
432 # Get masses
433 mz <- .self$.get(db, col = MSDB.TAG.MZTHEO)
434
435 # Remove duplicates
436 mz <- mz[ ! duplicated(mz)]
437
438 return(mz)
439 })
440
441 #######################
442 # GET RETENTION TIMES #
443 #######################
444
445 # Get the retention times of a molecule.
446 # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list.
447 MsFileDb$methods( getRetentionTimes = function(molid, col = NA_character_) {
448
449 if (is.null(molid) || is.na(molid))
450 return(list())
451
452 # Init db
453 .self$.init.db()
454 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)]
455
456 # Filter on molecule ID
457 if ( ! is.null(molid) && ! is.na(molid))
458 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,]
459
460 # Remove duplicates
461 db <- db[! duplicated(db), ]
462
463 # Build retention time list
464 rt <- list()
465 cols <- db[[MSDB.TAG.COL]]
466 cols <- cols[ ! duplicated(cols)]
467 for (col in cols) {
468 colrts <- db[db[[MSDB.TAG.COL]] %in% col, MSDB.TAG.COLRT]
469 rt[col] <- list(colrts)
470 }
471
472 return(rt)
473 })
474
475 } # end of load safe guard