Mercurial > repos > prog > lcmsmatching
comparison MsFileDb.R @ 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author | prog |
---|---|
date | Fri, 22 Feb 2019 16:04:22 -0500 |
parents | fb9c0409d85c |
children |
comparison
equal
deleted
inserted
replaced
5:fb9c0409d85c | 6:f86fec07f392 |
---|---|
1 if ( ! exists('MsFileDb')) { # Do not load again if already loaded | |
2 | |
3 library('methods') | |
4 source('MsDb.R') | |
5 source('msdb-common.R') | |
6 source('search.R', chdir = TRUE) | |
7 | |
8 ##################### | |
9 # CLASS DECLARATION # | |
10 ##################### | |
11 | |
12 MsFileDb <- setRefClass("MsFileDb", contains = "MsDb", fields = list(.file = "character", .db = "ANY", .fields = "list", .modes = "list", .name.to.id = "ANY")) | |
13 | |
14 ############### | |
15 # CONSTRUCTOR # | |
16 ############### | |
17 | |
18 MsFileDb$methods( initialize = function(file = NA_character_, ...) { | |
19 | |
20 # Initialize members | |
21 .file <<- if ( ! is.null(file)) file else NA_character_ | |
22 .db <<- NULL | |
23 .fields <<- msdb.get.dft.db.fields() | |
24 .modes <<- MSDB.DFT.MODES | |
25 .name.to.id <<- NULL | |
26 | |
27 callSuper(...) | |
28 }) | |
29 | |
30 ################# | |
31 # SET DB FIELDS # | |
32 ################# | |
33 | |
34 MsFileDb$methods( areDbFieldsSettable = function() { | |
35 return(TRUE) | |
36 }) | |
37 | |
38 MsFileDb$methods( setDbFields = function(fields) { | |
39 .fields <<- as.list(fields) | |
40 }) | |
41 | |
42 ################ | |
43 # CHECK FIELDS # | |
44 ################ | |
45 | |
46 MsFileDb$methods( .check.fields = function(fields) { | |
47 | |
48 if (is.null(fields)) | |
49 stop("No fields specified for .check.fields()") | |
50 | |
51 # Check that fields are defined in the fields list | |
52 unknown <- fields[ ! fields %in% names(.self$.fields)] | |
53 if (length(unknown) > 0) | |
54 stop(paste0("Database field", if (length(unknown) == 1) "" else "s", " \"", paste(unkown, collapse = ", "), "\" ", if (length(unknown) == 1) "is" else "are", " not defined.")) | |
55 | |
56 # Check that field values are real columns inside the database | |
57 .self$.init.db() | |
58 db.col.names <- fields #vapply(fields, function(s) .self$.fields[[s]], FUN.VALUE = '') | |
59 unknown.cols <- db.col.names[ ! db.col.names %in% colnames(.self$.db)] | |
60 if (length(unknown.cols) > 0) | |
61 stop(paste0("Column", if (length(unknown.cols) == 1) "" else "s", " \"", paste(unknown.cols, collapse = ", "), "\" ", if (length(unknown.cols) == 1) "is" else "are", " not defined inside the database \"", .self$.file, "\".")) | |
62 }) | |
63 | |
64 ################ | |
65 # SET MS MODES # | |
66 ################ | |
67 | |
68 MsFileDb$methods( areDbMsModesSettable = function() { | |
69 return(TRUE) | |
70 }) | |
71 | |
72 MsFileDb$methods( setDbMsModes = function(modes) { | |
73 .modes <<- as.list(modes) | |
74 }) | |
75 | |
76 ########### | |
77 # INIT DB # | |
78 ########### | |
79 | |
80 MsFileDb$methods( .init.db = function() { | |
81 | |
82 if (is.null(.self$.db)) { | |
83 | |
84 # Load database | |
85 .db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL, check.names = FALSE, comment.char = '') | |
86 | |
87 # Check that colnames are unique | |
88 dupcol <- duplicated(colnames(.self$.db)) | |
89 if (any(dupcol)) | |
90 stop(paste("Database header contains duplicated names: ", paste(unique(colnames(.self$.db)[dupcol]), collapse = ', '), ".")) | |
91 | |
92 # Check that columns names supplied through field map are unique | |
93 dupfields <- duplicated(.self$.fields) | |
94 if (any(dupfields)) | |
95 stop(paste("Some db column names supplied are duplicated: ", paste(unique(.self$.fields[dupfields]), collapse = ', '), ".")) | |
96 | |
97 # Rename columns | |
98 colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') | |
99 } | |
100 }) | |
101 | |
102 ############ | |
103 # GET DATA # | |
104 ############ | |
105 | |
106 MsFileDb$methods( .get = function(db = NULL, col = NULL) { | |
107 | |
108 # Init db | |
109 if (is.null(db)) { | |
110 .self$.init.db() | |
111 db <- .self$.db | |
112 } | |
113 | |
114 # Check fields | |
115 .self$.check.fields(col) | |
116 | |
117 # Get database columns | |
118 # db.cols <- unlist(.self$.fields[col]) | |
119 | |
120 return(db[, col]) | |
121 }) | |
122 | |
123 ########### | |
124 # GET ROW # | |
125 ########### | |
126 | |
127 MsFileDb$methods( .get.row = function(row, cols = NULL) { | |
128 | |
129 # Init db | |
130 .self$.init.db() | |
131 | |
132 # Check fields | |
133 if ( ! is.null(cols)) | |
134 .self$.check.fields(cols) | |
135 | |
136 if ( ! is.null(cols)) { | |
137 #cols <- vapply(cols, function(c) .self$.fields[[c]], FUN.VALUE = '') | |
138 return(.self$.db[row, cols]) | |
139 } | |
140 | |
141 return(.self$.db[row, ]) | |
142 }) | |
143 | |
144 ########### | |
145 # GET COL # | |
146 ########### | |
147 | |
148 MsFileDb$methods( .get.col = function(col) { | |
149 | |
150 # Init db | |
151 .self$.init.db() | |
152 | |
153 # Check fields | |
154 .self$.check.fields(col) | |
155 | |
156 #return(.self$.db[[.self$.fields[[col]]]]) | |
157 return(.self$.db[[col]]) | |
158 }) | |
159 | |
160 #################### | |
161 # GET MOLECULE IDS # | |
162 #################### | |
163 | |
164 MsFileDb$methods( getMoleculeIds = function(max.results = NA_integer_) { | |
165 | |
166 # Init db | |
167 .self$.init.db() | |
168 | |
169 # Get IDs | |
170 mol.ids <- as.character(.self$.get.col(MSDB.TAG.MOLID)) | |
171 mol.ids <- mol.ids[ ! duplicated(mol.ids)] | |
172 mol.ids <- sort(mol.ids) | |
173 | |
174 # Cut results | |
175 if ( ! is.na(max.results) && length(mol.ids) > max.results) | |
176 mol.ids <- mol.ids[1:max.results] | |
177 | |
178 return(mol.ids) | |
179 }) | |
180 | |
181 #################### | |
182 # GET NB MOLECULES # | |
183 #################### | |
184 | |
185 # Returns the number of molecules in the database. | |
186 MsFileDb$methods( getNbMolecules = function() { | |
187 | |
188 # Init db | |
189 .self$.init.db() | |
190 | |
191 # Get IDs | |
192 mol.ids <- .self$.get.col(MSDB.TAG.MOLID) | |
193 mol.ids <- mol.ids[ ! duplicated(mol.ids)] | |
194 | |
195 return(length(mol.ids)) | |
196 }) | |
197 | |
198 ##################### | |
199 # GET MOLECULE NAME # | |
200 ##################### | |
201 | |
202 MsFileDb$methods( .get.name.from.id = function(db, id) { | |
203 | |
204 if(is.na(id)) | |
205 return(NA_character_) | |
206 | |
207 # Get names | |
208 names <- db[db[[MSDB.TAG.MOLID]] %in% id, MSDB.TAG.MOLNAMES] | |
209 if (length(names) == 0) | |
210 return(NA_character_) | |
211 | |
212 # Each molecule has potentially several names. Since we must return only one name for each molecule, we choose the first one. | |
213 name <- strsplit(names, ';')[[1]][[1]] | |
214 | |
215 return(name) | |
216 }) | |
217 | |
218 # Get molecule names | |
219 # molid An integer vector of molecule IDs. | |
220 # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector. | |
221 MsFileDb$methods( getMoleculeName = function(molid) { | |
222 | |
223 if (is.null(molid)) | |
224 return(NA_character_) | |
225 | |
226 # Init db | |
227 .self$.init.db() | |
228 | |
229 # Get database | |
230 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] | |
231 | |
232 # Remove duplicates | |
233 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] | |
234 | |
235 # Look for ids | |
236 names <- vapply(molid, function(i) .self$.get.name.from.id(db, i), FUN.VALUE = '') | |
237 | |
238 return(names) | |
239 }) | |
240 | |
241 ################### | |
242 # INIT NAME TO ID # | |
243 ################### | |
244 | |
245 MsFileDb$methods( .init.name.to.id = function() { | |
246 | |
247 if (is.null(.self$.name.to.id)) { | |
248 | |
249 # Create data frame | |
250 .name.to.id <<- data.frame(name = character(), id = character(), stringsAsFactors = FALSE) | |
251 | |
252 # Init db | |
253 .self$.init.db() | |
254 | |
255 # Get database subset (columns name and id only). | |
256 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] | |
257 | |
258 # Remove duplicate IDs | |
259 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] | |
260 | |
261 # Loop on all | |
262 for(i in seq(db[[MSDB.TAG.MOLID]])) { | |
263 i.id <- db[i, MSDB.TAG.MOLID] | |
264 i.names <- split.str(db[i, MSDB.TAG.MOLNAMES], ';', unlist = TRUE) | |
265 .name.to.id <<- rbind(.self$.name.to.id, data.frame(name = toupper(i.names), id = rep(i.id, length(i.names)), stringsAsFactors = FALSE)) | |
266 } | |
267 | |
268 # Order by name | |
269 .name.to.id <<- .self$.name.to.id[order(.self$.name.to.id[['name']]), ] | |
270 } | |
271 }) | |
272 | |
273 #################### | |
274 # GET ID FROM NAME # | |
275 #################### | |
276 | |
277 MsFileDb$methods( .get.id.from.name = function(name) { | |
278 | |
279 # Initialize name.to.id search tree | |
280 .self$.init.name.to.id() | |
281 | |
282 # Search for name | |
283 i <- binary.search(toupper(name), .self$.name.to.id[['name']]) | |
284 | |
285 # Get ID | |
286 id <- if (is.na(i)) NA_character_ else as.character(.self$.name.to.id[i, 'id']) | |
287 | |
288 return(id) | |
289 }) | |
290 | |
291 ################ | |
292 # FIND BY NAME # | |
293 ################ | |
294 | |
295 # Find a molecule by name | |
296 # name A vector of molecule names to search for. | |
297 # Return a vector of the same size as the name input vector, containing the found molecule IDs, in the same order. | |
298 MsFileDb$methods( findByName = function(name) { | |
299 | |
300 if (is.null(name)) | |
301 return(NA_character_) | |
302 | |
303 # Look for molecules with this name | |
304 ids <- list() | |
305 for (n in name) | |
306 ids <- c(ids, list(.self$.get.id.from.name(n))) | |
307 | |
308 return(ids) | |
309 }) | |
310 | |
311 ############################### | |
312 # GET CHROMATOGRAPHIC COLUMNS # | |
313 ############################### | |
314 | |
315 MsFileDb$methods( getChromCol = function(molid = NULL) { | |
316 | |
317 # Init db | |
318 .self$.init.db() | |
319 | |
320 # Get database | |
321 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL)] | |
322 | |
323 # Filter on molecule IDs | |
324 if ( ! is.null(molid)) | |
325 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
326 | |
327 # Get column names | |
328 cols <- db[[MSDB.TAG.COL]] | |
329 | |
330 # Remove duplicates | |
331 cols <- cols[ ! duplicated(cols)] | |
332 | |
333 # Make data frame | |
334 cols <- data.frame(id = cols, title = cols, stringsAsFactors = FALSE) | |
335 | |
336 return(cols) | |
337 }) | |
338 | |
339 ################ | |
340 # GET NB PEAKS # | |
341 ################ | |
342 | |
343 # Get the total number of MS peaks stored inside the database. | |
344 # molid The ID of the molecule. | |
345 # type The MS type. | |
346 MsFileDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { | |
347 | |
348 # Init db | |
349 .self$.init.db() | |
350 | |
351 # Get database | |
352 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MODE, MSDB.TAG.MZTHEO)] | |
353 | |
354 # Filter on mode | |
355 if ( ! is.null(type) && ! is.na(type)) | |
356 db <- db[db[[MSDB.TAG.MODE]] == (if (type == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] | |
357 | |
358 # Filter on molecule IDs | |
359 if ( ! is.null(molid) && ! is.na(molid)) | |
360 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
361 | |
362 # Get mz values | |
363 mz <- db[[MSDB.TAG.MZTHEO]] | |
364 | |
365 # Count number of unique values | |
366 n <- sum(as.integer(! duplicated(mz))) | |
367 | |
368 return(n) | |
369 }) | |
370 | |
371 ########## | |
372 # SEARCH # | |
373 ########## | |
374 | |
375 MsFileDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { | |
376 | |
377 # Init db | |
378 .self$.init.db() | |
379 db <- .self$.db | |
380 | |
381 # Filter on mode | |
382 if ( ! is.null(mode) && ! is.na(mode)) | |
383 db <- db[db[[MSDB.TAG.MODE]] == (if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] | |
384 | |
385 # Filter on molecule IDs | |
386 if ( ! is.null(molids)) | |
387 db <- db[db[[MSDB.TAG.MOLID]] %in% molids,] | |
388 | |
389 # Filter on attributions | |
390 if ( ! is.null(attribs) && ! is.na(attribs)) | |
391 db <- db[db[[MSDB.TAG.ATTR]] %in% attribs,] | |
392 | |
393 # Filter on columns | |
394 if ( ! is.null(col) && ! is.na(col)) | |
395 db <- db[db[[MSDB.TAG.COL]] %in% col,] | |
396 | |
397 # Filter on retention time | |
398 if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) { | |
399 scale <- if (.self$getRtUnit() == MSDB.RTUNIT.MIN) 60 else 1 | |
400 db <- db[db[[MSDB.TAG.COLRT]] * scale >= rt.low & db[[MSDB.TAG.COLRT]] * scale <= rt.high, ] | |
401 } | |
402 | |
403 # Remove retention times and column information | |
404 if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) { | |
405 db <- db[, ! (colnames(db) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))] | |
406 | |
407 # Remove duplicates | |
408 db <- db[ ! duplicated(db), ] | |
409 } | |
410 | |
411 # Filter on mz | |
412 db <- db[db[[MSDB.TAG.MZTHEO]] >= mz.low & db[[MSDB.TAG.MZTHEO]] <= mz.high, ] | |
413 | |
414 return(db) | |
415 }) | |
416 | |
417 ################# | |
418 # GET MZ VALUES # | |
419 ################# | |
420 | |
421 # Returns a numeric vector of all masses stored inside the database. | |
422 MsFileDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { | |
423 | |
424 # Init db | |
425 .self$.init.db() | |
426 db <- .self$.db | |
427 | |
428 # Filter on mode | |
429 if ( ! is.null(mode) && ! is.na(mode)) { | |
430 mode.tag <- if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg | |
431 selected.lines <- (.self$.get(db, col = MSDB.TAG.MODE) == mode.tag) | |
432 db <- db[selected.lines, ] | |
433 } | |
434 | |
435 # Get masses | |
436 mz <- .self$.get(db, col = MSDB.TAG.MZTHEO) | |
437 | |
438 # Remove duplicates | |
439 mz <- mz[ ! duplicated(mz)] | |
440 | |
441 # Apply cut-off | |
442 if ( ! is.na(max.results)) | |
443 mz <- mz[1:max.results] | |
444 | |
445 return(mz) | |
446 }) | |
447 | |
448 ####################### | |
449 # GET RETENTION TIMES # | |
450 ####################### | |
451 | |
452 # Get the retention times of a molecule. | |
453 # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list. | |
454 MsFileDb$methods( getRetentionTimes = function(molid, col = NA_character_) { | |
455 | |
456 if (is.null(molid) || is.na(molid)) | |
457 return(list()) | |
458 | |
459 # Init db | |
460 .self$.init.db() | |
461 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)] | |
462 | |
463 # Filter on molecule ID | |
464 if ( ! is.null(molid) && ! is.na(molid)) | |
465 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
466 | |
467 # Remove duplicates | |
468 db <- db[! duplicated(db), ] | |
469 | |
470 # Build retention time list | |
471 rt <- list() | |
472 cols <- db[[MSDB.TAG.COL]] | |
473 cols <- cols[ ! duplicated(cols)] | |
474 for (col in cols) { | |
475 colrts <- db[db[[MSDB.TAG.COL]] %in% col, MSDB.TAG.COLRT] | |
476 rt[col] <- list(colrts) | |
477 } | |
478 | |
479 if (.self$getRtUnit() == MSDB.RTUNIT.MIN) | |
480 rt <- 60 * rt | |
481 | |
482 return(rt) | |
483 }) | |
484 | |
485 } # end of load safe guard |