Mercurial > repos > prog > lcmsmatching
comparison MsFileDb.R @ 0:e66bb061af06 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 3529b25417f8e1a5836474c9adec4b696d35099d-dirty
author | prog |
---|---|
date | Tue, 12 Jul 2016 12:02:37 -0400 |
parents | |
children | 20d69a062da3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e66bb061af06 |
---|---|
1 if ( ! exists('MsFileDb')) { # Do not load again if already loaded | |
2 | |
3 library('methods') | |
4 source('MsDb.R') | |
5 source('msdb-common.R') | |
6 source('search.R', chdir = TRUE) | |
7 | |
8 ##################### | |
9 # CLASS DECLARATION # | |
10 ##################### | |
11 | |
12 MsFileDb <- setRefClass("MsFileDb", contains = "MsDb", fields = list(.file = "character", .db = "ANY", .fields = "list", .modes = "list", .name.to.id = "ANY")) | |
13 | |
14 ############### | |
15 # CONSTRUCTOR # | |
16 ############### | |
17 | |
18 MsFileDb$methods( initialize = function(file = NA_character_, ...) { | |
19 | |
20 # Initialize members | |
21 .file <<- if ( ! is.null(file)) file else NA_character_ | |
22 .db <<- NULL | |
23 .fields <<- msdb.get.dft.db.fields() | |
24 .modes <<- MSDB.DFT.MODES | |
25 .name.to.id <<- NULL | |
26 | |
27 callSuper(...) | |
28 }) | |
29 | |
30 ################# | |
31 # SET DB FIELDS # | |
32 ################# | |
33 | |
34 MsFileDb$methods( areDbFieldsSettable = function() { | |
35 return(TRUE) | |
36 }) | |
37 | |
38 MsFileDb$methods( setDbFields = function(fields) { | |
39 .fields <<- as.list(fields) | |
40 }) | |
41 | |
42 ################ | |
43 # CHECK FIELDS # | |
44 ################ | |
45 | |
46 MsFileDb$methods( .check.fields = function(fields) { | |
47 | |
48 if (is.null(fields)) | |
49 stop("No fields specified for .check.fields()") | |
50 | |
51 # Check that fields are defined in the fields list | |
52 unknown <- fields[ ! fields %in% names(.self$.fields)] | |
53 if (length(unknown) > 0) | |
54 stop(paste0("Database field", if (length(unknown) == 1) "" else "s", " \"", paste(unkown, collapse = ", "), "\" ", if (length(unknown) == 1) "is" else "are", " not defined.")) | |
55 | |
56 # Check that field values are real columns inside the database | |
57 .self$.init.db() | |
58 db.col.names <- fields #vapply(fields, function(s) .self$.fields[[s]], FUN.VALUE = '') | |
59 unknown.cols <- db.col.names[ ! db.col.names %in% colnames(.self$.db)] | |
60 if (length(unknown.cols) > 0) | |
61 stop(paste0("Column", if (length(unknown.cols) == 1) "" else "s", " \"", paste(unknown.cols, collapse = ", "), "\" ", if (length(unknown.cols) == 1) "is" else "are", " not defined inside the database \"", .self$.file, "\".")) | |
62 }) | |
63 | |
64 ################ | |
65 # SET MS MODES # | |
66 ################ | |
67 | |
68 MsFileDb$methods( areDbMsModesSettable = function() { | |
69 return(TRUE) | |
70 }) | |
71 | |
72 MsFileDb$methods( setDbMsModes = function(modes) { | |
73 .modes <<- as.list(modes) | |
74 }) | |
75 | |
76 ########### | |
77 # INIT DB # | |
78 ########### | |
79 | |
80 MsFileDb$methods( .init.db = function() { | |
81 | |
82 if (is.null(.self$.db)) { | |
83 | |
84 # Load database | |
85 .db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL) | |
86 | |
87 # Rename columns | |
88 colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') | |
89 } | |
90 }) | |
91 | |
92 ############ | |
93 # GET DATA # | |
94 ############ | |
95 | |
96 MsFileDb$methods( .get = function(db = NULL, col = NULL) { | |
97 | |
98 # Init db | |
99 if (is.null(db)) { | |
100 .self$.init.db() | |
101 db <- .self$.db | |
102 } | |
103 | |
104 # Check fields | |
105 .self$.check.fields(col) | |
106 | |
107 # Get database columns | |
108 # db.cols <- unlist(.self$.fields[col]) | |
109 | |
110 return(db[, col]) | |
111 }) | |
112 | |
113 ########### | |
114 # GET ROW # | |
115 ########### | |
116 | |
117 MsFileDb$methods( .get.row = function(row, cols = NULL) { | |
118 | |
119 # Init db | |
120 .self$.init.db() | |
121 | |
122 # Check fields | |
123 if ( ! is.null(cols)) | |
124 .self$.check.fields(cols) | |
125 | |
126 if ( ! is.null(cols)) { | |
127 #cols <- vapply(cols, function(c) .self$.fields[[c]], FUN.VALUE = '') | |
128 return(.self$.db[row, cols]) | |
129 } | |
130 | |
131 return(.self$.db[row, ]) | |
132 }) | |
133 | |
134 ########### | |
135 # GET COL # | |
136 ########### | |
137 | |
138 MsFileDb$methods( .get.col = function(col) { | |
139 | |
140 # Init db | |
141 .self$.init.db() | |
142 | |
143 # Check fields | |
144 .self$.check.fields(col) | |
145 | |
146 #return(.self$.db[[.self$.fields[[col]]]]) | |
147 return(.self$.db[[col]]) | |
148 }) | |
149 | |
150 #################### | |
151 # GET MOLECULE IDS # | |
152 #################### | |
153 | |
154 MsFileDb$methods( getMoleculeIds = function() { | |
155 | |
156 # Init db | |
157 .self$.init.db() | |
158 | |
159 # Get IDs | |
160 mol.ids <- as.character(.self$.get.col(MSDB.TAG.MOLID)) | |
161 mol.ids <- mol.ids[ ! duplicated(mol.ids)] | |
162 mol.ids <- sort(mol.ids) | |
163 | |
164 return(mol.ids) | |
165 }) | |
166 | |
167 #################### | |
168 # GET NB MOLECULES # | |
169 #################### | |
170 | |
171 # Returns the number of molecules in the database. | |
172 MsFileDb$methods( getNbMolecules = function() { | |
173 | |
174 # Init db | |
175 .self$.init.db() | |
176 | |
177 # Get IDs | |
178 mol.ids <- .self$.get.col(MSDB.TAG.MOLID) | |
179 mol.ids <- mol.ids[ ! duplicated(mol.ids)] | |
180 | |
181 return(length(mol.ids)) | |
182 }) | |
183 | |
184 ##################### | |
185 # GET MOLECULE NAME # | |
186 ##################### | |
187 | |
188 MsFileDb$methods( .get.name.from.id = function(db, id) { | |
189 | |
190 if(is.na(id)) | |
191 return(NA_character_) | |
192 | |
193 # Get names | |
194 names <- db[db[[MSDB.TAG.MOLID]] %in% id, MSDB.TAG.MOLNAMES] | |
195 if (length(names) == 0) | |
196 return(NA_character_) | |
197 | |
198 # Each molecule has potentially several names. Since we must return only one name for each molecule, we choose the first one. | |
199 name <- strsplit(names, ';')[[1]][[1]] | |
200 | |
201 return(name) | |
202 }) | |
203 | |
204 # Get molecule names | |
205 # molid An integer vector of molecule IDs. | |
206 # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector. | |
207 MsFileDb$methods( getMoleculeName = function(molid) { | |
208 | |
209 if (is.null(molid)) | |
210 return(NA_character_) | |
211 | |
212 # Init db | |
213 .self$.init.db() | |
214 | |
215 # Get database | |
216 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] | |
217 | |
218 # Remove duplicates | |
219 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] | |
220 | |
221 # Look for ids | |
222 names <- vapply(molid, function(i) .self$.get.name.from.id(db, i), FUN.VALUE = '') | |
223 | |
224 return(names) | |
225 }) | |
226 | |
227 ################### | |
228 # INIT NAME TO ID # | |
229 ################### | |
230 | |
231 MsFileDb$methods( .init.name.to.id = function() { | |
232 | |
233 if (is.null(.self$.name.to.id)) { | |
234 | |
235 # Create data frame | |
236 .name.to.id <<- data.frame(name = character(), id = character(), stringsAsFactors = FALSE) | |
237 | |
238 # Init db | |
239 .self$.init.db() | |
240 | |
241 # Get database subset (columns name and id only). | |
242 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] | |
243 | |
244 # Remove duplicate IDs | |
245 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] | |
246 | |
247 # Loop on all | |
248 for(i in seq(db[[MSDB.TAG.MOLID]])) { | |
249 i.id <- db[i, MSDB.TAG.MOLID] | |
250 i.names <- split.str(db[i, MSDB.TAG.MOLNAMES], ';', unlist = TRUE) | |
251 .name.to.id <<- rbind(.self$.name.to.id, data.frame(name = toupper(i.names), id = rep(i.id, length(i.names)), stringsAsFactors = FALSE)) | |
252 } | |
253 | |
254 # Order by name | |
255 .name.to.id <<- .self$.name.to.id[order(.self$.name.to.id[['name']]), ] | |
256 } | |
257 }) | |
258 | |
259 #################### | |
260 # GET ID FROM NAME # | |
261 #################### | |
262 | |
263 MsFileDb$methods( .get.id.from.name = function(name) { | |
264 | |
265 # Initialize name.to.id search tree | |
266 .self$.init.name.to.id() | |
267 | |
268 # Search for name | |
269 i <- binary.search(toupper(name), .self$.name.to.id[['name']]) | |
270 | |
271 # Get ID | |
272 id <- if (is.na(i)) NA_character_ else as.character(.self$.name.to.id[i, 'id']) | |
273 | |
274 return(id) | |
275 }) | |
276 | |
277 ################ | |
278 # FIND BY NAME # | |
279 ################ | |
280 | |
281 # Find a molecule by name | |
282 # name A vector of molecule names to search for. | |
283 # Return a vector of the same size as the name input vector, containing the found molecule IDs, in the same order. | |
284 MsFileDb$methods( findByName = function(name) { | |
285 | |
286 if (is.null(name)) | |
287 return(NA_character_) | |
288 | |
289 # Look for molecules with this name | |
290 ids <- list() | |
291 for (n in name) | |
292 ids <- c(ids, list(.self$.get.id.from.name(n))) | |
293 | |
294 return(ids) | |
295 }) | |
296 | |
297 ############################### | |
298 # GET CHROMATOGRAPHIC COLUMNS # | |
299 ############################### | |
300 | |
301 MsFileDb$methods( getChromCol = function(molid = NULL) { | |
302 | |
303 # Init db | |
304 .self$.init.db() | |
305 | |
306 # Get database | |
307 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL)] | |
308 | |
309 # Filter on molecule IDs | |
310 if ( ! is.null(molid)) | |
311 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
312 | |
313 # Get column names | |
314 cols <- db[[MSDB.TAG.COL]] | |
315 | |
316 # Remove duplicates | |
317 cols <- cols[ ! duplicated(cols)] | |
318 | |
319 # Make data frame | |
320 cols <- data.frame(id = cols, title = cols, stringsAsFactors = FALSE) | |
321 | |
322 return(cols) | |
323 }) | |
324 | |
325 ################ | |
326 # GET NB PEAKS # | |
327 ################ | |
328 | |
329 # Get the total number of MS peaks stored inside the database. | |
330 # molid The ID of the molecule. | |
331 # type The MS type. | |
332 MsFileDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { | |
333 | |
334 # Init db | |
335 .self$.init.db() | |
336 | |
337 # Get database | |
338 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MODE, MSDB.TAG.MZTHEO)] | |
339 | |
340 # Filter on mode | |
341 if ( ! is.null(type) && ! is.na(type)) | |
342 db <- db[db[[MSDB.TAG.MODE]] == (if (type == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] | |
343 | |
344 # Filter on molecule IDs | |
345 if ( ! is.null(molid) && ! is.na(molid)) | |
346 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
347 | |
348 # Get mz values | |
349 mz <- db[[MSDB.TAG.MZTHEO]] | |
350 | |
351 # Count number of unique values | |
352 n <- sum(as.integer(! duplicated(mz))) | |
353 | |
354 return(n) | |
355 }) | |
356 | |
357 ########## | |
358 # SEARCH # | |
359 ########## | |
360 | |
361 MsFileDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { | |
362 | |
363 # Init db | |
364 .self$.init.db() | |
365 db <- .self$.db | |
366 | |
367 # Filter on mode | |
368 if ( ! is.null(mode) && ! is.na(mode)) | |
369 db <- db[db[[MSDB.TAG.MODE]] == (if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] | |
370 | |
371 # Filter on molecule IDs | |
372 if ( ! is.null(molids)) | |
373 db <- db[db[[MSDB.TAG.MOLID]] %in% molids,] | |
374 | |
375 # Filter on attributions | |
376 if ( ! is.null(attribs) && ! is.na(attribs)) | |
377 db <- db[db[[MSDB.TAG.ATTR]] %in% attribs,] | |
378 | |
379 # Filter on columns | |
380 if ( ! is.null(col) && ! is.na(col)) | |
381 db <- db[db[[MSDB.TAG.COL]] %in% col,] | |
382 | |
383 # Filter on retention time | |
384 if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) | |
385 db <- db[db[[MSDB.TAG.COLRT]] >= rt.low & db[[MSDB.TAG.COLRT]] <= rt.high, ] | |
386 | |
387 # Remove retention times and column information | |
388 if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) { | |
389 db <- db[, ! (colnames(db) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))] | |
390 | |
391 # Remove duplicates | |
392 db <- db[ ! duplicated(db), ] | |
393 } | |
394 | |
395 # Filter on mz | |
396 db <- db[db[[MSDB.TAG.MZTHEO]] >= mz.low & db[[MSDB.TAG.MZTHEO]] <= mz.high, ] | |
397 | |
398 # Rename database fields | |
399 # conv <- c( mz = 'mztheo', rt = 'colrt') # solving mismatch of field names between database and output | |
400 # cols <- colnames(db) | |
401 # for (db.field in names(.self$.fields)) { | |
402 # output.field <- if (db.field %in% names(conv)) conv[[db.field]] else db.field | |
403 # if (.self$.fields[[db.field]] %in% cols && output.field %in% names(.self$.output.fields)) | |
404 # cols[cols %in% .self$.fields[[db.field]]] <- .self$.output.fields[[output.field]] | |
405 # } | |
406 # colnames(db) <- cols | |
407 | |
408 # Remove unwanted columns | |
409 # db <- db[, colnames(db) %in% .self$.output.fields] | |
410 | |
411 return(db) | |
412 }) | |
413 | |
414 ################# | |
415 # GET MZ VALUES # | |
416 ################# | |
417 | |
418 # Returns a numeric vector of all masses stored inside the database. | |
419 MsFileDb$methods( getMzValues = function(mode = NULL) { | |
420 | |
421 # Init db | |
422 .self$.init.db() | |
423 db <- .self$.db | |
424 | |
425 # Filter on mode | |
426 if ( ! is.null(mode) && ! is.na(mode)) { | |
427 mode.tag <- if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg | |
428 selected.lines <- (.self$.get(db, col = MSDB.TAG.MODE) == mode.tag) | |
429 db <- db[selected.lines, ] | |
430 } | |
431 | |
432 # Get masses | |
433 mz <- .self$.get(db, col = MSDB.TAG.MZTHEO) | |
434 | |
435 # Remove duplicates | |
436 mz <- mz[ ! duplicated(mz)] | |
437 | |
438 return(mz) | |
439 }) | |
440 | |
441 ####################### | |
442 # GET RETENTION TIMES # | |
443 ####################### | |
444 | |
445 # Get the retention times of a molecule. | |
446 # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list. | |
447 MsFileDb$methods( getRetentionTimes = function(molid, col = NA_character_) { | |
448 | |
449 if (is.null(molid) || is.na(molid)) | |
450 return(list()) | |
451 | |
452 # Init db | |
453 .self$.init.db() | |
454 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)] | |
455 | |
456 # Filter on molecule ID | |
457 if ( ! is.null(molid) && ! is.na(molid)) | |
458 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
459 | |
460 # Remove duplicates | |
461 db <- db[! duplicated(db), ] | |
462 | |
463 # Build retention time list | |
464 rt <- list() | |
465 cols <- db[[MSDB.TAG.COL]] | |
466 cols <- cols[ ! duplicated(cols)] | |
467 for (col in cols) { | |
468 colrts <- db[db[[MSDB.TAG.COL]] %in% col, MSDB.TAG.COLRT] | |
469 rt[col] <- list(colrts) | |
470 } | |
471 | |
472 return(rt) | |
473 }) | |
474 | |
475 } # end of load safe guard |