Mercurial > repos > prog > lcmsmatching
comparison MsFileDb.R @ 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
| author | prog |
|---|---|
| date | Fri, 22 Feb 2019 16:04:22 -0500 |
| parents | fb9c0409d85c |
| children |
comparison
equal
deleted
inserted
replaced
| 5:fb9c0409d85c | 6:f86fec07f392 |
|---|---|
| 1 if ( ! exists('MsFileDb')) { # Do not load again if already loaded | |
| 2 | |
| 3 library('methods') | |
| 4 source('MsDb.R') | |
| 5 source('msdb-common.R') | |
| 6 source('search.R', chdir = TRUE) | |
| 7 | |
| 8 ##################### | |
| 9 # CLASS DECLARATION # | |
| 10 ##################### | |
| 11 | |
| 12 MsFileDb <- setRefClass("MsFileDb", contains = "MsDb", fields = list(.file = "character", .db = "ANY", .fields = "list", .modes = "list", .name.to.id = "ANY")) | |
| 13 | |
| 14 ############### | |
| 15 # CONSTRUCTOR # | |
| 16 ############### | |
| 17 | |
| 18 MsFileDb$methods( initialize = function(file = NA_character_, ...) { | |
| 19 | |
| 20 # Initialize members | |
| 21 .file <<- if ( ! is.null(file)) file else NA_character_ | |
| 22 .db <<- NULL | |
| 23 .fields <<- msdb.get.dft.db.fields() | |
| 24 .modes <<- MSDB.DFT.MODES | |
| 25 .name.to.id <<- NULL | |
| 26 | |
| 27 callSuper(...) | |
| 28 }) | |
| 29 | |
| 30 ################# | |
| 31 # SET DB FIELDS # | |
| 32 ################# | |
| 33 | |
| 34 MsFileDb$methods( areDbFieldsSettable = function() { | |
| 35 return(TRUE) | |
| 36 }) | |
| 37 | |
| 38 MsFileDb$methods( setDbFields = function(fields) { | |
| 39 .fields <<- as.list(fields) | |
| 40 }) | |
| 41 | |
| 42 ################ | |
| 43 # CHECK FIELDS # | |
| 44 ################ | |
| 45 | |
| 46 MsFileDb$methods( .check.fields = function(fields) { | |
| 47 | |
| 48 if (is.null(fields)) | |
| 49 stop("No fields specified for .check.fields()") | |
| 50 | |
| 51 # Check that fields are defined in the fields list | |
| 52 unknown <- fields[ ! fields %in% names(.self$.fields)] | |
| 53 if (length(unknown) > 0) | |
| 54 stop(paste0("Database field", if (length(unknown) == 1) "" else "s", " \"", paste(unkown, collapse = ", "), "\" ", if (length(unknown) == 1) "is" else "are", " not defined.")) | |
| 55 | |
| 56 # Check that field values are real columns inside the database | |
| 57 .self$.init.db() | |
| 58 db.col.names <- fields #vapply(fields, function(s) .self$.fields[[s]], FUN.VALUE = '') | |
| 59 unknown.cols <- db.col.names[ ! db.col.names %in% colnames(.self$.db)] | |
| 60 if (length(unknown.cols) > 0) | |
| 61 stop(paste0("Column", if (length(unknown.cols) == 1) "" else "s", " \"", paste(unknown.cols, collapse = ", "), "\" ", if (length(unknown.cols) == 1) "is" else "are", " not defined inside the database \"", .self$.file, "\".")) | |
| 62 }) | |
| 63 | |
| 64 ################ | |
| 65 # SET MS MODES # | |
| 66 ################ | |
| 67 | |
| 68 MsFileDb$methods( areDbMsModesSettable = function() { | |
| 69 return(TRUE) | |
| 70 }) | |
| 71 | |
| 72 MsFileDb$methods( setDbMsModes = function(modes) { | |
| 73 .modes <<- as.list(modes) | |
| 74 }) | |
| 75 | |
| 76 ########### | |
| 77 # INIT DB # | |
| 78 ########### | |
| 79 | |
| 80 MsFileDb$methods( .init.db = function() { | |
| 81 | |
| 82 if (is.null(.self$.db)) { | |
| 83 | |
| 84 # Load database | |
| 85 .db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL, check.names = FALSE, comment.char = '') | |
| 86 | |
| 87 # Check that colnames are unique | |
| 88 dupcol <- duplicated(colnames(.self$.db)) | |
| 89 if (any(dupcol)) | |
| 90 stop(paste("Database header contains duplicated names: ", paste(unique(colnames(.self$.db)[dupcol]), collapse = ', '), ".")) | |
| 91 | |
| 92 # Check that columns names supplied through field map are unique | |
| 93 dupfields <- duplicated(.self$.fields) | |
| 94 if (any(dupfields)) | |
| 95 stop(paste("Some db column names supplied are duplicated: ", paste(unique(.self$.fields[dupfields]), collapse = ', '), ".")) | |
| 96 | |
| 97 # Rename columns | |
| 98 colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') | |
| 99 } | |
| 100 }) | |
| 101 | |
| 102 ############ | |
| 103 # GET DATA # | |
| 104 ############ | |
| 105 | |
| 106 MsFileDb$methods( .get = function(db = NULL, col = NULL) { | |
| 107 | |
| 108 # Init db | |
| 109 if (is.null(db)) { | |
| 110 .self$.init.db() | |
| 111 db <- .self$.db | |
| 112 } | |
| 113 | |
| 114 # Check fields | |
| 115 .self$.check.fields(col) | |
| 116 | |
| 117 # Get database columns | |
| 118 # db.cols <- unlist(.self$.fields[col]) | |
| 119 | |
| 120 return(db[, col]) | |
| 121 }) | |
| 122 | |
| 123 ########### | |
| 124 # GET ROW # | |
| 125 ########### | |
| 126 | |
| 127 MsFileDb$methods( .get.row = function(row, cols = NULL) { | |
| 128 | |
| 129 # Init db | |
| 130 .self$.init.db() | |
| 131 | |
| 132 # Check fields | |
| 133 if ( ! is.null(cols)) | |
| 134 .self$.check.fields(cols) | |
| 135 | |
| 136 if ( ! is.null(cols)) { | |
| 137 #cols <- vapply(cols, function(c) .self$.fields[[c]], FUN.VALUE = '') | |
| 138 return(.self$.db[row, cols]) | |
| 139 } | |
| 140 | |
| 141 return(.self$.db[row, ]) | |
| 142 }) | |
| 143 | |
| 144 ########### | |
| 145 # GET COL # | |
| 146 ########### | |
| 147 | |
| 148 MsFileDb$methods( .get.col = function(col) { | |
| 149 | |
| 150 # Init db | |
| 151 .self$.init.db() | |
| 152 | |
| 153 # Check fields | |
| 154 .self$.check.fields(col) | |
| 155 | |
| 156 #return(.self$.db[[.self$.fields[[col]]]]) | |
| 157 return(.self$.db[[col]]) | |
| 158 }) | |
| 159 | |
| 160 #################### | |
| 161 # GET MOLECULE IDS # | |
| 162 #################### | |
| 163 | |
| 164 MsFileDb$methods( getMoleculeIds = function(max.results = NA_integer_) { | |
| 165 | |
| 166 # Init db | |
| 167 .self$.init.db() | |
| 168 | |
| 169 # Get IDs | |
| 170 mol.ids <- as.character(.self$.get.col(MSDB.TAG.MOLID)) | |
| 171 mol.ids <- mol.ids[ ! duplicated(mol.ids)] | |
| 172 mol.ids <- sort(mol.ids) | |
| 173 | |
| 174 # Cut results | |
| 175 if ( ! is.na(max.results) && length(mol.ids) > max.results) | |
| 176 mol.ids <- mol.ids[1:max.results] | |
| 177 | |
| 178 return(mol.ids) | |
| 179 }) | |
| 180 | |
| 181 #################### | |
| 182 # GET NB MOLECULES # | |
| 183 #################### | |
| 184 | |
| 185 # Returns the number of molecules in the database. | |
| 186 MsFileDb$methods( getNbMolecules = function() { | |
| 187 | |
| 188 # Init db | |
| 189 .self$.init.db() | |
| 190 | |
| 191 # Get IDs | |
| 192 mol.ids <- .self$.get.col(MSDB.TAG.MOLID) | |
| 193 mol.ids <- mol.ids[ ! duplicated(mol.ids)] | |
| 194 | |
| 195 return(length(mol.ids)) | |
| 196 }) | |
| 197 | |
| 198 ##################### | |
| 199 # GET MOLECULE NAME # | |
| 200 ##################### | |
| 201 | |
| 202 MsFileDb$methods( .get.name.from.id = function(db, id) { | |
| 203 | |
| 204 if(is.na(id)) | |
| 205 return(NA_character_) | |
| 206 | |
| 207 # Get names | |
| 208 names <- db[db[[MSDB.TAG.MOLID]] %in% id, MSDB.TAG.MOLNAMES] | |
| 209 if (length(names) == 0) | |
| 210 return(NA_character_) | |
| 211 | |
| 212 # Each molecule has potentially several names. Since we must return only one name for each molecule, we choose the first one. | |
| 213 name <- strsplit(names, ';')[[1]][[1]] | |
| 214 | |
| 215 return(name) | |
| 216 }) | |
| 217 | |
| 218 # Get molecule names | |
| 219 # molid An integer vector of molecule IDs. | |
| 220 # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector. | |
| 221 MsFileDb$methods( getMoleculeName = function(molid) { | |
| 222 | |
| 223 if (is.null(molid)) | |
| 224 return(NA_character_) | |
| 225 | |
| 226 # Init db | |
| 227 .self$.init.db() | |
| 228 | |
| 229 # Get database | |
| 230 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] | |
| 231 | |
| 232 # Remove duplicates | |
| 233 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] | |
| 234 | |
| 235 # Look for ids | |
| 236 names <- vapply(molid, function(i) .self$.get.name.from.id(db, i), FUN.VALUE = '') | |
| 237 | |
| 238 return(names) | |
| 239 }) | |
| 240 | |
| 241 ################### | |
| 242 # INIT NAME TO ID # | |
| 243 ################### | |
| 244 | |
| 245 MsFileDb$methods( .init.name.to.id = function() { | |
| 246 | |
| 247 if (is.null(.self$.name.to.id)) { | |
| 248 | |
| 249 # Create data frame | |
| 250 .name.to.id <<- data.frame(name = character(), id = character(), stringsAsFactors = FALSE) | |
| 251 | |
| 252 # Init db | |
| 253 .self$.init.db() | |
| 254 | |
| 255 # Get database subset (columns name and id only). | |
| 256 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] | |
| 257 | |
| 258 # Remove duplicate IDs | |
| 259 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] | |
| 260 | |
| 261 # Loop on all | |
| 262 for(i in seq(db[[MSDB.TAG.MOLID]])) { | |
| 263 i.id <- db[i, MSDB.TAG.MOLID] | |
| 264 i.names <- split.str(db[i, MSDB.TAG.MOLNAMES], ';', unlist = TRUE) | |
| 265 .name.to.id <<- rbind(.self$.name.to.id, data.frame(name = toupper(i.names), id = rep(i.id, length(i.names)), stringsAsFactors = FALSE)) | |
| 266 } | |
| 267 | |
| 268 # Order by name | |
| 269 .name.to.id <<- .self$.name.to.id[order(.self$.name.to.id[['name']]), ] | |
| 270 } | |
| 271 }) | |
| 272 | |
| 273 #################### | |
| 274 # GET ID FROM NAME # | |
| 275 #################### | |
| 276 | |
| 277 MsFileDb$methods( .get.id.from.name = function(name) { | |
| 278 | |
| 279 # Initialize name.to.id search tree | |
| 280 .self$.init.name.to.id() | |
| 281 | |
| 282 # Search for name | |
| 283 i <- binary.search(toupper(name), .self$.name.to.id[['name']]) | |
| 284 | |
| 285 # Get ID | |
| 286 id <- if (is.na(i)) NA_character_ else as.character(.self$.name.to.id[i, 'id']) | |
| 287 | |
| 288 return(id) | |
| 289 }) | |
| 290 | |
| 291 ################ | |
| 292 # FIND BY NAME # | |
| 293 ################ | |
| 294 | |
| 295 # Find a molecule by name | |
| 296 # name A vector of molecule names to search for. | |
| 297 # Return a vector of the same size as the name input vector, containing the found molecule IDs, in the same order. | |
| 298 MsFileDb$methods( findByName = function(name) { | |
| 299 | |
| 300 if (is.null(name)) | |
| 301 return(NA_character_) | |
| 302 | |
| 303 # Look for molecules with this name | |
| 304 ids <- list() | |
| 305 for (n in name) | |
| 306 ids <- c(ids, list(.self$.get.id.from.name(n))) | |
| 307 | |
| 308 return(ids) | |
| 309 }) | |
| 310 | |
| 311 ############################### | |
| 312 # GET CHROMATOGRAPHIC COLUMNS # | |
| 313 ############################### | |
| 314 | |
| 315 MsFileDb$methods( getChromCol = function(molid = NULL) { | |
| 316 | |
| 317 # Init db | |
| 318 .self$.init.db() | |
| 319 | |
| 320 # Get database | |
| 321 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL)] | |
| 322 | |
| 323 # Filter on molecule IDs | |
| 324 if ( ! is.null(molid)) | |
| 325 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
| 326 | |
| 327 # Get column names | |
| 328 cols <- db[[MSDB.TAG.COL]] | |
| 329 | |
| 330 # Remove duplicates | |
| 331 cols <- cols[ ! duplicated(cols)] | |
| 332 | |
| 333 # Make data frame | |
| 334 cols <- data.frame(id = cols, title = cols, stringsAsFactors = FALSE) | |
| 335 | |
| 336 return(cols) | |
| 337 }) | |
| 338 | |
| 339 ################ | |
| 340 # GET NB PEAKS # | |
| 341 ################ | |
| 342 | |
| 343 # Get the total number of MS peaks stored inside the database. | |
| 344 # molid The ID of the molecule. | |
| 345 # type The MS type. | |
| 346 MsFileDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { | |
| 347 | |
| 348 # Init db | |
| 349 .self$.init.db() | |
| 350 | |
| 351 # Get database | |
| 352 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MODE, MSDB.TAG.MZTHEO)] | |
| 353 | |
| 354 # Filter on mode | |
| 355 if ( ! is.null(type) && ! is.na(type)) | |
| 356 db <- db[db[[MSDB.TAG.MODE]] == (if (type == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] | |
| 357 | |
| 358 # Filter on molecule IDs | |
| 359 if ( ! is.null(molid) && ! is.na(molid)) | |
| 360 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
| 361 | |
| 362 # Get mz values | |
| 363 mz <- db[[MSDB.TAG.MZTHEO]] | |
| 364 | |
| 365 # Count number of unique values | |
| 366 n <- sum(as.integer(! duplicated(mz))) | |
| 367 | |
| 368 return(n) | |
| 369 }) | |
| 370 | |
| 371 ########## | |
| 372 # SEARCH # | |
| 373 ########## | |
| 374 | |
| 375 MsFileDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { | |
| 376 | |
| 377 # Init db | |
| 378 .self$.init.db() | |
| 379 db <- .self$.db | |
| 380 | |
| 381 # Filter on mode | |
| 382 if ( ! is.null(mode) && ! is.na(mode)) | |
| 383 db <- db[db[[MSDB.TAG.MODE]] == (if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] | |
| 384 | |
| 385 # Filter on molecule IDs | |
| 386 if ( ! is.null(molids)) | |
| 387 db <- db[db[[MSDB.TAG.MOLID]] %in% molids,] | |
| 388 | |
| 389 # Filter on attributions | |
| 390 if ( ! is.null(attribs) && ! is.na(attribs)) | |
| 391 db <- db[db[[MSDB.TAG.ATTR]] %in% attribs,] | |
| 392 | |
| 393 # Filter on columns | |
| 394 if ( ! is.null(col) && ! is.na(col)) | |
| 395 db <- db[db[[MSDB.TAG.COL]] %in% col,] | |
| 396 | |
| 397 # Filter on retention time | |
| 398 if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) { | |
| 399 scale <- if (.self$getRtUnit() == MSDB.RTUNIT.MIN) 60 else 1 | |
| 400 db <- db[db[[MSDB.TAG.COLRT]] * scale >= rt.low & db[[MSDB.TAG.COLRT]] * scale <= rt.high, ] | |
| 401 } | |
| 402 | |
| 403 # Remove retention times and column information | |
| 404 if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) { | |
| 405 db <- db[, ! (colnames(db) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))] | |
| 406 | |
| 407 # Remove duplicates | |
| 408 db <- db[ ! duplicated(db), ] | |
| 409 } | |
| 410 | |
| 411 # Filter on mz | |
| 412 db <- db[db[[MSDB.TAG.MZTHEO]] >= mz.low & db[[MSDB.TAG.MZTHEO]] <= mz.high, ] | |
| 413 | |
| 414 return(db) | |
| 415 }) | |
| 416 | |
| 417 ################# | |
| 418 # GET MZ VALUES # | |
| 419 ################# | |
| 420 | |
| 421 # Returns a numeric vector of all masses stored inside the database. | |
| 422 MsFileDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { | |
| 423 | |
| 424 # Init db | |
| 425 .self$.init.db() | |
| 426 db <- .self$.db | |
| 427 | |
| 428 # Filter on mode | |
| 429 if ( ! is.null(mode) && ! is.na(mode)) { | |
| 430 mode.tag <- if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg | |
| 431 selected.lines <- (.self$.get(db, col = MSDB.TAG.MODE) == mode.tag) | |
| 432 db <- db[selected.lines, ] | |
| 433 } | |
| 434 | |
| 435 # Get masses | |
| 436 mz <- .self$.get(db, col = MSDB.TAG.MZTHEO) | |
| 437 | |
| 438 # Remove duplicates | |
| 439 mz <- mz[ ! duplicated(mz)] | |
| 440 | |
| 441 # Apply cut-off | |
| 442 if ( ! is.na(max.results)) | |
| 443 mz <- mz[1:max.results] | |
| 444 | |
| 445 return(mz) | |
| 446 }) | |
| 447 | |
| 448 ####################### | |
| 449 # GET RETENTION TIMES # | |
| 450 ####################### | |
| 451 | |
| 452 # Get the retention times of a molecule. | |
| 453 # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list. | |
| 454 MsFileDb$methods( getRetentionTimes = function(molid, col = NA_character_) { | |
| 455 | |
| 456 if (is.null(molid) || is.na(molid)) | |
| 457 return(list()) | |
| 458 | |
| 459 # Init db | |
| 460 .self$.init.db() | |
| 461 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)] | |
| 462 | |
| 463 # Filter on molecule ID | |
| 464 if ( ! is.null(molid) && ! is.na(molid)) | |
| 465 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
| 466 | |
| 467 # Remove duplicates | |
| 468 db <- db[! duplicated(db), ] | |
| 469 | |
| 470 # Build retention time list | |
| 471 rt <- list() | |
| 472 cols <- db[[MSDB.TAG.COL]] | |
| 473 cols <- cols[ ! duplicated(cols)] | |
| 474 for (col in cols) { | |
| 475 colrts <- db[db[[MSDB.TAG.COL]] %in% col, MSDB.TAG.COLRT] | |
| 476 rt[col] <- list(colrts) | |
| 477 } | |
| 478 | |
| 479 if (.self$getRtUnit() == MSDB.RTUNIT.MIN) | |
| 480 rt <- 60 * rt | |
| 481 | |
| 482 return(rt) | |
| 483 }) | |
| 484 | |
| 485 } # end of load safe guard |
