Mercurial > repos > prog > lcmsmatching
comparison MsFileDb.R @ 0:e66bb061af06 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 3529b25417f8e1a5836474c9adec4b696d35099d-dirty
| author | prog |
|---|---|
| date | Tue, 12 Jul 2016 12:02:37 -0400 |
| parents | |
| children | 20d69a062da3 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e66bb061af06 |
|---|---|
| 1 if ( ! exists('MsFileDb')) { # Do not load again if already loaded | |
| 2 | |
| 3 library('methods') | |
| 4 source('MsDb.R') | |
| 5 source('msdb-common.R') | |
| 6 source('search.R', chdir = TRUE) | |
| 7 | |
| 8 ##################### | |
| 9 # CLASS DECLARATION # | |
| 10 ##################### | |
| 11 | |
| 12 MsFileDb <- setRefClass("MsFileDb", contains = "MsDb", fields = list(.file = "character", .db = "ANY", .fields = "list", .modes = "list", .name.to.id = "ANY")) | |
| 13 | |
| 14 ############### | |
| 15 # CONSTRUCTOR # | |
| 16 ############### | |
| 17 | |
| 18 MsFileDb$methods( initialize = function(file = NA_character_, ...) { | |
| 19 | |
| 20 # Initialize members | |
| 21 .file <<- if ( ! is.null(file)) file else NA_character_ | |
| 22 .db <<- NULL | |
| 23 .fields <<- msdb.get.dft.db.fields() | |
| 24 .modes <<- MSDB.DFT.MODES | |
| 25 .name.to.id <<- NULL | |
| 26 | |
| 27 callSuper(...) | |
| 28 }) | |
| 29 | |
| 30 ################# | |
| 31 # SET DB FIELDS # | |
| 32 ################# | |
| 33 | |
| 34 MsFileDb$methods( areDbFieldsSettable = function() { | |
| 35 return(TRUE) | |
| 36 }) | |
| 37 | |
| 38 MsFileDb$methods( setDbFields = function(fields) { | |
| 39 .fields <<- as.list(fields) | |
| 40 }) | |
| 41 | |
| 42 ################ | |
| 43 # CHECK FIELDS # | |
| 44 ################ | |
| 45 | |
| 46 MsFileDb$methods( .check.fields = function(fields) { | |
| 47 | |
| 48 if (is.null(fields)) | |
| 49 stop("No fields specified for .check.fields()") | |
| 50 | |
| 51 # Check that fields are defined in the fields list | |
| 52 unknown <- fields[ ! fields %in% names(.self$.fields)] | |
| 53 if (length(unknown) > 0) | |
| 54 stop(paste0("Database field", if (length(unknown) == 1) "" else "s", " \"", paste(unkown, collapse = ", "), "\" ", if (length(unknown) == 1) "is" else "are", " not defined.")) | |
| 55 | |
| 56 # Check that field values are real columns inside the database | |
| 57 .self$.init.db() | |
| 58 db.col.names <- fields #vapply(fields, function(s) .self$.fields[[s]], FUN.VALUE = '') | |
| 59 unknown.cols <- db.col.names[ ! db.col.names %in% colnames(.self$.db)] | |
| 60 if (length(unknown.cols) > 0) | |
| 61 stop(paste0("Column", if (length(unknown.cols) == 1) "" else "s", " \"", paste(unknown.cols, collapse = ", "), "\" ", if (length(unknown.cols) == 1) "is" else "are", " not defined inside the database \"", .self$.file, "\".")) | |
| 62 }) | |
| 63 | |
| 64 ################ | |
| 65 # SET MS MODES # | |
| 66 ################ | |
| 67 | |
| 68 MsFileDb$methods( areDbMsModesSettable = function() { | |
| 69 return(TRUE) | |
| 70 }) | |
| 71 | |
| 72 MsFileDb$methods( setDbMsModes = function(modes) { | |
| 73 .modes <<- as.list(modes) | |
| 74 }) | |
| 75 | |
| 76 ########### | |
| 77 # INIT DB # | |
| 78 ########### | |
| 79 | |
| 80 MsFileDb$methods( .init.db = function() { | |
| 81 | |
| 82 if (is.null(.self$.db)) { | |
| 83 | |
| 84 # Load database | |
| 85 .db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL) | |
| 86 | |
| 87 # Rename columns | |
| 88 colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') | |
| 89 } | |
| 90 }) | |
| 91 | |
| 92 ############ | |
| 93 # GET DATA # | |
| 94 ############ | |
| 95 | |
| 96 MsFileDb$methods( .get = function(db = NULL, col = NULL) { | |
| 97 | |
| 98 # Init db | |
| 99 if (is.null(db)) { | |
| 100 .self$.init.db() | |
| 101 db <- .self$.db | |
| 102 } | |
| 103 | |
| 104 # Check fields | |
| 105 .self$.check.fields(col) | |
| 106 | |
| 107 # Get database columns | |
| 108 # db.cols <- unlist(.self$.fields[col]) | |
| 109 | |
| 110 return(db[, col]) | |
| 111 }) | |
| 112 | |
| 113 ########### | |
| 114 # GET ROW # | |
| 115 ########### | |
| 116 | |
| 117 MsFileDb$methods( .get.row = function(row, cols = NULL) { | |
| 118 | |
| 119 # Init db | |
| 120 .self$.init.db() | |
| 121 | |
| 122 # Check fields | |
| 123 if ( ! is.null(cols)) | |
| 124 .self$.check.fields(cols) | |
| 125 | |
| 126 if ( ! is.null(cols)) { | |
| 127 #cols <- vapply(cols, function(c) .self$.fields[[c]], FUN.VALUE = '') | |
| 128 return(.self$.db[row, cols]) | |
| 129 } | |
| 130 | |
| 131 return(.self$.db[row, ]) | |
| 132 }) | |
| 133 | |
| 134 ########### | |
| 135 # GET COL # | |
| 136 ########### | |
| 137 | |
| 138 MsFileDb$methods( .get.col = function(col) { | |
| 139 | |
| 140 # Init db | |
| 141 .self$.init.db() | |
| 142 | |
| 143 # Check fields | |
| 144 .self$.check.fields(col) | |
| 145 | |
| 146 #return(.self$.db[[.self$.fields[[col]]]]) | |
| 147 return(.self$.db[[col]]) | |
| 148 }) | |
| 149 | |
| 150 #################### | |
| 151 # GET MOLECULE IDS # | |
| 152 #################### | |
| 153 | |
| 154 MsFileDb$methods( getMoleculeIds = function() { | |
| 155 | |
| 156 # Init db | |
| 157 .self$.init.db() | |
| 158 | |
| 159 # Get IDs | |
| 160 mol.ids <- as.character(.self$.get.col(MSDB.TAG.MOLID)) | |
| 161 mol.ids <- mol.ids[ ! duplicated(mol.ids)] | |
| 162 mol.ids <- sort(mol.ids) | |
| 163 | |
| 164 return(mol.ids) | |
| 165 }) | |
| 166 | |
| 167 #################### | |
| 168 # GET NB MOLECULES # | |
| 169 #################### | |
| 170 | |
| 171 # Returns the number of molecules in the database. | |
| 172 MsFileDb$methods( getNbMolecules = function() { | |
| 173 | |
| 174 # Init db | |
| 175 .self$.init.db() | |
| 176 | |
| 177 # Get IDs | |
| 178 mol.ids <- .self$.get.col(MSDB.TAG.MOLID) | |
| 179 mol.ids <- mol.ids[ ! duplicated(mol.ids)] | |
| 180 | |
| 181 return(length(mol.ids)) | |
| 182 }) | |
| 183 | |
| 184 ##################### | |
| 185 # GET MOLECULE NAME # | |
| 186 ##################### | |
| 187 | |
| 188 MsFileDb$methods( .get.name.from.id = function(db, id) { | |
| 189 | |
| 190 if(is.na(id)) | |
| 191 return(NA_character_) | |
| 192 | |
| 193 # Get names | |
| 194 names <- db[db[[MSDB.TAG.MOLID]] %in% id, MSDB.TAG.MOLNAMES] | |
| 195 if (length(names) == 0) | |
| 196 return(NA_character_) | |
| 197 | |
| 198 # Each molecule has potentially several names. Since we must return only one name for each molecule, we choose the first one. | |
| 199 name <- strsplit(names, ';')[[1]][[1]] | |
| 200 | |
| 201 return(name) | |
| 202 }) | |
| 203 | |
| 204 # Get molecule names | |
| 205 # molid An integer vector of molecule IDs. | |
| 206 # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector. | |
| 207 MsFileDb$methods( getMoleculeName = function(molid) { | |
| 208 | |
| 209 if (is.null(molid)) | |
| 210 return(NA_character_) | |
| 211 | |
| 212 # Init db | |
| 213 .self$.init.db() | |
| 214 | |
| 215 # Get database | |
| 216 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] | |
| 217 | |
| 218 # Remove duplicates | |
| 219 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] | |
| 220 | |
| 221 # Look for ids | |
| 222 names <- vapply(molid, function(i) .self$.get.name.from.id(db, i), FUN.VALUE = '') | |
| 223 | |
| 224 return(names) | |
| 225 }) | |
| 226 | |
| 227 ################### | |
| 228 # INIT NAME TO ID # | |
| 229 ################### | |
| 230 | |
| 231 MsFileDb$methods( .init.name.to.id = function() { | |
| 232 | |
| 233 if (is.null(.self$.name.to.id)) { | |
| 234 | |
| 235 # Create data frame | |
| 236 .name.to.id <<- data.frame(name = character(), id = character(), stringsAsFactors = FALSE) | |
| 237 | |
| 238 # Init db | |
| 239 .self$.init.db() | |
| 240 | |
| 241 # Get database subset (columns name and id only). | |
| 242 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] | |
| 243 | |
| 244 # Remove duplicate IDs | |
| 245 db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] | |
| 246 | |
| 247 # Loop on all | |
| 248 for(i in seq(db[[MSDB.TAG.MOLID]])) { | |
| 249 i.id <- db[i, MSDB.TAG.MOLID] | |
| 250 i.names <- split.str(db[i, MSDB.TAG.MOLNAMES], ';', unlist = TRUE) | |
| 251 .name.to.id <<- rbind(.self$.name.to.id, data.frame(name = toupper(i.names), id = rep(i.id, length(i.names)), stringsAsFactors = FALSE)) | |
| 252 } | |
| 253 | |
| 254 # Order by name | |
| 255 .name.to.id <<- .self$.name.to.id[order(.self$.name.to.id[['name']]), ] | |
| 256 } | |
| 257 }) | |
| 258 | |
| 259 #################### | |
| 260 # GET ID FROM NAME # | |
| 261 #################### | |
| 262 | |
| 263 MsFileDb$methods( .get.id.from.name = function(name) { | |
| 264 | |
| 265 # Initialize name.to.id search tree | |
| 266 .self$.init.name.to.id() | |
| 267 | |
| 268 # Search for name | |
| 269 i <- binary.search(toupper(name), .self$.name.to.id[['name']]) | |
| 270 | |
| 271 # Get ID | |
| 272 id <- if (is.na(i)) NA_character_ else as.character(.self$.name.to.id[i, 'id']) | |
| 273 | |
| 274 return(id) | |
| 275 }) | |
| 276 | |
| 277 ################ | |
| 278 # FIND BY NAME # | |
| 279 ################ | |
| 280 | |
| 281 # Find a molecule by name | |
| 282 # name A vector of molecule names to search for. | |
| 283 # Return a vector of the same size as the name input vector, containing the found molecule IDs, in the same order. | |
| 284 MsFileDb$methods( findByName = function(name) { | |
| 285 | |
| 286 if (is.null(name)) | |
| 287 return(NA_character_) | |
| 288 | |
| 289 # Look for molecules with this name | |
| 290 ids <- list() | |
| 291 for (n in name) | |
| 292 ids <- c(ids, list(.self$.get.id.from.name(n))) | |
| 293 | |
| 294 return(ids) | |
| 295 }) | |
| 296 | |
| 297 ############################### | |
| 298 # GET CHROMATOGRAPHIC COLUMNS # | |
| 299 ############################### | |
| 300 | |
| 301 MsFileDb$methods( getChromCol = function(molid = NULL) { | |
| 302 | |
| 303 # Init db | |
| 304 .self$.init.db() | |
| 305 | |
| 306 # Get database | |
| 307 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL)] | |
| 308 | |
| 309 # Filter on molecule IDs | |
| 310 if ( ! is.null(molid)) | |
| 311 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
| 312 | |
| 313 # Get column names | |
| 314 cols <- db[[MSDB.TAG.COL]] | |
| 315 | |
| 316 # Remove duplicates | |
| 317 cols <- cols[ ! duplicated(cols)] | |
| 318 | |
| 319 # Make data frame | |
| 320 cols <- data.frame(id = cols, title = cols, stringsAsFactors = FALSE) | |
| 321 | |
| 322 return(cols) | |
| 323 }) | |
| 324 | |
| 325 ################ | |
| 326 # GET NB PEAKS # | |
| 327 ################ | |
| 328 | |
| 329 # Get the total number of MS peaks stored inside the database. | |
| 330 # molid The ID of the molecule. | |
| 331 # type The MS type. | |
| 332 MsFileDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { | |
| 333 | |
| 334 # Init db | |
| 335 .self$.init.db() | |
| 336 | |
| 337 # Get database | |
| 338 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MODE, MSDB.TAG.MZTHEO)] | |
| 339 | |
| 340 # Filter on mode | |
| 341 if ( ! is.null(type) && ! is.na(type)) | |
| 342 db <- db[db[[MSDB.TAG.MODE]] == (if (type == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] | |
| 343 | |
| 344 # Filter on molecule IDs | |
| 345 if ( ! is.null(molid) && ! is.na(molid)) | |
| 346 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
| 347 | |
| 348 # Get mz values | |
| 349 mz <- db[[MSDB.TAG.MZTHEO]] | |
| 350 | |
| 351 # Count number of unique values | |
| 352 n <- sum(as.integer(! duplicated(mz))) | |
| 353 | |
| 354 return(n) | |
| 355 }) | |
| 356 | |
| 357 ########## | |
| 358 # SEARCH # | |
| 359 ########## | |
| 360 | |
| 361 MsFileDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { | |
| 362 | |
| 363 # Init db | |
| 364 .self$.init.db() | |
| 365 db <- .self$.db | |
| 366 | |
| 367 # Filter on mode | |
| 368 if ( ! is.null(mode) && ! is.na(mode)) | |
| 369 db <- db[db[[MSDB.TAG.MODE]] == (if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] | |
| 370 | |
| 371 # Filter on molecule IDs | |
| 372 if ( ! is.null(molids)) | |
| 373 db <- db[db[[MSDB.TAG.MOLID]] %in% molids,] | |
| 374 | |
| 375 # Filter on attributions | |
| 376 if ( ! is.null(attribs) && ! is.na(attribs)) | |
| 377 db <- db[db[[MSDB.TAG.ATTR]] %in% attribs,] | |
| 378 | |
| 379 # Filter on columns | |
| 380 if ( ! is.null(col) && ! is.na(col)) | |
| 381 db <- db[db[[MSDB.TAG.COL]] %in% col,] | |
| 382 | |
| 383 # Filter on retention time | |
| 384 if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) | |
| 385 db <- db[db[[MSDB.TAG.COLRT]] >= rt.low & db[[MSDB.TAG.COLRT]] <= rt.high, ] | |
| 386 | |
| 387 # Remove retention times and column information | |
| 388 if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) { | |
| 389 db <- db[, ! (colnames(db) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))] | |
| 390 | |
| 391 # Remove duplicates | |
| 392 db <- db[ ! duplicated(db), ] | |
| 393 } | |
| 394 | |
| 395 # Filter on mz | |
| 396 db <- db[db[[MSDB.TAG.MZTHEO]] >= mz.low & db[[MSDB.TAG.MZTHEO]] <= mz.high, ] | |
| 397 | |
| 398 # Rename database fields | |
| 399 # conv <- c( mz = 'mztheo', rt = 'colrt') # solving mismatch of field names between database and output | |
| 400 # cols <- colnames(db) | |
| 401 # for (db.field in names(.self$.fields)) { | |
| 402 # output.field <- if (db.field %in% names(conv)) conv[[db.field]] else db.field | |
| 403 # if (.self$.fields[[db.field]] %in% cols && output.field %in% names(.self$.output.fields)) | |
| 404 # cols[cols %in% .self$.fields[[db.field]]] <- .self$.output.fields[[output.field]] | |
| 405 # } | |
| 406 # colnames(db) <- cols | |
| 407 | |
| 408 # Remove unwanted columns | |
| 409 # db <- db[, colnames(db) %in% .self$.output.fields] | |
| 410 | |
| 411 return(db) | |
| 412 }) | |
| 413 | |
| 414 ################# | |
| 415 # GET MZ VALUES # | |
| 416 ################# | |
| 417 | |
| 418 # Returns a numeric vector of all masses stored inside the database. | |
| 419 MsFileDb$methods( getMzValues = function(mode = NULL) { | |
| 420 | |
| 421 # Init db | |
| 422 .self$.init.db() | |
| 423 db <- .self$.db | |
| 424 | |
| 425 # Filter on mode | |
| 426 if ( ! is.null(mode) && ! is.na(mode)) { | |
| 427 mode.tag <- if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg | |
| 428 selected.lines <- (.self$.get(db, col = MSDB.TAG.MODE) == mode.tag) | |
| 429 db <- db[selected.lines, ] | |
| 430 } | |
| 431 | |
| 432 # Get masses | |
| 433 mz <- .self$.get(db, col = MSDB.TAG.MZTHEO) | |
| 434 | |
| 435 # Remove duplicates | |
| 436 mz <- mz[ ! duplicated(mz)] | |
| 437 | |
| 438 return(mz) | |
| 439 }) | |
| 440 | |
| 441 ####################### | |
| 442 # GET RETENTION TIMES # | |
| 443 ####################### | |
| 444 | |
| 445 # Get the retention times of a molecule. | |
| 446 # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list. | |
| 447 MsFileDb$methods( getRetentionTimes = function(molid, col = NA_character_) { | |
| 448 | |
| 449 if (is.null(molid) || is.na(molid)) | |
| 450 return(list()) | |
| 451 | |
| 452 # Init db | |
| 453 .self$.init.db() | |
| 454 db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)] | |
| 455 | |
| 456 # Filter on molecule ID | |
| 457 if ( ! is.null(molid) && ! is.na(molid)) | |
| 458 db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] | |
| 459 | |
| 460 # Remove duplicates | |
| 461 db <- db[! duplicated(db), ] | |
| 462 | |
| 463 # Build retention time list | |
| 464 rt <- list() | |
| 465 cols <- db[[MSDB.TAG.COL]] | |
| 466 cols <- cols[ ! duplicated(cols)] | |
| 467 for (col in cols) { | |
| 468 colrts <- db[db[[MSDB.TAG.COL]] %in% col, MSDB.TAG.COLRT] | |
| 469 rt[col] <- list(colrts) | |
| 470 } | |
| 471 | |
| 472 return(rt) | |
| 473 }) | |
| 474 | |
| 475 } # end of load safe guard |
