Mercurial > repos > prog > lcmsmatching
diff MsFileDb.R @ 0:e66bb061af06 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 3529b25417f8e1a5836474c9adec4b696d35099d-dirty
author | prog |
---|---|
date | Tue, 12 Jul 2016 12:02:37 -0400 |
parents | |
children | 20d69a062da3 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MsFileDb.R Tue Jul 12 12:02:37 2016 -0400 @@ -0,0 +1,475 @@ +if ( ! exists('MsFileDb')) { # Do not load again if already loaded + + library('methods') + source('MsDb.R') + source('msdb-common.R') + source('search.R', chdir = TRUE) + + ##################### + # CLASS DECLARATION # + ##################### + + MsFileDb <- setRefClass("MsFileDb", contains = "MsDb", fields = list(.file = "character", .db = "ANY", .fields = "list", .modes = "list", .name.to.id = "ANY")) + + ############### + # CONSTRUCTOR # + ############### + + MsFileDb$methods( initialize = function(file = NA_character_, ...) { + + # Initialize members + .file <<- if ( ! is.null(file)) file else NA_character_ + .db <<- NULL + .fields <<- msdb.get.dft.db.fields() + .modes <<- MSDB.DFT.MODES + .name.to.id <<- NULL + + callSuper(...) + }) + + ################# + # SET DB FIELDS # + ################# + + MsFileDb$methods( areDbFieldsSettable = function() { + return(TRUE) + }) + + MsFileDb$methods( setDbFields = function(fields) { + .fields <<- as.list(fields) + }) + + ################ + # CHECK FIELDS # + ################ + + MsFileDb$methods( .check.fields = function(fields) { + + if (is.null(fields)) + stop("No fields specified for .check.fields()") + + # Check that fields are defined in the fields list + unknown <- fields[ ! fields %in% names(.self$.fields)] + if (length(unknown) > 0) + stop(paste0("Database field", if (length(unknown) == 1) "" else "s", " \"", paste(unkown, collapse = ", "), "\" ", if (length(unknown) == 1) "is" else "are", " not defined.")) + + # Check that field values are real columns inside the database + .self$.init.db() + db.col.names <- fields #vapply(fields, function(s) .self$.fields[[s]], FUN.VALUE = '') + unknown.cols <- db.col.names[ ! db.col.names %in% colnames(.self$.db)] + if (length(unknown.cols) > 0) + stop(paste0("Column", if (length(unknown.cols) == 1) "" else "s", " \"", paste(unknown.cols, collapse = ", "), "\" ", if (length(unknown.cols) == 1) "is" else "are", " not defined inside the database \"", .self$.file, "\".")) + }) + + ################ + # SET MS MODES # + ################ + + MsFileDb$methods( areDbMsModesSettable = function() { + return(TRUE) + }) + + MsFileDb$methods( setDbMsModes = function(modes) { + .modes <<- as.list(modes) + }) + + ########### + # INIT DB # + ########### + + MsFileDb$methods( .init.db = function() { + + if (is.null(.self$.db)) { + + # Load database + .db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL) + + # Rename columns + colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') + } + }) + + ############ + # GET DATA # + ############ + + MsFileDb$methods( .get = function(db = NULL, col = NULL) { + + # Init db + if (is.null(db)) { + .self$.init.db() + db <- .self$.db + } + + # Check fields + .self$.check.fields(col) + + # Get database columns +# db.cols <- unlist(.self$.fields[col]) + + return(db[, col]) + }) + + ########### + # GET ROW # + ########### + + MsFileDb$methods( .get.row = function(row, cols = NULL) { + + # Init db + .self$.init.db() + + # Check fields + if ( ! is.null(cols)) + .self$.check.fields(cols) + + if ( ! is.null(cols)) { + #cols <- vapply(cols, function(c) .self$.fields[[c]], FUN.VALUE = '') + return(.self$.db[row, cols]) + } + + return(.self$.db[row, ]) + }) + + ########### + # GET COL # + ########### + + MsFileDb$methods( .get.col = function(col) { + + # Init db + .self$.init.db() + + # Check fields + .self$.check.fields(col) + + #return(.self$.db[[.self$.fields[[col]]]]) + return(.self$.db[[col]]) + }) + + #################### + # GET MOLECULE IDS # + #################### + + MsFileDb$methods( getMoleculeIds = function() { + + # Init db + .self$.init.db() + + # Get IDs + mol.ids <- as.character(.self$.get.col(MSDB.TAG.MOLID)) + mol.ids <- mol.ids[ ! duplicated(mol.ids)] + mol.ids <- sort(mol.ids) + + return(mol.ids) + }) + + #################### + # GET NB MOLECULES # + #################### + + # Returns the number of molecules in the database. + MsFileDb$methods( getNbMolecules = function() { + + # Init db + .self$.init.db() + + # Get IDs + mol.ids <- .self$.get.col(MSDB.TAG.MOLID) + mol.ids <- mol.ids[ ! duplicated(mol.ids)] + + return(length(mol.ids)) + }) + + ##################### + # GET MOLECULE NAME # + ##################### + + MsFileDb$methods( .get.name.from.id = function(db, id) { + + if(is.na(id)) + return(NA_character_) + + # Get names + names <- db[db[[MSDB.TAG.MOLID]] %in% id, MSDB.TAG.MOLNAMES] + if (length(names) == 0) + return(NA_character_) + + # Each molecule has potentially several names. Since we must return only one name for each molecule, we choose the first one. + name <- strsplit(names, ';')[[1]][[1]] + + return(name) + }) + + # Get molecule names + # molid An integer vector of molecule IDs. + # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector. + MsFileDb$methods( getMoleculeName = function(molid) { + + if (is.null(molid)) + return(NA_character_) + + # Init db + .self$.init.db() + + # Get database + db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] + + # Remove duplicates + db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] + + # Look for ids + names <- vapply(molid, function(i) .self$.get.name.from.id(db, i), FUN.VALUE = '') + + return(names) + }) + + ################### + # INIT NAME TO ID # + ################### + + MsFileDb$methods( .init.name.to.id = function() { + + if (is.null(.self$.name.to.id)) { + + # Create data frame + .name.to.id <<- data.frame(name = character(), id = character(), stringsAsFactors = FALSE) + + # Init db + .self$.init.db() + + # Get database subset (columns name and id only). + db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] + + # Remove duplicate IDs + db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] + + # Loop on all + for(i in seq(db[[MSDB.TAG.MOLID]])) { + i.id <- db[i, MSDB.TAG.MOLID] + i.names <- split.str(db[i, MSDB.TAG.MOLNAMES], ';', unlist = TRUE) + .name.to.id <<- rbind(.self$.name.to.id, data.frame(name = toupper(i.names), id = rep(i.id, length(i.names)), stringsAsFactors = FALSE)) + } + + # Order by name + .name.to.id <<- .self$.name.to.id[order(.self$.name.to.id[['name']]), ] + } + }) + + #################### + # GET ID FROM NAME # + #################### + + MsFileDb$methods( .get.id.from.name = function(name) { + + # Initialize name.to.id search tree + .self$.init.name.to.id() + + # Search for name + i <- binary.search(toupper(name), .self$.name.to.id[['name']]) + + # Get ID + id <- if (is.na(i)) NA_character_ else as.character(.self$.name.to.id[i, 'id']) + + return(id) + }) + + ################ + # FIND BY NAME # + ################ + + # Find a molecule by name + # name A vector of molecule names to search for. + # Return a vector of the same size as the name input vector, containing the found molecule IDs, in the same order. + MsFileDb$methods( findByName = function(name) { + + if (is.null(name)) + return(NA_character_) + + # Look for molecules with this name + ids <- list() + for (n in name) + ids <- c(ids, list(.self$.get.id.from.name(n))) + + return(ids) + }) + + ############################### + # GET CHROMATOGRAPHIC COLUMNS # + ############################### + + MsFileDb$methods( getChromCol = function(molid = NULL) { + + # Init db + .self$.init.db() + + # Get database + db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL)] + + # Filter on molecule IDs + if ( ! is.null(molid)) + db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] + + # Get column names + cols <- db[[MSDB.TAG.COL]] + + # Remove duplicates + cols <- cols[ ! duplicated(cols)] + + # Make data frame + cols <- data.frame(id = cols, title = cols, stringsAsFactors = FALSE) + + return(cols) + }) + + ################ + # GET NB PEAKS # + ################ + + # Get the total number of MS peaks stored inside the database. + # molid The ID of the molecule. + # type The MS type. + MsFileDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { + + # Init db + .self$.init.db() + + # Get database + db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MODE, MSDB.TAG.MZTHEO)] + + # Filter on mode + if ( ! is.null(type) && ! is.na(type)) + db <- db[db[[MSDB.TAG.MODE]] == (if (type == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] + + # Filter on molecule IDs + if ( ! is.null(molid) && ! is.na(molid)) + db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] + + # Get mz values + mz <- db[[MSDB.TAG.MZTHEO]] + + # Count number of unique values + n <- sum(as.integer(! duplicated(mz))) + + return(n) + }) + + ########## + # SEARCH # + ########## + + MsFileDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { + + # Init db + .self$.init.db() + db <- .self$.db + + # Filter on mode + if ( ! is.null(mode) && ! is.na(mode)) + db <- db[db[[MSDB.TAG.MODE]] == (if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] + + # Filter on molecule IDs + if ( ! is.null(molids)) + db <- db[db[[MSDB.TAG.MOLID]] %in% molids,] + + # Filter on attributions + if ( ! is.null(attribs) && ! is.na(attribs)) + db <- db[db[[MSDB.TAG.ATTR]] %in% attribs,] + + # Filter on columns + if ( ! is.null(col) && ! is.na(col)) + db <- db[db[[MSDB.TAG.COL]] %in% col,] + + # Filter on retention time + if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) + db <- db[db[[MSDB.TAG.COLRT]] >= rt.low & db[[MSDB.TAG.COLRT]] <= rt.high, ] + + # Remove retention times and column information + if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) { + db <- db[, ! (colnames(db) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))] + + # Remove duplicates + db <- db[ ! duplicated(db), ] + } + + # Filter on mz + db <- db[db[[MSDB.TAG.MZTHEO]] >= mz.low & db[[MSDB.TAG.MZTHEO]] <= mz.high, ] + + # Rename database fields +# conv <- c( mz = 'mztheo', rt = 'colrt') # solving mismatch of field names between database and output +# cols <- colnames(db) +# for (db.field in names(.self$.fields)) { +# output.field <- if (db.field %in% names(conv)) conv[[db.field]] else db.field +# if (.self$.fields[[db.field]] %in% cols && output.field %in% names(.self$.output.fields)) +# cols[cols %in% .self$.fields[[db.field]]] <- .self$.output.fields[[output.field]] +# } +# colnames(db) <- cols + + # Remove unwanted columns +# db <- db[, colnames(db) %in% .self$.output.fields] + + return(db) + }) + + ################# + # GET MZ VALUES # + ################# + + # Returns a numeric vector of all masses stored inside the database. + MsFileDb$methods( getMzValues = function(mode = NULL) { + + # Init db + .self$.init.db() + db <- .self$.db + + # Filter on mode + if ( ! is.null(mode) && ! is.na(mode)) { + mode.tag <- if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg + selected.lines <- (.self$.get(db, col = MSDB.TAG.MODE) == mode.tag) + db <- db[selected.lines, ] + } + + # Get masses + mz <- .self$.get(db, col = MSDB.TAG.MZTHEO) + + # Remove duplicates + mz <- mz[ ! duplicated(mz)] + + return(mz) + }) + + ####################### + # GET RETENTION TIMES # + ####################### + + # Get the retention times of a molecule. + # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list. + MsFileDb$methods( getRetentionTimes = function(molid, col = NA_character_) { + + if (is.null(molid) || is.na(molid)) + return(list()) + + # Init db + .self$.init.db() + db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)] + + # Filter on molecule ID + if ( ! is.null(molid) && ! is.na(molid)) + db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] + + # Remove duplicates + db <- db[! duplicated(db), ] + + # Build retention time list + rt <- list() + cols <- db[[MSDB.TAG.COL]] + cols <- cols[ ! duplicated(cols)] + for (col in cols) { + colrts <- db[db[[MSDB.TAG.COL]] %in% col, MSDB.TAG.COLRT] + rt[col] <- list(colrts) + } + + return(rt) + }) + +} # end of load safe guard