Mercurial > repos > prog > lcmsmatching
diff MsFileDb.R @ 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author | prog |
---|---|
date | Fri, 22 Feb 2019 16:04:22 -0500 |
parents | fb9c0409d85c |
children |
line wrap: on
line diff
--- a/MsFileDb.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,485 +0,0 @@ -if ( ! exists('MsFileDb')) { # Do not load again if already loaded - - library('methods') - source('MsDb.R') - source('msdb-common.R') - source('search.R', chdir = TRUE) - - ##################### - # CLASS DECLARATION # - ##################### - - MsFileDb <- setRefClass("MsFileDb", contains = "MsDb", fields = list(.file = "character", .db = "ANY", .fields = "list", .modes = "list", .name.to.id = "ANY")) - - ############### - # CONSTRUCTOR # - ############### - - MsFileDb$methods( initialize = function(file = NA_character_, ...) { - - # Initialize members - .file <<- if ( ! is.null(file)) file else NA_character_ - .db <<- NULL - .fields <<- msdb.get.dft.db.fields() - .modes <<- MSDB.DFT.MODES - .name.to.id <<- NULL - - callSuper(...) - }) - - ################# - # SET DB FIELDS # - ################# - - MsFileDb$methods( areDbFieldsSettable = function() { - return(TRUE) - }) - - MsFileDb$methods( setDbFields = function(fields) { - .fields <<- as.list(fields) - }) - - ################ - # CHECK FIELDS # - ################ - - MsFileDb$methods( .check.fields = function(fields) { - - if (is.null(fields)) - stop("No fields specified for .check.fields()") - - # Check that fields are defined in the fields list - unknown <- fields[ ! fields %in% names(.self$.fields)] - if (length(unknown) > 0) - stop(paste0("Database field", if (length(unknown) == 1) "" else "s", " \"", paste(unkown, collapse = ", "), "\" ", if (length(unknown) == 1) "is" else "are", " not defined.")) - - # Check that field values are real columns inside the database - .self$.init.db() - db.col.names <- fields #vapply(fields, function(s) .self$.fields[[s]], FUN.VALUE = '') - unknown.cols <- db.col.names[ ! db.col.names %in% colnames(.self$.db)] - if (length(unknown.cols) > 0) - stop(paste0("Column", if (length(unknown.cols) == 1) "" else "s", " \"", paste(unknown.cols, collapse = ", "), "\" ", if (length(unknown.cols) == 1) "is" else "are", " not defined inside the database \"", .self$.file, "\".")) - }) - - ################ - # SET MS MODES # - ################ - - MsFileDb$methods( areDbMsModesSettable = function() { - return(TRUE) - }) - - MsFileDb$methods( setDbMsModes = function(modes) { - .modes <<- as.list(modes) - }) - - ########### - # INIT DB # - ########### - - MsFileDb$methods( .init.db = function() { - - if (is.null(.self$.db)) { - - # Load database - .db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL, check.names = FALSE, comment.char = '') - - # Check that colnames are unique - dupcol <- duplicated(colnames(.self$.db)) - if (any(dupcol)) - stop(paste("Database header contains duplicated names: ", paste(unique(colnames(.self$.db)[dupcol]), collapse = ', '), ".")) - - # Check that columns names supplied through field map are unique - dupfields <- duplicated(.self$.fields) - if (any(dupfields)) - stop(paste("Some db column names supplied are duplicated: ", paste(unique(.self$.fields[dupfields]), collapse = ', '), ".")) - - # Rename columns - colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') - } - }) - - ############ - # GET DATA # - ############ - - MsFileDb$methods( .get = function(db = NULL, col = NULL) { - - # Init db - if (is.null(db)) { - .self$.init.db() - db <- .self$.db - } - - # Check fields - .self$.check.fields(col) - - # Get database columns -# db.cols <- unlist(.self$.fields[col]) - - return(db[, col]) - }) - - ########### - # GET ROW # - ########### - - MsFileDb$methods( .get.row = function(row, cols = NULL) { - - # Init db - .self$.init.db() - - # Check fields - if ( ! is.null(cols)) - .self$.check.fields(cols) - - if ( ! is.null(cols)) { - #cols <- vapply(cols, function(c) .self$.fields[[c]], FUN.VALUE = '') - return(.self$.db[row, cols]) - } - - return(.self$.db[row, ]) - }) - - ########### - # GET COL # - ########### - - MsFileDb$methods( .get.col = function(col) { - - # Init db - .self$.init.db() - - # Check fields - .self$.check.fields(col) - - #return(.self$.db[[.self$.fields[[col]]]]) - return(.self$.db[[col]]) - }) - - #################### - # GET MOLECULE IDS # - #################### - - MsFileDb$methods( getMoleculeIds = function(max.results = NA_integer_) { - - # Init db - .self$.init.db() - - # Get IDs - mol.ids <- as.character(.self$.get.col(MSDB.TAG.MOLID)) - mol.ids <- mol.ids[ ! duplicated(mol.ids)] - mol.ids <- sort(mol.ids) - - # Cut results - if ( ! is.na(max.results) && length(mol.ids) > max.results) - mol.ids <- mol.ids[1:max.results] - - return(mol.ids) - }) - - #################### - # GET NB MOLECULES # - #################### - - # Returns the number of molecules in the database. - MsFileDb$methods( getNbMolecules = function() { - - # Init db - .self$.init.db() - - # Get IDs - mol.ids <- .self$.get.col(MSDB.TAG.MOLID) - mol.ids <- mol.ids[ ! duplicated(mol.ids)] - - return(length(mol.ids)) - }) - - ##################### - # GET MOLECULE NAME # - ##################### - - MsFileDb$methods( .get.name.from.id = function(db, id) { - - if(is.na(id)) - return(NA_character_) - - # Get names - names <- db[db[[MSDB.TAG.MOLID]] %in% id, MSDB.TAG.MOLNAMES] - if (length(names) == 0) - return(NA_character_) - - # Each molecule has potentially several names. Since we must return only one name for each molecule, we choose the first one. - name <- strsplit(names, ';')[[1]][[1]] - - return(name) - }) - - # Get molecule names - # molid An integer vector of molecule IDs. - # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector. - MsFileDb$methods( getMoleculeName = function(molid) { - - if (is.null(molid)) - return(NA_character_) - - # Init db - .self$.init.db() - - # Get database - db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] - - # Remove duplicates - db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] - - # Look for ids - names <- vapply(molid, function(i) .self$.get.name.from.id(db, i), FUN.VALUE = '') - - return(names) - }) - - ################### - # INIT NAME TO ID # - ################### - - MsFileDb$methods( .init.name.to.id = function() { - - if (is.null(.self$.name.to.id)) { - - # Create data frame - .name.to.id <<- data.frame(name = character(), id = character(), stringsAsFactors = FALSE) - - # Init db - .self$.init.db() - - # Get database subset (columns name and id only). - db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] - - # Remove duplicate IDs - db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] - - # Loop on all - for(i in seq(db[[MSDB.TAG.MOLID]])) { - i.id <- db[i, MSDB.TAG.MOLID] - i.names <- split.str(db[i, MSDB.TAG.MOLNAMES], ';', unlist = TRUE) - .name.to.id <<- rbind(.self$.name.to.id, data.frame(name = toupper(i.names), id = rep(i.id, length(i.names)), stringsAsFactors = FALSE)) - } - - # Order by name - .name.to.id <<- .self$.name.to.id[order(.self$.name.to.id[['name']]), ] - } - }) - - #################### - # GET ID FROM NAME # - #################### - - MsFileDb$methods( .get.id.from.name = function(name) { - - # Initialize name.to.id search tree - .self$.init.name.to.id() - - # Search for name - i <- binary.search(toupper(name), .self$.name.to.id[['name']]) - - # Get ID - id <- if (is.na(i)) NA_character_ else as.character(.self$.name.to.id[i, 'id']) - - return(id) - }) - - ################ - # FIND BY NAME # - ################ - - # Find a molecule by name - # name A vector of molecule names to search for. - # Return a vector of the same size as the name input vector, containing the found molecule IDs, in the same order. - MsFileDb$methods( findByName = function(name) { - - if (is.null(name)) - return(NA_character_) - - # Look for molecules with this name - ids <- list() - for (n in name) - ids <- c(ids, list(.self$.get.id.from.name(n))) - - return(ids) - }) - - ############################### - # GET CHROMATOGRAPHIC COLUMNS # - ############################### - - MsFileDb$methods( getChromCol = function(molid = NULL) { - - # Init db - .self$.init.db() - - # Get database - db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL)] - - # Filter on molecule IDs - if ( ! is.null(molid)) - db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] - - # Get column names - cols <- db[[MSDB.TAG.COL]] - - # Remove duplicates - cols <- cols[ ! duplicated(cols)] - - # Make data frame - cols <- data.frame(id = cols, title = cols, stringsAsFactors = FALSE) - - return(cols) - }) - - ################ - # GET NB PEAKS # - ################ - - # Get the total number of MS peaks stored inside the database. - # molid The ID of the molecule. - # type The MS type. - MsFileDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { - - # Init db - .self$.init.db() - - # Get database - db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MODE, MSDB.TAG.MZTHEO)] - - # Filter on mode - if ( ! is.null(type) && ! is.na(type)) - db <- db[db[[MSDB.TAG.MODE]] == (if (type == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] - - # Filter on molecule IDs - if ( ! is.null(molid) && ! is.na(molid)) - db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] - - # Get mz values - mz <- db[[MSDB.TAG.MZTHEO]] - - # Count number of unique values - n <- sum(as.integer(! duplicated(mz))) - - return(n) - }) - - ########## - # SEARCH # - ########## - - MsFileDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { - - # Init db - .self$.init.db() - db <- .self$.db - - # Filter on mode - if ( ! is.null(mode) && ! is.na(mode)) - db <- db[db[[MSDB.TAG.MODE]] == (if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] - - # Filter on molecule IDs - if ( ! is.null(molids)) - db <- db[db[[MSDB.TAG.MOLID]] %in% molids,] - - # Filter on attributions - if ( ! is.null(attribs) && ! is.na(attribs)) - db <- db[db[[MSDB.TAG.ATTR]] %in% attribs,] - - # Filter on columns - if ( ! is.null(col) && ! is.na(col)) - db <- db[db[[MSDB.TAG.COL]] %in% col,] - - # Filter on retention time - if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) { - scale <- if (.self$getRtUnit() == MSDB.RTUNIT.MIN) 60 else 1 - db <- db[db[[MSDB.TAG.COLRT]] * scale >= rt.low & db[[MSDB.TAG.COLRT]] * scale <= rt.high, ] - } - - # Remove retention times and column information - if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) { - db <- db[, ! (colnames(db) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))] - - # Remove duplicates - db <- db[ ! duplicated(db), ] - } - - # Filter on mz - db <- db[db[[MSDB.TAG.MZTHEO]] >= mz.low & db[[MSDB.TAG.MZTHEO]] <= mz.high, ] - - return(db) - }) - - ################# - # GET MZ VALUES # - ################# - - # Returns a numeric vector of all masses stored inside the database. - MsFileDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { - - # Init db - .self$.init.db() - db <- .self$.db - - # Filter on mode - if ( ! is.null(mode) && ! is.na(mode)) { - mode.tag <- if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg - selected.lines <- (.self$.get(db, col = MSDB.TAG.MODE) == mode.tag) - db <- db[selected.lines, ] - } - - # Get masses - mz <- .self$.get(db, col = MSDB.TAG.MZTHEO) - - # Remove duplicates - mz <- mz[ ! duplicated(mz)] - - # Apply cut-off - if ( ! is.na(max.results)) - mz <- mz[1:max.results] - - return(mz) - }) - - ####################### - # GET RETENTION TIMES # - ####################### - - # Get the retention times of a molecule. - # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list. - MsFileDb$methods( getRetentionTimes = function(molid, col = NA_character_) { - - if (is.null(molid) || is.na(molid)) - return(list()) - - # Init db - .self$.init.db() - db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)] - - # Filter on molecule ID - if ( ! is.null(molid) && ! is.na(molid)) - db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] - - # Remove duplicates - db <- db[! duplicated(db), ] - - # Build retention time list - rt <- list() - cols <- db[[MSDB.TAG.COL]] - cols <- cols[ ! duplicated(cols)] - for (col in cols) { - colrts <- db[db[[MSDB.TAG.COL]] %in% col, MSDB.TAG.COLRT] - rt[col] <- list(colrts) - } - - if (.self$getRtUnit() == MSDB.RTUNIT.MIN) - rt <- 60 * rt - - return(rt) - }) - -} # end of load safe guard