Mercurial > repos > prog > lcmsmatching
diff MsXlsDb.R @ 0:e66bb061af06 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 3529b25417f8e1a5836474c9adec4b696d35099d-dirty
author | prog |
---|---|
date | Tue, 12 Jul 2016 12:02:37 -0400 |
parents | |
children | 20d69a062da3 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MsXlsDb.R Tue Jul 12 12:02:37 2016 -0400 @@ -0,0 +1,842 @@ +if ( ! exists('MsXlsDb')) { # Do not load again if already loaded + + library('methods') + library('stringr') + source('msdb-common.R') + source('MsDb.R') + source('strhlp.R', chdir = TRUE) + source('dfhlp.R', chdir = TRUE) + source('search.R', chdir = TRUE) + source('excelhlp.R', chdir = TRUE) + + ############# + # CONSTANTS # + ############# + + .THIS.FILE.PATH <- getwd() # We suppose that the file has been sourced with the option chdir = TRUE + + .XLS_PEAKS_ROW_OFFSET <- 8 + .XLS_PEAKS_RT_COL_START <- 11 + .XLS_MSPOS_TAB <- 'MS_POS' + .XLS_MSNEG_TAB <- 'MS_NEG' + .XLS_MZ_COL <- 1 + .XLS_INTENSITY_COL <- 2 + .XLS_RELATIVE_COL <- 3 + .XLS_THEORETICAL_MZ_COL <- 5 + .XLS_COMPOSITION_COL <- 8 + .XLS_ATTRIBUTION_COL <- 9 + + ##################### + # CLASS DECLARATION # + ##################### + + MsXlsDb <- setRefClass("MsXlsDb", contains = "MsDb", fields = list(.mz.index = "ANY", .name_index = "ANY", .db_dir = "character", .limit = "numeric", .files = "ANY", .cache_dir = "character", .db = "ANY")) + + ############### + # CONSTRUCTOR # + ############### + + MsXlsDb$methods( initialize = function(db_dir = NA_character_, limit = NA_integer_, cache_dir = NA_character_, cache = FALSE, ...) { + + # Initialize members + .db_dir <<- if ( ! is.null(db_dir)) db_dir else NA_character_ + .limit <<- if ( ! is.null(limit) && ! is.na(limit) && limit > 0) limit else NA_integer_ + cache_dir <- if (cache && is.na(cache_dir) && ! is.na(db_dir)) file.path(db_dir, 'cache') else cache_dir + .cache_dir <<- if ( cache || ! is.null(cache_dir)) cache_dir else NA_character_ + .files <<- NULL + .db <<- NULL + .mz.index <<- NULL + .name_index <<- NULL + + callSuper(...) + }) + + #################### + # GET MOLECULE IDS # + #################### + + MsXlsDb$methods( getMoleculeIds = function() { + + # Init file list + .self$.init.file.list() + + # Get IDs + mol.ids <- as.integer(which( ! is.na(.self$.files))) + + return(mol.ids) + }) + + #################### + # GET NB MOLECULES # + #################### + + # Returns a list of all molecule names + MsXlsDb$methods( getNbMolecules = function() { + return(length(.self$getMoleculeIds())) + }) + + ##################### + # GET MOLECULE NAME # + ##################### + + MsXlsDb$methods( getMoleculeName = function(molid) { + return(vapply(molid, function(m) .self$.get.mol.name(m), FUN.VALUE = "")) + }) + + ############################### + # GET CHROMATOGRAPHIC COLUMNS # + ############################### + + # Returns a list of all chromatographic columns used + MsXlsDb$methods( getChromCol = function(molid = NULL) { + + cn <- character() + + # If no molecule IDs provided, then look at all molecules + if (is.null(molid)) + molid <- .self$getMoleculeIds() + + # Loop on molecules + for (mid in molid) { + + rt <- .self$getRetentionTimes(mid) + + if ( ! is.null(rt)) + cn <- c(cn, names(rt)) + } + + # Remove duplicates + cn <- cn[ ! duplicated(cn)] + + # Make data frame + cn <- data.frame(id = cn, title = cn, stringsAsFactors = FALSE) + + return(cn) + }) + + ################ + # FIND BY NAME # + ################ + + MsXlsDb$methods( findByName = function(name) { + + # NULL entry + if (is.null(name)) + return(NA_integer_) + + # Initialize output list + ids <- NULL + + for (n in name) { + + id <- NA_integer_ + + if ( ! is.na(n)) { + + # Get index + index <- .self$.get.name.index() + + # Search for name in index + i <- binary.search(toupper(n), index[['name']]) + + id <- if (is.na(i)) NA_integer_ else index[i, 'id'] + } + + ids <- c(ids, id) + } + + return(ids) + }) + + ####################### + # GET RETENTION TIMES # + ####################### + + MsXlsDb$methods( getRetentionTimes = function(molid, col = NA_character_) { + + if (is.null(molid) || is.na(molid)) + return(NULL) + + # Find it in memory + rt <- .self$.mem.get(molid, 'rt') + + if (is.null(rt)) { + + # Call observers + if ( ! is.null(.self$.observers)) + for (obs in .self$.observers) + obs$progress(paste0("Loading retention times of file", .self$.get.file(molid), "."), level = 2) + + rt <- NULL + + # Load from cache file + cache_file <- NA_character_ + if ( ! is.na(.self$.get.cache.dir())) { + cache_file <- file.path(.self$.get.cache.dir(), paste0('rt-', molid, '.bin')) + if (file.exists(cache_file)) + load(file = cache_file) # load rt + } + + if (is.null(rt)) { + + # Get retention times of both positive and negative mode tabs + mspos_rt <- .self$.parse_retention_times(molid, .XLS_MSPOS_TAB) + msneg_rt <- .self$.parse_retention_times(molid, .XLS_MSNEG_TAB) + + # Retention times stored in negative and positive modes + if ( ! is.null(mspos_rt) && ! is.null(msneg_rt)) { + + # Warn observers when both retention time lists are not identical + if ( ! identical(mspos_rt, msneg_rt)) + for (obs in .self$.observers) + obs$warning(paste0("Retention times in negative and positive modes are different in file ", .self$.get.file(molid), ".")) + + # Merge both lists + rt <- mspos_rt + for (c in names(msneg_rt)) + if (c %in% names(rt)) { + v <- c(rt[[c]], msneg_rt[[c]]) + rt[[c]] <- v[ ! duplicated(v)] + } + else + rt[[c]] <- msneg_rt[[c]] + } + else + # Set retention times + rt <- if (is.null(mspos_rt)) msneg_rt else mspos_rt + + if (is.null(rt)) rt <- list() + + # Write in cache + if ( ! is.na(cache_file)) { + + # Call observers + if ( ! is.null(.self$.observers)) + for (obs in .self$.observers) + obs$progress(paste0("Caching retention times of file ", .self$.get.file(molid), ".")) + + save(rt, file = cache_file) + } + } + + # Store in memory + .self$.mem.set(rt, molid, 'rt') + } + + # Select only one column if asked + if ( ! is.na(col)) rt <- rt[[col]] + + return(rt) + }) + + ################# + # GET NB PEAKS # + ################# + + MsXlsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { + + # Initialize parameters + if (is.null(molid) || (length(molid) == 1 && is.na(molid))) + molid <- .self$getMoleculeIds() + if (is.na(type)) + type <- c(MSDB.TAG.POS, MSDB.TAG.NEG) + + return(sum(vapply(molid, function(m) { if (is.na(m)) 0 else sum(vapply(type, function(t) { peaks <- .self$.get.peaks(m, t) ; if (is.null(peaks)) 0 else nrow(peaks) }, FUN.VALUE = 1)) }, FUN.VALUE = 1))) + }) + + ################## + # GET PEAK TABLE # + ################## + + MsXlsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) { + + peaks <- NULL + + # Set default molecule IDs + if (is.null(molid) || (length(molid) == 1 && is.na(molid))) + molid <- .self$getMoleculeIds() + + # Set default modes + if (is.null(mode) || (length(mode) == 1 && is.na(mode))) + mode <- c(MSDB.TAG.POS, MSDB.TAG.NEG) + + # Loop on all molecules + for (mol in molid) { + + # Loop on all modes + for (mod in mode) { + m.peaks <- .self$.get.peaks(mol, mod) + if ( ! is.null(m.peaks) && nrow(m.peaks) > 0) { + m.peaks[[MSDB.TAG.MOLID]] <- mol + m.peaks[[MSDB.TAG.MODE]] <- mod + peaks <- if (is.null(peaks)) m.peaks else rbind(peaks, m.peaks) + peaks <- df.move.col.first(peaks, c(MSDB.TAG.MOLID, MSDB.TAG.MODE)) + } + } + } + + return(peaks) + }) + + ################# + # GET MZ VALUES # + ################# + + # Returns a numeric vector of all masses stored inside the database. + MsXlsDb$methods( getMzValues = function(mode = NULL) { + + mz <- numeric() + + # Get all mz values of all molecules + for(molid in .self$getMoleculeIds()) + for (m in (if (is.null(mode) || is.na(mode)) c(MSDB.TAG.POS, MSDB.TAG.NEG) else mode)) + mz <- c(mz, .self$.get.peaks(molid, m)[[MSDB.TAG.MZTHEO]]) + + # Remove duplicated + mz <- mz[ ! duplicated(mz)] + + return(mz) + }) + + ############# + # GET PEAKS # + ############# + + MsXlsDb$methods( .get.peaks = function(molid, mode) { + + tab <- if (mode == MSDB.TAG.POS) .XLS_MSPOS_TAB else .XLS_MSNEG_TAB + + # Find it in memory + peak_df <- .self$.mem.get(molid, 'peaks', mode) + + if (is.null(peak_df)) { + # Call observers + if ( ! is.null(.self$.observers)) + for (obs in .self$.observers) + obs$progress(paste0("Loading peaks of tab ", tab, " of file ", .self$.get.file(molid), "."), level = 2) + + peak_df <- NULL + + # Load from cache file + cache_file <- NA_character_ + if ( ! is.na(.self$.get.cache.dir())) { + cache_file <- file.path(.self$.get.cache.dir(), paste0('peaks-', molid, '-', tab, '.csv')) + if (file.exists(cache_file)) + peak_df <- read.csv(cache_file, header = TRUE, stringsAsFactors = FALSE) + } + + # Read from XLS file, if not in cache + if (is.null(peak_df)) { + + # Load tab (peaks start at row 8) + if (.self$.tab.exists(.self$.get.file(molid), tab)) { + + peaks <- read.excel(.self$.get.file(molid), tab, start.row = .XLS_PEAKS_ROW_OFFSET, stringsAsFactors = FALSE) + if ( ! is.null(peaks)) + peaks <- peaks[ ! is.na(peaks[.XLS_MZ_COL]), , drop = FALSE] # Remove rows where m/z is not defined. TODO maybe call observer for notify a line with non NA values but without m/z value. + + # Instantiate peaks + if ( ! is.null(peaks) && nrow(peaks) > 0) { + peak_df <- peaks[1:length(peaks[[.XLS_MZ_COL]]), c(.XLS_MZ_COL, .XLS_THEORETICAL_MZ_COL, .XLS_INTENSITY_COL, .XLS_RELATIVE_COL, .XLS_COMPOSITION_COL, .XLS_ATTRIBUTION_COL), drop = FALSE] + colnames(peak_df) <- c(MSDB.TAG.MZEXP, MSDB.TAG.MZTHEO, MSDB.TAG.INT, MSDB.TAG.REL, MSDB.TAG.COMP, MSDB.TAG.ATTR) + } + + # Set default data frame (important for cache file writing, because we need a correct header to be written in order for loading) + else { + peak_df <- data.frame(stringsAsFactors = FALSE) + peak_df[MSDB.TAG.MZEXP] <- numeric() + peak_df[MSDB.TAG.MZTHEO] <- numeric() + peak_df[MSDB.TAG.INT] <- numeric() + peak_df[MSDB.TAG.REL] <- numeric() + peak_df[MSDB.TAG.COMP] <- character() + peak_df[MSDB.TAG.ATTR] <- character() + } + + if (is.null(peak_df)) peak_df <- data.frame() + + # Write in cache + if ( ! is.na(cache_file)) { + + # Call observers + if ( ! is.null(.self$.observers)) + for (obs in .self$.observers) + obs$progress(paste0("Caching peaks of tab ", tab, " of file ", .self$.get.file(molid), ".")) + + write.csv(peak_df, cache_file, row.names = FALSE) + } + } + } + + # Store in memory + .self$.mem.set(peak_df, molid, 'peaks', mode) + } + + return(peak_df) + }) + + ############################## + # GET FULL MS PEAK M/Z INDEX # + ############################## + + # Get mz index for full ions, creating it if necessary. + MsXlsDb$methods( .get.mz.index = function(mode) { + + if (is.null(.self$.mz.index[[mode]])) { + + # Initialize data frame + mzi <- data.frame(stringsAsFactors = FALSE) + mzi[MSDB.TAG.MZTHEO] <- numeric() + mzi[MSDB.TAG.MOLID] <- character() + mzi[MSDB.TAG.COMP] <- character() + mzi[MSDB.TAG.ATTR] <- character() + + # Loop on all molecules + for(molid in .self$getMoleculeIds()) { + + # Get all peaks of this molecule + peaks <- .self$.get.peaks(molid, mode) + + # Remove rows whose mz is NA. + peaks <- peaks[ ! is.na(peaks[[MSDB.TAG.MZTHEO]]), ] + + if (nrow(peaks) > 0) { + + # Add id column + peaks[MSDB.TAG.MOLID] <- molid + + # Append peaks + r <- nrow(mzi) + 1 + rows <- r:(r+nrow(peaks)-1) + mzi[rows, ] <- peaks[colnames(mzi)] + } + } + + # Sort by M/Z + sorted_indices <- order(mzi[[MSDB.TAG.MZTHEO]]) + + # Group in a data frame + .self$.mz.index[[mode]] <- mzi[sorted_indices, ] + } + + return(.self$.mz.index[[mode]]) + }) + + ###################### + # SEARCH FOR MZ & RT # + ###################### + + MsXlsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { + + # Search for m/z + results <- .self$.do.search.for.mz(mode, mz.low, mz.high) + + # Filter on attributions + if ( ! is.null(attribs)) { + results <- results[results[[MSDB.TAG.ATTR]] %in% attribs, ] + } + + # Filer on molecule IDs + if ( ! is.null(molids)) { + results <- results[results[[MSDB.TAG.MOLID]] %in% molids, ] + } + + # Use retention time + if ( ! is.null(col) && ! is.null(rt.low) && ! is.null(rt.high)) { + + # Get list of unique IDs + ids <- results[[MSDB.TAG.MOLID]] + ids <- ids[ ! duplicated(ids)] + rt <- .self$.search.for.rt(mols = ids, rt.low = rt.low, rt.high = rt.high, col = col) + results <- results[results[[MSDB.TAG.MOLID]] %in% rt[[MSDB.TAG.MOLID]], ] + results <- merge(results, rt) + } + + return(results) + }) + + ############################## + # SEARCH FOR M/Z IN MS PEAKS # + ############################## + + MsXlsDb$methods( .do.search.for.mz = function(mode, mz.low, mz.high) { + + results <- data.frame(stringsAsFactors = FALSE) + results[MSDB.TAG.MZTHEO] <- numeric() + results[MSDB.TAG.MOLID] <- character() + results[MSDB.TAG.MOLNAMES] <- character() + results[MSDB.TAG.COMP] <- character() + results[MSDB.TAG.ATTR] <- character() + + # Create m/z index + mz_index <- .self$.get.mz.index(mode) + + # Find molecules + low_bound <- binary.search(mz.low, mz_index[[MSDB.TAG.MZTHEO]], lower = FALSE) + high_bound <- binary.search(mz.high, mz_index[[MSDB.TAG.MZTHEO]], lower = TRUE) + + # Get results + if ( ! is.na(high_bound) && ! is.na(low_bound) && low_bound <= high_bound) + results <- mz_index[low_bound:high_bound,] + + # Remove row names + rownames(results) <- NULL + + return(results) + }) + + ################ + # GET MOL NAME # + ################ + + MsXlsDb$methods( .get.mol.name = function(molid) { + + if (is.na(molid)) + return(NA_character_) + + # Find it in memory + name <- .self$.mem.get(molid, 'name') + + if (is.null(name)) { + + # Load molecule + mol <- .self$.load.molecule(molid) + + # Look for name in tabs + for (tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { + hdr <- mol[[tab]][['header']] + if ( ! is.null(hdr)) + name <- hdr[[1]] + if ( ! is.null(name) && ! is.na(name)) break + } + + # Store in memory + if (is.null(name)) name <- NA_character_ + .self$.mem.set(name, molid, 'name') + } + + return(name) + }) + + ################## + # GET NAME INDEX # + ################## + + # Get name index. + MsXlsDb$methods( .get.name.index = function() { + + if (is.null(.self$.name_index)) { + + # Get names + names <- vapply(.self$getMoleculeIds(), function(id) toupper(.self$getMoleculeName(id)), FUN.VALUE = "") + + # Get molecule IDs + id <- .self$getMoleculeIds() + + # Sort by names + sorted_indices <- order(names) + + # Group in a data frame + .self$.name_index <- data.frame(name = rbind(names)[, sorted_indices], + id = rbind(id)[, sorted_indices], + stringsAsFactors = FALSE) + } + + return(.self$.name_index) + }) + + ################## + # INIT FILE LIST # + ################## + + MsXlsDb$methods( .init.file.list = function() { + + if (is.null(.self$.files)) { + + # List all files + files <- Sys.glob(file.path(.self$.db_dir, '*.xls')) + + # Limit the size of the database + if ( ! is.na(.self$.limit)) + files <- head(files, .self$.limit) + + # Get IDs + ids <- vapply(files, function(f) .extract_molecule_id_from_filename(f), FUN.VALUE = 1) + + # Use ids as indices to build the vector of files + .files <<- rep(NA_character_, max(ids)) + .files[ids] <<- files + } + }) + + ################# + # GET CACHE DIR # + ################# + + MsXlsDb$methods( .get.cache.dir = function() { + + if ( ! is.na(.self$.cache_dir) && ! file.exists(.self$.cache_dir)) + dir.create(.self$.cache_dir) + + return(.self$.cache_dir) + }) + + ################# + # LOAD MOLECULE # + ################# + + MsXlsDb$methods( .load.molecule = function(molid) { + + # Init local variables + mol <- NULL + cache_file <- NA_character_ + excel_file <- .self$.get.file(molid) + + # Call observers + if ( ! is.null(.self$.observers)) + for (obs in .self$.observers) + obs$progress(paste0("Loading molecule ", molid, "."), level = 2) + + # Load from cache + if ( ! is.na(.self$.get.cache.dir())) { + cache_file <- file.path(.self$.get.cache.dir(), paste0(molid, '.bin')) + if (file.exists(cache_file)) + load(file = cache_file) # load mol variable + } + + # Load from Excel file & write to cache + if (is.null(mol) && ! is.na(excel_file)) { + + source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. + + # Load from Excel file + for(tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { + + # Test that tab exists + if (.self$.tab.exists(excel_file, tab)) { + header <- read.excel(excel_file, tab, start.row = 1, end.row = .XLS_PEAKS_ROW_OFFSET - 1, header = FALSE, stringsAsFactors = FALSE, trim.values = TRUE, col.index = c(1))[[1]] + peaks <- read.excel(excel_file, tab, start.row = .XLS_PEAKS_ROW_OFFSET) + mol[[tab]] <- list(header = header, peaks = peaks) + } + + # Missing tab + else { + for (obs in .self$.observers) + obs$warning(paste0("No excel tab ", tab, " in file ", excel_file, ".")) + } + } + + # Write in cache + if ( ! is.na(cache_file)) { + + # Call observers + if ( ! is.null(.self$.observers)) + for (obs in .self$.observers) + obs$progress(paste0("Caching file ", excel_file, ".")) + + save(mol, file = cache_file) + } + } + + return(mol) + }) + + ######################## + # DOES EXCEL TAB EXIST # + ######################## + + MsXlsDb$methods( .tab.exists = function(file, tab) { + + source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. + + if ( ! tab.exists(file, tab)) { + + # Warn observers + for (obs in .self$.observers) + obs$warning(paste0("No excel tab ", tab, " in file ", file, ".")) + + return(FALSE) + } + + return(TRUE) + }) + + ######################### + # PARSE RETENTION TIMES # + ######################### + + MsXlsDb$methods( .parse_retention_times = function(id, tab) { + + rt <- NULL + + if (.self$.tab.exists(.self$.get.file(id), tab)) { + peaks <- read.excel(.self$.get.file(id), tab, start.row = .XLS_PEAKS_ROW_OFFSET) + + # Get retention times + if ( ! is.null(peaks) && length(peaks) > 0 && ! is.na(peaks[[1]][[1]])) + for (c in .XLS_PEAKS_RT_COL_START:length(names(peaks))) + if ( ! is.na(peaks[[c]][[1]])) { + + # Check retention times of all different m/z peaks for the same column. + .self$.check_retention_times(id, tab, names(peaks)[[c]], peaks[[c]], sum( ! is.na(peaks[[1]]))) + + # Add retention time + # TODO The column names are transformed through the read.xlsx call. For instance: + # HPLC (C18) 25mn QTOF (Bis) --> HPLC..C18..25mn.QTOF..Bis. + # ZICpHILIC 150*5*2.1 Shimadzu-Exactive-42mn --> ZICpHILIC.150.5.2.1.Shimadzu.Exactive.42mn + # This can be an issue, since we loose the formating. + col_id <- names(peaks)[[c]] + time <- peaks[[c]][[1]] * 60 # Read and convert retention time in seconds. + if (is.null(rt) || ! col_id %in% names(rt)) + rt[[col_id]] <- list(time) + else + rt[[col_id]] <- c(rt[[col_id]], time) + } + } + + return(rt) + }) + + ######################### + # CHECK RETENTION TIMES # + ######################### + + MsXlsDb$methods( .check_retention_times = function(id, tab_name, column_name, rt, n) { + + if (n >= 1 && ! is.null(.self$.observers) && length(.self$.observers) > 0) + + # Check column only if there is at least one value inside + if (sum( ! is.na(rt)) > 0) + + # Loop on all values + for(i in 1:n) { + + # Check that it's defined + if (i > 1 && is.na(rt[[i]])) + for (obs in .self$.observers) + obs$warning(paste0("Retention times undefined for column ", column_name, " at row ", i + .XLS_PEAKS_ROW_OFFSET, " of tab ", tab_name, " in file ", .self$.get.file(id), ".")) + + else if (i > 1) + # Check the value (it must be constant) + if (rt[[i-1]] != rt[[i]]) + for (obs in .self$.observers) + obs$error(paste0("Retention times not constant for column ", column_name, " between row ", i - 1 + .XLS_PEAKS_ROW_OFFSET, " and row ", i + .XLS_PEAKS_ROW_OFFSET, "o tab", tab_name, "in file", .self$.get.file(id))) + } + }) + + #################### + # GET FILE FROM ID # + #################### + + MsXlsDb$methods( .get.file = function(id) { + + # List files + .self$.init.file.list() + + return( if (id > 0 && id <= length(.self$.files)) .self$.files[id] else NA_character_) + }) + + ########### + # MEM GET # + ########### + + # Get database data from memory + MsXlsDb$methods( .mem.get = function(molid, field, second.field = NA_character_) { + + data <- .self$.db[[as.character(molid)]][[field]] + + if ( ! is.na(second.field)) + data <- data[[second.field]] + + return(data) + }) + + ########### + # MEM SET # + ########### + + # Set database data into memory + MsXlsDb$methods( .mem.set = function(data, molid, field, second.field = NA_character_) { + + id <- as.character(molid) + + # Create db + if (is.null(.self$.db)) + .db <<- list() + + # Create first level + if (is.null(.self$.db[[id]])) + .self$.db[[id]] <- list() + + # Create second level + if ( ! is.na(second.field) && is.null(.self$.db[[id]][[field]])) + .self$.db[[id]][[field]] <- list() + + # Store data + if (is.na(second.field)) { + .self$.db[[id]][[field]] <- data + } else { + .self$.db[[id]][[field]][[second.field]] <- data + } + }) + + ################# + # SEARCH FOR RT # + ################# + + # Find molecules matching a certain retention time. + # col A list of chromatographic columns to use. + # rt.low The lower bound of the rt value. + # rt.high The higher bound of the rt value. + # mols A list of molecule IDs to process. If unset, then take all molecules. + # Return a data frame with the following columns: id, col, colrt. + MsXlsDb$methods( .search.for.rt = function(col, rt.low, rt.high, mols = NULL) { + + # Use all molecules if no list is provided + if (is.null(mols)) + mols <- .self$getMoleculeIds() + + results <- data.frame(id = integer(), col = character(), colrt = double(), stringsAsFactors = FALSE) + + # Loop on all molecules + for (molid in mols) { + no.col <- TRUE + for (c in col) { + molrts <- .self$getRetentionTimes(molid, c) + if ( ! is.null(molrts)) { + no.col <- FALSE + for (molrt in molrts) { + if (molrt >= rt.low && molrt <= rt.high) { + r <- nrow(results) + 1 + results[r, ] <- c(id = molid, col = c, colrt = molrt) + } + } + } + } + + if (no.col) { + r <- nrow(results) + 1 + results[r, c(MSDB.TAG.MOLID)] <- c(id = molid) + } + } + + return(results) + }) + + ############################ + # EXTRACT ID FROM FILENAME # + ############################ + + .extract_molecule_id_from_filename <- function(filename) { + + id <- NA_integer_ + + if ( ! is.na(filename)) { + g <- str_match(filename, "N(\\d+)[._-]") + if ( ! is.na(g[1,1])) + id <- as.numeric(g[1,2]) + } + + return(id) + } + +} # end of load safe guard