Mercurial > repos > prog > lcmsmatching
view MsXlsDb.R @ 2:20d69a062da3 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
| author | prog | 
|---|---|
| date | Thu, 02 Mar 2017 08:55:00 -0500 | 
| parents | e66bb061af06 | 
| children | fb9c0409d85c | 
line wrap: on
 line source
if ( ! exists('MsXlsDb')) { # Do not load again if already loaded library('methods') library('stringr') source('msdb-common.R') source('MsDb.R') source('strhlp.R', chdir = TRUE) source('dfhlp.R', chdir = TRUE) source('search.R', chdir = TRUE) source('excelhlp.R', chdir = TRUE) ############# # CONSTANTS # ############# .THIS.FILE.PATH <- getwd() # We suppose that the file has been sourced with the option chdir = TRUE .XLS_PEAKS_ROW_OFFSET <- 8 .XLS_PEAKS_RT_COL_START <- 11 .XLS_MSPOS_TAB <- 'MS_POS' .XLS_MSNEG_TAB <- 'MS_NEG' .XLS_MZ_COL <- 1 .XLS_INTENSITY_COL <- 2 .XLS_RELATIVE_COL <- 3 .XLS_THEORETICAL_MZ_COL <- 5 .XLS_COMPOSITION_COL <- 8 .XLS_ATTRIBUTION_COL <- 9 ##################### # CLASS DECLARATION # ##################### MsXlsDb <- setRefClass("MsXlsDb", contains = "MsDb", fields = list(.mz.index = "ANY", .name_index = "ANY", .db_dir = "character", .limit = "numeric", .files = "ANY", .cache_dir = "character", .db = "ANY")) ############### # CONSTRUCTOR # ############### MsXlsDb$methods( initialize = function(db_dir = NA_character_, limit = NA_integer_, cache_dir = NA_character_, cache = FALSE, ...) { # Initialize members # TODO check that db_dir is not null neither na, and tests that it exists and is a directory. .db_dir <<- if ( ! is.null(db_dir)) db_dir else NA_character_ .limit <<- if ( ! is.null(limit) && ! is.na(limit) && limit > 0) limit else NA_integer_ cache_dir <- if (cache && is.na(cache_dir) && ! is.na(db_dir)) file.path(db_dir, 'cache') else cache_dir .cache_dir <<- if ( cache || ! is.null(cache_dir)) cache_dir else NA_character_ .files <<- NULL .db <<- NULL .mz.index <<- NULL .name_index <<- NULL callSuper(...) }) #################### # GET MOLECULE IDS # #################### MsXlsDb$methods( getMoleculeIds = function() { # Init file list .self$.init.file.list() # Get IDs mol.ids <- as.integer(which( ! is.na(.self$.files))) return(mol.ids) }) #################### # GET NB MOLECULES # #################### # Returns a list of all molecule names MsXlsDb$methods( getNbMolecules = function() { return(length(.self$getMoleculeIds())) }) ##################### # GET MOLECULE NAME # ##################### MsXlsDb$methods( getMoleculeName = function(molid) { return(vapply(molid, function(m) .self$.get.mol.name(m), FUN.VALUE = "")) }) ############################### # GET CHROMATOGRAPHIC COLUMNS # ############################### # Returns a list of all chromatographic columns used MsXlsDb$methods( getChromCol = function(molid = NULL) { cn <- character() # If no molecule IDs provided, then look at all molecules if (is.null(molid)) molid <- .self$getMoleculeIds() # Loop on molecules for (mid in molid) { rt <- .self$getRetentionTimes(mid) if ( ! is.null(rt)) cn <- c(cn, names(rt)) } # Remove duplicates cn <- cn[ ! duplicated(cn)] # Make data frame cn <- data.frame(id = cn, title = cn, stringsAsFactors = FALSE) return(cn) }) ################ # FIND BY NAME # ################ MsXlsDb$methods( findByName = function(name) { # NULL entry if (is.null(name)) return(NA_integer_) # Initialize output list ids <- NULL for (n in name) { id <- NA_integer_ if ( ! is.na(n)) { # Get index index <- .self$.get.name.index() # Search for name in index i <- binary.search(toupper(n), index[['name']]) id <- if (is.na(i)) NA_integer_ else index[i, 'id'] } ids <- c(ids, id) } return(ids) }) ####################### # GET RETENTION TIMES # ####################### MsXlsDb$methods( getRetentionTimes = function(molid, col = NA_character_) { if (is.null(molid) || is.na(molid)) return(NULL) # Find it in memory rt <- .self$.mem.get(molid, 'rt') if (is.null(rt)) { # Call observers if ( ! is.null(.self$.observers)) for (obs in .self$.observers) obs$progress(paste0("Loading retention times of file", .self$.get.file(molid), "."), level = 2) rt <- NULL # Load from cache file cache_file <- NA_character_ if ( ! is.na(.self$.get.cache.dir())) { cache_file <- file.path(.self$.get.cache.dir(), paste0('rt-', molid, '.bin')) if (file.exists(cache_file)) load(file = cache_file) # load rt } if (is.null(rt)) { # Get retention times of both positive and negative mode tabs mspos_rt <- .self$.parse_retention_times(molid, .XLS_MSPOS_TAB) msneg_rt <- .self$.parse_retention_times(molid, .XLS_MSNEG_TAB) # Retention times stored in negative and positive modes if ( ! is.null(mspos_rt) && ! is.null(msneg_rt)) { # Warn observers when both retention time lists are not identical if ( ! identical(mspos_rt, msneg_rt)) for (obs in .self$.observers) obs$warning(paste0("Retention times in negative and positive modes are different in file ", .self$.get.file(molid), ".")) # Merge both lists rt <- mspos_rt for (c in names(msneg_rt)) if (c %in% names(rt)) { v <- c(rt[[c]], msneg_rt[[c]]) rt[[c]] <- v[ ! duplicated(v)] } else rt[[c]] <- msneg_rt[[c]] } else # Set retention times rt <- if (is.null(mspos_rt)) msneg_rt else mspos_rt if (is.null(rt)) rt <- list() # Write in cache if ( ! is.na(cache_file)) { # Call observers if ( ! is.null(.self$.observers)) for (obs in .self$.observers) obs$progress(paste0("Caching retention times of file ", .self$.get.file(molid), ".")) save(rt, file = cache_file) } } # Store in memory .self$.mem.set(rt, molid, 'rt') } # Select only one column if asked if ( ! is.na(col)) rt <- rt[[col]] return(rt) }) ################# # GET NB PEAKS # ################# MsXlsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { # Initialize parameters if (is.null(molid) || (length(molid) == 1 && is.na(molid))) molid <- .self$getMoleculeIds() if (is.na(type)) type <- c(MSDB.TAG.POS, MSDB.TAG.NEG) return(sum(vapply(molid, function(m) { if (is.na(m)) 0 else sum(vapply(type, function(t) { peaks <- .self$.get.peaks(m, t) ; if (is.null(peaks)) 0 else nrow(peaks) }, FUN.VALUE = 1)) }, FUN.VALUE = 1))) }) ################## # GET PEAK TABLE # ################## MsXlsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) { peaks <- NULL # Set default molecule IDs if (is.null(molid) || (length(molid) == 1 && is.na(molid))) molid <- .self$getMoleculeIds() # Set default modes if (is.null(mode) || (length(mode) == 1 && is.na(mode))) mode <- c(MSDB.TAG.POS, MSDB.TAG.NEG) # Loop on all molecules for (mol in molid) { # Loop on all modes for (mod in mode) { m.peaks <- .self$.get.peaks(mol, mod) if ( ! is.null(m.peaks) && nrow(m.peaks) > 0) { m.peaks[[MSDB.TAG.MOLID]] <- mol m.peaks[[MSDB.TAG.MODE]] <- mod peaks <- if (is.null(peaks)) m.peaks else rbind(peaks, m.peaks) peaks <- df.move.col.first(peaks, c(MSDB.TAG.MOLID, MSDB.TAG.MODE)) } } } return(peaks) }) ################# # GET MZ VALUES # ################# # Returns a numeric vector of all masses stored inside the database. MsXlsDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { mz <- numeric() # Get all mz values of all molecules for(molid in .self$getMoleculeIds()) for (m in (if (is.null(mode) || is.na(mode)) c(MSDB.TAG.POS, MSDB.TAG.NEG) else mode)) mz <- c(mz, .self$.get.peaks(molid, m)[[MSDB.TAG.MZTHEO]]) # Remove duplicated mz <- mz[ ! duplicated(mz)] # Apply cut-off if ( ! is.na(max.results)) mz <- mz[1:max.results] return(mz) }) ############# # GET PEAKS # ############# MsXlsDb$methods( .get.peaks = function(molid, mode) { tab <- if (mode == MSDB.TAG.POS) .XLS_MSPOS_TAB else .XLS_MSNEG_TAB # Find it in memory peak_df <- .self$.mem.get(molid, 'peaks', mode) if (is.null(peak_df)) { # Call observers if ( ! is.null(.self$.observers)) for (obs in .self$.observers) obs$progress(paste0("Loading peaks of tab ", tab, " of file ", .self$.get.file(molid), "."), level = 2) peak_df <- NULL # Load from cache file cache_file <- NA_character_ if ( ! is.na(.self$.get.cache.dir())) { cache_file <- file.path(.self$.get.cache.dir(), paste0('peaks-', molid, '-', tab, '.csv')) if (file.exists(cache_file)) peak_df <- read.csv(cache_file, header = TRUE, stringsAsFactors = FALSE) } # Read from XLS file, if not in cache if (is.null(peak_df)) { # Load tab (peaks start at row 8) if (.self$.tab.exists(.self$.get.file(molid), tab)) { peaks <- read.excel(.self$.get.file(molid), tab, start.row = .XLS_PEAKS_ROW_OFFSET, stringsAsFactors = FALSE) if ( ! is.null(peaks)) peaks <- peaks[ ! is.na(peaks[.XLS_MZ_COL]), , drop = FALSE] # Remove rows where m/z is not defined. TODO maybe call observer for notify a line with non NA values but without m/z value. # Instantiate peaks if ( ! is.null(peaks) && nrow(peaks) > 0) { peak_df <- peaks[1:length(peaks[[.XLS_MZ_COL]]), c(.XLS_MZ_COL, .XLS_THEORETICAL_MZ_COL, .XLS_INTENSITY_COL, .XLS_RELATIVE_COL, .XLS_COMPOSITION_COL, .XLS_ATTRIBUTION_COL), drop = FALSE] colnames(peak_df) <- c(MSDB.TAG.MZEXP, MSDB.TAG.MZTHEO, MSDB.TAG.INT, MSDB.TAG.REL, MSDB.TAG.COMP, MSDB.TAG.ATTR) } # Set default data frame (important for cache file writing, because we need a correct header to be written in order for loading) else { peak_df <- data.frame(stringsAsFactors = FALSE) peak_df[MSDB.TAG.MZEXP] <- numeric() peak_df[MSDB.TAG.MZTHEO] <- numeric() peak_df[MSDB.TAG.INT] <- numeric() peak_df[MSDB.TAG.REL] <- numeric() peak_df[MSDB.TAG.COMP] <- character() peak_df[MSDB.TAG.ATTR] <- character() } if (is.null(peak_df)) peak_df <- data.frame() # Write in cache if ( ! is.na(cache_file)) { # Call observers if ( ! is.null(.self$.observers)) for (obs in .self$.observers) obs$progress(paste0("Caching peaks of tab ", tab, " of file ", .self$.get.file(molid), ".")) write.csv(peak_df, cache_file, row.names = FALSE) } } } # Store in memory .self$.mem.set(peak_df, molid, 'peaks', mode) } return(peak_df) }) ############################## # GET FULL MS PEAK M/Z INDEX # ############################## # Get mz index for full ions, creating it if necessary. MsXlsDb$methods( .get.mz.index = function(mode) { if (is.null(.self$.mz.index[[mode]])) { # Initialize data frame mzi <- data.frame(stringsAsFactors = FALSE) mzi[MSDB.TAG.MZTHEO] <- numeric() mzi[MSDB.TAG.MOLID] <- character() mzi[MSDB.TAG.COMP] <- character() mzi[MSDB.TAG.ATTR] <- character() # Loop on all molecules for(molid in .self$getMoleculeIds()) { # Get all peaks of this molecule peaks <- .self$.get.peaks(molid, mode) # Remove rows whose mz is NA. peaks <- peaks[ ! is.na(peaks[[MSDB.TAG.MZTHEO]]), ] if (nrow(peaks) > 0) { # Add id column peaks[MSDB.TAG.MOLID] <- molid # Append peaks r <- nrow(mzi) + 1 rows <- r:(r+nrow(peaks)-1) mzi[rows, ] <- peaks[colnames(mzi)] } } # Sort by M/Z sorted_indices <- order(mzi[[MSDB.TAG.MZTHEO]]) # Group in a data frame .self$.mz.index[[mode]] <- mzi[sorted_indices, ] } return(.self$.mz.index[[mode]]) }) ###################### # SEARCH FOR MZ & RT # ###################### MsXlsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { # Search for m/z results <- .self$.do.search.for.mz(mode, mz.low, mz.high) # Filter on attributions if ( ! is.null(attribs)) { results <- results[results[[MSDB.TAG.ATTR]] %in% attribs, ] } # Filer on molecule IDs if ( ! is.null(molids)) { results <- results[results[[MSDB.TAG.MOLID]] %in% molids, ] } # Use retention time if ( ! is.null(col) && ! is.null(rt.low) && ! is.null(rt.high)) { # Get list of unique IDs ids <- results[[MSDB.TAG.MOLID]] ids <- ids[ ! duplicated(ids)] rt <- .self$.search.for.rt(mols = ids, rt.low = rt.low, rt.high = rt.high, col = col) results <- results[results[[MSDB.TAG.MOLID]] %in% rt[[MSDB.TAG.MOLID]], ] results <- merge(results, rt) } return(results) }) ############################## # SEARCH FOR M/Z IN MS PEAKS # ############################## MsXlsDb$methods( .do.search.for.mz = function(mode, mz.low, mz.high) { results <- data.frame(stringsAsFactors = FALSE) results[MSDB.TAG.MZTHEO] <- numeric() results[MSDB.TAG.MOLID] <- character() results[MSDB.TAG.MOLNAMES] <- character() results[MSDB.TAG.COMP] <- character() results[MSDB.TAG.ATTR] <- character() # Create m/z index mz_index <- .self$.get.mz.index(mode) # Find molecules low_bound <- binary.search(mz.low, mz_index[[MSDB.TAG.MZTHEO]], lower = FALSE) high_bound <- binary.search(mz.high, mz_index[[MSDB.TAG.MZTHEO]], lower = TRUE) # Get results if ( ! is.na(high_bound) && ! is.na(low_bound) && low_bound <= high_bound) results <- mz_index[low_bound:high_bound,] # Remove row names rownames(results) <- NULL return(results) }) ################ # GET MOL NAME # ################ MsXlsDb$methods( .get.mol.name = function(molid) { if (is.na(molid)) return(NA_character_) # Find it in memory name <- .self$.mem.get(molid, 'name') if (is.null(name)) { # Load molecule mol <- .self$.load.molecule(molid) # Look for name in tabs for (tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { hdr <- mol[[tab]][['header']] if ( ! is.null(hdr)) name <- hdr[[1]] if ( ! is.null(name) && ! is.na(name)) break } # Store in memory if (is.null(name)) name <- NA_character_ .self$.mem.set(name, molid, 'name') } return(name) }) ################## # GET NAME INDEX # ################## # Get name index. MsXlsDb$methods( .get.name.index = function() { if (is.null(.self$.name_index)) { # Get names names <- vapply(.self$getMoleculeIds(), function(id) toupper(.self$getMoleculeName(id)), FUN.VALUE = "") # Get molecule IDs id <- .self$getMoleculeIds() # Sort by names sorted_indices <- order(names) # Group in a data frame .self$.name_index <- data.frame(name = rbind(names)[, sorted_indices], id = rbind(id)[, sorted_indices], stringsAsFactors = FALSE) } return(.self$.name_index) }) ################## # INIT FILE LIST # ################## MsXlsDb$methods( .init.file.list = function() { if (is.null(.self$.files)) { # List all files files <- Sys.glob(file.path(.self$.db_dir, '*.xls')) # Limit the size of the database if ( ! is.na(.self$.limit)) files <- head(files, .self$.limit) # Get IDs ids <- vapply(files, function(f) .extract_molecule_id_from_filename(f), FUN.VALUE = 1) # Use ids as indices to build the vector of files .files <<- rep(NA_character_, max(ids)) .files[ids] <<- files } }) ################# # GET CACHE DIR # ################# MsXlsDb$methods( .get.cache.dir = function() { if ( ! is.na(.self$.cache_dir) && ! file.exists(.self$.cache_dir)) dir.create(.self$.cache_dir) return(.self$.cache_dir) }) ################# # LOAD MOLECULE # ################# MsXlsDb$methods( .load.molecule = function(molid) { # Init local variables mol <- NULL cache_file <- NA_character_ excel_file <- .self$.get.file(molid) # Call observers if ( ! is.null(.self$.observers)) for (obs in .self$.observers) obs$progress(paste0("Loading molecule ", molid, "."), level = 2) # Load from cache if ( ! is.na(.self$.get.cache.dir())) { cache_file <- file.path(.self$.get.cache.dir(), paste0(molid, '.bin')) if (file.exists(cache_file)) load(file = cache_file) # load mol variable } # Load from Excel file & write to cache if (is.null(mol) && ! is.na(excel_file)) { source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. # Load from Excel file for(tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { # Test that tab exists if (.self$.tab.exists(excel_file, tab)) { header <- read.excel(excel_file, tab, start.row = 1, end.row = .XLS_PEAKS_ROW_OFFSET - 1, header = FALSE, stringsAsFactors = FALSE, trim.values = TRUE, col.index = c(1))[[1]] peaks <- read.excel(excel_file, tab, start.row = .XLS_PEAKS_ROW_OFFSET) mol[[tab]] <- list(header = header, peaks = peaks) } # Missing tab else { for (obs in .self$.observers) obs$warning(paste0("No excel tab ", tab, " in file ", excel_file, ".")) } } # Write in cache if ( ! is.na(cache_file)) { # Call observers if ( ! is.null(.self$.observers)) for (obs in .self$.observers) obs$progress(paste0("Caching file ", excel_file, ".")) save(mol, file = cache_file) } } return(mol) }) ######################## # DOES EXCEL TAB EXIST # ######################## MsXlsDb$methods( .tab.exists = function(file, tab) { source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. if ( ! tab.exists(file, tab)) { # Warn observers for (obs in .self$.observers) obs$warning(paste0("No excel tab ", tab, " in file ", file, ".")) return(FALSE) } return(TRUE) }) ######################### # PARSE RETENTION TIMES # ######################### MsXlsDb$methods( .parse_retention_times = function(id, tab) { rt <- NULL if (.self$.tab.exists(.self$.get.file(id), tab)) { peaks <- read.excel(.self$.get.file(id), tab, start.row = .XLS_PEAKS_ROW_OFFSET) # Get retention times if ( ! is.null(peaks) && length(peaks) > 0 && ! is.na(peaks[[1]][[1]])) for (c in .XLS_PEAKS_RT_COL_START:length(names(peaks))) if ( ! is.na(peaks[[c]][[1]])) { # Check retention times of all different m/z peaks for the same column. .self$.check_retention_times(id, tab, names(peaks)[[c]], peaks[[c]], sum( ! is.na(peaks[[1]]))) # Add retention time # TODO The column names are transformed through the read.xlsx call. For instance: # HPLC (C18) 25mn QTOF (Bis) --> HPLC..C18..25mn.QTOF..Bis. # ZICpHILIC 150*5*2.1 Shimadzu-Exactive-42mn --> ZICpHILIC.150.5.2.1.Shimadzu.Exactive.42mn # This can be an issue, since we loose the formating. col_id <- names(peaks)[[c]] time <- peaks[[c]][[1]] * 60 # Read and convert retention time in seconds. if (is.null(rt) || ! col_id %in% names(rt)) rt[[col_id]] <- list(time) else rt[[col_id]] <- c(rt[[col_id]], time) } } return(rt) }) ######################### # CHECK RETENTION TIMES # ######################### MsXlsDb$methods( .check_retention_times = function(id, tab_name, column_name, rt, n) { if (n >= 1 && ! is.null(.self$.observers) && length(.self$.observers) > 0) # Check column only if there is at least one value inside if (sum( ! is.na(rt)) > 0) # Loop on all values for(i in 1:n) { # Check that it's defined if (i > 1 && is.na(rt[[i]])) for (obs in .self$.observers) obs$warning(paste0("Retention times undefined for column ", column_name, " at row ", i + .XLS_PEAKS_ROW_OFFSET, " of tab ", tab_name, " in file ", .self$.get.file(id), ".")) else if (i > 1) # Check the value (it must be constant) if (rt[[i-1]] != rt[[i]]) for (obs in .self$.observers) obs$error(paste0("Retention times not constant for column ", column_name, " between row ", i - 1 + .XLS_PEAKS_ROW_OFFSET, " and row ", i + .XLS_PEAKS_ROW_OFFSET, "o tab", tab_name, "in file", .self$.get.file(id))) } }) #################### # GET FILE FROM ID # #################### MsXlsDb$methods( .get.file = function(id) { # List files .self$.init.file.list() return( if (id > 0 && id <= length(.self$.files)) .self$.files[id] else NA_character_) }) ########### # MEM GET # ########### # Get database data from memory MsXlsDb$methods( .mem.get = function(molid, field, second.field = NA_character_) { data <- .self$.db[[as.character(molid)]][[field]] if ( ! is.na(second.field)) data <- data[[second.field]] return(data) }) ########### # MEM SET # ########### # Set database data into memory MsXlsDb$methods( .mem.set = function(data, molid, field, second.field = NA_character_) { id <- as.character(molid) # Create db if (is.null(.self$.db)) .db <<- list() # Create first level if (is.null(.self$.db[[id]])) .self$.db[[id]] <- list() # Create second level if ( ! is.na(second.field) && is.null(.self$.db[[id]][[field]])) .self$.db[[id]][[field]] <- list() # Store data if (is.na(second.field)) { .self$.db[[id]][[field]] <- data } else { .self$.db[[id]][[field]][[second.field]] <- data } }) ################# # SEARCH FOR RT # ################# # Find molecules matching a certain retention time. # col A list of chromatographic columns to use. # rt.low The lower bound of the rt value. # rt.high The higher bound of the rt value. # mols A list of molecule IDs to process. If unset, then take all molecules. # Return a data frame with the following columns: id, col, colrt. MsXlsDb$methods( .search.for.rt = function(col, rt.low, rt.high, mols = NULL) { # Use all molecules if no list is provided if (is.null(mols)) mols <- .self$getMoleculeIds() results <- data.frame(id = integer(), col = character(), colrt = double(), stringsAsFactors = FALSE) # Loop on all molecules for (molid in mols) { no.col <- TRUE for (c in col) { molrts <- .self$getRetentionTimes(molid, c) if ( ! is.null(molrts)) { no.col <- FALSE for (molrt in molrts) { if (molrt >= rt.low && molrt <= rt.high) { r <- nrow(results) + 1 results[r, ] <- c(id = molid, col = c, colrt = molrt) } } } } if (no.col) { r <- nrow(results) + 1 results[r, c(MSDB.TAG.MOLID)] <- c(id = molid) } } return(results) }) ############################ # EXTRACT ID FROM FILENAME # ############################ .extract_molecule_id_from_filename <- function(filename) { id <- NA_integer_ if ( ! is.na(filename)) { g <- str_match(filename, "N(\\d+)[._-]") if ( ! is.na(g[1,1])) id <- as.numeric(g[1,2]) } return(id) } } # end of load safe guard
