Mercurial > repos > prog > lcmsmatching

diff MsXlsDb.R @ 0:e66bb061af06 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 3529b25417f8e1a5836474c9adec4b696d35099d-dirty
author: prog
date: Tue, 12 Jul 2016 12:02:37 -0400
children: 20d69a062da3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MsXlsDb.R	Tue Jul 12 12:02:37 2016 -0400
@@ -0,0 +1,842 @@
+if ( ! exists('MsXlsDb')) { # Do not load again if already loaded
+
+	library('methods')
+	library('stringr')
+	source('msdb-common.R')
+	source('MsDb.R')
+	source('strhlp.R', chdir = TRUE)
+	source('dfhlp.R', chdir = TRUE)
+	source('search.R', chdir = TRUE)
+	source('excelhlp.R', chdir = TRUE)
+	
+	#############
+	# CONSTANTS #
+	#############
+	
+	.THIS.FILE.PATH <- getwd() # We suppose that the file has been sourced with the option chdir = TRUE
+
+	.XLS_PEAKS_ROW_OFFSET <- 8
+	.XLS_PEAKS_RT_COL_START <- 11
+	.XLS_MSPOS_TAB <- 'MS_POS'
+	.XLS_MSNEG_TAB <- 'MS_NEG'
+	.XLS_MZ_COL <- 1
+	.XLS_INTENSITY_COL <- 2
+	.XLS_RELATIVE_COL <- 3
+	.XLS_THEORETICAL_MZ_COL <- 5
+	.XLS_COMPOSITION_COL <- 8
+	.XLS_ATTRIBUTION_COL <- 9
+	
+	#####################
+	# CLASS DECLARATION #
+	#####################
+	
+	MsXlsDb <- setRefClass("MsXlsDb", contains = "MsDb", fields = list(.mz.index = "ANY", .name_index = "ANY", .db_dir = "character", .limit = "numeric", .files = "ANY", .cache_dir = "character", .db = "ANY"))
+	
+	###############
+	# CONSTRUCTOR #
+	###############
+	
+	MsXlsDb$methods( initialize = function(db_dir = NA_character_, limit = NA_integer_, cache_dir = NA_character_, cache = FALSE, ...) {
+
+		# Initialize members
+		.db_dir <<- if ( ! is.null(db_dir)) db_dir else NA_character_
+		.limit <<- if ( ! is.null(limit) && ! is.na(limit) && limit > 0) limit else NA_integer_
+		cache_dir <- if (cache && is.na(cache_dir) && ! is.na(db_dir)) file.path(db_dir, 'cache') else cache_dir
+		.cache_dir <<- if ( cache || ! is.null(cache_dir)) cache_dir else NA_character_
+		.files <<- NULL
+		.db <<- NULL
+		.mz.index <<- NULL
+		.name_index <<- NULL
+	
+		callSuper(...)
+	})
+	
+	####################
+	# GET MOLECULE IDS #
+	####################
+	
+	MsXlsDb$methods( getMoleculeIds = function() {
+	
+		# Init file list
+		.self$.init.file.list()
+
+		# Get IDs
+		mol.ids <- as.integer(which( ! is.na(.self$.files)))
+
+		return(mol.ids)
+	})
+	
+	####################
+	# GET NB MOLECULES #
+	####################
+	
+	# Returns a list of all molecule names
+	MsXlsDb$methods( getNbMolecules = function() {
+		return(length(.self$getMoleculeIds()))
+	})
+
+	#####################
+	# GET MOLECULE NAME #
+	#####################
+	
+	MsXlsDb$methods( getMoleculeName = function(molid) {
+		return(vapply(molid, function(m) .self$.get.mol.name(m), FUN.VALUE = ""))
+	})
+	
+	###############################
+	# GET CHROMATOGRAPHIC COLUMNS #
+	###############################
+	
+	# Returns a list of all chromatographic columns used
+	MsXlsDb$methods( getChromCol = function(molid = NULL) {
+	
+	    cn <- character()
+
+		# If no molecule IDs provided, then look at all molecules
+	    if	(is.null(molid))
+	    	molid <- .self$getMoleculeIds()
+
+		# Loop on molecules
+		for (mid in molid) {
+
+	    	rt <- .self$getRetentionTimes(mid)
+
+			if ( ! is.null(rt))
+				cn <- c(cn, names(rt))
+		}
+	
+		# Remove duplicates
+		cn <- cn[ ! duplicated(cn)]
+
+		# Make data frame
+		cn <- data.frame(id = cn, title = cn, stringsAsFactors = FALSE)
+
+		return(cn)
+	})
+
+	################
+	# FIND BY NAME #
+	################
+
+	MsXlsDb$methods( findByName = function(name) {
+
+		# NULL entry
+		if (is.null(name))
+			return(NA_integer_)
+	
+		# Initialize output list
+		ids <- NULL
+
+		for (n in name) {
+
+			id <- NA_integer_
+
+			if ( ! is.na(n)) {
+
+				# Get index
+				index <- .self$.get.name.index()
+
+				# Search for name in index
+				i <- binary.search(toupper(n), index[['name']])
+
+				id <- if (is.na(i)) NA_integer_ else index[i, 'id']
+			}
+
+			ids <- c(ids, id)
+		}
+
+		return(ids)
+	})
+	
+	#######################
+	# GET RETENTION TIMES #
+	#######################
+	
+	MsXlsDb$methods( getRetentionTimes = function(molid, col = NA_character_) {
+
+		if (is.null(molid) || is.na(molid))
+			return(NULL)
+
+		# Find it in memory
+		rt <- .self$.mem.get(molid, 'rt')
+
+		if (is.null(rt)) {
+	
+			# Call observers
+			if ( ! is.null(.self$.observers))
+				for (obs in .self$.observers)
+					obs$progress(paste0("Loading retention times of file", .self$.get.file(molid), "."), level = 2)
+		
+			rt <- NULL
+		
+			# Load from cache file
+			cache_file <- NA_character_
+			if ( ! is.na(.self$.get.cache.dir())) {
+				cache_file <- file.path(.self$.get.cache.dir(), paste0('rt-', molid, '.bin'))
+				if (file.exists(cache_file))
+					load(file = cache_file) # load rt
+			}
+		
+			if (is.null(rt)) {
+		
+				# Get retention times of both positive and negative mode tabs
+				mspos_rt <- .self$.parse_retention_times(molid, .XLS_MSPOS_TAB)
+				msneg_rt <- .self$.parse_retention_times(molid, .XLS_MSNEG_TAB)
+		
+				# Retention times stored in negative and positive modes
+				if ( ! is.null(mspos_rt) && ! is.null(msneg_rt)) {
+
+					# Warn observers when both retention time lists are not identical
+					if ( ! identical(mspos_rt, msneg_rt))
+						for (obs in .self$.observers)
+							obs$warning(paste0("Retention times in negative and positive modes are different in file ", .self$.get.file(molid), "."))
+		
+					# Merge both lists
+					rt <- mspos_rt
+					for (c in names(msneg_rt))
+						if (c %in% names(rt)) {
+							v <- c(rt[[c]], msneg_rt[[c]])
+							rt[[c]] <- v[ ! duplicated(v)]
+						}
+						else
+							rt[[c]] <- msneg_rt[[c]]
+				}
+				else
+					# Set retention times
+					rt <- if (is.null(mspos_rt)) msneg_rt else mspos_rt
+
+				if (is.null(rt)) rt <- list()
+		
+				# Write in cache
+				if ( ! is.na(cache_file)) {
+		
+					# Call observers
+					if ( ! is.null(.self$.observers))
+						for (obs in .self$.observers)
+							obs$progress(paste0("Caching retention times of file ", .self$.get.file(molid), "."))
+
+					save(rt, file = cache_file)
+				}
+			}
+
+			# Store in memory
+			.self$.mem.set(rt, molid, 'rt')
+		}
+
+		# Select only one column if asked
+		if ( ! is.na(col)) rt <- rt[[col]]
+
+		return(rt)
+	})
+
+	#################
+	# GET NB PEAKS #
+	#################
+	
+	MsXlsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) {
+
+		# Initialize parameters
+		if (is.null(molid) || (length(molid) == 1 && is.na(molid)))
+			molid <- .self$getMoleculeIds()
+		if (is.na(type))
+			type <- c(MSDB.TAG.POS, MSDB.TAG.NEG)
+
+		return(sum(vapply(molid, function(m) { if (is.na(m)) 0 else sum(vapply(type, function(t) { peaks <- .self$.get.peaks(m, t) ; if (is.null(peaks)) 0 else nrow(peaks) }, FUN.VALUE = 1)) }, FUN.VALUE = 1)))
+	})
+
+	##################
+	# GET PEAK TABLE #
+	##################
+
+	MsXlsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) {
+
+		peaks <- NULL
+
+		# Set default molecule IDs
+		if (is.null(molid) || (length(molid) == 1 && is.na(molid)))
+			molid <- .self$getMoleculeIds()
+
+		# Set default modes
+		if (is.null(mode) || (length(mode) == 1 && is.na(mode)))
+			mode <- c(MSDB.TAG.POS, MSDB.TAG.NEG)
+
+		# Loop on all molecules
+		for (mol in molid) {
+
+			# Loop on all modes
+			for (mod in mode) {
+				m.peaks <- .self$.get.peaks(mol, mod)
+				if ( ! is.null(m.peaks) && nrow(m.peaks) > 0) {
+					m.peaks[[MSDB.TAG.MOLID]] <- mol
+					m.peaks[[MSDB.TAG.MODE]] <- mod
+					peaks <- if (is.null(peaks)) m.peaks else rbind(peaks, m.peaks)
+					peaks <- df.move.col.first(peaks, c(MSDB.TAG.MOLID, MSDB.TAG.MODE))
+				}
+			}
+		}
+
+		return(peaks)
+	})
+	
+	#################
+	# GET MZ VALUES #
+	#################
+	
+	# Returns a numeric vector of all masses stored inside the database.
+	MsXlsDb$methods( getMzValues = function(mode = NULL) {
+
+		mz <- numeric()
+
+		# Get all mz values of all molecules
+		for(molid in .self$getMoleculeIds())
+			for (m in (if (is.null(mode) || is.na(mode)) c(MSDB.TAG.POS, MSDB.TAG.NEG) else mode))
+				mz <- c(mz, .self$.get.peaks(molid, m)[[MSDB.TAG.MZTHEO]])
+
+		# Remove duplicated
+		mz <- mz[ ! duplicated(mz)]
+
+		return(mz)
+	})
+	
+	#############
+	# GET PEAKS #
+	#############
+	
+	MsXlsDb$methods( .get.peaks = function(molid, mode) {
+	
+		tab <- if (mode == MSDB.TAG.POS) .XLS_MSPOS_TAB else .XLS_MSNEG_TAB
+
+		# Find it in memory
+		peak_df <- .self$.mem.get(molid, 'peaks', mode)
+
+		if (is.null(peak_df)) {
+			# Call observers
+			if ( ! is.null(.self$.observers))
+				for (obs in .self$.observers)
+					obs$progress(paste0("Loading peaks of tab ", tab, " of file ", .self$.get.file(molid), "."), level = 2)
+		
+			peak_df <- NULL
+		
+			# Load from cache file
+			cache_file <- NA_character_
+			if ( ! is.na(.self$.get.cache.dir())) {
+				cache_file <- file.path(.self$.get.cache.dir(), paste0('peaks-', molid, '-', tab, '.csv'))
+				if (file.exists(cache_file))
+					peak_df <- read.csv(cache_file, header = TRUE, stringsAsFactors = FALSE)
+			}
+		
+			# Read from XLS file, if not in cache
+			if (is.null(peak_df)) {
+		
+				# Load tab (peaks start at row 8)
+				if (.self$.tab.exists(.self$.get.file(molid), tab)) {
+
+					peaks <- read.excel(.self$.get.file(molid), tab, start.row = .XLS_PEAKS_ROW_OFFSET, stringsAsFactors = FALSE)
+					if ( ! is.null(peaks))
+						peaks <- peaks[ ! is.na(peaks[.XLS_MZ_COL]), , drop = FALSE] # Remove rows where m/z is not defined. TODO maybe call observer for notify a line with non NA values but without m/z value.
+		
+					# Instantiate peaks
+					if ( ! is.null(peaks) && nrow(peaks) > 0) {
+						peak_df <- peaks[1:length(peaks[[.XLS_MZ_COL]]), c(.XLS_MZ_COL, .XLS_THEORETICAL_MZ_COL, .XLS_INTENSITY_COL, .XLS_RELATIVE_COL, .XLS_COMPOSITION_COL, .XLS_ATTRIBUTION_COL), drop = FALSE]
+						colnames(peak_df) <- c(MSDB.TAG.MZEXP, MSDB.TAG.MZTHEO, MSDB.TAG.INT, MSDB.TAG.REL, MSDB.TAG.COMP, MSDB.TAG.ATTR)
+					}
+		
+					# Set default data frame (important for cache file writing, because we need a correct header to be written in order for loading)
+					else {
+						peak_df <- data.frame(stringsAsFactors = FALSE)
+						peak_df[MSDB.TAG.MZEXP] <- numeric()
+ 				   		peak_df[MSDB.TAG.MZTHEO] <- numeric()
+ 				   		peak_df[MSDB.TAG.INT] <- numeric()
+ 				   		peak_df[MSDB.TAG.REL] <- numeric()
+ 				   		peak_df[MSDB.TAG.COMP] <- character()
+ 				   		peak_df[MSDB.TAG.ATTR] <- character()
+					}
+
+					if (is.null(peak_df)) peak_df <- data.frame()
+		
+					# Write in cache
+					if ( ! is.na(cache_file)) {
+		
+						# Call observers
+						if ( ! is.null(.self$.observers))
+							for (obs in .self$.observers)
+								obs$progress(paste0("Caching peaks of tab ", tab, " of file ", .self$.get.file(molid), "."))
+			
+						write.csv(peak_df, cache_file, row.names = FALSE)
+					}
+				}
+			}
+
+			# Store in memory
+			.self$.mem.set(peak_df, molid, 'peaks', mode)
+		}
+
+		return(peak_df)
+	})
+	
+	##############################
+	# GET FULL MS PEAK M/Z INDEX #
+	##############################
+	
+	# Get mz index for full ions, creating it if necessary.
+	MsXlsDb$methods( .get.mz.index = function(mode) {
+	
+		if (is.null(.self$.mz.index[[mode]])) {
+	
+			# Initialize data frame
+			mzi <- data.frame(stringsAsFactors = FALSE)
+			mzi[MSDB.TAG.MZTHEO]    <- numeric()
+			mzi[MSDB.TAG.MOLID]     <- character()
+			mzi[MSDB.TAG.COMP]      <- character()
+			mzi[MSDB.TAG.ATTR]      <- character()
+
+			# Loop on all molecules
+			for(molid in .self$getMoleculeIds()) {
+
+				# Get all peaks of this molecule
+				peaks <- .self$.get.peaks(molid, mode)
+
+				# Remove rows whose mz is NA.
+				peaks <- peaks[ ! is.na(peaks[[MSDB.TAG.MZTHEO]]), ]
+
+				if (nrow(peaks) > 0) {
+
+					# Add id column
+					peaks[MSDB.TAG.MOLID] <- molid
+
+					# Append peaks
+					r <- nrow(mzi) + 1
+					rows <- r:(r+nrow(peaks)-1)
+					mzi[rows, ] <- peaks[colnames(mzi)]
+				}
+			}
+	
+			# Sort by M/Z
+			sorted_indices <- order(mzi[[MSDB.TAG.MZTHEO]])
+	
+			# Group in a data frame
+			.self$.mz.index[[mode]] <- mzi[sorted_indices, ]
+		}
+
+		return(.self$.mz.index[[mode]])
+	})
+
+	######################
+	# SEARCH FOR MZ & RT #
+	######################
+
+	MsXlsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) {
+
+		# Search for m/z
+		results <- .self$.do.search.for.mz(mode, mz.low, mz.high)
+
+		# Filter on attributions
+		if ( ! is.null(attribs)) {
+			results <- results[results[[MSDB.TAG.ATTR]] %in% attribs, ]
+		}
+
+		# Filer on molecule IDs
+		if ( ! is.null(molids)) {
+			results <- results[results[[MSDB.TAG.MOLID]] %in% molids, ]
+		}
+
+		# Use retention time
+		if ( ! is.null(col) && ! is.null(rt.low) && ! is.null(rt.high)) {
+
+			# Get list of unique IDs
+			ids <- results[[MSDB.TAG.MOLID]]
+			ids <- ids[ ! duplicated(ids)]
+			rt <- .self$.search.for.rt(mols = ids, rt.low = rt.low, rt.high = rt.high, col = col)
+			results <- results[results[[MSDB.TAG.MOLID]] %in% rt[[MSDB.TAG.MOLID]], ]
+			results <- merge(results, rt)
+		}
+
+		return(results)
+	})
+
+	##############################
+	# SEARCH FOR M/Z IN MS PEAKS #
+	##############################
+
+	MsXlsDb$methods( .do.search.for.mz = function(mode, mz.low, mz.high) {
+	
+		results <- data.frame(stringsAsFactors = FALSE)
+		results[MSDB.TAG.MZTHEO]    <- numeric()
+		results[MSDB.TAG.MOLID]     <- character()
+		results[MSDB.TAG.MOLNAMES]  <- character()
+		results[MSDB.TAG.COMP]      <- character()
+		results[MSDB.TAG.ATTR]      <- character()
+
+		# Create m/z index
+		mz_index <- .self$.get.mz.index(mode)
+	
+		# Find molecules
+		low_bound <- binary.search(mz.low, mz_index[[MSDB.TAG.MZTHEO]], lower = FALSE)
+		high_bound <- binary.search(mz.high, mz_index[[MSDB.TAG.MZTHEO]], lower = TRUE)
+
+		# Get results
+		if ( ! is.na(high_bound) && ! is.na(low_bound) && low_bound <= high_bound)
+			results <- mz_index[low_bound:high_bound,]
+
+		# Remove row names
+		rownames(results) <- NULL
+
+		return(results)
+	})
+
+	################
+	# GET MOL NAME #
+	################
+	
+	MsXlsDb$methods( .get.mol.name = function(molid) {
+
+		if (is.na(molid))
+			return(NA_character_)
+
+		# Find it in memory
+		name <- .self$.mem.get(molid, 'name')
+
+		if (is.null(name)) {
+
+			# Load molecule
+			mol <- .self$.load.molecule(molid)
+			
+			# Look for name in tabs
+			for (tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) {
+				hdr <- mol[[tab]][['header']]
+				if ( ! is.null(hdr))
+					name <- hdr[[1]]
+				if ( ! is.null(name) && ! is.na(name)) break
+			}
+
+			# Store in memory
+			if (is.null(name)) name <-  NA_character_
+			.self$.mem.set(name, molid, 'name')
+		}
+
+		return(name)
+	})
+	
+	##################
+	# GET NAME INDEX #
+	##################
+	
+	# Get name index.
+	MsXlsDb$methods( .get.name.index = function() {
+	
+		if (is.null(.self$.name_index)) {
+	
+			# Get names
+			names <- vapply(.self$getMoleculeIds(), function(id) toupper(.self$getMoleculeName(id)), FUN.VALUE = "")
+	
+			# Get molecule IDs
+			id <- .self$getMoleculeIds()
+	
+			# Sort by names
+			sorted_indices <- order(names)
+	
+			# Group in a data frame
+			.self$.name_index <- data.frame(name = rbind(names)[, sorted_indices],
+			                                id = rbind(id)[, sorted_indices],
+			                                stringsAsFactors = FALSE)
+		}
+
+		return(.self$.name_index)
+	})
+
+	##################
+	# INIT FILE LIST #
+	##################
+	
+	MsXlsDb$methods( .init.file.list = function() {
+
+		if (is.null(.self$.files)) {
+	
+			# List all files
+			files <- Sys.glob(file.path(.self$.db_dir, '*.xls')) 
+
+			# Limit the size of the database
+			if ( ! is.na(.self$.limit))
+				files <- head(files, .self$.limit)
+	
+			# Get IDs
+			ids <- vapply(files, function(f) .extract_molecule_id_from_filename(f), FUN.VALUE = 1)
+
+			# Use ids as indices to build the vector of files
+			.files <<- rep(NA_character_, max(ids))
+			.files[ids] <<- files
+		}
+	})
+
+	#################
+	# GET CACHE DIR #
+	#################
+	
+	MsXlsDb$methods( .get.cache.dir = function() {
+	
+		if ( ! is.na(.self$.cache_dir) && ! file.exists(.self$.cache_dir))
+			dir.create(.self$.cache_dir)
+	
+		return(.self$.cache_dir)
+	})
+
+	#################
+	# LOAD MOLECULE #
+	#################
+	
+	MsXlsDb$methods( .load.molecule = function(molid) {
+
+		# Init local variables
+		mol <- NULL
+		cache_file <- NA_character_
+		excel_file <- .self$.get.file(molid)
+
+		# Call observers
+		if ( ! is.null(.self$.observers))
+			for (obs in .self$.observers)
+				obs$progress(paste0("Loading molecule ", molid, "."), level = 2)
+
+		# Load from cache
+		if ( ! is.na(.self$.get.cache.dir())) {
+			cache_file <- file.path(.self$.get.cache.dir(), paste0(molid, '.bin'))
+			if (file.exists(cache_file))
+				load(file = cache_file) # load mol variable
+		}
+
+		# Load from Excel file & write to cache
+		if (is.null(mol) && ! is.na(excel_file)) {
+
+			source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different.
+
+			# Load from Excel file
+			for(tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) {
+
+				# Test that tab exists
+				if (.self$.tab.exists(excel_file, tab)) {
+					header <- read.excel(excel_file, tab, start.row = 1, end.row = .XLS_PEAKS_ROW_OFFSET - 1, header = FALSE, stringsAsFactors = FALSE, trim.values = TRUE, col.index = c(1))[[1]]
+					peaks <- read.excel(excel_file, tab, start.row = .XLS_PEAKS_ROW_OFFSET)
+					mol[[tab]] <- list(header = header, peaks = peaks)
+				}
+
+				# Missing tab
+				else {
+					for (obs in .self$.observers)
+						obs$warning(paste0("No excel tab ", tab, " in file ", excel_file, "."))
+				}
+			}
+	
+			# Write in cache
+			if ( ! is.na(cache_file)) {
+	
+				# Call observers
+				if ( ! is.null(.self$.observers))
+					for (obs in .self$.observers)
+						obs$progress(paste0("Caching file ", excel_file, "."))
+
+				save(mol, file = cache_file)
+			}
+		}
+
+		return(mol)
+	})
+	
+	########################
+	# DOES EXCEL TAB EXIST #
+	########################
+	
+	MsXlsDb$methods( .tab.exists = function(file, tab) {
+	
+		source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different.
+	
+		if ( ! tab.exists(file, tab)) {
+
+			# Warn observers
+			for (obs in .self$.observers)
+				obs$warning(paste0("No excel tab ", tab, " in file ", file, "."))
+
+			return(FALSE)
+		}
+	
+		return(TRUE)
+	})
+	
+	#########################
+	# PARSE RETENTION TIMES #
+	#########################
+	
+	MsXlsDb$methods( .parse_retention_times = function(id, tab) {
+	
+	    rt <- NULL
+	
+		if (.self$.tab.exists(.self$.get.file(id), tab)) {
+			peaks <- read.excel(.self$.get.file(id), tab, start.row = .XLS_PEAKS_ROW_OFFSET)
+	
+			# Get retention times
+			if ( ! is.null(peaks) && length(peaks) > 0 && ! is.na(peaks[[1]][[1]]))
+				for (c in .XLS_PEAKS_RT_COL_START:length(names(peaks)))
+					if ( ! is.na(peaks[[c]][[1]])) {
+	
+						# Check retention times of all different m/z peaks for the same column.
+						.self$.check_retention_times(id, tab, names(peaks)[[c]], peaks[[c]], sum( ! is.na(peaks[[1]])))
+	
+						# Add retention time
+						# TODO The column names are transformed through the read.xlsx call. For instance:
+						#           HPLC (C18) 25mn QTOF (Bis)                  --> HPLC..C18..25mn.QTOF..Bis.
+	  					#           ZICpHILIC 150*5*2.1 Shimadzu-Exactive-42mn  --> ZICpHILIC.150.5.2.1.Shimadzu.Exactive.42mn
+						# This can be an issue, since we loose the formating.
+						col_id <- names(peaks)[[c]]
+						time <- peaks[[c]][[1]] * 60 # Read and convert retention time in seconds.
+						if (is.null(rt) || ! col_id %in% names(rt))
+							rt[[col_id]] <- list(time)
+						else
+							rt[[col_id]] <- c(rt[[col_id]], time)
+					}
+		}
+		
+		return(rt)
+	})
+	
+	#########################
+	# CHECK RETENTION TIMES #
+	#########################
+	
+	MsXlsDb$methods( .check_retention_times = function(id, tab_name, column_name, rt, n) {
+	
+		if (n >= 1  && ! is.null(.self$.observers) && length(.self$.observers) > 0)
+	
+			# Check column only if there is at least one value inside
+			if (sum( ! is.na(rt)) > 0)
+	
+				# Loop on all values
+				for(i in 1:n) {
+	
+					# Check that it's defined
+					if (i > 1 && is.na(rt[[i]]))
+						for (obs in .self$.observers)
+							obs$warning(paste0("Retention times undefined for column ", column_name, " at row ", i + .XLS_PEAKS_ROW_OFFSET, " of tab ", tab_name, " in file ", .self$.get.file(id), "."))
+	
+					else if (i > 1)
+						# Check the value (it must be constant)
+						if (rt[[i-1]] != rt[[i]])
+							for (obs in .self$.observers)
+								obs$error(paste0("Retention times not constant for column ", column_name, " between row ", i - 1 + .XLS_PEAKS_ROW_OFFSET, " and row ", i + .XLS_PEAKS_ROW_OFFSET, "o tab", tab_name, "in file", .self$.get.file(id)))
+				}
+	})
+	
+	####################
+	# GET FILE FROM ID #
+	####################
+	
+	MsXlsDb$methods( .get.file = function(id) {
+	
+		# List files
+		.self$.init.file.list()
+	
+		return( if (id > 0 && id <= length(.self$.files)) .self$.files[id] else NA_character_)
+	})
+	
+	###########
+	# MEM GET #
+	###########
+
+	# Get database data from memory
+	MsXlsDb$methods( .mem.get = function(molid, field, second.field = NA_character_) {
+
+		data <- .self$.db[[as.character(molid)]][[field]]
+
+		if ( ! is.na(second.field))
+			data <- data[[second.field]]
+
+		return(data)
+	})
+	
+	###########
+	# MEM SET #
+	###########
+
+	# Set database data into memory
+	MsXlsDb$methods( .mem.set = function(data, molid, field, second.field = NA_character_) {
+
+		id <- as.character(molid)
+
+		# Create db
+		if (is.null(.self$.db))
+		    .db <<- list()
+
+		# Create first level
+		if (is.null(.self$.db[[id]]))
+			.self$.db[[id]] <- list()
+
+		# Create second level
+		if ( ! is.na(second.field) && is.null(.self$.db[[id]][[field]]))
+			.self$.db[[id]][[field]] <- list()
+
+		# Store data
+		if (is.na(second.field)) {
+			.self$.db[[id]][[field]] <- data
+		} else {
+			.self$.db[[id]][[field]][[second.field]] <- data
+		}
+	})
+
+	#################
+	# SEARCH FOR RT #
+	#################
+
+	# Find molecules matching a certain retention time.
+	# col           A list of chromatographic columns to use.
+	# rt.low        The lower bound of the rt value.
+	# rt.high       The higher bound of the rt value.
+	# mols          A list of molecule IDs to process. If unset, then take all molecules.
+	# Return a data frame with the following columns: id, col, colrt.
+	MsXlsDb$methods( .search.for.rt = function(col, rt.low, rt.high, mols = NULL) {
+
+		# Use all molecules if no list is provided
+		if (is.null(mols))
+			mols <- .self$getMoleculeIds()
+
+		results <- data.frame(id = integer(), col = character(), colrt = double(), stringsAsFactors = FALSE)
+
+		# Loop on all molecules
+		for (molid in mols) {
+			no.col <- TRUE
+			for (c in col) {
+				molrts <- .self$getRetentionTimes(molid, c)
+				if ( ! is.null(molrts)) {
+					no.col <- FALSE
+					for (molrt in molrts) {
+						if (molrt >= rt.low && molrt <= rt.high) {
+							r <- nrow(results) + 1
+							results[r, ] <- c(id = molid, col = c, colrt = molrt)
+						}
+					}
+				}
+			}
+
+			if (no.col) {
+				r <- nrow(results) + 1
+				results[r, c(MSDB.TAG.MOLID)] <- c(id = molid)
+			}
+		}
+
+		return(results)
+	})
+
+	############################
+	# EXTRACT ID FROM FILENAME #
+	############################
+	
+	.extract_molecule_id_from_filename <- function(filename) {
+	
+		id <- NA_integer_
+	
+		if ( ! is.na(filename)) {
+			g <- str_match(filename, "N(\\d+)[._-]")
+			if ( ! is.na(g[1,1]))
+				id <- as.numeric(g[1,2])
+		}
+	
+		return(id)
+	}
+	
+} # end of load safe guard
author	prog
date	Tue, 12 Jul 2016 12:02:37 -0400
parents
children	20d69a062da3