diff MsDb.R @ 6:f86fec07f392 draft default tip

planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author prog
date Fri, 22 Feb 2019 16:04:22 -0500
parents fb9c0409d85c
children
line wrap: on
line diff
--- a/MsDb.R	Wed Apr 19 10:00:05 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,500 +0,0 @@
-if ( ! exists('MsDb')) { # Do not load again if already loaded
-	
-	library('methods')
-	source('msdb-common.R')
-	source('MsDbObserver.R')
-	source('MsDbOutputStream.R')
-
-	#####################
-	# CLASS DECLARATION #
-	#####################
-	
-	MsDb <- setRefClass("MsDb", fields = list(.observers = "ANY", .prec = "list", .output.streams = "ANY", .input.stream = "ANY", .mz.tol.unit = "character", .rt.unit = "character"))
-	
-	###############
-	# CONSTRUCTOR #
-	###############
-	
-	MsDb$methods( initialize = function(...) {
-		
-		callSuper(...)
-
-		.observers <<- NULL
-		.output.streams <<- NULL
-		.input.stream <<- NULL
-		.prec <<- MSDB.DFT.PREC
-		.mz.tol.unit <<- MSDB.DFT.MZTOLUNIT
-		.rt.unit <<- MSDB.RTUNIT.SEC
-	})
-
-	####################
-	# SET INPUT STREAM #
-	####################
-	
-	MsDb$methods( setInputStream = function(stream) {
-	
-		# Check types of input stream
-		if ( ! inherits(stream, "MsDbInputStream") && ! is.null(stream))
-			stop("Input stream must inherit from MsDbInputStream class.")
-	
-		# Save current stream
-		cur.stream <- .self$.input.stream
-
-		# Set stream
-		.input.stream <<- stream
-
-		return(cur.stream)
-	})
-
-	######################
-	# ADD OUTPUT STREAMS #
-	######################
-	
-	MsDb$methods( addOutputStreams = function(stream) {
-	
-		# Check types of output streams
-		if ( ( ! is.list(stream) && ! inherits(stream, "MsDbOutputStream")) || (is.list(stream) && any( ! vapply(stream, function(s) inherits(s, "MsDbOutputStream"), FUN.VALUE = TRUE))))
-			stop("Output streams must inherit from MsDbOutputStream class.")
-	
-		# Add streams to current list
-		.output.streams <<- if (is.null(.self$.output.streams)) c(stream) else c(.self$.output.streams, stream)
-	})
-
-	#########################
-	# REMOVE OUTPUT STREAMS #
-	#########################
-	
-	MsDb$methods( removeOutputStreams = function(stream) {
-	
-		# Check types of output streams
-		if ( ( ! is.list(stream) && ! inherits(stream, "MsDbOutputStream")) || (is.list(stream) && any( ! vapply(stream, function(s) inherits(s, "MsDbOutputStream"), FUN.VALUE = TRUE))))
-
-		# Remove streams from current list
-		.output.streams <<- .self$.output.streams[ ! stream %in% .self$.output.streams]
-	})
-
-	########################
-	# RESET OUTPUT STREAMS #
-	########################
-	
-	MsDb$methods( resetOutputStreams = function(stream) {
-		.output.streams <<- NULL
-	})
-
-	#################
-	# ADD OBSERVERS #
-	#################
-	
-	MsDb$methods( addObservers = function(obs) {
-	
-		# Check types of observers
-		if ( ( ! is.list(obs) && ! inherits(obs, "MsDbObserver")) || (is.list(obs) && any( ! vapply(obs, function(o) inherits(o, "MsDbObserver"), FUN.VALUE = TRUE))))
-			stop("Observers must inherit from MsDbObserver class.")
-	
-		# Add observers to current list
-		.observers <<- if (is.null(.self$.observers)) c(obs) else c(.self$.observers, obs)
-	})
-	
-	##################
-	# SET PRECURSORS #
-	##################
-	
-	MsDb$methods( setPrecursors = function(prec) {
-		.prec <<- prec
-	})
-	
-	#################
-	# SET DB FIELDS #
-	#################
-	
-	MsDb$methods( areDbFieldsSettable = function() {
-		return(FALSE)
-	})
-	
-	MsDb$methods( setDbFields = function(fields) {
-		stop("Method setDbFields() not implemented in concrete class.")
-	})
-	
-	################
-	# SET MS MODES #
-	################
-	
-	MsDb$methods( areDbMsModesSettable = function() {
-		return(FALSE)
-	})
-	
-	MsDb$methods( setDbMsModes = function(modes) {
-		stop("Method setDbMsModes() not implemented in concrete class.")
-	})
-	
-	###################
-	# SET MZ TOL UNIT #
-	###################
-
-	MsDb$methods( setMzTolUnit = function(mztolunit) {
-
-		if ( ! mztolunit %in% MSDB.MZTOLUNIT.VALS)
-			stop(paste0("M/Z tolerance unit must be one of: ", paste(MSDB.MZTOLUNIT.VALS, collapse = ', '), "."))
-
-		.mz.tol.unit <<- mztolunit
-	})
-
-	###############
-	# SET RT UNIT #
-	###############
-
-	MsDb$methods( setRtUnit = function(unit) {
-
-		if ( ! unit %in% MSDB.RTUNIT.VALS)
-			stop(paste0("RT unit must be one of: ", paste(MSDB.RTUNIT.VALS, collapse = ', '), "."))
-
-		.rt.unit <<- unit
-	})
-
-	###############
-	# GET RT UNIT #
-	###############
-
-	MsDb$methods( getRtUnit = function(unit) {
-		return(.self$.rt.unit)
-	})
-
-	####################
-	# HANDLE COMPOUNDS #
-	####################
-	
-	# Returns TRUE if this database handles compounds directly (by IDs)
-	MsDb$methods( handleCompounds = function() {
-		return(TRUE)
-	})
-
-	####################
-	# GET MOLECULE IDS #
-	####################
-	
-	# Returns an integer vector of all molecule IDs stored inside the database.
-	MsDb$methods( getMoleculeIds = function(max.results = NA_integer_) {
-		stop("Method getMoleculeIds() not implemented in concrete class.")
-	})
-
-	####################
-	# GET NB MOLECULES #
-	####################
-	
-	# Returns the number of molecules in the database.
-	MsDb$methods( getNbMolecules = function() {
-		stop("Method getNbMolecules() not implemented in concrete class.")
-	})
-	
-	#################
-	# GET MZ VALUES #
-	#################
-	
-	# Returns a numeric vector of all masses stored inside the database.
-	MsDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
-		stop("Method getMzValues() not implemented in concrete class.")
-	})
-	
-	#####################
-	# GET MOLECULE NAME #
-	#####################
-	
-	# Get molecule names
-	# molid     An integer vector of molecule IDs.
-	# Returns a character vector containing the names of the molecule IDs, in the same order as the input vector.
-	MsDb$methods( getMoleculeName = function(molid) {
-		stop("Method getMoleculeName() not implemented in concrete class.")
-	})
-	
-	###############################
-	# GET CHROMATOGRAPHIC COLUMNS #
-	###############################
-	
-	# Get chromatographic columns.
-	# Returns a vector of character listing the chromatographic column names. The name must be formatted in lowercase as following: uplc(-c8)?(-20min)?.
-	MsDb$methods( getChromCol = function(molid = NULL) {
-		stop("Method getChromCol() not implemented in concrete class.")
-	})
-
-	################
-	# FIND BY NAME #
-	################
-
-	# Find a molecule by name
-	# name  A vector of molecule names to search for.
-	# Return an integer vector of the same size as the name input vector, containing the found molecule IDs, in the same order.
-	MsDb$methods( findByName = function(name) {
-		stop("Method findByName() not implemented in concrete class.")
-	})
-	
-	#######################
-	# GET RETENTION TIMES #
-	#######################
-	
-	# Get the retention times of a molecule.
-	# Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list.
-	MsDb$methods( getRetentionTimes = function(molid, col = NA_character_) {
-		stop("Method getRetentionTimes() not implemented in concrete class.")
-	})
-	
-	################
-	# GET NB PEAKS #
-	################
-	
-	# Get the total number of MS peaks stored inside the database.
-	# molid     The ID of the molecule.
-	# type      The MS type.
-	MsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) {
-		stop("Method getNbPeaks() not implemented in concrete class.")
-	})
-
-	##################
-	# GET PEAK TABLE #
-	##################
-
-	MsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) {
-		stop("Method getPeakTable() not implemented in concrete class.")
-	})
-
-	##########
-	# SEARCH #
-	##########
-
-	# Find molecule MS peaks whose m/z matches the submitted m/z in the tolerance specified.
-	# mode              The mode to use: either MSDB.TAG.POS or MSDB.TAG.NEG.
-	# shift             The m/z shift to use, in ppm.
-	# prec              The m/z precision to use, in ppm.
-	# col               The chromatographic column used.
-	# rt.tol            Simple retention tolerance parameter: rtinf = rt - rt.tol and rtsup = rt + rt.tol
-	# rt.tol.x          Tolerance parameter for the equations : rtinf = rt - rt.tol.x - rt ^ rt.tol.y and rtsup = rt + rt.tol.x + rt ^ rt.tol.y
-	# rt.tol.y          Tolerance parameter. See rt.tol.x parameter.
-	# attribs           Only search for peaks whose attribution is among this set of attributions.
-	# molids            Only search for peaks whose molecule ID is among this vector of integer molecule IDs. Can also be a data frame with a retention time column x.colnames$rt and a molecule ID column MSDB.TAG.MOLID.
-	# molids.rt.tol     Retention time tolerance used when molids parameter is a data frame (rt, id)
-	# precursor.match   Remove peaks whose molecule precursor peak has not also been matched.
-	# precursor.rt.tol
-	# Returns a data frame, listing m/z values provided in input. Several matches can be found for an m/z value, in which case several lines (the same number as the number of matches found) with the same m/z value repeated will be inserted. The m/z values will be listed in the same order as in the input. The columns of the data.frame are: mz, rt (only if present in the input), id, mztheo, col, colrt, composition, attribution.
-	MsDb$methods( searchForMzRtList = function(x = NULL, mode, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = NULL, molids.rt.tol = NULL, attribs = NULL, precursor.match = FALSE, precursor.rt.tol = NULL, same.cols = FALSE, same.rows = FALSE, peak.table = FALSE) {
-
-		# Use provided data frame
-		old.input <- NULL
-		tmp.output <- NULL
-		if ( ! is.null(x)) {
-			tmp.input <- MsDbInputDataFrameStream$new(df = x)
-			tmp.output <- MsDbOutputDataFrameStream$new()
-			old.input <- .self$setInputStream(tmp.input)
-			.self$addOutputStreams(tmp.output)
-		}
-
-		if (precursor.match) {
-			# Get IDs of all molecules whose precursor peak matches one of the mz in the list
-			precursors.df <- .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, attribs = .self$.prec[[mode]], output.to.stream = FALSE)
-			cols.to.keep <- if (is.null(col)) MSDB.TAG.MOLID else c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)
-			precursors.ids <- precursors.df[, cols.to.keep, drop = FALSE]
-			precursors.ids <- precursors.ids[ ! is.na(precursors.ids[[MSDB.TAG.MOLID]]), , drop = FALSE]
-			precursors.ids <- precursors.ids[ ! duplicated(precursors.ids), ]
-
-			# Get all matching peaks whose molecule is inside the previously obtained list of molecules
-			df <- .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = precursors.ids, molids.rt.tol = precursor.rt.tol, same.cols = same.cols, same.rows = same.rows, peak.table = peak.table)
-# TODO 
-#
-#			peaks <- if (peak.table) results[['peaks']] else results
-#
-#			# Merge results with the column/rt found for precursors.
-#			if ( ! is.null(col) && ! is.null(peaks)) {
-#				precursors.ids <- precursors.df[, c(MSDB.TAG.MOLID, MSDB.TAG.col, MSDB.TAG.COLRT)]
-#				precursors.ids <- precursors.ids[ ! is.na(precursors.ids[[MSDB.TAG.MOLID]]), ]
-#
-#				# Get rows where ID is NA
-#				peaks.na <- peaks[is.na(peaks[[MSDB.TAG.MOLID]]), ]
-#
-#				# Get rows where ID is found (i.e.: not NA)
-#				peaks <- peaks[, !(colnames(peaks) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))] # drop col and colrt columns
-#				peaks.not.na <- peaks[! is.na(peaks[[MSDB.TAG.MOLID]]), ]
-#
-#				# Add col and colrt values to found peaks
-#				peaks <- merge(peaks.not.na, precursors.ids, by = MSDB.TAG.MOLID)
-#
-#				# Put back unfound peaks
-#				peaks <- rbind(peaks, peaks.na)
-#
-#				# Sort
-#				peaks <- peaks[order(peaks[[x.colnames$mz]], peaks[[x.colnames$rt]], peaks[[MSDB.TAG.MOLID]], peaks[[MSDB.TAG.COL]]), ]
-#
-#				# Remove rownames
-#				rownames(peaks) <- NULL
-#
-#				# Reorder columns
-#				peaks <- peaks[unlist(.self$.output.fields[names(.PEAK.TABLE.COLS)])]
-#			}
-#
-#			# Remove duplicates
-#			if ( ! is.null(peaks))
-#				peaks <- peaks[ ! duplicated(peaks), ]
-#
-#			if (peak.table)
-#				results[['peaks']] <- peaks
-#			else
-#				results <- peaks
-#
-#			return(results)
-		}
-		else
-			.self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, molids = molids, molids.rt.tol = molids.rt.tol, attribs = attribs, same.cols = same.cols, same.rows = same.rows, peak.table = peak.table)
-
-		if ( ! is.null(x)) {
-			results <- tmp.output$getDataFrame()
-			.self$removeOutputStreams(tmp.output)
-			.self$setInputStream(old.input)
-			return(results)
-		}
-	})
-
-	MsDb$methods( .doSearchForMzRtList = function(mode, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = NULL, molids.rt.tol = NULL, attribs = NULL, same.cols = FALSE, same.rows = FALSE, peak.table = FALSE, output.to.stream = TRUE) {
-
-#		# Choose columns to keep from x
-#		x.cols <- if (same.cols) colnames(x) else intersect(if (is.null(col)) c(x.colnames$mz) else c(x.colnames$mz, x.colnames$rt), colnames(x))
-#
-#		# Create a peak fake data frame for defining columns
-#		peaks.fake <- data.frame(stringsAsFactors = FALSE)
-#		for (field in names(.PEAK.TABLE.COLS))
-#			if ( ! is.null(col) || ! field %in% .RT.MATCHING.COLS)
-#				peaks.fake[.self$.output.fields[[field]]] <- vector(mode = .PEAK.TABLE.COLS[[field]], length = 0)
-#
-#		# Initialize y data frame, so when x contains no rows an empty y data frame is returned with all the columns set with right type.
-#		if (same.rows) {
-#			y <- peaks.fake[, if (is.null(col)) c(MSDB.TAG.MZ) else c(MSDB.TAG.MZ, MSDB.TAG.RT), drop = FALSE]
-#			y[MSDB.TAG.MSMATCHING] <- character()
-#		}
-#		else
-#			y <- peaks.fake
-#		y <- cbind(y, x[NULL, ! x.cols %in% colnames(y), drop = FALSE])
-#		if (peak.table) {
-#			z <- peaks.fake
-#			z <- cbind(z, x[NULL, ! x.cols %in% colnames(z), drop = FALSE])
-#		}
-
-		# Loop on all lines of input
-		peaks <- NULL
-		.self$.input.stream$reset()
-		while (.self$.input.stream$hasNextValues()) {
-
-			.self$.input.stream$nextValues()
-
-				# Search for m/z
-				results <- .self$searchForMzRtTols(mode = mode, mz = .self$.input.stream$getMz(), shift = shift, prec = prec, rt = .self$.input.stream$getRt(), col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, attribs = attribs, molids = molids, molids.rt.tol = molids.rt.tol)
-
-				# Call output streams
-				if (output.to.stream && ! is.null(.self$.output.streams))
-					for (s in .self$.output.streams)
-						s$matchedPeaks(mz = .self$.input.stream$getMz(), rt = if (is.null(col)) NULL else .self$.input.stream$getRt(), peaks = results, unused = .self$.input.stream$getAll(but = if (is.null(col)) c(MSDB.TAG.MZ) else c(MSDB.TAG.MZ, MSDB.TAG.RT)))
-
-				# Append to peak list
-				peaks <- rbind(peaks, results)
-
-#				# Add results to output
-#				r <- nrow(y) + 1
-#				x.lines <- x[i, x.cols, drop = FALSE]
-#				x.lines <- rename.col(x.lines, unlist(x.colnames), unlist(.self$.output.fields[names(x.colnames)]))
-#				if (nrow(results) == 0) {
-#					y[r, colnames(x.lines)] <- x.lines
-#				}
-#				else {
-#					if (same.rows) {
-#						y[r, colnames(x.lines)] <- x.lines
-#						ids <- results[[MSDB.TAG.MOLID]]
-#						ids <- ids[ ! duplicated(ids)] # Remove duplicated values
-#						y[r, MSDB.TAG.msmatching] <- paste(ids, collapse = .self$.molids.sep)
-#					}
-#					if ( ! same.rows || peak.table) {
-#						new.rows <- cbind(x.lines, results, row.names = NULL)
-#						if ( ! same.rows) {
-#							rows <- r:(r+nrow(results)-1)
-#							y[rows, colnames(new.rows)] <- new.rows
-#						}
-#						if (peak.table) {
-#							zr <- nrow(z) + 1
-#							zrows <- zr:(zr+nrow(results)-1)
-#							z[zrows, colnames(new.rows)] <- new.rows
-#						}
-#					}
-#				}
-		}
-
-#		results <- if (peak.table) list(main = y, peaks = z) else y
-
-#		return(results)
-		return(peaks)
-	})
-
-	# rt        Retention time in seconds.
-	# molids    An option vector of molecule IDs, used to restrict the search.
-	MsDb$methods( searchForMzRtTols = function(mode, mz, rt = NULL, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, attribs = NULL, molids = NULL, molids.rt.tol = NULL, colnames = MSDB.DFT.INPUT.FIELDS) {
-
-		# Set M/Z bounds
-		if (.self$.mz.tol.unit == MSDB.MZTOLUNIT.PPM) {
-			mz.low <- mz * (1 + (- shift - prec) * 1e-6)
-			mz.high <- mz * (1 + (- shift + prec) * 1e-6)
-		}
-		else { # PLAIN
-			mz.low <- mz - shift - prec
-			mz.high <- mz - shift + prec
-		}
-
-		# Set retention time bounds
-		rt.low <- NULL
-		rt.high <- NULL
-		if ( ! is.null(rt.tol)) {
-			low <- rt - rt.tol
-			high <- rt + rt.tol
-			rt.low <- if (is.null(rt.low)) low else max(low, rt.low)
-			rt.high <- if (is.null(rt.high)) high else min(high, rt.high)
-		}
-		if ( ! is.null(rt.tol.x)) {
-			low <- rt - rt.tol.x - rt ^ rt.tol.y
-			high <- rt + rt.tol.x + rt ^ rt.tol.y
-			rt.low <- if (is.null(rt.low)) low else max(low, rt.low)
-			rt.high <- if (is.null(rt.high)) high else min(high, rt.high)
-		}
-
-		# List molecule IDs
-		if ( ! is.null(molids.rt.tol) && is.data.frame(molids)) {
-			ids <- molids[(rt >= molids[[MSDB.TAG.COLRT]] - molids.rt.tol) & (rt <= molids[[MSDB.TAG.COLRT]] + molids.rt.tol), MSDB.TAG.MOLID]
-			if (length(ids) == 0)
-				# No molecule ID match for this retention time
-				return(data.frame()) # return empty result set
-		} else {
-			ids <- molids
-		}
-
-		return(.self$searchForMzRtBounds(mode,
-										 mz.low = mz * (1 + (- shift - prec) * 1e-6),
-										 mz.high = mz * (1 + (- shift + prec) * 1e-6),
-										 rt.low = rt.low,
-										 rt.high = rt.high,
-										 col = col,
-										 attribs = attribs,
-										 molids = ids))
-	})
-
-	# rt.low    Lower bound of the retention time in seconds.
-	# rt.high   Higher bound of the retention time in seconds.
-	MsDb$methods( searchForMzRtBounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) {
-
-		results <- .self$.do.search.for.mz.rt.bounds(mode = mode, mz.low = mz.low, mz.high = mz.high, rt.low = rt.low, rt.high = rt.high, col = col, attribs = attribs, molids = molids)
-
-		return(results)
-	})
-
-	# TODO Write description of output: data frame with which columns ?
-	MsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) {
-		stop("Method .do.search.for.mz.rt.bounds() not implemented in concrete class.")
-	})
-
-	# DEPRECATED
-	MsDb$methods( searchForMz = function(x, mode, tol = 5, col = NULL, rt.tol.x = 5, rt.tol.y = 0.80) {
-		warning("Method searchForMz() is deprecated. Use searchForMzRtList() instead.")
-		.self$searchForMzRtList(x = x, mode = mode, prec = tol, col = col, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y)
-	})
-
-} # end of load safe guard