view MsFileDb.R @ 4:b34c14151f25 draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 494194bb501d1d7033613131865f7bd68976041c
author prog
date Tue, 14 Mar 2017 12:40:22 -0400
parents 20d69a062da3
children fb9c0409d85c
line wrap: on
line source

if ( ! exists('MsFileDb')) { # Do not load again if already loaded

	library('methods')
	source('MsDb.R')
	source('msdb-common.R')
	source('search.R', chdir = TRUE)

	#####################
	# CLASS DECLARATION #
	#####################
	
	MsFileDb <- setRefClass("MsFileDb", contains = "MsDb", fields = list(.file = "character", .db = "ANY", .fields = "list", .modes = "list", .name.to.id = "ANY"))
	
	###############
	# CONSTRUCTOR #
	###############
	
	MsFileDb$methods( initialize = function(file = NA_character_, ...) {

		# Initialize members
		.file <<- if ( ! is.null(file)) file else NA_character_
		.db <<- NULL
		.fields <<- msdb.get.dft.db.fields()
		.modes <<- MSDB.DFT.MODES
		.name.to.id <<- NULL
	
		callSuper(...)
	})
	
	#################
	# SET DB FIELDS #
	#################
	
	MsFileDb$methods( areDbFieldsSettable = function() {
		return(TRUE)
	})
	
	MsFileDb$methods( setDbFields = function(fields) {
		.fields <<- as.list(fields)
	})
	
	################
	# CHECK FIELDS #
	################
	
	MsFileDb$methods( .check.fields = function(fields) {

		if (is.null(fields))
			stop("No fields specified for .check.fields()")

		# Check that fields are defined in the fields list
		unknown <- fields[ ! fields %in% names(.self$.fields)]
		if (length(unknown) > 0)
			stop(paste0("Database field", if (length(unknown) == 1) "" else "s", " \"", paste(unkown, collapse = ", "), "\" ", if (length(unknown) == 1) "is" else "are", " not defined."))

		# Check that field values are real columns inside the database
		.self$.init.db()
		db.col.names <- fields #vapply(fields, function(s) .self$.fields[[s]], FUN.VALUE = '')
		unknown.cols <- db.col.names[ ! db.col.names %in% colnames(.self$.db)]
		if (length(unknown.cols) > 0)
			stop(paste0("Column", if (length(unknown.cols) == 1) "" else "s", " \"", paste(unknown.cols, collapse = ", "), "\" ", if (length(unknown.cols) == 1) "is" else "are", " not defined inside the database \"", .self$.file, "\"."))
	})

	################
	# SET MS MODES #
	################
	
	MsFileDb$methods( areDbMsModesSettable = function() {
		return(TRUE)
	})
	
	MsFileDb$methods( setDbMsModes = function(modes) {
		.modes <<- as.list(modes)
	})
	
	###########
	# INIT DB #
	###########

	MsFileDb$methods( .init.db = function() {

		if (is.null(.self$.db)) {

			# Load database
			.db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL)

			# Check that colnames are unique
			dupcol <- duplicated(colnames(.self$.db))
			if (any(dupcol))
				stop(paste("Database header contains duplicated names: ", paste(unique(colnames(.self$.db)[dupcol]), collapse = ', '), "."))

			# Check that columns names supplied through field map are unique
			dupfields <- duplicated(.self$.fields)
			if (any(dupfields))
				stop(paste("Some db column names supplied are duplicated: ", paste(unique(.self$.fields[dupfields]), collapse = ', '), "."))

			# Rename columns
			colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '')
		}
	})

	############
	# GET DATA #
	############

	MsFileDb$methods( .get = function(db = NULL, col = NULL) {
	
		# Init db
		if  (is.null(db)) {
			.self$.init.db()
			db <- .self$.db
		}

		# Check fields
		.self$.check.fields(col)

		# Get database columns
#		db.cols <- unlist(.self$.fields[col])

		return(db[, col])
	})

	###########
	# GET ROW #
	###########

	MsFileDb$methods( .get.row = function(row, cols = NULL) {
	
		# Init db
		.self$.init.db()

		# Check fields
		if ( ! is.null(cols))
			.self$.check.fields(cols)

		if ( ! is.null(cols)) {
			#cols <- vapply(cols, function(c) .self$.fields[[c]], FUN.VALUE = '')
			return(.self$.db[row, cols])
		}

		return(.self$.db[row, ])
	})

	###########
	# GET COL #
	###########

	MsFileDb$methods( .get.col = function(col) {
	
		# Init db
		.self$.init.db()

		# Check fields
		.self$.check.fields(col)

		#return(.self$.db[[.self$.fields[[col]]]])
		return(.self$.db[[col]])
	})

	####################
	# GET MOLECULE IDS #
	####################
	
	MsFileDb$methods( getMoleculeIds = function(max.results = NA_integer_) {
	
		# Init db
		.self$.init.db()

		# Get IDs
		mol.ids <- as.character(.self$.get.col(MSDB.TAG.MOLID))
		mol.ids <- mol.ids[ ! duplicated(mol.ids)]
		mol.ids <- sort(mol.ids)

		# Cut results
		if ( ! is.na(max.results) && length(mol.ids) > max.results)
			mol.ids <- mol.ids[1:max.results]

		return(mol.ids)
	})

	####################
	# GET NB MOLECULES #
	####################
	
	# Returns the number of molecules in the database.
	MsFileDb$methods( getNbMolecules = function() {
	
		# Init db
		.self$.init.db()

		# Get IDs
		mol.ids <- .self$.get.col(MSDB.TAG.MOLID)
		mol.ids <- mol.ids[ ! duplicated(mol.ids)]

		return(length(mol.ids))
	})
	
	#####################
	# GET MOLECULE NAME #
	#####################
	
	MsFileDb$methods( .get.name.from.id = function(db, id) {

		if(is.na(id))
			return(NA_character_)

		# Get names
		names <- db[db[[MSDB.TAG.MOLID]] %in% id, MSDB.TAG.MOLNAMES]
		if (length(names) == 0)
			return(NA_character_)

		# Each molecule has potentially several names. Since we must return only one name for each molecule, we choose the first one.
		name <- strsplit(names, ';')[[1]][[1]]

		return(name)
	})

	# Get molecule names
	# molid     An integer vector of molecule IDs.
	# Returns a character vector containing the names of the molecule IDs, in the same order as the input vector.
	MsFileDb$methods( getMoleculeName = function(molid) {

		if (is.null(molid))
			return(NA_character_)

		# Init db
		.self$.init.db()

		# Get database
		db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)]

		# Remove duplicates
		db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ]

		# Look for ids
		names <- vapply(molid, function(i) .self$.get.name.from.id(db, i), FUN.VALUE = '')

		return(names)
	})

	###################
	# INIT NAME TO ID #
	###################

	MsFileDb$methods( .init.name.to.id = function() {

		if (is.null(.self$.name.to.id)) {

			# Create data frame
			.name.to.id <<- data.frame(name = character(), id = character(), stringsAsFactors = FALSE)

			# Init db
			.self$.init.db()

			# Get database subset (columns name and id only).
			db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)]

			# Remove duplicate IDs
			db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ]

			# Loop on all 
			for(i in seq(db[[MSDB.TAG.MOLID]])) {
				i.id <- db[i, MSDB.TAG.MOLID]
				i.names <- split.str(db[i, MSDB.TAG.MOLNAMES], ';', unlist = TRUE)
				.name.to.id <<- rbind(.self$.name.to.id, data.frame(name = toupper(i.names), id = rep(i.id, length(i.names)), stringsAsFactors = FALSE))
			}

			# Order by name
			.name.to.id <<- .self$.name.to.id[order(.self$.name.to.id[['name']]), ]
		}
	})

	####################
	# GET ID FROM NAME #
	####################

	MsFileDb$methods( .get.id.from.name = function(name) {

		# Initialize name.to.id search tree
		.self$.init.name.to.id()

		# Search for name
		i <- binary.search(toupper(name), .self$.name.to.id[['name']])

		# Get ID
		id <- if (is.na(i)) NA_character_ else as.character(.self$.name.to.id[i, 'id'])

		return(id)
	})

	################
	# FIND BY NAME #
	################

	# Find a molecule by name
	# name  A vector of molecule names to search for.
	# Return a vector of the same size as the name input vector, containing the found molecule IDs, in the same order.
	MsFileDb$methods( findByName = function(name) {

		if (is.null(name))
			return(NA_character_)

		# Look for molecules with this name
		ids <- list()
		for (n in name)
			ids <- c(ids, list(.self$.get.id.from.name(n)))

		return(ids)
	})
	
	###############################
	# GET CHROMATOGRAPHIC COLUMNS #
	###############################
	
	MsFileDb$methods( getChromCol = function(molid = NULL) {

		# Init db
		.self$.init.db()

		# Get database
		db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL)]

		# Filter on molecule IDs
		if ( ! is.null(molid))
			db <- db[db[[MSDB.TAG.MOLID]] %in% molid,]

		# Get column names
		cols <- db[[MSDB.TAG.COL]]

		# Remove duplicates
		cols <- cols[ ! duplicated(cols)]

		# Make data frame
		cols <- data.frame(id = cols, title = cols, stringsAsFactors = FALSE)

		return(cols)
	})
	
	################
	# GET NB PEAKS #
	################
	
	# Get the total number of MS peaks stored inside the database.
	# molid     The ID of the molecule.
	# type      The MS type.
	MsFileDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) {

		# Init db
		.self$.init.db()

		# Get database
		db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MODE, MSDB.TAG.MZTHEO)]

		# Filter on mode
		if ( ! is.null(type) && ! is.na(type))
			db <- db[db[[MSDB.TAG.MODE]] == (if (type == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ]

		# Filter on molecule IDs
		if ( ! is.null(molid) && ! is.na(molid))
			db <- db[db[[MSDB.TAG.MOLID]] %in% molid,]

		# Get mz values
		mz <- db[[MSDB.TAG.MZTHEO]]

		# Count number of unique values
		n <- sum(as.integer(! duplicated(mz)))

		return(n)
	})

	##########
	# SEARCH #
	##########

	MsFileDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) {

		# Init db
		.self$.init.db()
		db <- .self$.db

		# Filter on mode
		if ( ! is.null(mode) && ! is.na(mode))
			db <- db[db[[MSDB.TAG.MODE]] == (if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ]

		# Filter on molecule IDs
		if ( ! is.null(molids))
			db <- db[db[[MSDB.TAG.MOLID]] %in% molids,]

		# Filter on attributions
		if ( ! is.null(attribs) && ! is.na(attribs))
			db <- db[db[[MSDB.TAG.ATTR]] %in% attribs,]

		# Filter on columns
		if ( ! is.null(col) && ! is.na(col))
			db <- db[db[[MSDB.TAG.COL]] %in% col,]

		# Filter on retention time
		if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high))
			db <- db[db[[MSDB.TAG.COLRT]] >= rt.low & db[[MSDB.TAG.COLRT]] <= rt.high, ]

		# Remove retention times and column information
		if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) {
			db <- db[, ! (colnames(db) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))]

			# Remove duplicates
			db <- db[ ! duplicated(db), ]
		}

		# Filter on mz
		db <- db[db[[MSDB.TAG.MZTHEO]] >= mz.low & db[[MSDB.TAG.MZTHEO]] <= mz.high, ]

		# Rename database fields
#		conv <- c( mz = 'mztheo', rt = 'colrt') # solving mismatch of field names between database and output
#		cols <- colnames(db)
#		for (db.field in names(.self$.fields)) {
#			output.field <- if (db.field %in% names(conv)) conv[[db.field]] else db.field
#			if (.self$.fields[[db.field]] %in% cols && output.field %in% names(.self$.output.fields))
#				cols[cols %in% .self$.fields[[db.field]]] <- .self$.output.fields[[output.field]]
#		}
#		colnames(db) <- cols

		# Remove unwanted columns
#		db <- db[, colnames(db) %in% .self$.output.fields]

		return(db)
	})
	
	#################
	# GET MZ VALUES #
	#################
	
	# Returns a numeric vector of all masses stored inside the database.
	MsFileDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {

		# Init db
		.self$.init.db()
		db <- .self$.db

		# Filter on mode
		if ( ! is.null(mode) && ! is.na(mode)) {
			mode.tag <- if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg
			selected.lines <- (.self$.get(db, col = MSDB.TAG.MODE) == mode.tag)
			db <- db[selected.lines, ]
		}

		# Get masses
		mz <- .self$.get(db, col = MSDB.TAG.MZTHEO)

		# Remove duplicates
		mz <- mz[ ! duplicated(mz)]

		# Apply cut-off
		if ( ! is.na(max.results))
			mz <- mz[1:max.results]

		return(mz)
	})
	
	#######################
	# GET RETENTION TIMES #
	#######################
	
	# Get the retention times of a molecule.
	# Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list.
	MsFileDb$methods( getRetentionTimes = function(molid, col = NA_character_) {

		if (is.null(molid) || is.na(molid))
			return(list())

		# Init db
		.self$.init.db()
		db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)]

		# Filter on molecule ID
		if ( ! is.null(molid) && ! is.na(molid))
			db <- db[db[[MSDB.TAG.MOLID]] %in% molid,]

		# Remove duplicates
		db <- db[! duplicated(db), ]

		# Build retention time list
		rt <- list()
		cols <- db[[MSDB.TAG.COL]]
		cols <- cols[ ! duplicated(cols)]
		for (col in cols) {
			colrts <- db[db[[MSDB.TAG.COL]] %in% col, MSDB.TAG.COLRT]
			rt[col] <- list(colrts)
		}

		return(rt)
	})

} # end of load safe guard