diff MassFiledbConn.R @ 2:20d69a062da3 draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author prog
date Thu, 02 Mar 2017 08:55:00 -0500
parents 253d531a0193
children fb9c0409d85c
line wrap: on
line diff
--- a/MassFiledbConn.R	Sat Sep 03 17:02:01 2016 -0400
+++ b/MassFiledbConn.R	Thu Mar 02 08:55:00 2017 -0500
@@ -1,258 +1,275 @@
-if ( ! exists('MassFiledbConn')) {
+# LCMS File db.
+# In this type of database, a single file is provided in CSV format. Default separator is tabulation.
+# Each line is a MS peak measure, .
+# The file contains molecule and spectrum information. Each spectrum has an accession id.
+
+# TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue
 
-	source('MassdbConn.R')
-	
-	# LCMS File db.
-	# In this type of database, a single file is provided in CSV format. Default separator is tabulation.
-	# Each line is a MS peak measure, .
-	# The file contains molecule and spectrum information. Each spectrum has an accession id.
+#############
+# CONSTANTS #
+#############
 
-	# TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue
-
-	#############
-	# CONSTANTS #
-	#############
+# Default database fields
+.BIODB.DFT.DB.FIELDS <- list()
+for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZEXP, BIODB.PEAK.MZTHEO, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS))
+	.BIODB.DFT.DB.FIELDS[[f]] <- f
 
-	# Default database fields
-	.BIODB.DFT.DB.FIELDS <- list()
-	for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZ, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS))
-		.BIODB.DFT.DB.FIELDS[[f]] <- f
+#####################
+# CLASS DECLARATION #
+#####################
+
+MassFiledbConn <- methods::setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .db.orig.colnames = "character", .fields = "list", .ms.modes = "character"))
 
-	#####################
-	# CLASS DECLARATION #
-	#####################
-	
-	MassFiledbConn <- setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .fields = "list", .ms.modes = "character"))
+###############
+# CONSTRUCTOR #
+###############
 
-	###############
-	# CONSTRUCTOR #
-	###############
+MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) {
+
+	# Check file
+	(! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.")
+	file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\"."))
 
-	MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) {
-
-		# Check file
-		(! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.")
-		file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\"."))
+	# Set fields
+	.db <<- NULL
+	.db.orig.colnames <<- NA_character_
+	.file <<- file
+	.file.sep <<- file.sep
+	.file.quote <<- file.quote
+	.fields <<- .BIODB.DFT.DB.FIELDS
+	.field.multval.sep <<- ';'
+	.ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS)
+	names(.self$.ms.modes) <- .self$.ms.modes
 
-		# Set fields
-		.db <<- NULL
-		.file <<- file
-		.file.sep <<- file.sep
-		.file.quote <<- file.quote
-		.fields <<- .BIODB.DFT.DB.FIELDS
-		.field.multval.sep <<- ';'
-		.ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS)
-		names(.self$.ms.modes) <- .self$.ms.modes
+	callSuper(...)
+})
 
-		callSuper(...)
-	})
+######################
+# Is valid field tag #
+######################
 
-	######################
-	# Is valid field tag #
-	######################
+MassFiledbConn$methods( isValidFieldTag = function(tag) {
+	return (tag %in% names(.self$.fields))
+})
 
-	MassFiledbConn$methods( isValidFieldTag = function(tag) {
-		return (tag %in% names(.self$.fields))
-	})
+###########
+# INIT DB #
+###########
 
-	#############
-	# Set field #
-	#############
+MassFiledbConn$methods( .init.db = function() {
 
-	MassFiledbConn$methods( setField = function(tag, colname) {
+	if (is.null(.self$.db)) {
+
+		# Load database
+		.db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL)
 
-		( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.")
-		( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.")
-
-		# Load database file
-		.self$.init.db()
+		# Save column names
+		.db.orig.colnames <<- colnames(.self$.db)
+	}
+})
 
-		# Check that this field tag is defined in the fields list
-		.self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid."))
+#############
+# Set field #
+#############
 
-		# Check that columns are defined in database file
-		all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file."))
+MassFiledbConn$methods( setField = function(tag, colname) {
+
+	( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.")
+	( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.")
 
-		# Set new definition
-		if (length(colname) == 1)
-			.fields[[tag]] <<- colname
-		else {
-			new.col <- paste(colname, collapse = ".")
-			.self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '')
-			.fields[[tag]] <<- new.col
-		}
-	})
+	# Load database file
+	.self$.init.db()
+
+	# Check that this field tag is defined in the fields list
+	.self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid."))
+
+	# Check that columns are defined in database file
+	all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file."))
 
-	######################################
-	# SET FIELD MULTIPLE VALUE SEPARATOR #
-	######################################
-
-	MassFiledbConn$methods( setFieldMultValSep = function(sep) {
-		.field.multval.sep <<- sep
-	})
+	# Set new definition
+	if (length(colname) == 1)
+		.fields[[tag]] <<- colname
+	else {
+		new.col <- paste(colname, collapse = ".")
+		.self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '')
+		.fields[[tag]] <<- new.col
+	}
 
-	################
-	# SET MS MODES #
-	################
+	# Update data frame column names
+	colnames(.self$.db) <- vapply(.self$.db.orig.colnames, function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '')
+})
 
-	MassFiledbConn$methods( setMsMode = function(mode, value) {
-		.self$.ms.modes[[mode]] <- value
-	})
+######################################
+# SET FIELD MULTIPLE VALUE SEPARATOR #
+######################################
 
-	##########################
-	# GET ENTRY CONTENT TYPE #
-	##########################
+MassFiledbConn$methods( setFieldMultValSep = function(sep) {
+	.field.multval.sep <<- sep
+})
 
-	MassFiledbConn$methods( getEntryContentType = function(type) {
-		return(BIODB.DATAFRAME)
-	})
+################
+# SET MS MODES #
+################
 
-	###########
-	# INIT DB #
-	###########
+MassFiledbConn$methods( setMsMode = function(mode, value) {
+	.self$.ms.modes[[mode]] <- value
+})
 
-	MassFiledbConn$methods( .init.db = function() {
-
-		if (is.null(.self$.db)) {
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
 
-			# Load database
-			.db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL)
+MassFiledbConn$methods( getEntryContentType = function(type) {
+	return(BIODB.DATAFRAME)
+})
 
-			# Rename columns
-			colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '')
-		}
-	})
+################
+# CHECK FIELDS #
+################
 
-	################
-	# CHECK FIELDS #
-	################
+MassFiledbConn$methods( .check.fields = function(fields) {
+
+	if (length(fields) ==0 || (length(fields) == 1 && is.na(fields)))
+		return
 
-	MassFiledbConn$methods( .check.fields = function(fields) {
+	# Check if fields are known
+	unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)]
+	if (length(unknown.fields) > 0)
+		stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown."))
 
-		# Check if fields are known
-		unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)]
-		if (length(unknown.fields) > 0)
-			stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown."))
+	# Init db
+	.self$.init.db()
 
-		# Init db
-		.self$.init.db()
+	# Check if fields are defined in file database
+	undefined.fields <- colnames(.self$.db)[ ! fields %in% colnames(.self$.db)]
+	if (length(undefined.fields) > 0)
+		stop(paste0("Column(s) ", paste(fields), collapse = ", "), " is/are undefined in file database.")
+})
 
-		# Check if fields are defined in file database
-		undefined.fields <- colnames(.self$.init.db)[ ! unlist(.self$.fields[fields]) %in% colnames(.self$.init.db)]
-		if (length(undefined.fields) > 0)
-			stop(paste0("Column(s) ", paste(unlist(.self$.fields[fields]), collapse = ", "), " is/are undefined in file database."))
-	})
+##########
+# SELECT #
+##########
+
+# Select data from database
+MassFiledbConn$methods( .select = function(cols = NULL, mode = NULL, compound.ids = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) {
+
+	x <- NULL
 
-	################
-	# EXTRACT COLS #
-	################
-	
-	MassFiledbConn$methods( .extract.cols = function(cols, mode = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) {
-	
-		x <- NULL
+	# Init db
+	.self$.init.db()
 
-		if ( ! is.null(cols) && ! is.na(cols)) {
+	# Get db
+	db <- .self$.db
 
-			# Init db
-			.self$.init.db()
-
-			# TODO check existence of cols/fields
+	# Filter db on mode
+	if ( ! is.null(mode) && ! is.na(mode)) {
 
-			# Get db, eventually filtering it.
-			if (is.null(mode))
-				db <- .self$.db
-			else {
-				# Check mode value
-				mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'."))
-				.self$.check.fields(BIODB.MSMODE)
+		# Check mode value
+		mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'."))
+		.self$.check.fields(BIODB.MSMODE)
 
-				# Filter on mode
-				db <- .self$.db[.self$.db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ]
-			}
+		# Filter on mode
+		db <- db[db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ]
+	}
 
-			# Get subset
-			x <- db[, unlist(.self$.fields[cols]), drop = drop]
+	# Filter db on compound ids
+	# TODO
+
+	if ( ! is.null(cols) && ! is.na(cols))
+		.self$.check.fields(cols)
 
-			# Rename columns
-			if (is.data.frame(x))
-				colnames(x) <- cols
+	# Get subset
+	if (is.null(cols) || is.na(cols))
+		x <- db
+	else
+		x <- db[, unlist(.self$.fields[cols]), drop = drop]
 
-			# Rearrange
-			if (drop && is.vector(x)) {
-				if (uniq)
-					x <- x[ ! duplicated(x)]
-				if (sort)
-					x <- sort(x)
-			}
+	# Rearrange
+	if (drop && is.vector(x)) {
+		if (uniq)
+			x <- x[ ! duplicated(x)]
+		if (sort)
+			x <- sort(x)
+	}
 
-			# Cut
-			if ( ! is.na(max.rows))
-				x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ]
-		}
+	# Cut
+	if ( ! is.na(max.rows))
+		x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ]
+
+	return(x)
+})
+
+#################
+# GET ENTRY IDS #
+#################
 
-		return(x)
-	})
+MassFiledbConn$methods( getEntryIds = function(type) {
+
+	ids <- NA_character_
+
+	if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND))
+		ids <- as.character(.self$.select(cols = if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE))
 
-	#################
-	# GET ENTRY IDS #
-	#################
-	
-	MassFiledbConn$methods( getEntryIds = function(type) {
+	return(ids)
+})
 
-		ids <- NA_character_
-
-		if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND))
-			ids <- as.character(.self$.extract.cols(if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE))
+##################
+# GET NB ENTRIES #
+##################
 
-		return(ids)
-	})
+MassFiledbConn$methods( getNbEntries = function(type) {
+	return(length(.self$getEntryIds(type)))
+})
+
+###############################
+# GET CHROMATOGRAPHIC COLUMNS #
+###############################
 
-	##################
-	# GET NB ENTRIES #
-	##################
-	
-	MassFiledbConn$methods( getNbEntries = function(type) {
-		return(length(.self$getEntryIds(type)))
-	})
+# Inherited from MassdbConn.
+MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) {
+
+	# Extract needed columns
+	db <- .self$.select(cols = c(BIODB.COMPOUND.ID, BIODB.CHROM.COL))
 
-	###############################
-	# GET CHROMATOGRAPHIC COLUMNS #
-	###############################
-	
-	# Inherited from MassdbConn.
-	MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) {
+	# Filter on molecule IDs
+	if ( ! is.null(compound.ids))
+		db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ]
+
+	# Get column names
+	cols <- db[[BIODB.CHROM.COL]]
 
-		# Extract needed columns
-		db <- .self$.extract.cols(c(BIODB.COMPOUND.ID, BIODB.CHROM.COL))
+	# Remove duplicates
+	cols <- cols[ ! duplicated(cols)]
 
-		# Filter on molecule IDs
-		if ( ! is.null(compound.ids))
-			db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ]
+	# Make data frame
+	chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE)
+	colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE)
 
-		# Get column names
-		cols <- db[[BIODB.CHROM.COL]]
+	return(chrom.cols)
+})
 
-		# Remove duplicates
-		cols <- cols[ ! duplicated(cols)]
+#################
+# GET MZ VALUES #
+#################
 
-		# Make data frame
-		chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE)
-		colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE)
+# Inherited from MassdbConn.
+MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
 
-		return(chrom.cols)
-	})
-	
-	#################
-	# GET MZ VALUES #
-	#################
-	
-	# Inherited from MassdbConn.
-	MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
+	# Get mz values
+	mz <- .self$.select(cols = BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results)
+
+	return(mz)
+})
 
-		# Get mz values
-		mz <- .self$.extract.cols(BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results)
+################
+# GET NB PEAKS #
+################
 
-		return(mz)
-	})
+# Inherited from MassdbConn.
+MassFiledbConn$methods( getNbPeaks = function(mode = NULL, compound.ids = NULL) {
 
-}
+	# Get peaks
+	peaks <- .self$.select(cols = BIODB.PEAK.MZTHEO, mode = mode, compound.ids = compound.ids)
+
+	return(length(peaks))
+})