diff BiodbFactory.R @ 2:20d69a062da3 draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author prog
date Thu, 02 Mar 2017 08:55:00 -0500
parents 253d531a0193
children
line wrap: on
line diff
--- a/BiodbFactory.R	Sat Sep 03 17:02:01 2016 -0400
+++ b/BiodbFactory.R	Thu Mar 02 08:55:00 2017 -0500
@@ -1,233 +1,274 @@
-if ( ! exists('BiodbFactory')) { # Do not load again if already loaded
-	
-	library(methods)
-	source('biodb-common.R')
-	source('ChebiConn.R')
-	source('KeggConn.R')
-	source('PubchemConn.R')
-	source('HmdbConn.R')
-	source('ChemspiderConn.R')
-	source('EnzymeConn.R')
-	source('LipidmapsConn.R')
-	source('MirbaseConn.R')
-	source('NcbigeneConn.R')
-	source('NcbiccdsConn.R')
-	source('UniprotConn.R')
-	source('MassbankConn.R')
-	source('MassFiledbConn.R')
+# vi: fdm=marker
+
+##########################
+# CLASS DECLARATION {{{1 #
+##########################
+
+BiodbFactory <- methods::setRefClass("BiodbFactory", contains = 'BiodbObject', fields = list(.useragent = "character",
+														  .conn = "list",
+														  .cache.dir = "character",
+														  .cache.mode = "character",
+														  .debug = "logical",
+														  .chunk.size = "integer",
+														  .use.env.var = "logical"))
+
+###############
+# CONSTRUCTOR #
+###############
+
+BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, cache.mode = BIODB.CACHE.READ.WRITE, debug = FALSE, chunk.size = NA_integer_, use.env.var = FALSE, ...) {
 
-	#####################
-	# CLASS DECLARATION #
-	#####################
-	
-	BiodbFactory <- setRefClass("BiodbFactory", fields = list(.useragent = "character", .conn = "list", .cache.dir = "character", .debug = "logical"))
-	
-	###############
-	# CONSTRUCTOR #
-	###############
-	
-	BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, debug = FALSE, ...) {
-	
-		.useragent <<- useragent
-		.conn <<- list()
-		.cache.dir <<- cache.dir
-		.debug <<- debug
+	.useragent <<- useragent
+	.conn <<- list()
+	.cache.dir <<- cache.dir
+	.cache.mode <<- cache.mode
+	.debug <<- debug
+	.chunk.size <<- as.integer(chunk.size)
+	.use.env.var <<- use.env.var
 
-		callSuper(...) # calls super-class initializer with remaining parameters
-	})
+	callSuper(...) # calls super-class initializer with remaining parameters
+})
+
+#######################
+# PRINT DEBUG MESSAGE #
+#######################
 
-	#######################
-	# PRINT DEBUG MESSAGE #
-	#######################
-
-	BiodbFactory$methods( .print.debug.msg = function(msg) {
-		if (.self$.debug)
-			.print.msg(msg = msg, class = class(.self))
-	})
+BiodbFactory$methods( .print.debug.msg = function(msg) {
+	if (.self$.debug)
+		.print.msg(msg = msg, class = class(.self))
+})
 
-	##################
-	# GET USER AGENT #
-	##################
+##################
+# GET USER AGENT #
+##################
 
-	BiodbFactory$methods( getUserAgent = function() {
-		return(.self$.useragent)
-	})
+BiodbFactory$methods( getUserAgent = function() {
+	return(.self$.useragent)
+})
 
-	##################
-	# SET USER AGENT #
-	##################
+##################
+# SET USER AGENT #
+##################
 
 	BiodbFactory$methods( setUserAgent = function(useragent) {
-		.useragent <<- useragent
-	})
+	"Set useragent of BiodbFactory."
+	.useragent <<- useragent
+})
+
+###############
+# CREATE CONN #
+###############
+
+BiodbFactory$methods( createConn = function(class, url = NA_character_, token = NA_character_) {
+    " Create connection to databases useful for metabolomics."
+	if (class %in% names(.self$.conn))
+		stop(paste0('A connection of type ', class, ' already exists. Please use method getConn() to access it.'))
+
+	# Use environment variables
+	if (.self$.use.env.var) {
+		if (is.na(url))
+			url <- .biodb.get.env.var(c(class, 'URL'))
+		if (is.na(token))
+			token <- .biodb.get.env.var(c(class, 'TOKEN'))
+	}
+
+	# Create connection instance
+	conn <- switch(class,
+		            chebi       = ChebiConn$new(useragent = .self$.useragent, debug = .self$.debug),
+		            kegg        = KeggConn$new(useragent = .self$.useragent, debug = .self$.debug),
+		            pubchemcomp = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMCOMP, debug = .self$.debug),
+		            pubchemsub  = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMSUB, debug = .self$.debug),
+		            hmdb        = HmdbConn$new(useragent = .self$.useragent, debug = .self$.debug),
+		            chemspider  = ChemspiderConn$new(useragent = .self$.useragent, debug = .self$.debug, token = token),
+		            enzyme      = EnzymeConn$new(useragent = .self$.useragent, debug = .self$.debug),
+		            lipidmaps   = LipidmapsConn$new(useragent = .self$.useragent, debug = .self$.debug),
+		            mirbase     = MirbaseConn$new(useragent = .self$.useragent, debug = .self$.debug),
+		            ncbigene    = NcbigeneConn$new(useragent = .self$.useragent, debug = .self$.debug),
+		            ncbiccds    = NcbiccdsConn$new(useragent = .self$.useragent, debug = .self$.debug),
+		            uniprot     = UniprotConn$new(useragent = .self$.useragent, debug = .self$.debug),
+		            massbank    = MassbankConn$new(useragent = .self$.useragent, url = url, debug = .self$.debug),
+					massfiledb  = MassFiledbConn$new(file = url, debug = .self$.debug),
+					peakforest  = PeakforestConn$new(useragent = .self$.useragent, debug = .self$.debug),
+		      	    NULL)
+
+	# Unknown class
+	if (is.null(conn))
+		stop(paste0("Unknown r-biodb class \"", class,"\"."))
+
+	# Register new class
+	.self$.conn[[class]] <- conn
+
+	return (.self$.conn[[class]])
+})
 
-	############
-	# GET CONN #
-	############
+############
+# GET CONN #
+############
+
+BiodbFactory$methods( getConn = function(class) {
+	"Get connection to a database."
+
+	if ( ! class %in% names(.self$.conn))
+		.self$createConn(class)
+
+	return (.self$.conn[[class]])
+})
+
+################
+# CREATE ENTRY #
+################
+
+BiodbFactory$methods( createEntry = function(class, id = NULL, content = NULL, drop = TRUE) {
+	"Create Entry from a database by id."
+
+	is.null(id) && is.null(content) && stop("One of id or content must be set.")
+	! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.")
+
+	# Debug
+	.self$.print.debug.msg(paste0("Creating ", if (is.null(id)) length(content) else length(id), " entries from ", if (is.null(id)) "contents" else paste("ids", paste(if (length(id) > 10) id[1:10] else id, collapse = ", ")), "..."))
 
-	BiodbFactory$methods( getConn = function(class, url = NA_character_) {
+	# Get content
+	if ( ! is.null(id))
+		content <- .self$getEntryContent(class, id)
+	conn <- .self$getConn(class)
+	entry <- conn$createEntry(content = content, drop = drop)
+
+	# Set factory
+	.self$.print.debug.msg(paste0("Setting factory reference into entries..."))
+	for (e in c(entry))
+		if ( ! is.null(e))
+			e$setFactory(.self)
+
+	return(entry)
+})
 
-		if ( ! class %in% names(.self$.conn)) {
+########################
+# GET CACHE FILE PATHS #
+########################
+
+BiodbFactory$methods( .get.cache.file.paths = function(class, id) {
+
+	# Get extension
+	ext <- .self$getConn(class)$getEntryContentType()
+
+	# Set filenames
+	filenames <- vapply(id, function(x) { if (is.na(x)) NA_character_ else paste0(class, '-', x, '.', ext) }, FUN.VALUE = '')
+
+	# set file paths
+	file.paths <- vapply(filenames, function(x) { if (is.na(x)) NA_character_ else file.path(.self$.cache.dir, x) }, FUN.VALUE = '')
 
-			# Create connection instance
-			conn <- switch(class,
-		                	chebi       = ChebiConn$new(useragent = .self$.useragent),
-		                	kegg        = KeggConn$new(useragent = .self$.useragent),
-		                	pubchem     = PubchemConn$new(useragent = .self$.useragent),
-		                	hmdb        = HmdbConn$new(useragent = .self$.useragent),
-		                	chemspider  = ChemspiderConn$new(useragent = .self$.useragent),
-		                	enzyme      = EnzymeConn$new(useragent = .self$.useragent),
-		                	lipidmaps   = LipidmapsConn$new(useragent = .self$.useragent),
-		                	mirbase     = MirbaseConn$new(useragent = .self$.useragent),
-		                	ncbigene    = NcbigeneConn$new(useragent = .self$.useragent),
-		                	ncbiccds    = NcbiccdsConn$new(useragent = .self$.useragent),
-		                	uniprot     = UniprotConn$new(useragent = .self$.useragent),
-		                	massbank    = MassbankConn$new(useragent = .self$.useragent, debug = .self$.debug),
-							massfiledb  = MassFiledbConn$new(file = url),
-		      	          	NULL)
+	# Create cache dir if needed
+	if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir))
+		dir.create(.self$.cache.dir)
+
+	return(file.paths)
+})
+
+###########################
+# LOAD CONTENT FROM CACHE #
+###########################
+
+BiodbFactory$methods( .load.content.from.cache = function(class, id) {
+
+	content <- NULL
+
+	# Read contents from files
+	file.paths <- .self$.get.cache.file.paths(class, id)
+	content <- lapply(file.paths, function(x) { if (is.na(x)) NA_character_ else ( if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL )} )
+
+	return(content)
+})
+
+############################
+# IS CACHE READING ENABLED #
+############################
+
+BiodbFactory$methods( .is.cache.reading.enabled = function() {
+	return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.READ.ONLY, BIODB.CACHE.READ.WRITE))
+})
+
+############################
+# IS CACHE WRITING ENABLED #
+############################
+
+BiodbFactory$methods( .is.cache.writing.enabled = function() {
+	return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.WRITE.ONLY, BIODB.CACHE.READ.WRITE))
+})
+
+#########################
+# SAVE CONTENT TO CACHE #
+#########################
+
+BiodbFactory$methods( .save.content.to.cache = function(class, id, content) {
+
+	# Write contents into files
+	file.paths <- .self$.get.cache.file.paths(class, id)
+	mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths)
+})
+
+#####################
+# GET ENTRY CONTENT #
+#####################
 
-			# Unknown class
-			if (is.null(conn))
-				stop(paste0("Unknown r-biodb class \"", class,"\"."))
+BiodbFactory$methods( getEntryContent = function(class, id) {
+
+	# Debug
+	.self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)..."))
+
+	# Initialize content
+	if (.self$.is.cache.reading.enabled()) {
+		content <- .self$.load.content.from.cache(class, id)	
+		missing.ids <- id[vapply(content, is.null, FUN.VALUE = TRUE)]
+	}
+	else {
+		content <- lapply(id, as.null)
+		missing.ids <- id
+	}
+
+	# Remove duplicates
+	n.duplicates <- sum(duplicated(missing.ids))
+	missing.ids <- missing.ids[ ! duplicated(missing.ids)]
 
-			.self$.conn[[class]] <- conn
+	# Debug
+	if (any(is.na(id)))
+		.self$.print.debug.msg(paste0(sum(is.na(id)), " entry ids are NA."))
+	if (.self$.is.cache.reading.enabled()) {
+		.self$.print.debug.msg(paste0(sum( ! is.na(id)) - length(missing.ids), " entry content(s) loaded from cache."))
+		if (n.duplicates > 0)
+			.self$.print.debug.msg(paste0(n.duplicates, " entry ids, whose content needs to be fetched, are duplicates."))
+		.self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched."))
+	}
+
+	# Get contents
+	if (length(missing.ids) > 0) {
+
+		# Use connector to get missing contents
+		conn <- .self$getConn(class)
+
+		# Divide list of missing ids in chunks (in order to save in cache regularly)
+		chunks.of.missing.ids = if (is.na(.self$.chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / .self$.chunk.size))
+
+		# Loop on chunks
+		missing.contents <- NULL
+		for (ch.missing.ids in chunks.of.missing.ids) {
+
+			ch.missing.contents <- conn$getEntryContent(ch.missing.ids)
+
+			# Save to cache
+			if ( ! is.null(ch.missing.contents) && .self$.is.cache.writing.enabled())
+				.self$.save.content.to.cache(class, ch.missing.ids, ch.missing.contents)
+
+			# Append
+			missing.contents <- c(missing.contents, ch.missing.contents)
+
+			# Debug
+			if (.self$.is.cache.reading.enabled())
+				.self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved..."))
 		}
 
-		return (.self$.conn[[class]])
-	})
-
-	################
-	# CREATE ENTRY #
-	################
-
-	BiodbFactory$methods( createEntry = function(class, type, id = NULL, content = NULL, drop = TRUE) {
-
-		is.null(id) && is.null(content) && stop("One of id or content must be set.")
-		! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.")
-
-		# Debug
-		.self$.print.debug.msg(paste0("Creating entry from ", if (is.null(id)) "content" else paste("id", id), "..."))
-
-		# Get content
-		if ( ! is.null(id))
-			content <- .self$getEntryContent(class, type, id)
-
-		conn <- .self$getConn(class)
-		entry <- conn$createEntry(type = type, content = content, drop = drop)
-
-		# Set factory
-		for (e in c(entry))
-			e$setFactory(.self)
-
-		return(entry)
-	})
-
-	########################
-	# GET CACHE FILE PATHS #
-	########################
-
-	BiodbFactory$methods( .get.cache.file.paths = function(class, type, id) {
-
-		# Get extension
-		ext <- .self$getConn(class)$getEntryContentType(type)
-
-		# Set filenames
-		filenames <- vapply(id, function(x) paste0(class, '-', type, '-', x, '.', ext), FUN.VALUE = '')
-
-		# set file paths
-		file.paths <- vapply(filenames, function(x) file.path(.self$.cache.dir, x), FUN.VALUE = '')
-	
-		# Create cache dir if needed
-		if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir))
-			dir.create(.self$.cache.dir)
-
-		return(file.paths)
-	})
-
-	###########################
-	# LOAD CONTENT FROM CACHE #
-	###########################
-
-	BiodbFactory$methods( .load.content.from.cache = function(class, type, id) {
-
-		content <- NULL
-
-		# Read contents from files
-		file.paths <- .self$.get.cache.file.paths(class, type, id)
-		content <- lapply(file.paths, function(x) { if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL })
-
-		return(content)
-	})
+		# Merge content and missing.contents
+		content[id %in% missing.ids] <- vapply(id[id %in% missing.ids], function(x) missing.contents[missing.ids %in% x], FUN.VALUE = '')
+	}
 
-	#########################
-	# SAVE CONTENT TO CACHE #
-	#########################
-
-	BiodbFactory$methods( .save.content.to.cache = function(class, type, id, content) {
-
-		# Write contents into files
-		file.paths <- .self$.get.cache.file.paths(class, type, id)
-		mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths)
-	})
-
-	#####################
-	# GET ENTRY CONTENT #
-	#####################
-
-	BiodbFactory$methods( getEntryContent = function(class, type, id, chunk.size = NA_integer_) {
-
-		# Debug
-		.self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)..."))
-
-		content <- NULL
-		# Load from cache
-		if ( ! is.na(.self$.cache.dir))
-			content <- .self$.load.content.from.cache(class, type, id)	
-
-		# Get list of missing contents
-		missing.content.indexes <- vapply(content, is.null, FUN.VALUE = TRUE)
-		missing.ids <- if (is.null(content)) id else id[missing.content.indexes]
-
-		# Debug
-		if ( ! is.na(.self$.cache.dir)) {
-			.self$.print.debug.msg(paste0(length(id) - length(missing.ids), " entry content(s) loaded from cache."))
-			.self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched."))
-		}
-
-		# Get contents
-		if (length(missing.ids) > 0) {
-
-			# Use connector to get missing contents
-			conn <- .self$getConn(class)
-
-			# Divide list of missing ids in chunks (in order to save in cache regularly)
-			chunks.of.missing.ids = if (is.na(chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / chunk.size))
-
-			# Loop on chunks
-			missing.contents <- NULL
-			for (ch.missing.ids in chunks.of.missing.ids) {
-
-				ch.missing.contents <- conn$getEntryContent(type, ch.missing.ids)
-
-				# Save to cache
-				if ( ! is.null(ch.missing.contents) && ! is.na(.self$.cache.dir))
-					.self$.save.content.to.cache(class, type, ch.missing.ids, ch.missing.contents)
-
-				# Append
-				missing.contents <- c(missing.contents, ch.missing.contents)
-
-				# Debug
-				if ( ! is.na(.self$.cache.dir))
-					.self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved..."))
-			}
-
-			# Merge content and missing.contents
-			if (is.null(content))
-				content <- missing.contents
-			else
-				content[missing.content.indexes] <- missing.contents
-		}
-
-		return(content)
-	})
-}
+	return(content)
+})