view BiodbFactory.R @ 2:20d69a062da3 draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author prog
date Thu, 02 Mar 2017 08:55:00 -0500
parents 253d531a0193
children
line wrap: on
line source

# vi: fdm=marker

##########################
# CLASS DECLARATION {{{1 #
##########################

BiodbFactory <- methods::setRefClass("BiodbFactory", contains = 'BiodbObject', fields = list(.useragent = "character",
														  .conn = "list",
														  .cache.dir = "character",
														  .cache.mode = "character",
														  .debug = "logical",
														  .chunk.size = "integer",
														  .use.env.var = "logical"))

###############
# CONSTRUCTOR #
###############

BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, cache.mode = BIODB.CACHE.READ.WRITE, debug = FALSE, chunk.size = NA_integer_, use.env.var = FALSE, ...) {

	.useragent <<- useragent
	.conn <<- list()
	.cache.dir <<- cache.dir
	.cache.mode <<- cache.mode
	.debug <<- debug
	.chunk.size <<- as.integer(chunk.size)
	.use.env.var <<- use.env.var

	callSuper(...) # calls super-class initializer with remaining parameters
})

#######################
# PRINT DEBUG MESSAGE #
#######################

BiodbFactory$methods( .print.debug.msg = function(msg) {
	if (.self$.debug)
		.print.msg(msg = msg, class = class(.self))
})

##################
# GET USER AGENT #
##################

BiodbFactory$methods( getUserAgent = function() {
	return(.self$.useragent)
})

##################
# SET USER AGENT #
##################

	BiodbFactory$methods( setUserAgent = function(useragent) {
	"Set useragent of BiodbFactory."
	.useragent <<- useragent
})

###############
# CREATE CONN #
###############

BiodbFactory$methods( createConn = function(class, url = NA_character_, token = NA_character_) {
    " Create connection to databases useful for metabolomics."
	if (class %in% names(.self$.conn))
		stop(paste0('A connection of type ', class, ' already exists. Please use method getConn() to access it.'))

	# Use environment variables
	if (.self$.use.env.var) {
		if (is.na(url))
			url <- .biodb.get.env.var(c(class, 'URL'))
		if (is.na(token))
			token <- .biodb.get.env.var(c(class, 'TOKEN'))
	}

	# Create connection instance
	conn <- switch(class,
		            chebi       = ChebiConn$new(useragent = .self$.useragent, debug = .self$.debug),
		            kegg        = KeggConn$new(useragent = .self$.useragent, debug = .self$.debug),
		            pubchemcomp = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMCOMP, debug = .self$.debug),
		            pubchemsub  = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMSUB, debug = .self$.debug),
		            hmdb        = HmdbConn$new(useragent = .self$.useragent, debug = .self$.debug),
		            chemspider  = ChemspiderConn$new(useragent = .self$.useragent, debug = .self$.debug, token = token),
		            enzyme      = EnzymeConn$new(useragent = .self$.useragent, debug = .self$.debug),
		            lipidmaps   = LipidmapsConn$new(useragent = .self$.useragent, debug = .self$.debug),
		            mirbase     = MirbaseConn$new(useragent = .self$.useragent, debug = .self$.debug),
		            ncbigene    = NcbigeneConn$new(useragent = .self$.useragent, debug = .self$.debug),
		            ncbiccds    = NcbiccdsConn$new(useragent = .self$.useragent, debug = .self$.debug),
		            uniprot     = UniprotConn$new(useragent = .self$.useragent, debug = .self$.debug),
		            massbank    = MassbankConn$new(useragent = .self$.useragent, url = url, debug = .self$.debug),
					massfiledb  = MassFiledbConn$new(file = url, debug = .self$.debug),
					peakforest  = PeakforestConn$new(useragent = .self$.useragent, debug = .self$.debug),
		      	    NULL)

	# Unknown class
	if (is.null(conn))
		stop(paste0("Unknown r-biodb class \"", class,"\"."))

	# Register new class
	.self$.conn[[class]] <- conn

	return (.self$.conn[[class]])
})

############
# GET CONN #
############

BiodbFactory$methods( getConn = function(class) {
	"Get connection to a database."

	if ( ! class %in% names(.self$.conn))
		.self$createConn(class)

	return (.self$.conn[[class]])
})

################
# CREATE ENTRY #
################

BiodbFactory$methods( createEntry = function(class, id = NULL, content = NULL, drop = TRUE) {
	"Create Entry from a database by id."

	is.null(id) && is.null(content) && stop("One of id or content must be set.")
	! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.")

	# Debug
	.self$.print.debug.msg(paste0("Creating ", if (is.null(id)) length(content) else length(id), " entries from ", if (is.null(id)) "contents" else paste("ids", paste(if (length(id) > 10) id[1:10] else id, collapse = ", ")), "..."))

	# Get content
	if ( ! is.null(id))
		content <- .self$getEntryContent(class, id)
	conn <- .self$getConn(class)
	entry <- conn$createEntry(content = content, drop = drop)

	# Set factory
	.self$.print.debug.msg(paste0("Setting factory reference into entries..."))
	for (e in c(entry))
		if ( ! is.null(e))
			e$setFactory(.self)

	return(entry)
})

########################
# GET CACHE FILE PATHS #
########################

BiodbFactory$methods( .get.cache.file.paths = function(class, id) {

	# Get extension
	ext <- .self$getConn(class)$getEntryContentType()

	# Set filenames
	filenames <- vapply(id, function(x) { if (is.na(x)) NA_character_ else paste0(class, '-', x, '.', ext) }, FUN.VALUE = '')

	# set file paths
	file.paths <- vapply(filenames, function(x) { if (is.na(x)) NA_character_ else file.path(.self$.cache.dir, x) }, FUN.VALUE = '')

	# Create cache dir if needed
	if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir))
		dir.create(.self$.cache.dir)

	return(file.paths)
})

###########################
# LOAD CONTENT FROM CACHE #
###########################

BiodbFactory$methods( .load.content.from.cache = function(class, id) {

	content <- NULL

	# Read contents from files
	file.paths <- .self$.get.cache.file.paths(class, id)
	content <- lapply(file.paths, function(x) { if (is.na(x)) NA_character_ else ( if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL )} )

	return(content)
})

############################
# IS CACHE READING ENABLED #
############################

BiodbFactory$methods( .is.cache.reading.enabled = function() {
	return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.READ.ONLY, BIODB.CACHE.READ.WRITE))
})

############################
# IS CACHE WRITING ENABLED #
############################

BiodbFactory$methods( .is.cache.writing.enabled = function() {
	return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.WRITE.ONLY, BIODB.CACHE.READ.WRITE))
})

#########################
# SAVE CONTENT TO CACHE #
#########################

BiodbFactory$methods( .save.content.to.cache = function(class, id, content) {

	# Write contents into files
	file.paths <- .self$.get.cache.file.paths(class, id)
	mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths)
})

#####################
# GET ENTRY CONTENT #
#####################

BiodbFactory$methods( getEntryContent = function(class, id) {

	# Debug
	.self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)..."))

	# Initialize content
	if (.self$.is.cache.reading.enabled()) {
		content <- .self$.load.content.from.cache(class, id)	
		missing.ids <- id[vapply(content, is.null, FUN.VALUE = TRUE)]
	}
	else {
		content <- lapply(id, as.null)
		missing.ids <- id
	}

	# Remove duplicates
	n.duplicates <- sum(duplicated(missing.ids))
	missing.ids <- missing.ids[ ! duplicated(missing.ids)]

	# Debug
	if (any(is.na(id)))
		.self$.print.debug.msg(paste0(sum(is.na(id)), " entry ids are NA."))
	if (.self$.is.cache.reading.enabled()) {
		.self$.print.debug.msg(paste0(sum( ! is.na(id)) - length(missing.ids), " entry content(s) loaded from cache."))
		if (n.duplicates > 0)
			.self$.print.debug.msg(paste0(n.duplicates, " entry ids, whose content needs to be fetched, are duplicates."))
		.self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched."))
	}

	# Get contents
	if (length(missing.ids) > 0) {

		# Use connector to get missing contents
		conn <- .self$getConn(class)

		# Divide list of missing ids in chunks (in order to save in cache regularly)
		chunks.of.missing.ids = if (is.na(.self$.chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / .self$.chunk.size))

		# Loop on chunks
		missing.contents <- NULL
		for (ch.missing.ids in chunks.of.missing.ids) {

			ch.missing.contents <- conn$getEntryContent(ch.missing.ids)

			# Save to cache
			if ( ! is.null(ch.missing.contents) && .self$.is.cache.writing.enabled())
				.self$.save.content.to.cache(class, ch.missing.ids, ch.missing.contents)

			# Append
			missing.contents <- c(missing.contents, ch.missing.contents)

			# Debug
			if (.self$.is.cache.reading.enabled())
				.self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved..."))
		}

		# Merge content and missing.contents
		content[id %in% missing.ids] <- vapply(id[id %in% missing.ids], function(x) missing.contents[missing.ids %in% x], FUN.VALUE = '')
	}

	return(content)
})