Mercurial > repos > prog > lcmsmatching
diff BiodbFactory.R @ 2:20d69a062da3 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author | prog |
---|---|
date | Thu, 02 Mar 2017 08:55:00 -0500 |
parents | 253d531a0193 |
children |
line wrap: on
line diff
--- a/BiodbFactory.R Sat Sep 03 17:02:01 2016 -0400 +++ b/BiodbFactory.R Thu Mar 02 08:55:00 2017 -0500 @@ -1,233 +1,274 @@ -if ( ! exists('BiodbFactory')) { # Do not load again if already loaded - - library(methods) - source('biodb-common.R') - source('ChebiConn.R') - source('KeggConn.R') - source('PubchemConn.R') - source('HmdbConn.R') - source('ChemspiderConn.R') - source('EnzymeConn.R') - source('LipidmapsConn.R') - source('MirbaseConn.R') - source('NcbigeneConn.R') - source('NcbiccdsConn.R') - source('UniprotConn.R') - source('MassbankConn.R') - source('MassFiledbConn.R') +# vi: fdm=marker + +########################## +# CLASS DECLARATION {{{1 # +########################## + +BiodbFactory <- methods::setRefClass("BiodbFactory", contains = 'BiodbObject', fields = list(.useragent = "character", + .conn = "list", + .cache.dir = "character", + .cache.mode = "character", + .debug = "logical", + .chunk.size = "integer", + .use.env.var = "logical")) + +############### +# CONSTRUCTOR # +############### + +BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, cache.mode = BIODB.CACHE.READ.WRITE, debug = FALSE, chunk.size = NA_integer_, use.env.var = FALSE, ...) { - ##################### - # CLASS DECLARATION # - ##################### - - BiodbFactory <- setRefClass("BiodbFactory", fields = list(.useragent = "character", .conn = "list", .cache.dir = "character", .debug = "logical")) - - ############### - # CONSTRUCTOR # - ############### - - BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, debug = FALSE, ...) { - - .useragent <<- useragent - .conn <<- list() - .cache.dir <<- cache.dir - .debug <<- debug + .useragent <<- useragent + .conn <<- list() + .cache.dir <<- cache.dir + .cache.mode <<- cache.mode + .debug <<- debug + .chunk.size <<- as.integer(chunk.size) + .use.env.var <<- use.env.var - callSuper(...) # calls super-class initializer with remaining parameters - }) + callSuper(...) # calls super-class initializer with remaining parameters +}) + +####################### +# PRINT DEBUG MESSAGE # +####################### - ####################### - # PRINT DEBUG MESSAGE # - ####################### - - BiodbFactory$methods( .print.debug.msg = function(msg) { - if (.self$.debug) - .print.msg(msg = msg, class = class(.self)) - }) +BiodbFactory$methods( .print.debug.msg = function(msg) { + if (.self$.debug) + .print.msg(msg = msg, class = class(.self)) +}) - ################## - # GET USER AGENT # - ################## +################## +# GET USER AGENT # +################## - BiodbFactory$methods( getUserAgent = function() { - return(.self$.useragent) - }) +BiodbFactory$methods( getUserAgent = function() { + return(.self$.useragent) +}) - ################## - # SET USER AGENT # - ################## +################## +# SET USER AGENT # +################## BiodbFactory$methods( setUserAgent = function(useragent) { - .useragent <<- useragent - }) + "Set useragent of BiodbFactory." + .useragent <<- useragent +}) + +############### +# CREATE CONN # +############### + +BiodbFactory$methods( createConn = function(class, url = NA_character_, token = NA_character_) { + " Create connection to databases useful for metabolomics." + if (class %in% names(.self$.conn)) + stop(paste0('A connection of type ', class, ' already exists. Please use method getConn() to access it.')) + + # Use environment variables + if (.self$.use.env.var) { + if (is.na(url)) + url <- .biodb.get.env.var(c(class, 'URL')) + if (is.na(token)) + token <- .biodb.get.env.var(c(class, 'TOKEN')) + } + + # Create connection instance + conn <- switch(class, + chebi = ChebiConn$new(useragent = .self$.useragent, debug = .self$.debug), + kegg = KeggConn$new(useragent = .self$.useragent, debug = .self$.debug), + pubchemcomp = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMCOMP, debug = .self$.debug), + pubchemsub = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMSUB, debug = .self$.debug), + hmdb = HmdbConn$new(useragent = .self$.useragent, debug = .self$.debug), + chemspider = ChemspiderConn$new(useragent = .self$.useragent, debug = .self$.debug, token = token), + enzyme = EnzymeConn$new(useragent = .self$.useragent, debug = .self$.debug), + lipidmaps = LipidmapsConn$new(useragent = .self$.useragent, debug = .self$.debug), + mirbase = MirbaseConn$new(useragent = .self$.useragent, debug = .self$.debug), + ncbigene = NcbigeneConn$new(useragent = .self$.useragent, debug = .self$.debug), + ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent, debug = .self$.debug), + uniprot = UniprotConn$new(useragent = .self$.useragent, debug = .self$.debug), + massbank = MassbankConn$new(useragent = .self$.useragent, url = url, debug = .self$.debug), + massfiledb = MassFiledbConn$new(file = url, debug = .self$.debug), + peakforest = PeakforestConn$new(useragent = .self$.useragent, debug = .self$.debug), + NULL) + + # Unknown class + if (is.null(conn)) + stop(paste0("Unknown r-biodb class \"", class,"\".")) + + # Register new class + .self$.conn[[class]] <- conn + + return (.self$.conn[[class]]) +}) - ############ - # GET CONN # - ############ +############ +# GET CONN # +############ + +BiodbFactory$methods( getConn = function(class) { + "Get connection to a database." + + if ( ! class %in% names(.self$.conn)) + .self$createConn(class) + + return (.self$.conn[[class]]) +}) + +################ +# CREATE ENTRY # +################ + +BiodbFactory$methods( createEntry = function(class, id = NULL, content = NULL, drop = TRUE) { + "Create Entry from a database by id." + + is.null(id) && is.null(content) && stop("One of id or content must be set.") + ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.") + + # Debug + .self$.print.debug.msg(paste0("Creating ", if (is.null(id)) length(content) else length(id), " entries from ", if (is.null(id)) "contents" else paste("ids", paste(if (length(id) > 10) id[1:10] else id, collapse = ", ")), "...")) - BiodbFactory$methods( getConn = function(class, url = NA_character_) { + # Get content + if ( ! is.null(id)) + content <- .self$getEntryContent(class, id) + conn <- .self$getConn(class) + entry <- conn$createEntry(content = content, drop = drop) + + # Set factory + .self$.print.debug.msg(paste0("Setting factory reference into entries...")) + for (e in c(entry)) + if ( ! is.null(e)) + e$setFactory(.self) + + return(entry) +}) - if ( ! class %in% names(.self$.conn)) { +######################## +# GET CACHE FILE PATHS # +######################## + +BiodbFactory$methods( .get.cache.file.paths = function(class, id) { + + # Get extension + ext <- .self$getConn(class)$getEntryContentType() + + # Set filenames + filenames <- vapply(id, function(x) { if (is.na(x)) NA_character_ else paste0(class, '-', x, '.', ext) }, FUN.VALUE = '') + + # set file paths + file.paths <- vapply(filenames, function(x) { if (is.na(x)) NA_character_ else file.path(.self$.cache.dir, x) }, FUN.VALUE = '') - # Create connection instance - conn <- switch(class, - chebi = ChebiConn$new(useragent = .self$.useragent), - kegg = KeggConn$new(useragent = .self$.useragent), - pubchem = PubchemConn$new(useragent = .self$.useragent), - hmdb = HmdbConn$new(useragent = .self$.useragent), - chemspider = ChemspiderConn$new(useragent = .self$.useragent), - enzyme = EnzymeConn$new(useragent = .self$.useragent), - lipidmaps = LipidmapsConn$new(useragent = .self$.useragent), - mirbase = MirbaseConn$new(useragent = .self$.useragent), - ncbigene = NcbigeneConn$new(useragent = .self$.useragent), - ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent), - uniprot = UniprotConn$new(useragent = .self$.useragent), - massbank = MassbankConn$new(useragent = .self$.useragent, debug = .self$.debug), - massfiledb = MassFiledbConn$new(file = url), - NULL) + # Create cache dir if needed + if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir)) + dir.create(.self$.cache.dir) + + return(file.paths) +}) + +########################### +# LOAD CONTENT FROM CACHE # +########################### + +BiodbFactory$methods( .load.content.from.cache = function(class, id) { + + content <- NULL + + # Read contents from files + file.paths <- .self$.get.cache.file.paths(class, id) + content <- lapply(file.paths, function(x) { if (is.na(x)) NA_character_ else ( if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL )} ) + + return(content) +}) + +############################ +# IS CACHE READING ENABLED # +############################ + +BiodbFactory$methods( .is.cache.reading.enabled = function() { + return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.READ.ONLY, BIODB.CACHE.READ.WRITE)) +}) + +############################ +# IS CACHE WRITING ENABLED # +############################ + +BiodbFactory$methods( .is.cache.writing.enabled = function() { + return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.WRITE.ONLY, BIODB.CACHE.READ.WRITE)) +}) + +######################### +# SAVE CONTENT TO CACHE # +######################### + +BiodbFactory$methods( .save.content.to.cache = function(class, id, content) { + + # Write contents into files + file.paths <- .self$.get.cache.file.paths(class, id) + mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths) +}) + +##################### +# GET ENTRY CONTENT # +##################### - # Unknown class - if (is.null(conn)) - stop(paste0("Unknown r-biodb class \"", class,"\".")) +BiodbFactory$methods( getEntryContent = function(class, id) { + + # Debug + .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)...")) + + # Initialize content + if (.self$.is.cache.reading.enabled()) { + content <- .self$.load.content.from.cache(class, id) + missing.ids <- id[vapply(content, is.null, FUN.VALUE = TRUE)] + } + else { + content <- lapply(id, as.null) + missing.ids <- id + } + + # Remove duplicates + n.duplicates <- sum(duplicated(missing.ids)) + missing.ids <- missing.ids[ ! duplicated(missing.ids)] - .self$.conn[[class]] <- conn + # Debug + if (any(is.na(id))) + .self$.print.debug.msg(paste0(sum(is.na(id)), " entry ids are NA.")) + if (.self$.is.cache.reading.enabled()) { + .self$.print.debug.msg(paste0(sum( ! is.na(id)) - length(missing.ids), " entry content(s) loaded from cache.")) + if (n.duplicates > 0) + .self$.print.debug.msg(paste0(n.duplicates, " entry ids, whose content needs to be fetched, are duplicates.")) + .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched.")) + } + + # Get contents + if (length(missing.ids) > 0) { + + # Use connector to get missing contents + conn <- .self$getConn(class) + + # Divide list of missing ids in chunks (in order to save in cache regularly) + chunks.of.missing.ids = if (is.na(.self$.chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / .self$.chunk.size)) + + # Loop on chunks + missing.contents <- NULL + for (ch.missing.ids in chunks.of.missing.ids) { + + ch.missing.contents <- conn$getEntryContent(ch.missing.ids) + + # Save to cache + if ( ! is.null(ch.missing.contents) && .self$.is.cache.writing.enabled()) + .self$.save.content.to.cache(class, ch.missing.ids, ch.missing.contents) + + # Append + missing.contents <- c(missing.contents, ch.missing.contents) + + # Debug + if (.self$.is.cache.reading.enabled()) + .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved...")) } - return (.self$.conn[[class]]) - }) - - ################ - # CREATE ENTRY # - ################ - - BiodbFactory$methods( createEntry = function(class, type, id = NULL, content = NULL, drop = TRUE) { - - is.null(id) && is.null(content) && stop("One of id or content must be set.") - ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.") - - # Debug - .self$.print.debug.msg(paste0("Creating entry from ", if (is.null(id)) "content" else paste("id", id), "...")) - - # Get content - if ( ! is.null(id)) - content <- .self$getEntryContent(class, type, id) - - conn <- .self$getConn(class) - entry <- conn$createEntry(type = type, content = content, drop = drop) - - # Set factory - for (e in c(entry)) - e$setFactory(.self) - - return(entry) - }) - - ######################## - # GET CACHE FILE PATHS # - ######################## - - BiodbFactory$methods( .get.cache.file.paths = function(class, type, id) { - - # Get extension - ext <- .self$getConn(class)$getEntryContentType(type) - - # Set filenames - filenames <- vapply(id, function(x) paste0(class, '-', type, '-', x, '.', ext), FUN.VALUE = '') - - # set file paths - file.paths <- vapply(filenames, function(x) file.path(.self$.cache.dir, x), FUN.VALUE = '') - - # Create cache dir if needed - if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir)) - dir.create(.self$.cache.dir) - - return(file.paths) - }) - - ########################### - # LOAD CONTENT FROM CACHE # - ########################### - - BiodbFactory$methods( .load.content.from.cache = function(class, type, id) { - - content <- NULL - - # Read contents from files - file.paths <- .self$.get.cache.file.paths(class, type, id) - content <- lapply(file.paths, function(x) { if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL }) - - return(content) - }) + # Merge content and missing.contents + content[id %in% missing.ids] <- vapply(id[id %in% missing.ids], function(x) missing.contents[missing.ids %in% x], FUN.VALUE = '') + } - ######################### - # SAVE CONTENT TO CACHE # - ######################### - - BiodbFactory$methods( .save.content.to.cache = function(class, type, id, content) { - - # Write contents into files - file.paths <- .self$.get.cache.file.paths(class, type, id) - mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths) - }) - - ##################### - # GET ENTRY CONTENT # - ##################### - - BiodbFactory$methods( getEntryContent = function(class, type, id, chunk.size = NA_integer_) { - - # Debug - .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)...")) - - content <- NULL - # Load from cache - if ( ! is.na(.self$.cache.dir)) - content <- .self$.load.content.from.cache(class, type, id) - - # Get list of missing contents - missing.content.indexes <- vapply(content, is.null, FUN.VALUE = TRUE) - missing.ids <- if (is.null(content)) id else id[missing.content.indexes] - - # Debug - if ( ! is.na(.self$.cache.dir)) { - .self$.print.debug.msg(paste0(length(id) - length(missing.ids), " entry content(s) loaded from cache.")) - .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched.")) - } - - # Get contents - if (length(missing.ids) > 0) { - - # Use connector to get missing contents - conn <- .self$getConn(class) - - # Divide list of missing ids in chunks (in order to save in cache regularly) - chunks.of.missing.ids = if (is.na(chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / chunk.size)) - - # Loop on chunks - missing.contents <- NULL - for (ch.missing.ids in chunks.of.missing.ids) { - - ch.missing.contents <- conn$getEntryContent(type, ch.missing.ids) - - # Save to cache - if ( ! is.null(ch.missing.contents) && ! is.na(.self$.cache.dir)) - .self$.save.content.to.cache(class, type, ch.missing.ids, ch.missing.contents) - - # Append - missing.contents <- c(missing.contents, ch.missing.contents) - - # Debug - if ( ! is.na(.self$.cache.dir)) - .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved...")) - } - - # Merge content and missing.contents - if (is.null(content)) - content <- missing.contents - else - content[missing.content.indexes] <- missing.contents - } - - return(content) - }) -} + return(content) +})