Mercurial > repos > prog > lcmsmatching
comparison BiodbFactory.R @ 2:20d69a062da3 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
| author | prog |
|---|---|
| date | Thu, 02 Mar 2017 08:55:00 -0500 |
| parents | 253d531a0193 |
| children |
comparison
equal
deleted
inserted
replaced
| 1:253d531a0193 | 2:20d69a062da3 |
|---|---|
| 1 if ( ! exists('BiodbFactory')) { # Do not load again if already loaded | 1 # vi: fdm=marker |
| 2 | 2 |
| 3 library(methods) | 3 ########################## |
| 4 source('biodb-common.R') | 4 # CLASS DECLARATION {{{1 # |
| 5 source('ChebiConn.R') | 5 ########################## |
| 6 source('KeggConn.R') | 6 |
| 7 source('PubchemConn.R') | 7 BiodbFactory <- methods::setRefClass("BiodbFactory", contains = 'BiodbObject', fields = list(.useragent = "character", |
| 8 source('HmdbConn.R') | 8 .conn = "list", |
| 9 source('ChemspiderConn.R') | 9 .cache.dir = "character", |
| 10 source('EnzymeConn.R') | 10 .cache.mode = "character", |
| 11 source('LipidmapsConn.R') | 11 .debug = "logical", |
| 12 source('MirbaseConn.R') | 12 .chunk.size = "integer", |
| 13 source('NcbigeneConn.R') | 13 .use.env.var = "logical")) |
| 14 source('NcbiccdsConn.R') | 14 |
| 15 source('UniprotConn.R') | 15 ############### |
| 16 source('MassbankConn.R') | 16 # CONSTRUCTOR # |
| 17 source('MassFiledbConn.R') | 17 ############### |
| 18 | 18 |
| 19 ##################### | 19 BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, cache.mode = BIODB.CACHE.READ.WRITE, debug = FALSE, chunk.size = NA_integer_, use.env.var = FALSE, ...) { |
| 20 # CLASS DECLARATION # | 20 |
| 21 ##################### | 21 .useragent <<- useragent |
| 22 | 22 .conn <<- list() |
| 23 BiodbFactory <- setRefClass("BiodbFactory", fields = list(.useragent = "character", .conn = "list", .cache.dir = "character", .debug = "logical")) | 23 .cache.dir <<- cache.dir |
| 24 | 24 .cache.mode <<- cache.mode |
| 25 ############### | 25 .debug <<- debug |
| 26 # CONSTRUCTOR # | 26 .chunk.size <<- as.integer(chunk.size) |
| 27 ############### | 27 .use.env.var <<- use.env.var |
| 28 | 28 |
| 29 BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, debug = FALSE, ...) { | 29 callSuper(...) # calls super-class initializer with remaining parameters |
| 30 | 30 }) |
| 31 .useragent <<- useragent | 31 |
| 32 .conn <<- list() | 32 ####################### |
| 33 .cache.dir <<- cache.dir | 33 # PRINT DEBUG MESSAGE # |
| 34 .debug <<- debug | 34 ####################### |
| 35 | 35 |
| 36 callSuper(...) # calls super-class initializer with remaining parameters | 36 BiodbFactory$methods( .print.debug.msg = function(msg) { |
| 37 }) | 37 if (.self$.debug) |
| 38 | 38 .print.msg(msg = msg, class = class(.self)) |
| 39 ####################### | 39 }) |
| 40 # PRINT DEBUG MESSAGE # | 40 |
| 41 ####################### | 41 ################## |
| 42 | 42 # GET USER AGENT # |
| 43 BiodbFactory$methods( .print.debug.msg = function(msg) { | 43 ################## |
| 44 if (.self$.debug) | 44 |
| 45 .print.msg(msg = msg, class = class(.self)) | 45 BiodbFactory$methods( getUserAgent = function() { |
| 46 }) | 46 return(.self$.useragent) |
| 47 | 47 }) |
| 48 ################## | 48 |
| 49 # GET USER AGENT # | 49 ################## |
| 50 ################## | 50 # SET USER AGENT # |
| 51 | 51 ################## |
| 52 BiodbFactory$methods( getUserAgent = function() { | |
| 53 return(.self$.useragent) | |
| 54 }) | |
| 55 | |
| 56 ################## | |
| 57 # SET USER AGENT # | |
| 58 ################## | |
| 59 | 52 |
| 60 BiodbFactory$methods( setUserAgent = function(useragent) { | 53 BiodbFactory$methods( setUserAgent = function(useragent) { |
| 61 .useragent <<- useragent | 54 "Set useragent of BiodbFactory." |
| 62 }) | 55 .useragent <<- useragent |
| 63 | 56 }) |
| 64 ############ | 57 |
| 65 # GET CONN # | 58 ############### |
| 66 ############ | 59 # CREATE CONN # |
| 67 | 60 ############### |
| 68 BiodbFactory$methods( getConn = function(class, url = NA_character_) { | 61 |
| 69 | 62 BiodbFactory$methods( createConn = function(class, url = NA_character_, token = NA_character_) { |
| 70 if ( ! class %in% names(.self$.conn)) { | 63 " Create connection to databases useful for metabolomics." |
| 71 | 64 if (class %in% names(.self$.conn)) |
| 72 # Create connection instance | 65 stop(paste0('A connection of type ', class, ' already exists. Please use method getConn() to access it.')) |
| 73 conn <- switch(class, | 66 |
| 74 chebi = ChebiConn$new(useragent = .self$.useragent), | 67 # Use environment variables |
| 75 kegg = KeggConn$new(useragent = .self$.useragent), | 68 if (.self$.use.env.var) { |
| 76 pubchem = PubchemConn$new(useragent = .self$.useragent), | 69 if (is.na(url)) |
| 77 hmdb = HmdbConn$new(useragent = .self$.useragent), | 70 url <- .biodb.get.env.var(c(class, 'URL')) |
| 78 chemspider = ChemspiderConn$new(useragent = .self$.useragent), | 71 if (is.na(token)) |
| 79 enzyme = EnzymeConn$new(useragent = .self$.useragent), | 72 token <- .biodb.get.env.var(c(class, 'TOKEN')) |
| 80 lipidmaps = LipidmapsConn$new(useragent = .self$.useragent), | 73 } |
| 81 mirbase = MirbaseConn$new(useragent = .self$.useragent), | 74 |
| 82 ncbigene = NcbigeneConn$new(useragent = .self$.useragent), | 75 # Create connection instance |
| 83 ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent), | 76 conn <- switch(class, |
| 84 uniprot = UniprotConn$new(useragent = .self$.useragent), | 77 chebi = ChebiConn$new(useragent = .self$.useragent, debug = .self$.debug), |
| 85 massbank = MassbankConn$new(useragent = .self$.useragent, debug = .self$.debug), | 78 kegg = KeggConn$new(useragent = .self$.useragent, debug = .self$.debug), |
| 86 massfiledb = MassFiledbConn$new(file = url), | 79 pubchemcomp = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMCOMP, debug = .self$.debug), |
| 87 NULL) | 80 pubchemsub = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMSUB, debug = .self$.debug), |
| 88 | 81 hmdb = HmdbConn$new(useragent = .self$.useragent, debug = .self$.debug), |
| 89 # Unknown class | 82 chemspider = ChemspiderConn$new(useragent = .self$.useragent, debug = .self$.debug, token = token), |
| 90 if (is.null(conn)) | 83 enzyme = EnzymeConn$new(useragent = .self$.useragent, debug = .self$.debug), |
| 91 stop(paste0("Unknown r-biodb class \"", class,"\".")) | 84 lipidmaps = LipidmapsConn$new(useragent = .self$.useragent, debug = .self$.debug), |
| 92 | 85 mirbase = MirbaseConn$new(useragent = .self$.useragent, debug = .self$.debug), |
| 93 .self$.conn[[class]] <- conn | 86 ncbigene = NcbigeneConn$new(useragent = .self$.useragent, debug = .self$.debug), |
| 87 ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
| 88 uniprot = UniprotConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
| 89 massbank = MassbankConn$new(useragent = .self$.useragent, url = url, debug = .self$.debug), | |
| 90 massfiledb = MassFiledbConn$new(file = url, debug = .self$.debug), | |
| 91 peakforest = PeakforestConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
| 92 NULL) | |
| 93 | |
| 94 # Unknown class | |
| 95 if (is.null(conn)) | |
| 96 stop(paste0("Unknown r-biodb class \"", class,"\".")) | |
| 97 | |
| 98 # Register new class | |
| 99 .self$.conn[[class]] <- conn | |
| 100 | |
| 101 return (.self$.conn[[class]]) | |
| 102 }) | |
| 103 | |
| 104 ############ | |
| 105 # GET CONN # | |
| 106 ############ | |
| 107 | |
| 108 BiodbFactory$methods( getConn = function(class) { | |
| 109 "Get connection to a database." | |
| 110 | |
| 111 if ( ! class %in% names(.self$.conn)) | |
| 112 .self$createConn(class) | |
| 113 | |
| 114 return (.self$.conn[[class]]) | |
| 115 }) | |
| 116 | |
| 117 ################ | |
| 118 # CREATE ENTRY # | |
| 119 ################ | |
| 120 | |
| 121 BiodbFactory$methods( createEntry = function(class, id = NULL, content = NULL, drop = TRUE) { | |
| 122 "Create Entry from a database by id." | |
| 123 | |
| 124 is.null(id) && is.null(content) && stop("One of id or content must be set.") | |
| 125 ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.") | |
| 126 | |
| 127 # Debug | |
| 128 .self$.print.debug.msg(paste0("Creating ", if (is.null(id)) length(content) else length(id), " entries from ", if (is.null(id)) "contents" else paste("ids", paste(if (length(id) > 10) id[1:10] else id, collapse = ", ")), "...")) | |
| 129 | |
| 130 # Get content | |
| 131 if ( ! is.null(id)) | |
| 132 content <- .self$getEntryContent(class, id) | |
| 133 conn <- .self$getConn(class) | |
| 134 entry <- conn$createEntry(content = content, drop = drop) | |
| 135 | |
| 136 # Set factory | |
| 137 .self$.print.debug.msg(paste0("Setting factory reference into entries...")) | |
| 138 for (e in c(entry)) | |
| 139 if ( ! is.null(e)) | |
| 140 e$setFactory(.self) | |
| 141 | |
| 142 return(entry) | |
| 143 }) | |
| 144 | |
| 145 ######################## | |
| 146 # GET CACHE FILE PATHS # | |
| 147 ######################## | |
| 148 | |
| 149 BiodbFactory$methods( .get.cache.file.paths = function(class, id) { | |
| 150 | |
| 151 # Get extension | |
| 152 ext <- .self$getConn(class)$getEntryContentType() | |
| 153 | |
| 154 # Set filenames | |
| 155 filenames <- vapply(id, function(x) { if (is.na(x)) NA_character_ else paste0(class, '-', x, '.', ext) }, FUN.VALUE = '') | |
| 156 | |
| 157 # set file paths | |
| 158 file.paths <- vapply(filenames, function(x) { if (is.na(x)) NA_character_ else file.path(.self$.cache.dir, x) }, FUN.VALUE = '') | |
| 159 | |
| 160 # Create cache dir if needed | |
| 161 if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir)) | |
| 162 dir.create(.self$.cache.dir) | |
| 163 | |
| 164 return(file.paths) | |
| 165 }) | |
| 166 | |
| 167 ########################### | |
| 168 # LOAD CONTENT FROM CACHE # | |
| 169 ########################### | |
| 170 | |
| 171 BiodbFactory$methods( .load.content.from.cache = function(class, id) { | |
| 172 | |
| 173 content <- NULL | |
| 174 | |
| 175 # Read contents from files | |
| 176 file.paths <- .self$.get.cache.file.paths(class, id) | |
| 177 content <- lapply(file.paths, function(x) { if (is.na(x)) NA_character_ else ( if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL )} ) | |
| 178 | |
| 179 return(content) | |
| 180 }) | |
| 181 | |
| 182 ############################ | |
| 183 # IS CACHE READING ENABLED # | |
| 184 ############################ | |
| 185 | |
| 186 BiodbFactory$methods( .is.cache.reading.enabled = function() { | |
| 187 return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.READ.ONLY, BIODB.CACHE.READ.WRITE)) | |
| 188 }) | |
| 189 | |
| 190 ############################ | |
| 191 # IS CACHE WRITING ENABLED # | |
| 192 ############################ | |
| 193 | |
| 194 BiodbFactory$methods( .is.cache.writing.enabled = function() { | |
| 195 return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.WRITE.ONLY, BIODB.CACHE.READ.WRITE)) | |
| 196 }) | |
| 197 | |
| 198 ######################### | |
| 199 # SAVE CONTENT TO CACHE # | |
| 200 ######################### | |
| 201 | |
| 202 BiodbFactory$methods( .save.content.to.cache = function(class, id, content) { | |
| 203 | |
| 204 # Write contents into files | |
| 205 file.paths <- .self$.get.cache.file.paths(class, id) | |
| 206 mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths) | |
| 207 }) | |
| 208 | |
| 209 ##################### | |
| 210 # GET ENTRY CONTENT # | |
| 211 ##################### | |
| 212 | |
| 213 BiodbFactory$methods( getEntryContent = function(class, id) { | |
| 214 | |
| 215 # Debug | |
| 216 .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)...")) | |
| 217 | |
| 218 # Initialize content | |
| 219 if (.self$.is.cache.reading.enabled()) { | |
| 220 content <- .self$.load.content.from.cache(class, id) | |
| 221 missing.ids <- id[vapply(content, is.null, FUN.VALUE = TRUE)] | |
| 222 } | |
| 223 else { | |
| 224 content <- lapply(id, as.null) | |
| 225 missing.ids <- id | |
| 226 } | |
| 227 | |
| 228 # Remove duplicates | |
| 229 n.duplicates <- sum(duplicated(missing.ids)) | |
| 230 missing.ids <- missing.ids[ ! duplicated(missing.ids)] | |
| 231 | |
| 232 # Debug | |
| 233 if (any(is.na(id))) | |
| 234 .self$.print.debug.msg(paste0(sum(is.na(id)), " entry ids are NA.")) | |
| 235 if (.self$.is.cache.reading.enabled()) { | |
| 236 .self$.print.debug.msg(paste0(sum( ! is.na(id)) - length(missing.ids), " entry content(s) loaded from cache.")) | |
| 237 if (n.duplicates > 0) | |
| 238 .self$.print.debug.msg(paste0(n.duplicates, " entry ids, whose content needs to be fetched, are duplicates.")) | |
| 239 .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched.")) | |
| 240 } | |
| 241 | |
| 242 # Get contents | |
| 243 if (length(missing.ids) > 0) { | |
| 244 | |
| 245 # Use connector to get missing contents | |
| 246 conn <- .self$getConn(class) | |
| 247 | |
| 248 # Divide list of missing ids in chunks (in order to save in cache regularly) | |
| 249 chunks.of.missing.ids = if (is.na(.self$.chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / .self$.chunk.size)) | |
| 250 | |
| 251 # Loop on chunks | |
| 252 missing.contents <- NULL | |
| 253 for (ch.missing.ids in chunks.of.missing.ids) { | |
| 254 | |
| 255 ch.missing.contents <- conn$getEntryContent(ch.missing.ids) | |
| 256 | |
| 257 # Save to cache | |
| 258 if ( ! is.null(ch.missing.contents) && .self$.is.cache.writing.enabled()) | |
| 259 .self$.save.content.to.cache(class, ch.missing.ids, ch.missing.contents) | |
| 260 | |
| 261 # Append | |
| 262 missing.contents <- c(missing.contents, ch.missing.contents) | |
| 263 | |
| 264 # Debug | |
| 265 if (.self$.is.cache.reading.enabled()) | |
| 266 .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved...")) | |
| 94 } | 267 } |
| 95 | 268 |
| 96 return (.self$.conn[[class]]) | 269 # Merge content and missing.contents |
| 97 }) | 270 content[id %in% missing.ids] <- vapply(id[id %in% missing.ids], function(x) missing.contents[missing.ids %in% x], FUN.VALUE = '') |
| 98 | 271 } |
| 99 ################ | 272 |
| 100 # CREATE ENTRY # | 273 return(content) |
| 101 ################ | 274 }) |
| 102 | |
| 103 BiodbFactory$methods( createEntry = function(class, type, id = NULL, content = NULL, drop = TRUE) { | |
| 104 | |
| 105 is.null(id) && is.null(content) && stop("One of id or content must be set.") | |
| 106 ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.") | |
| 107 | |
| 108 # Debug | |
| 109 .self$.print.debug.msg(paste0("Creating entry from ", if (is.null(id)) "content" else paste("id", id), "...")) | |
| 110 | |
| 111 # Get content | |
| 112 if ( ! is.null(id)) | |
| 113 content <- .self$getEntryContent(class, type, id) | |
| 114 | |
| 115 conn <- .self$getConn(class) | |
| 116 entry <- conn$createEntry(type = type, content = content, drop = drop) | |
| 117 | |
| 118 # Set factory | |
| 119 for (e in c(entry)) | |
| 120 e$setFactory(.self) | |
| 121 | |
| 122 return(entry) | |
| 123 }) | |
| 124 | |
| 125 ######################## | |
| 126 # GET CACHE FILE PATHS # | |
| 127 ######################## | |
| 128 | |
| 129 BiodbFactory$methods( .get.cache.file.paths = function(class, type, id) { | |
| 130 | |
| 131 # Get extension | |
| 132 ext <- .self$getConn(class)$getEntryContentType(type) | |
| 133 | |
| 134 # Set filenames | |
| 135 filenames <- vapply(id, function(x) paste0(class, '-', type, '-', x, '.', ext), FUN.VALUE = '') | |
| 136 | |
| 137 # set file paths | |
| 138 file.paths <- vapply(filenames, function(x) file.path(.self$.cache.dir, x), FUN.VALUE = '') | |
| 139 | |
| 140 # Create cache dir if needed | |
| 141 if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir)) | |
| 142 dir.create(.self$.cache.dir) | |
| 143 | |
| 144 return(file.paths) | |
| 145 }) | |
| 146 | |
| 147 ########################### | |
| 148 # LOAD CONTENT FROM CACHE # | |
| 149 ########################### | |
| 150 | |
| 151 BiodbFactory$methods( .load.content.from.cache = function(class, type, id) { | |
| 152 | |
| 153 content <- NULL | |
| 154 | |
| 155 # Read contents from files | |
| 156 file.paths <- .self$.get.cache.file.paths(class, type, id) | |
| 157 content <- lapply(file.paths, function(x) { if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL }) | |
| 158 | |
| 159 return(content) | |
| 160 }) | |
| 161 | |
| 162 ######################### | |
| 163 # SAVE CONTENT TO CACHE # | |
| 164 ######################### | |
| 165 | |
| 166 BiodbFactory$methods( .save.content.to.cache = function(class, type, id, content) { | |
| 167 | |
| 168 # Write contents into files | |
| 169 file.paths <- .self$.get.cache.file.paths(class, type, id) | |
| 170 mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths) | |
| 171 }) | |
| 172 | |
| 173 ##################### | |
| 174 # GET ENTRY CONTENT # | |
| 175 ##################### | |
| 176 | |
| 177 BiodbFactory$methods( getEntryContent = function(class, type, id, chunk.size = NA_integer_) { | |
| 178 | |
| 179 # Debug | |
| 180 .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)...")) | |
| 181 | |
| 182 content <- NULL | |
| 183 # Load from cache | |
| 184 if ( ! is.na(.self$.cache.dir)) | |
| 185 content <- .self$.load.content.from.cache(class, type, id) | |
| 186 | |
| 187 # Get list of missing contents | |
| 188 missing.content.indexes <- vapply(content, is.null, FUN.VALUE = TRUE) | |
| 189 missing.ids <- if (is.null(content)) id else id[missing.content.indexes] | |
| 190 | |
| 191 # Debug | |
| 192 if ( ! is.na(.self$.cache.dir)) { | |
| 193 .self$.print.debug.msg(paste0(length(id) - length(missing.ids), " entry content(s) loaded from cache.")) | |
| 194 .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched.")) | |
| 195 } | |
| 196 | |
| 197 # Get contents | |
| 198 if (length(missing.ids) > 0) { | |
| 199 | |
| 200 # Use connector to get missing contents | |
| 201 conn <- .self$getConn(class) | |
| 202 | |
| 203 # Divide list of missing ids in chunks (in order to save in cache regularly) | |
| 204 chunks.of.missing.ids = if (is.na(chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / chunk.size)) | |
| 205 | |
| 206 # Loop on chunks | |
| 207 missing.contents <- NULL | |
| 208 for (ch.missing.ids in chunks.of.missing.ids) { | |
| 209 | |
| 210 ch.missing.contents <- conn$getEntryContent(type, ch.missing.ids) | |
| 211 | |
| 212 # Save to cache | |
| 213 if ( ! is.null(ch.missing.contents) && ! is.na(.self$.cache.dir)) | |
| 214 .self$.save.content.to.cache(class, type, ch.missing.ids, ch.missing.contents) | |
| 215 | |
| 216 # Append | |
| 217 missing.contents <- c(missing.contents, ch.missing.contents) | |
| 218 | |
| 219 # Debug | |
| 220 if ( ! is.na(.self$.cache.dir)) | |
| 221 .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved...")) | |
| 222 } | |
| 223 | |
| 224 # Merge content and missing.contents | |
| 225 if (is.null(content)) | |
| 226 content <- missing.contents | |
| 227 else | |
| 228 content[missing.content.indexes] <- missing.contents | |
| 229 } | |
| 230 | |
| 231 return(content) | |
| 232 }) | |
| 233 } |
