comparison BiodbFactory.R @ 2:20d69a062da3 draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author prog
date Thu, 02 Mar 2017 08:55:00 -0500
parents 253d531a0193
children
comparison
equal deleted inserted replaced
1:253d531a0193 2:20d69a062da3
1 if ( ! exists('BiodbFactory')) { # Do not load again if already loaded 1 # vi: fdm=marker
2 2
3 library(methods) 3 ##########################
4 source('biodb-common.R') 4 # CLASS DECLARATION {{{1 #
5 source('ChebiConn.R') 5 ##########################
6 source('KeggConn.R') 6
7 source('PubchemConn.R') 7 BiodbFactory <- methods::setRefClass("BiodbFactory", contains = 'BiodbObject', fields = list(.useragent = "character",
8 source('HmdbConn.R') 8 .conn = "list",
9 source('ChemspiderConn.R') 9 .cache.dir = "character",
10 source('EnzymeConn.R') 10 .cache.mode = "character",
11 source('LipidmapsConn.R') 11 .debug = "logical",
12 source('MirbaseConn.R') 12 .chunk.size = "integer",
13 source('NcbigeneConn.R') 13 .use.env.var = "logical"))
14 source('NcbiccdsConn.R') 14
15 source('UniprotConn.R') 15 ###############
16 source('MassbankConn.R') 16 # CONSTRUCTOR #
17 source('MassFiledbConn.R') 17 ###############
18 18
19 ##################### 19 BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, cache.mode = BIODB.CACHE.READ.WRITE, debug = FALSE, chunk.size = NA_integer_, use.env.var = FALSE, ...) {
20 # CLASS DECLARATION # 20
21 ##################### 21 .useragent <<- useragent
22 22 .conn <<- list()
23 BiodbFactory <- setRefClass("BiodbFactory", fields = list(.useragent = "character", .conn = "list", .cache.dir = "character", .debug = "logical")) 23 .cache.dir <<- cache.dir
24 24 .cache.mode <<- cache.mode
25 ############### 25 .debug <<- debug
26 # CONSTRUCTOR # 26 .chunk.size <<- as.integer(chunk.size)
27 ############### 27 .use.env.var <<- use.env.var
28 28
29 BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, debug = FALSE, ...) { 29 callSuper(...) # calls super-class initializer with remaining parameters
30 30 })
31 .useragent <<- useragent 31
32 .conn <<- list() 32 #######################
33 .cache.dir <<- cache.dir 33 # PRINT DEBUG MESSAGE #
34 .debug <<- debug 34 #######################
35 35
36 callSuper(...) # calls super-class initializer with remaining parameters 36 BiodbFactory$methods( .print.debug.msg = function(msg) {
37 }) 37 if (.self$.debug)
38 38 .print.msg(msg = msg, class = class(.self))
39 ####################### 39 })
40 # PRINT DEBUG MESSAGE # 40
41 ####################### 41 ##################
42 42 # GET USER AGENT #
43 BiodbFactory$methods( .print.debug.msg = function(msg) { 43 ##################
44 if (.self$.debug) 44
45 .print.msg(msg = msg, class = class(.self)) 45 BiodbFactory$methods( getUserAgent = function() {
46 }) 46 return(.self$.useragent)
47 47 })
48 ################## 48
49 # GET USER AGENT # 49 ##################
50 ################## 50 # SET USER AGENT #
51 51 ##################
52 BiodbFactory$methods( getUserAgent = function() {
53 return(.self$.useragent)
54 })
55
56 ##################
57 # SET USER AGENT #
58 ##################
59 52
60 BiodbFactory$methods( setUserAgent = function(useragent) { 53 BiodbFactory$methods( setUserAgent = function(useragent) {
61 .useragent <<- useragent 54 "Set useragent of BiodbFactory."
62 }) 55 .useragent <<- useragent
63 56 })
64 ############ 57
65 # GET CONN # 58 ###############
66 ############ 59 # CREATE CONN #
67 60 ###############
68 BiodbFactory$methods( getConn = function(class, url = NA_character_) { 61
69 62 BiodbFactory$methods( createConn = function(class, url = NA_character_, token = NA_character_) {
70 if ( ! class %in% names(.self$.conn)) { 63 " Create connection to databases useful for metabolomics."
71 64 if (class %in% names(.self$.conn))
72 # Create connection instance 65 stop(paste0('A connection of type ', class, ' already exists. Please use method getConn() to access it.'))
73 conn <- switch(class, 66
74 chebi = ChebiConn$new(useragent = .self$.useragent), 67 # Use environment variables
75 kegg = KeggConn$new(useragent = .self$.useragent), 68 if (.self$.use.env.var) {
76 pubchem = PubchemConn$new(useragent = .self$.useragent), 69 if (is.na(url))
77 hmdb = HmdbConn$new(useragent = .self$.useragent), 70 url <- .biodb.get.env.var(c(class, 'URL'))
78 chemspider = ChemspiderConn$new(useragent = .self$.useragent), 71 if (is.na(token))
79 enzyme = EnzymeConn$new(useragent = .self$.useragent), 72 token <- .biodb.get.env.var(c(class, 'TOKEN'))
80 lipidmaps = LipidmapsConn$new(useragent = .self$.useragent), 73 }
81 mirbase = MirbaseConn$new(useragent = .self$.useragent), 74
82 ncbigene = NcbigeneConn$new(useragent = .self$.useragent), 75 # Create connection instance
83 ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent), 76 conn <- switch(class,
84 uniprot = UniprotConn$new(useragent = .self$.useragent), 77 chebi = ChebiConn$new(useragent = .self$.useragent, debug = .self$.debug),
85 massbank = MassbankConn$new(useragent = .self$.useragent, debug = .self$.debug), 78 kegg = KeggConn$new(useragent = .self$.useragent, debug = .self$.debug),
86 massfiledb = MassFiledbConn$new(file = url), 79 pubchemcomp = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMCOMP, debug = .self$.debug),
87 NULL) 80 pubchemsub = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMSUB, debug = .self$.debug),
88 81 hmdb = HmdbConn$new(useragent = .self$.useragent, debug = .self$.debug),
89 # Unknown class 82 chemspider = ChemspiderConn$new(useragent = .self$.useragent, debug = .self$.debug, token = token),
90 if (is.null(conn)) 83 enzyme = EnzymeConn$new(useragent = .self$.useragent, debug = .self$.debug),
91 stop(paste0("Unknown r-biodb class \"", class,"\".")) 84 lipidmaps = LipidmapsConn$new(useragent = .self$.useragent, debug = .self$.debug),
92 85 mirbase = MirbaseConn$new(useragent = .self$.useragent, debug = .self$.debug),
93 .self$.conn[[class]] <- conn 86 ncbigene = NcbigeneConn$new(useragent = .self$.useragent, debug = .self$.debug),
87 ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent, debug = .self$.debug),
88 uniprot = UniprotConn$new(useragent = .self$.useragent, debug = .self$.debug),
89 massbank = MassbankConn$new(useragent = .self$.useragent, url = url, debug = .self$.debug),
90 massfiledb = MassFiledbConn$new(file = url, debug = .self$.debug),
91 peakforest = PeakforestConn$new(useragent = .self$.useragent, debug = .self$.debug),
92 NULL)
93
94 # Unknown class
95 if (is.null(conn))
96 stop(paste0("Unknown r-biodb class \"", class,"\"."))
97
98 # Register new class
99 .self$.conn[[class]] <- conn
100
101 return (.self$.conn[[class]])
102 })
103
104 ############
105 # GET CONN #
106 ############
107
108 BiodbFactory$methods( getConn = function(class) {
109 "Get connection to a database."
110
111 if ( ! class %in% names(.self$.conn))
112 .self$createConn(class)
113
114 return (.self$.conn[[class]])
115 })
116
117 ################
118 # CREATE ENTRY #
119 ################
120
121 BiodbFactory$methods( createEntry = function(class, id = NULL, content = NULL, drop = TRUE) {
122 "Create Entry from a database by id."
123
124 is.null(id) && is.null(content) && stop("One of id or content must be set.")
125 ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.")
126
127 # Debug
128 .self$.print.debug.msg(paste0("Creating ", if (is.null(id)) length(content) else length(id), " entries from ", if (is.null(id)) "contents" else paste("ids", paste(if (length(id) > 10) id[1:10] else id, collapse = ", ")), "..."))
129
130 # Get content
131 if ( ! is.null(id))
132 content <- .self$getEntryContent(class, id)
133 conn <- .self$getConn(class)
134 entry <- conn$createEntry(content = content, drop = drop)
135
136 # Set factory
137 .self$.print.debug.msg(paste0("Setting factory reference into entries..."))
138 for (e in c(entry))
139 if ( ! is.null(e))
140 e$setFactory(.self)
141
142 return(entry)
143 })
144
145 ########################
146 # GET CACHE FILE PATHS #
147 ########################
148
149 BiodbFactory$methods( .get.cache.file.paths = function(class, id) {
150
151 # Get extension
152 ext <- .self$getConn(class)$getEntryContentType()
153
154 # Set filenames
155 filenames <- vapply(id, function(x) { if (is.na(x)) NA_character_ else paste0(class, '-', x, '.', ext) }, FUN.VALUE = '')
156
157 # set file paths
158 file.paths <- vapply(filenames, function(x) { if (is.na(x)) NA_character_ else file.path(.self$.cache.dir, x) }, FUN.VALUE = '')
159
160 # Create cache dir if needed
161 if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir))
162 dir.create(.self$.cache.dir)
163
164 return(file.paths)
165 })
166
167 ###########################
168 # LOAD CONTENT FROM CACHE #
169 ###########################
170
171 BiodbFactory$methods( .load.content.from.cache = function(class, id) {
172
173 content <- NULL
174
175 # Read contents from files
176 file.paths <- .self$.get.cache.file.paths(class, id)
177 content <- lapply(file.paths, function(x) { if (is.na(x)) NA_character_ else ( if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL )} )
178
179 return(content)
180 })
181
182 ############################
183 # IS CACHE READING ENABLED #
184 ############################
185
186 BiodbFactory$methods( .is.cache.reading.enabled = function() {
187 return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.READ.ONLY, BIODB.CACHE.READ.WRITE))
188 })
189
190 ############################
191 # IS CACHE WRITING ENABLED #
192 ############################
193
194 BiodbFactory$methods( .is.cache.writing.enabled = function() {
195 return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.WRITE.ONLY, BIODB.CACHE.READ.WRITE))
196 })
197
198 #########################
199 # SAVE CONTENT TO CACHE #
200 #########################
201
202 BiodbFactory$methods( .save.content.to.cache = function(class, id, content) {
203
204 # Write contents into files
205 file.paths <- .self$.get.cache.file.paths(class, id)
206 mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths)
207 })
208
209 #####################
210 # GET ENTRY CONTENT #
211 #####################
212
213 BiodbFactory$methods( getEntryContent = function(class, id) {
214
215 # Debug
216 .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)..."))
217
218 # Initialize content
219 if (.self$.is.cache.reading.enabled()) {
220 content <- .self$.load.content.from.cache(class, id)
221 missing.ids <- id[vapply(content, is.null, FUN.VALUE = TRUE)]
222 }
223 else {
224 content <- lapply(id, as.null)
225 missing.ids <- id
226 }
227
228 # Remove duplicates
229 n.duplicates <- sum(duplicated(missing.ids))
230 missing.ids <- missing.ids[ ! duplicated(missing.ids)]
231
232 # Debug
233 if (any(is.na(id)))
234 .self$.print.debug.msg(paste0(sum(is.na(id)), " entry ids are NA."))
235 if (.self$.is.cache.reading.enabled()) {
236 .self$.print.debug.msg(paste0(sum( ! is.na(id)) - length(missing.ids), " entry content(s) loaded from cache."))
237 if (n.duplicates > 0)
238 .self$.print.debug.msg(paste0(n.duplicates, " entry ids, whose content needs to be fetched, are duplicates."))
239 .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched."))
240 }
241
242 # Get contents
243 if (length(missing.ids) > 0) {
244
245 # Use connector to get missing contents
246 conn <- .self$getConn(class)
247
248 # Divide list of missing ids in chunks (in order to save in cache regularly)
249 chunks.of.missing.ids = if (is.na(.self$.chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / .self$.chunk.size))
250
251 # Loop on chunks
252 missing.contents <- NULL
253 for (ch.missing.ids in chunks.of.missing.ids) {
254
255 ch.missing.contents <- conn$getEntryContent(ch.missing.ids)
256
257 # Save to cache
258 if ( ! is.null(ch.missing.contents) && .self$.is.cache.writing.enabled())
259 .self$.save.content.to.cache(class, ch.missing.ids, ch.missing.contents)
260
261 # Append
262 missing.contents <- c(missing.contents, ch.missing.contents)
263
264 # Debug
265 if (.self$.is.cache.reading.enabled())
266 .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved..."))
94 } 267 }
95 268
96 return (.self$.conn[[class]]) 269 # Merge content and missing.contents
97 }) 270 content[id %in% missing.ids] <- vapply(id[id %in% missing.ids], function(x) missing.contents[missing.ids %in% x], FUN.VALUE = '')
98 271 }
99 ################ 272
100 # CREATE ENTRY # 273 return(content)
101 ################ 274 })
102
103 BiodbFactory$methods( createEntry = function(class, type, id = NULL, content = NULL, drop = TRUE) {
104
105 is.null(id) && is.null(content) && stop("One of id or content must be set.")
106 ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.")
107
108 # Debug
109 .self$.print.debug.msg(paste0("Creating entry from ", if (is.null(id)) "content" else paste("id", id), "..."))
110
111 # Get content
112 if ( ! is.null(id))
113 content <- .self$getEntryContent(class, type, id)
114
115 conn <- .self$getConn(class)
116 entry <- conn$createEntry(type = type, content = content, drop = drop)
117
118 # Set factory
119 for (e in c(entry))
120 e$setFactory(.self)
121
122 return(entry)
123 })
124
125 ########################
126 # GET CACHE FILE PATHS #
127 ########################
128
129 BiodbFactory$methods( .get.cache.file.paths = function(class, type, id) {
130
131 # Get extension
132 ext <- .self$getConn(class)$getEntryContentType(type)
133
134 # Set filenames
135 filenames <- vapply(id, function(x) paste0(class, '-', type, '-', x, '.', ext), FUN.VALUE = '')
136
137 # set file paths
138 file.paths <- vapply(filenames, function(x) file.path(.self$.cache.dir, x), FUN.VALUE = '')
139
140 # Create cache dir if needed
141 if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir))
142 dir.create(.self$.cache.dir)
143
144 return(file.paths)
145 })
146
147 ###########################
148 # LOAD CONTENT FROM CACHE #
149 ###########################
150
151 BiodbFactory$methods( .load.content.from.cache = function(class, type, id) {
152
153 content <- NULL
154
155 # Read contents from files
156 file.paths <- .self$.get.cache.file.paths(class, type, id)
157 content <- lapply(file.paths, function(x) { if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL })
158
159 return(content)
160 })
161
162 #########################
163 # SAVE CONTENT TO CACHE #
164 #########################
165
166 BiodbFactory$methods( .save.content.to.cache = function(class, type, id, content) {
167
168 # Write contents into files
169 file.paths <- .self$.get.cache.file.paths(class, type, id)
170 mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths)
171 })
172
173 #####################
174 # GET ENTRY CONTENT #
175 #####################
176
177 BiodbFactory$methods( getEntryContent = function(class, type, id, chunk.size = NA_integer_) {
178
179 # Debug
180 .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)..."))
181
182 content <- NULL
183 # Load from cache
184 if ( ! is.na(.self$.cache.dir))
185 content <- .self$.load.content.from.cache(class, type, id)
186
187 # Get list of missing contents
188 missing.content.indexes <- vapply(content, is.null, FUN.VALUE = TRUE)
189 missing.ids <- if (is.null(content)) id else id[missing.content.indexes]
190
191 # Debug
192 if ( ! is.na(.self$.cache.dir)) {
193 .self$.print.debug.msg(paste0(length(id) - length(missing.ids), " entry content(s) loaded from cache."))
194 .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched."))
195 }
196
197 # Get contents
198 if (length(missing.ids) > 0) {
199
200 # Use connector to get missing contents
201 conn <- .self$getConn(class)
202
203 # Divide list of missing ids in chunks (in order to save in cache regularly)
204 chunks.of.missing.ids = if (is.na(chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / chunk.size))
205
206 # Loop on chunks
207 missing.contents <- NULL
208 for (ch.missing.ids in chunks.of.missing.ids) {
209
210 ch.missing.contents <- conn$getEntryContent(type, ch.missing.ids)
211
212 # Save to cache
213 if ( ! is.null(ch.missing.contents) && ! is.na(.self$.cache.dir))
214 .self$.save.content.to.cache(class, type, ch.missing.ids, ch.missing.contents)
215
216 # Append
217 missing.contents <- c(missing.contents, ch.missing.contents)
218
219 # Debug
220 if ( ! is.na(.self$.cache.dir))
221 .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved..."))
222 }
223
224 # Merge content and missing.contents
225 if (is.null(content))
226 content <- missing.contents
227 else
228 content[missing.content.indexes] <- missing.contents
229 }
230
231 return(content)
232 })
233 }