Mercurial > repos > prog > lcmsmatching
comparison BiodbFactory.R @ 2:20d69a062da3 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author | prog |
---|---|
date | Thu, 02 Mar 2017 08:55:00 -0500 |
parents | 253d531a0193 |
children |
comparison
equal
deleted
inserted
replaced
1:253d531a0193 | 2:20d69a062da3 |
---|---|
1 if ( ! exists('BiodbFactory')) { # Do not load again if already loaded | 1 # vi: fdm=marker |
2 | 2 |
3 library(methods) | 3 ########################## |
4 source('biodb-common.R') | 4 # CLASS DECLARATION {{{1 # |
5 source('ChebiConn.R') | 5 ########################## |
6 source('KeggConn.R') | 6 |
7 source('PubchemConn.R') | 7 BiodbFactory <- methods::setRefClass("BiodbFactory", contains = 'BiodbObject', fields = list(.useragent = "character", |
8 source('HmdbConn.R') | 8 .conn = "list", |
9 source('ChemspiderConn.R') | 9 .cache.dir = "character", |
10 source('EnzymeConn.R') | 10 .cache.mode = "character", |
11 source('LipidmapsConn.R') | 11 .debug = "logical", |
12 source('MirbaseConn.R') | 12 .chunk.size = "integer", |
13 source('NcbigeneConn.R') | 13 .use.env.var = "logical")) |
14 source('NcbiccdsConn.R') | 14 |
15 source('UniprotConn.R') | 15 ############### |
16 source('MassbankConn.R') | 16 # CONSTRUCTOR # |
17 source('MassFiledbConn.R') | 17 ############### |
18 | 18 |
19 ##################### | 19 BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, cache.mode = BIODB.CACHE.READ.WRITE, debug = FALSE, chunk.size = NA_integer_, use.env.var = FALSE, ...) { |
20 # CLASS DECLARATION # | 20 |
21 ##################### | 21 .useragent <<- useragent |
22 | 22 .conn <<- list() |
23 BiodbFactory <- setRefClass("BiodbFactory", fields = list(.useragent = "character", .conn = "list", .cache.dir = "character", .debug = "logical")) | 23 .cache.dir <<- cache.dir |
24 | 24 .cache.mode <<- cache.mode |
25 ############### | 25 .debug <<- debug |
26 # CONSTRUCTOR # | 26 .chunk.size <<- as.integer(chunk.size) |
27 ############### | 27 .use.env.var <<- use.env.var |
28 | 28 |
29 BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, debug = FALSE, ...) { | 29 callSuper(...) # calls super-class initializer with remaining parameters |
30 | 30 }) |
31 .useragent <<- useragent | 31 |
32 .conn <<- list() | 32 ####################### |
33 .cache.dir <<- cache.dir | 33 # PRINT DEBUG MESSAGE # |
34 .debug <<- debug | 34 ####################### |
35 | 35 |
36 callSuper(...) # calls super-class initializer with remaining parameters | 36 BiodbFactory$methods( .print.debug.msg = function(msg) { |
37 }) | 37 if (.self$.debug) |
38 | 38 .print.msg(msg = msg, class = class(.self)) |
39 ####################### | 39 }) |
40 # PRINT DEBUG MESSAGE # | 40 |
41 ####################### | 41 ################## |
42 | 42 # GET USER AGENT # |
43 BiodbFactory$methods( .print.debug.msg = function(msg) { | 43 ################## |
44 if (.self$.debug) | 44 |
45 .print.msg(msg = msg, class = class(.self)) | 45 BiodbFactory$methods( getUserAgent = function() { |
46 }) | 46 return(.self$.useragent) |
47 | 47 }) |
48 ################## | 48 |
49 # GET USER AGENT # | 49 ################## |
50 ################## | 50 # SET USER AGENT # |
51 | 51 ################## |
52 BiodbFactory$methods( getUserAgent = function() { | |
53 return(.self$.useragent) | |
54 }) | |
55 | |
56 ################## | |
57 # SET USER AGENT # | |
58 ################## | |
59 | 52 |
60 BiodbFactory$methods( setUserAgent = function(useragent) { | 53 BiodbFactory$methods( setUserAgent = function(useragent) { |
61 .useragent <<- useragent | 54 "Set useragent of BiodbFactory." |
62 }) | 55 .useragent <<- useragent |
63 | 56 }) |
64 ############ | 57 |
65 # GET CONN # | 58 ############### |
66 ############ | 59 # CREATE CONN # |
67 | 60 ############### |
68 BiodbFactory$methods( getConn = function(class, url = NA_character_) { | 61 |
69 | 62 BiodbFactory$methods( createConn = function(class, url = NA_character_, token = NA_character_) { |
70 if ( ! class %in% names(.self$.conn)) { | 63 " Create connection to databases useful for metabolomics." |
71 | 64 if (class %in% names(.self$.conn)) |
72 # Create connection instance | 65 stop(paste0('A connection of type ', class, ' already exists. Please use method getConn() to access it.')) |
73 conn <- switch(class, | 66 |
74 chebi = ChebiConn$new(useragent = .self$.useragent), | 67 # Use environment variables |
75 kegg = KeggConn$new(useragent = .self$.useragent), | 68 if (.self$.use.env.var) { |
76 pubchem = PubchemConn$new(useragent = .self$.useragent), | 69 if (is.na(url)) |
77 hmdb = HmdbConn$new(useragent = .self$.useragent), | 70 url <- .biodb.get.env.var(c(class, 'URL')) |
78 chemspider = ChemspiderConn$new(useragent = .self$.useragent), | 71 if (is.na(token)) |
79 enzyme = EnzymeConn$new(useragent = .self$.useragent), | 72 token <- .biodb.get.env.var(c(class, 'TOKEN')) |
80 lipidmaps = LipidmapsConn$new(useragent = .self$.useragent), | 73 } |
81 mirbase = MirbaseConn$new(useragent = .self$.useragent), | 74 |
82 ncbigene = NcbigeneConn$new(useragent = .self$.useragent), | 75 # Create connection instance |
83 ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent), | 76 conn <- switch(class, |
84 uniprot = UniprotConn$new(useragent = .self$.useragent), | 77 chebi = ChebiConn$new(useragent = .self$.useragent, debug = .self$.debug), |
85 massbank = MassbankConn$new(useragent = .self$.useragent, debug = .self$.debug), | 78 kegg = KeggConn$new(useragent = .self$.useragent, debug = .self$.debug), |
86 massfiledb = MassFiledbConn$new(file = url), | 79 pubchemcomp = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMCOMP, debug = .self$.debug), |
87 NULL) | 80 pubchemsub = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMSUB, debug = .self$.debug), |
88 | 81 hmdb = HmdbConn$new(useragent = .self$.useragent, debug = .self$.debug), |
89 # Unknown class | 82 chemspider = ChemspiderConn$new(useragent = .self$.useragent, debug = .self$.debug, token = token), |
90 if (is.null(conn)) | 83 enzyme = EnzymeConn$new(useragent = .self$.useragent, debug = .self$.debug), |
91 stop(paste0("Unknown r-biodb class \"", class,"\".")) | 84 lipidmaps = LipidmapsConn$new(useragent = .self$.useragent, debug = .self$.debug), |
92 | 85 mirbase = MirbaseConn$new(useragent = .self$.useragent, debug = .self$.debug), |
93 .self$.conn[[class]] <- conn | 86 ncbigene = NcbigeneConn$new(useragent = .self$.useragent, debug = .self$.debug), |
87 ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
88 uniprot = UniprotConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
89 massbank = MassbankConn$new(useragent = .self$.useragent, url = url, debug = .self$.debug), | |
90 massfiledb = MassFiledbConn$new(file = url, debug = .self$.debug), | |
91 peakforest = PeakforestConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
92 NULL) | |
93 | |
94 # Unknown class | |
95 if (is.null(conn)) | |
96 stop(paste0("Unknown r-biodb class \"", class,"\".")) | |
97 | |
98 # Register new class | |
99 .self$.conn[[class]] <- conn | |
100 | |
101 return (.self$.conn[[class]]) | |
102 }) | |
103 | |
104 ############ | |
105 # GET CONN # | |
106 ############ | |
107 | |
108 BiodbFactory$methods( getConn = function(class) { | |
109 "Get connection to a database." | |
110 | |
111 if ( ! class %in% names(.self$.conn)) | |
112 .self$createConn(class) | |
113 | |
114 return (.self$.conn[[class]]) | |
115 }) | |
116 | |
117 ################ | |
118 # CREATE ENTRY # | |
119 ################ | |
120 | |
121 BiodbFactory$methods( createEntry = function(class, id = NULL, content = NULL, drop = TRUE) { | |
122 "Create Entry from a database by id." | |
123 | |
124 is.null(id) && is.null(content) && stop("One of id or content must be set.") | |
125 ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.") | |
126 | |
127 # Debug | |
128 .self$.print.debug.msg(paste0("Creating ", if (is.null(id)) length(content) else length(id), " entries from ", if (is.null(id)) "contents" else paste("ids", paste(if (length(id) > 10) id[1:10] else id, collapse = ", ")), "...")) | |
129 | |
130 # Get content | |
131 if ( ! is.null(id)) | |
132 content <- .self$getEntryContent(class, id) | |
133 conn <- .self$getConn(class) | |
134 entry <- conn$createEntry(content = content, drop = drop) | |
135 | |
136 # Set factory | |
137 .self$.print.debug.msg(paste0("Setting factory reference into entries...")) | |
138 for (e in c(entry)) | |
139 if ( ! is.null(e)) | |
140 e$setFactory(.self) | |
141 | |
142 return(entry) | |
143 }) | |
144 | |
145 ######################## | |
146 # GET CACHE FILE PATHS # | |
147 ######################## | |
148 | |
149 BiodbFactory$methods( .get.cache.file.paths = function(class, id) { | |
150 | |
151 # Get extension | |
152 ext <- .self$getConn(class)$getEntryContentType() | |
153 | |
154 # Set filenames | |
155 filenames <- vapply(id, function(x) { if (is.na(x)) NA_character_ else paste0(class, '-', x, '.', ext) }, FUN.VALUE = '') | |
156 | |
157 # set file paths | |
158 file.paths <- vapply(filenames, function(x) { if (is.na(x)) NA_character_ else file.path(.self$.cache.dir, x) }, FUN.VALUE = '') | |
159 | |
160 # Create cache dir if needed | |
161 if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir)) | |
162 dir.create(.self$.cache.dir) | |
163 | |
164 return(file.paths) | |
165 }) | |
166 | |
167 ########################### | |
168 # LOAD CONTENT FROM CACHE # | |
169 ########################### | |
170 | |
171 BiodbFactory$methods( .load.content.from.cache = function(class, id) { | |
172 | |
173 content <- NULL | |
174 | |
175 # Read contents from files | |
176 file.paths <- .self$.get.cache.file.paths(class, id) | |
177 content <- lapply(file.paths, function(x) { if (is.na(x)) NA_character_ else ( if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL )} ) | |
178 | |
179 return(content) | |
180 }) | |
181 | |
182 ############################ | |
183 # IS CACHE READING ENABLED # | |
184 ############################ | |
185 | |
186 BiodbFactory$methods( .is.cache.reading.enabled = function() { | |
187 return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.READ.ONLY, BIODB.CACHE.READ.WRITE)) | |
188 }) | |
189 | |
190 ############################ | |
191 # IS CACHE WRITING ENABLED # | |
192 ############################ | |
193 | |
194 BiodbFactory$methods( .is.cache.writing.enabled = function() { | |
195 return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.WRITE.ONLY, BIODB.CACHE.READ.WRITE)) | |
196 }) | |
197 | |
198 ######################### | |
199 # SAVE CONTENT TO CACHE # | |
200 ######################### | |
201 | |
202 BiodbFactory$methods( .save.content.to.cache = function(class, id, content) { | |
203 | |
204 # Write contents into files | |
205 file.paths <- .self$.get.cache.file.paths(class, id) | |
206 mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths) | |
207 }) | |
208 | |
209 ##################### | |
210 # GET ENTRY CONTENT # | |
211 ##################### | |
212 | |
213 BiodbFactory$methods( getEntryContent = function(class, id) { | |
214 | |
215 # Debug | |
216 .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)...")) | |
217 | |
218 # Initialize content | |
219 if (.self$.is.cache.reading.enabled()) { | |
220 content <- .self$.load.content.from.cache(class, id) | |
221 missing.ids <- id[vapply(content, is.null, FUN.VALUE = TRUE)] | |
222 } | |
223 else { | |
224 content <- lapply(id, as.null) | |
225 missing.ids <- id | |
226 } | |
227 | |
228 # Remove duplicates | |
229 n.duplicates <- sum(duplicated(missing.ids)) | |
230 missing.ids <- missing.ids[ ! duplicated(missing.ids)] | |
231 | |
232 # Debug | |
233 if (any(is.na(id))) | |
234 .self$.print.debug.msg(paste0(sum(is.na(id)), " entry ids are NA.")) | |
235 if (.self$.is.cache.reading.enabled()) { | |
236 .self$.print.debug.msg(paste0(sum( ! is.na(id)) - length(missing.ids), " entry content(s) loaded from cache.")) | |
237 if (n.duplicates > 0) | |
238 .self$.print.debug.msg(paste0(n.duplicates, " entry ids, whose content needs to be fetched, are duplicates.")) | |
239 .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched.")) | |
240 } | |
241 | |
242 # Get contents | |
243 if (length(missing.ids) > 0) { | |
244 | |
245 # Use connector to get missing contents | |
246 conn <- .self$getConn(class) | |
247 | |
248 # Divide list of missing ids in chunks (in order to save in cache regularly) | |
249 chunks.of.missing.ids = if (is.na(.self$.chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / .self$.chunk.size)) | |
250 | |
251 # Loop on chunks | |
252 missing.contents <- NULL | |
253 for (ch.missing.ids in chunks.of.missing.ids) { | |
254 | |
255 ch.missing.contents <- conn$getEntryContent(ch.missing.ids) | |
256 | |
257 # Save to cache | |
258 if ( ! is.null(ch.missing.contents) && .self$.is.cache.writing.enabled()) | |
259 .self$.save.content.to.cache(class, ch.missing.ids, ch.missing.contents) | |
260 | |
261 # Append | |
262 missing.contents <- c(missing.contents, ch.missing.contents) | |
263 | |
264 # Debug | |
265 if (.self$.is.cache.reading.enabled()) | |
266 .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved...")) | |
94 } | 267 } |
95 | 268 |
96 return (.self$.conn[[class]]) | 269 # Merge content and missing.contents |
97 }) | 270 content[id %in% missing.ids] <- vapply(id[id %in% missing.ids], function(x) missing.contents[missing.ids %in% x], FUN.VALUE = '') |
98 | 271 } |
99 ################ | 272 |
100 # CREATE ENTRY # | 273 return(content) |
101 ################ | 274 }) |
102 | |
103 BiodbFactory$methods( createEntry = function(class, type, id = NULL, content = NULL, drop = TRUE) { | |
104 | |
105 is.null(id) && is.null(content) && stop("One of id or content must be set.") | |
106 ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.") | |
107 | |
108 # Debug | |
109 .self$.print.debug.msg(paste0("Creating entry from ", if (is.null(id)) "content" else paste("id", id), "...")) | |
110 | |
111 # Get content | |
112 if ( ! is.null(id)) | |
113 content <- .self$getEntryContent(class, type, id) | |
114 | |
115 conn <- .self$getConn(class) | |
116 entry <- conn$createEntry(type = type, content = content, drop = drop) | |
117 | |
118 # Set factory | |
119 for (e in c(entry)) | |
120 e$setFactory(.self) | |
121 | |
122 return(entry) | |
123 }) | |
124 | |
125 ######################## | |
126 # GET CACHE FILE PATHS # | |
127 ######################## | |
128 | |
129 BiodbFactory$methods( .get.cache.file.paths = function(class, type, id) { | |
130 | |
131 # Get extension | |
132 ext <- .self$getConn(class)$getEntryContentType(type) | |
133 | |
134 # Set filenames | |
135 filenames <- vapply(id, function(x) paste0(class, '-', type, '-', x, '.', ext), FUN.VALUE = '') | |
136 | |
137 # set file paths | |
138 file.paths <- vapply(filenames, function(x) file.path(.self$.cache.dir, x), FUN.VALUE = '') | |
139 | |
140 # Create cache dir if needed | |
141 if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir)) | |
142 dir.create(.self$.cache.dir) | |
143 | |
144 return(file.paths) | |
145 }) | |
146 | |
147 ########################### | |
148 # LOAD CONTENT FROM CACHE # | |
149 ########################### | |
150 | |
151 BiodbFactory$methods( .load.content.from.cache = function(class, type, id) { | |
152 | |
153 content <- NULL | |
154 | |
155 # Read contents from files | |
156 file.paths <- .self$.get.cache.file.paths(class, type, id) | |
157 content <- lapply(file.paths, function(x) { if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL }) | |
158 | |
159 return(content) | |
160 }) | |
161 | |
162 ######################### | |
163 # SAVE CONTENT TO CACHE # | |
164 ######################### | |
165 | |
166 BiodbFactory$methods( .save.content.to.cache = function(class, type, id, content) { | |
167 | |
168 # Write contents into files | |
169 file.paths <- .self$.get.cache.file.paths(class, type, id) | |
170 mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths) | |
171 }) | |
172 | |
173 ##################### | |
174 # GET ENTRY CONTENT # | |
175 ##################### | |
176 | |
177 BiodbFactory$methods( getEntryContent = function(class, type, id, chunk.size = NA_integer_) { | |
178 | |
179 # Debug | |
180 .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)...")) | |
181 | |
182 content <- NULL | |
183 # Load from cache | |
184 if ( ! is.na(.self$.cache.dir)) | |
185 content <- .self$.load.content.from.cache(class, type, id) | |
186 | |
187 # Get list of missing contents | |
188 missing.content.indexes <- vapply(content, is.null, FUN.VALUE = TRUE) | |
189 missing.ids <- if (is.null(content)) id else id[missing.content.indexes] | |
190 | |
191 # Debug | |
192 if ( ! is.na(.self$.cache.dir)) { | |
193 .self$.print.debug.msg(paste0(length(id) - length(missing.ids), " entry content(s) loaded from cache.")) | |
194 .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched.")) | |
195 } | |
196 | |
197 # Get contents | |
198 if (length(missing.ids) > 0) { | |
199 | |
200 # Use connector to get missing contents | |
201 conn <- .self$getConn(class) | |
202 | |
203 # Divide list of missing ids in chunks (in order to save in cache regularly) | |
204 chunks.of.missing.ids = if (is.na(chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / chunk.size)) | |
205 | |
206 # Loop on chunks | |
207 missing.contents <- NULL | |
208 for (ch.missing.ids in chunks.of.missing.ids) { | |
209 | |
210 ch.missing.contents <- conn$getEntryContent(type, ch.missing.ids) | |
211 | |
212 # Save to cache | |
213 if ( ! is.null(ch.missing.contents) && ! is.na(.self$.cache.dir)) | |
214 .self$.save.content.to.cache(class, type, ch.missing.ids, ch.missing.contents) | |
215 | |
216 # Append | |
217 missing.contents <- c(missing.contents, ch.missing.contents) | |
218 | |
219 # Debug | |
220 if ( ! is.na(.self$.cache.dir)) | |
221 .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved...")) | |
222 } | |
223 | |
224 # Merge content and missing.contents | |
225 if (is.null(content)) | |
226 content <- missing.contents | |
227 else | |
228 content[missing.content.indexes] <- missing.contents | |
229 } | |
230 | |
231 return(content) | |
232 }) | |
233 } |