Mercurial > repos > prog > lcmsmatching
comparison BiodbFactory.R @ 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author | prog |
---|---|
date | Fri, 22 Feb 2019 16:04:22 -0500 |
parents | fb9c0409d85c |
children |
comparison
equal
deleted
inserted
replaced
5:fb9c0409d85c | 6:f86fec07f392 |
---|---|
1 # vi: fdm=marker | |
2 | |
3 ########################## | |
4 # CLASS DECLARATION {{{1 # | |
5 ########################## | |
6 | |
7 BiodbFactory <- methods::setRefClass("BiodbFactory", contains = 'BiodbObject', fields = list(.useragent = "character", | |
8 .conn = "list", | |
9 .cache.dir = "character", | |
10 .cache.mode = "character", | |
11 .debug = "logical", | |
12 .chunk.size = "integer", | |
13 .use.env.var = "logical")) | |
14 | |
15 ############### | |
16 # CONSTRUCTOR # | |
17 ############### | |
18 | |
19 BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, cache.mode = BIODB.CACHE.READ.WRITE, debug = FALSE, chunk.size = NA_integer_, use.env.var = FALSE, ...) { | |
20 | |
21 .useragent <<- useragent | |
22 .conn <<- list() | |
23 .cache.dir <<- cache.dir | |
24 .cache.mode <<- cache.mode | |
25 .debug <<- debug | |
26 .chunk.size <<- as.integer(chunk.size) | |
27 .use.env.var <<- use.env.var | |
28 | |
29 callSuper(...) # calls super-class initializer with remaining parameters | |
30 }) | |
31 | |
32 ####################### | |
33 # PRINT DEBUG MESSAGE # | |
34 ####################### | |
35 | |
36 BiodbFactory$methods( .print.debug.msg = function(msg) { | |
37 if (.self$.debug) | |
38 .print.msg(msg = msg, class = class(.self)) | |
39 }) | |
40 | |
41 ################## | |
42 # GET USER AGENT # | |
43 ################## | |
44 | |
45 BiodbFactory$methods( getUserAgent = function() { | |
46 return(.self$.useragent) | |
47 }) | |
48 | |
49 ################## | |
50 # SET USER AGENT # | |
51 ################## | |
52 | |
53 BiodbFactory$methods( setUserAgent = function(useragent) { | |
54 "Set useragent of BiodbFactory." | |
55 .useragent <<- useragent | |
56 }) | |
57 | |
58 ############### | |
59 # CREATE CONN # | |
60 ############### | |
61 | |
62 BiodbFactory$methods( createConn = function(class, url = NA_character_, token = NA_character_) { | |
63 " Create connection to databases useful for metabolomics." | |
64 if (class %in% names(.self$.conn)) | |
65 stop(paste0('A connection of type ', class, ' already exists. Please use method getConn() to access it.')) | |
66 | |
67 # Use environment variables | |
68 if (.self$.use.env.var) { | |
69 if (is.na(url)) | |
70 url <- .biodb.get.env.var(c(class, 'URL')) | |
71 if (is.na(token)) | |
72 token <- .biodb.get.env.var(c(class, 'TOKEN')) | |
73 } | |
74 | |
75 # Create connection instance | |
76 conn <- switch(class, | |
77 chebi = ChebiConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
78 kegg = KeggConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
79 pubchemcomp = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMCOMP, debug = .self$.debug), | |
80 pubchemsub = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMSUB, debug = .self$.debug), | |
81 hmdb = HmdbConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
82 chemspider = ChemspiderConn$new(useragent = .self$.useragent, debug = .self$.debug, token = token), | |
83 enzyme = EnzymeConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
84 lipidmaps = LipidmapsConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
85 mirbase = MirbaseConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
86 ncbigene = NcbigeneConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
87 ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
88 uniprot = UniprotConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
89 massbank = MassbankConn$new(useragent = .self$.useragent, url = url, debug = .self$.debug), | |
90 massfiledb = MassFiledbConn$new(file = url, debug = .self$.debug), | |
91 peakforest = PeakforestConn$new(useragent = .self$.useragent, debug = .self$.debug), | |
92 NULL) | |
93 | |
94 # Unknown class | |
95 if (is.null(conn)) | |
96 stop(paste0("Unknown r-biodb class \"", class,"\".")) | |
97 | |
98 # Register new class | |
99 .self$.conn[[class]] <- conn | |
100 | |
101 return (.self$.conn[[class]]) | |
102 }) | |
103 | |
104 ############ | |
105 # GET CONN # | |
106 ############ | |
107 | |
108 BiodbFactory$methods( getConn = function(class) { | |
109 "Get connection to a database." | |
110 | |
111 if ( ! class %in% names(.self$.conn)) | |
112 .self$createConn(class) | |
113 | |
114 return (.self$.conn[[class]]) | |
115 }) | |
116 | |
117 ################ | |
118 # CREATE ENTRY # | |
119 ################ | |
120 | |
121 BiodbFactory$methods( createEntry = function(class, id = NULL, content = NULL, drop = TRUE) { | |
122 "Create Entry from a database by id." | |
123 | |
124 is.null(id) && is.null(content) && stop("One of id or content must be set.") | |
125 ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.") | |
126 | |
127 # Debug | |
128 .self$.print.debug.msg(paste0("Creating ", if (is.null(id)) length(content) else length(id), " entries from ", if (is.null(id)) "contents" else paste("ids", paste(if (length(id) > 10) id[1:10] else id, collapse = ", ")), "...")) | |
129 | |
130 # Get content | |
131 if ( ! is.null(id)) | |
132 content <- .self$getEntryContent(class, id) | |
133 conn <- .self$getConn(class) | |
134 entry <- conn$createEntry(content = content, drop = drop) | |
135 | |
136 # Set factory | |
137 .self$.print.debug.msg(paste0("Setting factory reference into entries...")) | |
138 for (e in c(entry)) | |
139 if ( ! is.null(e)) | |
140 e$setFactory(.self) | |
141 | |
142 return(entry) | |
143 }) | |
144 | |
145 ######################## | |
146 # GET CACHE FILE PATHS # | |
147 ######################## | |
148 | |
149 BiodbFactory$methods( .get.cache.file.paths = function(class, id) { | |
150 | |
151 # Get extension | |
152 ext <- .self$getConn(class)$getEntryContentType() | |
153 | |
154 # Set filenames | |
155 filenames <- vapply(id, function(x) { if (is.na(x)) NA_character_ else paste0(class, '-', x, '.', ext) }, FUN.VALUE = '') | |
156 | |
157 # set file paths | |
158 file.paths <- vapply(filenames, function(x) { if (is.na(x)) NA_character_ else file.path(.self$.cache.dir, x) }, FUN.VALUE = '') | |
159 | |
160 # Create cache dir if needed | |
161 if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir)) | |
162 dir.create(.self$.cache.dir) | |
163 | |
164 return(file.paths) | |
165 }) | |
166 | |
167 ########################### | |
168 # LOAD CONTENT FROM CACHE # | |
169 ########################### | |
170 | |
171 BiodbFactory$methods( .load.content.from.cache = function(class, id) { | |
172 | |
173 content <- NULL | |
174 | |
175 # Read contents from files | |
176 file.paths <- .self$.get.cache.file.paths(class, id) | |
177 content <- lapply(file.paths, function(x) { if (is.na(x)) NA_character_ else ( if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL )} ) | |
178 | |
179 return(content) | |
180 }) | |
181 | |
182 ############################ | |
183 # IS CACHE READING ENABLED # | |
184 ############################ | |
185 | |
186 BiodbFactory$methods( .is.cache.reading.enabled = function() { | |
187 return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.READ.ONLY, BIODB.CACHE.READ.WRITE)) | |
188 }) | |
189 | |
190 ############################ | |
191 # IS CACHE WRITING ENABLED # | |
192 ############################ | |
193 | |
194 BiodbFactory$methods( .is.cache.writing.enabled = function() { | |
195 return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.WRITE.ONLY, BIODB.CACHE.READ.WRITE)) | |
196 }) | |
197 | |
198 ######################### | |
199 # SAVE CONTENT TO CACHE # | |
200 ######################### | |
201 | |
202 BiodbFactory$methods( .save.content.to.cache = function(class, id, content) { | |
203 | |
204 # Write contents into files | |
205 file.paths <- .self$.get.cache.file.paths(class, id) | |
206 mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths) | |
207 }) | |
208 | |
209 ##################### | |
210 # GET ENTRY CONTENT # | |
211 ##################### | |
212 | |
213 BiodbFactory$methods( getEntryContent = function(class, id) { | |
214 | |
215 # Debug | |
216 .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)...")) | |
217 | |
218 # Initialize content | |
219 if (.self$.is.cache.reading.enabled()) { | |
220 content <- .self$.load.content.from.cache(class, id) | |
221 missing.ids <- id[vapply(content, is.null, FUN.VALUE = TRUE)] | |
222 } | |
223 else { | |
224 content <- lapply(id, as.null) | |
225 missing.ids <- id | |
226 } | |
227 | |
228 # Remove duplicates | |
229 n.duplicates <- sum(duplicated(missing.ids)) | |
230 missing.ids <- missing.ids[ ! duplicated(missing.ids)] | |
231 | |
232 # Debug | |
233 if (any(is.na(id))) | |
234 .self$.print.debug.msg(paste0(sum(is.na(id)), " entry ids are NA.")) | |
235 if (.self$.is.cache.reading.enabled()) { | |
236 .self$.print.debug.msg(paste0(sum( ! is.na(id)) - length(missing.ids), " entry content(s) loaded from cache.")) | |
237 if (n.duplicates > 0) | |
238 .self$.print.debug.msg(paste0(n.duplicates, " entry ids, whose content needs to be fetched, are duplicates.")) | |
239 .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched.")) | |
240 } | |
241 | |
242 # Get contents | |
243 if (length(missing.ids) > 0) { | |
244 | |
245 # Use connector to get missing contents | |
246 conn <- .self$getConn(class) | |
247 | |
248 # Divide list of missing ids in chunks (in order to save in cache regularly) | |
249 chunks.of.missing.ids = if (is.na(.self$.chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / .self$.chunk.size)) | |
250 | |
251 # Loop on chunks | |
252 missing.contents <- NULL | |
253 for (ch.missing.ids in chunks.of.missing.ids) { | |
254 | |
255 ch.missing.contents <- conn$getEntryContent(ch.missing.ids) | |
256 | |
257 # Save to cache | |
258 if ( ! is.null(ch.missing.contents) && .self$.is.cache.writing.enabled()) | |
259 .self$.save.content.to.cache(class, ch.missing.ids, ch.missing.contents) | |
260 | |
261 # Append | |
262 missing.contents <- c(missing.contents, ch.missing.contents) | |
263 | |
264 # Debug | |
265 if (.self$.is.cache.reading.enabled()) | |
266 .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved...")) | |
267 } | |
268 | |
269 # Merge content and missing.contents | |
270 content[id %in% missing.ids] <- vapply(id[id %in% missing.ids], function(x) missing.contents[missing.ids %in% x], FUN.VALUE = '') | |
271 } | |
272 | |
273 return(content) | |
274 }) |