comparison biodb-common.R @ 6:f86fec07f392 draft default tip

planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author prog
date Fri, 22 Feb 2019 16:04:22 -0500
parents fb9c0409d85c
children
comparison
equal deleted inserted replaced
5:fb9c0409d85c 6:f86fec07f392
1 if ( ! exists('BIODB.XML')) {
2
3 ###############
4 # CACHE MODES #
5 ###############
6
7 BIODB.CACHE.READ.ONLY <- 'read-only'
8 BIODB.CACHE.READ.WRITE <- 'read-write'
9 BIODB.CACHE.WRITE.ONLY <- 'write-only'
10
11 #######################
12 # ENTRY CONTENT TYPES #
13 #######################
14
15 BIODB.HTML <- 'html'
16 BIODB.TXT <- 'txt'
17 BIODB.XML <- 'xml'
18 BIODB.CSV <- 'csv'
19 BIODB.DATAFRAME <- 'dataframe'
20 BIODB.JSON <- 'json'
21
22 #############
23 # DATABASES #
24 #############
25
26 BIODB.CHEBI <- 'chebi'
27 BIODB.KEGG <- 'kegg'
28 BIODB.PUBCHEMCOMP <- 'pubchemcomp' # Compound database
29 BIODB.PUBCHEMSUB <- 'pubchemsub' # Substance database
30 BIODB.HMDB <- 'hmdb'
31 BIODB.CHEMSPIDER <- 'chemspider'
32 BIODB.ENZYME <- 'enzyme'
33 BIODB.LIPIDMAPS <- 'lipidmaps'
34 BIODB.MIRBASE <- 'mirbase'
35 BIODB.NCBIGENE <- 'ncbigene'
36 BIODB.NCBICCDS <- 'ncbiccds'
37 BIODB.UNIPROT <- 'uniprot'
38 BIODB.MASSBANK <- 'massbank'
39 BIODB.MASSFILEDB <- 'massfiledb'
40 BIODB.PEAKFOREST <- 'peakforest'
41
42 BIODB.DATABASES <- c(BIODB.CHEBI, BIODB.KEGG, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.HMDB, BIODB.CHEMSPIDER, BIODB.ENZYME, BIODB.LIPIDMAPS, BIODB.MIRBASE, BIODB.NCBIGENE, BIODB.NCBICCDS, BIODB.UNIPROT, BIODB.MASSBANK, BIODB.MASSFILEDB, BIODB.PEAKFOREST)
43
44 ##########
45 # FIELDS #
46 ##########
47
48 BIODB.ACCESSION <- 'accession'
49 BIODB.DESCRIPTION <- 'description'
50 BIODB.PROTEIN.DESCRIPTION <- 'protdesc'
51 BIODB.NAME <- 'name'
52 BIODB.COMP.IUPAC.NAME.ALLOWED <- 'comp.iupac.name.allowed'
53 BIODB.COMP.IUPAC.NAME.TRAD <- 'comp.iupac.name.trad'
54 BIODB.COMP.IUPAC.NAME.SYST <- 'comp.iupac.name.syst'
55 BIODB.COMP.IUPAC.NAME.PREF <- 'comp.iupac.name.pref'
56 BIODB.COMP.IUPAC.NAME.CAS <- 'comp.iupac.name.cas'
57 BIODB.FULLNAMES <- 'fullnames'
58 BIODB.SYNONYMS <- 'synonyms'
59 BIODB.SYMBOL <- 'symbol'
60 BIODB.GENE.SYMBOLS <- 'genesymbols'
61 BIODB.CHEBI.ID <- 'chebiid'
62 BIODB.LIPIDMAPS.ID <- 'lipidmapsid'
63 BIODB.KEGG.ID <- 'keggid'
64 BIODB.HMDB.ID <- 'hmdbid'
65 BIODB.ENZYME.ID <- 'enzymeid'
66 BIODB.NCBI.CCDS.ID <- 'ncbiccdsid'
67 BIODB.NCBI.GENE.ID <- 'ncbigeneid'
68 BIODB.PUBCHEMCOMP.ID <- 'pubchemcompid'
69 BIODB.PUBCHEMSUB.ID <- 'pubchemsubid'
70 BIODB.CHEMSPIDER.ID <- 'chemspiderid'
71 BIODB.UNIPROT.ID <- 'uniprotid'
72 BIODB.CAS.ID <- 'casid'
73 BIODB.PEAKFOREST.ID <- 'peakforestid'
74 BIODB.SMILES <- 'smiles'
75 BIODB.INCHI <- 'inchi'
76 BIODB.INCHIKEY <- 'inchikey'
77 BIODB.MSDEV <- 'msdev'
78 BIODB.MSDEVTYPE <- 'msdevtype'
79 BIODB.MSTYPE <- 'mstype'
80 BIODB.MSMODE <- 'msmode'
81 BIODB.MSPRECMZ <- 'msprecmz' # numeric
82 BIODB.MSPRECANNOT <- 'msprecannot'
83 BIODB.FORMULA <- 'formula'
84 BIODB.SUPER.CLASS <- 'superclass'
85 BIODB.MASS <- 'mass'
86 BIODB.AVERAGE.MASS <- 'averagemass'
87 BIODB.MONOISOTOPIC.MASS <- 'monoisotopicmass'
88 BIODB.SEQUENCE <- 'sequence'
89 BIODB.LOCATION <- 'location'
90 BIODB.LENGTH <- 'length'
91 BIODB.NB.PEAKS <- 'nbpeaks'
92 BIODB.PEAKS <- 'peaks'
93 BIODB.COMPOUNDS <- 'compounds'
94 BIODB.NB.COMPOUNDS <- 'nbcompounds'
95 BIODB.COMPOUND.ID <- 'compoundid'
96 BIODB.COMPOUND.MASS <- 'compoundmass'
97 BIODB.COMPOUND.COMP <- 'compoundcomp'
98 BIODB.CHROM.COL <- 'chromcol' # Chromatographic column
99 BIODB.CHROM.COL.RT <- 'chromcolrt' # Retention time measured on chromatographic column
100 BIODB.ID <- 'id'
101 BIODB.TITLE <- 'title'
102 BIODB.PEAK.MZ <- 'mz'
103 BIODB.PEAK.RT <- 'rt'
104 BIODB.PEAK.MZEXP <- 'mzexp'
105 BIODB.PEAK.MZTHEO <- 'mztheo'
106 BIODB.PEAK.FORMULA <- 'formula'
107 BIODB.PEAK.FORMULA.COUNT <- 'formula.count'
108 BIODB.PEAK.COMP <- 'peakcomp' # Peak composition
109 BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution
110 BIODB.PEAK.MASS <- 'mass'
111 # BIODB.PEAK.ATTR <- 'attr'
112 BIODB.PEAK.ERROR.PPM <- 'error.ppm'
113 BIODB.PEAK.INTENSITY <- 'intensity'
114 BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity'
115
116 # Mode values
117 BIODB.MSMODE.NEG <- 'neg'
118 BIODB.MSMODE.POS <- 'pos'
119
120 # Tolerance values
121 BIODB.TOL <- 'mztol'
122 BIODB.MZTOLUNIT.PPM <- 'ppm'
123 BIODB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio
124 BIODB.MZTOLUNIT.VALS <- c(BIODB.MZTOLUNIT.PPM, BIODB.MZTOLUNIT.PLAIN)
125
126 ########################
127 # MS-MS MEASURE VALUES #
128 ########################
129
130 BIODB.MSMS.DIST.COS <- "cosine"
131 BIODB.MSMS.DIST.WCOSINE <- "wcosine"
132 BIODB.MSMS.DIST.PKERNEL <- "pkernel"
133 BIODB.MSMS.DIST <- c(BIODB.MSMS.DIST.COS, BIODB.MSMS.DIST.WCOSINE, BIODB.MSMS.DIST.PKERNEL)
134
135
136 #################
137 # CARDINALITIES #
138 #################
139
140 BIODB.CARD.ONE <- '1'
141 BIODB.CARD.MANY <- '*'
142
143 #####################
144 #INTENSITy NOTATIONS#
145 #####################
146
147 BIODB.GROUP.INTENSITY<-c(BIODB.PEAK.INTENSITY,BIODB.PEAK.RELATIVE.INTENSITY)
148
149 ##########################
150 # ENTRY FIELD ATTRIBUTES #
151 ##########################
152 # FIELD NAME CLASS CARDINALITY TYPE
153 BIODB.FIELDS <- data.frame(matrix(c(
154 BIODB.ACCESSION, 'character', BIODB.CARD.ONE, 'none',
155 BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none',
156 BIODB.NAME, 'character', BIODB.CARD.ONE, 'name',
157 BIODB.COMP.IUPAC.NAME.ALLOWED, 'character', BIODB.CARD.ONE, 'name',
158 BIODB.COMP.IUPAC.NAME.TRAD, 'character', BIODB.CARD.ONE, 'name',
159 BIODB.COMP.IUPAC.NAME.SYST, 'character', BIODB.CARD.ONE, 'name',
160 BIODB.COMP.IUPAC.NAME.PREF, 'character', BIODB.CARD.ONE, 'name',
161 BIODB.COMP.IUPAC.NAME.CAS, 'character', BIODB.CARD.ONE, 'name',
162 BIODB.FULLNAMES, 'character', BIODB.CARD.MANY, 'name',
163 BIODB.SYNONYMS, 'character', BIODB.CARD.MANY, 'name',
164 BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none',
165 BIODB.SYMBOL, 'character', BIODB.CARD.ONE, 'none',
166 BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY, 'none',
167 BIODB.NB.COMPOUNDS, 'integer', BIODB.CARD.ONE, 'none',
168 BIODB.COMPOUNDS, 'object', BIODB.CARD.MANY, 'none',
169 BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE, 'none',
170 BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE, 'none',
171 BIODB.KEGG.ID, 'character', BIODB.CARD.ONE, 'none',
172 BIODB.HMDB.ID, 'character', BIODB.CARD.ONE, 'none',
173 BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE, 'none',
174 BIODB.PUBCHEMCOMP.ID, 'character', BIODB.CARD.ONE, 'none',
175 BIODB.PUBCHEMSUB.ID, 'character', BIODB.CARD.ONE, 'none',
176 BIODB.PEAKFOREST.ID, 'character', BIODB.CARD.ONE, 'none',
177 BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE, 'none',
178 BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE, 'none',
179 BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE, 'none',
180 BIODB.INCHI, 'character', BIODB.CARD.ONE, 'none',
181 BIODB.INCHIKEY, 'character', BIODB.CARD.ONE, 'none',
182 BIODB.MSDEV, 'character', BIODB.CARD.ONE, 'none',
183 BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE, 'none',
184 BIODB.MSTYPE, 'character', BIODB.CARD.ONE, 'none',
185 BIODB.MSMODE, 'character', BIODB.CARD.ONE, 'none',
186 BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE, 'none',
187 BIODB.PEAK.MZTHEO, 'double', BIODB.CARD.ONE, 'none',
188 BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE, 'none',
189 BIODB.FORMULA, 'character', BIODB.CARD.ONE, 'none',
190 BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE, 'none',
191 BIODB.MASS, 'double', BIODB.CARD.ONE, 'none',
192 BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE, 'none',
193 BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE, 'none',
194 BIODB.SEQUENCE, 'character', BIODB.CARD.ONE, 'none',
195 BIODB.LENGTH, 'integer', BIODB.CARD.ONE, 'none',
196 BIODB.LOCATION, 'character', BIODB.CARD.ONE, 'none',
197 BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE, 'none',
198 BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE, 'none',
199 BIODB.SMILES, 'character', BIODB.CARD.ONE, 'none',
200 BIODB.CHEMSPIDER.ID, 'character', BIODB.CARD.ONE, 'none',
201 BIODB.CAS.ID, 'character', BIODB.CARD.ONE, 'none'
202 ), byrow = TRUE, ncol = 4), stringsAsFactors = FALSE)
203 colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality', 'type')
204
205 #########################
206 # GET DATABASE ID FIELD #
207 #########################
208
209 biodb.get.database.id.field <- function(database) {
210
211 id.field <- NA_character_
212
213 if (database %in% BIODB.DATABASES) {
214 id.field <- paste0(database, 'id')
215 if ( ! id.field %in% BIODB.FIELDS[['name']])
216 stop(paste0('No ID field defined for database ', database, '.'))
217 }
218
219 return(id.field)
220 }
221
222 #####################
223 # COMPUTABLE FIELDS #
224 #####################
225
226 BIODB.FIELD.COMPUTING <- list()
227 BIODB.FIELD.COMPUTING[[BIODB.INCHI]] <- c(BIODB.CHEBI)
228 BIODB.FIELD.COMPUTING[[BIODB.INCHIKEY]] <- c(BIODB.CHEBI)
229 BIODB.FIELD.COMPUTING[[BIODB.SEQUENCE]] <- c(BIODB.NCBICCDS)
230
231 ####################
232 # PEAKS DATA FRAME #
233 ####################
234
235 # Example
236 BIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE)
237 colnames(BIODB.PEAK.DF.EXAMPLE) <- c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM)
238
239 #################
240 # GET ENTRY URL #
241 #################
242
243 # TODO Let the choice to use either jp or eu
244 BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/"
245 BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/"
246
247 .do.get.entry.url <- function(class, accession, content.type = BIODB.HTML, base.url = NA_character_, token = NA_character_) {
248
249 # Only certain databases can handle multiple accession ids
250 if ( ! class %in% c(BIODB.MASSBANK, BIODB.CHEMSPIDER, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.PEAKFOREST) && length(accession) > 1)
251 stop(paste0("Cannot build a URL for getting multiple entries for class ", class, "."))
252
253 # Get URL
254 url <- switch(class,
255 chebi = if (content.type == BIODB.HTML) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL,
256 chemspider = {
257 token.param <- if (is.na(token)) '' else paste('&token', token, sep = '=')
258 switch(content.type,
259 html = paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html'),
260 xml = paste0('http://www.chemspider.com/MassSpecAPI.asmx/GetExtendedCompoundInfoArray?', paste(paste0('CSIDs=', accession), collapse = '&'), token.param),
261 NULL)
262 },
263 enzyme = if (content.type == BIODB.TXT) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL,
264 hmdb = switch(content.type,
265 xml = paste0('http://www.hmdb.ca/metabolites/', accession, '.xml'),
266 html = paste0('http://www.hmdb.ca/metabolites/', accession),
267 NULL),
268 kegg = switch(content.type,
269 txt = paste0('http://rest.kegg.jp/get/', accession),
270 html = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession),
271 NULL),
272 lipidmaps = if (content.type == BIODB.CSV) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL,
273 massbank = if (content.type == BIODB.TXT) paste0((if (is.na(base.url)) BIODB.MASSBANK.EU.WS.URL else base.url), 'getRecordInfo?ids=', paste(accession, collapse = ',')) else NULL,
274 mirbase = if (content.type == BIODB.HTML) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL,
275 pubchemcomp = switch(content.type,
276 xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/', paste(accession, collapse = ','), '/XML'),
277 html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession),
278 NULL),
279 pubchemsub = switch(content.type,
280 xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/', paste(accession, collapse = ','), '/XML'),
281 html = paste0('http://pubchem.ncbi.nlm.nih.gov/substance/', accession),
282 NULL),
283 ncbigene = if (content.type == BIODB.XML) paste0('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL,
284 ncbiccds = if (content.type == BIODB.HTML) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession),
285 uniprot = if (content.type == BIODB.XML) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'),
286 peakforest = switch(content.type,
287 html= paste0('https://peakforest.org/home?PFs=',accession),
288 json= paste0('https://peakforest-alpha.inra.fr/rest/spectra/lcms/ids/',paste(accession,sep=','),'?token=',token),
289
290 NULL
291 )
292 )
293 return(url)
294 }
295
296 get.entry.url <- function(class, accession, content.type = BIODB.HTML, max.length = 0, base.url = NA_character_, token = NA_character_) {
297
298 if (length(accession) == 0)
299 return(NULL)
300
301 full.url <- .do.get.entry.url(class, accession, content.type = content.type, base.url = base.url, token = token)
302 if (max.length == 0 || nchar(full.url) <= max.length)
303 return(if (max.length == 0) full.url else list(url = full.url, n = length(accession)))
304
305 # Find max size URL
306 a <- 1
307 b <- length(accession)
308 while (a < b) {
309 m <- as.integer((a + b) / 2)
310 url <- .do.get.entry.url(class, accession[1:m], content.type = content.type, base.url = base.url, token = token)
311 if (nchar(url) <= max.length && m != a)
312 a <- m
313 else
314 b <- m
315 }
316 url <- .do.get.entry.url(class, accession[1:a], content.type = content.type, base.url = base.url, token = token)
317
318 return(list( url = url, n = a))
319 }
320
321 #################
322 # PRINT MESSAGE #
323 #################
324
325 BIODB.DEBUG <- 1
326 BIODB.LEVEL.NAMES <- c('DEBUG')
327
328 .print.msg <- function(msg, level = BIODB.DEBUG, class = NA_character_) {
329 cat(paste0(BIODB.LEVEL.NAMES[[level]], if (is.na(class)) '' else paste0(", ", class), ": ", msg, "\n"), file = stderr())
330 }
331
332 #####################
333 # BIODB GET ENV VAR #
334 #####################
335
336 .biodb.get.env.var <- function(v) {
337
338 # Get all env vars
339 env <- Sys.getenv()
340
341 # Make env var name
342 env.var <- paste(c('BIODB', toupper(v)), collapse = '_')
343
344 # Look if this env var exists
345 if (env.var %in% names(env))
346 return(env[[env.var]])
347
348 return(NA_character_)
349 }
350 }