Mercurial > repos > prog > lcmsmatching
comparison biodb-common.R @ 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author | prog |
---|---|
date | Fri, 22 Feb 2019 16:04:22 -0500 |
parents | fb9c0409d85c |
children |
comparison
equal
deleted
inserted
replaced
5:fb9c0409d85c | 6:f86fec07f392 |
---|---|
1 if ( ! exists('BIODB.XML')) { | |
2 | |
3 ############### | |
4 # CACHE MODES # | |
5 ############### | |
6 | |
7 BIODB.CACHE.READ.ONLY <- 'read-only' | |
8 BIODB.CACHE.READ.WRITE <- 'read-write' | |
9 BIODB.CACHE.WRITE.ONLY <- 'write-only' | |
10 | |
11 ####################### | |
12 # ENTRY CONTENT TYPES # | |
13 ####################### | |
14 | |
15 BIODB.HTML <- 'html' | |
16 BIODB.TXT <- 'txt' | |
17 BIODB.XML <- 'xml' | |
18 BIODB.CSV <- 'csv' | |
19 BIODB.DATAFRAME <- 'dataframe' | |
20 BIODB.JSON <- 'json' | |
21 | |
22 ############# | |
23 # DATABASES # | |
24 ############# | |
25 | |
26 BIODB.CHEBI <- 'chebi' | |
27 BIODB.KEGG <- 'kegg' | |
28 BIODB.PUBCHEMCOMP <- 'pubchemcomp' # Compound database | |
29 BIODB.PUBCHEMSUB <- 'pubchemsub' # Substance database | |
30 BIODB.HMDB <- 'hmdb' | |
31 BIODB.CHEMSPIDER <- 'chemspider' | |
32 BIODB.ENZYME <- 'enzyme' | |
33 BIODB.LIPIDMAPS <- 'lipidmaps' | |
34 BIODB.MIRBASE <- 'mirbase' | |
35 BIODB.NCBIGENE <- 'ncbigene' | |
36 BIODB.NCBICCDS <- 'ncbiccds' | |
37 BIODB.UNIPROT <- 'uniprot' | |
38 BIODB.MASSBANK <- 'massbank' | |
39 BIODB.MASSFILEDB <- 'massfiledb' | |
40 BIODB.PEAKFOREST <- 'peakforest' | |
41 | |
42 BIODB.DATABASES <- c(BIODB.CHEBI, BIODB.KEGG, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.HMDB, BIODB.CHEMSPIDER, BIODB.ENZYME, BIODB.LIPIDMAPS, BIODB.MIRBASE, BIODB.NCBIGENE, BIODB.NCBICCDS, BIODB.UNIPROT, BIODB.MASSBANK, BIODB.MASSFILEDB, BIODB.PEAKFOREST) | |
43 | |
44 ########## | |
45 # FIELDS # | |
46 ########## | |
47 | |
48 BIODB.ACCESSION <- 'accession' | |
49 BIODB.DESCRIPTION <- 'description' | |
50 BIODB.PROTEIN.DESCRIPTION <- 'protdesc' | |
51 BIODB.NAME <- 'name' | |
52 BIODB.COMP.IUPAC.NAME.ALLOWED <- 'comp.iupac.name.allowed' | |
53 BIODB.COMP.IUPAC.NAME.TRAD <- 'comp.iupac.name.trad' | |
54 BIODB.COMP.IUPAC.NAME.SYST <- 'comp.iupac.name.syst' | |
55 BIODB.COMP.IUPAC.NAME.PREF <- 'comp.iupac.name.pref' | |
56 BIODB.COMP.IUPAC.NAME.CAS <- 'comp.iupac.name.cas' | |
57 BIODB.FULLNAMES <- 'fullnames' | |
58 BIODB.SYNONYMS <- 'synonyms' | |
59 BIODB.SYMBOL <- 'symbol' | |
60 BIODB.GENE.SYMBOLS <- 'genesymbols' | |
61 BIODB.CHEBI.ID <- 'chebiid' | |
62 BIODB.LIPIDMAPS.ID <- 'lipidmapsid' | |
63 BIODB.KEGG.ID <- 'keggid' | |
64 BIODB.HMDB.ID <- 'hmdbid' | |
65 BIODB.ENZYME.ID <- 'enzymeid' | |
66 BIODB.NCBI.CCDS.ID <- 'ncbiccdsid' | |
67 BIODB.NCBI.GENE.ID <- 'ncbigeneid' | |
68 BIODB.PUBCHEMCOMP.ID <- 'pubchemcompid' | |
69 BIODB.PUBCHEMSUB.ID <- 'pubchemsubid' | |
70 BIODB.CHEMSPIDER.ID <- 'chemspiderid' | |
71 BIODB.UNIPROT.ID <- 'uniprotid' | |
72 BIODB.CAS.ID <- 'casid' | |
73 BIODB.PEAKFOREST.ID <- 'peakforestid' | |
74 BIODB.SMILES <- 'smiles' | |
75 BIODB.INCHI <- 'inchi' | |
76 BIODB.INCHIKEY <- 'inchikey' | |
77 BIODB.MSDEV <- 'msdev' | |
78 BIODB.MSDEVTYPE <- 'msdevtype' | |
79 BIODB.MSTYPE <- 'mstype' | |
80 BIODB.MSMODE <- 'msmode' | |
81 BIODB.MSPRECMZ <- 'msprecmz' # numeric | |
82 BIODB.MSPRECANNOT <- 'msprecannot' | |
83 BIODB.FORMULA <- 'formula' | |
84 BIODB.SUPER.CLASS <- 'superclass' | |
85 BIODB.MASS <- 'mass' | |
86 BIODB.AVERAGE.MASS <- 'averagemass' | |
87 BIODB.MONOISOTOPIC.MASS <- 'monoisotopicmass' | |
88 BIODB.SEQUENCE <- 'sequence' | |
89 BIODB.LOCATION <- 'location' | |
90 BIODB.LENGTH <- 'length' | |
91 BIODB.NB.PEAKS <- 'nbpeaks' | |
92 BIODB.PEAKS <- 'peaks' | |
93 BIODB.COMPOUNDS <- 'compounds' | |
94 BIODB.NB.COMPOUNDS <- 'nbcompounds' | |
95 BIODB.COMPOUND.ID <- 'compoundid' | |
96 BIODB.COMPOUND.MASS <- 'compoundmass' | |
97 BIODB.COMPOUND.COMP <- 'compoundcomp' | |
98 BIODB.CHROM.COL <- 'chromcol' # Chromatographic column | |
99 BIODB.CHROM.COL.RT <- 'chromcolrt' # Retention time measured on chromatographic column | |
100 BIODB.ID <- 'id' | |
101 BIODB.TITLE <- 'title' | |
102 BIODB.PEAK.MZ <- 'mz' | |
103 BIODB.PEAK.RT <- 'rt' | |
104 BIODB.PEAK.MZEXP <- 'mzexp' | |
105 BIODB.PEAK.MZTHEO <- 'mztheo' | |
106 BIODB.PEAK.FORMULA <- 'formula' | |
107 BIODB.PEAK.FORMULA.COUNT <- 'formula.count' | |
108 BIODB.PEAK.COMP <- 'peakcomp' # Peak composition | |
109 BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution | |
110 BIODB.PEAK.MASS <- 'mass' | |
111 # BIODB.PEAK.ATTR <- 'attr' | |
112 BIODB.PEAK.ERROR.PPM <- 'error.ppm' | |
113 BIODB.PEAK.INTENSITY <- 'intensity' | |
114 BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity' | |
115 | |
116 # Mode values | |
117 BIODB.MSMODE.NEG <- 'neg' | |
118 BIODB.MSMODE.POS <- 'pos' | |
119 | |
120 # Tolerance values | |
121 BIODB.TOL <- 'mztol' | |
122 BIODB.MZTOLUNIT.PPM <- 'ppm' | |
123 BIODB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio | |
124 BIODB.MZTOLUNIT.VALS <- c(BIODB.MZTOLUNIT.PPM, BIODB.MZTOLUNIT.PLAIN) | |
125 | |
126 ######################## | |
127 # MS-MS MEASURE VALUES # | |
128 ######################## | |
129 | |
130 BIODB.MSMS.DIST.COS <- "cosine" | |
131 BIODB.MSMS.DIST.WCOSINE <- "wcosine" | |
132 BIODB.MSMS.DIST.PKERNEL <- "pkernel" | |
133 BIODB.MSMS.DIST <- c(BIODB.MSMS.DIST.COS, BIODB.MSMS.DIST.WCOSINE, BIODB.MSMS.DIST.PKERNEL) | |
134 | |
135 | |
136 ################# | |
137 # CARDINALITIES # | |
138 ################# | |
139 | |
140 BIODB.CARD.ONE <- '1' | |
141 BIODB.CARD.MANY <- '*' | |
142 | |
143 ##################### | |
144 #INTENSITy NOTATIONS# | |
145 ##################### | |
146 | |
147 BIODB.GROUP.INTENSITY<-c(BIODB.PEAK.INTENSITY,BIODB.PEAK.RELATIVE.INTENSITY) | |
148 | |
149 ########################## | |
150 # ENTRY FIELD ATTRIBUTES # | |
151 ########################## | |
152 # FIELD NAME CLASS CARDINALITY TYPE | |
153 BIODB.FIELDS <- data.frame(matrix(c( | |
154 BIODB.ACCESSION, 'character', BIODB.CARD.ONE, 'none', | |
155 BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none', | |
156 BIODB.NAME, 'character', BIODB.CARD.ONE, 'name', | |
157 BIODB.COMP.IUPAC.NAME.ALLOWED, 'character', BIODB.CARD.ONE, 'name', | |
158 BIODB.COMP.IUPAC.NAME.TRAD, 'character', BIODB.CARD.ONE, 'name', | |
159 BIODB.COMP.IUPAC.NAME.SYST, 'character', BIODB.CARD.ONE, 'name', | |
160 BIODB.COMP.IUPAC.NAME.PREF, 'character', BIODB.CARD.ONE, 'name', | |
161 BIODB.COMP.IUPAC.NAME.CAS, 'character', BIODB.CARD.ONE, 'name', | |
162 BIODB.FULLNAMES, 'character', BIODB.CARD.MANY, 'name', | |
163 BIODB.SYNONYMS, 'character', BIODB.CARD.MANY, 'name', | |
164 BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none', | |
165 BIODB.SYMBOL, 'character', BIODB.CARD.ONE, 'none', | |
166 BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY, 'none', | |
167 BIODB.NB.COMPOUNDS, 'integer', BIODB.CARD.ONE, 'none', | |
168 BIODB.COMPOUNDS, 'object', BIODB.CARD.MANY, 'none', | |
169 BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE, 'none', | |
170 BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE, 'none', | |
171 BIODB.KEGG.ID, 'character', BIODB.CARD.ONE, 'none', | |
172 BIODB.HMDB.ID, 'character', BIODB.CARD.ONE, 'none', | |
173 BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE, 'none', | |
174 BIODB.PUBCHEMCOMP.ID, 'character', BIODB.CARD.ONE, 'none', | |
175 BIODB.PUBCHEMSUB.ID, 'character', BIODB.CARD.ONE, 'none', | |
176 BIODB.PEAKFOREST.ID, 'character', BIODB.CARD.ONE, 'none', | |
177 BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE, 'none', | |
178 BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE, 'none', | |
179 BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE, 'none', | |
180 BIODB.INCHI, 'character', BIODB.CARD.ONE, 'none', | |
181 BIODB.INCHIKEY, 'character', BIODB.CARD.ONE, 'none', | |
182 BIODB.MSDEV, 'character', BIODB.CARD.ONE, 'none', | |
183 BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE, 'none', | |
184 BIODB.MSTYPE, 'character', BIODB.CARD.ONE, 'none', | |
185 BIODB.MSMODE, 'character', BIODB.CARD.ONE, 'none', | |
186 BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE, 'none', | |
187 BIODB.PEAK.MZTHEO, 'double', BIODB.CARD.ONE, 'none', | |
188 BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE, 'none', | |
189 BIODB.FORMULA, 'character', BIODB.CARD.ONE, 'none', | |
190 BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE, 'none', | |
191 BIODB.MASS, 'double', BIODB.CARD.ONE, 'none', | |
192 BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE, 'none', | |
193 BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE, 'none', | |
194 BIODB.SEQUENCE, 'character', BIODB.CARD.ONE, 'none', | |
195 BIODB.LENGTH, 'integer', BIODB.CARD.ONE, 'none', | |
196 BIODB.LOCATION, 'character', BIODB.CARD.ONE, 'none', | |
197 BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE, 'none', | |
198 BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE, 'none', | |
199 BIODB.SMILES, 'character', BIODB.CARD.ONE, 'none', | |
200 BIODB.CHEMSPIDER.ID, 'character', BIODB.CARD.ONE, 'none', | |
201 BIODB.CAS.ID, 'character', BIODB.CARD.ONE, 'none' | |
202 ), byrow = TRUE, ncol = 4), stringsAsFactors = FALSE) | |
203 colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality', 'type') | |
204 | |
205 ######################### | |
206 # GET DATABASE ID FIELD # | |
207 ######################### | |
208 | |
209 biodb.get.database.id.field <- function(database) { | |
210 | |
211 id.field <- NA_character_ | |
212 | |
213 if (database %in% BIODB.DATABASES) { | |
214 id.field <- paste0(database, 'id') | |
215 if ( ! id.field %in% BIODB.FIELDS[['name']]) | |
216 stop(paste0('No ID field defined for database ', database, '.')) | |
217 } | |
218 | |
219 return(id.field) | |
220 } | |
221 | |
222 ##################### | |
223 # COMPUTABLE FIELDS # | |
224 ##################### | |
225 | |
226 BIODB.FIELD.COMPUTING <- list() | |
227 BIODB.FIELD.COMPUTING[[BIODB.INCHI]] <- c(BIODB.CHEBI) | |
228 BIODB.FIELD.COMPUTING[[BIODB.INCHIKEY]] <- c(BIODB.CHEBI) | |
229 BIODB.FIELD.COMPUTING[[BIODB.SEQUENCE]] <- c(BIODB.NCBICCDS) | |
230 | |
231 #################### | |
232 # PEAKS DATA FRAME # | |
233 #################### | |
234 | |
235 # Example | |
236 BIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE) | |
237 colnames(BIODB.PEAK.DF.EXAMPLE) <- c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM) | |
238 | |
239 ################# | |
240 # GET ENTRY URL # | |
241 ################# | |
242 | |
243 # TODO Let the choice to use either jp or eu | |
244 BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/" | |
245 BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/" | |
246 | |
247 .do.get.entry.url <- function(class, accession, content.type = BIODB.HTML, base.url = NA_character_, token = NA_character_) { | |
248 | |
249 # Only certain databases can handle multiple accession ids | |
250 if ( ! class %in% c(BIODB.MASSBANK, BIODB.CHEMSPIDER, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.PEAKFOREST) && length(accession) > 1) | |
251 stop(paste0("Cannot build a URL for getting multiple entries for class ", class, ".")) | |
252 | |
253 # Get URL | |
254 url <- switch(class, | |
255 chebi = if (content.type == BIODB.HTML) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL, | |
256 chemspider = { | |
257 token.param <- if (is.na(token)) '' else paste('&token', token, sep = '=') | |
258 switch(content.type, | |
259 html = paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html'), | |
260 xml = paste0('http://www.chemspider.com/MassSpecAPI.asmx/GetExtendedCompoundInfoArray?', paste(paste0('CSIDs=', accession), collapse = '&'), token.param), | |
261 NULL) | |
262 }, | |
263 enzyme = if (content.type == BIODB.TXT) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL, | |
264 hmdb = switch(content.type, | |
265 xml = paste0('http://www.hmdb.ca/metabolites/', accession, '.xml'), | |
266 html = paste0('http://www.hmdb.ca/metabolites/', accession), | |
267 NULL), | |
268 kegg = switch(content.type, | |
269 txt = paste0('http://rest.kegg.jp/get/', accession), | |
270 html = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), | |
271 NULL), | |
272 lipidmaps = if (content.type == BIODB.CSV) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL, | |
273 massbank = if (content.type == BIODB.TXT) paste0((if (is.na(base.url)) BIODB.MASSBANK.EU.WS.URL else base.url), 'getRecordInfo?ids=', paste(accession, collapse = ',')) else NULL, | |
274 mirbase = if (content.type == BIODB.HTML) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL, | |
275 pubchemcomp = switch(content.type, | |
276 xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/', paste(accession, collapse = ','), '/XML'), | |
277 html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession), | |
278 NULL), | |
279 pubchemsub = switch(content.type, | |
280 xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/', paste(accession, collapse = ','), '/XML'), | |
281 html = paste0('http://pubchem.ncbi.nlm.nih.gov/substance/', accession), | |
282 NULL), | |
283 ncbigene = if (content.type == BIODB.XML) paste0('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL, | |
284 ncbiccds = if (content.type == BIODB.HTML) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession), | |
285 uniprot = if (content.type == BIODB.XML) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'), | |
286 peakforest = switch(content.type, | |
287 html= paste0('https://peakforest.org/home?PFs=',accession), | |
288 json= paste0('https://peakforest-alpha.inra.fr/rest/spectra/lcms/ids/',paste(accession,sep=','),'?token=',token), | |
289 | |
290 NULL | |
291 ) | |
292 ) | |
293 return(url) | |
294 } | |
295 | |
296 get.entry.url <- function(class, accession, content.type = BIODB.HTML, max.length = 0, base.url = NA_character_, token = NA_character_) { | |
297 | |
298 if (length(accession) == 0) | |
299 return(NULL) | |
300 | |
301 full.url <- .do.get.entry.url(class, accession, content.type = content.type, base.url = base.url, token = token) | |
302 if (max.length == 0 || nchar(full.url) <= max.length) | |
303 return(if (max.length == 0) full.url else list(url = full.url, n = length(accession))) | |
304 | |
305 # Find max size URL | |
306 a <- 1 | |
307 b <- length(accession) | |
308 while (a < b) { | |
309 m <- as.integer((a + b) / 2) | |
310 url <- .do.get.entry.url(class, accession[1:m], content.type = content.type, base.url = base.url, token = token) | |
311 if (nchar(url) <= max.length && m != a) | |
312 a <- m | |
313 else | |
314 b <- m | |
315 } | |
316 url <- .do.get.entry.url(class, accession[1:a], content.type = content.type, base.url = base.url, token = token) | |
317 | |
318 return(list( url = url, n = a)) | |
319 } | |
320 | |
321 ################# | |
322 # PRINT MESSAGE # | |
323 ################# | |
324 | |
325 BIODB.DEBUG <- 1 | |
326 BIODB.LEVEL.NAMES <- c('DEBUG') | |
327 | |
328 .print.msg <- function(msg, level = BIODB.DEBUG, class = NA_character_) { | |
329 cat(paste0(BIODB.LEVEL.NAMES[[level]], if (is.na(class)) '' else paste0(", ", class), ": ", msg, "\n"), file = stderr()) | |
330 } | |
331 | |
332 ##################### | |
333 # BIODB GET ENV VAR # | |
334 ##################### | |
335 | |
336 .biodb.get.env.var <- function(v) { | |
337 | |
338 # Get all env vars | |
339 env <- Sys.getenv() | |
340 | |
341 # Make env var name | |
342 env.var <- paste(c('BIODB', toupper(v)), collapse = '_') | |
343 | |
344 # Look if this env var exists | |
345 if (env.var %in% names(env)) | |
346 return(env[[env.var]]) | |
347 | |
348 return(NA_character_) | |
349 } | |
350 } |