comparison biodb-common.R @ 2:20d69a062da3 draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author prog
date Thu, 02 Mar 2017 08:55:00 -0500
parents 253d531a0193
children
comparison
equal deleted inserted replaced
1:253d531a0193 2:20d69a062da3
1 if ( ! exists('BIODB.COMPOUND')) { # Do not load again if already loaded 1 if ( ! exists('BIODB.XML')) {
2 2
3 ############### 3 ###############
4 # ENTRY TYPES # 4 # CACHE MODES #
5 ############### 5 ###############
6 6
7 BIODB.COMPOUND <- 'compound' 7 BIODB.CACHE.READ.ONLY <- 'read-only'
8 BIODB.SPECTRUM <- 'spectrum' 8 BIODB.CACHE.READ.WRITE <- 'read-write'
9 9 BIODB.CACHE.WRITE.ONLY <- 'write-only'
10
10 ####################### 11 #######################
11 # ENTRY CONTENT TYPES # 12 # ENTRY CONTENT TYPES #
12 ####################### 13 #######################
13 14
14 BIODB.HTML <- 'html' 15 BIODB.HTML <- 'html'
15 BIODB.TXT <- 'txt' 16 BIODB.TXT <- 'txt'
16 BIODB.XML <- 'xml' 17 BIODB.XML <- 'xml'
17 BIODB.CSV <- 'csv' 18 BIODB.CSV <- 'csv'
18 BIODB.DATAFRAME <- 'dataframe' 19 BIODB.DATAFRAME <- 'dataframe'
19 BIODB.ANY <- 'any' # Value used when we do not care about the type. 20 BIODB.JSON <- 'json'
20 21
21 ############# 22 #############
22 # DATABASES # 23 # DATABASES #
23 ############# 24 #############
24 25
25 BIODB.CHEBI <- 'chebi' 26 BIODB.CHEBI <- 'chebi'
26 BIODB.KEGG <- 'kegg' 27 BIODB.KEGG <- 'kegg'
27 BIODB.PUBCHEM <- 'pubchem' 28 BIODB.PUBCHEMCOMP <- 'pubchemcomp' # Compound database
29 BIODB.PUBCHEMSUB <- 'pubchemsub' # Substance database
28 BIODB.HMDB <- 'hmdb' 30 BIODB.HMDB <- 'hmdb'
29 BIODB.CHEMSPIDER <- 'chemspider' 31 BIODB.CHEMSPIDER <- 'chemspider'
30 BIODB.ENZYME <- 'enzyme' 32 BIODB.ENZYME <- 'enzyme'
31 BIODB.LIPIDMAPS <- 'lipidmaps' 33 BIODB.LIPIDMAPS <- 'lipidmaps'
32 BIODB.MIRBASE <- 'mirbase' 34 BIODB.MIRBASE <- 'mirbase'
33 BIODB.NCBIGENE <- 'ncbigene' 35 BIODB.NCBIGENE <- 'ncbigene'
34 BIODB.NCBICCDS <- 'ncbiccds' 36 BIODB.NCBICCDS <- 'ncbiccds'
35 BIODB.UNIPROT <- 'uniprot' 37 BIODB.UNIPROT <- 'uniprot'
36 BIODB.MASSBANK <- 'massbank' 38 BIODB.MASSBANK <- 'massbank'
37 BIODB.MASSFILEDB <- 'massfiledb' 39 BIODB.MASSFILEDB <- 'massfiledb'
40 BIODB.PEAKFOREST <- 'peakforest'
41
42 BIODB.DATABASES <- c(BIODB.CHEBI, BIODB.KEGG, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.HMDB, BIODB.CHEMSPIDER, BIODB.ENZYME, BIODB.LIPIDMAPS, BIODB.MIRBASE, BIODB.NCBIGENE, BIODB.NCBICCDS, BIODB.UNIPROT, BIODB.MASSBANK, BIODB.MASSFILEDB, BIODB.PEAKFOREST)
38 43
39 ########## 44 ##########
40 # FIELDS # 45 # FIELDS #
41 ########## 46 ##########
42 47
43 BIODB.ACCESSION <- 'accession' 48 BIODB.ACCESSION <- 'accession'
44 BIODB.DESCRIPTION <- 'description' 49 BIODB.DESCRIPTION <- 'description'
45 BIODB.PROTEIN.DESCRIPTION <- 'protdesc' 50 BIODB.PROTEIN.DESCRIPTION <- 'protdesc'
46 BIODB.NAME <- 'name' 51 BIODB.NAME <- 'name'
52 BIODB.COMP.IUPAC.NAME.ALLOWED <- 'comp.iupac.name.allowed'
53 BIODB.COMP.IUPAC.NAME.TRAD <- 'comp.iupac.name.trad'
54 BIODB.COMP.IUPAC.NAME.SYST <- 'comp.iupac.name.syst'
55 BIODB.COMP.IUPAC.NAME.PREF <- 'comp.iupac.name.pref'
56 BIODB.COMP.IUPAC.NAME.CAS <- 'comp.iupac.name.cas'
47 BIODB.FULLNAMES <- 'fullnames' 57 BIODB.FULLNAMES <- 'fullnames'
48 BIODB.SYNONYMS <- 'synonyms' 58 BIODB.SYNONYMS <- 'synonyms'
49 BIODB.SYMBOL <- 'symbol' 59 BIODB.SYMBOL <- 'symbol'
50 BIODB.GENE.SYMBOLS <- 'genesymbols' 60 BIODB.GENE.SYMBOLS <- 'genesymbols'
51 BIODB.CHEBI.ID <- 'chebiid' 61 BIODB.CHEBI.ID <- 'chebiid'
53 BIODB.KEGG.ID <- 'keggid' 63 BIODB.KEGG.ID <- 'keggid'
54 BIODB.HMDB.ID <- 'hmdbid' 64 BIODB.HMDB.ID <- 'hmdbid'
55 BIODB.ENZYME.ID <- 'enzymeid' 65 BIODB.ENZYME.ID <- 'enzymeid'
56 BIODB.NCBI.CCDS.ID <- 'ncbiccdsid' 66 BIODB.NCBI.CCDS.ID <- 'ncbiccdsid'
57 BIODB.NCBI.GENE.ID <- 'ncbigeneid' 67 BIODB.NCBI.GENE.ID <- 'ncbigeneid'
58 BIODB.PUBCHEM.ID <- 'pubchemid' 68 BIODB.PUBCHEMCOMP.ID <- 'pubchemcompid'
69 BIODB.PUBCHEMSUB.ID <- 'pubchemsubid'
70 BIODB.CHEMSPIDER.ID <- 'chemspiderid'
59 BIODB.UNIPROT.ID <- 'uniprotid' 71 BIODB.UNIPROT.ID <- 'uniprotid'
72 BIODB.CAS.ID <- 'casid'
73 BIODB.PEAKFOREST.ID <- 'peakforestid'
74 BIODB.SMILES <- 'smiles'
60 BIODB.INCHI <- 'inchi' 75 BIODB.INCHI <- 'inchi'
61 BIODB.INCHIKEY <- 'inchikey' 76 BIODB.INCHIKEY <- 'inchikey'
62 BIODB.MSDEV <- 'msdev' 77 BIODB.MSDEV <- 'msdev'
63 BIODB.MSDEVTYPE <- 'msdevtype' 78 BIODB.MSDEVTYPE <- 'msdevtype'
64 BIODB.MSTYPE <- 'mstype' 79 BIODB.MSTYPE <- 'mstype'
73 BIODB.SEQUENCE <- 'sequence' 88 BIODB.SEQUENCE <- 'sequence'
74 BIODB.LOCATION <- 'location' 89 BIODB.LOCATION <- 'location'
75 BIODB.LENGTH <- 'length' 90 BIODB.LENGTH <- 'length'
76 BIODB.NB.PEAKS <- 'nbpeaks' 91 BIODB.NB.PEAKS <- 'nbpeaks'
77 BIODB.PEAKS <- 'peaks' 92 BIODB.PEAKS <- 'peaks'
93 BIODB.COMPOUNDS <- 'compounds'
94 BIODB.NB.COMPOUNDS <- 'nbcompounds'
78 BIODB.COMPOUND.ID <- 'compoundid' 95 BIODB.COMPOUND.ID <- 'compoundid'
79 BIODB.PEAK.MZ <- 'peakmz' 96 BIODB.COMPOUND.MASS <- 'compoundmass'
80 BIODB.PEAK.COMP <- 'peakcomp' # Peak composition 97 BIODB.COMPOUND.COMP <- 'compoundcomp'
81 BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution
82 BIODB.CHROM.COL <- 'chromcol' # Chromatographic column 98 BIODB.CHROM.COL <- 'chromcol' # Chromatographic column
83 BIODB.CHROM.COL.RT <- 'chromcolrt' # Retention time measured on chromatographic column 99 BIODB.CHROM.COL.RT <- 'chromcolrt' # Retention time measured on chromatographic column
84 BIODB.ID <- 'id' 100 BIODB.ID <- 'id'
85 BIODB.TITLE <- 'title' 101 BIODB.TITLE <- 'title'
102 BIODB.PEAK.MZ <- 'mz'
103 BIODB.PEAK.RT <- 'rt'
104 BIODB.PEAK.MZEXP <- 'mzexp'
105 BIODB.PEAK.MZTHEO <- 'mztheo'
106 BIODB.PEAK.FORMULA <- 'formula'
107 BIODB.PEAK.FORMULA.COUNT <- 'formula.count'
108 BIODB.PEAK.COMP <- 'peakcomp' # Peak composition
109 BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution
110 BIODB.PEAK.MASS <- 'mass'
111 # BIODB.PEAK.ATTR <- 'attr'
112 BIODB.PEAK.ERROR.PPM <- 'error.ppm'
113 BIODB.PEAK.INTENSITY <- 'intensity'
114 BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity'
86 115
87 # Mode values 116 # Mode values
88 BIODB.MSMODE.NEG <- 'neg' 117 BIODB.MSMODE.NEG <- 'neg'
89 BIODB.MSMODE.POS <- 'pos' 118 BIODB.MSMODE.POS <- 'pos'
90 119
120 # Tolerance values
121 BIODB.TOL <- 'mztol'
122 BIODB.MZTOLUNIT.PPM <- 'ppm'
123 BIODB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio
124 BIODB.MZTOLUNIT.VALS <- c(BIODB.MZTOLUNIT.PPM, BIODB.MZTOLUNIT.PLAIN)
125
126 ########################
127 # MS-MS MEASURE VALUES #
128 ########################
129
130 BIODB.MSMS.DIST.COS <- "cosine"
131 BIODB.MSMS.DIST.WCOSINE <- "wcosine"
132 BIODB.MSMS.DIST.PKERNEL <- "pkernel"
133 BIODB.MSMS.DIST <- c(BIODB.MSMS.DIST.COS, BIODB.MSMS.DIST.WCOSINE, BIODB.MSMS.DIST.PKERNEL)
134
135
91 ################# 136 #################
92 # CARDINALITIES # 137 # CARDINALITIES #
93 ################# 138 #################
94 139
95 BIODB.CARD.ONE <- '1' 140 BIODB.CARD.ONE <- '1'
96 BIODB.CARD.MANY <- '*' 141 BIODB.CARD.MANY <- '*'
142
143 #####################
144 #INTENSITy NOTATIONS#
145 #####################
146
147 BIODB.GROUP.INTENSITY<-c(BIODB.PEAK.INTENSITY,BIODB.PEAK.RELATIVE.INTENSITY)
97 148
98 ########################## 149 ##########################
99 # ENTRY FIELD ATTRIBUTES # 150 # ENTRY FIELD ATTRIBUTES #
100 ########################## 151 ##########################
101 152 # FIELD NAME CLASS CARDINALITY TYPE
102 BIODB.FIELDS <- data.frame(matrix(c( 153 BIODB.FIELDS <- data.frame(matrix(c(
103 # FIELD NAME CLASS CARDINALITY 154 BIODB.ACCESSION, 'character', BIODB.CARD.ONE, 'none',
104 BIODB.COMPOUND, 'BiodEntry', BIODB.CARD.ONE, 155 BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none',
105 BIODB.ACCESSION, 'character', BIODB.CARD.ONE, 156 BIODB.NAME, 'character', BIODB.CARD.ONE, 'name',
106 BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE, 157 BIODB.COMP.IUPAC.NAME.ALLOWED, 'character', BIODB.CARD.ONE, 'name',
107 BIODB.NAME, 'character', BIODB.CARD.ONE, 158 BIODB.COMP.IUPAC.NAME.TRAD, 'character', BIODB.CARD.ONE, 'name',
108 BIODB.FULLNAMES, 'character', BIODB.CARD.MANY, 159 BIODB.COMP.IUPAC.NAME.SYST, 'character', BIODB.CARD.ONE, 'name',
109 BIODB.SYNONYMS, 'character', BIODB.CARD.MANY, 160 BIODB.COMP.IUPAC.NAME.PREF, 'character', BIODB.CARD.ONE, 'name',
110 BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE, 161 BIODB.COMP.IUPAC.NAME.CAS, 'character', BIODB.CARD.ONE, 'name',
111 BIODB.SYMBOL, 'character', BIODB.CARD.ONE, 162 BIODB.FULLNAMES, 'character', BIODB.CARD.MANY, 'name',
112 BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY, 163 BIODB.SYNONYMS, 'character', BIODB.CARD.MANY, 'name',
113 BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE, 164 BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none',
114 BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE, 165 BIODB.SYMBOL, 'character', BIODB.CARD.ONE, 'none',
115 BIODB.KEGG.ID, 'character', BIODB.CARD.ONE, 166 BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY, 'none',
116 BIODB.HMDB.ID, 'character', BIODB.CARD.ONE, 167 BIODB.NB.COMPOUNDS, 'integer', BIODB.CARD.ONE, 'none',
117 BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE, 168 BIODB.COMPOUNDS, 'object', BIODB.CARD.MANY, 'none',
118 BIODB.PUBCHEM.ID, 'character', BIODB.CARD.ONE, 169 BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE, 'none',
119 BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE, 170 BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE, 'none',
120 BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE, 171 BIODB.KEGG.ID, 'character', BIODB.CARD.ONE, 'none',
121 BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE, 172 BIODB.HMDB.ID, 'character', BIODB.CARD.ONE, 'none',
122 BIODB.INCHI, 'character', BIODB.CARD.ONE, 173 BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE, 'none',
123 BIODB.INCHIKEY, 'character', BIODB.CARD.ONE, 174 BIODB.PUBCHEMCOMP.ID, 'character', BIODB.CARD.ONE, 'none',
124 BIODB.MSDEV, 'character', BIODB.CARD.ONE, 175 BIODB.PUBCHEMSUB.ID, 'character', BIODB.CARD.ONE, 'none',
125 BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE, 176 BIODB.PEAKFOREST.ID, 'character', BIODB.CARD.ONE, 'none',
126 BIODB.MSTYPE, 'character', BIODB.CARD.ONE, 177 BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE, 'none',
127 BIODB.MSMODE, 'character', BIODB.CARD.ONE, 178 BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE, 'none',
128 BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE, 179 BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE, 'none',
129 BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE, 180 BIODB.INCHI, 'character', BIODB.CARD.ONE, 'none',
130 BIODB.FORMULA, 'character', BIODB.CARD.ONE, 181 BIODB.INCHIKEY, 'character', BIODB.CARD.ONE, 'none',
131 BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE, 182 BIODB.MSDEV, 'character', BIODB.CARD.ONE, 'none',
132 BIODB.MASS, 'double', BIODB.CARD.ONE, 183 BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE, 'none',
133 BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE, 184 BIODB.MSTYPE, 'character', BIODB.CARD.ONE, 'none',
134 BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE, 185 BIODB.MSMODE, 'character', BIODB.CARD.ONE, 'none',
135 BIODB.SEQUENCE, 'character', BIODB.CARD.ONE, 186 BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE, 'none',
136 BIODB.LENGTH, 'integer', BIODB.CARD.ONE, 187 BIODB.PEAK.MZTHEO, 'double', BIODB.CARD.ONE, 'none',
137 BIODB.LOCATION, 'character', BIODB.CARD.ONE, 188 BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE, 'none',
138 BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE, 189 BIODB.FORMULA, 'character', BIODB.CARD.ONE, 'none',
139 BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE 190 BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE, 'none',
140 ), byrow = TRUE, ncol = 3), stringsAsFactors = FALSE) 191 BIODB.MASS, 'double', BIODB.CARD.ONE, 'none',
141 colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality') 192 BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE, 'none',
193 BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE, 'none',
194 BIODB.SEQUENCE, 'character', BIODB.CARD.ONE, 'none',
195 BIODB.LENGTH, 'integer', BIODB.CARD.ONE, 'none',
196 BIODB.LOCATION, 'character', BIODB.CARD.ONE, 'none',
197 BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE, 'none',
198 BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE, 'none',
199 BIODB.SMILES, 'character', BIODB.CARD.ONE, 'none',
200 BIODB.CHEMSPIDER.ID, 'character', BIODB.CARD.ONE, 'none',
201 BIODB.CAS.ID, 'character', BIODB.CARD.ONE, 'none'
202 ), byrow = TRUE, ncol = 4), stringsAsFactors = FALSE)
203 colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality', 'type')
204
205 #########################
206 # GET DATABASE ID FIELD #
207 #########################
208
209 biodb.get.database.id.field <- function(database) {
210
211 id.field <- NA_character_
212
213 if (database %in% BIODB.DATABASES) {
214 id.field <- paste0(database, 'id')
215 if ( ! id.field %in% BIODB.FIELDS[['name']])
216 stop(paste0('No ID field defined for database ', database, '.'))
217 }
218
219 return(id.field)
220 }
142 221
143 ##################### 222 #####################
144 # COMPUTABLE FIELDS # 223 # COMPUTABLE FIELDS #
145 ##################### 224 #####################
146 225
151 230
152 #################### 231 ####################
153 # PEAKS DATA FRAME # 232 # PEAKS DATA FRAME #
154 #################### 233 ####################
155 234
156 # Columns
157 BIODB.PEAK.MZ <- 'mz'
158 BIODB.PEAK.FORMULA <- 'formula'
159 BIODB.PEAK.FORMULA.COUNT <- 'formula.count'
160 BIODB.PEAK.MASS <- 'mass'
161 BIODB.PEAK.ERROR.PPM <- 'error.ppm'
162 BIODB.PEAK.INTENSITY <- 'intensity'
163 BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity'
164
165 # Example 235 # Example
166 BIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE) 236 BIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE)
167 colnames(BIODB.PEAK.DF.EXAMPLE) <- c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM) 237 colnames(BIODB.PEAK.DF.EXAMPLE) <- c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM)
168 238
169 ################# 239 #################
170 # GET ENTRY URL # 240 # GET ENTRY URL #
171 ################# 241 #################
172 242
173 # TODO Let the choice to use either jp or eu 243 # TODO Let the choice to use either jp or eu
174 BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/getRecordInfo" 244 BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/"
175 BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/getRecordInfo" 245 BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/"
176 246
177 .do.get.entry.url <- function(class, accession, content.type = BIODB.ANY) { 247 .do.get.entry.url <- function(class, accession, content.type = BIODB.HTML, base.url = NA_character_, token = NA_character_) {
178 248
179 # TODO Only Massbank can handle multiple accession ids 249 # Only certain databases can handle multiple accession ids
180 if (class != 'massbank' && length(accession) > 1) 250 if ( ! class %in% c(BIODB.MASSBANK, BIODB.CHEMSPIDER, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.PEAKFOREST) && length(accession) > 1)
181 stop(paste0("Cannot build a URL for getting multiple entries for class ", class, ".")) 251 stop(paste0("Cannot build a URL for getting multiple entries for class ", class, "."))
182 252
253 # Get URL
183 url <- switch(class, 254 url <- switch(class,
184 chebi = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL, 255 chebi = if (content.type == BIODB.HTML) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL,
185 chemspider = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html') else NULL, 256 chemspider = {
186 enzyme = if (content.type %in% c(BIODB.ANY, BIODB.TXT)) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL, 257 token.param <- if (is.na(token)) '' else paste('&token', token, sep = '=')
258 switch(content.type,
259 html = paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html'),
260 xml = paste0('http://www.chemspider.com/MassSpecAPI.asmx/GetExtendedCompoundInfoArray?', paste(paste0('CSIDs=', accession), collapse = '&'), token.param),
261 NULL)
262 },
263 enzyme = if (content.type == BIODB.TXT) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL,
187 hmdb = switch(content.type, 264 hmdb = switch(content.type,
188 xml = paste0('http://www.hmdb.ca/metabolites/', accession, '.xml'), 265 xml = paste0('http://www.hmdb.ca/metabolites/', accession, '.xml'),
189 html = paste0('http://www.hmdb.ca/metabolites/', accession), 266 html = paste0('http://www.hmdb.ca/metabolites/', accession),
190 any = paste0('http://www.hmdb.ca/metabolites/', accession),
191 NULL), 267 NULL),
192 kegg = switch(content.type, 268 kegg = switch(content.type,
193 txt = paste0('http://rest.kegg.jp/get/', accession), 269 txt = paste0('http://rest.kegg.jp/get/', accession),
194 html = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), 270 html = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession),
195 any = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession),
196 NULL), 271 NULL),
197 lipidmaps = if (content.type %in% c(BIODB.ANY, BIODB.CSV)) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL, 272 lipidmaps = if (content.type == BIODB.CSV) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL,
198 massbank = if (content.type %in% c(BIODB.ANY, BIODB.TXT)) paste0(BIODB.MASSBANK.EU.WS.URL, '?ids=', paste(accession, collapse = ',')) else NULL, 273 massbank = if (content.type == BIODB.TXT) paste0((if (is.na(base.url)) BIODB.MASSBANK.EU.WS.URL else base.url), 'getRecordInfo?ids=', paste(accession, collapse = ',')) else NULL,
199 mirbase = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL, 274 mirbase = if (content.type == BIODB.HTML) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL,
200 pubchem = { 275 pubchemcomp = switch(content.type,
201 accession <- gsub(' ', '', accession, perl = TRUE) 276 xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/', paste(accession, collapse = ','), '/XML'),
202 accession <- gsub('^CID', '', accession, perl = TRUE)
203 switch(content.type,
204 xml = paste0('http://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/', accession, '/XML/?response_type=save&response_basename=CID_', accession),
205 html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession), 277 html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession),
206 NULL) 278 NULL),
207 }, 279 pubchemsub = switch(content.type,
208 ncbigene = if (content.type %in% c(BIODB.ANY, BIODB.XML)) paste0('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL, 280 xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/', paste(accession, collapse = ','), '/XML'),
209 ncbiccds = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession), 281 html = paste0('http://pubchem.ncbi.nlm.nih.gov/substance/', accession),
210 uniprot = if (content.type %in% c(BIODB.ANY, BIODB.XML)) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'), 282 NULL),
283 ncbigene = if (content.type == BIODB.XML) paste0('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL,
284 ncbiccds = if (content.type == BIODB.HTML) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession),
285 uniprot = if (content.type == BIODB.XML) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'),
286 peakforest = switch(content.type,
287 html= paste0('https://peakforest.org/home?PFs=',accession),
288 json= paste0('https://peakforest-alpha.inra.fr/rest/spectra/lcms/ids/',paste(accession,sep=','),'?token=',token),
289
211 NULL 290 NULL
212 ) 291 )
213 292 )
214 return(url) 293 return(url)
215 } 294 }
216 295
217 get.entry.url <- function(class, accession, content.type = BIODB.ANY, max.length = 0) { 296 get.entry.url <- function(class, accession, content.type = BIODB.HTML, max.length = 0, base.url = NA_character_, token = NA_character_) {
218 297
219 if (length(accession) == 0) 298 if (length(accession) == 0)
220 return(NULL) 299 return(NULL)
221 300
222 full.url <- .do.get.entry.url(class, accession, content.type = content.type) 301 full.url <- .do.get.entry.url(class, accession, content.type = content.type, base.url = base.url, token = token)
223 if (max.length == 0 || nchar(full.url) <= max.length) 302 if (max.length == 0 || nchar(full.url) <= max.length)
224 return(if (max.length == 0) full.url else list(url = full.url, n = length(accession))) 303 return(if (max.length == 0) full.url else list(url = full.url, n = length(accession)))
225 304
226 # Find max size URL 305 # Find max size URL
227 a <- 1 306 a <- 1
228 b <- length(accession) 307 b <- length(accession)
229 while (a < b) { 308 while (a < b) {
230 m <- as.integer((a + b) / 2) 309 m <- as.integer((a + b) / 2)
231 url <- .do.get.entry.url(class, accession[1:m], content.type = content.type) 310 url <- .do.get.entry.url(class, accession[1:m], content.type = content.type, base.url = base.url, token = token)
232 if (nchar(url) <= max.length && m != a) 311 if (nchar(url) <= max.length && m != a)
233 a <- m 312 a <- m
234 else 313 else
235 b <- m 314 b <- m
236 } 315 }
237 url <- .do.get.entry.url(class, accession[1:a], content.type = content.type) 316 url <- .do.get.entry.url(class, accession[1:a], content.type = content.type, base.url = base.url, token = token)
238 317
239 return(list( url = url, n = a)) 318 return(list( url = url, n = a))
240 } 319 }
241 320
242 ################# 321 #################
248 327
249 .print.msg <- function(msg, level = BIODB.DEBUG, class = NA_character_) { 328 .print.msg <- function(msg, level = BIODB.DEBUG, class = NA_character_) {
250 cat(paste0(BIODB.LEVEL.NAMES[[level]], if (is.na(class)) '' else paste0(", ", class), ": ", msg, "\n"), file = stderr()) 329 cat(paste0(BIODB.LEVEL.NAMES[[level]], if (is.na(class)) '' else paste0(", ", class), ": ", msg, "\n"), file = stderr())
251 } 330 }
252 331
332 #####################
333 # BIODB GET ENV VAR #
334 #####################
335
336 .biodb.get.env.var <- function(v) {
337
338 # Get all env vars
339 env <- Sys.getenv()
340
341 # Make env var name
342 env.var <- paste(c('BIODB', toupper(v)), collapse = '_')
343
344 # Look if this env var exists
345 if (env.var %in% names(env))
346 return(env[[env.var]])
347
348 return(NA_character_)
349 }
253 } 350 }