Mercurial > repos > prog > lcmsmatching
comparison biodb-common.R @ 2:20d69a062da3 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author | prog |
---|---|
date | Thu, 02 Mar 2017 08:55:00 -0500 |
parents | 253d531a0193 |
children |
comparison
equal
deleted
inserted
replaced
1:253d531a0193 | 2:20d69a062da3 |
---|---|
1 if ( ! exists('BIODB.COMPOUND')) { # Do not load again if already loaded | 1 if ( ! exists('BIODB.XML')) { |
2 | 2 |
3 ############### | 3 ############### |
4 # ENTRY TYPES # | 4 # CACHE MODES # |
5 ############### | 5 ############### |
6 | 6 |
7 BIODB.COMPOUND <- 'compound' | 7 BIODB.CACHE.READ.ONLY <- 'read-only' |
8 BIODB.SPECTRUM <- 'spectrum' | 8 BIODB.CACHE.READ.WRITE <- 'read-write' |
9 | 9 BIODB.CACHE.WRITE.ONLY <- 'write-only' |
10 | |
10 ####################### | 11 ####################### |
11 # ENTRY CONTENT TYPES # | 12 # ENTRY CONTENT TYPES # |
12 ####################### | 13 ####################### |
13 | 14 |
14 BIODB.HTML <- 'html' | 15 BIODB.HTML <- 'html' |
15 BIODB.TXT <- 'txt' | 16 BIODB.TXT <- 'txt' |
16 BIODB.XML <- 'xml' | 17 BIODB.XML <- 'xml' |
17 BIODB.CSV <- 'csv' | 18 BIODB.CSV <- 'csv' |
18 BIODB.DATAFRAME <- 'dataframe' | 19 BIODB.DATAFRAME <- 'dataframe' |
19 BIODB.ANY <- 'any' # Value used when we do not care about the type. | 20 BIODB.JSON <- 'json' |
20 | 21 |
21 ############# | 22 ############# |
22 # DATABASES # | 23 # DATABASES # |
23 ############# | 24 ############# |
24 | 25 |
25 BIODB.CHEBI <- 'chebi' | 26 BIODB.CHEBI <- 'chebi' |
26 BIODB.KEGG <- 'kegg' | 27 BIODB.KEGG <- 'kegg' |
27 BIODB.PUBCHEM <- 'pubchem' | 28 BIODB.PUBCHEMCOMP <- 'pubchemcomp' # Compound database |
29 BIODB.PUBCHEMSUB <- 'pubchemsub' # Substance database | |
28 BIODB.HMDB <- 'hmdb' | 30 BIODB.HMDB <- 'hmdb' |
29 BIODB.CHEMSPIDER <- 'chemspider' | 31 BIODB.CHEMSPIDER <- 'chemspider' |
30 BIODB.ENZYME <- 'enzyme' | 32 BIODB.ENZYME <- 'enzyme' |
31 BIODB.LIPIDMAPS <- 'lipidmaps' | 33 BIODB.LIPIDMAPS <- 'lipidmaps' |
32 BIODB.MIRBASE <- 'mirbase' | 34 BIODB.MIRBASE <- 'mirbase' |
33 BIODB.NCBIGENE <- 'ncbigene' | 35 BIODB.NCBIGENE <- 'ncbigene' |
34 BIODB.NCBICCDS <- 'ncbiccds' | 36 BIODB.NCBICCDS <- 'ncbiccds' |
35 BIODB.UNIPROT <- 'uniprot' | 37 BIODB.UNIPROT <- 'uniprot' |
36 BIODB.MASSBANK <- 'massbank' | 38 BIODB.MASSBANK <- 'massbank' |
37 BIODB.MASSFILEDB <- 'massfiledb' | 39 BIODB.MASSFILEDB <- 'massfiledb' |
40 BIODB.PEAKFOREST <- 'peakforest' | |
41 | |
42 BIODB.DATABASES <- c(BIODB.CHEBI, BIODB.KEGG, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.HMDB, BIODB.CHEMSPIDER, BIODB.ENZYME, BIODB.LIPIDMAPS, BIODB.MIRBASE, BIODB.NCBIGENE, BIODB.NCBICCDS, BIODB.UNIPROT, BIODB.MASSBANK, BIODB.MASSFILEDB, BIODB.PEAKFOREST) | |
38 | 43 |
39 ########## | 44 ########## |
40 # FIELDS # | 45 # FIELDS # |
41 ########## | 46 ########## |
42 | 47 |
43 BIODB.ACCESSION <- 'accession' | 48 BIODB.ACCESSION <- 'accession' |
44 BIODB.DESCRIPTION <- 'description' | 49 BIODB.DESCRIPTION <- 'description' |
45 BIODB.PROTEIN.DESCRIPTION <- 'protdesc' | 50 BIODB.PROTEIN.DESCRIPTION <- 'protdesc' |
46 BIODB.NAME <- 'name' | 51 BIODB.NAME <- 'name' |
52 BIODB.COMP.IUPAC.NAME.ALLOWED <- 'comp.iupac.name.allowed' | |
53 BIODB.COMP.IUPAC.NAME.TRAD <- 'comp.iupac.name.trad' | |
54 BIODB.COMP.IUPAC.NAME.SYST <- 'comp.iupac.name.syst' | |
55 BIODB.COMP.IUPAC.NAME.PREF <- 'comp.iupac.name.pref' | |
56 BIODB.COMP.IUPAC.NAME.CAS <- 'comp.iupac.name.cas' | |
47 BIODB.FULLNAMES <- 'fullnames' | 57 BIODB.FULLNAMES <- 'fullnames' |
48 BIODB.SYNONYMS <- 'synonyms' | 58 BIODB.SYNONYMS <- 'synonyms' |
49 BIODB.SYMBOL <- 'symbol' | 59 BIODB.SYMBOL <- 'symbol' |
50 BIODB.GENE.SYMBOLS <- 'genesymbols' | 60 BIODB.GENE.SYMBOLS <- 'genesymbols' |
51 BIODB.CHEBI.ID <- 'chebiid' | 61 BIODB.CHEBI.ID <- 'chebiid' |
53 BIODB.KEGG.ID <- 'keggid' | 63 BIODB.KEGG.ID <- 'keggid' |
54 BIODB.HMDB.ID <- 'hmdbid' | 64 BIODB.HMDB.ID <- 'hmdbid' |
55 BIODB.ENZYME.ID <- 'enzymeid' | 65 BIODB.ENZYME.ID <- 'enzymeid' |
56 BIODB.NCBI.CCDS.ID <- 'ncbiccdsid' | 66 BIODB.NCBI.CCDS.ID <- 'ncbiccdsid' |
57 BIODB.NCBI.GENE.ID <- 'ncbigeneid' | 67 BIODB.NCBI.GENE.ID <- 'ncbigeneid' |
58 BIODB.PUBCHEM.ID <- 'pubchemid' | 68 BIODB.PUBCHEMCOMP.ID <- 'pubchemcompid' |
69 BIODB.PUBCHEMSUB.ID <- 'pubchemsubid' | |
70 BIODB.CHEMSPIDER.ID <- 'chemspiderid' | |
59 BIODB.UNIPROT.ID <- 'uniprotid' | 71 BIODB.UNIPROT.ID <- 'uniprotid' |
72 BIODB.CAS.ID <- 'casid' | |
73 BIODB.PEAKFOREST.ID <- 'peakforestid' | |
74 BIODB.SMILES <- 'smiles' | |
60 BIODB.INCHI <- 'inchi' | 75 BIODB.INCHI <- 'inchi' |
61 BIODB.INCHIKEY <- 'inchikey' | 76 BIODB.INCHIKEY <- 'inchikey' |
62 BIODB.MSDEV <- 'msdev' | 77 BIODB.MSDEV <- 'msdev' |
63 BIODB.MSDEVTYPE <- 'msdevtype' | 78 BIODB.MSDEVTYPE <- 'msdevtype' |
64 BIODB.MSTYPE <- 'mstype' | 79 BIODB.MSTYPE <- 'mstype' |
73 BIODB.SEQUENCE <- 'sequence' | 88 BIODB.SEQUENCE <- 'sequence' |
74 BIODB.LOCATION <- 'location' | 89 BIODB.LOCATION <- 'location' |
75 BIODB.LENGTH <- 'length' | 90 BIODB.LENGTH <- 'length' |
76 BIODB.NB.PEAKS <- 'nbpeaks' | 91 BIODB.NB.PEAKS <- 'nbpeaks' |
77 BIODB.PEAKS <- 'peaks' | 92 BIODB.PEAKS <- 'peaks' |
93 BIODB.COMPOUNDS <- 'compounds' | |
94 BIODB.NB.COMPOUNDS <- 'nbcompounds' | |
78 BIODB.COMPOUND.ID <- 'compoundid' | 95 BIODB.COMPOUND.ID <- 'compoundid' |
79 BIODB.PEAK.MZ <- 'peakmz' | 96 BIODB.COMPOUND.MASS <- 'compoundmass' |
80 BIODB.PEAK.COMP <- 'peakcomp' # Peak composition | 97 BIODB.COMPOUND.COMP <- 'compoundcomp' |
81 BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution | |
82 BIODB.CHROM.COL <- 'chromcol' # Chromatographic column | 98 BIODB.CHROM.COL <- 'chromcol' # Chromatographic column |
83 BIODB.CHROM.COL.RT <- 'chromcolrt' # Retention time measured on chromatographic column | 99 BIODB.CHROM.COL.RT <- 'chromcolrt' # Retention time measured on chromatographic column |
84 BIODB.ID <- 'id' | 100 BIODB.ID <- 'id' |
85 BIODB.TITLE <- 'title' | 101 BIODB.TITLE <- 'title' |
102 BIODB.PEAK.MZ <- 'mz' | |
103 BIODB.PEAK.RT <- 'rt' | |
104 BIODB.PEAK.MZEXP <- 'mzexp' | |
105 BIODB.PEAK.MZTHEO <- 'mztheo' | |
106 BIODB.PEAK.FORMULA <- 'formula' | |
107 BIODB.PEAK.FORMULA.COUNT <- 'formula.count' | |
108 BIODB.PEAK.COMP <- 'peakcomp' # Peak composition | |
109 BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution | |
110 BIODB.PEAK.MASS <- 'mass' | |
111 # BIODB.PEAK.ATTR <- 'attr' | |
112 BIODB.PEAK.ERROR.PPM <- 'error.ppm' | |
113 BIODB.PEAK.INTENSITY <- 'intensity' | |
114 BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity' | |
86 | 115 |
87 # Mode values | 116 # Mode values |
88 BIODB.MSMODE.NEG <- 'neg' | 117 BIODB.MSMODE.NEG <- 'neg' |
89 BIODB.MSMODE.POS <- 'pos' | 118 BIODB.MSMODE.POS <- 'pos' |
90 | 119 |
120 # Tolerance values | |
121 BIODB.TOL <- 'mztol' | |
122 BIODB.MZTOLUNIT.PPM <- 'ppm' | |
123 BIODB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio | |
124 BIODB.MZTOLUNIT.VALS <- c(BIODB.MZTOLUNIT.PPM, BIODB.MZTOLUNIT.PLAIN) | |
125 | |
126 ######################## | |
127 # MS-MS MEASURE VALUES # | |
128 ######################## | |
129 | |
130 BIODB.MSMS.DIST.COS <- "cosine" | |
131 BIODB.MSMS.DIST.WCOSINE <- "wcosine" | |
132 BIODB.MSMS.DIST.PKERNEL <- "pkernel" | |
133 BIODB.MSMS.DIST <- c(BIODB.MSMS.DIST.COS, BIODB.MSMS.DIST.WCOSINE, BIODB.MSMS.DIST.PKERNEL) | |
134 | |
135 | |
91 ################# | 136 ################# |
92 # CARDINALITIES # | 137 # CARDINALITIES # |
93 ################# | 138 ################# |
94 | 139 |
95 BIODB.CARD.ONE <- '1' | 140 BIODB.CARD.ONE <- '1' |
96 BIODB.CARD.MANY <- '*' | 141 BIODB.CARD.MANY <- '*' |
142 | |
143 ##################### | |
144 #INTENSITy NOTATIONS# | |
145 ##################### | |
146 | |
147 BIODB.GROUP.INTENSITY<-c(BIODB.PEAK.INTENSITY,BIODB.PEAK.RELATIVE.INTENSITY) | |
97 | 148 |
98 ########################## | 149 ########################## |
99 # ENTRY FIELD ATTRIBUTES # | 150 # ENTRY FIELD ATTRIBUTES # |
100 ########################## | 151 ########################## |
101 | 152 # FIELD NAME CLASS CARDINALITY TYPE |
102 BIODB.FIELDS <- data.frame(matrix(c( | 153 BIODB.FIELDS <- data.frame(matrix(c( |
103 # FIELD NAME CLASS CARDINALITY | 154 BIODB.ACCESSION, 'character', BIODB.CARD.ONE, 'none', |
104 BIODB.COMPOUND, 'BiodEntry', BIODB.CARD.ONE, | 155 BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none', |
105 BIODB.ACCESSION, 'character', BIODB.CARD.ONE, | 156 BIODB.NAME, 'character', BIODB.CARD.ONE, 'name', |
106 BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE, | 157 BIODB.COMP.IUPAC.NAME.ALLOWED, 'character', BIODB.CARD.ONE, 'name', |
107 BIODB.NAME, 'character', BIODB.CARD.ONE, | 158 BIODB.COMP.IUPAC.NAME.TRAD, 'character', BIODB.CARD.ONE, 'name', |
108 BIODB.FULLNAMES, 'character', BIODB.CARD.MANY, | 159 BIODB.COMP.IUPAC.NAME.SYST, 'character', BIODB.CARD.ONE, 'name', |
109 BIODB.SYNONYMS, 'character', BIODB.CARD.MANY, | 160 BIODB.COMP.IUPAC.NAME.PREF, 'character', BIODB.CARD.ONE, 'name', |
110 BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE, | 161 BIODB.COMP.IUPAC.NAME.CAS, 'character', BIODB.CARD.ONE, 'name', |
111 BIODB.SYMBOL, 'character', BIODB.CARD.ONE, | 162 BIODB.FULLNAMES, 'character', BIODB.CARD.MANY, 'name', |
112 BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY, | 163 BIODB.SYNONYMS, 'character', BIODB.CARD.MANY, 'name', |
113 BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE, | 164 BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none', |
114 BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE, | 165 BIODB.SYMBOL, 'character', BIODB.CARD.ONE, 'none', |
115 BIODB.KEGG.ID, 'character', BIODB.CARD.ONE, | 166 BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY, 'none', |
116 BIODB.HMDB.ID, 'character', BIODB.CARD.ONE, | 167 BIODB.NB.COMPOUNDS, 'integer', BIODB.CARD.ONE, 'none', |
117 BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE, | 168 BIODB.COMPOUNDS, 'object', BIODB.CARD.MANY, 'none', |
118 BIODB.PUBCHEM.ID, 'character', BIODB.CARD.ONE, | 169 BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE, 'none', |
119 BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE, | 170 BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE, 'none', |
120 BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE, | 171 BIODB.KEGG.ID, 'character', BIODB.CARD.ONE, 'none', |
121 BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE, | 172 BIODB.HMDB.ID, 'character', BIODB.CARD.ONE, 'none', |
122 BIODB.INCHI, 'character', BIODB.CARD.ONE, | 173 BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE, 'none', |
123 BIODB.INCHIKEY, 'character', BIODB.CARD.ONE, | 174 BIODB.PUBCHEMCOMP.ID, 'character', BIODB.CARD.ONE, 'none', |
124 BIODB.MSDEV, 'character', BIODB.CARD.ONE, | 175 BIODB.PUBCHEMSUB.ID, 'character', BIODB.CARD.ONE, 'none', |
125 BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE, | 176 BIODB.PEAKFOREST.ID, 'character', BIODB.CARD.ONE, 'none', |
126 BIODB.MSTYPE, 'character', BIODB.CARD.ONE, | 177 BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE, 'none', |
127 BIODB.MSMODE, 'character', BIODB.CARD.ONE, | 178 BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE, 'none', |
128 BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE, | 179 BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE, 'none', |
129 BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE, | 180 BIODB.INCHI, 'character', BIODB.CARD.ONE, 'none', |
130 BIODB.FORMULA, 'character', BIODB.CARD.ONE, | 181 BIODB.INCHIKEY, 'character', BIODB.CARD.ONE, 'none', |
131 BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE, | 182 BIODB.MSDEV, 'character', BIODB.CARD.ONE, 'none', |
132 BIODB.MASS, 'double', BIODB.CARD.ONE, | 183 BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE, 'none', |
133 BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE, | 184 BIODB.MSTYPE, 'character', BIODB.CARD.ONE, 'none', |
134 BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE, | 185 BIODB.MSMODE, 'character', BIODB.CARD.ONE, 'none', |
135 BIODB.SEQUENCE, 'character', BIODB.CARD.ONE, | 186 BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE, 'none', |
136 BIODB.LENGTH, 'integer', BIODB.CARD.ONE, | 187 BIODB.PEAK.MZTHEO, 'double', BIODB.CARD.ONE, 'none', |
137 BIODB.LOCATION, 'character', BIODB.CARD.ONE, | 188 BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE, 'none', |
138 BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE, | 189 BIODB.FORMULA, 'character', BIODB.CARD.ONE, 'none', |
139 BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE | 190 BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE, 'none', |
140 ), byrow = TRUE, ncol = 3), stringsAsFactors = FALSE) | 191 BIODB.MASS, 'double', BIODB.CARD.ONE, 'none', |
141 colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality') | 192 BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE, 'none', |
193 BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE, 'none', | |
194 BIODB.SEQUENCE, 'character', BIODB.CARD.ONE, 'none', | |
195 BIODB.LENGTH, 'integer', BIODB.CARD.ONE, 'none', | |
196 BIODB.LOCATION, 'character', BIODB.CARD.ONE, 'none', | |
197 BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE, 'none', | |
198 BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE, 'none', | |
199 BIODB.SMILES, 'character', BIODB.CARD.ONE, 'none', | |
200 BIODB.CHEMSPIDER.ID, 'character', BIODB.CARD.ONE, 'none', | |
201 BIODB.CAS.ID, 'character', BIODB.CARD.ONE, 'none' | |
202 ), byrow = TRUE, ncol = 4), stringsAsFactors = FALSE) | |
203 colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality', 'type') | |
204 | |
205 ######################### | |
206 # GET DATABASE ID FIELD # | |
207 ######################### | |
208 | |
209 biodb.get.database.id.field <- function(database) { | |
210 | |
211 id.field <- NA_character_ | |
212 | |
213 if (database %in% BIODB.DATABASES) { | |
214 id.field <- paste0(database, 'id') | |
215 if ( ! id.field %in% BIODB.FIELDS[['name']]) | |
216 stop(paste0('No ID field defined for database ', database, '.')) | |
217 } | |
218 | |
219 return(id.field) | |
220 } | |
142 | 221 |
143 ##################### | 222 ##################### |
144 # COMPUTABLE FIELDS # | 223 # COMPUTABLE FIELDS # |
145 ##################### | 224 ##################### |
146 | 225 |
151 | 230 |
152 #################### | 231 #################### |
153 # PEAKS DATA FRAME # | 232 # PEAKS DATA FRAME # |
154 #################### | 233 #################### |
155 | 234 |
156 # Columns | |
157 BIODB.PEAK.MZ <- 'mz' | |
158 BIODB.PEAK.FORMULA <- 'formula' | |
159 BIODB.PEAK.FORMULA.COUNT <- 'formula.count' | |
160 BIODB.PEAK.MASS <- 'mass' | |
161 BIODB.PEAK.ERROR.PPM <- 'error.ppm' | |
162 BIODB.PEAK.INTENSITY <- 'intensity' | |
163 BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity' | |
164 | |
165 # Example | 235 # Example |
166 BIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE) | 236 BIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE) |
167 colnames(BIODB.PEAK.DF.EXAMPLE) <- c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM) | 237 colnames(BIODB.PEAK.DF.EXAMPLE) <- c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM) |
168 | 238 |
169 ################# | 239 ################# |
170 # GET ENTRY URL # | 240 # GET ENTRY URL # |
171 ################# | 241 ################# |
172 | 242 |
173 # TODO Let the choice to use either jp or eu | 243 # TODO Let the choice to use either jp or eu |
174 BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/getRecordInfo" | 244 BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/" |
175 BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/getRecordInfo" | 245 BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/" |
176 | 246 |
177 .do.get.entry.url <- function(class, accession, content.type = BIODB.ANY) { | 247 .do.get.entry.url <- function(class, accession, content.type = BIODB.HTML, base.url = NA_character_, token = NA_character_) { |
178 | 248 |
179 # TODO Only Massbank can handle multiple accession ids | 249 # Only certain databases can handle multiple accession ids |
180 if (class != 'massbank' && length(accession) > 1) | 250 if ( ! class %in% c(BIODB.MASSBANK, BIODB.CHEMSPIDER, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.PEAKFOREST) && length(accession) > 1) |
181 stop(paste0("Cannot build a URL for getting multiple entries for class ", class, ".")) | 251 stop(paste0("Cannot build a URL for getting multiple entries for class ", class, ".")) |
182 | 252 |
253 # Get URL | |
183 url <- switch(class, | 254 url <- switch(class, |
184 chebi = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL, | 255 chebi = if (content.type == BIODB.HTML) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL, |
185 chemspider = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html') else NULL, | 256 chemspider = { |
186 enzyme = if (content.type %in% c(BIODB.ANY, BIODB.TXT)) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL, | 257 token.param <- if (is.na(token)) '' else paste('&token', token, sep = '=') |
258 switch(content.type, | |
259 html = paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html'), | |
260 xml = paste0('http://www.chemspider.com/MassSpecAPI.asmx/GetExtendedCompoundInfoArray?', paste(paste0('CSIDs=', accession), collapse = '&'), token.param), | |
261 NULL) | |
262 }, | |
263 enzyme = if (content.type == BIODB.TXT) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL, | |
187 hmdb = switch(content.type, | 264 hmdb = switch(content.type, |
188 xml = paste0('http://www.hmdb.ca/metabolites/', accession, '.xml'), | 265 xml = paste0('http://www.hmdb.ca/metabolites/', accession, '.xml'), |
189 html = paste0('http://www.hmdb.ca/metabolites/', accession), | 266 html = paste0('http://www.hmdb.ca/metabolites/', accession), |
190 any = paste0('http://www.hmdb.ca/metabolites/', accession), | |
191 NULL), | 267 NULL), |
192 kegg = switch(content.type, | 268 kegg = switch(content.type, |
193 txt = paste0('http://rest.kegg.jp/get/', accession), | 269 txt = paste0('http://rest.kegg.jp/get/', accession), |
194 html = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), | 270 html = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), |
195 any = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), | |
196 NULL), | 271 NULL), |
197 lipidmaps = if (content.type %in% c(BIODB.ANY, BIODB.CSV)) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL, | 272 lipidmaps = if (content.type == BIODB.CSV) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL, |
198 massbank = if (content.type %in% c(BIODB.ANY, BIODB.TXT)) paste0(BIODB.MASSBANK.EU.WS.URL, '?ids=', paste(accession, collapse = ',')) else NULL, | 273 massbank = if (content.type == BIODB.TXT) paste0((if (is.na(base.url)) BIODB.MASSBANK.EU.WS.URL else base.url), 'getRecordInfo?ids=', paste(accession, collapse = ',')) else NULL, |
199 mirbase = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL, | 274 mirbase = if (content.type == BIODB.HTML) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL, |
200 pubchem = { | 275 pubchemcomp = switch(content.type, |
201 accession <- gsub(' ', '', accession, perl = TRUE) | 276 xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/', paste(accession, collapse = ','), '/XML'), |
202 accession <- gsub('^CID', '', accession, perl = TRUE) | |
203 switch(content.type, | |
204 xml = paste0('http://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/', accession, '/XML/?response_type=save&response_basename=CID_', accession), | |
205 html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession), | 277 html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession), |
206 NULL) | 278 NULL), |
207 }, | 279 pubchemsub = switch(content.type, |
208 ncbigene = if (content.type %in% c(BIODB.ANY, BIODB.XML)) paste0('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL, | 280 xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/', paste(accession, collapse = ','), '/XML'), |
209 ncbiccds = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession), | 281 html = paste0('http://pubchem.ncbi.nlm.nih.gov/substance/', accession), |
210 uniprot = if (content.type %in% c(BIODB.ANY, BIODB.XML)) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'), | 282 NULL), |
283 ncbigene = if (content.type == BIODB.XML) paste0('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL, | |
284 ncbiccds = if (content.type == BIODB.HTML) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession), | |
285 uniprot = if (content.type == BIODB.XML) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'), | |
286 peakforest = switch(content.type, | |
287 html= paste0('https://peakforest.org/home?PFs=',accession), | |
288 json= paste0('https://peakforest-alpha.inra.fr/rest/spectra/lcms/ids/',paste(accession,sep=','),'?token=',token), | |
289 | |
211 NULL | 290 NULL |
212 ) | 291 ) |
213 | 292 ) |
214 return(url) | 293 return(url) |
215 } | 294 } |
216 | 295 |
217 get.entry.url <- function(class, accession, content.type = BIODB.ANY, max.length = 0) { | 296 get.entry.url <- function(class, accession, content.type = BIODB.HTML, max.length = 0, base.url = NA_character_, token = NA_character_) { |
218 | 297 |
219 if (length(accession) == 0) | 298 if (length(accession) == 0) |
220 return(NULL) | 299 return(NULL) |
221 | 300 |
222 full.url <- .do.get.entry.url(class, accession, content.type = content.type) | 301 full.url <- .do.get.entry.url(class, accession, content.type = content.type, base.url = base.url, token = token) |
223 if (max.length == 0 || nchar(full.url) <= max.length) | 302 if (max.length == 0 || nchar(full.url) <= max.length) |
224 return(if (max.length == 0) full.url else list(url = full.url, n = length(accession))) | 303 return(if (max.length == 0) full.url else list(url = full.url, n = length(accession))) |
225 | 304 |
226 # Find max size URL | 305 # Find max size URL |
227 a <- 1 | 306 a <- 1 |
228 b <- length(accession) | 307 b <- length(accession) |
229 while (a < b) { | 308 while (a < b) { |
230 m <- as.integer((a + b) / 2) | 309 m <- as.integer((a + b) / 2) |
231 url <- .do.get.entry.url(class, accession[1:m], content.type = content.type) | 310 url <- .do.get.entry.url(class, accession[1:m], content.type = content.type, base.url = base.url, token = token) |
232 if (nchar(url) <= max.length && m != a) | 311 if (nchar(url) <= max.length && m != a) |
233 a <- m | 312 a <- m |
234 else | 313 else |
235 b <- m | 314 b <- m |
236 } | 315 } |
237 url <- .do.get.entry.url(class, accession[1:a], content.type = content.type) | 316 url <- .do.get.entry.url(class, accession[1:a], content.type = content.type, base.url = base.url, token = token) |
238 | 317 |
239 return(list( url = url, n = a)) | 318 return(list( url = url, n = a)) |
240 } | 319 } |
241 | 320 |
242 ################# | 321 ################# |
248 | 327 |
249 .print.msg <- function(msg, level = BIODB.DEBUG, class = NA_character_) { | 328 .print.msg <- function(msg, level = BIODB.DEBUG, class = NA_character_) { |
250 cat(paste0(BIODB.LEVEL.NAMES[[level]], if (is.na(class)) '' else paste0(", ", class), ": ", msg, "\n"), file = stderr()) | 329 cat(paste0(BIODB.LEVEL.NAMES[[level]], if (is.na(class)) '' else paste0(", ", class), ": ", msg, "\n"), file = stderr()) |
251 } | 330 } |
252 | 331 |
332 ##################### | |
333 # BIODB GET ENV VAR # | |
334 ##################### | |
335 | |
336 .biodb.get.env.var <- function(v) { | |
337 | |
338 # Get all env vars | |
339 env <- Sys.getenv() | |
340 | |
341 # Make env var name | |
342 env.var <- paste(c('BIODB', toupper(v)), collapse = '_') | |
343 | |
344 # Look if this env var exists | |
345 if (env.var %in% names(env)) | |
346 return(env[[env.var]]) | |
347 | |
348 return(NA_character_) | |
349 } | |
253 } | 350 } |