comparison MassFiledbConn.R @ 2:20d69a062da3 draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author prog
date Thu, 02 Mar 2017 08:55:00 -0500
parents 253d531a0193
children fb9c0409d85c
comparison
equal deleted inserted replaced
1:253d531a0193 2:20d69a062da3
1 if ( ! exists('MassFiledbConn')) { 1 # LCMS File db.
2 2 # In this type of database, a single file is provided in CSV format. Default separator is tabulation.
3 source('MassdbConn.R') 3 # Each line is a MS peak measure, .
4 4 # The file contains molecule and spectrum information. Each spectrum has an accession id.
5 # LCMS File db. 5
6 # In this type of database, a single file is provided in CSV format. Default separator is tabulation. 6 # TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue
7 # Each line is a MS peak measure, . 7
8 # The file contains molecule and spectrum information. Each spectrum has an accession id. 8 #############
9 9 # CONSTANTS #
10 # TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue 10 #############
11 11
12 ############# 12 # Default database fields
13 # CONSTANTS # 13 .BIODB.DFT.DB.FIELDS <- list()
14 ############# 14 for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZEXP, BIODB.PEAK.MZTHEO, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS))
15 15 .BIODB.DFT.DB.FIELDS[[f]] <- f
16 # Default database fields 16
17 .BIODB.DFT.DB.FIELDS <- list() 17 #####################
18 for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZ, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS)) 18 # CLASS DECLARATION #
19 .BIODB.DFT.DB.FIELDS[[f]] <- f 19 #####################
20 20
21 ##################### 21 MassFiledbConn <- methods::setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .db.orig.colnames = "character", .fields = "list", .ms.modes = "character"))
22 # CLASS DECLARATION # 22
23 ##################### 23 ###############
24 24 # CONSTRUCTOR #
25 MassFiledbConn <- setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .fields = "list", .ms.modes = "character")) 25 ###############
26 26
27 ############### 27 MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) {
28 # CONSTRUCTOR # 28
29 ############### 29 # Check file
30 30 (! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.")
31 MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) { 31 file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\"."))
32 32
33 # Check file 33 # Set fields
34 (! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.") 34 .db <<- NULL
35 file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\".")) 35 .db.orig.colnames <<- NA_character_
36 36 .file <<- file
37 # Set fields 37 .file.sep <<- file.sep
38 .db <<- NULL 38 .file.quote <<- file.quote
39 .file <<- file 39 .fields <<- .BIODB.DFT.DB.FIELDS
40 .file.sep <<- file.sep 40 .field.multval.sep <<- ';'
41 .file.quote <<- file.quote 41 .ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS)
42 .fields <<- .BIODB.DFT.DB.FIELDS 42 names(.self$.ms.modes) <- .self$.ms.modes
43 .field.multval.sep <<- ';' 43
44 .ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS) 44 callSuper(...)
45 names(.self$.ms.modes) <- .self$.ms.modes 45 })
46 46
47 callSuper(...) 47 ######################
48 }) 48 # Is valid field tag #
49 49 ######################
50 ###################### 50
51 # Is valid field tag # 51 MassFiledbConn$methods( isValidFieldTag = function(tag) {
52 ###################### 52 return (tag %in% names(.self$.fields))
53 53 })
54 MassFiledbConn$methods( isValidFieldTag = function(tag) { 54
55 return (tag %in% names(.self$.fields)) 55 ###########
56 }) 56 # INIT DB #
57 57 ###########
58 ############# 58
59 # Set field # 59 MassFiledbConn$methods( .init.db = function() {
60 ############# 60
61 61 if (is.null(.self$.db)) {
62 MassFiledbConn$methods( setField = function(tag, colname) { 62
63 63 # Load database
64 ( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.") 64 .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL)
65 ( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.") 65
66 66 # Save column names
67 # Load database file 67 .db.orig.colnames <<- colnames(.self$.db)
68 .self$.init.db() 68 }
69 69 })
70 # Check that this field tag is defined in the fields list 70
71 .self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid.")) 71 #############
72 72 # Set field #
73 # Check that columns are defined in database file 73 #############
74 all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file.")) 74
75 75 MassFiledbConn$methods( setField = function(tag, colname) {
76 # Set new definition 76
77 if (length(colname) == 1) 77 ( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.")
78 .fields[[tag]] <<- colname 78 ( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.")
79 else { 79
80 new.col <- paste(colname, collapse = ".") 80 # Load database file
81 .self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '') 81 .self$.init.db()
82 .fields[[tag]] <<- new.col 82
83 } 83 # Check that this field tag is defined in the fields list
84 }) 84 .self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid."))
85 85
86 ###################################### 86 # Check that columns are defined in database file
87 # SET FIELD MULTIPLE VALUE SEPARATOR # 87 all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file."))
88 ###################################### 88
89 89 # Set new definition
90 MassFiledbConn$methods( setFieldMultValSep = function(sep) { 90 if (length(colname) == 1)
91 .field.multval.sep <<- sep 91 .fields[[tag]] <<- colname
92 }) 92 else {
93 93 new.col <- paste(colname, collapse = ".")
94 ################ 94 .self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '')
95 # SET MS MODES # 95 .fields[[tag]] <<- new.col
96 ################ 96 }
97 97
98 MassFiledbConn$methods( setMsMode = function(mode, value) { 98 # Update data frame column names
99 .self$.ms.modes[[mode]] <- value 99 colnames(.self$.db) <- vapply(.self$.db.orig.colnames, function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '')
100 }) 100 })
101 101
102 ########################## 102 ######################################
103 # GET ENTRY CONTENT TYPE # 103 # SET FIELD MULTIPLE VALUE SEPARATOR #
104 ########################## 104 ######################################
105 105
106 MassFiledbConn$methods( getEntryContentType = function(type) { 106 MassFiledbConn$methods( setFieldMultValSep = function(sep) {
107 return(BIODB.DATAFRAME) 107 .field.multval.sep <<- sep
108 }) 108 })
109 109
110 ########### 110 ################
111 # INIT DB # 111 # SET MS MODES #
112 ########### 112 ################
113 113
114 MassFiledbConn$methods( .init.db = function() { 114 MassFiledbConn$methods( setMsMode = function(mode, value) {
115 115 .self$.ms.modes[[mode]] <- value
116 if (is.null(.self$.db)) { 116 })
117 117
118 # Load database 118 ##########################
119 .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL) 119 # GET ENTRY CONTENT TYPE #
120 120 ##########################
121 # Rename columns 121
122 colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') 122 MassFiledbConn$methods( getEntryContentType = function(type) {
123 } 123 return(BIODB.DATAFRAME)
124 }) 124 })
125 125
126 ################ 126 ################
127 # CHECK FIELDS # 127 # CHECK FIELDS #
128 ################ 128 ################
129 129
130 MassFiledbConn$methods( .check.fields = function(fields) { 130 MassFiledbConn$methods( .check.fields = function(fields) {
131 131
132 # Check if fields are known 132 if (length(fields) ==0 || (length(fields) == 1 && is.na(fields)))
133 unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)] 133 return
134 if (length(unknown.fields) > 0) 134
135 stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown.")) 135 # Check if fields are known
136 136 unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)]
137 # Init db 137 if (length(unknown.fields) > 0)
138 .self$.init.db() 138 stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown."))
139 139
140 # Check if fields are defined in file database 140 # Init db
141 undefined.fields <- colnames(.self$.init.db)[ ! unlist(.self$.fields[fields]) %in% colnames(.self$.init.db)] 141 .self$.init.db()
142 if (length(undefined.fields) > 0) 142
143 stop(paste0("Column(s) ", paste(unlist(.self$.fields[fields]), collapse = ", "), " is/are undefined in file database.")) 143 # Check if fields are defined in file database
144 }) 144 undefined.fields <- colnames(.self$.db)[ ! fields %in% colnames(.self$.db)]
145 145 if (length(undefined.fields) > 0)
146 ################ 146 stop(paste0("Column(s) ", paste(fields), collapse = ", "), " is/are undefined in file database.")
147 # EXTRACT COLS # 147 })
148 ################ 148
149 149 ##########
150 MassFiledbConn$methods( .extract.cols = function(cols, mode = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) { 150 # SELECT #
151 151 ##########
152 x <- NULL 152
153 153 # Select data from database
154 if ( ! is.null(cols) && ! is.na(cols)) { 154 MassFiledbConn$methods( .select = function(cols = NULL, mode = NULL, compound.ids = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) {
155 155
156 # Init db 156 x <- NULL
157 .self$.init.db() 157
158 158 # Init db
159 # TODO check existence of cols/fields 159 .self$.init.db()
160 160
161 # Get db, eventually filtering it. 161 # Get db
162 if (is.null(mode)) 162 db <- .self$.db
163 db <- .self$.db 163
164 else { 164 # Filter db on mode
165 # Check mode value 165 if ( ! is.null(mode) && ! is.na(mode)) {
166 mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'.")) 166
167 .self$.check.fields(BIODB.MSMODE) 167 # Check mode value
168 168 mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'."))
169 # Filter on mode 169 .self$.check.fields(BIODB.MSMODE)
170 db <- .self$.db[.self$.db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ] 170
171 } 171 # Filter on mode
172 172 db <- db[db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ]
173 # Get subset 173 }
174 x <- db[, unlist(.self$.fields[cols]), drop = drop] 174
175 175 # Filter db on compound ids
176 # Rename columns 176 # TODO
177 if (is.data.frame(x)) 177
178 colnames(x) <- cols 178 if ( ! is.null(cols) && ! is.na(cols))
179 179 .self$.check.fields(cols)
180 # Rearrange 180
181 if (drop && is.vector(x)) { 181 # Get subset
182 if (uniq) 182 if (is.null(cols) || is.na(cols))
183 x <- x[ ! duplicated(x)] 183 x <- db
184 if (sort) 184 else
185 x <- sort(x) 185 x <- db[, unlist(.self$.fields[cols]), drop = drop]
186 } 186
187 187 # Rearrange
188 # Cut 188 if (drop && is.vector(x)) {
189 if ( ! is.na(max.rows)) 189 if (uniq)
190 x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ] 190 x <- x[ ! duplicated(x)]
191 } 191 if (sort)
192 192 x <- sort(x)
193 return(x) 193 }
194 }) 194
195 195 # Cut
196 ################# 196 if ( ! is.na(max.rows))
197 # GET ENTRY IDS # 197 x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ]
198 ################# 198
199 199 return(x)
200 MassFiledbConn$methods( getEntryIds = function(type) { 200 })
201 201
202 ids <- NA_character_ 202 #################
203 203 # GET ENTRY IDS #
204 if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND)) 204 #################
205 ids <- as.character(.self$.extract.cols(if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE)) 205
206 206 MassFiledbConn$methods( getEntryIds = function(type) {
207 return(ids) 207
208 }) 208 ids <- NA_character_
209 209
210 ################## 210 if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND))
211 # GET NB ENTRIES # 211 ids <- as.character(.self$.select(cols = if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE))
212 ################## 212
213 213 return(ids)
214 MassFiledbConn$methods( getNbEntries = function(type) { 214 })
215 return(length(.self$getEntryIds(type))) 215
216 }) 216 ##################
217 217 # GET NB ENTRIES #
218 ############################### 218 ##################
219 # GET CHROMATOGRAPHIC COLUMNS # 219
220 ############################### 220 MassFiledbConn$methods( getNbEntries = function(type) {
221 221 return(length(.self$getEntryIds(type)))
222 # Inherited from MassdbConn. 222 })
223 MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) { 223
224 224 ###############################
225 # Extract needed columns 225 # GET CHROMATOGRAPHIC COLUMNS #
226 db <- .self$.extract.cols(c(BIODB.COMPOUND.ID, BIODB.CHROM.COL)) 226 ###############################
227 227
228 # Filter on molecule IDs 228 # Inherited from MassdbConn.
229 if ( ! is.null(compound.ids)) 229 MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) {
230 db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ] 230
231 231 # Extract needed columns
232 # Get column names 232 db <- .self$.select(cols = c(BIODB.COMPOUND.ID, BIODB.CHROM.COL))
233 cols <- db[[BIODB.CHROM.COL]] 233
234 234 # Filter on molecule IDs
235 # Remove duplicates 235 if ( ! is.null(compound.ids))
236 cols <- cols[ ! duplicated(cols)] 236 db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ]
237 237
238 # Make data frame 238 # Get column names
239 chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE) 239 cols <- db[[BIODB.CHROM.COL]]
240 colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE) 240
241 241 # Remove duplicates
242 return(chrom.cols) 242 cols <- cols[ ! duplicated(cols)]
243 }) 243
244 244 # Make data frame
245 ################# 245 chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE)
246 # GET MZ VALUES # 246 colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE)
247 ################# 247
248 248 return(chrom.cols)
249 # Inherited from MassdbConn. 249 })
250 MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { 250
251 251 #################
252 # Get mz values 252 # GET MZ VALUES #
253 mz <- .self$.extract.cols(BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results) 253 #################
254 254
255 return(mz) 255 # Inherited from MassdbConn.
256 }) 256 MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
257 257
258 } 258 # Get mz values
259 mz <- .self$.select(cols = BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results)
260
261 return(mz)
262 })
263
264 ################
265 # GET NB PEAKS #
266 ################
267
268 # Inherited from MassdbConn.
269 MassFiledbConn$methods( getNbPeaks = function(mode = NULL, compound.ids = NULL) {
270
271 # Get peaks
272 peaks <- .self$.select(cols = BIODB.PEAK.MZTHEO, mode = mode, compound.ids = compound.ids)
273
274 return(length(peaks))
275 })