Mercurial > repos > prog > lcmsmatching
comparison MsXlsDb.R @ 0:e66bb061af06 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 3529b25417f8e1a5836474c9adec4b696d35099d-dirty
| author | prog |
|---|---|
| date | Tue, 12 Jul 2016 12:02:37 -0400 |
| parents | |
| children | 20d69a062da3 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e66bb061af06 |
|---|---|
| 1 if ( ! exists('MsXlsDb')) { # Do not load again if already loaded | |
| 2 | |
| 3 library('methods') | |
| 4 library('stringr') | |
| 5 source('msdb-common.R') | |
| 6 source('MsDb.R') | |
| 7 source('strhlp.R', chdir = TRUE) | |
| 8 source('dfhlp.R', chdir = TRUE) | |
| 9 source('search.R', chdir = TRUE) | |
| 10 source('excelhlp.R', chdir = TRUE) | |
| 11 | |
| 12 ############# | |
| 13 # CONSTANTS # | |
| 14 ############# | |
| 15 | |
| 16 .THIS.FILE.PATH <- getwd() # We suppose that the file has been sourced with the option chdir = TRUE | |
| 17 | |
| 18 .XLS_PEAKS_ROW_OFFSET <- 8 | |
| 19 .XLS_PEAKS_RT_COL_START <- 11 | |
| 20 .XLS_MSPOS_TAB <- 'MS_POS' | |
| 21 .XLS_MSNEG_TAB <- 'MS_NEG' | |
| 22 .XLS_MZ_COL <- 1 | |
| 23 .XLS_INTENSITY_COL <- 2 | |
| 24 .XLS_RELATIVE_COL <- 3 | |
| 25 .XLS_THEORETICAL_MZ_COL <- 5 | |
| 26 .XLS_COMPOSITION_COL <- 8 | |
| 27 .XLS_ATTRIBUTION_COL <- 9 | |
| 28 | |
| 29 ##################### | |
| 30 # CLASS DECLARATION # | |
| 31 ##################### | |
| 32 | |
| 33 MsXlsDb <- setRefClass("MsXlsDb", contains = "MsDb", fields = list(.mz.index = "ANY", .name_index = "ANY", .db_dir = "character", .limit = "numeric", .files = "ANY", .cache_dir = "character", .db = "ANY")) | |
| 34 | |
| 35 ############### | |
| 36 # CONSTRUCTOR # | |
| 37 ############### | |
| 38 | |
| 39 MsXlsDb$methods( initialize = function(db_dir = NA_character_, limit = NA_integer_, cache_dir = NA_character_, cache = FALSE, ...) { | |
| 40 | |
| 41 # Initialize members | |
| 42 .db_dir <<- if ( ! is.null(db_dir)) db_dir else NA_character_ | |
| 43 .limit <<- if ( ! is.null(limit) && ! is.na(limit) && limit > 0) limit else NA_integer_ | |
| 44 cache_dir <- if (cache && is.na(cache_dir) && ! is.na(db_dir)) file.path(db_dir, 'cache') else cache_dir | |
| 45 .cache_dir <<- if ( cache || ! is.null(cache_dir)) cache_dir else NA_character_ | |
| 46 .files <<- NULL | |
| 47 .db <<- NULL | |
| 48 .mz.index <<- NULL | |
| 49 .name_index <<- NULL | |
| 50 | |
| 51 callSuper(...) | |
| 52 }) | |
| 53 | |
| 54 #################### | |
| 55 # GET MOLECULE IDS # | |
| 56 #################### | |
| 57 | |
| 58 MsXlsDb$methods( getMoleculeIds = function() { | |
| 59 | |
| 60 # Init file list | |
| 61 .self$.init.file.list() | |
| 62 | |
| 63 # Get IDs | |
| 64 mol.ids <- as.integer(which( ! is.na(.self$.files))) | |
| 65 | |
| 66 return(mol.ids) | |
| 67 }) | |
| 68 | |
| 69 #################### | |
| 70 # GET NB MOLECULES # | |
| 71 #################### | |
| 72 | |
| 73 # Returns a list of all molecule names | |
| 74 MsXlsDb$methods( getNbMolecules = function() { | |
| 75 return(length(.self$getMoleculeIds())) | |
| 76 }) | |
| 77 | |
| 78 ##################### | |
| 79 # GET MOLECULE NAME # | |
| 80 ##################### | |
| 81 | |
| 82 MsXlsDb$methods( getMoleculeName = function(molid) { | |
| 83 return(vapply(molid, function(m) .self$.get.mol.name(m), FUN.VALUE = "")) | |
| 84 }) | |
| 85 | |
| 86 ############################### | |
| 87 # GET CHROMATOGRAPHIC COLUMNS # | |
| 88 ############################### | |
| 89 | |
| 90 # Returns a list of all chromatographic columns used | |
| 91 MsXlsDb$methods( getChromCol = function(molid = NULL) { | |
| 92 | |
| 93 cn <- character() | |
| 94 | |
| 95 # If no molecule IDs provided, then look at all molecules | |
| 96 if (is.null(molid)) | |
| 97 molid <- .self$getMoleculeIds() | |
| 98 | |
| 99 # Loop on molecules | |
| 100 for (mid in molid) { | |
| 101 | |
| 102 rt <- .self$getRetentionTimes(mid) | |
| 103 | |
| 104 if ( ! is.null(rt)) | |
| 105 cn <- c(cn, names(rt)) | |
| 106 } | |
| 107 | |
| 108 # Remove duplicates | |
| 109 cn <- cn[ ! duplicated(cn)] | |
| 110 | |
| 111 # Make data frame | |
| 112 cn <- data.frame(id = cn, title = cn, stringsAsFactors = FALSE) | |
| 113 | |
| 114 return(cn) | |
| 115 }) | |
| 116 | |
| 117 ################ | |
| 118 # FIND BY NAME # | |
| 119 ################ | |
| 120 | |
| 121 MsXlsDb$methods( findByName = function(name) { | |
| 122 | |
| 123 # NULL entry | |
| 124 if (is.null(name)) | |
| 125 return(NA_integer_) | |
| 126 | |
| 127 # Initialize output list | |
| 128 ids <- NULL | |
| 129 | |
| 130 for (n in name) { | |
| 131 | |
| 132 id <- NA_integer_ | |
| 133 | |
| 134 if ( ! is.na(n)) { | |
| 135 | |
| 136 # Get index | |
| 137 index <- .self$.get.name.index() | |
| 138 | |
| 139 # Search for name in index | |
| 140 i <- binary.search(toupper(n), index[['name']]) | |
| 141 | |
| 142 id <- if (is.na(i)) NA_integer_ else index[i, 'id'] | |
| 143 } | |
| 144 | |
| 145 ids <- c(ids, id) | |
| 146 } | |
| 147 | |
| 148 return(ids) | |
| 149 }) | |
| 150 | |
| 151 ####################### | |
| 152 # GET RETENTION TIMES # | |
| 153 ####################### | |
| 154 | |
| 155 MsXlsDb$methods( getRetentionTimes = function(molid, col = NA_character_) { | |
| 156 | |
| 157 if (is.null(molid) || is.na(molid)) | |
| 158 return(NULL) | |
| 159 | |
| 160 # Find it in memory | |
| 161 rt <- .self$.mem.get(molid, 'rt') | |
| 162 | |
| 163 if (is.null(rt)) { | |
| 164 | |
| 165 # Call observers | |
| 166 if ( ! is.null(.self$.observers)) | |
| 167 for (obs in .self$.observers) | |
| 168 obs$progress(paste0("Loading retention times of file", .self$.get.file(molid), "."), level = 2) | |
| 169 | |
| 170 rt <- NULL | |
| 171 | |
| 172 # Load from cache file | |
| 173 cache_file <- NA_character_ | |
| 174 if ( ! is.na(.self$.get.cache.dir())) { | |
| 175 cache_file <- file.path(.self$.get.cache.dir(), paste0('rt-', molid, '.bin')) | |
| 176 if (file.exists(cache_file)) | |
| 177 load(file = cache_file) # load rt | |
| 178 } | |
| 179 | |
| 180 if (is.null(rt)) { | |
| 181 | |
| 182 # Get retention times of both positive and negative mode tabs | |
| 183 mspos_rt <- .self$.parse_retention_times(molid, .XLS_MSPOS_TAB) | |
| 184 msneg_rt <- .self$.parse_retention_times(molid, .XLS_MSNEG_TAB) | |
| 185 | |
| 186 # Retention times stored in negative and positive modes | |
| 187 if ( ! is.null(mspos_rt) && ! is.null(msneg_rt)) { | |
| 188 | |
| 189 # Warn observers when both retention time lists are not identical | |
| 190 if ( ! identical(mspos_rt, msneg_rt)) | |
| 191 for (obs in .self$.observers) | |
| 192 obs$warning(paste0("Retention times in negative and positive modes are different in file ", .self$.get.file(molid), ".")) | |
| 193 | |
| 194 # Merge both lists | |
| 195 rt <- mspos_rt | |
| 196 for (c in names(msneg_rt)) | |
| 197 if (c %in% names(rt)) { | |
| 198 v <- c(rt[[c]], msneg_rt[[c]]) | |
| 199 rt[[c]] <- v[ ! duplicated(v)] | |
| 200 } | |
| 201 else | |
| 202 rt[[c]] <- msneg_rt[[c]] | |
| 203 } | |
| 204 else | |
| 205 # Set retention times | |
| 206 rt <- if (is.null(mspos_rt)) msneg_rt else mspos_rt | |
| 207 | |
| 208 if (is.null(rt)) rt <- list() | |
| 209 | |
| 210 # Write in cache | |
| 211 if ( ! is.na(cache_file)) { | |
| 212 | |
| 213 # Call observers | |
| 214 if ( ! is.null(.self$.observers)) | |
| 215 for (obs in .self$.observers) | |
| 216 obs$progress(paste0("Caching retention times of file ", .self$.get.file(molid), ".")) | |
| 217 | |
| 218 save(rt, file = cache_file) | |
| 219 } | |
| 220 } | |
| 221 | |
| 222 # Store in memory | |
| 223 .self$.mem.set(rt, molid, 'rt') | |
| 224 } | |
| 225 | |
| 226 # Select only one column if asked | |
| 227 if ( ! is.na(col)) rt <- rt[[col]] | |
| 228 | |
| 229 return(rt) | |
| 230 }) | |
| 231 | |
| 232 ################# | |
| 233 # GET NB PEAKS # | |
| 234 ################# | |
| 235 | |
| 236 MsXlsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { | |
| 237 | |
| 238 # Initialize parameters | |
| 239 if (is.null(molid) || (length(molid) == 1 && is.na(molid))) | |
| 240 molid <- .self$getMoleculeIds() | |
| 241 if (is.na(type)) | |
| 242 type <- c(MSDB.TAG.POS, MSDB.TAG.NEG) | |
| 243 | |
| 244 return(sum(vapply(molid, function(m) { if (is.na(m)) 0 else sum(vapply(type, function(t) { peaks <- .self$.get.peaks(m, t) ; if (is.null(peaks)) 0 else nrow(peaks) }, FUN.VALUE = 1)) }, FUN.VALUE = 1))) | |
| 245 }) | |
| 246 | |
| 247 ################## | |
| 248 # GET PEAK TABLE # | |
| 249 ################## | |
| 250 | |
| 251 MsXlsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) { | |
| 252 | |
| 253 peaks <- NULL | |
| 254 | |
| 255 # Set default molecule IDs | |
| 256 if (is.null(molid) || (length(molid) == 1 && is.na(molid))) | |
| 257 molid <- .self$getMoleculeIds() | |
| 258 | |
| 259 # Set default modes | |
| 260 if (is.null(mode) || (length(mode) == 1 && is.na(mode))) | |
| 261 mode <- c(MSDB.TAG.POS, MSDB.TAG.NEG) | |
| 262 | |
| 263 # Loop on all molecules | |
| 264 for (mol in molid) { | |
| 265 | |
| 266 # Loop on all modes | |
| 267 for (mod in mode) { | |
| 268 m.peaks <- .self$.get.peaks(mol, mod) | |
| 269 if ( ! is.null(m.peaks) && nrow(m.peaks) > 0) { | |
| 270 m.peaks[[MSDB.TAG.MOLID]] <- mol | |
| 271 m.peaks[[MSDB.TAG.MODE]] <- mod | |
| 272 peaks <- if (is.null(peaks)) m.peaks else rbind(peaks, m.peaks) | |
| 273 peaks <- df.move.col.first(peaks, c(MSDB.TAG.MOLID, MSDB.TAG.MODE)) | |
| 274 } | |
| 275 } | |
| 276 } | |
| 277 | |
| 278 return(peaks) | |
| 279 }) | |
| 280 | |
| 281 ################# | |
| 282 # GET MZ VALUES # | |
| 283 ################# | |
| 284 | |
| 285 # Returns a numeric vector of all masses stored inside the database. | |
| 286 MsXlsDb$methods( getMzValues = function(mode = NULL) { | |
| 287 | |
| 288 mz <- numeric() | |
| 289 | |
| 290 # Get all mz values of all molecules | |
| 291 for(molid in .self$getMoleculeIds()) | |
| 292 for (m in (if (is.null(mode) || is.na(mode)) c(MSDB.TAG.POS, MSDB.TAG.NEG) else mode)) | |
| 293 mz <- c(mz, .self$.get.peaks(molid, m)[[MSDB.TAG.MZTHEO]]) | |
| 294 | |
| 295 # Remove duplicated | |
| 296 mz <- mz[ ! duplicated(mz)] | |
| 297 | |
| 298 return(mz) | |
| 299 }) | |
| 300 | |
| 301 ############# | |
| 302 # GET PEAKS # | |
| 303 ############# | |
| 304 | |
| 305 MsXlsDb$methods( .get.peaks = function(molid, mode) { | |
| 306 | |
| 307 tab <- if (mode == MSDB.TAG.POS) .XLS_MSPOS_TAB else .XLS_MSNEG_TAB | |
| 308 | |
| 309 # Find it in memory | |
| 310 peak_df <- .self$.mem.get(molid, 'peaks', mode) | |
| 311 | |
| 312 if (is.null(peak_df)) { | |
| 313 # Call observers | |
| 314 if ( ! is.null(.self$.observers)) | |
| 315 for (obs in .self$.observers) | |
| 316 obs$progress(paste0("Loading peaks of tab ", tab, " of file ", .self$.get.file(molid), "."), level = 2) | |
| 317 | |
| 318 peak_df <- NULL | |
| 319 | |
| 320 # Load from cache file | |
| 321 cache_file <- NA_character_ | |
| 322 if ( ! is.na(.self$.get.cache.dir())) { | |
| 323 cache_file <- file.path(.self$.get.cache.dir(), paste0('peaks-', molid, '-', tab, '.csv')) | |
| 324 if (file.exists(cache_file)) | |
| 325 peak_df <- read.csv(cache_file, header = TRUE, stringsAsFactors = FALSE) | |
| 326 } | |
| 327 | |
| 328 # Read from XLS file, if not in cache | |
| 329 if (is.null(peak_df)) { | |
| 330 | |
| 331 # Load tab (peaks start at row 8) | |
| 332 if (.self$.tab.exists(.self$.get.file(molid), tab)) { | |
| 333 | |
| 334 peaks <- read.excel(.self$.get.file(molid), tab, start.row = .XLS_PEAKS_ROW_OFFSET, stringsAsFactors = FALSE) | |
| 335 if ( ! is.null(peaks)) | |
| 336 peaks <- peaks[ ! is.na(peaks[.XLS_MZ_COL]), , drop = FALSE] # Remove rows where m/z is not defined. TODO maybe call observer for notify a line with non NA values but without m/z value. | |
| 337 | |
| 338 # Instantiate peaks | |
| 339 if ( ! is.null(peaks) && nrow(peaks) > 0) { | |
| 340 peak_df <- peaks[1:length(peaks[[.XLS_MZ_COL]]), c(.XLS_MZ_COL, .XLS_THEORETICAL_MZ_COL, .XLS_INTENSITY_COL, .XLS_RELATIVE_COL, .XLS_COMPOSITION_COL, .XLS_ATTRIBUTION_COL), drop = FALSE] | |
| 341 colnames(peak_df) <- c(MSDB.TAG.MZEXP, MSDB.TAG.MZTHEO, MSDB.TAG.INT, MSDB.TAG.REL, MSDB.TAG.COMP, MSDB.TAG.ATTR) | |
| 342 } | |
| 343 | |
| 344 # Set default data frame (important for cache file writing, because we need a correct header to be written in order for loading) | |
| 345 else { | |
| 346 peak_df <- data.frame(stringsAsFactors = FALSE) | |
| 347 peak_df[MSDB.TAG.MZEXP] <- numeric() | |
| 348 peak_df[MSDB.TAG.MZTHEO] <- numeric() | |
| 349 peak_df[MSDB.TAG.INT] <- numeric() | |
| 350 peak_df[MSDB.TAG.REL] <- numeric() | |
| 351 peak_df[MSDB.TAG.COMP] <- character() | |
| 352 peak_df[MSDB.TAG.ATTR] <- character() | |
| 353 } | |
| 354 | |
| 355 if (is.null(peak_df)) peak_df <- data.frame() | |
| 356 | |
| 357 # Write in cache | |
| 358 if ( ! is.na(cache_file)) { | |
| 359 | |
| 360 # Call observers | |
| 361 if ( ! is.null(.self$.observers)) | |
| 362 for (obs in .self$.observers) | |
| 363 obs$progress(paste0("Caching peaks of tab ", tab, " of file ", .self$.get.file(molid), ".")) | |
| 364 | |
| 365 write.csv(peak_df, cache_file, row.names = FALSE) | |
| 366 } | |
| 367 } | |
| 368 } | |
| 369 | |
| 370 # Store in memory | |
| 371 .self$.mem.set(peak_df, molid, 'peaks', mode) | |
| 372 } | |
| 373 | |
| 374 return(peak_df) | |
| 375 }) | |
| 376 | |
| 377 ############################## | |
| 378 # GET FULL MS PEAK M/Z INDEX # | |
| 379 ############################## | |
| 380 | |
| 381 # Get mz index for full ions, creating it if necessary. | |
| 382 MsXlsDb$methods( .get.mz.index = function(mode) { | |
| 383 | |
| 384 if (is.null(.self$.mz.index[[mode]])) { | |
| 385 | |
| 386 # Initialize data frame | |
| 387 mzi <- data.frame(stringsAsFactors = FALSE) | |
| 388 mzi[MSDB.TAG.MZTHEO] <- numeric() | |
| 389 mzi[MSDB.TAG.MOLID] <- character() | |
| 390 mzi[MSDB.TAG.COMP] <- character() | |
| 391 mzi[MSDB.TAG.ATTR] <- character() | |
| 392 | |
| 393 # Loop on all molecules | |
| 394 for(molid in .self$getMoleculeIds()) { | |
| 395 | |
| 396 # Get all peaks of this molecule | |
| 397 peaks <- .self$.get.peaks(molid, mode) | |
| 398 | |
| 399 # Remove rows whose mz is NA. | |
| 400 peaks <- peaks[ ! is.na(peaks[[MSDB.TAG.MZTHEO]]), ] | |
| 401 | |
| 402 if (nrow(peaks) > 0) { | |
| 403 | |
| 404 # Add id column | |
| 405 peaks[MSDB.TAG.MOLID] <- molid | |
| 406 | |
| 407 # Append peaks | |
| 408 r <- nrow(mzi) + 1 | |
| 409 rows <- r:(r+nrow(peaks)-1) | |
| 410 mzi[rows, ] <- peaks[colnames(mzi)] | |
| 411 } | |
| 412 } | |
| 413 | |
| 414 # Sort by M/Z | |
| 415 sorted_indices <- order(mzi[[MSDB.TAG.MZTHEO]]) | |
| 416 | |
| 417 # Group in a data frame | |
| 418 .self$.mz.index[[mode]] <- mzi[sorted_indices, ] | |
| 419 } | |
| 420 | |
| 421 return(.self$.mz.index[[mode]]) | |
| 422 }) | |
| 423 | |
| 424 ###################### | |
| 425 # SEARCH FOR MZ & RT # | |
| 426 ###################### | |
| 427 | |
| 428 MsXlsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { | |
| 429 | |
| 430 # Search for m/z | |
| 431 results <- .self$.do.search.for.mz(mode, mz.low, mz.high) | |
| 432 | |
| 433 # Filter on attributions | |
| 434 if ( ! is.null(attribs)) { | |
| 435 results <- results[results[[MSDB.TAG.ATTR]] %in% attribs, ] | |
| 436 } | |
| 437 | |
| 438 # Filer on molecule IDs | |
| 439 if ( ! is.null(molids)) { | |
| 440 results <- results[results[[MSDB.TAG.MOLID]] %in% molids, ] | |
| 441 } | |
| 442 | |
| 443 # Use retention time | |
| 444 if ( ! is.null(col) && ! is.null(rt.low) && ! is.null(rt.high)) { | |
| 445 | |
| 446 # Get list of unique IDs | |
| 447 ids <- results[[MSDB.TAG.MOLID]] | |
| 448 ids <- ids[ ! duplicated(ids)] | |
| 449 rt <- .self$.search.for.rt(mols = ids, rt.low = rt.low, rt.high = rt.high, col = col) | |
| 450 results <- results[results[[MSDB.TAG.MOLID]] %in% rt[[MSDB.TAG.MOLID]], ] | |
| 451 results <- merge(results, rt) | |
| 452 } | |
| 453 | |
| 454 return(results) | |
| 455 }) | |
| 456 | |
| 457 ############################## | |
| 458 # SEARCH FOR M/Z IN MS PEAKS # | |
| 459 ############################## | |
| 460 | |
| 461 MsXlsDb$methods( .do.search.for.mz = function(mode, mz.low, mz.high) { | |
| 462 | |
| 463 results <- data.frame(stringsAsFactors = FALSE) | |
| 464 results[MSDB.TAG.MZTHEO] <- numeric() | |
| 465 results[MSDB.TAG.MOLID] <- character() | |
| 466 results[MSDB.TAG.MOLNAMES] <- character() | |
| 467 results[MSDB.TAG.COMP] <- character() | |
| 468 results[MSDB.TAG.ATTR] <- character() | |
| 469 | |
| 470 # Create m/z index | |
| 471 mz_index <- .self$.get.mz.index(mode) | |
| 472 | |
| 473 # Find molecules | |
| 474 low_bound <- binary.search(mz.low, mz_index[[MSDB.TAG.MZTHEO]], lower = FALSE) | |
| 475 high_bound <- binary.search(mz.high, mz_index[[MSDB.TAG.MZTHEO]], lower = TRUE) | |
| 476 | |
| 477 # Get results | |
| 478 if ( ! is.na(high_bound) && ! is.na(low_bound) && low_bound <= high_bound) | |
| 479 results <- mz_index[low_bound:high_bound,] | |
| 480 | |
| 481 # Remove row names | |
| 482 rownames(results) <- NULL | |
| 483 | |
| 484 return(results) | |
| 485 }) | |
| 486 | |
| 487 ################ | |
| 488 # GET MOL NAME # | |
| 489 ################ | |
| 490 | |
| 491 MsXlsDb$methods( .get.mol.name = function(molid) { | |
| 492 | |
| 493 if (is.na(molid)) | |
| 494 return(NA_character_) | |
| 495 | |
| 496 # Find it in memory | |
| 497 name <- .self$.mem.get(molid, 'name') | |
| 498 | |
| 499 if (is.null(name)) { | |
| 500 | |
| 501 # Load molecule | |
| 502 mol <- .self$.load.molecule(molid) | |
| 503 | |
| 504 # Look for name in tabs | |
| 505 for (tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { | |
| 506 hdr <- mol[[tab]][['header']] | |
| 507 if ( ! is.null(hdr)) | |
| 508 name <- hdr[[1]] | |
| 509 if ( ! is.null(name) && ! is.na(name)) break | |
| 510 } | |
| 511 | |
| 512 # Store in memory | |
| 513 if (is.null(name)) name <- NA_character_ | |
| 514 .self$.mem.set(name, molid, 'name') | |
| 515 } | |
| 516 | |
| 517 return(name) | |
| 518 }) | |
| 519 | |
| 520 ################## | |
| 521 # GET NAME INDEX # | |
| 522 ################## | |
| 523 | |
| 524 # Get name index. | |
| 525 MsXlsDb$methods( .get.name.index = function() { | |
| 526 | |
| 527 if (is.null(.self$.name_index)) { | |
| 528 | |
| 529 # Get names | |
| 530 names <- vapply(.self$getMoleculeIds(), function(id) toupper(.self$getMoleculeName(id)), FUN.VALUE = "") | |
| 531 | |
| 532 # Get molecule IDs | |
| 533 id <- .self$getMoleculeIds() | |
| 534 | |
| 535 # Sort by names | |
| 536 sorted_indices <- order(names) | |
| 537 | |
| 538 # Group in a data frame | |
| 539 .self$.name_index <- data.frame(name = rbind(names)[, sorted_indices], | |
| 540 id = rbind(id)[, sorted_indices], | |
| 541 stringsAsFactors = FALSE) | |
| 542 } | |
| 543 | |
| 544 return(.self$.name_index) | |
| 545 }) | |
| 546 | |
| 547 ################## | |
| 548 # INIT FILE LIST # | |
| 549 ################## | |
| 550 | |
| 551 MsXlsDb$methods( .init.file.list = function() { | |
| 552 | |
| 553 if (is.null(.self$.files)) { | |
| 554 | |
| 555 # List all files | |
| 556 files <- Sys.glob(file.path(.self$.db_dir, '*.xls')) | |
| 557 | |
| 558 # Limit the size of the database | |
| 559 if ( ! is.na(.self$.limit)) | |
| 560 files <- head(files, .self$.limit) | |
| 561 | |
| 562 # Get IDs | |
| 563 ids <- vapply(files, function(f) .extract_molecule_id_from_filename(f), FUN.VALUE = 1) | |
| 564 | |
| 565 # Use ids as indices to build the vector of files | |
| 566 .files <<- rep(NA_character_, max(ids)) | |
| 567 .files[ids] <<- files | |
| 568 } | |
| 569 }) | |
| 570 | |
| 571 ################# | |
| 572 # GET CACHE DIR # | |
| 573 ################# | |
| 574 | |
| 575 MsXlsDb$methods( .get.cache.dir = function() { | |
| 576 | |
| 577 if ( ! is.na(.self$.cache_dir) && ! file.exists(.self$.cache_dir)) | |
| 578 dir.create(.self$.cache_dir) | |
| 579 | |
| 580 return(.self$.cache_dir) | |
| 581 }) | |
| 582 | |
| 583 ################# | |
| 584 # LOAD MOLECULE # | |
| 585 ################# | |
| 586 | |
| 587 MsXlsDb$methods( .load.molecule = function(molid) { | |
| 588 | |
| 589 # Init local variables | |
| 590 mol <- NULL | |
| 591 cache_file <- NA_character_ | |
| 592 excel_file <- .self$.get.file(molid) | |
| 593 | |
| 594 # Call observers | |
| 595 if ( ! is.null(.self$.observers)) | |
| 596 for (obs in .self$.observers) | |
| 597 obs$progress(paste0("Loading molecule ", molid, "."), level = 2) | |
| 598 | |
| 599 # Load from cache | |
| 600 if ( ! is.na(.self$.get.cache.dir())) { | |
| 601 cache_file <- file.path(.self$.get.cache.dir(), paste0(molid, '.bin')) | |
| 602 if (file.exists(cache_file)) | |
| 603 load(file = cache_file) # load mol variable | |
| 604 } | |
| 605 | |
| 606 # Load from Excel file & write to cache | |
| 607 if (is.null(mol) && ! is.na(excel_file)) { | |
| 608 | |
| 609 source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. | |
| 610 | |
| 611 # Load from Excel file | |
| 612 for(tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { | |
| 613 | |
| 614 # Test that tab exists | |
| 615 if (.self$.tab.exists(excel_file, tab)) { | |
| 616 header <- read.excel(excel_file, tab, start.row = 1, end.row = .XLS_PEAKS_ROW_OFFSET - 1, header = FALSE, stringsAsFactors = FALSE, trim.values = TRUE, col.index = c(1))[[1]] | |
| 617 peaks <- read.excel(excel_file, tab, start.row = .XLS_PEAKS_ROW_OFFSET) | |
| 618 mol[[tab]] <- list(header = header, peaks = peaks) | |
| 619 } | |
| 620 | |
| 621 # Missing tab | |
| 622 else { | |
| 623 for (obs in .self$.observers) | |
| 624 obs$warning(paste0("No excel tab ", tab, " in file ", excel_file, ".")) | |
| 625 } | |
| 626 } | |
| 627 | |
| 628 # Write in cache | |
| 629 if ( ! is.na(cache_file)) { | |
| 630 | |
| 631 # Call observers | |
| 632 if ( ! is.null(.self$.observers)) | |
| 633 for (obs in .self$.observers) | |
| 634 obs$progress(paste0("Caching file ", excel_file, ".")) | |
| 635 | |
| 636 save(mol, file = cache_file) | |
| 637 } | |
| 638 } | |
| 639 | |
| 640 return(mol) | |
| 641 }) | |
| 642 | |
| 643 ######################## | |
| 644 # DOES EXCEL TAB EXIST # | |
| 645 ######################## | |
| 646 | |
| 647 MsXlsDb$methods( .tab.exists = function(file, tab) { | |
| 648 | |
| 649 source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. | |
| 650 | |
| 651 if ( ! tab.exists(file, tab)) { | |
| 652 | |
| 653 # Warn observers | |
| 654 for (obs in .self$.observers) | |
| 655 obs$warning(paste0("No excel tab ", tab, " in file ", file, ".")) | |
| 656 | |
| 657 return(FALSE) | |
| 658 } | |
| 659 | |
| 660 return(TRUE) | |
| 661 }) | |
| 662 | |
| 663 ######################### | |
| 664 # PARSE RETENTION TIMES # | |
| 665 ######################### | |
| 666 | |
| 667 MsXlsDb$methods( .parse_retention_times = function(id, tab) { | |
| 668 | |
| 669 rt <- NULL | |
| 670 | |
| 671 if (.self$.tab.exists(.self$.get.file(id), tab)) { | |
| 672 peaks <- read.excel(.self$.get.file(id), tab, start.row = .XLS_PEAKS_ROW_OFFSET) | |
| 673 | |
| 674 # Get retention times | |
| 675 if ( ! is.null(peaks) && length(peaks) > 0 && ! is.na(peaks[[1]][[1]])) | |
| 676 for (c in .XLS_PEAKS_RT_COL_START:length(names(peaks))) | |
| 677 if ( ! is.na(peaks[[c]][[1]])) { | |
| 678 | |
| 679 # Check retention times of all different m/z peaks for the same column. | |
| 680 .self$.check_retention_times(id, tab, names(peaks)[[c]], peaks[[c]], sum( ! is.na(peaks[[1]]))) | |
| 681 | |
| 682 # Add retention time | |
| 683 # TODO The column names are transformed through the read.xlsx call. For instance: | |
| 684 # HPLC (C18) 25mn QTOF (Bis) --> HPLC..C18..25mn.QTOF..Bis. | |
| 685 # ZICpHILIC 150*5*2.1 Shimadzu-Exactive-42mn --> ZICpHILIC.150.5.2.1.Shimadzu.Exactive.42mn | |
| 686 # This can be an issue, since we loose the formating. | |
| 687 col_id <- names(peaks)[[c]] | |
| 688 time <- peaks[[c]][[1]] * 60 # Read and convert retention time in seconds. | |
| 689 if (is.null(rt) || ! col_id %in% names(rt)) | |
| 690 rt[[col_id]] <- list(time) | |
| 691 else | |
| 692 rt[[col_id]] <- c(rt[[col_id]], time) | |
| 693 } | |
| 694 } | |
| 695 | |
| 696 return(rt) | |
| 697 }) | |
| 698 | |
| 699 ######################### | |
| 700 # CHECK RETENTION TIMES # | |
| 701 ######################### | |
| 702 | |
| 703 MsXlsDb$methods( .check_retention_times = function(id, tab_name, column_name, rt, n) { | |
| 704 | |
| 705 if (n >= 1 && ! is.null(.self$.observers) && length(.self$.observers) > 0) | |
| 706 | |
| 707 # Check column only if there is at least one value inside | |
| 708 if (sum( ! is.na(rt)) > 0) | |
| 709 | |
| 710 # Loop on all values | |
| 711 for(i in 1:n) { | |
| 712 | |
| 713 # Check that it's defined | |
| 714 if (i > 1 && is.na(rt[[i]])) | |
| 715 for (obs in .self$.observers) | |
| 716 obs$warning(paste0("Retention times undefined for column ", column_name, " at row ", i + .XLS_PEAKS_ROW_OFFSET, " of tab ", tab_name, " in file ", .self$.get.file(id), ".")) | |
| 717 | |
| 718 else if (i > 1) | |
| 719 # Check the value (it must be constant) | |
| 720 if (rt[[i-1]] != rt[[i]]) | |
| 721 for (obs in .self$.observers) | |
| 722 obs$error(paste0("Retention times not constant for column ", column_name, " between row ", i - 1 + .XLS_PEAKS_ROW_OFFSET, " and row ", i + .XLS_PEAKS_ROW_OFFSET, "o tab", tab_name, "in file", .self$.get.file(id))) | |
| 723 } | |
| 724 }) | |
| 725 | |
| 726 #################### | |
| 727 # GET FILE FROM ID # | |
| 728 #################### | |
| 729 | |
| 730 MsXlsDb$methods( .get.file = function(id) { | |
| 731 | |
| 732 # List files | |
| 733 .self$.init.file.list() | |
| 734 | |
| 735 return( if (id > 0 && id <= length(.self$.files)) .self$.files[id] else NA_character_) | |
| 736 }) | |
| 737 | |
| 738 ########### | |
| 739 # MEM GET # | |
| 740 ########### | |
| 741 | |
| 742 # Get database data from memory | |
| 743 MsXlsDb$methods( .mem.get = function(molid, field, second.field = NA_character_) { | |
| 744 | |
| 745 data <- .self$.db[[as.character(molid)]][[field]] | |
| 746 | |
| 747 if ( ! is.na(second.field)) | |
| 748 data <- data[[second.field]] | |
| 749 | |
| 750 return(data) | |
| 751 }) | |
| 752 | |
| 753 ########### | |
| 754 # MEM SET # | |
| 755 ########### | |
| 756 | |
| 757 # Set database data into memory | |
| 758 MsXlsDb$methods( .mem.set = function(data, molid, field, second.field = NA_character_) { | |
| 759 | |
| 760 id <- as.character(molid) | |
| 761 | |
| 762 # Create db | |
| 763 if (is.null(.self$.db)) | |
| 764 .db <<- list() | |
| 765 | |
| 766 # Create first level | |
| 767 if (is.null(.self$.db[[id]])) | |
| 768 .self$.db[[id]] <- list() | |
| 769 | |
| 770 # Create second level | |
| 771 if ( ! is.na(second.field) && is.null(.self$.db[[id]][[field]])) | |
| 772 .self$.db[[id]][[field]] <- list() | |
| 773 | |
| 774 # Store data | |
| 775 if (is.na(second.field)) { | |
| 776 .self$.db[[id]][[field]] <- data | |
| 777 } else { | |
| 778 .self$.db[[id]][[field]][[second.field]] <- data | |
| 779 } | |
| 780 }) | |
| 781 | |
| 782 ################# | |
| 783 # SEARCH FOR RT # | |
| 784 ################# | |
| 785 | |
| 786 # Find molecules matching a certain retention time. | |
| 787 # col A list of chromatographic columns to use. | |
| 788 # rt.low The lower bound of the rt value. | |
| 789 # rt.high The higher bound of the rt value. | |
| 790 # mols A list of molecule IDs to process. If unset, then take all molecules. | |
| 791 # Return a data frame with the following columns: id, col, colrt. | |
| 792 MsXlsDb$methods( .search.for.rt = function(col, rt.low, rt.high, mols = NULL) { | |
| 793 | |
| 794 # Use all molecules if no list is provided | |
| 795 if (is.null(mols)) | |
| 796 mols <- .self$getMoleculeIds() | |
| 797 | |
| 798 results <- data.frame(id = integer(), col = character(), colrt = double(), stringsAsFactors = FALSE) | |
| 799 | |
| 800 # Loop on all molecules | |
| 801 for (molid in mols) { | |
| 802 no.col <- TRUE | |
| 803 for (c in col) { | |
| 804 molrts <- .self$getRetentionTimes(molid, c) | |
| 805 if ( ! is.null(molrts)) { | |
| 806 no.col <- FALSE | |
| 807 for (molrt in molrts) { | |
| 808 if (molrt >= rt.low && molrt <= rt.high) { | |
| 809 r <- nrow(results) + 1 | |
| 810 results[r, ] <- c(id = molid, col = c, colrt = molrt) | |
| 811 } | |
| 812 } | |
| 813 } | |
| 814 } | |
| 815 | |
| 816 if (no.col) { | |
| 817 r <- nrow(results) + 1 | |
| 818 results[r, c(MSDB.TAG.MOLID)] <- c(id = molid) | |
| 819 } | |
| 820 } | |
| 821 | |
| 822 return(results) | |
| 823 }) | |
| 824 | |
| 825 ############################ | |
| 826 # EXTRACT ID FROM FILENAME # | |
| 827 ############################ | |
| 828 | |
| 829 .extract_molecule_id_from_filename <- function(filename) { | |
| 830 | |
| 831 id <- NA_integer_ | |
| 832 | |
| 833 if ( ! is.na(filename)) { | |
| 834 g <- str_match(filename, "N(\\d+)[._-]") | |
| 835 if ( ! is.na(g[1,1])) | |
| 836 id <- as.numeric(g[1,2]) | |
| 837 } | |
| 838 | |
| 839 return(id) | |
| 840 } | |
| 841 | |
| 842 } # end of load safe guard |
