Mercurial > repos > prog > lcmsmatching
comparison MsXlsDb.R @ 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
| author | prog |
|---|---|
| date | Fri, 22 Feb 2019 16:04:22 -0500 |
| parents | fb9c0409d85c |
| children |
comparison
equal
deleted
inserted
replaced
| 5:fb9c0409d85c | 6:f86fec07f392 |
|---|---|
| 1 if ( ! exists('MsXlsDb')) { # Do not load again if already loaded | |
| 2 | |
| 3 library('methods') | |
| 4 library('stringr') | |
| 5 source('msdb-common.R') | |
| 6 source('MsDb.R') | |
| 7 source('strhlp.R', chdir = TRUE) | |
| 8 source('dfhlp.R', chdir = TRUE) | |
| 9 source('search.R', chdir = TRUE) | |
| 10 source('excelhlp.R', chdir = TRUE) | |
| 11 | |
| 12 ############# | |
| 13 # CONSTANTS # | |
| 14 ############# | |
| 15 | |
| 16 .THIS.FILE.PATH <- getwd() # We suppose that the file has been sourced with the option chdir = TRUE | |
| 17 | |
| 18 .XLS_PEAKS_ROW_OFFSET <- 8 | |
| 19 .XLS_PEAKS_RT_COL_START <- 11 | |
| 20 .XLS_MSPOS_TAB <- 'MS_POS' | |
| 21 .XLS_MSNEG_TAB <- 'MS_NEG' | |
| 22 .XLS_MZ_COL <- 1 | |
| 23 .XLS_INTENSITY_COL <- 2 | |
| 24 .XLS_RELATIVE_COL <- 3 | |
| 25 .XLS_THEORETICAL_MZ_COL <- 5 | |
| 26 .XLS_COMPOSITION_COL <- 8 | |
| 27 .XLS_ATTRIBUTION_COL <- 9 | |
| 28 | |
| 29 ##################### | |
| 30 # CLASS DECLARATION # | |
| 31 ##################### | |
| 32 | |
| 33 MsXlsDb <- setRefClass("MsXlsDb", contains = "MsDb", fields = list(.mz.index = "ANY", .name_index = "ANY", .db_dir = "character", .limit = "numeric", .files = "ANY", .cache_dir = "character", .db = "ANY")) | |
| 34 | |
| 35 ############### | |
| 36 # CONSTRUCTOR # | |
| 37 ############### | |
| 38 | |
| 39 MsXlsDb$methods( initialize = function(db_dir = NA_character_, limit = NA_integer_, cache_dir = NA_character_, cache = FALSE, ...) { | |
| 40 | |
| 41 # Initialize members | |
| 42 # TODO check that db_dir is not null neither na, and tests that it exists and is a directory. | |
| 43 .db_dir <<- if ( ! is.null(db_dir)) db_dir else NA_character_ | |
| 44 .limit <<- if ( ! is.null(limit) && ! is.na(limit) && limit > 0) limit else NA_integer_ | |
| 45 cache_dir <- if (cache && is.na(cache_dir) && ! is.na(db_dir)) file.path(db_dir, 'cache') else cache_dir | |
| 46 .cache_dir <<- if ( cache || ! is.null(cache_dir)) cache_dir else NA_character_ | |
| 47 .files <<- NULL | |
| 48 .db <<- NULL | |
| 49 .mz.index <<- NULL | |
| 50 .name_index <<- NULL | |
| 51 | |
| 52 callSuper(...) | |
| 53 }) | |
| 54 | |
| 55 #################### | |
| 56 # GET MOLECULE IDS # | |
| 57 #################### | |
| 58 | |
| 59 MsXlsDb$methods( getMoleculeIds = function(max.results = NA_integer_) { | |
| 60 | |
| 61 # Init file list | |
| 62 .self$.init.file.list() | |
| 63 | |
| 64 # Get IDs | |
| 65 mol.ids <- as.integer(which( ! is.na(.self$.files))) | |
| 66 | |
| 67 # Cut | |
| 68 if ( ! is.na(max.results) && length(mol.ids) > max.results) | |
| 69 mol.ids <- mol.ids[max.results, ] | |
| 70 | |
| 71 return(mol.ids) | |
| 72 }) | |
| 73 | |
| 74 #################### | |
| 75 # GET NB MOLECULES # | |
| 76 #################### | |
| 77 | |
| 78 # Returns a list of all molecule names | |
| 79 MsXlsDb$methods( getNbMolecules = function() { | |
| 80 return(length(.self$getMoleculeIds())) | |
| 81 }) | |
| 82 | |
| 83 ##################### | |
| 84 # GET MOLECULE NAME # | |
| 85 ##################### | |
| 86 | |
| 87 MsXlsDb$methods( getMoleculeName = function(molid) { | |
| 88 return(vapply(molid, function(m) .self$.get.mol.name(m), FUN.VALUE = "")) | |
| 89 }) | |
| 90 | |
| 91 ############################### | |
| 92 # GET CHROMATOGRAPHIC COLUMNS # | |
| 93 ############################### | |
| 94 | |
| 95 # Returns a list of all chromatographic columns used | |
| 96 MsXlsDb$methods( getChromCol = function(molid = NULL) { | |
| 97 | |
| 98 cn <- character() | |
| 99 | |
| 100 # If no molecule IDs provided, then look at all molecules | |
| 101 if (is.null(molid)) | |
| 102 molid <- .self$getMoleculeIds() | |
| 103 | |
| 104 # Loop on molecules | |
| 105 for (mid in molid) { | |
| 106 | |
| 107 rt <- .self$getRetentionTimes(mid) | |
| 108 | |
| 109 if ( ! is.null(rt)) | |
| 110 cn <- c(cn, names(rt)) | |
| 111 } | |
| 112 | |
| 113 # Remove duplicates | |
| 114 cn <- cn[ ! duplicated(cn)] | |
| 115 | |
| 116 # Make data frame | |
| 117 cn <- data.frame(id = cn, title = cn, stringsAsFactors = FALSE) | |
| 118 | |
| 119 return(cn) | |
| 120 }) | |
| 121 | |
| 122 ################ | |
| 123 # FIND BY NAME # | |
| 124 ################ | |
| 125 | |
| 126 MsXlsDb$methods( findByName = function(name) { | |
| 127 | |
| 128 # NULL entry | |
| 129 if (is.null(name)) | |
| 130 return(NA_integer_) | |
| 131 | |
| 132 # Initialize output list | |
| 133 ids <- NULL | |
| 134 | |
| 135 for (n in name) { | |
| 136 | |
| 137 id <- NA_integer_ | |
| 138 | |
| 139 if ( ! is.na(n)) { | |
| 140 | |
| 141 # Get index | |
| 142 index <- .self$.get.name.index() | |
| 143 | |
| 144 # Search for name in index | |
| 145 i <- binary.search(toupper(n), index[['name']]) | |
| 146 | |
| 147 id <- if (is.na(i)) NA_integer_ else index[i, 'id'] | |
| 148 } | |
| 149 | |
| 150 ids <- c(ids, id) | |
| 151 } | |
| 152 | |
| 153 return(ids) | |
| 154 }) | |
| 155 | |
| 156 ####################### | |
| 157 # GET RETENTION TIMES # | |
| 158 ####################### | |
| 159 | |
| 160 MsXlsDb$methods( getRetentionTimes = function(molid, col = NA_character_) { | |
| 161 | |
| 162 if (is.null(molid) || is.na(molid)) | |
| 163 return(NULL) | |
| 164 | |
| 165 # Find it in memory | |
| 166 rt <- .self$.mem.get(molid, 'rt') | |
| 167 | |
| 168 if (is.null(rt)) { | |
| 169 | |
| 170 # Call observers | |
| 171 if ( ! is.null(.self$.observers)) | |
| 172 for (obs in .self$.observers) | |
| 173 obs$progress(paste0("Loading retention times of file", .self$.get.file(molid), "."), level = 2) | |
| 174 | |
| 175 rt <- NULL | |
| 176 | |
| 177 # Load from cache file | |
| 178 cache_file <- NA_character_ | |
| 179 if ( ! is.na(.self$.get.cache.dir())) { | |
| 180 cache_file <- file.path(.self$.get.cache.dir(), paste0('rt-', molid, '.bin')) | |
| 181 if (file.exists(cache_file)) | |
| 182 load(file = cache_file) # load rt | |
| 183 } | |
| 184 | |
| 185 if (is.null(rt)) { | |
| 186 | |
| 187 # Get retention times of both positive and negative mode tabs | |
| 188 mspos_rt <- .self$.parse_retention_times(molid, .XLS_MSPOS_TAB) | |
| 189 msneg_rt <- .self$.parse_retention_times(molid, .XLS_MSNEG_TAB) | |
| 190 | |
| 191 # Retention times stored in negative and positive modes | |
| 192 if ( ! is.null(mspos_rt) && ! is.null(msneg_rt)) { | |
| 193 | |
| 194 # Warn observers when both retention time lists are not identical | |
| 195 if ( ! identical(mspos_rt, msneg_rt)) | |
| 196 for (obs in .self$.observers) | |
| 197 obs$warning(paste0("Retention times in negative and positive modes are different in file ", .self$.get.file(molid), ".")) | |
| 198 | |
| 199 # Merge both lists | |
| 200 rt <- mspos_rt | |
| 201 for (c in names(msneg_rt)) | |
| 202 if (c %in% names(rt)) { | |
| 203 v <- c(rt[[c]], msneg_rt[[c]]) | |
| 204 rt[[c]] <- v[ ! duplicated(v)] | |
| 205 } | |
| 206 else | |
| 207 rt[[c]] <- msneg_rt[[c]] | |
| 208 } | |
| 209 else | |
| 210 # Set retention times | |
| 211 rt <- if (is.null(mspos_rt)) msneg_rt else mspos_rt | |
| 212 | |
| 213 if (is.null(rt)) rt <- list() | |
| 214 | |
| 215 # Write in cache | |
| 216 if ( ! is.na(cache_file)) { | |
| 217 | |
| 218 # Call observers | |
| 219 if ( ! is.null(.self$.observers)) | |
| 220 for (obs in .self$.observers) | |
| 221 obs$progress(paste0("Caching retention times of file ", .self$.get.file(molid), ".")) | |
| 222 | |
| 223 save(rt, file = cache_file) | |
| 224 } | |
| 225 } | |
| 226 | |
| 227 # Store in memory | |
| 228 .self$.mem.set(rt, molid, 'rt') | |
| 229 } | |
| 230 | |
| 231 # Select only one column if asked | |
| 232 if ( ! is.na(col)) rt <- rt[[col]] | |
| 233 | |
| 234 return(rt) | |
| 235 }) | |
| 236 | |
| 237 ################# | |
| 238 # GET NB PEAKS # | |
| 239 ################# | |
| 240 | |
| 241 MsXlsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { | |
| 242 | |
| 243 # Initialize parameters | |
| 244 if (is.null(molid) || (length(molid) == 1 && is.na(molid))) | |
| 245 molid <- .self$getMoleculeIds() | |
| 246 if (is.na(type)) | |
| 247 type <- c(MSDB.TAG.POS, MSDB.TAG.NEG) | |
| 248 | |
| 249 return(sum(vapply(molid, function(m) { if (is.na(m)) 0 else sum(vapply(type, function(t) { peaks <- .self$.get.peaks(m, t) ; if (is.null(peaks)) 0 else nrow(peaks) }, FUN.VALUE = 1)) }, FUN.VALUE = 1))) | |
| 250 }) | |
| 251 | |
| 252 ################## | |
| 253 # GET PEAK TABLE # | |
| 254 ################## | |
| 255 | |
| 256 MsXlsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) { | |
| 257 | |
| 258 peaks <- NULL | |
| 259 | |
| 260 # Set default molecule IDs | |
| 261 if (is.null(molid) || (length(molid) == 1 && is.na(molid))) | |
| 262 molid <- .self$getMoleculeIds() | |
| 263 | |
| 264 # Set default modes | |
| 265 if (is.null(mode) || (length(mode) == 1 && is.na(mode))) | |
| 266 mode <- c(MSDB.TAG.POS, MSDB.TAG.NEG) | |
| 267 | |
| 268 # Loop on all molecules | |
| 269 for (mol in molid) { | |
| 270 | |
| 271 # Loop on all modes | |
| 272 for (mod in mode) { | |
| 273 m.peaks <- .self$.get.peaks(mol, mod) | |
| 274 if ( ! is.null(m.peaks) && nrow(m.peaks) > 0) { | |
| 275 m.peaks[[MSDB.TAG.MOLID]] <- mol | |
| 276 m.peaks[[MSDB.TAG.MODE]] <- mod | |
| 277 peaks <- if (is.null(peaks)) m.peaks else rbind(peaks, m.peaks) | |
| 278 peaks <- df.move.col.first(peaks, c(MSDB.TAG.MOLID, MSDB.TAG.MODE)) | |
| 279 } | |
| 280 } | |
| 281 } | |
| 282 | |
| 283 return(peaks) | |
| 284 }) | |
| 285 | |
| 286 ################# | |
| 287 # GET MZ VALUES # | |
| 288 ################# | |
| 289 | |
| 290 # Returns a numeric vector of all masses stored inside the database. | |
| 291 MsXlsDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { | |
| 292 | |
| 293 mz <- numeric() | |
| 294 | |
| 295 # Get all mz values of all molecules | |
| 296 for(molid in .self$getMoleculeIds()) | |
| 297 for (m in (if (is.null(mode) || is.na(mode)) c(MSDB.TAG.POS, MSDB.TAG.NEG) else mode)) | |
| 298 mz <- c(mz, .self$.get.peaks(molid, m)[[MSDB.TAG.MZTHEO]]) | |
| 299 | |
| 300 # Remove duplicated | |
| 301 mz <- mz[ ! duplicated(mz)] | |
| 302 | |
| 303 # Apply cut-off | |
| 304 if ( ! is.na(max.results)) | |
| 305 mz <- mz[1:max.results] | |
| 306 | |
| 307 return(mz) | |
| 308 }) | |
| 309 | |
| 310 ############# | |
| 311 # GET PEAKS # | |
| 312 ############# | |
| 313 | |
| 314 MsXlsDb$methods( .get.peaks = function(molid, mode) { | |
| 315 | |
| 316 tab <- if (mode == MSDB.TAG.POS) .XLS_MSPOS_TAB else .XLS_MSNEG_TAB | |
| 317 | |
| 318 # Find it in memory | |
| 319 peak_df <- .self$.mem.get(molid, 'peaks', mode) | |
| 320 | |
| 321 if (is.null(peak_df)) { | |
| 322 # Call observers | |
| 323 if ( ! is.null(.self$.observers)) | |
| 324 for (obs in .self$.observers) | |
| 325 obs$progress(paste0("Loading peaks of tab ", tab, " of file ", .self$.get.file(molid), "."), level = 2) | |
| 326 | |
| 327 peak_df <- NULL | |
| 328 | |
| 329 # Load from cache file | |
| 330 cache_file <- NA_character_ | |
| 331 if ( ! is.na(.self$.get.cache.dir())) { | |
| 332 cache_file <- file.path(.self$.get.cache.dir(), paste0('peaks-', molid, '-', tab, '.csv')) | |
| 333 if (file.exists(cache_file)) | |
| 334 peak_df <- read.csv(cache_file, header = TRUE, stringsAsFactors = FALSE) | |
| 335 } | |
| 336 | |
| 337 # Read from XLS file, if not in cache | |
| 338 if (is.null(peak_df)) { | |
| 339 | |
| 340 # Load tab (peaks start at row 8) | |
| 341 if (.self$.tab.exists(.self$.get.file(molid), tab)) { | |
| 342 | |
| 343 peaks <- read.excel(.self$.get.file(molid), tab, start.row = .XLS_PEAKS_ROW_OFFSET, stringsAsFactors = FALSE) | |
| 344 if ( ! is.null(peaks)) | |
| 345 peaks <- peaks[ ! is.na(peaks[.XLS_MZ_COL]), , drop = FALSE] # Remove rows where m/z is not defined. TODO maybe call observer for notify a line with non NA values but without m/z value. | |
| 346 | |
| 347 # Instantiate peaks | |
| 348 if ( ! is.null(peaks) && nrow(peaks) > 0) { | |
| 349 peak_df <- peaks[1:length(peaks[[.XLS_MZ_COL]]), c(.XLS_MZ_COL, .XLS_THEORETICAL_MZ_COL, .XLS_INTENSITY_COL, .XLS_RELATIVE_COL, .XLS_COMPOSITION_COL, .XLS_ATTRIBUTION_COL), drop = FALSE] | |
| 350 colnames(peak_df) <- c(MSDB.TAG.MZEXP, MSDB.TAG.MZTHEO, MSDB.TAG.INT, MSDB.TAG.REL, MSDB.TAG.COMP, MSDB.TAG.ATTR) | |
| 351 } | |
| 352 | |
| 353 # Set default data frame (important for cache file writing, because we need a correct header to be written in order for loading) | |
| 354 else { | |
| 355 peak_df <- data.frame(stringsAsFactors = FALSE) | |
| 356 peak_df[MSDB.TAG.MZEXP] <- numeric() | |
| 357 peak_df[MSDB.TAG.MZTHEO] <- numeric() | |
| 358 peak_df[MSDB.TAG.INT] <- numeric() | |
| 359 peak_df[MSDB.TAG.REL] <- numeric() | |
| 360 peak_df[MSDB.TAG.COMP] <- character() | |
| 361 peak_df[MSDB.TAG.ATTR] <- character() | |
| 362 } | |
| 363 | |
| 364 if (is.null(peak_df)) peak_df <- data.frame() | |
| 365 | |
| 366 # Write in cache | |
| 367 if ( ! is.na(cache_file)) { | |
| 368 | |
| 369 # Call observers | |
| 370 if ( ! is.null(.self$.observers)) | |
| 371 for (obs in .self$.observers) | |
| 372 obs$progress(paste0("Caching peaks of tab ", tab, " of file ", .self$.get.file(molid), ".")) | |
| 373 | |
| 374 write.csv(peak_df, cache_file, row.names = FALSE) | |
| 375 } | |
| 376 } | |
| 377 } | |
| 378 | |
| 379 # Store in memory | |
| 380 .self$.mem.set(peak_df, molid, 'peaks', mode) | |
| 381 } | |
| 382 | |
| 383 return(peak_df) | |
| 384 }) | |
| 385 | |
| 386 ############################## | |
| 387 # GET FULL MS PEAK M/Z INDEX # | |
| 388 ############################## | |
| 389 | |
| 390 # Get mz index for full ions, creating it if necessary. | |
| 391 MsXlsDb$methods( .get.mz.index = function(mode) { | |
| 392 | |
| 393 if (is.null(.self$.mz.index[[mode]])) { | |
| 394 | |
| 395 # Initialize data frame | |
| 396 mzi <- data.frame(stringsAsFactors = FALSE) | |
| 397 mzi[MSDB.TAG.MZTHEO] <- numeric() | |
| 398 mzi[MSDB.TAG.MOLID] <- character() | |
| 399 mzi[MSDB.TAG.COMP] <- character() | |
| 400 mzi[MSDB.TAG.ATTR] <- character() | |
| 401 | |
| 402 # Loop on all molecules | |
| 403 for(molid in .self$getMoleculeIds()) { | |
| 404 | |
| 405 # Get all peaks of this molecule | |
| 406 peaks <- .self$.get.peaks(molid, mode) | |
| 407 | |
| 408 # Remove rows whose mz is NA. | |
| 409 peaks <- peaks[ ! is.na(peaks[[MSDB.TAG.MZTHEO]]), ] | |
| 410 | |
| 411 if (nrow(peaks) > 0) { | |
| 412 | |
| 413 # Add id column | |
| 414 peaks[MSDB.TAG.MOLID] <- molid | |
| 415 | |
| 416 # Append peaks | |
| 417 r <- nrow(mzi) + 1 | |
| 418 rows <- r:(r+nrow(peaks)-1) | |
| 419 mzi[rows, ] <- peaks[colnames(mzi)] | |
| 420 } | |
| 421 } | |
| 422 | |
| 423 # Sort by M/Z | |
| 424 sorted_indices <- order(mzi[[MSDB.TAG.MZTHEO]]) | |
| 425 | |
| 426 # Group in a data frame | |
| 427 .self$.mz.index[[mode]] <- mzi[sorted_indices, ] | |
| 428 } | |
| 429 | |
| 430 return(.self$.mz.index[[mode]]) | |
| 431 }) | |
| 432 | |
| 433 ###################### | |
| 434 # SEARCH FOR MZ & RT # | |
| 435 ###################### | |
| 436 | |
| 437 MsXlsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { | |
| 438 | |
| 439 # Search for m/z | |
| 440 results <- .self$.do.search.for.mz(mode, mz.low, mz.high) | |
| 441 | |
| 442 # Filter on attributions | |
| 443 if ( ! is.null(attribs)) { | |
| 444 results <- results[results[[MSDB.TAG.ATTR]] %in% attribs, ] | |
| 445 } | |
| 446 | |
| 447 # Filer on molecule IDs | |
| 448 if ( ! is.null(molids)) { | |
| 449 results <- results[results[[MSDB.TAG.MOLID]] %in% molids, ] | |
| 450 } | |
| 451 | |
| 452 # Use retention time | |
| 453 if ( ! is.null(col) && ! is.null(rt.low) && ! is.null(rt.high)) { | |
| 454 | |
| 455 # Get list of unique IDs | |
| 456 ids <- results[[MSDB.TAG.MOLID]] | |
| 457 ids <- ids[ ! duplicated(ids)] | |
| 458 rt <- .self$.search.for.rt(mols = ids, rt.low = rt.low, rt.high = rt.high, col = col) | |
| 459 results <- results[results[[MSDB.TAG.MOLID]] %in% rt[[MSDB.TAG.MOLID]], ] | |
| 460 results <- merge(results, rt) | |
| 461 } | |
| 462 | |
| 463 return(results) | |
| 464 }) | |
| 465 | |
| 466 ############################## | |
| 467 # SEARCH FOR M/Z IN MS PEAKS # | |
| 468 ############################## | |
| 469 | |
| 470 MsXlsDb$methods( .do.search.for.mz = function(mode, mz.low, mz.high) { | |
| 471 | |
| 472 results <- data.frame(stringsAsFactors = FALSE) | |
| 473 results[MSDB.TAG.MZTHEO] <- numeric() | |
| 474 results[MSDB.TAG.MOLID] <- character() | |
| 475 results[MSDB.TAG.MOLNAMES] <- character() | |
| 476 results[MSDB.TAG.COMP] <- character() | |
| 477 results[MSDB.TAG.ATTR] <- character() | |
| 478 | |
| 479 # Create m/z index | |
| 480 mz_index <- .self$.get.mz.index(mode) | |
| 481 | |
| 482 # Find molecules | |
| 483 low_bound <- binary.search(mz.low, mz_index[[MSDB.TAG.MZTHEO]], lower = FALSE) | |
| 484 high_bound <- binary.search(mz.high, mz_index[[MSDB.TAG.MZTHEO]], lower = TRUE) | |
| 485 | |
| 486 # Get results | |
| 487 if ( ! is.na(high_bound) && ! is.na(low_bound) && low_bound <= high_bound) | |
| 488 results <- mz_index[low_bound:high_bound,] | |
| 489 | |
| 490 # Remove row names | |
| 491 rownames(results) <- NULL | |
| 492 | |
| 493 return(results) | |
| 494 }) | |
| 495 | |
| 496 ################ | |
| 497 # GET MOL NAME # | |
| 498 ################ | |
| 499 | |
| 500 MsXlsDb$methods( .get.mol.name = function(molid) { | |
| 501 | |
| 502 if (is.na(molid)) | |
| 503 return(NA_character_) | |
| 504 | |
| 505 # Find it in memory | |
| 506 name <- .self$.mem.get(molid, 'name') | |
| 507 | |
| 508 if (is.null(name)) { | |
| 509 | |
| 510 # Load molecule | |
| 511 mol <- .self$.load.molecule(molid) | |
| 512 | |
| 513 # Look for name in tabs | |
| 514 for (tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { | |
| 515 hdr <- mol[[tab]][['header']] | |
| 516 if ( ! is.null(hdr)) | |
| 517 name <- hdr[[1]] | |
| 518 if ( ! is.null(name) && ! is.na(name)) break | |
| 519 } | |
| 520 | |
| 521 # Store in memory | |
| 522 if (is.null(name)) name <- NA_character_ | |
| 523 .self$.mem.set(name, molid, 'name') | |
| 524 } | |
| 525 | |
| 526 return(name) | |
| 527 }) | |
| 528 | |
| 529 ################## | |
| 530 # GET NAME INDEX # | |
| 531 ################## | |
| 532 | |
| 533 # Get name index. | |
| 534 MsXlsDb$methods( .get.name.index = function() { | |
| 535 | |
| 536 if (is.null(.self$.name_index)) { | |
| 537 | |
| 538 # Get names | |
| 539 names <- vapply(.self$getMoleculeIds(), function(id) toupper(.self$getMoleculeName(id)), FUN.VALUE = "") | |
| 540 | |
| 541 # Get molecule IDs | |
| 542 id <- .self$getMoleculeIds() | |
| 543 | |
| 544 # Sort by names | |
| 545 sorted_indices <- order(names) | |
| 546 | |
| 547 # Group in a data frame | |
| 548 .self$.name_index <- data.frame(name = rbind(names)[, sorted_indices], | |
| 549 id = rbind(id)[, sorted_indices], | |
| 550 stringsAsFactors = FALSE) | |
| 551 } | |
| 552 | |
| 553 return(.self$.name_index) | |
| 554 }) | |
| 555 | |
| 556 ################## | |
| 557 # INIT FILE LIST # | |
| 558 ################## | |
| 559 | |
| 560 MsXlsDb$methods( .init.file.list = function() { | |
| 561 | |
| 562 if (is.null(.self$.files)) { | |
| 563 | |
| 564 # List all files | |
| 565 files <- Sys.glob(file.path(.self$.db_dir, '*.xls')) | |
| 566 | |
| 567 # Limit the size of the database | |
| 568 if ( ! is.na(.self$.limit)) | |
| 569 files <- head(files, .self$.limit) | |
| 570 | |
| 571 # Get IDs | |
| 572 ids <- vapply(files, function(f) .extract_molecule_id_from_filename(f), FUN.VALUE = 1) | |
| 573 | |
| 574 # Use ids as indices to build the vector of files | |
| 575 .files <<- rep(NA_character_, max(ids)) | |
| 576 .files[ids] <<- files | |
| 577 } | |
| 578 }) | |
| 579 | |
| 580 ################# | |
| 581 # GET CACHE DIR # | |
| 582 ################# | |
| 583 | |
| 584 MsXlsDb$methods( .get.cache.dir = function() { | |
| 585 | |
| 586 if ( ! is.na(.self$.cache_dir) && ! file.exists(.self$.cache_dir)) | |
| 587 dir.create(.self$.cache_dir) | |
| 588 | |
| 589 return(.self$.cache_dir) | |
| 590 }) | |
| 591 | |
| 592 ################# | |
| 593 # LOAD MOLECULE # | |
| 594 ################# | |
| 595 | |
| 596 MsXlsDb$methods( .load.molecule = function(molid) { | |
| 597 | |
| 598 # Init local variables | |
| 599 mol <- NULL | |
| 600 cache_file <- NA_character_ | |
| 601 excel_file <- .self$.get.file(molid) | |
| 602 | |
| 603 # Call observers | |
| 604 if ( ! is.null(.self$.observers)) | |
| 605 for (obs in .self$.observers) | |
| 606 obs$progress(paste0("Loading molecule ", molid, "."), level = 2) | |
| 607 | |
| 608 # Load from cache | |
| 609 if ( ! is.na(.self$.get.cache.dir())) { | |
| 610 cache_file <- file.path(.self$.get.cache.dir(), paste0(molid, '.bin')) | |
| 611 if (file.exists(cache_file)) | |
| 612 load(file = cache_file) # load mol variable | |
| 613 } | |
| 614 | |
| 615 # Load from Excel file & write to cache | |
| 616 if (is.null(mol) && ! is.na(excel_file)) { | |
| 617 | |
| 618 source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. | |
| 619 | |
| 620 # Load from Excel file | |
| 621 for(tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { | |
| 622 | |
| 623 # Test that tab exists | |
| 624 if (.self$.tab.exists(excel_file, tab)) { | |
| 625 header <- read.excel(excel_file, tab, start.row = 1, end.row = .XLS_PEAKS_ROW_OFFSET - 1, header = FALSE, stringsAsFactors = FALSE, trim.values = TRUE, col.index = c(1))[[1]] | |
| 626 peaks <- read.excel(excel_file, tab, start.row = .XLS_PEAKS_ROW_OFFSET) | |
| 627 mol[[tab]] <- list(header = header, peaks = peaks) | |
| 628 } | |
| 629 | |
| 630 # Missing tab | |
| 631 else { | |
| 632 for (obs in .self$.observers) | |
| 633 obs$warning(paste0("No excel tab ", tab, " in file ", excel_file, ".")) | |
| 634 } | |
| 635 } | |
| 636 | |
| 637 # Write in cache | |
| 638 if ( ! is.na(cache_file)) { | |
| 639 | |
| 640 # Call observers | |
| 641 if ( ! is.null(.self$.observers)) | |
| 642 for (obs in .self$.observers) | |
| 643 obs$progress(paste0("Caching file ", excel_file, ".")) | |
| 644 | |
| 645 save(mol, file = cache_file) | |
| 646 } | |
| 647 } | |
| 648 | |
| 649 return(mol) | |
| 650 }) | |
| 651 | |
| 652 ######################## | |
| 653 # DOES EXCEL TAB EXIST # | |
| 654 ######################## | |
| 655 | |
| 656 MsXlsDb$methods( .tab.exists = function(file, tab) { | |
| 657 | |
| 658 source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. | |
| 659 | |
| 660 if ( ! tab.exists(file, tab)) { | |
| 661 | |
| 662 # Warn observers | |
| 663 for (obs in .self$.observers) | |
| 664 obs$warning(paste0("No excel tab ", tab, " in file ", file, ".")) | |
| 665 | |
| 666 return(FALSE) | |
| 667 } | |
| 668 | |
| 669 return(TRUE) | |
| 670 }) | |
| 671 | |
| 672 ######################### | |
| 673 # PARSE RETENTION TIMES # | |
| 674 ######################### | |
| 675 | |
| 676 MsXlsDb$methods( .parse_retention_times = function(id, tab) { | |
| 677 | |
| 678 rt <- NULL | |
| 679 | |
| 680 if (.self$.tab.exists(.self$.get.file(id), tab)) { | |
| 681 peaks <- read.excel(.self$.get.file(id), tab, start.row = .XLS_PEAKS_ROW_OFFSET) | |
| 682 | |
| 683 # Get retention times | |
| 684 if ( ! is.null(peaks) && length(peaks) > 0 && ! is.na(peaks[[1]][[1]])) | |
| 685 for (c in .XLS_PEAKS_RT_COL_START:length(names(peaks))) | |
| 686 if ( ! is.na(peaks[[c]][[1]])) { | |
| 687 | |
| 688 # Check retention times of all different m/z peaks for the same column. | |
| 689 .self$.check_retention_times(id, tab, names(peaks)[[c]], peaks[[c]], sum( ! is.na(peaks[[1]]))) | |
| 690 | |
| 691 # Add retention time | |
| 692 # TODO The column names are transformed through the read.xlsx call. For instance: | |
| 693 # HPLC (C18) 25mn QTOF (Bis) --> HPLC..C18..25mn.QTOF..Bis. | |
| 694 # ZICpHILIC 150*5*2.1 Shimadzu-Exactive-42mn --> ZICpHILIC.150.5.2.1.Shimadzu.Exactive.42mn | |
| 695 # This can be an issue, since we loose the formating. | |
| 696 col_id <- names(peaks)[[c]] | |
| 697 time <- peaks[[c]][[1]] * 60 # Read and convert retention time in seconds. | |
| 698 if (is.null(rt) || ! col_id %in% names(rt)) | |
| 699 rt[[col_id]] <- list(time) | |
| 700 else | |
| 701 rt[[col_id]] <- c(rt[[col_id]], time) | |
| 702 } | |
| 703 } | |
| 704 | |
| 705 return(rt) | |
| 706 }) | |
| 707 | |
| 708 ######################### | |
| 709 # CHECK RETENTION TIMES # | |
| 710 ######################### | |
| 711 | |
| 712 MsXlsDb$methods( .check_retention_times = function(id, tab_name, column_name, rt, n) { | |
| 713 | |
| 714 if (n >= 1 && ! is.null(.self$.observers) && length(.self$.observers) > 0) | |
| 715 | |
| 716 # Check column only if there is at least one value inside | |
| 717 if (sum( ! is.na(rt)) > 0) | |
| 718 | |
| 719 # Loop on all values | |
| 720 for(i in 1:n) { | |
| 721 | |
| 722 # Check that it's defined | |
| 723 if (i > 1 && is.na(rt[[i]])) | |
| 724 for (obs in .self$.observers) | |
| 725 obs$warning(paste0("Retention times undefined for column ", column_name, " at row ", i + .XLS_PEAKS_ROW_OFFSET, " of tab ", tab_name, " in file ", .self$.get.file(id), ".")) | |
| 726 | |
| 727 else if (i > 1) | |
| 728 # Check the value (it must be constant) | |
| 729 if (rt[[i-1]] != rt[[i]]) | |
| 730 for (obs in .self$.observers) | |
| 731 obs$error(paste0("Retention times not constant for column ", column_name, " between row ", i - 1 + .XLS_PEAKS_ROW_OFFSET, " and row ", i + .XLS_PEAKS_ROW_OFFSET, "o tab", tab_name, "in file", .self$.get.file(id))) | |
| 732 } | |
| 733 }) | |
| 734 | |
| 735 #################### | |
| 736 # GET FILE FROM ID # | |
| 737 #################### | |
| 738 | |
| 739 MsXlsDb$methods( .get.file = function(id) { | |
| 740 | |
| 741 # List files | |
| 742 .self$.init.file.list() | |
| 743 | |
| 744 return( if (id > 0 && id <= length(.self$.files)) .self$.files[id] else NA_character_) | |
| 745 }) | |
| 746 | |
| 747 ########### | |
| 748 # MEM GET # | |
| 749 ########### | |
| 750 | |
| 751 # Get database data from memory | |
| 752 MsXlsDb$methods( .mem.get = function(molid, field, second.field = NA_character_) { | |
| 753 | |
| 754 data <- .self$.db[[as.character(molid)]][[field]] | |
| 755 | |
| 756 if ( ! is.na(second.field)) | |
| 757 data <- data[[second.field]] | |
| 758 | |
| 759 return(data) | |
| 760 }) | |
| 761 | |
| 762 ########### | |
| 763 # MEM SET # | |
| 764 ########### | |
| 765 | |
| 766 # Set database data into memory | |
| 767 MsXlsDb$methods( .mem.set = function(data, molid, field, second.field = NA_character_) { | |
| 768 | |
| 769 id <- as.character(molid) | |
| 770 | |
| 771 # Create db | |
| 772 if (is.null(.self$.db)) | |
| 773 .db <<- list() | |
| 774 | |
| 775 # Create first level | |
| 776 if (is.null(.self$.db[[id]])) | |
| 777 .self$.db[[id]] <- list() | |
| 778 | |
| 779 # Create second level | |
| 780 if ( ! is.na(second.field) && is.null(.self$.db[[id]][[field]])) | |
| 781 .self$.db[[id]][[field]] <- list() | |
| 782 | |
| 783 # Store data | |
| 784 if (is.na(second.field)) { | |
| 785 .self$.db[[id]][[field]] <- data | |
| 786 } else { | |
| 787 .self$.db[[id]][[field]][[second.field]] <- data | |
| 788 } | |
| 789 }) | |
| 790 | |
| 791 ################# | |
| 792 # SEARCH FOR RT # | |
| 793 ################# | |
| 794 | |
| 795 # Find molecules matching a certain retention time. | |
| 796 # col A list of chromatographic columns to use. | |
| 797 # rt.low The lower bound of the rt value. | |
| 798 # rt.high The higher bound of the rt value. | |
| 799 # mols A list of molecule IDs to process. If unset, then take all molecules. | |
| 800 # Return a data frame with the following columns: id, col, colrt. | |
| 801 MsXlsDb$methods( .search.for.rt = function(col, rt.low, rt.high, mols = NULL) { | |
| 802 | |
| 803 # Use all molecules if no list is provided | |
| 804 if (is.null(mols)) | |
| 805 mols <- .self$getMoleculeIds() | |
| 806 | |
| 807 results <- data.frame(id = integer(), col = character(), colrt = double(), stringsAsFactors = FALSE) | |
| 808 colnames(results) <- c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT) | |
| 809 | |
| 810 # Loop on all molecules | |
| 811 for (molid in mols) { | |
| 812 no.col <- TRUE | |
| 813 for (c in col) { | |
| 814 molrts <- .self$getRetentionTimes(molid, c) | |
| 815 if ( ! is.null(molrts)) { | |
| 816 no.col <- FALSE | |
| 817 for (molrt in molrts) { | |
| 818 if (molrt >= rt.low && molrt <= rt.high) { | |
| 819 r <- nrow(results) + 1 | |
| 820 results[r, ] <- c(id = molid, col = c, colrt = molrt) | |
| 821 } | |
| 822 } | |
| 823 } | |
| 824 } | |
| 825 | |
| 826 if (no.col) { | |
| 827 r <- nrow(results) + 1 | |
| 828 results[r, c(MSDB.TAG.MOLID)] <- c(id = molid) | |
| 829 } | |
| 830 } | |
| 831 | |
| 832 return(results) | |
| 833 }) | |
| 834 | |
| 835 ############################ | |
| 836 # EXTRACT ID FROM FILENAME # | |
| 837 ############################ | |
| 838 | |
| 839 .extract_molecule_id_from_filename <- function(filename) { | |
| 840 | |
| 841 id <- NA_integer_ | |
| 842 | |
| 843 if ( ! is.na(filename)) { | |
| 844 g <- str_match(filename, "N(\\d+)[._-]") | |
| 845 if ( ! is.na(g[1,1])) | |
| 846 id <- as.numeric(g[1,2]) | |
| 847 } | |
| 848 | |
| 849 return(id) | |
| 850 } | |
| 851 | |
| 852 } # end of load safe guard |
