Mercurial > repos > prog > lcmsmatching
diff MassFiledbConn.R @ 1:253d531a0193 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 36c9d8099c20a1ae848f1337c16564335dd8fb2b
author | prog |
---|---|
date | Sat, 03 Sep 2016 17:02:01 -0400 |
parents | |
children | 20d69a062da3 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MassFiledbConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -0,0 +1,258 @@ +if ( ! exists('MassFiledbConn')) { + + source('MassdbConn.R') + + # LCMS File db. + # In this type of database, a single file is provided in CSV format. Default separator is tabulation. + # Each line is a MS peak measure, . + # The file contains molecule and spectrum information. Each spectrum has an accession id. + + # TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue + + ############# + # CONSTANTS # + ############# + + # Default database fields + .BIODB.DFT.DB.FIELDS <- list() + for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZ, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS)) + .BIODB.DFT.DB.FIELDS[[f]] <- f + + ##################### + # CLASS DECLARATION # + ##################### + + MassFiledbConn <- setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .fields = "list", .ms.modes = "character")) + + ############### + # CONSTRUCTOR # + ############### + + MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) { + + # Check file + (! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.") + file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\".")) + + # Set fields + .db <<- NULL + .file <<- file + .file.sep <<- file.sep + .file.quote <<- file.quote + .fields <<- .BIODB.DFT.DB.FIELDS + .field.multval.sep <<- ';' + .ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS) + names(.self$.ms.modes) <- .self$.ms.modes + + callSuper(...) + }) + + ###################### + # Is valid field tag # + ###################### + + MassFiledbConn$methods( isValidFieldTag = function(tag) { + return (tag %in% names(.self$.fields)) + }) + + ############# + # Set field # + ############# + + MassFiledbConn$methods( setField = function(tag, colname) { + + ( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.") + ( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.") + + # Load database file + .self$.init.db() + + # Check that this field tag is defined in the fields list + .self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid.")) + + # Check that columns are defined in database file + all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file.")) + + # Set new definition + if (length(colname) == 1) + .fields[[tag]] <<- colname + else { + new.col <- paste(colname, collapse = ".") + .self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '') + .fields[[tag]] <<- new.col + } + }) + + ###################################### + # SET FIELD MULTIPLE VALUE SEPARATOR # + ###################################### + + MassFiledbConn$methods( setFieldMultValSep = function(sep) { + .field.multval.sep <<- sep + }) + + ################ + # SET MS MODES # + ################ + + MassFiledbConn$methods( setMsMode = function(mode, value) { + .self$.ms.modes[[mode]] <- value + }) + + ########################## + # GET ENTRY CONTENT TYPE # + ########################## + + MassFiledbConn$methods( getEntryContentType = function(type) { + return(BIODB.DATAFRAME) + }) + + ########### + # INIT DB # + ########### + + MassFiledbConn$methods( .init.db = function() { + + if (is.null(.self$.db)) { + + # Load database + .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL) + + # Rename columns + colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') + } + }) + + ################ + # CHECK FIELDS # + ################ + + MassFiledbConn$methods( .check.fields = function(fields) { + + # Check if fields are known + unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)] + if (length(unknown.fields) > 0) + stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown.")) + + # Init db + .self$.init.db() + + # Check if fields are defined in file database + undefined.fields <- colnames(.self$.init.db)[ ! unlist(.self$.fields[fields]) %in% colnames(.self$.init.db)] + if (length(undefined.fields) > 0) + stop(paste0("Column(s) ", paste(unlist(.self$.fields[fields]), collapse = ", "), " is/are undefined in file database.")) + }) + + ################ + # EXTRACT COLS # + ################ + + MassFiledbConn$methods( .extract.cols = function(cols, mode = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) { + + x <- NULL + + if ( ! is.null(cols) && ! is.na(cols)) { + + # Init db + .self$.init.db() + + # TODO check existence of cols/fields + + # Get db, eventually filtering it. + if (is.null(mode)) + db <- .self$.db + else { + # Check mode value + mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'.")) + .self$.check.fields(BIODB.MSMODE) + + # Filter on mode + db <- .self$.db[.self$.db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ] + } + + # Get subset + x <- db[, unlist(.self$.fields[cols]), drop = drop] + + # Rename columns + if (is.data.frame(x)) + colnames(x) <- cols + + # Rearrange + if (drop && is.vector(x)) { + if (uniq) + x <- x[ ! duplicated(x)] + if (sort) + x <- sort(x) + } + + # Cut + if ( ! is.na(max.rows)) + x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ] + } + + return(x) + }) + + ################# + # GET ENTRY IDS # + ################# + + MassFiledbConn$methods( getEntryIds = function(type) { + + ids <- NA_character_ + + if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND)) + ids <- as.character(.self$.extract.cols(if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE)) + + return(ids) + }) + + ################## + # GET NB ENTRIES # + ################## + + MassFiledbConn$methods( getNbEntries = function(type) { + return(length(.self$getEntryIds(type))) + }) + + ############################### + # GET CHROMATOGRAPHIC COLUMNS # + ############################### + + # Inherited from MassdbConn. + MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) { + + # Extract needed columns + db <- .self$.extract.cols(c(BIODB.COMPOUND.ID, BIODB.CHROM.COL)) + + # Filter on molecule IDs + if ( ! is.null(compound.ids)) + db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ] + + # Get column names + cols <- db[[BIODB.CHROM.COL]] + + # Remove duplicates + cols <- cols[ ! duplicated(cols)] + + # Make data frame + chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE) + colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE) + + return(chrom.cols) + }) + + ################# + # GET MZ VALUES # + ################# + + # Inherited from MassdbConn. + MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { + + # Get mz values + mz <- .self$.extract.cols(BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results) + + return(mz) + }) + +}