Mercurial > repos > prog > lcmsmatching
diff MassFiledbConn.R @ 2:20d69a062da3 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author | prog |
---|---|
date | Thu, 02 Mar 2017 08:55:00 -0500 |
parents | 253d531a0193 |
children | fb9c0409d85c |
line wrap: on
line diff
--- a/MassFiledbConn.R Sat Sep 03 17:02:01 2016 -0400 +++ b/MassFiledbConn.R Thu Mar 02 08:55:00 2017 -0500 @@ -1,258 +1,275 @@ -if ( ! exists('MassFiledbConn')) { +# LCMS File db. +# In this type of database, a single file is provided in CSV format. Default separator is tabulation. +# Each line is a MS peak measure, . +# The file contains molecule and spectrum information. Each spectrum has an accession id. + +# TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue - source('MassdbConn.R') - - # LCMS File db. - # In this type of database, a single file is provided in CSV format. Default separator is tabulation. - # Each line is a MS peak measure, . - # The file contains molecule and spectrum information. Each spectrum has an accession id. +############# +# CONSTANTS # +############# - # TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue - - ############# - # CONSTANTS # - ############# +# Default database fields +.BIODB.DFT.DB.FIELDS <- list() +for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZEXP, BIODB.PEAK.MZTHEO, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS)) + .BIODB.DFT.DB.FIELDS[[f]] <- f - # Default database fields - .BIODB.DFT.DB.FIELDS <- list() - for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZ, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS)) - .BIODB.DFT.DB.FIELDS[[f]] <- f +##################### +# CLASS DECLARATION # +##################### + +MassFiledbConn <- methods::setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .db.orig.colnames = "character", .fields = "list", .ms.modes = "character")) - ##################### - # CLASS DECLARATION # - ##################### - - MassFiledbConn <- setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .fields = "list", .ms.modes = "character")) +############### +# CONSTRUCTOR # +############### - ############### - # CONSTRUCTOR # - ############### +MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) { + + # Check file + (! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.") + file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\".")) - MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) { - - # Check file - (! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.") - file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\".")) + # Set fields + .db <<- NULL + .db.orig.colnames <<- NA_character_ + .file <<- file + .file.sep <<- file.sep + .file.quote <<- file.quote + .fields <<- .BIODB.DFT.DB.FIELDS + .field.multval.sep <<- ';' + .ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS) + names(.self$.ms.modes) <- .self$.ms.modes - # Set fields - .db <<- NULL - .file <<- file - .file.sep <<- file.sep - .file.quote <<- file.quote - .fields <<- .BIODB.DFT.DB.FIELDS - .field.multval.sep <<- ';' - .ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS) - names(.self$.ms.modes) <- .self$.ms.modes + callSuper(...) +}) - callSuper(...) - }) +###################### +# Is valid field tag # +###################### - ###################### - # Is valid field tag # - ###################### +MassFiledbConn$methods( isValidFieldTag = function(tag) { + return (tag %in% names(.self$.fields)) +}) - MassFiledbConn$methods( isValidFieldTag = function(tag) { - return (tag %in% names(.self$.fields)) - }) +########### +# INIT DB # +########### - ############# - # Set field # - ############# +MassFiledbConn$methods( .init.db = function() { - MassFiledbConn$methods( setField = function(tag, colname) { + if (is.null(.self$.db)) { + + # Load database + .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL) - ( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.") - ( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.") - - # Load database file - .self$.init.db() + # Save column names + .db.orig.colnames <<- colnames(.self$.db) + } +}) - # Check that this field tag is defined in the fields list - .self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid.")) +############# +# Set field # +############# - # Check that columns are defined in database file - all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file.")) +MassFiledbConn$methods( setField = function(tag, colname) { + + ( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.") + ( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.") - # Set new definition - if (length(colname) == 1) - .fields[[tag]] <<- colname - else { - new.col <- paste(colname, collapse = ".") - .self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '') - .fields[[tag]] <<- new.col - } - }) + # Load database file + .self$.init.db() + + # Check that this field tag is defined in the fields list + .self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid.")) + + # Check that columns are defined in database file + all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file.")) - ###################################### - # SET FIELD MULTIPLE VALUE SEPARATOR # - ###################################### - - MassFiledbConn$methods( setFieldMultValSep = function(sep) { - .field.multval.sep <<- sep - }) + # Set new definition + if (length(colname) == 1) + .fields[[tag]] <<- colname + else { + new.col <- paste(colname, collapse = ".") + .self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '') + .fields[[tag]] <<- new.col + } - ################ - # SET MS MODES # - ################ + # Update data frame column names + colnames(.self$.db) <- vapply(.self$.db.orig.colnames, function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') +}) - MassFiledbConn$methods( setMsMode = function(mode, value) { - .self$.ms.modes[[mode]] <- value - }) +###################################### +# SET FIELD MULTIPLE VALUE SEPARATOR # +###################################### - ########################## - # GET ENTRY CONTENT TYPE # - ########################## +MassFiledbConn$methods( setFieldMultValSep = function(sep) { + .field.multval.sep <<- sep +}) - MassFiledbConn$methods( getEntryContentType = function(type) { - return(BIODB.DATAFRAME) - }) +################ +# SET MS MODES # +################ - ########### - # INIT DB # - ########### +MassFiledbConn$methods( setMsMode = function(mode, value) { + .self$.ms.modes[[mode]] <- value +}) - MassFiledbConn$methods( .init.db = function() { - - if (is.null(.self$.db)) { +########################## +# GET ENTRY CONTENT TYPE # +########################## - # Load database - .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL) +MassFiledbConn$methods( getEntryContentType = function(type) { + return(BIODB.DATAFRAME) +}) - # Rename columns - colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') - } - }) +################ +# CHECK FIELDS # +################ - ################ - # CHECK FIELDS # - ################ +MassFiledbConn$methods( .check.fields = function(fields) { + + if (length(fields) ==0 || (length(fields) == 1 && is.na(fields))) + return - MassFiledbConn$methods( .check.fields = function(fields) { + # Check if fields are known + unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)] + if (length(unknown.fields) > 0) + stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown.")) - # Check if fields are known - unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)] - if (length(unknown.fields) > 0) - stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown.")) + # Init db + .self$.init.db() - # Init db - .self$.init.db() + # Check if fields are defined in file database + undefined.fields <- colnames(.self$.db)[ ! fields %in% colnames(.self$.db)] + if (length(undefined.fields) > 0) + stop(paste0("Column(s) ", paste(fields), collapse = ", "), " is/are undefined in file database.") +}) - # Check if fields are defined in file database - undefined.fields <- colnames(.self$.init.db)[ ! unlist(.self$.fields[fields]) %in% colnames(.self$.init.db)] - if (length(undefined.fields) > 0) - stop(paste0("Column(s) ", paste(unlist(.self$.fields[fields]), collapse = ", "), " is/are undefined in file database.")) - }) +########## +# SELECT # +########## + +# Select data from database +MassFiledbConn$methods( .select = function(cols = NULL, mode = NULL, compound.ids = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) { + + x <- NULL - ################ - # EXTRACT COLS # - ################ - - MassFiledbConn$methods( .extract.cols = function(cols, mode = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) { - - x <- NULL + # Init db + .self$.init.db() - if ( ! is.null(cols) && ! is.na(cols)) { + # Get db + db <- .self$.db - # Init db - .self$.init.db() - - # TODO check existence of cols/fields + # Filter db on mode + if ( ! is.null(mode) && ! is.na(mode)) { - # Get db, eventually filtering it. - if (is.null(mode)) - db <- .self$.db - else { - # Check mode value - mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'.")) - .self$.check.fields(BIODB.MSMODE) + # Check mode value + mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'.")) + .self$.check.fields(BIODB.MSMODE) - # Filter on mode - db <- .self$.db[.self$.db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ] - } + # Filter on mode + db <- db[db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ] + } - # Get subset - x <- db[, unlist(.self$.fields[cols]), drop = drop] + # Filter db on compound ids + # TODO + + if ( ! is.null(cols) && ! is.na(cols)) + .self$.check.fields(cols) - # Rename columns - if (is.data.frame(x)) - colnames(x) <- cols + # Get subset + if (is.null(cols) || is.na(cols)) + x <- db + else + x <- db[, unlist(.self$.fields[cols]), drop = drop] - # Rearrange - if (drop && is.vector(x)) { - if (uniq) - x <- x[ ! duplicated(x)] - if (sort) - x <- sort(x) - } + # Rearrange + if (drop && is.vector(x)) { + if (uniq) + x <- x[ ! duplicated(x)] + if (sort) + x <- sort(x) + } - # Cut - if ( ! is.na(max.rows)) - x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ] - } + # Cut + if ( ! is.na(max.rows)) + x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ] + + return(x) +}) + +################# +# GET ENTRY IDS # +################# - return(x) - }) +MassFiledbConn$methods( getEntryIds = function(type) { + + ids <- NA_character_ + + if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND)) + ids <- as.character(.self$.select(cols = if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE)) - ################# - # GET ENTRY IDS # - ################# - - MassFiledbConn$methods( getEntryIds = function(type) { + return(ids) +}) - ids <- NA_character_ - - if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND)) - ids <- as.character(.self$.extract.cols(if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE)) +################## +# GET NB ENTRIES # +################## - return(ids) - }) +MassFiledbConn$methods( getNbEntries = function(type) { + return(length(.self$getEntryIds(type))) +}) + +############################### +# GET CHROMATOGRAPHIC COLUMNS # +############################### - ################## - # GET NB ENTRIES # - ################## - - MassFiledbConn$methods( getNbEntries = function(type) { - return(length(.self$getEntryIds(type))) - }) +# Inherited from MassdbConn. +MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) { + + # Extract needed columns + db <- .self$.select(cols = c(BIODB.COMPOUND.ID, BIODB.CHROM.COL)) - ############################### - # GET CHROMATOGRAPHIC COLUMNS # - ############################### - - # Inherited from MassdbConn. - MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) { + # Filter on molecule IDs + if ( ! is.null(compound.ids)) + db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ] + + # Get column names + cols <- db[[BIODB.CHROM.COL]] - # Extract needed columns - db <- .self$.extract.cols(c(BIODB.COMPOUND.ID, BIODB.CHROM.COL)) + # Remove duplicates + cols <- cols[ ! duplicated(cols)] - # Filter on molecule IDs - if ( ! is.null(compound.ids)) - db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ] + # Make data frame + chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE) + colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE) - # Get column names - cols <- db[[BIODB.CHROM.COL]] + return(chrom.cols) +}) - # Remove duplicates - cols <- cols[ ! duplicated(cols)] +################# +# GET MZ VALUES # +################# - # Make data frame - chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE) - colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE) +# Inherited from MassdbConn. +MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { - return(chrom.cols) - }) - - ################# - # GET MZ VALUES # - ################# - - # Inherited from MassdbConn. - MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { + # Get mz values + mz <- .self$.select(cols = BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results) + + return(mz) +}) - # Get mz values - mz <- .self$.extract.cols(BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results) +################ +# GET NB PEAKS # +################ - return(mz) - }) +# Inherited from MassdbConn. +MassFiledbConn$methods( getNbPeaks = function(mode = NULL, compound.ids = NULL) { -} + # Get peaks + peaks <- .self$.select(cols = BIODB.PEAK.MZTHEO, mode = mode, compound.ids = compound.ids) + + return(length(peaks)) +})