# HG changeset patch
# User prog
# Date 1488462900 18000
# Node ID 20d69a062da3b25cb2c8aa5a44b89477a4ec84c4
# Parent 253d531a0193b67ee318762fdac9a25ee05038d8
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
diff -r 253d531a0193 -r 20d69a062da3 BiodbConn.R
--- a/BiodbConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/BiodbConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,93 +1,80 @@
-if ( ! exists('BiodbConn')) {
-
- source('biodb-common.R')
+#####################
+# CLASS DECLARATION #
+#####################
- #####################
- # CLASS DECLARATION #
- #####################
-
- BiodbConn <- setRefClass("BiodbConn", fields = list( .debug = "logical" ))
+BiodbConn <- methods::setRefClass("BiodbConn", contains = "BiodbObject", fields = list( .debug = "logical" ))
- ###############
- # CONSTRUCTOR #
- ###############
+###############
+# CONSTRUCTOR #
+###############
- BiodbConn$methods( initialize = function(debug = FALSE, ...) {
- .debug <<- debug
- })
+BiodbConn$methods( initialize = function(debug = FALSE, ...) {
+ .debug <<- debug
+})
- #######################
- # PRINT DEBUG MESSAGE #
- #######################
+#######################
+# PRINT DEBUG MESSAGE #
+#######################
- BiodbConn$methods( .print.debug.msg = function(msg) {
- if (.self$.debug)
- .print.msg(msg = msg, class = class(.self))
- })
+BiodbConn$methods( .print.debug.msg = function(msg) {
+ if (.self$.debug)
+ .print.msg(msg = msg, class = class(.self))
+})
- ######################
- # HANDLES ENTRY TYPE #
- ######################
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
- BiodbConn$methods( handlesEntryType = function(type) {
- return( ! is.null(.self$getEntryContentType(type)))
- })
-
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
+BiodbConn$methods( getEntryContentType = function() {
+ .self$.abstract.method()
+})
- BiodbConn$methods( getEntryContentType = function(type) {
- stop("Method getEntryContentType() is not implemented in concrete class.")
- })
+#############
+# GET ENTRY #
+#############
- #############
- # GET ENTRY #
- #############
+BiodbConn$methods( getEntry = function(id, drop = TRUE) {
+ content <- .self$getEntryContent(id)
+ return(.self$createEntry(content, drop = drop))
+})
- BiodbConn$methods( getEntry = function(type, id, drop = TRUE) {
- content <- .self$getEntryContent(type, id)
- return(.self$createEntry(type, content, drop = drop))
- })
+#####################
+# GET ENTRY CONTENT #
+#####################
+
+# Download entry content from the public database.
+# type The entry type.
+# id The ID of the entry to get.
+# RETURN An entry content downloaded from database.
+BiodbConn$methods( getEntryContent = function(id) {
+ .self$.abstract.method()
+})
+
+#############################
+# CREATE ENTRY FROM CONTENT #
+#############################
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- # Download entry content from the public database.
- # type The entry type.
- # id The ID of the entry to get.
- # RETURN An entry content downloaded from database.
- BiodbConn$methods( getEntryContent = function(type, id) {
- stop("Method getCompound() is not implemented in concrete class.")
- })
-
- #############################
- # CREATE ENTRY FROM CONTENT #
- #############################
-
- # Creates a Compound instance from file content.
- # content A file content, downloaded from the public database.
- # RETURN A compound instance.
- BiodbConn$methods( createEntry = function(type, content, drop = TRUE) {
- stop("Method createEntry() is not implemented in concrete class.")
- })
+# Creates a Compound instance from file content.
+# content A file content, downloaded from the public database.
+# RETURN A compound instance.
+BiodbConn$methods( createEntry = function(content, drop = TRUE) {
+ .self$.abstract.method()
+})
+
+#################
+# GET ENTRY IDS #
+#################
- #################
- # GET ENTRY IDS #
- #################
-
- # Get a list of IDs of all entries contained in this database.
- BiodbConn$methods( getEntryIds = function(type) {
- stop("Method getEntryIds() is not implemented in concrete class.")
- })
+# Get a list of IDs of all entries contained in this database.
+BiodbConn$methods( getEntryIds = function(max.results = NA_integer_) {
+ .self$.abstract.method()
+})
- ##################
- # GET NB ENTRIES #
- ##################
-
- # Get the number of entries contained in this database.
- BiodbConn$methods( getNbEntries = function(type) {
- stop("Method getNbEntries() is not implemented in concrete class.")
- })
-}
+##################
+# GET NB ENTRIES #
+##################
+
+# Get the number of entries contained in this database.
+BiodbConn$methods( getNbEntries = function() {
+ .self$.abstract.method()
+})
diff -r 253d531a0193 -r 20d69a062da3 BiodbEntry.R
--- a/BiodbEntry.R Sat Sep 03 17:02:01 2016 -0400
+++ b/BiodbEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,159 +1,182 @@
-if ( ! exists('BiodbEntry')) { # Do not load again if already loaded
+#############
+# CONSTANTS #
+#############
+
+BIODB.BASIC.CLASSES <- c('character', 'integer', 'double', 'logical')
- source('biodb-common.R')
+########################
+# ENTRY ABSTRACT CLASS #
+########################
+
+BiodbEntry <- methods::setRefClass("BiodbEntry", fields = list(.fields ='list', .factory = "ANY"))
+
+###############
+# CONSTRUCTOR #
+###############
- ########################
- # ENTRY ABSTRACT CLASS #
- ########################
-
- BiodbEntry <- setRefClass("BiodbEntry", fields = list(.fields ='list', .factory = "ANY"))
-
- ###############
- # CONSTRUCTOR #
- ###############
-
- BiodbEntry$methods( initialize = function(...) {
-
- .fields <<- list()
- .factory <<- NULL
-
- callSuper(...) # calls super-class initializer with remaining parameters
- })
-
- ###################
- # SET FIELD VALUE #
- ###################
-
- BiodbEntry$methods( setFieldValue = function(field, value) {
- .self$setField(field, value)
- })
+BiodbEntry$methods( initialize = function(...) {
+
+ .fields <<- list()
+ .factory <<- NULL
+
+ callSuper(...) # calls super-class initializer with remaining parameters
+})
+
+###################
+# SET FIELD VALUE #
+###################
+
+BiodbEntry$methods( setFieldValue = function(field, value) {
+
+ class = .self$getFieldClass(field)
- BiodbEntry$methods( setField = function(field, value) {
-
- class = .self$getFieldClass(field)
+ # Secific case to handle objects.
+ if ( class ==" object" & !(isS4(value) & methods::is(value, "refClass")))
+ stop(paste0('Cannot set a non RC instance to field "', field, '" in BiodEntry.'))
+
+ # Check cardinality
+ if (class != 'data.frame' && .self$getFieldCardinality(field) == BIODB.CARD.ONE && length(value) > 1)
+ stop(paste0('Cannot set more that one value to single value field "', field, '" in BiodEntry.'))
- # Check cardinality
- if (class != 'data.frame' && .self$getFieldCardinality(field) == BIODB.CARD.ONE && length(value) > 1)
- stop(paste0('Cannot set more that one value to single value field "', field, '" in BiodEntry.'))
+ # Check value class
+ value <- switch(class,
+ 'character' = as.character(value),
+ 'double' = as.double(value),
+ 'integer' = as.integer(value),
+ 'logical' = as.logical(value),
+ value)
+ # TODO check value class
- # Check value class
- value <- switch(class,
- 'character' = as.character(value),
- 'double' = as.double(value),
- 'integer' = as.integer(value),
- 'logical' = as.logical(value),
- value)
- # TODO check value class
+ .self$.fields[[field]] <- value
+})
- .self$.fields[[field]] <- value
- })
+###################
+# GET FIELD NAMES #
+###################
- ###################
- # GET FIELD NAMES #
- ###################
+BiodbEntry$methods( getFieldNames = function(field) {
+ return(names(.self$.fields))
+})
- BiodbEntry$methods( getFieldNames = function(field) {
- return(names(.self$.fields))
- })
+#############
+# HAS FIELD #
+#############
- ###################
- # GET FIELD CLASS #
- ###################
+BiodbEntry$methods( hasField = function(field) {
+ return(field %in% names(.self$.fields))
+})
- BiodbEntry$methods( getFieldClass = function(field) {
+###################
+# GET FIELD CLASS #
+###################
- if ( ! field %in% BIODB.FIELDS[['name']])
- stop(paste0('Unknown field "', field, '" in BiodEntry.'))
+BiodbEntry$methods( getFieldClass = function(field) {
- field.class <- BIODB.FIELDS[which(field == BIODB.FIELDS[['name']]), 'class']
+ if ( ! field %in% BIODB.FIELDS[['name']])
+ stop(paste0('Unknown field "', field, '" in BiodEntry.'))
- return(field.class)
- })
+ field.class <- BIODB.FIELDS[which(field == BIODB.FIELDS[['name']]), 'class']
+
+ return(field.class)
+})
- #########################
- # GET FIELD CARDINALITY #
- #########################
-
- BiodbEntry$methods( getFieldCardinality = function(field) {
+#########################
+# FIELD HAS BASIC CLASS #
+#########################
+
+BiodbEntry$methods( fieldHasBasicClass = function(field) {
+ return(.self$getFieldClass(field) %in% BIODB.BASIC.CLASSES)
+})
- if ( ! field %in% BIODB.FIELDS[['name']])
- stop(paste0('Unknown field "', field, '" in BiodEntry.'))
+#########################
+# GET FIELD CARDINALITY #
+#########################
- field.card <- BIODB.FIELDS[which(field == BIODB.FIELDS[['name']]), 'cardinality']
+BiodbEntry$methods( getFieldCardinality = function(field) {
+
+ if ( ! field %in% BIODB.FIELDS[['name']])
+ stop(paste0('Unknown field "', field, '" in BiodEntry.'))
+
+ field.card <- BIODB.FIELDS[which(field == BIODB.FIELDS[['name']]), 'cardinality']
- return(field.card)
- })
-
- ###################
- # GET FIELD VALUE #
- ###################
-
- BiodbEntry$methods( getFieldValue = function(field) {
- return(.self$getField(field))
- })
+ return(field.card)
+})
+
+###################
+# GET FIELD VALUE #
+###################
- BiodbEntry$methods( getField = function(field) {
+BiodbEntry$methods( getFieldValue = function(field, compute = TRUE) {
- if ( ! field %in% BIODB.FIELDS[['name']])
- stop(paste0('Unknown field "', field, '" in BiodEntry.'))
+ if ( ! field %in% BIODB.FIELDS[['name']])
+ stop(paste0('Unknown field "', field, '" in BiodEntry.'))
- if (field %in% names(.self$.fields))
- return(.self$.fields[[field]])
- else if (.self$.compute.field(field))
- return(.self$.fields[[field]])
+ if (field %in% names(.self$.fields))
+ return(.self$.fields[[field]])
+ else if (compute && .self$.compute.field(field))
+ return(.self$.fields[[field]])
- # Return NULL or NA
- class = .self$getFieldClass(field)
- return(if (class %in% c('character', 'integer', 'double', 'logical')) as.vector(NA, mode = class) else NULL)
- })
-
- #################
- # COMPUTE FIELD #
- ##################
-
- BiodbEntry$methods( .compute.field = function(field) {
+ # Return NULL or NA
+ class = .self$getFieldClass(field)
+ return(if (class %in% BIODB.BASIC.CLASSES) as.vector(NA, mode = class) else NULL)
+})
+
+#################
+# COMPUTE FIELD #
+##################
- if ( ! is.null(.self$.factory) && field %in% names(BIODB.FIELD.COMPUTING)) {
- for (db in BIODB.FIELD.COMPUTING[[field]]) {
- db.id <- .self$getField(paste0(db, 'id'))
- if ( ! is.na(db.id)) {
- db.compound <- .self$.factory$createEntry(db, type = BIODB.COMPOUND, id = db.id)
- if ( ! is.null(db.compound)) {
- .self$setField(field, db.compound$getField(field))
- return(TRUE)
- }
+BiodbEntry$methods( .compute.field = function(field) {
+
+ if ( ! is.null(.self$.factory) && field %in% names(BIODB.FIELD.COMPUTING)) {
+ for (db in BIODB.FIELD.COMPUTING[[field]]) {
+ db.id <- .self$getField(paste0(db, 'id'))
+ if ( ! is.na(db.id)) {
+ db.entry <- .self$.factory$createEntry(db, id = db.id)
+ if ( ! is.null(db.entry)) {
+ .self$setField(field, db.entry$getField(field))
+ return(TRUE)
}
}
}
+ }
- return(FALSE)
- })
-
- ############################
- # GET FIELDS AS DATA FRAME #
- ############################
-
- BiodbEntry$methods( getFieldsAsDataFrame = function(field) {
+ return(FALSE)
+})
- df <- data.frame()
+############################
+# GET FIELDS AS DATA FRAME #
+############################
+###TODO add a limiting option to get some fields.
+BiodbEntry$methods( getFieldsAsDataFrame = function() {
+ df <- data.frame()
+ # Loop on all fields
+ for (f in names(.self$.fields))
- # Loop on all fields
- for (f in names(.self$.fields))
+ # If field class is a basic type
+ if (.self$getFieldClass(f) %in% c('character', 'logical', 'integer', 'double') &
+ length(.self$getFieldValue(f)) == 1)
+ df[1, f] <- .self$getFieldValue(f)
- # If field class is a basic type
- if (.self$getFieldClass(f) %in% c('character', 'logical', 'integer', 'double'))
- df[1, f] <- .self$getFieldValue(f)
+ return(df)
+})
- return(df)
- })
+###########
+# FACTORY #
+###########
+
+BiodbEntry$methods( setFactory = function(factory) {
+ is.null(factory) || inherits(factory, "BiodbFactory") || stop("The factory instance must inherit from BiodbFactory class.")
+ .factory <<- factory
+})
- ###########
- # FACTORY #
- ###########
-
- BiodbEntry$methods( setFactory = function(factory) {
+##############
+# DEPRECATED #
+##############
- is.null(factory) || inherits(factory, "BiodbFactory") || stop("The factory instance must inherit from BiodbFactory class.")
- .factory <<- factory
- })
-}
+BiodbEntry$methods( getField = function(field) {
+ return(.self$getFieldValue(field))
+})
+
+BiodbEntry$methods( setField = function(field, value) {
+ .self$setFieldValue(field, value)
+})
diff -r 253d531a0193 -r 20d69a062da3 BiodbFactory.R
--- a/BiodbFactory.R Sat Sep 03 17:02:01 2016 -0400
+++ b/BiodbFactory.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,233 +1,274 @@
-if ( ! exists('BiodbFactory')) { # Do not load again if already loaded
-
- library(methods)
- source('biodb-common.R')
- source('ChebiConn.R')
- source('KeggConn.R')
- source('PubchemConn.R')
- source('HmdbConn.R')
- source('ChemspiderConn.R')
- source('EnzymeConn.R')
- source('LipidmapsConn.R')
- source('MirbaseConn.R')
- source('NcbigeneConn.R')
- source('NcbiccdsConn.R')
- source('UniprotConn.R')
- source('MassbankConn.R')
- source('MassFiledbConn.R')
+# vi: fdm=marker
+
+##########################
+# CLASS DECLARATION {{{1 #
+##########################
+
+BiodbFactory <- methods::setRefClass("BiodbFactory", contains = 'BiodbObject', fields = list(.useragent = "character",
+ .conn = "list",
+ .cache.dir = "character",
+ .cache.mode = "character",
+ .debug = "logical",
+ .chunk.size = "integer",
+ .use.env.var = "logical"))
+
+###############
+# CONSTRUCTOR #
+###############
+
+BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, cache.mode = BIODB.CACHE.READ.WRITE, debug = FALSE, chunk.size = NA_integer_, use.env.var = FALSE, ...) {
- #####################
- # CLASS DECLARATION #
- #####################
-
- BiodbFactory <- setRefClass("BiodbFactory", fields = list(.useragent = "character", .conn = "list", .cache.dir = "character", .debug = "logical"))
-
- ###############
- # CONSTRUCTOR #
- ###############
-
- BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, debug = FALSE, ...) {
-
- .useragent <<- useragent
- .conn <<- list()
- .cache.dir <<- cache.dir
- .debug <<- debug
+ .useragent <<- useragent
+ .conn <<- list()
+ .cache.dir <<- cache.dir
+ .cache.mode <<- cache.mode
+ .debug <<- debug
+ .chunk.size <<- as.integer(chunk.size)
+ .use.env.var <<- use.env.var
- callSuper(...) # calls super-class initializer with remaining parameters
- })
+ callSuper(...) # calls super-class initializer with remaining parameters
+})
+
+#######################
+# PRINT DEBUG MESSAGE #
+#######################
- #######################
- # PRINT DEBUG MESSAGE #
- #######################
-
- BiodbFactory$methods( .print.debug.msg = function(msg) {
- if (.self$.debug)
- .print.msg(msg = msg, class = class(.self))
- })
+BiodbFactory$methods( .print.debug.msg = function(msg) {
+ if (.self$.debug)
+ .print.msg(msg = msg, class = class(.self))
+})
- ##################
- # GET USER AGENT #
- ##################
+##################
+# GET USER AGENT #
+##################
- BiodbFactory$methods( getUserAgent = function() {
- return(.self$.useragent)
- })
+BiodbFactory$methods( getUserAgent = function() {
+ return(.self$.useragent)
+})
- ##################
- # SET USER AGENT #
- ##################
+##################
+# SET USER AGENT #
+##################
BiodbFactory$methods( setUserAgent = function(useragent) {
- .useragent <<- useragent
- })
+ "Set useragent of BiodbFactory."
+ .useragent <<- useragent
+})
+
+###############
+# CREATE CONN #
+###############
+
+BiodbFactory$methods( createConn = function(class, url = NA_character_, token = NA_character_) {
+ " Create connection to databases useful for metabolomics."
+ if (class %in% names(.self$.conn))
+ stop(paste0('A connection of type ', class, ' already exists. Please use method getConn() to access it.'))
+
+ # Use environment variables
+ if (.self$.use.env.var) {
+ if (is.na(url))
+ url <- .biodb.get.env.var(c(class, 'URL'))
+ if (is.na(token))
+ token <- .biodb.get.env.var(c(class, 'TOKEN'))
+ }
+
+ # Create connection instance
+ conn <- switch(class,
+ chebi = ChebiConn$new(useragent = .self$.useragent, debug = .self$.debug),
+ kegg = KeggConn$new(useragent = .self$.useragent, debug = .self$.debug),
+ pubchemcomp = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMCOMP, debug = .self$.debug),
+ pubchemsub = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMSUB, debug = .self$.debug),
+ hmdb = HmdbConn$new(useragent = .self$.useragent, debug = .self$.debug),
+ chemspider = ChemspiderConn$new(useragent = .self$.useragent, debug = .self$.debug, token = token),
+ enzyme = EnzymeConn$new(useragent = .self$.useragent, debug = .self$.debug),
+ lipidmaps = LipidmapsConn$new(useragent = .self$.useragent, debug = .self$.debug),
+ mirbase = MirbaseConn$new(useragent = .self$.useragent, debug = .self$.debug),
+ ncbigene = NcbigeneConn$new(useragent = .self$.useragent, debug = .self$.debug),
+ ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent, debug = .self$.debug),
+ uniprot = UniprotConn$new(useragent = .self$.useragent, debug = .self$.debug),
+ massbank = MassbankConn$new(useragent = .self$.useragent, url = url, debug = .self$.debug),
+ massfiledb = MassFiledbConn$new(file = url, debug = .self$.debug),
+ peakforest = PeakforestConn$new(useragent = .self$.useragent, debug = .self$.debug),
+ NULL)
+
+ # Unknown class
+ if (is.null(conn))
+ stop(paste0("Unknown r-biodb class \"", class,"\"."))
+
+ # Register new class
+ .self$.conn[[class]] <- conn
+
+ return (.self$.conn[[class]])
+})
- ############
- # GET CONN #
- ############
+############
+# GET CONN #
+############
+
+BiodbFactory$methods( getConn = function(class) {
+ "Get connection to a database."
+
+ if ( ! class %in% names(.self$.conn))
+ .self$createConn(class)
+
+ return (.self$.conn[[class]])
+})
+
+################
+# CREATE ENTRY #
+################
+
+BiodbFactory$methods( createEntry = function(class, id = NULL, content = NULL, drop = TRUE) {
+ "Create Entry from a database by id."
+
+ is.null(id) && is.null(content) && stop("One of id or content must be set.")
+ ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.")
+
+ # Debug
+ .self$.print.debug.msg(paste0("Creating ", if (is.null(id)) length(content) else length(id), " entries from ", if (is.null(id)) "contents" else paste("ids", paste(if (length(id) > 10) id[1:10] else id, collapse = ", ")), "..."))
- BiodbFactory$methods( getConn = function(class, url = NA_character_) {
+ # Get content
+ if ( ! is.null(id))
+ content <- .self$getEntryContent(class, id)
+ conn <- .self$getConn(class)
+ entry <- conn$createEntry(content = content, drop = drop)
+
+ # Set factory
+ .self$.print.debug.msg(paste0("Setting factory reference into entries..."))
+ for (e in c(entry))
+ if ( ! is.null(e))
+ e$setFactory(.self)
+
+ return(entry)
+})
- if ( ! class %in% names(.self$.conn)) {
+########################
+# GET CACHE FILE PATHS #
+########################
+
+BiodbFactory$methods( .get.cache.file.paths = function(class, id) {
+
+ # Get extension
+ ext <- .self$getConn(class)$getEntryContentType()
+
+ # Set filenames
+ filenames <- vapply(id, function(x) { if (is.na(x)) NA_character_ else paste0(class, '-', x, '.', ext) }, FUN.VALUE = '')
+
+ # set file paths
+ file.paths <- vapply(filenames, function(x) { if (is.na(x)) NA_character_ else file.path(.self$.cache.dir, x) }, FUN.VALUE = '')
- # Create connection instance
- conn <- switch(class,
- chebi = ChebiConn$new(useragent = .self$.useragent),
- kegg = KeggConn$new(useragent = .self$.useragent),
- pubchem = PubchemConn$new(useragent = .self$.useragent),
- hmdb = HmdbConn$new(useragent = .self$.useragent),
- chemspider = ChemspiderConn$new(useragent = .self$.useragent),
- enzyme = EnzymeConn$new(useragent = .self$.useragent),
- lipidmaps = LipidmapsConn$new(useragent = .self$.useragent),
- mirbase = MirbaseConn$new(useragent = .self$.useragent),
- ncbigene = NcbigeneConn$new(useragent = .self$.useragent),
- ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent),
- uniprot = UniprotConn$new(useragent = .self$.useragent),
- massbank = MassbankConn$new(useragent = .self$.useragent, debug = .self$.debug),
- massfiledb = MassFiledbConn$new(file = url),
- NULL)
+ # Create cache dir if needed
+ if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir))
+ dir.create(.self$.cache.dir)
+
+ return(file.paths)
+})
+
+###########################
+# LOAD CONTENT FROM CACHE #
+###########################
+
+BiodbFactory$methods( .load.content.from.cache = function(class, id) {
+
+ content <- NULL
+
+ # Read contents from files
+ file.paths <- .self$.get.cache.file.paths(class, id)
+ content <- lapply(file.paths, function(x) { if (is.na(x)) NA_character_ else ( if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL )} )
+
+ return(content)
+})
+
+############################
+# IS CACHE READING ENABLED #
+############################
+
+BiodbFactory$methods( .is.cache.reading.enabled = function() {
+ return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.READ.ONLY, BIODB.CACHE.READ.WRITE))
+})
+
+############################
+# IS CACHE WRITING ENABLED #
+############################
+
+BiodbFactory$methods( .is.cache.writing.enabled = function() {
+ return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.WRITE.ONLY, BIODB.CACHE.READ.WRITE))
+})
+
+#########################
+# SAVE CONTENT TO CACHE #
+#########################
+
+BiodbFactory$methods( .save.content.to.cache = function(class, id, content) {
+
+ # Write contents into files
+ file.paths <- .self$.get.cache.file.paths(class, id)
+ mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths)
+})
+
+#####################
+# GET ENTRY CONTENT #
+#####################
- # Unknown class
- if (is.null(conn))
- stop(paste0("Unknown r-biodb class \"", class,"\"."))
+BiodbFactory$methods( getEntryContent = function(class, id) {
+
+ # Debug
+ .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)..."))
+
+ # Initialize content
+ if (.self$.is.cache.reading.enabled()) {
+ content <- .self$.load.content.from.cache(class, id)
+ missing.ids <- id[vapply(content, is.null, FUN.VALUE = TRUE)]
+ }
+ else {
+ content <- lapply(id, as.null)
+ missing.ids <- id
+ }
+
+ # Remove duplicates
+ n.duplicates <- sum(duplicated(missing.ids))
+ missing.ids <- missing.ids[ ! duplicated(missing.ids)]
- .self$.conn[[class]] <- conn
+ # Debug
+ if (any(is.na(id)))
+ .self$.print.debug.msg(paste0(sum(is.na(id)), " entry ids are NA."))
+ if (.self$.is.cache.reading.enabled()) {
+ .self$.print.debug.msg(paste0(sum( ! is.na(id)) - length(missing.ids), " entry content(s) loaded from cache."))
+ if (n.duplicates > 0)
+ .self$.print.debug.msg(paste0(n.duplicates, " entry ids, whose content needs to be fetched, are duplicates."))
+ .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched."))
+ }
+
+ # Get contents
+ if (length(missing.ids) > 0) {
+
+ # Use connector to get missing contents
+ conn <- .self$getConn(class)
+
+ # Divide list of missing ids in chunks (in order to save in cache regularly)
+ chunks.of.missing.ids = if (is.na(.self$.chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / .self$.chunk.size))
+
+ # Loop on chunks
+ missing.contents <- NULL
+ for (ch.missing.ids in chunks.of.missing.ids) {
+
+ ch.missing.contents <- conn$getEntryContent(ch.missing.ids)
+
+ # Save to cache
+ if ( ! is.null(ch.missing.contents) && .self$.is.cache.writing.enabled())
+ .self$.save.content.to.cache(class, ch.missing.ids, ch.missing.contents)
+
+ # Append
+ missing.contents <- c(missing.contents, ch.missing.contents)
+
+ # Debug
+ if (.self$.is.cache.reading.enabled())
+ .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved..."))
}
- return (.self$.conn[[class]])
- })
-
- ################
- # CREATE ENTRY #
- ################
-
- BiodbFactory$methods( createEntry = function(class, type, id = NULL, content = NULL, drop = TRUE) {
-
- is.null(id) && is.null(content) && stop("One of id or content must be set.")
- ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.")
-
- # Debug
- .self$.print.debug.msg(paste0("Creating entry from ", if (is.null(id)) "content" else paste("id", id), "..."))
-
- # Get content
- if ( ! is.null(id))
- content <- .self$getEntryContent(class, type, id)
-
- conn <- .self$getConn(class)
- entry <- conn$createEntry(type = type, content = content, drop = drop)
-
- # Set factory
- for (e in c(entry))
- e$setFactory(.self)
-
- return(entry)
- })
-
- ########################
- # GET CACHE FILE PATHS #
- ########################
-
- BiodbFactory$methods( .get.cache.file.paths = function(class, type, id) {
-
- # Get extension
- ext <- .self$getConn(class)$getEntryContentType(type)
-
- # Set filenames
- filenames <- vapply(id, function(x) paste0(class, '-', type, '-', x, '.', ext), FUN.VALUE = '')
-
- # set file paths
- file.paths <- vapply(filenames, function(x) file.path(.self$.cache.dir, x), FUN.VALUE = '')
-
- # Create cache dir if needed
- if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir))
- dir.create(.self$.cache.dir)
-
- return(file.paths)
- })
-
- ###########################
- # LOAD CONTENT FROM CACHE #
- ###########################
-
- BiodbFactory$methods( .load.content.from.cache = function(class, type, id) {
-
- content <- NULL
-
- # Read contents from files
- file.paths <- .self$.get.cache.file.paths(class, type, id)
- content <- lapply(file.paths, function(x) { if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL })
-
- return(content)
- })
+ # Merge content and missing.contents
+ content[id %in% missing.ids] <- vapply(id[id %in% missing.ids], function(x) missing.contents[missing.ids %in% x], FUN.VALUE = '')
+ }
- #########################
- # SAVE CONTENT TO CACHE #
- #########################
-
- BiodbFactory$methods( .save.content.to.cache = function(class, type, id, content) {
-
- # Write contents into files
- file.paths <- .self$.get.cache.file.paths(class, type, id)
- mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths)
- })
-
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- BiodbFactory$methods( getEntryContent = function(class, type, id, chunk.size = NA_integer_) {
-
- # Debug
- .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)..."))
-
- content <- NULL
- # Load from cache
- if ( ! is.na(.self$.cache.dir))
- content <- .self$.load.content.from.cache(class, type, id)
-
- # Get list of missing contents
- missing.content.indexes <- vapply(content, is.null, FUN.VALUE = TRUE)
- missing.ids <- if (is.null(content)) id else id[missing.content.indexes]
-
- # Debug
- if ( ! is.na(.self$.cache.dir)) {
- .self$.print.debug.msg(paste0(length(id) - length(missing.ids), " entry content(s) loaded from cache."))
- .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched."))
- }
-
- # Get contents
- if (length(missing.ids) > 0) {
-
- # Use connector to get missing contents
- conn <- .self$getConn(class)
-
- # Divide list of missing ids in chunks (in order to save in cache regularly)
- chunks.of.missing.ids = if (is.na(chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / chunk.size))
-
- # Loop on chunks
- missing.contents <- NULL
- for (ch.missing.ids in chunks.of.missing.ids) {
-
- ch.missing.contents <- conn$getEntryContent(type, ch.missing.ids)
-
- # Save to cache
- if ( ! is.null(ch.missing.contents) && ! is.na(.self$.cache.dir))
- .self$.save.content.to.cache(class, type, ch.missing.ids, ch.missing.contents)
-
- # Append
- missing.contents <- c(missing.contents, ch.missing.contents)
-
- # Debug
- if ( ! is.na(.self$.cache.dir))
- .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved..."))
- }
-
- # Merge content and missing.contents
- if (is.null(content))
- content <- missing.contents
- else
- content[missing.content.indexes] <- missing.contents
- }
-
- return(content)
- })
-}
+ return(content)
+})
diff -r 253d531a0193 -r 20d69a062da3 BiodbLogger.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/BiodbLogger.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,47 @@
+# vi: fdm=marker
+
+##########################
+# CLASS DECLARATION {{{1 #
+##########################
+
+BiodbLogger <- methods::setRefClass("BiodbLogger", contains = 'BiodbObserver', fields = list(.verbose.level = 'integer', .debug.level = 'integer', .file = 'ANY', .fail.on.error = 'logical', .signal.warnings = 'logical'))
+
+####################
+# CONSTRUCTOR {{{1 #
+####################
+
+BiodbLogger$methods( initialize = function(verbose.level = 1, debug.level = 1, file = NULL, ...) {
+
+ .verbose.level <<- if ( ! is.null(verbose.level) && ! is.na(verbose.level)) verbose.level else 1
+ .debug.level <<- if ( ! is.null(debug.level) && ! is.na(debug.level)) debug.level else 1
+ .file <<- if ( ! is.null(file) && ! is.na(file)) file else stderr()
+ .fail.on.error <<- TRUE
+ .signal.warnings <<- TRUE
+
+ callSuper(...)
+})
+
+################
+# MESSAGE {{{1 #
+################
+
+BiodbLogger$methods( message = function(type = MSG.INFO, msg, level = 1) {
+ type %in% biodb::MSG.TYPES || .self$message(biodb::MSG.ERROR, paste0("Unknown message type ", type, "."))
+
+ display = TRUE
+ if (type == biodb::MSG.INFO && .self$.verbose.level < level)
+ display = FALSE
+ if (type == biodb::MSG.DEBUG && .self$.debug.level < level)
+ display = FALSE
+
+ if (display)
+ cat(type, ': ', msg, "\n", sep = '', file = .self$.file)
+
+ # Raise error
+ if (type == biodb::MSG.ERROR && .self$.fail.on.error)
+ stop(msg)
+
+ # Raise warning
+ if (type == biodb::MSG.WARNING && .self$.signal.warnings)
+ warning(msg)
+})
diff -r 253d531a0193 -r 20d69a062da3 BiodbObject.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/BiodbObject.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,32 @@
+##########################
+# CLASS DECLARATION {{{1 #
+##########################
+
+BiodbObject <- methods::setRefClass("BiodbObject", fields = list( .observers = "ANY" ))
+
+########################
+# ABSTRACT METHOD {{{1 #
+########################
+
+BiodbObject$methods( .abstract.method = function() {
+
+ class <- class(.self)
+ method <- sys.call(length(sys.calls()) - 1)
+ method <- sub('^[^$]*\\$([^(]*)\\(.*$', '\\1()', method)
+
+ stop(paste("Method", method, "is not implemented in", class, "class."))
+})
+
+######################
+# ADD OBSERVERS {{{1 #
+######################
+
+BiodbObject$methods( addObservers = function(obs) {
+
+ # Check types of observers
+ if ( ( ! is.list(obs) && ! inherits(obs, "BiodbObserver")) || (is.list(obs) && any( ! vapply(obs, function(o) inherits(o, "BiodbObserver"), FUN.VALUE = TRUE))))
+ stop("Observers must inherit from BiodbObserver class.")
+
+ # Add observers to current list
+ .observers <<- if (is.null(.self$.observers)) c(obs) else c(.self$.observers, obs)
+})
diff -r 253d531a0193 -r 20d69a062da3 BiodbObserver.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/BiodbObserver.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,25 @@
+# vi: fdm=marker
+
+##########################
+# CLASS DECLARATION {{{1 #
+##########################
+
+BiodbObserver <- methods::setRefClass("BiodbObserver", fields = list())
+
+###################
+# CONSTANTS {{{ 1 #
+###################
+
+MSG.INFO <- 'INFO'
+MSG.DEBUG <- 'DEBUG'
+MSG.WARNING <- 'WARNING'
+MSG.ERROR <- 'ERROR'
+
+.MSG.TYPES <- c(MSG.ERROR, MSG.WARNING, MSG.DEBUG, MSG.INFO)
+
+################
+# MESSAGE {{{1 #
+################
+
+BiodbObserver$methods( message = function(type = MSG.INFO, msg, level = 1) {
+})
diff -r 253d531a0193 -r 20d69a062da3 ChebiCompound.R
--- a/ChebiCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,61 +0,0 @@
-if ( ! exists('ChebiCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- ChebiCompound <- setRefClass("ChebiCompound", contains = "BiodbEntry")
-
- ###########
- # FACTORY #
- ###########
-
- createChebiCompoundFromHtml <- function(contents, drop = TRUE) {
-
- library(XML)
-
- compounds <- list()
-
- # Define xpath expressions
- xpath.expr <- character()
-# xpath.expr[[BIODB.ACCESSION]] <- "//b[starts-with(., 'CHEBI:')]"
- xpath.expr[[BIODB.INCHI]] <- "//td[starts-with(., 'InChI=')]"
- xpath.expr[[BIODB.INCHIKEY]] <- "//td[text()='InChIKey']/../td[2]"
-
- for (html in contents) {
-
- # Create instance
- compound <- ChebiCompound$new()
-
- # Parse HTML
- xml <- htmlTreeParse(html, asText = TRUE, useInternalNodes = TRUE)
-
- # Test generic xpath expressions
- for (field in names(xpath.expr)) {
- v <- xpathSApply(xml, xpath.expr[[field]], xmlValue)
- if (length(v) > 0)
- compound$setField(field, v)
- }
-
- # Get accession
- accession <- xpathSApply(xml, "//b[starts-with(., 'CHEBI:')]", xmlValue)
- if (length(accession) > 0) {
- accession <- sub('^CHEBI:([0-9]+)$', '\\1', accession, perl = TRUE)
- compound$setField(BIODB.ACCESSION, accession)
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- compounds <- compounds[[1]]
-
- return(compounds)
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 ChebiConn.R
--- a/ChebiConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/ChebiConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,48 +1,59 @@
-if ( ! exists('ChebiConn')) { # Do not load again if already loaded
+#####################
+# CLASS DECLARATION #
+#####################
+
+ChebiConn <- methods::setRefClass("ChebiConn", contains = "RemotedbConn")
- source('RemotedbConn.R')
- source('ChebiCompound.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- ChebiConn <- setRefClass("ChebiConn", contains = "RemotedbConn")
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
+
+ChebiConn$methods( getEntryContentType = function() {
+ return(BIODB.HTML)
+})
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
+#####################
+# GET ENTRY CONTENT #
+#####################
+
+ChebiConn$methods( getEntryContent = function(id) {
- ChebiConn$methods( getEntryContentType = function(type) {
- return(BIODB.HTML)
- })
+ # Initialize return values
+ content <- rep(NA_character_, length(id))
- #####################
- # GET ENTRY CONTENT #
- #####################
+ # Request
+ content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.CHEBI, x)), FUN.VALUE = '')
+
+ return(content)
+})
- ChebiConn$methods( getEntryContent = function(type, id) {
-
- if (type == BIODB.COMPOUND) {
+################
+# CREATE ENTRY #
+################
- # Initialize return values
- content <- rep(NA_character_, length(id))
+ChebiConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createChebiEntryFromHtml(content, drop = drop))
+})
- # Request
- content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.CHEBI, x)), FUN.VALUE = '')
-
- return(content)
- }
+##################
+# GET NB ENTRIES #
+##################
- return(NULL)
- })
-
- ################
- # CREATE ENTRY #
- ################
-
- ChebiConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.COMPOUND) createChebiCompoundFromHtml(content, drop = drop) else NULL)
- })
+ChebiConn$methods( getNbEntries = function() {
+ return(NA_integer_)
+})
+
+#################
+# GET ENTRY IDS #
+#################
-} # end of load safe guard
+ChebiConn$methods( getEntryIds = function(max.results = NA_integer_) {
+ request <- "1*CHEBI ID100"
+ print('********************************************************************************')
+ print('********************************************************************************')
+ results <- .self$.scheduler$sendSoapRequest('http://www.ebi.ac.uk:80/webservices/chebi/2.0/webservice', request)
+ print(results)
+ print('********************************************************************************')
+ print('********************************************************************************')
+ return(NULL)
+})
diff -r 253d531a0193 -r 20d69a062da3 ChebiEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ChebiEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,57 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+ChebiEntry <- methods::setRefClass("ChebiEntry", contains = "BiodbEntry")
+
+###########
+# FACTORY #
+###########
+
+createChebiEntryFromHtml <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ # Define xpath expressions
+ xpath.expr <- character()
+# xpath.expr[[BIODB.ACCESSION]] <- "//b[starts-with(., 'CHEBI:')]"
+ xpath.expr[[BIODB.INCHI]] <- "//td[starts-with(., 'InChI=')]"
+ xpath.expr[[BIODB.INCHIKEY]] <- "//td[text()='InChIKey']/../td[2]"
+
+ for (content in contents) {
+
+ # Create instance
+ entry <- ChebiEntry$new()
+
+ if ( ! is.null(content) && ! is.na(content)) {
+
+ # Parse HTML
+ xml <- XML::htmlTreeParse(content, asText = TRUE, useInternalNodes = TRUE)
+
+ # Test generic xpath expressions
+ for (field in names(xpath.expr)) {
+ v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue)
+ if (length(v) > 0)
+ entry$setField(field, v)
+ }
+
+ # Get accession
+ accession <- XML::xpathSApply(xml, "//b[starts-with(., 'CHEBI:')]", XML::xmlValue)
+ if (length(accession) > 0) {
+ accession <- sub('^CHEBI:([0-9]+)$', '\\1', accession, perl = TRUE)
+ entry$setField(BIODB.ACCESSION, accession)
+ }
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
diff -r 253d531a0193 -r 20d69a062da3 ChemSpiderConn.R
--- a/ChemSpiderConn.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,60 +0,0 @@
-if ( ! exists('ChemspiderConn')) { # Do not load again if already loaded
-
- source('RemotedbConn.R')
- source('ChemspiderCompound.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- ChemspiderConn <- setRefClass("ChemspiderConn", contains = "RemotedbConn")
-
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
-
- ChemspiderConn$methods( getEntryContentType = function(type) {
- return(BIODB.HTML)
- })
-
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- ChemspiderConn$methods( getEntryContent = function(type, id) {
-
- if (type == BIODB.COMPOUND) {
-
- # Initialize return values
- content <- rep(NA_character_, length(id))
-
- # Request
- content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.CHEMSPIDER, x)), FUN.VALUE = '')
-
- return(content)
- }
-
- return(NULL)
- })
-
- ################
- # CREATE ENTRY #
- ################
-
- ChemspiderConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.COMPOUND) createChemspiderCompoundFromHtml(content, drop = drop) else NULL)
- })
-
- ############################
- # GET CHEMSPIDER IMAGE URL #
- ############################
-
- get.chemspider.image.url <- function(id) {
-
- url <- paste0('http://www.chemspider.com/ImagesHandler.ashx?w=300&h=300&id=', id)
-
- return(url)
- }
-
-} # end of load safe guard
-
diff -r 253d531a0193 -r 20d69a062da3 ChemspiderCompound.R
--- a/ChemspiderCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,59 +0,0 @@
-if ( ! exists('ChemspiderCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- ChemspiderCompound <- setRefClass("ChemspiderCompound", contains = "BiodbEntry")
-
- ###########
- # FACTORY #
- ###########
-
- createChemspiderCompoundFromHtml <- function(contents, drop = TRUE) {
-
- library(XML)
-
- compounds <- list()
-
- # Define xpath expressions
- xpath.expr <- character()
-
- for (html in contents) {
-
- # Create instance
- compound <- ChemspiderCompound$new()
-
- # Parse HTML
- xml <- htmlTreeParse(html, asText = TRUE, useInternalNodes = TRUE)
-
- # Test generic xpath expressions
- for (field in names(xpath.expr)) {
- v <- xpathSApply(xml, xpath.expr[[field]], xmlValue)
- if (length(v) > 0)
- compound$setField(field, v)
- }
-
- # Get accession
- accession <- xpathSApply(xml, "//li[starts-with(., 'ChemSpider ID')]", xmlValue)
- if (length(accession) > 0) {
- accession <- sub('^ChemSpider ID([0-9]+)$', '\\1', accession, perl = TRUE)
- compound$setField(BIODB.ACCESSION, accession)
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- compounds <- compounds[[1]]
-
- return(compounds)
- }
-}
-
diff -r 253d531a0193 -r 20d69a062da3 ChemspiderConn.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ChemspiderConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,92 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+ChemspiderConn <- methods::setRefClass("ChemspiderConn", contains = "RemotedbConn")
+
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
+
+ChemspiderConn$methods( getEntryContentType = function() {
+ return(BIODB.XML)
+})
+
+#####################
+# GET ENTRY CONTENT #
+#####################
+
+ChemspiderConn$methods( getEntryContent = function(ids) {
+
+ # Debug
+ .self$.print.debug.msg(paste0("Get entry content(s) for ", length(ids)," id(s)..."))
+
+ URL.MAX.LENGTH <- 2083
+
+ # Initialize return values
+ content <- rep(NA_character_, length(ids))
+
+ # Loop on all
+ n <- 0
+ inc <- NA_integer_
+ while (n < length(ids)) {
+
+ # Get list of accession ids to retrieve
+ accessions <- ids[(n + 1):(if (is.na(inc)) length(ids) else (min(n + inc, length(ids))))]
+
+ # Create URL request
+ x <- get.entry.url(class = BIODB.CHEMSPIDER, accession = accessions, content.type = BIODB.XML, max.length = URL.MAX.LENGTH, base.url = .self$.url, token = .self$.token)
+
+ # Debug
+ .self$.print.debug.msg(paste0("Send URL request for ", x$n," id(s)..."))
+
+ # Send request
+ xmlstr <- .self$.get.url(x$url)
+
+ # Error : "Cannot convert WRONG to System.Int32.\r\nParameter name: type ---> Input string was not in a correct format.\r\n"
+ if (grepl('^Cannot convert .* to System\\.Int32\\.', xmlstr)) {
+ # One of the ids is incorrect
+ if (is.na(inc)) {
+ inc <- 1
+ next
+ }
+ else
+ xmlstr <- NA_character_
+ }
+
+ # Increase number of entries retrieved
+ n <- n + x$n
+
+ # Parse XML and get included XML
+ if ( ! is.na(xmlstr)) {
+ xml <- xmlInternalTreeParse(xmlstr, asText = TRUE)
+ ns <- c(csns = "http://www.chemspider.com/")
+ returned.ids <- xpathSApply(xml, "//csns:ExtendedCompoundInfo/csns:CSID", xmlValue, namespaces = ns)
+ content[match(returned.ids, ids)] <- vapply(getNodeSet(xml, "//csns:ExtendedCompoundInfo", namespaces = ns), saveXML, FUN.VALUE = '')
+ }
+
+ # Debug
+ .self$.print.debug.msg(paste0("Now ", length(ids) - n," id(s) left to be retrieved..."))
+ }
+
+ return(content)
+})
+
+################
+# CREATE ENTRY #
+################
+
+ChemspiderConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createChemspiderEntryFromXml(content, drop = drop))
+})
+
+############################
+# GET CHEMSPIDER IMAGE URL #
+############################
+
+get.chemspider.image.url <- function(id) {
+
+ url <- paste0('http://www.chemspider.com/ImagesHandler.ashx?w=300&h=300&id=', id)
+
+ return(url)
+}
diff -r 253d531a0193 -r 20d69a062da3 ChemspiderEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ChemspiderEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,103 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+ChemspiderEntry <- methods::setRefClass("ChemspiderEntry", contains = "BiodbEntry")
+
+############################
+# CREATE COMPOUND FROM XML #
+############################
+
+createChemspiderEntryFromXml <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ # Define xpath expressions
+ xpath.expr <- character()
+ xpath.expr[[BIODB.ACCESSION]] <- "//CSID"
+ xpath.expr[[BIODB.FORMULA]] <- "//MF"
+ xpath.expr[[BIODB.NAME]] <- "//CommonName"
+ xpath.expr[[BIODB.AVERAGE.MASS]] <- "//AverageMass"
+ xpath.expr[[BIODB.INCHI]] <- "//InChI"
+ xpath.expr[[BIODB.INCHIKEY]] <- "//InChIKey"
+ xpath.expr[[BIODB.SMILES]] <- "//SMILES"
+
+ for (content in contents) {
+
+ # Create instance
+ entry <- ChemspiderEntry$new()
+
+ if ( ! is.null(content) && ! is.na(content) && content != 'NA') {
+
+ # Parse XML
+ xml <- XML::xmlInternalTreeParse(content, asText = TRUE)
+
+ # Test generic xpath expressions
+ for (field in names(xpath.expr)) {
+ v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue)
+ if (length(v) > 0)
+ entry$setField(field, v)
+ }
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
+
+#############################
+# CREATE COMPOUND FROM HTML #
+#############################
+
+createChemspiderEntryFromHtml <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ # Define xpath expressions
+ xpath.expr <- character()
+
+ for (content in contents) {
+
+ # Create instance
+ entry <- ChemspiderEntry$new()
+
+ if ( ! is.null(content) && ! is.na(content)) {
+
+ # Parse HTML
+ xml <- XML::htmlTreeParse(content, asText = TRUE, useInternalNodes = TRUE)
+
+ # Test generic xpath expressions
+ for (field in names(xpath.expr)) {
+ v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue)
+ if (length(v) > 0)
+ entry$setField(field, v)
+ }
+
+ # Get accession
+ accession <- XML::xpathSApply(xml, "//li[starts-with(., 'ChemSpider ID')]", XML::xmlValue)
+ if (length(accession) > 0) {
+ accession <- sub('^ChemSpider ID([0-9]+)$', '\\1', accession, perl = TRUE)
+ entry$setField(BIODB.ACCESSION, accession)
+ }
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
diff -r 253d531a0193 -r 20d69a062da3 EnzymeCompound.R
--- a/EnzymeCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,61 +0,0 @@
-if ( ! exists('EnzymeCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- EnzymeCompound <- setRefClass("EnzymeCompound", contains = 'BiodbEntry')
-
- ###########
- # FACTORY #
- ###########
-
- createEnzymeCompoundFromTxt <- function(contents, drop = TRUE) {
-
- library(stringr)
-
- compounds <- list()
-
- # Define fields regex
- regex <- character()
- regex[[BIODB.ACCESSION]] <- "^ID\\s+([0-9.]+)$"
- regex[[BIODB.DESCRIPTION]] <- "^DE\\s+(.+)$"
-
- for (text in contents) {
-
- # Create instance
- compound <- EnzymeCompound$new()
-
- lines <- strsplit(text, "\n")
- for (s in lines[[1]]) {
-
- # Test generic regex
- parsed <- FALSE
- for (field in names(regex)) {
- g <- str_match(s, regex[[field]])
- if ( ! is.na(g[1,1])) {
- compound$setField(field, g[1,2])
- parsed <- TRUE
- break
- }
- }
- if (parsed)
- next
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- compounds <- compounds[[1]]
-
- return(compounds)
-
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 EnzymeConn.R
--- a/EnzymeConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/EnzymeConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,47 +1,36 @@
-if ( ! exists('EnzymeConn')) { # Do not load again if already loaded
+#####################
+# CLASS DECLARATION #
+#####################
- source('RemotedbConn.R')
- source('EnzymeCompound.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
+EnzymeConn <- methods::setRefClass("EnzymeConn", contains = "RemotedbConn")
- EnzymeConn <- setRefClass("EnzymeConn", contains = "RemotedbConn")
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
+EnzymeConn$methods( getEntryContentType = function() {
+ return(BIODB.TXT)
+})
- EnzymeConn$methods( getEntryContentType = function(type) {
- return(BIODB.TXT)
- })
+#####################
+# GET ENTRY CONTENT #
+#####################
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- EnzymeConn$methods( getEntryContent = function(type, id) {
+EnzymeConn$methods( getEntryContent = function(id) {
- if (type == BIODB.COMPOUND) {
+ # Initialize return values
+ content <- rep(NA_character_, length(id))
- # Initialize return values
- content <- rep(NA_character_, length(id))
-
- # Request
- content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.ENZYME, accession = x, content.type = BIODB.TXT)), FUN.VALUE = '')
+ # Request
+ content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.ENZYME, accession = x, content.type = BIODB.TXT)), FUN.VALUE = '')
- return(content)
- }
-
- return(NULL)
- })
+ return(content)
+})
- ################
- # CREATE ENTRY #
- ################
-
- EnzymeConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.COMPOUND) createEnzymeCompoundFromTxt(content, drop = drop) else NULL)
- })
-}
+################
+# CREATE ENTRY #
+################
+
+EnzymeConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createEnzymeEntryFromTxt(content, drop = drop))
+})
diff -r 253d531a0193 -r 20d69a062da3 EnzymeEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/EnzymeEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,53 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+EnzymeEntry <- methods::setRefClass("EnzymeEntry", contains = 'BiodbEntry')
+
+###########
+# FACTORY #
+###########
+
+createEnzymeEntryFromTxt <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ # Define fields regex
+ regex <- character()
+ regex[[BIODB.ACCESSION]] <- "^ID\\s+([0-9.]+)$"
+ regex[[BIODB.DESCRIPTION]] <- "^DE\\s+(.+)$"
+
+ for (text in contents) {
+
+ # Create instance
+ entry <- EnzymeEntry$new()
+
+ lines <- strsplit(text, "\n")
+ for (s in lines[[1]]) {
+
+ # Test generic regex
+ parsed <- FALSE
+ for (field in names(regex)) {
+ g <- stringr::str_match(s, regex[[field]])
+ if ( ! is.na(g[1,1])) {
+ entry$setField(field, g[1,2])
+ parsed <- TRUE
+ break
+ }
+ }
+ if (parsed)
+ next
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
diff -r 253d531a0193 -r 20d69a062da3 HmdbCompound.R
--- a/HmdbCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,63 +0,0 @@
-if ( ! exists('HmdbCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- HmdbCompound <- setRefClass("HmdbCompound", contains = "BiodbEntry")
-
- ###########
- # FACTORY #
- ###########
-
- createHmdbCompoundFromXml <- function(contents, drop = FALSE) {
-
- library(XML)
-
- compounds <- list()
-
- # Define xpath expressions
- xpath.expr <- character()
- xpath.expr[[BIODB.ACCESSION]] <- "/metabolite/accession"
- xpath.expr[[BIODB.KEGG.ID]] <- "//kegg_id"
- xpath.expr[[BIODB.NAME]] <- "/metabolite/name"
- xpath.expr[[BIODB.FORMULA]] <- "/metabolite/chemical_formula"
- xpath.expr[[BIODB.SUPER.CLASS]] <- "//super_class"
- xpath.expr[[BIODB.AVERAGE.MASS]] <- "//average_molecular_weight"
- xpath.expr[[BIODB.MONOISOTOPIC.MASS]] <- "//monisotopic_moleculate_weight"
-
- for (content in contents) {
-
- # Create instance
- compound <- HmdbCompound$new()
-
- # Parse XML
- xml <- xmlInternalTreeParse(content, asText = TRUE)
-
- # An error occured
- if (length(getNodeSet(xml, "//error")) == 0) {
-
- # Test generic xpath expressions
- for (field in names(xpath.expr)) {
- v <- xpathSApply(xml, xpath.expr[[field]], xmlValue)
- if (length(v) > 0)
- compound$setField(field, v)
- }
-
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- compounds <- compounds[[1]]
-
- return(compounds)
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 HmdbConn.R
--- a/HmdbConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/HmdbConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,48 +1,36 @@
-if ( ! exists('HmdbConn')) { # Do not load again if already loaded
+#####################
+# CLASS DECLARATION #
+#####################
+
+HmdbConn <- methods::setRefClass("HmdbConn", contains = "RemotedbConn")
- source('RemotedbConn.R')
- source('HmdbCompound.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- HmdbConn <- setRefClass("HmdbConn", contains = "RemotedbConn")
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
+HmdbConn$methods( getEntryContentType = function() {
+ return(BIODB.XML)
+})
- HmdbConn$methods( getEntryContentType = function(type) {
- return(BIODB.XML)
- })
+#####################
+# GET ENTRY CONTENT #
+#####################
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- HmdbConn$methods( getEntryContent = function(type, id) {
+HmdbConn$methods( getEntryContent = function(id) {
- if (type == BIODB.COMPOUND) {
+ # Initialize return values
+ content <- rep(NA_character_, length(id))
- # Initialize return values
- content <- rep(NA_character_, length(id))
-
- # Request
- content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.HMDB, x, content.type = BIODB.XML)), FUN.VALUE = '')
+ # Request
+ content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.HMDB, x, content.type = BIODB.XML)), FUN.VALUE = '')
- return(content)
- }
+ return(content)
+})
- return(NULL)
- })
-
- ################
- # CREATE ENTRY #
- ################
-
- HmdbConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.COMPOUND) createHmdbCompoundFromXml(content, drop = drop) else NULL)
- })
-
-} # end of load safe guard
+################
+# CREATE ENTRY #
+################
+
+HmdbConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createHmdbEntryFromXml(content, drop = drop))
+})
diff -r 253d531a0193 -r 20d69a062da3 HmdbEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/HmdbEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,59 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+HmdbEntry <- methods::setRefClass("HmdbEntry", contains = "BiodbEntry")
+
+###########
+# FACTORY #
+###########
+
+createHmdbEntryFromXml <- function(contents, drop = FALSE) {
+
+ entries <- list()
+
+ # Define xpath expressions
+ xpath.expr <- character()
+ xpath.expr[[BIODB.ACCESSION]] <- "/metabolite/accession"
+ xpath.expr[[BIODB.KEGG.ID]] <- "//kegg_id"
+ xpath.expr[[BIODB.NAME]] <- "/metabolite/name"
+ xpath.expr[[BIODB.FORMULA]] <- "/metabolite/chemical_formula"
+ xpath.expr[[BIODB.SUPER.CLASS]] <- "//super_class"
+ xpath.expr[[BIODB.AVERAGE.MASS]] <- "//average_molecular_weight"
+ xpath.expr[[BIODB.MONOISOTOPIC.MASS]] <- "//monisotopic_moleculate_weight"
+
+ for (content in contents) {
+
+ # Create instance
+ entry <- HmdbEntry$new()
+
+ if ( ! is.null(content) && ! is.na(content)) {
+
+ # Parse XML
+ xml <- XML::xmlInternalTreeParse(content, asText = TRUE)
+
+ # An error occured
+ if (length(XML::getNodeSet(xml, "//error")) == 0) {
+
+ # Test generic xpath expressions
+ for (field in names(xpath.expr)) {
+ v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue)
+ if (length(v) > 0)
+ entry$setField(field, v)
+ }
+
+ }
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
diff -r 253d531a0193 -r 20d69a062da3 KeggCompound.R
--- a/KeggCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,88 +0,0 @@
-if ( ! exists('KeggCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- KeggCompound <- setRefClass("KeggCompound", contains = 'BiodbEntry')
-
- ###########
- # FACTORY #
- ###########
-
- createKeggCompoundFromTxt <- function(contents, drop = TRUE) {
-
- library(stringr)
-
- compounds <- list()
-
- # Define fields regex
- regex <- character()
- regex[[BIODB.NAME]] <- "^NAME\\s+([^,;]+)"
- regex[[BIODB.CHEBI.ID]] <- "^\\s+ChEBI:\\s+(\\S+)"
- regex[[BIODB.LIPIDMAPS.ID]] <- "^\\s+LIPIDMAPS:\\s+(\\S+)"
-
- for (text in contents) {
-
- # Create instance
- compound <- KeggCompound$new()
-
- lines <- strsplit(text, "\n")
- for (s in lines[[1]]) {
-
- # Test generic regex
- parsed <- FALSE
- for (field in names(regex)) {
- g <- str_match(s, regex[[field]])
- if ( ! is.na(g[1,1])) {
- compound$setField(field, g[1,2])
- parsed <- TRUE
- break
- }
- }
- if (parsed)
- next
-
- # ACCESSION
- {
- # ENZYME ID
- g <- str_match(s, "^ENTRY\\s+EC\\s+(\\S+)")
- if ( ! is.na(g[1,1]))
- compound$setField(BIODB.ACCESSION, paste('ec', g[1,2], sep = ':'))
-
- # ENTRY ID
- else {
- g <- str_match(s, "^ENTRY\\s+(\\S+)\\s+Compound")
- if ( ! is.na(g[1,1]))
- compound$setField(BIODB.ACCESSION, paste('cpd', g[1,2], sep = ':'))
-
- # OTHER ID
- else {
- g <- str_match(s, "^ENTRY\\s+(\\S+)")
- if ( ! is.na(g[1,1]))
- compound$setField(BIODB.ACCESSION, g[1,2])
- }
- }
-
- # ORGANISM
- g <- str_match(s, "^ORGANISM\\s+(\\S+)")
- if ( ! is.na(g[1,1]))
- compound$setField(BIODB.ACCESSION, paste(g[1,2], compound$getField(BIODB.ACCESSION), sep = ':'))
- }
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- compounds <- compounds[[1]]
-
- return(compounds)
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 KeggConn.R
--- a/KeggConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/KeggConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,48 +1,36 @@
-if ( ! exists('KeggConn')) { # Do not load again if already loaded
+#####################
+# CLASS DECLARATION #
+#####################
- source('RemotedbConn.R')
- source('KeggCompound.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- KeggConn <- setRefClass("KeggConn", contains = "RemotedbConn")
+KeggConn <- methods::setRefClass("KeggConn", contains = "RemotedbConn")
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
- KeggConn$methods( getEntryContentType = function(type) {
- return(BIODB.TXT)
- })
+KeggConn$methods( getEntryContentType = function() {
+ return(BIODB.TXT)
+})
- #####################
- # GET ENTRY CONTENT #
- #####################
+#####################
+# GET ENTRY CONTENT #
+#####################
- KeggConn$methods( getEntryContent = function(type, id) {
-
- if (type == BIODB.COMPOUND) {
+KeggConn$methods( getEntryContent = function(id) {
- # Initialize return values
- content <- rep(NA_character_, length(id))
+ # Initialize return values
+ content <- rep(NA_character_, length(id))
- # Request
- content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.KEGG, x, content.type = BIODB.TXT)), FUN.VALUE = '')
-
- return(content)
- }
+ # Request
+ content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.KEGG, x, content.type = BIODB.TXT)), FUN.VALUE = '')
- return(NULL)
- })
+ return(content)
+})
- ################
- # CREATE ENTRY #
- ################
+################
+# CREATE ENTRY #
+################
- KeggConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.COMPOUND) createKeggCompoundFromTxt(content, drop = drop) else NULL)
- })
-
-} # end of load safe guard
+KeggConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createKeggEntryFromTxt(content, drop = drop))
+})
diff -r 253d531a0193 -r 20d69a062da3 KeggEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/KeggEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,81 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+KeggEntry <- methods::setRefClass("KeggEntry", contains = 'BiodbEntry')
+
+###########
+# FACTORY #
+###########
+
+createKeggEntryFromTxt <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ # Define fields regex
+ regex <- character()
+ regex[[BIODB.NAME]] <- "^NAME\\s+([^,;]+)"
+ regex[[BIODB.CHEBI.ID]] <- "^\\s+ChEBI:\\s+(\\S+)"
+ regex[[BIODB.LIPIDMAPS.ID]] <- "^\\s+LIPIDMAPS:\\s+(\\S+)"
+
+ for (text in contents) {
+
+ # Create instance
+ entry <- KeggEntry$new()
+
+ lines <- strsplit(text, "\n")
+ for (s in lines[[1]]) {
+
+ # Test generic regex
+ parsed <- FALSE
+ for (field in names(regex)) {
+ g <- stringr::str_match(s, regex[[field]])
+ if ( ! is.na(g[1,1])) {
+ entry$setField(field, g[1,2])
+ parsed <- TRUE
+ break
+ }
+ }
+ if (parsed)
+ next
+
+ # ACCESSION
+ {
+ # ENZYME ID
+ g <- stringr::str_match(s, "^ENTRY\\s+EC\\s+(\\S+)")
+ if ( ! is.na(g[1,1])){
+ entry$setField(BIODB.ACCESSION, paste('ec', g[1,2], sep = ':'))
+
+ # ENTRY ID
+ }else {
+ g <- stringr::str_match(s, "^ENTRY\\s+(\\S+)\\s+Compound")
+ if ( ! is.na(g[1,1])){
+ entry$setField(BIODB.ACCESSION, paste('cpd', g[1,2], sep = ':'))
+
+ # OTHER ID
+ }else {
+ g <- stringr::str_match(s, "^ENTRY\\s+(\\S+)")
+ if ( ! is.na(g[1,1]))
+ entry$setField(BIODB.ACCESSION, g[1,2])
+ }
+ }
+
+ # ORGANISM
+ g <- stringr::str_match(s, "^ORGANISM\\s+(\\S+)")
+ if ( ! is.na(g[1,1]))
+ entry$setField(BIODB.ACCESSION, paste(g[1,2], entry$getField(BIODB.ACCESSION), sep = ':'))
+ }
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
diff -r 253d531a0193 -r 20d69a062da3 LipidmapsCompound.R
--- a/LipidmapsCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,70 +0,0 @@
-if ( ! exists('LipidmapsCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
- source('strhlp.R', chdir = TRUE)
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- LipidmapsCompound <- setRefClass("LipidmapsCompound", contains = 'BiodbEntry')
-
- ###########
- # FACTORY #
- ###########
-
- createLipidmapsCompoundFromCsv <- function(contents, drop = TRUE) {
-
- compounds <- list()
-
- # Mapping column names
- col2field <- list()
- col2field[[BIODB.NAME]] <- 'COMMON_NAME'
- col2field[[BIODB.ACCESSION]] <- 'LM_ID'
- col2field[[BIODB.KEGG.ID]] <- 'KEGG_ID'
- col2field[[BIODB.HMDB.ID]] <- 'HMDBID'
- col2field[[BIODB.MASS]] <- 'MASS'
- col2field[[BIODB.FORMULA]] <- 'FORMULA'
-
- for (text in contents) {
-
- # Create instance
- compound <- LipidmapsCompound$new()
-
- # Split text in lines
- lines <- split.str(text, sep = "\n", unlist = TRUE)
-
- # An error occured
- if ( ! grepl("No record found", lines[[2]])) {
-
- # Keys on first line
- keys <- split.str(lines[[1]], unlist = TRUE)
-
- # Values on second line
- values <- split.str(lines[[2]], unlist = TRUE)
- names(values) <- keys[seq(values)]
-
- # Get field values
- for (field in names(col2field))
- if (values[[col2field[[field]]]] != '-')
- compound$setField(field, values[[col2field[[field]]]])
-
- # Set names
- if (values[['SYNONYMS']] != '-') {
- # TODO
- }
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- compounds <- compounds[[1]]
-
- return(compounds)
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 LipidmapsConn.R
--- a/LipidmapsConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/LipidmapsConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,57 +1,46 @@
-if ( ! exists('LipdmapsConn')) { # Do not load again if already loaded
-
- source('RemotedbConn.R')
- source('LipidmapsCompound.R')
+#####################
+# CLASS DECLARATION #
+#####################
- #####################
- # CLASS DECLARATION #
- #####################
+LipidmapsConn <- methods::setRefClass("LipidmapsConn", contains = "RemotedbConn")
- LipidmapsConn <- setRefClass("LipidmapsConn", contains = "RemotedbConn")
+###############
+# CONSTRUCTOR #
+###############
- ###############
- # CONSTRUCTOR #
- ###############
+LipidmapsConn$methods( initialize = function(...) {
+ # From http://www.lipidmaps.org/data/structure/programmaticaccess.html:
+ # If you write a script to automate calls to LMSD, please be kind and do not hit our server more often than once per 20 seconds. We may have to kill scripts that hit our server more frequently.
+ callSuper(scheduler = UrlRequestScheduler$new(t = 20), ...)
+})
- LipidmapsConn$methods( initialize = function(...) {
- # From http://www.lipidmaps.org/data/structure/programmaticaccess.html:
- # If you write a script to automate calls to LMSD, please be kind and do not hit our server more often than once per 20 seconds. We may have to kill scripts that hit our server more frequently.
- callSuper(scheduler = UrlRequestScheduler$new(t = 20), ...)
- })
-
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
-
- LipidmapsConn$methods( getEntryContentType = function(type) {
- return(BIODB.CSV)
- })
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- LipidmapsConn$methods( getEntryContent = function(type, id) {
+LipidmapsConn$methods( getEntryContentType = function() {
+ return(BIODB.CSV)
+})
- if (type == BIODB.COMPOUND) {
+#####################
+# GET ENTRY CONTENT #
+#####################
- # Initialize return values
- content <- rep(NA_character_, length(id))
+LipidmapsConn$methods( getEntryContent = function(id) {
- # Request
- content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.LIPIDMAPS, x, content.type = BIODB.CSV)), FUN.VALUE = '')
+ # Initialize return values
+ content <- rep(NA_character_, length(id))
- return(content)
- }
+ # Request
+ content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.LIPIDMAPS, x, content.type = BIODB.CSV)), FUN.VALUE = '')
- return(NULL)
- })
+ return(content)
+})
- ################
- # CREATE ENTRY #
- ################
+################
+# CREATE ENTRY #
+################
- LipidmapsConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.COMPOUND) createLipidmapsCompoundFromCsv(content, drop = drop) else NULL)
- })
-}
+LipidmapsConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createLipidmapsEntryFromCsv(content, drop = drop))
+})
diff -r 253d531a0193 -r 20d69a062da3 LipidmapsEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/LipidmapsEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,64 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+LipidmapsEntry <- methods::setRefClass("LipidmapsEntry", contains = 'BiodbEntry')
+
+###########
+# FACTORY #
+###########
+
+createLipidmapsEntryFromCsv <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ # Mapping column names
+ col2field <- list()
+ col2field[[BIODB.NAME]] <- 'COMMON_NAME'
+ col2field[[BIODB.ACCESSION]] <- 'LM_ID'
+ col2field[[BIODB.KEGG.ID]] <- 'KEGG_ID'
+ col2field[[BIODB.HMDB.ID]] <- 'HMDBID'
+ col2field[[BIODB.MASS]] <- 'MASS'
+ col2field[[BIODB.FORMULA]] <- 'FORMULA'
+
+ for (text in contents) {
+
+ # Create instance
+ entry <- LipidmapsEntry$new()
+
+ # Split text in lines
+ lines <- split.str(text, sep = "\n", unlist = TRUE)
+
+ # An error occured
+ if ( ! grepl("No record found", lines[[2]])) {
+
+ # Keys on first line
+ keys <- split.str(lines[[1]], unlist = TRUE)
+
+ # Values on second line
+ values <- split.str(lines[[2]], unlist = TRUE)
+ names(values) <- keys[seq(values)]
+
+ # Get field values
+ for (field in names(col2field))
+ if (values[[col2field[[field]]]] != '-')
+ entry$setField(field, values[[col2field[[field]]]])
+
+ # Set names
+ if (values[['SYNONYMS']] != '-') {
+ # TODO
+ }
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
diff -r 253d531a0193 -r 20d69a062da3 MassFiledbConn.R
--- a/MassFiledbConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/MassFiledbConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,258 +1,275 @@
-if ( ! exists('MassFiledbConn')) {
+# LCMS File db.
+# In this type of database, a single file is provided in CSV format. Default separator is tabulation.
+# Each line is a MS peak measure, .
+# The file contains molecule and spectrum information. Each spectrum has an accession id.
+
+# TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue
- source('MassdbConn.R')
-
- # LCMS File db.
- # In this type of database, a single file is provided in CSV format. Default separator is tabulation.
- # Each line is a MS peak measure, .
- # The file contains molecule and spectrum information. Each spectrum has an accession id.
+#############
+# CONSTANTS #
+#############
- # TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue
-
- #############
- # CONSTANTS #
- #############
+# Default database fields
+.BIODB.DFT.DB.FIELDS <- list()
+for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZEXP, BIODB.PEAK.MZTHEO, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS))
+ .BIODB.DFT.DB.FIELDS[[f]] <- f
- # Default database fields
- .BIODB.DFT.DB.FIELDS <- list()
- for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZ, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS))
- .BIODB.DFT.DB.FIELDS[[f]] <- f
+#####################
+# CLASS DECLARATION #
+#####################
+
+MassFiledbConn <- methods::setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .db.orig.colnames = "character", .fields = "list", .ms.modes = "character"))
- #####################
- # CLASS DECLARATION #
- #####################
-
- MassFiledbConn <- setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .fields = "list", .ms.modes = "character"))
+###############
+# CONSTRUCTOR #
+###############
- ###############
- # CONSTRUCTOR #
- ###############
+MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) {
+
+ # Check file
+ (! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.")
+ file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\"."))
- MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) {
-
- # Check file
- (! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.")
- file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\"."))
+ # Set fields
+ .db <<- NULL
+ .db.orig.colnames <<- NA_character_
+ .file <<- file
+ .file.sep <<- file.sep
+ .file.quote <<- file.quote
+ .fields <<- .BIODB.DFT.DB.FIELDS
+ .field.multval.sep <<- ';'
+ .ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS)
+ names(.self$.ms.modes) <- .self$.ms.modes
- # Set fields
- .db <<- NULL
- .file <<- file
- .file.sep <<- file.sep
- .file.quote <<- file.quote
- .fields <<- .BIODB.DFT.DB.FIELDS
- .field.multval.sep <<- ';'
- .ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS)
- names(.self$.ms.modes) <- .self$.ms.modes
+ callSuper(...)
+})
- callSuper(...)
- })
+######################
+# Is valid field tag #
+######################
- ######################
- # Is valid field tag #
- ######################
+MassFiledbConn$methods( isValidFieldTag = function(tag) {
+ return (tag %in% names(.self$.fields))
+})
- MassFiledbConn$methods( isValidFieldTag = function(tag) {
- return (tag %in% names(.self$.fields))
- })
+###########
+# INIT DB #
+###########
- #############
- # Set field #
- #############
+MassFiledbConn$methods( .init.db = function() {
- MassFiledbConn$methods( setField = function(tag, colname) {
+ if (is.null(.self$.db)) {
+
+ # Load database
+ .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL)
- ( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.")
- ( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.")
-
- # Load database file
- .self$.init.db()
+ # Save column names
+ .db.orig.colnames <<- colnames(.self$.db)
+ }
+})
- # Check that this field tag is defined in the fields list
- .self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid."))
+#############
+# Set field #
+#############
- # Check that columns are defined in database file
- all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file."))
+MassFiledbConn$methods( setField = function(tag, colname) {
+
+ ( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.")
+ ( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.")
- # Set new definition
- if (length(colname) == 1)
- .fields[[tag]] <<- colname
- else {
- new.col <- paste(colname, collapse = ".")
- .self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '')
- .fields[[tag]] <<- new.col
- }
- })
+ # Load database file
+ .self$.init.db()
+
+ # Check that this field tag is defined in the fields list
+ .self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid."))
+
+ # Check that columns are defined in database file
+ all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file."))
- ######################################
- # SET FIELD MULTIPLE VALUE SEPARATOR #
- ######################################
-
- MassFiledbConn$methods( setFieldMultValSep = function(sep) {
- .field.multval.sep <<- sep
- })
+ # Set new definition
+ if (length(colname) == 1)
+ .fields[[tag]] <<- colname
+ else {
+ new.col <- paste(colname, collapse = ".")
+ .self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '')
+ .fields[[tag]] <<- new.col
+ }
- ################
- # SET MS MODES #
- ################
+ # Update data frame column names
+ colnames(.self$.db) <- vapply(.self$.db.orig.colnames, function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '')
+})
- MassFiledbConn$methods( setMsMode = function(mode, value) {
- .self$.ms.modes[[mode]] <- value
- })
+######################################
+# SET FIELD MULTIPLE VALUE SEPARATOR #
+######################################
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
+MassFiledbConn$methods( setFieldMultValSep = function(sep) {
+ .field.multval.sep <<- sep
+})
- MassFiledbConn$methods( getEntryContentType = function(type) {
- return(BIODB.DATAFRAME)
- })
+################
+# SET MS MODES #
+################
- ###########
- # INIT DB #
- ###########
+MassFiledbConn$methods( setMsMode = function(mode, value) {
+ .self$.ms.modes[[mode]] <- value
+})
- MassFiledbConn$methods( .init.db = function() {
-
- if (is.null(.self$.db)) {
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
- # Load database
- .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL)
+MassFiledbConn$methods( getEntryContentType = function(type) {
+ return(BIODB.DATAFRAME)
+})
- # Rename columns
- colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '')
- }
- })
+################
+# CHECK FIELDS #
+################
- ################
- # CHECK FIELDS #
- ################
+MassFiledbConn$methods( .check.fields = function(fields) {
+
+ if (length(fields) ==0 || (length(fields) == 1 && is.na(fields)))
+ return
- MassFiledbConn$methods( .check.fields = function(fields) {
+ # Check if fields are known
+ unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)]
+ if (length(unknown.fields) > 0)
+ stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown."))
- # Check if fields are known
- unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)]
- if (length(unknown.fields) > 0)
- stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown."))
+ # Init db
+ .self$.init.db()
- # Init db
- .self$.init.db()
+ # Check if fields are defined in file database
+ undefined.fields <- colnames(.self$.db)[ ! fields %in% colnames(.self$.db)]
+ if (length(undefined.fields) > 0)
+ stop(paste0("Column(s) ", paste(fields), collapse = ", "), " is/are undefined in file database.")
+})
- # Check if fields are defined in file database
- undefined.fields <- colnames(.self$.init.db)[ ! unlist(.self$.fields[fields]) %in% colnames(.self$.init.db)]
- if (length(undefined.fields) > 0)
- stop(paste0("Column(s) ", paste(unlist(.self$.fields[fields]), collapse = ", "), " is/are undefined in file database."))
- })
+##########
+# SELECT #
+##########
+
+# Select data from database
+MassFiledbConn$methods( .select = function(cols = NULL, mode = NULL, compound.ids = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) {
+
+ x <- NULL
- ################
- # EXTRACT COLS #
- ################
-
- MassFiledbConn$methods( .extract.cols = function(cols, mode = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) {
-
- x <- NULL
+ # Init db
+ .self$.init.db()
- if ( ! is.null(cols) && ! is.na(cols)) {
+ # Get db
+ db <- .self$.db
- # Init db
- .self$.init.db()
-
- # TODO check existence of cols/fields
+ # Filter db on mode
+ if ( ! is.null(mode) && ! is.na(mode)) {
- # Get db, eventually filtering it.
- if (is.null(mode))
- db <- .self$.db
- else {
- # Check mode value
- mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'."))
- .self$.check.fields(BIODB.MSMODE)
+ # Check mode value
+ mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'."))
+ .self$.check.fields(BIODB.MSMODE)
- # Filter on mode
- db <- .self$.db[.self$.db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ]
- }
+ # Filter on mode
+ db <- db[db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ]
+ }
- # Get subset
- x <- db[, unlist(.self$.fields[cols]), drop = drop]
+ # Filter db on compound ids
+ # TODO
+
+ if ( ! is.null(cols) && ! is.na(cols))
+ .self$.check.fields(cols)
- # Rename columns
- if (is.data.frame(x))
- colnames(x) <- cols
+ # Get subset
+ if (is.null(cols) || is.na(cols))
+ x <- db
+ else
+ x <- db[, unlist(.self$.fields[cols]), drop = drop]
- # Rearrange
- if (drop && is.vector(x)) {
- if (uniq)
- x <- x[ ! duplicated(x)]
- if (sort)
- x <- sort(x)
- }
+ # Rearrange
+ if (drop && is.vector(x)) {
+ if (uniq)
+ x <- x[ ! duplicated(x)]
+ if (sort)
+ x <- sort(x)
+ }
- # Cut
- if ( ! is.na(max.rows))
- x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ]
- }
+ # Cut
+ if ( ! is.na(max.rows))
+ x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ]
+
+ return(x)
+})
+
+#################
+# GET ENTRY IDS #
+#################
- return(x)
- })
+MassFiledbConn$methods( getEntryIds = function(type) {
+
+ ids <- NA_character_
+
+ if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND))
+ ids <- as.character(.self$.select(cols = if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE))
- #################
- # GET ENTRY IDS #
- #################
-
- MassFiledbConn$methods( getEntryIds = function(type) {
+ return(ids)
+})
- ids <- NA_character_
-
- if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND))
- ids <- as.character(.self$.extract.cols(if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE))
+##################
+# GET NB ENTRIES #
+##################
- return(ids)
- })
+MassFiledbConn$methods( getNbEntries = function(type) {
+ return(length(.self$getEntryIds(type)))
+})
+
+###############################
+# GET CHROMATOGRAPHIC COLUMNS #
+###############################
- ##################
- # GET NB ENTRIES #
- ##################
-
- MassFiledbConn$methods( getNbEntries = function(type) {
- return(length(.self$getEntryIds(type)))
- })
+# Inherited from MassdbConn.
+MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) {
+
+ # Extract needed columns
+ db <- .self$.select(cols = c(BIODB.COMPOUND.ID, BIODB.CHROM.COL))
- ###############################
- # GET CHROMATOGRAPHIC COLUMNS #
- ###############################
-
- # Inherited from MassdbConn.
- MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) {
+ # Filter on molecule IDs
+ if ( ! is.null(compound.ids))
+ db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ]
+
+ # Get column names
+ cols <- db[[BIODB.CHROM.COL]]
- # Extract needed columns
- db <- .self$.extract.cols(c(BIODB.COMPOUND.ID, BIODB.CHROM.COL))
+ # Remove duplicates
+ cols <- cols[ ! duplicated(cols)]
- # Filter on molecule IDs
- if ( ! is.null(compound.ids))
- db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ]
+ # Make data frame
+ chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE)
+ colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE)
- # Get column names
- cols <- db[[BIODB.CHROM.COL]]
+ return(chrom.cols)
+})
- # Remove duplicates
- cols <- cols[ ! duplicated(cols)]
+#################
+# GET MZ VALUES #
+#################
- # Make data frame
- chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE)
- colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE)
+# Inherited from MassdbConn.
+MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
- return(chrom.cols)
- })
-
- #################
- # GET MZ VALUES #
- #################
-
- # Inherited from MassdbConn.
- MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
+ # Get mz values
+ mz <- .self$.select(cols = BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results)
+
+ return(mz)
+})
- # Get mz values
- mz <- .self$.extract.cols(BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results)
+################
+# GET NB PEAKS #
+################
- return(mz)
- })
+# Inherited from MassdbConn.
+MassFiledbConn$methods( getNbPeaks = function(mode = NULL, compound.ids = NULL) {
-}
+ # Get peaks
+ peaks <- .self$.select(cols = BIODB.PEAK.MZTHEO, mode = mode, compound.ids = compound.ids)
+
+ return(length(peaks))
+})
diff -r 253d531a0193 -r 20d69a062da3 MassbankCompound.R
--- a/MassbankCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,66 +0,0 @@
-if ( ! exists('MassbankCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- MassbankCompound <- setRefClass("MassbankCompound", contains = "BiodbEntry")
-
- ###########
- # FACTORY #
- ###########
-
- createMassbankCompoundFromTxt <- function(contents) {
-
- library(stringr)
-
- compounds <- list()
-
- for (text in contents) {
-
- # Create instance
- compound <- MassbankCompound$new()
-
- # Read text
- lines <- strsplit(text, "\n")
- for (s in lines[[1]]) {
-
- # NAME
- if (is.na(compound$getField(BIODB.NAME))) {
- g <- str_match(s, "^CH\\$NAME:\\s+(.+)$")
- if ( ! is.na(g[1,1]))
- compound$setField(BIODB.NAME, g[1,2])
- }
-
- # CHEBI ID
- g <- str_match(s, "^CH\\$LINK: CHEBI\\s+(.+)$")
- if ( ! is.na(g[1,1]))
- compound$setField(BIODB.CHEBI.ID, g[1,2])
-
- # KEGG ID
- g <- str_match(s, "^CH\\$LINK: KEGG\\s+(.+)$")
- if ( ! is.na(g[1,1]))
- compound$setField(BIODB.KEGG.ID, g[1,2])
-
- # PUBCHEM ID
- g <- str_match(s, "^CH\\$LINK: PUBCHEM\\s+(.+)$")
- if ( ! is.na(g[1,1]))
- compound$setField(BIODB.PUBCHEM.ID, g[1,2])
-
- # INCHI
- g <- str_match(s, "^CH\\$IUPAC:\\s+(.+)$")
- if ( ! is.na(g[1,1]))
- compound$setField(BIODB.INCHI, g[1,2])
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.NAME))) NULL else x)
-
- return(compounds)
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 MassbankConn.R
--- a/MassbankConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/MassbankConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,92 +1,122 @@
-if ( ! exists('MassbankConn')) { # Do not load again if already loaded
+#####################
+# CLASS DECLARATION #
+#####################
- source('RemotedbConn.R')
- source('MassdbConn.R')
- source('MassbankSpectrum.R')
+MassbankConn <- methods::setRefClass("MassbankConn", contains = c("RemotedbConn", "MassdbConn"), fields = list( .url = "character" ))
+
+###############
+# CONSTRUCTOR #
+###############
- #####################
- # CLASS DECLARATION #
- #####################
-
- MassbankConn <- setRefClass("MassbankConn", contains = c("RemotedbConn", "MassdbConn"))
+MassbankConn$methods( initialize = function(url = NA_character_, ...) {
+
+ # Set URL
+ .url <<- if (is.null(url) || is.na(url)) BIODB.MASSBANK.EU.WS.URL else url
+
+ callSuper(...)
+})
+
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
+MassbankConn$methods( getEntryContentType = function() {
+ return(BIODB.TXT)
+})
+
+#####################
+# GET ENTRY CONTENT #
+#####################
+
+MassbankConn$methods( getEntryContent = function(ids) {
+
+ # Debug
+ .self$.print.debug.msg(paste0("Get entry content(s) for ", length(ids)," id(s)..."))
- MassbankConn$methods( getEntryContentType = function(type) {
- return(if (type == BIODB.SPECTRUM) BIODB.TXT else NULL)
- })
+ URL.MAX.LENGTH <- 2083
+
+ # Initialize return values
+ content <- rep(NA_character_, length(ids))
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- MassbankConn$methods( getEntryContent = function(type, ids) {
+ # Loop on all
+ n <- 0
+ while (n < length(ids)) {
+
+ # Get list of accession ids to retrieve
+ accessions <- ids[(n + 1):length(ids)]
+
+ # Create URL request
+ x <- get.entry.url(class = BIODB.MASSBANK, accession = accessions, content.type = BIODB.TXT, max.length = URL.MAX.LENGTH, base.url = .self$.url)
# Debug
- .self$.print.debug.msg(paste0("Get entry content(s) for ", length(ids)," id(s)..."))
-
- if (type == BIODB.SPECTRUM) {
-
- URL.MAX.LENGTH <- 2083
-
- # Initialize return values
- content <- rep(NA_character_, length(ids))
+ .self$.print.debug.msg(paste0("Send URL request for ", x$n," id(s)..."))
- # Loop on all
- n <- 0
- while (n < length(ids)) {
+ # Send request
+ xmlstr <- .self$.get.url(x$url)
- # Get list of accession ids to retrieve
- accessions <- ids[(n + 1):length(ids)]
-
- # Create URL request
- x <- get.entry.url(class = BIODB.MASSBANK, accession = accessions, content.type = BIODB.TXT, max.length = URL.MAX.LENGTH)
+ # Increase number of entries retrieved
+ n <- n + x$n
- # Debug
- .self$.print.debug.msg(paste0("Send URL request for ", x$n," id(s)..."))
-
- # Send request
- xmlstr <- .self$.scheduler$getUrl(x$url)
-
- # Increase number of entries retrieved
- n <- n + x$n
-
- # Parse XML and get text
- if ( ! is.na(xmlstr)) {
- library(XML)
- xml <- xmlInternalTreeParse(xmlstr, asText = TRUE)
- ns <- c(ax21 = "http://api.massbank/xsd")
- returned.ids <- xpathSApply(xml, "//ax21:id", xmlValue, namespaces = ns)
- content[match(returned.ids, ids)] <- xpathSApply(xml, "//ax21:info", xmlValue, namespaces = ns)
- }
-
- # Debug
- .self$.print.debug.msg(paste0("Now ", length(ids) - n," id(s) left to be retrieved..."))
- }
-
- return(content)
+ # Parse XML and get text
+ if ( ! is.na(xmlstr)) {
+ xml <- xmlInternalTreeParse(xmlstr, asText = TRUE)
+ ns <- c(ax21 = "http://api.massbank/xsd")
+ returned.ids <- xpathSApply(xml, "//ax21:id", xmlValue, namespaces = ns)
+ if (length(returned.ids) > 0)
+ content[match(returned.ids, ids)] <- xpathSApply(xml, "//ax21:info", xmlValue, namespaces = ns)
}
- return(NULL)
- })
-
- ################
- # CREATE ENTRY #
- ################
-
- # Creates a Spectrum instance from file content.
- # content A file content, downloaded from the public database.
- # RETURN A spectrum instance.
- MassbankConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.SPECTRUM) createMassbankSpectrumFromTxt(content, drop = drop) else NULL)
- })
+ # Debug
+ .self$.print.debug.msg(paste0("Now ", length(ids) - n," id(s) left to be retrieved..."))
+ }
+
+ return(content)
+})
+
+################
+# CREATE ENTRY #
+################
+
+# Creates a Spectrum instance from file content.
+# content A file content, downloaded from the public database.
+# RETURN A spectrum instance.
+MassbankConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createMassbankEntryFromTxt(content, drop = drop))
+})
+
+#################
+# GET MZ VALUES #
+#################
+
+MassbankConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
+})
- #################
- # GET MZ VALUES #
- #################
-
- MassbankConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
- })
-}
+#################
+# GET ENTRY IDS #
+#################
+
+MassbankConn$methods( getEntryIds = function(max.results = NA_integer_) {
+
+ # Set URL
+ url <- paste0(.self$.url, 'searchPeak?mzs=1000&relativeIntensity=100&tolerance=1000&instrumentTypes=all&ionMode=Both')
+ url <- paste0(url, '&maxNumResults=', (if (is.na(max.results)) 0 else max.results))
+
+ # Send request
+ xmlstr <- .self$.get.url(url)
+
+ # Parse XML and get text
+ if ( ! is.na(xmlstr)) {
+ xml <- xmlInternalTreeParse(xmlstr, asText = TRUE)
+ ns <- c(ax21 = "http://api.massbank/xsd")
+ returned.ids <- xpathSApply(xml, "//ax21:id", xmlValue, namespaces = ns)
+ return(returned.ids)
+ }
+})
+
+##################
+# GET NB ENTRIES #
+##################
+
+MassbankConn$methods( getNbEntries = function() {
+ return(length(.self$getEntryIds()))
+})
diff -r 253d531a0193 -r 20d69a062da3 MassbankEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MassbankEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,129 @@
+###########################
+# MASSBANK SPECTRUM CLASS #
+###########################
+
+MassbankEntry <- methods::setRefClass("MassbankEntry", contains = "BiodbEntry")
+
+###########
+# FACTORY #
+###########
+
+createMassbankEntryFromTxt <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ # Define fields regex
+ regex <- character()
+ regex[[BIODB.ACCESSION]] <- "^ACCESSION: (.+)$"
+ regex[[BIODB.MSDEV]] <- "^AC\\$INSTRUMENT: (.+)$"
+ regex[[BIODB.MSDEVTYPE]] <- "^AC\\$INSTRUMENT_TYPE: (.+)$"
+ regex[[BIODB.MSTYPE]] <- "^AC\\$MASS_SPECTROMETRY: MS_TYPE (.+)$"
+ regex[[BIODB.MSPRECMZ]] <- "^MS\\$FOCUSED_ION: PRECURSOR_M/Z (.+)$"
+ regex[[BIODB.NB.PEAKS]] <- "^PK\\$NUM_PEAK: ([0-9]+)$"
+ regex[[BIODB.MSPRECANNOT]] <- "^MS\\$FOCUSED_ION: PRECURSOR_TYPE (.+)$"
+ regex[[BIODB.CHEBI.ID]] <- "^CH\\$LINK: CHEBI\\s+(.+)$"
+ regex[[BIODB.KEGG.ID]] <- "^CH\\$LINK: KEGG\\s+(.+)$"
+ regex[[BIODB.INCHI]] <- "^CH\\$IUPAC:\\s+(.+)$"
+ regex[[BIODB.INCHIKEY]] <- "^CH\\$LINK: INCHIKEY\\s+(.+)$"
+ regex[[BIODB.CHEMSPIDER.ID]] <- "^CH\\$LINK: CHEMSPIDER\\s+(.+)$"
+ regex[[BIODB.CAS.ID]] <- "^CH\\$LINK: CAS\\s+(.+)$"
+ regex[[BIODB.FORMULA]] <- "^CH\\$FORMULA:\\s+(.+)$"
+ regex[[BIODB.SMILES]] <- "^CH\\$SMILES:\\s+(.+)$"
+ regex[[BIODB.MASS]] <- "^CH\\$EXACT_MASS:\\s+(.+)$"
+ regex[[BIODB.PUBCHEMCOMP.ID]] <- "^CH\\$LINK: PUBCHEM\\s+.*CID:([0-9]+)"
+ regex[[BIODB.PUBCHEMSUB.ID]] <- "^CH\\$LINK: PUBCHEM\\s+.*SID:([0-9]+)"
+
+ for (text in contents) {
+
+ # Create instance
+ entry <- MassbankEntry$new()
+
+ if ( ! is.null(text) && ! is.na(text)) {
+
+ # Read text
+ lines <- strsplit(text, "\n")
+ for (s in lines[[1]]) {
+
+ # Test generic regex
+ parsed <- FALSE
+ for (field in names(regex)) {
+ g <- stringr::str_match(s, regex[[field]])
+ if ( ! is.na(g[1,1])) {
+ entry$setField(field, g[1,2])
+ parsed <- TRUE
+ break
+ }
+ }
+ if (parsed)
+ next
+
+ # Name
+ if (is.na(entry$getField(BIODB.NAME))) {
+ g <- stringr::str_match(s, "^CH\\$NAME:\\s+(.+)$")
+ if ( ! is.na(g[1,1]))
+ entry$setField(BIODB.NAME, g[1,2])
+ }
+
+ # PubChem
+ g <- stringr::str_match(s, "^CH\\$LINK: PUBCHEM\\s+([0-9]+)$")
+ if ( ! is.na(g[1,1]))
+ entry$setField(BIODB.PUBCHEMSUB.ID, g[1,2])
+
+ # MS MODE
+ g <- stringr::str_match(s, "^AC\\$MASS_SPECTROMETRY: ION_MODE (.+)$")
+ if ( ! is.na(g[1,1])) {
+ entry$setField(BIODB.MSMODE, if (g[1,2] == 'POSITIVE') BIODB.MSMODE.POS else BIODB.MSMODE.NEG)
+ next
+ }
+
+ # PEAKS
+ if (.parse.peak.line(entry, s))
+ next
+ }
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
+
+###################
+# PARSE PEAK LINE #
+###################
+
+.parse.peak.line <- function(entry, line) {
+
+ peaks <- BIODB.PEAK.DF.EXAMPLE
+
+ # Annotation
+ g <- stringr::str_match(line, "^\\s+([0-9][0-9.]*) ([A-Z0-9+-]+) ([0-9]+) ([0-9][0-9.]*) ([0-9][0-9.]*)$")
+ if ( ! is.na(g[1,1]))
+ peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM)] <- list(as.double(g[1,2]), g[1,3], as.integer(g[1,4]), as.double(g[1,5]), as.double(g[1,6]))
+
+ # Peak
+ g <- stringr::str_match(line, "^\\s+([0-9][0-9.]*) ([0-9][0-9.]*) ([0-9]+)$")
+ if ( ! is.na(g[1,1]))
+ peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY)] <- list(as.double(g[1,2]), as.double(g[1,3]), as.integer(g[1,4]))
+
+ if (nrow(peaks) > 0) {
+
+ # Get curent peaks and merge with new peaks
+ current.peaks <- entry$getField(BIODB.PEAKS)
+ if ( ! is.null(current.peaks))
+ peaks <- rbind(current.peaks, peaks)
+
+ entry$setField(BIODB.PEAKS, peaks)
+
+ return(TRUE)
+ }
+
+ return(FALSE)
+}
diff -r 253d531a0193 -r 20d69a062da3 MassbankSpectrum.R
--- a/MassbankSpectrum.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,117 +0,0 @@
-if ( ! exists('MassbankSpectrum')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
- source('MassbankCompound.R')
-
- ###########################
- # MASSBANK SPECTRUM CLASS #
- ###########################
-
- MassbankSpectrum <- setRefClass("MassbankSpectrum", contains = "BiodbEntry")
-
- ###########
- # FACTORY #
- ###########
-
- createMassbankSpectrumFromTxt <- function(contents, drop = TRUE) {
-
- library(stringr)
-
- spectra <- list()
-
- # Define fields regex
- regex <- character()
- regex[[BIODB.ACCESSION]] <- "^ACCESSION: (.+)$"
- regex[[BIODB.MSDEV]] <- "^AC\\$INSTRUMENT: (.+)$"
- regex[[BIODB.MSDEVTYPE]] <- "^AC\\$INSTRUMENT_TYPE: (.+)$"
- regex[[BIODB.MSTYPE]] <- "^AC\\$MASS_SPECTROMETRY: MS_TYPE (.+)$"
- regex[[BIODB.MSPRECMZ]] <- "^MS\\$FOCUSED_ION: PRECURSOR_M/Z (.+)$"
- regex[[BIODB.NB.PEAKS]] <- "^PK\\$NUM_PEAK: ([0-9]+)$"
- regex[[BIODB.MSPRECANNOT]] <- "^MS\\$FOCUSED_ION: PRECURSOR_TYPE (.+)$"
-
- for (text in contents) {
-
- # Create instance
- spectrum <- MassbankSpectrum$new()
-
- # Read text
- lines <- strsplit(text, "\n")
- for (s in lines[[1]]) {
-
- # Test generic regex
- parsed <- FALSE
- for (field in names(regex)) {
- g <- str_match(s, regex[[field]])
- if ( ! is.na(g[1,1])) {
- spectrum$setField(field, g[1,2])
- parsed <- TRUE
- break
- }
- }
- if (parsed)
- next
-
- # MS MODE
- g <- str_match(s, "^AC\\$MASS_SPECTROMETRY: ION_MODE (.+)$")
- if ( ! is.na(g[1,1])) {
- spectrum$setField(BIODB.MSMODE, if (g[1,2] == 'POSITIVE') BIODB.MSMODE.POS else BIODB.MSMODE.NEG)
- next
- }
-
- # PEAKS
- if (.parse.peak.line(spectrum, s))
- next
- }
-
- spectra <- c(spectra, spectrum)
- }
-
- # Replace elements with no accession id by NULL
- spectra <- lapply(spectra, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # Set associated compounds
- compounds <- createMassbankCompoundFromTxt(contents)
- for (i in seq(spectra))
- if ( ! is.null(spectra[[i]]))
- spectra[[i]]$setField(BIODB.COMPOUND, compounds[[i]])
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- spectra <- spectra[[1]]
-
- return(spectra)
- }
-
- ###################
- # PARSE PEAK LINE #
- ###################
-
- .parse.peak.line <- function(spectrum, line) {
-
- peaks <- BIODB.PEAK.DF.EXAMPLE
-
- # Annotation
- g <- str_match(line, "^\\s+([0-9][0-9.]*) ([A-Z0-9+-]+) ([0-9]+) ([0-9][0-9.]*) ([0-9][0-9.]*)$")
- if ( ! is.na(g[1,1]))
- peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM)] <- list(as.double(g[1,2]), g[1,3], as.integer(g[1,4]), as.double(g[1,5]), as.double(g[1,6]))
-
- # Peak
- g <- str_match(line, "^\\s+([0-9][0-9.]*) ([0-9][0-9.]*) ([0-9]+)$")
- if ( ! is.na(g[1,1]))
- peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY)] <- list(as.double(g[1,2]), as.double(g[1,3]), as.integer(g[1,4]))
-
- if (nrow(peaks) > 0) {
-
- # Get curent peaks and merge with new peaks
- current.peaks <- spectrum$getField(BIODB.PEAKS)
- if ( ! is.null(current.peaks))
- peaks <- rbind(current.peaks, peaks)
-
- spectrum$setField(BIODB.PEAKS, peaks)
-
- return(TRUE)
- }
-
- return(FALSE)
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 MassdbConn.R
--- a/MassdbConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/MassdbConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,31 +1,130 @@
-if ( ! exists('MassdbConn')) {
+#####################
+# CLASS DECLARATION #
+#####################
+
+MassdbConn <- methods::setRefClass("MassdbConn", contains = "BiodbConn")
+
+###############################
+# GET CHROMATOGRAPHIC COLUMNS #
+###############################
+
+# Get a list of chromatographic columns contained in this database.
+# compound.ids A list of compound IDs used to filter results.
+# The returned value is a data.frame with two columns : one for the ID (BIODB.ID) and another one for the title (BIODB.TITLE).
+MassdbConn$methods( getChromCol = function(compound.ids = NULL) {
+ stop("Method getChromCol() is not implemented in concrete class.")
+})
- source('BiodbConn.R')
+#################
+# GET MZ VALUES #
+#################
+
+# Returns a numeric vector of all masses stored inside the database.
+MassdbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
+ stop("Method getMzValues() not implemented in concrete class.")
+})
+
+################
+# GET NB PEAKS #
+################
+
+# Returns the number of peaks contained in the database
+MassdbConn$methods( getNbPeaks = function(mode = NULL, compound.ids = NULL) {
+ stop("Method getNbPeaks() not implemented in concrete class.")
+})
+
+#########################
+# FIND COMPOUND BY NAME #
+#########################
- #####################
- # CLASS DECLARATION #
- #####################
-
- MassdbConn <- setRefClass("MassdbConn", contains = "BiodbConn")
+# Find a molecule by name
+# name A vector of molecule names to search for.
+# Return an integer vector of the same size as the name input vector, containing the found molecule IDs, in the same order.
+MassdbConn$methods( findCompoundByName = function(name) {
+ stop("Method findCompoundByName() not implemented in concrete class.")
+})
+
+####################################
+# FIND SPECTRA IN GIVEN MASS RANGE #
+####################################
+# Find spectra in the given mass range.
+# rtype the type of return, objects, dfspecs data.frame of spectra, dfpeaks data.frame of peaks.
+MassdbConn$methods( searchMzRange = function(mzmin, mzmax, rtype = c("objects","dfspecs","dfpeaks")){
+ stop("Method searchMzRange() not implemented in concrete class.")
+})
+
+####################################
+# FIND SPECTRA IN GIVEN MASS RANGE #
+####################################
+MassdbConn$methods( searchMzTol = function(mz, tol, tolunit=BIODB.MZTOLUNIT.PLAIN, rtype = c("objects","dfspecs","dfpeaks")){
+ stop("Method searchMzTol() not implemented in concrete class.")
+})
- ###############################
- # GET CHROMATOGRAPHIC COLUMNS #
- ###############################
+######################################################
+# FIND A MOLECULES WITH PRECURSOR WITHIN A TOLERANCE #
+######################################################
+ MassdbConn$methods( searchSpecPrecTol = function(mz, tol, tolunit=BIODB.MZTOLUNIT.PLAIN, mode = NULL){
+ stop("Method searchSpecPrecTol not implemented in concrete class.")
+ })
+
+#################################
+#perform a database MS-MS search#
+#################################
+
+### spec : the spec to match against the database.
+### precursor : the mass/charge of the precursor to be looked for.
+### mtol : the size of the windows arounf the precursor to be looked for.
+### ppm : the matching ppm tolerance.
+### fun :
+### dmz : the mass tolerance is taken as the minium between this quantity and the ppm.
+### npmin : the minimum number of peak to detect a match (2 recommended)
+
+MassdbConn$methods( msmsSearch = function(spec, precursor, mztol, tolunit,
+ ppm, fun = BIODB.MSMS.DIST.WCOSINE,
+ params = list(), npmin=2, dmz = 0.001,
+ mode = BIODB.MSMODE.POS, return.ids.only = TRUE){
+
- # Get a list of chromatographic columns contained in this database.
- # compound.ids A list of compound IDs used to filter results.
- # The returned value is a data.frame with two columns : one for the ID (BIODB.ID) and another one for the title (BIODB.TITLE).
- MassdbConn$methods( getChromCol = function(compound.ids = NULL) {
- stop("Method getChromCol() is not implemented in concrete class.")
+ # TODO replace by msms precursor search when available.
+ lspec <- .self$searchSpecPrecTol( precursor, mztol, BIODB.MZTOLUNIT.PLAIN, mode = mode)
+ rspec <- lapply(lspec,function(x){
+ peaks <- x$getFieldValue(BIODB.PEAKS)
+
+ ####Getting the correct fields
+ vcomp <- c(BIODB.PEAK.MZ, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.INTENSITY)
+
+ foundfields <- vcomp %in% colnames(peaks)
+ if(sum(foundfields ) < 2){
+ stop(paste0("fields can't be coerced to mz and intensity : ",colnames(peaks)))
+ }
+
+ peaks <- peaks[ , vcomp[which( foundfields ) ] ]
+
+ peaks
})
+
+ # TODO Import compareSpectra into biodb and put it inside massdb-helper.R or hide it as a private method.
+ res <- compareSpectra(spec, rspec, npmin = npmin, fun = fun, params = params)
+
+ if(is.null(res)) return(NULL) # To decide at MassdbConn level: return empty list (or empty data frame) or NULL.
+ ###Adiing the matched peaks and the smimlarity values to spectra.
+
+ lret <-vector(length(lspec),mode = "list")
+ vsimilarity <- numeric( length( lspec ) )
+ vmatched <- vector( mode = "list", length( lspec ) )
+
+ if( return.ids.only ){
+ lret <- sapply( lspec, function( x ) {
+ x$getFieldValue( BIODB.ACCESSION )
+ })
+ }else{
+ ###TODO implement three types of return.
+ lret <- lspec
+ }
+
+ ###Reordering the list.
+ lret <- lret[ res$ord ]
+
- #################
- # GET MZ VALUES #
- #################
-
- # Returns a numeric vector of all masses stored inside the database.
- MassdbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
- stop("Method getMzValues() not implemented in concrete class.")
- })
-
-}
+ return( list(measure = res$similarity[ res$ord ], matchedpeaks = res$matched [ res$ord ], id = lret))
+})
diff -r 253d531a0193 -r 20d69a062da3 MirbaseCompound.R
--- a/MirbaseCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,54 +0,0 @@
-if ( ! exists('MirbaseCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- MirbaseCompound <- setRefClass("MirbaseCompound", contains = "BiodbEntry")
-
- ###########
- # FACTORY #
- ###########
-
- createMirbaseCompoundFromHtml <- function(contents, drop = TRUE) {
-
- library(XML)
-
- compounds <- list()
-
- # Define fields regex
- xpath.expr <- character()
- xpath.expr[[BIODB.ACCESSION]] <- "//td[text()='Accession number']/../td[2]"
- xpath.expr[[BIODB.NAME]] <- "//td[text()='ID']/../td[2]"
- xpath.expr[[BIODB.SEQUENCE]] <- "//td[text()='Sequence']/..//pre"
-
- for (html in contents) {
-
- # Create instance
- compound <- ChebiCompound$new()
-
- # Parse HTML
- xml <- htmlTreeParse(html, asText = TRUE, useInternalNodes = TRUE)
-
- # Test generic xpath expressions
- for (field in names(xpath.expr)) {
- v <- xpathSApply(xml, xpath.expr[[field]], xmlValue)
- if (length(v) > 0)
- compound$setField(field, v)
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- compounds <- compounds[[1]]
-
- return(compounds)
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 MirbaseConn.R
--- a/MirbaseConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/MirbaseConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,66 +1,54 @@
-if ( ! exists('MirbaseConn')) { # Do not load again if already loaded
-
- source('RemotedbConn.R')
- source('MirbaseCompound.R')
+#####################
+# CLASS DECLARATION #
+#####################
- #####################
- # CLASS DECLARATION #
- #####################
+MirbaseConn <- methods::setRefClass("MirbaseConn", contains = "RemotedbConn")
- MirbaseConn <- setRefClass("MirbaseConn", contains = "RemotedbConn")
-
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
- MirbaseConn$methods( getEntryContentType = function(type) {
- return(BIODB.HTML)
- })
+MirbaseConn$methods( getEntryContentType = function() {
+ return(BIODB.HTML)
+})
+
+#####################
+# GET ENTRY CONTENT #
+#####################
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- MirbaseConn$methods( getEntryContent = function(type, id) {
+MirbaseConn$methods( getEntryContent = function(ids) {
- if (type == BIODB.COMPOUND) {
+ # Initialize return values
+ content <- rep(NA_character_, length(ids))
- # Initialize return values
- content <- rep(NA_character_, length(id))
-
- # Request
- content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.MIRBASE, x, content.type = BIODB.HTML)), FUN.VALUE = '')
+ # Request
+ content <- vapply(ids, function(x) .self$.get.url(get.entry.url(BIODB.MIRBASE, x, content.type = BIODB.HTML)), FUN.VALUE = '')
- return(content)
- }
-
- return(NULL)
- })
+ return(content)
+})
- ################
- # CREATE ENTRY #
- ################
-
- MirbaseConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.COMPOUND) createMirbaseCompoundFromHtml(content, drop = drop) else NULL)
- })
+################
+# CREATE ENTRY #
+################
+
+MirbaseConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createMirbaseEntryFromHtml(content, drop = drop))
+})
- ###################
- # FIND ACCESSIONS #
- ###################
+###################
+# FIND ACCESSIONS #
+###################
- MirbaseConn$methods(
- findAccessions = function(name) {
+MirbaseConn$methods( findAccessions = function(name) {
- # Get HTML
- htmlstr <- .self$.scheduler$getUrl('http://www.mirbase.org/cgi-bin/query.pl', params = c(terms = name, submit = 'Search'))
+ # Get HTML
+ htmlstr <- .self$.get.url('http://www.mirbase.org/cgi-bin/query.pl', params = c(terms = name, submit = 'Search'))
- # Parse HTML
- xml <- htmlTreeParse(htmlstr, asText = TRUE, useInternalNodes = TRUE)
+ # Parse HTML
+ xml <- htmlTreeParse(htmlstr, asText = TRUE, useInternalNodes = TRUE)
- # Get accession number
- acc <- unlist(xpathSApply(xml, "//a[starts-with(.,'MIMAT')]", xmlValue))
+ # Get accession number
+ acc <- unlist(xpathSApply(xml, "//a[starts-with(.,'MIMAT')]", xmlValue))
- return(acc)
- })
-}
+ return(acc)
+})
diff -r 253d531a0193 -r 20d69a062da3 MirbaseEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MirbaseEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,47 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+MirbaseEntry <- methods::setRefClass("MirbaseEntry", contains = "BiodbEntry")
+
+###########
+# FACTORY #
+###########
+
+createMirbaseEntryFromHtml <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ # Define fields regex
+ xpath.expr <- character()
+ xpath.expr[[BIODB.ACCESSION]] <- "//td[text()='Accession number']/../td[2]"
+ xpath.expr[[BIODB.NAME]] <- "//td[text()='ID']/../td[2]"
+ xpath.expr[[BIODB.SEQUENCE]] <- "//td[text()='Sequence']/..//pre"
+
+ for (html in contents) {
+
+ # Create instance
+ entry <- MirbaseEntry$new()
+
+ # Parse HTML
+ xml <- XML::htmlTreeParse(html, asText = TRUE, useInternalNodes = TRUE)
+
+ # Test generic xpath expressions
+ for (field in names(xpath.expr)) {
+ v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue)
+ if (length(v) > 0)
+ entry$setField(field, v)
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
diff -r 253d531a0193 -r 20d69a062da3 Ms4TabSqlDb.R
--- a/Ms4TabSqlDb.R Sat Sep 03 17:02:01 2016 -0400
+++ b/Ms4TabSqlDb.R Thu Mar 02 08:55:00 2017 -0500
@@ -266,7 +266,7 @@
#################
# Returns a numeric vector of all masses stored inside the database.
- Ms4TabSqlDb$methods( getMzValues = function(mode = NULL) {
+ Ms4TabSqlDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
# Build request
select <- paste0("select distinct pk.mass as ", MSDB.TAG.MZTHEO)
@@ -274,6 +274,9 @@
where <- ""
if ( ! is.null(mode))
where <- paste0(" where ", if (mode == MSDB.TAG.POS) '' else 'not ', 'pk.ion_pos')
+ limit <- ""
+ if ( ! is.na(NA_integer_))
+ limit <- paste(" limit", max.results)
# Assemble request
request <- paste0(select, from, where, ';')
diff -r 253d531a0193 -r 20d69a062da3 MsBioDb.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MsBioDb.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,100 @@
+if ( ! exists('MsBioDb')) { # Do not load again if already loaded
+
+ library(methods)
+ source('MsDb.R')
+ source(file.path('BiodbObject.R'), chdir = TRUE)
+ source(file.path('BiodbFactory.R'), chdir = TRUE)
+
+ #####################
+ # CLASS DECLARATION #
+ #####################
+
+ MsBioDb <- setRefClass("MsBioDb", contains = "MsDb", fields = list(.massdb = "ANY"))
+
+ ###############
+ # CONSTRUCTOR #
+ ###############
+
+ MsBioDb$methods( initialize = function(massdb = NULL, ...) {
+
+ # Check bio database
+ ! is.null(massdb) || stop("You must set a bio database.")
+ inherits(massdb, "MassdbConn") || stop("The bio database must inherit from MassdbConn class.")
+ .massdb <<- massdb
+
+ callSuper(...)
+ })
+
+ ####################
+ # HANDLE COMPOUNDS #
+ ####################
+
+ MsBioDb$methods( handleCompounds = function() {
+ return(.self$.massdb$handlesEntryType(BIODB.COMPOUND))
+ })
+
+ ####################
+ # GET MOLECULE IDS #
+ ####################
+
+ MsBioDb$methods( getMoleculeIds = function(max.results = NA_integer_) {
+ return(.self$.massdb$getEntryIds(type = BIODB.COMPOUND, max.results = max.results))
+ })
+
+ ####################
+ # GET NB MOLECULES #
+ ####################
+
+ MsBioDb$methods( getNbMolecules = function() {
+ return(.self$.massdb$getNbEntries(type = BIODB.COMPOUND))
+ })
+
+ #################
+ # GET MZ VALUES #
+ #################
+
+ MsBioDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
+ return(.self$.massdb$getMzValues(mode = mode, max.results = max.results))
+ })
+
+ #####################
+ # GET MOLECULE NAME #
+ #####################
+
+ MsBioDb$methods( getMoleculeName = function(molid) {
+ return(.self$.massdb$getMoleculeName(molid))
+ })
+
+ ###############################
+ # GET CHROMATOGRAPHIC COLUMNS #
+ ###############################
+
+ MsBioDb$methods( getChromCol = function(molid = NULL) {
+ return(.self$.massdb$getChromCol(molid))
+ })
+
+ ################
+ # FIND BY NAME #
+ ################
+
+ MsBioDb$methods( findByName = function(name) {
+ return(.self$.massdb$findCompoundByName(name))
+ })
+
+ #######################
+ # GET RETENTION TIMES #
+ #######################
+
+ MsBioDb$methods( getRetentionTimes = function(molid, col = NA_character_) {
+ return(.self$.massdb$getRetentionTimes(molid, chrom.cols = col))
+ })
+
+ ################
+ # GET NB PEAKS #
+ ################
+
+ MsBioDb$methods( getNbPeaks = function(molid = NA_integer_, mode = NA_character_) {
+ return(.self$.massdb$getNbPeaks(compound.ids = molid, mode = mode))
+ })
+
+}
diff -r 253d531a0193 -r 20d69a062da3 MsDb.R
--- a/MsDb.R Sat Sep 03 17:02:01 2016 -0400
+++ b/MsDb.R Thu Mar 02 08:55:00 2017 -0500
@@ -135,11 +135,20 @@
})
####################
+ # HANDLE COMPOUNDS #
+ ####################
+
+ # Returns TRUE if this database handles compounds directly (by IDs)
+ MsDb$methods( handleCompounds = function() {
+ return(TRUE)
+ })
+
+ ####################
# GET MOLECULE IDS #
####################
# Returns an integer vector of all molecule IDs stored inside the database.
- MsDb$methods( getMoleculeIds = function() {
+ MsDb$methods( getMoleculeIds = function(max.results = NA_integer_) {
stop("Method getMoleculeIds() not implemented in concrete class.")
})
@@ -157,7 +166,7 @@
#################
# Returns a numeric vector of all masses stored inside the database.
- MsDb$methods( getMzValues = function(mode = NULL) {
+ MsDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
stop("Method getMzValues() not implemented in concrete class.")
})
@@ -218,7 +227,7 @@
# GET PEAK TABLE #
##################
- MsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_){
+ MsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) {
stop("Method getPeakTable() not implemented in concrete class.")
})
@@ -235,7 +244,7 @@
# rt.tol.x Tolerance parameter for the equations : rtinf = rt - rt.tol.x - rt ^ rt.tol.y and rtsup = rt + rt.tol.x + rt ^ rt.tol.y
# rt.tol.y Tolerance parameter. See rt.tol.x parameter.
# attribs Only search for peaks whose attribution is among this set of attributions.
- # molids Only search for peaks whose molecule ID is among this vector of integer molecule IDs. Can also be a data frame with a retention time column x.colnames$rt and a molecule ID column MSDB.TAG.molid.
+ # molids Only search for peaks whose molecule ID is among this vector of integer molecule IDs. Can also be a data frame with a retention time column x.colnames$rt and a molecule ID column MSDB.TAG.MOLID.
# molids.rt.tol Retention time tolerance used when molids parameter is a data frame (rt, id)
# precursor.match Remove peaks whose molecule precursor peak has not also been matched.
# precursor.rt.tol
@@ -261,7 +270,7 @@
precursors.ids <- precursors.ids[ ! duplicated(precursors.ids), ]
# Get all matching peaks whose molecule is inside the previously obtained list of molecules
- .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = precursors.ids, molids.rt.tol = precursor.rt.tol, same.cols = same.cols, same.rows = same.rows, peak.table = peak.table)
+ df <- .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = precursors.ids, molids.rt.tol = precursor.rt.tol, same.cols = same.cols, same.rows = same.rows, peak.table = peak.table)
# TODO
#
# peaks <- if (peak.table) results[['peaks']] else results
@@ -344,6 +353,7 @@
# Loop on all lines of input
peaks <- NULL
+ .self$.input.stream$reset()
while (.self$.input.stream$hasNextValues()) {
.self$.input.stream$nextValues()
@@ -369,7 +379,7 @@
# else {
# if (same.rows) {
# y[r, colnames(x.lines)] <- x.lines
-# ids <- results[[MSDB.TAG.molid]]
+# ids <- results[[MSDB.TAG.MOLID]]
# ids <- ids[ ! duplicated(ids)] # Remove duplicated values
# y[r, MSDB.TAG.msmatching] <- paste(ids, collapse = .self$.molids.sep)
# }
@@ -426,7 +436,7 @@
# List molecule IDs
if ( ! is.null(molids.rt.tol) && is.data.frame(molids)) {
- ids <- molids[(rt >= molids[[MSDB.TAG.colrt]] - molids.rt.tol) & (rt <= molids[[MSDB.TAG.colrt]] + molids.rt.tol), MSDB.TAG.molid]
+ ids <- molids[(rt >= molids[[MSDB.TAG.COLRT]] - molids.rt.tol) & (rt <= molids[[MSDB.TAG.COLRT]] + molids.rt.tol), MSDB.TAG.MOLID]
if (length(ids) == 0)
# No molecule ID match for this retention time
return(data.frame()) # return empty result set
diff -r 253d531a0193 -r 20d69a062da3 MsDbInputDataFrameStream.R
--- a/MsDbInputDataFrameStream.R Sat Sep 03 17:02:01 2016 -0400
+++ b/MsDbInputDataFrameStream.R Thu Mar 02 08:55:00 2017 -0500
@@ -82,4 +82,12 @@
return(.self$.i < nrow(.self$.df))
})
+ #########
+ # RESET #
+ #########
+
+ MsDbInputDataFrameStream$methods( reset = function() {
+ .i <<- 0L
+ })
+
} # end of load safe guard
diff -r 253d531a0193 -r 20d69a062da3 MsDbOutputDataFrameStream.R
--- a/MsDbOutputDataFrameStream.R Sat Sep 03 17:02:01 2016 -0400
+++ b/MsDbOutputDataFrameStream.R Thu Mar 02 08:55:00 2017 -0500
@@ -25,7 +25,7 @@
# GET DATA FRAME #
##################
- MsDbOutputDataFrameStream$methods( getDataFrame = function(...) {
+ MsDbOutputDataFrameStream$methods( getDataFrame = function() {
# Put at least a column name if empty
if (nrow(.self$.df) == 0)
@@ -34,6 +34,15 @@
return(.self$.df)
})
+ # Move columns to beginning {{{1
+
+ MsDbOutputDataFrameStream$methods( moveColumnsToBeginning = function(cols) {
+ all.cols <- colnames(.self$.df)
+ other.cols <- all.cols[ ! all.cols %in% cols]
+ cols <- cols[cols %in% all.cols]
+ .df <<- .self$.df[c(cols, other.cols)]
+ })
+
#################
# MATCHED PEAKS #
#################
@@ -44,8 +53,12 @@
# Set input values
x <- data.frame(mz = mz)
- if ( ! is.null(rt))
- x <- cbind(x, data.frame(rt = rt))
+ colnames(x) <- MSDB.TAG.MZ
+ if ( ! is.null(rt)) {
+ x.rt <- data.frame(rt = rt)
+ colnames(x.rt) <- MSDB.TAG.RT
+ x <- cbind(x, x.rt)
+ }
# Merge input values with matched peaks
if ( ! is.null(peaks)) {
@@ -74,8 +87,12 @@
# Concatenate results in one line
if (.self$.one.line) {
# For each column, concatenate all values in one string.
- for (c in seq(peaks))
- peaks[1, c] <- paste0(peaks[[c]], collapse = .self$.match.sep, FUN.VALUE = '')
+ for (c in seq(peaks)) {
+ v <- peaks[[c]]
+ v <- v[ ! is.na(v)] # remove NA values
+ v <- v[ ! duplicated(v)] # remove duplicates
+ peaks[1, c] <- paste0(v, collapse = .self$.match.sep, FUN.VALUE = '')
+ }
peaks <- peaks[1, ] # Keep only first line
}
}
diff -r 253d531a0193 -r 20d69a062da3 MsFileDb.R
--- a/MsFileDb.R Sat Sep 03 17:02:01 2016 -0400
+++ b/MsFileDb.R Thu Mar 02 08:55:00 2017 -0500
@@ -84,6 +84,16 @@
# Load database
.db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL)
+ # Check that colnames are unique
+ dupcol <- duplicated(colnames(.self$.db))
+ if (any(dupcol))
+ stop(paste("Database header contains duplicated names: ", paste(unique(colnames(.self$.db)[dupcol]), collapse = ', '), "."))
+
+ # Check that columns names supplied through field map are unique
+ dupfields <- duplicated(.self$.fields)
+ if (any(dupfields))
+ stop(paste("Some db column names supplied are duplicated: ", paste(unique(.self$.fields[dupfields]), collapse = ', '), "."))
+
# Rename columns
colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '')
}
@@ -151,7 +161,7 @@
# GET MOLECULE IDS #
####################
- MsFileDb$methods( getMoleculeIds = function() {
+ MsFileDb$methods( getMoleculeIds = function(max.results = NA_integer_) {
# Init db
.self$.init.db()
@@ -161,6 +171,10 @@
mol.ids <- mol.ids[ ! duplicated(mol.ids)]
mol.ids <- sort(mol.ids)
+ # Cut results
+ if ( ! is.na(max.results) && length(mol.ids) > max.results)
+ mol.ids <- mol.ids[1:max.results]
+
return(mol.ids)
})
@@ -416,7 +430,7 @@
#################
# Returns a numeric vector of all masses stored inside the database.
- MsFileDb$methods( getMzValues = function(mode = NULL) {
+ MsFileDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
# Init db
.self$.init.db()
@@ -435,6 +449,10 @@
# Remove duplicates
mz <- mz[ ! duplicated(mz)]
+ # Apply cut-off
+ if ( ! is.na(max.results))
+ mz <- mz[1:max.results]
+
return(mz)
})
diff -r 253d531a0193 -r 20d69a062da3 MsPeakForestDb.R
--- a/MsPeakForestDb.R Sat Sep 03 17:02:01 2016 -0400
+++ b/MsPeakForestDb.R Thu Mar 02 08:55:00 2017 -0500
@@ -229,7 +229,7 @@
# GET MZ VALUES #
#################
- MsPeakForestDb$methods( getMzValues = function(mode = NULL) {
+ MsPeakForestDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
# Query params
params <- NULL
@@ -239,6 +239,10 @@
# Get MZ valuels
mz <- .self$.get.url(url = 'spectra/lcms/peaks/list-mz', params = params)
+ # Apply cut-off
+ if ( ! is.na(max.results))
+ mz <- mz[1:max.results]
+
return(mz)
})
diff -r 253d531a0193 -r 20d69a062da3 MsXlsDb.R
--- a/MsXlsDb.R Sat Sep 03 17:02:01 2016 -0400
+++ b/MsXlsDb.R Thu Mar 02 08:55:00 2017 -0500
@@ -39,6 +39,7 @@
MsXlsDb$methods( initialize = function(db_dir = NA_character_, limit = NA_integer_, cache_dir = NA_character_, cache = FALSE, ...) {
# Initialize members
+ # TODO check that db_dir is not null neither na, and tests that it exists and is a directory.
.db_dir <<- if ( ! is.null(db_dir)) db_dir else NA_character_
.limit <<- if ( ! is.null(limit) && ! is.na(limit) && limit > 0) limit else NA_integer_
cache_dir <- if (cache && is.na(cache_dir) && ! is.na(db_dir)) file.path(db_dir, 'cache') else cache_dir
@@ -283,7 +284,7 @@
#################
# Returns a numeric vector of all masses stored inside the database.
- MsXlsDb$methods( getMzValues = function(mode = NULL) {
+ MsXlsDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) {
mz <- numeric()
@@ -295,6 +296,10 @@
# Remove duplicated
mz <- mz[ ! duplicated(mz)]
+ # Apply cut-off
+ if ( ! is.na(max.results))
+ mz <- mz[1:max.results]
+
return(mz)
})
diff -r 253d531a0193 -r 20d69a062da3 NcbiCcdsCompound.R
--- a/NcbiCcdsCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,46 +0,0 @@
-if ( ! exists('NcbiccdsCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- NcbiccdsCompound <- setRefClass("NcbiccdsCompound", contains = "BiodbEntry")
-
- ###########
- # FACTORY #
- ###########
-
- createNcbiccdsCompoundFromHtml <- function(contents, drop = TRUE) {
-
- library(XML)
-
- compounds <- list()
-
- for (html in contents) {
-
- # Create instance
- compound <- NcbiccdsCompound$new()
-
- # Parse HTML
- xml <- htmlTreeParse(html, asText = TRUE, useInternalNodes = TRUE)
-
- if (length(getNodeSet(xml, "//*[starts-with(.,'No results found for CCDS ID ')]")) == 0) {
- compound$setField(BIODB.ACCESSION, xpathSApply(xml, "//input[@id='DATA']", xmlGetAttr, "value"))
- compound$setField(BIODB.SEQUENCE, xpathSApply(xml, "//b[starts-with(.,'Nucleotide Sequence')]/../tt", xmlValue))
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- compounds <- compounds[[1]]
-
- return(compounds)
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 NcbiCcdsConn.R
--- a/NcbiCcdsConn.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,56 +0,0 @@
-if ( ! exists('NcbiccdsConn')) { # Do not load again if already loaded
-
- source('RemotedbConn.R')
- source('NcbiccdsCompound.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- NcbiccdsConn <- setRefClass("NcbiccdsConn", contains = "RemotedbConn")
-
- ###############
- # CONSTRUCTOR #
- ###############
-
- NcbiccdsConn$methods( initialize = function(...) {
- # From NCBI E-Utility manual: "In order not to overload the E-utility servers, NCBI recommends that users post no more than three URL requests per second and limit large jobs to either weekends or between 9:00 PM and 5:00 AM Eastern time during weekdays".
- callSuper(scheduler = UrlRequestScheduler$new(n = 3), ...)
- })
-
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
-
- NcbiccdsConn$methods( getEntryContentType = function(type) {
- return(BIODB.HTML)
- })
-
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- NcbiccdsConn$methods( getEntryContent = function(type, id) {
-
- if (type == BIODB.COMPOUND) {
-
- # Initialize return values
- content <- rep(NA_character_, length(id))
-
- # Request
- content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.NCBICCDS, x)), FUN.VALUE = '')
-
- return(content)
- }
-
- return(NULL)
- })
-
- ################
- # CREATE ENTRY #
- ################
-
- NcbiccdsConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.COMPOUND) createNcbiccdsCompoundFromHtml(content, drop = drop) else NULL)
- })
-}
diff -r 253d531a0193 -r 20d69a062da3 NcbiGeneCompound.R
--- a/NcbiGeneCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,118 +0,0 @@
-if ( ! exists('NcbigeneCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
- source(file.path('strhlp.R'), chdir = TRUE)
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- NcbigeneCompound <- setRefClass("NcbigeneCompound", contains = "BiodbEntry")
-
- ###########
- # FACTORY #
- ###########
-
- createNcbigeneCompoundFromXml <- function(contents, drop = TRUE) {
-
- library(XML)
-
- compounds <- list()
-
- # Define xpath expressions
- xpath.expr <- character()
- xpath.expr[[BIODB.ACCESSION]] <- "//Gene-track_geneid"
- xpath.expr[[BIODB.KEGG.ID]] <- "/Dbtag_db[text()='KEGG']/..//Object-id_str"
- xpath.expr[[BIODB.UNIPROT.ID]] <- "//Gene-commentary_heading[text()='UniProtKB']/..//Dbtag_db[text()='UniProtKB/Swiss-Prot']/..//Object-id_str"
- xpath.expr[[BIODB.LOCATION]] <- "//Gene-ref_maploc"
- xpath.expr[[BIODB.PROTEIN.DESCRIPTION]] <- "//Gene-ref_desc"
- xpath.expr[[BIODB.SYMBOL]] <- "//Gene-ref_locus"
- xpath.expr[[BIODB.SYNONYMS]] <- "//Gene-ref_syn_E"
-
- for (content in contents) {
-
- # Create instance
- compound <- NcbigeneCompound$new()
-
- # Parse HTML
- xml <- xmlInternalTreeParse(content, asText = TRUE)
-
- # An error occured
- if (length(getNodeSet(xml, "//Error")) == 0 && length(getNodeSet(xml, "//ERROR")) == 0) {
-
- # Test generic xpath expressions
- for (field in names(xpath.expr)) {
- v <- xpathSApply(xml, xpath.expr[[field]], xmlValue)
- if (length(v) > 0) {
-
- # Eliminate duplicates
- v <- v[ ! duplicated(v)]
-
- # Set field
- compound$setField(field, v)
- }
- }
-
- # CCDS ID
- ccdsid <- .find.ccds.id(xml)
- if ( ! is.na(ccdsid))
- compound$setField(BIODB.NCBI.CCDS.ID, ccdsid)
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- compounds <- compounds[[1]]
-
- return(compounds)
-
- # Get data
-
- }
-
- ################
- # FIND CCDS ID #
- ################
-
- .find.ccds.id <- function(xml) {
-
- # 1) Get all CCDS tags.
- ccds_elements <- getNodeSet(xml, "//Dbtag_db[text()='CCDS']/..//Object-id_str")
-
- # 2) If all CCDS are the same, go to point 4.
- ccds <- NA_character_
- for (e in ccds_elements) {
- current_ccds <- xmlValue(e)
- if (is.na(ccds))
- ccds <- current_ccds
- else {
- if (current_ccds != ccds) {
- ccds <- NA_character_
- break
- }
- }
- }
-
- # 3) There are several CCDS values, we need to find the best one (i.e.: the most current one).
- if (is.na(ccds)) {
- # For each CCDS, look for the parent Gene-commentary tag. Then look for the text content of the Gene-commentary_label which is situed under. Ignore CCDS that have no Gene-commentary_label associated. Choose the CCDS that has the smallest Gene-commentary_label in alphabetical order.
- version <- NA_character_
- for (e in ccds_elements) {
- versions <- xpathSApply(e, "ancestor::Gene-commentary/Gene-commentary_label", xmlValue)
- if (length(versions) < 1) next
- current_version <- versions[[length(versions)]]
- if (is.na(version) || current_version < version) {
- version <- current_version
- ccds <- xmlValue(e)
- }
- }
- }
-
- return(ccds)
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 NcbiGeneConn.R
--- a/NcbiGeneConn.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,56 +0,0 @@
-if ( ! exists('NcbigeneConn')) { # Do not load again if already loaded
-
- source('RemotedbConn.R')
- source('NcbigeneCompound.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- NcbigeneConn <- setRefClass("NcbigeneConn", contains = "RemotedbConn")
-
- ###############
- # CONSTRUCTOR #
- ###############
-
- NcbigeneConn$methods( initialize = function(...) {
- # From NCBI E-Utility manual: "In order not to overload the E-utility servers, NCBI recommends that users post no more than three URL requests per second and limit large jobs to either weekends or between 9:00 PM and 5:00 AM Eastern time during weekdays".
- callSuper(scheduler = UrlRequestScheduler$new(n = 3), ...)
- })
-
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
-
- NcbigeneConn$methods( getEntryContentType = function(type) {
- return(BIODB.XML)
- })
-
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- NcbigeneConn$methods( getEntryContent = function(type, id) {
-
- if (type == BIODB.COMPOUND) {
-
- # Initialize return values
- content <- rep(NA_character_, length(id))
-
- # Request
- content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.NCBIGENE, x)), FUN.VALUE = '')
-
- return(content)
- }
-
- return(NULL)
- })
-
- ################
- # CREATE ENTRY #
- ################
-
- NcbigeneConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.COMPOUND) createNcbigeneCompoundFromXml(content, drop = drop) else NULL)
- })
-}
diff -r 253d531a0193 -r 20d69a062da3 NcbiccdsConn.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/NcbiccdsConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,45 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+NcbiccdsConn <- methods::setRefClass("NcbiccdsConn", contains = "RemotedbConn")
+
+###############
+# CONSTRUCTOR #
+###############
+
+NcbiccdsConn$methods( initialize = function(...) {
+ # From NCBI E-Utility manual: "In order not to overload the E-utility servers, NCBI recommends that users post no more than three URL requests per second and limit large jobs to either weekends or between 9:00 PM and 5:00 AM Eastern time during weekdays".
+ callSuper(scheduler = UrlRequestScheduler$new(n = 3), ...)
+})
+
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
+
+NcbiccdsConn$methods( getEntryContentType = function() {
+ return(BIODB.HTML)
+})
+
+#####################
+# GET ENTRY CONTENT #
+#####################
+
+NcbiccdsConn$methods( getEntryContent = function(id) {
+
+ # Initialize return values
+ content <- rep(NA_character_, length(id))
+
+ # Request
+ content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.NCBICCDS, x, content.type = BIODB.HTML)), FUN.VALUE = '')
+
+ return(content)
+})
+
+################
+# CREATE ENTRY #
+################
+
+NcbiccdsConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createNcbiccdsEntryFromHtml(content, drop = drop))
+})
diff -r 253d531a0193 -r 20d69a062da3 NcbiccdsEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/NcbiccdsEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,39 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+NcbiccdsEntry <- methods::setRefClass("NcbiccdsEntry", contains = "BiodbEntry")
+
+###########
+# FACTORY #
+###########
+
+createNcbiccdsEntryFromHtml <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ for (html in contents) {
+
+ # Create instance
+ entry <- NcbiccdsEntry$new()
+
+ # Parse HTML
+ xml <- XML::htmlTreeParse(html, asText = TRUE, useInternalNodes = TRUE)
+
+ if (length(XML::getNodeSet(xml, "//*[starts-with(.,'No results found for CCDS ID ')]")) == 0) {
+ entry$setField(BIODB.ACCESSION, XML::xpathSApply(xml, "//input[@id='DATA']", XML::xmlGetAttr, "value"))
+ entry$setField(BIODB.SEQUENCE, XML::xpathSApply(xml, "//b[starts-with(.,'Nucleotide Sequence')]/../tt", XML::xmlValue))
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
diff -r 253d531a0193 -r 20d69a062da3 NcbigeneConn.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/NcbigeneConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,45 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+NcbigeneConn <- methods::setRefClass("NcbigeneConn", contains = "RemotedbConn")
+
+###############
+# CONSTRUCTOR #
+###############
+
+NcbigeneConn$methods( initialize = function(...) {
+ # From NCBI E-Utility manual: "In order not to overload the E-utility servers, NCBI recommends that users post no more than three URL requests per second and limit large jobs to either weekends or between 9:00 PM and 5:00 AM Eastern time during weekdays".
+ callSuper(scheduler = UrlRequestScheduler$new(n = 3), ...)
+})
+
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
+
+NcbigeneConn$methods( getEntryContentType = function() {
+ return(BIODB.XML)
+})
+
+#####################
+# GET ENTRY CONTENT #
+#####################
+
+NcbigeneConn$methods( getEntryContent = function(id) {
+
+ # Initialize return values
+ content <- rep(NA_character_, length(id))
+
+ # Request
+ content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.NCBIGENE, x, content.type = BIODB.XML)), FUN.VALUE = '')
+
+ return(content)
+})
+
+################
+# CREATE ENTRY #
+################
+
+NcbigeneConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createNcbigeneEntryFromXml(content, drop = drop))
+})
diff -r 253d531a0193 -r 20d69a062da3 NcbigeneEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/NcbigeneEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,110 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+NcbigeneEntry <- methods::setRefClass("NcbigeneEntry", contains = "BiodbEntry")
+
+###########
+# FACTORY #
+###########
+
+createNcbigeneEntryFromXml <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ # Define xpath expressions
+ xpath.expr <- character()
+ xpath.expr[[BIODB.ACCESSION]] <- "//Gene-track_geneid"
+ xpath.expr[[BIODB.KEGG.ID]] <- "/Dbtag_db[text()='KEGG']/..//Object-id_str"
+ xpath.expr[[BIODB.UNIPROT.ID]] <- "//Gene-commentary_heading[text()='UniProtKB']/..//Dbtag_db[text()='UniProtKB/Swiss-Prot']/..//Object-id_str"
+ xpath.expr[[BIODB.LOCATION]] <- "//Gene-ref_maploc"
+ xpath.expr[[BIODB.PROTEIN.DESCRIPTION]] <- "//Gene-ref_desc"
+ xpath.expr[[BIODB.SYMBOL]] <- "//Gene-ref_locus"
+ xpath.expr[[BIODB.SYNONYMS]] <- "//Gene-ref_syn_E"
+
+ for (content in contents) {
+
+ # Create instance
+ entry <- NcbigeneEntry$new()
+
+ # Parse HTML
+ xml <- XML::xmlInternalTreeParse(content, asText = TRUE)
+
+ # An error occured
+ if (length(XML::getNodeSet(xml, "//Error")) == 0 && length(XML::getNodeSet(xml, "//ERROR")) == 0) {
+
+ # Test generic xpath expressions
+ for (field in names(xpath.expr)) {
+ v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue)
+ if (length(v) > 0) {
+
+ # Eliminate duplicates
+ v <- v[ ! duplicated(v)]
+
+ # Set field
+ entry$setField(field, v)
+ }
+ }
+
+ # CCDS ID
+ ccdsid <- .find.ccds.id(xml)
+ if ( ! is.na(ccdsid))
+ entry$setField(BIODB.NCBI.CCDS.ID, ccdsid)
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+
+ # Get data
+
+}
+
+################
+# FIND CCDS ID #
+################
+
+.find.ccds.id <- function(xml) {
+
+ # 1) Get all CCDS tags.
+ ccds_elements <- XML::getNodeSet(xml, "//Dbtag_db[text()='CCDS']/..//Object-id_str")
+
+ # 2) If all CCDS are the same, go to point 4.
+ ccds <- NA_character_
+ for (e in ccds_elements) {
+ current_ccds <- XML::xmlValue(e)
+ if (is.na(ccds))
+ ccds <- current_ccds
+ else {
+ if (current_ccds != ccds) {
+ ccds <- NA_character_
+ break
+ }
+ }
+ }
+
+ # 3) There are several CCDS values, we need to find the best one (i.e.: the most current one).
+ if (is.na(ccds)) {
+ # For each CCDS, look for the parent Gene-commentary tag. Then look for the text content of the Gene-commentary_label which is situed under. Ignore CCDS that have no Gene-commentary_label associated. Choose the CCDS that has the smallest Gene-commentary_label in alphabetical order.
+ version <- NA_character_
+ for (e in ccds_elements) {
+ versions <- XML::xpathSApply(e, "ancestor::Gene-commentary/Gene-commentary_label", XML::xmlValue)
+ if (length(versions) < 1) next
+ current_version <- versions[[length(versions)]]
+ if (is.na(version) || current_version < version) {
+ version <- current_version
+ ccds <- XML::xmlValue(e)
+ }
+ }
+ }
+
+ return(ccds)
+}
diff -r 253d531a0193 -r 20d69a062da3 PeakforestConn.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PeakforestConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,176 @@
+#####################
+# CLASS DECLARATION #
+#####################
+#'A class to connect to peakforest
+#'@export
+#'@field .url An urel to the database
+PeakforestConn <- methods::setRefClass("PeakforestConn", contains = c("RemotedbConn","MassdbConn"), fields = list( .url = "character" )) # TODO Inherits also from MassdbConn
+
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
+
+PeakforestConn$methods( getEntryContentType = function(type) {
+ return(BIODB.JSON)
+})
+
+#####################
+# GET ENTRY CONTENT #
+#####################
+
+PeakforestConn$methods( getEntryContent = function(id) {
+
+
+ # Initialize return values
+ content <- rep(NA_character_, length(id))
+ # Request
+
+ url <- get.entry.url(BIODB.PEAKFOREST, id[i], BIODB.JSON,token = .self$.token)
+ jsonstr <- .self$.get.url(url)
+ if(startsWith("", jsonstr) ){
+ next
+ }
+
+ return(content)
+})
+
+
+##########################################
+# SEARCH FOR SPECTRA IN GIVEN MASS RANGE #
+##########################################
+
+PeakforestConn$methods( searchMzRange = function(mzmin, mzmax, rtype = c("object","spec","peak")){
+
+ rtype <- match.arg(rtype)
+ if(mzmin>mzmax){
+ stop("mzmin shloud be inferior to mzmax in searchMzRange.")
+ }
+
+ url <- paste0("https://rest.peakforest.org/spectra/lcms/peaks/get-range/",mzmin,"/",mzmax)
+
+ contents <- .self$.get.url(url)
+
+ jsontree <- fromJSON(contents)
+
+ ###No match form the output.
+ if( length(jsontree)==0 ) return(NULL)
+
+ # Getting a list of all the id.
+ lid <- sapply(jsontree,function(x){
+ x$source$id
+ })
+
+ # Returning the content for all the spectra
+ contents <- .self$getEntryContent(lid)
+
+ entries <- .self$createEntry(contents)
+
+ # Checking the return type
+ if( rtype=="object" ){
+ return( entries )
+ }
+
+ ### XXXX See if we don't want to reduce the output and factorize this shit.
+ toreturn <- NULL
+ if( rtype=="spec" ){
+ toreturn <- sapply(entries,function(x){
+ x$getFieldsAsDataFrame()
+ })
+ }
+ if( rtype=="peak" ){
+ toreturn <- lapply(entries,function(x){
+ temp <- as.data.frame( x$getFieldValue( BIODB.PEAKS ))
+ temp$accession = x$getFieldValue( BIODB.ACCESSION)
+ return(temp)
+
+ })
+ }
+ ###Trying to convert in data.frame
+ if(!is.data.frame(toreturn)){
+ temp <- colnames(toreturn[[1]])
+ toreturn <- do.call("rbind.fill",toreturn)
+ colnames(toreturn) <- temp
+ }
+
+ return(toreturn)
+})
+
+
+#################################################
+# SEARCH FOR SPECTRA IN A TOLERANCE AROUND A MZ #
+#################################################
+
+PeakforestConn$methods( searchMzTol = function(mz, tol, tolunit=BIODB.MZTOLUNIT.VALS,
+ rtype = c("object","spec","peak")){
+
+ rtype <- match.arg(rtype)
+ tolunit <- match.arg(tolunit)
+
+ if( tolunit == BIODB.MZTOLUNIT.PPM){
+ tol <- tol * mz * 10^-6
+ }
+
+ mzmin <- mz - tol
+ mzmax <- mz + tol
+
+ return(.self$searchMzRange(mzmin, mzmax, rtype = rtype))
+
+})
+
+##################################################
+# SEARCH FOR MSMS SPECTRA PRECUSOR AROUND A MASS #
+##################################################
+
+
+PeakforestConn$methods(
+ searchSpecPrecTol = function(mz,
+ tol,
+ tolunit = "plain",
+ mode = NULL) {
+ #TODO handle the units
+ #tolunit <- match.arg(tolunit)
+
+ strmode <- ''
+
+ if (!is.null(mode)) {
+ if (mode %in% c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS)) {
+ strmode <- paste0('?polarity=', mode)
+ }
+
+ }
+
+ if (tolunit == BIODB.MZTOLUNIT.PPM) {
+ tol <- tol * mz * 10 ^ -6
+ }
+
+ ##Request which return peak and not spectra.
+ url <-
+ paste0(
+ "https://rest.peakforest.org/spectra/lcms/search-naive/",
+ mz,
+ "/",
+ tol,
+ strmode
+ )
+ contents <- .self$.get.url(url)
+ entries <- .self$createReducedEntry(contents, drop = TRUE)
+ return(entries)
+ }
+)
+
+
+################
+# CREATE ENTRY #
+################
+
+# Creates a Spectrum instance from file content.
+# content A file content, downloaded from the public database.
+# RETURN A spectrum instance.
+PeakforestConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createPeakforestSpectraFromJSON(content, drop = drop))
+})
+
+PeakforestConn$methods( createReducedEntry = function(content , drop = TRUE){
+ entries <- createReducedSpectraFromJSON(content, drop = drop)
+ return(entries)
+})
diff -r 253d531a0193 -r 20d69a062da3 PeakforestEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PeakforestEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,250 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+# TODO Create class PeakforestCompoundEntry
+PeakForestSpectrumEntry <- methods::setRefClass("PeakForestSpectrumEntry", contains = "BiodbEntry")
+
+PeakForestCompoundEntry <- methods::setRefClass("PeakForestCompoundEntry", contains = "BiodbEntry")
+
+
+###########
+# FACTORY #
+###########
+
+
+###Arg is jcontent ot indicate that the content is already a json.
+createPeakforestCompoundFromJSON <- function(contents, drop = FALSE) {
+
+ if(is.character(contents))
+ contents <- jsonlite::fromJSON(contents, simplifyDataFrame=FALSE)
+
+ jsonfields <- list()
+ jsonfields[[BIODB.ACCESSION]] <- "id"
+ jsonfields[[BIODB.PUBCHEMCOMP.ID]] <- "PubChemCID"
+ jsonfields[[BIODB.CHEBI.ID]] <- "ChEBI"
+ jsonfields[[BIODB.HMDB.ID]] <- "HMDB"
+ jsonfields[[BIODB.KEGG.ID]] <- "KEGG"
+ jsonfields[[BIODB.FORMULA]] <- "formula"
+ jsonfields[[BIODB.SMILES]] <- "canSmiles"
+ jsonfields[[BIODB.AVERAGE.MASS]] <- "averageMass"
+ jsonfields[[BIODB.MONOISOTOPIC.MASS]] <- "monoisotopicMass"
+ jsonfields[[BIODB.INCHI]] <- "inChI"
+ jsonfields[[BIODB.INCHIKEY]] <- "inchiIKey"
+ jsonfields[[BIODB.NAME]] <- "mainName"
+
+ entries <- vector(length(contents),mode="list")
+
+ for (i in seq_along(contents)){
+
+ jsontree <- contents[[i]]
+ entry <- PeakForestCompoundEntry$new()
+
+
+ for(field in names(jsonfields)){
+
+ tosearch <- jsonfields[[field]]
+ value <- jsontree$tosearch
+ entry$setField(field,value)
+ }
+
+ entries[[i]] <- entry
+ }
+
+
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ entries
+}
+
+createPeakforestSpectraFromJSON <- function(contents, drop = FALSE, checkSub = TRUE) {
+
+ entries <- vector(length(contents),mode="list")
+ jsonfields <- character()
+ jsonfields[[BIODB.ACCESSION]] <- "id" # TODO Use BIODB.ACCESSION instead
+ jsonfields[[BIODB.MSMODE]] <- "polarity"
+
+
+ ###Checking that it's a list.
+ if(length(contents) == 1){
+ if(startsWith(contents[[1]], "") ){
+ return(NULL)
+ }else{
+ contents <- jsonlite::fromJSON(contents[[1]],simplifyDataFrame=FALSE)
+
+ }
+ }
+
+ for (i in seq_along(contents)){
+
+ content <- contents[[i]]
+ jsontree <- NULL
+ if(typeof(content) == "character"){
+ if(startsWith(content, "")|content=="null"){
+ entries[[i]] <- NULL
+ next
+ }
+ jsontree <- jsonlite::fromJSON(content,simplifyDataFrame=FALSE)
+ }else{
+ jsontree <- content
+ }
+ cnames <- c(BIODB.PEAK.MZ, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.MZTHEO, BIODB.PEAK.ERROR.PPM)
+
+ entry <- PeakForestSpectrumEntry$new()
+ #####Setting thz mass analyzer
+ entry$setField(BIODB.MSDEV,jsontree$analyzerMassSpectrometerDevice$instrumentName)
+ entry$setField(BIODB.MSDEVTYPE,jsontree$analyzerMassSpectrometerDevice$ionAnalyzerType)
+
+
+
+ for(field in names(jsonfields)){
+
+ tosearch <- jsonfields[[field]]
+ value <- jsontree$tosearch
+ entry$setField(field,value)
+ }
+
+ ######################
+ # TREATING THE PEAKS #
+ ######################
+
+ entry$setField(BIODB.NB.PEAKS,length(jsontree$peaks))
+ peaks <- data.frame( matrix( 0,ncol = length(cnames), nrow = 0))
+ colnames(peaks) <- cnames
+ ###Parsing peaks.
+ if(length(jsontree$peaks) != 0){
+ peaks <- sapply(jsontree$peaks,function(x){
+ return(list(as.double(x$mz),
+ as.integer(x$ri),
+ as.character(x$composition),
+ as.double(x$theoricalMass),
+ as.double(x$deltaPPM)
+ ))
+ })
+ ###Removing all whitespaces from the formule.
+ peaks[3,]<-vapply(peaks[3,],function(x){
+ gsub(" ","",trimws(x))
+ },FUN.VALUE = NA_character_)
+
+ peaks<-t(peaks)
+ colnames(peaks)<-cnames
+ }
+
+ entry$setField(BIODB.PEAKS,peaks)
+
+ ##################################
+ # TREATING THE LIST OF COMPOUNDS #
+ ##################################
+
+ entry$setField(BIODB.NB.COMPOUNDS,length(jsontree$listOfCompounds))
+ compounds <- list()
+
+ ###Parsing compounds.
+ if( length( jsontree$listOfCompounds) != 0){
+ compounds <- lapply( jsontree$listOfCompounds, function(x){
+ createPeakforestCompoundFromJSON(x)
+ })
+ }
+
+ entry$setField(BIODB.COMPOUNDS, compounds)
+
+
+ entries[[i]] <- entry
+ }
+
+
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ entries
+}
+
+
+####TDO CLEAN THIS
+
+createReducedSpectraFromJSON <- function(contents,
+ drop = FALSE,
+ checkSub = TRUE) {
+ entries <- vector(length(contents), mode = "list")
+ jsonfields <- character()
+ # jsonfields[[BIODB.ACCESSION]] <-
+ # "id" # TODO Use BIODB.ACCESSION instead
+
+
+ ###Checking that it's a list.
+ if (length(contents) == 1) {
+ if (startsWith(contents[[1]], "")) {
+ return(NULL)
+ } else{
+ contents <- jsonlite::fromJSON(contents[[1]], simplifyDataFrame=FALSE)
+
+ }
+ }
+
+ for (i in seq_along(contents)) {
+ content <- contents[[i]]
+ jsontree <- NULL
+ if (typeof(content) == "character") {
+ if (startsWith(content, "") | content == "null") {
+ entries[[i]] <- NULL
+ next
+ }
+ jsontree <- jsonlite::fromJSON(content, simplifyDataFrame=FALSE)
+ } else{
+ jsontree <- content
+ }
+
+
+ cnames <-
+ c(
+ BIODB.PEAK.MZ,
+ BIODB.PEAK.RELATIVE.INTENSITY,
+ BIODB.PEAK.FORMULA,
+ BIODB.PEAK.MZTHEO,
+ BIODB.PEAK.ERROR.PPM
+ )
+
+ entry <- PeakForestSpectrumEntry$new()
+ entry$setField(BIODB.ACCESSION, jsontree$id)
+
+ ######################
+ # TREATING THE PEAKS #
+ ######################
+
+ entry$setField(BIODB.NB.PEAKS, length(jsontree$peaks))
+ peaks <- data.frame(matrix(0, ncol = length(cnames), nrow = 0))
+ colnames(peaks) <- cnames
+ ###Parsing peaks.
+ if (length(jsontree$peaks) != 0) {
+ peaks <- sapply(jsontree$peaks, function(x) {
+ return(
+ list(
+ as.double(x$mz),
+ as.integer(x$ri),
+ as.character(x$composition),
+ as.double(x$theoricalMass),
+ as.double(x$deltaPPM)
+ )
+ )
+ })
+ ###Removing all whitespaces from the formule.
+ peaks[3, ] <- vapply(peaks[3, ], function(x) {
+ gsub(" ", "", trimws(x))
+ }, FUN.VALUE = NA_character_)
+
+ peaks <- as.data.frame(t(peaks))
+ colnames(peaks) <- cnames
+ }
+
+ entry$setField(BIODB.PEAKS, peaks)
+
+ entries[[i]] <- entry
+ }
+
+
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ entries
+}
diff -r 253d531a0193 -r 20d69a062da3 PubchemCompound.R
--- a/PubchemCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,71 +0,0 @@
-if ( ! exists('PubchemCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- PubchemCompound <- setRefClass("PubchemCompound", contains = "BiodbEntry")
-
- ###########
- # FACTORY #
- ###########
-
- createPubchemCompoundFromXml <- function(contents, drop = TRUE) {
-
- library(XML)
-
- compounds <- list()
-
- # Set XML namespace
- ns <- c(pubchem = "http://pubchem.ncbi.nlm.nih.gov/pug_view")
-
- # Define xpath expressions
- xpath.expr <- character()
- xpath.expr[[BIODB.ACCESSION]] <- "//pubchem:RecordType[text()='CID']/../pubchem:RecordNumber"
- xpath.expr[[BIODB.INCHI]] <- "//pubchem:Name[text()='InChI']/../pubchem:StringValue"
- xpath.expr[[BIODB.INCHIKEY]] <- "//pubchem:Name[text()='InChI Key']/../pubchem:StringValue"
-
- for (content in contents) {
-
- # Create instance
- compound <- PubchemCompound$new()
-
- # Parse XML
- xml <- xmlInternalTreeParse(content, asText = TRUE)
-
- # Unknown compound
- fault <- xpathSApply(xml, "/pubchem:Fault", xmlValue, namespaces = ns)
- if (length(fault) == 0) {
-
- # Test generic xpath expressions
- for (field in names(xpath.expr)) {
- v <- xpathSApply(xml, xpath.expr[[field]], xmlValue, namespaces = ns)
- if (length(v) > 0)
- compound$setField(field, v)
- }
-
- # Get name
- name <- NA_character_
- tryCatch( { name <- xpathSApply(xml, "//pubchem:Name[text()='IUPAC Name']/../pubchem:StringValue", xmlValue, namespaces = ns) }, warning = function(w) {})
- if (is.na(name))
- tryCatch( { name <- xpathSApply(xml, "//pubchem:Name[text()='Record Title']/../pubchem:StringValue", xmlValue, namespaces = ns) }, warning = function(w) {})
- if ( ! is.na(name))
- compound$setField(BIODB.NAME, name)
-
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- compounds <- compounds[[1]]
-
- return(compounds)
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 PubchemConn.R
--- a/PubchemConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/PubchemConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,59 +1,96 @@
-if ( ! exists('get.pubchem.compound.url')) { # Do not load again if already loaded
+#####################
+# CLASS DECLARATION #
+#####################
+
+PubchemConn <- methods::setRefClass("PubchemConn", contains = "RemotedbConn", fields = list( .db = "character" ))
+
+###############
+# CONSTRUCTOR #
+###############
+
+PubchemConn$methods( initialize = function(db = BIODB.PUBCHEMCOMP, ...) {
+ .db <<- db
+ callSuper(...)
+})
- source('RemotedbConn.R')
- source('PubchemCompound.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- PubchemConn <- setRefClass("PubchemConn", contains = "RemotedbConn")
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
+
+PubchemConn$methods( getEntryContentType = function() {
+ return(BIODB.XML)
+})
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
+#####################
+# GET ENTRY CONTENT #
+#####################
+
+PubchemConn$methods( getEntryContent = function(ids) {
+
+ # Debug
+ .self$.print.debug.msg(paste0("Get entry content(s) for ", length(ids)," id(s)..."))
+
+ URL.MAX.LENGTH <- 2083
- PubchemConn$methods( getEntryContentType = function(type) {
- return(BIODB.XML)
- })
+ # Initialize return values
+ content <- rep(NA_character_, length(ids))
+
+ # Loop on all
+ n <- 0
+ while (n < length(ids)) {
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- PubchemConn$methods( getEntryContent = function(type, id) {
+ # Get list of accession ids to retrieve
+ accessions <- ids[(n + 1):length(ids)]
+
+ # Create URL request
+ x <- get.entry.url(class = .self$.db, accession = accessions, content.type = BIODB.XML, max.length = URL.MAX.LENGTH)
+
+ # Debug
+ .self$.print.debug.msg(paste0("Send URL request for ", x$n," id(s)..."))
- if (type == BIODB.COMPOUND) {
+ # Send request
+ xmlstr <- .self$.get.url(x$url)
- # Initialize return values
- content <- rep(NA_character_, length(id))
+ # Increase number of entries retrieved
+ n <- n + x$n
- # Request
- content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.PUBCHEM, x, content.type = BIODB.XML)), FUN.VALUE = '')
+ # TODO When one of the id is wrong, no content is returned. Only a single error is returned, with the first faulty ID:
+#
+# PUGREST.NotFound
+# Record not found
+# No record data for CID 1246452553
+#
- return(content)
+ # Parse XML and get included XML
+ if ( ! is.na(xmlstr)) {
+ xml <- xmlInternalTreeParse(xmlstr, asText = TRUE)
+ ns <- c(pcns = "http://www.ncbi.nlm.nih.gov")
+ returned.ids <- xpathSApply(xml, paste0("//pcns:", if (.self$.db == BIODB.PUBCHEMCOMP) 'PC-CompoundType_id_cid' else 'PC-ID_id'), xmlValue, namespaces = ns)
+ content[match(returned.ids, ids)] <- vapply(getNodeSet(xml, paste0("//pcns:", if (.self$.db == BIODB.PUBCHEMCOMP) "PC-Compound" else 'PC-Substance'), namespaces = ns), saveXML, FUN.VALUE = '')
}
- return(NULL)
- })
-
- ################
- # CREATE ENTRY #
- ################
-
- PubchemConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.COMPOUND) createPubchemCompoundFromXml(content, drop = drop) else NULL)
- })
+ # Debug
+ .self$.print.debug.msg(paste0("Now ", length(ids) - n," id(s) left to be retrieved..."))
+ }
+
+ return(content)
+})
+
+################
+# CREATE ENTRY #
+################
- #########################
- # GET PUBCHEM IMAGE URL #
- #########################
-
- get.pubchem.image.url <- function(id) {
-
- url <- paste0('http://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid=', id, '&t=l')
+PubchemConn$methods( createEntry = function(content, drop = TRUE) {
+ return(if (.self$.db == BIODB.PUBCHEMCOMP) createPubchemEntryFromXml(content, drop = drop) else createPubchemSubstanceFromXml(content, drop = drop))
+})
- return(url)
- }
-
-} # end of load safe guard
+#########################
+# GET PUBCHEM IMAGE URL #
+#########################
+
+get.pubchem.image.url <- function(id, db = BIODB.PUBCHEMCOMP) {
+
+ url <- paste0('http://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?', (if (db == BIODB.PUBCHEMCOMP) 'cid' else 'sid'), '=', id, '&t=l')
+
+ return(url)
+}
diff -r 253d531a0193 -r 20d69a062da3 PubchemEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PubchemEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,108 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+PubchemEntry <- methods::setRefClass("PubchemEntry", contains = "BiodbEntry")
+PubchemSubstance <- methods::setRefClass("PubchemSubstance", contains = "BiodbEntry")
+
+#####################
+# SUBSTANCE FACTORY #
+#####################
+
+createPubchemSubstanceFromXml <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ # Define xpath expressions
+ xpath.expr <- character()
+ xpath.expr[[BIODB.ACCESSION]] <- "//PC-ID_id"
+ #xpath.expr[[BIODB.PUBCHEMCOMP.ID]] <- "//PC-CompoundType_id_cid" --> Apparently that can be more than one CID for a substance.
+
+ for (content in contents) {
+
+ # Create instance
+ entry <- PubchemEntry$new()
+
+ if ( ! is.null(content) && ! is.na(content)) {
+
+ # Parse XML
+ xml <- XML::xmlInternalTreeParse(content, asText = TRUE)
+
+ # Unknown entry
+ fault <- XML::xpathSApply(xml, "/Fault", XML::xmlValue)
+ if (length(fault) == 0) {
+
+ # Test generic xpath expressions
+ for (field in names(xpath.expr)) {
+ v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue)
+ if (length(v) > 0)
+ entry$setField(field, v)
+ }
+ }
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
+
+####################
+# COMPOUND FACTORY #
+####################
+
+createPubchemEntryFromXml <- function(contents, drop = TRUE) {
+
+ entries <- list()
+
+ # Define xpath expressions
+ xpath.expr <- character()
+ xpath.expr[[BIODB.ACCESSION]] <- "//PC-CompoundType_id_cid"
+ xpath.expr[[BIODB.INCHI]] <- "//PC-Urn_label[text()='InChI']/../../..//PC-InfoData_value_sval"
+ xpath.expr[[BIODB.INCHIKEY]] <- "//PC-Urn_label[text()='InChIKey']/../../..//PC-InfoData_value_sval"
+ xpath.expr[[BIODB.FORMULA]] <- "//PC-Urn_label[text()='Molecular Formula']/../../..//PC-InfoData_value_sval"
+ xpath.expr[[BIODB.MASS]] <- "//PC-Urn_label[text()='Mass']/../../..//PC-InfoData_value_fval"
+ xpath.expr[[BIODB.COMP.IUPAC.NAME.SYST]] <- "//PC-Urn_label[text()='IUPAC Name']/../PC-Urn_name[text()='Systematic']/../../..//PC-InfoData_value_sval"
+
+ for (content in contents) {
+
+ # Create instance
+ entry <- PubchemEntry$new()
+
+ if ( ! is.null(content) && ! is.na(content)) {
+
+ # Parse XML
+ xml <- XML::xmlInternalTreeParse(content, asText = TRUE)
+
+ # Unknown entry
+ fault <- XML::xpathSApply(xml, "/Fault", XML::xmlValue)
+ if (length(fault) == 0) {
+
+ # Test generic xpath expressions
+ for (field in names(xpath.expr)) {
+ v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue)
+ if (length(v) > 0)
+ entry$setField(field, v)
+ }
+ }
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
diff -r 253d531a0193 -r 20d69a062da3 RemotedbConn.R
--- a/RemotedbConn.R Sat Sep 03 17:02:01 2016 -0400
+++ b/RemotedbConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,23 +1,23 @@
if ( ! exists('RemotedbConn')) {
- source('BiodbConn.R')
- source(file.path('UrlRequestScheduler.R'), chdir = TRUE)
-
#####################
# CLASS DECLARATION #
#####################
- RemotedbConn <- setRefClass("RemotedbConn", contains = "BiodbConn", fields = list(.scheduler = "UrlRequestScheduler"))
+ RemotedbConn <- methods::setRefClass("RemotedbConn", contains = "BiodbConn", fields = list(.scheduler = "UrlRequestScheduler", .token = "character"))
###############
# CONSTRUCTOR #
###############
- RemotedbConn$methods( initialize = function(useragent = NA_character_, scheduler = NULL, ...) {
+ RemotedbConn$methods( initialize = function(useragent = NA_character_, scheduler = NULL, token = NA_character_, ...) {
# Check useragent
( ! is.null(useragent) && ! is.na(useragent)) || stop("You must specify a valid useragent string (e.g.: \"myapp ; my.email@address\").")
+ # Set token
+ .token <<- token
+
# Set scheduler
if (is.null(scheduler))
scheduler <- UrlRequestScheduler$new(n = 3)
@@ -28,4 +28,21 @@
callSuper(...) # calls super-class initializer with remaining parameters
})
+ ###########
+ # GET URL #
+ ###########
+
+ RemotedbConn$methods( .get.url = function(url) {
+ .self$.print.debug.msg(paste0("Sending URL request '", url, "'..."))
+ return(.self$.scheduler$getUrl(url))
+ })
+
+ ###########
+ # GET URL #
+ ###########
+
+ RemotedbConn$methods( .set.useragent = function(useragent) {
+ .scheduler$setUserAgent(useragent) # set agent
+ })
+
}
diff -r 253d531a0193 -r 20d69a062da3 UniProtCompound.R
--- a/UniProtCompound.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,81 +0,0 @@
-if ( ! exists('UniprotCompound')) { # Do not load again if already loaded
-
- source('BiodbEntry.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- UniprotCompound <- setRefClass("UniprotCompound", contains = "BiodbEntry")
-
- ###########
- # FACTORY #
- ###########
-
- createUniprotCompoundFromXml <- function(contents, drop = FALSE) {
-
- library(XML)
-
- # Set XML namespace
- ns <- c(uniprot = "http://uniprot.org/uniprot")
-
- compounds <- list()
-
- # Define xpath expressions
- xpath.values <- character()
- xpath.values[[BIODB.NAME]] <- "/uniprot:uniprot/uniprot:compound/uniprot:name"
- xpath.values[[BIODB.GENE.SYMBOLS]] <- "//uniprot:gene/uniprot:name"
- xpath.values[[BIODB.FULLNAMES]] <- "//uniprot:protein//uniprot:fullName"
- xpath.values[[BIODB.SEQUENCE]] <- "//uniprot:entry/uniprot:sequence"
- xpath.values[[BIODB.ACCESSION]] <- "//uniprot:accession[1]"
- xpath.attr <- list()
- xpath.attr[[BIODB.KEGG.ID]] <- list(path = "//uniprot:dbReference[@type='KEGG']", attr = 'id')
- xpath.attr[[BIODB.NCBI.GENE.ID]] <- list(path = "//uniprot:dbReference[@type='GeneID']", attr = 'id')
- xpath.attr[[BIODB.ENZYME.ID]] <- list(path = "//uniprot:dbReference[@type='EC']", attr = 'id')
- xpath.attr[[BIODB.MASS]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'mass')
- xpath.attr[[BIODB.LENGTH]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'length')
-
- for (content in contents) {
-
- # Create instance
- compound <- HmdbCompound$new()
-
- # If the entity doesn't exist (i.e.: no .xml page), then it returns an HTML page
- if ( ! grepl("^ 0)
- compound$setField(field, v)
- }
-
- # Test attribute xpath
- for (field in names(xpath.attr)) {
- v <- xpathSApply(xml, xpath.attr[[field]]$path, xmlGetAttr, xpath.attr[[field]]$attr, namespaces = ns)
- if (length(v) > 0)
- compound$setField(field, v)
- }
-
- # Remove new lines from sequence string
- seq <- compound$getField(BIODB.SEQUENCE)
- if ( ! is.na(seq))
- compound$setField(BIODB.SEQUENCE, gsub("\\n", "", seq))
- }
-
- compounds <- c(compounds, compound)
- }
-
- # Replace elements with no accession id by NULL
- compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
-
- # If the input was a single element, then output a single object
- if (drop && length(contents) == 1)
- compounds <- compounds[[1]]
-
- return(compounds)
- }
-}
diff -r 253d531a0193 -r 20d69a062da3 UniProtConn.R
--- a/UniProtConn.R Sat Sep 03 17:02:01 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,47 +0,0 @@
-if ( ! exists('UniprotConn')) { # Do not load again if already loaded
-
- source('RemotedbConn.R')
- source('UniprotCompound.R')
-
- #####################
- # CLASS DECLARATION #
- #####################
-
- UniprotConn <- setRefClass("UniprotConn", contains = "RemotedbConn")
-
- ##########################
- # GET ENTRY CONTENT TYPE #
- ##########################
-
- UniprotConn$methods( getEntryContentType = function(type) {
- return(BIODB.XML)
- })
-
- #####################
- # GET ENTRY CONTENT #
- #####################
-
- UniprotConn$methods( getEntryContent = function(type, id) {
-
- if (type == BIODB.COMPOUND) {
-
- # Initialize return values
- content <- rep(NA_character_, length(id))
-
- # Request
- content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.UNIPROT, x, content.type = BIODB.XML)), FUN.VALUE = '')
-
- return(content)
- }
-
- return(NULL)
- })
-
- ################
- # CREATE ENTRY #
- ################
-
- UniprotConn$methods( createEntry = function(type, content, drop = TRUE) {
- return(if (type == BIODB.COMPOUND) createUniprotCompoundFromXml(content, drop = drop) else NULL)
- })
-}
diff -r 253d531a0193 -r 20d69a062da3 UniprotConn.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/UniprotConn.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,36 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+UniprotConn <- methods::setRefClass("UniprotConn", contains = "RemotedbConn")
+
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
+
+UniprotConn$methods( getEntryContentType = function() {
+ return(BIODB.XML)
+})
+
+#####################
+# GET ENTRY CONTENT #
+#####################
+
+UniprotConn$methods( getEntryContent = function(ids) {
+
+ # Initialize return values
+ content <- rep(NA_character_, length(ids))
+
+ # Request
+ content <- vapply(ids, function(x) .self$.get.url(get.entry.url(BIODB.UNIPROT, x, content.type = BIODB.XML)), FUN.VALUE = '')
+
+ return(content)
+})
+
+################
+# CREATE ENTRY #
+################
+
+UniprotConn$methods( createEntry = function(content, drop = TRUE) {
+ return(createUniprotEntryFromXml(content, drop = drop))
+})
diff -r 253d531a0193 -r 20d69a062da3 UniprotEntry.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/UniprotEntry.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,74 @@
+#####################
+# CLASS DECLARATION #
+#####################
+
+UniprotEntry <- methods::setRefClass("UniprotEntry", contains = "BiodbEntry")
+
+###########
+# FACTORY #
+###########
+
+createUniprotEntryFromXml <- function(contents, drop = FALSE) {
+
+ # Set XML namespace
+ ns <- c(uniprot = "http://uniprot.org/uniprot")
+
+ entries <- list()
+
+ # Define xpath expressions
+ xpath.values <- character()
+ xpath.values[[BIODB.NAME]] <- "/uniprot:uniprot/uniprot:entry/uniprot:name"
+ xpath.values[[BIODB.GENE.SYMBOLS]] <- "//uniprot:gene/uniprot:name"
+ xpath.values[[BIODB.FULLNAMES]] <- "//uniprot:protein//uniprot:fullName"
+ xpath.values[[BIODB.SEQUENCE]] <- "//uniprot:entry/uniprot:sequence"
+ xpath.values[[BIODB.ACCESSION]] <- "//uniprot:accession[1]"
+ xpath.attr <- list()
+ xpath.attr[[BIODB.KEGG.ID]] <- list(path = "//uniprot:dbReference[@type='KEGG']", attr = 'id')
+ xpath.attr[[BIODB.NCBI.GENE.ID]] <- list(path = "//uniprot:dbReference[@type='GeneID']", attr = 'id')
+ xpath.attr[[BIODB.ENZYME.ID]] <- list(path = "//uniprot:dbReference[@type='EC']", attr = 'id')
+ xpath.attr[[BIODB.MASS]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'mass')
+ xpath.attr[[BIODB.LENGTH]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'length')
+
+ for (content in contents) {
+
+ # Create instance
+ entry <- UniprotEntry$new()
+
+ # If the entity doesn't exist (i.e.: no .xml page), then it returns an HTML page
+ if ( ! grepl("^ 0)
+ entry$setField(field, v)
+ }
+
+ # Test attribute xpath
+ for (field in names(xpath.attr)) {
+ v <- XML::xpathSApply(xml, xpath.attr[[field]]$path, XML::xmlGetAttr, xpath.attr[[field]]$attr, namespaces = ns)
+ if (length(v) > 0)
+ entry$setField(field, v)
+ }
+
+ # Remove new lines from sequence string
+ seq <- entry$getField(BIODB.SEQUENCE)
+ if ( ! is.na(seq))
+ entry$setField(BIODB.SEQUENCE, gsub("\\n", "", seq))
+ }
+
+ entries <- c(entries, entry)
+ }
+
+ # Replace elements with no accession id by NULL
+ entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
+
+ # If the input was a single element, then output a single object
+ if (drop && length(contents) == 1)
+ entries <- entries[[1]]
+
+ return(entries)
+}
diff -r 253d531a0193 -r 20d69a062da3 UrlRequestScheduler.R
--- a/UrlRequestScheduler.R Sat Sep 03 17:02:01 2016 -0400
+++ b/UrlRequestScheduler.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,126 +1,135 @@
-if ( ! exists('UrlRequestScheduler')) { # Do not load again if already loaded
+#############
+# CONSTANTS #
+#############
+
+BIODB.GET <- 'GET'
+BIODB.POST <- 'POST'
+
+#####################
+# CLASS DECLARATION #
+#####################
+
+UrlRequestScheduler <- methods::setRefClass("UrlRequestScheduler", fields = list(.n = "numeric", .t = "numeric", .time.of.last.request = "ANY", .useragent = "character", .ssl.verifypeer = "logical", .nb.max.tries = "integer", .verbose = "integer"))
+
+# n: number of connections
+# t: time (in seconds)
+
+# The scheduler restrict the number of connections at n per t seconds.
+
+###############
+# CONSTRUCTOR #
+###############
+
+UrlRequestScheduler$methods( initialize = function(n = 1, t = 1, useragent = NA_character_, ssl.verifypeer = TRUE, ...) {
+ .n <<- n
+ .t <<- t
+ .time.of.last.request <<- -1
+ .useragent <<- useragent
+ .nb.max.tries <<- 10L
+ .ssl.verifypeer <<- ssl.verifypeer
+ .verbose <<- 0L
+ callSuper(...) # calls super-class initializer with remaining parameters
+})
- #############
- # CONSTANTS #
- #############
+##################
+# SET USER AGENT #
+##################
+
+UrlRequestScheduler$methods( setUserAgent = function(useragent) {
+ .useragent <<- useragent
+})
+
+###############
+# SET VERBOSE #
+###############
+
+UrlRequestScheduler$methods( setVerbose = function(verbose) {
+ .verbose <<- verbose
+})
- RLIB.GET <- 'GET'
- RLIB.POST <- 'POST'
+##################
+# WAIT AS NEEDED #
+##################
+
+# Wait the specified between two requests.
+UrlRequestScheduler$methods( .wait.as.needed = function() {
+
+ # Compute minimum waiting time between two URL requests
+ waiting_time <- .self$.t / .self$.n
+
+ # Wait, if needed, before previous URL request and this new URL request.
+ if (.self$.time.of.last.request > 0) {
+ spent_time <- Sys.time() - .self$.time.of.last.request
+ if (spent_time < waiting_time)
+ Sys.sleep(waiting_time - spent_time)
+ }
- #####################
- # CLASS DECLARATION #
- #####################
-
- UrlRequestScheduler <- setRefClass("UrlRequestScheduler", fields = list(.n = "numeric", .t = "numeric", .time.of.last.request = "ANY", .useragent = "character", .ssl.verifypeer = "logical", .nb.max.tries = "integer", .verbose = "integer"))
-
- # n: number of connections
- # t: time (in seconds)
-
- # The scheduler restrict the number of connections at n per t seconds.
-
- ###############
- # CONSTRUCTOR #
- ###############
-
- UrlRequestScheduler$methods( initialize = function(n = 1, t = 1, useragent = NA_character_, ssl.verifypeer = TRUE, ...) {
- .n <<- n
- .t <<- t
- .time.of.last.request <<- -1
- .useragent <<- useragent
- .nb.max.tries <<- 10L
- .ssl.verifypeer <<- ssl.verifypeer
- .verbose <<- 0L
- callSuper(...) # calls super-class initializer with remaining parameters
- })
-
- ##################
- # SET USER AGENT #
- ##################
-
- UrlRequestScheduler$methods( setUserAgent = function(useragent) {
- .useragent <<- useragent
- })
-
- ###############
- # SET VERBOSE #
- ###############
-
- UrlRequestScheduler$methods( setVerbose = function(verbose) {
- .verbose <<- verbose
- })
-
- ##################
- # WAIT AS NEEDED #
- ##################
-
- # Wait the specified between two requests.
- UrlRequestScheduler$methods( .wait.as.needed = function() {
-
- # Compute minimum waiting time between two URL requests
- waiting_time <- .self$.t / .self$.n
-
- # Wait, if needed, before previous URL request and this new URL request.
- if (.self$.time.of.last.request > 0) {
- spent_time <- Sys.time() - .self$.time.of.last.request
- if (spent_time < waiting_time)
- Sys.sleep(waiting_time - spent_time)
- }
-
- # Store current time
- .time.of.last.request <<- Sys.time()
- })
-
- ####################
- # GET CURL OPTIONS #
- ####################
-
- UrlRequestScheduler$methods( .get_curl_opts = function(url) {
- opts <- curlOptions(useragent = .self$.useragent, timeout.ms = 60000, verbose = FALSE)
- return(opts)
- })
-
- ###########
- # GET URL #
- ###########
-
- UrlRequestScheduler$methods( .doGetUrl = function(url, params = NULL, method = RLIB.GET) {
-
- content <- NA_character_
-
- # Use form to send URL request
- if ( ! is.null(params) && ! is.na(params))
- switch(method,
- GET = { content <- getForm(url, .opts = .self$.get_curl_opts(), .params = params) },
- POST = { content <- postForm(url, .opts = .self$.get_curl_opts(), .params = params) },
- stop(paste('Unknown method "', method, '".'))
- )
-
- # Get URL normally
- else
- content <- getURL(url, .opts = .self$.get_curl_opts(), ssl.verifypeer = .self$.ssl.verifypeer)
-
- return(content)
- })
-
- UrlRequestScheduler$methods( getUrl = function(url, params = NULL, method = RLIB.GET) {
-
- # Load library here and not inside .doGetUrl() since it is called from inside a try/catch clause, hence if library is missing the error will be ignored.
- library(bitops)
- library(RCurl)
-
- content <- NA_character_
-
- # Wait required time between two requests
- .self$.wait.as.needed()
-
- # Run query
- for (i in seq(.self$.nb.max.tries)) {
- tryCatch({ content <- .self$.doGetUrl(url, params = params, method = method) },
- error = function(e) { if (.self$.verbose > 0) print("Retry connection to server...") } )
- if ( ! is.na(content))
- break
- }
-
- return(content)
- })
-}
+ # Store current time
+ .time.of.last.request <<- Sys.time()
+})
+
+#########################
+# GET CURL OPTIONS {{{1 #
+#########################
+
+UrlRequestScheduler$methods( .get.curl.opts = function(opts = list()) {
+ opts <- RCurl::curlOptions(useragent = .self$.useragent, timeout.ms = 60000, verbose = FALSE, .opts = opts)
+ return(opts)
+})
+
+###################
+# DO GET URL {{{1 #
+###################
+
+UrlRequestScheduler$methods( .doGetUrl = function(url, params = list(), method = BIODB.GET, opts = .self$.get.curl.opts()) {
+
+ content <- NA_character_
+
+ # Use form to send URL request
+ if ( method == BIODB.POST || ( ! is.null(params) && ! is.na(params) && length(params) > 0)) {
+ switch(method,
+ GET = { content <- RCurl::getForm(url, .opts = opts, .params = params) },
+ POST = { content <- RCurl::postForm(url, .opts = opts, .params = params) },
+ stop(paste('Unknown method "', method, '".'))
+ )
+ }
+
+ # Get URL normally
+ else {
+ content <- RCurl::getURL(url, .opts = opts, ssl.verifypeer = .self$.ssl.verifypeer)
+ }
+ return(content)
+})
+
+##########################
+# SEND SOAP REQUEST {{{1 #
+##########################
+
+UrlRequestScheduler$methods( sendSoapRequest = function(url, request) {
+ header <- c(Accept="text/xml", Accept="multipart/*", 'Content-Type'="text/xml; charset=utf-8")
+ opts <- .self$.get.curl.opts(list(httpheader = header, postfields = request))
+ results <- .self$getUrl(url, method = BIODB.POST, opts = opts)
+ return(results)
+})
+
+################
+# GET URL {{{1 #
+################
+
+UrlRequestScheduler$methods( getUrl = function(url, params = list(), method = BIODB.GET, opts = .self$.get.curl.opts()) {
+
+ content <- NA_character_
+
+ # Wait required time between two requests
+ .self$.wait.as.needed()
+
+ # Run query
+ for (i in seq(.self$.nb.max.tries)) {
+ tryCatch({ content <- .self$.doGetUrl(url, params = params, method = method, opts = opts) },
+ error = function(e) { if (.self$.verbose > 0) print("Retry connection to server...") } )
+ if ( ! is.na(content))
+ break
+ }
+
+ return(content)
+})
diff -r 253d531a0193 -r 20d69a062da3 biodb-common.R
--- a/biodb-common.R Sat Sep 03 17:02:01 2016 -0400
+++ b/biodb-common.R Thu Mar 02 08:55:00 2017 -0500
@@ -1,12 +1,13 @@
-if ( ! exists('BIODB.COMPOUND')) { # Do not load again if already loaded
+if ( ! exists('BIODB.XML')) {
###############
- # ENTRY TYPES #
+ # CACHE MODES #
###############
- BIODB.COMPOUND <- 'compound'
- BIODB.SPECTRUM <- 'spectrum'
-
+ BIODB.CACHE.READ.ONLY <- 'read-only'
+ BIODB.CACHE.READ.WRITE <- 'read-write'
+ BIODB.CACHE.WRITE.ONLY <- 'write-only'
+
#######################
# ENTRY CONTENT TYPES #
#######################
@@ -16,7 +17,7 @@
BIODB.XML <- 'xml'
BIODB.CSV <- 'csv'
BIODB.DATAFRAME <- 'dataframe'
- BIODB.ANY <- 'any' # Value used when we do not care about the type.
+ BIODB.JSON <- 'json'
#############
# DATABASES #
@@ -24,7 +25,8 @@
BIODB.CHEBI <- 'chebi'
BIODB.KEGG <- 'kegg'
- BIODB.PUBCHEM <- 'pubchem'
+ BIODB.PUBCHEMCOMP <- 'pubchemcomp' # Compound database
+ BIODB.PUBCHEMSUB <- 'pubchemsub' # Substance database
BIODB.HMDB <- 'hmdb'
BIODB.CHEMSPIDER <- 'chemspider'
BIODB.ENZYME <- 'enzyme'
@@ -35,6 +37,9 @@
BIODB.UNIPROT <- 'uniprot'
BIODB.MASSBANK <- 'massbank'
BIODB.MASSFILEDB <- 'massfiledb'
+ BIODB.PEAKFOREST <- 'peakforest'
+
+ BIODB.DATABASES <- c(BIODB.CHEBI, BIODB.KEGG, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.HMDB, BIODB.CHEMSPIDER, BIODB.ENZYME, BIODB.LIPIDMAPS, BIODB.MIRBASE, BIODB.NCBIGENE, BIODB.NCBICCDS, BIODB.UNIPROT, BIODB.MASSBANK, BIODB.MASSFILEDB, BIODB.PEAKFOREST)
##########
# FIELDS #
@@ -44,6 +49,11 @@
BIODB.DESCRIPTION <- 'description'
BIODB.PROTEIN.DESCRIPTION <- 'protdesc'
BIODB.NAME <- 'name'
+ BIODB.COMP.IUPAC.NAME.ALLOWED <- 'comp.iupac.name.allowed'
+ BIODB.COMP.IUPAC.NAME.TRAD <- 'comp.iupac.name.trad'
+ BIODB.COMP.IUPAC.NAME.SYST <- 'comp.iupac.name.syst'
+ BIODB.COMP.IUPAC.NAME.PREF <- 'comp.iupac.name.pref'
+ BIODB.COMP.IUPAC.NAME.CAS <- 'comp.iupac.name.cas'
BIODB.FULLNAMES <- 'fullnames'
BIODB.SYNONYMS <- 'synonyms'
BIODB.SYMBOL <- 'symbol'
@@ -55,8 +65,13 @@
BIODB.ENZYME.ID <- 'enzymeid'
BIODB.NCBI.CCDS.ID <- 'ncbiccdsid'
BIODB.NCBI.GENE.ID <- 'ncbigeneid'
- BIODB.PUBCHEM.ID <- 'pubchemid'
+ BIODB.PUBCHEMCOMP.ID <- 'pubchemcompid'
+ BIODB.PUBCHEMSUB.ID <- 'pubchemsubid'
+ BIODB.CHEMSPIDER.ID <- 'chemspiderid'
BIODB.UNIPROT.ID <- 'uniprotid'
+ BIODB.CAS.ID <- 'casid'
+ BIODB.PEAKFOREST.ID <- 'peakforestid'
+ BIODB.SMILES <- 'smiles'
BIODB.INCHI <- 'inchi'
BIODB.INCHIKEY <- 'inchikey'
BIODB.MSDEV <- 'msdev'
@@ -75,70 +90,134 @@
BIODB.LENGTH <- 'length'
BIODB.NB.PEAKS <- 'nbpeaks'
BIODB.PEAKS <- 'peaks'
+ BIODB.COMPOUNDS <- 'compounds'
+ BIODB.NB.COMPOUNDS <- 'nbcompounds'
BIODB.COMPOUND.ID <- 'compoundid'
- BIODB.PEAK.MZ <- 'peakmz'
- BIODB.PEAK.COMP <- 'peakcomp' # Peak composition
- BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution
+ BIODB.COMPOUND.MASS <- 'compoundmass'
+ BIODB.COMPOUND.COMP <- 'compoundcomp'
BIODB.CHROM.COL <- 'chromcol' # Chromatographic column
BIODB.CHROM.COL.RT <- 'chromcolrt' # Retention time measured on chromatographic column
BIODB.ID <- 'id'
BIODB.TITLE <- 'title'
+ BIODB.PEAK.MZ <- 'mz'
+ BIODB.PEAK.RT <- 'rt'
+ BIODB.PEAK.MZEXP <- 'mzexp'
+ BIODB.PEAK.MZTHEO <- 'mztheo'
+ BIODB.PEAK.FORMULA <- 'formula'
+ BIODB.PEAK.FORMULA.COUNT <- 'formula.count'
+ BIODB.PEAK.COMP <- 'peakcomp' # Peak composition
+ BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution
+ BIODB.PEAK.MASS <- 'mass'
+# BIODB.PEAK.ATTR <- 'attr'
+ BIODB.PEAK.ERROR.PPM <- 'error.ppm'
+ BIODB.PEAK.INTENSITY <- 'intensity'
+ BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity'
# Mode values
BIODB.MSMODE.NEG <- 'neg'
BIODB.MSMODE.POS <- 'pos'
+ # Tolerance values
+ BIODB.TOL <- 'mztol'
+ BIODB.MZTOLUNIT.PPM <- 'ppm'
+ BIODB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio
+ BIODB.MZTOLUNIT.VALS <- c(BIODB.MZTOLUNIT.PPM, BIODB.MZTOLUNIT.PLAIN)
+
+ ########################
+ # MS-MS MEASURE VALUES #
+ ########################
+
+ BIODB.MSMS.DIST.COS <- "cosine"
+ BIODB.MSMS.DIST.WCOSINE <- "wcosine"
+ BIODB.MSMS.DIST.PKERNEL <- "pkernel"
+ BIODB.MSMS.DIST <- c(BIODB.MSMS.DIST.COS, BIODB.MSMS.DIST.WCOSINE, BIODB.MSMS.DIST.PKERNEL)
+
+
#################
# CARDINALITIES #
#################
BIODB.CARD.ONE <- '1'
BIODB.CARD.MANY <- '*'
+
+ #####################
+ #INTENSITy NOTATIONS#
+ #####################
+
+ BIODB.GROUP.INTENSITY<-c(BIODB.PEAK.INTENSITY,BIODB.PEAK.RELATIVE.INTENSITY)
##########################
# ENTRY FIELD ATTRIBUTES #
##########################
-
+ # FIELD NAME CLASS CARDINALITY TYPE
BIODB.FIELDS <- data.frame(matrix(c(
- # FIELD NAME CLASS CARDINALITY
- BIODB.COMPOUND, 'BiodEntry', BIODB.CARD.ONE,
- BIODB.ACCESSION, 'character', BIODB.CARD.ONE,
- BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE,
- BIODB.NAME, 'character', BIODB.CARD.ONE,
- BIODB.FULLNAMES, 'character', BIODB.CARD.MANY,
- BIODB.SYNONYMS, 'character', BIODB.CARD.MANY,
- BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE,
- BIODB.SYMBOL, 'character', BIODB.CARD.ONE,
- BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY,
- BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE,
- BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE,
- BIODB.KEGG.ID, 'character', BIODB.CARD.ONE,
- BIODB.HMDB.ID, 'character', BIODB.CARD.ONE,
- BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE,
- BIODB.PUBCHEM.ID, 'character', BIODB.CARD.ONE,
- BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE,
- BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE,
- BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE,
- BIODB.INCHI, 'character', BIODB.CARD.ONE,
- BIODB.INCHIKEY, 'character', BIODB.CARD.ONE,
- BIODB.MSDEV, 'character', BIODB.CARD.ONE,
- BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE,
- BIODB.MSTYPE, 'character', BIODB.CARD.ONE,
- BIODB.MSMODE, 'character', BIODB.CARD.ONE,
- BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE,
- BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE,
- BIODB.FORMULA, 'character', BIODB.CARD.ONE,
- BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE,
- BIODB.MASS, 'double', BIODB.CARD.ONE,
- BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE,
- BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE,
- BIODB.SEQUENCE, 'character', BIODB.CARD.ONE,
- BIODB.LENGTH, 'integer', BIODB.CARD.ONE,
- BIODB.LOCATION, 'character', BIODB.CARD.ONE,
- BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE,
- BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE
- ), byrow = TRUE, ncol = 3), stringsAsFactors = FALSE)
- colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality')
+ BIODB.ACCESSION, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.NAME, 'character', BIODB.CARD.ONE, 'name',
+ BIODB.COMP.IUPAC.NAME.ALLOWED, 'character', BIODB.CARD.ONE, 'name',
+ BIODB.COMP.IUPAC.NAME.TRAD, 'character', BIODB.CARD.ONE, 'name',
+ BIODB.COMP.IUPAC.NAME.SYST, 'character', BIODB.CARD.ONE, 'name',
+ BIODB.COMP.IUPAC.NAME.PREF, 'character', BIODB.CARD.ONE, 'name',
+ BIODB.COMP.IUPAC.NAME.CAS, 'character', BIODB.CARD.ONE, 'name',
+ BIODB.FULLNAMES, 'character', BIODB.CARD.MANY, 'name',
+ BIODB.SYNONYMS, 'character', BIODB.CARD.MANY, 'name',
+ BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.SYMBOL, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY, 'none',
+ BIODB.NB.COMPOUNDS, 'integer', BIODB.CARD.ONE, 'none',
+ BIODB.COMPOUNDS, 'object', BIODB.CARD.MANY, 'none',
+ BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.KEGG.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.HMDB.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.PUBCHEMCOMP.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.PUBCHEMSUB.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.PEAKFOREST.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.INCHI, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.INCHIKEY, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.MSDEV, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.MSTYPE, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.MSMODE, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE, 'none',
+ BIODB.PEAK.MZTHEO, 'double', BIODB.CARD.ONE, 'none',
+ BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.FORMULA, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.MASS, 'double', BIODB.CARD.ONE, 'none',
+ BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE, 'none',
+ BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE, 'none',
+ BIODB.SEQUENCE, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.LENGTH, 'integer', BIODB.CARD.ONE, 'none',
+ BIODB.LOCATION, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE, 'none',
+ BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE, 'none',
+ BIODB.SMILES, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.CHEMSPIDER.ID, 'character', BIODB.CARD.ONE, 'none',
+ BIODB.CAS.ID, 'character', BIODB.CARD.ONE, 'none'
+ ), byrow = TRUE, ncol = 4), stringsAsFactors = FALSE)
+ colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality', 'type')
+
+ #########################
+ # GET DATABASE ID FIELD #
+ #########################
+
+ biodb.get.database.id.field <- function(database) {
+
+ id.field <- NA_character_
+
+ if (database %in% BIODB.DATABASES) {
+ id.field <- paste0(database, 'id')
+ if ( ! id.field %in% BIODB.FIELDS[['name']])
+ stop(paste0('No ID field defined for database ', database, '.'))
+ }
+
+ return(id.field)
+ }
#####################
# COMPUTABLE FIELDS #
@@ -153,15 +232,6 @@
# PEAKS DATA FRAME #
####################
- # Columns
- BIODB.PEAK.MZ <- 'mz'
- BIODB.PEAK.FORMULA <- 'formula'
- BIODB.PEAK.FORMULA.COUNT <- 'formula.count'
- BIODB.PEAK.MASS <- 'mass'
- BIODB.PEAK.ERROR.PPM <- 'error.ppm'
- BIODB.PEAK.INTENSITY <- 'intensity'
- BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity'
-
# Example
BIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE)
colnames(BIODB.PEAK.DF.EXAMPLE) <- c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM)
@@ -171,55 +241,64 @@
#################
# TODO Let the choice to use either jp or eu
- BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/getRecordInfo"
- BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/getRecordInfo"
+ BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/"
+ BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/"
- .do.get.entry.url <- function(class, accession, content.type = BIODB.ANY) {
+ .do.get.entry.url <- function(class, accession, content.type = BIODB.HTML, base.url = NA_character_, token = NA_character_) {
- # TODO Only Massbank can handle multiple accession ids
- if (class != 'massbank' && length(accession) > 1)
+ # Only certain databases can handle multiple accession ids
+ if ( ! class %in% c(BIODB.MASSBANK, BIODB.CHEMSPIDER, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.PEAKFOREST) && length(accession) > 1)
stop(paste0("Cannot build a URL for getting multiple entries for class ", class, "."))
+ # Get URL
url <- switch(class,
- chebi = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL,
- chemspider = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html') else NULL,
- enzyme = if (content.type %in% c(BIODB.ANY, BIODB.TXT)) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL,
+ chebi = if (content.type == BIODB.HTML) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL,
+ chemspider = {
+ token.param <- if (is.na(token)) '' else paste('&token', token, sep = '=')
+ switch(content.type,
+ html = paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html'),
+ xml = paste0('http://www.chemspider.com/MassSpecAPI.asmx/GetExtendedCompoundInfoArray?', paste(paste0('CSIDs=', accession), collapse = '&'), token.param),
+ NULL)
+ },
+ enzyme = if (content.type == BIODB.TXT) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL,
hmdb = switch(content.type,
xml = paste0('http://www.hmdb.ca/metabolites/', accession, '.xml'),
html = paste0('http://www.hmdb.ca/metabolites/', accession),
- any = paste0('http://www.hmdb.ca/metabolites/', accession),
NULL),
kegg = switch(content.type,
txt = paste0('http://rest.kegg.jp/get/', accession),
html = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession),
- any = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession),
+ NULL),
+ lipidmaps = if (content.type == BIODB.CSV) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL,
+ massbank = if (content.type == BIODB.TXT) paste0((if (is.na(base.url)) BIODB.MASSBANK.EU.WS.URL else base.url), 'getRecordInfo?ids=', paste(accession, collapse = ',')) else NULL,
+ mirbase = if (content.type == BIODB.HTML) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL,
+ pubchemcomp = switch(content.type,
+ xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/', paste(accession, collapse = ','), '/XML'),
+ html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession),
NULL),
- lipidmaps = if (content.type %in% c(BIODB.ANY, BIODB.CSV)) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL,
- massbank = if (content.type %in% c(BIODB.ANY, BIODB.TXT)) paste0(BIODB.MASSBANK.EU.WS.URL, '?ids=', paste(accession, collapse = ',')) else NULL,
- mirbase = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL,
- pubchem = {
- accession <- gsub(' ', '', accession, perl = TRUE)
- accession <- gsub('^CID', '', accession, perl = TRUE)
- switch(content.type,
- xml = paste0('http://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/', accession, '/XML/?response_type=save&response_basename=CID_', accession),
- html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession),
- NULL)
- },
- ncbigene = if (content.type %in% c(BIODB.ANY, BIODB.XML)) paste0('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL,
- ncbiccds = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession),
- uniprot = if (content.type %in% c(BIODB.ANY, BIODB.XML)) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'),
+ pubchemsub = switch(content.type,
+ xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/', paste(accession, collapse = ','), '/XML'),
+ html = paste0('http://pubchem.ncbi.nlm.nih.gov/substance/', accession),
+ NULL),
+ ncbigene = if (content.type == BIODB.XML) paste0('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL,
+ ncbiccds = if (content.type == BIODB.HTML) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession),
+ uniprot = if (content.type == BIODB.XML) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'),
+ peakforest = switch(content.type,
+ html= paste0('https://peakforest.org/home?PFs=',accession),
+ json= paste0('https://peakforest-alpha.inra.fr/rest/spectra/lcms/ids/',paste(accession,sep=','),'?token=',token),
+
NULL
)
-
+ )
return(url)
}
- get.entry.url <- function(class, accession, content.type = BIODB.ANY, max.length = 0) {
+ get.entry.url <- function(class, accession, content.type = BIODB.HTML, max.length = 0, base.url = NA_character_, token = NA_character_) {
if (length(accession) == 0)
return(NULL)
- full.url <- .do.get.entry.url(class, accession, content.type = content.type)
+ full.url <- .do.get.entry.url(class, accession, content.type = content.type, base.url = base.url, token = token)
if (max.length == 0 || nchar(full.url) <= max.length)
return(if (max.length == 0) full.url else list(url = full.url, n = length(accession)))
@@ -228,13 +307,13 @@
b <- length(accession)
while (a < b) {
m <- as.integer((a + b) / 2)
- url <- .do.get.entry.url(class, accession[1:m], content.type = content.type)
+ url <- .do.get.entry.url(class, accession[1:m], content.type = content.type, base.url = base.url, token = token)
if (nchar(url) <= max.length && m != a)
a <- m
else
b <- m
}
- url <- .do.get.entry.url(class, accession[1:a], content.type = content.type)
+ url <- .do.get.entry.url(class, accession[1:a], content.type = content.type, base.url = base.url, token = token)
return(list( url = url, n = a))
}
@@ -250,4 +329,22 @@
cat(paste0(BIODB.LEVEL.NAMES[[level]], if (is.na(class)) '' else paste0(", ", class), ": ", msg, "\n"), file = stderr())
}
+ #####################
+ # BIODB GET ENV VAR #
+ #####################
+
+ .biodb.get.env.var <- function(v) {
+
+ # Get all env vars
+ env <- Sys.getenv()
+
+ # Make env var name
+ env.var <- paste(c('BIODB', toupper(v)), collapse = '_')
+
+ # Look if this env var exists
+ if (env.var %in% names(env))
+ return(env[[env.var]])
+
+ return(NA_character_)
+ }
}
diff -r 253d531a0193 -r 20d69a062da3 biodb-package.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/biodb-package.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,4 @@
+#' @useDynLib biodb
+#' @importFrom methods new
+#' @importFrom methods getGeneric
+NULL
diff -r 253d531a0193 -r 20d69a062da3 htmlhlp.R
--- a/htmlhlp.R Sat Sep 03 17:02:01 2016 -0400
+++ b/htmlhlp.R Thu Mar 02 08:55:00 2017 -0500
@@ -41,12 +41,14 @@
# WRITE TAG #
#############
- HtmlWriter$methods( writeTag = function(tag, text = NA_character_, indent = NA_integer_, newline = TRUE) {
+ HtmlWriter$methods( writeTag = function(tag, attr = NA_character_, text = NA_character_, indent = NA_integer_, newline = TRUE) {
- if (is.na(text))
- .self$write(paste0("<", tag, "/>"), indent = indent, newline = newline, escape = FALSE)
+ if (is.na(text)) {
+ attributes <- if (is.na(attr)) '' else paste0(' ', paste(vapply(names(attr), function(a) paste0(a, '="', attr[[a]], '"'), FUN.VALUE=''), collapse = ' '))
+ .self$write(paste0("<", tag, attributes, "/>"), indent = indent, newline = newline, escape = FALSE)
+ }
else {
- .self$writeBegTag(tag, indent = indent, newline = FALSE)
+ .self$writeBegTag(tag, attr = attr, indent = indent, newline = FALSE)
.self$write(text, escape = TRUE , indent = 0, newline = FALSE)
.self$writeEndTag(tag, indent = 0, newline = newline)
}
@@ -56,10 +58,11 @@
# WRITE BEGIN TAG #
###################
- HtmlWriter$methods( writeBegTag = function(tag, indent = NA_integer_, newline = TRUE) {
+ HtmlWriter$methods( writeBegTag = function(tag, attr = NA_character_, indent = NA_integer_, newline = TRUE) {
# Write opening tag
- .self$write(paste0("<", tag, ">"), indent = indent, newline = newline, escape = FALSE)
+ attributes <- if (is.na(attr)) '' else paste0(' ', paste(vapply(names(attr), function(a) paste0(a, '="', attr[[a]], '"'), FUN.VALUE=''), collapse = ' '))
+ .self$write(paste0("<", tag, attributes, ">"), indent = indent, newline = newline, escape = FALSE)
# Increment auto-indent
if ( ! is.na(.self$.auto.indent))
@@ -92,7 +95,7 @@
if ( ! is.null(colnames(x))) {
.self$writeBegTag('tr', indent = indent + 1, newline = newline)
for (field in colnames(x))
- .self$writeTag('th', field, indent = indent + 2, newline = newline)
+ .self$writeTag('th', text = field, indent = indent + 2, newline = newline)
.self$writeEndTag('tr', indent = indent + 1, newline = newline)
}
@@ -101,7 +104,7 @@
for (i in 1:nrow(x)) {
.self$writeBegTag('tr', indent = indent + 1, newline = newline)
for (j in 1:ncol(x))
- .self$writeTag('td', x[i, j], indent = indent + 2, newline = newline)
+ .self$writeTag('td', text = x[i, j], indent = indent + 2, newline = newline)
.self$writeEndTag('tr', indent = indent + 1, newline = newline)
}
.self$writeEndTag('table', indent = indent, newline = newline)
diff -r 253d531a0193 -r 20d69a062da3 lcmsmatching.xml
--- a/lcmsmatching.xml Sat Sep 03 17:02:01 2016 -0400
+++ b/lcmsmatching.xml Thu Mar 02 08:55:00 2017 -0500
@@ -1,4 +1,4 @@
-
+
Annotation of MS peaks using matching on a spectra database.
@@ -91,7 +91,7 @@
-
+
@@ -178,15 +178,15 @@
-
-
+
+
-
+
diff -r 253d531a0193 -r 20d69a062da3 list-chrom-cols.py
--- a/list-chrom-cols.py Sat Sep 03 17:02:01 2016 -0400
+++ b/list-chrom-cols.py Thu Mar 02 08:55:00 2017 -0500
@@ -24,24 +24,25 @@
elif dbtype == 'inhouse':
# Get field for chromatographic column name
- col_field = 'col'
+ col_field = 'chromcol'
if dbfields is not None:
fields = dict(u.split("=") for u in dbfields.split(","))
- if 'col' in fields:
- col_field = fields['col']
+ if 'chromcol' in fields:
+ col_field = fields['chromcol']
# Get all column names from file
- with open(dburl.get_file_name(), 'rb') as dbfile:
+ with open(dburl if isinstance(dburl, str) else dburl.get_file_name(), 'rb') as dbfile:
reader = csv.reader(dbfile, delimiter = "\t", quotechar='"')
header = reader.next()
- i = header.index(col_field)
- allcols = []
- for row in reader:
- col = row[i]
- if col not in allcols:
- allcols.append(col)
- for i, c in enumerate(allcols):
- cols.append( (c, c, i == 0) )
+ if col_field in header:
+ i = header.index(col_field)
+ allcols = []
+ for row in reader:
+ col = row[i]
+ if col not in allcols:
+ allcols.append(col)
+ for i, c in enumerate(allcols):
+ cols.append( (c, c, i == 0) )
return cols
diff -r 253d531a0193 -r 20d69a062da3 massdb-helper.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/massdb-helper.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,92 @@
+simplifySpectrum <- function(spec) {
+ if(length(spec) == 0){
+ return(NA_real_)
+ }
+ #print(spec)
+ if (nrow(spec) == 0)
+ return(NA_real_)
+ if (ncol(spec) != 2) {
+ spec[, BIODB.PEAK.MZ]
+ mint <- BIODB.GROUP.INTENSITY %in% colnames(spec)
+ pint <- which(mint[1])
+ if (length(pint) == 0)
+ stop(
+ "No intensity column founds, if there is more than 2 column, columns should be named",
+ paste0(BIODB.GROUP.INTENSITY, collapse = ", ")
+ )
+ spec <- spec[, c(BIODB.PEAK.MZ, BIODB.GROUP.INTENSITY[pint[1]])]
+ ###Normalizing the intenities.
+ }
+ spec[, 2] <- as.numeric(spec[, 2]) * 100 / max(as.numeric(spec[, 2]))
+ colnames(spec) <- c(BIODB.PEAK.MZ, BIODB.PEAK.RELATIVE.INTENSITY)
+ spec
+}
+
+
+
+calcDistance <-
+ function(spec1 ,
+ spec2,
+ npmin = 2,
+ fun = c("wcosine"),
+ params = list()) {
+ #fun <- match.arg(fun)
+
+ #SPec are always notmlized in pourcentage toa voir issues;
+ spec1 <- simplifySpectrum(spec1)
+ spec2 <- simplifySpectrum(spec2)
+ if(is.na(spec1)||is.na(spec2)) return(list(matched=numeric(0),similarity=0))
+ params$mz1 <- as.numeric(spec1[, BIODB.PEAK.MZ])
+ params$mz2 <- as.numeric(spec2[, BIODB.PEAK.MZ])
+ params$int1 <- as.numeric(spec1[, BIODB.PEAK.RELATIVE.INTENSITY])
+ params$int2 <- as.numeric(spec2[, BIODB.PEAK.RELATIVE.INTENSITY])
+ res <- do.call(fun, args = params)
+ if (sum(res$matched != -1) < npmin)
+ return(list(matched = res$matched, similarity = 0))
+ list(matched = res$matched,
+ similarity = res$measure)
+ }
+
+
+
+###The returned sim list is not ordered
+compareSpectra <-
+ function(spec,
+ libspec,
+ npmin = 2,
+ fun = BIODB.MSMS.DIST.WCOSINE,
+ params = list(),
+ decreasing = TRUE) {
+ #fun <- match.arg(fun)
+ if (length(libspec) == 0) {
+ return(NULL)
+ }
+ if (nrow(spec) == 0) {
+ return(NULL)
+ }
+
+ ####spec is directly normalized.
+ vall <-
+ sapply(
+ libspec,
+ calcDistance,
+ spec1 = spec,
+ params = params,
+ fun = fun,
+ simplify = FALSE
+ )
+ ####the list is ordered with the chosen metric.
+ sim <-
+ vapply(vall,
+ '[[',
+ i = "similarity",
+ FUN.VALUE = ifelse(decreasing, 0, 1))
+ osim <- order(sim, decreasing = decreasing)
+ matched <- sapply(vall, '[[', i = "matched", simplify = FALSE)
+
+ return(list(
+ ord = osim,
+ matched = matched,
+ similarity = sim
+ ))
+ }
diff -r 253d531a0193 -r 20d69a062da3 msdb-common.R
--- a/msdb-common.R Sat Sep 03 17:02:01 2016 -0400
+++ b/msdb-common.R Thu Mar 02 08:55:00 2017 -0500
@@ -2,38 +2,40 @@
library('stringr')
source('strhlp.R', chdir = TRUE)
+ source('biodb-common.R', chdir = TRUE)
#############
# CONSTANTS #
#############
# Field tags
- MSDB.TAG.MZ <- 'mz'
- MSDB.TAG.MZEXP <- 'mzexp'
- MSDB.TAG.MZTHEO <- 'mztheo'
- MSDB.TAG.RT <- 'rt'
- MSDB.TAG.MODE <- 'mode'
- MSDB.TAG.MOLID <- 'molid'
- MSDB.TAG.COL <- 'col'
- MSDB.TAG.COLRT <- 'colrt'
- MSDB.TAG.ATTR <- 'attr'
- MSDB.TAG.INT <- 'int' # Absolute intensity
- MSDB.TAG.REL <- 'rel' # Relative intensity
- MSDB.TAG.COMP <- 'comp'
- MSDB.TAG.MOLNAMES <- 'molnames'
- MSDB.TAG.MOLCOMP <- 'molcomp'
- MSDB.TAG.MOLATTR <- 'molattr'
- MSDB.TAG.MOLMASS <- 'molmass'
- MSDB.TAG.INCHI <- 'inchi'
- MSDB.TAG.INCHIKEY <- 'inchikey'
- MSDB.TAG.PUBCHEM <- 'pubchem'
- MSDB.TAG.CHEBI <- 'chebi'
- MSDB.TAG.HMDB <- 'hmdb'
- MSDB.TAG.KEGG <- 'kegg'
+ MSDB.TAG.MZ <- BIODB.PEAK.MZ
+ MSDB.TAG.MZEXP <- BIODB.PEAK.MZEXP
+ MSDB.TAG.MZTHEO <- BIODB.PEAK.MZTHEO
+ MSDB.TAG.RT <- BIODB.PEAK.RT
+ MSDB.TAG.MODE <- BIODB.MSMODE
+ MSDB.TAG.MOLID <- BIODB.COMPOUND.ID
+ MSDB.TAG.COL <- BIODB.CHROM.COL
+ MSDB.TAG.COLRT <- BIODB.CHROM.COL.RT
+ MSDB.TAG.ATTR <- BIODB.PEAK.ATTR
+ MSDB.TAG.INT <- BIODB.PEAK.INTENSITY
+ MSDB.TAG.REL <- BIODB.PEAK.RELATIVE.INTENSITY
+ MSDB.TAG.COMP <- BIODB.PEAK.COMP
+ MSDB.TAG.MOLNAMES <- BIODB.FULLNAMES
+ MSDB.TAG.MOLCOMP <- BIODB.COMPOUND.MASS
+# MSDB.TAG.MOLATTR <- 'molattr'
+ MSDB.TAG.MOLMASS <- BIODB.COMPOUND.COMP
+ MSDB.TAG.INCHI <- BIODB.INCHI
+ MSDB.TAG.INCHIKEY <- BIODB.INCHIKEY
+ # TODO Use BIODB tags.
+ MSDB.TAG.PUBCHEM <- BIODB.PUBCHEMCOMP.ID
+ MSDB.TAG.CHEBI <- BIODB.CHEBI.ID
+ MSDB.TAG.HMDB <- BIODB.HMDB.ID
+ MSDB.TAG.KEGG <- BIODB.KEGG.ID
# Mode tags
- MSDB.TAG.POS <- 'ms.pos'
- MSDB.TAG.NEG <- 'ms.neg'
+ MSDB.TAG.POS <- BIODB.MSMODE.NEG
+ MSDB.TAG.NEG <- BIODB.MSMODE.POS
# Fields containing multiple values
MSDB.MULTIVAL.FIELDS <- c(MSDB.TAG.MOLNAMES)
diff -r 253d531a0193 -r 20d69a062da3 search-mz
--- a/search-mz Sat Sep 03 17:02:01 2016 -0400
+++ b/search-mz Thu Mar 02 08:55:00 2017 -0500
@@ -49,10 +49,11 @@
MSDB.DFT[['molids-sep']] <- MSDB.DFT.MATCH.SEP
MSDB.DFT[['db-fields']] <- concat.kv.list(msdb.get.dft.db.fields())
MSDB.DFT[['db-ms-modes']] <- concat.kv.list(MSDB.DFT.MODES)
-MSDB.DFT[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields())
-MSDB.DFT[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields())
MSDB.DFT[['pos-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.POS]], collapse = ',')
MSDB.DFT[['neg-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.NEG]], collapse = ',')
+DEFAULT.ARG.VALUES <- MSDB.DFT
+DEFAULT.ARG.VALUES[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields())
+DEFAULT.ARG.VALUES[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields())
##############
# PRINT HELP #
@@ -108,16 +109,26 @@
opt$rtcol <- strsplit(opt$rtcol, ',')[[1]]
# Parse input column names
- if ( ! is.null(opt[['input-col-names']])) {
+ if (is.null(opt[['input-col-names']])) {
+ opt[['input-col-names']] <- msdb.get.dft.input.fields()
+ }
+ else {
custcols <- split.kv.list(opt[['input-col-names']])
- dftcols <- split.kv.list(MSDB.DFT[['input-col-names']])
+ dftcols <- msdb.get.dft.input.fields()
opt[['input-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)])
}
# Parse output column names
- if ( ! is.null(opt[['output-col-names']])) {
+ if (is.null(opt[['output-col-names']])) {
+ # By default keep input col names for output
+ opt[['output-col-names']] <- msdb.get.dft.output.fields()
+ input.cols <- names(opt[['input-col-names']])
+ output.cols <- names(opt[['output-col-names']])
+ opt[['output-col-names']] <- c(opt[['input-col-names']][input.cols %in% output.cols], opt[['output-col-names']][ ! output.cols %in% input.cols])
+ }
+ else {
custcols <- split.kv.list(opt[['output-col-names']])
- dftcols <- split.kv.list(MSDB.DFT[['output-col-names']])
+ dftcols <- msdb.get.dft.output.fields()
opt[['output-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)])
}
@@ -136,7 +147,7 @@
print.dft.arg.val <- function(opt) {
- print.flags <- MSDB.DFT
+ print.flags <- DEFAULT.ARG.VALUES
names(print.flags) <- vapply(names(print.flags), function(x) paste0('print-', x), FUN.VALUE = '')
for (f in names(print.flags))
if ( ! is.null(opt[[f]])) {
@@ -149,7 +160,7 @@
spec <- character()
- for (f in names(MSDB.DFT))
+ for (f in names(DEFAULT.ARG.VALUES))
spec <- c(spec, paste0('print-', f), NA_character_, 0, 'logical', paste0('Print default value of --', f))
return(spec)
@@ -184,8 +195,8 @@
'precursor-rt-tol', NA_character_, 1, 'numeric', paste0('Precursor retention time tolerance. Only used when precursor-match is enabled. Default is ', MSDB.DFT[['precursor-rt-tol']], '.'),
'pos-prec', NA_character_, 1, 'character', paste0('Set the list of precursors to use in positive mode. Default is "', MSDB.DFT[['pos-prec']], '".'),
'neg-prec', NA_character_, 1, 'character', paste0('Set the list of precursors to use in negative mode. Default is "', MSDB.DFT[['neg-prec']], '".'),
- 'input-col-names', NA_character_, 1, 'character', paste0('Set the input column names. Default is "', MSDB.DFT[['input-col-names']], '".'),
- 'output-col-names', NA_character_, 1, 'character', paste0('Set the output column names. Default is "', MSDB.DFT[['output-col-names']], '".'),
+ 'input-col-names', NA_character_, 1, 'character', paste0('Set the input column names. Default is "', DEFAULT.ARG.VALUES[['input-col-names']], '".'),
+ 'output-col-names', NA_character_, 1, 'character', paste0('Set the output column names. Default is "', DEFAULT.ARG.VALUES[['output-col-names']], '".'),
'molids-sep', NA_character_, 1, 'character', paste0('Set character separator used to when concatenating molecule IDs in output. Default is "', MSDB.DFT[['molids-sep']] , '".'),
'first-val', NA_character_, 0, 'logical', 'Keep only the first value in multi-value fields. Unset by default.',
'excel2011comp', NA_character_, 0, 'logical', 'Excel 2011 compatiblity mode. Output ASCII text files instead of UTF-8 files, where greek letters are replaced with their latin names, plusminus sign is replaced with +- and apostrophe is replaced with \"prime\". All other non-ASCII characters are repladed with underscore.',
@@ -386,17 +397,29 @@
output.html <- function(db, main, peaks, file, opt, output.fields) {
# Replace public database IDs by URLs
- if ( ! is.null(peaks))
+ if ( ! is.null(peaks) || ! is.null(main)) {
+ # Conversion from extdb id field to extdb name
+ extdb2classdb = list()
+ extdb2classdb[MSDB.TAG.KEGG] = BIODB.KEGG
+ extdb2classdb[MSDB.TAG.HMDB] = BIODB.HMDB
+ extdb2classdb[MSDB.TAG.CHEBI] = BIODB.CHEBI
+ extdb2classdb[MSDB.TAG.PUBCHEM] = BIODB.PUBCHEMCOMP
+
+ # Loop on all dbs
for (extdb in c(MSDB.TAG.KEGG, MSDB.TAG.HMDB, MSDB.TAG.CHEBI, MSDB.TAG.PUBCHEM)) {
field <- output.fields[[extdb]]
- if (field %in% colnames(peaks))
- peaks[[field]] <- vapply(peaks[[field]], function(id) paste0('', id, ''), FUN.VALUE = '')
+ if ( ! is.null(peaks) && field %in% colnames(peaks))
+ peaks[[field]] <- vapply(peaks[[field]], function(id) if (is.na(id)) '' else paste0('', id, ''), FUN.VALUE = '')
+ if ( ! is.null(main) && field %in% colnames(main))
+ main[[field]] <- vapply(main[[field]], function(ids) if (is.na(ids) || nchar(ids) == 0) '' else paste(vapply(strsplit(ids, opt[['molids-sep']])[[1]], function(id) paste0('', id, ''), FUN.VALUE = ''), collapse = opt[['molids-sep']]), FUN.VALUE = '')
}
+ }
# Write HTML
html <- HtmlWriter(file = file)
html$writeBegTag('html')
html$writeBegTag('header')
+ html$writeTag('meta', attr = c(charset = "UTF-8"))
html$writeTag('title', text = "LC/MS matching results")
html$writeBegTag('style')
html$write('table, th, td { border-collapse: collapse; }')
@@ -414,20 +437,20 @@
# Write parameters
html$writeTag('h2', text = "Parameters")
html$writeBegTag('ul')
- html$writeTag('li', paste0("Mode = ", opt$mode, "."))
- html$writeTag('li', paste0("M/Z precision = ", opt$mzprec, "."))
- html$writeTag('li', paste0("M/Z shift = ", opt$mzshift, "."))
- html$writeTag('li', paste0("Precursor match = ", (if (is.null(opt[['precursor-match']])) "no" else "yes"), "."))
+ html$writeTag('li', text = paste0("Mode = ", opt$mode, "."))
+ html$writeTag('li', text = paste0("M/Z precision = ", opt$mzprec, "."))
+ html$writeTag('li', text = paste0("M/Z shift = ", opt$mzshift, "."))
+ html$writeTag('li', text = paste0("Precursor match = ", (if (is.null(opt[['precursor-match']])) "no" else "yes"), "."))
if ( ! is.null(opt[['precursor-match']])) {
- html$writeTag('li', paste0("Positive precursors = ", paste0(opt[['pos-prec']], collapse = ', '), "."))
- html$writeTag('li', paste0("Negative precursors = ", paste0(opt[['neg-prec']], collapse = ', '), "."))
+ html$writeTag('li', text = paste0("Positive precursors = ", paste0(opt[['pos-prec']], collapse = ', '), "."))
+ html$writeTag('li', text = paste0("Negative precursors = ", paste0(opt[['neg-prec']], collapse = ', '), "."))
}
if ( ! is.null(opt$rtcol)) {
- html$writeTag('li', paste0("Columns = ", paste(opt$rtcol, collapse = ", "), "."))
- html$writeTag('li', paste0("RTX = ", opt$rttolx, "."))
- html$writeTag('li', paste0("RTY = ", opt$rttoly, "."))
+ html$writeTag('li', text = paste0("Columns = ", paste(opt$rtcol, collapse = ", "), "."))
+ html$writeTag('li', text = paste0("RTX = ", opt$rttolx, "."))
+ html$writeTag('li', text = paste0("RTY = ", opt$rttoly, "."))
if ( ! is.null(opt[['precursor-match']]))
- html$writeTag('li', paste0("RTZ = ", opt[['precursor-rt-tol']], "."))
+ html$writeTag('li', text = paste0("RTZ = ", opt[['precursor-rt-tol']], "."))
}
html$writeEndTag('ul')
@@ -480,7 +503,7 @@
if (file.info(opt[['input-file']])$size > 0) {
# Load file into data frame
- input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t")
+ input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t", stringsAsFactor = FALSE)
# Convert each column that is identified by a number into a name
for (field in names(opt[['input-col-names']])) {
@@ -533,6 +556,8 @@
db$searchForMzRtList(mode = mode, shift = opt$mzshift, prec = opt$mzprec, rt.tol = opt$rttol, rt.tol.x = opt$rttolx, rt.tol.y = opt$rttoly, col = opt$rtcol, precursor.match = ! is.null(opt[['precursor-match']]), precursor.rt.tol = opt[['precursor-rt-tol']])
# Write output
+main.output$moveColumnsToBeginning(colnames(input))
+peaks.output$moveColumnsToBeginning(colnames(input))
# TODO Create a class MsDbOutputCsvFileStream
df.write.tsv(main.output$getDataFrame(), file = opt[['output-file']], row.names = FALSE)
if ( ! is.null(opt[['peak-output-file']]))
diff -r 253d531a0193 -r 20d69a062da3 spec-dist.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/spec-dist.R Thu Mar 02 08:55:00 2017 -0500
@@ -0,0 +1,196 @@
+#dyn.load('src/closeMatchPpm.dll')
+# commented out for refactoring as package
+#dyn.load('src/closeMatchPpm.so')
+
+matchPpm <- function(x, y, ppm = 3, mzmin = 0) {
+ if (any(is.na(y)))
+ stop("NA's are not allowed in y !\n")
+ ok <- !(is.na(x))
+ ans <- order(x)
+ keep <- seq_along(ok)[ok]
+ xidx <- ans[ans %in% keep]
+ xs <- x[xidx]
+ yidx <- order(y)
+ ys <- y[yidx]
+ if (!is.double(xs))
+ xs <- as.double(xs)
+ if (!is.double(ys))
+ ys <- as.double(ys)
+ if (!is.integer(xidx))
+ xidx <- as.integer(xidx)
+ if (!is.integer(yidx))
+ yidx <- as.integer(yidx)
+
+ fm <-
+ .Call(
+ "closeMatchPpm",
+ xs,
+ ys,
+ xidx,
+ yidx,
+ as.integer(length(x)),
+ as.double(ppm),
+ as.double(mzmin)
+ )
+ fm
+}
+
+
+simpList <- function(v) {
+ vapply(v, function(x) {
+ if (is.null(x)) {
+ -1
+ } else{
+ x
+ }
+ }, FUN.VALUE = -1)
+}
+
+
+##Stein and scott values : mzexp 3 intexp 0.6
+##Massbank values : mzexp 2 intexp 0.5
+
+
+cosine <-
+ function(mz1,
+ mz2,
+ int1,
+ int2,
+ mzexp = 2,
+ intexp = 0.5,
+ ppm,
+ dmz = 0.005) {
+ matchList <- matchPpm(mz1, mz2, ppm, dmz)
+ ###Weigthed intensity
+ pfound <- which(!sapply(matchList, is.null, simplify = TRUE))
+
+ ###If no peak is found.
+ if (length(pfound) == 0)
+ return(list(measure = 0, matched = rep(-1, length(mz1))))
+ w1 <- int1 ^ intexp * mz1 ^ mzexp
+ w2 <- int2 ^ intexp * mz2 ^ mzexp
+ cat(w1[pfound], w2[unlist(matchList[pfound])],'\n')
+ cos_value <-
+ sum((w1[pfound] * w2[unlist(matchList[pfound])]) ^ 2) / (sum(w1[pfound] ^
+ 2) * sum(w2[unlist(matchList[pfound])] ^ 2))
+
+ ####Adding the penality if needed.
+ list(measure = cos_value, matched = simpList(matchList))
+ }
+
+
+###penalized cosine
+
+wcosine <-
+ function(mz1,
+ mz2,
+ int1,
+ int2,
+ mzexp = 2,
+ intexp = 0.5,
+ ppm,
+ dmz = 0.005,
+ penality = c("rweigth")) {
+ penality <- match.arg(penality)
+ matchList <- matchPpm(mz1, mz2, ppm, dmz)
+ ###Weigthed intensity
+ pfound <- which(!sapply(matchList, is.null, simplify = TRUE))
+ ###If no peak is found.
+ if (length(pfound) == 0)
+ return(list(measure = 0, matched = rep(-1, length(mz1))))
+ w1 <- int1 ^ intexp * mz1 ^ mzexp
+ w2 <- int2 ^ intexp * mz2 ^ mzexp
+
+ cos_value <-
+ sum((w1[pfound] * w2[unlist(matchList[pfound])]) ^ 2) / (sum(w1[pfound] ^
+ 2) * sum(w2[unlist(matchList[pfound])] ^ 2))
+
+ if(is.nan(cos_value)) cos_value <- 0
+ ####Adding the penality if needed.
+ div = 1
+ if (penality == "rweigth") {
+ p <-
+ (sum(w1[pfound]) / sum(w1) + sum(w2[unlist(matchList[pfound])]) / sum(w2)) /
+ 2
+ div = 2
+ } else{
+ p <- 0
+ }
+
+ measure <- (cos_value + p) / div
+ if(is.nan(measure)) measure <- (cos_value) / div
+ list(measure = measure,
+ matched = simpList(matchList))
+ }
+
+##A gaussian of the two spectra seen as a mixture of gaussian, derived form Heinonen et al 2012
+pkernel <-
+ function(mz1,
+ mz2,
+ int1,
+ int2,
+ mzexp = 2,
+ intexp = 0.5,
+ ppm,
+ dmz = 0.005,
+ sigint = 0.5,
+ penality = c("rweigth")) {
+ ###We first match the peak
+ matchList <- matchPpm(mz1, mz2, ppm, dmz)
+ # ###Weigthed intensity
+ pfound <- which(!sapply(matchList, is.null, simplify = TRUE))
+ #
+ ###If no peak is found.
+ if (length(pfound) == 0)
+ return(list(measure = 0, matched = rep(-1, length(mz1))))
+ w1 <- int1 ^ intexp * mz1 ^ mzexp
+ w2 <- int2 ^ intexp * mz2 ^ mzexp
+ w1 <- w1 * 1 / sum(w1)
+ w2 <- w2 * 1 / sum(w2)
+ l1 <- length(w1)
+ l2 <- length(w2)
+ ###The mz dev
+ vsig1 = mz1 * ppm * 3 * 10 ^ -6
+ vsig1 = sapply(vsig1, function(x, y) {
+ return(max(x, y))
+ }, y = dmz)
+
+ vsig2 = mz2 * ppm * 3 * 10 ^ -6
+ vsig2 = sapply(vsig2, function(x, y) {
+ return(max(x, y))
+ }, y = dmz)
+ accu = 0
+ ###TO DO rcopder en C
+ for (i in 1:l1) {
+ for (j in 1:l2) {
+ divisor = max(stats::dnorm(
+ mz1[i],
+ mean = mz1[i],
+ sd = sqrt(vsig1[i] ^ 2 + vsig1[i] ^ 2)
+ ),
+ stats::dnorm(
+ mz2[j],
+ mean = mz2[j],
+ sd = sqrt(vsig2[j] ^ 2 + vsig2[j] ^ 2)
+ ))
+ if (divisor == 0)
+ next
+ scalet = stats::dnorm(mz1[i],
+ mean = mz2[j],
+ sd = sqrt(vsig1[i] ^ 2 + vsig2[j] ^ 2))
+ accu = accu + scalet / divisor
+ }
+ }
+ div = 1
+ if (penality == "rweigth") {
+ p <-
+ (sum(w1[pfound]) / sum(w1) + sum(w2[unlist(matchList[pfound])]) / sum(w2)) /
+ 2
+ div = 2
+ } else{
+ p <- 0
+ }
+ accu = accu / (l2 * l1)
+ list(measure = (accu + p) / div,
+ matched = simpList(matchList))
+ }
diff -r 253d531a0193 -r 20d69a062da3 test-data/filedb-small-mz-match-html-output.html
--- a/test-data/filedb-small-mz-match-html-output.html Sat Sep 03 17:02:01 2016 -0400
+++ b/test-data/filedb-small-mz-match-html-output.html Thu Mar 02 08:55:00 2017 -0500
@@ -1,5 +1,6 @@