# HG changeset patch # User prog # Date 1472936521 14400 # Node ID 253d531a0193b67ee318762fdac9a25ee05038d8 # Parent e66bb061af0622f9969ab20f52724b4e13d1bb42 planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 36c9d8099c20a1ae848f1337c16564335dd8fb2b diff -r e66bb061af06 -r 253d531a0193 BiodbConn.R --- a/BiodbConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/BiodbConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,33 +1,30 @@ -if ( ! exists('BiodbConn')) { # Do not load again if already loaded +if ( ! exists('BiodbConn')) { - source(file.path('UrlRequestScheduler.R'), chdir = TRUE) source('biodb-common.R') ##################### # CLASS DECLARATION # ##################### - BiodbConn <- setRefClass("BiodbConn", fields = list(.scheduler = "UrlRequestScheduler")) + BiodbConn <- setRefClass("BiodbConn", fields = list( .debug = "logical" )) ############### # CONSTRUCTOR # ############### - BiodbConn$methods( initialize = function(useragent = NA_character_, scheduler = NULL, ...) { - - # Check useragent - ! is.null(useragent) && ! is.na(useragent) || stop("You must specify a valid useragent.") + BiodbConn$methods( initialize = function(debug = FALSE, ...) { + .debug <<- debug + }) - # Set scheduler - if (is.null(scheduler)) - scheduler <- UrlRequestScheduler$new(n = 3) - inherits(scheduler, "UrlRequestScheduler") || stop("The scheduler instance must inherit from UrlRequestScheduler class.") - scheduler$setUserAgent(useragent) # set agent - .scheduler <<- scheduler - - callSuper(...) # calls super-class initializer with remaining parameters + ####################### + # PRINT DEBUG MESSAGE # + ####################### + + BiodbConn$methods( .print.debug.msg = function(msg) { + if (.self$.debug) + .print.msg(msg = msg, class = class(.self)) }) - + ###################### # HANDLES ENTRY TYPE # ###################### @@ -59,7 +56,7 @@ # Download entry content from the public database. # type The entry type. - # id The ID of the enttry to get. + # id The ID of the entry to get. # RETURN An entry content downloaded from database. BiodbConn$methods( getEntryContent = function(type, id) { stop("Method getCompound() is not implemented in concrete class.") @@ -75,4 +72,22 @@ BiodbConn$methods( createEntry = function(type, content, drop = TRUE) { stop("Method createEntry() is not implemented in concrete class.") }) + + ################# + # GET ENTRY IDS # + ################# + + # Get a list of IDs of all entries contained in this database. + BiodbConn$methods( getEntryIds = function(type) { + stop("Method getEntryIds() is not implemented in concrete class.") + }) + + ################## + # GET NB ENTRIES # + ################## + + # Get the number of entries contained in this database. + BiodbConn$methods( getNbEntries = function(type) { + stop("Method getNbEntries() is not implemented in concrete class.") + }) } diff -r e66bb061af06 -r 253d531a0193 BiodbEntry.R --- a/BiodbEntry.R Tue Jul 12 12:02:37 2016 -0400 +++ b/BiodbEntry.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,16 +20,20 @@ callSuper(...) # calls super-class initializer with remaining parameters }) - ############# - # SET FIELD # - ############# + ################### + # SET FIELD VALUE # + ################### + BiodbEntry$methods( setFieldValue = function(field, value) { + .self$setField(field, value) + }) + BiodbEntry$methods( setField = function(field, value) { class = .self$getFieldClass(field) # Check cardinality - if (class != 'data.frame' && .self$getFieldCardinality(field) == RBIODB.CARD.ONE && length(value) > 1) + if (class != 'data.frame' && .self$getFieldCardinality(field) == BIODB.CARD.ONE && length(value) > 1) stop(paste0('Cannot set more that one value to single value field "', field, '" in BiodEntry.')) # Check value class @@ -45,15 +49,23 @@ }) ################### + # GET FIELD NAMES # + ################### + + BiodbEntry$methods( getFieldNames = function(field) { + return(names(.self$.fields)) + }) + + ################### # GET FIELD CLASS # ################### - + BiodbEntry$methods( getFieldClass = function(field) { - if ( ! field %in% RBIODB.FIELDS[['name']]) + if ( ! field %in% BIODB.FIELDS[['name']]) stop(paste0('Unknown field "', field, '" in BiodEntry.')) - field.class <- RBIODB.FIELDS[which(field == RBIODB.FIELDS[['name']]), 'class'] + field.class <- BIODB.FIELDS[which(field == BIODB.FIELDS[['name']]), 'class'] return(field.class) }) @@ -64,21 +76,25 @@ BiodbEntry$methods( getFieldCardinality = function(field) { - if ( ! field %in% RBIODB.FIELDS[['name']]) + if ( ! field %in% BIODB.FIELDS[['name']]) stop(paste0('Unknown field "', field, '" in BiodEntry.')) - field.card <- RBIODB.FIELDS[which(field == RBIODB.FIELDS[['name']]), 'cardinality'] + field.card <- BIODB.FIELDS[which(field == BIODB.FIELDS[['name']]), 'cardinality'] return(field.card) }) - ############# - # GET FIELD # - ############# + ################### + # GET FIELD VALUE # + ################### + BiodbEntry$methods( getFieldValue = function(field) { + return(.self$getField(field)) + }) + BiodbEntry$methods( getField = function(field) { - if ( ! field %in% RBIODB.FIELDS[['name']]) + if ( ! field %in% BIODB.FIELDS[['name']]) stop(paste0('Unknown field "', field, '" in BiodEntry.')) if (field %in% names(.self$.fields)) @@ -97,11 +113,11 @@ BiodbEntry$methods( .compute.field = function(field) { - if ( ! is.null(.self$.factory) && field %in% names(RBIODB.FIELD.COMPUTING)) { - for (db in RBIODB.FIELD.COMPUTING[[field]]) { + if ( ! is.null(.self$.factory) && field %in% names(BIODB.FIELD.COMPUTING)) { + for (db in BIODB.FIELD.COMPUTING[[field]]) { db.id <- .self$getField(paste0(db, 'id')) if ( ! is.na(db.id)) { - db.compound <- .self$.factory$createEntry(db, type = RBIODB.COMPOUND, id = db.id) + db.compound <- .self$.factory$createEntry(db, type = BIODB.COMPOUND, id = db.id) if ( ! is.null(db.compound)) { .self$setField(field, db.compound$getField(field)) return(TRUE) @@ -112,6 +128,24 @@ return(FALSE) }) + + ############################ + # GET FIELDS AS DATA FRAME # + ############################ + + BiodbEntry$methods( getFieldsAsDataFrame = function(field) { + + df <- data.frame() + + # Loop on all fields + for (f in names(.self$.fields)) + + # If field class is a basic type + if (.self$getFieldClass(f) %in% c('character', 'logical', 'integer', 'double')) + df[1, f] <- .self$getFieldValue(f) + + return(df) + }) ########### # FACTORY # diff -r e66bb061af06 -r 253d531a0193 BiodbFactory.R --- a/BiodbFactory.R Tue Jul 12 12:02:37 2016 -0400 +++ b/BiodbFactory.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,6 +1,7 @@ if ( ! exists('BiodbFactory')) { # Do not load again if already loaded library(methods) + source('biodb-common.R') source('ChebiConn.R') source('KeggConn.R') source('PubchemConn.R') @@ -13,27 +14,37 @@ source('NcbiccdsConn.R') source('UniprotConn.R') source('MassbankConn.R') + source('MassFiledbConn.R') ##################### # CLASS DECLARATION # ##################### - BiodbFactory <- setRefClass("BiodbFactory", fields = list(.useragent = "character", .conn = "list", .cache.dir = "character")) + BiodbFactory <- setRefClass("BiodbFactory", fields = list(.useragent = "character", .conn = "list", .cache.dir = "character", .debug = "logical")) ############### # CONSTRUCTOR # ############### - BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, ...) { + BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, debug = FALSE, ...) { - ( ! is.null(useragent) && ! is.na(useragent)) || stop("You must provide a user agent string (e.g.: \"myapp ; my.email@address\").") .useragent <<- useragent .conn <<- list() .cache.dir <<- cache.dir + .debug <<- debug callSuper(...) # calls super-class initializer with remaining parameters }) + ####################### + # PRINT DEBUG MESSAGE # + ####################### + + BiodbFactory$methods( .print.debug.msg = function(msg) { + if (.self$.debug) + .print.msg(msg = msg, class = class(.self)) + }) + ################## # GET USER AGENT # ################## @@ -42,11 +53,19 @@ return(.self$.useragent) }) + ################## + # SET USER AGENT # + ################## + + BiodbFactory$methods( setUserAgent = function(useragent) { + .useragent <<- useragent + }) + ############ # GET CONN # ############ - BiodbFactory$methods( getConn = function(class) { + BiodbFactory$methods( getConn = function(class, url = NA_character_) { if ( ! class %in% names(.self$.conn)) { @@ -63,7 +82,8 @@ ncbigene = NcbigeneConn$new(useragent = .self$.useragent), ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent), uniprot = UniprotConn$new(useragent = .self$.useragent), - massbank = MassbankConn$new(useragent = .self$.useragent), + massbank = MassbankConn$new(useragent = .self$.useragent, debug = .self$.debug), + massfiledb = MassFiledbConn$new(file = url), NULL) # Unknown class @@ -85,6 +105,9 @@ is.null(id) && is.null(content) && stop("One of id or content must be set.") ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.") + # Debug + .self$.print.debug.msg(paste0("Creating entry from ", if (is.null(id)) "content" else paste("id", id), "...")) + # Get content if ( ! is.null(id)) content <- .self$getEntryContent(class, type, id) @@ -151,25 +174,52 @@ # GET ENTRY CONTENT # ##################### - BiodbFactory$methods( getEntryContent = function(class, type, id) { + BiodbFactory$methods( getEntryContent = function(class, type, id, chunk.size = NA_integer_) { + + # Debug + .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)...")) content <- NULL # Load from cache if ( ! is.na(.self$.cache.dir)) content <- .self$.load.content.from.cache(class, type, id) - # Get contents + # Get list of missing contents missing.content.indexes <- vapply(content, is.null, FUN.VALUE = TRUE) missing.ids <- if (is.null(content)) id else id[missing.content.indexes] + + # Debug + if ( ! is.na(.self$.cache.dir)) { + .self$.print.debug.msg(paste0(length(id) - length(missing.ids), " entry content(s) loaded from cache.")) + .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched.")) + } + + # Get contents if (length(missing.ids) > 0) { # Use connector to get missing contents conn <- .self$getConn(class) - missing.contents <- conn$getEntryContent(type, missing.ids) + + # Divide list of missing ids in chunks (in order to save in cache regularly) + chunks.of.missing.ids = if (is.na(chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / chunk.size)) + + # Loop on chunks + missing.contents <- NULL + for (ch.missing.ids in chunks.of.missing.ids) { + + ch.missing.contents <- conn$getEntryContent(type, ch.missing.ids) - # Save to cache - if ( ! is.null(missing.contents) && ! is.na(.self$.cache.dir)) - .self$.save.content.to.cache(class, type, missing.ids, missing.contents) + # Save to cache + if ( ! is.null(ch.missing.contents) && ! is.na(.self$.cache.dir)) + .self$.save.content.to.cache(class, type, ch.missing.ids, ch.missing.contents) + + # Append + missing.contents <- c(missing.contents, ch.missing.contents) + + # Debug + if ( ! is.na(.self$.cache.dir)) + .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved...")) + } # Merge content and missing.contents if (is.null(content)) diff -r e66bb061af06 -r 253d531a0193 ChebiCompound.R --- a/ChebiCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/ChebiCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,9 +20,9 @@ # Define xpath expressions xpath.expr <- character() -# xpath.expr[[RBIODB.ACCESSION]] <- "//b[starts-with(., 'CHEBI:')]" - xpath.expr[[RBIODB.INCHI]] <- "//td[starts-with(., 'InChI=')]" - xpath.expr[[RBIODB.INCHIKEY]] <- "//td[text()='InChIKey']/../td[2]" +# xpath.expr[[BIODB.ACCESSION]] <- "//b[starts-with(., 'CHEBI:')]" + xpath.expr[[BIODB.INCHI]] <- "//td[starts-with(., 'InChI=')]" + xpath.expr[[BIODB.INCHIKEY]] <- "//td[text()='InChIKey']/../td[2]" for (html in contents) { @@ -43,14 +43,14 @@ accession <- xpathSApply(xml, "//b[starts-with(., 'CHEBI:')]", xmlValue) if (length(accession) > 0) { accession <- sub('^CHEBI:([0-9]+)$', '\\1', accession, perl = TRUE) - compound$setField(RBIODB.ACCESSION, accession) + compound$setField(BIODB.ACCESSION, accession) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1) diff -r e66bb061af06 -r 253d531a0193 ChebiConn.R --- a/ChebiConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/ChebiConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,35 +1,35 @@ if ( ! exists('ChebiConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('ChebiCompound.R') ##################### # CLASS DECLARATION # ##################### - ChebiConn <- setRefClass("ChebiConn", contains = "BiodbConn") + ChebiConn <- setRefClass("ChebiConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## ChebiConn$methods( getEntryContentType = function(type) { - return(RBIODB.HTML) + return(BIODB.HTML) }) ##################### # GET ENTRY CONTENT # ##################### - + ChebiConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.CHEBI, x)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.CHEBI, x)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ ChebiConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createChebiCompoundFromHtml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createChebiCompoundFromHtml(content, drop = drop) else NULL) }) } # end of load safe guard diff -r e66bb061af06 -r 253d531a0193 ChemSpiderConn.R --- a/ChemSpiderConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/ChemSpiderConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('ChemspiderConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('ChemspiderCompound.R') ##################### # CLASS DECLARATION # ##################### - ChemspiderConn <- setRefClass("ChemspiderConn", contains = "BiodbConn") + ChemspiderConn <- setRefClass("ChemspiderConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## ChemspiderConn$methods( getEntryContentType = function(type) { - return(RBIODB.HTML) + return(BIODB.HTML) }) ##################### @@ -23,13 +23,13 @@ ChemspiderConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.CHEMSPIDER, x)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.CHEMSPIDER, x)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ ChemspiderConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createChemspiderCompoundFromHtml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createChemspiderCompoundFromHtml(content, drop = drop) else NULL) }) ############################ diff -r e66bb061af06 -r 253d531a0193 ChemspiderCompound.R --- a/ChemspiderCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/ChemspiderCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -40,14 +40,14 @@ accession <- xpathSApply(xml, "//li[starts-with(., 'ChemSpider ID')]", xmlValue) if (length(accession) > 0) { accession <- sub('^ChemSpider ID([0-9]+)$', '\\1', accession, perl = TRUE) - compound$setField(RBIODB.ACCESSION, accession) + compound$setField(BIODB.ACCESSION, accession) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1) diff -r e66bb061af06 -r 253d531a0193 EnzymeCompound.R --- a/EnzymeCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/EnzymeCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,8 +20,8 @@ # Define fields regex regex <- character() - regex[[RBIODB.ACCESSION]] <- "^ID\\s+([0-9.]+)$" - regex[[RBIODB.DESCRIPTION]] <- "^DE\\s+(.+)$" + regex[[BIODB.ACCESSION]] <- "^ID\\s+([0-9.]+)$" + regex[[BIODB.DESCRIPTION]] <- "^DE\\s+(.+)$" for (text in contents) { @@ -49,7 +49,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1) diff -r e66bb061af06 -r 253d531a0193 EnzymeConn.R --- a/EnzymeConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/EnzymeConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('EnzymeConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('EnzymeCompound.R') ##################### # CLASS DECLARATION # ##################### - EnzymeConn <- setRefClass("EnzymeConn", contains = "BiodbConn") + EnzymeConn <- setRefClass("EnzymeConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## EnzymeConn$methods( getEntryContentType = function(type) { - return(RBIODB.TXT) + return(BIODB.TXT) }) ##################### @@ -23,13 +23,13 @@ EnzymeConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.ENZYME, accession = x, content.type = RBIODB.TXT)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.ENZYME, accession = x, content.type = BIODB.TXT)), FUN.VALUE = '') return(content) } @@ -42,6 +42,6 @@ ################ EnzymeConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createEnzymeCompoundFromTxt(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createEnzymeCompoundFromTxt(content, drop = drop) else NULL) }) } diff -r e66bb061af06 -r 253d531a0193 HmdbCompound.R --- a/HmdbCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/HmdbCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,13 +20,13 @@ # Define xpath expressions xpath.expr <- character() - xpath.expr[[RBIODB.ACCESSION]] <- "/metabolite/accession" - xpath.expr[[RBIODB.KEGG.ID]] <- "//kegg_id" - xpath.expr[[RBIODB.NAME]] <- "/metabolite/name" - xpath.expr[[RBIODB.FORMULA]] <- "/metabolite/chemical_formula" - xpath.expr[[RBIODB.SUPER.CLASS]] <- "//super_class" - xpath.expr[[RBIODB.AVERAGE.MASS]] <- "//average_molecular_weight" - xpath.expr[[RBIODB.MONOISOTOPIC.MASS]] <- "//monisotopic_moleculate_weight" + xpath.expr[[BIODB.ACCESSION]] <- "/metabolite/accession" + xpath.expr[[BIODB.KEGG.ID]] <- "//kegg_id" + xpath.expr[[BIODB.NAME]] <- "/metabolite/name" + xpath.expr[[BIODB.FORMULA]] <- "/metabolite/chemical_formula" + xpath.expr[[BIODB.SUPER.CLASS]] <- "//super_class" + xpath.expr[[BIODB.AVERAGE.MASS]] <- "//average_molecular_weight" + xpath.expr[[BIODB.MONOISOTOPIC.MASS]] <- "//monisotopic_moleculate_weight" for (content in contents) { @@ -52,7 +52,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1) diff -r e66bb061af06 -r 253d531a0193 HmdbConn.R --- a/HmdbConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/HmdbConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('HmdbConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('HmdbCompound.R') ##################### # CLASS DECLARATION # ##################### - HmdbConn <- setRefClass("HmdbConn", contains = "BiodbConn") + HmdbConn <- setRefClass("HmdbConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## HmdbConn$methods( getEntryContentType = function(type) { - return(RBIODB.XML) + return(BIODB.XML) }) ##################### @@ -23,13 +23,13 @@ HmdbConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.HMDB, x, content.type = RBIODB.XML)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.HMDB, x, content.type = BIODB.XML)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ HmdbConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createHmdbCompoundFromXml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createHmdbCompoundFromXml(content, drop = drop) else NULL) }) } # end of load safe guard diff -r e66bb061af06 -r 253d531a0193 KeggCompound.R --- a/KeggCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/KeggCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,9 +20,9 @@ # Define fields regex regex <- character() - regex[[RBIODB.NAME]] <- "^NAME\\s+([^,;]+)" - regex[[RBIODB.CHEBI.ID]] <- "^\\s+ChEBI:\\s+(\\S+)" - regex[[RBIODB.LIPIDMAPS.ID]] <- "^\\s+LIPIDMAPS:\\s+(\\S+)" + regex[[BIODB.NAME]] <- "^NAME\\s+([^,;]+)" + regex[[BIODB.CHEBI.ID]] <- "^\\s+ChEBI:\\s+(\\S+)" + regex[[BIODB.LIPIDMAPS.ID]] <- "^\\s+LIPIDMAPS:\\s+(\\S+)" for (text in contents) { @@ -50,26 +50,26 @@ # ENZYME ID g <- str_match(s, "^ENTRY\\s+EC\\s+(\\S+)") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.ACCESSION, paste('ec', g[1,2], sep = ':')) + compound$setField(BIODB.ACCESSION, paste('ec', g[1,2], sep = ':')) # ENTRY ID else { g <- str_match(s, "^ENTRY\\s+(\\S+)\\s+Compound") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.ACCESSION, paste('cpd', g[1,2], sep = ':')) + compound$setField(BIODB.ACCESSION, paste('cpd', g[1,2], sep = ':')) # OTHER ID else { g <- str_match(s, "^ENTRY\\s+(\\S+)") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.ACCESSION, g[1,2]) + compound$setField(BIODB.ACCESSION, g[1,2]) } } # ORGANISM g <- str_match(s, "^ORGANISM\\s+(\\S+)") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.ACCESSION, paste(g[1,2], compound$getField(RBIODB.ACCESSION), sep = ':')) + compound$setField(BIODB.ACCESSION, paste(g[1,2], compound$getField(BIODB.ACCESSION), sep = ':')) } } @@ -77,7 +77,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1) diff -r e66bb061af06 -r 253d531a0193 KeggConn.R --- a/KeggConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/KeggConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('KeggConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('KeggCompound.R') ##################### # CLASS DECLARATION # ##################### - KeggConn <- setRefClass("KeggConn", contains = "BiodbConn") + KeggConn <- setRefClass("KeggConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## KeggConn$methods( getEntryContentType = function(type) { - return(RBIODB.TXT) + return(BIODB.TXT) }) ##################### @@ -23,13 +23,13 @@ KeggConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.KEGG, x, content.type = RBIODB.TXT)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.KEGG, x, content.type = BIODB.TXT)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ KeggConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createKeggCompoundFromTxt(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createKeggCompoundFromTxt(content, drop = drop) else NULL) }) } # end of load safe guard diff -r e66bb061af06 -r 253d531a0193 LipidmapsCompound.R --- a/LipidmapsCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/LipidmapsCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -19,12 +19,12 @@ # Mapping column names col2field <- list() - col2field[[RBIODB.NAME]] <- 'COMMON_NAME' - col2field[[RBIODB.ACCESSION]] <- 'LM_ID' - col2field[[RBIODB.KEGG.ID]] <- 'KEGG_ID' - col2field[[RBIODB.HMDB.ID]] <- 'HMDBID' - col2field[[RBIODB.MASS]] <- 'MASS' - col2field[[RBIODB.FORMULA]] <- 'FORMULA' + col2field[[BIODB.NAME]] <- 'COMMON_NAME' + col2field[[BIODB.ACCESSION]] <- 'LM_ID' + col2field[[BIODB.KEGG.ID]] <- 'KEGG_ID' + col2field[[BIODB.HMDB.ID]] <- 'HMDBID' + col2field[[BIODB.MASS]] <- 'MASS' + col2field[[BIODB.FORMULA]] <- 'FORMULA' for (text in contents) { @@ -59,7 +59,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1) diff -r e66bb061af06 -r 253d531a0193 LipidmapsConn.R --- a/LipidmapsConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/LipidmapsConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,13 +1,13 @@ if ( ! exists('LipdmapsConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('LipidmapsCompound.R') ##################### # CLASS DECLARATION # ##################### - LipidmapsConn <- setRefClass("LipidmapsConn", contains = "BiodbConn") + LipidmapsConn <- setRefClass("LipidmapsConn", contains = "RemotedbConn") ############### # CONSTRUCTOR # @@ -24,7 +24,7 @@ ########################## LipidmapsConn$methods( getEntryContentType = function(type) { - return(RBIODB.CSV) + return(BIODB.CSV) }) ##################### @@ -33,13 +33,13 @@ LipidmapsConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.LIPIDMAPS, x, content.type = RBIODB.CSV)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.LIPIDMAPS, x, content.type = BIODB.CSV)), FUN.VALUE = '') return(content) } @@ -52,6 +52,6 @@ ################ LipidmapsConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createLipidmapsCompoundFromCsv(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createLipidmapsCompoundFromCsv(content, drop = drop) else NULL) }) } diff -r e66bb061af06 -r 253d531a0193 MassFiledbConn.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MassFiledbConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -0,0 +1,258 @@ +if ( ! exists('MassFiledbConn')) { + + source('MassdbConn.R') + + # LCMS File db. + # In this type of database, a single file is provided in CSV format. Default separator is tabulation. + # Each line is a MS peak measure, . + # The file contains molecule and spectrum information. Each spectrum has an accession id. + + # TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue + + ############# + # CONSTANTS # + ############# + + # Default database fields + .BIODB.DFT.DB.FIELDS <- list() + for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZ, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS)) + .BIODB.DFT.DB.FIELDS[[f]] <- f + + ##################### + # CLASS DECLARATION # + ##################### + + MassFiledbConn <- setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .fields = "list", .ms.modes = "character")) + + ############### + # CONSTRUCTOR # + ############### + + MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) { + + # Check file + (! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.") + file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\".")) + + # Set fields + .db <<- NULL + .file <<- file + .file.sep <<- file.sep + .file.quote <<- file.quote + .fields <<- .BIODB.DFT.DB.FIELDS + .field.multval.sep <<- ';' + .ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS) + names(.self$.ms.modes) <- .self$.ms.modes + + callSuper(...) + }) + + ###################### + # Is valid field tag # + ###################### + + MassFiledbConn$methods( isValidFieldTag = function(tag) { + return (tag %in% names(.self$.fields)) + }) + + ############# + # Set field # + ############# + + MassFiledbConn$methods( setField = function(tag, colname) { + + ( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.") + ( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.") + + # Load database file + .self$.init.db() + + # Check that this field tag is defined in the fields list + .self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid.")) + + # Check that columns are defined in database file + all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file.")) + + # Set new definition + if (length(colname) == 1) + .fields[[tag]] <<- colname + else { + new.col <- paste(colname, collapse = ".") + .self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '') + .fields[[tag]] <<- new.col + } + }) + + ###################################### + # SET FIELD MULTIPLE VALUE SEPARATOR # + ###################################### + + MassFiledbConn$methods( setFieldMultValSep = function(sep) { + .field.multval.sep <<- sep + }) + + ################ + # SET MS MODES # + ################ + + MassFiledbConn$methods( setMsMode = function(mode, value) { + .self$.ms.modes[[mode]] <- value + }) + + ########################## + # GET ENTRY CONTENT TYPE # + ########################## + + MassFiledbConn$methods( getEntryContentType = function(type) { + return(BIODB.DATAFRAME) + }) + + ########### + # INIT DB # + ########### + + MassFiledbConn$methods( .init.db = function() { + + if (is.null(.self$.db)) { + + # Load database + .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL) + + # Rename columns + colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') + } + }) + + ################ + # CHECK FIELDS # + ################ + + MassFiledbConn$methods( .check.fields = function(fields) { + + # Check if fields are known + unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)] + if (length(unknown.fields) > 0) + stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown.")) + + # Init db + .self$.init.db() + + # Check if fields are defined in file database + undefined.fields <- colnames(.self$.init.db)[ ! unlist(.self$.fields[fields]) %in% colnames(.self$.init.db)] + if (length(undefined.fields) > 0) + stop(paste0("Column(s) ", paste(unlist(.self$.fields[fields]), collapse = ", "), " is/are undefined in file database.")) + }) + + ################ + # EXTRACT COLS # + ################ + + MassFiledbConn$methods( .extract.cols = function(cols, mode = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) { + + x <- NULL + + if ( ! is.null(cols) && ! is.na(cols)) { + + # Init db + .self$.init.db() + + # TODO check existence of cols/fields + + # Get db, eventually filtering it. + if (is.null(mode)) + db <- .self$.db + else { + # Check mode value + mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'.")) + .self$.check.fields(BIODB.MSMODE) + + # Filter on mode + db <- .self$.db[.self$.db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ] + } + + # Get subset + x <- db[, unlist(.self$.fields[cols]), drop = drop] + + # Rename columns + if (is.data.frame(x)) + colnames(x) <- cols + + # Rearrange + if (drop && is.vector(x)) { + if (uniq) + x <- x[ ! duplicated(x)] + if (sort) + x <- sort(x) + } + + # Cut + if ( ! is.na(max.rows)) + x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ] + } + + return(x) + }) + + ################# + # GET ENTRY IDS # + ################# + + MassFiledbConn$methods( getEntryIds = function(type) { + + ids <- NA_character_ + + if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND)) + ids <- as.character(.self$.extract.cols(if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE)) + + return(ids) + }) + + ################## + # GET NB ENTRIES # + ################## + + MassFiledbConn$methods( getNbEntries = function(type) { + return(length(.self$getEntryIds(type))) + }) + + ############################### + # GET CHROMATOGRAPHIC COLUMNS # + ############################### + + # Inherited from MassdbConn. + MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) { + + # Extract needed columns + db <- .self$.extract.cols(c(BIODB.COMPOUND.ID, BIODB.CHROM.COL)) + + # Filter on molecule IDs + if ( ! is.null(compound.ids)) + db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ] + + # Get column names + cols <- db[[BIODB.CHROM.COL]] + + # Remove duplicates + cols <- cols[ ! duplicated(cols)] + + # Make data frame + chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE) + colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE) + + return(chrom.cols) + }) + + ################# + # GET MZ VALUES # + ################# + + # Inherited from MassdbConn. + MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { + + # Get mz values + mz <- .self$.extract.cols(BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results) + + return(mz) + }) + +} diff -r e66bb061af06 -r 253d531a0193 MassbankCompound.R --- a/MassbankCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MassbankCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -28,38 +28,38 @@ for (s in lines[[1]]) { # NAME - if (is.na(compound$getField(RBIODB.NAME))) { + if (is.na(compound$getField(BIODB.NAME))) { g <- str_match(s, "^CH\\$NAME:\\s+(.+)$") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.NAME, g[1,2]) + compound$setField(BIODB.NAME, g[1,2]) } # CHEBI ID g <- str_match(s, "^CH\\$LINK: CHEBI\\s+(.+)$") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.CHEBI.ID, g[1,2]) + compound$setField(BIODB.CHEBI.ID, g[1,2]) # KEGG ID g <- str_match(s, "^CH\\$LINK: KEGG\\s+(.+)$") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.KEGG.ID, g[1,2]) + compound$setField(BIODB.KEGG.ID, g[1,2]) # PUBCHEM ID g <- str_match(s, "^CH\\$LINK: PUBCHEM\\s+(.+)$") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.PUBCHEM.ID, g[1,2]) + compound$setField(BIODB.PUBCHEM.ID, g[1,2]) # INCHI g <- str_match(s, "^CH\\$IUPAC:\\s+(.+)$") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.INCHI, g[1,2]) + compound$setField(BIODB.INCHI, g[1,2]) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.NAME))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.NAME))) NULL else x) return(compounds) } diff -r e66bb061af06 -r 253d531a0193 MassbankConn.R --- a/MassbankConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MassbankConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,43 +1,69 @@ if ( ! exists('MassbankConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') + source('MassdbConn.R') source('MassbankSpectrum.R') ##################### # CLASS DECLARATION # ##################### - MassbankConn <- setRefClass("MassbankConn", contains = "BiodbConn") + MassbankConn <- setRefClass("MassbankConn", contains = c("RemotedbConn", "MassdbConn")) ########################## # GET ENTRY CONTENT TYPE # ########################## MassbankConn$methods( getEntryContentType = function(type) { - return(if (type == RBIODB.SPECTRUM) RBIODB.TXT else NULL) + return(if (type == BIODB.SPECTRUM) BIODB.TXT else NULL) }) ##################### # GET ENTRY CONTENT # ##################### - MassbankConn$methods( getEntryContent = function(type, id) { + MassbankConn$methods( getEntryContent = function(type, ids) { - if (type == RBIODB.SPECTRUM) { + # Debug + .self$.print.debug.msg(paste0("Get entry content(s) for ", length(ids)," id(s)...")) + + if (type == BIODB.SPECTRUM) { + + URL.MAX.LENGTH <- 2083 # Initialize return values - content <- rep(NA_character_, length(id)) + content <- rep(NA_character_, length(ids)) + + # Loop on all + n <- 0 + while (n < length(ids)) { - # Request - xmlstr <- .self$.scheduler$getUrl(get.entry.url(RBIODB.MASSBANK, id, RBIODB.TXT)) + # Get list of accession ids to retrieve + accessions <- ids[(n + 1):length(ids)] + + # Create URL request + x <- get.entry.url(class = BIODB.MASSBANK, accession = accessions, content.type = BIODB.TXT, max.length = URL.MAX.LENGTH) + + # Debug + .self$.print.debug.msg(paste0("Send URL request for ", x$n," id(s)...")) - # Parse XML and get text - if ( ! is.na(xmlstr)) { - library(XML) - xml <- xmlInternalTreeParse(xmlstr, asText = TRUE) - ns <- c(ax21 = "http://api.massbank/xsd") - returned.ids <- xpathSApply(xml, "//ax21:id", xmlValue, namespaces = ns) - content[match(returned.ids, id)] <- xpathSApply(xml, "//ax21:info", xmlValue, namespaces = ns) + # Send request + xmlstr <- .self$.scheduler$getUrl(x$url) + + # Increase number of entries retrieved + n <- n + x$n + + # Parse XML and get text + if ( ! is.na(xmlstr)) { + library(XML) + xml <- xmlInternalTreeParse(xmlstr, asText = TRUE) + ns <- c(ax21 = "http://api.massbank/xsd") + returned.ids <- xpathSApply(xml, "//ax21:id", xmlValue, namespaces = ns) + content[match(returned.ids, ids)] <- xpathSApply(xml, "//ax21:info", xmlValue, namespaces = ns) + } + + # Debug + .self$.print.debug.msg(paste0("Now ", length(ids) - n," id(s) left to be retrieved...")) } return(content) @@ -54,6 +80,13 @@ # content A file content, downloaded from the public database. # RETURN A spectrum instance. MassbankConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.SPECTRUM) createMassbankSpectrumFromTxt(content, drop = drop) else NULL) + return(if (type == BIODB.SPECTRUM) createMassbankSpectrumFromTxt(content, drop = drop) else NULL) + }) + + ################# + # GET MZ VALUES # + ################# + + MassbankConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { }) } diff -r e66bb061af06 -r 253d531a0193 MassbankSpectrum.R --- a/MassbankSpectrum.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MassbankSpectrum.R Sat Sep 03 17:02:01 2016 -0400 @@ -21,13 +21,13 @@ # Define fields regex regex <- character() - regex[[RBIODB.ACCESSION]] <- "^ACCESSION: (.+)$" - regex[[RBIODB.MSDEV]] <- "^AC\\$INSTRUMENT: (.+)$" - regex[[RBIODB.MSDEVTYPE]] <- "^AC\\$INSTRUMENT_TYPE: (.+)$" - regex[[RBIODB.MSTYPE]] <- "^AC\\$MASS_SPECTROMETRY: MS_TYPE (.+)$" - regex[[RBIODB.MSPRECMZ]] <- "^MS\\$FOCUSED_ION: PRECURSOR_M/Z (.+)$" - regex[[RBIODB.NB.PEAKS]] <- "^PK\\$NUM_PEAK: ([0-9]+)$" - regex[[RBIODB.MSPRECANNOT]] <- "^MS\\$FOCUSED_ION: PRECURSOR_TYPE (.+)$" + regex[[BIODB.ACCESSION]] <- "^ACCESSION: (.+)$" + regex[[BIODB.MSDEV]] <- "^AC\\$INSTRUMENT: (.+)$" + regex[[BIODB.MSDEVTYPE]] <- "^AC\\$INSTRUMENT_TYPE: (.+)$" + regex[[BIODB.MSTYPE]] <- "^AC\\$MASS_SPECTROMETRY: MS_TYPE (.+)$" + regex[[BIODB.MSPRECMZ]] <- "^MS\\$FOCUSED_ION: PRECURSOR_M/Z (.+)$" + regex[[BIODB.NB.PEAKS]] <- "^PK\\$NUM_PEAK: ([0-9]+)$" + regex[[BIODB.MSPRECANNOT]] <- "^MS\\$FOCUSED_ION: PRECURSOR_TYPE (.+)$" for (text in contents) { @@ -54,7 +54,7 @@ # MS MODE g <- str_match(s, "^AC\\$MASS_SPECTROMETRY: ION_MODE (.+)$") if ( ! is.na(g[1,1])) { - spectrum$setField(RBIODB.MSMODE, if (g[1,2] == 'POSITIVE') RBIODB.MSMODE.POS else RBIODB.MSMODE.NEG) + spectrum$setField(BIODB.MSMODE, if (g[1,2] == 'POSITIVE') BIODB.MSMODE.POS else BIODB.MSMODE.NEG) next } @@ -67,13 +67,13 @@ } # Replace elements with no accession id by NULL - spectra <- lapply(spectra, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + spectra <- lapply(spectra, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # Set associated compounds compounds <- createMassbankCompoundFromTxt(contents) for (i in seq(spectra)) if ( ! is.null(spectra[[i]])) - spectra[[i]]$setField(RBIODB.COMPOUND, compounds[[i]]) + spectra[[i]]$setField(BIODB.COMPOUND, compounds[[i]]) # If the input was a single element, then output a single object if (drop && length(contents) == 1) @@ -88,26 +88,26 @@ .parse.peak.line <- function(spectrum, line) { - peaks <- RBIODB.PEAK.DF.EXAMPLE + peaks <- BIODB.PEAK.DF.EXAMPLE # Annotation g <- str_match(line, "^\\s+([0-9][0-9.]*) ([A-Z0-9+-]+) ([0-9]+) ([0-9][0-9.]*) ([0-9][0-9.]*)$") if ( ! is.na(g[1,1])) - peaks[1, c(RBIODB.PEAK.MZ, RBIODB.PEAK.FORMULA, RBIODB.PEAK.FORMULA.COUNT, RBIODB.PEAK.MASS, RBIODB.PEAK.ERROR.PPM)] <- list(as.double(g[1,2]), g[1,3], as.integer(g[1,4]), as.double(g[1,5]), as.double(g[1,6])) + peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM)] <- list(as.double(g[1,2]), g[1,3], as.integer(g[1,4]), as.double(g[1,5]), as.double(g[1,6])) # Peak g <- str_match(line, "^\\s+([0-9][0-9.]*) ([0-9][0-9.]*) ([0-9]+)$") if ( ! is.na(g[1,1])) - peaks[1, c(RBIODB.PEAK.MZ, RBIODB.PEAK.INTENSITY, RBIODB.PEAK.RELATIVE.INTENSITY)] <- list(as.double(g[1,2]), as.double(g[1,3]), as.integer(g[1,4])) + peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY)] <- list(as.double(g[1,2]), as.double(g[1,3]), as.integer(g[1,4])) if (nrow(peaks) > 0) { # Get curent peaks and merge with new peaks - current.peaks <- spectrum$getField(RBIODB.PEAKS) + current.peaks <- spectrum$getField(BIODB.PEAKS) if ( ! is.null(current.peaks)) peaks <- rbind(current.peaks, peaks) - spectrum$setField(RBIODB.PEAKS, peaks) + spectrum$setField(BIODB.PEAKS, peaks) return(TRUE) } diff -r e66bb061af06 -r 253d531a0193 MassdbConn.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MassdbConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -0,0 +1,31 @@ +if ( ! exists('MassdbConn')) { + + source('BiodbConn.R') + + ##################### + # CLASS DECLARATION # + ##################### + + MassdbConn <- setRefClass("MassdbConn", contains = "BiodbConn") + + ############################### + # GET CHROMATOGRAPHIC COLUMNS # + ############################### + + # Get a list of chromatographic columns contained in this database. + # compound.ids A list of compound IDs used to filter results. + # The returned value is a data.frame with two columns : one for the ID (BIODB.ID) and another one for the title (BIODB.TITLE). + MassdbConn$methods( getChromCol = function(compound.ids = NULL) { + stop("Method getChromCol() is not implemented in concrete class.") + }) + + ################# + # GET MZ VALUES # + ################# + + # Returns a numeric vector of all masses stored inside the database. + MassdbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { + stop("Method getMzValues() not implemented in concrete class.") + }) + +} diff -r e66bb061af06 -r 253d531a0193 MirbaseCompound.R --- a/MirbaseCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MirbaseCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,9 +20,9 @@ # Define fields regex xpath.expr <- character() - xpath.expr[[RBIODB.ACCESSION]] <- "//td[text()='Accession number']/../td[2]" - xpath.expr[[RBIODB.NAME]] <- "//td[text()='ID']/../td[2]" - xpath.expr[[RBIODB.SEQUENCE]] <- "//td[text()='Sequence']/..//pre" + xpath.expr[[BIODB.ACCESSION]] <- "//td[text()='Accession number']/../td[2]" + xpath.expr[[BIODB.NAME]] <- "//td[text()='ID']/../td[2]" + xpath.expr[[BIODB.SEQUENCE]] <- "//td[text()='Sequence']/..//pre" for (html in contents) { @@ -43,7 +43,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1) diff -r e66bb061af06 -r 253d531a0193 MirbaseConn.R --- a/MirbaseConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MirbaseConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('MirbaseConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('MirbaseCompound.R') ##################### # CLASS DECLARATION # ##################### - MirbaseConn <- setRefClass("MirbaseConn", contains = "BiodbConn") + MirbaseConn <- setRefClass("MirbaseConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## MirbaseConn$methods( getEntryContentType = function(type) { - return(RBIODB.HTML) + return(BIODB.HTML) }) ##################### @@ -23,13 +23,13 @@ MirbaseConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.MIRBASE, x, content.type = RBIODB.HTML)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.MIRBASE, x, content.type = BIODB.HTML)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ MirbaseConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createMirbaseCompoundFromHtml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createMirbaseCompoundFromHtml(content, drop = drop) else NULL) }) ################### diff -r e66bb061af06 -r 253d531a0193 MsDbOutputDataFrameStream.R --- a/MsDbOutputDataFrameStream.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MsDbOutputDataFrameStream.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,7 +1,6 @@ if ( ! exists('MsDbOutputDataFrameStream')) { # Do not load again if already loaded library(methods) - library(plyr) source('MsDbOutputStream.R') source('dfhlp.R', chdir = TRUE) @@ -41,6 +40,8 @@ MsDbOutputDataFrameStream$methods( matchedPeaks = function(mz, rt = NULL, unused = NULL, peaks = NULL) { + library(plyr) + # Set input values x <- data.frame(mz = mz) if ( ! is.null(rt)) diff -r e66bb061af06 -r 253d531a0193 MsPeakForestDb.R --- a/MsPeakForestDb.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MsPeakForestDb.R Sat Sep 03 17:02:01 2016 -0400 @@ -8,21 +8,24 @@ # CLASS DECLARATION # ##################### - MsPeakForestDb <- setRefClass("MsPeakForestDb", contains = "MsDb", fields = list(.url = "character", .url.scheduler = "ANY")) + MsPeakForestDb <- setRefClass("MsPeakForestDb", contains = "MsDb", fields = list(.url = "character", .url.scheduler = "ANY", .token = "character")) ############### # CONSTRUCTOR # ############### - MsPeakForestDb$methods( initialize = function(url = NA_character_, useragent = NA_character_, ...) { + MsPeakForestDb$methods( initialize = function(url = NA_character_, useragent = NA_character_, token = NA_character_, ...) { # Check URL if (is.null(url) || is.na(url)) stop("No URL defined for new MsPeakForestDb instance.") + if (substring(url, nchar(url) - 1, 1) == '/') + url <- substring(url, nchar(url) - 1) .url <<- url .url.scheduler <<- UrlRequestScheduler$new(n = 3, useragent = useragent) .self$.url.scheduler$setVerbose(1L) + .token <<- token callSuper(...) }) @@ -35,6 +38,17 @@ res <- NULL + # Add url prefix + if (substring(url, 1, 1) == '/') + url <- substring(url, 2) + url <- paste(.self$.url, url, sep = '/') + + # Add token + if ( ! is.na(.self$.token)) + params <- c(params, token = .self$.token) + param.str <- if (is.null(params)) '' else paste('?', vapply(names(params), function(p) paste(p, params[[p]], sep = '='), FUN.VALUE = ''), collapse = '&', sep = '') + + # Get URL content <- .self$.url.scheduler$getUrl(url = url, params = params) if (ret.type == 'json') { @@ -67,7 +81,7 @@ MsPeakForestDb$methods( getMoleculeIds = function() { - ids <- as.character(.self$.get.url(url = paste0(.self$.url, 'compounds/all/ids'))) + ids <- as.character(.self$.get.url(url = 'compounds/all/ids')) return(ids) }) @@ -78,7 +92,7 @@ MsPeakForestDb$methods( getNbMolecules = function() { - n <- .self$.get.url(url = paste0(.self$.url, 'compounds/all/count'), ret.type = 'integer') + n <- .self$.get.url(url = 'compounds/all/count', ret.type = 'integer') return(n) }) @@ -90,13 +104,12 @@ MsPeakForestDb$methods( getChromCol = function(molid = NULL) { # Set URL - url <- paste0(.self$.url, 'metadata/lc/list-code-columns') params <- NULL if ( ! is.null(molid)) params <- list(molids = paste(molid, collapse = ',')) # Call webservice - wscols <- .self$.get.url(url = url, params = params) + wscols <- .self$.get.url(url = 'metadata/lc/list-code-columns', params = params) # Build data frame cols <- data.frame(id = character(), title = character()) @@ -118,13 +131,12 @@ rt <- list() # Set URL - url <- paste0(.self$.url, 'spectra/lcms/search') params <- NULL if ( ! is.null(molid)) params <- list(molids = paste(molid, collapse = ',')) # Call webservice - spectra <- .self$.get.url(url = url, params = params) + spectra <- .self$.get.url(url = 'spectra/lcms/search', params = params) if (class(spectra) == 'list' && length(spectra) > 0) { for (s in spectra) if (is.na(col) || s$liquidChromatography$columnCode %in% col) { @@ -160,11 +172,10 @@ if (length(non.na.molid) > 0) { # Set URL - url <- paste0(.self$.url, 'compounds/all/names') params <- c(molids = paste(non.na.molid, collapse = ',')) # Call webservice - names[ ! is.na(molid)] <- .self$.get.url(url = url, params = params) + names[ ! is.na(molid)] <- .self$.get.url(url = 'compounds/all/names', params = params) } return(names) @@ -187,8 +198,7 @@ ids <- c(ids, NA_character_) else { - url <- paste0(.self$.url, 'search/compounds/name/', curlEscape(n)) - compounds <- .self$.get.url(url = url)$compoundNames + compounds <- .self$.get.url(url = paste0('search/compounds/name/', curlEscape(n)))$compoundNames ids <- c(ids, list(vapply(compounds, function(c) as.character(c$compound$id), FUN.VALUE = ''))) } } @@ -203,7 +213,6 @@ MsPeakForestDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { # Build URL - url <- paste0(.self$.url, 'spectra/lcms/count-peaks') params <- NULL if ( ! is.na(type)) params <- c(params, mode = if (type == MSDB.TAG.POS) 'pos' else 'neg') @@ -211,7 +220,7 @@ params <- c(params, molids = paste(molid, collapse = ',')) # Run request - n <- .self$.get.url(url = url, params = params, ret.type = 'integer') + n <- .self$.get.url(url = 'spectra/lcms/count-peaks', params = params, ret.type = 'integer') return(sum(n)) }) @@ -222,16 +231,13 @@ MsPeakForestDb$methods( getMzValues = function(mode = NULL) { - # Build URL - url <- paste0(.self$.url, 'spectra/lcms/peaks/list-mz') - # Query params params <- NULL if ( ! is.null(mode)) params <- c(params, mode = if (mode == MSDB.TAG.POS) 'positive' else 'negative') # Get MZ valuels - mz <- .self$.get.url(url = url, params = params) + mz <- .self$.get.url(url = 'spectra/lcms/peaks/list-mz', params = params) return(mz) }) @@ -243,7 +249,7 @@ MsPeakForestDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { # Build URL for mz search - url <- paste0(.self$.url, 'spectra/lcms/peaks/get-range/', mz.low, '/', mz.high) + url <- paste0('spectra/lcms/peaks/get-range/', mz.low, '/', mz.high) # Get spectra spectra <- .self$.get.url(url = url) @@ -265,7 +271,7 @@ if (nrow(results) > 0) { # Build URL for rt search - url <- paste0(.self$.url, 'spectra/lcms/range-rt-min/', rt.low, '/', rt.high) + url <- paste0('spectra/lcms/range-rt-min/', rt.low, '/', rt.high) params <- NULL if ( ! is.null(col)) params <- c(columns = paste(col, collapse = ',')) diff -r e66bb061af06 -r 253d531a0193 NcbiCcdsCompound.R --- a/NcbiCcdsCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/NcbiCcdsCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -27,15 +27,15 @@ xml <- htmlTreeParse(html, asText = TRUE, useInternalNodes = TRUE) if (length(getNodeSet(xml, "//*[starts-with(.,'No results found for CCDS ID ')]")) == 0) { - compound$setField(RBIODB.ACCESSION, xpathSApply(xml, "//input[@id='DATA']", xmlGetAttr, "value")) - compound$setField(RBIODB.SEQUENCE, xpathSApply(xml, "//b[starts-with(.,'Nucleotide Sequence')]/../tt", xmlValue)) + compound$setField(BIODB.ACCESSION, xpathSApply(xml, "//input[@id='DATA']", xmlGetAttr, "value")) + compound$setField(BIODB.SEQUENCE, xpathSApply(xml, "//b[starts-with(.,'Nucleotide Sequence')]/../tt", xmlValue)) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1) diff -r e66bb061af06 -r 253d531a0193 NcbiCcdsConn.R --- a/NcbiCcdsConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/NcbiCcdsConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,13 +1,13 @@ if ( ! exists('NcbiccdsConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('NcbiccdsCompound.R') ##################### # CLASS DECLARATION # ##################### - NcbiccdsConn <- setRefClass("NcbiccdsConn", contains = "BiodbConn") + NcbiccdsConn <- setRefClass("NcbiccdsConn", contains = "RemotedbConn") ############### # CONSTRUCTOR # @@ -23,7 +23,7 @@ ########################## NcbiccdsConn$methods( getEntryContentType = function(type) { - return(RBIODB.HTML) + return(BIODB.HTML) }) ##################### @@ -32,13 +32,13 @@ NcbiccdsConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.NCBICCDS, x)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.NCBICCDS, x)), FUN.VALUE = '') return(content) } @@ -51,6 +51,6 @@ ################ NcbiccdsConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createNcbiccdsCompoundFromHtml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createNcbiccdsCompoundFromHtml(content, drop = drop) else NULL) }) } diff -r e66bb061af06 -r 253d531a0193 NcbiGeneCompound.R --- a/NcbiGeneCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/NcbiGeneCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -21,13 +21,13 @@ # Define xpath expressions xpath.expr <- character() - xpath.expr[[RBIODB.ACCESSION]] <- "//Gene-track_geneid" - xpath.expr[[RBIODB.KEGG.ID]] <- "/Dbtag_db[text()='KEGG']/..//Object-id_str" - xpath.expr[[RBIODB.UNIPROT.ID]] <- "//Gene-commentary_heading[text()='UniProtKB']/..//Dbtag_db[text()='UniProtKB/Swiss-Prot']/..//Object-id_str" - xpath.expr[[RBIODB.LOCATION]] <- "//Gene-ref_maploc" - xpath.expr[[RBIODB.PROTEIN.DESCRIPTION]] <- "//Gene-ref_desc" - xpath.expr[[RBIODB.SYMBOL]] <- "//Gene-ref_locus" - xpath.expr[[RBIODB.SYNONYMS]] <- "//Gene-ref_syn_E" + xpath.expr[[BIODB.ACCESSION]] <- "//Gene-track_geneid" + xpath.expr[[BIODB.KEGG.ID]] <- "/Dbtag_db[text()='KEGG']/..//Object-id_str" + xpath.expr[[BIODB.UNIPROT.ID]] <- "//Gene-commentary_heading[text()='UniProtKB']/..//Dbtag_db[text()='UniProtKB/Swiss-Prot']/..//Object-id_str" + xpath.expr[[BIODB.LOCATION]] <- "//Gene-ref_maploc" + xpath.expr[[BIODB.PROTEIN.DESCRIPTION]] <- "//Gene-ref_desc" + xpath.expr[[BIODB.SYMBOL]] <- "//Gene-ref_locus" + xpath.expr[[BIODB.SYNONYMS]] <- "//Gene-ref_syn_E" for (content in contents) { @@ -56,14 +56,14 @@ # CCDS ID ccdsid <- .find.ccds.id(xml) if ( ! is.na(ccdsid)) - compound$setField(RBIODB.NCBI.CCDS.ID, ccdsid) + compound$setField(BIODB.NCBI.CCDS.ID, ccdsid) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1) diff -r e66bb061af06 -r 253d531a0193 NcbiGeneConn.R --- a/NcbiGeneConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/NcbiGeneConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,13 +1,13 @@ if ( ! exists('NcbigeneConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('NcbigeneCompound.R') ##################### # CLASS DECLARATION # ##################### - NcbigeneConn <- setRefClass("NcbigeneConn", contains = "BiodbConn") + NcbigeneConn <- setRefClass("NcbigeneConn", contains = "RemotedbConn") ############### # CONSTRUCTOR # @@ -23,7 +23,7 @@ ########################## NcbigeneConn$methods( getEntryContentType = function(type) { - return(RBIODB.XML) + return(BIODB.XML) }) ##################### @@ -32,13 +32,13 @@ NcbigeneConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.NCBIGENE, x)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.NCBIGENE, x)), FUN.VALUE = '') return(content) } @@ -51,6 +51,6 @@ ################ NcbigeneConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createNcbigeneCompoundFromXml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createNcbigeneCompoundFromXml(content, drop = drop) else NULL) }) } diff -r e66bb061af06 -r 253d531a0193 PubchemCompound.R --- a/PubchemCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/PubchemCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -23,9 +23,9 @@ # Define xpath expressions xpath.expr <- character() - xpath.expr[[RBIODB.ACCESSION]] <- "//pubchem:RecordType[text()='CID']/../pubchem:RecordNumber" - xpath.expr[[RBIODB.INCHI]] <- "//pubchem:Name[text()='InChI']/../pubchem:StringValue" - xpath.expr[[RBIODB.INCHIKEY]] <- "//pubchem:Name[text()='InChI Key']/../pubchem:StringValue" + xpath.expr[[BIODB.ACCESSION]] <- "//pubchem:RecordType[text()='CID']/../pubchem:RecordNumber" + xpath.expr[[BIODB.INCHI]] <- "//pubchem:Name[text()='InChI']/../pubchem:StringValue" + xpath.expr[[BIODB.INCHIKEY]] <- "//pubchem:Name[text()='InChI Key']/../pubchem:StringValue" for (content in contents) { @@ -52,7 +52,7 @@ if (is.na(name)) tryCatch( { name <- xpathSApply(xml, "//pubchem:Name[text()='Record Title']/../pubchem:StringValue", xmlValue, namespaces = ns) }, warning = function(w) {}) if ( ! is.na(name)) - compound$setField(RBIODB.NAME, name) + compound$setField(BIODB.NAME, name) } @@ -60,7 +60,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1) diff -r e66bb061af06 -r 253d531a0193 PubchemConn.R --- a/PubchemConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/PubchemConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('get.pubchem.compound.url')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('PubchemCompound.R') ##################### # CLASS DECLARATION # ##################### - PubchemConn <- setRefClass("PubchemConn", contains = "BiodbConn") + PubchemConn <- setRefClass("PubchemConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## PubchemConn$methods( getEntryContentType = function(type) { - return(RBIODB.XML) + return(BIODB.XML) }) ##################### @@ -23,13 +23,13 @@ PubchemConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.PUBCHEM, x, content.type = RBIODB.XML)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.PUBCHEM, x, content.type = BIODB.XML)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ PubchemConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createPubchemCompoundFromXml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createPubchemCompoundFromXml(content, drop = drop) else NULL) }) ######################### diff -r e66bb061af06 -r 253d531a0193 RemotedbConn.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RemotedbConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -0,0 +1,31 @@ +if ( ! exists('RemotedbConn')) { + + source('BiodbConn.R') + source(file.path('UrlRequestScheduler.R'), chdir = TRUE) + + ##################### + # CLASS DECLARATION # + ##################### + + RemotedbConn <- setRefClass("RemotedbConn", contains = "BiodbConn", fields = list(.scheduler = "UrlRequestScheduler")) + + ############### + # CONSTRUCTOR # + ############### + + RemotedbConn$methods( initialize = function(useragent = NA_character_, scheduler = NULL, ...) { + + # Check useragent + ( ! is.null(useragent) && ! is.na(useragent)) || stop("You must specify a valid useragent string (e.g.: \"myapp ; my.email@address\").") + + # Set scheduler + if (is.null(scheduler)) + scheduler <- UrlRequestScheduler$new(n = 3) + inherits(scheduler, "UrlRequestScheduler") || stop("The scheduler instance must inherit from UrlRequestScheduler class.") + scheduler$setUserAgent(useragent) # set agent + .scheduler <<- scheduler + + callSuper(...) # calls super-class initializer with remaining parameters + }) + +} diff -r e66bb061af06 -r 253d531a0193 UniProtCompound.R --- a/UniProtCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/UniProtCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -23,17 +23,17 @@ # Define xpath expressions xpath.values <- character() - xpath.values[[RBIODB.NAME]] <- "/uniprot:uniprot/uniprot:compound/uniprot:name" - xpath.values[[RBIODB.GENE.SYMBOLS]] <- "//uniprot:gene/uniprot:name" - xpath.values[[RBIODB.FULLNAMES]] <- "//uniprot:protein//uniprot:fullName" - xpath.values[[RBIODB.SEQUENCE]] <- "//uniprot:entry/uniprot:sequence" - xpath.values[[RBIODB.ACCESSION]] <- "//uniprot:accession[1]" + xpath.values[[BIODB.NAME]] <- "/uniprot:uniprot/uniprot:compound/uniprot:name" + xpath.values[[BIODB.GENE.SYMBOLS]] <- "//uniprot:gene/uniprot:name" + xpath.values[[BIODB.FULLNAMES]] <- "//uniprot:protein//uniprot:fullName" + xpath.values[[BIODB.SEQUENCE]] <- "//uniprot:entry/uniprot:sequence" + xpath.values[[BIODB.ACCESSION]] <- "//uniprot:accession[1]" xpath.attr <- list() - xpath.attr[[RBIODB.KEGG.ID]] <- list(path = "//uniprot:dbReference[@type='KEGG']", attr = 'id') - xpath.attr[[RBIODB.NCBI.GENE.ID]] <- list(path = "//uniprot:dbReference[@type='GeneID']", attr = 'id') - xpath.attr[[RBIODB.ENZYME.ID]] <- list(path = "//uniprot:dbReference[@type='EC']", attr = 'id') - xpath.attr[[RBIODB.MASS]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'mass') - xpath.attr[[RBIODB.LENGTH]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'length') + xpath.attr[[BIODB.KEGG.ID]] <- list(path = "//uniprot:dbReference[@type='KEGG']", attr = 'id') + xpath.attr[[BIODB.NCBI.GENE.ID]] <- list(path = "//uniprot:dbReference[@type='GeneID']", attr = 'id') + xpath.attr[[BIODB.ENZYME.ID]] <- list(path = "//uniprot:dbReference[@type='EC']", attr = 'id') + xpath.attr[[BIODB.MASS]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'mass') + xpath.attr[[BIODB.LENGTH]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'length') for (content in contents) { @@ -61,16 +61,16 @@ } # Remove new lines from sequence string - seq <- compound$getField(RBIODB.SEQUENCE) + seq <- compound$getField(BIODB.SEQUENCE) if ( ! is.na(seq)) - compound$setField(RBIODB.SEQUENCE, gsub("\\n", "", seq)) + compound$setField(BIODB.SEQUENCE, gsub("\\n", "", seq)) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1) diff -r e66bb061af06 -r 253d531a0193 UniProtConn.R --- a/UniProtConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/UniProtConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('UniprotConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('UniprotCompound.R') ##################### # CLASS DECLARATION # ##################### - UniprotConn <- setRefClass("UniprotConn", contains = "BiodbConn") + UniprotConn <- setRefClass("UniprotConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## UniprotConn$methods( getEntryContentType = function(type) { - return(RBIODB.XML) + return(BIODB.XML) }) ##################### @@ -23,13 +23,13 @@ UniprotConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.UNIPROT, x, content.type = RBIODB.XML)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.UNIPROT, x, content.type = BIODB.XML)), FUN.VALUE = '') return(content) } @@ -42,6 +42,6 @@ ################ UniprotConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createUniprotCompoundFromXml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createUniprotCompoundFromXml(content, drop = drop) else NULL) }) } diff -r e66bb061af06 -r 253d531a0193 biodb-common.R --- a/biodb-common.R Tue Jul 12 12:02:37 2016 -0400 +++ b/biodb-common.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,154 +1,189 @@ -if ( ! exists('RBIODB.COMPOUND')) { # Do not load again if already loaded +if ( ! exists('BIODB.COMPOUND')) { # Do not load again if already loaded + + ############### + # ENTRY TYPES # + ############### + + BIODB.COMPOUND <- 'compound' + BIODB.SPECTRUM <- 'spectrum' + + ####################### + # ENTRY CONTENT TYPES # + ####################### + + BIODB.HTML <- 'html' + BIODB.TXT <- 'txt' + BIODB.XML <- 'xml' + BIODB.CSV <- 'csv' + BIODB.DATAFRAME <- 'dataframe' + BIODB.ANY <- 'any' # Value used when we do not care about the type. ############# - # CONSTANTS # + # DATABASES # ############# - - # Entry types - RBIODB.COMPOUND <- 'compound' - RBIODB.SPECTRUM <- 'spectrum' - - # Entry content types - RBIODB.HTML <- 'html' - RBIODB.TXT <- 'txt' - RBIODB.XML <- 'xml' - RBIODB.CSV <- 'csv' - RBIODB.ANY <- 'any' - # Class names - RBIODB.CHEBI <- 'chebi' - RBIODB.KEGG <- 'kegg' - RBIODB.PUBCHEM <- 'pubchem' - RBIODB.HMDB <- 'hmdb' - RBIODB.CHEMSPIDER <- 'chemspider' - RBIODB.ENZYME <- 'enzyme' - RBIODB.LIPIDMAPS <- 'lipidmaps' - RBIODB.MIRBASE <- 'mirbase' - RBIODB.NCBIGENE <- 'ncbigene' - RBIODB.NCBICCDS <- 'ncbiccds' - RBIODB.UNIPROT <- 'uniprot' - RBIODB.MASSBANK <- 'massbank' + BIODB.CHEBI <- 'chebi' + BIODB.KEGG <- 'kegg' + BIODB.PUBCHEM <- 'pubchem' + BIODB.HMDB <- 'hmdb' + BIODB.CHEMSPIDER <- 'chemspider' + BIODB.ENZYME <- 'enzyme' + BIODB.LIPIDMAPS <- 'lipidmaps' + BIODB.MIRBASE <- 'mirbase' + BIODB.NCBIGENE <- 'ncbigene' + BIODB.NCBICCDS <- 'ncbiccds' + BIODB.UNIPROT <- 'uniprot' + BIODB.MASSBANK <- 'massbank' + BIODB.MASSFILEDB <- 'massfiledb' + + ########## + # FIELDS # + ########## - # Fields - RBIODB.COMPOUND <- 'compound' - RBIODB.ACCESSION <- 'accession' - RBIODB.DESCRIPTION <- 'description' - RBIODB.PROTEIN.DESCRIPTION <- 'protdesc' - RBIODB.NAME <- 'name' - RBIODB.FULLNAMES <- 'fullnames' - RBIODB.SYNONYMS <- 'synonyms' - RBIODB.SYMBOL <- 'symbol' - RBIODB.GENE.SYMBOLS <- 'genesymbols' - RBIODB.CHEBI.ID <- 'chebiid' - RBIODB.LIPIDMAPS.ID <- 'lipidmapsid' - RBIODB.KEGG.ID <- 'keggid' - RBIODB.HMDB.ID <- 'hmdbid' - RBIODB.ENZYME.ID <- 'enzymeid' - RBIODB.NCBI.CCDS.ID <- 'ncbiccdsid' - RBIODB.NCBI.GENE.ID <- 'ncbigeneid' - RBIODB.PUBCHEM.ID <- 'pubchemid' - RBIODB.UNIPROT.ID <- 'uniprotid' - RBIODB.INCHI <- 'inchi' - RBIODB.INCHIKEY <- 'inchikey' - RBIODB.MSDEV <- 'msdev' - RBIODB.MSDEVTYPE <- 'msdevtype' - RBIODB.MSTYPE <- 'mstype' - RBIODB.MSMODE <- 'msmode' - RBIODB.MSPRECMZ <- 'msprecmz' # numeric - RBIODB.MSPRECANNOT <- 'msprecannot' - RBIODB.FORMULA <- 'formula' - RBIODB.SUPER.CLASS <- 'superclass' - RBIODB.MASS <- 'mass' - RBIODB.AVERAGE.MASS <- 'averagemass' - RBIODB.MONOISOTOPIC.MASS <- 'monoisotopicmass' - RBIODB.SEQUENCE <- 'sequence' - RBIODB.LOCATION <- 'location' - RBIODB.LENGTH <- 'length' - RBIODB.NB.PEAKS <- 'nbpeaks' - RBIODB.NB.PEAKS <- 'nbpeaks' - RBIODB.PEAKS <- 'peaks' + BIODB.ACCESSION <- 'accession' + BIODB.DESCRIPTION <- 'description' + BIODB.PROTEIN.DESCRIPTION <- 'protdesc' + BIODB.NAME <- 'name' + BIODB.FULLNAMES <- 'fullnames' + BIODB.SYNONYMS <- 'synonyms' + BIODB.SYMBOL <- 'symbol' + BIODB.GENE.SYMBOLS <- 'genesymbols' + BIODB.CHEBI.ID <- 'chebiid' + BIODB.LIPIDMAPS.ID <- 'lipidmapsid' + BIODB.KEGG.ID <- 'keggid' + BIODB.HMDB.ID <- 'hmdbid' + BIODB.ENZYME.ID <- 'enzymeid' + BIODB.NCBI.CCDS.ID <- 'ncbiccdsid' + BIODB.NCBI.GENE.ID <- 'ncbigeneid' + BIODB.PUBCHEM.ID <- 'pubchemid' + BIODB.UNIPROT.ID <- 'uniprotid' + BIODB.INCHI <- 'inchi' + BIODB.INCHIKEY <- 'inchikey' + BIODB.MSDEV <- 'msdev' + BIODB.MSDEVTYPE <- 'msdevtype' + BIODB.MSTYPE <- 'mstype' + BIODB.MSMODE <- 'msmode' + BIODB.MSPRECMZ <- 'msprecmz' # numeric + BIODB.MSPRECANNOT <- 'msprecannot' + BIODB.FORMULA <- 'formula' + BIODB.SUPER.CLASS <- 'superclass' + BIODB.MASS <- 'mass' + BIODB.AVERAGE.MASS <- 'averagemass' + BIODB.MONOISOTOPIC.MASS <- 'monoisotopicmass' + BIODB.SEQUENCE <- 'sequence' + BIODB.LOCATION <- 'location' + BIODB.LENGTH <- 'length' + BIODB.NB.PEAKS <- 'nbpeaks' + BIODB.PEAKS <- 'peaks' + BIODB.COMPOUND.ID <- 'compoundid' + BIODB.PEAK.MZ <- 'peakmz' + BIODB.PEAK.COMP <- 'peakcomp' # Peak composition + BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution + BIODB.CHROM.COL <- 'chromcol' # Chromatographic column + BIODB.CHROM.COL.RT <- 'chromcolrt' # Retention time measured on chromatographic column + BIODB.ID <- 'id' + BIODB.TITLE <- 'title' # Mode values - RBIODB.MSMODE.NEG <- 'neg' - RBIODB.MSMODE.POS <- 'pos' + BIODB.MSMODE.NEG <- 'neg' + BIODB.MSMODE.POS <- 'pos' + + ################# + # CARDINALITIES # + ################# - # Cardinalities - RBIODB.CARD.ONE <- '1' - RBIODB.CARD.MANY <- '*' + BIODB.CARD.ONE <- '1' + BIODB.CARD.MANY <- '*' - # Field attributes - RBIODB.FIELDS <- data.frame(matrix(c( + ########################## + # ENTRY FIELD ATTRIBUTES # + ########################## + + BIODB.FIELDS <- data.frame(matrix(c( # FIELD NAME CLASS CARDINALITY - RBIODB.COMPOUND, 'BiodEntry', RBIODB.CARD.ONE, - RBIODB.ACCESSION, 'character', RBIODB.CARD.ONE, - RBIODB.DESCRIPTION, 'character', RBIODB.CARD.ONE, - RBIODB.NAME, 'character', RBIODB.CARD.ONE, - RBIODB.FULLNAMES, 'character', RBIODB.CARD.MANY, - RBIODB.SYNONYMS, 'character', RBIODB.CARD.MANY, - RBIODB.PROTEIN.DESCRIPTION, 'character', RBIODB.CARD.ONE, - RBIODB.SYMBOL, 'character', RBIODB.CARD.ONE, - RBIODB.GENE.SYMBOLS, 'character', RBIODB.CARD.MANY, - RBIODB.CHEBI.ID, 'character', RBIODB.CARD.ONE, - RBIODB.LIPIDMAPS.ID, 'character', RBIODB.CARD.ONE, - RBIODB.KEGG.ID, 'character', RBIODB.CARD.ONE, - RBIODB.HMDB.ID, 'character', RBIODB.CARD.ONE, - RBIODB.ENZYME.ID, 'character', RBIODB.CARD.ONE, - RBIODB.PUBCHEM.ID, 'character', RBIODB.CARD.ONE, - RBIODB.UNIPROT.ID, 'character', RBIODB.CARD.ONE, - RBIODB.NCBI.CCDS.ID, 'character', RBIODB.CARD.ONE, - RBIODB.NCBI.GENE.ID, 'character', RBIODB.CARD.ONE, - RBIODB.INCHI, 'character', RBIODB.CARD.ONE, - RBIODB.INCHIKEY, 'character', RBIODB.CARD.ONE, - RBIODB.MSDEV, 'character', RBIODB.CARD.ONE, - RBIODB.MSDEVTYPE, 'character', RBIODB.CARD.ONE, - RBIODB.MSTYPE, 'character', RBIODB.CARD.ONE, - RBIODB.MSMODE, 'character', RBIODB.CARD.ONE, - RBIODB.MSPRECMZ, 'double', RBIODB.CARD.ONE, - RBIODB.MSPRECANNOT, 'character', RBIODB.CARD.ONE, - RBIODB.FORMULA, 'character', RBIODB.CARD.ONE, - RBIODB.SUPER.CLASS, 'character', RBIODB.CARD.ONE, - RBIODB.MASS, 'double', RBIODB.CARD.ONE, - RBIODB.AVERAGE.MASS, 'double', RBIODB.CARD.ONE, - RBIODB.MONOISOTOPIC.MASS, 'double', RBIODB.CARD.ONE, - RBIODB.SEQUENCE, 'character', RBIODB.CARD.ONE, - RBIODB.LENGTH, 'integer', RBIODB.CARD.ONE, - RBIODB.LOCATION, 'character', RBIODB.CARD.ONE, - RBIODB.NB.PEAKS, 'integer', RBIODB.CARD.ONE, - RBIODB.PEAKS, 'data.frame', RBIODB.CARD.ONE + BIODB.COMPOUND, 'BiodEntry', BIODB.CARD.ONE, + BIODB.ACCESSION, 'character', BIODB.CARD.ONE, + BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE, + BIODB.NAME, 'character', BIODB.CARD.ONE, + BIODB.FULLNAMES, 'character', BIODB.CARD.MANY, + BIODB.SYNONYMS, 'character', BIODB.CARD.MANY, + BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE, + BIODB.SYMBOL, 'character', BIODB.CARD.ONE, + BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY, + BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE, + BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE, + BIODB.KEGG.ID, 'character', BIODB.CARD.ONE, + BIODB.HMDB.ID, 'character', BIODB.CARD.ONE, + BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE, + BIODB.PUBCHEM.ID, 'character', BIODB.CARD.ONE, + BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE, + BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE, + BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE, + BIODB.INCHI, 'character', BIODB.CARD.ONE, + BIODB.INCHIKEY, 'character', BIODB.CARD.ONE, + BIODB.MSDEV, 'character', BIODB.CARD.ONE, + BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE, + BIODB.MSTYPE, 'character', BIODB.CARD.ONE, + BIODB.MSMODE, 'character', BIODB.CARD.ONE, + BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE, + BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE, + BIODB.FORMULA, 'character', BIODB.CARD.ONE, + BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE, + BIODB.MASS, 'double', BIODB.CARD.ONE, + BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE, + BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE, + BIODB.SEQUENCE, 'character', BIODB.CARD.ONE, + BIODB.LENGTH, 'integer', BIODB.CARD.ONE, + BIODB.LOCATION, 'character', BIODB.CARD.ONE, + BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE, + BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE ), byrow = TRUE, ncol = 3), stringsAsFactors = FALSE) - colnames(RBIODB.FIELDS) <- c('name', 'class', 'cardinality') + colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality') - # How to compute a missing field ? - RBIODB.FIELD.COMPUTING <- list() - RBIODB.FIELD.COMPUTING[[RBIODB.INCHI]] <- c(RBIODB.CHEBI) - RBIODB.FIELD.COMPUTING[[RBIODB.INCHIKEY]] <- c(RBIODB.CHEBI) - RBIODB.FIELD.COMPUTING[[RBIODB.SEQUENCE]] <- c(RBIODB.NCBICCDS) + ##################### + # COMPUTABLE FIELDS # + ##################### + + BIODB.FIELD.COMPUTING <- list() + BIODB.FIELD.COMPUTING[[BIODB.INCHI]] <- c(BIODB.CHEBI) + BIODB.FIELD.COMPUTING[[BIODB.INCHIKEY]] <- c(BIODB.CHEBI) + BIODB.FIELD.COMPUTING[[BIODB.SEQUENCE]] <- c(BIODB.NCBICCDS) - # Peaks data frame columns - RBIODB.PEAK.MZ <- 'mz' - RBIODB.PEAK.FORMULA <- 'formula' - RBIODB.PEAK.FORMULA.COUNT <- 'formula.count' - RBIODB.PEAK.MASS <- 'mass' - RBIODB.PEAK.ERROR.PPM <- 'error.ppm' - RBIODB.PEAK.INTENSITY <- 'intensity' - RBIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity' - RBIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE) - colnames(RBIODB.PEAK.DF.EXAMPLE) <- c(RBIODB.PEAK.MZ, RBIODB.PEAK.INTENSITY, RBIODB.PEAK.RELATIVE.INTENSITY, RBIODB.PEAK.FORMULA, RBIODB.PEAK.FORMULA.COUNT, RBIODB.PEAK.MASS, RBIODB.PEAK.ERROR.PPM) + #################### + # PEAKS DATA FRAME # + #################### + + # Columns + BIODB.PEAK.MZ <- 'mz' + BIODB.PEAK.FORMULA <- 'formula' + BIODB.PEAK.FORMULA.COUNT <- 'formula.count' + BIODB.PEAK.MASS <- 'mass' + BIODB.PEAK.ERROR.PPM <- 'error.ppm' + BIODB.PEAK.INTENSITY <- 'intensity' + BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity' + + # Example + BIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE) + colnames(BIODB.PEAK.DF.EXAMPLE) <- c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM) ################# # GET ENTRY URL # ################# # TODO Let the choice to use either jp or eu - RBIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/getRecordInfo" - RBIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/getRecordInfo" + BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/getRecordInfo" + BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/getRecordInfo" - get.entry.url <- function(class, accession, content.type = RBIODB.ANY) { + .do.get.entry.url <- function(class, accession, content.type = BIODB.ANY) { + + # TODO Only Massbank can handle multiple accession ids + if (class != 'massbank' && length(accession) > 1) + stop(paste0("Cannot build a URL for getting multiple entries for class ", class, ".")) url <- switch(class, - chebi = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL, - chemspider = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html') else NULL, - enzyme = if (content.type %in% c(RBIODB.ANY, RBIODB.TXT)) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL, + chebi = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL, + chemspider = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html') else NULL, + enzyme = if (content.type %in% c(BIODB.ANY, BIODB.TXT)) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL, hmdb = switch(content.type, xml = paste0('http://www.hmdb.ca/metabolites/', accession, '.xml'), html = paste0('http://www.hmdb.ca/metabolites/', accession), @@ -159,9 +194,9 @@ html = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), any = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), NULL), - lipidmaps = if (content.type %in% c(RBIODB.ANY, RBIODB.CSV)) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL, - massbank = if (content.type %in% c(RBIODB.ANY, RBIODB.TXT)) paste0(RBIODB.MASSBANK.EU.WS.URL, '?ids=', paste(accession, collapse = ',')) else NULL, - mirbase = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL, + lipidmaps = if (content.type %in% c(BIODB.ANY, BIODB.CSV)) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL, + massbank = if (content.type %in% c(BIODB.ANY, BIODB.TXT)) paste0(BIODB.MASSBANK.EU.WS.URL, '?ids=', paste(accession, collapse = ',')) else NULL, + mirbase = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL, pubchem = { accession <- gsub(' ', '', accession, perl = TRUE) accession <- gsub('^CID', '', accession, perl = TRUE) @@ -170,12 +205,49 @@ html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession), NULL) }, - ncbigene = if (content.type %in% c(RBIODB.ANY, RBIODB.XML)) paste0('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL, - ncbiccds = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession), - uniprot = if (content.type %in% c(RBIODB.ANY, RBIODB.XML)) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'), + ncbigene = if (content.type %in% c(BIODB.ANY, BIODB.XML)) paste0('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL, + ncbiccds = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession), + uniprot = if (content.type %in% c(BIODB.ANY, BIODB.XML)) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'), NULL ) return(url) } + + get.entry.url <- function(class, accession, content.type = BIODB.ANY, max.length = 0) { + + if (length(accession) == 0) + return(NULL) + + full.url <- .do.get.entry.url(class, accession, content.type = content.type) + if (max.length == 0 || nchar(full.url) <= max.length) + return(if (max.length == 0) full.url else list(url = full.url, n = length(accession))) + + # Find max size URL + a <- 1 + b <- length(accession) + while (a < b) { + m <- as.integer((a + b) / 2) + url <- .do.get.entry.url(class, accession[1:m], content.type = content.type) + if (nchar(url) <= max.length && m != a) + a <- m + else + b <- m + } + url <- .do.get.entry.url(class, accession[1:a], content.type = content.type) + + return(list( url = url, n = a)) + } + + ################# + # PRINT MESSAGE # + ################# + + BIODB.DEBUG <- 1 + BIODB.LEVEL.NAMES <- c('DEBUG') + + .print.msg <- function(msg, level = BIODB.DEBUG, class = NA_character_) { + cat(paste0(BIODB.LEVEL.NAMES[[level]], if (is.na(class)) '' else paste0(", ", class), ": ", msg, "\n"), file = stderr()) + } + } diff -r e66bb061af06 -r 253d531a0193 lcmsmatching.xml --- a/lcmsmatching.xml Tue Jul 12 12:02:37 2016 -0400 +++ b/lcmsmatching.xml Sat Sep 03 17:02:01 2016 -0400 @@ -1,6 +1,6 @@ - + - Matching of mz/rt values onto local reference compound database. + Annotation of MS peaks using matching on a spectra database. R @@ -8,17 +8,33 @@ r-stringr r-plyr r-xml + r-bitops + r-rcurl + r-rjsonio + + - + - - + + + + + + + + + + + + + - - + + + + + - - + + + + + + + + - + - + + + - --> - - - - - - - - - - + + + - - - - - + + - - - - - - - - - - - - - - - - - - - - + + + + @@ -221,18 +223,35 @@ - + - + + + + - - + + +