Mercurial > repos > prog > lcmsmatching
changeset 1:253d531a0193 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 36c9d8099c20a1ae848f1337c16564335dd8fb2b
author | prog |
---|---|
date | Sat, 03 Sep 2016 17:02:01 -0400 |
parents | e66bb061af06 |
children | 20d69a062da3 |
files | BiodbConn.R BiodbEntry.R BiodbFactory.R ChebiCompound.R ChebiConn.R ChemSpiderConn.R ChemspiderCompound.R EnzymeCompound.R EnzymeConn.R HmdbCompound.R HmdbConn.R KeggCompound.R KeggConn.R LipidmapsCompound.R LipidmapsConn.R MassFiledbConn.R MassbankCompound.R MassbankConn.R MassbankSpectrum.R MassdbConn.R MirbaseCompound.R MirbaseConn.R MsDbOutputDataFrameStream.R MsPeakForestDb.R NcbiCcdsCompound.R NcbiCcdsConn.R NcbiGeneCompound.R NcbiGeneConn.R PubchemCompound.R PubchemConn.R RemotedbConn.R UniProtCompound.R UniProtConn.R biodb-common.R lcmsmatching.xml list-chrom-cols.py search-mz |
diffstat | 37 files changed, 1080 insertions(+), 456 deletions(-) [+] |
line wrap: on
line diff
--- a/BiodbConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/BiodbConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,33 +1,30 @@ -if ( ! exists('BiodbConn')) { # Do not load again if already loaded +if ( ! exists('BiodbConn')) { - source(file.path('UrlRequestScheduler.R'), chdir = TRUE) source('biodb-common.R') ##################### # CLASS DECLARATION # ##################### - BiodbConn <- setRefClass("BiodbConn", fields = list(.scheduler = "UrlRequestScheduler")) + BiodbConn <- setRefClass("BiodbConn", fields = list( .debug = "logical" )) ############### # CONSTRUCTOR # ############### - BiodbConn$methods( initialize = function(useragent = NA_character_, scheduler = NULL, ...) { - - # Check useragent - ! is.null(useragent) && ! is.na(useragent) || stop("You must specify a valid useragent.") + BiodbConn$methods( initialize = function(debug = FALSE, ...) { + .debug <<- debug + }) - # Set scheduler - if (is.null(scheduler)) - scheduler <- UrlRequestScheduler$new(n = 3) - inherits(scheduler, "UrlRequestScheduler") || stop("The scheduler instance must inherit from UrlRequestScheduler class.") - scheduler$setUserAgent(useragent) # set agent - .scheduler <<- scheduler - - callSuper(...) # calls super-class initializer with remaining parameters + ####################### + # PRINT DEBUG MESSAGE # + ####################### + + BiodbConn$methods( .print.debug.msg = function(msg) { + if (.self$.debug) + .print.msg(msg = msg, class = class(.self)) }) - + ###################### # HANDLES ENTRY TYPE # ###################### @@ -59,7 +56,7 @@ # Download entry content from the public database. # type The entry type. - # id The ID of the enttry to get. + # id The ID of the entry to get. # RETURN An entry content downloaded from database. BiodbConn$methods( getEntryContent = function(type, id) { stop("Method getCompound() is not implemented in concrete class.") @@ -75,4 +72,22 @@ BiodbConn$methods( createEntry = function(type, content, drop = TRUE) { stop("Method createEntry() is not implemented in concrete class.") }) + + ################# + # GET ENTRY IDS # + ################# + + # Get a list of IDs of all entries contained in this database. + BiodbConn$methods( getEntryIds = function(type) { + stop("Method getEntryIds() is not implemented in concrete class.") + }) + + ################## + # GET NB ENTRIES # + ################## + + # Get the number of entries contained in this database. + BiodbConn$methods( getNbEntries = function(type) { + stop("Method getNbEntries() is not implemented in concrete class.") + }) }
--- a/BiodbEntry.R Tue Jul 12 12:02:37 2016 -0400 +++ b/BiodbEntry.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,16 +20,20 @@ callSuper(...) # calls super-class initializer with remaining parameters }) - ############# - # SET FIELD # - ############# + ################### + # SET FIELD VALUE # + ################### + BiodbEntry$methods( setFieldValue = function(field, value) { + .self$setField(field, value) + }) + BiodbEntry$methods( setField = function(field, value) { class = .self$getFieldClass(field) # Check cardinality - if (class != 'data.frame' && .self$getFieldCardinality(field) == RBIODB.CARD.ONE && length(value) > 1) + if (class != 'data.frame' && .self$getFieldCardinality(field) == BIODB.CARD.ONE && length(value) > 1) stop(paste0('Cannot set more that one value to single value field "', field, '" in BiodEntry.')) # Check value class @@ -45,15 +49,23 @@ }) ################### + # GET FIELD NAMES # + ################### + + BiodbEntry$methods( getFieldNames = function(field) { + return(names(.self$.fields)) + }) + + ################### # GET FIELD CLASS # ################### - + BiodbEntry$methods( getFieldClass = function(field) { - if ( ! field %in% RBIODB.FIELDS[['name']]) + if ( ! field %in% BIODB.FIELDS[['name']]) stop(paste0('Unknown field "', field, '" in BiodEntry.')) - field.class <- RBIODB.FIELDS[which(field == RBIODB.FIELDS[['name']]), 'class'] + field.class <- BIODB.FIELDS[which(field == BIODB.FIELDS[['name']]), 'class'] return(field.class) }) @@ -64,21 +76,25 @@ BiodbEntry$methods( getFieldCardinality = function(field) { - if ( ! field %in% RBIODB.FIELDS[['name']]) + if ( ! field %in% BIODB.FIELDS[['name']]) stop(paste0('Unknown field "', field, '" in BiodEntry.')) - field.card <- RBIODB.FIELDS[which(field == RBIODB.FIELDS[['name']]), 'cardinality'] + field.card <- BIODB.FIELDS[which(field == BIODB.FIELDS[['name']]), 'cardinality'] return(field.card) }) - ############# - # GET FIELD # - ############# + ################### + # GET FIELD VALUE # + ################### + BiodbEntry$methods( getFieldValue = function(field) { + return(.self$getField(field)) + }) + BiodbEntry$methods( getField = function(field) { - if ( ! field %in% RBIODB.FIELDS[['name']]) + if ( ! field %in% BIODB.FIELDS[['name']]) stop(paste0('Unknown field "', field, '" in BiodEntry.')) if (field %in% names(.self$.fields)) @@ -97,11 +113,11 @@ BiodbEntry$methods( .compute.field = function(field) { - if ( ! is.null(.self$.factory) && field %in% names(RBIODB.FIELD.COMPUTING)) { - for (db in RBIODB.FIELD.COMPUTING[[field]]) { + if ( ! is.null(.self$.factory) && field %in% names(BIODB.FIELD.COMPUTING)) { + for (db in BIODB.FIELD.COMPUTING[[field]]) { db.id <- .self$getField(paste0(db, 'id')) if ( ! is.na(db.id)) { - db.compound <- .self$.factory$createEntry(db, type = RBIODB.COMPOUND, id = db.id) + db.compound <- .self$.factory$createEntry(db, type = BIODB.COMPOUND, id = db.id) if ( ! is.null(db.compound)) { .self$setField(field, db.compound$getField(field)) return(TRUE) @@ -112,6 +128,24 @@ return(FALSE) }) + + ############################ + # GET FIELDS AS DATA FRAME # + ############################ + + BiodbEntry$methods( getFieldsAsDataFrame = function(field) { + + df <- data.frame() + + # Loop on all fields + for (f in names(.self$.fields)) + + # If field class is a basic type + if (.self$getFieldClass(f) %in% c('character', 'logical', 'integer', 'double')) + df[1, f] <- .self$getFieldValue(f) + + return(df) + }) ########### # FACTORY #
--- a/BiodbFactory.R Tue Jul 12 12:02:37 2016 -0400 +++ b/BiodbFactory.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,6 +1,7 @@ if ( ! exists('BiodbFactory')) { # Do not load again if already loaded library(methods) + source('biodb-common.R') source('ChebiConn.R') source('KeggConn.R') source('PubchemConn.R') @@ -13,27 +14,37 @@ source('NcbiccdsConn.R') source('UniprotConn.R') source('MassbankConn.R') + source('MassFiledbConn.R') ##################### # CLASS DECLARATION # ##################### - BiodbFactory <- setRefClass("BiodbFactory", fields = list(.useragent = "character", .conn = "list", .cache.dir = "character")) + BiodbFactory <- setRefClass("BiodbFactory", fields = list(.useragent = "character", .conn = "list", .cache.dir = "character", .debug = "logical")) ############### # CONSTRUCTOR # ############### - BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, ...) { + BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, debug = FALSE, ...) { - ( ! is.null(useragent) && ! is.na(useragent)) || stop("You must provide a user agent string (e.g.: \"myapp ; my.email@address\").") .useragent <<- useragent .conn <<- list() .cache.dir <<- cache.dir + .debug <<- debug callSuper(...) # calls super-class initializer with remaining parameters }) + ####################### + # PRINT DEBUG MESSAGE # + ####################### + + BiodbFactory$methods( .print.debug.msg = function(msg) { + if (.self$.debug) + .print.msg(msg = msg, class = class(.self)) + }) + ################## # GET USER AGENT # ################## @@ -42,11 +53,19 @@ return(.self$.useragent) }) + ################## + # SET USER AGENT # + ################## + + BiodbFactory$methods( setUserAgent = function(useragent) { + .useragent <<- useragent + }) + ############ # GET CONN # ############ - BiodbFactory$methods( getConn = function(class) { + BiodbFactory$methods( getConn = function(class, url = NA_character_) { if ( ! class %in% names(.self$.conn)) { @@ -63,7 +82,8 @@ ncbigene = NcbigeneConn$new(useragent = .self$.useragent), ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent), uniprot = UniprotConn$new(useragent = .self$.useragent), - massbank = MassbankConn$new(useragent = .self$.useragent), + massbank = MassbankConn$new(useragent = .self$.useragent, debug = .self$.debug), + massfiledb = MassFiledbConn$new(file = url), NULL) # Unknown class @@ -85,6 +105,9 @@ is.null(id) && is.null(content) && stop("One of id or content must be set.") ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.") + # Debug + .self$.print.debug.msg(paste0("Creating entry from ", if (is.null(id)) "content" else paste("id", id), "...")) + # Get content if ( ! is.null(id)) content <- .self$getEntryContent(class, type, id) @@ -151,25 +174,52 @@ # GET ENTRY CONTENT # ##################### - BiodbFactory$methods( getEntryContent = function(class, type, id) { + BiodbFactory$methods( getEntryContent = function(class, type, id, chunk.size = NA_integer_) { + + # Debug + .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)...")) content <- NULL # Load from cache if ( ! is.na(.self$.cache.dir)) content <- .self$.load.content.from.cache(class, type, id) - # Get contents + # Get list of missing contents missing.content.indexes <- vapply(content, is.null, FUN.VALUE = TRUE) missing.ids <- if (is.null(content)) id else id[missing.content.indexes] + + # Debug + if ( ! is.na(.self$.cache.dir)) { + .self$.print.debug.msg(paste0(length(id) - length(missing.ids), " entry content(s) loaded from cache.")) + .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched.")) + } + + # Get contents if (length(missing.ids) > 0) { # Use connector to get missing contents conn <- .self$getConn(class) - missing.contents <- conn$getEntryContent(type, missing.ids) + + # Divide list of missing ids in chunks (in order to save in cache regularly) + chunks.of.missing.ids = if (is.na(chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / chunk.size)) + + # Loop on chunks + missing.contents <- NULL + for (ch.missing.ids in chunks.of.missing.ids) { + + ch.missing.contents <- conn$getEntryContent(type, ch.missing.ids) - # Save to cache - if ( ! is.null(missing.contents) && ! is.na(.self$.cache.dir)) - .self$.save.content.to.cache(class, type, missing.ids, missing.contents) + # Save to cache + if ( ! is.null(ch.missing.contents) && ! is.na(.self$.cache.dir)) + .self$.save.content.to.cache(class, type, ch.missing.ids, ch.missing.contents) + + # Append + missing.contents <- c(missing.contents, ch.missing.contents) + + # Debug + if ( ! is.na(.self$.cache.dir)) + .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved...")) + } # Merge content and missing.contents if (is.null(content))
--- a/ChebiCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/ChebiCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,9 +20,9 @@ # Define xpath expressions xpath.expr <- character() -# xpath.expr[[RBIODB.ACCESSION]] <- "//b[starts-with(., 'CHEBI:')]" - xpath.expr[[RBIODB.INCHI]] <- "//td[starts-with(., 'InChI=')]" - xpath.expr[[RBIODB.INCHIKEY]] <- "//td[text()='InChIKey']/../td[2]" +# xpath.expr[[BIODB.ACCESSION]] <- "//b[starts-with(., 'CHEBI:')]" + xpath.expr[[BIODB.INCHI]] <- "//td[starts-with(., 'InChI=')]" + xpath.expr[[BIODB.INCHIKEY]] <- "//td[text()='InChIKey']/../td[2]" for (html in contents) { @@ -43,14 +43,14 @@ accession <- xpathSApply(xml, "//b[starts-with(., 'CHEBI:')]", xmlValue) if (length(accession) > 0) { accession <- sub('^CHEBI:([0-9]+)$', '\\1', accession, perl = TRUE) - compound$setField(RBIODB.ACCESSION, accession) + compound$setField(BIODB.ACCESSION, accession) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1)
--- a/ChebiConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/ChebiConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,35 +1,35 @@ if ( ! exists('ChebiConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('ChebiCompound.R') ##################### # CLASS DECLARATION # ##################### - ChebiConn <- setRefClass("ChebiConn", contains = "BiodbConn") + ChebiConn <- setRefClass("ChebiConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## ChebiConn$methods( getEntryContentType = function(type) { - return(RBIODB.HTML) + return(BIODB.HTML) }) ##################### # GET ENTRY CONTENT # ##################### - + ChebiConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.CHEBI, x)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.CHEBI, x)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ ChebiConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createChebiCompoundFromHtml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createChebiCompoundFromHtml(content, drop = drop) else NULL) }) } # end of load safe guard
--- a/ChemSpiderConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/ChemSpiderConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('ChemspiderConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('ChemspiderCompound.R') ##################### # CLASS DECLARATION # ##################### - ChemspiderConn <- setRefClass("ChemspiderConn", contains = "BiodbConn") + ChemspiderConn <- setRefClass("ChemspiderConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## ChemspiderConn$methods( getEntryContentType = function(type) { - return(RBIODB.HTML) + return(BIODB.HTML) }) ##################### @@ -23,13 +23,13 @@ ChemspiderConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.CHEMSPIDER, x)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.CHEMSPIDER, x)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ ChemspiderConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createChemspiderCompoundFromHtml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createChemspiderCompoundFromHtml(content, drop = drop) else NULL) }) ############################
--- a/ChemspiderCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/ChemspiderCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -40,14 +40,14 @@ accession <- xpathSApply(xml, "//li[starts-with(., 'ChemSpider ID')]", xmlValue) if (length(accession) > 0) { accession <- sub('^ChemSpider ID([0-9]+)$', '\\1', accession, perl = TRUE) - compound$setField(RBIODB.ACCESSION, accession) + compound$setField(BIODB.ACCESSION, accession) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1)
--- a/EnzymeCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/EnzymeCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,8 +20,8 @@ # Define fields regex regex <- character() - regex[[RBIODB.ACCESSION]] <- "^ID\\s+([0-9.]+)$" - regex[[RBIODB.DESCRIPTION]] <- "^DE\\s+(.+)$" + regex[[BIODB.ACCESSION]] <- "^ID\\s+([0-9.]+)$" + regex[[BIODB.DESCRIPTION]] <- "^DE\\s+(.+)$" for (text in contents) { @@ -49,7 +49,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1)
--- a/EnzymeConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/EnzymeConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('EnzymeConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('EnzymeCompound.R') ##################### # CLASS DECLARATION # ##################### - EnzymeConn <- setRefClass("EnzymeConn", contains = "BiodbConn") + EnzymeConn <- setRefClass("EnzymeConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## EnzymeConn$methods( getEntryContentType = function(type) { - return(RBIODB.TXT) + return(BIODB.TXT) }) ##################### @@ -23,13 +23,13 @@ EnzymeConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.ENZYME, accession = x, content.type = RBIODB.TXT)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.ENZYME, accession = x, content.type = BIODB.TXT)), FUN.VALUE = '') return(content) } @@ -42,6 +42,6 @@ ################ EnzymeConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createEnzymeCompoundFromTxt(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createEnzymeCompoundFromTxt(content, drop = drop) else NULL) }) }
--- a/HmdbCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/HmdbCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,13 +20,13 @@ # Define xpath expressions xpath.expr <- character() - xpath.expr[[RBIODB.ACCESSION]] <- "/metabolite/accession" - xpath.expr[[RBIODB.KEGG.ID]] <- "//kegg_id" - xpath.expr[[RBIODB.NAME]] <- "/metabolite/name" - xpath.expr[[RBIODB.FORMULA]] <- "/metabolite/chemical_formula" - xpath.expr[[RBIODB.SUPER.CLASS]] <- "//super_class" - xpath.expr[[RBIODB.AVERAGE.MASS]] <- "//average_molecular_weight" - xpath.expr[[RBIODB.MONOISOTOPIC.MASS]] <- "//monisotopic_moleculate_weight" + xpath.expr[[BIODB.ACCESSION]] <- "/metabolite/accession" + xpath.expr[[BIODB.KEGG.ID]] <- "//kegg_id" + xpath.expr[[BIODB.NAME]] <- "/metabolite/name" + xpath.expr[[BIODB.FORMULA]] <- "/metabolite/chemical_formula" + xpath.expr[[BIODB.SUPER.CLASS]] <- "//super_class" + xpath.expr[[BIODB.AVERAGE.MASS]] <- "//average_molecular_weight" + xpath.expr[[BIODB.MONOISOTOPIC.MASS]] <- "//monisotopic_moleculate_weight" for (content in contents) { @@ -52,7 +52,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1)
--- a/HmdbConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/HmdbConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('HmdbConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('HmdbCompound.R') ##################### # CLASS DECLARATION # ##################### - HmdbConn <- setRefClass("HmdbConn", contains = "BiodbConn") + HmdbConn <- setRefClass("HmdbConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## HmdbConn$methods( getEntryContentType = function(type) { - return(RBIODB.XML) + return(BIODB.XML) }) ##################### @@ -23,13 +23,13 @@ HmdbConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.HMDB, x, content.type = RBIODB.XML)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.HMDB, x, content.type = BIODB.XML)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ HmdbConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createHmdbCompoundFromXml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createHmdbCompoundFromXml(content, drop = drop) else NULL) }) } # end of load safe guard
--- a/KeggCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/KeggCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,9 +20,9 @@ # Define fields regex regex <- character() - regex[[RBIODB.NAME]] <- "^NAME\\s+([^,;]+)" - regex[[RBIODB.CHEBI.ID]] <- "^\\s+ChEBI:\\s+(\\S+)" - regex[[RBIODB.LIPIDMAPS.ID]] <- "^\\s+LIPIDMAPS:\\s+(\\S+)" + regex[[BIODB.NAME]] <- "^NAME\\s+([^,;]+)" + regex[[BIODB.CHEBI.ID]] <- "^\\s+ChEBI:\\s+(\\S+)" + regex[[BIODB.LIPIDMAPS.ID]] <- "^\\s+LIPIDMAPS:\\s+(\\S+)" for (text in contents) { @@ -50,26 +50,26 @@ # ENZYME ID g <- str_match(s, "^ENTRY\\s+EC\\s+(\\S+)") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.ACCESSION, paste('ec', g[1,2], sep = ':')) + compound$setField(BIODB.ACCESSION, paste('ec', g[1,2], sep = ':')) # ENTRY ID else { g <- str_match(s, "^ENTRY\\s+(\\S+)\\s+Compound") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.ACCESSION, paste('cpd', g[1,2], sep = ':')) + compound$setField(BIODB.ACCESSION, paste('cpd', g[1,2], sep = ':')) # OTHER ID else { g <- str_match(s, "^ENTRY\\s+(\\S+)") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.ACCESSION, g[1,2]) + compound$setField(BIODB.ACCESSION, g[1,2]) } } # ORGANISM g <- str_match(s, "^ORGANISM\\s+(\\S+)") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.ACCESSION, paste(g[1,2], compound$getField(RBIODB.ACCESSION), sep = ':')) + compound$setField(BIODB.ACCESSION, paste(g[1,2], compound$getField(BIODB.ACCESSION), sep = ':')) } } @@ -77,7 +77,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1)
--- a/KeggConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/KeggConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('KeggConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('KeggCompound.R') ##################### # CLASS DECLARATION # ##################### - KeggConn <- setRefClass("KeggConn", contains = "BiodbConn") + KeggConn <- setRefClass("KeggConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## KeggConn$methods( getEntryContentType = function(type) { - return(RBIODB.TXT) + return(BIODB.TXT) }) ##################### @@ -23,13 +23,13 @@ KeggConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.KEGG, x, content.type = RBIODB.TXT)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.KEGG, x, content.type = BIODB.TXT)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ KeggConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createKeggCompoundFromTxt(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createKeggCompoundFromTxt(content, drop = drop) else NULL) }) } # end of load safe guard
--- a/LipidmapsCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/LipidmapsCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -19,12 +19,12 @@ # Mapping column names col2field <- list() - col2field[[RBIODB.NAME]] <- 'COMMON_NAME' - col2field[[RBIODB.ACCESSION]] <- 'LM_ID' - col2field[[RBIODB.KEGG.ID]] <- 'KEGG_ID' - col2field[[RBIODB.HMDB.ID]] <- 'HMDBID' - col2field[[RBIODB.MASS]] <- 'MASS' - col2field[[RBIODB.FORMULA]] <- 'FORMULA' + col2field[[BIODB.NAME]] <- 'COMMON_NAME' + col2field[[BIODB.ACCESSION]] <- 'LM_ID' + col2field[[BIODB.KEGG.ID]] <- 'KEGG_ID' + col2field[[BIODB.HMDB.ID]] <- 'HMDBID' + col2field[[BIODB.MASS]] <- 'MASS' + col2field[[BIODB.FORMULA]] <- 'FORMULA' for (text in contents) { @@ -59,7 +59,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1)
--- a/LipidmapsConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/LipidmapsConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,13 +1,13 @@ if ( ! exists('LipdmapsConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('LipidmapsCompound.R') ##################### # CLASS DECLARATION # ##################### - LipidmapsConn <- setRefClass("LipidmapsConn", contains = "BiodbConn") + LipidmapsConn <- setRefClass("LipidmapsConn", contains = "RemotedbConn") ############### # CONSTRUCTOR # @@ -24,7 +24,7 @@ ########################## LipidmapsConn$methods( getEntryContentType = function(type) { - return(RBIODB.CSV) + return(BIODB.CSV) }) ##################### @@ -33,13 +33,13 @@ LipidmapsConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.LIPIDMAPS, x, content.type = RBIODB.CSV)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.LIPIDMAPS, x, content.type = BIODB.CSV)), FUN.VALUE = '') return(content) } @@ -52,6 +52,6 @@ ################ LipidmapsConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createLipidmapsCompoundFromCsv(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createLipidmapsCompoundFromCsv(content, drop = drop) else NULL) }) }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MassFiledbConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -0,0 +1,258 @@ +if ( ! exists('MassFiledbConn')) { + + source('MassdbConn.R') + + # LCMS File db. + # In this type of database, a single file is provided in CSV format. Default separator is tabulation. + # Each line is a MS peak measure, . + # The file contains molecule and spectrum information. Each spectrum has an accession id. + + # TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue + + ############# + # CONSTANTS # + ############# + + # Default database fields + .BIODB.DFT.DB.FIELDS <- list() + for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZ, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS)) + .BIODB.DFT.DB.FIELDS[[f]] <- f + + ##################### + # CLASS DECLARATION # + ##################### + + MassFiledbConn <- setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .fields = "list", .ms.modes = "character")) + + ############### + # CONSTRUCTOR # + ############### + + MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) { + + # Check file + (! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.") + file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\".")) + + # Set fields + .db <<- NULL + .file <<- file + .file.sep <<- file.sep + .file.quote <<- file.quote + .fields <<- .BIODB.DFT.DB.FIELDS + .field.multval.sep <<- ';' + .ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS) + names(.self$.ms.modes) <- .self$.ms.modes + + callSuper(...) + }) + + ###################### + # Is valid field tag # + ###################### + + MassFiledbConn$methods( isValidFieldTag = function(tag) { + return (tag %in% names(.self$.fields)) + }) + + ############# + # Set field # + ############# + + MassFiledbConn$methods( setField = function(tag, colname) { + + ( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.") + ( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.") + + # Load database file + .self$.init.db() + + # Check that this field tag is defined in the fields list + .self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid.")) + + # Check that columns are defined in database file + all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file.")) + + # Set new definition + if (length(colname) == 1) + .fields[[tag]] <<- colname + else { + new.col <- paste(colname, collapse = ".") + .self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '') + .fields[[tag]] <<- new.col + } + }) + + ###################################### + # SET FIELD MULTIPLE VALUE SEPARATOR # + ###################################### + + MassFiledbConn$methods( setFieldMultValSep = function(sep) { + .field.multval.sep <<- sep + }) + + ################ + # SET MS MODES # + ################ + + MassFiledbConn$methods( setMsMode = function(mode, value) { + .self$.ms.modes[[mode]] <- value + }) + + ########################## + # GET ENTRY CONTENT TYPE # + ########################## + + MassFiledbConn$methods( getEntryContentType = function(type) { + return(BIODB.DATAFRAME) + }) + + ########### + # INIT DB # + ########### + + MassFiledbConn$methods( .init.db = function() { + + if (is.null(.self$.db)) { + + # Load database + .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL) + + # Rename columns + colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') + } + }) + + ################ + # CHECK FIELDS # + ################ + + MassFiledbConn$methods( .check.fields = function(fields) { + + # Check if fields are known + unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)] + if (length(unknown.fields) > 0) + stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown.")) + + # Init db + .self$.init.db() + + # Check if fields are defined in file database + undefined.fields <- colnames(.self$.init.db)[ ! unlist(.self$.fields[fields]) %in% colnames(.self$.init.db)] + if (length(undefined.fields) > 0) + stop(paste0("Column(s) ", paste(unlist(.self$.fields[fields]), collapse = ", "), " is/are undefined in file database.")) + }) + + ################ + # EXTRACT COLS # + ################ + + MassFiledbConn$methods( .extract.cols = function(cols, mode = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) { + + x <- NULL + + if ( ! is.null(cols) && ! is.na(cols)) { + + # Init db + .self$.init.db() + + # TODO check existence of cols/fields + + # Get db, eventually filtering it. + if (is.null(mode)) + db <- .self$.db + else { + # Check mode value + mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'.")) + .self$.check.fields(BIODB.MSMODE) + + # Filter on mode + db <- .self$.db[.self$.db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ] + } + + # Get subset + x <- db[, unlist(.self$.fields[cols]), drop = drop] + + # Rename columns + if (is.data.frame(x)) + colnames(x) <- cols + + # Rearrange + if (drop && is.vector(x)) { + if (uniq) + x <- x[ ! duplicated(x)] + if (sort) + x <- sort(x) + } + + # Cut + if ( ! is.na(max.rows)) + x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ] + } + + return(x) + }) + + ################# + # GET ENTRY IDS # + ################# + + MassFiledbConn$methods( getEntryIds = function(type) { + + ids <- NA_character_ + + if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND)) + ids <- as.character(.self$.extract.cols(if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE)) + + return(ids) + }) + + ################## + # GET NB ENTRIES # + ################## + + MassFiledbConn$methods( getNbEntries = function(type) { + return(length(.self$getEntryIds(type))) + }) + + ############################### + # GET CHROMATOGRAPHIC COLUMNS # + ############################### + + # Inherited from MassdbConn. + MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) { + + # Extract needed columns + db <- .self$.extract.cols(c(BIODB.COMPOUND.ID, BIODB.CHROM.COL)) + + # Filter on molecule IDs + if ( ! is.null(compound.ids)) + db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ] + + # Get column names + cols <- db[[BIODB.CHROM.COL]] + + # Remove duplicates + cols <- cols[ ! duplicated(cols)] + + # Make data frame + chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE) + colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE) + + return(chrom.cols) + }) + + ################# + # GET MZ VALUES # + ################# + + # Inherited from MassdbConn. + MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { + + # Get mz values + mz <- .self$.extract.cols(BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results) + + return(mz) + }) + +}
--- a/MassbankCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MassbankCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -28,38 +28,38 @@ for (s in lines[[1]]) { # NAME - if (is.na(compound$getField(RBIODB.NAME))) { + if (is.na(compound$getField(BIODB.NAME))) { g <- str_match(s, "^CH\\$NAME:\\s+(.+)$") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.NAME, g[1,2]) + compound$setField(BIODB.NAME, g[1,2]) } # CHEBI ID g <- str_match(s, "^CH\\$LINK: CHEBI\\s+(.+)$") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.CHEBI.ID, g[1,2]) + compound$setField(BIODB.CHEBI.ID, g[1,2]) # KEGG ID g <- str_match(s, "^CH\\$LINK: KEGG\\s+(.+)$") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.KEGG.ID, g[1,2]) + compound$setField(BIODB.KEGG.ID, g[1,2]) # PUBCHEM ID g <- str_match(s, "^CH\\$LINK: PUBCHEM\\s+(.+)$") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.PUBCHEM.ID, g[1,2]) + compound$setField(BIODB.PUBCHEM.ID, g[1,2]) # INCHI g <- str_match(s, "^CH\\$IUPAC:\\s+(.+)$") if ( ! is.na(g[1,1])) - compound$setField(RBIODB.INCHI, g[1,2]) + compound$setField(BIODB.INCHI, g[1,2]) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.NAME))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.NAME))) NULL else x) return(compounds) }
--- a/MassbankConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MassbankConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,43 +1,69 @@ if ( ! exists('MassbankConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') + source('MassdbConn.R') source('MassbankSpectrum.R') ##################### # CLASS DECLARATION # ##################### - MassbankConn <- setRefClass("MassbankConn", contains = "BiodbConn") + MassbankConn <- setRefClass("MassbankConn", contains = c("RemotedbConn", "MassdbConn")) ########################## # GET ENTRY CONTENT TYPE # ########################## MassbankConn$methods( getEntryContentType = function(type) { - return(if (type == RBIODB.SPECTRUM) RBIODB.TXT else NULL) + return(if (type == BIODB.SPECTRUM) BIODB.TXT else NULL) }) ##################### # GET ENTRY CONTENT # ##################### - MassbankConn$methods( getEntryContent = function(type, id) { + MassbankConn$methods( getEntryContent = function(type, ids) { - if (type == RBIODB.SPECTRUM) { + # Debug + .self$.print.debug.msg(paste0("Get entry content(s) for ", length(ids)," id(s)...")) + + if (type == BIODB.SPECTRUM) { + + URL.MAX.LENGTH <- 2083 # Initialize return values - content <- rep(NA_character_, length(id)) + content <- rep(NA_character_, length(ids)) + + # Loop on all + n <- 0 + while (n < length(ids)) { - # Request - xmlstr <- .self$.scheduler$getUrl(get.entry.url(RBIODB.MASSBANK, id, RBIODB.TXT)) + # Get list of accession ids to retrieve + accessions <- ids[(n + 1):length(ids)] + + # Create URL request + x <- get.entry.url(class = BIODB.MASSBANK, accession = accessions, content.type = BIODB.TXT, max.length = URL.MAX.LENGTH) + + # Debug + .self$.print.debug.msg(paste0("Send URL request for ", x$n," id(s)...")) - # Parse XML and get text - if ( ! is.na(xmlstr)) { - library(XML) - xml <- xmlInternalTreeParse(xmlstr, asText = TRUE) - ns <- c(ax21 = "http://api.massbank/xsd") - returned.ids <- xpathSApply(xml, "//ax21:id", xmlValue, namespaces = ns) - content[match(returned.ids, id)] <- xpathSApply(xml, "//ax21:info", xmlValue, namespaces = ns) + # Send request + xmlstr <- .self$.scheduler$getUrl(x$url) + + # Increase number of entries retrieved + n <- n + x$n + + # Parse XML and get text + if ( ! is.na(xmlstr)) { + library(XML) + xml <- xmlInternalTreeParse(xmlstr, asText = TRUE) + ns <- c(ax21 = "http://api.massbank/xsd") + returned.ids <- xpathSApply(xml, "//ax21:id", xmlValue, namespaces = ns) + content[match(returned.ids, ids)] <- xpathSApply(xml, "//ax21:info", xmlValue, namespaces = ns) + } + + # Debug + .self$.print.debug.msg(paste0("Now ", length(ids) - n," id(s) left to be retrieved...")) } return(content) @@ -54,6 +80,13 @@ # content A file content, downloaded from the public database. # RETURN A spectrum instance. MassbankConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.SPECTRUM) createMassbankSpectrumFromTxt(content, drop = drop) else NULL) + return(if (type == BIODB.SPECTRUM) createMassbankSpectrumFromTxt(content, drop = drop) else NULL) + }) + + ################# + # GET MZ VALUES # + ################# + + MassbankConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { }) }
--- a/MassbankSpectrum.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MassbankSpectrum.R Sat Sep 03 17:02:01 2016 -0400 @@ -21,13 +21,13 @@ # Define fields regex regex <- character() - regex[[RBIODB.ACCESSION]] <- "^ACCESSION: (.+)$" - regex[[RBIODB.MSDEV]] <- "^AC\\$INSTRUMENT: (.+)$" - regex[[RBIODB.MSDEVTYPE]] <- "^AC\\$INSTRUMENT_TYPE: (.+)$" - regex[[RBIODB.MSTYPE]] <- "^AC\\$MASS_SPECTROMETRY: MS_TYPE (.+)$" - regex[[RBIODB.MSPRECMZ]] <- "^MS\\$FOCUSED_ION: PRECURSOR_M/Z (.+)$" - regex[[RBIODB.NB.PEAKS]] <- "^PK\\$NUM_PEAK: ([0-9]+)$" - regex[[RBIODB.MSPRECANNOT]] <- "^MS\\$FOCUSED_ION: PRECURSOR_TYPE (.+)$" + regex[[BIODB.ACCESSION]] <- "^ACCESSION: (.+)$" + regex[[BIODB.MSDEV]] <- "^AC\\$INSTRUMENT: (.+)$" + regex[[BIODB.MSDEVTYPE]] <- "^AC\\$INSTRUMENT_TYPE: (.+)$" + regex[[BIODB.MSTYPE]] <- "^AC\\$MASS_SPECTROMETRY: MS_TYPE (.+)$" + regex[[BIODB.MSPRECMZ]] <- "^MS\\$FOCUSED_ION: PRECURSOR_M/Z (.+)$" + regex[[BIODB.NB.PEAKS]] <- "^PK\\$NUM_PEAK: ([0-9]+)$" + regex[[BIODB.MSPRECANNOT]] <- "^MS\\$FOCUSED_ION: PRECURSOR_TYPE (.+)$" for (text in contents) { @@ -54,7 +54,7 @@ # MS MODE g <- str_match(s, "^AC\\$MASS_SPECTROMETRY: ION_MODE (.+)$") if ( ! is.na(g[1,1])) { - spectrum$setField(RBIODB.MSMODE, if (g[1,2] == 'POSITIVE') RBIODB.MSMODE.POS else RBIODB.MSMODE.NEG) + spectrum$setField(BIODB.MSMODE, if (g[1,2] == 'POSITIVE') BIODB.MSMODE.POS else BIODB.MSMODE.NEG) next } @@ -67,13 +67,13 @@ } # Replace elements with no accession id by NULL - spectra <- lapply(spectra, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + spectra <- lapply(spectra, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # Set associated compounds compounds <- createMassbankCompoundFromTxt(contents) for (i in seq(spectra)) if ( ! is.null(spectra[[i]])) - spectra[[i]]$setField(RBIODB.COMPOUND, compounds[[i]]) + spectra[[i]]$setField(BIODB.COMPOUND, compounds[[i]]) # If the input was a single element, then output a single object if (drop && length(contents) == 1) @@ -88,26 +88,26 @@ .parse.peak.line <- function(spectrum, line) { - peaks <- RBIODB.PEAK.DF.EXAMPLE + peaks <- BIODB.PEAK.DF.EXAMPLE # Annotation g <- str_match(line, "^\\s+([0-9][0-9.]*) ([A-Z0-9+-]+) ([0-9]+) ([0-9][0-9.]*) ([0-9][0-9.]*)$") if ( ! is.na(g[1,1])) - peaks[1, c(RBIODB.PEAK.MZ, RBIODB.PEAK.FORMULA, RBIODB.PEAK.FORMULA.COUNT, RBIODB.PEAK.MASS, RBIODB.PEAK.ERROR.PPM)] <- list(as.double(g[1,2]), g[1,3], as.integer(g[1,4]), as.double(g[1,5]), as.double(g[1,6])) + peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM)] <- list(as.double(g[1,2]), g[1,3], as.integer(g[1,4]), as.double(g[1,5]), as.double(g[1,6])) # Peak g <- str_match(line, "^\\s+([0-9][0-9.]*) ([0-9][0-9.]*) ([0-9]+)$") if ( ! is.na(g[1,1])) - peaks[1, c(RBIODB.PEAK.MZ, RBIODB.PEAK.INTENSITY, RBIODB.PEAK.RELATIVE.INTENSITY)] <- list(as.double(g[1,2]), as.double(g[1,3]), as.integer(g[1,4])) + peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY)] <- list(as.double(g[1,2]), as.double(g[1,3]), as.integer(g[1,4])) if (nrow(peaks) > 0) { # Get curent peaks and merge with new peaks - current.peaks <- spectrum$getField(RBIODB.PEAKS) + current.peaks <- spectrum$getField(BIODB.PEAKS) if ( ! is.null(current.peaks)) peaks <- rbind(current.peaks, peaks) - spectrum$setField(RBIODB.PEAKS, peaks) + spectrum$setField(BIODB.PEAKS, peaks) return(TRUE) }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MassdbConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -0,0 +1,31 @@ +if ( ! exists('MassdbConn')) { + + source('BiodbConn.R') + + ##################### + # CLASS DECLARATION # + ##################### + + MassdbConn <- setRefClass("MassdbConn", contains = "BiodbConn") + + ############################### + # GET CHROMATOGRAPHIC COLUMNS # + ############################### + + # Get a list of chromatographic columns contained in this database. + # compound.ids A list of compound IDs used to filter results. + # The returned value is a data.frame with two columns : one for the ID (BIODB.ID) and another one for the title (BIODB.TITLE). + MassdbConn$methods( getChromCol = function(compound.ids = NULL) { + stop("Method getChromCol() is not implemented in concrete class.") + }) + + ################# + # GET MZ VALUES # + ################# + + # Returns a numeric vector of all masses stored inside the database. + MassdbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { + stop("Method getMzValues() not implemented in concrete class.") + }) + +}
--- a/MirbaseCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MirbaseCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -20,9 +20,9 @@ # Define fields regex xpath.expr <- character() - xpath.expr[[RBIODB.ACCESSION]] <- "//td[text()='Accession number']/../td[2]" - xpath.expr[[RBIODB.NAME]] <- "//td[text()='ID']/../td[2]" - xpath.expr[[RBIODB.SEQUENCE]] <- "//td[text()='Sequence']/..//pre" + xpath.expr[[BIODB.ACCESSION]] <- "//td[text()='Accession number']/../td[2]" + xpath.expr[[BIODB.NAME]] <- "//td[text()='ID']/../td[2]" + xpath.expr[[BIODB.SEQUENCE]] <- "//td[text()='Sequence']/..//pre" for (html in contents) { @@ -43,7 +43,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1)
--- a/MirbaseConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MirbaseConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('MirbaseConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('MirbaseCompound.R') ##################### # CLASS DECLARATION # ##################### - MirbaseConn <- setRefClass("MirbaseConn", contains = "BiodbConn") + MirbaseConn <- setRefClass("MirbaseConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## MirbaseConn$methods( getEntryContentType = function(type) { - return(RBIODB.HTML) + return(BIODB.HTML) }) ##################### @@ -23,13 +23,13 @@ MirbaseConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.MIRBASE, x, content.type = RBIODB.HTML)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.MIRBASE, x, content.type = BIODB.HTML)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ MirbaseConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createMirbaseCompoundFromHtml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createMirbaseCompoundFromHtml(content, drop = drop) else NULL) }) ###################
--- a/MsDbOutputDataFrameStream.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MsDbOutputDataFrameStream.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,7 +1,6 @@ if ( ! exists('MsDbOutputDataFrameStream')) { # Do not load again if already loaded library(methods) - library(plyr) source('MsDbOutputStream.R') source('dfhlp.R', chdir = TRUE) @@ -41,6 +40,8 @@ MsDbOutputDataFrameStream$methods( matchedPeaks = function(mz, rt = NULL, unused = NULL, peaks = NULL) { + library(plyr) + # Set input values x <- data.frame(mz = mz) if ( ! is.null(rt))
--- a/MsPeakForestDb.R Tue Jul 12 12:02:37 2016 -0400 +++ b/MsPeakForestDb.R Sat Sep 03 17:02:01 2016 -0400 @@ -8,21 +8,24 @@ # CLASS DECLARATION # ##################### - MsPeakForestDb <- setRefClass("MsPeakForestDb", contains = "MsDb", fields = list(.url = "character", .url.scheduler = "ANY")) + MsPeakForestDb <- setRefClass("MsPeakForestDb", contains = "MsDb", fields = list(.url = "character", .url.scheduler = "ANY", .token = "character")) ############### # CONSTRUCTOR # ############### - MsPeakForestDb$methods( initialize = function(url = NA_character_, useragent = NA_character_, ...) { + MsPeakForestDb$methods( initialize = function(url = NA_character_, useragent = NA_character_, token = NA_character_, ...) { # Check URL if (is.null(url) || is.na(url)) stop("No URL defined for new MsPeakForestDb instance.") + if (substring(url, nchar(url) - 1, 1) == '/') + url <- substring(url, nchar(url) - 1) .url <<- url .url.scheduler <<- UrlRequestScheduler$new(n = 3, useragent = useragent) .self$.url.scheduler$setVerbose(1L) + .token <<- token callSuper(...) }) @@ -35,6 +38,17 @@ res <- NULL + # Add url prefix + if (substring(url, 1, 1) == '/') + url <- substring(url, 2) + url <- paste(.self$.url, url, sep = '/') + + # Add token + if ( ! is.na(.self$.token)) + params <- c(params, token = .self$.token) + param.str <- if (is.null(params)) '' else paste('?', vapply(names(params), function(p) paste(p, params[[p]], sep = '='), FUN.VALUE = ''), collapse = '&', sep = '') + + # Get URL content <- .self$.url.scheduler$getUrl(url = url, params = params) if (ret.type == 'json') { @@ -67,7 +81,7 @@ MsPeakForestDb$methods( getMoleculeIds = function() { - ids <- as.character(.self$.get.url(url = paste0(.self$.url, 'compounds/all/ids'))) + ids <- as.character(.self$.get.url(url = 'compounds/all/ids')) return(ids) }) @@ -78,7 +92,7 @@ MsPeakForestDb$methods( getNbMolecules = function() { - n <- .self$.get.url(url = paste0(.self$.url, 'compounds/all/count'), ret.type = 'integer') + n <- .self$.get.url(url = 'compounds/all/count', ret.type = 'integer') return(n) }) @@ -90,13 +104,12 @@ MsPeakForestDb$methods( getChromCol = function(molid = NULL) { # Set URL - url <- paste0(.self$.url, 'metadata/lc/list-code-columns') params <- NULL if ( ! is.null(molid)) params <- list(molids = paste(molid, collapse = ',')) # Call webservice - wscols <- .self$.get.url(url = url, params = params) + wscols <- .self$.get.url(url = 'metadata/lc/list-code-columns', params = params) # Build data frame cols <- data.frame(id = character(), title = character()) @@ -118,13 +131,12 @@ rt <- list() # Set URL - url <- paste0(.self$.url, 'spectra/lcms/search') params <- NULL if ( ! is.null(molid)) params <- list(molids = paste(molid, collapse = ',')) # Call webservice - spectra <- .self$.get.url(url = url, params = params) + spectra <- .self$.get.url(url = 'spectra/lcms/search', params = params) if (class(spectra) == 'list' && length(spectra) > 0) { for (s in spectra) if (is.na(col) || s$liquidChromatography$columnCode %in% col) { @@ -160,11 +172,10 @@ if (length(non.na.molid) > 0) { # Set URL - url <- paste0(.self$.url, 'compounds/all/names') params <- c(molids = paste(non.na.molid, collapse = ',')) # Call webservice - names[ ! is.na(molid)] <- .self$.get.url(url = url, params = params) + names[ ! is.na(molid)] <- .self$.get.url(url = 'compounds/all/names', params = params) } return(names) @@ -187,8 +198,7 @@ ids <- c(ids, NA_character_) else { - url <- paste0(.self$.url, 'search/compounds/name/', curlEscape(n)) - compounds <- .self$.get.url(url = url)$compoundNames + compounds <- .self$.get.url(url = paste0('search/compounds/name/', curlEscape(n)))$compoundNames ids <- c(ids, list(vapply(compounds, function(c) as.character(c$compound$id), FUN.VALUE = ''))) } } @@ -203,7 +213,6 @@ MsPeakForestDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { # Build URL - url <- paste0(.self$.url, 'spectra/lcms/count-peaks') params <- NULL if ( ! is.na(type)) params <- c(params, mode = if (type == MSDB.TAG.POS) 'pos' else 'neg') @@ -211,7 +220,7 @@ params <- c(params, molids = paste(molid, collapse = ',')) # Run request - n <- .self$.get.url(url = url, params = params, ret.type = 'integer') + n <- .self$.get.url(url = 'spectra/lcms/count-peaks', params = params, ret.type = 'integer') return(sum(n)) }) @@ -222,16 +231,13 @@ MsPeakForestDb$methods( getMzValues = function(mode = NULL) { - # Build URL - url <- paste0(.self$.url, 'spectra/lcms/peaks/list-mz') - # Query params params <- NULL if ( ! is.null(mode)) params <- c(params, mode = if (mode == MSDB.TAG.POS) 'positive' else 'negative') # Get MZ valuels - mz <- .self$.get.url(url = url, params = params) + mz <- .self$.get.url(url = 'spectra/lcms/peaks/list-mz', params = params) return(mz) }) @@ -243,7 +249,7 @@ MsPeakForestDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { # Build URL for mz search - url <- paste0(.self$.url, 'spectra/lcms/peaks/get-range/', mz.low, '/', mz.high) + url <- paste0('spectra/lcms/peaks/get-range/', mz.low, '/', mz.high) # Get spectra spectra <- .self$.get.url(url = url) @@ -265,7 +271,7 @@ if (nrow(results) > 0) { # Build URL for rt search - url <- paste0(.self$.url, 'spectra/lcms/range-rt-min/', rt.low, '/', rt.high) + url <- paste0('spectra/lcms/range-rt-min/', rt.low, '/', rt.high) params <- NULL if ( ! is.null(col)) params <- c(columns = paste(col, collapse = ','))
--- a/NcbiCcdsCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/NcbiCcdsCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -27,15 +27,15 @@ xml <- htmlTreeParse(html, asText = TRUE, useInternalNodes = TRUE) if (length(getNodeSet(xml, "//*[starts-with(.,'No results found for CCDS ID ')]")) == 0) { - compound$setField(RBIODB.ACCESSION, xpathSApply(xml, "//input[@id='DATA']", xmlGetAttr, "value")) - compound$setField(RBIODB.SEQUENCE, xpathSApply(xml, "//b[starts-with(.,'Nucleotide Sequence')]/../tt", xmlValue)) + compound$setField(BIODB.ACCESSION, xpathSApply(xml, "//input[@id='DATA']", xmlGetAttr, "value")) + compound$setField(BIODB.SEQUENCE, xpathSApply(xml, "//b[starts-with(.,'Nucleotide Sequence')]/../tt", xmlValue)) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1)
--- a/NcbiCcdsConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/NcbiCcdsConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,13 +1,13 @@ if ( ! exists('NcbiccdsConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('NcbiccdsCompound.R') ##################### # CLASS DECLARATION # ##################### - NcbiccdsConn <- setRefClass("NcbiccdsConn", contains = "BiodbConn") + NcbiccdsConn <- setRefClass("NcbiccdsConn", contains = "RemotedbConn") ############### # CONSTRUCTOR # @@ -23,7 +23,7 @@ ########################## NcbiccdsConn$methods( getEntryContentType = function(type) { - return(RBIODB.HTML) + return(BIODB.HTML) }) ##################### @@ -32,13 +32,13 @@ NcbiccdsConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.NCBICCDS, x)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.NCBICCDS, x)), FUN.VALUE = '') return(content) } @@ -51,6 +51,6 @@ ################ NcbiccdsConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createNcbiccdsCompoundFromHtml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createNcbiccdsCompoundFromHtml(content, drop = drop) else NULL) }) }
--- a/NcbiGeneCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/NcbiGeneCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -21,13 +21,13 @@ # Define xpath expressions xpath.expr <- character() - xpath.expr[[RBIODB.ACCESSION]] <- "//Gene-track_geneid" - xpath.expr[[RBIODB.KEGG.ID]] <- "/Dbtag_db[text()='KEGG']/..//Object-id_str" - xpath.expr[[RBIODB.UNIPROT.ID]] <- "//Gene-commentary_heading[text()='UniProtKB']/..//Dbtag_db[text()='UniProtKB/Swiss-Prot']/..//Object-id_str" - xpath.expr[[RBIODB.LOCATION]] <- "//Gene-ref_maploc" - xpath.expr[[RBIODB.PROTEIN.DESCRIPTION]] <- "//Gene-ref_desc" - xpath.expr[[RBIODB.SYMBOL]] <- "//Gene-ref_locus" - xpath.expr[[RBIODB.SYNONYMS]] <- "//Gene-ref_syn_E" + xpath.expr[[BIODB.ACCESSION]] <- "//Gene-track_geneid" + xpath.expr[[BIODB.KEGG.ID]] <- "/Dbtag_db[text()='KEGG']/..//Object-id_str" + xpath.expr[[BIODB.UNIPROT.ID]] <- "//Gene-commentary_heading[text()='UniProtKB']/..//Dbtag_db[text()='UniProtKB/Swiss-Prot']/..//Object-id_str" + xpath.expr[[BIODB.LOCATION]] <- "//Gene-ref_maploc" + xpath.expr[[BIODB.PROTEIN.DESCRIPTION]] <- "//Gene-ref_desc" + xpath.expr[[BIODB.SYMBOL]] <- "//Gene-ref_locus" + xpath.expr[[BIODB.SYNONYMS]] <- "//Gene-ref_syn_E" for (content in contents) { @@ -56,14 +56,14 @@ # CCDS ID ccdsid <- .find.ccds.id(xml) if ( ! is.na(ccdsid)) - compound$setField(RBIODB.NCBI.CCDS.ID, ccdsid) + compound$setField(BIODB.NCBI.CCDS.ID, ccdsid) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1)
--- a/NcbiGeneConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/NcbiGeneConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,13 +1,13 @@ if ( ! exists('NcbigeneConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('NcbigeneCompound.R') ##################### # CLASS DECLARATION # ##################### - NcbigeneConn <- setRefClass("NcbigeneConn", contains = "BiodbConn") + NcbigeneConn <- setRefClass("NcbigeneConn", contains = "RemotedbConn") ############### # CONSTRUCTOR # @@ -23,7 +23,7 @@ ########################## NcbigeneConn$methods( getEntryContentType = function(type) { - return(RBIODB.XML) + return(BIODB.XML) }) ##################### @@ -32,13 +32,13 @@ NcbigeneConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.NCBIGENE, x)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.NCBIGENE, x)), FUN.VALUE = '') return(content) } @@ -51,6 +51,6 @@ ################ NcbigeneConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createNcbigeneCompoundFromXml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createNcbigeneCompoundFromXml(content, drop = drop) else NULL) }) }
--- a/PubchemCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/PubchemCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -23,9 +23,9 @@ # Define xpath expressions xpath.expr <- character() - xpath.expr[[RBIODB.ACCESSION]] <- "//pubchem:RecordType[text()='CID']/../pubchem:RecordNumber" - xpath.expr[[RBIODB.INCHI]] <- "//pubchem:Name[text()='InChI']/../pubchem:StringValue" - xpath.expr[[RBIODB.INCHIKEY]] <- "//pubchem:Name[text()='InChI Key']/../pubchem:StringValue" + xpath.expr[[BIODB.ACCESSION]] <- "//pubchem:RecordType[text()='CID']/../pubchem:RecordNumber" + xpath.expr[[BIODB.INCHI]] <- "//pubchem:Name[text()='InChI']/../pubchem:StringValue" + xpath.expr[[BIODB.INCHIKEY]] <- "//pubchem:Name[text()='InChI Key']/../pubchem:StringValue" for (content in contents) { @@ -52,7 +52,7 @@ if (is.na(name)) tryCatch( { name <- xpathSApply(xml, "//pubchem:Name[text()='Record Title']/../pubchem:StringValue", xmlValue, namespaces = ns) }, warning = function(w) {}) if ( ! is.na(name)) - compound$setField(RBIODB.NAME, name) + compound$setField(BIODB.NAME, name) } @@ -60,7 +60,7 @@ } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1)
--- a/PubchemConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/PubchemConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('get.pubchem.compound.url')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('PubchemCompound.R') ##################### # CLASS DECLARATION # ##################### - PubchemConn <- setRefClass("PubchemConn", contains = "BiodbConn") + PubchemConn <- setRefClass("PubchemConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## PubchemConn$methods( getEntryContentType = function(type) { - return(RBIODB.XML) + return(BIODB.XML) }) ##################### @@ -23,13 +23,13 @@ PubchemConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.PUBCHEM, x, content.type = RBIODB.XML)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.PUBCHEM, x, content.type = BIODB.XML)), FUN.VALUE = '') return(content) } @@ -42,7 +42,7 @@ ################ PubchemConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createPubchemCompoundFromXml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createPubchemCompoundFromXml(content, drop = drop) else NULL) }) #########################
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RemotedbConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -0,0 +1,31 @@ +if ( ! exists('RemotedbConn')) { + + source('BiodbConn.R') + source(file.path('UrlRequestScheduler.R'), chdir = TRUE) + + ##################### + # CLASS DECLARATION # + ##################### + + RemotedbConn <- setRefClass("RemotedbConn", contains = "BiodbConn", fields = list(.scheduler = "UrlRequestScheduler")) + + ############### + # CONSTRUCTOR # + ############### + + RemotedbConn$methods( initialize = function(useragent = NA_character_, scheduler = NULL, ...) { + + # Check useragent + ( ! is.null(useragent) && ! is.na(useragent)) || stop("You must specify a valid useragent string (e.g.: \"myapp ; my.email@address\").") + + # Set scheduler + if (is.null(scheduler)) + scheduler <- UrlRequestScheduler$new(n = 3) + inherits(scheduler, "UrlRequestScheduler") || stop("The scheduler instance must inherit from UrlRequestScheduler class.") + scheduler$setUserAgent(useragent) # set agent + .scheduler <<- scheduler + + callSuper(...) # calls super-class initializer with remaining parameters + }) + +}
--- a/UniProtCompound.R Tue Jul 12 12:02:37 2016 -0400 +++ b/UniProtCompound.R Sat Sep 03 17:02:01 2016 -0400 @@ -23,17 +23,17 @@ # Define xpath expressions xpath.values <- character() - xpath.values[[RBIODB.NAME]] <- "/uniprot:uniprot/uniprot:compound/uniprot:name" - xpath.values[[RBIODB.GENE.SYMBOLS]] <- "//uniprot:gene/uniprot:name" - xpath.values[[RBIODB.FULLNAMES]] <- "//uniprot:protein//uniprot:fullName" - xpath.values[[RBIODB.SEQUENCE]] <- "//uniprot:entry/uniprot:sequence" - xpath.values[[RBIODB.ACCESSION]] <- "//uniprot:accession[1]" + xpath.values[[BIODB.NAME]] <- "/uniprot:uniprot/uniprot:compound/uniprot:name" + xpath.values[[BIODB.GENE.SYMBOLS]] <- "//uniprot:gene/uniprot:name" + xpath.values[[BIODB.FULLNAMES]] <- "//uniprot:protein//uniprot:fullName" + xpath.values[[BIODB.SEQUENCE]] <- "//uniprot:entry/uniprot:sequence" + xpath.values[[BIODB.ACCESSION]] <- "//uniprot:accession[1]" xpath.attr <- list() - xpath.attr[[RBIODB.KEGG.ID]] <- list(path = "//uniprot:dbReference[@type='KEGG']", attr = 'id') - xpath.attr[[RBIODB.NCBI.GENE.ID]] <- list(path = "//uniprot:dbReference[@type='GeneID']", attr = 'id') - xpath.attr[[RBIODB.ENZYME.ID]] <- list(path = "//uniprot:dbReference[@type='EC']", attr = 'id') - xpath.attr[[RBIODB.MASS]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'mass') - xpath.attr[[RBIODB.LENGTH]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'length') + xpath.attr[[BIODB.KEGG.ID]] <- list(path = "//uniprot:dbReference[@type='KEGG']", attr = 'id') + xpath.attr[[BIODB.NCBI.GENE.ID]] <- list(path = "//uniprot:dbReference[@type='GeneID']", attr = 'id') + xpath.attr[[BIODB.ENZYME.ID]] <- list(path = "//uniprot:dbReference[@type='EC']", attr = 'id') + xpath.attr[[BIODB.MASS]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'mass') + xpath.attr[[BIODB.LENGTH]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'length') for (content in contents) { @@ -61,16 +61,16 @@ } # Remove new lines from sequence string - seq <- compound$getField(RBIODB.SEQUENCE) + seq <- compound$getField(BIODB.SEQUENCE) if ( ! is.na(seq)) - compound$setField(RBIODB.SEQUENCE, gsub("\\n", "", seq)) + compound$setField(BIODB.SEQUENCE, gsub("\\n", "", seq)) } compounds <- c(compounds, compound) } # Replace elements with no accession id by NULL - compounds <- lapply(compounds, function(x) if (is.na(x$getField(RBIODB.ACCESSION))) NULL else x) + compounds <- lapply(compounds, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) # If the input was a single element, then output a single object if (drop && length(contents) == 1)
--- a/UniProtConn.R Tue Jul 12 12:02:37 2016 -0400 +++ b/UniProtConn.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,20 +1,20 @@ if ( ! exists('UniprotConn')) { # Do not load again if already loaded - source('BiodbConn.R') + source('RemotedbConn.R') source('UniprotCompound.R') ##################### # CLASS DECLARATION # ##################### - UniprotConn <- setRefClass("UniprotConn", contains = "BiodbConn") + UniprotConn <- setRefClass("UniprotConn", contains = "RemotedbConn") ########################## # GET ENTRY CONTENT TYPE # ########################## UniprotConn$methods( getEntryContentType = function(type) { - return(RBIODB.XML) + return(BIODB.XML) }) ##################### @@ -23,13 +23,13 @@ UniprotConn$methods( getEntryContent = function(type, id) { - if (type == RBIODB.COMPOUND) { + if (type == BIODB.COMPOUND) { # Initialize return values content <- rep(NA_character_, length(id)) # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(RBIODB.UNIPROT, x, content.type = RBIODB.XML)), FUN.VALUE = '') + content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.UNIPROT, x, content.type = BIODB.XML)), FUN.VALUE = '') return(content) } @@ -42,6 +42,6 @@ ################ UniprotConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == RBIODB.COMPOUND) createUniprotCompoundFromXml(content, drop = drop) else NULL) + return(if (type == BIODB.COMPOUND) createUniprotCompoundFromXml(content, drop = drop) else NULL) }) }
--- a/biodb-common.R Tue Jul 12 12:02:37 2016 -0400 +++ b/biodb-common.R Sat Sep 03 17:02:01 2016 -0400 @@ -1,154 +1,189 @@ -if ( ! exists('RBIODB.COMPOUND')) { # Do not load again if already loaded +if ( ! exists('BIODB.COMPOUND')) { # Do not load again if already loaded + + ############### + # ENTRY TYPES # + ############### + + BIODB.COMPOUND <- 'compound' + BIODB.SPECTRUM <- 'spectrum' + + ####################### + # ENTRY CONTENT TYPES # + ####################### + + BIODB.HTML <- 'html' + BIODB.TXT <- 'txt' + BIODB.XML <- 'xml' + BIODB.CSV <- 'csv' + BIODB.DATAFRAME <- 'dataframe' + BIODB.ANY <- 'any' # Value used when we do not care about the type. ############# - # CONSTANTS # + # DATABASES # ############# - - # Entry types - RBIODB.COMPOUND <- 'compound' - RBIODB.SPECTRUM <- 'spectrum' - - # Entry content types - RBIODB.HTML <- 'html' - RBIODB.TXT <- 'txt' - RBIODB.XML <- 'xml' - RBIODB.CSV <- 'csv' - RBIODB.ANY <- 'any' - # Class names - RBIODB.CHEBI <- 'chebi' - RBIODB.KEGG <- 'kegg' - RBIODB.PUBCHEM <- 'pubchem' - RBIODB.HMDB <- 'hmdb' - RBIODB.CHEMSPIDER <- 'chemspider' - RBIODB.ENZYME <- 'enzyme' - RBIODB.LIPIDMAPS <- 'lipidmaps' - RBIODB.MIRBASE <- 'mirbase' - RBIODB.NCBIGENE <- 'ncbigene' - RBIODB.NCBICCDS <- 'ncbiccds' - RBIODB.UNIPROT <- 'uniprot' - RBIODB.MASSBANK <- 'massbank' + BIODB.CHEBI <- 'chebi' + BIODB.KEGG <- 'kegg' + BIODB.PUBCHEM <- 'pubchem' + BIODB.HMDB <- 'hmdb' + BIODB.CHEMSPIDER <- 'chemspider' + BIODB.ENZYME <- 'enzyme' + BIODB.LIPIDMAPS <- 'lipidmaps' + BIODB.MIRBASE <- 'mirbase' + BIODB.NCBIGENE <- 'ncbigene' + BIODB.NCBICCDS <- 'ncbiccds' + BIODB.UNIPROT <- 'uniprot' + BIODB.MASSBANK <- 'massbank' + BIODB.MASSFILEDB <- 'massfiledb' + + ########## + # FIELDS # + ########## - # Fields - RBIODB.COMPOUND <- 'compound' - RBIODB.ACCESSION <- 'accession' - RBIODB.DESCRIPTION <- 'description' - RBIODB.PROTEIN.DESCRIPTION <- 'protdesc' - RBIODB.NAME <- 'name' - RBIODB.FULLNAMES <- 'fullnames' - RBIODB.SYNONYMS <- 'synonyms' - RBIODB.SYMBOL <- 'symbol' - RBIODB.GENE.SYMBOLS <- 'genesymbols' - RBIODB.CHEBI.ID <- 'chebiid' - RBIODB.LIPIDMAPS.ID <- 'lipidmapsid' - RBIODB.KEGG.ID <- 'keggid' - RBIODB.HMDB.ID <- 'hmdbid' - RBIODB.ENZYME.ID <- 'enzymeid' - RBIODB.NCBI.CCDS.ID <- 'ncbiccdsid' - RBIODB.NCBI.GENE.ID <- 'ncbigeneid' - RBIODB.PUBCHEM.ID <- 'pubchemid' - RBIODB.UNIPROT.ID <- 'uniprotid' - RBIODB.INCHI <- 'inchi' - RBIODB.INCHIKEY <- 'inchikey' - RBIODB.MSDEV <- 'msdev' - RBIODB.MSDEVTYPE <- 'msdevtype' - RBIODB.MSTYPE <- 'mstype' - RBIODB.MSMODE <- 'msmode' - RBIODB.MSPRECMZ <- 'msprecmz' # numeric - RBIODB.MSPRECANNOT <- 'msprecannot' - RBIODB.FORMULA <- 'formula' - RBIODB.SUPER.CLASS <- 'superclass' - RBIODB.MASS <- 'mass' - RBIODB.AVERAGE.MASS <- 'averagemass' - RBIODB.MONOISOTOPIC.MASS <- 'monoisotopicmass' - RBIODB.SEQUENCE <- 'sequence' - RBIODB.LOCATION <- 'location' - RBIODB.LENGTH <- 'length' - RBIODB.NB.PEAKS <- 'nbpeaks' - RBIODB.NB.PEAKS <- 'nbpeaks' - RBIODB.PEAKS <- 'peaks' + BIODB.ACCESSION <- 'accession' + BIODB.DESCRIPTION <- 'description' + BIODB.PROTEIN.DESCRIPTION <- 'protdesc' + BIODB.NAME <- 'name' + BIODB.FULLNAMES <- 'fullnames' + BIODB.SYNONYMS <- 'synonyms' + BIODB.SYMBOL <- 'symbol' + BIODB.GENE.SYMBOLS <- 'genesymbols' + BIODB.CHEBI.ID <- 'chebiid' + BIODB.LIPIDMAPS.ID <- 'lipidmapsid' + BIODB.KEGG.ID <- 'keggid' + BIODB.HMDB.ID <- 'hmdbid' + BIODB.ENZYME.ID <- 'enzymeid' + BIODB.NCBI.CCDS.ID <- 'ncbiccdsid' + BIODB.NCBI.GENE.ID <- 'ncbigeneid' + BIODB.PUBCHEM.ID <- 'pubchemid' + BIODB.UNIPROT.ID <- 'uniprotid' + BIODB.INCHI <- 'inchi' + BIODB.INCHIKEY <- 'inchikey' + BIODB.MSDEV <- 'msdev' + BIODB.MSDEVTYPE <- 'msdevtype' + BIODB.MSTYPE <- 'mstype' + BIODB.MSMODE <- 'msmode' + BIODB.MSPRECMZ <- 'msprecmz' # numeric + BIODB.MSPRECANNOT <- 'msprecannot' + BIODB.FORMULA <- 'formula' + BIODB.SUPER.CLASS <- 'superclass' + BIODB.MASS <- 'mass' + BIODB.AVERAGE.MASS <- 'averagemass' + BIODB.MONOISOTOPIC.MASS <- 'monoisotopicmass' + BIODB.SEQUENCE <- 'sequence' + BIODB.LOCATION <- 'location' + BIODB.LENGTH <- 'length' + BIODB.NB.PEAKS <- 'nbpeaks' + BIODB.PEAKS <- 'peaks' + BIODB.COMPOUND.ID <- 'compoundid' + BIODB.PEAK.MZ <- 'peakmz' + BIODB.PEAK.COMP <- 'peakcomp' # Peak composition + BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution + BIODB.CHROM.COL <- 'chromcol' # Chromatographic column + BIODB.CHROM.COL.RT <- 'chromcolrt' # Retention time measured on chromatographic column + BIODB.ID <- 'id' + BIODB.TITLE <- 'title' # Mode values - RBIODB.MSMODE.NEG <- 'neg' - RBIODB.MSMODE.POS <- 'pos' + BIODB.MSMODE.NEG <- 'neg' + BIODB.MSMODE.POS <- 'pos' + + ################# + # CARDINALITIES # + ################# - # Cardinalities - RBIODB.CARD.ONE <- '1' - RBIODB.CARD.MANY <- '*' + BIODB.CARD.ONE <- '1' + BIODB.CARD.MANY <- '*' - # Field attributes - RBIODB.FIELDS <- data.frame(matrix(c( + ########################## + # ENTRY FIELD ATTRIBUTES # + ########################## + + BIODB.FIELDS <- data.frame(matrix(c( # FIELD NAME CLASS CARDINALITY - RBIODB.COMPOUND, 'BiodEntry', RBIODB.CARD.ONE, - RBIODB.ACCESSION, 'character', RBIODB.CARD.ONE, - RBIODB.DESCRIPTION, 'character', RBIODB.CARD.ONE, - RBIODB.NAME, 'character', RBIODB.CARD.ONE, - RBIODB.FULLNAMES, 'character', RBIODB.CARD.MANY, - RBIODB.SYNONYMS, 'character', RBIODB.CARD.MANY, - RBIODB.PROTEIN.DESCRIPTION, 'character', RBIODB.CARD.ONE, - RBIODB.SYMBOL, 'character', RBIODB.CARD.ONE, - RBIODB.GENE.SYMBOLS, 'character', RBIODB.CARD.MANY, - RBIODB.CHEBI.ID, 'character', RBIODB.CARD.ONE, - RBIODB.LIPIDMAPS.ID, 'character', RBIODB.CARD.ONE, - RBIODB.KEGG.ID, 'character', RBIODB.CARD.ONE, - RBIODB.HMDB.ID, 'character', RBIODB.CARD.ONE, - RBIODB.ENZYME.ID, 'character', RBIODB.CARD.ONE, - RBIODB.PUBCHEM.ID, 'character', RBIODB.CARD.ONE, - RBIODB.UNIPROT.ID, 'character', RBIODB.CARD.ONE, - RBIODB.NCBI.CCDS.ID, 'character', RBIODB.CARD.ONE, - RBIODB.NCBI.GENE.ID, 'character', RBIODB.CARD.ONE, - RBIODB.INCHI, 'character', RBIODB.CARD.ONE, - RBIODB.INCHIKEY, 'character', RBIODB.CARD.ONE, - RBIODB.MSDEV, 'character', RBIODB.CARD.ONE, - RBIODB.MSDEVTYPE, 'character', RBIODB.CARD.ONE, - RBIODB.MSTYPE, 'character', RBIODB.CARD.ONE, - RBIODB.MSMODE, 'character', RBIODB.CARD.ONE, - RBIODB.MSPRECMZ, 'double', RBIODB.CARD.ONE, - RBIODB.MSPRECANNOT, 'character', RBIODB.CARD.ONE, - RBIODB.FORMULA, 'character', RBIODB.CARD.ONE, - RBIODB.SUPER.CLASS, 'character', RBIODB.CARD.ONE, - RBIODB.MASS, 'double', RBIODB.CARD.ONE, - RBIODB.AVERAGE.MASS, 'double', RBIODB.CARD.ONE, - RBIODB.MONOISOTOPIC.MASS, 'double', RBIODB.CARD.ONE, - RBIODB.SEQUENCE, 'character', RBIODB.CARD.ONE, - RBIODB.LENGTH, 'integer', RBIODB.CARD.ONE, - RBIODB.LOCATION, 'character', RBIODB.CARD.ONE, - RBIODB.NB.PEAKS, 'integer', RBIODB.CARD.ONE, - RBIODB.PEAKS, 'data.frame', RBIODB.CARD.ONE + BIODB.COMPOUND, 'BiodEntry', BIODB.CARD.ONE, + BIODB.ACCESSION, 'character', BIODB.CARD.ONE, + BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE, + BIODB.NAME, 'character', BIODB.CARD.ONE, + BIODB.FULLNAMES, 'character', BIODB.CARD.MANY, + BIODB.SYNONYMS, 'character', BIODB.CARD.MANY, + BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE, + BIODB.SYMBOL, 'character', BIODB.CARD.ONE, + BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY, + BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE, + BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE, + BIODB.KEGG.ID, 'character', BIODB.CARD.ONE, + BIODB.HMDB.ID, 'character', BIODB.CARD.ONE, + BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE, + BIODB.PUBCHEM.ID, 'character', BIODB.CARD.ONE, + BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE, + BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE, + BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE, + BIODB.INCHI, 'character', BIODB.CARD.ONE, + BIODB.INCHIKEY, 'character', BIODB.CARD.ONE, + BIODB.MSDEV, 'character', BIODB.CARD.ONE, + BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE, + BIODB.MSTYPE, 'character', BIODB.CARD.ONE, + BIODB.MSMODE, 'character', BIODB.CARD.ONE, + BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE, + BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE, + BIODB.FORMULA, 'character', BIODB.CARD.ONE, + BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE, + BIODB.MASS, 'double', BIODB.CARD.ONE, + BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE, + BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE, + BIODB.SEQUENCE, 'character', BIODB.CARD.ONE, + BIODB.LENGTH, 'integer', BIODB.CARD.ONE, + BIODB.LOCATION, 'character', BIODB.CARD.ONE, + BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE, + BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE ), byrow = TRUE, ncol = 3), stringsAsFactors = FALSE) - colnames(RBIODB.FIELDS) <- c('name', 'class', 'cardinality') + colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality') - # How to compute a missing field ? - RBIODB.FIELD.COMPUTING <- list() - RBIODB.FIELD.COMPUTING[[RBIODB.INCHI]] <- c(RBIODB.CHEBI) - RBIODB.FIELD.COMPUTING[[RBIODB.INCHIKEY]] <- c(RBIODB.CHEBI) - RBIODB.FIELD.COMPUTING[[RBIODB.SEQUENCE]] <- c(RBIODB.NCBICCDS) + ##################### + # COMPUTABLE FIELDS # + ##################### + + BIODB.FIELD.COMPUTING <- list() + BIODB.FIELD.COMPUTING[[BIODB.INCHI]] <- c(BIODB.CHEBI) + BIODB.FIELD.COMPUTING[[BIODB.INCHIKEY]] <- c(BIODB.CHEBI) + BIODB.FIELD.COMPUTING[[BIODB.SEQUENCE]] <- c(BIODB.NCBICCDS) - # Peaks data frame columns - RBIODB.PEAK.MZ <- 'mz' - RBIODB.PEAK.FORMULA <- 'formula' - RBIODB.PEAK.FORMULA.COUNT <- 'formula.count' - RBIODB.PEAK.MASS <- 'mass' - RBIODB.PEAK.ERROR.PPM <- 'error.ppm' - RBIODB.PEAK.INTENSITY <- 'intensity' - RBIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity' - RBIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE) - colnames(RBIODB.PEAK.DF.EXAMPLE) <- c(RBIODB.PEAK.MZ, RBIODB.PEAK.INTENSITY, RBIODB.PEAK.RELATIVE.INTENSITY, RBIODB.PEAK.FORMULA, RBIODB.PEAK.FORMULA.COUNT, RBIODB.PEAK.MASS, RBIODB.PEAK.ERROR.PPM) + #################### + # PEAKS DATA FRAME # + #################### + + # Columns + BIODB.PEAK.MZ <- 'mz' + BIODB.PEAK.FORMULA <- 'formula' + BIODB.PEAK.FORMULA.COUNT <- 'formula.count' + BIODB.PEAK.MASS <- 'mass' + BIODB.PEAK.ERROR.PPM <- 'error.ppm' + BIODB.PEAK.INTENSITY <- 'intensity' + BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity' + + # Example + BIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE) + colnames(BIODB.PEAK.DF.EXAMPLE) <- c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM) ################# # GET ENTRY URL # ################# # TODO Let the choice to use either jp or eu - RBIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/getRecordInfo" - RBIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/getRecordInfo" + BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/getRecordInfo" + BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/getRecordInfo" - get.entry.url <- function(class, accession, content.type = RBIODB.ANY) { + .do.get.entry.url <- function(class, accession, content.type = BIODB.ANY) { + + # TODO Only Massbank can handle multiple accession ids + if (class != 'massbank' && length(accession) > 1) + stop(paste0("Cannot build a URL for getting multiple entries for class ", class, ".")) url <- switch(class, - chebi = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL, - chemspider = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html') else NULL, - enzyme = if (content.type %in% c(RBIODB.ANY, RBIODB.TXT)) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL, + chebi = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL, + chemspider = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html') else NULL, + enzyme = if (content.type %in% c(BIODB.ANY, BIODB.TXT)) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL, hmdb = switch(content.type, xml = paste0('http://www.hmdb.ca/metabolites/', accession, '.xml'), html = paste0('http://www.hmdb.ca/metabolites/', accession), @@ -159,9 +194,9 @@ html = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), any = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), NULL), - lipidmaps = if (content.type %in% c(RBIODB.ANY, RBIODB.CSV)) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL, - massbank = if (content.type %in% c(RBIODB.ANY, RBIODB.TXT)) paste0(RBIODB.MASSBANK.EU.WS.URL, '?ids=', paste(accession, collapse = ',')) else NULL, - mirbase = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL, + lipidmaps = if (content.type %in% c(BIODB.ANY, BIODB.CSV)) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL, + massbank = if (content.type %in% c(BIODB.ANY, BIODB.TXT)) paste0(BIODB.MASSBANK.EU.WS.URL, '?ids=', paste(accession, collapse = ',')) else NULL, + mirbase = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL, pubchem = { accession <- gsub(' ', '', accession, perl = TRUE) accession <- gsub('^CID', '', accession, perl = TRUE) @@ -170,12 +205,49 @@ html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession), NULL) }, - ncbigene = if (content.type %in% c(RBIODB.ANY, RBIODB.XML)) paste0('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL, - ncbiccds = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession), - uniprot = if (content.type %in% c(RBIODB.ANY, RBIODB.XML)) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'), + ncbigene = if (content.type %in% c(BIODB.ANY, BIODB.XML)) paste0('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL, + ncbiccds = if (content.type %in% c(BIODB.ANY, BIODB.HTML)) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession), + uniprot = if (content.type %in% c(BIODB.ANY, BIODB.XML)) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'), NULL ) return(url) } + + get.entry.url <- function(class, accession, content.type = BIODB.ANY, max.length = 0) { + + if (length(accession) == 0) + return(NULL) + + full.url <- .do.get.entry.url(class, accession, content.type = content.type) + if (max.length == 0 || nchar(full.url) <= max.length) + return(if (max.length == 0) full.url else list(url = full.url, n = length(accession))) + + # Find max size URL + a <- 1 + b <- length(accession) + while (a < b) { + m <- as.integer((a + b) / 2) + url <- .do.get.entry.url(class, accession[1:m], content.type = content.type) + if (nchar(url) <= max.length && m != a) + a <- m + else + b <- m + } + url <- .do.get.entry.url(class, accession[1:a], content.type = content.type) + + return(list( url = url, n = a)) + } + + ################# + # PRINT MESSAGE # + ################# + + BIODB.DEBUG <- 1 + BIODB.LEVEL.NAMES <- c('DEBUG') + + .print.msg <- function(msg, level = BIODB.DEBUG, class = NA_character_) { + cat(paste0(BIODB.LEVEL.NAMES[[level]], if (is.na(class)) '' else paste0(", ", class), ": ", msg, "\n"), file = stderr()) + } + }
--- a/lcmsmatching.xml Tue Jul 12 12:02:37 2016 -0400 +++ b/lcmsmatching.xml Sat Sep 03 17:02:01 2016 -0400 @@ -1,6 +1,6 @@ -<tool id="lcmsmatching" name="LC/MS matching" version="2.1.3"> +<tool id="lcmsmatching" name="LC/MS matching" version="3.0"> - <description>Matching of mz/rt values onto local reference compound database.</description> + <description>Annotation of MS peaks using matching on a spectra database.</description> <requirements> <requirement type="package" version="3.2.2">R</requirement> @@ -8,17 +8,33 @@ <requirement type="package" version="1.0.0">r-stringr</requirement> <requirement type="package" version="1.8.3">r-plyr</requirement> <requirement type="package" version="3.98">r-xml</requirement> + <requirement type="package" version="1.0_6">r-bitops</requirement> + <requirement type="package" version="1.95">r-rcurl</requirement> + <requirement type="package" version="1.3">r-rjsonio</requirement> </requirements> + <code file="list-chrom-cols.py"/> + <!--~~~~~~~ ~ COMMAND ~ ~~~~~~~~--> - <command><![CDATA[ + <command> + <![CDATA[ + ## @@@BEGIN_CHEETAH@@@ $__tool_directory__/search-mz -i "$mzrtinput" ## Database - -d file --url "$dbfile" + #if $db.dbtype == "inhouse" + -d file + --db-fields "$db.dbfields" + --db-ms-modes "$db.dbmsmodes" + #end if + #if $db.dbtype == "peakforest" + -d peakforest + --db-token "$db.dbtoken" + #end if + --url "$db.dburl" ## M/Z matching -m $mzmode -p $mzprec -s $mzshift @@ -27,16 +43,13 @@ #if $prec.match == "true" --precursor-match --pos-prec "$prec.pos" --neg-prec "$prec.neg" #end if - #if $prec.match == "true" and $rt.match == "true" - --precursor-rt-tol $rt.tolz + #if $prec.match == "true" and $chromcols: + --precursor-rt-tol $tolz #end if ## Chromatographic columns options and retention matching - #if $rt.match == "true" and $rt.cols.all == "true" - --all-cols --rttolx $rt.tolx --rttoly $rt.toly - #end if - #if $rt.match == "true" and $rt.cols.all == "false" and $rt.cols.list != "" - -c "$rt.cols.list" --check-cols --rttolx $rt.tolx --rttoly $rt.toly + #if $chromcols: + -c "$chromcols" --check-cols --rttolx $tolx --rttoly $toly #end if ## Table outputs @@ -45,10 +58,8 @@ ## HTML output --html-output-file "$htmloutput" --no-main-table-in-html-output - ## Fields + ## Fields of input file --input-col-names "$inputfields" - --db-fields "$dbfields" - --db-ms-modes "$dbmsmodes" ## Ouput setting #if $out.enabled == "true" @@ -57,7 +68,7 @@ #else --molids-sep "|" #end if - + ## @@@END_CHEETAH@@@ ]]></command> <!--~~~~~~ @@ -68,73 +79,64 @@ <!-- DATABASE --> - <!-- Database file --> - <param name="dbfile" label="Database file" type="data" format="tabular" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. Retention time values must be in seconds."/> + <conditional name="db"> + + <param name="dbtype" label="Database" type="select" refresh_on_change="true"> + <option value="inhouse">In-house</option> + <option value="peakforest">Peakforest</option> + </param> + + <when value="inhouse"> + <!-- Database file --> + <param name="dburl" label="Database file" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. Retention time values must be in seconds."/> + + <!-- File database field names --> + <param name="dbfields" label="File database column names" type="text" size="256" value="mztheo=mztheo,colrt=colrt,molid=molid,col=col,mode=mode,attr=attr,comp=comp,molnames=molnames,molcomp=molcomp,molmass=molmass,inchi=inchi,inchikey=inchikey,pubchem=pubchem,chebi=chebi,hmdb=hmdb,kegg=kegg" refresh_on_change="true" help=""/> - <!-- File database field names --> - <param name="dbfields" label="File database column names" type="text" size="256" value="mztheo=mztheo,colrt=colrt,molid=molid,col=col,mode=mode,attr=attr,comp=comp,molnames=molnames,molcomp=molcomp,molmass=molmass,inchi=inchi,inchikey=inchikey,pubchem=pubchem,chebi=chebi,hmdb=hmdb,kegg=kegg" help=""/> + <!-- File database MS modes --> + <param name="dbmsmodes" label="File database MS modes" type="text" size="32" value="pos=POS,neg=NEG" help=""/> + + <param name="dbtoken" type="text" size="32" value="" hidden="true"/> + </when> - <!-- File database MS modes --> - <param name="dbmsmodes" label="File database MS modes" type="text" size="32" value="pos=POS,neg=NEG" help=""/> + <when value="peakforest"> + <param name="dburl" type="text" size="128" value="https://peakforest-alpha.inra.fr/rest" refresh_on_change="true"/> + + <param name="dbtoken" label="Peakforest security token" type="text" size="32" value="" refresh_on_change="true" help="If you do not have yet a Peakforest token, go to Peakforest website and request one from your account."/> + + <param name="dbfields" type="text" size="32" value="" hidden="true"/> + </when> + </conditional> <!-- INPUT --> <!-- Input file --> - <param name="mzrtinput" label="Input file - MZ(/RT) values" type="data" format="tabular" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. RT values must be in seconds."/> + <param name="mzrtinput" label="Input file - MZ(/RT) values" type="data" format="tabular,tsv" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. RT values must be in seconds."/> <!-- Input field names --> <param name="inputfields" label="Input file column names" type="text" size="32" value="mz=mzmed,rt=rtmed" help=""/> <!-- M/Z MATCHING --> -<!-- <conditional name="mz"> - <param name="enabled" label="M/Z matching" type="select"> - <option value="true">Show</option> - <option value="false">Hide</option> + <!-- Mode --> + <param name="mzmode" label="MS mode" type="select" display="radio" multiple="false" help=""> + <option value="pos">Positive</option> + <option value="neg">Negative</option> </param> - <when value="true">--> - <!-- Mode --> - <param name="mzmode" label="MS mode" type="select" display="radio" multiple="false" help=""> - <option value="pos">Positive</option> - <option value="neg">Negative</option> - </param> - - <!-- MZ matching parameters --> - <param name="mzprec" label="M/Z precision (in ppm)" type="float" help="" value="5"/> - <param name="mzshift" label="M/Z shift (in ppm)" type="float" help="" value="0"/> - <!--</when> - <when value="false"></when> - </conditional>--> + <!-- MZ matching parameters --> + <param name="mzprec" label="M/Z precision (in ppm)" type="float" help="" value="5"/> + <param name="mzshift" label="M/Z shift (in ppm)" type="float" help="" value="0"/> <!-- RETENTION TIME PARAMETERS --> - <conditional name="rt"> - <param name="match" label="Retention time match" type="select"> - <option value="false">Off</option> - <option value="true">On</option> - </param> + <!-- List of chromatographic columns --> + <param name="chromcols" type="select" label="Chromatographic columns" multiple="true" dynamic_options="get_chrom_cols(dbtype = db['dbtype'], dburl = db['dburl'], dbtoken = db['dbtoken'], dbfields = db['dbfields'])" help="Select here the set of chromatographic columns against which the retention time matching will be run."/> - <when value="false"></when> - <when value="true"> - <!-- Columns --> - <conditional name="cols"> - <param name="all" label="All chromatographic columns" type="select"> - <option value="true">Yes</option> - <option value="false">No</option> - </param> - <when value="true"></when> - <when value="false"> - <param name="list" label="Chromatographic columns" type="text" size="64" value="" help="Set here the list of chromatographic columns against which the retention time matching will be run. This is a comma separated list of the column names as used instead the database file."/> - </when> - </conditional> - - <!-- Tolerances --> - <param name="tolx" label="RTX retention time tolerance, parameter x (in seconds)" type="float" help="" value="5"/> - <param name="toly" label="RTY retention time tolerance, parameter y" type="float" help="" value="0.8"/> - <param name="tolz" label="RTZ retention time tolerance, used when precursor matching is enabled." type="float" help="" value="5"/> - </when> - </conditional> + <!-- Tolerances --> + <param name="tolx" label="RTX retention time tolerance, parameter x (in seconds)" type="float" help="" value="5"/> + <param name="toly" label="RTY retention time tolerance, parameter y" type="float" help="" value="0.8"/> + <param name="tolz" label="RTZ retention time tolerance, used when precursor matching is enabled." type="float" help="" value="5"/> <!-- PRECURSOR MATCH --> <conditional name="prec"> @@ -221,18 +223,35 @@ <tests> - <!-- Simple quick test --> + <!-- File database test --> <test> - <param name="dbfile" value="filedb.tsv"/> + <param name="dbtype" value="inhouse"/> + <param name="dburl" value="filedb.tsv"/> + <param name="dbfields" value=""/> + <param name="dbmsmodes" value=""/> <param name="mzrtinput" value="mz-input-small.tsv"/> <param name="inputfields" value=""/> - <param name="dbfields" value=""/> - <param name="dbmsmodes" value=""/> <param name="mzmode" value="pos"/> <output name="mainoutput" file="filedb-small-mz-match-output.tsv"/> <output name="peaksoutput" file="filedb-small-mz-match-peaks-output.tsv"/> <output name="htmloutput" file="filedb-small-mz-match-html-output.html"/> </test> + + <!-- File database test --> +<!-- + <test> + <param name="dbtype" value="peakforest"/> + <param name="dbtoken" value="@PEAKFOREST_TOKEN@"/> + <param name="mzrtinput" value="mz-input-small.tsv"/> + <param name="inputfields" value=""/> + <param name="mzmode" value="pos"/> + <output name="mainoutput"> + <assert_contents> + <has_text text="mz"/> + </assert_contents> + </output> + </test> +--> </tests> <!--~~~~ @@ -246,7 +265,17 @@ LC/MS matching ============== -This tool performs LC/MS matching on an input list of MZ/RT values, using a provided single file database. +This tool performs LC/MS matching on an input list of MZ/RT values, using either a provided in-house single file database or a connection to Peakforest database. + +-------- +Database +-------- + +When selecting the database, you have the choice between a Peakforest database or an in-house file. + +For the Peakforest database, a default REST web base address is already provided. But you can change it of you want to use a custom database. A field is also available for setting a token key in case the access to the Peakforest database you want to use is restricted. This is the case of the default database. + +For the in-house file, please refer to the paragraph "Single file database" below. ----------- Input files
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/list-chrom-cols.py Sat Sep 03 17:02:01 2016 -0400 @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +import argparse +import subprocess +import re +import urllib2 +import json +import csv + +def get_chrom_cols(dbtype, dburl, dbtoken = None, dbfields = None): + + cols = [] + + if dbtype == 'peakforest': + url = dburl + ( '' if dburl[-1] == '/' else '/' ) + 'metadata/lc/list-code-columns' + if dbtoken is not None: + url += '?token=' + dbtoken + result = urllib2.urlopen(url).read() + v = json.JSONDecoder().decode(result) + i = 0 + for colid, coldesc in v.iteritems(): + cols.append( (coldesc['name'], colid, i == 0) ) + ++i + + elif dbtype == 'inhouse': + # Get field for chromatographic column name + col_field = 'col' + if dbfields is not None: + fields = dict(u.split("=") for u in dbfields.split(",")) + if 'col' in fields: + col_field = fields['col'] + + # Get all column names from file + with open(dburl.get_file_name(), 'rb') as dbfile: + reader = csv.reader(dbfile, delimiter = "\t", quotechar='"') + header = reader.next() + i = header.index(col_field) + allcols = [] + for row in reader: + col = row[i] + if col not in allcols: + allcols.append(col) + for i, c in enumerate(allcols): + cols.append( (c, c, i == 0) ) + + return cols + +######## +# MAIN # +######## + +if __name__ == '__main__': + + # Parse command line arguments + parser = argparse.ArgumentParser(description='Script for getting chromatographic columns of an RMSDB database for Galaxy tool lcmsmatching.') + parser.add_argument('-d', help = 'Database type', dest = 'dbtype', required = True) + parser.add_argument('-u', help = 'Database URL', dest = 'dburl', required = True) + parser.add_argument('-t', help = 'Database token', dest = 'dbtoken', required = False) + parser.add_argument('-f', help = 'Database fields', dest = 'dbfields', required = False) + args = parser.parse_args() + args_dict = vars(args) + + print(get_chrom_cols(**args_dict))
--- a/search-mz Tue Jul 12 12:02:37 2016 -0400 +++ b/search-mz Sat Sep 03 17:02:01 2016 -0400 @@ -17,11 +17,16 @@ source(file.path(dirname(script.path), 'biodb-common.R'), chdir = TRUE) source(file.path(dirname(script.path), 'nethlp.R'), chdir = TRUE) +# Missing paste0() function in R 2.14.1 +if (as.integer(R.Version()$major) == 2 && as.numeric(R.Version()$minor) < 15) + paste0 <- function(...) paste(..., sep = '') + ############# # CONSTANTS # ############# PROG <- sub('^.*/([^/]+)$', '\\1', commandArgs()[4], perl = TRUE) +USERAGENT <- 'search-mz ; pierrick.roger@gmail.com' # Authorized database types MSDB.XLS <- 'xls' @@ -187,12 +192,12 @@ 'database', 'd', 1, 'character', paste0('Set database to use: "xls" for an Excel database, "file" for a single file database, "4tabsql" for a 4Tab SQL database, and "peakforest" for a connection to PeakForest database.'), 'url', NA_character_, 1, 'character', 'URL of database. For "peakforest" database it is the HTTP URL, for the "xls" database it is the path to the directory containing the Excel files, for the "file" database it is the path to the file database and for the "4tabsql" database it is the IP address of the server.', 'cache-dir', NA_character_, 1, 'character', 'Path to directory where to store cache files. Only used when database flag is set to "xls".', - 'useragent', NA_character_, 1, 'character', 'User agent. Used by the "Peakforest" database.', 'db-name', NA_character_, 1, 'character', 'Name of the database. Used by the "4tabsql" database.', - 'db-user', NA_character_, 1, 'character', 'Name of the database. Used by the "4tabsql" database.', - 'db-password', NA_character_, 1, 'character', 'Name of the database. Used by the "4tabsql" database.', + 'db-user', NA_character_, 1, 'character', 'User of the database. Used by the "4tabsql" database.', + 'db-password', NA_character_, 1, 'character', 'Password of the database user. Used by the "4tabsql" database.', 'db-fields', NA_character_, 1, 'character', paste0('Comma separated key/value list giving the field names to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-fields']], '".'), 'db-ms-modes', NA_character_, 1, 'character', paste0('Comma separated key/value list giving the MS modes to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-ms-modes']], '".'), + 'db-token', NA_character_, 1, 'character', 'Database token. Used by Peakforest database.', 'debug', NA_character_, 0, 'logical', 'Set debug mode.' ) @@ -224,7 +229,7 @@ # Check values error <- .check.db.conn.opts(opt) - if (is.null(opt[['output-file']])) { + if (is.null(opt[['output-file']]) && is.null(opt[['list-cols']])) { warning("You must set a path for the output file.") error <- TRUE } @@ -327,10 +332,6 @@ warning("When using PeakForest database, you must specify the URL of the PeakForest server with option --url.") error <- TRUE } - if (is.null(opt$useragent)) { - warning("When using PeakForest database, you must specify a user agent with option --useragent.") - error <- TRUE - } } return(error) @@ -363,10 +364,10 @@ } db <- switch(opt$database, - peakforest = MsPeakForestDb$new(url = opt$url, useragent = opt$useragent), - xls = MsXlsDb(db_dir = opt$url, cache_dir = opt[['cache-dir']]), - '4tabsql' = Ms4TabSqlDb(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]), - file = MsFileDb(file = opt$url), + peakforest = MsPeakForestDb$new(url = opt$url, useragent = USERAGENT, token = opt[['db-token']]), + xls = MsXlsDb$new(db_dir = opt$url, cache_dir = opt[['cache-dir']]), + '4tabsql' = Ms4TabSqlDb$new(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]), + file = MsFileDb$new(file = opt$url), NULL) db$setPrecursors(precursors) if (db$areDbFieldsSettable()) @@ -389,7 +390,7 @@ for (extdb in c(MSDB.TAG.KEGG, MSDB.TAG.HMDB, MSDB.TAG.CHEBI, MSDB.TAG.PUBCHEM)) { field <- output.fields[[extdb]] if (field %in% colnames(peaks)) - peaks[[field]] <- vapply(peaks[[field]], function(id) paste0('<a href="', get.entry.url(class = extdb, accession = id, content.type = RBIODB.HTML), '">', id, '</a>'), FUN.VALUE = '') + peaks[[field]] <- vapply(peaks[[field]], function(id) paste0('<a href="', get.entry.url(class = extdb, accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = '') } # Write HTML @@ -469,7 +470,7 @@ # Print columns if ( ! is.null(opt[['list-cols']])) { cols <- db$getChromCol() - df.write.tsv(cols, file = opt[['output-file']]) + df.write.tsv(cols, file = if (is.null(opt[['output-file']])) stdout() else opt[['output-file']]) q(status = 0) } @@ -506,7 +507,7 @@ # Check chrom columns if ( ! is.null(opt[['check-cols']]) && ! is.null(opt$rtcol)) { - dbcols <- db$getChromCol() + dbcols <- db$getChromCol()[['id']] unknown.cols <- opt$rtcol[ ! opt$rtcol %in% dbcols] if (length(unknown.cols) > 0) { stop(paste0("Unknown chromatographic column", (if (length(unknown.cols) > 1) 's' else ''), ': ', paste(unknown.cols, collapse = ', '), ".\nAllowed chromatographic column names are:\n", paste(dbcols, collapse = "\n")))