Mercurial > repos > prog > lcmsmatching
diff PubchemConn.R @ 2:20d69a062da3 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author | prog |
---|---|
date | Thu, 02 Mar 2017 08:55:00 -0500 |
parents | 253d531a0193 |
children |
line wrap: on
line diff
--- a/PubchemConn.R Sat Sep 03 17:02:01 2016 -0400 +++ b/PubchemConn.R Thu Mar 02 08:55:00 2017 -0500 @@ -1,59 +1,96 @@ -if ( ! exists('get.pubchem.compound.url')) { # Do not load again if already loaded +##################### +# CLASS DECLARATION # +##################### + +PubchemConn <- methods::setRefClass("PubchemConn", contains = "RemotedbConn", fields = list( .db = "character" )) + +############### +# CONSTRUCTOR # +############### + +PubchemConn$methods( initialize = function(db = BIODB.PUBCHEMCOMP, ...) { + .db <<- db + callSuper(...) +}) - source('RemotedbConn.R') - source('PubchemCompound.R') - - ##################### - # CLASS DECLARATION # - ##################### - - PubchemConn <- setRefClass("PubchemConn", contains = "RemotedbConn") +########################## +# GET ENTRY CONTENT TYPE # +########################## + +PubchemConn$methods( getEntryContentType = function() { + return(BIODB.XML) +}) - ########################## - # GET ENTRY CONTENT TYPE # - ########################## +##################### +# GET ENTRY CONTENT # +##################### + +PubchemConn$methods( getEntryContent = function(ids) { + + # Debug + .self$.print.debug.msg(paste0("Get entry content(s) for ", length(ids)," id(s)...")) + + URL.MAX.LENGTH <- 2083 - PubchemConn$methods( getEntryContentType = function(type) { - return(BIODB.XML) - }) + # Initialize return values + content <- rep(NA_character_, length(ids)) + + # Loop on all + n <- 0 + while (n < length(ids)) { - ##################### - # GET ENTRY CONTENT # - ##################### - - PubchemConn$methods( getEntryContent = function(type, id) { + # Get list of accession ids to retrieve + accessions <- ids[(n + 1):length(ids)] + + # Create URL request + x <- get.entry.url(class = .self$.db, accession = accessions, content.type = BIODB.XML, max.length = URL.MAX.LENGTH) + + # Debug + .self$.print.debug.msg(paste0("Send URL request for ", x$n," id(s)...")) - if (type == BIODB.COMPOUND) { + # Send request + xmlstr <- .self$.get.url(x$url) - # Initialize return values - content <- rep(NA_character_, length(id)) + # Increase number of entries retrieved + n <- n + x$n - # Request - content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.PUBCHEM, x, content.type = BIODB.XML)), FUN.VALUE = '') + # TODO When one of the id is wrong, no content is returned. Only a single error is returned, with the first faulty ID: +# <Fault xmlns="http://pubchem.ncbi.nlm.nih.gov/pug_rest" xmlns:xs="http://www.w3.org/2001/XMLSchema-instance" xs:schemaLocation="http://pubchem.ncbi.nlm.nih.gov/pug_rest https://pubchem.ncbi.nlm.nih.gov/pug_rest/pug_rest.xsd"> +# <Code>PUGREST.NotFound</Code> +# <Message>Record not found</Message> +# <Details>No record data for CID 1246452553</Details> +# </Fault> - return(content) + # Parse XML and get included XML + if ( ! is.na(xmlstr)) { + xml <- xmlInternalTreeParse(xmlstr, asText = TRUE) + ns <- c(pcns = "http://www.ncbi.nlm.nih.gov") + returned.ids <- xpathSApply(xml, paste0("//pcns:", if (.self$.db == BIODB.PUBCHEMCOMP) 'PC-CompoundType_id_cid' else 'PC-ID_id'), xmlValue, namespaces = ns) + content[match(returned.ids, ids)] <- vapply(getNodeSet(xml, paste0("//pcns:", if (.self$.db == BIODB.PUBCHEMCOMP) "PC-Compound" else 'PC-Substance'), namespaces = ns), saveXML, FUN.VALUE = '') } - return(NULL) - }) - - ################ - # CREATE ENTRY # - ################ - - PubchemConn$methods( createEntry = function(type, content, drop = TRUE) { - return(if (type == BIODB.COMPOUND) createPubchemCompoundFromXml(content, drop = drop) else NULL) - }) + # Debug + .self$.print.debug.msg(paste0("Now ", length(ids) - n," id(s) left to be retrieved...")) + } + + return(content) +}) + +################ +# CREATE ENTRY # +################ - ######################### - # GET PUBCHEM IMAGE URL # - ######################### - - get.pubchem.image.url <- function(id) { - - url <- paste0('http://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid=', id, '&t=l') +PubchemConn$methods( createEntry = function(content, drop = TRUE) { + return(if (.self$.db == BIODB.PUBCHEMCOMP) createPubchemEntryFromXml(content, drop = drop) else createPubchemSubstanceFromXml(content, drop = drop)) +}) - return(url) - } - -} # end of load safe guard +######################### +# GET PUBCHEM IMAGE URL # +######################### + +get.pubchem.image.url <- function(id, db = BIODB.PUBCHEMCOMP) { + + url <- paste0('http://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?', (if (db == BIODB.PUBCHEMCOMP) 'cid' else 'sid'), '=', id, '&t=l') + + return(url) +}