diff PubchemConn.R @ 2:20d69a062da3 draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author prog
date Thu, 02 Mar 2017 08:55:00 -0500
parents 253d531a0193
children
line wrap: on
line diff
--- a/PubchemConn.R	Sat Sep 03 17:02:01 2016 -0400
+++ b/PubchemConn.R	Thu Mar 02 08:55:00 2017 -0500
@@ -1,59 +1,96 @@
-if ( ! exists('get.pubchem.compound.url')) { # Do not load again if already loaded
+#####################
+# CLASS DECLARATION #
+#####################
+
+PubchemConn <- methods::setRefClass("PubchemConn", contains = "RemotedbConn", fields = list( .db = "character" ))
+
+###############
+# CONSTRUCTOR #
+###############
+
+PubchemConn$methods( initialize = function(db = BIODB.PUBCHEMCOMP, ...) {
+	.db <<- db
+	callSuper(...)
+})
 
-	source('RemotedbConn.R')
-	source('PubchemCompound.R')
-	
-	#####################
-	# CLASS DECLARATION #
-	#####################
-	
-	PubchemConn <- setRefClass("PubchemConn", contains = "RemotedbConn")
+##########################
+# GET ENTRY CONTENT TYPE #
+##########################
+
+PubchemConn$methods( getEntryContentType = function() {
+	return(BIODB.XML)
+})
 
-	##########################
-	# GET ENTRY CONTENT TYPE #
-	##########################
+#####################
+# GET ENTRY CONTENT #
+#####################
+
+PubchemConn$methods( getEntryContent = function(ids) {
+
+	# Debug
+	.self$.print.debug.msg(paste0("Get entry content(s) for ", length(ids)," id(s)..."))
+
+	URL.MAX.LENGTH <- 2083
 
-	PubchemConn$methods( getEntryContentType = function(type) {
-		return(BIODB.XML)
-	})
+	# Initialize return values
+	content <- rep(NA_character_, length(ids))
+
+	# Loop on all
+	n <- 0
+	while (n < length(ids)) {
 
-	#####################
-	# GET ENTRY CONTENT #
-	#####################
-	
-	PubchemConn$methods( getEntryContent = function(type, id) {
+		# Get list of accession ids to retrieve
+		accessions <- ids[(n + 1):length(ids)]
+
+		# Create URL request
+		x <- get.entry.url(class = .self$.db, accession = accessions, content.type = BIODB.XML, max.length = URL.MAX.LENGTH)
+
+		# Debug
+		.self$.print.debug.msg(paste0("Send URL request for ", x$n," id(s)..."))
 
-		if (type == BIODB.COMPOUND) {
+		# Send request
+		xmlstr <- .self$.get.url(x$url)
 
-			# Initialize return values
-			content <- rep(NA_character_, length(id))
+		# Increase number of entries retrieved
+		n <- n + x$n
 
-			# Request
-			content <- vapply(id, function(x) .self$.scheduler$getUrl(get.entry.url(BIODB.PUBCHEM, x, content.type = BIODB.XML)), FUN.VALUE = '')
+		# TODO When one of the id is wrong, no content is returned. Only a single error is returned, with the first faulty ID:
+#		<Fault xmlns="http://pubchem.ncbi.nlm.nih.gov/pug_rest" xmlns:xs="http://www.w3.org/2001/XMLSchema-instance" xs:schemaLocation="http://pubchem.ncbi.nlm.nih.gov/pug_rest https://pubchem.ncbi.nlm.nih.gov/pug_rest/pug_rest.xsd">
+#		<Code>PUGREST.NotFound</Code>
+#		<Message>Record not found</Message>
+#		<Details>No record data for CID 1246452553</Details>
+#		</Fault>
 
-			return(content)
+		# Parse XML and get included XML
+		if ( ! is.na(xmlstr)) {
+			xml <-  xmlInternalTreeParse(xmlstr, asText = TRUE)
+			ns <- c(pcns = "http://www.ncbi.nlm.nih.gov")
+			returned.ids <- xpathSApply(xml, paste0("//pcns:", if (.self$.db == BIODB.PUBCHEMCOMP) 'PC-CompoundType_id_cid' else 'PC-ID_id'), xmlValue, namespaces = ns)
+			content[match(returned.ids, ids)] <- vapply(getNodeSet(xml, paste0("//pcns:", if (.self$.db == BIODB.PUBCHEMCOMP) "PC-Compound" else 'PC-Substance'), namespaces = ns), saveXML, FUN.VALUE = '')
 		}
 
-		return(NULL)
-	})
-	
-	################
-	# CREATE ENTRY #
-	################
-	
-	PubchemConn$methods( createEntry = function(type, content, drop = TRUE) {
-		return(if (type == BIODB.COMPOUND) createPubchemCompoundFromXml(content, drop = drop) else NULL)
-	})
+		# Debug
+		.self$.print.debug.msg(paste0("Now ", length(ids) - n," id(s) left to be retrieved..."))
+	}
+
+	return(content)
+})
+
+################
+# CREATE ENTRY #
+################
 
-	#########################
-	# GET PUBCHEM IMAGE URL #
-	#########################
-	
-	get.pubchem.image.url <- function(id) {
-	
-		url <- paste0('http://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid=', id, '&t=l')
+PubchemConn$methods( createEntry = function(content, drop = TRUE) {
+	return(if (.self$.db == BIODB.PUBCHEMCOMP) createPubchemEntryFromXml(content, drop = drop) else createPubchemSubstanceFromXml(content, drop = drop))
+})
 
-		return(url)
-	}
-	
-} # end of load safe guard
+#########################
+# GET PUBCHEM IMAGE URL #
+#########################
+
+get.pubchem.image.url <- function(id, db = BIODB.PUBCHEMCOMP) {
+
+	url <- paste0('http://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?', (if (db == BIODB.PUBCHEMCOMP) 'cid' else 'sid'), '=', id, '&t=l')
+
+	return(url)
+}