view NcbigeneEntry.R @ 3:f61ce21ed17c draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 476a081c0da66822f4e77070f5ce59d9f14511f4-dirty
author prog
date Thu, 02 Mar 2017 11:07:56 -0500
parents 20d69a062da3
children
line wrap: on
line source

#####################
# CLASS DECLARATION #
#####################

NcbigeneEntry <- methods::setRefClass("NcbigeneEntry", contains = "BiodbEntry")

###########
# FACTORY #
###########

createNcbigeneEntryFromXml <- function(contents, drop = TRUE) {

	entries <- list()

	# Define xpath expressions
	xpath.expr <- character()
	xpath.expr[[BIODB.ACCESSION]] <- "//Gene-track_geneid"
	xpath.expr[[BIODB.KEGG.ID]] <- "/Dbtag_db[text()='KEGG']/..//Object-id_str"
	xpath.expr[[BIODB.UNIPROT.ID]] <- "//Gene-commentary_heading[text()='UniProtKB']/..//Dbtag_db[text()='UniProtKB/Swiss-Prot']/..//Object-id_str"
	xpath.expr[[BIODB.LOCATION]] <- "//Gene-ref_maploc"
	xpath.expr[[BIODB.PROTEIN.DESCRIPTION]] <- "//Gene-ref_desc"
	xpath.expr[[BIODB.SYMBOL]] <- "//Gene-ref_locus"
	xpath.expr[[BIODB.SYNONYMS]] <- "//Gene-ref_syn_E"

	for (content in contents) {

		# Create instance
		entry <- NcbigeneEntry$new()
	
		# Parse HTML
		xml <-  XML::xmlInternalTreeParse(content, asText = TRUE)

		# An error occured
		if (length(XML::getNodeSet(xml, "//Error")) == 0 && length(XML::getNodeSet(xml, "//ERROR")) == 0) {

			# Test generic xpath expressions
			for (field in names(xpath.expr)) {
				v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue)
				if (length(v) > 0) {

					# Eliminate duplicates
					v <- v[ ! duplicated(v)]

					# Set field
					entry$setField(field, v)
				}
			}
		
			# CCDS ID
			ccdsid <- .find.ccds.id(xml)
			if ( ! is.na(ccdsid))
				entry$setField(BIODB.NCBI.CCDS.ID, ccdsid)
		}

		entries <- c(entries, entry)
	}

	# Replace elements with no accession id by NULL
	entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)

	# If the input was a single element, then output a single object
	if (drop && length(contents) == 1)
		entries <- entries[[1]]

	return(entries)

	# Get data

}

################
# FIND CCDS ID #
################

.find.ccds.id <- function(xml) {

	# 1) Get all CCDS tags.
	ccds_elements <- XML::getNodeSet(xml, "//Dbtag_db[text()='CCDS']/..//Object-id_str")

	# 2) If all CCDS are the same, go to point 4.
	ccds <- NA_character_
	for (e in ccds_elements) {
		current_ccds <- XML::xmlValue(e)
		if (is.na(ccds))
			ccds <- current_ccds
		else {
			if (current_ccds != ccds) {
				ccds <- NA_character_
				break
			}
		}
	}

	# 3) There are several CCDS values, we need to find the best one (i.e.: the most current one).
	if (is.na(ccds)) {
		# For each CCDS, look for the parent Gene-commentary tag. Then look for the text content of the Gene-commentary_label which is situed under. Ignore CCDS that have no Gene-commentary_label associated. Choose the CCDS that has the smallest Gene-commentary_label in alphabetical order.
		version <- NA_character_
		for (e in ccds_elements) {
			versions <- XML::xpathSApply(e, "ancestor::Gene-commentary/Gene-commentary_label", XML::xmlValue)
			if (length(versions) < 1) next
			current_version <- versions[[length(versions)]]
			if (is.na(version) || current_version < version) {
				version <- current_version
				ccds <- XML::xmlValue(e)
			}
		}
	}

	return(ccds)
}