comparison NcbigeneEntry.R @ 2:20d69a062da3 draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
author prog
date Thu, 02 Mar 2017 08:55:00 -0500
parents
children
comparison
equal deleted inserted replaced
1:253d531a0193 2:20d69a062da3
1 #####################
2 # CLASS DECLARATION #
3 #####################
4
5 NcbigeneEntry <- methods::setRefClass("NcbigeneEntry", contains = "BiodbEntry")
6
7 ###########
8 # FACTORY #
9 ###########
10
11 createNcbigeneEntryFromXml <- function(contents, drop = TRUE) {
12
13 entries <- list()
14
15 # Define xpath expressions
16 xpath.expr <- character()
17 xpath.expr[[BIODB.ACCESSION]] <- "//Gene-track_geneid"
18 xpath.expr[[BIODB.KEGG.ID]] <- "/Dbtag_db[text()='KEGG']/..//Object-id_str"
19 xpath.expr[[BIODB.UNIPROT.ID]] <- "//Gene-commentary_heading[text()='UniProtKB']/..//Dbtag_db[text()='UniProtKB/Swiss-Prot']/..//Object-id_str"
20 xpath.expr[[BIODB.LOCATION]] <- "//Gene-ref_maploc"
21 xpath.expr[[BIODB.PROTEIN.DESCRIPTION]] <- "//Gene-ref_desc"
22 xpath.expr[[BIODB.SYMBOL]] <- "//Gene-ref_locus"
23 xpath.expr[[BIODB.SYNONYMS]] <- "//Gene-ref_syn_E"
24
25 for (content in contents) {
26
27 # Create instance
28 entry <- NcbigeneEntry$new()
29
30 # Parse HTML
31 xml <- XML::xmlInternalTreeParse(content, asText = TRUE)
32
33 # An error occured
34 if (length(XML::getNodeSet(xml, "//Error")) == 0 && length(XML::getNodeSet(xml, "//ERROR")) == 0) {
35
36 # Test generic xpath expressions
37 for (field in names(xpath.expr)) {
38 v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue)
39 if (length(v) > 0) {
40
41 # Eliminate duplicates
42 v <- v[ ! duplicated(v)]
43
44 # Set field
45 entry$setField(field, v)
46 }
47 }
48
49 # CCDS ID
50 ccdsid <- .find.ccds.id(xml)
51 if ( ! is.na(ccdsid))
52 entry$setField(BIODB.NCBI.CCDS.ID, ccdsid)
53 }
54
55 entries <- c(entries, entry)
56 }
57
58 # Replace elements with no accession id by NULL
59 entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x)
60
61 # If the input was a single element, then output a single object
62 if (drop && length(contents) == 1)
63 entries <- entries[[1]]
64
65 return(entries)
66
67 # Get data
68
69 }
70
71 ################
72 # FIND CCDS ID #
73 ################
74
75 .find.ccds.id <- function(xml) {
76
77 # 1) Get all CCDS tags.
78 ccds_elements <- XML::getNodeSet(xml, "//Dbtag_db[text()='CCDS']/..//Object-id_str")
79
80 # 2) If all CCDS are the same, go to point 4.
81 ccds <- NA_character_
82 for (e in ccds_elements) {
83 current_ccds <- XML::xmlValue(e)
84 if (is.na(ccds))
85 ccds <- current_ccds
86 else {
87 if (current_ccds != ccds) {
88 ccds <- NA_character_
89 break
90 }
91 }
92 }
93
94 # 3) There are several CCDS values, we need to find the best one (i.e.: the most current one).
95 if (is.na(ccds)) {
96 # For each CCDS, look for the parent Gene-commentary tag. Then look for the text content of the Gene-commentary_label which is situed under. Ignore CCDS that have no Gene-commentary_label associated. Choose the CCDS that has the smallest Gene-commentary_label in alphabetical order.
97 version <- NA_character_
98 for (e in ccds_elements) {
99 versions <- XML::xpathSApply(e, "ancestor::Gene-commentary/Gene-commentary_label", XML::xmlValue)
100 if (length(versions) < 1) next
101 current_version <- versions[[length(versions)]]
102 if (is.na(version) || current_version < version) {
103 version <- current_version
104 ccds <- XML::xmlValue(e)
105 }
106 }
107 }
108
109 return(ccds)
110 }