Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/pyMicrodata/utils.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d30785e31577 | 1:56ad4e20f292 |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """ | |
| 3 Various utilities for pyMicrodata | |
| 4 | |
| 5 @organization: U{World Wide Web Consortium<http://www.w3.org>} | |
| 6 @author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} | |
| 7 @license: This software is available for use under the | |
| 8 U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} | |
| 9 """ | |
| 10 | |
| 11 """ | |
| 12 $Id: utils.py,v 1.7 2012/09/01 15:17:28 ivan Exp $ | |
| 13 $Date: 2012/09/01 15:17:28 $ | |
| 14 """ | |
| 15 import os, os.path, sys | |
| 16 (py_v_major, py_v_minor, py_v_micro, py_v_final, py_v_serial) = sys.version_info | |
| 17 | |
| 18 if py_v_major >= 3 : | |
| 19 from urllib.request import Request, urlopen | |
| 20 from urllib.parse import urljoin, quote, urlparse | |
| 21 from http.server import BaseHTTPRequestHandler | |
| 22 from urllib.error import HTTPError as urllib_HTTPError | |
| 23 else : | |
| 24 from urllib.request import Request, urlopen | |
| 25 from urllib.error import HTTPError as urllib_HTTPError | |
| 26 from urllib.parse import urljoin, urlparse | |
| 27 from urllib.parse import quote | |
| 28 from http.server import BaseHTTPRequestHandler | |
| 29 | |
| 30 import re | |
| 31 from datetime import datetime | |
| 32 | |
| 33 from rdflib import BNode | |
| 34 import rdflib | |
| 35 if rdflib.__version__ >= "3.0.0" : | |
| 36 from rdflib import RDF as ns_rdf | |
| 37 else : | |
| 38 from rdflib.RDF import RDFNS as ns_rdf | |
| 39 | |
| 40 ################################################################################# | |
| 41 def is_absolute_URI( uri ) : | |
| 42 return urlparse(uri)[0] != "" | |
| 43 | |
| 44 ################################################################################# | |
| 45 | |
| 46 def fragment_escape( name ) : | |
| 47 return quote(name, '/~:-.') | |
| 48 | |
| 49 ################################################################################# | |
| 50 | |
| 51 def generate_URI(base, v) : | |
| 52 """ | |
| 53 Generate an (absolute) URI; if val is a fragment, then using it with base, | |
| 54 otherwise just return the value | |
| 55 @param base: Absolute URI for base | |
| 56 @param v: relative or absolute URI | |
| 57 """ | |
| 58 if is_absolute_URI( v ) : | |
| 59 return v | |
| 60 else : | |
| 61 # UGLY!!! There is a bug for a corner case in python version <= 2.5.X | |
| 62 if len(v) > 0 and v[0] == '?' and (py_v_major < 3 and py_v_minor <= 5) : | |
| 63 return base+val | |
| 64 #### | |
| 65 | |
| 66 # Trust the python library... | |
| 67 # Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it | |
| 68 # swallows the '#' or '?' character at the end. This is clearly a problem with | |
| 69 # Semantic Web URI-s | |
| 70 v = fragment_escape(v.strip()) | |
| 71 joined = urljoin(base, v) | |
| 72 try : | |
| 73 if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") : | |
| 74 return joined + v[-1] | |
| 75 else : | |
| 76 return joined | |
| 77 except : | |
| 78 return joined | |
| 79 | |
| 80 ################################################################################# | |
| 81 def generate_RDF_collection( graph, vals ) : | |
| 82 """ | |
| 83 Generate an RDF List from vals, returns the head of the list | |
| 84 @param graph: RDF graph | |
| 85 @type graph: RDFLib Graph | |
| 86 @param vals: array of RDF Resources | |
| 87 @return: head of the List (an RDF Resource) | |
| 88 """ | |
| 89 # generate an RDF List, returns the head | |
| 90 # list has all the elements in RDF format already | |
| 91 heads = [ BNode() for r in vals ] + [ ns_rdf["nil"] ] | |
| 92 for i in range(0, len(vals)) : | |
| 93 graph.add( (heads[i], ns_rdf["first"], vals[i]) ) | |
| 94 graph.add( (heads[i], ns_rdf["rest"], heads[i+1]) ) | |
| 95 return heads[0] | |
| 96 | |
| 97 ################################################################################# | |
| 98 def get_Literal(Pnode): | |
| 99 """ | |
| 100 Get (recursively) the full text from a DOM Node. | |
| 101 | |
| 102 @param Pnode: DOM Node | |
| 103 @return: string | |
| 104 """ | |
| 105 rc = "" | |
| 106 for node in Pnode.childNodes: | |
| 107 if node.nodeType == node.TEXT_NODE: | |
| 108 rc = rc + node.data | |
| 109 elif node.nodeType == node.ELEMENT_NODE : | |
| 110 rc = rc + get_Literal(node) | |
| 111 | |
| 112 # This presupposes that all spaces and such should be stripped. I am not sure it is true in the spec, | |
| 113 # but this is what the examples show | |
| 114 # return re.sub(r'(\r| |\n|\t)+'," ",rc).strip() | |
| 115 | |
| 116 # at present, the agreement seems to say that white spaces are maintained: | |
| 117 return rc | |
| 118 | |
| 119 ################################################################################# | |
| 120 def get_lang(node) : | |
| 121 # we may have lang and xml:lang | |
| 122 retval = None | |
| 123 if node.hasAttribute("lang") : | |
| 124 retval = node.getAttribute("lang") | |
| 125 if retval and node.hasAttribute("xml:lang") : | |
| 126 xmllang = node.getAttribute("xml:lang").lower() | |
| 127 if not( xmllang != None and xmllang == retval.lower() ) : | |
| 128 # This is an error, in which case retval must be invalidated... | |
| 129 retval = None | |
| 130 return retval | |
| 131 | |
| 132 def get_lang_from_hierarchy(document, node) : | |
| 133 lang = get_lang(node) | |
| 134 if lang == None : | |
| 135 parent = node.parentNode | |
| 136 if parent != None and parent != document : | |
| 137 return get_lang_from_hierarchy(document, parent) | |
| 138 else : | |
| 139 return get_lang(document) | |
| 140 else : | |
| 141 return lang | |
| 142 | |
| 143 ################################################################################# | |
| 144 datetime_type = "http://www.w3.org/2001/XMLSchema#dateTime" | |
| 145 time_type = "http://www.w3.org/2001/XMLSchema#time" | |
| 146 date_type = "http://www.w3.org/2001/XMLSchema#date" | |
| 147 date_gYear = "http://www.w3.org/2001/XMLSchema#gYear" | |
| 148 date_gYearMonth = "http://www.w3.org/2001/XMLSchema#gYearMonth" | |
| 149 date_gMonthDay = "http://www.w3.org/2001/XMLSchema#gMonthDay" | |
| 150 duration_type = "http://www.w3.org/2001/XMLSchema#duration" | |
| 151 | |
| 152 _formats = { | |
| 153 date_gMonthDay : [ "%m-%d" ], | |
| 154 date_gYearMonth : [ "%Y-%m"], | |
| 155 date_gYear : [ "%Y" ], | |
| 156 date_type : [ "%Y-%m-%d", "%Y-%m-%dZ" ], | |
| 157 time_type : [ "%H:%M", | |
| 158 "%H:%M:%S", | |
| 159 "%H:%M:%SZ", | |
| 160 "%H:%M:%S.%f" ], | |
| 161 datetime_type : [ "%Y-%m-%dT%H:%M", | |
| 162 "%Y-%m-%dT%H:%M:%S", | |
| 163 "%Y-%m-%dT%H:%M:%S.%f", | |
| 164 "%Y-%m-%dT%H:%MZ", | |
| 165 "%Y-%m-%dT%H:%M:%SZ", | |
| 166 "%Y-%m-%dT%H:%M:%S.%fZ" ], | |
| 167 duration_type : [ "P%dD", | |
| 168 "P%YY%mM%dD", | |
| 169 "P%YY%mM", | |
| 170 "P%YY%dD", | |
| 171 "P%YY", | |
| 172 "P%mM", | |
| 173 "P%mM%dD", | |
| 174 ], | |
| 175 } | |
| 176 | |
| 177 _dur_times = [ "%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS" ] | |
| 178 | |
| 179 def get_time_type(string) : | |
| 180 """ | |
| 181 Check whether the string abides to one of the accepted time related datatypes, and returns that one if yes | |
| 182 @param string: the attribute value to be checked | |
| 183 @return : a datatype URI or None | |
| 184 """ | |
| 185 for key in _formats : | |
| 186 for format in _formats[key] : | |
| 187 try : | |
| 188 # try to check if the syntax is fine | |
| 189 d = datetime.strptime(string, format) | |
| 190 # bingo! | |
| 191 return key | |
| 192 except ValueError : | |
| 193 pass | |
| 194 | |
| 195 # Now come the special cases:-( | |
| 196 # Check first for the duration stuff, that is the nastiest. | |
| 197 if len(string) > 2 and string[0] == 'P' or (string [0] == '-' and string[1] == 'P') : | |
| 198 # this is meant to be a duration type | |
| 199 # first of all, get rid of the leading '-' and check again | |
| 200 if string[0] == '-' : | |
| 201 for format in _formats[duration_type] : | |
| 202 try : | |
| 203 # try to check if the syntax is fine | |
| 204 d = datetime.strptime(string, format) | |
| 205 # bingo! | |
| 206 return duration_type | |
| 207 except ValueError : | |
| 208 pass | |
| 209 # Let us see if the value contains a separate time portion, and cut that one | |
| 210 durs = string.split('T') | |
| 211 if len(durs) == 2 : | |
| 212 # yep, so we should check again | |
| 213 dur = durs[0] | |
| 214 tm = durs[1] | |
| 215 # Check the duration part | |
| 216 td = False | |
| 217 for format in _formats[duration_type] : | |
| 218 try : | |
| 219 # try to check if the syntax is fine | |
| 220 d = datetime.strptime(dur, format) | |
| 221 # bingo! | |
| 222 td = True | |
| 223 break | |
| 224 except ValueError : | |
| 225 pass | |
| 226 if td == True : | |
| 227 # Getting there... | |
| 228 for format in _dur_times : | |
| 229 try : | |
| 230 # try to check if the syntax is fine | |
| 231 d = datetime.strptime(tm, format) | |
| 232 # bingo! | |
| 233 return duration_type | |
| 234 except ValueError : | |
| 235 pass | |
| 236 # something went wrong... | |
| 237 return None | |
| 238 else : | |
| 239 # Well, no more tricks, this is a plain type | |
| 240 return None | |
| 241 | |
| 242 # If we got here, we should check the time zone | |
| 243 # there is a discrepancy betwen the python and the HTML5/XSD lexical string, | |
| 244 # which means that this has to handled separately for the date and the timezone portion | |
| 245 try : | |
| 246 # The time-zone-less portion of the string | |
| 247 str = string[0:-6] | |
| 248 # The time-zone portion | |
| 249 tz = string[-5:] | |
| 250 try : | |
| 251 t = datetime.strptime(tz,"%H:%M") | |
| 252 except ValueError : | |
| 253 # Bummer, this is not a correct time | |
| 254 return None | |
| 255 # The time-zone is fine, the datetime portion has to be checked | |
| 256 for format in _formats[datetime_type] : | |
| 257 try : | |
| 258 # try to check if it is fine | |
| 259 d = datetime.strptime(str, format) | |
| 260 # Bingo! | |
| 261 return datetime_type | |
| 262 except ValueError : | |
| 263 pass | |
| 264 except : | |
| 265 pass | |
| 266 return None | |
| 267 | |
| 268 | |
| 269 ######################################################################################################### | |
| 270 # Handling URIs | |
| 271 class URIOpener : | |
| 272 """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class | |
| 273 sets the content location. | |
| 274 The class also adds an accept header to the outgoing request, namely | |
| 275 text/html and application/xhtml+xml (unless set explicitly by the caller). | |
| 276 | |
| 277 @ivar data: the real data, ie, a file-like object | |
| 278 @ivar headers: the return headers as sent back by the server | |
| 279 @ivar location: the real location of the data (ie, after possible redirection and content negotiation) | |
| 280 """ | |
| 281 CONTENT_LOCATION = 'Content-Location' | |
| 282 def __init__(self, name) : | |
| 283 """ | |
| 284 @param name: URL to be opened | |
| 285 @keyword additional_headers: additional HTTP request headers to be added to the call | |
| 286 """ | |
| 287 try : | |
| 288 # Note the removal of the fragment ID. This is necessary, per the HTTP spec | |
| 289 req = Request(url=name.split('#')[0]) | |
| 290 | |
| 291 req.add_header('Accept', 'text/html, application/xhtml+xml') | |
| 292 | |
| 293 self.data = urlopen(req) | |
| 294 self.headers = self.data.info() | |
| 295 | |
| 296 if URIOpener.CONTENT_LOCATION in self.headers : | |
| 297 self.location = urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION]) | |
| 298 else : | |
| 299 self.location = name | |
| 300 | |
| 301 except urllib_HTTPError : | |
| 302 e = sys.exc_info()[1] | |
| 303 from pyMicrodata import HTTPError | |
| 304 msg = BaseHTTPRequestHandler.responses[e.code] | |
| 305 raise HTTPError('%s' % msg[1], e.code) | |
| 306 except Exception : | |
| 307 e = sys.exc_info()[1] | |
| 308 from pyMicrodata import MicrodataError | |
| 309 raise MicrodataError('%s' % e) | |
| 310 |
