Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/structureddata.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d30785e31577 | 1:56ad4e20f292 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Extraction parsers for structured data embedded into HTML or XML files. | |
| 4 The former may include RDFa or microdata. The syntax and the extraction | |
| 5 procedures are based on: | |
| 6 | |
| 7 * The RDFa specifications: http://www.w3.org/TR/#tr_RDFa | |
| 8 * The microdata specification: http://www.w3.org/TR/microdata/ | |
| 9 * The specification of the microdata to RDF conversion: | |
| 10 http://www.w3.org/TR/microdata-rdf/ | |
| 11 | |
| 12 License: W3C Software License, | |
| 13 http://www.w3.org/Consortium/Legal/copyright-software | |
| 14 Author: Ivan Herman | |
| 15 Copyright: W3C | |
| 16 | |
| 17 """ | |
| 18 | |
| 19 from rdflib.parser import ( | |
| 20 Parser, StringInputSource, URLInputSource, FileInputSource) | |
| 21 | |
| 22 try: | |
| 23 import html5lib | |
| 24 assert html5lib | |
| 25 html5lib = True | |
| 26 except ImportError: | |
| 27 import warnings | |
| 28 warnings.warn( | |
| 29 'html5lib not found! RDFa and Microdata ' + | |
| 30 'parsers will not be available.') | |
| 31 html5lib = False | |
| 32 | |
| 33 | |
| 34 def _get_orig_source(source): | |
| 35 """ | |
| 36 A bit of a hack; the RDFa/microdata parsers need more than what the | |
| 37 upper layers of RDFLib provide... | |
| 38 This method returns the original source references. | |
| 39 """ | |
| 40 if isinstance(source, StringInputSource): | |
| 41 orig_source = source.getByteStream() | |
| 42 elif isinstance(source, URLInputSource): | |
| 43 orig_source = source.url | |
| 44 elif isinstance(source, FileInputSource): | |
| 45 orig_source = source.file.name | |
| 46 source.file.close() | |
| 47 else: | |
| 48 orig_source = source.getByteStream() | |
| 49 baseURI = source.getPublicId() | |
| 50 return (baseURI, orig_source) | |
| 51 | |
| 52 | |
| 53 def _check_error(graph): | |
| 54 from .pyRdfa import RDFA_Error, ns_rdf | |
| 55 from .pyRdfa.options import ns_dc | |
| 56 for (s, p, o) in graph.triples((None, ns_rdf["type"], RDFA_Error)): | |
| 57 for (x, y, msg) in graph.triples((s, ns_dc["description"], None)): | |
| 58 raise Exception("RDFa parsing Error! %s" % msg) | |
| 59 | |
| 60 | |
| 61 # This is the parser interface as it would look when called from the | |
| 62 # rest of RDFLib | |
| 63 class RDFaParser(Parser): | |
| 64 """ | |
| 65 Wrapper around the RDFa 1.1 parser. For further details on the RDFa 1.1 | |
| 66 processing, see the relevant W3C documents at | |
| 67 http://www.w3.org/TR/#tr_RDFa. RDFa 1.1 is defined for XHTML, HTML5, SVG | |
| 68 and, in general, for any XML language. | |
| 69 | |
| 70 Note that the parser can also handle RDFa 1.0 if the extra parameter is | |
| 71 used and/or the input source uses RDFa 1.0 specific @version or DTD-s. | |
| 72 """ | |
| 73 def parse(self, source, graph, | |
| 74 pgraph=None, | |
| 75 media_type="", | |
| 76 rdfa_version=None, | |
| 77 embedded_rdf=False, | |
| 78 space_preserve=True, | |
| 79 vocab_expansion=False, | |
| 80 vocab_cache=False, | |
| 81 refresh_vocab_cache=False, | |
| 82 vocab_cache_report=False, | |
| 83 check_lite=False): | |
| 84 """ | |
| 85 @param source: one of the input sources that the RDFLib package defined | |
| 86 @type source: InputSource class instance | |
| 87 @param graph: target graph for the triples; output graph, in RDFa spec. | |
| 88 parlance | |
| 89 @type graph: RDFLib Graph | |
| 90 @keyword pgraph: target for error and warning triples; processor graph, | |
| 91 in RDFa spec. parlance. If set to None, these triples are ignored | |
| 92 @type pgraph: RDFLib Graph | |
| 93 @keyword media_type: explicit setting of the preferred media type | |
| 94 (a.k.a. content type) of the the RDFa source. None means the content | |
| 95 type of the HTTP result is used, or a guess is made based on the | |
| 96 suffix of a file | |
| 97 @type media_type: string | |
| 98 @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by | |
| 99 default, 1.1 is used unless the source has explicit signals to use | |
| 100 1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc) | |
| 101 @type rdfa_version: string | |
| 102 @keyword embedded_rdf: some formats allow embedding RDF in other | |
| 103 formats: (X)HTML can contain turtle in a special <script> element, | |
| 104 SVG can have RDF/XML embedded in a <metadata> element. This flag | |
| 105 controls whether those triples should be interpreted and added to | |
| 106 the output graph. Some languages (e.g., SVG) require this, and the | |
| 107 flag is ignored. | |
| 108 @type embedded_rdf: Boolean | |
| 109 @keyword space_preserve: by default, space in the HTML source must be preserved in the generated literal; | |
| 110 this behavior can be switched off | |
| 111 @type space_preserve: Boolean | |
| 112 @keyword vocab_expansion: whether the RDFa @vocab attribute should | |
| 113 also mean vocabulary expansion (see the RDFa 1.1 spec for further | |
| 114 details) | |
| 115 @type vocab_expansion: Boolean | |
| 116 @keyword vocab_cache: in case vocab expansion is used, whether the | |
| 117 expansion data (i.e., vocabulary) should be cached locally. This | |
| 118 requires the ability for the local application to write on the | |
| 119 local file system | |
| 120 @type vocab_chache: Boolean | |
| 121 @keyword vocab_cache_report: whether the details of vocabulary file caching process should be reported | |
| 122 in the processor graph as information (mainly useful for debug) | |
| 123 @type vocab_cache_report: Boolean | |
| 124 @keyword refresh_vocab_cache: whether the caching checks of vocabs should be by-passed, ie, if caches should be re-generated regardless of the stored date (important for vocab development) | |
| 125 @type refresh_vocab_cache: Boolean | |
| 126 @keyword check_lite: generate extra warnings in case the input source is not RDFa 1.1 check_lite | |
| 127 @type check_lite: Boolean | |
| 128 """ | |
| 129 | |
| 130 if html5lib is False: | |
| 131 raise ImportError( | |
| 132 'html5lib is not installed, cannot use ' + | |
| 133 'RDFa and Microdata parsers.') | |
| 134 | |
| 135 (baseURI, orig_source) = _get_orig_source(source) | |
| 136 self._process(graph, pgraph, baseURI, orig_source, | |
| 137 media_type=media_type, | |
| 138 rdfa_version=rdfa_version, | |
| 139 embedded_rdf=embedded_rdf, | |
| 140 space_preserve=space_preserve, | |
| 141 vocab_expansion=vocab_expansion, | |
| 142 vocab_cache=vocab_cache, | |
| 143 vocab_cache_report=vocab_cache_report, | |
| 144 refresh_vocab_cache=refresh_vocab_cache, | |
| 145 check_lite=check_lite | |
| 146 ) | |
| 147 | |
| 148 def _process(self, graph, pgraph, baseURI, orig_source, | |
| 149 media_type="", | |
| 150 rdfa_version=None, | |
| 151 embedded_rdf=False, | |
| 152 space_preserve=True, | |
| 153 vocab_expansion=False, | |
| 154 vocab_cache=False, | |
| 155 vocab_cache_report=False, | |
| 156 refresh_vocab_cache=False, | |
| 157 check_lite=False): | |
| 158 from .pyRdfa import pyRdfa, Options | |
| 159 from rdflib import Graph | |
| 160 processor_graph = pgraph if pgraph is not None else Graph() | |
| 161 self.options = Options(output_processor_graph=True, | |
| 162 embedded_rdf=embedded_rdf, | |
| 163 space_preserve=space_preserve, | |
| 164 vocab_expansion=vocab_expansion, | |
| 165 vocab_cache=vocab_cache, | |
| 166 vocab_cache_report=vocab_cache_report, | |
| 167 refresh_vocab_cache=refresh_vocab_cache, | |
| 168 check_lite=check_lite) | |
| 169 | |
| 170 if media_type is None: | |
| 171 media_type = "" | |
| 172 processor = pyRdfa(self.options, | |
| 173 base=baseURI, | |
| 174 media_type=media_type, | |
| 175 rdfa_version=rdfa_version) | |
| 176 processor.graph_from_source(orig_source, graph=graph, pgraph=processor_graph, rdfOutput=False) | |
| 177 # This may result in an exception if the graph parsing led to an error | |
| 178 _check_error(processor_graph) | |
| 179 | |
| 180 | |
| 181 class RDFa10Parser(Parser): | |
| 182 """ | |
| 183 This is just a convenience class to wrap around the RDFa 1.0 parser. | |
| 184 """ | |
| 185 def parse(self, source, graph, pgraph=None, media_type=""): | |
| 186 """ | |
| 187 @param source: one of the input sources that the RDFLib package defined | |
| 188 @type source: InputSource class instance | |
| 189 @param graph: target graph for the triples; output graph, in RDFa | |
| 190 spec. parlance | |
| 191 @type graph: RDFLib Graph | |
| 192 @keyword pgraph: target for error and warning triples; processor | |
| 193 graph, in RDFa spec. parlance. If set to None, these triples are | |
| 194 ignored | |
| 195 @type pgraph: RDFLib Graph | |
| 196 @keyword media_type: explicit setting of the preferred media type | |
| 197 (a.k.a. content type) of the the RDFa source. None means the content | |
| 198 type of the HTTP result is used, or a guess is made based on the | |
| 199 suffix of a file | |
| 200 @type media_type: string | |
| 201 @keyword rdfOutput: whether Exceptions should be catched and added, | |
| 202 as triples, to the processor graph, or whether they should be raised. | |
| 203 @type rdfOutput: Boolean | |
| 204 """ | |
| 205 RDFaParser().parse(source, graph, pgraph=pgraph, | |
| 206 media_type=media_type, rdfa_version="1.0") | |
| 207 | |
| 208 | |
| 209 class MicrodataParser(Parser): | |
| 210 """ | |
| 211 Wrapper around an HTML5 microdata, extracted and converted into RDF. For | |
| 212 the specification of microdata, see the relevant section of the HTML5 | |
| 213 spec: http://www.w3.org/TR/microdata/; for the algorithm used to extract | |
| 214 microdata into RDF, see http://www.w3.org/TR/microdata-rdf/. | |
| 215 """ | |
| 216 def parse(self, source, graph, vocab_expansion=False, vocab_cache=False): | |
| 217 """ | |
| 218 @param source: one of the input sources that the RDFLib package defined | |
| 219 @type source: InputSource class instance | |
| 220 @param graph: target graph for the triples; output graph, in RDFa | |
| 221 spec. parlance | |
| 222 @type graph: RDFLib Graph | |
| 223 @keyword vocab_expansion: whether the RDFa @vocab attribute should | |
| 224 also mean vocabulary expansion (see the RDFa 1.1 spec for further | |
| 225 details) | |
| 226 @type vocab_expansion: Boolean | |
| 227 @keyword vocab_cache: in case vocab expansion is used, whether the | |
| 228 expansion data (i.e., vocabulary) should be cached locally. This | |
| 229 requires the ability for the local application to write on the | |
| 230 local file system | |
| 231 @type vocab_chache: Boolean | |
| 232 @keyword rdfOutput: whether Exceptions should be catched and added, | |
| 233 as triples, to the processor graph, or whether they should be raised. | |
| 234 @type rdfOutput: Boolean | |
| 235 """ | |
| 236 if html5lib is False: | |
| 237 raise ImportError( | |
| 238 'html5lib is not installed, cannot use RDFa ' + | |
| 239 'and Microdata parsers.') | |
| 240 | |
| 241 (baseURI, orig_source) = _get_orig_source(source) | |
| 242 self._process(graph, baseURI, orig_source, | |
| 243 vocab_expansion=vocab_expansion, | |
| 244 vocab_cache=vocab_cache) | |
| 245 | |
| 246 def _process(self, graph, baseURI, orig_source, | |
| 247 vocab_expansion=False, vocab_cache=False): | |
| 248 from .pyMicrodata import pyMicrodata | |
| 249 processor = pyMicrodata(base=baseURI, vocab_expansion=vocab_expansion, | |
| 250 vocab_cache=vocab_cache) | |
| 251 processor.graph_from_source( | |
| 252 orig_source, graph=graph, rdfOutput=False) | |
| 253 | |
| 254 | |
| 255 class StructuredDataParser(Parser): | |
| 256 """ | |
| 257 Convenience parser to extract both RDFa (including embedded Turtle) | |
| 258 and microdata from an HTML file. | |
| 259 It is simply a wrapper around the specific parsers. | |
| 260 """ | |
| 261 def parse(self, source, graph, | |
| 262 pgraph=None, | |
| 263 rdfa_version="", | |
| 264 vocab_expansion=False, | |
| 265 vocab_cache=False, | |
| 266 media_type='text/html' | |
| 267 ): | |
| 268 """ | |
| 269 @param source: one of the input sources that the RDFLib package defined | |
| 270 @type source: InputSource class instance | |
| 271 @param graph: target graph for the triples; output graph, in RDFa | |
| 272 spec. parlance | |
| 273 @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by | |
| 274 default, 1.1 is used unless the source has explicit signals to use 1.0 | |
| 275 (e.g., using a @version attribute, using a DTD set up for 1.0, etc) | |
| 276 @type rdfa_version: string | |
| 277 @type graph: RDFLib Graph | |
| 278 @keyword pgraph: target for error and warning triples; processor | |
| 279 graph, in RDFa spec. parlance. If set to None, these triples are | |
| 280 ignored | |
| 281 @type pgraph: RDFLib Graph | |
| 282 @keyword vocab_expansion: whether the RDFa @vocab attribute should | |
| 283 also mean vocabulary expansion (see the RDFa 1.1 spec for further | |
| 284 details) | |
| 285 @type vocab_expansion: Boolean | |
| 286 @keyword vocab_cache: in case vocab expansion is used, whether the | |
| 287 expansion data (i.e., vocabulary) should be cached locally. This | |
| 288 requires the ability for the local application to write on the | |
| 289 local file system | |
| 290 @type vocab_chache: Boolean | |
| 291 @keyword rdfOutput: whether Exceptions should be catched and added, | |
| 292 as triples, to the processor graph, or whether they should be raised. | |
| 293 @type rdfOutput: Boolean | |
| 294 """ | |
| 295 # Note that the media_type argument is ignored, and is here only to avoid an 'unexpected argument' error. | |
| 296 # This parser works for text/html only anyway... | |
| 297 (baseURI, orig_source) = _get_orig_source(source) | |
| 298 if rdfa_version == "" : rdfa_version = "1.1" | |
| 299 RDFaParser()._process(graph, pgraph, baseURI, orig_source, | |
| 300 media_type='text/html', | |
| 301 rdfa_version=rdfa_version, | |
| 302 vocab_expansion=vocab_expansion, | |
| 303 vocab_cache=vocab_cache) | |
| 304 MicrodataParser()._process(graph, baseURI, orig_source, | |
| 305 vocab_expansion=vocab_expansion, | |
| 306 vocab_cache=vocab_cache) | |
| 307 from .hturtle import HTurtleParser | |
| 308 HTurtleParser()._process(graph, baseURI, orig_source, media_type='text/html') |
