comparison env/lib/python3.9/site-packages/bs4/builder/__init__.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # Use of this source code is governed by the MIT license.
2 __license__ = "MIT"
3
4 from collections import defaultdict
5 import itertools
6 import sys
7 from bs4.element import (
8 CharsetMetaAttributeValue,
9 ContentMetaAttributeValue,
10 Stylesheet,
11 Script,
12 TemplateString,
13 nonwhitespace_re
14 )
15
16 __all__ = [
17 'HTMLTreeBuilder',
18 'SAXTreeBuilder',
19 'TreeBuilder',
20 'TreeBuilderRegistry',
21 ]
22
23 # Some useful features for a TreeBuilder to have.
24 FAST = 'fast'
25 PERMISSIVE = 'permissive'
26 STRICT = 'strict'
27 XML = 'xml'
28 HTML = 'html'
29 HTML_5 = 'html5'
30
31
32 class TreeBuilderRegistry(object):
33 """A way of looking up TreeBuilder subclasses by their name or by desired
34 features.
35 """
36
37 def __init__(self):
38 self.builders_for_feature = defaultdict(list)
39 self.builders = []
40
41 def register(self, treebuilder_class):
42 """Register a treebuilder based on its advertised features.
43
44 :param treebuilder_class: A subclass of Treebuilder. its .features
45 attribute should list its features.
46 """
47 for feature in treebuilder_class.features:
48 self.builders_for_feature[feature].insert(0, treebuilder_class)
49 self.builders.insert(0, treebuilder_class)
50
51 def lookup(self, *features):
52 """Look up a TreeBuilder subclass with the desired features.
53
54 :param features: A list of features to look for. If none are
55 provided, the most recently registered TreeBuilder subclass
56 will be used.
57 :return: A TreeBuilder subclass, or None if there's no
58 registered subclass with all the requested features.
59 """
60 if len(self.builders) == 0:
61 # There are no builders at all.
62 return None
63
64 if len(features) == 0:
65 # They didn't ask for any features. Give them the most
66 # recently registered builder.
67 return self.builders[0]
68
69 # Go down the list of features in order, and eliminate any builders
70 # that don't match every feature.
71 features = list(features)
72 features.reverse()
73 candidates = None
74 candidate_set = None
75 while len(features) > 0:
76 feature = features.pop()
77 we_have_the_feature = self.builders_for_feature.get(feature, [])
78 if len(we_have_the_feature) > 0:
79 if candidates is None:
80 candidates = we_have_the_feature
81 candidate_set = set(candidates)
82 else:
83 # Eliminate any candidates that don't have this feature.
84 candidate_set = candidate_set.intersection(
85 set(we_have_the_feature))
86
87 # The only valid candidates are the ones in candidate_set.
88 # Go through the original list of candidates and pick the first one
89 # that's in candidate_set.
90 if candidate_set is None:
91 return None
92 for candidate in candidates:
93 if candidate in candidate_set:
94 return candidate
95 return None
96
97 # The BeautifulSoup class will take feature lists from developers and use them
98 # to look up builders in this registry.
99 builder_registry = TreeBuilderRegistry()
100
101 class TreeBuilder(object):
102 """Turn a textual document into a Beautiful Soup object tree."""
103
104 NAME = "[Unknown tree builder]"
105 ALTERNATE_NAMES = []
106 features = []
107
108 is_xml = False
109 picklable = False
110 empty_element_tags = None # A tag will be considered an empty-element
111 # tag when and only when it has no contents.
112
113 # A value for these tag/attribute combinations is a space- or
114 # comma-separated list of CDATA, rather than a single CDATA.
115 DEFAULT_CDATA_LIST_ATTRIBUTES = {}
116
117 # Whitespace should be preserved inside these tags.
118 DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
119
120 # The textual contents of tags with these names should be
121 # instantiated with some class other than NavigableString.
122 DEFAULT_STRING_CONTAINERS = {}
123
124 USE_DEFAULT = object()
125
126 # Most parsers don't keep track of line numbers.
127 TRACKS_LINE_NUMBERS = False
128
129 def __init__(self, multi_valued_attributes=USE_DEFAULT,
130 preserve_whitespace_tags=USE_DEFAULT,
131 store_line_numbers=USE_DEFAULT,
132 string_containers=USE_DEFAULT,
133 ):
134 """Constructor.
135
136 :param multi_valued_attributes: If this is set to None, the
137 TreeBuilder will not turn any values for attributes like
138 'class' into lists. Setting this to a dictionary will
139 customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
140 for an example.
141
142 Internally, these are called "CDATA list attributes", but that
143 probably doesn't make sense to an end-user, so the argument name
144 is `multi_valued_attributes`.
145
146 :param preserve_whitespace_tags: A list of tags to treat
147 the way <pre> tags are treated in HTML. Tags in this list
148 are immune from pretty-printing; their contents will always be
149 output as-is.
150
151 :param string_containers: A dictionary mapping tag names to
152 the classes that should be instantiated to contain the textual
153 contents of those tags. The default is to use NavigableString
154 for every tag, no matter what the name. You can override the
155 default by changing DEFAULT_STRING_CONTAINERS.
156
157 :param store_line_numbers: If the parser keeps track of the
158 line numbers and positions of the original markup, that
159 information will, by default, be stored in each corresponding
160 `Tag` object. You can turn this off by passing
161 store_line_numbers=False. If the parser you're using doesn't
162 keep track of this information, then setting store_line_numbers=True
163 will do nothing.
164 """
165 self.soup = None
166 if multi_valued_attributes is self.USE_DEFAULT:
167 multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
168 self.cdata_list_attributes = multi_valued_attributes
169 if preserve_whitespace_tags is self.USE_DEFAULT:
170 preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
171 self.preserve_whitespace_tags = preserve_whitespace_tags
172 if store_line_numbers == self.USE_DEFAULT:
173 store_line_numbers = self.TRACKS_LINE_NUMBERS
174 self.store_line_numbers = store_line_numbers
175 if string_containers == self.USE_DEFAULT:
176 string_containers = self.DEFAULT_STRING_CONTAINERS
177 self.string_containers = string_containers
178
179 def initialize_soup(self, soup):
180 """The BeautifulSoup object has been initialized and is now
181 being associated with the TreeBuilder.
182
183 :param soup: A BeautifulSoup object.
184 """
185 self.soup = soup
186
187 def reset(self):
188 """Do any work necessary to reset the underlying parser
189 for a new document.
190
191 By default, this does nothing.
192 """
193 pass
194
195 def can_be_empty_element(self, tag_name):
196 """Might a tag with this name be an empty-element tag?
197
198 The final markup may or may not actually present this tag as
199 self-closing.
200
201 For instance: an HTMLBuilder does not consider a <p> tag to be
202 an empty-element tag (it's not in
203 HTMLBuilder.empty_element_tags). This means an empty <p> tag
204 will be presented as "<p></p>", not "<p/>" or "<p>".
205
206 The default implementation has no opinion about which tags are
207 empty-element tags, so a tag will be presented as an
208 empty-element tag if and only if it has no children.
209 "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
210 be left alone.
211
212 :param tag_name: The name of a markup tag.
213 """
214 if self.empty_element_tags is None:
215 return True
216 return tag_name in self.empty_element_tags
217
218 def feed(self, markup):
219 """Run some incoming markup through some parsing process,
220 populating the `BeautifulSoup` object in self.soup.
221
222 This method is not implemented in TreeBuilder; it must be
223 implemented in subclasses.
224
225 :return: None.
226 """
227 raise NotImplementedError()
228
229 def prepare_markup(self, markup, user_specified_encoding=None,
230 document_declared_encoding=None, exclude_encodings=None):
231 """Run any preliminary steps necessary to make incoming markup
232 acceptable to the parser.
233
234 :param markup: Some markup -- probably a bytestring.
235 :param user_specified_encoding: The user asked to try this encoding.
236 :param document_declared_encoding: The markup itself claims to be
237 in this encoding.
238 :param exclude_encodings: The user asked _not_ to try any of
239 these encodings.
240
241 :yield: A series of 4-tuples:
242 (markup, encoding, declared encoding,
243 has undergone character replacement)
244
245 Each 4-tuple represents a strategy for converting the
246 document to Unicode and parsing it. Each strategy will be tried
247 in turn.
248
249 By default, the only strategy is to parse the markup
250 as-is. See `LXMLTreeBuilderForXML` and
251 `HTMLParserTreeBuilder` for implementations that take into
252 account the quirks of particular parsers.
253 """
254 yield markup, None, None, False
255
256 def test_fragment_to_document(self, fragment):
257 """Wrap an HTML fragment to make it look like a document.
258
259 Different parsers do this differently. For instance, lxml
260 introduces an empty <head> tag, and html5lib
261 doesn't. Abstracting this away lets us write simple tests
262 which run HTML fragments through the parser and compare the
263 results against other HTML fragments.
264
265 This method should not be used outside of tests.
266
267 :param fragment: A string -- fragment of HTML.
268 :return: A string -- a full HTML document.
269 """
270 return fragment
271
272 def set_up_substitutions(self, tag):
273 """Set up any substitutions that will need to be performed on
274 a `Tag` when it's output as a string.
275
276 By default, this does nothing. See `HTMLTreeBuilder` for a
277 case where this is used.
278
279 :param tag: A `Tag`
280 :return: Whether or not a substitution was performed.
281 """
282 return False
283
284 def _replace_cdata_list_attribute_values(self, tag_name, attrs):
285 """When an attribute value is associated with a tag that can
286 have multiple values for that attribute, convert the string
287 value to a list of strings.
288
289 Basically, replaces class="foo bar" with class=["foo", "bar"]
290
291 NOTE: This method modifies its input in place.
292
293 :param tag_name: The name of a tag.
294 :param attrs: A dictionary containing the tag's attributes.
295 Any appropriate attribute values will be modified in place.
296 """
297 if not attrs:
298 return attrs
299 if self.cdata_list_attributes:
300 universal = self.cdata_list_attributes.get('*', [])
301 tag_specific = self.cdata_list_attributes.get(
302 tag_name.lower(), None)
303 for attr in list(attrs.keys()):
304 if attr in universal or (tag_specific and attr in tag_specific):
305 # We have a "class"-type attribute whose string
306 # value is a whitespace-separated list of
307 # values. Split it into a list.
308 value = attrs[attr]
309 if isinstance(value, str):
310 values = nonwhitespace_re.findall(value)
311 else:
312 # html5lib sometimes calls setAttributes twice
313 # for the same tag when rearranging the parse
314 # tree. On the second call the attribute value
315 # here is already a list. If this happens,
316 # leave the value alone rather than trying to
317 # split it again.
318 values = value
319 attrs[attr] = values
320 return attrs
321
322 class SAXTreeBuilder(TreeBuilder):
323 """A Beautiful Soup treebuilder that listens for SAX events.
324
325 This is not currently used for anything, but it demonstrates
326 how a simple TreeBuilder would work.
327 """
328
329 def feed(self, markup):
330 raise NotImplementedError()
331
332 def close(self):
333 pass
334
335 def startElement(self, name, attrs):
336 attrs = dict((key[1], value) for key, value in list(attrs.items()))
337 #print("Start %s, %r" % (name, attrs))
338 self.soup.handle_starttag(name, attrs)
339
340 def endElement(self, name):
341 #print("End %s" % name)
342 self.soup.handle_endtag(name)
343
344 def startElementNS(self, nsTuple, nodeName, attrs):
345 # Throw away (ns, nodeName) for now.
346 self.startElement(nodeName, attrs)
347
348 def endElementNS(self, nsTuple, nodeName):
349 # Throw away (ns, nodeName) for now.
350 self.endElement(nodeName)
351 #handler.endElementNS((ns, node.nodeName), node.nodeName)
352
353 def startPrefixMapping(self, prefix, nodeValue):
354 # Ignore the prefix for now.
355 pass
356
357 def endPrefixMapping(self, prefix):
358 # Ignore the prefix for now.
359 # handler.endPrefixMapping(prefix)
360 pass
361
362 def characters(self, content):
363 self.soup.handle_data(content)
364
365 def startDocument(self):
366 pass
367
368 def endDocument(self):
369 pass
370
371
372 class HTMLTreeBuilder(TreeBuilder):
373 """This TreeBuilder knows facts about HTML.
374
375 Such as which tags are empty-element tags.
376 """
377
378 empty_element_tags = set([
379 # These are from HTML5.
380 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
381
382 # These are from earlier versions of HTML and are removed in HTML5.
383 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
384 ])
385
386 # The HTML standard defines these as block-level elements. Beautiful
387 # Soup does not treat these elements differently from other elements,
388 # but it may do so eventually, and this information is available if
389 # you need to use it.
390 block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
391
392 # The HTML standard defines an unusual content model for these tags.
393 # We represent this by using a string class other than NavigableString
394 # inside these tags.
395 #
396 # I made this list by going through the HTML spec
397 # (https://html.spec.whatwg.org/#metadata-content) and looking for
398 # "metadata content" elements that can contain strings.
399 #
400 # TODO: Arguably <noscript> could go here but it seems
401 # qualitatively different from the other tags.
402 DEFAULT_STRING_CONTAINERS = {
403 'style': Stylesheet,
404 'script': Script,
405 'template': TemplateString,
406 }
407
408 # The HTML standard defines these attributes as containing a
409 # space-separated list of values, not a single value. That is,
410 # class="foo bar" means that the 'class' attribute has two values,
411 # 'foo' and 'bar', not the single value 'foo bar'. When we
412 # encounter one of these attributes, we will parse its value into
413 # a list of values if possible. Upon output, the list will be
414 # converted back into a string.
415 DEFAULT_CDATA_LIST_ATTRIBUTES = {
416 "*" : ['class', 'accesskey', 'dropzone'],
417 "a" : ['rel', 'rev'],
418 "link" : ['rel', 'rev'],
419 "td" : ["headers"],
420 "th" : ["headers"],
421 "td" : ["headers"],
422 "form" : ["accept-charset"],
423 "object" : ["archive"],
424
425 # These are HTML5 specific, as are *.accesskey and *.dropzone above.
426 "area" : ["rel"],
427 "icon" : ["sizes"],
428 "iframe" : ["sandbox"],
429 "output" : ["for"],
430 }
431
432 DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
433
434 def set_up_substitutions(self, tag):
435 """Replace the declared encoding in a <meta> tag with a placeholder,
436 to be substituted when the tag is output to a string.
437
438 An HTML document may come in to Beautiful Soup as one
439 encoding, but exit in a different encoding, and the <meta> tag
440 needs to be changed to reflect this.
441
442 :param tag: A `Tag`
443 :return: Whether or not a substitution was performed.
444 """
445 # We are only interested in <meta> tags
446 if tag.name != 'meta':
447 return False
448
449 http_equiv = tag.get('http-equiv')
450 content = tag.get('content')
451 charset = tag.get('charset')
452
453 # We are interested in <meta> tags that say what encoding the
454 # document was originally in. This means HTML 5-style <meta>
455 # tags that provide the "charset" attribute. It also means
456 # HTML 4-style <meta> tags that provide the "content"
457 # attribute and have "http-equiv" set to "content-type".
458 #
459 # In both cases we will replace the value of the appropriate
460 # attribute with a standin object that can take on any
461 # encoding.
462 meta_encoding = None
463 if charset is not None:
464 # HTML 5 style:
465 # <meta charset="utf8">
466 meta_encoding = charset
467 tag['charset'] = CharsetMetaAttributeValue(charset)
468
469 elif (content is not None and http_equiv is not None
470 and http_equiv.lower() == 'content-type'):
471 # HTML 4 style:
472 # <meta http-equiv="content-type" content="text/html; charset=utf8">
473 tag['content'] = ContentMetaAttributeValue(content)
474
475 return (meta_encoding is not None)
476
477 def register_treebuilders_from(module):
478 """Copy TreeBuilders from the given module into this module."""
479 this_module = sys.modules[__name__]
480 for name in module.__all__:
481 obj = getattr(module, name)
482
483 if issubclass(obj, TreeBuilder):
484 setattr(this_module, name, obj)
485 this_module.__all__.append(name)
486 # Register the builder while we're at it.
487 this_module.builder_registry.register(obj)
488
489 class ParserRejectedMarkup(Exception):
490 """An Exception to be raised when the underlying parser simply
491 refuses to parse the given markup.
492 """
493 def __init__(self, message_or_exception):
494 """Explain why the parser rejected the given markup, either
495 with a textual explanation or another exception.
496 """
497 if isinstance(message_or_exception, Exception):
498 e = message_or_exception
499 message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
500 super(ParserRejectedMarkup, self).__init__(message_or_exception)
501
502 # Builders are registered in reverse order of priority, so that custom
503 # builder registrations will take precedence. In general, we want lxml
504 # to take precedence over html5lib, because it's faster. And we only
505 # want to use HTMLParser as a last resort.
506 from . import _htmlparser
507 register_treebuilders_from(_htmlparser)
508 try:
509 from . import _html5lib
510 register_treebuilders_from(_html5lib)
511 except ImportError:
512 # They don't have html5lib installed.
513 pass
514 try:
515 from . import _lxml
516 register_treebuilders_from(_lxml)
517 except ImportError:
518 # They don't have lxml installed.
519 pass