comparison env/lib/python3.9/site-packages/bs4/builder/_html5lib.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # Use of this source code is governed by the MIT license.
2 __license__ = "MIT"
3
4 __all__ = [
5 'HTML5TreeBuilder',
6 ]
7
8 import warnings
9 import re
10 from bs4.builder import (
11 PERMISSIVE,
12 HTML,
13 HTML_5,
14 HTMLTreeBuilder,
15 )
16 from bs4.element import (
17 NamespacedAttribute,
18 nonwhitespace_re,
19 )
20 import html5lib
21 from html5lib.constants import (
22 namespaces,
23 prefixes,
24 )
25 from bs4.element import (
26 Comment,
27 Doctype,
28 NavigableString,
29 Tag,
30 )
31
32 try:
33 # Pre-0.99999999
34 from html5lib.treebuilders import _base as treebuilder_base
35 new_html5lib = False
36 except ImportError as e:
37 # 0.99999999 and up
38 from html5lib.treebuilders import base as treebuilder_base
39 new_html5lib = True
40
41 class HTML5TreeBuilder(HTMLTreeBuilder):
42 """Use html5lib to build a tree.
43
44 Note that this TreeBuilder does not support some features common
45 to HTML TreeBuilders. Some of these features could theoretically
46 be implemented, but at the very least it's quite difficult,
47 because html5lib moves the parse tree around as it's being built.
48
49 * This TreeBuilder doesn't use different subclasses of NavigableString
50 based on the name of the tag in which the string was found.
51
52 * You can't use a SoupStrainer to parse only part of a document.
53 """
54
55 NAME = "html5lib"
56
57 features = [NAME, PERMISSIVE, HTML_5, HTML]
58
59 # html5lib can tell us which line number and position in the
60 # original file is the source of an element.
61 TRACKS_LINE_NUMBERS = True
62
63 def prepare_markup(self, markup, user_specified_encoding,
64 document_declared_encoding=None, exclude_encodings=None):
65 # Store the user-specified encoding for use later on.
66 self.user_specified_encoding = user_specified_encoding
67
68 # document_declared_encoding and exclude_encodings aren't used
69 # ATM because the html5lib TreeBuilder doesn't use
70 # UnicodeDammit.
71 if exclude_encodings:
72 warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
73 yield (markup, None, None, False)
74
75 # These methods are defined by Beautiful Soup.
76 def feed(self, markup):
77 if self.soup.parse_only is not None:
78 warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
79 parser = html5lib.HTMLParser(tree=self.create_treebuilder)
80 self.underlying_builder.parser = parser
81 extra_kwargs = dict()
82 if not isinstance(markup, str):
83 if new_html5lib:
84 extra_kwargs['override_encoding'] = self.user_specified_encoding
85 else:
86 extra_kwargs['encoding'] = self.user_specified_encoding
87 doc = parser.parse(markup, **extra_kwargs)
88
89 # Set the character encoding detected by the tokenizer.
90 if isinstance(markup, str):
91 # We need to special-case this because html5lib sets
92 # charEncoding to UTF-8 if it gets Unicode input.
93 doc.original_encoding = None
94 else:
95 original_encoding = parser.tokenizer.stream.charEncoding[0]
96 if not isinstance(original_encoding, str):
97 # In 0.99999999 and up, the encoding is an html5lib
98 # Encoding object. We want to use a string for compatibility
99 # with other tree builders.
100 original_encoding = original_encoding.name
101 doc.original_encoding = original_encoding
102 self.underlying_builder.parser = None
103
104 def create_treebuilder(self, namespaceHTMLElements):
105 self.underlying_builder = TreeBuilderForHtml5lib(
106 namespaceHTMLElements, self.soup,
107 store_line_numbers=self.store_line_numbers
108 )
109 return self.underlying_builder
110
111 def test_fragment_to_document(self, fragment):
112 """See `TreeBuilder`."""
113 return '<html><head></head><body>%s</body></html>' % fragment
114
115
116 class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
117
118 def __init__(self, namespaceHTMLElements, soup=None,
119 store_line_numbers=True, **kwargs):
120 if soup:
121 self.soup = soup
122 else:
123 from bs4 import BeautifulSoup
124 # TODO: Why is the parser 'html.parser' here? To avoid an
125 # infinite loop?
126 self.soup = BeautifulSoup(
127 "", "html.parser", store_line_numbers=store_line_numbers,
128 **kwargs
129 )
130 # TODO: What are **kwargs exactly? Should they be passed in
131 # here in addition to/instead of being passed to the BeautifulSoup
132 # constructor?
133 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
134
135 # This will be set later to an html5lib.html5parser.HTMLParser
136 # object, which we can use to track the current line number.
137 self.parser = None
138 self.store_line_numbers = store_line_numbers
139
140 def documentClass(self):
141 self.soup.reset()
142 return Element(self.soup, self.soup, None)
143
144 def insertDoctype(self, token):
145 name = token["name"]
146 publicId = token["publicId"]
147 systemId = token["systemId"]
148
149 doctype = Doctype.for_name_and_ids(name, publicId, systemId)
150 self.soup.object_was_parsed(doctype)
151
152 def elementClass(self, name, namespace):
153 kwargs = {}
154 if self.parser and self.store_line_numbers:
155 # This represents the point immediately after the end of the
156 # tag. We don't know when the tag started, but we do know
157 # where it ended -- the character just before this one.
158 sourceline, sourcepos = self.parser.tokenizer.stream.position()
159 kwargs['sourceline'] = sourceline
160 kwargs['sourcepos'] = sourcepos-1
161 tag = self.soup.new_tag(name, namespace, **kwargs)
162
163 return Element(tag, self.soup, namespace)
164
165 def commentClass(self, data):
166 return TextNode(Comment(data), self.soup)
167
168 def fragmentClass(self):
169 from bs4 import BeautifulSoup
170 # TODO: Why is the parser 'html.parser' here? To avoid an
171 # infinite loop?
172 self.soup = BeautifulSoup("", "html.parser")
173 self.soup.name = "[document_fragment]"
174 return Element(self.soup, self.soup, None)
175
176 def appendChild(self, node):
177 # XXX This code is not covered by the BS4 tests.
178 self.soup.append(node.element)
179
180 def getDocument(self):
181 return self.soup
182
183 def getFragment(self):
184 return treebuilder_base.TreeBuilder.getFragment(self).element
185
186 def testSerializer(self, element):
187 from bs4 import BeautifulSoup
188 rv = []
189 doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
190
191 def serializeElement(element, indent=0):
192 if isinstance(element, BeautifulSoup):
193 pass
194 if isinstance(element, Doctype):
195 m = doctype_re.match(element)
196 if m:
197 name = m.group(1)
198 if m.lastindex > 1:
199 publicId = m.group(2) or ""
200 systemId = m.group(3) or m.group(4) or ""
201 rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
202 (' ' * indent, name, publicId, systemId))
203 else:
204 rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
205 else:
206 rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
207 elif isinstance(element, Comment):
208 rv.append("|%s<!-- %s -->" % (' ' * indent, element))
209 elif isinstance(element, NavigableString):
210 rv.append("|%s\"%s\"" % (' ' * indent, element))
211 else:
212 if element.namespace:
213 name = "%s %s" % (prefixes[element.namespace],
214 element.name)
215 else:
216 name = element.name
217 rv.append("|%s<%s>" % (' ' * indent, name))
218 if element.attrs:
219 attributes = []
220 for name, value in list(element.attrs.items()):
221 if isinstance(name, NamespacedAttribute):
222 name = "%s %s" % (prefixes[name.namespace], name.name)
223 if isinstance(value, list):
224 value = " ".join(value)
225 attributes.append((name, value))
226
227 for name, value in sorted(attributes):
228 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
229 indent += 2
230 for child in element.children:
231 serializeElement(child, indent)
232 serializeElement(element, 0)
233
234 return "\n".join(rv)
235
236 class AttrList(object):
237 def __init__(self, element):
238 self.element = element
239 self.attrs = dict(self.element.attrs)
240 def __iter__(self):
241 return list(self.attrs.items()).__iter__()
242 def __setitem__(self, name, value):
243 # If this attribute is a multi-valued attribute for this element,
244 # turn its value into a list.
245 list_attr = self.element.cdata_list_attributes
246 if (name in list_attr['*']
247 or (self.element.name in list_attr
248 and name in list_attr[self.element.name])):
249 # A node that is being cloned may have already undergone
250 # this procedure.
251 if not isinstance(value, list):
252 value = nonwhitespace_re.findall(value)
253 self.element[name] = value
254 def items(self):
255 return list(self.attrs.items())
256 def keys(self):
257 return list(self.attrs.keys())
258 def __len__(self):
259 return len(self.attrs)
260 def __getitem__(self, name):
261 return self.attrs[name]
262 def __contains__(self, name):
263 return name in list(self.attrs.keys())
264
265
266 class Element(treebuilder_base.Node):
267 def __init__(self, element, soup, namespace):
268 treebuilder_base.Node.__init__(self, element.name)
269 self.element = element
270 self.soup = soup
271 self.namespace = namespace
272
273 def appendChild(self, node):
274 string_child = child = None
275 if isinstance(node, str):
276 # Some other piece of code decided to pass in a string
277 # instead of creating a TextElement object to contain the
278 # string.
279 string_child = child = node
280 elif isinstance(node, Tag):
281 # Some other piece of code decided to pass in a Tag
282 # instead of creating an Element object to contain the
283 # Tag.
284 child = node
285 elif node.element.__class__ == NavigableString:
286 string_child = child = node.element
287 node.parent = self
288 else:
289 child = node.element
290 node.parent = self
291
292 if not isinstance(child, str) and child.parent is not None:
293 node.element.extract()
294
295 if (string_child is not None and self.element.contents
296 and self.element.contents[-1].__class__ == NavigableString):
297 # We are appending a string onto another string.
298 # TODO This has O(n^2) performance, for input like
299 # "a</a>a</a>a</a>..."
300 old_element = self.element.contents[-1]
301 new_element = self.soup.new_string(old_element + string_child)
302 old_element.replace_with(new_element)
303 self.soup._most_recent_element = new_element
304 else:
305 if isinstance(node, str):
306 # Create a brand new NavigableString from this string.
307 child = self.soup.new_string(node)
308
309 # Tell Beautiful Soup to act as if it parsed this element
310 # immediately after the parent's last descendant. (Or
311 # immediately after the parent, if it has no children.)
312 if self.element.contents:
313 most_recent_element = self.element._last_descendant(False)
314 elif self.element.next_element is not None:
315 # Something from further ahead in the parse tree is
316 # being inserted into this earlier element. This is
317 # very annoying because it means an expensive search
318 # for the last element in the tree.
319 most_recent_element = self.soup._last_descendant()
320 else:
321 most_recent_element = self.element
322
323 self.soup.object_was_parsed(
324 child, parent=self.element,
325 most_recent_element=most_recent_element)
326
327 def getAttributes(self):
328 if isinstance(self.element, Comment):
329 return {}
330 return AttrList(self.element)
331
332 def setAttributes(self, attributes):
333 if attributes is not None and len(attributes) > 0:
334 converted_attributes = []
335 for name, value in list(attributes.items()):
336 if isinstance(name, tuple):
337 new_name = NamespacedAttribute(*name)
338 del attributes[name]
339 attributes[new_name] = value
340
341 self.soup.builder._replace_cdata_list_attribute_values(
342 self.name, attributes)
343 for name, value in list(attributes.items()):
344 self.element[name] = value
345
346 # The attributes may contain variables that need substitution.
347 # Call set_up_substitutions manually.
348 #
349 # The Tag constructor called this method when the Tag was created,
350 # but we just set/changed the attributes, so call it again.
351 self.soup.builder.set_up_substitutions(self.element)
352 attributes = property(getAttributes, setAttributes)
353
354 def insertText(self, data, insertBefore=None):
355 text = TextNode(self.soup.new_string(data), self.soup)
356 if insertBefore:
357 self.insertBefore(text, insertBefore)
358 else:
359 self.appendChild(text)
360
361 def insertBefore(self, node, refNode):
362 index = self.element.index(refNode.element)
363 if (node.element.__class__ == NavigableString and self.element.contents
364 and self.element.contents[index-1].__class__ == NavigableString):
365 # (See comments in appendChild)
366 old_node = self.element.contents[index-1]
367 new_str = self.soup.new_string(old_node + node.element)
368 old_node.replace_with(new_str)
369 else:
370 self.element.insert(index, node.element)
371 node.parent = self
372
373 def removeChild(self, node):
374 node.element.extract()
375
376 def reparentChildren(self, new_parent):
377 """Move all of this tag's children into another tag."""
378 # print("MOVE", self.element.contents)
379 # print("FROM", self.element)
380 # print("TO", new_parent.element)
381
382 element = self.element
383 new_parent_element = new_parent.element
384 # Determine what this tag's next_element will be once all the children
385 # are removed.
386 final_next_element = element.next_sibling
387
388 new_parents_last_descendant = new_parent_element._last_descendant(False, False)
389 if len(new_parent_element.contents) > 0:
390 # The new parent already contains children. We will be
391 # appending this tag's children to the end.
392 new_parents_last_child = new_parent_element.contents[-1]
393 new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
394 else:
395 # The new parent contains no children.
396 new_parents_last_child = None
397 new_parents_last_descendant_next_element = new_parent_element.next_element
398
399 to_append = element.contents
400 if len(to_append) > 0:
401 # Set the first child's previous_element and previous_sibling
402 # to elements within the new parent
403 first_child = to_append[0]
404 if new_parents_last_descendant is not None:
405 first_child.previous_element = new_parents_last_descendant
406 else:
407 first_child.previous_element = new_parent_element
408 first_child.previous_sibling = new_parents_last_child
409 if new_parents_last_descendant is not None:
410 new_parents_last_descendant.next_element = first_child
411 else:
412 new_parent_element.next_element = first_child
413 if new_parents_last_child is not None:
414 new_parents_last_child.next_sibling = first_child
415
416 # Find the very last element being moved. It is now the
417 # parent's last descendant. It has no .next_sibling and
418 # its .next_element is whatever the previous last
419 # descendant had.
420 last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
421
422 last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
423 if new_parents_last_descendant_next_element is not None:
424 # TODO: This code has no test coverage and I'm not sure
425 # how to get html5lib to go through this path, but it's
426 # just the other side of the previous line.
427 new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
428 last_childs_last_descendant.next_sibling = None
429
430 for child in to_append:
431 child.parent = new_parent_element
432 new_parent_element.contents.append(child)
433
434 # Now that this element has no children, change its .next_element.
435 element.contents = []
436 element.next_element = final_next_element
437
438 # print("DONE WITH MOVE")
439 # print("FROM", self.element)
440 # print("TO", new_parent_element)
441
442 def cloneNode(self):
443 tag = self.soup.new_tag(self.element.name, self.namespace)
444 node = Element(tag, self.soup, self.namespace)
445 for key,value in self.attributes:
446 node.attributes[key] = value
447 return node
448
449 def hasContent(self):
450 return self.element.contents
451
452 def getNameTuple(self):
453 if self.namespace == None:
454 return namespaces["html"], self.name
455 else:
456 return self.namespace, self.name
457
458 nameTuple = property(getNameTuple)
459
460 class TextNode(Element):
461 def __init__(self, element, soup):
462 treebuilder_base.Node.__init__(self, None)
463 self.element = element
464 self.soup = soup
465
466 def cloneNode(self):
467 raise NotImplementedError