comparison env/lib/python3.9/site-packages/bs4/tests/test_htmlparser.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """Tests to ensure that the html.parser tree builder generates good
2 trees."""
3
4 from pdb import set_trace
5 import pickle
6 from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
7 from bs4.builder import HTMLParserTreeBuilder
8 from bs4.builder._htmlparser import BeautifulSoupHTMLParser
9
10 class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
11
12 default_builder = HTMLParserTreeBuilder
13
14 def test_namespaced_system_doctype(self):
15 # html.parser can't handle namespaced doctypes, so skip this one.
16 pass
17
18 def test_namespaced_public_doctype(self):
19 # html.parser can't handle namespaced doctypes, so skip this one.
20 pass
21
22 def test_builder_is_pickled(self):
23 """Unlike most tree builders, HTMLParserTreeBuilder and will
24 be restored after pickling.
25 """
26 tree = self.soup("<a><b>foo</a>")
27 dumped = pickle.dumps(tree, 2)
28 loaded = pickle.loads(dumped)
29 self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
30
31 def test_redundant_empty_element_closing_tags(self):
32 self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
33 self.assertSoupEquals('</br></br></br>', "")
34
35 def test_empty_element(self):
36 # This verifies that any buffered data present when the parser
37 # finishes working is handled.
38 self.assertSoupEquals("foo &# bar", "foo &amp;# bar")
39
40 def test_tracking_line_numbers(self):
41 # The html.parser TreeBuilder keeps track of line number and
42 # position of each element.
43 markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
44 soup = self.soup(markup)
45 self.assertEqual(2, soup.p.sourceline)
46 self.assertEqual(3, soup.p.sourcepos)
47 self.assertEqual("sourceline", soup.p.find('sourceline').name)
48
49 # You can deactivate this behavior.
50 soup = self.soup(markup, store_line_numbers=False)
51 self.assertEqual("sourceline", soup.p.sourceline.name)
52 self.assertEqual("sourcepos", soup.p.sourcepos.name)
53
54 def test_on_duplicate_attribute(self):
55 # The html.parser tree builder has a variety of ways of
56 # handling a tag that contains the same attribute multiple times.
57
58 markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
59
60 # If you don't provide any particular value for
61 # on_duplicate_attribute, later values replace earlier values.
62 soup = self.soup(markup)
63 self.assertEqual("url3", soup.a['href'])
64 self.assertEqual(["cls"], soup.a['class'])
65 self.assertEqual("id", soup.a['id'])
66
67 # You can also get this behavior explicitly.
68 def assert_attribute(on_duplicate_attribute, expected):
69 soup = self.soup(
70 markup, on_duplicate_attribute=on_duplicate_attribute
71 )
72 self.assertEqual(expected, soup.a['href'])
73
74 # Verify that non-duplicate attributes are treated normally.
75 self.assertEqual(["cls"], soup.a['class'])
76 self.assertEqual("id", soup.a['id'])
77 assert_attribute(None, "url3")
78 assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
79
80 # You can ignore subsequent values in favor of the first.
81 assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
82
83 # And you can pass in a callable that does whatever you want.
84 def accumulate(attrs, key, value):
85 if not isinstance(attrs[key], list):
86 attrs[key] = [attrs[key]]
87 attrs[key].append(value)
88 assert_attribute(accumulate, ["url1", "url2", "url3"])
89
90
91 class TestHTMLParserSubclass(SoupTest):
92 def test_error(self):
93 """Verify that our HTMLParser subclass implements error() in a way
94 that doesn't cause a crash.
95 """
96 parser = BeautifulSoupHTMLParser()
97 parser.error("don't crash")