comparison env/lib/python3.9/site-packages/bs4/tests/test_soup.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # -*- coding: utf-8 -*-
2 """Tests of Beautiful Soup as a whole."""
3
4 from pdb import set_trace
5 import logging
6 import unittest
7 import sys
8 import tempfile
9
10 from bs4 import (
11 BeautifulSoup,
12 BeautifulStoneSoup,
13 GuessedAtParserWarning,
14 MarkupResemblesLocatorWarning,
15 )
16 from bs4.builder import (
17 TreeBuilder,
18 ParserRejectedMarkup,
19 )
20 from bs4.element import (
21 CharsetMetaAttributeValue,
22 Comment,
23 ContentMetaAttributeValue,
24 SoupStrainer,
25 NamespacedAttribute,
26 Tag,
27 NavigableString,
28 )
29
30 import bs4.dammit
31 from bs4.dammit import (
32 EntitySubstitution,
33 UnicodeDammit,
34 EncodingDetector,
35 )
36 from bs4.testing import (
37 default_builder,
38 SoupTest,
39 skipIf,
40 )
41 import warnings
42
43 try:
44 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
45 LXML_PRESENT = True
46 except ImportError as e:
47 LXML_PRESENT = False
48
49 PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
50
51 class TestConstructor(SoupTest):
52
53 def test_short_unicode_input(self):
54 data = "<h1>éé</h1>"
55 soup = self.soup(data)
56 self.assertEqual("éé", soup.h1.string)
57
58 def test_embedded_null(self):
59 data = "<h1>foo\0bar</h1>"
60 soup = self.soup(data)
61 self.assertEqual("foo\0bar", soup.h1.string)
62
63 def test_exclude_encodings(self):
64 utf8_data = "Räksmörgås".encode("utf-8")
65 soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
66 self.assertEqual("windows-1252", soup.original_encoding)
67
68 def test_custom_builder_class(self):
69 # Verify that you can pass in a custom Builder class and
70 # it'll be instantiated with the appropriate keyword arguments.
71 class Mock(object):
72 def __init__(self, **kwargs):
73 self.called_with = kwargs
74 self.is_xml = True
75 self.store_line_numbers = False
76 self.cdata_list_attributes = []
77 self.preserve_whitespace_tags = []
78 self.string_containers = {}
79 def initialize_soup(self, soup):
80 pass
81 def feed(self, markup):
82 self.fed = markup
83 def reset(self):
84 pass
85 def ignore(self, ignore):
86 pass
87 set_up_substitutions = can_be_empty_element = ignore
88 def prepare_markup(self, *args, **kwargs):
89 yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
90
91 kwargs = dict(
92 var="value",
93 # This is a deprecated BS3-era keyword argument, which
94 # will be stripped out.
95 convertEntities=True,
96 )
97 with warnings.catch_warnings(record=True):
98 soup = BeautifulSoup('', builder=Mock, **kwargs)
99 assert isinstance(soup.builder, Mock)
100 self.assertEqual(dict(var="value"), soup.builder.called_with)
101 self.assertEqual("prepared markup", soup.builder.fed)
102
103 # You can also instantiate the TreeBuilder yourself. In this
104 # case, that specific object is used and any keyword arguments
105 # to the BeautifulSoup constructor are ignored.
106 builder = Mock(**kwargs)
107 with warnings.catch_warnings(record=True) as w:
108 soup = BeautifulSoup(
109 '', builder=builder, ignored_value=True,
110 )
111 msg = str(w[0].message)
112 assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
113 self.assertEqual(builder, soup.builder)
114 self.assertEqual(kwargs, builder.called_with)
115
116 def test_parser_markup_rejection(self):
117 # If markup is completely rejected by the parser, an
118 # explanatory ParserRejectedMarkup exception is raised.
119 class Mock(TreeBuilder):
120 def feed(self, *args, **kwargs):
121 raise ParserRejectedMarkup("Nope.")
122
123 def prepare_markup(self, *args, **kwargs):
124 # We're going to try two different ways of preparing this markup,
125 # but feed() will reject both of them.
126 yield markup, None, None, False
127 yield markup, None, None, False
128
129 import re
130 self.assertRaisesRegex(
131 ParserRejectedMarkup,
132 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",
133 BeautifulSoup, '', builder=Mock,
134 )
135
136 def test_cdata_list_attributes(self):
137 # Most attribute values are represented as scalars, but the
138 # HTML standard says that some attributes, like 'class' have
139 # space-separated lists as values.
140 markup = '<a id=" an id " class=" a class "></a>'
141 soup = self.soup(markup)
142
143 # Note that the spaces are stripped for 'class' but not for 'id'.
144 a = soup.a
145 self.assertEqual(" an id ", a['id'])
146 self.assertEqual(["a", "class"], a['class'])
147
148 # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
149 # you customize or disable this. As always, you can customize the TreeBuilder
150 # by passing in a keyword argument to the BeautifulSoup constructor.
151 soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
152 self.assertEqual(" a class ", soup.a['class'])
153
154 # Here are two ways of saying that `id` is a multi-valued
155 # attribute in this context, but 'class' is not.
156 for switcheroo in ({'*': 'id'}, {'a': 'id'}):
157 with warnings.catch_warnings(record=True) as w:
158 # This will create a warning about not explicitly
159 # specifying a parser, but we'll ignore it.
160 soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
161 a = soup.a
162 self.assertEqual(["an", "id"], a['id'])
163 self.assertEqual(" a class ", a['class'])
164
165 def test_replacement_classes(self):
166 # Test the ability to pass in replacements for element classes
167 # which will be used when building the tree.
168 class TagPlus(Tag):
169 pass
170
171 class StringPlus(NavigableString):
172 pass
173
174 class CommentPlus(Comment):
175 pass
176
177 soup = self.soup(
178 "<a><b>foo</b>bar</a><!--whee-->",
179 element_classes = {
180 Tag: TagPlus,
181 NavigableString: StringPlus,
182 Comment: CommentPlus,
183 }
184 )
185
186 # The tree was built with TagPlus, StringPlus, and CommentPlus objects,
187 # rather than Tag, String, and Comment objects.
188 assert all(
189 isinstance(x, (TagPlus, StringPlus, CommentPlus))
190 for x in soup.recursiveChildGenerator()
191 )
192
193 def test_alternate_string_containers(self):
194 # Test the ability to customize the string containers for
195 # different types of tags.
196 class PString(NavigableString):
197 pass
198
199 class BString(NavigableString):
200 pass
201
202 soup = self.soup(
203 "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
204 string_containers = {
205 'b': BString,
206 'p': PString,
207 }
208 )
209
210 # The string before the <p> tag is a regular NavigableString.
211 assert isinstance(soup.div.contents[0], NavigableString)
212
213 # The string inside the <p> tag, but not inside the <i> tag,
214 # is a PString.
215 assert isinstance(soup.p.contents[0], PString)
216
217 # Every string inside the <b> tag is a BString, even the one that
218 # was also inside an <i> tag.
219 for s in soup.b.strings:
220 assert isinstance(s, BString)
221
222 # Now that parsing was complete, the string_container_stack
223 # (where this information was kept) has been cleared out.
224 self.assertEqual([], soup.string_container_stack)
225
226
227 class TestWarnings(SoupTest):
228
229 def _assert_warning(self, warnings, cls):
230 for w in warnings:
231 if isinstance(w.message, cls):
232 return w
233 raise Exception("%s warning not found in %r" % cls, warnings)
234
235 def _assert_no_parser_specified(self, w):
236 warning = self._assert_warning(w, GuessedAtParserWarning)
237 message = str(warning.message)
238 self.assertTrue(
239 message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
240 )
241
242 def test_warning_if_no_parser_specified(self):
243 with warnings.catch_warnings(record=True) as w:
244 soup = BeautifulSoup("<a><b></b></a>")
245 self._assert_no_parser_specified(w)
246
247 def test_warning_if_parser_specified_too_vague(self):
248 with warnings.catch_warnings(record=True) as w:
249 soup = BeautifulSoup("<a><b></b></a>", "html")
250 self._assert_no_parser_specified(w)
251
252 def test_no_warning_if_explicit_parser_specified(self):
253 with warnings.catch_warnings(record=True) as w:
254 soup = BeautifulSoup("<a><b></b></a>", "html.parser")
255 self.assertEqual([], w)
256
257 def test_parseOnlyThese_renamed_to_parse_only(self):
258 with warnings.catch_warnings(record=True) as w:
259 soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
260 msg = str(w[0].message)
261 self.assertTrue("parseOnlyThese" in msg)
262 self.assertTrue("parse_only" in msg)
263 self.assertEqual(b"<b></b>", soup.encode())
264
265 def test_fromEncoding_renamed_to_from_encoding(self):
266 with warnings.catch_warnings(record=True) as w:
267 utf8 = b"\xc3\xa9"
268 soup = self.soup(utf8, fromEncoding="utf8")
269 msg = str(w[0].message)
270 self.assertTrue("fromEncoding" in msg)
271 self.assertTrue("from_encoding" in msg)
272 self.assertEqual("utf8", soup.original_encoding)
273
274 def test_unrecognized_keyword_argument(self):
275 self.assertRaises(
276 TypeError, self.soup, "<a>", no_such_argument=True)
277
278 def test_disk_file_warning(self):
279 filehandle = tempfile.NamedTemporaryFile()
280 filename = filehandle.name
281 try:
282 with warnings.catch_warnings(record=True) as w:
283 soup = self.soup(filename)
284 warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
285 self.assertTrue("looks like a filename" in str(warning.message))
286 finally:
287 filehandle.close()
288
289 # The file no longer exists, so Beautiful Soup will no longer issue the warning.
290 with warnings.catch_warnings(record=True) as w:
291 soup = self.soup(filename)
292 self.assertEqual([], w)
293
294 def test_url_warning_with_bytes_url(self):
295 with warnings.catch_warnings(record=True) as warning_list:
296 soup = self.soup(b"http://www.crummybytes.com/")
297 warning = self._assert_warning(
298 warning_list, MarkupResemblesLocatorWarning
299 )
300 self.assertTrue("looks like a URL" in str(warning.message))
301
302 def test_url_warning_with_unicode_url(self):
303 with warnings.catch_warnings(record=True) as warning_list:
304 # note - this url must differ from the bytes one otherwise
305 # python's warnings system swallows the second warning
306 soup = self.soup("http://www.crummyunicode.com/")
307 warning = self._assert_warning(
308 warning_list, MarkupResemblesLocatorWarning
309 )
310 self.assertTrue("looks like a URL" in str(warning.message))
311
312 def test_url_warning_with_bytes_and_space(self):
313 # Here the markup contains something besides a URL, so no warning
314 # is issued.
315 with warnings.catch_warnings(record=True) as warning_list:
316 soup = self.soup(b"http://www.crummybytes.com/ is great")
317 self.assertFalse(any("looks like a URL" in str(w.message)
318 for w in warning_list))
319
320 def test_url_warning_with_unicode_and_space(self):
321 with warnings.catch_warnings(record=True) as warning_list:
322 soup = self.soup("http://www.crummyuncode.com/ is great")
323 self.assertFalse(any("looks like a URL" in str(w.message)
324 for w in warning_list))
325
326
327 class TestSelectiveParsing(SoupTest):
328
329 def test_parse_with_soupstrainer(self):
330 markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
331 strainer = SoupStrainer("b")
332 soup = self.soup(markup, parse_only=strainer)
333 self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
334
335
336 class TestEntitySubstitution(unittest.TestCase):
337 """Standalone tests of the EntitySubstitution class."""
338 def setUp(self):
339 self.sub = EntitySubstitution
340
341 def test_simple_html_substitution(self):
342 # Unicode characters corresponding to named HTML entites
343 # are substituted, and no others.
344 s = "foo\u2200\N{SNOWMAN}\u00f5bar"
345 self.assertEqual(self.sub.substitute_html(s),
346 "foo&forall;\N{SNOWMAN}&otilde;bar")
347
348 def test_smart_quote_substitution(self):
349 # MS smart quotes are a common source of frustration, so we
350 # give them a special test.
351 quotes = b"\x91\x92foo\x93\x94"
352 dammit = UnicodeDammit(quotes)
353 self.assertEqual(self.sub.substitute_html(dammit.markup),
354 "&lsquo;&rsquo;foo&ldquo;&rdquo;")
355
356 def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
357 s = 'Welcome to "my bar"'
358 self.assertEqual(self.sub.substitute_xml(s, False), s)
359
360 def test_xml_attribute_quoting_normally_uses_double_quotes(self):
361 self.assertEqual(self.sub.substitute_xml("Welcome", True),
362 '"Welcome"')
363 self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
364 '"Bob\'s Bar"')
365
366 def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
367 s = 'Welcome to "my bar"'
368 self.assertEqual(self.sub.substitute_xml(s, True),
369 "'Welcome to \"my bar\"'")
370
371 def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
372 s = 'Welcome to "Bob\'s Bar"'
373 self.assertEqual(
374 self.sub.substitute_xml(s, True),
375 '"Welcome to &quot;Bob\'s Bar&quot;"')
376
377 def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
378 quoted = 'Welcome to "Bob\'s Bar"'
379 self.assertEqual(self.sub.substitute_xml(quoted), quoted)
380
381 def test_xml_quoting_handles_angle_brackets(self):
382 self.assertEqual(
383 self.sub.substitute_xml("foo<bar>"),
384 "foo&lt;bar&gt;")
385
386 def test_xml_quoting_handles_ampersands(self):
387 self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
388
389 def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
390 self.assertEqual(
391 self.sub.substitute_xml("&Aacute;T&T"),
392 "&amp;Aacute;T&amp;T")
393
394 def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
395 self.assertEqual(
396 self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
397 "&Aacute;T&amp;T")
398
399 def test_quotes_not_html_substituted(self):
400 """There's no need to do this except inside attribute values."""
401 text = 'Bob\'s "bar"'
402 self.assertEqual(self.sub.substitute_html(text), text)
403
404
405 class TestEncodingConversion(SoupTest):
406 # Test Beautiful Soup's ability to decode and encode from various
407 # encodings.
408
409 def setUp(self):
410 super(TestEncodingConversion, self).setUp()
411 self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
412 self.utf8_data = self.unicode_data.encode("utf-8")
413 # Just so you know what it looks like.
414 self.assertEqual(
415 self.utf8_data,
416 b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
417
418 def test_ascii_in_unicode_out(self):
419 # ASCII input is converted to Unicode. The original_encoding
420 # attribute is set to 'utf-8', a superset of ASCII.
421 chardet = bs4.dammit.chardet_dammit
422 logging.disable(logging.WARNING)
423 try:
424 def noop(str):
425 return None
426 # Disable chardet, which will realize that the ASCII is ASCII.
427 bs4.dammit.chardet_dammit = noop
428 ascii = b"<foo>a</foo>"
429 soup_from_ascii = self.soup(ascii)
430 unicode_output = soup_from_ascii.decode()
431 self.assertTrue(isinstance(unicode_output, str))
432 self.assertEqual(unicode_output, self.document_for(ascii.decode()))
433 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
434 finally:
435 logging.disable(logging.NOTSET)
436 bs4.dammit.chardet_dammit = chardet
437
438 def test_unicode_in_unicode_out(self):
439 # Unicode input is left alone. The original_encoding attribute
440 # is not set.
441 soup_from_unicode = self.soup(self.unicode_data)
442 self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
443 self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
444 self.assertEqual(soup_from_unicode.original_encoding, None)
445
446 def test_utf8_in_unicode_out(self):
447 # UTF-8 input is converted to Unicode. The original_encoding
448 # attribute is set.
449 soup_from_utf8 = self.soup(self.utf8_data)
450 self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
451 self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
452
453 def test_utf8_out(self):
454 # The internal data structures can be encoded as UTF-8.
455 soup_from_unicode = self.soup(self.unicode_data)
456 self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
457
458 @skipIf(
459 PYTHON_3_PRE_3_2,
460 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
461 def test_attribute_name_containing_unicode_characters(self):
462 markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
463 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
464
465 class TestUnicodeDammit(unittest.TestCase):
466 """Standalone tests of UnicodeDammit."""
467
468 def test_unicode_input(self):
469 markup = "I'm already Unicode! \N{SNOWMAN}"
470 dammit = UnicodeDammit(markup)
471 self.assertEqual(dammit.unicode_markup, markup)
472
473 def test_smart_quotes_to_unicode(self):
474 markup = b"<foo>\x91\x92\x93\x94</foo>"
475 dammit = UnicodeDammit(markup)
476 self.assertEqual(
477 dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
478
479 def test_smart_quotes_to_xml_entities(self):
480 markup = b"<foo>\x91\x92\x93\x94</foo>"
481 dammit = UnicodeDammit(markup, smart_quotes_to="xml")
482 self.assertEqual(
483 dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
484
485 def test_smart_quotes_to_html_entities(self):
486 markup = b"<foo>\x91\x92\x93\x94</foo>"
487 dammit = UnicodeDammit(markup, smart_quotes_to="html")
488 self.assertEqual(
489 dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
490
491 def test_smart_quotes_to_ascii(self):
492 markup = b"<foo>\x91\x92\x93\x94</foo>"
493 dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
494 self.assertEqual(
495 dammit.unicode_markup, """<foo>''""</foo>""")
496
497 def test_detect_utf8(self):
498 utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
499 dammit = UnicodeDammit(utf8)
500 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
501 self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
502
503
504 def test_convert_hebrew(self):
505 hebrew = b"\xed\xe5\xec\xf9"
506 dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
507 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
508 self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
509
510 def test_dont_see_smart_quotes_where_there_are_none(self):
511 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
512 dammit = UnicodeDammit(utf_8)
513 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
514 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
515
516 def test_ignore_inappropriate_codecs(self):
517 utf8_data = "Räksmörgås".encode("utf-8")
518 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
519 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
520
521 def test_ignore_invalid_codecs(self):
522 utf8_data = "Räksmörgås".encode("utf-8")
523 for bad_encoding in ['.utf8', '...', 'utF---16.!']:
524 dammit = UnicodeDammit(utf8_data, [bad_encoding])
525 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
526
527 def test_exclude_encodings(self):
528 # This is UTF-8.
529 utf8_data = "Räksmörgås".encode("utf-8")
530
531 # But if we exclude UTF-8 from consideration, the guess is
532 # Windows-1252.
533 dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
534 self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
535
536 # And if we exclude that, there is no valid guess at all.
537 dammit = UnicodeDammit(
538 utf8_data, exclude_encodings=["utf-8", "windows-1252"])
539 self.assertEqual(dammit.original_encoding, None)
540
541 def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
542 detected = EncodingDetector(
543 b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
544 encodings = list(detected.encodings)
545 assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
546
547 def test_detect_html5_style_meta_tag(self):
548
549 for data in (
550 b'<html><meta charset="euc-jp" /></html>',
551 b"<html><meta charset='euc-jp' /></html>",
552 b"<html><meta charset=euc-jp /></html>",
553 b"<html><meta charset=euc-jp/></html>"):
554 dammit = UnicodeDammit(data, is_html=True)
555 self.assertEqual(
556 "euc-jp", dammit.original_encoding)
557
558 def test_last_ditch_entity_replacement(self):
559 # This is a UTF-8 document that contains bytestrings
560 # completely incompatible with UTF-8 (ie. encoded with some other
561 # encoding).
562 #
563 # Since there is no consistent encoding for the document,
564 # Unicode, Dammit will eventually encode the document as UTF-8
565 # and encode the incompatible characters as REPLACEMENT
566 # CHARACTER.
567 #
568 # If chardet is installed, it will detect that the document
569 # can be converted into ISO-8859-1 without errors. This happens
570 # to be the wrong encoding, but it is a consistent encoding, so the
571 # code we're testing here won't run.
572 #
573 # So we temporarily disable chardet if it's present.
574 doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
575 <html><b>\330\250\330\252\330\261</b>
576 <i>\310\322\321\220\312\321\355\344</i></html>"""
577 chardet = bs4.dammit.chardet_dammit
578 logging.disable(logging.WARNING)
579 try:
580 def noop(str):
581 return None
582 bs4.dammit.chardet_dammit = noop
583 dammit = UnicodeDammit(doc)
584 self.assertEqual(True, dammit.contains_replacement_characters)
585 self.assertTrue("\ufffd" in dammit.unicode_markup)
586
587 soup = BeautifulSoup(doc, "html.parser")
588 self.assertTrue(soup.contains_replacement_characters)
589 finally:
590 logging.disable(logging.NOTSET)
591 bs4.dammit.chardet_dammit = chardet
592
593 def test_byte_order_mark_removed(self):
594 # A document written in UTF-16LE will have its byte order marker stripped.
595 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
596 dammit = UnicodeDammit(data)
597 self.assertEqual("<a>áé</a>", dammit.unicode_markup)
598 self.assertEqual("utf-16le", dammit.original_encoding)
599
600 def test_detwingle(self):
601 # Here's a UTF8 document.
602 utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
603
604 # Here's a Windows-1252 document.
605 windows_1252 = (
606 "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
607 "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
608
609 # Through some unholy alchemy, they've been stuck together.
610 doc = utf8 + windows_1252 + utf8
611
612 # The document can't be turned into UTF-8:
613 self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
614
615 # Unicode, Dammit thinks the whole document is Windows-1252,
616 # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
617
618 # But if we run it through fix_embedded_windows_1252, it's fixed:
619
620 fixed = UnicodeDammit.detwingle(doc)
621 self.assertEqual(
622 "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
623
624 def test_detwingle_ignores_multibyte_characters(self):
625 # Each of these characters has a UTF-8 representation ending
626 # in \x93. \x93 is a smart quote if interpreted as
627 # Windows-1252. But our code knows to skip over multibyte
628 # UTF-8 characters, so they'll survive the process unscathed.
629 for tricky_unicode_char in (
630 "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
631 "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
632 "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
633 ):
634 input = tricky_unicode_char.encode("utf8")
635 self.assertTrue(input.endswith(b'\x93'))
636 output = UnicodeDammit.detwingle(input)
637 self.assertEqual(output, input)
638
639 def test_find_declared_encoding(self):
640 # Test our ability to find a declared encoding inside an
641 # XML or HTML document.
642 #
643 # Even if the document comes in as Unicode, it may be
644 # interesting to know what encoding was claimed
645 # originally.
646
647 html_unicode = '<html><head><meta charset="utf-8"></head></html>'
648 html_bytes = html_unicode.encode("ascii")
649
650 xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
651 xml_bytes = xml_unicode.encode("ascii")
652
653 m = EncodingDetector.find_declared_encoding
654 self.assertEqual(None, m(html_unicode, is_html=False))
655 self.assertEqual("utf-8", m(html_unicode, is_html=True))
656 self.assertEqual("utf-8", m(html_bytes, is_html=True))
657
658 self.assertEqual("iso-8859-1", m(xml_unicode))
659 self.assertEqual("iso-8859-1", m(xml_bytes))
660
661 # Normally, only the first few kilobytes of a document are checked for
662 # an encoding.
663 spacer = b' ' * 5000
664 self.assertEqual(None, m(spacer + html_bytes))
665 self.assertEqual(None, m(spacer + xml_bytes))
666
667 # But you can tell find_declared_encoding to search an entire
668 # HTML document.
669 self.assertEqual(
670 "utf-8",
671 m(spacer + html_bytes, is_html=True, search_entire_document=True)
672 )
673
674 # The XML encoding declaration has to be the very first thing
675 # in the document. We'll allow whitespace before the document
676 # starts, but nothing else.
677 self.assertEqual(
678 "iso-8859-1",
679 m(xml_bytes, search_entire_document=True)
680 )
681 self.assertEqual(
682 None, m(b'a' + xml_bytes, search_entire_document=True)
683 )
684
685 class TestNamedspacedAttribute(SoupTest):
686
687 def test_name_may_be_none_or_missing(self):
688 a = NamespacedAttribute("xmlns", None)
689 self.assertEqual(a, "xmlns")
690
691 a = NamespacedAttribute("xmlns")
692 self.assertEqual(a, "xmlns")
693
694 def test_attribute_is_equivalent_to_colon_separated_string(self):
695 a = NamespacedAttribute("a", "b")
696 self.assertEqual("a:b", a)
697
698 def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
699 a = NamespacedAttribute("a", "b", "c")
700 b = NamespacedAttribute("a", "b", "c")
701 self.assertEqual(a, b)
702
703 # The actual namespace is not considered.
704 c = NamespacedAttribute("a", "b", None)
705 self.assertEqual(a, c)
706
707 # But name and prefix are important.
708 d = NamespacedAttribute("a", "z", "c")
709 self.assertNotEqual(a, d)
710
711 e = NamespacedAttribute("z", "b", "c")
712 self.assertNotEqual(a, e)
713
714
715 class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
716
717 def test_content_meta_attribute_value(self):
718 value = CharsetMetaAttributeValue("euc-jp")
719 self.assertEqual("euc-jp", value)
720 self.assertEqual("euc-jp", value.original_value)
721 self.assertEqual("utf8", value.encode("utf8"))
722
723
724 def test_content_meta_attribute_value(self):
725 value = ContentMetaAttributeValue("text/html; charset=euc-jp")
726 self.assertEqual("text/html; charset=euc-jp", value)
727 self.assertEqual("text/html; charset=euc-jp", value.original_value)
728 self.assertEqual("text/html; charset=utf8", value.encode("utf8"))