comparison env/lib/python3.9/site-packages/bs4/dammit.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # -*- coding: utf-8 -*-
2 """Beautiful Soup bonus library: Unicode, Dammit
3
4 This library converts a bytestream to Unicode through any means
5 necessary. It is heavily based on code from Mark Pilgrim's Universal
6 Feed Parser. It works best on XML and HTML, but it does not rewrite the
7 XML or HTML to reflect a new encoding; that's the tree builder's job.
8 """
9 # Use of this source code is governed by the MIT license.
10 __license__ = "MIT"
11
12 import codecs
13 from html.entities import codepoint2name
14 import re
15 import logging
16 import string
17
18 # Import a library to autodetect character encodings.
19 chardet_type = None
20 try:
21 # First try the fast C implementation.
22 # PyPI package: cchardet
23 import cchardet
24 def chardet_dammit(s):
25 if isinstance(s, str):
26 return None
27 return cchardet.detect(s)['encoding']
28 except ImportError:
29 try:
30 # Fall back to the pure Python implementation
31 # Debian package: python-chardet
32 # PyPI package: chardet
33 import chardet
34 def chardet_dammit(s):
35 if isinstance(s, str):
36 return None
37 return chardet.detect(s)['encoding']
38 #import chardet.constants
39 #chardet.constants._debug = 1
40 except ImportError:
41 # No chardet available.
42 def chardet_dammit(s):
43 return None
44
45 # Available from http://cjkpython.i18n.org/.
46 #
47 # TODO: This doesn't work anymore and the closest thing, iconv_codecs,
48 # is GPL-licensed. Check whether this is still necessary.
49 try:
50 import iconv_codec
51 except ImportError:
52 pass
53
54 # Build bytestring and Unicode versions of regular expressions for finding
55 # a declared encoding inside an XML or HTML document.
56 xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
57 html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
58 encoding_res = dict()
59 encoding_res[bytes] = {
60 'html' : re.compile(html_meta.encode("ascii"), re.I),
61 'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
62 }
63 encoding_res[str] = {
64 'html' : re.compile(html_meta, re.I),
65 'xml' : re.compile(xml_encoding, re.I)
66 }
67
68 class EntitySubstitution(object):
69 """The ability to substitute XML or HTML entities for certain characters."""
70
71 def _populate_class_variables():
72 lookup = {}
73 reverse_lookup = {}
74 characters_for_re = []
75
76 # &apos is an XHTML entity and an HTML 5, but not an HTML 4
77 # entity. We don't want to use it, but we want to recognize it on the way in.
78 #
79 # TODO: Ideally we would be able to recognize all HTML 5 named
80 # entities, but that's a little tricky.
81 extra = [(39, 'apos')]
82 for codepoint, name in list(codepoint2name.items()) + extra:
83 character = chr(codepoint)
84 if codepoint not in (34, 39):
85 # There's no point in turning the quotation mark into
86 # &quot; or the single quote into &apos;, unless it
87 # happens within an attribute value, which is handled
88 # elsewhere.
89 characters_for_re.append(character)
90 lookup[character] = name
91 # But we do want to recognize those entities on the way in and
92 # convert them to Unicode characters.
93 reverse_lookup[name] = character
94 re_definition = "[%s]" % "".join(characters_for_re)
95 return lookup, reverse_lookup, re.compile(re_definition)
96 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
97 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
98
99 CHARACTER_TO_XML_ENTITY = {
100 "'": "apos",
101 '"': "quot",
102 "&": "amp",
103 "<": "lt",
104 ">": "gt",
105 }
106
107 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
108 "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
109 ")")
110
111 AMPERSAND_OR_BRACKET = re.compile("([<>&])")
112
113 @classmethod
114 def _substitute_html_entity(cls, matchobj):
115 """Used with a regular expression to substitute the
116 appropriate HTML entity for a special character."""
117 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
118 return "&%s;" % entity
119
120 @classmethod
121 def _substitute_xml_entity(cls, matchobj):
122 """Used with a regular expression to substitute the
123 appropriate XML entity for a special character."""
124 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
125 return "&%s;" % entity
126
127 @classmethod
128 def quoted_attribute_value(self, value):
129 """Make a value into a quoted XML attribute, possibly escaping it.
130
131 Most strings will be quoted using double quotes.
132
133 Bob's Bar -> "Bob's Bar"
134
135 If a string contains double quotes, it will be quoted using
136 single quotes.
137
138 Welcome to "my bar" -> 'Welcome to "my bar"'
139
140 If a string contains both single and double quotes, the
141 double quotes will be escaped, and the string will be quoted
142 using double quotes.
143
144 Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
145 """
146 quote_with = '"'
147 if '"' in value:
148 if "'" in value:
149 # The string contains both single and double
150 # quotes. Turn the double quotes into
151 # entities. We quote the double quotes rather than
152 # the single quotes because the entity name is
153 # "&quot;" whether this is HTML or XML. If we
154 # quoted the single quotes, we'd have to decide
155 # between &apos; and &squot;.
156 replace_with = "&quot;"
157 value = value.replace('"', replace_with)
158 else:
159 # There are double quotes but no single quotes.
160 # We can use single quotes to quote the attribute.
161 quote_with = "'"
162 return quote_with + value + quote_with
163
164 @classmethod
165 def substitute_xml(cls, value, make_quoted_attribute=False):
166 """Substitute XML entities for special XML characters.
167
168 :param value: A string to be substituted. The less-than sign
169 will become &lt;, the greater-than sign will become &gt;,
170 and any ampersands will become &amp;. If you want ampersands
171 that appear to be part of an entity definition to be left
172 alone, use substitute_xml_containing_entities() instead.
173
174 :param make_quoted_attribute: If True, then the string will be
175 quoted, as befits an attribute value.
176 """
177 # Escape angle brackets and ampersands.
178 value = cls.AMPERSAND_OR_BRACKET.sub(
179 cls._substitute_xml_entity, value)
180
181 if make_quoted_attribute:
182 value = cls.quoted_attribute_value(value)
183 return value
184
185 @classmethod
186 def substitute_xml_containing_entities(
187 cls, value, make_quoted_attribute=False):
188 """Substitute XML entities for special XML characters.
189
190 :param value: A string to be substituted. The less-than sign will
191 become &lt;, the greater-than sign will become &gt;, and any
192 ampersands that are not part of an entity defition will
193 become &amp;.
194
195 :param make_quoted_attribute: If True, then the string will be
196 quoted, as befits an attribute value.
197 """
198 # Escape angle brackets, and ampersands that aren't part of
199 # entities.
200 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
201 cls._substitute_xml_entity, value)
202
203 if make_quoted_attribute:
204 value = cls.quoted_attribute_value(value)
205 return value
206
207 @classmethod
208 def substitute_html(cls, s):
209 """Replace certain Unicode characters with named HTML entities.
210
211 This differs from data.encode(encoding, 'xmlcharrefreplace')
212 in that the goal is to make the result more readable (to those
213 with ASCII displays) rather than to recover from
214 errors. There's absolutely nothing wrong with a UTF-8 string
215 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
216 character with "&eacute;" will make it more readable to some
217 people.
218
219 :param s: A Unicode string.
220 """
221 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
222 cls._substitute_html_entity, s)
223
224
225 class EncodingDetector:
226 """Suggests a number of possible encodings for a bytestring.
227
228 Order of precedence:
229
230 1. Encodings you specifically tell EncodingDetector to try first
231 (the override_encodings argument to the constructor).
232
233 2. An encoding declared within the bytestring itself, either in an
234 XML declaration (if the bytestring is to be interpreted as an XML
235 document), or in a <meta> tag (if the bytestring is to be
236 interpreted as an HTML document.)
237
238 3. An encoding detected through textual analysis by chardet,
239 cchardet, or a similar external library.
240
241 4. UTF-8.
242
243 5. Windows-1252.
244 """
245 def __init__(self, markup, override_encodings=None, is_html=False,
246 exclude_encodings=None):
247 """Constructor.
248
249 :param markup: Some markup in an unknown encoding.
250 :param override_encodings: These encodings will be tried first.
251 :param is_html: If True, this markup is considered to be HTML. Otherwise
252 it's assumed to be XML.
253 :param exclude_encodings: These encodings will not be tried, even
254 if they otherwise would be.
255 """
256 self.override_encodings = override_encodings or []
257 exclude_encodings = exclude_encodings or []
258 self.exclude_encodings = set([x.lower() for x in exclude_encodings])
259 self.chardet_encoding = None
260 self.is_html = is_html
261 self.declared_encoding = None
262
263 # First order of business: strip a byte-order mark.
264 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
265
266 def _usable(self, encoding, tried):
267 """Should we even bother to try this encoding?
268
269 :param encoding: Name of an encoding.
270 :param tried: Encodings that have already been tried. This will be modified
271 as a side effect.
272 """
273 if encoding is not None:
274 encoding = encoding.lower()
275 if encoding in self.exclude_encodings:
276 return False
277 if encoding not in tried:
278 tried.add(encoding)
279 return True
280 return False
281
282 @property
283 def encodings(self):
284 """Yield a number of encodings that might work for this markup.
285
286 :yield: A sequence of strings.
287 """
288 tried = set()
289 for e in self.override_encodings:
290 if self._usable(e, tried):
291 yield e
292
293 # Did the document originally start with a byte-order mark
294 # that indicated its encoding?
295 if self._usable(self.sniffed_encoding, tried):
296 yield self.sniffed_encoding
297
298 # Look within the document for an XML or HTML encoding
299 # declaration.
300 if self.declared_encoding is None:
301 self.declared_encoding = self.find_declared_encoding(
302 self.markup, self.is_html)
303 if self._usable(self.declared_encoding, tried):
304 yield self.declared_encoding
305
306 # Use third-party character set detection to guess at the
307 # encoding.
308 if self.chardet_encoding is None:
309 self.chardet_encoding = chardet_dammit(self.markup)
310 if self._usable(self.chardet_encoding, tried):
311 yield self.chardet_encoding
312
313 # As a last-ditch effort, try utf-8 and windows-1252.
314 for e in ('utf-8', 'windows-1252'):
315 if self._usable(e, tried):
316 yield e
317
318 @classmethod
319 def strip_byte_order_mark(cls, data):
320 """If a byte-order mark is present, strip it and return the encoding it implies.
321
322 :param data: Some markup.
323 :return: A 2-tuple (modified data, implied encoding)
324 """
325 encoding = None
326 if isinstance(data, str):
327 # Unicode data cannot have a byte-order mark.
328 return data, encoding
329 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
330 and (data[2:4] != '\x00\x00'):
331 encoding = 'utf-16be'
332 data = data[2:]
333 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
334 and (data[2:4] != '\x00\x00'):
335 encoding = 'utf-16le'
336 data = data[2:]
337 elif data[:3] == b'\xef\xbb\xbf':
338 encoding = 'utf-8'
339 data = data[3:]
340 elif data[:4] == b'\x00\x00\xfe\xff':
341 encoding = 'utf-32be'
342 data = data[4:]
343 elif data[:4] == b'\xff\xfe\x00\x00':
344 encoding = 'utf-32le'
345 data = data[4:]
346 return data, encoding
347
348 @classmethod
349 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
350 """Given a document, tries to find its declared encoding.
351
352 An XML encoding is declared at the beginning of the document.
353
354 An HTML encoding is declared in a <meta> tag, hopefully near the
355 beginning of the document.
356
357 :param markup: Some markup.
358 :param is_html: If True, this markup is considered to be HTML. Otherwise
359 it's assumed to be XML.
360 :param search_entire_document: Since an encoding is supposed to declared near the beginning
361 of the document, most of the time it's only necessary to search a few kilobytes of data.
362 Set this to True to force this method to search the entire document.
363 """
364 if search_entire_document:
365 xml_endpos = html_endpos = len(markup)
366 else:
367 xml_endpos = 1024
368 html_endpos = max(2048, int(len(markup) * 0.05))
369
370 if isinstance(markup, bytes):
371 res = encoding_res[bytes]
372 else:
373 res = encoding_res[str]
374
375 xml_re = res['xml']
376 html_re = res['html']
377 declared_encoding = None
378 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
379 if not declared_encoding_match and is_html:
380 declared_encoding_match = html_re.search(markup, endpos=html_endpos)
381 if declared_encoding_match is not None:
382 declared_encoding = declared_encoding_match.groups()[0]
383 if declared_encoding:
384 if isinstance(declared_encoding, bytes):
385 declared_encoding = declared_encoding.decode('ascii', 'replace')
386 return declared_encoding.lower()
387 return None
388
389 class UnicodeDammit:
390 """A class for detecting the encoding of a *ML document and
391 converting it to a Unicode string. If the source encoding is
392 windows-1252, can replace MS smart quotes with their HTML or XML
393 equivalents."""
394
395 # This dictionary maps commonly seen values for "charset" in HTML
396 # meta tags to the corresponding Python codec names. It only covers
397 # values that aren't in Python's aliases and can't be determined
398 # by the heuristics in find_codec.
399 CHARSET_ALIASES = {"macintosh": "mac-roman",
400 "x-sjis": "shift-jis"}
401
402 ENCODINGS_WITH_SMART_QUOTES = [
403 "windows-1252",
404 "iso-8859-1",
405 "iso-8859-2",
406 ]
407
408 def __init__(self, markup, override_encodings=[],
409 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
410 """Constructor.
411
412 :param markup: A bytestring representing markup in an unknown encoding.
413 :param override_encodings: These encodings will be tried first,
414 before any sniffing code is run.
415
416 :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
417 to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
418 Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
419 will convert them to HTML entity references.
420 :param is_html: If True, this markup is considered to be HTML. Otherwise
421 it's assumed to be XML.
422 :param exclude_encodings: These encodings will not be considered, even
423 if the sniffing code thinks they might make sense.
424 """
425 self.smart_quotes_to = smart_quotes_to
426 self.tried_encodings = []
427 self.contains_replacement_characters = False
428 self.is_html = is_html
429 self.log = logging.getLogger(__name__)
430 self.detector = EncodingDetector(
431 markup, override_encodings, is_html, exclude_encodings)
432
433 # Short-circuit if the data is in Unicode to begin with.
434 if isinstance(markup, str) or markup == '':
435 self.markup = markup
436 self.unicode_markup = str(markup)
437 self.original_encoding = None
438 return
439
440 # The encoding detector may have stripped a byte-order mark.
441 # Use the stripped markup from this point on.
442 self.markup = self.detector.markup
443
444 u = None
445 for encoding in self.detector.encodings:
446 markup = self.detector.markup
447 u = self._convert_from(encoding)
448 if u is not None:
449 break
450
451 if not u:
452 # None of the encodings worked. As an absolute last resort,
453 # try them again with character replacement.
454
455 for encoding in self.detector.encodings:
456 if encoding != "ascii":
457 u = self._convert_from(encoding, "replace")
458 if u is not None:
459 self.log.warning(
460 "Some characters could not be decoded, and were "
461 "replaced with REPLACEMENT CHARACTER."
462 )
463 self.contains_replacement_characters = True
464 break
465
466 # If none of that worked, we could at this point force it to
467 # ASCII, but that would destroy so much data that I think
468 # giving up is better.
469 self.unicode_markup = u
470 if not u:
471 self.original_encoding = None
472
473 def _sub_ms_char(self, match):
474 """Changes a MS smart quote character to an XML or HTML
475 entity, or an ASCII character."""
476 orig = match.group(1)
477 if self.smart_quotes_to == 'ascii':
478 sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
479 else:
480 sub = self.MS_CHARS.get(orig)
481 if type(sub) == tuple:
482 if self.smart_quotes_to == 'xml':
483 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
484 else:
485 sub = '&'.encode() + sub[0].encode() + ';'.encode()
486 else:
487 sub = sub.encode()
488 return sub
489
490 def _convert_from(self, proposed, errors="strict"):
491 """Attempt to convert the markup to the proposed encoding.
492
493 :param proposed: The name of a character encoding.
494 """
495 proposed = self.find_codec(proposed)
496 if not proposed or (proposed, errors) in self.tried_encodings:
497 return None
498 self.tried_encodings.append((proposed, errors))
499 markup = self.markup
500 # Convert smart quotes to HTML if coming from an encoding
501 # that might have them.
502 if (self.smart_quotes_to is not None
503 and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
504 smart_quotes_re = b"([\x80-\x9f])"
505 smart_quotes_compiled = re.compile(smart_quotes_re)
506 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
507
508 try:
509 #print("Trying to convert document to %s (errors=%s)" % (
510 # proposed, errors))
511 u = self._to_unicode(markup, proposed, errors)
512 self.markup = u
513 self.original_encoding = proposed
514 except Exception as e:
515 #print("That didn't work!")
516 #print(e)
517 return None
518 #print("Correct encoding: %s" % proposed)
519 return self.markup
520
521 def _to_unicode(self, data, encoding, errors="strict"):
522 """Given a string and its encoding, decodes the string into Unicode.
523
524 :param encoding: The name of an encoding.
525 """
526 return str(data, encoding, errors)
527
528 @property
529 def declared_html_encoding(self):
530 """If the markup is an HTML document, returns the encoding declared _within_
531 the document.
532 """
533 if not self.is_html:
534 return None
535 return self.detector.declared_encoding
536
537 def find_codec(self, charset):
538 """Convert the name of a character set to a codec name.
539
540 :param charset: The name of a character set.
541 :return: The name of a codec.
542 """
543 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
544 or (charset and self._codec(charset.replace("-", "")))
545 or (charset and self._codec(charset.replace("-", "_")))
546 or (charset and charset.lower())
547 or charset
548 )
549 if value:
550 return value.lower()
551 return None
552
553 def _codec(self, charset):
554 if not charset:
555 return charset
556 codec = None
557 try:
558 codecs.lookup(charset)
559 codec = charset
560 except (LookupError, ValueError):
561 pass
562 return codec
563
564
565 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
566 MS_CHARS = {b'\x80': ('euro', '20AC'),
567 b'\x81': ' ',
568 b'\x82': ('sbquo', '201A'),
569 b'\x83': ('fnof', '192'),
570 b'\x84': ('bdquo', '201E'),
571 b'\x85': ('hellip', '2026'),
572 b'\x86': ('dagger', '2020'),
573 b'\x87': ('Dagger', '2021'),
574 b'\x88': ('circ', '2C6'),
575 b'\x89': ('permil', '2030'),
576 b'\x8A': ('Scaron', '160'),
577 b'\x8B': ('lsaquo', '2039'),
578 b'\x8C': ('OElig', '152'),
579 b'\x8D': '?',
580 b'\x8E': ('#x17D', '17D'),
581 b'\x8F': '?',
582 b'\x90': '?',
583 b'\x91': ('lsquo', '2018'),
584 b'\x92': ('rsquo', '2019'),
585 b'\x93': ('ldquo', '201C'),
586 b'\x94': ('rdquo', '201D'),
587 b'\x95': ('bull', '2022'),
588 b'\x96': ('ndash', '2013'),
589 b'\x97': ('mdash', '2014'),
590 b'\x98': ('tilde', '2DC'),
591 b'\x99': ('trade', '2122'),
592 b'\x9a': ('scaron', '161'),
593 b'\x9b': ('rsaquo', '203A'),
594 b'\x9c': ('oelig', '153'),
595 b'\x9d': '?',
596 b'\x9e': ('#x17E', '17E'),
597 b'\x9f': ('Yuml', ''),}
598
599 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
600 # horrors like stripping diacritical marks to turn á into a, but also
601 # contains non-horrors like turning “ into ".
602 MS_CHARS_TO_ASCII = {
603 b'\x80' : 'EUR',
604 b'\x81' : ' ',
605 b'\x82' : ',',
606 b'\x83' : 'f',
607 b'\x84' : ',,',
608 b'\x85' : '...',
609 b'\x86' : '+',
610 b'\x87' : '++',
611 b'\x88' : '^',
612 b'\x89' : '%',
613 b'\x8a' : 'S',
614 b'\x8b' : '<',
615 b'\x8c' : 'OE',
616 b'\x8d' : '?',
617 b'\x8e' : 'Z',
618 b'\x8f' : '?',
619 b'\x90' : '?',
620 b'\x91' : "'",
621 b'\x92' : "'",
622 b'\x93' : '"',
623 b'\x94' : '"',
624 b'\x95' : '*',
625 b'\x96' : '-',
626 b'\x97' : '--',
627 b'\x98' : '~',
628 b'\x99' : '(TM)',
629 b'\x9a' : 's',
630 b'\x9b' : '>',
631 b'\x9c' : 'oe',
632 b'\x9d' : '?',
633 b'\x9e' : 'z',
634 b'\x9f' : 'Y',
635 b'\xa0' : ' ',
636 b'\xa1' : '!',
637 b'\xa2' : 'c',
638 b'\xa3' : 'GBP',
639 b'\xa4' : '$', #This approximation is especially parochial--this is the
640 #generic currency symbol.
641 b'\xa5' : 'YEN',
642 b'\xa6' : '|',
643 b'\xa7' : 'S',
644 b'\xa8' : '..',
645 b'\xa9' : '',
646 b'\xaa' : '(th)',
647 b'\xab' : '<<',
648 b'\xac' : '!',
649 b'\xad' : ' ',
650 b'\xae' : '(R)',
651 b'\xaf' : '-',
652 b'\xb0' : 'o',
653 b'\xb1' : '+-',
654 b'\xb2' : '2',
655 b'\xb3' : '3',
656 b'\xb4' : ("'", 'acute'),
657 b'\xb5' : 'u',
658 b'\xb6' : 'P',
659 b'\xb7' : '*',
660 b'\xb8' : ',',
661 b'\xb9' : '1',
662 b'\xba' : '(th)',
663 b'\xbb' : '>>',
664 b'\xbc' : '1/4',
665 b'\xbd' : '1/2',
666 b'\xbe' : '3/4',
667 b'\xbf' : '?',
668 b'\xc0' : 'A',
669 b'\xc1' : 'A',
670 b'\xc2' : 'A',
671 b'\xc3' : 'A',
672 b'\xc4' : 'A',
673 b'\xc5' : 'A',
674 b'\xc6' : 'AE',
675 b'\xc7' : 'C',
676 b'\xc8' : 'E',
677 b'\xc9' : 'E',
678 b'\xca' : 'E',
679 b'\xcb' : 'E',
680 b'\xcc' : 'I',
681 b'\xcd' : 'I',
682 b'\xce' : 'I',
683 b'\xcf' : 'I',
684 b'\xd0' : 'D',
685 b'\xd1' : 'N',
686 b'\xd2' : 'O',
687 b'\xd3' : 'O',
688 b'\xd4' : 'O',
689 b'\xd5' : 'O',
690 b'\xd6' : 'O',
691 b'\xd7' : '*',
692 b'\xd8' : 'O',
693 b'\xd9' : 'U',
694 b'\xda' : 'U',
695 b'\xdb' : 'U',
696 b'\xdc' : 'U',
697 b'\xdd' : 'Y',
698 b'\xde' : 'b',
699 b'\xdf' : 'B',
700 b'\xe0' : 'a',
701 b'\xe1' : 'a',
702 b'\xe2' : 'a',
703 b'\xe3' : 'a',
704 b'\xe4' : 'a',
705 b'\xe5' : 'a',
706 b'\xe6' : 'ae',
707 b'\xe7' : 'c',
708 b'\xe8' : 'e',
709 b'\xe9' : 'e',
710 b'\xea' : 'e',
711 b'\xeb' : 'e',
712 b'\xec' : 'i',
713 b'\xed' : 'i',
714 b'\xee' : 'i',
715 b'\xef' : 'i',
716 b'\xf0' : 'o',
717 b'\xf1' : 'n',
718 b'\xf2' : 'o',
719 b'\xf3' : 'o',
720 b'\xf4' : 'o',
721 b'\xf5' : 'o',
722 b'\xf6' : 'o',
723 b'\xf7' : '/',
724 b'\xf8' : 'o',
725 b'\xf9' : 'u',
726 b'\xfa' : 'u',
727 b'\xfb' : 'u',
728 b'\xfc' : 'u',
729 b'\xfd' : 'y',
730 b'\xfe' : 'b',
731 b'\xff' : 'y',
732 }
733
734 # A map used when removing rogue Windows-1252/ISO-8859-1
735 # characters in otherwise UTF-8 documents.
736 #
737 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
738 # Windows-1252.
739 WINDOWS_1252_TO_UTF8 = {
740 0x80 : b'\xe2\x82\xac', # €
741 0x82 : b'\xe2\x80\x9a', # ‚
742 0x83 : b'\xc6\x92', # ƒ
743 0x84 : b'\xe2\x80\x9e', # „
744 0x85 : b'\xe2\x80\xa6', # …
745 0x86 : b'\xe2\x80\xa0', # †
746 0x87 : b'\xe2\x80\xa1', # ‡
747 0x88 : b'\xcb\x86', # ˆ
748 0x89 : b'\xe2\x80\xb0', # ‰
749 0x8a : b'\xc5\xa0', # Š
750 0x8b : b'\xe2\x80\xb9', # ‹
751 0x8c : b'\xc5\x92', # Œ
752 0x8e : b'\xc5\xbd', # Ž
753 0x91 : b'\xe2\x80\x98', # ‘
754 0x92 : b'\xe2\x80\x99', # ’
755 0x93 : b'\xe2\x80\x9c', # “
756 0x94 : b'\xe2\x80\x9d', # ”
757 0x95 : b'\xe2\x80\xa2', # •
758 0x96 : b'\xe2\x80\x93', # –
759 0x97 : b'\xe2\x80\x94', # —
760 0x98 : b'\xcb\x9c', # ˜
761 0x99 : b'\xe2\x84\xa2', # ™
762 0x9a : b'\xc5\xa1', # š
763 0x9b : b'\xe2\x80\xba', # ›
764 0x9c : b'\xc5\x93', # œ
765 0x9e : b'\xc5\xbe', # ž
766 0x9f : b'\xc5\xb8', # Ÿ
767 0xa0 : b'\xc2\xa0', #  
768 0xa1 : b'\xc2\xa1', # ¡
769 0xa2 : b'\xc2\xa2', # ¢
770 0xa3 : b'\xc2\xa3', # £
771 0xa4 : b'\xc2\xa4', # ¤
772 0xa5 : b'\xc2\xa5', # ¥
773 0xa6 : b'\xc2\xa6', # ¦
774 0xa7 : b'\xc2\xa7', # §
775 0xa8 : b'\xc2\xa8', # ¨
776 0xa9 : b'\xc2\xa9', # ©
777 0xaa : b'\xc2\xaa', # ª
778 0xab : b'\xc2\xab', # «
779 0xac : b'\xc2\xac', # ¬
780 0xad : b'\xc2\xad', # ­
781 0xae : b'\xc2\xae', # ®
782 0xaf : b'\xc2\xaf', # ¯
783 0xb0 : b'\xc2\xb0', # °
784 0xb1 : b'\xc2\xb1', # ±
785 0xb2 : b'\xc2\xb2', # ²
786 0xb3 : b'\xc2\xb3', # ³
787 0xb4 : b'\xc2\xb4', # ´
788 0xb5 : b'\xc2\xb5', # µ
789 0xb6 : b'\xc2\xb6', # ¶
790 0xb7 : b'\xc2\xb7', # ·
791 0xb8 : b'\xc2\xb8', # ¸
792 0xb9 : b'\xc2\xb9', # ¹
793 0xba : b'\xc2\xba', # º
794 0xbb : b'\xc2\xbb', # »
795 0xbc : b'\xc2\xbc', # ¼
796 0xbd : b'\xc2\xbd', # ½
797 0xbe : b'\xc2\xbe', # ¾
798 0xbf : b'\xc2\xbf', # ¿
799 0xc0 : b'\xc3\x80', # À
800 0xc1 : b'\xc3\x81', # Á
801 0xc2 : b'\xc3\x82', # Â
802 0xc3 : b'\xc3\x83', # Ã
803 0xc4 : b'\xc3\x84', # Ä
804 0xc5 : b'\xc3\x85', # Å
805 0xc6 : b'\xc3\x86', # Æ
806 0xc7 : b'\xc3\x87', # Ç
807 0xc8 : b'\xc3\x88', # È
808 0xc9 : b'\xc3\x89', # É
809 0xca : b'\xc3\x8a', # Ê
810 0xcb : b'\xc3\x8b', # Ë
811 0xcc : b'\xc3\x8c', # Ì
812 0xcd : b'\xc3\x8d', # Í
813 0xce : b'\xc3\x8e', # Î
814 0xcf : b'\xc3\x8f', # Ï
815 0xd0 : b'\xc3\x90', # Ð
816 0xd1 : b'\xc3\x91', # Ñ
817 0xd2 : b'\xc3\x92', # Ò
818 0xd3 : b'\xc3\x93', # Ó
819 0xd4 : b'\xc3\x94', # Ô
820 0xd5 : b'\xc3\x95', # Õ
821 0xd6 : b'\xc3\x96', # Ö
822 0xd7 : b'\xc3\x97', # ×
823 0xd8 : b'\xc3\x98', # Ø
824 0xd9 : b'\xc3\x99', # Ù
825 0xda : b'\xc3\x9a', # Ú
826 0xdb : b'\xc3\x9b', # Û
827 0xdc : b'\xc3\x9c', # Ü
828 0xdd : b'\xc3\x9d', # Ý
829 0xde : b'\xc3\x9e', # Þ
830 0xdf : b'\xc3\x9f', # ß
831 0xe0 : b'\xc3\xa0', # à
832 0xe1 : b'\xa1', # á
833 0xe2 : b'\xc3\xa2', # â
834 0xe3 : b'\xc3\xa3', # ã
835 0xe4 : b'\xc3\xa4', # ä
836 0xe5 : b'\xc3\xa5', # å
837 0xe6 : b'\xc3\xa6', # æ
838 0xe7 : b'\xc3\xa7', # ç
839 0xe8 : b'\xc3\xa8', # è
840 0xe9 : b'\xc3\xa9', # é
841 0xea : b'\xc3\xaa', # ê
842 0xeb : b'\xc3\xab', # ë
843 0xec : b'\xc3\xac', # ì
844 0xed : b'\xc3\xad', # í
845 0xee : b'\xc3\xae', # î
846 0xef : b'\xc3\xaf', # ï
847 0xf0 : b'\xc3\xb0', # ð
848 0xf1 : b'\xc3\xb1', # ñ
849 0xf2 : b'\xc3\xb2', # ò
850 0xf3 : b'\xc3\xb3', # ó
851 0xf4 : b'\xc3\xb4', # ô
852 0xf5 : b'\xc3\xb5', # õ
853 0xf6 : b'\xc3\xb6', # ö
854 0xf7 : b'\xc3\xb7', # ÷
855 0xf8 : b'\xc3\xb8', # ø
856 0xf9 : b'\xc3\xb9', # ù
857 0xfa : b'\xc3\xba', # ú
858 0xfb : b'\xc3\xbb', # û
859 0xfc : b'\xc3\xbc', # ü
860 0xfd : b'\xc3\xbd', # ý
861 0xfe : b'\xc3\xbe', # þ
862 }
863
864 MULTIBYTE_MARKERS_AND_SIZES = [
865 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
866 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
867 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
868 ]
869
870 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
871 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
872
873 @classmethod
874 def detwingle(cls, in_bytes, main_encoding="utf8",
875 embedded_encoding="windows-1252"):
876 """Fix characters from one encoding embedded in some other encoding.
877
878 Currently the only situation supported is Windows-1252 (or its
879 subset ISO-8859-1), embedded in UTF-8.
880
881 :param in_bytes: A bytestring that you suspect contains
882 characters from multiple encodings. Note that this _must_
883 be a bytestring. If you've already converted the document
884 to Unicode, you're too late.
885 :param main_encoding: The primary encoding of `in_bytes`.
886 :param embedded_encoding: The encoding that was used to embed characters
887 in the main document.
888 :return: A bytestring in which `embedded_encoding`
889 characters have been converted to their `main_encoding`
890 equivalents.
891 """
892 if embedded_encoding.replace('_', '-').lower() not in (
893 'windows-1252', 'windows_1252'):
894 raise NotImplementedError(
895 "Windows-1252 and ISO-8859-1 are the only currently supported "
896 "embedded encodings.")
897
898 if main_encoding.lower() not in ('utf8', 'utf-8'):
899 raise NotImplementedError(
900 "UTF-8 is the only currently supported main encoding.")
901
902 byte_chunks = []
903
904 chunk_start = 0
905 pos = 0
906 while pos < len(in_bytes):
907 byte = in_bytes[pos]
908 if not isinstance(byte, int):
909 # Python 2.x
910 byte = ord(byte)
911 if (byte >= cls.FIRST_MULTIBYTE_MARKER
912 and byte <= cls.LAST_MULTIBYTE_MARKER):
913 # This is the start of a UTF-8 multibyte character. Skip
914 # to the end.
915 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
916 if byte >= start and byte <= end:
917 pos += size
918 break
919 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
920 # We found a Windows-1252 character!
921 # Save the string up to this point as a chunk.
922 byte_chunks.append(in_bytes[chunk_start:pos])
923
924 # Now translate the Windows-1252 character into UTF-8
925 # and add it as another, one-byte chunk.
926 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
927 pos += 1
928 chunk_start = pos
929 else:
930 # Go on to the next character.
931 pos += 1
932 if chunk_start == 0:
933 # The string is unchanged.
934 return in_bytes
935 else:
936 # Store the final chunk.
937 byte_chunks.append(in_bytes[chunk_start:])
938 return b''.join(byte_chunks)
939