Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bs4/dammit.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
| author | shellac |
|---|---|
| date | Mon, 22 Mar 2021 18:12:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4f3585e2f14b |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """Beautiful Soup bonus library: Unicode, Dammit | |
| 3 | |
| 4 This library converts a bytestream to Unicode through any means | |
| 5 necessary. It is heavily based on code from Mark Pilgrim's Universal | |
| 6 Feed Parser. It works best on XML and HTML, but it does not rewrite the | |
| 7 XML or HTML to reflect a new encoding; that's the tree builder's job. | |
| 8 """ | |
| 9 # Use of this source code is governed by the MIT license. | |
| 10 __license__ = "MIT" | |
| 11 | |
| 12 import codecs | |
| 13 from html.entities import codepoint2name | |
| 14 import re | |
| 15 import logging | |
| 16 import string | |
| 17 | |
| 18 # Import a library to autodetect character encodings. | |
| 19 chardet_type = None | |
| 20 try: | |
| 21 # First try the fast C implementation. | |
| 22 # PyPI package: cchardet | |
| 23 import cchardet | |
| 24 def chardet_dammit(s): | |
| 25 if isinstance(s, str): | |
| 26 return None | |
| 27 return cchardet.detect(s)['encoding'] | |
| 28 except ImportError: | |
| 29 try: | |
| 30 # Fall back to the pure Python implementation | |
| 31 # Debian package: python-chardet | |
| 32 # PyPI package: chardet | |
| 33 import chardet | |
| 34 def chardet_dammit(s): | |
| 35 if isinstance(s, str): | |
| 36 return None | |
| 37 return chardet.detect(s)['encoding'] | |
| 38 #import chardet.constants | |
| 39 #chardet.constants._debug = 1 | |
| 40 except ImportError: | |
| 41 # No chardet available. | |
| 42 def chardet_dammit(s): | |
| 43 return None | |
| 44 | |
| 45 # Available from http://cjkpython.i18n.org/. | |
| 46 # | |
| 47 # TODO: This doesn't work anymore and the closest thing, iconv_codecs, | |
| 48 # is GPL-licensed. Check whether this is still necessary. | |
| 49 try: | |
| 50 import iconv_codec | |
| 51 except ImportError: | |
| 52 pass | |
| 53 | |
| 54 # Build bytestring and Unicode versions of regular expressions for finding | |
| 55 # a declared encoding inside an XML or HTML document. | |
| 56 xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' | |
| 57 html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' | |
| 58 encoding_res = dict() | |
| 59 encoding_res[bytes] = { | |
| 60 'html' : re.compile(html_meta.encode("ascii"), re.I), | |
| 61 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), | |
| 62 } | |
| 63 encoding_res[str] = { | |
| 64 'html' : re.compile(html_meta, re.I), | |
| 65 'xml' : re.compile(xml_encoding, re.I) | |
| 66 } | |
| 67 | |
| 68 class EntitySubstitution(object): | |
| 69 """The ability to substitute XML or HTML entities for certain characters.""" | |
| 70 | |
| 71 def _populate_class_variables(): | |
| 72 lookup = {} | |
| 73 reverse_lookup = {} | |
| 74 characters_for_re = [] | |
| 75 | |
| 76 # &apos is an XHTML entity and an HTML 5, but not an HTML 4 | |
| 77 # entity. We don't want to use it, but we want to recognize it on the way in. | |
| 78 # | |
| 79 # TODO: Ideally we would be able to recognize all HTML 5 named | |
| 80 # entities, but that's a little tricky. | |
| 81 extra = [(39, 'apos')] | |
| 82 for codepoint, name in list(codepoint2name.items()) + extra: | |
| 83 character = chr(codepoint) | |
| 84 if codepoint not in (34, 39): | |
| 85 # There's no point in turning the quotation mark into | |
| 86 # " or the single quote into ', unless it | |
| 87 # happens within an attribute value, which is handled | |
| 88 # elsewhere. | |
| 89 characters_for_re.append(character) | |
| 90 lookup[character] = name | |
| 91 # But we do want to recognize those entities on the way in and | |
| 92 # convert them to Unicode characters. | |
| 93 reverse_lookup[name] = character | |
| 94 re_definition = "[%s]" % "".join(characters_for_re) | |
| 95 return lookup, reverse_lookup, re.compile(re_definition) | |
| 96 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, | |
| 97 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() | |
| 98 | |
| 99 CHARACTER_TO_XML_ENTITY = { | |
| 100 "'": "apos", | |
| 101 '"': "quot", | |
| 102 "&": "amp", | |
| 103 "<": "lt", | |
| 104 ">": "gt", | |
| 105 } | |
| 106 | |
| 107 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" | |
| 108 "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" | |
| 109 ")") | |
| 110 | |
| 111 AMPERSAND_OR_BRACKET = re.compile("([<>&])") | |
| 112 | |
| 113 @classmethod | |
| 114 def _substitute_html_entity(cls, matchobj): | |
| 115 """Used with a regular expression to substitute the | |
| 116 appropriate HTML entity for a special character.""" | |
| 117 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) | |
| 118 return "&%s;" % entity | |
| 119 | |
| 120 @classmethod | |
| 121 def _substitute_xml_entity(cls, matchobj): | |
| 122 """Used with a regular expression to substitute the | |
| 123 appropriate XML entity for a special character.""" | |
| 124 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] | |
| 125 return "&%s;" % entity | |
| 126 | |
| 127 @classmethod | |
| 128 def quoted_attribute_value(self, value): | |
| 129 """Make a value into a quoted XML attribute, possibly escaping it. | |
| 130 | |
| 131 Most strings will be quoted using double quotes. | |
| 132 | |
| 133 Bob's Bar -> "Bob's Bar" | |
| 134 | |
| 135 If a string contains double quotes, it will be quoted using | |
| 136 single quotes. | |
| 137 | |
| 138 Welcome to "my bar" -> 'Welcome to "my bar"' | |
| 139 | |
| 140 If a string contains both single and double quotes, the | |
| 141 double quotes will be escaped, and the string will be quoted | |
| 142 using double quotes. | |
| 143 | |
| 144 Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" | |
| 145 """ | |
| 146 quote_with = '"' | |
| 147 if '"' in value: | |
| 148 if "'" in value: | |
| 149 # The string contains both single and double | |
| 150 # quotes. Turn the double quotes into | |
| 151 # entities. We quote the double quotes rather than | |
| 152 # the single quotes because the entity name is | |
| 153 # """ whether this is HTML or XML. If we | |
| 154 # quoted the single quotes, we'd have to decide | |
| 155 # between ' and &squot;. | |
| 156 replace_with = """ | |
| 157 value = value.replace('"', replace_with) | |
| 158 else: | |
| 159 # There are double quotes but no single quotes. | |
| 160 # We can use single quotes to quote the attribute. | |
| 161 quote_with = "'" | |
| 162 return quote_with + value + quote_with | |
| 163 | |
| 164 @classmethod | |
| 165 def substitute_xml(cls, value, make_quoted_attribute=False): | |
| 166 """Substitute XML entities for special XML characters. | |
| 167 | |
| 168 :param value: A string to be substituted. The less-than sign | |
| 169 will become <, the greater-than sign will become >, | |
| 170 and any ampersands will become &. If you want ampersands | |
| 171 that appear to be part of an entity definition to be left | |
| 172 alone, use substitute_xml_containing_entities() instead. | |
| 173 | |
| 174 :param make_quoted_attribute: If True, then the string will be | |
| 175 quoted, as befits an attribute value. | |
| 176 """ | |
| 177 # Escape angle brackets and ampersands. | |
| 178 value = cls.AMPERSAND_OR_BRACKET.sub( | |
| 179 cls._substitute_xml_entity, value) | |
| 180 | |
| 181 if make_quoted_attribute: | |
| 182 value = cls.quoted_attribute_value(value) | |
| 183 return value | |
| 184 | |
| 185 @classmethod | |
| 186 def substitute_xml_containing_entities( | |
| 187 cls, value, make_quoted_attribute=False): | |
| 188 """Substitute XML entities for special XML characters. | |
| 189 | |
| 190 :param value: A string to be substituted. The less-than sign will | |
| 191 become <, the greater-than sign will become >, and any | |
| 192 ampersands that are not part of an entity defition will | |
| 193 become &. | |
| 194 | |
| 195 :param make_quoted_attribute: If True, then the string will be | |
| 196 quoted, as befits an attribute value. | |
| 197 """ | |
| 198 # Escape angle brackets, and ampersands that aren't part of | |
| 199 # entities. | |
| 200 value = cls.BARE_AMPERSAND_OR_BRACKET.sub( | |
| 201 cls._substitute_xml_entity, value) | |
| 202 | |
| 203 if make_quoted_attribute: | |
| 204 value = cls.quoted_attribute_value(value) | |
| 205 return value | |
| 206 | |
| 207 @classmethod | |
| 208 def substitute_html(cls, s): | |
| 209 """Replace certain Unicode characters with named HTML entities. | |
| 210 | |
| 211 This differs from data.encode(encoding, 'xmlcharrefreplace') | |
| 212 in that the goal is to make the result more readable (to those | |
| 213 with ASCII displays) rather than to recover from | |
| 214 errors. There's absolutely nothing wrong with a UTF-8 string | |
| 215 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that | |
| 216 character with "é" will make it more readable to some | |
| 217 people. | |
| 218 | |
| 219 :param s: A Unicode string. | |
| 220 """ | |
| 221 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( | |
| 222 cls._substitute_html_entity, s) | |
| 223 | |
| 224 | |
| 225 class EncodingDetector: | |
| 226 """Suggests a number of possible encodings for a bytestring. | |
| 227 | |
| 228 Order of precedence: | |
| 229 | |
| 230 1. Encodings you specifically tell EncodingDetector to try first | |
| 231 (the override_encodings argument to the constructor). | |
| 232 | |
| 233 2. An encoding declared within the bytestring itself, either in an | |
| 234 XML declaration (if the bytestring is to be interpreted as an XML | |
| 235 document), or in a <meta> tag (if the bytestring is to be | |
| 236 interpreted as an HTML document.) | |
| 237 | |
| 238 3. An encoding detected through textual analysis by chardet, | |
| 239 cchardet, or a similar external library. | |
| 240 | |
| 241 4. UTF-8. | |
| 242 | |
| 243 5. Windows-1252. | |
| 244 """ | |
| 245 def __init__(self, markup, override_encodings=None, is_html=False, | |
| 246 exclude_encodings=None): | |
| 247 """Constructor. | |
| 248 | |
| 249 :param markup: Some markup in an unknown encoding. | |
| 250 :param override_encodings: These encodings will be tried first. | |
| 251 :param is_html: If True, this markup is considered to be HTML. Otherwise | |
| 252 it's assumed to be XML. | |
| 253 :param exclude_encodings: These encodings will not be tried, even | |
| 254 if they otherwise would be. | |
| 255 """ | |
| 256 self.override_encodings = override_encodings or [] | |
| 257 exclude_encodings = exclude_encodings or [] | |
| 258 self.exclude_encodings = set([x.lower() for x in exclude_encodings]) | |
| 259 self.chardet_encoding = None | |
| 260 self.is_html = is_html | |
| 261 self.declared_encoding = None | |
| 262 | |
| 263 # First order of business: strip a byte-order mark. | |
| 264 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) | |
| 265 | |
| 266 def _usable(self, encoding, tried): | |
| 267 """Should we even bother to try this encoding? | |
| 268 | |
| 269 :param encoding: Name of an encoding. | |
| 270 :param tried: Encodings that have already been tried. This will be modified | |
| 271 as a side effect. | |
| 272 """ | |
| 273 if encoding is not None: | |
| 274 encoding = encoding.lower() | |
| 275 if encoding in self.exclude_encodings: | |
| 276 return False | |
| 277 if encoding not in tried: | |
| 278 tried.add(encoding) | |
| 279 return True | |
| 280 return False | |
| 281 | |
| 282 @property | |
| 283 def encodings(self): | |
| 284 """Yield a number of encodings that might work for this markup. | |
| 285 | |
| 286 :yield: A sequence of strings. | |
| 287 """ | |
| 288 tried = set() | |
| 289 for e in self.override_encodings: | |
| 290 if self._usable(e, tried): | |
| 291 yield e | |
| 292 | |
| 293 # Did the document originally start with a byte-order mark | |
| 294 # that indicated its encoding? | |
| 295 if self._usable(self.sniffed_encoding, tried): | |
| 296 yield self.sniffed_encoding | |
| 297 | |
| 298 # Look within the document for an XML or HTML encoding | |
| 299 # declaration. | |
| 300 if self.declared_encoding is None: | |
| 301 self.declared_encoding = self.find_declared_encoding( | |
| 302 self.markup, self.is_html) | |
| 303 if self._usable(self.declared_encoding, tried): | |
| 304 yield self.declared_encoding | |
| 305 | |
| 306 # Use third-party character set detection to guess at the | |
| 307 # encoding. | |
| 308 if self.chardet_encoding is None: | |
| 309 self.chardet_encoding = chardet_dammit(self.markup) | |
| 310 if self._usable(self.chardet_encoding, tried): | |
| 311 yield self.chardet_encoding | |
| 312 | |
| 313 # As a last-ditch effort, try utf-8 and windows-1252. | |
| 314 for e in ('utf-8', 'windows-1252'): | |
| 315 if self._usable(e, tried): | |
| 316 yield e | |
| 317 | |
| 318 @classmethod | |
| 319 def strip_byte_order_mark(cls, data): | |
| 320 """If a byte-order mark is present, strip it and return the encoding it implies. | |
| 321 | |
| 322 :param data: Some markup. | |
| 323 :return: A 2-tuple (modified data, implied encoding) | |
| 324 """ | |
| 325 encoding = None | |
| 326 if isinstance(data, str): | |
| 327 # Unicode data cannot have a byte-order mark. | |
| 328 return data, encoding | |
| 329 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ | |
| 330 and (data[2:4] != '\x00\x00'): | |
| 331 encoding = 'utf-16be' | |
| 332 data = data[2:] | |
| 333 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ | |
| 334 and (data[2:4] != '\x00\x00'): | |
| 335 encoding = 'utf-16le' | |
| 336 data = data[2:] | |
| 337 elif data[:3] == b'\xef\xbb\xbf': | |
| 338 encoding = 'utf-8' | |
| 339 data = data[3:] | |
| 340 elif data[:4] == b'\x00\x00\xfe\xff': | |
| 341 encoding = 'utf-32be' | |
| 342 data = data[4:] | |
| 343 elif data[:4] == b'\xff\xfe\x00\x00': | |
| 344 encoding = 'utf-32le' | |
| 345 data = data[4:] | |
| 346 return data, encoding | |
| 347 | |
| 348 @classmethod | |
| 349 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): | |
| 350 """Given a document, tries to find its declared encoding. | |
| 351 | |
| 352 An XML encoding is declared at the beginning of the document. | |
| 353 | |
| 354 An HTML encoding is declared in a <meta> tag, hopefully near the | |
| 355 beginning of the document. | |
| 356 | |
| 357 :param markup: Some markup. | |
| 358 :param is_html: If True, this markup is considered to be HTML. Otherwise | |
| 359 it's assumed to be XML. | |
| 360 :param search_entire_document: Since an encoding is supposed to declared near the beginning | |
| 361 of the document, most of the time it's only necessary to search a few kilobytes of data. | |
| 362 Set this to True to force this method to search the entire document. | |
| 363 """ | |
| 364 if search_entire_document: | |
| 365 xml_endpos = html_endpos = len(markup) | |
| 366 else: | |
| 367 xml_endpos = 1024 | |
| 368 html_endpos = max(2048, int(len(markup) * 0.05)) | |
| 369 | |
| 370 if isinstance(markup, bytes): | |
| 371 res = encoding_res[bytes] | |
| 372 else: | |
| 373 res = encoding_res[str] | |
| 374 | |
| 375 xml_re = res['xml'] | |
| 376 html_re = res['html'] | |
| 377 declared_encoding = None | |
| 378 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) | |
| 379 if not declared_encoding_match and is_html: | |
| 380 declared_encoding_match = html_re.search(markup, endpos=html_endpos) | |
| 381 if declared_encoding_match is not None: | |
| 382 declared_encoding = declared_encoding_match.groups()[0] | |
| 383 if declared_encoding: | |
| 384 if isinstance(declared_encoding, bytes): | |
| 385 declared_encoding = declared_encoding.decode('ascii', 'replace') | |
| 386 return declared_encoding.lower() | |
| 387 return None | |
| 388 | |
| 389 class UnicodeDammit: | |
| 390 """A class for detecting the encoding of a *ML document and | |
| 391 converting it to a Unicode string. If the source encoding is | |
| 392 windows-1252, can replace MS smart quotes with their HTML or XML | |
| 393 equivalents.""" | |
| 394 | |
| 395 # This dictionary maps commonly seen values for "charset" in HTML | |
| 396 # meta tags to the corresponding Python codec names. It only covers | |
| 397 # values that aren't in Python's aliases and can't be determined | |
| 398 # by the heuristics in find_codec. | |
| 399 CHARSET_ALIASES = {"macintosh": "mac-roman", | |
| 400 "x-sjis": "shift-jis"} | |
| 401 | |
| 402 ENCODINGS_WITH_SMART_QUOTES = [ | |
| 403 "windows-1252", | |
| 404 "iso-8859-1", | |
| 405 "iso-8859-2", | |
| 406 ] | |
| 407 | |
| 408 def __init__(self, markup, override_encodings=[], | |
| 409 smart_quotes_to=None, is_html=False, exclude_encodings=[]): | |
| 410 """Constructor. | |
| 411 | |
| 412 :param markup: A bytestring representing markup in an unknown encoding. | |
| 413 :param override_encodings: These encodings will be tried first, | |
| 414 before any sniffing code is run. | |
| 415 | |
| 416 :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted | |
| 417 to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. | |
| 418 Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' | |
| 419 will convert them to HTML entity references. | |
| 420 :param is_html: If True, this markup is considered to be HTML. Otherwise | |
| 421 it's assumed to be XML. | |
| 422 :param exclude_encodings: These encodings will not be considered, even | |
| 423 if the sniffing code thinks they might make sense. | |
| 424 """ | |
| 425 self.smart_quotes_to = smart_quotes_to | |
| 426 self.tried_encodings = [] | |
| 427 self.contains_replacement_characters = False | |
| 428 self.is_html = is_html | |
| 429 self.log = logging.getLogger(__name__) | |
| 430 self.detector = EncodingDetector( | |
| 431 markup, override_encodings, is_html, exclude_encodings) | |
| 432 | |
| 433 # Short-circuit if the data is in Unicode to begin with. | |
| 434 if isinstance(markup, str) or markup == '': | |
| 435 self.markup = markup | |
| 436 self.unicode_markup = str(markup) | |
| 437 self.original_encoding = None | |
| 438 return | |
| 439 | |
| 440 # The encoding detector may have stripped a byte-order mark. | |
| 441 # Use the stripped markup from this point on. | |
| 442 self.markup = self.detector.markup | |
| 443 | |
| 444 u = None | |
| 445 for encoding in self.detector.encodings: | |
| 446 markup = self.detector.markup | |
| 447 u = self._convert_from(encoding) | |
| 448 if u is not None: | |
| 449 break | |
| 450 | |
| 451 if not u: | |
| 452 # None of the encodings worked. As an absolute last resort, | |
| 453 # try them again with character replacement. | |
| 454 | |
| 455 for encoding in self.detector.encodings: | |
| 456 if encoding != "ascii": | |
| 457 u = self._convert_from(encoding, "replace") | |
| 458 if u is not None: | |
| 459 self.log.warning( | |
| 460 "Some characters could not be decoded, and were " | |
| 461 "replaced with REPLACEMENT CHARACTER." | |
| 462 ) | |
| 463 self.contains_replacement_characters = True | |
| 464 break | |
| 465 | |
| 466 # If none of that worked, we could at this point force it to | |
| 467 # ASCII, but that would destroy so much data that I think | |
| 468 # giving up is better. | |
| 469 self.unicode_markup = u | |
| 470 if not u: | |
| 471 self.original_encoding = None | |
| 472 | |
| 473 def _sub_ms_char(self, match): | |
| 474 """Changes a MS smart quote character to an XML or HTML | |
| 475 entity, or an ASCII character.""" | |
| 476 orig = match.group(1) | |
| 477 if self.smart_quotes_to == 'ascii': | |
| 478 sub = self.MS_CHARS_TO_ASCII.get(orig).encode() | |
| 479 else: | |
| 480 sub = self.MS_CHARS.get(orig) | |
| 481 if type(sub) == tuple: | |
| 482 if self.smart_quotes_to == 'xml': | |
| 483 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() | |
| 484 else: | |
| 485 sub = '&'.encode() + sub[0].encode() + ';'.encode() | |
| 486 else: | |
| 487 sub = sub.encode() | |
| 488 return sub | |
| 489 | |
| 490 def _convert_from(self, proposed, errors="strict"): | |
| 491 """Attempt to convert the markup to the proposed encoding. | |
| 492 | |
| 493 :param proposed: The name of a character encoding. | |
| 494 """ | |
| 495 proposed = self.find_codec(proposed) | |
| 496 if not proposed or (proposed, errors) in self.tried_encodings: | |
| 497 return None | |
| 498 self.tried_encodings.append((proposed, errors)) | |
| 499 markup = self.markup | |
| 500 # Convert smart quotes to HTML if coming from an encoding | |
| 501 # that might have them. | |
| 502 if (self.smart_quotes_to is not None | |
| 503 and proposed in self.ENCODINGS_WITH_SMART_QUOTES): | |
| 504 smart_quotes_re = b"([\x80-\x9f])" | |
| 505 smart_quotes_compiled = re.compile(smart_quotes_re) | |
| 506 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) | |
| 507 | |
| 508 try: | |
| 509 #print("Trying to convert document to %s (errors=%s)" % ( | |
| 510 # proposed, errors)) | |
| 511 u = self._to_unicode(markup, proposed, errors) | |
| 512 self.markup = u | |
| 513 self.original_encoding = proposed | |
| 514 except Exception as e: | |
| 515 #print("That didn't work!") | |
| 516 #print(e) | |
| 517 return None | |
| 518 #print("Correct encoding: %s" % proposed) | |
| 519 return self.markup | |
| 520 | |
| 521 def _to_unicode(self, data, encoding, errors="strict"): | |
| 522 """Given a string and its encoding, decodes the string into Unicode. | |
| 523 | |
| 524 :param encoding: The name of an encoding. | |
| 525 """ | |
| 526 return str(data, encoding, errors) | |
| 527 | |
| 528 @property | |
| 529 def declared_html_encoding(self): | |
| 530 """If the markup is an HTML document, returns the encoding declared _within_ | |
| 531 the document. | |
| 532 """ | |
| 533 if not self.is_html: | |
| 534 return None | |
| 535 return self.detector.declared_encoding | |
| 536 | |
| 537 def find_codec(self, charset): | |
| 538 """Convert the name of a character set to a codec name. | |
| 539 | |
| 540 :param charset: The name of a character set. | |
| 541 :return: The name of a codec. | |
| 542 """ | |
| 543 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) | |
| 544 or (charset and self._codec(charset.replace("-", ""))) | |
| 545 or (charset and self._codec(charset.replace("-", "_"))) | |
| 546 or (charset and charset.lower()) | |
| 547 or charset | |
| 548 ) | |
| 549 if value: | |
| 550 return value.lower() | |
| 551 return None | |
| 552 | |
| 553 def _codec(self, charset): | |
| 554 if not charset: | |
| 555 return charset | |
| 556 codec = None | |
| 557 try: | |
| 558 codecs.lookup(charset) | |
| 559 codec = charset | |
| 560 except (LookupError, ValueError): | |
| 561 pass | |
| 562 return codec | |
| 563 | |
| 564 | |
| 565 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. | |
| 566 MS_CHARS = {b'\x80': ('euro', '20AC'), | |
| 567 b'\x81': ' ', | |
| 568 b'\x82': ('sbquo', '201A'), | |
| 569 b'\x83': ('fnof', '192'), | |
| 570 b'\x84': ('bdquo', '201E'), | |
| 571 b'\x85': ('hellip', '2026'), | |
| 572 b'\x86': ('dagger', '2020'), | |
| 573 b'\x87': ('Dagger', '2021'), | |
| 574 b'\x88': ('circ', '2C6'), | |
| 575 b'\x89': ('permil', '2030'), | |
| 576 b'\x8A': ('Scaron', '160'), | |
| 577 b'\x8B': ('lsaquo', '2039'), | |
| 578 b'\x8C': ('OElig', '152'), | |
| 579 b'\x8D': '?', | |
| 580 b'\x8E': ('#x17D', '17D'), | |
| 581 b'\x8F': '?', | |
| 582 b'\x90': '?', | |
| 583 b'\x91': ('lsquo', '2018'), | |
| 584 b'\x92': ('rsquo', '2019'), | |
| 585 b'\x93': ('ldquo', '201C'), | |
| 586 b'\x94': ('rdquo', '201D'), | |
| 587 b'\x95': ('bull', '2022'), | |
| 588 b'\x96': ('ndash', '2013'), | |
| 589 b'\x97': ('mdash', '2014'), | |
| 590 b'\x98': ('tilde', '2DC'), | |
| 591 b'\x99': ('trade', '2122'), | |
| 592 b'\x9a': ('scaron', '161'), | |
| 593 b'\x9b': ('rsaquo', '203A'), | |
| 594 b'\x9c': ('oelig', '153'), | |
| 595 b'\x9d': '?', | |
| 596 b'\x9e': ('#x17E', '17E'), | |
| 597 b'\x9f': ('Yuml', ''),} | |
| 598 | |
| 599 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains | |
| 600 # horrors like stripping diacritical marks to turn á into a, but also | |
| 601 # contains non-horrors like turning “ into ". | |
| 602 MS_CHARS_TO_ASCII = { | |
| 603 b'\x80' : 'EUR', | |
| 604 b'\x81' : ' ', | |
| 605 b'\x82' : ',', | |
| 606 b'\x83' : 'f', | |
| 607 b'\x84' : ',,', | |
| 608 b'\x85' : '...', | |
| 609 b'\x86' : '+', | |
| 610 b'\x87' : '++', | |
| 611 b'\x88' : '^', | |
| 612 b'\x89' : '%', | |
| 613 b'\x8a' : 'S', | |
| 614 b'\x8b' : '<', | |
| 615 b'\x8c' : 'OE', | |
| 616 b'\x8d' : '?', | |
| 617 b'\x8e' : 'Z', | |
| 618 b'\x8f' : '?', | |
| 619 b'\x90' : '?', | |
| 620 b'\x91' : "'", | |
| 621 b'\x92' : "'", | |
| 622 b'\x93' : '"', | |
| 623 b'\x94' : '"', | |
| 624 b'\x95' : '*', | |
| 625 b'\x96' : '-', | |
| 626 b'\x97' : '--', | |
| 627 b'\x98' : '~', | |
| 628 b'\x99' : '(TM)', | |
| 629 b'\x9a' : 's', | |
| 630 b'\x9b' : '>', | |
| 631 b'\x9c' : 'oe', | |
| 632 b'\x9d' : '?', | |
| 633 b'\x9e' : 'z', | |
| 634 b'\x9f' : 'Y', | |
| 635 b'\xa0' : ' ', | |
| 636 b'\xa1' : '!', | |
| 637 b'\xa2' : 'c', | |
| 638 b'\xa3' : 'GBP', | |
| 639 b'\xa4' : '$', #This approximation is especially parochial--this is the | |
| 640 #generic currency symbol. | |
| 641 b'\xa5' : 'YEN', | |
| 642 b'\xa6' : '|', | |
| 643 b'\xa7' : 'S', | |
| 644 b'\xa8' : '..', | |
| 645 b'\xa9' : '', | |
| 646 b'\xaa' : '(th)', | |
| 647 b'\xab' : '<<', | |
| 648 b'\xac' : '!', | |
| 649 b'\xad' : ' ', | |
| 650 b'\xae' : '(R)', | |
| 651 b'\xaf' : '-', | |
| 652 b'\xb0' : 'o', | |
| 653 b'\xb1' : '+-', | |
| 654 b'\xb2' : '2', | |
| 655 b'\xb3' : '3', | |
| 656 b'\xb4' : ("'", 'acute'), | |
| 657 b'\xb5' : 'u', | |
| 658 b'\xb6' : 'P', | |
| 659 b'\xb7' : '*', | |
| 660 b'\xb8' : ',', | |
| 661 b'\xb9' : '1', | |
| 662 b'\xba' : '(th)', | |
| 663 b'\xbb' : '>>', | |
| 664 b'\xbc' : '1/4', | |
| 665 b'\xbd' : '1/2', | |
| 666 b'\xbe' : '3/4', | |
| 667 b'\xbf' : '?', | |
| 668 b'\xc0' : 'A', | |
| 669 b'\xc1' : 'A', | |
| 670 b'\xc2' : 'A', | |
| 671 b'\xc3' : 'A', | |
| 672 b'\xc4' : 'A', | |
| 673 b'\xc5' : 'A', | |
| 674 b'\xc6' : 'AE', | |
| 675 b'\xc7' : 'C', | |
| 676 b'\xc8' : 'E', | |
| 677 b'\xc9' : 'E', | |
| 678 b'\xca' : 'E', | |
| 679 b'\xcb' : 'E', | |
| 680 b'\xcc' : 'I', | |
| 681 b'\xcd' : 'I', | |
| 682 b'\xce' : 'I', | |
| 683 b'\xcf' : 'I', | |
| 684 b'\xd0' : 'D', | |
| 685 b'\xd1' : 'N', | |
| 686 b'\xd2' : 'O', | |
| 687 b'\xd3' : 'O', | |
| 688 b'\xd4' : 'O', | |
| 689 b'\xd5' : 'O', | |
| 690 b'\xd6' : 'O', | |
| 691 b'\xd7' : '*', | |
| 692 b'\xd8' : 'O', | |
| 693 b'\xd9' : 'U', | |
| 694 b'\xda' : 'U', | |
| 695 b'\xdb' : 'U', | |
| 696 b'\xdc' : 'U', | |
| 697 b'\xdd' : 'Y', | |
| 698 b'\xde' : 'b', | |
| 699 b'\xdf' : 'B', | |
| 700 b'\xe0' : 'a', | |
| 701 b'\xe1' : 'a', | |
| 702 b'\xe2' : 'a', | |
| 703 b'\xe3' : 'a', | |
| 704 b'\xe4' : 'a', | |
| 705 b'\xe5' : 'a', | |
| 706 b'\xe6' : 'ae', | |
| 707 b'\xe7' : 'c', | |
| 708 b'\xe8' : 'e', | |
| 709 b'\xe9' : 'e', | |
| 710 b'\xea' : 'e', | |
| 711 b'\xeb' : 'e', | |
| 712 b'\xec' : 'i', | |
| 713 b'\xed' : 'i', | |
| 714 b'\xee' : 'i', | |
| 715 b'\xef' : 'i', | |
| 716 b'\xf0' : 'o', | |
| 717 b'\xf1' : 'n', | |
| 718 b'\xf2' : 'o', | |
| 719 b'\xf3' : 'o', | |
| 720 b'\xf4' : 'o', | |
| 721 b'\xf5' : 'o', | |
| 722 b'\xf6' : 'o', | |
| 723 b'\xf7' : '/', | |
| 724 b'\xf8' : 'o', | |
| 725 b'\xf9' : 'u', | |
| 726 b'\xfa' : 'u', | |
| 727 b'\xfb' : 'u', | |
| 728 b'\xfc' : 'u', | |
| 729 b'\xfd' : 'y', | |
| 730 b'\xfe' : 'b', | |
| 731 b'\xff' : 'y', | |
| 732 } | |
| 733 | |
| 734 # A map used when removing rogue Windows-1252/ISO-8859-1 | |
| 735 # characters in otherwise UTF-8 documents. | |
| 736 # | |
| 737 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in | |
| 738 # Windows-1252. | |
| 739 WINDOWS_1252_TO_UTF8 = { | |
| 740 0x80 : b'\xe2\x82\xac', # € | |
| 741 0x82 : b'\xe2\x80\x9a', # ‚ | |
| 742 0x83 : b'\xc6\x92', # ƒ | |
| 743 0x84 : b'\xe2\x80\x9e', # „ | |
| 744 0x85 : b'\xe2\x80\xa6', # … | |
| 745 0x86 : b'\xe2\x80\xa0', # † | |
| 746 0x87 : b'\xe2\x80\xa1', # ‡ | |
| 747 0x88 : b'\xcb\x86', # ˆ | |
| 748 0x89 : b'\xe2\x80\xb0', # ‰ | |
| 749 0x8a : b'\xc5\xa0', # Š | |
| 750 0x8b : b'\xe2\x80\xb9', # ‹ | |
| 751 0x8c : b'\xc5\x92', # Œ | |
| 752 0x8e : b'\xc5\xbd', # Ž | |
| 753 0x91 : b'\xe2\x80\x98', # ‘ | |
| 754 0x92 : b'\xe2\x80\x99', # ’ | |
| 755 0x93 : b'\xe2\x80\x9c', # “ | |
| 756 0x94 : b'\xe2\x80\x9d', # ” | |
| 757 0x95 : b'\xe2\x80\xa2', # • | |
| 758 0x96 : b'\xe2\x80\x93', # – | |
| 759 0x97 : b'\xe2\x80\x94', # — | |
| 760 0x98 : b'\xcb\x9c', # ˜ | |
| 761 0x99 : b'\xe2\x84\xa2', # ™ | |
| 762 0x9a : b'\xc5\xa1', # š | |
| 763 0x9b : b'\xe2\x80\xba', # › | |
| 764 0x9c : b'\xc5\x93', # œ | |
| 765 0x9e : b'\xc5\xbe', # ž | |
| 766 0x9f : b'\xc5\xb8', # Ÿ | |
| 767 0xa0 : b'\xc2\xa0', # | |
| 768 0xa1 : b'\xc2\xa1', # ¡ | |
| 769 0xa2 : b'\xc2\xa2', # ¢ | |
| 770 0xa3 : b'\xc2\xa3', # £ | |
| 771 0xa4 : b'\xc2\xa4', # ¤ | |
| 772 0xa5 : b'\xc2\xa5', # ¥ | |
| 773 0xa6 : b'\xc2\xa6', # ¦ | |
| 774 0xa7 : b'\xc2\xa7', # § | |
| 775 0xa8 : b'\xc2\xa8', # ¨ | |
| 776 0xa9 : b'\xc2\xa9', # © | |
| 777 0xaa : b'\xc2\xaa', # ª | |
| 778 0xab : b'\xc2\xab', # « | |
| 779 0xac : b'\xc2\xac', # ¬ | |
| 780 0xad : b'\xc2\xad', # | |
| 781 0xae : b'\xc2\xae', # ® | |
| 782 0xaf : b'\xc2\xaf', # ¯ | |
| 783 0xb0 : b'\xc2\xb0', # ° | |
| 784 0xb1 : b'\xc2\xb1', # ± | |
| 785 0xb2 : b'\xc2\xb2', # ² | |
| 786 0xb3 : b'\xc2\xb3', # ³ | |
| 787 0xb4 : b'\xc2\xb4', # ´ | |
| 788 0xb5 : b'\xc2\xb5', # µ | |
| 789 0xb6 : b'\xc2\xb6', # ¶ | |
| 790 0xb7 : b'\xc2\xb7', # · | |
| 791 0xb8 : b'\xc2\xb8', # ¸ | |
| 792 0xb9 : b'\xc2\xb9', # ¹ | |
| 793 0xba : b'\xc2\xba', # º | |
| 794 0xbb : b'\xc2\xbb', # » | |
| 795 0xbc : b'\xc2\xbc', # ¼ | |
| 796 0xbd : b'\xc2\xbd', # ½ | |
| 797 0xbe : b'\xc2\xbe', # ¾ | |
| 798 0xbf : b'\xc2\xbf', # ¿ | |
| 799 0xc0 : b'\xc3\x80', # À | |
| 800 0xc1 : b'\xc3\x81', # Á | |
| 801 0xc2 : b'\xc3\x82', # Â | |
| 802 0xc3 : b'\xc3\x83', # Ã | |
| 803 0xc4 : b'\xc3\x84', # Ä | |
| 804 0xc5 : b'\xc3\x85', # Å | |
| 805 0xc6 : b'\xc3\x86', # Æ | |
| 806 0xc7 : b'\xc3\x87', # Ç | |
| 807 0xc8 : b'\xc3\x88', # È | |
| 808 0xc9 : b'\xc3\x89', # É | |
| 809 0xca : b'\xc3\x8a', # Ê | |
| 810 0xcb : b'\xc3\x8b', # Ë | |
| 811 0xcc : b'\xc3\x8c', # Ì | |
| 812 0xcd : b'\xc3\x8d', # Í | |
| 813 0xce : b'\xc3\x8e', # Î | |
| 814 0xcf : b'\xc3\x8f', # Ï | |
| 815 0xd0 : b'\xc3\x90', # Ð | |
| 816 0xd1 : b'\xc3\x91', # Ñ | |
| 817 0xd2 : b'\xc3\x92', # Ò | |
| 818 0xd3 : b'\xc3\x93', # Ó | |
| 819 0xd4 : b'\xc3\x94', # Ô | |
| 820 0xd5 : b'\xc3\x95', # Õ | |
| 821 0xd6 : b'\xc3\x96', # Ö | |
| 822 0xd7 : b'\xc3\x97', # × | |
| 823 0xd8 : b'\xc3\x98', # Ø | |
| 824 0xd9 : b'\xc3\x99', # Ù | |
| 825 0xda : b'\xc3\x9a', # Ú | |
| 826 0xdb : b'\xc3\x9b', # Û | |
| 827 0xdc : b'\xc3\x9c', # Ü | |
| 828 0xdd : b'\xc3\x9d', # Ý | |
| 829 0xde : b'\xc3\x9e', # Þ | |
| 830 0xdf : b'\xc3\x9f', # ß | |
| 831 0xe0 : b'\xc3\xa0', # à | |
| 832 0xe1 : b'\xa1', # á | |
| 833 0xe2 : b'\xc3\xa2', # â | |
| 834 0xe3 : b'\xc3\xa3', # ã | |
| 835 0xe4 : b'\xc3\xa4', # ä | |
| 836 0xe5 : b'\xc3\xa5', # å | |
| 837 0xe6 : b'\xc3\xa6', # æ | |
| 838 0xe7 : b'\xc3\xa7', # ç | |
| 839 0xe8 : b'\xc3\xa8', # è | |
| 840 0xe9 : b'\xc3\xa9', # é | |
| 841 0xea : b'\xc3\xaa', # ê | |
| 842 0xeb : b'\xc3\xab', # ë | |
| 843 0xec : b'\xc3\xac', # ì | |
| 844 0xed : b'\xc3\xad', # í | |
| 845 0xee : b'\xc3\xae', # î | |
| 846 0xef : b'\xc3\xaf', # ï | |
| 847 0xf0 : b'\xc3\xb0', # ð | |
| 848 0xf1 : b'\xc3\xb1', # ñ | |
| 849 0xf2 : b'\xc3\xb2', # ò | |
| 850 0xf3 : b'\xc3\xb3', # ó | |
| 851 0xf4 : b'\xc3\xb4', # ô | |
| 852 0xf5 : b'\xc3\xb5', # õ | |
| 853 0xf6 : b'\xc3\xb6', # ö | |
| 854 0xf7 : b'\xc3\xb7', # ÷ | |
| 855 0xf8 : b'\xc3\xb8', # ø | |
| 856 0xf9 : b'\xc3\xb9', # ù | |
| 857 0xfa : b'\xc3\xba', # ú | |
| 858 0xfb : b'\xc3\xbb', # û | |
| 859 0xfc : b'\xc3\xbc', # ü | |
| 860 0xfd : b'\xc3\xbd', # ý | |
| 861 0xfe : b'\xc3\xbe', # þ | |
| 862 } | |
| 863 | |
| 864 MULTIBYTE_MARKERS_AND_SIZES = [ | |
| 865 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF | |
| 866 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF | |
| 867 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 | |
| 868 ] | |
| 869 | |
| 870 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] | |
| 871 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] | |
| 872 | |
| 873 @classmethod | |
| 874 def detwingle(cls, in_bytes, main_encoding="utf8", | |
| 875 embedded_encoding="windows-1252"): | |
| 876 """Fix characters from one encoding embedded in some other encoding. | |
| 877 | |
| 878 Currently the only situation supported is Windows-1252 (or its | |
| 879 subset ISO-8859-1), embedded in UTF-8. | |
| 880 | |
| 881 :param in_bytes: A bytestring that you suspect contains | |
| 882 characters from multiple encodings. Note that this _must_ | |
| 883 be a bytestring. If you've already converted the document | |
| 884 to Unicode, you're too late. | |
| 885 :param main_encoding: The primary encoding of `in_bytes`. | |
| 886 :param embedded_encoding: The encoding that was used to embed characters | |
| 887 in the main document. | |
| 888 :return: A bytestring in which `embedded_encoding` | |
| 889 characters have been converted to their `main_encoding` | |
| 890 equivalents. | |
| 891 """ | |
| 892 if embedded_encoding.replace('_', '-').lower() not in ( | |
| 893 'windows-1252', 'windows_1252'): | |
| 894 raise NotImplementedError( | |
| 895 "Windows-1252 and ISO-8859-1 are the only currently supported " | |
| 896 "embedded encodings.") | |
| 897 | |
| 898 if main_encoding.lower() not in ('utf8', 'utf-8'): | |
| 899 raise NotImplementedError( | |
| 900 "UTF-8 is the only currently supported main encoding.") | |
| 901 | |
| 902 byte_chunks = [] | |
| 903 | |
| 904 chunk_start = 0 | |
| 905 pos = 0 | |
| 906 while pos < len(in_bytes): | |
| 907 byte = in_bytes[pos] | |
| 908 if not isinstance(byte, int): | |
| 909 # Python 2.x | |
| 910 byte = ord(byte) | |
| 911 if (byte >= cls.FIRST_MULTIBYTE_MARKER | |
| 912 and byte <= cls.LAST_MULTIBYTE_MARKER): | |
| 913 # This is the start of a UTF-8 multibyte character. Skip | |
| 914 # to the end. | |
| 915 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: | |
| 916 if byte >= start and byte <= end: | |
| 917 pos += size | |
| 918 break | |
| 919 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: | |
| 920 # We found a Windows-1252 character! | |
| 921 # Save the string up to this point as a chunk. | |
| 922 byte_chunks.append(in_bytes[chunk_start:pos]) | |
| 923 | |
| 924 # Now translate the Windows-1252 character into UTF-8 | |
| 925 # and add it as another, one-byte chunk. | |
| 926 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) | |
| 927 pos += 1 | |
| 928 chunk_start = pos | |
| 929 else: | |
| 930 # Go on to the next character. | |
| 931 pos += 1 | |
| 932 if chunk_start == 0: | |
| 933 # The string is unchanged. | |
| 934 return in_bytes | |
| 935 else: | |
| 936 # Store the final chunk. | |
| 937 byte_chunks.append(in_bytes[chunk_start:]) | |
| 938 return b''.join(byte_chunks) | |
| 939 |
