Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/bleach/html5lib_shim.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:18:57 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d30785e31577 |
|---|---|
| 1 # flake8: noqa | |
| 2 """ | |
| 3 Shim module between Bleach and html5lib. This makes it easier to upgrade the | |
| 4 html5lib library without having to change a lot of code. | |
| 5 """ | |
| 6 | |
| 7 from __future__ import unicode_literals | |
| 8 | |
| 9 import re | |
| 10 import string | |
| 11 | |
| 12 import six | |
| 13 | |
| 14 from bleach._vendor.html5lib import ( | |
| 15 HTMLParser, | |
| 16 getTreeWalker, | |
| 17 ) | |
| 18 from bleach._vendor.html5lib import constants | |
| 19 from bleach._vendor.html5lib.constants import ( | |
| 20 namespaces, | |
| 21 prefixes, | |
| 22 ) | |
| 23 from bleach._vendor.html5lib.constants import _ReparseException as ReparseException | |
| 24 from bleach._vendor.html5lib.filters.base import Filter | |
| 25 from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols | |
| 26 from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter | |
| 27 from bleach._vendor.html5lib._inputstream import HTMLInputStream | |
| 28 from bleach._vendor.html5lib.serializer import HTMLSerializer | |
| 29 from bleach._vendor.html5lib._tokenizer import HTMLTokenizer | |
| 30 from bleach._vendor.html5lib._trie import Trie | |
| 31 | |
| 32 | |
| 33 #: Map of entity name to expanded entity | |
| 34 ENTITIES = constants.entities | |
| 35 | |
| 36 #: Trie of html entity string -> character representation | |
| 37 ENTITIES_TRIE = Trie(ENTITIES) | |
| 38 | |
| 39 #: Token type constants--these never change | |
| 40 TAG_TOKEN_TYPES = { | |
| 41 constants.tokenTypes['StartTag'], | |
| 42 constants.tokenTypes['EndTag'], | |
| 43 constants.tokenTypes['EmptyTag'] | |
| 44 } | |
| 45 CHARACTERS_TYPE = constants.tokenTypes['Characters'] | |
| 46 PARSEERROR_TYPE = constants.tokenTypes['ParseError'] | |
| 47 | |
| 48 | |
| 49 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 | |
| 50 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3 | |
| 51 HTML_TAGS = [ | |
| 52 'a', | |
| 53 'abbr', | |
| 54 'address', | |
| 55 'area', | |
| 56 'article', | |
| 57 'aside', | |
| 58 'audio', | |
| 59 'b', | |
| 60 'base', | |
| 61 'bdi', | |
| 62 'bdo', | |
| 63 'blockquote', | |
| 64 'body', | |
| 65 'br', | |
| 66 'button', | |
| 67 'canvas', | |
| 68 'caption', | |
| 69 'cite', | |
| 70 'code', | |
| 71 'col', | |
| 72 'colgroup', | |
| 73 'data', | |
| 74 'datalist', | |
| 75 'dd', | |
| 76 'del', | |
| 77 'details', | |
| 78 'dfn', | |
| 79 'dialog', | |
| 80 'div', | |
| 81 'dl', | |
| 82 'dt', | |
| 83 'em', | |
| 84 'embed', | |
| 85 'fieldset', | |
| 86 'figcaption', | |
| 87 'figure', | |
| 88 'footer', | |
| 89 'form', | |
| 90 'h1', | |
| 91 'h2', | |
| 92 'h3', | |
| 93 'h4', | |
| 94 'h5', | |
| 95 'h6', | |
| 96 'head', | |
| 97 'header', | |
| 98 'hgroup', | |
| 99 'hr', | |
| 100 'html', | |
| 101 'i', | |
| 102 'iframe', | |
| 103 'img', | |
| 104 'input', | |
| 105 'ins', | |
| 106 'kbd', | |
| 107 'keygen', | |
| 108 'label', | |
| 109 'legend', | |
| 110 'li', | |
| 111 'link', | |
| 112 'map', | |
| 113 'mark', | |
| 114 'menu', | |
| 115 'meta', | |
| 116 'meter', | |
| 117 'nav', | |
| 118 'noscript', | |
| 119 'object', | |
| 120 'ol', | |
| 121 'optgroup', | |
| 122 'option', | |
| 123 'output', | |
| 124 'p', | |
| 125 'param', | |
| 126 'picture', | |
| 127 'pre', | |
| 128 'progress', | |
| 129 'q', | |
| 130 'rp', | |
| 131 'rt', | |
| 132 'ruby', | |
| 133 's', | |
| 134 'samp', | |
| 135 'script', | |
| 136 'section', | |
| 137 'select', | |
| 138 'slot', | |
| 139 'small', | |
| 140 'source', | |
| 141 'span', | |
| 142 'strong', | |
| 143 'style', | |
| 144 'sub', | |
| 145 'summary', | |
| 146 'sup', | |
| 147 'table', | |
| 148 'tbody', | |
| 149 'td', | |
| 150 'template', | |
| 151 'textarea', | |
| 152 'tfoot', | |
| 153 'th', | |
| 154 'thead', | |
| 155 'time', | |
| 156 'title', | |
| 157 'tr', | |
| 158 'track', | |
| 159 'u', | |
| 160 'ul', | |
| 161 'var', | |
| 162 'video', | |
| 163 'wbr', | |
| 164 ] | |
| 165 | |
| 166 | |
| 167 class InputStreamWithMemory(object): | |
| 168 """Wraps an HTMLInputStream to remember characters since last < | |
| 169 | |
| 170 This wraps existing HTMLInputStream classes to keep track of the stream | |
| 171 since the last < which marked an open tag state. | |
| 172 | |
| 173 """ | |
| 174 def __init__(self, inner_stream): | |
| 175 self._inner_stream = inner_stream | |
| 176 self.reset = self._inner_stream.reset | |
| 177 self.position = self._inner_stream.position | |
| 178 self._buffer = [] | |
| 179 | |
| 180 @property | |
| 181 def errors(self): | |
| 182 return self._inner_stream.errors | |
| 183 | |
| 184 @property | |
| 185 def charEncoding(self): | |
| 186 return self._inner_stream.charEncoding | |
| 187 | |
| 188 @property | |
| 189 def changeEncoding(self): | |
| 190 return self._inner_stream.changeEncoding | |
| 191 | |
| 192 def char(self): | |
| 193 c = self._inner_stream.char() | |
| 194 # char() can return None if EOF, so ignore that | |
| 195 if c: | |
| 196 self._buffer.append(c) | |
| 197 return c | |
| 198 | |
| 199 def charsUntil(self, characters, opposite=False): | |
| 200 chars = self._inner_stream.charsUntil(characters, opposite=opposite) | |
| 201 self._buffer.extend(list(chars)) | |
| 202 return chars | |
| 203 | |
| 204 def unget(self, char): | |
| 205 if self._buffer: | |
| 206 self._buffer.pop(-1) | |
| 207 return self._inner_stream.unget(char) | |
| 208 | |
| 209 def get_tag(self): | |
| 210 """Returns the stream history since last '<' | |
| 211 | |
| 212 Since the buffer starts at the last '<' as as seen by tagOpenState(), | |
| 213 we know that everything from that point to when this method is called | |
| 214 is the "tag" that is being tokenized. | |
| 215 | |
| 216 """ | |
| 217 return six.text_type('').join(self._buffer) | |
| 218 | |
| 219 def start_tag(self): | |
| 220 """Resets stream history to just '<' | |
| 221 | |
| 222 This gets called by tagOpenState() which marks a '<' that denotes an | |
| 223 open tag. Any time we see that, we reset the buffer. | |
| 224 | |
| 225 """ | |
| 226 self._buffer = ['<'] | |
| 227 | |
| 228 | |
| 229 class BleachHTMLTokenizer(HTMLTokenizer): | |
| 230 """Tokenizer that doesn't consume character entities""" | |
| 231 def __init__(self, consume_entities=False, **kwargs): | |
| 232 super(BleachHTMLTokenizer, self).__init__(**kwargs) | |
| 233 | |
| 234 self.consume_entities = consume_entities | |
| 235 | |
| 236 # Wrap the stream with one that remembers the history | |
| 237 self.stream = InputStreamWithMemory(self.stream) | |
| 238 | |
| 239 def __iter__(self): | |
| 240 last_error_token = None | |
| 241 | |
| 242 for token in super(BleachHTMLTokenizer, self).__iter__(): | |
| 243 if last_error_token is not None: | |
| 244 if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and | |
| 245 token['type'] in TAG_TOKEN_TYPES and | |
| 246 token.get('data'))): | |
| 247 # Remove attribute names that have ', " or < in them | |
| 248 # because those characters are invalid for attribute names. | |
| 249 token['data'] = [ | |
| 250 item for item in token['data'] | |
| 251 if ('"' not in item[0] and | |
| 252 "'" not in item[0] and | |
| 253 '<' not in item[0]) | |
| 254 ] | |
| 255 last_error_token = None | |
| 256 yield token | |
| 257 | |
| 258 elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and | |
| 259 self.parser.tags is not None and | |
| 260 token['data'].lower().strip() not in self.parser.tags)): | |
| 261 # We've got either a malformed tag or a pseudo-tag or | |
| 262 # something that html5lib wants to turn into a malformed | |
| 263 # comment which Bleach clean() will drop so we interfere | |
| 264 # with the token stream to handle it more correctly. | |
| 265 # | |
| 266 # If this is an allowed tag, it's malformed and we just let | |
| 267 # the html5lib parser deal with it--we don't enter into this | |
| 268 # block. | |
| 269 # | |
| 270 # If this is not an allowed tag, then we convert it to | |
| 271 # characters and it'll get escaped in the sanitizer. | |
| 272 token['data'] = self.stream.get_tag() | |
| 273 token['type'] = CHARACTERS_TYPE | |
| 274 | |
| 275 last_error_token = None | |
| 276 yield token | |
| 277 | |
| 278 elif token['type'] == PARSEERROR_TYPE: | |
| 279 # If the token is a parse error, then let the last_error_token | |
| 280 # go, and make token the new last_error_token | |
| 281 yield last_error_token | |
| 282 last_error_token = token | |
| 283 | |
| 284 else: | |
| 285 yield last_error_token | |
| 286 yield token | |
| 287 last_error_token = None | |
| 288 | |
| 289 continue | |
| 290 | |
| 291 # If the token is a ParseError, we hold on to it so we can get the | |
| 292 # next token and potentially fix it. | |
| 293 if token['type'] == PARSEERROR_TYPE: | |
| 294 last_error_token = token | |
| 295 continue | |
| 296 | |
| 297 yield token | |
| 298 | |
| 299 if last_error_token: | |
| 300 yield last_error_token | |
| 301 | |
| 302 def consumeEntity(self, allowedChar=None, fromAttribute=False): | |
| 303 # If this tokenizer is set to consume entities, then we can let the | |
| 304 # superclass do its thing. | |
| 305 if self.consume_entities: | |
| 306 return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute) | |
| 307 | |
| 308 # If this tokenizer is set to not consume entities, then we don't want | |
| 309 # to consume and convert them, so this overrides the html5lib tokenizer's | |
| 310 # consumeEntity so that it's now a no-op. | |
| 311 # | |
| 312 # However, when that gets called, it's consumed an &, so we put that back in | |
| 313 # the stream. | |
| 314 if fromAttribute: | |
| 315 self.currentToken['data'][-1][1] += '&' | |
| 316 | |
| 317 else: | |
| 318 self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'}) | |
| 319 | |
| 320 def tagOpenState(self): | |
| 321 # This state marks a < that is either a StartTag, EndTag, EmptyTag, | |
| 322 # or ParseError. In all cases, we want to drop any stream history | |
| 323 # we've collected so far and we do that by calling start_tag() on | |
| 324 # the input stream wrapper. | |
| 325 self.stream.start_tag() | |
| 326 return super(BleachHTMLTokenizer, self).tagOpenState() | |
| 327 | |
| 328 def emitCurrentToken(self): | |
| 329 token = self.currentToken | |
| 330 | |
| 331 if ((self.parser.tags is not None and | |
| 332 token['type'] in TAG_TOKEN_TYPES and | |
| 333 token['name'].lower() not in self.parser.tags)): | |
| 334 # If this is a start/end/empty tag for a tag that's not in our | |
| 335 # allowed list, then it gets stripped or escaped. In both of these | |
| 336 # cases it gets converted to a Characters token. | |
| 337 if self.parser.strip: | |
| 338 # If we're stripping the token, we just throw in an empty | |
| 339 # string token. | |
| 340 new_data = '' | |
| 341 | |
| 342 else: | |
| 343 # If we're escaping the token, we want to escape the exact | |
| 344 # original string. Since tokenizing also normalizes data | |
| 345 # and this is a tag-like thing, we've lost some information. | |
| 346 # So we go back through the stream to get the original | |
| 347 # string and use that. | |
| 348 new_data = self.stream.get_tag() | |
| 349 | |
| 350 new_token = { | |
| 351 'type': CHARACTERS_TYPE, | |
| 352 'data': new_data | |
| 353 } | |
| 354 | |
| 355 self.currentToken = new_token | |
| 356 self.tokenQueue.append(new_token) | |
| 357 self.state = self.dataState | |
| 358 return | |
| 359 | |
| 360 super(BleachHTMLTokenizer, self).emitCurrentToken() | |
| 361 | |
| 362 | |
| 363 class BleachHTMLParser(HTMLParser): | |
| 364 """Parser that uses BleachHTMLTokenizer""" | |
| 365 def __init__(self, tags, strip, consume_entities, **kwargs): | |
| 366 """ | |
| 367 :arg tags: list of allowed tags--everything else is either stripped or | |
| 368 escaped; if None, then this doesn't look at tags at all | |
| 369 :arg strip: whether to strip disallowed tags (True) or escape them (False); | |
| 370 if tags=None, then this doesn't have any effect | |
| 371 :arg consume_entities: whether to consume entities (default behavior) or | |
| 372 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) | |
| 373 | |
| 374 """ | |
| 375 self.tags = [tag.lower() for tag in tags] if tags is not None else None | |
| 376 self.strip = strip | |
| 377 self.consume_entities = consume_entities | |
| 378 super(BleachHTMLParser, self).__init__(**kwargs) | |
| 379 | |
| 380 def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs): | |
| 381 # set scripting=True to parse <noscript> as though JS is enabled to | |
| 382 # match the expected context in browsers | |
| 383 # | |
| 384 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element | |
| 385 # | |
| 386 # Override HTMLParser so we can swap out the tokenizer for our own. | |
| 387 self.innerHTMLMode = innerHTML | |
| 388 self.container = container | |
| 389 self.scripting = scripting | |
| 390 self.tokenizer = BleachHTMLTokenizer( | |
| 391 stream=stream, | |
| 392 consume_entities=self.consume_entities, | |
| 393 parser=self, | |
| 394 **kwargs | |
| 395 ) | |
| 396 self.reset() | |
| 397 | |
| 398 try: | |
| 399 self.mainLoop() | |
| 400 except ReparseException: | |
| 401 self.reset() | |
| 402 self.mainLoop() | |
| 403 | |
| 404 | |
| 405 def convert_entity(value): | |
| 406 """Convert an entity (minus the & and ; part) into what it represents | |
| 407 | |
| 408 This handles numeric, hex, and text entities. | |
| 409 | |
| 410 :arg value: the string (minus the ``&`` and ``;`` part) to convert | |
| 411 | |
| 412 :returns: unicode character or None if it's an ambiguous ampersand that | |
| 413 doesn't match a character entity | |
| 414 | |
| 415 """ | |
| 416 if value[0] == '#': | |
| 417 if value[1] in ('x', 'X'): | |
| 418 return six.unichr(int(value[2:], 16)) | |
| 419 return six.unichr(int(value[1:], 10)) | |
| 420 | |
| 421 return ENTITIES.get(value, None) | |
| 422 | |
| 423 | |
| 424 def convert_entities(text): | |
| 425 """Converts all found entities in the text | |
| 426 | |
| 427 :arg text: the text to convert entities in | |
| 428 | |
| 429 :returns: unicode text with converted entities | |
| 430 | |
| 431 """ | |
| 432 if '&' not in text: | |
| 433 return text | |
| 434 | |
| 435 new_text = [] | |
| 436 for part in next_possible_entity(text): | |
| 437 if not part: | |
| 438 continue | |
| 439 | |
| 440 if part.startswith('&'): | |
| 441 entity = match_entity(part) | |
| 442 if entity is not None: | |
| 443 converted = convert_entity(entity) | |
| 444 | |
| 445 # If it's not an ambiguous ampersand, then replace with the | |
| 446 # unicode character. Otherwise, we leave the entity in. | |
| 447 if converted is not None: | |
| 448 new_text.append(converted) | |
| 449 remainder = part[len(entity) + 2:] | |
| 450 if part: | |
| 451 new_text.append(remainder) | |
| 452 continue | |
| 453 | |
| 454 new_text.append(part) | |
| 455 | |
| 456 return ''.join(new_text) | |
| 457 | |
| 458 | |
| 459 def match_entity(stream): | |
| 460 """Returns first entity in stream or None if no entity exists | |
| 461 | |
| 462 Note: For Bleach purposes, entities must start with a "&" and end with | |
| 463 a ";". This ignoresambiguous character entities that have no ";" at the | |
| 464 end. | |
| 465 | |
| 466 :arg stream: the character stream | |
| 467 | |
| 468 :returns: ``None`` or the entity string without "&" or ";" | |
| 469 | |
| 470 """ | |
| 471 # Nix the & at the beginning | |
| 472 if stream[0] != '&': | |
| 473 raise ValueError('Stream should begin with "&"') | |
| 474 | |
| 475 stream = stream[1:] | |
| 476 | |
| 477 stream = list(stream) | |
| 478 possible_entity = '' | |
| 479 end_characters = '<&=;' + string.whitespace | |
| 480 | |
| 481 # Handle number entities | |
| 482 if stream and stream[0] == '#': | |
| 483 possible_entity = '#' | |
| 484 stream.pop(0) | |
| 485 | |
| 486 if stream and stream[0] in ('x', 'X'): | |
| 487 allowed = '0123456789abcdefABCDEF' | |
| 488 possible_entity += stream.pop(0) | |
| 489 else: | |
| 490 allowed = '0123456789' | |
| 491 | |
| 492 # FIXME(willkg): Do we want to make sure these are valid number | |
| 493 # entities? This doesn't do that currently. | |
| 494 while stream and stream[0] not in end_characters: | |
| 495 c = stream.pop(0) | |
| 496 if c not in allowed: | |
| 497 break | |
| 498 possible_entity += c | |
| 499 | |
| 500 if possible_entity and stream and stream[0] == ';': | |
| 501 return possible_entity | |
| 502 return None | |
| 503 | |
| 504 # Handle character entities | |
| 505 while stream and stream[0] not in end_characters: | |
| 506 c = stream.pop(0) | |
| 507 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity): | |
| 508 break | |
| 509 possible_entity += c | |
| 510 | |
| 511 if possible_entity and stream and stream[0] == ';': | |
| 512 return possible_entity | |
| 513 | |
| 514 return None | |
| 515 | |
| 516 | |
| 517 AMP_SPLIT_RE = re.compile('(&)') | |
| 518 | |
| 519 | |
| 520 def next_possible_entity(text): | |
| 521 """Takes a text and generates a list of possible entities | |
| 522 | |
| 523 :arg text: the text to look at | |
| 524 | |
| 525 :returns: generator where each part (except the first) starts with an | |
| 526 "&" | |
| 527 | |
| 528 """ | |
| 529 for i, part in enumerate(AMP_SPLIT_RE.split(text)): | |
| 530 if i == 0: | |
| 531 yield part | |
| 532 elif i % 2 == 0: | |
| 533 yield '&' + part | |
| 534 | |
| 535 | |
| 536 class BleachHTMLSerializer(HTMLSerializer): | |
| 537 """HTMLSerializer that undoes & -> & in attributes and sets | |
| 538 escape_rcdata to True | |
| 539 """ | |
| 540 | |
| 541 # per the HTMLSerializer.__init__ docstring: | |
| 542 # | |
| 543 # Whether to escape characters that need to be | |
| 544 # escaped within normal elements within rcdata elements such as | |
| 545 # style. | |
| 546 # | |
| 547 escape_rcdata = True | |
| 548 | |
| 549 def escape_base_amp(self, stoken): | |
| 550 """Escapes just bare & in HTML attribute values""" | |
| 551 # First, undo escaping of &. We need to do this because html5lib's | |
| 552 # HTMLSerializer expected the tokenizer to consume all the character | |
| 553 # entities and convert them to their respective characters, but the | |
| 554 # BleachHTMLTokenizer doesn't do that. For example, this fixes | |
| 555 # &entity; back to &entity; . | |
| 556 stoken = stoken.replace('&', '&') | |
| 557 | |
| 558 # However, we do want all bare & that are not marking character | |
| 559 # entities to be changed to &, so let's do that carefully here. | |
| 560 for part in next_possible_entity(stoken): | |
| 561 if not part: | |
| 562 continue | |
| 563 | |
| 564 if part.startswith('&'): | |
| 565 entity = match_entity(part) | |
| 566 # Only leave entities in that are not ambiguous. If they're | |
| 567 # ambiguous, then we escape the ampersand. | |
| 568 if entity is not None and convert_entity(entity) is not None: | |
| 569 yield '&' + entity + ';' | |
| 570 | |
| 571 # Length of the entity plus 2--one for & at the beginning | |
| 572 # and one for ; at the end | |
| 573 part = part[len(entity) + 2:] | |
| 574 if part: | |
| 575 yield part | |
| 576 continue | |
| 577 | |
| 578 yield part.replace('&', '&') | |
| 579 | |
| 580 def serialize(self, treewalker, encoding=None): | |
| 581 """Wrap HTMLSerializer.serialize and conver & to & in attribute values | |
| 582 | |
| 583 Note that this converts & to & in attribute values where the & isn't | |
| 584 already part of an unambiguous character entity. | |
| 585 | |
| 586 """ | |
| 587 in_tag = False | |
| 588 after_equals = False | |
| 589 | |
| 590 for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding): | |
| 591 if in_tag: | |
| 592 if stoken == '>': | |
| 593 in_tag = False | |
| 594 | |
| 595 elif after_equals: | |
| 596 if stoken != '"': | |
| 597 for part in self.escape_base_amp(stoken): | |
| 598 yield part | |
| 599 | |
| 600 after_equals = False | |
| 601 continue | |
| 602 | |
| 603 elif stoken == '=': | |
| 604 after_equals = True | |
| 605 | |
| 606 yield stoken | |
| 607 else: | |
| 608 if stoken.startswith('<'): | |
| 609 in_tag = True | |
| 610 yield stoken |
