comparison env/lib/python3.9/site-packages/bleach/html5lib_shim.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # flake8: noqa
2 """
3 Shim module between Bleach and html5lib. This makes it easier to upgrade the
4 html5lib library without having to change a lot of code.
5 """
6
7 from __future__ import unicode_literals
8
9 import re
10 import string
11 import warnings
12
13 import six
14
15 # ignore html5lib deprecation warnings to use bleach; we are bleach
16 # apply before we import submodules that import html5lib
17 warnings.filterwarnings(
18 "ignore",
19 message="html5lib's sanitizer is deprecated",
20 category=DeprecationWarning,
21 module="bleach._vendor.html5lib",
22 )
23
24 from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
25 HTMLParser,
26 getTreeWalker,
27 )
28 from bleach._vendor.html5lib import (
29 constants,
30 ) # noqa: E402 module level import not at top of file
31 from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
32 namespaces,
33 prefixes,
34 )
35 from bleach._vendor.html5lib.constants import (
36 _ReparseException as ReparseException,
37 ) # noqa: E402 module level import not at top of file
38 from bleach._vendor.html5lib.filters.base import (
39 Filter,
40 ) # noqa: E402 module level import not at top of file
41 from bleach._vendor.html5lib.filters.sanitizer import (
42 allowed_protocols,
43 ) # noqa: E402 module level import not at top of file
44 from bleach._vendor.html5lib.filters.sanitizer import (
45 Filter as SanitizerFilter,
46 ) # noqa: E402 module level import not at top of file
47 from bleach._vendor.html5lib._inputstream import (
48 HTMLInputStream,
49 ) # noqa: E402 module level import not at top of file
50 from bleach._vendor.html5lib.serializer import (
51 escape,
52 HTMLSerializer,
53 ) # noqa: E402 module level import not at top of file
54 from bleach._vendor.html5lib._tokenizer import (
55 attributeMap,
56 HTMLTokenizer,
57 ) # noqa: E402 module level import not at top of file
58 from bleach._vendor.html5lib._trie import (
59 Trie,
60 ) # noqa: E402 module level import not at top of file
61
62
63 #: Map of entity name to expanded entity
64 ENTITIES = constants.entities
65
66 #: Trie of html entity string -> character representation
67 ENTITIES_TRIE = Trie(ENTITIES)
68
69 #: Token type constants--these never change
70 TAG_TOKEN_TYPES = {
71 constants.tokenTypes["StartTag"],
72 constants.tokenTypes["EndTag"],
73 constants.tokenTypes["EmptyTag"],
74 }
75 CHARACTERS_TYPE = constants.tokenTypes["Characters"]
76 PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
77
78
79 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
80 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3
81 HTML_TAGS = [
82 "a",
83 "abbr",
84 "address",
85 "area",
86 "article",
87 "aside",
88 "audio",
89 "b",
90 "base",
91 "bdi",
92 "bdo",
93 "blockquote",
94 "body",
95 "br",
96 "button",
97 "canvas",
98 "caption",
99 "cite",
100 "code",
101 "col",
102 "colgroup",
103 "data",
104 "datalist",
105 "dd",
106 "del",
107 "details",
108 "dfn",
109 "dialog",
110 "div",
111 "dl",
112 "dt",
113 "em",
114 "embed",
115 "fieldset",
116 "figcaption",
117 "figure",
118 "footer",
119 "form",
120 "h1",
121 "h2",
122 "h3",
123 "h4",
124 "h5",
125 "h6",
126 "head",
127 "header",
128 "hgroup",
129 "hr",
130 "html",
131 "i",
132 "iframe",
133 "img",
134 "input",
135 "ins",
136 "kbd",
137 "keygen",
138 "label",
139 "legend",
140 "li",
141 "link",
142 "map",
143 "mark",
144 "menu",
145 "meta",
146 "meter",
147 "nav",
148 "noscript",
149 "object",
150 "ol",
151 "optgroup",
152 "option",
153 "output",
154 "p",
155 "param",
156 "picture",
157 "pre",
158 "progress",
159 "q",
160 "rp",
161 "rt",
162 "ruby",
163 "s",
164 "samp",
165 "script",
166 "section",
167 "select",
168 "slot",
169 "small",
170 "source",
171 "span",
172 "strong",
173 "style",
174 "sub",
175 "summary",
176 "sup",
177 "table",
178 "tbody",
179 "td",
180 "template",
181 "textarea",
182 "tfoot",
183 "th",
184 "thead",
185 "time",
186 "title",
187 "tr",
188 "track",
189 "u",
190 "ul",
191 "var",
192 "video",
193 "wbr",
194 ]
195
196
197 class InputStreamWithMemory(object):
198 """Wraps an HTMLInputStream to remember characters since last <
199
200 This wraps existing HTMLInputStream classes to keep track of the stream
201 since the last < which marked an open tag state.
202
203 """
204
205 def __init__(self, inner_stream):
206 self._inner_stream = inner_stream
207 self.reset = self._inner_stream.reset
208 self.position = self._inner_stream.position
209 self._buffer = []
210
211 @property
212 def errors(self):
213 return self._inner_stream.errors
214
215 @property
216 def charEncoding(self):
217 return self._inner_stream.charEncoding
218
219 @property
220 def changeEncoding(self):
221 return self._inner_stream.changeEncoding
222
223 def char(self):
224 c = self._inner_stream.char()
225 # char() can return None if EOF, so ignore that
226 if c:
227 self._buffer.append(c)
228 return c
229
230 def charsUntil(self, characters, opposite=False):
231 chars = self._inner_stream.charsUntil(characters, opposite=opposite)
232 self._buffer.extend(list(chars))
233 return chars
234
235 def unget(self, char):
236 if self._buffer:
237 self._buffer.pop(-1)
238 return self._inner_stream.unget(char)
239
240 def get_tag(self):
241 """Returns the stream history since last '<'
242
243 Since the buffer starts at the last '<' as as seen by tagOpenState(),
244 we know that everything from that point to when this method is called
245 is the "tag" that is being tokenized.
246
247 """
248 return six.text_type("").join(self._buffer)
249
250 def start_tag(self):
251 """Resets stream history to just '<'
252
253 This gets called by tagOpenState() which marks a '<' that denotes an
254 open tag. Any time we see that, we reset the buffer.
255
256 """
257 self._buffer = ["<"]
258
259
260 class BleachHTMLTokenizer(HTMLTokenizer):
261 """Tokenizer that doesn't consume character entities"""
262
263 def __init__(self, consume_entities=False, **kwargs):
264 super(BleachHTMLTokenizer, self).__init__(**kwargs)
265
266 self.consume_entities = consume_entities
267
268 # Wrap the stream with one that remembers the history
269 self.stream = InputStreamWithMemory(self.stream)
270
271 def __iter__(self):
272 last_error_token = None
273
274 for token in super(BleachHTMLTokenizer, self).__iter__():
275 if last_error_token is not None:
276 if (
277 last_error_token["data"] == "invalid-character-in-attribute-name"
278 and token["type"] in TAG_TOKEN_TYPES
279 and token.get("data")
280 ):
281 # token["data"] is an html5lib attributeMap
282 # (OrderedDict 3.7+ and dict otherwise)
283 # of attr name to attr value
284 #
285 # Remove attribute names that have ', " or < in them
286 # because those characters are invalid for attribute names.
287 token["data"] = attributeMap(
288 (attr_name, attr_value)
289 for attr_name, attr_value in token["data"].items()
290 if (
291 '"' not in attr_name
292 and "'" not in attr_name
293 and "<" not in attr_name
294 )
295 )
296 last_error_token = None
297 yield token
298
299 elif (
300 last_error_token["data"] == "expected-closing-tag-but-got-char"
301 and self.parser.tags is not None
302 and token["data"].lower().strip() not in self.parser.tags
303 ):
304 # We've got either a malformed tag or a pseudo-tag or
305 # something that html5lib wants to turn into a malformed
306 # comment which Bleach clean() will drop so we interfere
307 # with the token stream to handle it more correctly.
308 #
309 # If this is an allowed tag, it's malformed and we just let
310 # the html5lib parser deal with it--we don't enter into this
311 # block.
312 #
313 # If this is not an allowed tag, then we convert it to
314 # characters and it'll get escaped in the sanitizer.
315 token["data"] = self.stream.get_tag()
316 token["type"] = CHARACTERS_TYPE
317
318 last_error_token = None
319 yield token
320
321 elif token["type"] == PARSEERROR_TYPE:
322 # If the token is a parse error, then let the last_error_token
323 # go, and make token the new last_error_token
324 yield last_error_token
325 last_error_token = token
326
327 else:
328 yield last_error_token
329 yield token
330 last_error_token = None
331
332 continue
333
334 # If the token is a ParseError, we hold on to it so we can get the
335 # next token and potentially fix it.
336 if token["type"] == PARSEERROR_TYPE:
337 last_error_token = token
338 continue
339
340 yield token
341
342 if last_error_token:
343 yield last_error_token
344
345 def consumeEntity(self, allowedChar=None, fromAttribute=False):
346 # If this tokenizer is set to consume entities, then we can let the
347 # superclass do its thing.
348 if self.consume_entities:
349 return super(BleachHTMLTokenizer, self).consumeEntity(
350 allowedChar, fromAttribute
351 )
352
353 # If this tokenizer is set to not consume entities, then we don't want
354 # to consume and convert them, so this overrides the html5lib tokenizer's
355 # consumeEntity so that it's now a no-op.
356 #
357 # However, when that gets called, it's consumed an &, so we put that back in
358 # the stream.
359 if fromAttribute:
360 self.currentToken["data"][-1][1] += "&"
361
362 else:
363 self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
364
365 def tagOpenState(self):
366 # This state marks a < that is either a StartTag, EndTag, EmptyTag,
367 # or ParseError. In all cases, we want to drop any stream history
368 # we've collected so far and we do that by calling start_tag() on
369 # the input stream wrapper.
370 self.stream.start_tag()
371 return super(BleachHTMLTokenizer, self).tagOpenState()
372
373 def emitCurrentToken(self):
374 token = self.currentToken
375
376 if (
377 self.parser.tags is not None
378 and token["type"] in TAG_TOKEN_TYPES
379 and token["name"].lower() not in self.parser.tags
380 ):
381 # If this is a start/end/empty tag for a tag that's not in our
382 # allowed list, then it gets stripped or escaped. In both of these
383 # cases it gets converted to a Characters token.
384 if self.parser.strip:
385 # If we're stripping the token, we just throw in an empty
386 # string token.
387 new_data = ""
388
389 else:
390 # If we're escaping the token, we want to escape the exact
391 # original string. Since tokenizing also normalizes data
392 # and this is a tag-like thing, we've lost some information.
393 # So we go back through the stream to get the original
394 # string and use that.
395 new_data = self.stream.get_tag()
396
397 new_token = {"type": CHARACTERS_TYPE, "data": new_data}
398
399 self.currentToken = new_token
400 self.tokenQueue.append(new_token)
401 self.state = self.dataState
402 return
403
404 super(BleachHTMLTokenizer, self).emitCurrentToken()
405
406
407 class BleachHTMLParser(HTMLParser):
408 """Parser that uses BleachHTMLTokenizer"""
409
410 def __init__(self, tags, strip, consume_entities, **kwargs):
411 """
412 :arg tags: list of allowed tags--everything else is either stripped or
413 escaped; if None, then this doesn't look at tags at all
414 :arg strip: whether to strip disallowed tags (True) or escape them (False);
415 if tags=None, then this doesn't have any effect
416 :arg consume_entities: whether to consume entities (default behavior) or
417 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
418
419 """
420 self.tags = [tag.lower() for tag in tags] if tags is not None else None
421 self.strip = strip
422 self.consume_entities = consume_entities
423 super(BleachHTMLParser, self).__init__(**kwargs)
424
425 def _parse(
426 self, stream, innerHTML=False, container="div", scripting=True, **kwargs
427 ):
428 # set scripting=True to parse <noscript> as though JS is enabled to
429 # match the expected context in browsers
430 #
431 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
432 #
433 # Override HTMLParser so we can swap out the tokenizer for our own.
434 self.innerHTMLMode = innerHTML
435 self.container = container
436 self.scripting = scripting
437 self.tokenizer = BleachHTMLTokenizer(
438 stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
439 )
440 self.reset()
441
442 try:
443 self.mainLoop()
444 except ReparseException:
445 self.reset()
446 self.mainLoop()
447
448
449 def convert_entity(value):
450 """Convert an entity (minus the & and ; part) into what it represents
451
452 This handles numeric, hex, and text entities.
453
454 :arg value: the string (minus the ``&`` and ``;`` part) to convert
455
456 :returns: unicode character or None if it's an ambiguous ampersand that
457 doesn't match a character entity
458
459 """
460 if value[0] == "#":
461 if len(value) < 2:
462 return None
463
464 if value[1] in ("x", "X"):
465 # hex-encoded code point
466 int_as_string, base = value[2:], 16
467 else:
468 # decimal code point
469 int_as_string, base = value[1:], 10
470
471 if int_as_string == "":
472 return None
473
474 code_point = int(int_as_string, base)
475 if 0 < code_point < 0x110000:
476 return six.unichr(code_point)
477 else:
478 return None
479
480 return ENTITIES.get(value, None)
481
482
483 def convert_entities(text):
484 """Converts all found entities in the text
485
486 :arg text: the text to convert entities in
487
488 :returns: unicode text with converted entities
489
490 """
491 if "&" not in text:
492 return text
493
494 new_text = []
495 for part in next_possible_entity(text):
496 if not part:
497 continue
498
499 if part.startswith("&"):
500 entity = match_entity(part)
501 if entity is not None:
502 converted = convert_entity(entity)
503
504 # If it's not an ambiguous ampersand, then replace with the
505 # unicode character. Otherwise, we leave the entity in.
506 if converted is not None:
507 new_text.append(converted)
508 remainder = part[len(entity) + 2 :]
509 if part:
510 new_text.append(remainder)
511 continue
512
513 new_text.append(part)
514
515 return "".join(new_text)
516
517
518 def match_entity(stream):
519 """Returns first entity in stream or None if no entity exists
520
521 Note: For Bleach purposes, entities must start with a "&" and end with
522 a ";". This ignoresambiguous character entities that have no ";" at the
523 end.
524
525 :arg stream: the character stream
526
527 :returns: ``None`` or the entity string without "&" or ";"
528
529 """
530 # Nix the & at the beginning
531 if stream[0] != "&":
532 raise ValueError('Stream should begin with "&"')
533
534 stream = stream[1:]
535
536 stream = list(stream)
537 possible_entity = ""
538 end_characters = "<&=;" + string.whitespace
539
540 # Handle number entities
541 if stream and stream[0] == "#":
542 possible_entity = "#"
543 stream.pop(0)
544
545 if stream and stream[0] in ("x", "X"):
546 allowed = "0123456789abcdefABCDEF"
547 possible_entity += stream.pop(0)
548 else:
549 allowed = "0123456789"
550
551 # FIXME(willkg): Do we want to make sure these are valid number
552 # entities? This doesn't do that currently.
553 while stream and stream[0] not in end_characters:
554 c = stream.pop(0)
555 if c not in allowed:
556 break
557 possible_entity += c
558
559 if possible_entity and stream and stream[0] == ";":
560 return possible_entity
561 return None
562
563 # Handle character entities
564 while stream and stream[0] not in end_characters:
565 c = stream.pop(0)
566 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
567 break
568 possible_entity += c
569
570 if possible_entity and stream and stream[0] == ";":
571 return possible_entity
572
573 return None
574
575
576 AMP_SPLIT_RE = re.compile("(&)")
577
578
579 def next_possible_entity(text):
580 """Takes a text and generates a list of possible entities
581
582 :arg text: the text to look at
583
584 :returns: generator where each part (except the first) starts with an
585 "&"
586
587 """
588 for i, part in enumerate(AMP_SPLIT_RE.split(text)):
589 if i == 0:
590 yield part
591 elif i % 2 == 0:
592 yield "&" + part
593
594
595 class BleachHTMLSerializer(HTMLSerializer):
596 """HTMLSerializer that undoes & -> &amp; in attributes and sets
597 escape_rcdata to True
598 """
599
600 # per the HTMLSerializer.__init__ docstring:
601 #
602 # Whether to escape characters that need to be
603 # escaped within normal elements within rcdata elements such as
604 # style.
605 #
606 escape_rcdata = True
607
608 def escape_base_amp(self, stoken):
609 """Escapes just bare & in HTML attribute values"""
610 # First, undo escaping of &. We need to do this because html5lib's
611 # HTMLSerializer expected the tokenizer to consume all the character
612 # entities and convert them to their respective characters, but the
613 # BleachHTMLTokenizer doesn't do that. For example, this fixes
614 # &amp;entity; back to &entity; .
615 stoken = stoken.replace("&amp;", "&")
616
617 # However, we do want all bare & that are not marking character
618 # entities to be changed to &amp;, so let's do that carefully here.
619 for part in next_possible_entity(stoken):
620 if not part:
621 continue
622
623 if part.startswith("&"):
624 entity = match_entity(part)
625 # Only leave entities in that are not ambiguous. If they're
626 # ambiguous, then we escape the ampersand.
627 if entity is not None and convert_entity(entity) is not None:
628 yield "&" + entity + ";"
629
630 # Length of the entity plus 2--one for & at the beginning
631 # and one for ; at the end
632 part = part[len(entity) + 2 :]
633 if part:
634 yield part
635 continue
636
637 yield part.replace("&", "&amp;")
638
639 def serialize(self, treewalker, encoding=None):
640 """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
641
642 Note that this converts & to &amp; in attribute values where the & isn't
643 already part of an unambiguous character entity.
644
645 """
646 in_tag = False
647 after_equals = False
648
649 for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
650 if in_tag:
651 if stoken == ">":
652 in_tag = False
653
654 elif after_equals:
655 if stoken != '"':
656 for part in self.escape_base_amp(stoken):
657 yield part
658
659 after_equals = False
660 continue
661
662 elif stoken == "=":
663 after_equals = True
664
665 yield stoken
666 else:
667 if stoken.startswith("<"):
668 in_tag = True
669 yield stoken