Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bleach/html5lib_shim.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 # flake8: noqa | |
2 """ | |
3 Shim module between Bleach and html5lib. This makes it easier to upgrade the | |
4 html5lib library without having to change a lot of code. | |
5 """ | |
6 | |
7 from __future__ import unicode_literals | |
8 | |
9 import re | |
10 import string | |
11 import warnings | |
12 | |
13 import six | |
14 | |
15 # ignore html5lib deprecation warnings to use bleach; we are bleach | |
16 # apply before we import submodules that import html5lib | |
17 warnings.filterwarnings( | |
18 "ignore", | |
19 message="html5lib's sanitizer is deprecated", | |
20 category=DeprecationWarning, | |
21 module="bleach._vendor.html5lib", | |
22 ) | |
23 | |
24 from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file | |
25 HTMLParser, | |
26 getTreeWalker, | |
27 ) | |
28 from bleach._vendor.html5lib import ( | |
29 constants, | |
30 ) # noqa: E402 module level import not at top of file | |
31 from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file | |
32 namespaces, | |
33 prefixes, | |
34 ) | |
35 from bleach._vendor.html5lib.constants import ( | |
36 _ReparseException as ReparseException, | |
37 ) # noqa: E402 module level import not at top of file | |
38 from bleach._vendor.html5lib.filters.base import ( | |
39 Filter, | |
40 ) # noqa: E402 module level import not at top of file | |
41 from bleach._vendor.html5lib.filters.sanitizer import ( | |
42 allowed_protocols, | |
43 ) # noqa: E402 module level import not at top of file | |
44 from bleach._vendor.html5lib.filters.sanitizer import ( | |
45 Filter as SanitizerFilter, | |
46 ) # noqa: E402 module level import not at top of file | |
47 from bleach._vendor.html5lib._inputstream import ( | |
48 HTMLInputStream, | |
49 ) # noqa: E402 module level import not at top of file | |
50 from bleach._vendor.html5lib.serializer import ( | |
51 escape, | |
52 HTMLSerializer, | |
53 ) # noqa: E402 module level import not at top of file | |
54 from bleach._vendor.html5lib._tokenizer import ( | |
55 attributeMap, | |
56 HTMLTokenizer, | |
57 ) # noqa: E402 module level import not at top of file | |
58 from bleach._vendor.html5lib._trie import ( | |
59 Trie, | |
60 ) # noqa: E402 module level import not at top of file | |
61 | |
62 | |
63 #: Map of entity name to expanded entity | |
64 ENTITIES = constants.entities | |
65 | |
66 #: Trie of html entity string -> character representation | |
67 ENTITIES_TRIE = Trie(ENTITIES) | |
68 | |
69 #: Token type constants--these never change | |
70 TAG_TOKEN_TYPES = { | |
71 constants.tokenTypes["StartTag"], | |
72 constants.tokenTypes["EndTag"], | |
73 constants.tokenTypes["EmptyTag"], | |
74 } | |
75 CHARACTERS_TYPE = constants.tokenTypes["Characters"] | |
76 PARSEERROR_TYPE = constants.tokenTypes["ParseError"] | |
77 | |
78 | |
79 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 | |
80 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3 | |
81 HTML_TAGS = [ | |
82 "a", | |
83 "abbr", | |
84 "address", | |
85 "area", | |
86 "article", | |
87 "aside", | |
88 "audio", | |
89 "b", | |
90 "base", | |
91 "bdi", | |
92 "bdo", | |
93 "blockquote", | |
94 "body", | |
95 "br", | |
96 "button", | |
97 "canvas", | |
98 "caption", | |
99 "cite", | |
100 "code", | |
101 "col", | |
102 "colgroup", | |
103 "data", | |
104 "datalist", | |
105 "dd", | |
106 "del", | |
107 "details", | |
108 "dfn", | |
109 "dialog", | |
110 "div", | |
111 "dl", | |
112 "dt", | |
113 "em", | |
114 "embed", | |
115 "fieldset", | |
116 "figcaption", | |
117 "figure", | |
118 "footer", | |
119 "form", | |
120 "h1", | |
121 "h2", | |
122 "h3", | |
123 "h4", | |
124 "h5", | |
125 "h6", | |
126 "head", | |
127 "header", | |
128 "hgroup", | |
129 "hr", | |
130 "html", | |
131 "i", | |
132 "iframe", | |
133 "img", | |
134 "input", | |
135 "ins", | |
136 "kbd", | |
137 "keygen", | |
138 "label", | |
139 "legend", | |
140 "li", | |
141 "link", | |
142 "map", | |
143 "mark", | |
144 "menu", | |
145 "meta", | |
146 "meter", | |
147 "nav", | |
148 "noscript", | |
149 "object", | |
150 "ol", | |
151 "optgroup", | |
152 "option", | |
153 "output", | |
154 "p", | |
155 "param", | |
156 "picture", | |
157 "pre", | |
158 "progress", | |
159 "q", | |
160 "rp", | |
161 "rt", | |
162 "ruby", | |
163 "s", | |
164 "samp", | |
165 "script", | |
166 "section", | |
167 "select", | |
168 "slot", | |
169 "small", | |
170 "source", | |
171 "span", | |
172 "strong", | |
173 "style", | |
174 "sub", | |
175 "summary", | |
176 "sup", | |
177 "table", | |
178 "tbody", | |
179 "td", | |
180 "template", | |
181 "textarea", | |
182 "tfoot", | |
183 "th", | |
184 "thead", | |
185 "time", | |
186 "title", | |
187 "tr", | |
188 "track", | |
189 "u", | |
190 "ul", | |
191 "var", | |
192 "video", | |
193 "wbr", | |
194 ] | |
195 | |
196 | |
197 class InputStreamWithMemory(object): | |
198 """Wraps an HTMLInputStream to remember characters since last < | |
199 | |
200 This wraps existing HTMLInputStream classes to keep track of the stream | |
201 since the last < which marked an open tag state. | |
202 | |
203 """ | |
204 | |
205 def __init__(self, inner_stream): | |
206 self._inner_stream = inner_stream | |
207 self.reset = self._inner_stream.reset | |
208 self.position = self._inner_stream.position | |
209 self._buffer = [] | |
210 | |
211 @property | |
212 def errors(self): | |
213 return self._inner_stream.errors | |
214 | |
215 @property | |
216 def charEncoding(self): | |
217 return self._inner_stream.charEncoding | |
218 | |
219 @property | |
220 def changeEncoding(self): | |
221 return self._inner_stream.changeEncoding | |
222 | |
223 def char(self): | |
224 c = self._inner_stream.char() | |
225 # char() can return None if EOF, so ignore that | |
226 if c: | |
227 self._buffer.append(c) | |
228 return c | |
229 | |
230 def charsUntil(self, characters, opposite=False): | |
231 chars = self._inner_stream.charsUntil(characters, opposite=opposite) | |
232 self._buffer.extend(list(chars)) | |
233 return chars | |
234 | |
235 def unget(self, char): | |
236 if self._buffer: | |
237 self._buffer.pop(-1) | |
238 return self._inner_stream.unget(char) | |
239 | |
240 def get_tag(self): | |
241 """Returns the stream history since last '<' | |
242 | |
243 Since the buffer starts at the last '<' as as seen by tagOpenState(), | |
244 we know that everything from that point to when this method is called | |
245 is the "tag" that is being tokenized. | |
246 | |
247 """ | |
248 return six.text_type("").join(self._buffer) | |
249 | |
250 def start_tag(self): | |
251 """Resets stream history to just '<' | |
252 | |
253 This gets called by tagOpenState() which marks a '<' that denotes an | |
254 open tag. Any time we see that, we reset the buffer. | |
255 | |
256 """ | |
257 self._buffer = ["<"] | |
258 | |
259 | |
260 class BleachHTMLTokenizer(HTMLTokenizer): | |
261 """Tokenizer that doesn't consume character entities""" | |
262 | |
263 def __init__(self, consume_entities=False, **kwargs): | |
264 super(BleachHTMLTokenizer, self).__init__(**kwargs) | |
265 | |
266 self.consume_entities = consume_entities | |
267 | |
268 # Wrap the stream with one that remembers the history | |
269 self.stream = InputStreamWithMemory(self.stream) | |
270 | |
271 def __iter__(self): | |
272 last_error_token = None | |
273 | |
274 for token in super(BleachHTMLTokenizer, self).__iter__(): | |
275 if last_error_token is not None: | |
276 if ( | |
277 last_error_token["data"] == "invalid-character-in-attribute-name" | |
278 and token["type"] in TAG_TOKEN_TYPES | |
279 and token.get("data") | |
280 ): | |
281 # token["data"] is an html5lib attributeMap | |
282 # (OrderedDict 3.7+ and dict otherwise) | |
283 # of attr name to attr value | |
284 # | |
285 # Remove attribute names that have ', " or < in them | |
286 # because those characters are invalid for attribute names. | |
287 token["data"] = attributeMap( | |
288 (attr_name, attr_value) | |
289 for attr_name, attr_value in token["data"].items() | |
290 if ( | |
291 '"' not in attr_name | |
292 and "'" not in attr_name | |
293 and "<" not in attr_name | |
294 ) | |
295 ) | |
296 last_error_token = None | |
297 yield token | |
298 | |
299 elif ( | |
300 last_error_token["data"] == "expected-closing-tag-but-got-char" | |
301 and self.parser.tags is not None | |
302 and token["data"].lower().strip() not in self.parser.tags | |
303 ): | |
304 # We've got either a malformed tag or a pseudo-tag or | |
305 # something that html5lib wants to turn into a malformed | |
306 # comment which Bleach clean() will drop so we interfere | |
307 # with the token stream to handle it more correctly. | |
308 # | |
309 # If this is an allowed tag, it's malformed and we just let | |
310 # the html5lib parser deal with it--we don't enter into this | |
311 # block. | |
312 # | |
313 # If this is not an allowed tag, then we convert it to | |
314 # characters and it'll get escaped in the sanitizer. | |
315 token["data"] = self.stream.get_tag() | |
316 token["type"] = CHARACTERS_TYPE | |
317 | |
318 last_error_token = None | |
319 yield token | |
320 | |
321 elif token["type"] == PARSEERROR_TYPE: | |
322 # If the token is a parse error, then let the last_error_token | |
323 # go, and make token the new last_error_token | |
324 yield last_error_token | |
325 last_error_token = token | |
326 | |
327 else: | |
328 yield last_error_token | |
329 yield token | |
330 last_error_token = None | |
331 | |
332 continue | |
333 | |
334 # If the token is a ParseError, we hold on to it so we can get the | |
335 # next token and potentially fix it. | |
336 if token["type"] == PARSEERROR_TYPE: | |
337 last_error_token = token | |
338 continue | |
339 | |
340 yield token | |
341 | |
342 if last_error_token: | |
343 yield last_error_token | |
344 | |
345 def consumeEntity(self, allowedChar=None, fromAttribute=False): | |
346 # If this tokenizer is set to consume entities, then we can let the | |
347 # superclass do its thing. | |
348 if self.consume_entities: | |
349 return super(BleachHTMLTokenizer, self).consumeEntity( | |
350 allowedChar, fromAttribute | |
351 ) | |
352 | |
353 # If this tokenizer is set to not consume entities, then we don't want | |
354 # to consume and convert them, so this overrides the html5lib tokenizer's | |
355 # consumeEntity so that it's now a no-op. | |
356 # | |
357 # However, when that gets called, it's consumed an &, so we put that back in | |
358 # the stream. | |
359 if fromAttribute: | |
360 self.currentToken["data"][-1][1] += "&" | |
361 | |
362 else: | |
363 self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"}) | |
364 | |
365 def tagOpenState(self): | |
366 # This state marks a < that is either a StartTag, EndTag, EmptyTag, | |
367 # or ParseError. In all cases, we want to drop any stream history | |
368 # we've collected so far and we do that by calling start_tag() on | |
369 # the input stream wrapper. | |
370 self.stream.start_tag() | |
371 return super(BleachHTMLTokenizer, self).tagOpenState() | |
372 | |
373 def emitCurrentToken(self): | |
374 token = self.currentToken | |
375 | |
376 if ( | |
377 self.parser.tags is not None | |
378 and token["type"] in TAG_TOKEN_TYPES | |
379 and token["name"].lower() not in self.parser.tags | |
380 ): | |
381 # If this is a start/end/empty tag for a tag that's not in our | |
382 # allowed list, then it gets stripped or escaped. In both of these | |
383 # cases it gets converted to a Characters token. | |
384 if self.parser.strip: | |
385 # If we're stripping the token, we just throw in an empty | |
386 # string token. | |
387 new_data = "" | |
388 | |
389 else: | |
390 # If we're escaping the token, we want to escape the exact | |
391 # original string. Since tokenizing also normalizes data | |
392 # and this is a tag-like thing, we've lost some information. | |
393 # So we go back through the stream to get the original | |
394 # string and use that. | |
395 new_data = self.stream.get_tag() | |
396 | |
397 new_token = {"type": CHARACTERS_TYPE, "data": new_data} | |
398 | |
399 self.currentToken = new_token | |
400 self.tokenQueue.append(new_token) | |
401 self.state = self.dataState | |
402 return | |
403 | |
404 super(BleachHTMLTokenizer, self).emitCurrentToken() | |
405 | |
406 | |
407 class BleachHTMLParser(HTMLParser): | |
408 """Parser that uses BleachHTMLTokenizer""" | |
409 | |
410 def __init__(self, tags, strip, consume_entities, **kwargs): | |
411 """ | |
412 :arg tags: list of allowed tags--everything else is either stripped or | |
413 escaped; if None, then this doesn't look at tags at all | |
414 :arg strip: whether to strip disallowed tags (True) or escape them (False); | |
415 if tags=None, then this doesn't have any effect | |
416 :arg consume_entities: whether to consume entities (default behavior) or | |
417 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) | |
418 | |
419 """ | |
420 self.tags = [tag.lower() for tag in tags] if tags is not None else None | |
421 self.strip = strip | |
422 self.consume_entities = consume_entities | |
423 super(BleachHTMLParser, self).__init__(**kwargs) | |
424 | |
425 def _parse( | |
426 self, stream, innerHTML=False, container="div", scripting=True, **kwargs | |
427 ): | |
428 # set scripting=True to parse <noscript> as though JS is enabled to | |
429 # match the expected context in browsers | |
430 # | |
431 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element | |
432 # | |
433 # Override HTMLParser so we can swap out the tokenizer for our own. | |
434 self.innerHTMLMode = innerHTML | |
435 self.container = container | |
436 self.scripting = scripting | |
437 self.tokenizer = BleachHTMLTokenizer( | |
438 stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs | |
439 ) | |
440 self.reset() | |
441 | |
442 try: | |
443 self.mainLoop() | |
444 except ReparseException: | |
445 self.reset() | |
446 self.mainLoop() | |
447 | |
448 | |
449 def convert_entity(value): | |
450 """Convert an entity (minus the & and ; part) into what it represents | |
451 | |
452 This handles numeric, hex, and text entities. | |
453 | |
454 :arg value: the string (minus the ``&`` and ``;`` part) to convert | |
455 | |
456 :returns: unicode character or None if it's an ambiguous ampersand that | |
457 doesn't match a character entity | |
458 | |
459 """ | |
460 if value[0] == "#": | |
461 if len(value) < 2: | |
462 return None | |
463 | |
464 if value[1] in ("x", "X"): | |
465 # hex-encoded code point | |
466 int_as_string, base = value[2:], 16 | |
467 else: | |
468 # decimal code point | |
469 int_as_string, base = value[1:], 10 | |
470 | |
471 if int_as_string == "": | |
472 return None | |
473 | |
474 code_point = int(int_as_string, base) | |
475 if 0 < code_point < 0x110000: | |
476 return six.unichr(code_point) | |
477 else: | |
478 return None | |
479 | |
480 return ENTITIES.get(value, None) | |
481 | |
482 | |
483 def convert_entities(text): | |
484 """Converts all found entities in the text | |
485 | |
486 :arg text: the text to convert entities in | |
487 | |
488 :returns: unicode text with converted entities | |
489 | |
490 """ | |
491 if "&" not in text: | |
492 return text | |
493 | |
494 new_text = [] | |
495 for part in next_possible_entity(text): | |
496 if not part: | |
497 continue | |
498 | |
499 if part.startswith("&"): | |
500 entity = match_entity(part) | |
501 if entity is not None: | |
502 converted = convert_entity(entity) | |
503 | |
504 # If it's not an ambiguous ampersand, then replace with the | |
505 # unicode character. Otherwise, we leave the entity in. | |
506 if converted is not None: | |
507 new_text.append(converted) | |
508 remainder = part[len(entity) + 2 :] | |
509 if part: | |
510 new_text.append(remainder) | |
511 continue | |
512 | |
513 new_text.append(part) | |
514 | |
515 return "".join(new_text) | |
516 | |
517 | |
518 def match_entity(stream): | |
519 """Returns first entity in stream or None if no entity exists | |
520 | |
521 Note: For Bleach purposes, entities must start with a "&" and end with | |
522 a ";". This ignoresambiguous character entities that have no ";" at the | |
523 end. | |
524 | |
525 :arg stream: the character stream | |
526 | |
527 :returns: ``None`` or the entity string without "&" or ";" | |
528 | |
529 """ | |
530 # Nix the & at the beginning | |
531 if stream[0] != "&": | |
532 raise ValueError('Stream should begin with "&"') | |
533 | |
534 stream = stream[1:] | |
535 | |
536 stream = list(stream) | |
537 possible_entity = "" | |
538 end_characters = "<&=;" + string.whitespace | |
539 | |
540 # Handle number entities | |
541 if stream and stream[0] == "#": | |
542 possible_entity = "#" | |
543 stream.pop(0) | |
544 | |
545 if stream and stream[0] in ("x", "X"): | |
546 allowed = "0123456789abcdefABCDEF" | |
547 possible_entity += stream.pop(0) | |
548 else: | |
549 allowed = "0123456789" | |
550 | |
551 # FIXME(willkg): Do we want to make sure these are valid number | |
552 # entities? This doesn't do that currently. | |
553 while stream and stream[0] not in end_characters: | |
554 c = stream.pop(0) | |
555 if c not in allowed: | |
556 break | |
557 possible_entity += c | |
558 | |
559 if possible_entity and stream and stream[0] == ";": | |
560 return possible_entity | |
561 return None | |
562 | |
563 # Handle character entities | |
564 while stream and stream[0] not in end_characters: | |
565 c = stream.pop(0) | |
566 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity): | |
567 break | |
568 possible_entity += c | |
569 | |
570 if possible_entity and stream and stream[0] == ";": | |
571 return possible_entity | |
572 | |
573 return None | |
574 | |
575 | |
576 AMP_SPLIT_RE = re.compile("(&)") | |
577 | |
578 | |
579 def next_possible_entity(text): | |
580 """Takes a text and generates a list of possible entities | |
581 | |
582 :arg text: the text to look at | |
583 | |
584 :returns: generator where each part (except the first) starts with an | |
585 "&" | |
586 | |
587 """ | |
588 for i, part in enumerate(AMP_SPLIT_RE.split(text)): | |
589 if i == 0: | |
590 yield part | |
591 elif i % 2 == 0: | |
592 yield "&" + part | |
593 | |
594 | |
595 class BleachHTMLSerializer(HTMLSerializer): | |
596 """HTMLSerializer that undoes & -> & in attributes and sets | |
597 escape_rcdata to True | |
598 """ | |
599 | |
600 # per the HTMLSerializer.__init__ docstring: | |
601 # | |
602 # Whether to escape characters that need to be | |
603 # escaped within normal elements within rcdata elements such as | |
604 # style. | |
605 # | |
606 escape_rcdata = True | |
607 | |
608 def escape_base_amp(self, stoken): | |
609 """Escapes just bare & in HTML attribute values""" | |
610 # First, undo escaping of &. We need to do this because html5lib's | |
611 # HTMLSerializer expected the tokenizer to consume all the character | |
612 # entities and convert them to their respective characters, but the | |
613 # BleachHTMLTokenizer doesn't do that. For example, this fixes | |
614 # &entity; back to &entity; . | |
615 stoken = stoken.replace("&", "&") | |
616 | |
617 # However, we do want all bare & that are not marking character | |
618 # entities to be changed to &, so let's do that carefully here. | |
619 for part in next_possible_entity(stoken): | |
620 if not part: | |
621 continue | |
622 | |
623 if part.startswith("&"): | |
624 entity = match_entity(part) | |
625 # Only leave entities in that are not ambiguous. If they're | |
626 # ambiguous, then we escape the ampersand. | |
627 if entity is not None and convert_entity(entity) is not None: | |
628 yield "&" + entity + ";" | |
629 | |
630 # Length of the entity plus 2--one for & at the beginning | |
631 # and one for ; at the end | |
632 part = part[len(entity) + 2 :] | |
633 if part: | |
634 yield part | |
635 continue | |
636 | |
637 yield part.replace("&", "&") | |
638 | |
639 def serialize(self, treewalker, encoding=None): | |
640 """Wrap HTMLSerializer.serialize and conver & to & in attribute values | |
641 | |
642 Note that this converts & to & in attribute values where the & isn't | |
643 already part of an unambiguous character entity. | |
644 | |
645 """ | |
646 in_tag = False | |
647 after_equals = False | |
648 | |
649 for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding): | |
650 if in_tag: | |
651 if stoken == ">": | |
652 in_tag = False | |
653 | |
654 elif after_equals: | |
655 if stoken != '"': | |
656 for part in self.escape_base_amp(stoken): | |
657 yield part | |
658 | |
659 after_equals = False | |
660 continue | |
661 | |
662 elif stoken == "=": | |
663 after_equals = True | |
664 | |
665 yield stoken | |
666 else: | |
667 if stoken.startswith("<"): | |
668 in_tag = True | |
669 yield stoken |