Mercurial > repos > shellac > sam_consensus_v3
diff env/lib/python3.9/site-packages/bleach/html5lib_shim.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/env/lib/python3.9/site-packages/bleach/html5lib_shim.py Mon Mar 22 18:12:50 2021 +0000 @@ -0,0 +1,669 @@ +# flake8: noqa +""" +Shim module between Bleach and html5lib. This makes it easier to upgrade the +html5lib library without having to change a lot of code. +""" + +from __future__ import unicode_literals + +import re +import string +import warnings + +import six + +# ignore html5lib deprecation warnings to use bleach; we are bleach +# apply before we import submodules that import html5lib +warnings.filterwarnings( + "ignore", + message="html5lib's sanitizer is deprecated", + category=DeprecationWarning, + module="bleach._vendor.html5lib", +) + +from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file + HTMLParser, + getTreeWalker, +) +from bleach._vendor.html5lib import ( + constants, +) # noqa: E402 module level import not at top of file +from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file + namespaces, + prefixes, +) +from bleach._vendor.html5lib.constants import ( + _ReparseException as ReparseException, +) # noqa: E402 module level import not at top of file +from bleach._vendor.html5lib.filters.base import ( + Filter, +) # noqa: E402 module level import not at top of file +from bleach._vendor.html5lib.filters.sanitizer import ( + allowed_protocols, +) # noqa: E402 module level import not at top of file +from bleach._vendor.html5lib.filters.sanitizer import ( + Filter as SanitizerFilter, +) # noqa: E402 module level import not at top of file +from bleach._vendor.html5lib._inputstream import ( + HTMLInputStream, +) # noqa: E402 module level import not at top of file +from bleach._vendor.html5lib.serializer import ( + escape, + HTMLSerializer, +) # noqa: E402 module level import not at top of file +from bleach._vendor.html5lib._tokenizer import ( + attributeMap, + HTMLTokenizer, +) # noqa: E402 module level import not at top of file +from bleach._vendor.html5lib._trie import ( + Trie, +) # noqa: E402 module level import not at top of file + + +#: Map of entity name to expanded entity +ENTITIES = constants.entities + +#: Trie of html entity string -> character representation +ENTITIES_TRIE = Trie(ENTITIES) + +#: Token type constants--these never change +TAG_TOKEN_TYPES = { + constants.tokenTypes["StartTag"], + constants.tokenTypes["EndTag"], + constants.tokenTypes["EmptyTag"], +} +CHARACTERS_TYPE = constants.tokenTypes["Characters"] +PARSEERROR_TYPE = constants.tokenTypes["ParseError"] + + +#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 +#: https://html.spec.whatwg.org/multipage/indices.html#elements-3 +HTML_TAGS = [ + "a", + "abbr", + "address", + "area", + "article", + "aside", + "audio", + "b", + "base", + "bdi", + "bdo", + "blockquote", + "body", + "br", + "button", + "canvas", + "caption", + "cite", + "code", + "col", + "colgroup", + "data", + "datalist", + "dd", + "del", + "details", + "dfn", + "dialog", + "div", + "dl", + "dt", + "em", + "embed", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hgroup", + "hr", + "html", + "i", + "iframe", + "img", + "input", + "ins", + "kbd", + "keygen", + "label", + "legend", + "li", + "link", + "map", + "mark", + "menu", + "meta", + "meter", + "nav", + "noscript", + "object", + "ol", + "optgroup", + "option", + "output", + "p", + "param", + "picture", + "pre", + "progress", + "q", + "rp", + "rt", + "ruby", + "s", + "samp", + "script", + "section", + "select", + "slot", + "small", + "source", + "span", + "strong", + "style", + "sub", + "summary", + "sup", + "table", + "tbody", + "td", + "template", + "textarea", + "tfoot", + "th", + "thead", + "time", + "title", + "tr", + "track", + "u", + "ul", + "var", + "video", + "wbr", +] + + +class InputStreamWithMemory(object): + """Wraps an HTMLInputStream to remember characters since last < + + This wraps existing HTMLInputStream classes to keep track of the stream + since the last < which marked an open tag state. + + """ + + def __init__(self, inner_stream): + self._inner_stream = inner_stream + self.reset = self._inner_stream.reset + self.position = self._inner_stream.position + self._buffer = [] + + @property + def errors(self): + return self._inner_stream.errors + + @property + def charEncoding(self): + return self._inner_stream.charEncoding + + @property + def changeEncoding(self): + return self._inner_stream.changeEncoding + + def char(self): + c = self._inner_stream.char() + # char() can return None if EOF, so ignore that + if c: + self._buffer.append(c) + return c + + def charsUntil(self, characters, opposite=False): + chars = self._inner_stream.charsUntil(characters, opposite=opposite) + self._buffer.extend(list(chars)) + return chars + + def unget(self, char): + if self._buffer: + self._buffer.pop(-1) + return self._inner_stream.unget(char) + + def get_tag(self): + """Returns the stream history since last '<' + + Since the buffer starts at the last '<' as as seen by tagOpenState(), + we know that everything from that point to when this method is called + is the "tag" that is being tokenized. + + """ + return six.text_type("").join(self._buffer) + + def start_tag(self): + """Resets stream history to just '<' + + This gets called by tagOpenState() which marks a '<' that denotes an + open tag. Any time we see that, we reset the buffer. + + """ + self._buffer = ["<"] + + +class BleachHTMLTokenizer(HTMLTokenizer): + """Tokenizer that doesn't consume character entities""" + + def __init__(self, consume_entities=False, **kwargs): + super(BleachHTMLTokenizer, self).__init__(**kwargs) + + self.consume_entities = consume_entities + + # Wrap the stream with one that remembers the history + self.stream = InputStreamWithMemory(self.stream) + + def __iter__(self): + last_error_token = None + + for token in super(BleachHTMLTokenizer, self).__iter__(): + if last_error_token is not None: + if ( + last_error_token["data"] == "invalid-character-in-attribute-name" + and token["type"] in TAG_TOKEN_TYPES + and token.get("data") + ): + # token["data"] is an html5lib attributeMap + # (OrderedDict 3.7+ and dict otherwise) + # of attr name to attr value + # + # Remove attribute names that have ', " or < in them + # because those characters are invalid for attribute names. + token["data"] = attributeMap( + (attr_name, attr_value) + for attr_name, attr_value in token["data"].items() + if ( + '"' not in attr_name + and "'" not in attr_name + and "<" not in attr_name + ) + ) + last_error_token = None + yield token + + elif ( + last_error_token["data"] == "expected-closing-tag-but-got-char" + and self.parser.tags is not None + and token["data"].lower().strip() not in self.parser.tags + ): + # We've got either a malformed tag or a pseudo-tag or + # something that html5lib wants to turn into a malformed + # comment which Bleach clean() will drop so we interfere + # with the token stream to handle it more correctly. + # + # If this is an allowed tag, it's malformed and we just let + # the html5lib parser deal with it--we don't enter into this + # block. + # + # If this is not an allowed tag, then we convert it to + # characters and it'll get escaped in the sanitizer. + token["data"] = self.stream.get_tag() + token["type"] = CHARACTERS_TYPE + + last_error_token = None + yield token + + elif token["type"] == PARSEERROR_TYPE: + # If the token is a parse error, then let the last_error_token + # go, and make token the new last_error_token + yield last_error_token + last_error_token = token + + else: + yield last_error_token + yield token + last_error_token = None + + continue + + # If the token is a ParseError, we hold on to it so we can get the + # next token and potentially fix it. + if token["type"] == PARSEERROR_TYPE: + last_error_token = token + continue + + yield token + + if last_error_token: + yield last_error_token + + def consumeEntity(self, allowedChar=None, fromAttribute=False): + # If this tokenizer is set to consume entities, then we can let the + # superclass do its thing. + if self.consume_entities: + return super(BleachHTMLTokenizer, self).consumeEntity( + allowedChar, fromAttribute + ) + + # If this tokenizer is set to not consume entities, then we don't want + # to consume and convert them, so this overrides the html5lib tokenizer's + # consumeEntity so that it's now a no-op. + # + # However, when that gets called, it's consumed an &, so we put that back in + # the stream. + if fromAttribute: + self.currentToken["data"][-1][1] += "&" + + else: + self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"}) + + def tagOpenState(self): + # This state marks a < that is either a StartTag, EndTag, EmptyTag, + # or ParseError. In all cases, we want to drop any stream history + # we've collected so far and we do that by calling start_tag() on + # the input stream wrapper. + self.stream.start_tag() + return super(BleachHTMLTokenizer, self).tagOpenState() + + def emitCurrentToken(self): + token = self.currentToken + + if ( + self.parser.tags is not None + and token["type"] in TAG_TOKEN_TYPES + and token["name"].lower() not in self.parser.tags + ): + # If this is a start/end/empty tag for a tag that's not in our + # allowed list, then it gets stripped or escaped. In both of these + # cases it gets converted to a Characters token. + if self.parser.strip: + # If we're stripping the token, we just throw in an empty + # string token. + new_data = "" + + else: + # If we're escaping the token, we want to escape the exact + # original string. Since tokenizing also normalizes data + # and this is a tag-like thing, we've lost some information. + # So we go back through the stream to get the original + # string and use that. + new_data = self.stream.get_tag() + + new_token = {"type": CHARACTERS_TYPE, "data": new_data} + + self.currentToken = new_token + self.tokenQueue.append(new_token) + self.state = self.dataState + return + + super(BleachHTMLTokenizer, self).emitCurrentToken() + + +class BleachHTMLParser(HTMLParser): + """Parser that uses BleachHTMLTokenizer""" + + def __init__(self, tags, strip, consume_entities, **kwargs): + """ + :arg tags: list of allowed tags--everything else is either stripped or + escaped; if None, then this doesn't look at tags at all + :arg strip: whether to strip disallowed tags (True) or escape them (False); + if tags=None, then this doesn't have any effect + :arg consume_entities: whether to consume entities (default behavior) or + leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) + + """ + self.tags = [tag.lower() for tag in tags] if tags is not None else None + self.strip = strip + self.consume_entities = consume_entities + super(BleachHTMLParser, self).__init__(**kwargs) + + def _parse( + self, stream, innerHTML=False, container="div", scripting=True, **kwargs + ): + # set scripting=True to parse <noscript> as though JS is enabled to + # match the expected context in browsers + # + # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element + # + # Override HTMLParser so we can swap out the tokenizer for our own. + self.innerHTMLMode = innerHTML + self.container = container + self.scripting = scripting + self.tokenizer = BleachHTMLTokenizer( + stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs + ) + self.reset() + + try: + self.mainLoop() + except ReparseException: + self.reset() + self.mainLoop() + + +def convert_entity(value): + """Convert an entity (minus the & and ; part) into what it represents + + This handles numeric, hex, and text entities. + + :arg value: the string (minus the ``&`` and ``;`` part) to convert + + :returns: unicode character or None if it's an ambiguous ampersand that + doesn't match a character entity + + """ + if value[0] == "#": + if len(value) < 2: + return None + + if value[1] in ("x", "X"): + # hex-encoded code point + int_as_string, base = value[2:], 16 + else: + # decimal code point + int_as_string, base = value[1:], 10 + + if int_as_string == "": + return None + + code_point = int(int_as_string, base) + if 0 < code_point < 0x110000: + return six.unichr(code_point) + else: + return None + + return ENTITIES.get(value, None) + + +def convert_entities(text): + """Converts all found entities in the text + + :arg text: the text to convert entities in + + :returns: unicode text with converted entities + + """ + if "&" not in text: + return text + + new_text = [] + for part in next_possible_entity(text): + if not part: + continue + + if part.startswith("&"): + entity = match_entity(part) + if entity is not None: + converted = convert_entity(entity) + + # If it's not an ambiguous ampersand, then replace with the + # unicode character. Otherwise, we leave the entity in. + if converted is not None: + new_text.append(converted) + remainder = part[len(entity) + 2 :] + if part: + new_text.append(remainder) + continue + + new_text.append(part) + + return "".join(new_text) + + +def match_entity(stream): + """Returns first entity in stream or None if no entity exists + + Note: For Bleach purposes, entities must start with a "&" and end with + a ";". This ignoresambiguous character entities that have no ";" at the + end. + + :arg stream: the character stream + + :returns: ``None`` or the entity string without "&" or ";" + + """ + # Nix the & at the beginning + if stream[0] != "&": + raise ValueError('Stream should begin with "&"') + + stream = stream[1:] + + stream = list(stream) + possible_entity = "" + end_characters = "<&=;" + string.whitespace + + # Handle number entities + if stream and stream[0] == "#": + possible_entity = "#" + stream.pop(0) + + if stream and stream[0] in ("x", "X"): + allowed = "0123456789abcdefABCDEF" + possible_entity += stream.pop(0) + else: + allowed = "0123456789" + + # FIXME(willkg): Do we want to make sure these are valid number + # entities? This doesn't do that currently. + while stream and stream[0] not in end_characters: + c = stream.pop(0) + if c not in allowed: + break + possible_entity += c + + if possible_entity and stream and stream[0] == ";": + return possible_entity + return None + + # Handle character entities + while stream and stream[0] not in end_characters: + c = stream.pop(0) + if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity): + break + possible_entity += c + + if possible_entity and stream and stream[0] == ";": + return possible_entity + + return None + + +AMP_SPLIT_RE = re.compile("(&)") + + +def next_possible_entity(text): + """Takes a text and generates a list of possible entities + + :arg text: the text to look at + + :returns: generator where each part (except the first) starts with an + "&" + + """ + for i, part in enumerate(AMP_SPLIT_RE.split(text)): + if i == 0: + yield part + elif i % 2 == 0: + yield "&" + part + + +class BleachHTMLSerializer(HTMLSerializer): + """HTMLSerializer that undoes & -> & in attributes and sets + escape_rcdata to True + """ + + # per the HTMLSerializer.__init__ docstring: + # + # Whether to escape characters that need to be + # escaped within normal elements within rcdata elements such as + # style. + # + escape_rcdata = True + + def escape_base_amp(self, stoken): + """Escapes just bare & in HTML attribute values""" + # First, undo escaping of &. We need to do this because html5lib's + # HTMLSerializer expected the tokenizer to consume all the character + # entities and convert them to their respective characters, but the + # BleachHTMLTokenizer doesn't do that. For example, this fixes + # &entity; back to &entity; . + stoken = stoken.replace("&", "&") + + # However, we do want all bare & that are not marking character + # entities to be changed to &, so let's do that carefully here. + for part in next_possible_entity(stoken): + if not part: + continue + + if part.startswith("&"): + entity = match_entity(part) + # Only leave entities in that are not ambiguous. If they're + # ambiguous, then we escape the ampersand. + if entity is not None and convert_entity(entity) is not None: + yield "&" + entity + ";" + + # Length of the entity plus 2--one for & at the beginning + # and one for ; at the end + part = part[len(entity) + 2 :] + if part: + yield part + continue + + yield part.replace("&", "&") + + def serialize(self, treewalker, encoding=None): + """Wrap HTMLSerializer.serialize and conver & to & in attribute values + + Note that this converts & to & in attribute values where the & isn't + already part of an unambiguous character entity. + + """ + in_tag = False + after_equals = False + + for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding): + if in_tag: + if stoken == ">": + in_tag = False + + elif after_equals: + if stoken != '"': + for part in self.escape_base_amp(stoken): + yield part + + after_equals = False + continue + + elif stoken == "=": + after_equals = True + + yield stoken + else: + if stoken.startswith("<"): + in_tag = True + yield stoken