diff env/lib/python3.9/site-packages/bleach/html5lib_shim.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/env/lib/python3.9/site-packages/bleach/html5lib_shim.py	Mon Mar 22 18:12:50 2021 +0000
@@ -0,0 +1,669 @@
+# flake8: noqa
+"""
+Shim module between Bleach and html5lib. This makes it easier to upgrade the
+html5lib library without having to change a lot of code.
+"""
+
+from __future__ import unicode_literals
+
+import re
+import string
+import warnings
+
+import six
+
+# ignore html5lib deprecation warnings to use bleach; we are bleach
+# apply before we import submodules that import html5lib
+warnings.filterwarnings(
+    "ignore",
+    message="html5lib's sanitizer is deprecated",
+    category=DeprecationWarning,
+    module="bleach._vendor.html5lib",
+)
+
+from bleach._vendor.html5lib import (  # noqa: E402 module level import not at top of file
+    HTMLParser,
+    getTreeWalker,
+)
+from bleach._vendor.html5lib import (
+    constants,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.constants import (  # noqa: E402 module level import not at top of file
+    namespaces,
+    prefixes,
+)
+from bleach._vendor.html5lib.constants import (
+    _ReparseException as ReparseException,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.base import (
+    Filter,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.sanitizer import (
+    allowed_protocols,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.sanitizer import (
+    Filter as SanitizerFilter,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._inputstream import (
+    HTMLInputStream,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.serializer import (
+    escape,
+    HTMLSerializer,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._tokenizer import (
+    attributeMap,
+    HTMLTokenizer,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._trie import (
+    Trie,
+)  # noqa: E402 module level import not at top of file
+
+
+#: Map of entity name to expanded entity
+ENTITIES = constants.entities
+
+#: Trie of html entity string -> character representation
+ENTITIES_TRIE = Trie(ENTITIES)
+
+#: Token type constants--these never change
+TAG_TOKEN_TYPES = {
+    constants.tokenTypes["StartTag"],
+    constants.tokenTypes["EndTag"],
+    constants.tokenTypes["EmptyTag"],
+}
+CHARACTERS_TYPE = constants.tokenTypes["Characters"]
+PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
+
+
+#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
+#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
+HTML_TAGS = [
+    "a",
+    "abbr",
+    "address",
+    "area",
+    "article",
+    "aside",
+    "audio",
+    "b",
+    "base",
+    "bdi",
+    "bdo",
+    "blockquote",
+    "body",
+    "br",
+    "button",
+    "canvas",
+    "caption",
+    "cite",
+    "code",
+    "col",
+    "colgroup",
+    "data",
+    "datalist",
+    "dd",
+    "del",
+    "details",
+    "dfn",
+    "dialog",
+    "div",
+    "dl",
+    "dt",
+    "em",
+    "embed",
+    "fieldset",
+    "figcaption",
+    "figure",
+    "footer",
+    "form",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "head",
+    "header",
+    "hgroup",
+    "hr",
+    "html",
+    "i",
+    "iframe",
+    "img",
+    "input",
+    "ins",
+    "kbd",
+    "keygen",
+    "label",
+    "legend",
+    "li",
+    "link",
+    "map",
+    "mark",
+    "menu",
+    "meta",
+    "meter",
+    "nav",
+    "noscript",
+    "object",
+    "ol",
+    "optgroup",
+    "option",
+    "output",
+    "p",
+    "param",
+    "picture",
+    "pre",
+    "progress",
+    "q",
+    "rp",
+    "rt",
+    "ruby",
+    "s",
+    "samp",
+    "script",
+    "section",
+    "select",
+    "slot",
+    "small",
+    "source",
+    "span",
+    "strong",
+    "style",
+    "sub",
+    "summary",
+    "sup",
+    "table",
+    "tbody",
+    "td",
+    "template",
+    "textarea",
+    "tfoot",
+    "th",
+    "thead",
+    "time",
+    "title",
+    "tr",
+    "track",
+    "u",
+    "ul",
+    "var",
+    "video",
+    "wbr",
+]
+
+
+class InputStreamWithMemory(object):
+    """Wraps an HTMLInputStream to remember characters since last <
+
+    This wraps existing HTMLInputStream classes to keep track of the stream
+    since the last < which marked an open tag state.
+
+    """
+
+    def __init__(self, inner_stream):
+        self._inner_stream = inner_stream
+        self.reset = self._inner_stream.reset
+        self.position = self._inner_stream.position
+        self._buffer = []
+
+    @property
+    def errors(self):
+        return self._inner_stream.errors
+
+    @property
+    def charEncoding(self):
+        return self._inner_stream.charEncoding
+
+    @property
+    def changeEncoding(self):
+        return self._inner_stream.changeEncoding
+
+    def char(self):
+        c = self._inner_stream.char()
+        # char() can return None if EOF, so ignore that
+        if c:
+            self._buffer.append(c)
+        return c
+
+    def charsUntil(self, characters, opposite=False):
+        chars = self._inner_stream.charsUntil(characters, opposite=opposite)
+        self._buffer.extend(list(chars))
+        return chars
+
+    def unget(self, char):
+        if self._buffer:
+            self._buffer.pop(-1)
+        return self._inner_stream.unget(char)
+
+    def get_tag(self):
+        """Returns the stream history since last '<'
+
+        Since the buffer starts at the last '<' as as seen by tagOpenState(),
+        we know that everything from that point to when this method is called
+        is the "tag" that is being tokenized.
+
+        """
+        return six.text_type("").join(self._buffer)
+
+    def start_tag(self):
+        """Resets stream history to just '<'
+
+        This gets called by tagOpenState() which marks a '<' that denotes an
+        open tag. Any time we see that, we reset the buffer.
+
+        """
+        self._buffer = ["<"]
+
+
+class BleachHTMLTokenizer(HTMLTokenizer):
+    """Tokenizer that doesn't consume character entities"""
+
+    def __init__(self, consume_entities=False, **kwargs):
+        super(BleachHTMLTokenizer, self).__init__(**kwargs)
+
+        self.consume_entities = consume_entities
+
+        # Wrap the stream with one that remembers the history
+        self.stream = InputStreamWithMemory(self.stream)
+
+    def __iter__(self):
+        last_error_token = None
+
+        for token in super(BleachHTMLTokenizer, self).__iter__():
+            if last_error_token is not None:
+                if (
+                    last_error_token["data"] == "invalid-character-in-attribute-name"
+                    and token["type"] in TAG_TOKEN_TYPES
+                    and token.get("data")
+                ):
+                    # token["data"] is an html5lib attributeMap
+                    # (OrderedDict 3.7+ and dict otherwise)
+                    # of attr name to attr value
+                    #
+                    # Remove attribute names that have ', " or < in them
+                    # because those characters are invalid for attribute names.
+                    token["data"] = attributeMap(
+                        (attr_name, attr_value)
+                        for attr_name, attr_value in token["data"].items()
+                        if (
+                            '"' not in attr_name
+                            and "'" not in attr_name
+                            and "<" not in attr_name
+                        )
+                    )
+                    last_error_token = None
+                    yield token
+
+                elif (
+                    last_error_token["data"] == "expected-closing-tag-but-got-char"
+                    and self.parser.tags is not None
+                    and token["data"].lower().strip() not in self.parser.tags
+                ):
+                    # We've got either a malformed tag or a pseudo-tag or
+                    # something that html5lib wants to turn into a malformed
+                    # comment which Bleach clean() will drop so we interfere
+                    # with the token stream to handle it more correctly.
+                    #
+                    # If this is an allowed tag, it's malformed and we just let
+                    # the html5lib parser deal with it--we don't enter into this
+                    # block.
+                    #
+                    # If this is not an allowed tag, then we convert it to
+                    # characters and it'll get escaped in the sanitizer.
+                    token["data"] = self.stream.get_tag()
+                    token["type"] = CHARACTERS_TYPE
+
+                    last_error_token = None
+                    yield token
+
+                elif token["type"] == PARSEERROR_TYPE:
+                    # If the token is a parse error, then let the last_error_token
+                    # go, and make token the new last_error_token
+                    yield last_error_token
+                    last_error_token = token
+
+                else:
+                    yield last_error_token
+                    yield token
+                    last_error_token = None
+
+                continue
+
+            # If the token is a ParseError, we hold on to it so we can get the
+            # next token and potentially fix it.
+            if token["type"] == PARSEERROR_TYPE:
+                last_error_token = token
+                continue
+
+            yield token
+
+        if last_error_token:
+            yield last_error_token
+
+    def consumeEntity(self, allowedChar=None, fromAttribute=False):
+        # If this tokenizer is set to consume entities, then we can let the
+        # superclass do its thing.
+        if self.consume_entities:
+            return super(BleachHTMLTokenizer, self).consumeEntity(
+                allowedChar, fromAttribute
+            )
+
+        # If this tokenizer is set to not consume entities, then we don't want
+        # to consume and convert them, so this overrides the html5lib tokenizer's
+        # consumeEntity so that it's now a no-op.
+        #
+        # However, when that gets called, it's consumed an &, so we put that back in
+        # the stream.
+        if fromAttribute:
+            self.currentToken["data"][-1][1] += "&"
+
+        else:
+            self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
+
+    def tagOpenState(self):
+        # This state marks a < that is either a StartTag, EndTag, EmptyTag,
+        # or ParseError. In all cases, we want to drop any stream history
+        # we've collected so far and we do that by calling start_tag() on
+        # the input stream wrapper.
+        self.stream.start_tag()
+        return super(BleachHTMLTokenizer, self).tagOpenState()
+
+    def emitCurrentToken(self):
+        token = self.currentToken
+
+        if (
+            self.parser.tags is not None
+            and token["type"] in TAG_TOKEN_TYPES
+            and token["name"].lower() not in self.parser.tags
+        ):
+            # If this is a start/end/empty tag for a tag that's not in our
+            # allowed list, then it gets stripped or escaped. In both of these
+            # cases it gets converted to a Characters token.
+            if self.parser.strip:
+                # If we're stripping the token, we just throw in an empty
+                # string token.
+                new_data = ""
+
+            else:
+                # If we're escaping the token, we want to escape the exact
+                # original string. Since tokenizing also normalizes data
+                # and this is a tag-like thing, we've lost some information.
+                # So we go back through the stream to get the original
+                # string and use that.
+                new_data = self.stream.get_tag()
+
+            new_token = {"type": CHARACTERS_TYPE, "data": new_data}
+
+            self.currentToken = new_token
+            self.tokenQueue.append(new_token)
+            self.state = self.dataState
+            return
+
+        super(BleachHTMLTokenizer, self).emitCurrentToken()
+
+
+class BleachHTMLParser(HTMLParser):
+    """Parser that uses BleachHTMLTokenizer"""
+
+    def __init__(self, tags, strip, consume_entities, **kwargs):
+        """
+        :arg tags: list of allowed tags--everything else is either stripped or
+            escaped; if None, then this doesn't look at tags at all
+        :arg strip: whether to strip disallowed tags (True) or escape them (False);
+            if tags=None, then this doesn't have any effect
+        :arg consume_entities: whether to consume entities (default behavior) or
+            leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
+
+        """
+        self.tags = [tag.lower() for tag in tags] if tags is not None else None
+        self.strip = strip
+        self.consume_entities = consume_entities
+        super(BleachHTMLParser, self).__init__(**kwargs)
+
+    def _parse(
+        self, stream, innerHTML=False, container="div", scripting=True, **kwargs
+    ):
+        # set scripting=True to parse <noscript> as though JS is enabled to
+        # match the expected context in browsers
+        #
+        # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
+        #
+        # Override HTMLParser so we can swap out the tokenizer for our own.
+        self.innerHTMLMode = innerHTML
+        self.container = container
+        self.scripting = scripting
+        self.tokenizer = BleachHTMLTokenizer(
+            stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
+        )
+        self.reset()
+
+        try:
+            self.mainLoop()
+        except ReparseException:
+            self.reset()
+            self.mainLoop()
+
+
+def convert_entity(value):
+    """Convert an entity (minus the & and ; part) into what it represents
+
+    This handles numeric, hex, and text entities.
+
+    :arg value: the string (minus the ``&`` and ``;`` part) to convert
+
+    :returns: unicode character or None if it's an ambiguous ampersand that
+        doesn't match a character entity
+
+    """
+    if value[0] == "#":
+        if len(value) < 2:
+            return None
+
+        if value[1] in ("x", "X"):
+            # hex-encoded code point
+            int_as_string, base = value[2:], 16
+        else:
+            # decimal code point
+            int_as_string, base = value[1:], 10
+
+        if int_as_string == "":
+            return None
+
+        code_point = int(int_as_string, base)
+        if 0 < code_point < 0x110000:
+            return six.unichr(code_point)
+        else:
+            return None
+
+    return ENTITIES.get(value, None)
+
+
+def convert_entities(text):
+    """Converts all found entities in the text
+
+    :arg text: the text to convert entities in
+
+    :returns: unicode text with converted entities
+
+    """
+    if "&" not in text:
+        return text
+
+    new_text = []
+    for part in next_possible_entity(text):
+        if not part:
+            continue
+
+        if part.startswith("&"):
+            entity = match_entity(part)
+            if entity is not None:
+                converted = convert_entity(entity)
+
+                # If it's not an ambiguous ampersand, then replace with the
+                # unicode character. Otherwise, we leave the entity in.
+                if converted is not None:
+                    new_text.append(converted)
+                    remainder = part[len(entity) + 2 :]
+                    if part:
+                        new_text.append(remainder)
+                    continue
+
+        new_text.append(part)
+
+    return "".join(new_text)
+
+
+def match_entity(stream):
+    """Returns first entity in stream or None if no entity exists
+
+    Note: For Bleach purposes, entities must start with a "&" and end with
+    a ";". This ignoresambiguous character entities that have no ";" at the
+    end.
+
+    :arg stream: the character stream
+
+    :returns: ``None`` or the entity string without "&" or ";"
+
+    """
+    # Nix the & at the beginning
+    if stream[0] != "&":
+        raise ValueError('Stream should begin with "&"')
+
+    stream = stream[1:]
+
+    stream = list(stream)
+    possible_entity = ""
+    end_characters = "<&=;" + string.whitespace
+
+    # Handle number entities
+    if stream and stream[0] == "#":
+        possible_entity = "#"
+        stream.pop(0)
+
+        if stream and stream[0] in ("x", "X"):
+            allowed = "0123456789abcdefABCDEF"
+            possible_entity += stream.pop(0)
+        else:
+            allowed = "0123456789"
+
+        # FIXME(willkg): Do we want to make sure these are valid number
+        # entities? This doesn't do that currently.
+        while stream and stream[0] not in end_characters:
+            c = stream.pop(0)
+            if c not in allowed:
+                break
+            possible_entity += c
+
+        if possible_entity and stream and stream[0] == ";":
+            return possible_entity
+        return None
+
+    # Handle character entities
+    while stream and stream[0] not in end_characters:
+        c = stream.pop(0)
+        if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
+            break
+        possible_entity += c
+
+    if possible_entity and stream and stream[0] == ";":
+        return possible_entity
+
+    return None
+
+
+AMP_SPLIT_RE = re.compile("(&)")
+
+
+def next_possible_entity(text):
+    """Takes a text and generates a list of possible entities
+
+    :arg text: the text to look at
+
+    :returns: generator where each part (except the first) starts with an
+        "&"
+
+    """
+    for i, part in enumerate(AMP_SPLIT_RE.split(text)):
+        if i == 0:
+            yield part
+        elif i % 2 == 0:
+            yield "&" + part
+
+
+class BleachHTMLSerializer(HTMLSerializer):
+    """HTMLSerializer that undoes & -> &amp; in attributes and sets
+    escape_rcdata to True
+    """
+
+    # per the HTMLSerializer.__init__ docstring:
+    #
+    # Whether to escape characters that need to be
+    # escaped within normal elements within rcdata elements such as
+    # style.
+    #
+    escape_rcdata = True
+
+    def escape_base_amp(self, stoken):
+        """Escapes just bare & in HTML attribute values"""
+        # First, undo escaping of &. We need to do this because html5lib's
+        # HTMLSerializer expected the tokenizer to consume all the character
+        # entities and convert them to their respective characters, but the
+        # BleachHTMLTokenizer doesn't do that. For example, this fixes
+        # &amp;entity; back to &entity; .
+        stoken = stoken.replace("&amp;", "&")
+
+        # However, we do want all bare & that are not marking character
+        # entities to be changed to &amp;, so let's do that carefully here.
+        for part in next_possible_entity(stoken):
+            if not part:
+                continue
+
+            if part.startswith("&"):
+                entity = match_entity(part)
+                # Only leave entities in that are not ambiguous. If they're
+                # ambiguous, then we escape the ampersand.
+                if entity is not None and convert_entity(entity) is not None:
+                    yield "&" + entity + ";"
+
+                    # Length of the entity plus 2--one for & at the beginning
+                    # and one for ; at the end
+                    part = part[len(entity) + 2 :]
+                    if part:
+                        yield part
+                    continue
+
+            yield part.replace("&", "&amp;")
+
+    def serialize(self, treewalker, encoding=None):
+        """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
+
+        Note that this converts & to &amp; in attribute values where the & isn't
+        already part of an unambiguous character entity.
+
+        """
+        in_tag = False
+        after_equals = False
+
+        for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
+            if in_tag:
+                if stoken == ">":
+                    in_tag = False
+
+                elif after_equals:
+                    if stoken != '"':
+                        for part in self.escape_base_amp(stoken):
+                            yield part
+
+                        after_equals = False
+                        continue
+
+                elif stoken == "=":
+                    after_equals = True
+
+                yield stoken
+            else:
+                if stoken.startswith("<"):
+                    in_tag = True
+                yield stoken