diff env/lib/python3.9/site-packages/bleach/sanitizer.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/env/lib/python3.9/site-packages/bleach/sanitizer.py	Mon Mar 22 18:12:50 2021 +0000
@@ -0,0 +1,650 @@
+from __future__ import unicode_literals
+
+from itertools import chain
+import re
+import warnings
+
+import six
+from six.moves.urllib.parse import urlparse
+from xml.sax.saxutils import unescape
+
+from bleach import html5lib_shim
+from bleach.utils import alphabetize_attributes, force_unicode
+
+
+#: List of allowed tags
+ALLOWED_TAGS = [
+    "a",
+    "abbr",
+    "acronym",
+    "b",
+    "blockquote",
+    "code",
+    "em",
+    "i",
+    "li",
+    "ol",
+    "strong",
+    "ul",
+]
+
+
+#: Map of allowed attributes by tag
+ALLOWED_ATTRIBUTES = {
+    "a": ["href", "title"],
+    "abbr": ["title"],
+    "acronym": ["title"],
+}
+
+#: List of allowed styles
+ALLOWED_STYLES = []
+
+#: List of allowed protocols
+ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
+
+#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
+INVISIBLE_CHARACTERS = "".join(
+    [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
+)
+
+#: Regexp for characters that are invisible
+INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
+
+#: String to replace invisible characters with. This can be a character, a
+#: string, or even a function that takes a Python re matchobj
+INVISIBLE_REPLACEMENT_CHAR = "?"
+
+
+class Cleaner(object):
+    """Cleaner for cleaning HTML fragments of malicious content
+
+    This cleaner is a security-focused function whose sole purpose is to remove
+    malicious content from a string such that it can be displayed as content in
+    a web page.
+
+    To use::
+
+        from bleach.sanitizer import Cleaner
+
+        cleaner = Cleaner()
+
+        for text in all_the_yucky_things:
+            sanitized = cleaner.clean(text)
+
+    .. Note::
+
+       This cleaner is not designed to use to transform content to be used in
+       non-web-page contexts.
+
+    .. Warning::
+
+       This cleaner is not thread-safe--the html parser has internal state.
+       Create a separate cleaner per thread!
+
+
+    """
+
+    def __init__(
+        self,
+        tags=ALLOWED_TAGS,
+        attributes=ALLOWED_ATTRIBUTES,
+        styles=ALLOWED_STYLES,
+        protocols=ALLOWED_PROTOCOLS,
+        strip=False,
+        strip_comments=True,
+        filters=None,
+    ):
+        """Initializes a Cleaner
+
+        :arg list tags: allowed list of tags; defaults to
+            ``bleach.sanitizer.ALLOWED_TAGS``
+
+        :arg dict attributes: allowed attributes; can be a callable, list or dict;
+            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
+
+        :arg list styles: allowed list of css styles; defaults to
+            ``bleach.sanitizer.ALLOWED_STYLES``
+
+        :arg list protocols: allowed list of protocols for links; defaults
+            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
+
+        :arg bool strip: whether or not to strip disallowed elements
+
+        :arg bool strip_comments: whether or not to strip HTML comments
+
+        :arg list filters: list of html5lib Filter classes to pass streamed content through
+
+            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
+
+            .. Warning::
+
+               Using filters changes the output of ``bleach.Cleaner.clean``.
+               Make sure the way the filters change the output are secure.
+
+        """
+        self.tags = tags
+        self.attributes = attributes
+        self.styles = styles
+        self.protocols = protocols
+        self.strip = strip
+        self.strip_comments = strip_comments
+        self.filters = filters or []
+
+        self.parser = html5lib_shim.BleachHTMLParser(
+            tags=self.tags,
+            strip=self.strip,
+            consume_entities=False,
+            namespaceHTMLElements=False,
+        )
+        self.walker = html5lib_shim.getTreeWalker("etree")
+        self.serializer = html5lib_shim.BleachHTMLSerializer(
+            quote_attr_values="always",
+            omit_optional_tags=False,
+            escape_lt_in_attrs=True,
+            # We want to leave entities as they are without escaping or
+            # resolving or expanding
+            resolve_entities=False,
+            # Bleach has its own sanitizer, so don't use the html5lib one
+            sanitize=False,
+            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+            alphabetical_attributes=False,
+        )
+
+    def clean(self, text):
+        """Cleans text and returns sanitized result as unicode
+
+        :arg str text: text to be cleaned
+
+        :returns: sanitized text as unicode
+
+        :raises TypeError: if ``text`` is not a text type
+
+        """
+        if not isinstance(text, six.string_types):
+            message = (
+                "argument cannot be of '{name}' type, must be of text type".format(
+                    name=text.__class__.__name__
+                )
+            )
+            raise TypeError(message)
+
+        if not text:
+            return ""
+
+        text = force_unicode(text)
+
+        dom = self.parser.parseFragment(text)
+        filtered = BleachSanitizerFilter(
+            source=self.walker(dom),
+            # Bleach-sanitizer-specific things
+            attributes=self.attributes,
+            strip_disallowed_elements=self.strip,
+            strip_html_comments=self.strip_comments,
+            # html5lib-sanitizer things
+            allowed_elements=self.tags,
+            allowed_css_properties=self.styles,
+            allowed_protocols=self.protocols,
+            allowed_svg_properties=[],
+        )
+
+        # Apply any filters after the BleachSanitizerFilter
+        for filter_class in self.filters:
+            filtered = filter_class(source=filtered)
+
+        return self.serializer.render(filtered)
+
+
+def attribute_filter_factory(attributes):
+    """Generates attribute filter function for the given attributes value
+
+    The attributes value can take one of several shapes. This returns a filter
+    function appropriate to the attributes value. One nice thing about this is
+    that there's less if/then shenanigans in the ``allow_token`` method.
+
+    """
+    if callable(attributes):
+        return attributes
+
+    if isinstance(attributes, dict):
+
+        def _attr_filter(tag, attr, value):
+            if tag in attributes:
+                attr_val = attributes[tag]
+                if callable(attr_val):
+                    return attr_val(tag, attr, value)
+
+                if attr in attr_val:
+                    return True
+
+            if "*" in attributes:
+                attr_val = attributes["*"]
+                if callable(attr_val):
+                    return attr_val(tag, attr, value)
+
+                return attr in attr_val
+
+            return False
+
+        return _attr_filter
+
+    if isinstance(attributes, list):
+
+        def _attr_filter(tag, attr, value):
+            return attr in attributes
+
+        return _attr_filter
+
+    raise ValueError("attributes needs to be a callable, a list or a dict")
+
+
+class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
+    """html5lib Filter that sanitizes text
+
+    This filter can be used anywhere html5lib filters can be used.
+
+    """
+
+    def __init__(
+        self,
+        source,
+        attributes=ALLOWED_ATTRIBUTES,
+        strip_disallowed_elements=False,
+        strip_html_comments=True,
+        **kwargs
+    ):
+        """Creates a BleachSanitizerFilter instance
+
+        :arg Treewalker source: stream
+
+        :arg list tags: allowed list of tags; defaults to
+            ``bleach.sanitizer.ALLOWED_TAGS``
+
+        :arg dict attributes: allowed attributes; can be a callable, list or dict;
+            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
+
+        :arg list styles: allowed list of css styles; defaults to
+            ``bleach.sanitizer.ALLOWED_STYLES``
+
+        :arg list protocols: allowed list of protocols for links; defaults
+            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
+
+        :arg bool strip_disallowed_elements: whether or not to strip disallowed
+            elements
+
+        :arg bool strip_html_comments: whether or not to strip HTML comments
+
+        """
+        self.attr_filter = attribute_filter_factory(attributes)
+        self.strip_disallowed_elements = strip_disallowed_elements
+        self.strip_html_comments = strip_html_comments
+
+        # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
+        warnings.filterwarnings(
+            "ignore",
+            message="html5lib's sanitizer is deprecated",
+            category=DeprecationWarning,
+            module="bleach._vendor.html5lib",
+        )
+        return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
+
+    def sanitize_stream(self, token_iterator):
+        for token in token_iterator:
+            ret = self.sanitize_token(token)
+
+            if not ret:
+                continue
+
+            if isinstance(ret, list):
+                for subtoken in ret:
+                    yield subtoken
+            else:
+                yield ret
+
+    def merge_characters(self, token_iterator):
+        """Merge consecutive Characters tokens in a stream"""
+        characters_buffer = []
+
+        for token in token_iterator:
+            if characters_buffer:
+                if token["type"] == "Characters":
+                    characters_buffer.append(token)
+                    continue
+                else:
+                    # Merge all the characters tokens together into one and then
+                    # operate on it.
+                    new_token = {
+                        "data": "".join(
+                            [char_token["data"] for char_token in characters_buffer]
+                        ),
+                        "type": "Characters",
+                    }
+                    characters_buffer = []
+                    yield new_token
+
+            elif token["type"] == "Characters":
+                characters_buffer.append(token)
+                continue
+
+            yield token
+
+        new_token = {
+            "data": "".join([char_token["data"] for char_token in characters_buffer]),
+            "type": "Characters",
+        }
+        yield new_token
+
+    def __iter__(self):
+        return self.merge_characters(
+            self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
+        )
+
+    def sanitize_token(self, token):
+        """Sanitize a token either by HTML-encoding or dropping.
+
+        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
+        ['attribute', 'pairs'], 'tag': callable}.
+
+        Here callable is a function with two arguments of attribute name and
+        value. It should return true of false.
+
+        Also gives the option to strip tags instead of encoding.
+
+        :arg dict token: token to sanitize
+
+        :returns: token or list of tokens
+
+        """
+        token_type = token["type"]
+        if token_type in ["StartTag", "EndTag", "EmptyTag"]:
+            if token["name"] in self.allowed_elements:
+                return self.allow_token(token)
+
+            elif self.strip_disallowed_elements:
+                return None
+
+            else:
+                if "data" in token:
+                    # Alphabetize the attributes before calling .disallowed_token()
+                    # so that the resulting string is stable
+                    token["data"] = alphabetize_attributes(token["data"])
+                return self.disallowed_token(token)
+
+        elif token_type == "Comment":
+            if not self.strip_html_comments:
+                # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
+                token["data"] = html5lib_shim.escape(
+                    token["data"], entities={'"': "&quot;", "'": "&#x27;"}
+                )
+                return token
+            else:
+                return None
+
+        elif token_type == "Characters":
+            return self.sanitize_characters(token)
+
+        else:
+            return token
+
+    def sanitize_characters(self, token):
+        """Handles Characters tokens
+
+        Our overridden tokenizer doesn't do anything with entities. However,
+        that means that the serializer will convert all ``&`` in Characters
+        tokens to ``&amp;``.
+
+        Since we don't want that, we extract entities here and convert them to
+        Entity tokens so the serializer will let them be.
+
+        :arg token: the Characters token to work on
+
+        :returns: a list of tokens
+
+        """
+        data = token.get("data", "")
+
+        if not data:
+            return token
+
+        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
+        token["data"] = data
+
+        # If there isn't a & in the data, we can return now
+        if "&" not in data:
+            return token
+
+        new_tokens = []
+
+        # For each possible entity that starts with a "&", we try to extract an
+        # actual entity and re-tokenize accordingly
+        for part in html5lib_shim.next_possible_entity(data):
+            if not part:
+                continue
+
+            if part.startswith("&"):
+                entity = html5lib_shim.match_entity(part)
+                if entity is not None:
+                    if entity == "amp":
+                        # LinkifyFilter can't match urls across token boundaries
+                        # which is problematic with &amp; since that shows up in
+                        # querystrings all the time. This special-cases &amp;
+                        # and converts it to a & and sticks it in as a
+                        # Characters token. It'll get merged with surrounding
+                        # tokens in the BleachSanitizerfilter.__iter__ and
+                        # escaped in the serializer.
+                        new_tokens.append({"type": "Characters", "data": "&"})
+                    else:
+                        new_tokens.append({"type": "Entity", "name": entity})
+
+                    # Length of the entity plus 2--one for & at the beginning
+                    # and one for ; at the end
+                    remainder = part[len(entity) + 2 :]
+                    if remainder:
+                        new_tokens.append({"type": "Characters", "data": remainder})
+                    continue
+
+            new_tokens.append({"type": "Characters", "data": part})
+
+        return new_tokens
+
+    def sanitize_uri_value(self, value, allowed_protocols):
+        """Checks a uri value to see if it's allowed
+
+        :arg value: the uri value to sanitize
+        :arg allowed_protocols: list of allowed protocols
+
+        :returns: allowed value or None
+
+        """
+        # NOTE(willkg): This transforms the value into one that's easier to
+        # match and verify, but shouldn't get returned since it's vastly
+        # different than the original value.
+
+        # Convert all character entities in the value
+        new_value = html5lib_shim.convert_entities(value)
+
+        # Nix backtick, space characters, and control characters
+        new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value)
+
+        # Remove REPLACEMENT characters
+        new_value = new_value.replace("\ufffd", "")
+
+        # Lowercase it--this breaks the value, but makes it easier to match
+        # against
+        new_value = new_value.lower()
+
+        try:
+            # Drop attributes with uri values that have protocols that aren't
+            # allowed
+            parsed = urlparse(new_value)
+        except ValueError:
+            # URI is impossible to parse, therefore it's not allowed
+            return None
+
+        if parsed.scheme:
+            # If urlparse found a scheme, check that
+            if parsed.scheme in allowed_protocols:
+                return value
+
+        else:
+            # Allow uris that are just an anchor
+            if new_value.startswith("#"):
+                return value
+
+            # Handle protocols that urlparse doesn't recognize like "myprotocol"
+            if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
+                return value
+
+            # If there's no protocol/scheme specified, then assume it's "http"
+            # and see if that's allowed
+            if "http" in allowed_protocols:
+                return value
+
+        return None
+
+    def allow_token(self, token):
+        """Handles the case where we're allowing the tag"""
+        if "data" in token:
+            # Loop through all the attributes and drop the ones that are not
+            # allowed, are unsafe or break other rules. Additionally, fix
+            # attribute values that need fixing.
+            #
+            # At the end of this loop, we have the final set of attributes
+            # we're keeping.
+            attrs = {}
+            for namespaced_name, val in token["data"].items():
+                namespace, name = namespaced_name
+
+                # Drop attributes that are not explicitly allowed
+                #
+                # NOTE(willkg): We pass in the attribute name--not a namespaced
+                # name.
+                if not self.attr_filter(token["name"], name, val):
+                    continue
+
+                # Drop attributes with uri values that use a disallowed protocol
+                # Sanitize attributes with uri values
+                if namespaced_name in self.attr_val_is_uri:
+                    new_value = self.sanitize_uri_value(val, self.allowed_protocols)
+                    if new_value is None:
+                        continue
+                    val = new_value
+
+                # Drop values in svg attrs with non-local IRIs
+                if namespaced_name in self.svg_attr_val_allows_ref:
+                    new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
+                    new_val = new_val.strip()
+                    if not new_val:
+                        continue
+
+                    else:
+                        # Replace the val with the unescaped version because
+                        # it's a iri
+                        val = new_val
+
+                # Drop href and xlink:href attr for svg elements with non-local IRIs
+                if (None, token["name"]) in self.svg_allow_local_href:
+                    if namespaced_name in [
+                        (None, "href"),
+                        (html5lib_shim.namespaces["xlink"], "href"),
+                    ]:
+                        if re.search(r"^\s*[^#\s]", val):
+                            continue
+
+                # If it's a style attribute, sanitize it
+                if namespaced_name == (None, "style"):
+                    val = self.sanitize_css(val)
+
+                # At this point, we want to keep the attribute, so add it in
+                attrs[namespaced_name] = val
+
+            token["data"] = alphabetize_attributes(attrs)
+
+        return token
+
+    def disallowed_token(self, token):
+        token_type = token["type"]
+        if token_type == "EndTag":
+            token["data"] = "</%s>" % token["name"]
+
+        elif token["data"]:
+            assert token_type in ("StartTag", "EmptyTag")
+            attrs = []
+            for (ns, name), v in token["data"].items():
+                # If we end up with a namespace, but no name, switch them so we
+                # have a valid name to use.
+                if ns and not name:
+                    ns, name = name, ns
+
+                # Figure out namespaced name if the namespace is appropriate
+                # and exists; if the ns isn't in prefixes, then drop it.
+                if ns is None or ns not in html5lib_shim.prefixes:
+                    namespaced_name = name
+                else:
+                    namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name)
+
+                attrs.append(
+                    ' %s="%s"'
+                    % (
+                        namespaced_name,
+                        # NOTE(willkg): HTMLSerializer escapes attribute values
+                        # already, so if we do it here (like HTMLSerializer does),
+                        # then we end up double-escaping.
+                        v,
+                    )
+                )
+            token["data"] = "<%s%s>" % (token["name"], "".join(attrs))
+
+        else:
+            token["data"] = "<%s>" % token["name"]
+
+        if token.get("selfClosing"):
+            token["data"] = token["data"][:-1] + "/>"
+
+        token["type"] = "Characters"
+
+        del token["name"]
+        return token
+
+    def sanitize_css(self, style):
+        """Sanitizes css in style tags"""
+        # Convert entities in the style so that it can be parsed as CSS
+        style = html5lib_shim.convert_entities(style)
+
+        # Drop any url values before we do anything else
+        style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
+
+        # The gauntlet of sanitization
+
+        # Validate the css in the style tag and if it's not valid, then drop
+        # the whole thing.
+        parts = style.split(";")
+        gauntlet = re.compile(
+            r"""^(  # consider a style attribute value as composed of:
+[/:,#%!.\s\w]    # a non-newline character
+|\w-\w           # 3 characters in the form \w-\w
+|'[\s\w]+'\s*    # a single quoted string of [\s\w]+ with trailing space
+|"[\s\w]+"       # a double quoted string of [\s\w]+
+|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
+)*$""",  # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
+            flags=re.U | re.VERBOSE,
+        )
+
+        for part in parts:
+            if not gauntlet.match(part):
+                return ""
+
+        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ""
+
+        clean = []
+        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
+            if not value:
+                continue
+
+            if prop.lower() in self.allowed_css_properties:
+                clean.append(prop + ": " + value + ";")
+
+            elif prop.lower() in self.allowed_svg_properties:
+                clean.append(prop + ": " + value + ";")
+
+        return " ".join(clean)