diff env/lib/python3.7/site-packages/bleach/html5lib_shim.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/bleach/html5lib_shim.py	Thu May 14 16:47:39 2020 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,610 +0,0 @@
-# flake8: noqa
-"""
-Shim module between Bleach and html5lib. This makes it easier to upgrade the
-html5lib library without having to change a lot of code.
-"""
-
-from __future__ import unicode_literals
-
-import re
-import string
-
-import six
-
-from bleach._vendor.html5lib import (
-    HTMLParser,
-    getTreeWalker,
-)
-from bleach._vendor.html5lib import constants
-from bleach._vendor.html5lib.constants import (
-    namespaces,
-    prefixes,
-)
-from bleach._vendor.html5lib.constants import _ReparseException as ReparseException
-from bleach._vendor.html5lib.filters.base import Filter
-from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols
-from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter
-from bleach._vendor.html5lib._inputstream import HTMLInputStream
-from bleach._vendor.html5lib.serializer import HTMLSerializer
-from bleach._vendor.html5lib._tokenizer import HTMLTokenizer
-from bleach._vendor.html5lib._trie import Trie
-
-
-#: Map of entity name to expanded entity
-ENTITIES = constants.entities
-
-#: Trie of html entity string -> character representation
-ENTITIES_TRIE = Trie(ENTITIES)
-
-#: Token type constants--these never change
-TAG_TOKEN_TYPES = {
-    constants.tokenTypes['StartTag'],
-    constants.tokenTypes['EndTag'],
-    constants.tokenTypes['EmptyTag']
-}
-CHARACTERS_TYPE = constants.tokenTypes['Characters']
-PARSEERROR_TYPE = constants.tokenTypes['ParseError']
-
-
-#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
-#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
-HTML_TAGS = [
-    'a',
-    'abbr',
-    'address',
-    'area',
-    'article',
-    'aside',
-    'audio',
-    'b',
-    'base',
-    'bdi',
-    'bdo',
-    'blockquote',
-    'body',
-    'br',
-    'button',
-    'canvas',
-    'caption',
-    'cite',
-    'code',
-    'col',
-    'colgroup',
-    'data',
-    'datalist',
-    'dd',
-    'del',
-    'details',
-    'dfn',
-    'dialog',
-    'div',
-    'dl',
-    'dt',
-    'em',
-    'embed',
-    'fieldset',
-    'figcaption',
-    'figure',
-    'footer',
-    'form',
-    'h1',
-    'h2',
-    'h3',
-    'h4',
-    'h5',
-    'h6',
-    'head',
-    'header',
-    'hgroup',
-    'hr',
-    'html',
-    'i',
-    'iframe',
-    'img',
-    'input',
-    'ins',
-    'kbd',
-    'keygen',
-    'label',
-    'legend',
-    'li',
-    'link',
-    'map',
-    'mark',
-    'menu',
-    'meta',
-    'meter',
-    'nav',
-    'noscript',
-    'object',
-    'ol',
-    'optgroup',
-    'option',
-    'output',
-    'p',
-    'param',
-    'picture',
-    'pre',
-    'progress',
-    'q',
-    'rp',
-    'rt',
-    'ruby',
-    's',
-    'samp',
-    'script',
-    'section',
-    'select',
-    'slot',
-    'small',
-    'source',
-    'span',
-    'strong',
-    'style',
-    'sub',
-    'summary',
-    'sup',
-    'table',
-    'tbody',
-    'td',
-    'template',
-    'textarea',
-    'tfoot',
-    'th',
-    'thead',
-    'time',
-    'title',
-    'tr',
-    'track',
-    'u',
-    'ul',
-    'var',
-    'video',
-    'wbr',
-]
-
-
-class InputStreamWithMemory(object):
-    """Wraps an HTMLInputStream to remember characters since last <
-
-    This wraps existing HTMLInputStream classes to keep track of the stream
-    since the last < which marked an open tag state.
-
-    """
-    def __init__(self, inner_stream):
-        self._inner_stream = inner_stream
-        self.reset = self._inner_stream.reset
-        self.position = self._inner_stream.position
-        self._buffer = []
-
-    @property
-    def errors(self):
-        return self._inner_stream.errors
-
-    @property
-    def charEncoding(self):
-        return self._inner_stream.charEncoding
-
-    @property
-    def changeEncoding(self):
-        return self._inner_stream.changeEncoding
-
-    def char(self):
-        c = self._inner_stream.char()
-        # char() can return None if EOF, so ignore that
-        if c:
-            self._buffer.append(c)
-        return c
-
-    def charsUntil(self, characters, opposite=False):
-        chars = self._inner_stream.charsUntil(characters, opposite=opposite)
-        self._buffer.extend(list(chars))
-        return chars
-
-    def unget(self, char):
-        if self._buffer:
-            self._buffer.pop(-1)
-        return self._inner_stream.unget(char)
-
-    def get_tag(self):
-        """Returns the stream history since last '<'
-
-        Since the buffer starts at the last '<' as as seen by tagOpenState(),
-        we know that everything from that point to when this method is called
-        is the "tag" that is being tokenized.
-
-        """
-        return six.text_type('').join(self._buffer)
-
-    def start_tag(self):
-        """Resets stream history to just '<'
-
-        This gets called by tagOpenState() which marks a '<' that denotes an
-        open tag. Any time we see that, we reset the buffer.
-
-        """
-        self._buffer = ['<']
-
-
-class BleachHTMLTokenizer(HTMLTokenizer):
-    """Tokenizer that doesn't consume character entities"""
-    def __init__(self, consume_entities=False, **kwargs):
-        super(BleachHTMLTokenizer, self).__init__(**kwargs)
-
-        self.consume_entities = consume_entities
-
-        # Wrap the stream with one that remembers the history
-        self.stream = InputStreamWithMemory(self.stream)
-
-    def __iter__(self):
-        last_error_token = None
-
-        for token in super(BleachHTMLTokenizer, self).__iter__():
-            if last_error_token is not None:
-                if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and
-                     token['type'] in TAG_TOKEN_TYPES and
-                     token.get('data'))):
-                    # Remove attribute names that have ', " or < in them
-                    # because those characters are invalid for attribute names.
-                    token['data'] = [
-                        item for item in token['data']
-                        if ('"' not in item[0] and
-                            "'" not in item[0] and
-                            '<' not in item[0])
-                    ]
-                    last_error_token = None
-                    yield token
-
-                elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
-                       self.parser.tags is not None and
-                       token['data'].lower().strip() not in self.parser.tags)):
-                    # We've got either a malformed tag or a pseudo-tag or
-                    # something that html5lib wants to turn into a malformed
-                    # comment which Bleach clean() will drop so we interfere
-                    # with the token stream to handle it more correctly.
-                    #
-                    # If this is an allowed tag, it's malformed and we just let
-                    # the html5lib parser deal with it--we don't enter into this
-                    # block.
-                    #
-                    # If this is not an allowed tag, then we convert it to
-                    # characters and it'll get escaped in the sanitizer.
-                    token['data'] = self.stream.get_tag()
-                    token['type'] = CHARACTERS_TYPE
-
-                    last_error_token = None
-                    yield token
-
-                elif token['type'] == PARSEERROR_TYPE:
-                    # If the token is a parse error, then let the last_error_token
-                    # go, and make token the new last_error_token
-                    yield last_error_token
-                    last_error_token = token
-
-                else:
-                    yield last_error_token
-                    yield token
-                    last_error_token = None
-
-                continue
-
-            # If the token is a ParseError, we hold on to it so we can get the
-            # next token and potentially fix it.
-            if token['type'] == PARSEERROR_TYPE:
-                last_error_token = token
-                continue
-
-            yield token
-
-        if last_error_token:
-            yield last_error_token
-
-    def consumeEntity(self, allowedChar=None, fromAttribute=False):
-        # If this tokenizer is set to consume entities, then we can let the
-        # superclass do its thing.
-        if self.consume_entities:
-            return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute)
-
-        # If this tokenizer is set to not consume entities, then we don't want
-        # to consume and convert them, so this overrides the html5lib tokenizer's
-        # consumeEntity so that it's now a no-op.
-        #
-        # However, when that gets called, it's consumed an &, so we put that back in
-        # the stream.
-        if fromAttribute:
-            self.currentToken['data'][-1][1] += '&'
-
-        else:
-            self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'})
-
-    def tagOpenState(self):
-        # This state marks a < that is either a StartTag, EndTag, EmptyTag,
-        # or ParseError. In all cases, we want to drop any stream history
-        # we've collected so far and we do that by calling start_tag() on
-        # the input stream wrapper.
-        self.stream.start_tag()
-        return super(BleachHTMLTokenizer, self).tagOpenState()
-
-    def emitCurrentToken(self):
-        token = self.currentToken
-
-        if ((self.parser.tags is not None and
-             token['type'] in TAG_TOKEN_TYPES and
-             token['name'].lower() not in self.parser.tags)):
-            # If this is a start/end/empty tag for a tag that's not in our
-            # allowed list, then it gets stripped or escaped. In both of these
-            # cases it gets converted to a Characters token.
-            if self.parser.strip:
-                # If we're stripping the token, we just throw in an empty
-                # string token.
-                new_data = ''
-
-            else:
-                # If we're escaping the token, we want to escape the exact
-                # original string. Since tokenizing also normalizes data
-                # and this is a tag-like thing, we've lost some information.
-                # So we go back through the stream to get the original
-                # string and use that.
-                new_data = self.stream.get_tag()
-
-            new_token = {
-                'type': CHARACTERS_TYPE,
-                'data': new_data
-            }
-
-            self.currentToken = new_token
-            self.tokenQueue.append(new_token)
-            self.state = self.dataState
-            return
-
-        super(BleachHTMLTokenizer, self).emitCurrentToken()
-
-
-class BleachHTMLParser(HTMLParser):
-    """Parser that uses BleachHTMLTokenizer"""
-    def __init__(self, tags, strip, consume_entities, **kwargs):
-        """
-        :arg tags: list of allowed tags--everything else is either stripped or
-            escaped; if None, then this doesn't look at tags at all
-        :arg strip: whether to strip disallowed tags (True) or escape them (False);
-            if tags=None, then this doesn't have any effect
-        :arg consume_entities: whether to consume entities (default behavior) or
-            leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
-
-        """
-        self.tags = [tag.lower() for tag in tags] if tags is not None else None
-        self.strip = strip
-        self.consume_entities = consume_entities
-        super(BleachHTMLParser, self).__init__(**kwargs)
-
-    def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs):
-        # set scripting=True to parse <noscript> as though JS is enabled to
-        # match the expected context in browsers
-        #
-        # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
-        #
-        # Override HTMLParser so we can swap out the tokenizer for our own.
-        self.innerHTMLMode = innerHTML
-        self.container = container
-        self.scripting = scripting
-        self.tokenizer = BleachHTMLTokenizer(
-            stream=stream,
-            consume_entities=self.consume_entities,
-            parser=self,
-            **kwargs
-        )
-        self.reset()
-
-        try:
-            self.mainLoop()
-        except ReparseException:
-            self.reset()
-            self.mainLoop()
-
-
-def convert_entity(value):
-    """Convert an entity (minus the & and ; part) into what it represents
-
-    This handles numeric, hex, and text entities.
-
-    :arg value: the string (minus the ``&`` and ``;`` part) to convert
-
-    :returns: unicode character or None if it's an ambiguous ampersand that
-        doesn't match a character entity
-
-    """
-    if value[0] == '#':
-        if value[1] in ('x', 'X'):
-            return six.unichr(int(value[2:], 16))
-        return six.unichr(int(value[1:], 10))
-
-    return ENTITIES.get(value, None)
-
-
-def convert_entities(text):
-    """Converts all found entities in the text
-
-    :arg text: the text to convert entities in
-
-    :returns: unicode text with converted entities
-
-    """
-    if '&' not in text:
-        return text
-
-    new_text = []
-    for part in next_possible_entity(text):
-        if not part:
-            continue
-
-        if part.startswith('&'):
-            entity = match_entity(part)
-            if entity is not None:
-                converted = convert_entity(entity)
-
-                # If it's not an ambiguous ampersand, then replace with the
-                # unicode character. Otherwise, we leave the entity in.
-                if converted is not None:
-                    new_text.append(converted)
-                    remainder = part[len(entity) + 2:]
-                    if part:
-                        new_text.append(remainder)
-                    continue
-
-        new_text.append(part)
-
-    return ''.join(new_text)
-
-
-def match_entity(stream):
-    """Returns first entity in stream or None if no entity exists
-
-    Note: For Bleach purposes, entities must start with a "&" and end with
-    a ";". This ignoresambiguous character entities that have no ";" at the
-    end.
-
-    :arg stream: the character stream
-
-    :returns: ``None`` or the entity string without "&" or ";"
-
-    """
-    # Nix the & at the beginning
-    if stream[0] != '&':
-        raise ValueError('Stream should begin with "&"')
-
-    stream = stream[1:]
-
-    stream = list(stream)
-    possible_entity = ''
-    end_characters = '<&=;' + string.whitespace
-
-    # Handle number entities
-    if stream and stream[0] == '#':
-        possible_entity = '#'
-        stream.pop(0)
-
-        if stream and stream[0] in ('x', 'X'):
-            allowed = '0123456789abcdefABCDEF'
-            possible_entity += stream.pop(0)
-        else:
-            allowed = '0123456789'
-
-        # FIXME(willkg): Do we want to make sure these are valid number
-        # entities? This doesn't do that currently.
-        while stream and stream[0] not in end_characters:
-            c = stream.pop(0)
-            if c not in allowed:
-                break
-            possible_entity += c
-
-        if possible_entity and stream and stream[0] == ';':
-            return possible_entity
-        return None
-
-    # Handle character entities
-    while stream and stream[0] not in end_characters:
-        c = stream.pop(0)
-        if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
-            break
-        possible_entity += c
-
-    if possible_entity and stream and stream[0] == ';':
-        return possible_entity
-
-    return None
-
-
-AMP_SPLIT_RE = re.compile('(&)')
-
-
-def next_possible_entity(text):
-    """Takes a text and generates a list of possible entities
-
-    :arg text: the text to look at
-
-    :returns: generator where each part (except the first) starts with an
-        "&"
-
-    """
-    for i, part in enumerate(AMP_SPLIT_RE.split(text)):
-        if i == 0:
-            yield part
-        elif i % 2 == 0:
-            yield '&' + part
-
-
-class BleachHTMLSerializer(HTMLSerializer):
-    """HTMLSerializer that undoes & -> &amp; in attributes and sets
-    escape_rcdata to True
-    """
-
-    # per the HTMLSerializer.__init__ docstring:
-    #
-    # Whether to escape characters that need to be
-    # escaped within normal elements within rcdata elements such as
-    # style.
-    #
-    escape_rcdata = True
-
-    def escape_base_amp(self, stoken):
-        """Escapes just bare & in HTML attribute values"""
-        # First, undo escaping of &. We need to do this because html5lib's
-        # HTMLSerializer expected the tokenizer to consume all the character
-        # entities and convert them to their respective characters, but the
-        # BleachHTMLTokenizer doesn't do that. For example, this fixes
-        # &amp;entity; back to &entity; .
-        stoken = stoken.replace('&amp;', '&')
-
-        # However, we do want all bare & that are not marking character
-        # entities to be changed to &amp;, so let's do that carefully here.
-        for part in next_possible_entity(stoken):
-            if not part:
-                continue
-
-            if part.startswith('&'):
-                entity = match_entity(part)
-                # Only leave entities in that are not ambiguous. If they're
-                # ambiguous, then we escape the ampersand.
-                if entity is not None and convert_entity(entity) is not None:
-                    yield '&' + entity + ';'
-
-                    # Length of the entity plus 2--one for & at the beginning
-                    # and one for ; at the end
-                    part = part[len(entity) + 2:]
-                    if part:
-                        yield part
-                    continue
-
-            yield part.replace('&', '&amp;')
-
-    def serialize(self, treewalker, encoding=None):
-        """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
-
-        Note that this converts & to &amp; in attribute values where the & isn't
-        already part of an unambiguous character entity.
-
-        """
-        in_tag = False
-        after_equals = False
-
-        for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
-            if in_tag:
-                if stoken == '>':
-                    in_tag = False
-
-                elif after_equals:
-                    if stoken != '"':
-                        for part in self.escape_base_amp(stoken):
-                            yield part
-
-                        after_equals = False
-                        continue
-
-                elif stoken == '=':
-                    after_equals = True
-
-                yield stoken
-            else:
-                if stoken.startswith('<'):
-                    in_tag = True
-                yield stoken