sam_consensus_v3: env/lib/python3.9/site-packages/bleach/html5lib

comparison env/lib/python3.9/site-packages/bleach/html5lib_shim.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"

author	shellac
date	Mon, 22 Mar 2021 18:12:50 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:4f3585e2f14b
+# flake8: noqa
+"""
+Shim module between Bleach and html5lib. This makes it easier to upgrade the
+html5lib library without having to change a lot of code.
+"""
+from __future__ import unicode_literals
+import re
+import string
+import warnings
+import six
+# ignore html5lib deprecation warnings to use bleach; we are bleach
+# apply before we import submodules that import html5lib
+warnings.filterwarnings(
+"ignore",
+message="html5lib's sanitizer is deprecated",
+category=DeprecationWarning,
+module="bleach._vendor.html5lib",
+)
+from bleach._vendor.html5lib import (  # noqa: E402 module level import not at top of file
+HTMLParser,
+getTreeWalker,
+)
+from bleach._vendor.html5lib import (
+constants,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.constants import (  # noqa: E402 module level import not at top of file
+namespaces,
+prefixes,
+)
+from bleach._vendor.html5lib.constants import (
+_ReparseException as ReparseException,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.base import (
+Filter,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.sanitizer import (
+allowed_protocols,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.sanitizer import (
+Filter as SanitizerFilter,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._inputstream import (
+HTMLInputStream,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.serializer import (
+escape,
+HTMLSerializer,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._tokenizer import (
+attributeMap,
+HTMLTokenizer,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._trie import (
+Trie,
+)  # noqa: E402 module level import not at top of file
+#: Map of entity name to expanded entity
+ENTITIES = constants.entities
+#: Trie of html entity string -> character representation
+ENTITIES_TRIE = Trie(ENTITIES)
+#: Token type constants--these never change
+TAG_TOKEN_TYPES = {
+constants.tokenTypes["StartTag"],
+constants.tokenTypes["EndTag"],
+constants.tokenTypes["EmptyTag"],
+}
+CHARACTERS_TYPE = constants.tokenTypes["Characters"]
+PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
+#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
+#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
+HTML_TAGS = [
+"a",
+"abbr",
+"address",
+"area",
+"article",
+"aside",
+"audio",
+"b",
+"base",
+"bdi",
+"bdo",
+"blockquote",
+"body",
+"br",
+"button",
+"canvas",
+"caption",
+"cite",
+"code",
+"col",
+"colgroup",
+"data",
+"datalist",
+"dd",
+"del",
+"details",
+"dfn",
+"dialog",
+"div",
+"dl",
+"dt",
+"em",
+"embed",
+"fieldset",
+"figcaption",
+"figure",
+"footer",
+"form",
+"h1",
+"h2",
+"h3",
+"h4",
+"h5",
+"h6",
+"head",
+"header",
+"hgroup",
+"hr",
+"html",
+"i",
+"iframe",
+"img",
+"input",
+"ins",
+"kbd",
+"keygen",
+"label",
+"legend",
+"li",
+"link",
+"map",
+"mark",
+"menu",
+"meta",
+"meter",
+"nav",
+"noscript",
+"object",
+"ol",
+"optgroup",
+"option",
+"output",
+"p",
+"param",
+"picture",
+"pre",
+"progress",
+"q",
+"rp",
+"rt",
+"ruby",
+"s",
+"samp",
+"script",
+"section",
+"select",
+"slot",
+"small",
+"source",
+"span",
+"strong",
+"style",
+"sub",
+"summary",
+"sup",
+"table",
+"tbody",
+"td",
+"template",
+"textarea",
+"tfoot",
+"th",
+"thead",
+"time",
+"title",
+"tr",
+"track",
+"u",
+"ul",
+"var",
+"video",
+"wbr",
+]
+class InputStreamWithMemory(object):
+"""Wraps an HTMLInputStream to remember characters since last <
+This wraps existing HTMLInputStream classes to keep track of the stream
+since the last < which marked an open tag state.
+"""
+def __init__(self, inner_stream):
+self._inner_stream = inner_stream
+self.reset = self._inner_stream.reset
+self.position = self._inner_stream.position
+self._buffer = []
+@property
+def errors(self):
+return self._inner_stream.errors
+@property
+def charEncoding(self):
+return self._inner_stream.charEncoding
+@property
+def changeEncoding(self):
+return self._inner_stream.changeEncoding
+def char(self):
+c = self._inner_stream.char()
+# char() can return None if EOF, so ignore that
+if c:
+self._buffer.append(c)
+return c
+def charsUntil(self, characters, opposite=False):
+chars = self._inner_stream.charsUntil(characters, opposite=opposite)
+self._buffer.extend(list(chars))
+return chars
+def unget(self, char):
+if self._buffer:
+self._buffer.pop(-1)
+return self._inner_stream.unget(char)
+def get_tag(self):
+"""Returns the stream history since last '<'
+Since the buffer starts at the last '<' as as seen by tagOpenState(),
+we know that everything from that point to when this method is called
+is the "tag" that is being tokenized.
+"""
+return six.text_type("").join(self._buffer)
+def start_tag(self):
+"""Resets stream history to just '<'
+This gets called by tagOpenState() which marks a '<' that denotes an
+open tag. Any time we see that, we reset the buffer.
+"""
+self._buffer = ["<"]
+class BleachHTMLTokenizer(HTMLTokenizer):
+"""Tokenizer that doesn't consume character entities"""
+def __init__(self, consume_entities=False, **kwargs):
+super(BleachHTMLTokenizer, self).__init__(**kwargs)
+self.consume_entities = consume_entities
+# Wrap the stream with one that remembers the history
+self.stream = InputStreamWithMemory(self.stream)
+def __iter__(self):
+last_error_token = None
+for token in super(BleachHTMLTokenizer, self).__iter__():
+if last_error_token is not None:
+if (
+last_error_token["data"] == "invalid-character-in-attribute-name"
+and token["type"] in TAG_TOKEN_TYPES
+and token.get("data")
+):
+# token["data"] is an html5lib attributeMap
+# (OrderedDict 3.7+ and dict otherwise)
+# of attr name to attr value
+#
+# Remove attribute names that have ', " or < in them
+# because those characters are invalid for attribute names.
+token["data"] = attributeMap(
+(attr_name, attr_value)
+for attr_name, attr_value in token["data"].items()
+if (
+'"' not in attr_name
+and "'" not in attr_name
+and "<" not in attr_name
+)
+)
+last_error_token = None
+yield token
+elif (
+last_error_token["data"] == "expected-closing-tag-but-got-char"
+and self.parser.tags is not None
+and token["data"].lower().strip() not in self.parser.tags
+):
+# We've got either a malformed tag or a pseudo-tag or
+# something that html5lib wants to turn into a malformed
+# comment which Bleach clean() will drop so we interfere
+# with the token stream to handle it more correctly.
+#
+# If this is an allowed tag, it's malformed and we just let
+# the html5lib parser deal with it--we don't enter into this
+# block.
+#
+# If this is not an allowed tag, then we convert it to
+# characters and it'll get escaped in the sanitizer.
+token["data"] = self.stream.get_tag()
+token["type"] = CHARACTERS_TYPE
+last_error_token = None
+yield token
+elif token["type"] == PARSEERROR_TYPE:
+# If the token is a parse error, then let the last_error_token
+# go, and make token the new last_error_token
+yield last_error_token
+last_error_token = token
+else:
+yield last_error_token
+yield token
+last_error_token = None
+continue
+# If the token is a ParseError, we hold on to it so we can get the
+# next token and potentially fix it.
+if token["type"] == PARSEERROR_TYPE:
+last_error_token = token
+continue
+yield token
+if last_error_token:
+yield last_error_token
+def consumeEntity(self, allowedChar=None, fromAttribute=False):
+# If this tokenizer is set to consume entities, then we can let the
+# superclass do its thing.
+if self.consume_entities:
+return super(BleachHTMLTokenizer, self).consumeEntity(
+allowedChar, fromAttribute
+)
+# If this tokenizer is set to not consume entities, then we don't want
+# to consume and convert them, so this overrides the html5lib tokenizer's
+# consumeEntity so that it's now a no-op.
+#
+# However, when that gets called, it's consumed an &, so we put that back in
+# the stream.
+if fromAttribute:
+self.currentToken["data"][-1][1] += "&"
+else:
+self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
+def tagOpenState(self):
+# This state marks a < that is either a StartTag, EndTag, EmptyTag,
+# or ParseError. In all cases, we want to drop any stream history
+# we've collected so far and we do that by calling start_tag() on
+# the input stream wrapper.
+self.stream.start_tag()
+return super(BleachHTMLTokenizer, self).tagOpenState()
+def emitCurrentToken(self):
+token = self.currentToken
+if (
+self.parser.tags is not None
+and token["type"] in TAG_TOKEN_TYPES
+and token["name"].lower() not in self.parser.tags
+):
+# If this is a start/end/empty tag for a tag that's not in our
+# allowed list, then it gets stripped or escaped. In both of these
+# cases it gets converted to a Characters token.
+if self.parser.strip:
+# If we're stripping the token, we just throw in an empty
+# string token.
+new_data = ""
+else:
+# If we're escaping the token, we want to escape the exact
+# original string. Since tokenizing also normalizes data
+# and this is a tag-like thing, we've lost some information.
+# So we go back through the stream to get the original
+# string and use that.
+new_data = self.stream.get_tag()
+new_token = {"type": CHARACTERS_TYPE, "data": new_data}
+self.currentToken = new_token
+self.tokenQueue.append(new_token)
+self.state = self.dataState
+return
+super(BleachHTMLTokenizer, self).emitCurrentToken()
+class BleachHTMLParser(HTMLParser):
+"""Parser that uses BleachHTMLTokenizer"""
+def __init__(self, tags, strip, consume_entities, **kwargs):
+"""
+:arg tags: list of allowed tags--everything else is either stripped or
+escaped; if None, then this doesn't look at tags at all
+:arg strip: whether to strip disallowed tags (True) or escape them (False);
+if tags=None, then this doesn't have any effect
+:arg consume_entities: whether to consume entities (default behavior) or
+leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
+"""
+self.tags = [tag.lower() for tag in tags] if tags is not None else None
+self.strip = strip
+self.consume_entities = consume_entities
+super(BleachHTMLParser, self).__init__(**kwargs)
+def _parse(
+self, stream, innerHTML=False, container="div", scripting=True, **kwargs
+):
+# set scripting=True to parse <noscript> as though JS is enabled to
+# match the expected context in browsers
+#
+# https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
+#
+# Override HTMLParser so we can swap out the tokenizer for our own.
+self.innerHTMLMode = innerHTML
+self.container = container
+self.scripting = scripting
+self.tokenizer = BleachHTMLTokenizer(
+stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
+)
+self.reset()
+try:
+self.mainLoop()
+except ReparseException:
+self.reset()
+self.mainLoop()
+def convert_entity(value):
+"""Convert an entity (minus the & and ; part) into what it represents
+This handles numeric, hex, and text entities.
+:arg value: the string (minus the ``&`` and ``;`` part) to convert
+:returns: unicode character or None if it's an ambiguous ampersand that
+doesn't match a character entity
+"""
+if value[0] == "#":
+if len(value) < 2:
+return None
+if value[1] in ("x", "X"):
+# hex-encoded code point
+int_as_string, base = value[2:], 16
+else:
+# decimal code point
+int_as_string, base = value[1:], 10
+if int_as_string == "":
+return None
+code_point = int(int_as_string, base)
+if 0 < code_point < 0x110000:
+return six.unichr(code_point)
+else:
+return None
+return ENTITIES.get(value, None)
+def convert_entities(text):
+"""Converts all found entities in the text
+:arg text: the text to convert entities in
+:returns: unicode text with converted entities
+"""
+if "&" not in text:
+return text
+new_text = []
+for part in next_possible_entity(text):
+if not part:
+continue
+if part.startswith("&"):
+entity = match_entity(part)
+if entity is not None:
+converted = convert_entity(entity)
+# If it's not an ambiguous ampersand, then replace with the
+# unicode character. Otherwise, we leave the entity in.
+if converted is not None:
+new_text.append(converted)
+remainder = part[len(entity) + 2 :]
+if part:
+new_text.append(remainder)
+continue
+new_text.append(part)
+return "".join(new_text)
+def match_entity(stream):
+"""Returns first entity in stream or None if no entity exists
+Note: For Bleach purposes, entities must start with a "&" and end with
+a ";". This ignoresambiguous character entities that have no ";" at the
+end.
+:arg stream: the character stream
+:returns: ``None`` or the entity string without "&" or ";"
+"""
+# Nix the & at the beginning
+if stream[0] != "&":
+raise ValueError('Stream should begin with "&"')
+stream = stream[1:]
+stream = list(stream)
+possible_entity = ""
+end_characters = "<&=;" + string.whitespace
+# Handle number entities
+if stream and stream[0] == "#":
+possible_entity = "#"
+stream.pop(0)
+if stream and stream[0] in ("x", "X"):
+allowed = "0123456789abcdefABCDEF"
+possible_entity += stream.pop(0)
+else:
+allowed = "0123456789"
+# FIXME(willkg): Do we want to make sure these are valid number
+# entities? This doesn't do that currently.
+while stream and stream[0] not in end_characters:
+c = stream.pop(0)
+if c not in allowed:
+break
+possible_entity += c
+if possible_entity and stream and stream[0] == ";":
+return possible_entity
+return None
+# Handle character entities
+while stream and stream[0] not in end_characters:
+c = stream.pop(0)
+if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
+break
+possible_entity += c
+if possible_entity and stream and stream[0] == ";":
+return possible_entity
+return None
+AMP_SPLIT_RE = re.compile("(&)")
+def next_possible_entity(text):
+"""Takes a text and generates a list of possible entities
+:arg text: the text to look at
+:returns: generator where each part (except the first) starts with an
+"&"
+"""
+for i, part in enumerate(AMP_SPLIT_RE.split(text)):
+if i == 0:
+yield part
+elif i % 2 == 0:
+yield "&" + part
+class BleachHTMLSerializer(HTMLSerializer):
+"""HTMLSerializer that undoes & -> &amp; in attributes and sets
+escape_rcdata to True
+"""
+# per the HTMLSerializer.__init__ docstring:
+#
+# Whether to escape characters that need to be
+# escaped within normal elements within rcdata elements such as
+# style.
+#
+escape_rcdata = True
+def escape_base_amp(self, stoken):
+"""Escapes just bare & in HTML attribute values"""
+# First, undo escaping of &. We need to do this because html5lib's
+# HTMLSerializer expected the tokenizer to consume all the character
+# entities and convert them to their respective characters, but the
+# BleachHTMLTokenizer doesn't do that. For example, this fixes
+# &amp;entity; back to &entity; .
+stoken = stoken.replace("&amp;", "&")
+# However, we do want all bare & that are not marking character
+# entities to be changed to &amp;, so let's do that carefully here.
+for part in next_possible_entity(stoken):
+if not part:
+continue
+if part.startswith("&"):
+entity = match_entity(part)
+# Only leave entities in that are not ambiguous. If they're
+# ambiguous, then we escape the ampersand.
+if entity is not None and convert_entity(entity) is not None:
+yield "&" + entity + ";"
+# Length of the entity plus 2--one for & at the beginning
+# and one for ; at the end
+part = part[len(entity) + 2 :]
+if part:
+yield part
+continue
+yield part.replace("&", "&amp;")
+def serialize(self, treewalker, encoding=None):
+"""Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
+Note that this converts & to &amp; in attribute values where the & isn't
+already part of an unambiguous character entity.
+"""
+in_tag = False
+after_equals = False
+for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
+if in_tag:
+if stoken == ">":
+in_tag = False
+elif after_equals:
+if stoken != '"':
+for part in self.escape_base_amp(stoken):
+yield part
+after_equals = False
+continue
+elif stoken == "=":
+after_equals = True
+yield stoken
+else:
+if stoken.startswith("<"):
+in_tag = True
+yield stoken

Mercurial > repos > shellac > sam_consensus_v3

comparison env/lib/python3.9/site-packages/bleach/html5lib_shim.py @ 0:4f3585e2f14b draft default tip