Mercurial > repos > shellac > sam_consensus_v3
view env/lib/python3.9/site-packages/bleach/html5lib_shim.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
line wrap: on
line source
# flake8: noqa """ Shim module between Bleach and html5lib. This makes it easier to upgrade the html5lib library without having to change a lot of code. """ from __future__ import unicode_literals import re import string import warnings import six # ignore html5lib deprecation warnings to use bleach; we are bleach # apply before we import submodules that import html5lib warnings.filterwarnings( "ignore", message="html5lib's sanitizer is deprecated", category=DeprecationWarning, module="bleach._vendor.html5lib", ) from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file HTMLParser, getTreeWalker, ) from bleach._vendor.html5lib import ( constants, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file namespaces, prefixes, ) from bleach._vendor.html5lib.constants import ( _ReparseException as ReparseException, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.filters.base import ( Filter, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.filters.sanitizer import ( allowed_protocols, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.filters.sanitizer import ( Filter as SanitizerFilter, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib._inputstream import ( HTMLInputStream, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.serializer import ( escape, HTMLSerializer, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib._tokenizer import ( attributeMap, HTMLTokenizer, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib._trie import ( Trie, ) # noqa: E402 module level import not at top of file #: Map of entity name to expanded entity ENTITIES = constants.entities #: Trie of html entity string -> character representation ENTITIES_TRIE = Trie(ENTITIES) #: Token type constants--these never change TAG_TOKEN_TYPES = { constants.tokenTypes["StartTag"], constants.tokenTypes["EndTag"], constants.tokenTypes["EmptyTag"], } CHARACTERS_TYPE = constants.tokenTypes["Characters"] PARSEERROR_TYPE = constants.tokenTypes["ParseError"] #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3 HTML_TAGS = [ "a", "abbr", "address", "area", "article", "aside", "audio", "b", "base", "bdi", "bdo", "blockquote", "body", "br", "button", "canvas", "caption", "cite", "code", "col", "colgroup", "data", "datalist", "dd", "del", "details", "dfn", "dialog", "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "iframe", "img", "input", "ins", "kbd", "keygen", "label", "legend", "li", "link", "map", "mark", "menu", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "picture", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "script", "section", "select", "slot", "small", "source", "span", "strong", "style", "sub", "summary", "sup", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time", "title", "tr", "track", "u", "ul", "var", "video", "wbr", ] class InputStreamWithMemory(object): """Wraps an HTMLInputStream to remember characters since last < This wraps existing HTMLInputStream classes to keep track of the stream since the last < which marked an open tag state. """ def __init__(self, inner_stream): self._inner_stream = inner_stream self.reset = self._inner_stream.reset self.position = self._inner_stream.position self._buffer = [] @property def errors(self): return self._inner_stream.errors @property def charEncoding(self): return self._inner_stream.charEncoding @property def changeEncoding(self): return self._inner_stream.changeEncoding def char(self): c = self._inner_stream.char() # char() can return None if EOF, so ignore that if c: self._buffer.append(c) return c def charsUntil(self, characters, opposite=False): chars = self._inner_stream.charsUntil(characters, opposite=opposite) self._buffer.extend(list(chars)) return chars def unget(self, char): if self._buffer: self._buffer.pop(-1) return self._inner_stream.unget(char) def get_tag(self): """Returns the stream history since last '<' Since the buffer starts at the last '<' as as seen by tagOpenState(), we know that everything from that point to when this method is called is the "tag" that is being tokenized. """ return six.text_type("").join(self._buffer) def start_tag(self): """Resets stream history to just '<' This gets called by tagOpenState() which marks a '<' that denotes an open tag. Any time we see that, we reset the buffer. """ self._buffer = ["<"] class BleachHTMLTokenizer(HTMLTokenizer): """Tokenizer that doesn't consume character entities""" def __init__(self, consume_entities=False, **kwargs): super(BleachHTMLTokenizer, self).__init__(**kwargs) self.consume_entities = consume_entities # Wrap the stream with one that remembers the history self.stream = InputStreamWithMemory(self.stream) def __iter__(self): last_error_token = None for token in super(BleachHTMLTokenizer, self).__iter__(): if last_error_token is not None: if ( last_error_token["data"] == "invalid-character-in-attribute-name" and token["type"] in TAG_TOKEN_TYPES and token.get("data") ): # token["data"] is an html5lib attributeMap # (OrderedDict 3.7+ and dict otherwise) # of attr name to attr value # # Remove attribute names that have ', " or < in them # because those characters are invalid for attribute names. token["data"] = attributeMap( (attr_name, attr_value) for attr_name, attr_value in token["data"].items() if ( '"' not in attr_name and "'" not in attr_name and "<" not in attr_name ) ) last_error_token = None yield token elif ( last_error_token["data"] == "expected-closing-tag-but-got-char" and self.parser.tags is not None and token["data"].lower().strip() not in self.parser.tags ): # We've got either a malformed tag or a pseudo-tag or # something that html5lib wants to turn into a malformed # comment which Bleach clean() will drop so we interfere # with the token stream to handle it more correctly. # # If this is an allowed tag, it's malformed and we just let # the html5lib parser deal with it--we don't enter into this # block. # # If this is not an allowed tag, then we convert it to # characters and it'll get escaped in the sanitizer. token["data"] = self.stream.get_tag() token["type"] = CHARACTERS_TYPE last_error_token = None yield token elif token["type"] == PARSEERROR_TYPE: # If the token is a parse error, then let the last_error_token # go, and make token the new last_error_token yield last_error_token last_error_token = token else: yield last_error_token yield token last_error_token = None continue # If the token is a ParseError, we hold on to it so we can get the # next token and potentially fix it. if token["type"] == PARSEERROR_TYPE: last_error_token = token continue yield token if last_error_token: yield last_error_token def consumeEntity(self, allowedChar=None, fromAttribute=False): # If this tokenizer is set to consume entities, then we can let the # superclass do its thing. if self.consume_entities: return super(BleachHTMLTokenizer, self).consumeEntity( allowedChar, fromAttribute ) # If this tokenizer is set to not consume entities, then we don't want # to consume and convert them, so this overrides the html5lib tokenizer's # consumeEntity so that it's now a no-op. # # However, when that gets called, it's consumed an &, so we put that back in # the stream. if fromAttribute: self.currentToken["data"][-1][1] += "&" else: self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"}) def tagOpenState(self): # This state marks a < that is either a StartTag, EndTag, EmptyTag, # or ParseError. In all cases, we want to drop any stream history # we've collected so far and we do that by calling start_tag() on # the input stream wrapper. self.stream.start_tag() return super(BleachHTMLTokenizer, self).tagOpenState() def emitCurrentToken(self): token = self.currentToken if ( self.parser.tags is not None and token["type"] in TAG_TOKEN_TYPES and token["name"].lower() not in self.parser.tags ): # If this is a start/end/empty tag for a tag that's not in our # allowed list, then it gets stripped or escaped. In both of these # cases it gets converted to a Characters token. if self.parser.strip: # If we're stripping the token, we just throw in an empty # string token. new_data = "" else: # If we're escaping the token, we want to escape the exact # original string. Since tokenizing also normalizes data # and this is a tag-like thing, we've lost some information. # So we go back through the stream to get the original # string and use that. new_data = self.stream.get_tag() new_token = {"type": CHARACTERS_TYPE, "data": new_data} self.currentToken = new_token self.tokenQueue.append(new_token) self.state = self.dataState return super(BleachHTMLTokenizer, self).emitCurrentToken() class BleachHTMLParser(HTMLParser): """Parser that uses BleachHTMLTokenizer""" def __init__(self, tags, strip, consume_entities, **kwargs): """ :arg tags: list of allowed tags--everything else is either stripped or escaped; if None, then this doesn't look at tags at all :arg strip: whether to strip disallowed tags (True) or escape them (False); if tags=None, then this doesn't have any effect :arg consume_entities: whether to consume entities (default behavior) or leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) """ self.tags = [tag.lower() for tag in tags] if tags is not None else None self.strip = strip self.consume_entities = consume_entities super(BleachHTMLParser, self).__init__(**kwargs) def _parse( self, stream, innerHTML=False, container="div", scripting=True, **kwargs ): # set scripting=True to parse <noscript> as though JS is enabled to # match the expected context in browsers # # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element # # Override HTMLParser so we can swap out the tokenizer for our own. self.innerHTMLMode = innerHTML self.container = container self.scripting = scripting self.tokenizer = BleachHTMLTokenizer( stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs ) self.reset() try: self.mainLoop() except ReparseException: self.reset() self.mainLoop() def convert_entity(value): """Convert an entity (minus the & and ; part) into what it represents This handles numeric, hex, and text entities. :arg value: the string (minus the ``&`` and ``;`` part) to convert :returns: unicode character or None if it's an ambiguous ampersand that doesn't match a character entity """ if value[0] == "#": if len(value) < 2: return None if value[1] in ("x", "X"): # hex-encoded code point int_as_string, base = value[2:], 16 else: # decimal code point int_as_string, base = value[1:], 10 if int_as_string == "": return None code_point = int(int_as_string, base) if 0 < code_point < 0x110000: return six.unichr(code_point) else: return None return ENTITIES.get(value, None) def convert_entities(text): """Converts all found entities in the text :arg text: the text to convert entities in :returns: unicode text with converted entities """ if "&" not in text: return text new_text = [] for part in next_possible_entity(text): if not part: continue if part.startswith("&"): entity = match_entity(part) if entity is not None: converted = convert_entity(entity) # If it's not an ambiguous ampersand, then replace with the # unicode character. Otherwise, we leave the entity in. if converted is not None: new_text.append(converted) remainder = part[len(entity) + 2 :] if part: new_text.append(remainder) continue new_text.append(part) return "".join(new_text) def match_entity(stream): """Returns first entity in stream or None if no entity exists Note: For Bleach purposes, entities must start with a "&" and end with a ";". This ignoresambiguous character entities that have no ";" at the end. :arg stream: the character stream :returns: ``None`` or the entity string without "&" or ";" """ # Nix the & at the beginning if stream[0] != "&": raise ValueError('Stream should begin with "&"') stream = stream[1:] stream = list(stream) possible_entity = "" end_characters = "<&=;" + string.whitespace # Handle number entities if stream and stream[0] == "#": possible_entity = "#" stream.pop(0) if stream and stream[0] in ("x", "X"): allowed = "0123456789abcdefABCDEF" possible_entity += stream.pop(0) else: allowed = "0123456789" # FIXME(willkg): Do we want to make sure these are valid number # entities? This doesn't do that currently. while stream and stream[0] not in end_characters: c = stream.pop(0) if c not in allowed: break possible_entity += c if possible_entity and stream and stream[0] == ";": return possible_entity return None # Handle character entities while stream and stream[0] not in end_characters: c = stream.pop(0) if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity): break possible_entity += c if possible_entity and stream and stream[0] == ";": return possible_entity return None AMP_SPLIT_RE = re.compile("(&)") def next_possible_entity(text): """Takes a text and generates a list of possible entities :arg text: the text to look at :returns: generator where each part (except the first) starts with an "&" """ for i, part in enumerate(AMP_SPLIT_RE.split(text)): if i == 0: yield part elif i % 2 == 0: yield "&" + part class BleachHTMLSerializer(HTMLSerializer): """HTMLSerializer that undoes & -> & in attributes and sets escape_rcdata to True """ # per the HTMLSerializer.__init__ docstring: # # Whether to escape characters that need to be # escaped within normal elements within rcdata elements such as # style. # escape_rcdata = True def escape_base_amp(self, stoken): """Escapes just bare & in HTML attribute values""" # First, undo escaping of &. We need to do this because html5lib's # HTMLSerializer expected the tokenizer to consume all the character # entities and convert them to their respective characters, but the # BleachHTMLTokenizer doesn't do that. For example, this fixes # &entity; back to &entity; . stoken = stoken.replace("&", "&") # However, we do want all bare & that are not marking character # entities to be changed to &, so let's do that carefully here. for part in next_possible_entity(stoken): if not part: continue if part.startswith("&"): entity = match_entity(part) # Only leave entities in that are not ambiguous. If they're # ambiguous, then we escape the ampersand. if entity is not None and convert_entity(entity) is not None: yield "&" + entity + ";" # Length of the entity plus 2--one for & at the beginning # and one for ; at the end part = part[len(entity) + 2 :] if part: yield part continue yield part.replace("&", "&") def serialize(self, treewalker, encoding=None): """Wrap HTMLSerializer.serialize and conver & to & in attribute values Note that this converts & to & in attribute values where the & isn't already part of an unambiguous character entity. """ in_tag = False after_equals = False for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding): if in_tag: if stoken == ">": in_tag = False elif after_equals: if stoken != '"': for part in self.escape_base_amp(stoken): yield part after_equals = False continue elif stoken == "=": after_equals = True yield stoken else: if stoken.startswith("<"): in_tag = True yield stoken