Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/bleach/html5lib_shim.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
| author | shellac |
|---|---|
| date | Mon, 01 Jun 2020 08:59:25 -0400 |
| parents | 79f47841a781 |
| children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/bleach/html5lib_shim.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,610 +0,0 @@ -# flake8: noqa -""" -Shim module between Bleach and html5lib. This makes it easier to upgrade the -html5lib library without having to change a lot of code. -""" - -from __future__ import unicode_literals - -import re -import string - -import six - -from bleach._vendor.html5lib import ( - HTMLParser, - getTreeWalker, -) -from bleach._vendor.html5lib import constants -from bleach._vendor.html5lib.constants import ( - namespaces, - prefixes, -) -from bleach._vendor.html5lib.constants import _ReparseException as ReparseException -from bleach._vendor.html5lib.filters.base import Filter -from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols -from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter -from bleach._vendor.html5lib._inputstream import HTMLInputStream -from bleach._vendor.html5lib.serializer import HTMLSerializer -from bleach._vendor.html5lib._tokenizer import HTMLTokenizer -from bleach._vendor.html5lib._trie import Trie - - -#: Map of entity name to expanded entity -ENTITIES = constants.entities - -#: Trie of html entity string -> character representation -ENTITIES_TRIE = Trie(ENTITIES) - -#: Token type constants--these never change -TAG_TOKEN_TYPES = { - constants.tokenTypes['StartTag'], - constants.tokenTypes['EndTag'], - constants.tokenTypes['EmptyTag'] -} -CHARACTERS_TYPE = constants.tokenTypes['Characters'] -PARSEERROR_TYPE = constants.tokenTypes['ParseError'] - - -#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 -#: https://html.spec.whatwg.org/multipage/indices.html#elements-3 -HTML_TAGS = [ - 'a', - 'abbr', - 'address', - 'area', - 'article', - 'aside', - 'audio', - 'b', - 'base', - 'bdi', - 'bdo', - 'blockquote', - 'body', - 'br', - 'button', - 'canvas', - 'caption', - 'cite', - 'code', - 'col', - 'colgroup', - 'data', - 'datalist', - 'dd', - 'del', - 'details', - 'dfn', - 'dialog', - 'div', - 'dl', - 'dt', - 'em', - 'embed', - 'fieldset', - 'figcaption', - 'figure', - 'footer', - 'form', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'head', - 'header', - 'hgroup', - 'hr', - 'html', - 'i', - 'iframe', - 'img', - 'input', - 'ins', - 'kbd', - 'keygen', - 'label', - 'legend', - 'li', - 'link', - 'map', - 'mark', - 'menu', - 'meta', - 'meter', - 'nav', - 'noscript', - 'object', - 'ol', - 'optgroup', - 'option', - 'output', - 'p', - 'param', - 'picture', - 'pre', - 'progress', - 'q', - 'rp', - 'rt', - 'ruby', - 's', - 'samp', - 'script', - 'section', - 'select', - 'slot', - 'small', - 'source', - 'span', - 'strong', - 'style', - 'sub', - 'summary', - 'sup', - 'table', - 'tbody', - 'td', - 'template', - 'textarea', - 'tfoot', - 'th', - 'thead', - 'time', - 'title', - 'tr', - 'track', - 'u', - 'ul', - 'var', - 'video', - 'wbr', -] - - -class InputStreamWithMemory(object): - """Wraps an HTMLInputStream to remember characters since last < - - This wraps existing HTMLInputStream classes to keep track of the stream - since the last < which marked an open tag state. - - """ - def __init__(self, inner_stream): - self._inner_stream = inner_stream - self.reset = self._inner_stream.reset - self.position = self._inner_stream.position - self._buffer = [] - - @property - def errors(self): - return self._inner_stream.errors - - @property - def charEncoding(self): - return self._inner_stream.charEncoding - - @property - def changeEncoding(self): - return self._inner_stream.changeEncoding - - def char(self): - c = self._inner_stream.char() - # char() can return None if EOF, so ignore that - if c: - self._buffer.append(c) - return c - - def charsUntil(self, characters, opposite=False): - chars = self._inner_stream.charsUntil(characters, opposite=opposite) - self._buffer.extend(list(chars)) - return chars - - def unget(self, char): - if self._buffer: - self._buffer.pop(-1) - return self._inner_stream.unget(char) - - def get_tag(self): - """Returns the stream history since last '<' - - Since the buffer starts at the last '<' as as seen by tagOpenState(), - we know that everything from that point to when this method is called - is the "tag" that is being tokenized. - - """ - return six.text_type('').join(self._buffer) - - def start_tag(self): - """Resets stream history to just '<' - - This gets called by tagOpenState() which marks a '<' that denotes an - open tag. Any time we see that, we reset the buffer. - - """ - self._buffer = ['<'] - - -class BleachHTMLTokenizer(HTMLTokenizer): - """Tokenizer that doesn't consume character entities""" - def __init__(self, consume_entities=False, **kwargs): - super(BleachHTMLTokenizer, self).__init__(**kwargs) - - self.consume_entities = consume_entities - - # Wrap the stream with one that remembers the history - self.stream = InputStreamWithMemory(self.stream) - - def __iter__(self): - last_error_token = None - - for token in super(BleachHTMLTokenizer, self).__iter__(): - if last_error_token is not None: - if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and - token['type'] in TAG_TOKEN_TYPES and - token.get('data'))): - # Remove attribute names that have ', " or < in them - # because those characters are invalid for attribute names. - token['data'] = [ - item for item in token['data'] - if ('"' not in item[0] and - "'" not in item[0] and - '<' not in item[0]) - ] - last_error_token = None - yield token - - elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and - self.parser.tags is not None and - token['data'].lower().strip() not in self.parser.tags)): - # We've got either a malformed tag or a pseudo-tag or - # something that html5lib wants to turn into a malformed - # comment which Bleach clean() will drop so we interfere - # with the token stream to handle it more correctly. - # - # If this is an allowed tag, it's malformed and we just let - # the html5lib parser deal with it--we don't enter into this - # block. - # - # If this is not an allowed tag, then we convert it to - # characters and it'll get escaped in the sanitizer. - token['data'] = self.stream.get_tag() - token['type'] = CHARACTERS_TYPE - - last_error_token = None - yield token - - elif token['type'] == PARSEERROR_TYPE: - # If the token is a parse error, then let the last_error_token - # go, and make token the new last_error_token - yield last_error_token - last_error_token = token - - else: - yield last_error_token - yield token - last_error_token = None - - continue - - # If the token is a ParseError, we hold on to it so we can get the - # next token and potentially fix it. - if token['type'] == PARSEERROR_TYPE: - last_error_token = token - continue - - yield token - - if last_error_token: - yield last_error_token - - def consumeEntity(self, allowedChar=None, fromAttribute=False): - # If this tokenizer is set to consume entities, then we can let the - # superclass do its thing. - if self.consume_entities: - return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute) - - # If this tokenizer is set to not consume entities, then we don't want - # to consume and convert them, so this overrides the html5lib tokenizer's - # consumeEntity so that it's now a no-op. - # - # However, when that gets called, it's consumed an &, so we put that back in - # the stream. - if fromAttribute: - self.currentToken['data'][-1][1] += '&' - - else: - self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'}) - - def tagOpenState(self): - # This state marks a < that is either a StartTag, EndTag, EmptyTag, - # or ParseError. In all cases, we want to drop any stream history - # we've collected so far and we do that by calling start_tag() on - # the input stream wrapper. - self.stream.start_tag() - return super(BleachHTMLTokenizer, self).tagOpenState() - - def emitCurrentToken(self): - token = self.currentToken - - if ((self.parser.tags is not None and - token['type'] in TAG_TOKEN_TYPES and - token['name'].lower() not in self.parser.tags)): - # If this is a start/end/empty tag for a tag that's not in our - # allowed list, then it gets stripped or escaped. In both of these - # cases it gets converted to a Characters token. - if self.parser.strip: - # If we're stripping the token, we just throw in an empty - # string token. - new_data = '' - - else: - # If we're escaping the token, we want to escape the exact - # original string. Since tokenizing also normalizes data - # and this is a tag-like thing, we've lost some information. - # So we go back through the stream to get the original - # string and use that. - new_data = self.stream.get_tag() - - new_token = { - 'type': CHARACTERS_TYPE, - 'data': new_data - } - - self.currentToken = new_token - self.tokenQueue.append(new_token) - self.state = self.dataState - return - - super(BleachHTMLTokenizer, self).emitCurrentToken() - - -class BleachHTMLParser(HTMLParser): - """Parser that uses BleachHTMLTokenizer""" - def __init__(self, tags, strip, consume_entities, **kwargs): - """ - :arg tags: list of allowed tags--everything else is either stripped or - escaped; if None, then this doesn't look at tags at all - :arg strip: whether to strip disallowed tags (True) or escape them (False); - if tags=None, then this doesn't have any effect - :arg consume_entities: whether to consume entities (default behavior) or - leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) - - """ - self.tags = [tag.lower() for tag in tags] if tags is not None else None - self.strip = strip - self.consume_entities = consume_entities - super(BleachHTMLParser, self).__init__(**kwargs) - - def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs): - # set scripting=True to parse <noscript> as though JS is enabled to - # match the expected context in browsers - # - # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element - # - # Override HTMLParser so we can swap out the tokenizer for our own. - self.innerHTMLMode = innerHTML - self.container = container - self.scripting = scripting - self.tokenizer = BleachHTMLTokenizer( - stream=stream, - consume_entities=self.consume_entities, - parser=self, - **kwargs - ) - self.reset() - - try: - self.mainLoop() - except ReparseException: - self.reset() - self.mainLoop() - - -def convert_entity(value): - """Convert an entity (minus the & and ; part) into what it represents - - This handles numeric, hex, and text entities. - - :arg value: the string (minus the ``&`` and ``;`` part) to convert - - :returns: unicode character or None if it's an ambiguous ampersand that - doesn't match a character entity - - """ - if value[0] == '#': - if value[1] in ('x', 'X'): - return six.unichr(int(value[2:], 16)) - return six.unichr(int(value[1:], 10)) - - return ENTITIES.get(value, None) - - -def convert_entities(text): - """Converts all found entities in the text - - :arg text: the text to convert entities in - - :returns: unicode text with converted entities - - """ - if '&' not in text: - return text - - new_text = [] - for part in next_possible_entity(text): - if not part: - continue - - if part.startswith('&'): - entity = match_entity(part) - if entity is not None: - converted = convert_entity(entity) - - # If it's not an ambiguous ampersand, then replace with the - # unicode character. Otherwise, we leave the entity in. - if converted is not None: - new_text.append(converted) - remainder = part[len(entity) + 2:] - if part: - new_text.append(remainder) - continue - - new_text.append(part) - - return ''.join(new_text) - - -def match_entity(stream): - """Returns first entity in stream or None if no entity exists - - Note: For Bleach purposes, entities must start with a "&" and end with - a ";". This ignoresambiguous character entities that have no ";" at the - end. - - :arg stream: the character stream - - :returns: ``None`` or the entity string without "&" or ";" - - """ - # Nix the & at the beginning - if stream[0] != '&': - raise ValueError('Stream should begin with "&"') - - stream = stream[1:] - - stream = list(stream) - possible_entity = '' - end_characters = '<&=;' + string.whitespace - - # Handle number entities - if stream and stream[0] == '#': - possible_entity = '#' - stream.pop(0) - - if stream and stream[0] in ('x', 'X'): - allowed = '0123456789abcdefABCDEF' - possible_entity += stream.pop(0) - else: - allowed = '0123456789' - - # FIXME(willkg): Do we want to make sure these are valid number - # entities? This doesn't do that currently. - while stream and stream[0] not in end_characters: - c = stream.pop(0) - if c not in allowed: - break - possible_entity += c - - if possible_entity and stream and stream[0] == ';': - return possible_entity - return None - - # Handle character entities - while stream and stream[0] not in end_characters: - c = stream.pop(0) - if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity): - break - possible_entity += c - - if possible_entity and stream and stream[0] == ';': - return possible_entity - - return None - - -AMP_SPLIT_RE = re.compile('(&)') - - -def next_possible_entity(text): - """Takes a text and generates a list of possible entities - - :arg text: the text to look at - - :returns: generator where each part (except the first) starts with an - "&" - - """ - for i, part in enumerate(AMP_SPLIT_RE.split(text)): - if i == 0: - yield part - elif i % 2 == 0: - yield '&' + part - - -class BleachHTMLSerializer(HTMLSerializer): - """HTMLSerializer that undoes & -> & in attributes and sets - escape_rcdata to True - """ - - # per the HTMLSerializer.__init__ docstring: - # - # Whether to escape characters that need to be - # escaped within normal elements within rcdata elements such as - # style. - # - escape_rcdata = True - - def escape_base_amp(self, stoken): - """Escapes just bare & in HTML attribute values""" - # First, undo escaping of &. We need to do this because html5lib's - # HTMLSerializer expected the tokenizer to consume all the character - # entities and convert them to their respective characters, but the - # BleachHTMLTokenizer doesn't do that. For example, this fixes - # &entity; back to &entity; . - stoken = stoken.replace('&', '&') - - # However, we do want all bare & that are not marking character - # entities to be changed to &, so let's do that carefully here. - for part in next_possible_entity(stoken): - if not part: - continue - - if part.startswith('&'): - entity = match_entity(part) - # Only leave entities in that are not ambiguous. If they're - # ambiguous, then we escape the ampersand. - if entity is not None and convert_entity(entity) is not None: - yield '&' + entity + ';' - - # Length of the entity plus 2--one for & at the beginning - # and one for ; at the end - part = part[len(entity) + 2:] - if part: - yield part - continue - - yield part.replace('&', '&') - - def serialize(self, treewalker, encoding=None): - """Wrap HTMLSerializer.serialize and conver & to & in attribute values - - Note that this converts & to & in attribute values where the & isn't - already part of an unambiguous character entity. - - """ - in_tag = False - after_equals = False - - for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding): - if in_tag: - if stoken == '>': - in_tag = False - - elif after_equals: - if stoken != '"': - for part in self.escape_base_amp(stoken): - yield part - - after_equals = False - continue - - elif stoken == '=': - after_equals = True - - yield stoken - else: - if stoken.startswith('<'): - in_tag = True - yield stoken
