sam_consensus_v3: env/lib/python3.9/site-packages/bleach/_vendor/html5lib/serializer.py comparison

comparison env/lib/python3.9/site-packages/bleach/_vendor/html5lib/serializer.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"

author	shellac
date	Mon, 22 Mar 2021 18:12:50 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:4f3585e2f14b
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+import re
+from codecs import register_error, xmlcharrefreplace_errors
+from .constants import voidElements, booleanAttributes, spaceCharacters
+from .constants import rcdataElements, entities, xmlEntities
+from . import treewalkers, _utils
+from xml.sax.saxutils import escape
+_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
+_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
+_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
+"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+"\u3000]")
+_encode_entity_map = {}
+_is_ucs4 = len("\U0010FFFF") == 1
+for k, v in list(entities.items()):
+# skip multi-character entities
+if ((_is_ucs4 and len(v) > 1) or
+(not _is_ucs4 and len(v) > 2)):
+continue
+if v != "&":
+if len(v) == 2:
+v = _utils.surrogatePairToCodepoint(v)
+else:
+v = ord(v)
+if v not in _encode_entity_map or k.islower():
+# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
+_encode_entity_map[v] = k
+def htmlentityreplace_errors(exc):
+if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+res = []
+codepoints = []
+skip = False
+for i, c in enumerate(exc.object[exc.start:exc.end]):
+if skip:
+skip = False
+continue
+index = i + exc.start
+if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
+skip = True
+else:
+codepoint = ord(c)
+codepoints.append(codepoint)
+for cp in codepoints:
+e = _encode_entity_map.get(cp)
+if e:
+res.append("&")
+res.append(e)
+if not e.endswith(";"):
+res.append(";")
+else:
+res.append("&#x%s;" % (hex(cp)[2:]))
+return ("".join(res), exc.end)
+else:
+return xmlcharrefreplace_errors(exc)
+register_error("htmlentityreplace", htmlentityreplace_errors)
+def serialize(input, tree="etree", encoding=None, **serializer_opts):
+"""Serializes the input token stream using the specified treewalker
+:arg input: the token stream to serialize
+:arg tree: the treewalker to use
+:arg encoding: the encoding to use
+:arg serializer_opts: any options to pass to the
+:py:class:`html5lib.serializer.HTMLSerializer` that gets created
+:returns: the tree serialized as a string
+Example:
+>>> from html5lib.html5parser import parse
+>>> from html5lib.serializer import serialize
+>>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
+>>> serialize(token_stream, omit_optional_tags=False)
+'<html><head></head><body><p>Hi!</p></body></html>'
+"""
+# XXX: Should we cache this?
+walker = treewalkers.getTreeWalker(tree)
+s = HTMLSerializer(**serializer_opts)
+return s.render(walker(input), encoding)
+class HTMLSerializer(object):
+# attribute quoting options
+quote_attr_values = "legacy"  # be secure by default
+quote_char = '"'
+use_best_quote_char = True
+# tag syntax options
+omit_optional_tags = True
+minimize_boolean_attributes = True
+use_trailing_solidus = False
+space_before_trailing_solidus = True
+# escaping options
+escape_lt_in_attrs = False
+escape_rcdata = False
+resolve_entities = True
+# miscellaneous options
+alphabetical_attributes = False
+inject_meta_charset = True
+strip_whitespace = False
+sanitize = False
+options = ("quote_attr_values", "quote_char", "use_best_quote_char",
+"omit_optional_tags", "minimize_boolean_attributes",
+"use_trailing_solidus", "space_before_trailing_solidus",
+"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
+"alphabetical_attributes", "inject_meta_charset",
+"strip_whitespace", "sanitize")
+def __init__(self, **kwargs):
+"""Initialize HTMLSerializer
+:arg inject_meta_charset: Whether or not to inject the meta charset.
+Defaults to ``True``.
+:arg quote_attr_values: Whether to quote attribute values that don't
+require quoting per legacy browser behavior (``"legacy"``), when
+required by the standard (``"spec"``), or always (``"always"``).
+Defaults to ``"legacy"``.
+:arg quote_char: Use given quote character for attribute quoting.
+Defaults to ``"`` which will use double quotes unless attribute
+value contains a double quote, in which case single quotes are
+used.
+:arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
+values.
+Defaults to ``False``.
+:arg escape_rcdata: Whether to escape characters that need to be
+escaped within normal elements within rcdata elements such as
+style.
+Defaults to ``False``.
+:arg resolve_entities: Whether to resolve named character entities that
+appear in the source tree. The XML predefined entities &lt; &gt;
+&amp; &quot; &apos; are unaffected by this setting.
+Defaults to ``True``.
+:arg strip_whitespace: Whether to remove semantically meaningless
+whitespace. (This compresses all whitespace to a single space
+except within ``pre``.)
+Defaults to ``False``.
+:arg minimize_boolean_attributes: Shortens boolean attributes to give
+just the attribute value, for example::
+<input disabled="disabled">
+becomes::
+<input disabled>
+Defaults to ``True``.
+:arg use_trailing_solidus: Includes a close-tag slash at the end of the
+start tag of void elements (empty elements whose end tag is
+forbidden). E.g. ``<hr/>``.
+Defaults to ``False``.
+:arg space_before_trailing_solidus: Places a space immediately before
+the closing slash in a tag using a trailing solidus. E.g.
+``<hr />``. Requires ``use_trailing_solidus=True``.
+Defaults to ``True``.
+:arg sanitize: Strip all unsafe or unknown constructs from output.
+See :py:class:`html5lib.filters.sanitizer.Filter`.
+Defaults to ``False``.
+:arg omit_optional_tags: Omit start/end tags that are optional.
+Defaults to ``True``.
+:arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
+Defaults to ``False``.
+"""
+unexpected_args = frozenset(kwargs) - frozenset(self.options)
+if len(unexpected_args) > 0:
+raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
+if 'quote_char' in kwargs:
+self.use_best_quote_char = False
+for attr in self.options:
+setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
+self.errors = []
+self.strict = False
+def encode(self, string):
+assert(isinstance(string, text_type))
+if self.encoding:
+return string.encode(self.encoding, "htmlentityreplace")
+else:
+return string
+def encodeStrict(self, string):
+assert(isinstance(string, text_type))
+if self.encoding:
+return string.encode(self.encoding, "strict")
+else:
+return string
+def serialize(self, treewalker, encoding=None):
+# pylint:disable=too-many-nested-blocks
+self.encoding = encoding
+in_cdata = False
+self.errors = []
+if encoding and self.inject_meta_charset:
+from .filters.inject_meta_charset import Filter
+treewalker = Filter(treewalker, encoding)
+# Alphabetical attributes is here under the assumption that none of
+# the later filters add or change order of attributes; it needs to be
+# before the sanitizer so escaped elements come out correctly
+if self.alphabetical_attributes:
+from .filters.alphabeticalattributes import Filter
+treewalker = Filter(treewalker)
+# WhitespaceFilter should be used before OptionalTagFilter
+# for maximum efficiently of this latter filter
+if self.strip_whitespace:
+from .filters.whitespace import Filter
+treewalker = Filter(treewalker)
+if self.sanitize:
+from .filters.sanitizer import Filter
+treewalker = Filter(treewalker)
+if self.omit_optional_tags:
+from .filters.optionaltags import Filter
+treewalker = Filter(treewalker)
+for token in treewalker:
+type = token["type"]
+if type == "Doctype":
+doctype = "<!DOCTYPE %s" % token["name"]
+if token["publicId"]:
+doctype += ' PUBLIC "%s"' % token["publicId"]
+elif token["systemId"]:
+doctype += " SYSTEM"
+if token["systemId"]:
+if token["systemId"].find('"') >= 0:
+if token["systemId"].find("'") >= 0:
+self.serializeError("System identifier contains both single and double quote characters")
+quote_char = "'"
+else:
+quote_char = '"'
+doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
+doctype += ">"
+yield self.encodeStrict(doctype)
+elif type in ("Characters", "SpaceCharacters"):
+if type == "SpaceCharacters" or in_cdata:
+if in_cdata and token["data"].find("</") >= 0:
+self.serializeError("Unexpected </ in CDATA")
+yield self.encode(token["data"])
+else:
+yield self.encode(escape(token["data"]))
+elif type in ("StartTag", "EmptyTag"):
+name = token["name"]
+yield self.encodeStrict("<%s" % name)
+if name in rcdataElements and not self.escape_rcdata:
+in_cdata = True
+elif in_cdata:
+self.serializeError("Unexpected child element of a CDATA element")
+for (_, attr_name), attr_value in token["data"].items():
+# TODO: Add namespace support here
+k = attr_name
+v = attr_value
+yield self.encodeStrict(' ')
+yield self.encodeStrict(k)
+if not self.minimize_boolean_attributes or \
+(k not in booleanAttributes.get(name, tuple()) and
+k not in booleanAttributes.get("", tuple())):
+yield self.encodeStrict("=")
+if self.quote_attr_values == "always" or len(v) == 0:
+quote_attr = True
+elif self.quote_attr_values == "spec":
+quote_attr = _quoteAttributeSpec.search(v) is not None
+elif self.quote_attr_values == "legacy":
+quote_attr = _quoteAttributeLegacy.search(v) is not None
+else:
+raise ValueError("quote_attr_values must be one of: "
+"'always', 'spec', or 'legacy'")
+v = v.replace("&", "&amp;")
+if self.escape_lt_in_attrs:
+v = v.replace("<", "&lt;")
+if quote_attr:
+quote_char = self.quote_char
+if self.use_best_quote_char:
+if "'" in v and '"' not in v:
+quote_char = '"'
+elif '"' in v and "'" not in v:
+quote_char = "'"
+if quote_char == "'":
+v = v.replace("'", "&#39;")
+else:
+v = v.replace('"', "&quot;")
+yield self.encodeStrict(quote_char)
+yield self.encode(v)
+yield self.encodeStrict(quote_char)
+else:
+yield self.encode(v)
+if name in voidElements and self.use_trailing_solidus:
+if self.space_before_trailing_solidus:
+yield self.encodeStrict(" /")
+else:
+yield self.encodeStrict("/")
+yield self.encode(">")
+elif type == "EndTag":
+name = token["name"]
+if name in rcdataElements:
+in_cdata = False
+elif in_cdata:
+self.serializeError("Unexpected child element of a CDATA element")
+yield self.encodeStrict("</%s>" % name)
+elif type == "Comment":
+data = token["data"]
+if data.find("--") >= 0:
+self.serializeError("Comment contains --")
+yield self.encodeStrict("<!--%s-->" % token["data"])
+elif type == "Entity":
+name = token["name"]
+key = name + ";"
+if key not in entities:
+self.serializeError("Entity %s not recognized" % name)
+if self.resolve_entities and key not in xmlEntities:
+data = entities[key]
+else:
+data = "&%s;" % name
+yield self.encodeStrict(data)
+else:
+self.serializeError(token["data"])
+def render(self, treewalker, encoding=None):
+"""Serializes the stream from the treewalker into a string
+:arg treewalker: the treewalker to serialize
+:arg encoding: the string encoding to use
+:returns: the serialized tree
+Example:
+>>> from html5lib import parse, getTreeWalker
+>>> from html5lib.serializer import HTMLSerializer
+>>> token_stream = parse('<html><body>Hi!</body></html>')
+>>> walker = getTreeWalker('etree')
+>>> serializer = HTMLSerializer(omit_optional_tags=False)
+>>> serializer.render(walker(token_stream))
+'<html><head></head><body>Hi!</body></html>'
+"""
+if encoding:
+return b"".join(list(self.serialize(treewalker, encoding)))
+else:
+return "".join(list(self.serialize(treewalker)))
+def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
+# XXX The idea is to make data mandatory.
+self.errors.append(data)
+if self.strict:
+raise SerializeError
+class SerializeError(Exception):
+"""Error in serialized tree"""
+pass

Mercurial > repos > shellac > sam_consensus_v3

comparison env/lib/python3.9/site-packages/bleach/_vendor/html5lib/serializer.py @ 0:4f3585e2f14b draft default tip