diff env/lib/python3.9/site-packages/bleach/_vendor/html5lib/treewalkers/base.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/env/lib/python3.9/site-packages/bleach/_vendor/html5lib/treewalkers/base.py	Mon Mar 22 18:12:50 2021 +0000
@@ -0,0 +1,252 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.dom import Node
+from ..constants import namespaces, voidElements, spaceCharacters
+
+__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
+           "TreeWalker", "NonRecursiveTreeWalker"]
+
+DOCUMENT = Node.DOCUMENT_NODE
+DOCTYPE = Node.DOCUMENT_TYPE_NODE
+TEXT = Node.TEXT_NODE
+ELEMENT = Node.ELEMENT_NODE
+COMMENT = Node.COMMENT_NODE
+ENTITY = Node.ENTITY_NODE
+UNKNOWN = "<#UNKNOWN#>"
+
+spaceCharacters = "".join(spaceCharacters)
+
+
+class TreeWalker(object):
+    """Walks a tree yielding tokens
+
+    Tokens are dicts that all have a ``type`` field specifying the type of the
+    token.
+
+    """
+    def __init__(self, tree):
+        """Creates a TreeWalker
+
+        :arg tree: the tree to walk
+
+        """
+        self.tree = tree
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def error(self, msg):
+        """Generates an error token with the given message
+
+        :arg msg: the error message
+
+        :returns: SerializeError token
+
+        """
+        return {"type": "SerializeError", "data": msg}
+
+    def emptyTag(self, namespace, name, attrs, hasChildren=False):
+        """Generates an EmptyTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :arg attrs: the attributes of the element as a dict
+
+        :arg hasChildren: whether or not to yield a SerializationError because
+            this tag shouldn't have children
+
+        :returns: EmptyTag token
+
+        """
+        yield {"type": "EmptyTag", "name": name,
+               "namespace": namespace,
+               "data": attrs}
+        if hasChildren:
+            yield self.error("Void element has children")
+
+    def startTag(self, namespace, name, attrs):
+        """Generates a StartTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :arg attrs: the attributes of the element as a dict
+
+        :returns: StartTag token
+
+        """
+        return {"type": "StartTag",
+                "name": name,
+                "namespace": namespace,
+                "data": attrs}
+
+    def endTag(self, namespace, name):
+        """Generates an EndTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :returns: EndTag token
+
+        """
+        return {"type": "EndTag",
+                "name": name,
+                "namespace": namespace}
+
+    def text(self, data):
+        """Generates SpaceCharacters and Characters tokens
+
+        Depending on what's in the data, this generates one or more
+        ``SpaceCharacters`` and ``Characters`` tokens.
+
+        For example:
+
+            >>> from html5lib.treewalkers.base import TreeWalker
+            >>> # Give it an empty tree just so it instantiates
+            >>> walker = TreeWalker([])
+            >>> list(walker.text(''))
+            []
+            >>> list(walker.text('  '))
+            [{u'data': '  ', u'type': u'SpaceCharacters'}]
+            >>> list(walker.text(' abc '))  # doctest: +NORMALIZE_WHITESPACE
+            [{u'data': ' ', u'type': u'SpaceCharacters'},
+            {u'data': u'abc', u'type': u'Characters'},
+            {u'data': u' ', u'type': u'SpaceCharacters'}]
+
+        :arg data: the text data
+
+        :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
+
+        """
+        data = data
+        middle = data.lstrip(spaceCharacters)
+        left = data[:len(data) - len(middle)]
+        if left:
+            yield {"type": "SpaceCharacters", "data": left}
+        data = middle
+        middle = data.rstrip(spaceCharacters)
+        right = data[len(middle):]
+        if middle:
+            yield {"type": "Characters", "data": middle}
+        if right:
+            yield {"type": "SpaceCharacters", "data": right}
+
+    def comment(self, data):
+        """Generates a Comment token
+
+        :arg data: the comment
+
+        :returns: Comment token
+
+        """
+        return {"type": "Comment", "data": data}
+
+    def doctype(self, name, publicId=None, systemId=None):
+        """Generates a Doctype token
+
+        :arg name:
+
+        :arg publicId:
+
+        :arg systemId:
+
+        :returns: the Doctype token
+
+        """
+        return {"type": "Doctype",
+                "name": name,
+                "publicId": publicId,
+                "systemId": systemId}
+
+    def entity(self, name):
+        """Generates an Entity token
+
+        :arg name: the entity name
+
+        :returns: an Entity token
+
+        """
+        return {"type": "Entity", "name": name}
+
+    def unknown(self, nodeType):
+        """Handles unknown node types"""
+        return self.error("Unknown node type: " + nodeType)
+
+
+class NonRecursiveTreeWalker(TreeWalker):
+    def getNodeDetails(self, node):
+        raise NotImplementedError
+
+    def getFirstChild(self, node):
+        raise NotImplementedError
+
+    def getNextSibling(self, node):
+        raise NotImplementedError
+
+    def getParentNode(self, node):
+        raise NotImplementedError
+
+    def __iter__(self):
+        currentNode = self.tree
+        while currentNode is not None:
+            details = self.getNodeDetails(currentNode)
+            type, details = details[0], details[1:]
+            hasChildren = False
+
+            if type == DOCTYPE:
+                yield self.doctype(*details)
+
+            elif type == TEXT:
+                for token in self.text(*details):
+                    yield token
+
+            elif type == ELEMENT:
+                namespace, name, attributes, hasChildren = details
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    for token in self.emptyTag(namespace, name, attributes,
+                                               hasChildren):
+                        yield token
+                    hasChildren = False
+                else:
+                    yield self.startTag(namespace, name, attributes)
+
+            elif type == COMMENT:
+                yield self.comment(details[0])
+
+            elif type == ENTITY:
+                yield self.entity(details[0])
+
+            elif type == DOCUMENT:
+                hasChildren = True
+
+            else:
+                yield self.unknown(details[0])
+
+            if hasChildren:
+                firstChild = self.getFirstChild(currentNode)
+            else:
+                firstChild = None
+
+            if firstChild is not None:
+                currentNode = firstChild
+            else:
+                while currentNode is not None:
+                    details = self.getNodeDetails(currentNode)
+                    type, details = details[0], details[1:]
+                    if type == ELEMENT:
+                        namespace, name, attributes, hasChildren = details
+                        if (namespace and namespace != namespaces["html"]) or name not in voidElements:
+                            yield self.endTag(namespace, name)
+                    if self.tree is currentNode:
+                        currentNode = None
+                        break
+                    nextSibling = self.getNextSibling(currentNode)
+                    if nextSibling is not None:
+                        currentNode = nextSibling
+                        break
+                    else:
+                        currentNode = self.getParentNode(currentNode)