comparison env/lib/python3.9/site-packages/bleach/_vendor/html5lib/treewalkers/base.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 from __future__ import absolute_import, division, unicode_literals
2
3 from xml.dom import Node
4 from ..constants import namespaces, voidElements, spaceCharacters
5
6 __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
7 "TreeWalker", "NonRecursiveTreeWalker"]
8
9 DOCUMENT = Node.DOCUMENT_NODE
10 DOCTYPE = Node.DOCUMENT_TYPE_NODE
11 TEXT = Node.TEXT_NODE
12 ELEMENT = Node.ELEMENT_NODE
13 COMMENT = Node.COMMENT_NODE
14 ENTITY = Node.ENTITY_NODE
15 UNKNOWN = "<#UNKNOWN#>"
16
17 spaceCharacters = "".join(spaceCharacters)
18
19
20 class TreeWalker(object):
21 """Walks a tree yielding tokens
22
23 Tokens are dicts that all have a ``type`` field specifying the type of the
24 token.
25
26 """
27 def __init__(self, tree):
28 """Creates a TreeWalker
29
30 :arg tree: the tree to walk
31
32 """
33 self.tree = tree
34
35 def __iter__(self):
36 raise NotImplementedError
37
38 def error(self, msg):
39 """Generates an error token with the given message
40
41 :arg msg: the error message
42
43 :returns: SerializeError token
44
45 """
46 return {"type": "SerializeError", "data": msg}
47
48 def emptyTag(self, namespace, name, attrs, hasChildren=False):
49 """Generates an EmptyTag token
50
51 :arg namespace: the namespace of the token--can be ``None``
52
53 :arg name: the name of the element
54
55 :arg attrs: the attributes of the element as a dict
56
57 :arg hasChildren: whether or not to yield a SerializationError because
58 this tag shouldn't have children
59
60 :returns: EmptyTag token
61
62 """
63 yield {"type": "EmptyTag", "name": name,
64 "namespace": namespace,
65 "data": attrs}
66 if hasChildren:
67 yield self.error("Void element has children")
68
69 def startTag(self, namespace, name, attrs):
70 """Generates a StartTag token
71
72 :arg namespace: the namespace of the token--can be ``None``
73
74 :arg name: the name of the element
75
76 :arg attrs: the attributes of the element as a dict
77
78 :returns: StartTag token
79
80 """
81 return {"type": "StartTag",
82 "name": name,
83 "namespace": namespace,
84 "data": attrs}
85
86 def endTag(self, namespace, name):
87 """Generates an EndTag token
88
89 :arg namespace: the namespace of the token--can be ``None``
90
91 :arg name: the name of the element
92
93 :returns: EndTag token
94
95 """
96 return {"type": "EndTag",
97 "name": name,
98 "namespace": namespace}
99
100 def text(self, data):
101 """Generates SpaceCharacters and Characters tokens
102
103 Depending on what's in the data, this generates one or more
104 ``SpaceCharacters`` and ``Characters`` tokens.
105
106 For example:
107
108 >>> from html5lib.treewalkers.base import TreeWalker
109 >>> # Give it an empty tree just so it instantiates
110 >>> walker = TreeWalker([])
111 >>> list(walker.text(''))
112 []
113 >>> list(walker.text(' '))
114 [{u'data': ' ', u'type': u'SpaceCharacters'}]
115 >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
116 [{u'data': ' ', u'type': u'SpaceCharacters'},
117 {u'data': u'abc', u'type': u'Characters'},
118 {u'data': u' ', u'type': u'SpaceCharacters'}]
119
120 :arg data: the text data
121
122 :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
123
124 """
125 data = data
126 middle = data.lstrip(spaceCharacters)
127 left = data[:len(data) - len(middle)]
128 if left:
129 yield {"type": "SpaceCharacters", "data": left}
130 data = middle
131 middle = data.rstrip(spaceCharacters)
132 right = data[len(middle):]
133 if middle:
134 yield {"type": "Characters", "data": middle}
135 if right:
136 yield {"type": "SpaceCharacters", "data": right}
137
138 def comment(self, data):
139 """Generates a Comment token
140
141 :arg data: the comment
142
143 :returns: Comment token
144
145 """
146 return {"type": "Comment", "data": data}
147
148 def doctype(self, name, publicId=None, systemId=None):
149 """Generates a Doctype token
150
151 :arg name:
152
153 :arg publicId:
154
155 :arg systemId:
156
157 :returns: the Doctype token
158
159 """
160 return {"type": "Doctype",
161 "name": name,
162 "publicId": publicId,
163 "systemId": systemId}
164
165 def entity(self, name):
166 """Generates an Entity token
167
168 :arg name: the entity name
169
170 :returns: an Entity token
171
172 """
173 return {"type": "Entity", "name": name}
174
175 def unknown(self, nodeType):
176 """Handles unknown node types"""
177 return self.error("Unknown node type: " + nodeType)
178
179
180 class NonRecursiveTreeWalker(TreeWalker):
181 def getNodeDetails(self, node):
182 raise NotImplementedError
183
184 def getFirstChild(self, node):
185 raise NotImplementedError
186
187 def getNextSibling(self, node):
188 raise NotImplementedError
189
190 def getParentNode(self, node):
191 raise NotImplementedError
192
193 def __iter__(self):
194 currentNode = self.tree
195 while currentNode is not None:
196 details = self.getNodeDetails(currentNode)
197 type, details = details[0], details[1:]
198 hasChildren = False
199
200 if type == DOCTYPE:
201 yield self.doctype(*details)
202
203 elif type == TEXT:
204 for token in self.text(*details):
205 yield token
206
207 elif type == ELEMENT:
208 namespace, name, attributes, hasChildren = details
209 if (not namespace or namespace == namespaces["html"]) and name in voidElements:
210 for token in self.emptyTag(namespace, name, attributes,
211 hasChildren):
212 yield token
213 hasChildren = False
214 else:
215 yield self.startTag(namespace, name, attributes)
216
217 elif type == COMMENT:
218 yield self.comment(details[0])
219
220 elif type == ENTITY:
221 yield self.entity(details[0])
222
223 elif type == DOCUMENT:
224 hasChildren = True
225
226 else:
227 yield self.unknown(details[0])
228
229 if hasChildren:
230 firstChild = self.getFirstChild(currentNode)
231 else:
232 firstChild = None
233
234 if firstChild is not None:
235 currentNode = firstChild
236 else:
237 while currentNode is not None:
238 details = self.getNodeDetails(currentNode)
239 type, details = details[0], details[1:]
240 if type == ELEMENT:
241 namespace, name, attributes, hasChildren = details
242 if (namespace and namespace != namespaces["html"]) or name not in voidElements:
243 yield self.endTag(namespace, name)
244 if self.tree is currentNode:
245 currentNode = None
246 break
247 nextSibling = self.getNextSibling(currentNode)
248 if nextSibling is not None:
249 currentNode = nextSibling
250 break
251 else:
252 currentNode = self.getParentNode(currentNode)