Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bleach/_vendor/html5lib/treewalkers/base.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 from __future__ import absolute_import, division, unicode_literals | |
2 | |
3 from xml.dom import Node | |
4 from ..constants import namespaces, voidElements, spaceCharacters | |
5 | |
6 __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", | |
7 "TreeWalker", "NonRecursiveTreeWalker"] | |
8 | |
9 DOCUMENT = Node.DOCUMENT_NODE | |
10 DOCTYPE = Node.DOCUMENT_TYPE_NODE | |
11 TEXT = Node.TEXT_NODE | |
12 ELEMENT = Node.ELEMENT_NODE | |
13 COMMENT = Node.COMMENT_NODE | |
14 ENTITY = Node.ENTITY_NODE | |
15 UNKNOWN = "<#UNKNOWN#>" | |
16 | |
17 spaceCharacters = "".join(spaceCharacters) | |
18 | |
19 | |
20 class TreeWalker(object): | |
21 """Walks a tree yielding tokens | |
22 | |
23 Tokens are dicts that all have a ``type`` field specifying the type of the | |
24 token. | |
25 | |
26 """ | |
27 def __init__(self, tree): | |
28 """Creates a TreeWalker | |
29 | |
30 :arg tree: the tree to walk | |
31 | |
32 """ | |
33 self.tree = tree | |
34 | |
35 def __iter__(self): | |
36 raise NotImplementedError | |
37 | |
38 def error(self, msg): | |
39 """Generates an error token with the given message | |
40 | |
41 :arg msg: the error message | |
42 | |
43 :returns: SerializeError token | |
44 | |
45 """ | |
46 return {"type": "SerializeError", "data": msg} | |
47 | |
48 def emptyTag(self, namespace, name, attrs, hasChildren=False): | |
49 """Generates an EmptyTag token | |
50 | |
51 :arg namespace: the namespace of the token--can be ``None`` | |
52 | |
53 :arg name: the name of the element | |
54 | |
55 :arg attrs: the attributes of the element as a dict | |
56 | |
57 :arg hasChildren: whether or not to yield a SerializationError because | |
58 this tag shouldn't have children | |
59 | |
60 :returns: EmptyTag token | |
61 | |
62 """ | |
63 yield {"type": "EmptyTag", "name": name, | |
64 "namespace": namespace, | |
65 "data": attrs} | |
66 if hasChildren: | |
67 yield self.error("Void element has children") | |
68 | |
69 def startTag(self, namespace, name, attrs): | |
70 """Generates a StartTag token | |
71 | |
72 :arg namespace: the namespace of the token--can be ``None`` | |
73 | |
74 :arg name: the name of the element | |
75 | |
76 :arg attrs: the attributes of the element as a dict | |
77 | |
78 :returns: StartTag token | |
79 | |
80 """ | |
81 return {"type": "StartTag", | |
82 "name": name, | |
83 "namespace": namespace, | |
84 "data": attrs} | |
85 | |
86 def endTag(self, namespace, name): | |
87 """Generates an EndTag token | |
88 | |
89 :arg namespace: the namespace of the token--can be ``None`` | |
90 | |
91 :arg name: the name of the element | |
92 | |
93 :returns: EndTag token | |
94 | |
95 """ | |
96 return {"type": "EndTag", | |
97 "name": name, | |
98 "namespace": namespace} | |
99 | |
100 def text(self, data): | |
101 """Generates SpaceCharacters and Characters tokens | |
102 | |
103 Depending on what's in the data, this generates one or more | |
104 ``SpaceCharacters`` and ``Characters`` tokens. | |
105 | |
106 For example: | |
107 | |
108 >>> from html5lib.treewalkers.base import TreeWalker | |
109 >>> # Give it an empty tree just so it instantiates | |
110 >>> walker = TreeWalker([]) | |
111 >>> list(walker.text('')) | |
112 [] | |
113 >>> list(walker.text(' ')) | |
114 [{u'data': ' ', u'type': u'SpaceCharacters'}] | |
115 >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE | |
116 [{u'data': ' ', u'type': u'SpaceCharacters'}, | |
117 {u'data': u'abc', u'type': u'Characters'}, | |
118 {u'data': u' ', u'type': u'SpaceCharacters'}] | |
119 | |
120 :arg data: the text data | |
121 | |
122 :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens | |
123 | |
124 """ | |
125 data = data | |
126 middle = data.lstrip(spaceCharacters) | |
127 left = data[:len(data) - len(middle)] | |
128 if left: | |
129 yield {"type": "SpaceCharacters", "data": left} | |
130 data = middle | |
131 middle = data.rstrip(spaceCharacters) | |
132 right = data[len(middle):] | |
133 if middle: | |
134 yield {"type": "Characters", "data": middle} | |
135 if right: | |
136 yield {"type": "SpaceCharacters", "data": right} | |
137 | |
138 def comment(self, data): | |
139 """Generates a Comment token | |
140 | |
141 :arg data: the comment | |
142 | |
143 :returns: Comment token | |
144 | |
145 """ | |
146 return {"type": "Comment", "data": data} | |
147 | |
148 def doctype(self, name, publicId=None, systemId=None): | |
149 """Generates a Doctype token | |
150 | |
151 :arg name: | |
152 | |
153 :arg publicId: | |
154 | |
155 :arg systemId: | |
156 | |
157 :returns: the Doctype token | |
158 | |
159 """ | |
160 return {"type": "Doctype", | |
161 "name": name, | |
162 "publicId": publicId, | |
163 "systemId": systemId} | |
164 | |
165 def entity(self, name): | |
166 """Generates an Entity token | |
167 | |
168 :arg name: the entity name | |
169 | |
170 :returns: an Entity token | |
171 | |
172 """ | |
173 return {"type": "Entity", "name": name} | |
174 | |
175 def unknown(self, nodeType): | |
176 """Handles unknown node types""" | |
177 return self.error("Unknown node type: " + nodeType) | |
178 | |
179 | |
180 class NonRecursiveTreeWalker(TreeWalker): | |
181 def getNodeDetails(self, node): | |
182 raise NotImplementedError | |
183 | |
184 def getFirstChild(self, node): | |
185 raise NotImplementedError | |
186 | |
187 def getNextSibling(self, node): | |
188 raise NotImplementedError | |
189 | |
190 def getParentNode(self, node): | |
191 raise NotImplementedError | |
192 | |
193 def __iter__(self): | |
194 currentNode = self.tree | |
195 while currentNode is not None: | |
196 details = self.getNodeDetails(currentNode) | |
197 type, details = details[0], details[1:] | |
198 hasChildren = False | |
199 | |
200 if type == DOCTYPE: | |
201 yield self.doctype(*details) | |
202 | |
203 elif type == TEXT: | |
204 for token in self.text(*details): | |
205 yield token | |
206 | |
207 elif type == ELEMENT: | |
208 namespace, name, attributes, hasChildren = details | |
209 if (not namespace or namespace == namespaces["html"]) and name in voidElements: | |
210 for token in self.emptyTag(namespace, name, attributes, | |
211 hasChildren): | |
212 yield token | |
213 hasChildren = False | |
214 else: | |
215 yield self.startTag(namespace, name, attributes) | |
216 | |
217 elif type == COMMENT: | |
218 yield self.comment(details[0]) | |
219 | |
220 elif type == ENTITY: | |
221 yield self.entity(details[0]) | |
222 | |
223 elif type == DOCUMENT: | |
224 hasChildren = True | |
225 | |
226 else: | |
227 yield self.unknown(details[0]) | |
228 | |
229 if hasChildren: | |
230 firstChild = self.getFirstChild(currentNode) | |
231 else: | |
232 firstChild = None | |
233 | |
234 if firstChild is not None: | |
235 currentNode = firstChild | |
236 else: | |
237 while currentNode is not None: | |
238 details = self.getNodeDetails(currentNode) | |
239 type, details = details[0], details[1:] | |
240 if type == ELEMENT: | |
241 namespace, name, attributes, hasChildren = details | |
242 if (namespace and namespace != namespaces["html"]) or name not in voidElements: | |
243 yield self.endTag(namespace, name) | |
244 if self.tree is currentNode: | |
245 currentNode = None | |
246 break | |
247 nextSibling = self.getNextSibling(currentNode) | |
248 if nextSibling is not None: | |
249 currentNode = nextSibling | |
250 break | |
251 else: | |
252 currentNode = self.getParentNode(currentNode) |