Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bleach/_vendor/html5lib/html5parser.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 from __future__ import absolute_import, division, unicode_literals | |
2 from six import with_metaclass, viewkeys | |
3 | |
4 import types | |
5 | |
6 from . import _inputstream | |
7 from . import _tokenizer | |
8 | |
9 from . import treebuilders | |
10 from .treebuilders.base import Marker | |
11 | |
12 from . import _utils | |
13 from .constants import ( | |
14 spaceCharacters, asciiUpper2Lower, | |
15 specialElements, headingElements, cdataElements, rcdataElements, | |
16 tokenTypes, tagTokenTypes, | |
17 namespaces, | |
18 htmlIntegrationPointElements, mathmlTextIntegrationPointElements, | |
19 adjustForeignAttributes as adjustForeignAttributesMap, | |
20 adjustMathMLAttributes, adjustSVGAttributes, | |
21 E, | |
22 _ReparseException | |
23 ) | |
24 | |
25 | |
26 def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): | |
27 """Parse an HTML document as a string or file-like object into a tree | |
28 | |
29 :arg doc: the document to parse as a string or file-like object | |
30 | |
31 :arg treebuilder: the treebuilder to use when parsing | |
32 | |
33 :arg namespaceHTMLElements: whether or not to namespace HTML elements | |
34 | |
35 :returns: parsed tree | |
36 | |
37 Example: | |
38 | |
39 >>> from html5lib.html5parser import parse | |
40 >>> parse('<html><body><p>This is a doc</p></body></html>') | |
41 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> | |
42 | |
43 """ | |
44 tb = treebuilders.getTreeBuilder(treebuilder) | |
45 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) | |
46 return p.parse(doc, **kwargs) | |
47 | |
48 | |
49 def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): | |
50 """Parse an HTML fragment as a string or file-like object into a tree | |
51 | |
52 :arg doc: the fragment to parse as a string or file-like object | |
53 | |
54 :arg container: the container context to parse the fragment in | |
55 | |
56 :arg treebuilder: the treebuilder to use when parsing | |
57 | |
58 :arg namespaceHTMLElements: whether or not to namespace HTML elements | |
59 | |
60 :returns: parsed tree | |
61 | |
62 Example: | |
63 | |
64 >>> from html5lib.html5libparser import parseFragment | |
65 >>> parseFragment('<b>this is a fragment</b>') | |
66 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> | |
67 | |
68 """ | |
69 tb = treebuilders.getTreeBuilder(treebuilder) | |
70 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) | |
71 return p.parseFragment(doc, container=container, **kwargs) | |
72 | |
73 | |
74 def method_decorator_metaclass(function): | |
75 class Decorated(type): | |
76 def __new__(meta, classname, bases, classDict): | |
77 for attributeName, attribute in classDict.items(): | |
78 if isinstance(attribute, types.FunctionType): | |
79 attribute = function(attribute) | |
80 | |
81 classDict[attributeName] = attribute | |
82 return type.__new__(meta, classname, bases, classDict) | |
83 return Decorated | |
84 | |
85 | |
86 class HTMLParser(object): | |
87 """HTML parser | |
88 | |
89 Generates a tree structure from a stream of (possibly malformed) HTML. | |
90 | |
91 """ | |
92 | |
93 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): | |
94 """ | |
95 :arg tree: a treebuilder class controlling the type of tree that will be | |
96 returned. Built in treebuilders can be accessed through | |
97 html5lib.treebuilders.getTreeBuilder(treeType) | |
98 | |
99 :arg strict: raise an exception when a parse error is encountered | |
100 | |
101 :arg namespaceHTMLElements: whether or not to namespace HTML elements | |
102 | |
103 :arg debug: whether or not to enable debug mode which logs things | |
104 | |
105 Example: | |
106 | |
107 >>> from html5lib.html5parser import HTMLParser | |
108 >>> parser = HTMLParser() # generates parser with etree builder | |
109 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict | |
110 | |
111 """ | |
112 | |
113 # Raise an exception on the first error encountered | |
114 self.strict = strict | |
115 | |
116 if tree is None: | |
117 tree = treebuilders.getTreeBuilder("etree") | |
118 self.tree = tree(namespaceHTMLElements) | |
119 self.errors = [] | |
120 | |
121 self.phases = {name: cls(self, self.tree) for name, cls in | |
122 getPhases(debug).items()} | |
123 | |
124 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): | |
125 | |
126 self.innerHTMLMode = innerHTML | |
127 self.container = container | |
128 self.scripting = scripting | |
129 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) | |
130 self.reset() | |
131 | |
132 try: | |
133 self.mainLoop() | |
134 except _ReparseException: | |
135 self.reset() | |
136 self.mainLoop() | |
137 | |
138 def reset(self): | |
139 self.tree.reset() | |
140 self.firstStartTag = False | |
141 self.errors = [] | |
142 self.log = [] # only used with debug mode | |
143 # "quirks" / "limited quirks" / "no quirks" | |
144 self.compatMode = "no quirks" | |
145 | |
146 if self.innerHTMLMode: | |
147 self.innerHTML = self.container.lower() | |
148 | |
149 if self.innerHTML in cdataElements: | |
150 self.tokenizer.state = self.tokenizer.rcdataState | |
151 elif self.innerHTML in rcdataElements: | |
152 self.tokenizer.state = self.tokenizer.rawtextState | |
153 elif self.innerHTML == 'plaintext': | |
154 self.tokenizer.state = self.tokenizer.plaintextState | |
155 else: | |
156 # state already is data state | |
157 # self.tokenizer.state = self.tokenizer.dataState | |
158 pass | |
159 self.phase = self.phases["beforeHtml"] | |
160 self.phase.insertHtmlElement() | |
161 self.resetInsertionMode() | |
162 else: | |
163 self.innerHTML = False # pylint:disable=redefined-variable-type | |
164 self.phase = self.phases["initial"] | |
165 | |
166 self.lastPhase = None | |
167 | |
168 self.beforeRCDataPhase = None | |
169 | |
170 self.framesetOK = True | |
171 | |
172 @property | |
173 def documentEncoding(self): | |
174 """Name of the character encoding that was used to decode the input stream, or | |
175 :obj:`None` if that is not determined yet | |
176 | |
177 """ | |
178 if not hasattr(self, 'tokenizer'): | |
179 return None | |
180 return self.tokenizer.stream.charEncoding[0].name | |
181 | |
182 def isHTMLIntegrationPoint(self, element): | |
183 if (element.name == "annotation-xml" and | |
184 element.namespace == namespaces["mathml"]): | |
185 return ("encoding" in element.attributes and | |
186 element.attributes["encoding"].translate( | |
187 asciiUpper2Lower) in | |
188 ("text/html", "application/xhtml+xml")) | |
189 else: | |
190 return (element.namespace, element.name) in htmlIntegrationPointElements | |
191 | |
192 def isMathMLTextIntegrationPoint(self, element): | |
193 return (element.namespace, element.name) in mathmlTextIntegrationPointElements | |
194 | |
195 def mainLoop(self): | |
196 CharactersToken = tokenTypes["Characters"] | |
197 SpaceCharactersToken = tokenTypes["SpaceCharacters"] | |
198 StartTagToken = tokenTypes["StartTag"] | |
199 EndTagToken = tokenTypes["EndTag"] | |
200 CommentToken = tokenTypes["Comment"] | |
201 DoctypeToken = tokenTypes["Doctype"] | |
202 ParseErrorToken = tokenTypes["ParseError"] | |
203 | |
204 for token in self.tokenizer: | |
205 prev_token = None | |
206 new_token = token | |
207 while new_token is not None: | |
208 prev_token = new_token | |
209 currentNode = self.tree.openElements[-1] if self.tree.openElements else None | |
210 currentNodeNamespace = currentNode.namespace if currentNode else None | |
211 currentNodeName = currentNode.name if currentNode else None | |
212 | |
213 type = new_token["type"] | |
214 | |
215 if type == ParseErrorToken: | |
216 self.parseError(new_token["data"], new_token.get("datavars", {})) | |
217 new_token = None | |
218 else: | |
219 if (len(self.tree.openElements) == 0 or | |
220 currentNodeNamespace == self.tree.defaultNamespace or | |
221 (self.isMathMLTextIntegrationPoint(currentNode) and | |
222 ((type == StartTagToken and | |
223 token["name"] not in frozenset(["mglyph", "malignmark"])) or | |
224 type in (CharactersToken, SpaceCharactersToken))) or | |
225 (currentNodeNamespace == namespaces["mathml"] and | |
226 currentNodeName == "annotation-xml" and | |
227 type == StartTagToken and | |
228 token["name"] == "svg") or | |
229 (self.isHTMLIntegrationPoint(currentNode) and | |
230 type in (StartTagToken, CharactersToken, SpaceCharactersToken))): | |
231 phase = self.phase | |
232 else: | |
233 phase = self.phases["inForeignContent"] | |
234 | |
235 if type == CharactersToken: | |
236 new_token = phase.processCharacters(new_token) | |
237 elif type == SpaceCharactersToken: | |
238 new_token = phase.processSpaceCharacters(new_token) | |
239 elif type == StartTagToken: | |
240 new_token = phase.processStartTag(new_token) | |
241 elif type == EndTagToken: | |
242 new_token = phase.processEndTag(new_token) | |
243 elif type == CommentToken: | |
244 new_token = phase.processComment(new_token) | |
245 elif type == DoctypeToken: | |
246 new_token = phase.processDoctype(new_token) | |
247 | |
248 if (type == StartTagToken and prev_token["selfClosing"] and | |
249 not prev_token["selfClosingAcknowledged"]): | |
250 self.parseError("non-void-element-with-trailing-solidus", | |
251 {"name": prev_token["name"]}) | |
252 | |
253 # When the loop finishes it's EOF | |
254 reprocess = True | |
255 phases = [] | |
256 while reprocess: | |
257 phases.append(self.phase) | |
258 reprocess = self.phase.processEOF() | |
259 if reprocess: | |
260 assert self.phase not in phases | |
261 | |
262 def parse(self, stream, *args, **kwargs): | |
263 """Parse a HTML document into a well-formed tree | |
264 | |
265 :arg stream: a file-like object or string containing the HTML to be parsed | |
266 | |
267 The optional encoding parameter must be a string that indicates | |
268 the encoding. If specified, that encoding will be used, | |
269 regardless of any BOM or later declaration (such as in a meta | |
270 element). | |
271 | |
272 :arg scripting: treat noscript elements as if JavaScript was turned on | |
273 | |
274 :returns: parsed tree | |
275 | |
276 Example: | |
277 | |
278 >>> from html5lib.html5parser import HTMLParser | |
279 >>> parser = HTMLParser() | |
280 >>> parser.parse('<html><body><p>This is a doc</p></body></html>') | |
281 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> | |
282 | |
283 """ | |
284 self._parse(stream, False, None, *args, **kwargs) | |
285 return self.tree.getDocument() | |
286 | |
287 def parseFragment(self, stream, *args, **kwargs): | |
288 """Parse a HTML fragment into a well-formed tree fragment | |
289 | |
290 :arg container: name of the element we're setting the innerHTML | |
291 property if set to None, default to 'div' | |
292 | |
293 :arg stream: a file-like object or string containing the HTML to be parsed | |
294 | |
295 The optional encoding parameter must be a string that indicates | |
296 the encoding. If specified, that encoding will be used, | |
297 regardless of any BOM or later declaration (such as in a meta | |
298 element) | |
299 | |
300 :arg scripting: treat noscript elements as if JavaScript was turned on | |
301 | |
302 :returns: parsed tree | |
303 | |
304 Example: | |
305 | |
306 >>> from html5lib.html5libparser import HTMLParser | |
307 >>> parser = HTMLParser() | |
308 >>> parser.parseFragment('<b>this is a fragment</b>') | |
309 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> | |
310 | |
311 """ | |
312 self._parse(stream, True, *args, **kwargs) | |
313 return self.tree.getFragment() | |
314 | |
315 def parseError(self, errorcode="XXX-undefined-error", datavars=None): | |
316 # XXX The idea is to make errorcode mandatory. | |
317 if datavars is None: | |
318 datavars = {} | |
319 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) | |
320 if self.strict: | |
321 raise ParseError(E[errorcode] % datavars) | |
322 | |
323 def adjustMathMLAttributes(self, token): | |
324 adjust_attributes(token, adjustMathMLAttributes) | |
325 | |
326 def adjustSVGAttributes(self, token): | |
327 adjust_attributes(token, adjustSVGAttributes) | |
328 | |
329 def adjustForeignAttributes(self, token): | |
330 adjust_attributes(token, adjustForeignAttributesMap) | |
331 | |
332 def reparseTokenNormal(self, token): | |
333 # pylint:disable=unused-argument | |
334 self.parser.phase() | |
335 | |
336 def resetInsertionMode(self): | |
337 # The name of this method is mostly historical. (It's also used in the | |
338 # specification.) | |
339 last = False | |
340 newModes = { | |
341 "select": "inSelect", | |
342 "td": "inCell", | |
343 "th": "inCell", | |
344 "tr": "inRow", | |
345 "tbody": "inTableBody", | |
346 "thead": "inTableBody", | |
347 "tfoot": "inTableBody", | |
348 "caption": "inCaption", | |
349 "colgroup": "inColumnGroup", | |
350 "table": "inTable", | |
351 "head": "inBody", | |
352 "body": "inBody", | |
353 "frameset": "inFrameset", | |
354 "html": "beforeHead" | |
355 } | |
356 for node in self.tree.openElements[::-1]: | |
357 nodeName = node.name | |
358 new_phase = None | |
359 if node == self.tree.openElements[0]: | |
360 assert self.innerHTML | |
361 last = True | |
362 nodeName = self.innerHTML | |
363 # Check for conditions that should only happen in the innerHTML | |
364 # case | |
365 if nodeName in ("select", "colgroup", "head", "html"): | |
366 assert self.innerHTML | |
367 | |
368 if not last and node.namespace != self.tree.defaultNamespace: | |
369 continue | |
370 | |
371 if nodeName in newModes: | |
372 new_phase = self.phases[newModes[nodeName]] | |
373 break | |
374 elif last: | |
375 new_phase = self.phases["inBody"] | |
376 break | |
377 | |
378 self.phase = new_phase | |
379 | |
380 def parseRCDataRawtext(self, token, contentType): | |
381 # Generic RCDATA/RAWTEXT Parsing algorithm | |
382 assert contentType in ("RAWTEXT", "RCDATA") | |
383 | |
384 self.tree.insertElement(token) | |
385 | |
386 if contentType == "RAWTEXT": | |
387 self.tokenizer.state = self.tokenizer.rawtextState | |
388 else: | |
389 self.tokenizer.state = self.tokenizer.rcdataState | |
390 | |
391 self.originalPhase = self.phase | |
392 | |
393 self.phase = self.phases["text"] | |
394 | |
395 | |
396 @_utils.memoize | |
397 def getPhases(debug): | |
398 def log(function): | |
399 """Logger that records which phase processes each token""" | |
400 type_names = {value: key for key, value in tokenTypes.items()} | |
401 | |
402 def wrapped(self, *args, **kwargs): | |
403 if function.__name__.startswith("process") and len(args) > 0: | |
404 token = args[0] | |
405 info = {"type": type_names[token['type']]} | |
406 if token['type'] in tagTokenTypes: | |
407 info["name"] = token['name'] | |
408 | |
409 self.parser.log.append((self.parser.tokenizer.state.__name__, | |
410 self.parser.phase.__class__.__name__, | |
411 self.__class__.__name__, | |
412 function.__name__, | |
413 info)) | |
414 return function(self, *args, **kwargs) | |
415 else: | |
416 return function(self, *args, **kwargs) | |
417 return wrapped | |
418 | |
419 def getMetaclass(use_metaclass, metaclass_func): | |
420 if use_metaclass: | |
421 return method_decorator_metaclass(metaclass_func) | |
422 else: | |
423 return type | |
424 | |
425 # pylint:disable=unused-argument | |
426 class Phase(with_metaclass(getMetaclass(debug, log))): | |
427 """Base class for helper object that implements each phase of processing | |
428 """ | |
429 __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") | |
430 | |
431 def __init__(self, parser, tree): | |
432 self.parser = parser | |
433 self.tree = tree | |
434 self.__startTagCache = {} | |
435 self.__endTagCache = {} | |
436 | |
437 def processEOF(self): | |
438 raise NotImplementedError | |
439 | |
440 def processComment(self, token): | |
441 # For most phases the following is correct. Where it's not it will be | |
442 # overridden. | |
443 self.tree.insertComment(token, self.tree.openElements[-1]) | |
444 | |
445 def processDoctype(self, token): | |
446 self.parser.parseError("unexpected-doctype") | |
447 | |
448 def processCharacters(self, token): | |
449 self.tree.insertText(token["data"]) | |
450 | |
451 def processSpaceCharacters(self, token): | |
452 self.tree.insertText(token["data"]) | |
453 | |
454 def processStartTag(self, token): | |
455 # Note the caching is done here rather than BoundMethodDispatcher as doing it there | |
456 # requires a circular reference to the Phase, and this ends up with a significant | |
457 # (CPython 2.7, 3.8) GC cost when parsing many short inputs | |
458 name = token["name"] | |
459 # In Py2, using `in` is quicker in general than try/except KeyError | |
460 # In Py3, `in` is quicker when there are few cache hits (typically short inputs) | |
461 if name in self.__startTagCache: | |
462 func = self.__startTagCache[name] | |
463 else: | |
464 func = self.__startTagCache[name] = self.startTagHandler[name] | |
465 # bound the cache size in case we get loads of unknown tags | |
466 while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: | |
467 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 | |
468 self.__startTagCache.pop(next(iter(self.__startTagCache))) | |
469 return func(token) | |
470 | |
471 def startTagHtml(self, token): | |
472 if not self.parser.firstStartTag and token["name"] == "html": | |
473 self.parser.parseError("non-html-root") | |
474 # XXX Need a check here to see if the first start tag token emitted is | |
475 # this token... If it's not, invoke self.parser.parseError(). | |
476 for attr, value in token["data"].items(): | |
477 if attr not in self.tree.openElements[0].attributes: | |
478 self.tree.openElements[0].attributes[attr] = value | |
479 self.parser.firstStartTag = False | |
480 | |
481 def processEndTag(self, token): | |
482 # Note the caching is done here rather than BoundMethodDispatcher as doing it there | |
483 # requires a circular reference to the Phase, and this ends up with a significant | |
484 # (CPython 2.7, 3.8) GC cost when parsing many short inputs | |
485 name = token["name"] | |
486 # In Py2, using `in` is quicker in general than try/except KeyError | |
487 # In Py3, `in` is quicker when there are few cache hits (typically short inputs) | |
488 if name in self.__endTagCache: | |
489 func = self.__endTagCache[name] | |
490 else: | |
491 func = self.__endTagCache[name] = self.endTagHandler[name] | |
492 # bound the cache size in case we get loads of unknown tags | |
493 while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: | |
494 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 | |
495 self.__endTagCache.pop(next(iter(self.__endTagCache))) | |
496 return func(token) | |
497 | |
498 class InitialPhase(Phase): | |
499 __slots__ = tuple() | |
500 | |
501 def processSpaceCharacters(self, token): | |
502 pass | |
503 | |
504 def processComment(self, token): | |
505 self.tree.insertComment(token, self.tree.document) | |
506 | |
507 def processDoctype(self, token): | |
508 name = token["name"] | |
509 publicId = token["publicId"] | |
510 systemId = token["systemId"] | |
511 correct = token["correct"] | |
512 | |
513 if (name != "html" or publicId is not None or | |
514 systemId is not None and systemId != "about:legacy-compat"): | |
515 self.parser.parseError("unknown-doctype") | |
516 | |
517 if publicId is None: | |
518 publicId = "" | |
519 | |
520 self.tree.insertDoctype(token) | |
521 | |
522 if publicId != "": | |
523 publicId = publicId.translate(asciiUpper2Lower) | |
524 | |
525 if (not correct or token["name"] != "html" or | |
526 publicId.startswith( | |
527 ("+//silmaril//dtd html pro v0r11 19970101//", | |
528 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", | |
529 "-//as//dtd html 3.0 aswedit + extensions//", | |
530 "-//ietf//dtd html 2.0 level 1//", | |
531 "-//ietf//dtd html 2.0 level 2//", | |
532 "-//ietf//dtd html 2.0 strict level 1//", | |
533 "-//ietf//dtd html 2.0 strict level 2//", | |
534 "-//ietf//dtd html 2.0 strict//", | |
535 "-//ietf//dtd html 2.0//", | |
536 "-//ietf//dtd html 2.1e//", | |
537 "-//ietf//dtd html 3.0//", | |
538 "-//ietf//dtd html 3.2 final//", | |
539 "-//ietf//dtd html 3.2//", | |
540 "-//ietf//dtd html 3//", | |
541 "-//ietf//dtd html level 0//", | |
542 "-//ietf//dtd html level 1//", | |
543 "-//ietf//dtd html level 2//", | |
544 "-//ietf//dtd html level 3//", | |
545 "-//ietf//dtd html strict level 0//", | |
546 "-//ietf//dtd html strict level 1//", | |
547 "-//ietf//dtd html strict level 2//", | |
548 "-//ietf//dtd html strict level 3//", | |
549 "-//ietf//dtd html strict//", | |
550 "-//ietf//dtd html//", | |
551 "-//metrius//dtd metrius presentational//", | |
552 "-//microsoft//dtd internet explorer 2.0 html strict//", | |
553 "-//microsoft//dtd internet explorer 2.0 html//", | |
554 "-//microsoft//dtd internet explorer 2.0 tables//", | |
555 "-//microsoft//dtd internet explorer 3.0 html strict//", | |
556 "-//microsoft//dtd internet explorer 3.0 html//", | |
557 "-//microsoft//dtd internet explorer 3.0 tables//", | |
558 "-//netscape comm. corp.//dtd html//", | |
559 "-//netscape comm. corp.//dtd strict html//", | |
560 "-//o'reilly and associates//dtd html 2.0//", | |
561 "-//o'reilly and associates//dtd html extended 1.0//", | |
562 "-//o'reilly and associates//dtd html extended relaxed 1.0//", | |
563 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", | |
564 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", | |
565 "-//spyglass//dtd html 2.0 extended//", | |
566 "-//sq//dtd html 2.0 hotmetal + extensions//", | |
567 "-//sun microsystems corp.//dtd hotjava html//", | |
568 "-//sun microsystems corp.//dtd hotjava strict html//", | |
569 "-//w3c//dtd html 3 1995-03-24//", | |
570 "-//w3c//dtd html 3.2 draft//", | |
571 "-//w3c//dtd html 3.2 final//", | |
572 "-//w3c//dtd html 3.2//", | |
573 "-//w3c//dtd html 3.2s draft//", | |
574 "-//w3c//dtd html 4.0 frameset//", | |
575 "-//w3c//dtd html 4.0 transitional//", | |
576 "-//w3c//dtd html experimental 19960712//", | |
577 "-//w3c//dtd html experimental 970421//", | |
578 "-//w3c//dtd w3 html//", | |
579 "-//w3o//dtd w3 html 3.0//", | |
580 "-//webtechs//dtd mozilla html 2.0//", | |
581 "-//webtechs//dtd mozilla html//")) or | |
582 publicId in ("-//w3o//dtd w3 html strict 3.0//en//", | |
583 "-/w3c/dtd html 4.0 transitional/en", | |
584 "html") or | |
585 publicId.startswith( | |
586 ("-//w3c//dtd html 4.01 frameset//", | |
587 "-//w3c//dtd html 4.01 transitional//")) and | |
588 systemId is None or | |
589 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): | |
590 self.parser.compatMode = "quirks" | |
591 elif (publicId.startswith( | |
592 ("-//w3c//dtd xhtml 1.0 frameset//", | |
593 "-//w3c//dtd xhtml 1.0 transitional//")) or | |
594 publicId.startswith( | |
595 ("-//w3c//dtd html 4.01 frameset//", | |
596 "-//w3c//dtd html 4.01 transitional//")) and | |
597 systemId is not None): | |
598 self.parser.compatMode = "limited quirks" | |
599 | |
600 self.parser.phase = self.parser.phases["beforeHtml"] | |
601 | |
602 def anythingElse(self): | |
603 self.parser.compatMode = "quirks" | |
604 self.parser.phase = self.parser.phases["beforeHtml"] | |
605 | |
606 def processCharacters(self, token): | |
607 self.parser.parseError("expected-doctype-but-got-chars") | |
608 self.anythingElse() | |
609 return token | |
610 | |
611 def processStartTag(self, token): | |
612 self.parser.parseError("expected-doctype-but-got-start-tag", | |
613 {"name": token["name"]}) | |
614 self.anythingElse() | |
615 return token | |
616 | |
617 def processEndTag(self, token): | |
618 self.parser.parseError("expected-doctype-but-got-end-tag", | |
619 {"name": token["name"]}) | |
620 self.anythingElse() | |
621 return token | |
622 | |
623 def processEOF(self): | |
624 self.parser.parseError("expected-doctype-but-got-eof") | |
625 self.anythingElse() | |
626 return True | |
627 | |
628 class BeforeHtmlPhase(Phase): | |
629 __slots__ = tuple() | |
630 | |
631 # helper methods | |
632 def insertHtmlElement(self): | |
633 self.tree.insertRoot(impliedTagToken("html", "StartTag")) | |
634 self.parser.phase = self.parser.phases["beforeHead"] | |
635 | |
636 # other | |
637 def processEOF(self): | |
638 self.insertHtmlElement() | |
639 return True | |
640 | |
641 def processComment(self, token): | |
642 self.tree.insertComment(token, self.tree.document) | |
643 | |
644 def processSpaceCharacters(self, token): | |
645 pass | |
646 | |
647 def processCharacters(self, token): | |
648 self.insertHtmlElement() | |
649 return token | |
650 | |
651 def processStartTag(self, token): | |
652 if token["name"] == "html": | |
653 self.parser.firstStartTag = True | |
654 self.insertHtmlElement() | |
655 return token | |
656 | |
657 def processEndTag(self, token): | |
658 if token["name"] not in ("head", "body", "html", "br"): | |
659 self.parser.parseError("unexpected-end-tag-before-html", | |
660 {"name": token["name"]}) | |
661 else: | |
662 self.insertHtmlElement() | |
663 return token | |
664 | |
665 class BeforeHeadPhase(Phase): | |
666 __slots__ = tuple() | |
667 | |
668 def processEOF(self): | |
669 self.startTagHead(impliedTagToken("head", "StartTag")) | |
670 return True | |
671 | |
672 def processSpaceCharacters(self, token): | |
673 pass | |
674 | |
675 def processCharacters(self, token): | |
676 self.startTagHead(impliedTagToken("head", "StartTag")) | |
677 return token | |
678 | |
679 def startTagHtml(self, token): | |
680 return self.parser.phases["inBody"].processStartTag(token) | |
681 | |
682 def startTagHead(self, token): | |
683 self.tree.insertElement(token) | |
684 self.tree.headPointer = self.tree.openElements[-1] | |
685 self.parser.phase = self.parser.phases["inHead"] | |
686 | |
687 def startTagOther(self, token): | |
688 self.startTagHead(impliedTagToken("head", "StartTag")) | |
689 return token | |
690 | |
691 def endTagImplyHead(self, token): | |
692 self.startTagHead(impliedTagToken("head", "StartTag")) | |
693 return token | |
694 | |
695 def endTagOther(self, token): | |
696 self.parser.parseError("end-tag-after-implied-root", | |
697 {"name": token["name"]}) | |
698 | |
699 startTagHandler = _utils.MethodDispatcher([ | |
700 ("html", startTagHtml), | |
701 ("head", startTagHead) | |
702 ]) | |
703 startTagHandler.default = startTagOther | |
704 | |
705 endTagHandler = _utils.MethodDispatcher([ | |
706 (("head", "body", "html", "br"), endTagImplyHead) | |
707 ]) | |
708 endTagHandler.default = endTagOther | |
709 | |
710 class InHeadPhase(Phase): | |
711 __slots__ = tuple() | |
712 | |
713 # the real thing | |
714 def processEOF(self): | |
715 self.anythingElse() | |
716 return True | |
717 | |
718 def processCharacters(self, token): | |
719 self.anythingElse() | |
720 return token | |
721 | |
722 def startTagHtml(self, token): | |
723 return self.parser.phases["inBody"].processStartTag(token) | |
724 | |
725 def startTagHead(self, token): | |
726 self.parser.parseError("two-heads-are-not-better-than-one") | |
727 | |
728 def startTagBaseLinkCommand(self, token): | |
729 self.tree.insertElement(token) | |
730 self.tree.openElements.pop() | |
731 token["selfClosingAcknowledged"] = True | |
732 | |
733 def startTagMeta(self, token): | |
734 self.tree.insertElement(token) | |
735 self.tree.openElements.pop() | |
736 token["selfClosingAcknowledged"] = True | |
737 | |
738 attributes = token["data"] | |
739 if self.parser.tokenizer.stream.charEncoding[1] == "tentative": | |
740 if "charset" in attributes: | |
741 self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) | |
742 elif ("content" in attributes and | |
743 "http-equiv" in attributes and | |
744 attributes["http-equiv"].lower() == "content-type"): | |
745 # Encoding it as UTF-8 here is a hack, as really we should pass | |
746 # the abstract Unicode string, and just use the | |
747 # ContentAttrParser on that, but using UTF-8 allows all chars | |
748 # to be encoded and as a ASCII-superset works. | |
749 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) | |
750 parser = _inputstream.ContentAttrParser(data) | |
751 codec = parser.parse() | |
752 self.parser.tokenizer.stream.changeEncoding(codec) | |
753 | |
754 def startTagTitle(self, token): | |
755 self.parser.parseRCDataRawtext(token, "RCDATA") | |
756 | |
757 def startTagNoFramesStyle(self, token): | |
758 # Need to decide whether to implement the scripting-disabled case | |
759 self.parser.parseRCDataRawtext(token, "RAWTEXT") | |
760 | |
761 def startTagNoscript(self, token): | |
762 if self.parser.scripting: | |
763 self.parser.parseRCDataRawtext(token, "RAWTEXT") | |
764 else: | |
765 self.tree.insertElement(token) | |
766 self.parser.phase = self.parser.phases["inHeadNoscript"] | |
767 | |
768 def startTagScript(self, token): | |
769 self.tree.insertElement(token) | |
770 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState | |
771 self.parser.originalPhase = self.parser.phase | |
772 self.parser.phase = self.parser.phases["text"] | |
773 | |
774 def startTagOther(self, token): | |
775 self.anythingElse() | |
776 return token | |
777 | |
778 def endTagHead(self, token): | |
779 node = self.parser.tree.openElements.pop() | |
780 assert node.name == "head", "Expected head got %s" % node.name | |
781 self.parser.phase = self.parser.phases["afterHead"] | |
782 | |
783 def endTagHtmlBodyBr(self, token): | |
784 self.anythingElse() | |
785 return token | |
786 | |
787 def endTagOther(self, token): | |
788 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
789 | |
790 def anythingElse(self): | |
791 self.endTagHead(impliedTagToken("head")) | |
792 | |
793 startTagHandler = _utils.MethodDispatcher([ | |
794 ("html", startTagHtml), | |
795 ("title", startTagTitle), | |
796 (("noframes", "style"), startTagNoFramesStyle), | |
797 ("noscript", startTagNoscript), | |
798 ("script", startTagScript), | |
799 (("base", "basefont", "bgsound", "command", "link"), | |
800 startTagBaseLinkCommand), | |
801 ("meta", startTagMeta), | |
802 ("head", startTagHead) | |
803 ]) | |
804 startTagHandler.default = startTagOther | |
805 | |
806 endTagHandler = _utils.MethodDispatcher([ | |
807 ("head", endTagHead), | |
808 (("br", "html", "body"), endTagHtmlBodyBr) | |
809 ]) | |
810 endTagHandler.default = endTagOther | |
811 | |
812 class InHeadNoscriptPhase(Phase): | |
813 __slots__ = tuple() | |
814 | |
815 def processEOF(self): | |
816 self.parser.parseError("eof-in-head-noscript") | |
817 self.anythingElse() | |
818 return True | |
819 | |
820 def processComment(self, token): | |
821 return self.parser.phases["inHead"].processComment(token) | |
822 | |
823 def processCharacters(self, token): | |
824 self.parser.parseError("char-in-head-noscript") | |
825 self.anythingElse() | |
826 return token | |
827 | |
828 def processSpaceCharacters(self, token): | |
829 return self.parser.phases["inHead"].processSpaceCharacters(token) | |
830 | |
831 def startTagHtml(self, token): | |
832 return self.parser.phases["inBody"].processStartTag(token) | |
833 | |
834 def startTagBaseLinkCommand(self, token): | |
835 return self.parser.phases["inHead"].processStartTag(token) | |
836 | |
837 def startTagHeadNoscript(self, token): | |
838 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) | |
839 | |
840 def startTagOther(self, token): | |
841 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) | |
842 self.anythingElse() | |
843 return token | |
844 | |
845 def endTagNoscript(self, token): | |
846 node = self.parser.tree.openElements.pop() | |
847 assert node.name == "noscript", "Expected noscript got %s" % node.name | |
848 self.parser.phase = self.parser.phases["inHead"] | |
849 | |
850 def endTagBr(self, token): | |
851 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) | |
852 self.anythingElse() | |
853 return token | |
854 | |
855 def endTagOther(self, token): | |
856 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
857 | |
858 def anythingElse(self): | |
859 # Caller must raise parse error first! | |
860 self.endTagNoscript(impliedTagToken("noscript")) | |
861 | |
862 startTagHandler = _utils.MethodDispatcher([ | |
863 ("html", startTagHtml), | |
864 (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand), | |
865 (("head", "noscript"), startTagHeadNoscript), | |
866 ]) | |
867 startTagHandler.default = startTagOther | |
868 | |
869 endTagHandler = _utils.MethodDispatcher([ | |
870 ("noscript", endTagNoscript), | |
871 ("br", endTagBr), | |
872 ]) | |
873 endTagHandler.default = endTagOther | |
874 | |
875 class AfterHeadPhase(Phase): | |
876 __slots__ = tuple() | |
877 | |
878 def processEOF(self): | |
879 self.anythingElse() | |
880 return True | |
881 | |
882 def processCharacters(self, token): | |
883 self.anythingElse() | |
884 return token | |
885 | |
886 def startTagHtml(self, token): | |
887 return self.parser.phases["inBody"].processStartTag(token) | |
888 | |
889 def startTagBody(self, token): | |
890 self.parser.framesetOK = False | |
891 self.tree.insertElement(token) | |
892 self.parser.phase = self.parser.phases["inBody"] | |
893 | |
894 def startTagFrameset(self, token): | |
895 self.tree.insertElement(token) | |
896 self.parser.phase = self.parser.phases["inFrameset"] | |
897 | |
898 def startTagFromHead(self, token): | |
899 self.parser.parseError("unexpected-start-tag-out-of-my-head", | |
900 {"name": token["name"]}) | |
901 self.tree.openElements.append(self.tree.headPointer) | |
902 self.parser.phases["inHead"].processStartTag(token) | |
903 for node in self.tree.openElements[::-1]: | |
904 if node.name == "head": | |
905 self.tree.openElements.remove(node) | |
906 break | |
907 | |
908 def startTagHead(self, token): | |
909 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) | |
910 | |
911 def startTagOther(self, token): | |
912 self.anythingElse() | |
913 return token | |
914 | |
915 def endTagHtmlBodyBr(self, token): | |
916 self.anythingElse() | |
917 return token | |
918 | |
919 def endTagOther(self, token): | |
920 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
921 | |
922 def anythingElse(self): | |
923 self.tree.insertElement(impliedTagToken("body", "StartTag")) | |
924 self.parser.phase = self.parser.phases["inBody"] | |
925 self.parser.framesetOK = True | |
926 | |
927 startTagHandler = _utils.MethodDispatcher([ | |
928 ("html", startTagHtml), | |
929 ("body", startTagBody), | |
930 ("frameset", startTagFrameset), | |
931 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", | |
932 "style", "title"), | |
933 startTagFromHead), | |
934 ("head", startTagHead) | |
935 ]) | |
936 startTagHandler.default = startTagOther | |
937 endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), | |
938 endTagHtmlBodyBr)]) | |
939 endTagHandler.default = endTagOther | |
940 | |
941 class InBodyPhase(Phase): | |
942 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody | |
943 # the really-really-really-very crazy mode | |
944 __slots__ = ("processSpaceCharacters",) | |
945 | |
946 def __init__(self, *args, **kwargs): | |
947 super(InBodyPhase, self).__init__(*args, **kwargs) | |
948 # Set this to the default handler | |
949 self.processSpaceCharacters = self.processSpaceCharactersNonPre | |
950 | |
951 def isMatchingFormattingElement(self, node1, node2): | |
952 return (node1.name == node2.name and | |
953 node1.namespace == node2.namespace and | |
954 node1.attributes == node2.attributes) | |
955 | |
956 # helper | |
957 def addFormattingElement(self, token): | |
958 self.tree.insertElement(token) | |
959 element = self.tree.openElements[-1] | |
960 | |
961 matchingElements = [] | |
962 for node in self.tree.activeFormattingElements[::-1]: | |
963 if node is Marker: | |
964 break | |
965 elif self.isMatchingFormattingElement(node, element): | |
966 matchingElements.append(node) | |
967 | |
968 assert len(matchingElements) <= 3 | |
969 if len(matchingElements) == 3: | |
970 self.tree.activeFormattingElements.remove(matchingElements[-1]) | |
971 self.tree.activeFormattingElements.append(element) | |
972 | |
973 # the real deal | |
974 def processEOF(self): | |
975 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", | |
976 "tfoot", "th", "thead", "tr", "body", | |
977 "html")) | |
978 for node in self.tree.openElements[::-1]: | |
979 if node.name not in allowed_elements: | |
980 self.parser.parseError("expected-closing-tag-but-got-eof") | |
981 break | |
982 # Stop parsing | |
983 | |
984 def processSpaceCharactersDropNewline(self, token): | |
985 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we | |
986 # want to drop leading newlines | |
987 data = token["data"] | |
988 self.processSpaceCharacters = self.processSpaceCharactersNonPre | |
989 if (data.startswith("\n") and | |
990 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and | |
991 not self.tree.openElements[-1].hasContent()): | |
992 data = data[1:] | |
993 if data: | |
994 self.tree.reconstructActiveFormattingElements() | |
995 self.tree.insertText(data) | |
996 | |
997 def processCharacters(self, token): | |
998 if token["data"] == "\u0000": | |
999 # The tokenizer should always emit null on its own | |
1000 return | |
1001 self.tree.reconstructActiveFormattingElements() | |
1002 self.tree.insertText(token["data"]) | |
1003 # This must be bad for performance | |
1004 if (self.parser.framesetOK and | |
1005 any([char not in spaceCharacters | |
1006 for char in token["data"]])): | |
1007 self.parser.framesetOK = False | |
1008 | |
1009 def processSpaceCharactersNonPre(self, token): | |
1010 self.tree.reconstructActiveFormattingElements() | |
1011 self.tree.insertText(token["data"]) | |
1012 | |
1013 def startTagProcessInHead(self, token): | |
1014 return self.parser.phases["inHead"].processStartTag(token) | |
1015 | |
1016 def startTagBody(self, token): | |
1017 self.parser.parseError("unexpected-start-tag", {"name": "body"}) | |
1018 if (len(self.tree.openElements) == 1 or | |
1019 self.tree.openElements[1].name != "body"): | |
1020 assert self.parser.innerHTML | |
1021 else: | |
1022 self.parser.framesetOK = False | |
1023 for attr, value in token["data"].items(): | |
1024 if attr not in self.tree.openElements[1].attributes: | |
1025 self.tree.openElements[1].attributes[attr] = value | |
1026 | |
1027 def startTagFrameset(self, token): | |
1028 self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) | |
1029 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): | |
1030 assert self.parser.innerHTML | |
1031 elif not self.parser.framesetOK: | |
1032 pass | |
1033 else: | |
1034 if self.tree.openElements[1].parent: | |
1035 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) | |
1036 while self.tree.openElements[-1].name != "html": | |
1037 self.tree.openElements.pop() | |
1038 self.tree.insertElement(token) | |
1039 self.parser.phase = self.parser.phases["inFrameset"] | |
1040 | |
1041 def startTagCloseP(self, token): | |
1042 if self.tree.elementInScope("p", variant="button"): | |
1043 self.endTagP(impliedTagToken("p")) | |
1044 self.tree.insertElement(token) | |
1045 | |
1046 def startTagPreListing(self, token): | |
1047 if self.tree.elementInScope("p", variant="button"): | |
1048 self.endTagP(impliedTagToken("p")) | |
1049 self.tree.insertElement(token) | |
1050 self.parser.framesetOK = False | |
1051 self.processSpaceCharacters = self.processSpaceCharactersDropNewline | |
1052 | |
1053 def startTagForm(self, token): | |
1054 if self.tree.formPointer: | |
1055 self.parser.parseError("unexpected-start-tag", {"name": "form"}) | |
1056 else: | |
1057 if self.tree.elementInScope("p", variant="button"): | |
1058 self.endTagP(impliedTagToken("p")) | |
1059 self.tree.insertElement(token) | |
1060 self.tree.formPointer = self.tree.openElements[-1] | |
1061 | |
1062 def startTagListItem(self, token): | |
1063 self.parser.framesetOK = False | |
1064 | |
1065 stopNamesMap = {"li": ["li"], | |
1066 "dt": ["dt", "dd"], | |
1067 "dd": ["dt", "dd"]} | |
1068 stopNames = stopNamesMap[token["name"]] | |
1069 for node in reversed(self.tree.openElements): | |
1070 if node.name in stopNames: | |
1071 self.parser.phase.processEndTag( | |
1072 impliedTagToken(node.name, "EndTag")) | |
1073 break | |
1074 if (node.nameTuple in specialElements and | |
1075 node.name not in ("address", "div", "p")): | |
1076 break | |
1077 | |
1078 if self.tree.elementInScope("p", variant="button"): | |
1079 self.parser.phase.processEndTag( | |
1080 impliedTagToken("p", "EndTag")) | |
1081 | |
1082 self.tree.insertElement(token) | |
1083 | |
1084 def startTagPlaintext(self, token): | |
1085 if self.tree.elementInScope("p", variant="button"): | |
1086 self.endTagP(impliedTagToken("p")) | |
1087 self.tree.insertElement(token) | |
1088 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState | |
1089 | |
1090 def startTagHeading(self, token): | |
1091 if self.tree.elementInScope("p", variant="button"): | |
1092 self.endTagP(impliedTagToken("p")) | |
1093 if self.tree.openElements[-1].name in headingElements: | |
1094 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) | |
1095 self.tree.openElements.pop() | |
1096 self.tree.insertElement(token) | |
1097 | |
1098 def startTagA(self, token): | |
1099 afeAElement = self.tree.elementInActiveFormattingElements("a") | |
1100 if afeAElement: | |
1101 self.parser.parseError("unexpected-start-tag-implies-end-tag", | |
1102 {"startName": "a", "endName": "a"}) | |
1103 self.endTagFormatting(impliedTagToken("a")) | |
1104 if afeAElement in self.tree.openElements: | |
1105 self.tree.openElements.remove(afeAElement) | |
1106 if afeAElement in self.tree.activeFormattingElements: | |
1107 self.tree.activeFormattingElements.remove(afeAElement) | |
1108 self.tree.reconstructActiveFormattingElements() | |
1109 self.addFormattingElement(token) | |
1110 | |
1111 def startTagFormatting(self, token): | |
1112 self.tree.reconstructActiveFormattingElements() | |
1113 self.addFormattingElement(token) | |
1114 | |
1115 def startTagNobr(self, token): | |
1116 self.tree.reconstructActiveFormattingElements() | |
1117 if self.tree.elementInScope("nobr"): | |
1118 self.parser.parseError("unexpected-start-tag-implies-end-tag", | |
1119 {"startName": "nobr", "endName": "nobr"}) | |
1120 self.processEndTag(impliedTagToken("nobr")) | |
1121 # XXX Need tests that trigger the following | |
1122 self.tree.reconstructActiveFormattingElements() | |
1123 self.addFormattingElement(token) | |
1124 | |
1125 def startTagButton(self, token): | |
1126 if self.tree.elementInScope("button"): | |
1127 self.parser.parseError("unexpected-start-tag-implies-end-tag", | |
1128 {"startName": "button", "endName": "button"}) | |
1129 self.processEndTag(impliedTagToken("button")) | |
1130 return token | |
1131 else: | |
1132 self.tree.reconstructActiveFormattingElements() | |
1133 self.tree.insertElement(token) | |
1134 self.parser.framesetOK = False | |
1135 | |
1136 def startTagAppletMarqueeObject(self, token): | |
1137 self.tree.reconstructActiveFormattingElements() | |
1138 self.tree.insertElement(token) | |
1139 self.tree.activeFormattingElements.append(Marker) | |
1140 self.parser.framesetOK = False | |
1141 | |
1142 def startTagXmp(self, token): | |
1143 if self.tree.elementInScope("p", variant="button"): | |
1144 self.endTagP(impliedTagToken("p")) | |
1145 self.tree.reconstructActiveFormattingElements() | |
1146 self.parser.framesetOK = False | |
1147 self.parser.parseRCDataRawtext(token, "RAWTEXT") | |
1148 | |
1149 def startTagTable(self, token): | |
1150 if self.parser.compatMode != "quirks": | |
1151 if self.tree.elementInScope("p", variant="button"): | |
1152 self.processEndTag(impliedTagToken("p")) | |
1153 self.tree.insertElement(token) | |
1154 self.parser.framesetOK = False | |
1155 self.parser.phase = self.parser.phases["inTable"] | |
1156 | |
1157 def startTagVoidFormatting(self, token): | |
1158 self.tree.reconstructActiveFormattingElements() | |
1159 self.tree.insertElement(token) | |
1160 self.tree.openElements.pop() | |
1161 token["selfClosingAcknowledged"] = True | |
1162 self.parser.framesetOK = False | |
1163 | |
1164 def startTagInput(self, token): | |
1165 framesetOK = self.parser.framesetOK | |
1166 self.startTagVoidFormatting(token) | |
1167 if ("type" in token["data"] and | |
1168 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): | |
1169 # input type=hidden doesn't change framesetOK | |
1170 self.parser.framesetOK = framesetOK | |
1171 | |
1172 def startTagParamSource(self, token): | |
1173 self.tree.insertElement(token) | |
1174 self.tree.openElements.pop() | |
1175 token["selfClosingAcknowledged"] = True | |
1176 | |
1177 def startTagHr(self, token): | |
1178 if self.tree.elementInScope("p", variant="button"): | |
1179 self.endTagP(impliedTagToken("p")) | |
1180 self.tree.insertElement(token) | |
1181 self.tree.openElements.pop() | |
1182 token["selfClosingAcknowledged"] = True | |
1183 self.parser.framesetOK = False | |
1184 | |
1185 def startTagImage(self, token): | |
1186 # No really... | |
1187 self.parser.parseError("unexpected-start-tag-treated-as", | |
1188 {"originalName": "image", "newName": "img"}) | |
1189 self.processStartTag(impliedTagToken("img", "StartTag", | |
1190 attributes=token["data"], | |
1191 selfClosing=token["selfClosing"])) | |
1192 | |
1193 def startTagIsIndex(self, token): | |
1194 self.parser.parseError("deprecated-tag", {"name": "isindex"}) | |
1195 if self.tree.formPointer: | |
1196 return | |
1197 form_attrs = {} | |
1198 if "action" in token["data"]: | |
1199 form_attrs["action"] = token["data"]["action"] | |
1200 self.processStartTag(impliedTagToken("form", "StartTag", | |
1201 attributes=form_attrs)) | |
1202 self.processStartTag(impliedTagToken("hr", "StartTag")) | |
1203 self.processStartTag(impliedTagToken("label", "StartTag")) | |
1204 # XXX Localization ... | |
1205 if "prompt" in token["data"]: | |
1206 prompt = token["data"]["prompt"] | |
1207 else: | |
1208 prompt = "This is a searchable index. Enter search keywords: " | |
1209 self.processCharacters( | |
1210 {"type": tokenTypes["Characters"], "data": prompt}) | |
1211 attributes = token["data"].copy() | |
1212 if "action" in attributes: | |
1213 del attributes["action"] | |
1214 if "prompt" in attributes: | |
1215 del attributes["prompt"] | |
1216 attributes["name"] = "isindex" | |
1217 self.processStartTag(impliedTagToken("input", "StartTag", | |
1218 attributes=attributes, | |
1219 selfClosing=token["selfClosing"])) | |
1220 self.processEndTag(impliedTagToken("label")) | |
1221 self.processStartTag(impliedTagToken("hr", "StartTag")) | |
1222 self.processEndTag(impliedTagToken("form")) | |
1223 | |
1224 def startTagTextarea(self, token): | |
1225 self.tree.insertElement(token) | |
1226 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState | |
1227 self.processSpaceCharacters = self.processSpaceCharactersDropNewline | |
1228 self.parser.framesetOK = False | |
1229 | |
1230 def startTagIFrame(self, token): | |
1231 self.parser.framesetOK = False | |
1232 self.startTagRawtext(token) | |
1233 | |
1234 def startTagNoscript(self, token): | |
1235 if self.parser.scripting: | |
1236 self.startTagRawtext(token) | |
1237 else: | |
1238 self.startTagOther(token) | |
1239 | |
1240 def startTagRawtext(self, token): | |
1241 """iframe, noembed noframes, noscript(if scripting enabled)""" | |
1242 self.parser.parseRCDataRawtext(token, "RAWTEXT") | |
1243 | |
1244 def startTagOpt(self, token): | |
1245 if self.tree.openElements[-1].name == "option": | |
1246 self.parser.phase.processEndTag(impliedTagToken("option")) | |
1247 self.tree.reconstructActiveFormattingElements() | |
1248 self.parser.tree.insertElement(token) | |
1249 | |
1250 def startTagSelect(self, token): | |
1251 self.tree.reconstructActiveFormattingElements() | |
1252 self.tree.insertElement(token) | |
1253 self.parser.framesetOK = False | |
1254 if self.parser.phase in (self.parser.phases["inTable"], | |
1255 self.parser.phases["inCaption"], | |
1256 self.parser.phases["inColumnGroup"], | |
1257 self.parser.phases["inTableBody"], | |
1258 self.parser.phases["inRow"], | |
1259 self.parser.phases["inCell"]): | |
1260 self.parser.phase = self.parser.phases["inSelectInTable"] | |
1261 else: | |
1262 self.parser.phase = self.parser.phases["inSelect"] | |
1263 | |
1264 def startTagRpRt(self, token): | |
1265 if self.tree.elementInScope("ruby"): | |
1266 self.tree.generateImpliedEndTags() | |
1267 if self.tree.openElements[-1].name != "ruby": | |
1268 self.parser.parseError() | |
1269 self.tree.insertElement(token) | |
1270 | |
1271 def startTagMath(self, token): | |
1272 self.tree.reconstructActiveFormattingElements() | |
1273 self.parser.adjustMathMLAttributes(token) | |
1274 self.parser.adjustForeignAttributes(token) | |
1275 token["namespace"] = namespaces["mathml"] | |
1276 self.tree.insertElement(token) | |
1277 # Need to get the parse error right for the case where the token | |
1278 # has a namespace not equal to the xmlns attribute | |
1279 if token["selfClosing"]: | |
1280 self.tree.openElements.pop() | |
1281 token["selfClosingAcknowledged"] = True | |
1282 | |
1283 def startTagSvg(self, token): | |
1284 self.tree.reconstructActiveFormattingElements() | |
1285 self.parser.adjustSVGAttributes(token) | |
1286 self.parser.adjustForeignAttributes(token) | |
1287 token["namespace"] = namespaces["svg"] | |
1288 self.tree.insertElement(token) | |
1289 # Need to get the parse error right for the case where the token | |
1290 # has a namespace not equal to the xmlns attribute | |
1291 if token["selfClosing"]: | |
1292 self.tree.openElements.pop() | |
1293 token["selfClosingAcknowledged"] = True | |
1294 | |
1295 def startTagMisplaced(self, token): | |
1296 """ Elements that should be children of other elements that have a | |
1297 different insertion mode; here they are ignored | |
1298 "caption", "col", "colgroup", "frame", "frameset", "head", | |
1299 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", | |
1300 "tr", "noscript" | |
1301 """ | |
1302 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) | |
1303 | |
1304 def startTagOther(self, token): | |
1305 self.tree.reconstructActiveFormattingElements() | |
1306 self.tree.insertElement(token) | |
1307 | |
1308 def endTagP(self, token): | |
1309 if not self.tree.elementInScope("p", variant="button"): | |
1310 self.startTagCloseP(impliedTagToken("p", "StartTag")) | |
1311 self.parser.parseError("unexpected-end-tag", {"name": "p"}) | |
1312 self.endTagP(impliedTagToken("p", "EndTag")) | |
1313 else: | |
1314 self.tree.generateImpliedEndTags("p") | |
1315 if self.tree.openElements[-1].name != "p": | |
1316 self.parser.parseError("unexpected-end-tag", {"name": "p"}) | |
1317 node = self.tree.openElements.pop() | |
1318 while node.name != "p": | |
1319 node = self.tree.openElements.pop() | |
1320 | |
1321 def endTagBody(self, token): | |
1322 if not self.tree.elementInScope("body"): | |
1323 self.parser.parseError() | |
1324 return | |
1325 elif self.tree.openElements[-1].name != "body": | |
1326 for node in self.tree.openElements[2:]: | |
1327 if node.name not in frozenset(("dd", "dt", "li", "optgroup", | |
1328 "option", "p", "rp", "rt", | |
1329 "tbody", "td", "tfoot", | |
1330 "th", "thead", "tr", "body", | |
1331 "html")): | |
1332 # Not sure this is the correct name for the parse error | |
1333 self.parser.parseError( | |
1334 "expected-one-end-tag-but-got-another", | |
1335 {"gotName": "body", "expectedName": node.name}) | |
1336 break | |
1337 self.parser.phase = self.parser.phases["afterBody"] | |
1338 | |
1339 def endTagHtml(self, token): | |
1340 # We repeat the test for the body end tag token being ignored here | |
1341 if self.tree.elementInScope("body"): | |
1342 self.endTagBody(impliedTagToken("body")) | |
1343 return token | |
1344 | |
1345 def endTagBlock(self, token): | |
1346 # Put us back in the right whitespace handling mode | |
1347 if token["name"] == "pre": | |
1348 self.processSpaceCharacters = self.processSpaceCharactersNonPre | |
1349 inScope = self.tree.elementInScope(token["name"]) | |
1350 if inScope: | |
1351 self.tree.generateImpliedEndTags() | |
1352 if self.tree.openElements[-1].name != token["name"]: | |
1353 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) | |
1354 if inScope: | |
1355 node = self.tree.openElements.pop() | |
1356 while node.name != token["name"]: | |
1357 node = self.tree.openElements.pop() | |
1358 | |
1359 def endTagForm(self, token): | |
1360 node = self.tree.formPointer | |
1361 self.tree.formPointer = None | |
1362 if node is None or not self.tree.elementInScope(node): | |
1363 self.parser.parseError("unexpected-end-tag", | |
1364 {"name": "form"}) | |
1365 else: | |
1366 self.tree.generateImpliedEndTags() | |
1367 if self.tree.openElements[-1] != node: | |
1368 self.parser.parseError("end-tag-too-early-ignored", | |
1369 {"name": "form"}) | |
1370 self.tree.openElements.remove(node) | |
1371 | |
1372 def endTagListItem(self, token): | |
1373 if token["name"] == "li": | |
1374 variant = "list" | |
1375 else: | |
1376 variant = None | |
1377 if not self.tree.elementInScope(token["name"], variant=variant): | |
1378 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
1379 else: | |
1380 self.tree.generateImpliedEndTags(exclude=token["name"]) | |
1381 if self.tree.openElements[-1].name != token["name"]: | |
1382 self.parser.parseError( | |
1383 "end-tag-too-early", | |
1384 {"name": token["name"]}) | |
1385 node = self.tree.openElements.pop() | |
1386 while node.name != token["name"]: | |
1387 node = self.tree.openElements.pop() | |
1388 | |
1389 def endTagHeading(self, token): | |
1390 for item in headingElements: | |
1391 if self.tree.elementInScope(item): | |
1392 self.tree.generateImpliedEndTags() | |
1393 break | |
1394 if self.tree.openElements[-1].name != token["name"]: | |
1395 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) | |
1396 | |
1397 for item in headingElements: | |
1398 if self.tree.elementInScope(item): | |
1399 item = self.tree.openElements.pop() | |
1400 while item.name not in headingElements: | |
1401 item = self.tree.openElements.pop() | |
1402 break | |
1403 | |
1404 def endTagFormatting(self, token): | |
1405 """The much-feared adoption agency algorithm""" | |
1406 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 | |
1407 # XXX Better parseError messages appreciated. | |
1408 | |
1409 # Step 1 | |
1410 outerLoopCounter = 0 | |
1411 | |
1412 # Step 2 | |
1413 while outerLoopCounter < 8: | |
1414 | |
1415 # Step 3 | |
1416 outerLoopCounter += 1 | |
1417 | |
1418 # Step 4: | |
1419 | |
1420 # Let the formatting element be the last element in | |
1421 # the list of active formatting elements that: | |
1422 # - is between the end of the list and the last scope | |
1423 # marker in the list, if any, or the start of the list | |
1424 # otherwise, and | |
1425 # - has the same tag name as the token. | |
1426 formattingElement = self.tree.elementInActiveFormattingElements( | |
1427 token["name"]) | |
1428 if (not formattingElement or | |
1429 (formattingElement in self.tree.openElements and | |
1430 not self.tree.elementInScope(formattingElement.name))): | |
1431 # If there is no such node, then abort these steps | |
1432 # and instead act as described in the "any other | |
1433 # end tag" entry below. | |
1434 self.endTagOther(token) | |
1435 return | |
1436 | |
1437 # Otherwise, if there is such a node, but that node is | |
1438 # not in the stack of open elements, then this is a | |
1439 # parse error; remove the element from the list, and | |
1440 # abort these steps. | |
1441 elif formattingElement not in self.tree.openElements: | |
1442 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) | |
1443 self.tree.activeFormattingElements.remove(formattingElement) | |
1444 return | |
1445 | |
1446 # Otherwise, if there is such a node, and that node is | |
1447 # also in the stack of open elements, but the element | |
1448 # is not in scope, then this is a parse error; ignore | |
1449 # the token, and abort these steps. | |
1450 elif not self.tree.elementInScope(formattingElement.name): | |
1451 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) | |
1452 return | |
1453 | |
1454 # Otherwise, there is a formatting element and that | |
1455 # element is in the stack and is in scope. If the | |
1456 # element is not the current node, this is a parse | |
1457 # error. In any case, proceed with the algorithm as | |
1458 # written in the following steps. | |
1459 else: | |
1460 if formattingElement != self.tree.openElements[-1]: | |
1461 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) | |
1462 | |
1463 # Step 5: | |
1464 | |
1465 # Let the furthest block be the topmost node in the | |
1466 # stack of open elements that is lower in the stack | |
1467 # than the formatting element, and is an element in | |
1468 # the special category. There might not be one. | |
1469 afeIndex = self.tree.openElements.index(formattingElement) | |
1470 furthestBlock = None | |
1471 for element in self.tree.openElements[afeIndex:]: | |
1472 if element.nameTuple in specialElements: | |
1473 furthestBlock = element | |
1474 break | |
1475 | |
1476 # Step 6: | |
1477 | |
1478 # If there is no furthest block, then the UA must | |
1479 # first pop all the nodes from the bottom of the stack | |
1480 # of open elements, from the current node up to and | |
1481 # including the formatting element, then remove the | |
1482 # formatting element from the list of active | |
1483 # formatting elements, and finally abort these steps. | |
1484 if furthestBlock is None: | |
1485 element = self.tree.openElements.pop() | |
1486 while element != formattingElement: | |
1487 element = self.tree.openElements.pop() | |
1488 self.tree.activeFormattingElements.remove(element) | |
1489 return | |
1490 | |
1491 # Step 7 | |
1492 commonAncestor = self.tree.openElements[afeIndex - 1] | |
1493 | |
1494 # Step 8: | |
1495 # The bookmark is supposed to help us identify where to reinsert | |
1496 # nodes in step 15. We have to ensure that we reinsert nodes after | |
1497 # the node before the active formatting element. Note the bookmark | |
1498 # can move in step 9.7 | |
1499 bookmark = self.tree.activeFormattingElements.index(formattingElement) | |
1500 | |
1501 # Step 9 | |
1502 lastNode = node = furthestBlock | |
1503 innerLoopCounter = 0 | |
1504 | |
1505 index = self.tree.openElements.index(node) | |
1506 while innerLoopCounter < 3: | |
1507 innerLoopCounter += 1 | |
1508 # Node is element before node in open elements | |
1509 index -= 1 | |
1510 node = self.tree.openElements[index] | |
1511 if node not in self.tree.activeFormattingElements: | |
1512 self.tree.openElements.remove(node) | |
1513 continue | |
1514 # Step 9.6 | |
1515 if node == formattingElement: | |
1516 break | |
1517 # Step 9.7 | |
1518 if lastNode == furthestBlock: | |
1519 bookmark = self.tree.activeFormattingElements.index(node) + 1 | |
1520 # Step 9.8 | |
1521 clone = node.cloneNode() | |
1522 # Replace node with clone | |
1523 self.tree.activeFormattingElements[ | |
1524 self.tree.activeFormattingElements.index(node)] = clone | |
1525 self.tree.openElements[ | |
1526 self.tree.openElements.index(node)] = clone | |
1527 node = clone | |
1528 # Step 9.9 | |
1529 # Remove lastNode from its parents, if any | |
1530 if lastNode.parent: | |
1531 lastNode.parent.removeChild(lastNode) | |
1532 node.appendChild(lastNode) | |
1533 # Step 9.10 | |
1534 lastNode = node | |
1535 | |
1536 # Step 10 | |
1537 # Foster parent lastNode if commonAncestor is a | |
1538 # table, tbody, tfoot, thead, or tr we need to foster | |
1539 # parent the lastNode | |
1540 if lastNode.parent: | |
1541 lastNode.parent.removeChild(lastNode) | |
1542 | |
1543 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): | |
1544 parent, insertBefore = self.tree.getTableMisnestedNodePosition() | |
1545 parent.insertBefore(lastNode, insertBefore) | |
1546 else: | |
1547 commonAncestor.appendChild(lastNode) | |
1548 | |
1549 # Step 11 | |
1550 clone = formattingElement.cloneNode() | |
1551 | |
1552 # Step 12 | |
1553 furthestBlock.reparentChildren(clone) | |
1554 | |
1555 # Step 13 | |
1556 furthestBlock.appendChild(clone) | |
1557 | |
1558 # Step 14 | |
1559 self.tree.activeFormattingElements.remove(formattingElement) | |
1560 self.tree.activeFormattingElements.insert(bookmark, clone) | |
1561 | |
1562 # Step 15 | |
1563 self.tree.openElements.remove(formattingElement) | |
1564 self.tree.openElements.insert( | |
1565 self.tree.openElements.index(furthestBlock) + 1, clone) | |
1566 | |
1567 def endTagAppletMarqueeObject(self, token): | |
1568 if self.tree.elementInScope(token["name"]): | |
1569 self.tree.generateImpliedEndTags() | |
1570 if self.tree.openElements[-1].name != token["name"]: | |
1571 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) | |
1572 | |
1573 if self.tree.elementInScope(token["name"]): | |
1574 element = self.tree.openElements.pop() | |
1575 while element.name != token["name"]: | |
1576 element = self.tree.openElements.pop() | |
1577 self.tree.clearActiveFormattingElements() | |
1578 | |
1579 def endTagBr(self, token): | |
1580 self.parser.parseError("unexpected-end-tag-treated-as", | |
1581 {"originalName": "br", "newName": "br element"}) | |
1582 self.tree.reconstructActiveFormattingElements() | |
1583 self.tree.insertElement(impliedTagToken("br", "StartTag")) | |
1584 self.tree.openElements.pop() | |
1585 | |
1586 def endTagOther(self, token): | |
1587 for node in self.tree.openElements[::-1]: | |
1588 if node.name == token["name"]: | |
1589 self.tree.generateImpliedEndTags(exclude=token["name"]) | |
1590 if self.tree.openElements[-1].name != token["name"]: | |
1591 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
1592 while self.tree.openElements.pop() != node: | |
1593 pass | |
1594 break | |
1595 else: | |
1596 if node.nameTuple in specialElements: | |
1597 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
1598 break | |
1599 | |
1600 startTagHandler = _utils.MethodDispatcher([ | |
1601 ("html", Phase.startTagHtml), | |
1602 (("base", "basefont", "bgsound", "command", "link", "meta", | |
1603 "script", "style", "title"), | |
1604 startTagProcessInHead), | |
1605 ("body", startTagBody), | |
1606 ("frameset", startTagFrameset), | |
1607 (("address", "article", "aside", "blockquote", "center", "details", | |
1608 "dir", "div", "dl", "fieldset", "figcaption", "figure", | |
1609 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", | |
1610 "section", "summary", "ul"), | |
1611 startTagCloseP), | |
1612 (headingElements, startTagHeading), | |
1613 (("pre", "listing"), startTagPreListing), | |
1614 ("form", startTagForm), | |
1615 (("li", "dd", "dt"), startTagListItem), | |
1616 ("plaintext", startTagPlaintext), | |
1617 ("a", startTagA), | |
1618 (("b", "big", "code", "em", "font", "i", "s", "small", "strike", | |
1619 "strong", "tt", "u"), startTagFormatting), | |
1620 ("nobr", startTagNobr), | |
1621 ("button", startTagButton), | |
1622 (("applet", "marquee", "object"), startTagAppletMarqueeObject), | |
1623 ("xmp", startTagXmp), | |
1624 ("table", startTagTable), | |
1625 (("area", "br", "embed", "img", "keygen", "wbr"), | |
1626 startTagVoidFormatting), | |
1627 (("param", "source", "track"), startTagParamSource), | |
1628 ("input", startTagInput), | |
1629 ("hr", startTagHr), | |
1630 ("image", startTagImage), | |
1631 ("isindex", startTagIsIndex), | |
1632 ("textarea", startTagTextarea), | |
1633 ("iframe", startTagIFrame), | |
1634 ("noscript", startTagNoscript), | |
1635 (("noembed", "noframes"), startTagRawtext), | |
1636 ("select", startTagSelect), | |
1637 (("rp", "rt"), startTagRpRt), | |
1638 (("option", "optgroup"), startTagOpt), | |
1639 (("math"), startTagMath), | |
1640 (("svg"), startTagSvg), | |
1641 (("caption", "col", "colgroup", "frame", "head", | |
1642 "tbody", "td", "tfoot", "th", "thead", | |
1643 "tr"), startTagMisplaced) | |
1644 ]) | |
1645 startTagHandler.default = startTagOther | |
1646 | |
1647 endTagHandler = _utils.MethodDispatcher([ | |
1648 ("body", endTagBody), | |
1649 ("html", endTagHtml), | |
1650 (("address", "article", "aside", "blockquote", "button", "center", | |
1651 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", | |
1652 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", | |
1653 "section", "summary", "ul"), endTagBlock), | |
1654 ("form", endTagForm), | |
1655 ("p", endTagP), | |
1656 (("dd", "dt", "li"), endTagListItem), | |
1657 (headingElements, endTagHeading), | |
1658 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", | |
1659 "strike", "strong", "tt", "u"), endTagFormatting), | |
1660 (("applet", "marquee", "object"), endTagAppletMarqueeObject), | |
1661 ("br", endTagBr), | |
1662 ]) | |
1663 endTagHandler.default = endTagOther | |
1664 | |
1665 class TextPhase(Phase): | |
1666 __slots__ = tuple() | |
1667 | |
1668 def processCharacters(self, token): | |
1669 self.tree.insertText(token["data"]) | |
1670 | |
1671 def processEOF(self): | |
1672 self.parser.parseError("expected-named-closing-tag-but-got-eof", | |
1673 {"name": self.tree.openElements[-1].name}) | |
1674 self.tree.openElements.pop() | |
1675 self.parser.phase = self.parser.originalPhase | |
1676 return True | |
1677 | |
1678 def startTagOther(self, token): | |
1679 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] | |
1680 | |
1681 def endTagScript(self, token): | |
1682 node = self.tree.openElements.pop() | |
1683 assert node.name == "script" | |
1684 self.parser.phase = self.parser.originalPhase | |
1685 # The rest of this method is all stuff that only happens if | |
1686 # document.write works | |
1687 | |
1688 def endTagOther(self, token): | |
1689 self.tree.openElements.pop() | |
1690 self.parser.phase = self.parser.originalPhase | |
1691 | |
1692 startTagHandler = _utils.MethodDispatcher([]) | |
1693 startTagHandler.default = startTagOther | |
1694 endTagHandler = _utils.MethodDispatcher([ | |
1695 ("script", endTagScript)]) | |
1696 endTagHandler.default = endTagOther | |
1697 | |
1698 class InTablePhase(Phase): | |
1699 # http://www.whatwg.org/specs/web-apps/current-work/#in-table | |
1700 __slots__ = tuple() | |
1701 | |
1702 # helper methods | |
1703 def clearStackToTableContext(self): | |
1704 # "clear the stack back to a table context" | |
1705 while self.tree.openElements[-1].name not in ("table", "html"): | |
1706 # self.parser.parseError("unexpected-implied-end-tag-in-table", | |
1707 # {"name": self.tree.openElements[-1].name}) | |
1708 self.tree.openElements.pop() | |
1709 # When the current node is <html> it's an innerHTML case | |
1710 | |
1711 # processing methods | |
1712 def processEOF(self): | |
1713 if self.tree.openElements[-1].name != "html": | |
1714 self.parser.parseError("eof-in-table") | |
1715 else: | |
1716 assert self.parser.innerHTML | |
1717 # Stop parsing | |
1718 | |
1719 def processSpaceCharacters(self, token): | |
1720 originalPhase = self.parser.phase | |
1721 self.parser.phase = self.parser.phases["inTableText"] | |
1722 self.parser.phase.originalPhase = originalPhase | |
1723 self.parser.phase.processSpaceCharacters(token) | |
1724 | |
1725 def processCharacters(self, token): | |
1726 originalPhase = self.parser.phase | |
1727 self.parser.phase = self.parser.phases["inTableText"] | |
1728 self.parser.phase.originalPhase = originalPhase | |
1729 self.parser.phase.processCharacters(token) | |
1730 | |
1731 def insertText(self, token): | |
1732 # If we get here there must be at least one non-whitespace character | |
1733 # Do the table magic! | |
1734 self.tree.insertFromTable = True | |
1735 self.parser.phases["inBody"].processCharacters(token) | |
1736 self.tree.insertFromTable = False | |
1737 | |
1738 def startTagCaption(self, token): | |
1739 self.clearStackToTableContext() | |
1740 self.tree.activeFormattingElements.append(Marker) | |
1741 self.tree.insertElement(token) | |
1742 self.parser.phase = self.parser.phases["inCaption"] | |
1743 | |
1744 def startTagColgroup(self, token): | |
1745 self.clearStackToTableContext() | |
1746 self.tree.insertElement(token) | |
1747 self.parser.phase = self.parser.phases["inColumnGroup"] | |
1748 | |
1749 def startTagCol(self, token): | |
1750 self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) | |
1751 return token | |
1752 | |
1753 def startTagRowGroup(self, token): | |
1754 self.clearStackToTableContext() | |
1755 self.tree.insertElement(token) | |
1756 self.parser.phase = self.parser.phases["inTableBody"] | |
1757 | |
1758 def startTagImplyTbody(self, token): | |
1759 self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) | |
1760 return token | |
1761 | |
1762 def startTagTable(self, token): | |
1763 self.parser.parseError("unexpected-start-tag-implies-end-tag", | |
1764 {"startName": "table", "endName": "table"}) | |
1765 self.parser.phase.processEndTag(impliedTagToken("table")) | |
1766 if not self.parser.innerHTML: | |
1767 return token | |
1768 | |
1769 def startTagStyleScript(self, token): | |
1770 return self.parser.phases["inHead"].processStartTag(token) | |
1771 | |
1772 def startTagInput(self, token): | |
1773 if ("type" in token["data"] and | |
1774 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): | |
1775 self.parser.parseError("unexpected-hidden-input-in-table") | |
1776 self.tree.insertElement(token) | |
1777 # XXX associate with form | |
1778 self.tree.openElements.pop() | |
1779 else: | |
1780 self.startTagOther(token) | |
1781 | |
1782 def startTagForm(self, token): | |
1783 self.parser.parseError("unexpected-form-in-table") | |
1784 if self.tree.formPointer is None: | |
1785 self.tree.insertElement(token) | |
1786 self.tree.formPointer = self.tree.openElements[-1] | |
1787 self.tree.openElements.pop() | |
1788 | |
1789 def startTagOther(self, token): | |
1790 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) | |
1791 # Do the table magic! | |
1792 self.tree.insertFromTable = True | |
1793 self.parser.phases["inBody"].processStartTag(token) | |
1794 self.tree.insertFromTable = False | |
1795 | |
1796 def endTagTable(self, token): | |
1797 if self.tree.elementInScope("table", variant="table"): | |
1798 self.tree.generateImpliedEndTags() | |
1799 if self.tree.openElements[-1].name != "table": | |
1800 self.parser.parseError("end-tag-too-early-named", | |
1801 {"gotName": "table", | |
1802 "expectedName": self.tree.openElements[-1].name}) | |
1803 while self.tree.openElements[-1].name != "table": | |
1804 self.tree.openElements.pop() | |
1805 self.tree.openElements.pop() | |
1806 self.parser.resetInsertionMode() | |
1807 else: | |
1808 # innerHTML case | |
1809 assert self.parser.innerHTML | |
1810 self.parser.parseError() | |
1811 | |
1812 def endTagIgnore(self, token): | |
1813 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
1814 | |
1815 def endTagOther(self, token): | |
1816 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) | |
1817 # Do the table magic! | |
1818 self.tree.insertFromTable = True | |
1819 self.parser.phases["inBody"].processEndTag(token) | |
1820 self.tree.insertFromTable = False | |
1821 | |
1822 startTagHandler = _utils.MethodDispatcher([ | |
1823 ("html", Phase.startTagHtml), | |
1824 ("caption", startTagCaption), | |
1825 ("colgroup", startTagColgroup), | |
1826 ("col", startTagCol), | |
1827 (("tbody", "tfoot", "thead"), startTagRowGroup), | |
1828 (("td", "th", "tr"), startTagImplyTbody), | |
1829 ("table", startTagTable), | |
1830 (("style", "script"), startTagStyleScript), | |
1831 ("input", startTagInput), | |
1832 ("form", startTagForm) | |
1833 ]) | |
1834 startTagHandler.default = startTagOther | |
1835 | |
1836 endTagHandler = _utils.MethodDispatcher([ | |
1837 ("table", endTagTable), | |
1838 (("body", "caption", "col", "colgroup", "html", "tbody", "td", | |
1839 "tfoot", "th", "thead", "tr"), endTagIgnore) | |
1840 ]) | |
1841 endTagHandler.default = endTagOther | |
1842 | |
1843 class InTableTextPhase(Phase): | |
1844 __slots__ = ("originalPhase", "characterTokens") | |
1845 | |
1846 def __init__(self, *args, **kwargs): | |
1847 super(InTableTextPhase, self).__init__(*args, **kwargs) | |
1848 self.originalPhase = None | |
1849 self.characterTokens = [] | |
1850 | |
1851 def flushCharacters(self): | |
1852 data = "".join([item["data"] for item in self.characterTokens]) | |
1853 if any([item not in spaceCharacters for item in data]): | |
1854 token = {"type": tokenTypes["Characters"], "data": data} | |
1855 self.parser.phases["inTable"].insertText(token) | |
1856 elif data: | |
1857 self.tree.insertText(data) | |
1858 self.characterTokens = [] | |
1859 | |
1860 def processComment(self, token): | |
1861 self.flushCharacters() | |
1862 self.parser.phase = self.originalPhase | |
1863 return token | |
1864 | |
1865 def processEOF(self): | |
1866 self.flushCharacters() | |
1867 self.parser.phase = self.originalPhase | |
1868 return True | |
1869 | |
1870 def processCharacters(self, token): | |
1871 if token["data"] == "\u0000": | |
1872 return | |
1873 self.characterTokens.append(token) | |
1874 | |
1875 def processSpaceCharacters(self, token): | |
1876 # pretty sure we should never reach here | |
1877 self.characterTokens.append(token) | |
1878 # assert False | |
1879 | |
1880 def processStartTag(self, token): | |
1881 self.flushCharacters() | |
1882 self.parser.phase = self.originalPhase | |
1883 return token | |
1884 | |
1885 def processEndTag(self, token): | |
1886 self.flushCharacters() | |
1887 self.parser.phase = self.originalPhase | |
1888 return token | |
1889 | |
1890 class InCaptionPhase(Phase): | |
1891 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption | |
1892 __slots__ = tuple() | |
1893 | |
1894 def ignoreEndTagCaption(self): | |
1895 return not self.tree.elementInScope("caption", variant="table") | |
1896 | |
1897 def processEOF(self): | |
1898 self.parser.phases["inBody"].processEOF() | |
1899 | |
1900 def processCharacters(self, token): | |
1901 return self.parser.phases["inBody"].processCharacters(token) | |
1902 | |
1903 def startTagTableElement(self, token): | |
1904 self.parser.parseError() | |
1905 # XXX Have to duplicate logic here to find out if the tag is ignored | |
1906 ignoreEndTag = self.ignoreEndTagCaption() | |
1907 self.parser.phase.processEndTag(impliedTagToken("caption")) | |
1908 if not ignoreEndTag: | |
1909 return token | |
1910 | |
1911 def startTagOther(self, token): | |
1912 return self.parser.phases["inBody"].processStartTag(token) | |
1913 | |
1914 def endTagCaption(self, token): | |
1915 if not self.ignoreEndTagCaption(): | |
1916 # AT this code is quite similar to endTagTable in "InTable" | |
1917 self.tree.generateImpliedEndTags() | |
1918 if self.tree.openElements[-1].name != "caption": | |
1919 self.parser.parseError("expected-one-end-tag-but-got-another", | |
1920 {"gotName": "caption", | |
1921 "expectedName": self.tree.openElements[-1].name}) | |
1922 while self.tree.openElements[-1].name != "caption": | |
1923 self.tree.openElements.pop() | |
1924 self.tree.openElements.pop() | |
1925 self.tree.clearActiveFormattingElements() | |
1926 self.parser.phase = self.parser.phases["inTable"] | |
1927 else: | |
1928 # innerHTML case | |
1929 assert self.parser.innerHTML | |
1930 self.parser.parseError() | |
1931 | |
1932 def endTagTable(self, token): | |
1933 self.parser.parseError() | |
1934 ignoreEndTag = self.ignoreEndTagCaption() | |
1935 self.parser.phase.processEndTag(impliedTagToken("caption")) | |
1936 if not ignoreEndTag: | |
1937 return token | |
1938 | |
1939 def endTagIgnore(self, token): | |
1940 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
1941 | |
1942 def endTagOther(self, token): | |
1943 return self.parser.phases["inBody"].processEndTag(token) | |
1944 | |
1945 startTagHandler = _utils.MethodDispatcher([ | |
1946 ("html", Phase.startTagHtml), | |
1947 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", | |
1948 "thead", "tr"), startTagTableElement) | |
1949 ]) | |
1950 startTagHandler.default = startTagOther | |
1951 | |
1952 endTagHandler = _utils.MethodDispatcher([ | |
1953 ("caption", endTagCaption), | |
1954 ("table", endTagTable), | |
1955 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", | |
1956 "thead", "tr"), endTagIgnore) | |
1957 ]) | |
1958 endTagHandler.default = endTagOther | |
1959 | |
1960 class InColumnGroupPhase(Phase): | |
1961 # http://www.whatwg.org/specs/web-apps/current-work/#in-column | |
1962 __slots__ = tuple() | |
1963 | |
1964 def ignoreEndTagColgroup(self): | |
1965 return self.tree.openElements[-1].name == "html" | |
1966 | |
1967 def processEOF(self): | |
1968 if self.tree.openElements[-1].name == "html": | |
1969 assert self.parser.innerHTML | |
1970 return | |
1971 else: | |
1972 ignoreEndTag = self.ignoreEndTagColgroup() | |
1973 self.endTagColgroup(impliedTagToken("colgroup")) | |
1974 if not ignoreEndTag: | |
1975 return True | |
1976 | |
1977 def processCharacters(self, token): | |
1978 ignoreEndTag = self.ignoreEndTagColgroup() | |
1979 self.endTagColgroup(impliedTagToken("colgroup")) | |
1980 if not ignoreEndTag: | |
1981 return token | |
1982 | |
1983 def startTagCol(self, token): | |
1984 self.tree.insertElement(token) | |
1985 self.tree.openElements.pop() | |
1986 token["selfClosingAcknowledged"] = True | |
1987 | |
1988 def startTagOther(self, token): | |
1989 ignoreEndTag = self.ignoreEndTagColgroup() | |
1990 self.endTagColgroup(impliedTagToken("colgroup")) | |
1991 if not ignoreEndTag: | |
1992 return token | |
1993 | |
1994 def endTagColgroup(self, token): | |
1995 if self.ignoreEndTagColgroup(): | |
1996 # innerHTML case | |
1997 assert self.parser.innerHTML | |
1998 self.parser.parseError() | |
1999 else: | |
2000 self.tree.openElements.pop() | |
2001 self.parser.phase = self.parser.phases["inTable"] | |
2002 | |
2003 def endTagCol(self, token): | |
2004 self.parser.parseError("no-end-tag", {"name": "col"}) | |
2005 | |
2006 def endTagOther(self, token): | |
2007 ignoreEndTag = self.ignoreEndTagColgroup() | |
2008 self.endTagColgroup(impliedTagToken("colgroup")) | |
2009 if not ignoreEndTag: | |
2010 return token | |
2011 | |
2012 startTagHandler = _utils.MethodDispatcher([ | |
2013 ("html", Phase.startTagHtml), | |
2014 ("col", startTagCol) | |
2015 ]) | |
2016 startTagHandler.default = startTagOther | |
2017 | |
2018 endTagHandler = _utils.MethodDispatcher([ | |
2019 ("colgroup", endTagColgroup), | |
2020 ("col", endTagCol) | |
2021 ]) | |
2022 endTagHandler.default = endTagOther | |
2023 | |
2024 class InTableBodyPhase(Phase): | |
2025 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 | |
2026 __slots__ = tuple() | |
2027 | |
2028 # helper methods | |
2029 def clearStackToTableBodyContext(self): | |
2030 while self.tree.openElements[-1].name not in ("tbody", "tfoot", | |
2031 "thead", "html"): | |
2032 # self.parser.parseError("unexpected-implied-end-tag-in-table", | |
2033 # {"name": self.tree.openElements[-1].name}) | |
2034 self.tree.openElements.pop() | |
2035 if self.tree.openElements[-1].name == "html": | |
2036 assert self.parser.innerHTML | |
2037 | |
2038 # the rest | |
2039 def processEOF(self): | |
2040 self.parser.phases["inTable"].processEOF() | |
2041 | |
2042 def processSpaceCharacters(self, token): | |
2043 return self.parser.phases["inTable"].processSpaceCharacters(token) | |
2044 | |
2045 def processCharacters(self, token): | |
2046 return self.parser.phases["inTable"].processCharacters(token) | |
2047 | |
2048 def startTagTr(self, token): | |
2049 self.clearStackToTableBodyContext() | |
2050 self.tree.insertElement(token) | |
2051 self.parser.phase = self.parser.phases["inRow"] | |
2052 | |
2053 def startTagTableCell(self, token): | |
2054 self.parser.parseError("unexpected-cell-in-table-body", | |
2055 {"name": token["name"]}) | |
2056 self.startTagTr(impliedTagToken("tr", "StartTag")) | |
2057 return token | |
2058 | |
2059 def startTagTableOther(self, token): | |
2060 # XXX AT Any ideas on how to share this with endTagTable? | |
2061 if (self.tree.elementInScope("tbody", variant="table") or | |
2062 self.tree.elementInScope("thead", variant="table") or | |
2063 self.tree.elementInScope("tfoot", variant="table")): | |
2064 self.clearStackToTableBodyContext() | |
2065 self.endTagTableRowGroup( | |
2066 impliedTagToken(self.tree.openElements[-1].name)) | |
2067 return token | |
2068 else: | |
2069 # innerHTML case | |
2070 assert self.parser.innerHTML | |
2071 self.parser.parseError() | |
2072 | |
2073 def startTagOther(self, token): | |
2074 return self.parser.phases["inTable"].processStartTag(token) | |
2075 | |
2076 def endTagTableRowGroup(self, token): | |
2077 if self.tree.elementInScope(token["name"], variant="table"): | |
2078 self.clearStackToTableBodyContext() | |
2079 self.tree.openElements.pop() | |
2080 self.parser.phase = self.parser.phases["inTable"] | |
2081 else: | |
2082 self.parser.parseError("unexpected-end-tag-in-table-body", | |
2083 {"name": token["name"]}) | |
2084 | |
2085 def endTagTable(self, token): | |
2086 if (self.tree.elementInScope("tbody", variant="table") or | |
2087 self.tree.elementInScope("thead", variant="table") or | |
2088 self.tree.elementInScope("tfoot", variant="table")): | |
2089 self.clearStackToTableBodyContext() | |
2090 self.endTagTableRowGroup( | |
2091 impliedTagToken(self.tree.openElements[-1].name)) | |
2092 return token | |
2093 else: | |
2094 # innerHTML case | |
2095 assert self.parser.innerHTML | |
2096 self.parser.parseError() | |
2097 | |
2098 def endTagIgnore(self, token): | |
2099 self.parser.parseError("unexpected-end-tag-in-table-body", | |
2100 {"name": token["name"]}) | |
2101 | |
2102 def endTagOther(self, token): | |
2103 return self.parser.phases["inTable"].processEndTag(token) | |
2104 | |
2105 startTagHandler = _utils.MethodDispatcher([ | |
2106 ("html", Phase.startTagHtml), | |
2107 ("tr", startTagTr), | |
2108 (("td", "th"), startTagTableCell), | |
2109 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), | |
2110 startTagTableOther) | |
2111 ]) | |
2112 startTagHandler.default = startTagOther | |
2113 | |
2114 endTagHandler = _utils.MethodDispatcher([ | |
2115 (("tbody", "tfoot", "thead"), endTagTableRowGroup), | |
2116 ("table", endTagTable), | |
2117 (("body", "caption", "col", "colgroup", "html", "td", "th", | |
2118 "tr"), endTagIgnore) | |
2119 ]) | |
2120 endTagHandler.default = endTagOther | |
2121 | |
2122 class InRowPhase(Phase): | |
2123 # http://www.whatwg.org/specs/web-apps/current-work/#in-row | |
2124 __slots__ = tuple() | |
2125 | |
2126 # helper methods (XXX unify this with other table helper methods) | |
2127 def clearStackToTableRowContext(self): | |
2128 while self.tree.openElements[-1].name not in ("tr", "html"): | |
2129 self.parser.parseError("unexpected-implied-end-tag-in-table-row", | |
2130 {"name": self.tree.openElements[-1].name}) | |
2131 self.tree.openElements.pop() | |
2132 | |
2133 def ignoreEndTagTr(self): | |
2134 return not self.tree.elementInScope("tr", variant="table") | |
2135 | |
2136 # the rest | |
2137 def processEOF(self): | |
2138 self.parser.phases["inTable"].processEOF() | |
2139 | |
2140 def processSpaceCharacters(self, token): | |
2141 return self.parser.phases["inTable"].processSpaceCharacters(token) | |
2142 | |
2143 def processCharacters(self, token): | |
2144 return self.parser.phases["inTable"].processCharacters(token) | |
2145 | |
2146 def startTagTableCell(self, token): | |
2147 self.clearStackToTableRowContext() | |
2148 self.tree.insertElement(token) | |
2149 self.parser.phase = self.parser.phases["inCell"] | |
2150 self.tree.activeFormattingElements.append(Marker) | |
2151 | |
2152 def startTagTableOther(self, token): | |
2153 ignoreEndTag = self.ignoreEndTagTr() | |
2154 self.endTagTr(impliedTagToken("tr")) | |
2155 # XXX how are we sure it's always ignored in the innerHTML case? | |
2156 if not ignoreEndTag: | |
2157 return token | |
2158 | |
2159 def startTagOther(self, token): | |
2160 return self.parser.phases["inTable"].processStartTag(token) | |
2161 | |
2162 def endTagTr(self, token): | |
2163 if not self.ignoreEndTagTr(): | |
2164 self.clearStackToTableRowContext() | |
2165 self.tree.openElements.pop() | |
2166 self.parser.phase = self.parser.phases["inTableBody"] | |
2167 else: | |
2168 # innerHTML case | |
2169 assert self.parser.innerHTML | |
2170 self.parser.parseError() | |
2171 | |
2172 def endTagTable(self, token): | |
2173 ignoreEndTag = self.ignoreEndTagTr() | |
2174 self.endTagTr(impliedTagToken("tr")) | |
2175 # Reprocess the current tag if the tr end tag was not ignored | |
2176 # XXX how are we sure it's always ignored in the innerHTML case? | |
2177 if not ignoreEndTag: | |
2178 return token | |
2179 | |
2180 def endTagTableRowGroup(self, token): | |
2181 if self.tree.elementInScope(token["name"], variant="table"): | |
2182 self.endTagTr(impliedTagToken("tr")) | |
2183 return token | |
2184 else: | |
2185 self.parser.parseError() | |
2186 | |
2187 def endTagIgnore(self, token): | |
2188 self.parser.parseError("unexpected-end-tag-in-table-row", | |
2189 {"name": token["name"]}) | |
2190 | |
2191 def endTagOther(self, token): | |
2192 return self.parser.phases["inTable"].processEndTag(token) | |
2193 | |
2194 startTagHandler = _utils.MethodDispatcher([ | |
2195 ("html", Phase.startTagHtml), | |
2196 (("td", "th"), startTagTableCell), | |
2197 (("caption", "col", "colgroup", "tbody", "tfoot", "thead", | |
2198 "tr"), startTagTableOther) | |
2199 ]) | |
2200 startTagHandler.default = startTagOther | |
2201 | |
2202 endTagHandler = _utils.MethodDispatcher([ | |
2203 ("tr", endTagTr), | |
2204 ("table", endTagTable), | |
2205 (("tbody", "tfoot", "thead"), endTagTableRowGroup), | |
2206 (("body", "caption", "col", "colgroup", "html", "td", "th"), | |
2207 endTagIgnore) | |
2208 ]) | |
2209 endTagHandler.default = endTagOther | |
2210 | |
2211 class InCellPhase(Phase): | |
2212 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell | |
2213 __slots__ = tuple() | |
2214 | |
2215 # helper | |
2216 def closeCell(self): | |
2217 if self.tree.elementInScope("td", variant="table"): | |
2218 self.endTagTableCell(impliedTagToken("td")) | |
2219 elif self.tree.elementInScope("th", variant="table"): | |
2220 self.endTagTableCell(impliedTagToken("th")) | |
2221 | |
2222 # the rest | |
2223 def processEOF(self): | |
2224 self.parser.phases["inBody"].processEOF() | |
2225 | |
2226 def processCharacters(self, token): | |
2227 return self.parser.phases["inBody"].processCharacters(token) | |
2228 | |
2229 def startTagTableOther(self, token): | |
2230 if (self.tree.elementInScope("td", variant="table") or | |
2231 self.tree.elementInScope("th", variant="table")): | |
2232 self.closeCell() | |
2233 return token | |
2234 else: | |
2235 # innerHTML case | |
2236 assert self.parser.innerHTML | |
2237 self.parser.parseError() | |
2238 | |
2239 def startTagOther(self, token): | |
2240 return self.parser.phases["inBody"].processStartTag(token) | |
2241 | |
2242 def endTagTableCell(self, token): | |
2243 if self.tree.elementInScope(token["name"], variant="table"): | |
2244 self.tree.generateImpliedEndTags(token["name"]) | |
2245 if self.tree.openElements[-1].name != token["name"]: | |
2246 self.parser.parseError("unexpected-cell-end-tag", | |
2247 {"name": token["name"]}) | |
2248 while True: | |
2249 node = self.tree.openElements.pop() | |
2250 if node.name == token["name"]: | |
2251 break | |
2252 else: | |
2253 self.tree.openElements.pop() | |
2254 self.tree.clearActiveFormattingElements() | |
2255 self.parser.phase = self.parser.phases["inRow"] | |
2256 else: | |
2257 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
2258 | |
2259 def endTagIgnore(self, token): | |
2260 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
2261 | |
2262 def endTagImply(self, token): | |
2263 if self.tree.elementInScope(token["name"], variant="table"): | |
2264 self.closeCell() | |
2265 return token | |
2266 else: | |
2267 # sometimes innerHTML case | |
2268 self.parser.parseError() | |
2269 | |
2270 def endTagOther(self, token): | |
2271 return self.parser.phases["inBody"].processEndTag(token) | |
2272 | |
2273 startTagHandler = _utils.MethodDispatcher([ | |
2274 ("html", Phase.startTagHtml), | |
2275 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", | |
2276 "thead", "tr"), startTagTableOther) | |
2277 ]) | |
2278 startTagHandler.default = startTagOther | |
2279 | |
2280 endTagHandler = _utils.MethodDispatcher([ | |
2281 (("td", "th"), endTagTableCell), | |
2282 (("body", "caption", "col", "colgroup", "html"), endTagIgnore), | |
2283 (("table", "tbody", "tfoot", "thead", "tr"), endTagImply) | |
2284 ]) | |
2285 endTagHandler.default = endTagOther | |
2286 | |
2287 class InSelectPhase(Phase): | |
2288 __slots__ = tuple() | |
2289 | |
2290 # http://www.whatwg.org/specs/web-apps/current-work/#in-select | |
2291 def processEOF(self): | |
2292 if self.tree.openElements[-1].name != "html": | |
2293 self.parser.parseError("eof-in-select") | |
2294 else: | |
2295 assert self.parser.innerHTML | |
2296 | |
2297 def processCharacters(self, token): | |
2298 if token["data"] == "\u0000": | |
2299 return | |
2300 self.tree.insertText(token["data"]) | |
2301 | |
2302 def startTagOption(self, token): | |
2303 # We need to imply </option> if <option> is the current node. | |
2304 if self.tree.openElements[-1].name == "option": | |
2305 self.tree.openElements.pop() | |
2306 self.tree.insertElement(token) | |
2307 | |
2308 def startTagOptgroup(self, token): | |
2309 if self.tree.openElements[-1].name == "option": | |
2310 self.tree.openElements.pop() | |
2311 if self.tree.openElements[-1].name == "optgroup": | |
2312 self.tree.openElements.pop() | |
2313 self.tree.insertElement(token) | |
2314 | |
2315 def startTagSelect(self, token): | |
2316 self.parser.parseError("unexpected-select-in-select") | |
2317 self.endTagSelect(impliedTagToken("select")) | |
2318 | |
2319 def startTagInput(self, token): | |
2320 self.parser.parseError("unexpected-input-in-select") | |
2321 if self.tree.elementInScope("select", variant="select"): | |
2322 self.endTagSelect(impliedTagToken("select")) | |
2323 return token | |
2324 else: | |
2325 assert self.parser.innerHTML | |
2326 | |
2327 def startTagScript(self, token): | |
2328 return self.parser.phases["inHead"].processStartTag(token) | |
2329 | |
2330 def startTagOther(self, token): | |
2331 self.parser.parseError("unexpected-start-tag-in-select", | |
2332 {"name": token["name"]}) | |
2333 | |
2334 def endTagOption(self, token): | |
2335 if self.tree.openElements[-1].name == "option": | |
2336 self.tree.openElements.pop() | |
2337 else: | |
2338 self.parser.parseError("unexpected-end-tag-in-select", | |
2339 {"name": "option"}) | |
2340 | |
2341 def endTagOptgroup(self, token): | |
2342 # </optgroup> implicitly closes <option> | |
2343 if (self.tree.openElements[-1].name == "option" and | |
2344 self.tree.openElements[-2].name == "optgroup"): | |
2345 self.tree.openElements.pop() | |
2346 # It also closes </optgroup> | |
2347 if self.tree.openElements[-1].name == "optgroup": | |
2348 self.tree.openElements.pop() | |
2349 # But nothing else | |
2350 else: | |
2351 self.parser.parseError("unexpected-end-tag-in-select", | |
2352 {"name": "optgroup"}) | |
2353 | |
2354 def endTagSelect(self, token): | |
2355 if self.tree.elementInScope("select", variant="select"): | |
2356 node = self.tree.openElements.pop() | |
2357 while node.name != "select": | |
2358 node = self.tree.openElements.pop() | |
2359 self.parser.resetInsertionMode() | |
2360 else: | |
2361 # innerHTML case | |
2362 assert self.parser.innerHTML | |
2363 self.parser.parseError() | |
2364 | |
2365 def endTagOther(self, token): | |
2366 self.parser.parseError("unexpected-end-tag-in-select", | |
2367 {"name": token["name"]}) | |
2368 | |
2369 startTagHandler = _utils.MethodDispatcher([ | |
2370 ("html", Phase.startTagHtml), | |
2371 ("option", startTagOption), | |
2372 ("optgroup", startTagOptgroup), | |
2373 ("select", startTagSelect), | |
2374 (("input", "keygen", "textarea"), startTagInput), | |
2375 ("script", startTagScript) | |
2376 ]) | |
2377 startTagHandler.default = startTagOther | |
2378 | |
2379 endTagHandler = _utils.MethodDispatcher([ | |
2380 ("option", endTagOption), | |
2381 ("optgroup", endTagOptgroup), | |
2382 ("select", endTagSelect) | |
2383 ]) | |
2384 endTagHandler.default = endTagOther | |
2385 | |
2386 class InSelectInTablePhase(Phase): | |
2387 __slots__ = tuple() | |
2388 | |
2389 def processEOF(self): | |
2390 self.parser.phases["inSelect"].processEOF() | |
2391 | |
2392 def processCharacters(self, token): | |
2393 return self.parser.phases["inSelect"].processCharacters(token) | |
2394 | |
2395 def startTagTable(self, token): | |
2396 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) | |
2397 self.endTagOther(impliedTagToken("select")) | |
2398 return token | |
2399 | |
2400 def startTagOther(self, token): | |
2401 return self.parser.phases["inSelect"].processStartTag(token) | |
2402 | |
2403 def endTagTable(self, token): | |
2404 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) | |
2405 if self.tree.elementInScope(token["name"], variant="table"): | |
2406 self.endTagOther(impliedTagToken("select")) | |
2407 return token | |
2408 | |
2409 def endTagOther(self, token): | |
2410 return self.parser.phases["inSelect"].processEndTag(token) | |
2411 | |
2412 startTagHandler = _utils.MethodDispatcher([ | |
2413 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), | |
2414 startTagTable) | |
2415 ]) | |
2416 startTagHandler.default = startTagOther | |
2417 | |
2418 endTagHandler = _utils.MethodDispatcher([ | |
2419 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), | |
2420 endTagTable) | |
2421 ]) | |
2422 endTagHandler.default = endTagOther | |
2423 | |
2424 class InForeignContentPhase(Phase): | |
2425 __slots__ = tuple() | |
2426 | |
2427 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", | |
2428 "center", "code", "dd", "div", "dl", "dt", | |
2429 "em", "embed", "h1", "h2", "h3", | |
2430 "h4", "h5", "h6", "head", "hr", "i", "img", | |
2431 "li", "listing", "menu", "meta", "nobr", | |
2432 "ol", "p", "pre", "ruby", "s", "small", | |
2433 "span", "strong", "strike", "sub", "sup", | |
2434 "table", "tt", "u", "ul", "var"]) | |
2435 | |
2436 def adjustSVGTagNames(self, token): | |
2437 replacements = {"altglyph": "altGlyph", | |
2438 "altglyphdef": "altGlyphDef", | |
2439 "altglyphitem": "altGlyphItem", | |
2440 "animatecolor": "animateColor", | |
2441 "animatemotion": "animateMotion", | |
2442 "animatetransform": "animateTransform", | |
2443 "clippath": "clipPath", | |
2444 "feblend": "feBlend", | |
2445 "fecolormatrix": "feColorMatrix", | |
2446 "fecomponenttransfer": "feComponentTransfer", | |
2447 "fecomposite": "feComposite", | |
2448 "feconvolvematrix": "feConvolveMatrix", | |
2449 "fediffuselighting": "feDiffuseLighting", | |
2450 "fedisplacementmap": "feDisplacementMap", | |
2451 "fedistantlight": "feDistantLight", | |
2452 "feflood": "feFlood", | |
2453 "fefunca": "feFuncA", | |
2454 "fefuncb": "feFuncB", | |
2455 "fefuncg": "feFuncG", | |
2456 "fefuncr": "feFuncR", | |
2457 "fegaussianblur": "feGaussianBlur", | |
2458 "feimage": "feImage", | |
2459 "femerge": "feMerge", | |
2460 "femergenode": "feMergeNode", | |
2461 "femorphology": "feMorphology", | |
2462 "feoffset": "feOffset", | |
2463 "fepointlight": "fePointLight", | |
2464 "fespecularlighting": "feSpecularLighting", | |
2465 "fespotlight": "feSpotLight", | |
2466 "fetile": "feTile", | |
2467 "feturbulence": "feTurbulence", | |
2468 "foreignobject": "foreignObject", | |
2469 "glyphref": "glyphRef", | |
2470 "lineargradient": "linearGradient", | |
2471 "radialgradient": "radialGradient", | |
2472 "textpath": "textPath"} | |
2473 | |
2474 if token["name"] in replacements: | |
2475 token["name"] = replacements[token["name"]] | |
2476 | |
2477 def processCharacters(self, token): | |
2478 if token["data"] == "\u0000": | |
2479 token["data"] = "\uFFFD" | |
2480 elif (self.parser.framesetOK and | |
2481 any(char not in spaceCharacters for char in token["data"])): | |
2482 self.parser.framesetOK = False | |
2483 Phase.processCharacters(self, token) | |
2484 | |
2485 def processStartTag(self, token): | |
2486 currentNode = self.tree.openElements[-1] | |
2487 if (token["name"] in self.breakoutElements or | |
2488 (token["name"] == "font" and | |
2489 set(token["data"].keys()) & {"color", "face", "size"})): | |
2490 self.parser.parseError("unexpected-html-element-in-foreign-content", | |
2491 {"name": token["name"]}) | |
2492 while (self.tree.openElements[-1].namespace != | |
2493 self.tree.defaultNamespace and | |
2494 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and | |
2495 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): | |
2496 self.tree.openElements.pop() | |
2497 return token | |
2498 | |
2499 else: | |
2500 if currentNode.namespace == namespaces["mathml"]: | |
2501 self.parser.adjustMathMLAttributes(token) | |
2502 elif currentNode.namespace == namespaces["svg"]: | |
2503 self.adjustSVGTagNames(token) | |
2504 self.parser.adjustSVGAttributes(token) | |
2505 self.parser.adjustForeignAttributes(token) | |
2506 token["namespace"] = currentNode.namespace | |
2507 self.tree.insertElement(token) | |
2508 if token["selfClosing"]: | |
2509 self.tree.openElements.pop() | |
2510 token["selfClosingAcknowledged"] = True | |
2511 | |
2512 def processEndTag(self, token): | |
2513 nodeIndex = len(self.tree.openElements) - 1 | |
2514 node = self.tree.openElements[-1] | |
2515 if node.name.translate(asciiUpper2Lower) != token["name"]: | |
2516 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
2517 | |
2518 while True: | |
2519 if node.name.translate(asciiUpper2Lower) == token["name"]: | |
2520 # XXX this isn't in the spec but it seems necessary | |
2521 if self.parser.phase == self.parser.phases["inTableText"]: | |
2522 self.parser.phase.flushCharacters() | |
2523 self.parser.phase = self.parser.phase.originalPhase | |
2524 while self.tree.openElements.pop() != node: | |
2525 assert self.tree.openElements | |
2526 new_token = None | |
2527 break | |
2528 nodeIndex -= 1 | |
2529 | |
2530 node = self.tree.openElements[nodeIndex] | |
2531 if node.namespace != self.tree.defaultNamespace: | |
2532 continue | |
2533 else: | |
2534 new_token = self.parser.phase.processEndTag(token) | |
2535 break | |
2536 return new_token | |
2537 | |
2538 class AfterBodyPhase(Phase): | |
2539 __slots__ = tuple() | |
2540 | |
2541 def processEOF(self): | |
2542 # Stop parsing | |
2543 pass | |
2544 | |
2545 def processComment(self, token): | |
2546 # This is needed because data is to be appended to the <html> element | |
2547 # here and not to whatever is currently open. | |
2548 self.tree.insertComment(token, self.tree.openElements[0]) | |
2549 | |
2550 def processCharacters(self, token): | |
2551 self.parser.parseError("unexpected-char-after-body") | |
2552 self.parser.phase = self.parser.phases["inBody"] | |
2553 return token | |
2554 | |
2555 def startTagHtml(self, token): | |
2556 return self.parser.phases["inBody"].processStartTag(token) | |
2557 | |
2558 def startTagOther(self, token): | |
2559 self.parser.parseError("unexpected-start-tag-after-body", | |
2560 {"name": token["name"]}) | |
2561 self.parser.phase = self.parser.phases["inBody"] | |
2562 return token | |
2563 | |
2564 def endTagHtml(self, name): | |
2565 if self.parser.innerHTML: | |
2566 self.parser.parseError("unexpected-end-tag-after-body-innerhtml") | |
2567 else: | |
2568 self.parser.phase = self.parser.phases["afterAfterBody"] | |
2569 | |
2570 def endTagOther(self, token): | |
2571 self.parser.parseError("unexpected-end-tag-after-body", | |
2572 {"name": token["name"]}) | |
2573 self.parser.phase = self.parser.phases["inBody"] | |
2574 return token | |
2575 | |
2576 startTagHandler = _utils.MethodDispatcher([ | |
2577 ("html", startTagHtml) | |
2578 ]) | |
2579 startTagHandler.default = startTagOther | |
2580 | |
2581 endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)]) | |
2582 endTagHandler.default = endTagOther | |
2583 | |
2584 class InFramesetPhase(Phase): | |
2585 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset | |
2586 __slots__ = tuple() | |
2587 | |
2588 def processEOF(self): | |
2589 if self.tree.openElements[-1].name != "html": | |
2590 self.parser.parseError("eof-in-frameset") | |
2591 else: | |
2592 assert self.parser.innerHTML | |
2593 | |
2594 def processCharacters(self, token): | |
2595 self.parser.parseError("unexpected-char-in-frameset") | |
2596 | |
2597 def startTagFrameset(self, token): | |
2598 self.tree.insertElement(token) | |
2599 | |
2600 def startTagFrame(self, token): | |
2601 self.tree.insertElement(token) | |
2602 self.tree.openElements.pop() | |
2603 | |
2604 def startTagNoframes(self, token): | |
2605 return self.parser.phases["inBody"].processStartTag(token) | |
2606 | |
2607 def startTagOther(self, token): | |
2608 self.parser.parseError("unexpected-start-tag-in-frameset", | |
2609 {"name": token["name"]}) | |
2610 | |
2611 def endTagFrameset(self, token): | |
2612 if self.tree.openElements[-1].name == "html": | |
2613 # innerHTML case | |
2614 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") | |
2615 else: | |
2616 self.tree.openElements.pop() | |
2617 if (not self.parser.innerHTML and | |
2618 self.tree.openElements[-1].name != "frameset"): | |
2619 # If we're not in innerHTML mode and the current node is not a | |
2620 # "frameset" element (anymore) then switch. | |
2621 self.parser.phase = self.parser.phases["afterFrameset"] | |
2622 | |
2623 def endTagOther(self, token): | |
2624 self.parser.parseError("unexpected-end-tag-in-frameset", | |
2625 {"name": token["name"]}) | |
2626 | |
2627 startTagHandler = _utils.MethodDispatcher([ | |
2628 ("html", Phase.startTagHtml), | |
2629 ("frameset", startTagFrameset), | |
2630 ("frame", startTagFrame), | |
2631 ("noframes", startTagNoframes) | |
2632 ]) | |
2633 startTagHandler.default = startTagOther | |
2634 | |
2635 endTagHandler = _utils.MethodDispatcher([ | |
2636 ("frameset", endTagFrameset) | |
2637 ]) | |
2638 endTagHandler.default = endTagOther | |
2639 | |
2640 class AfterFramesetPhase(Phase): | |
2641 # http://www.whatwg.org/specs/web-apps/current-work/#after3 | |
2642 __slots__ = tuple() | |
2643 | |
2644 def processEOF(self): | |
2645 # Stop parsing | |
2646 pass | |
2647 | |
2648 def processCharacters(self, token): | |
2649 self.parser.parseError("unexpected-char-after-frameset") | |
2650 | |
2651 def startTagNoframes(self, token): | |
2652 return self.parser.phases["inHead"].processStartTag(token) | |
2653 | |
2654 def startTagOther(self, token): | |
2655 self.parser.parseError("unexpected-start-tag-after-frameset", | |
2656 {"name": token["name"]}) | |
2657 | |
2658 def endTagHtml(self, token): | |
2659 self.parser.phase = self.parser.phases["afterAfterFrameset"] | |
2660 | |
2661 def endTagOther(self, token): | |
2662 self.parser.parseError("unexpected-end-tag-after-frameset", | |
2663 {"name": token["name"]}) | |
2664 | |
2665 startTagHandler = _utils.MethodDispatcher([ | |
2666 ("html", Phase.startTagHtml), | |
2667 ("noframes", startTagNoframes) | |
2668 ]) | |
2669 startTagHandler.default = startTagOther | |
2670 | |
2671 endTagHandler = _utils.MethodDispatcher([ | |
2672 ("html", endTagHtml) | |
2673 ]) | |
2674 endTagHandler.default = endTagOther | |
2675 | |
2676 class AfterAfterBodyPhase(Phase): | |
2677 __slots__ = tuple() | |
2678 | |
2679 def processEOF(self): | |
2680 pass | |
2681 | |
2682 def processComment(self, token): | |
2683 self.tree.insertComment(token, self.tree.document) | |
2684 | |
2685 def processSpaceCharacters(self, token): | |
2686 return self.parser.phases["inBody"].processSpaceCharacters(token) | |
2687 | |
2688 def processCharacters(self, token): | |
2689 self.parser.parseError("expected-eof-but-got-char") | |
2690 self.parser.phase = self.parser.phases["inBody"] | |
2691 return token | |
2692 | |
2693 def startTagHtml(self, token): | |
2694 return self.parser.phases["inBody"].processStartTag(token) | |
2695 | |
2696 def startTagOther(self, token): | |
2697 self.parser.parseError("expected-eof-but-got-start-tag", | |
2698 {"name": token["name"]}) | |
2699 self.parser.phase = self.parser.phases["inBody"] | |
2700 return token | |
2701 | |
2702 def processEndTag(self, token): | |
2703 self.parser.parseError("expected-eof-but-got-end-tag", | |
2704 {"name": token["name"]}) | |
2705 self.parser.phase = self.parser.phases["inBody"] | |
2706 return token | |
2707 | |
2708 startTagHandler = _utils.MethodDispatcher([ | |
2709 ("html", startTagHtml) | |
2710 ]) | |
2711 startTagHandler.default = startTagOther | |
2712 | |
2713 class AfterAfterFramesetPhase(Phase): | |
2714 __slots__ = tuple() | |
2715 | |
2716 def processEOF(self): | |
2717 pass | |
2718 | |
2719 def processComment(self, token): | |
2720 self.tree.insertComment(token, self.tree.document) | |
2721 | |
2722 def processSpaceCharacters(self, token): | |
2723 return self.parser.phases["inBody"].processSpaceCharacters(token) | |
2724 | |
2725 def processCharacters(self, token): | |
2726 self.parser.parseError("expected-eof-but-got-char") | |
2727 | |
2728 def startTagHtml(self, token): | |
2729 return self.parser.phases["inBody"].processStartTag(token) | |
2730 | |
2731 def startTagNoFrames(self, token): | |
2732 return self.parser.phases["inHead"].processStartTag(token) | |
2733 | |
2734 def startTagOther(self, token): | |
2735 self.parser.parseError("expected-eof-but-got-start-tag", | |
2736 {"name": token["name"]}) | |
2737 | |
2738 def processEndTag(self, token): | |
2739 self.parser.parseError("expected-eof-but-got-end-tag", | |
2740 {"name": token["name"]}) | |
2741 | |
2742 startTagHandler = _utils.MethodDispatcher([ | |
2743 ("html", startTagHtml), | |
2744 ("noframes", startTagNoFrames) | |
2745 ]) | |
2746 startTagHandler.default = startTagOther | |
2747 | |
2748 # pylint:enable=unused-argument | |
2749 | |
2750 return { | |
2751 "initial": InitialPhase, | |
2752 "beforeHtml": BeforeHtmlPhase, | |
2753 "beforeHead": BeforeHeadPhase, | |
2754 "inHead": InHeadPhase, | |
2755 "inHeadNoscript": InHeadNoscriptPhase, | |
2756 "afterHead": AfterHeadPhase, | |
2757 "inBody": InBodyPhase, | |
2758 "text": TextPhase, | |
2759 "inTable": InTablePhase, | |
2760 "inTableText": InTableTextPhase, | |
2761 "inCaption": InCaptionPhase, | |
2762 "inColumnGroup": InColumnGroupPhase, | |
2763 "inTableBody": InTableBodyPhase, | |
2764 "inRow": InRowPhase, | |
2765 "inCell": InCellPhase, | |
2766 "inSelect": InSelectPhase, | |
2767 "inSelectInTable": InSelectInTablePhase, | |
2768 "inForeignContent": InForeignContentPhase, | |
2769 "afterBody": AfterBodyPhase, | |
2770 "inFrameset": InFramesetPhase, | |
2771 "afterFrameset": AfterFramesetPhase, | |
2772 "afterAfterBody": AfterAfterBodyPhase, | |
2773 "afterAfterFrameset": AfterAfterFramesetPhase, | |
2774 # XXX after after frameset | |
2775 } | |
2776 | |
2777 | |
2778 def adjust_attributes(token, replacements): | |
2779 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) | |
2780 if needs_adjustment: | |
2781 token['data'] = type(token['data'])((replacements.get(k, k), v) | |
2782 for k, v in token['data'].items()) | |
2783 | |
2784 | |
2785 def impliedTagToken(name, type="EndTag", attributes=None, | |
2786 selfClosing=False): | |
2787 if attributes is None: | |
2788 attributes = {} | |
2789 return {"type": tokenTypes[type], "name": name, "data": attributes, | |
2790 "selfClosing": selfClosing} | |
2791 | |
2792 | |
2793 class ParseError(Exception): | |
2794 """Error in parsed document""" | |
2795 pass |