comparison env/lib/python3.9/site-packages/bleach/_vendor/html5lib/_tokenizer.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 from __future__ import absolute_import, division, unicode_literals
2
3 from six import unichr as chr
4
5 from collections import deque, OrderedDict
6 from sys import version_info
7
8 from .constants import spaceCharacters
9 from .constants import entities
10 from .constants import asciiLetters, asciiUpper2Lower
11 from .constants import digits, hexDigits, EOF
12 from .constants import tokenTypes, tagTokenTypes
13 from .constants import replacementCharacters
14
15 from ._inputstream import HTMLInputStream
16
17 from ._trie import Trie
18
19 entitiesTrie = Trie(entities)
20
21 if version_info >= (3, 7):
22 attributeMap = dict
23 else:
24 attributeMap = OrderedDict
25
26
27 class HTMLTokenizer(object):
28 """ This class takes care of tokenizing HTML.
29
30 * self.currentToken
31 Holds the token that is currently being processed.
32
33 * self.state
34 Holds a reference to the method to be invoked... XXX
35
36 * self.stream
37 Points to HTMLInputStream object.
38 """
39
40 def __init__(self, stream, parser=None, **kwargs):
41
42 self.stream = HTMLInputStream(stream, **kwargs)
43 self.parser = parser
44
45 # Setup the initial tokenizer state
46 self.escapeFlag = False
47 self.lastFourChars = []
48 self.state = self.dataState
49 self.escape = False
50
51 # The current token being created
52 self.currentToken = None
53 super(HTMLTokenizer, self).__init__()
54
55 def __iter__(self):
56 """ This is where the magic happens.
57
58 We do our usually processing through the states and when we have a token
59 to return we yield the token which pauses processing until the next token
60 is requested.
61 """
62 self.tokenQueue = deque([])
63 # Start processing. When EOF is reached self.state will return False
64 # instead of True and the loop will terminate.
65 while self.state():
66 while self.stream.errors:
67 yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
68 while self.tokenQueue:
69 yield self.tokenQueue.popleft()
70
71 def consumeNumberEntity(self, isHex):
72 """This function returns either U+FFFD or the character based on the
73 decimal or hexadecimal representation. It also discards ";" if present.
74 If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
75 """
76
77 allowed = digits
78 radix = 10
79 if isHex:
80 allowed = hexDigits
81 radix = 16
82
83 charStack = []
84
85 # Consume all the characters that are in range while making sure we
86 # don't hit an EOF.
87 c = self.stream.char()
88 while c in allowed and c is not EOF:
89 charStack.append(c)
90 c = self.stream.char()
91
92 # Convert the set of characters consumed to an int.
93 charAsInt = int("".join(charStack), radix)
94
95 # Certain characters get replaced with others
96 if charAsInt in replacementCharacters:
97 char = replacementCharacters[charAsInt]
98 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
99 "illegal-codepoint-for-numeric-entity",
100 "datavars": {"charAsInt": charAsInt}})
101 elif ((0xD800 <= charAsInt <= 0xDFFF) or
102 (charAsInt > 0x10FFFF)):
103 char = "\uFFFD"
104 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
105 "illegal-codepoint-for-numeric-entity",
106 "datavars": {"charAsInt": charAsInt}})
107 else:
108 # Should speed up this check somehow (e.g. move the set to a constant)
109 if ((0x0001 <= charAsInt <= 0x0008) or
110 (0x000E <= charAsInt <= 0x001F) or
111 (0x007F <= charAsInt <= 0x009F) or
112 (0xFDD0 <= charAsInt <= 0xFDEF) or
113 charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
114 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
115 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
116 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
117 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
118 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
119 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
120 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
121 0xFFFFF, 0x10FFFE, 0x10FFFF])):
122 self.tokenQueue.append({"type": tokenTypes["ParseError"],
123 "data":
124 "illegal-codepoint-for-numeric-entity",
125 "datavars": {"charAsInt": charAsInt}})
126 try:
127 # Try/except needed as UCS-2 Python builds' unichar only works
128 # within the BMP.
129 char = chr(charAsInt)
130 except ValueError:
131 v = charAsInt - 0x10000
132 char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
133
134 # Discard the ; if present. Otherwise, put it back on the queue and
135 # invoke parseError on parser.
136 if c != ";":
137 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
138 "numeric-entity-without-semicolon"})
139 self.stream.unget(c)
140
141 return char
142
143 def consumeEntity(self, allowedChar=None, fromAttribute=False):
144 # Initialise to the default output for when no entity is matched
145 output = "&"
146
147 charStack = [self.stream.char()]
148 if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
149 (allowedChar is not None and allowedChar == charStack[0])):
150 self.stream.unget(charStack[0])
151
152 elif charStack[0] == "#":
153 # Read the next character to see if it's hex or decimal
154 hex = False
155 charStack.append(self.stream.char())
156 if charStack[-1] in ("x", "X"):
157 hex = True
158 charStack.append(self.stream.char())
159
160 # charStack[-1] should be the first digit
161 if (hex and charStack[-1] in hexDigits) \
162 or (not hex and charStack[-1] in digits):
163 # At least one digit found, so consume the whole number
164 self.stream.unget(charStack[-1])
165 output = self.consumeNumberEntity(hex)
166 else:
167 # No digits found
168 self.tokenQueue.append({"type": tokenTypes["ParseError"],
169 "data": "expected-numeric-entity"})
170 self.stream.unget(charStack.pop())
171 output = "&" + "".join(charStack)
172
173 else:
174 # At this point in the process might have named entity. Entities
175 # are stored in the global variable "entities".
176 #
177 # Consume characters and compare to these to a substring of the
178 # entity names in the list until the substring no longer matches.
179 while (charStack[-1] is not EOF):
180 if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
181 break
182 charStack.append(self.stream.char())
183
184 # At this point we have a string that starts with some characters
185 # that may match an entity
186 # Try to find the longest entity the string will match to take care
187 # of &noti for instance.
188 try:
189 entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
190 entityLength = len(entityName)
191 except KeyError:
192 entityName = None
193
194 if entityName is not None:
195 if entityName[-1] != ";":
196 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
197 "named-entity-without-semicolon"})
198 if (entityName[-1] != ";" and fromAttribute and
199 (charStack[entityLength] in asciiLetters or
200 charStack[entityLength] in digits or
201 charStack[entityLength] == "=")):
202 self.stream.unget(charStack.pop())
203 output = "&" + "".join(charStack)
204 else:
205 output = entities[entityName]
206 self.stream.unget(charStack.pop())
207 output += "".join(charStack[entityLength:])
208 else:
209 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
210 "expected-named-entity"})
211 self.stream.unget(charStack.pop())
212 output = "&" + "".join(charStack)
213
214 if fromAttribute:
215 self.currentToken["data"][-1][1] += output
216 else:
217 if output in spaceCharacters:
218 tokenType = "SpaceCharacters"
219 else:
220 tokenType = "Characters"
221 self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
222
223 def processEntityInAttribute(self, allowedChar):
224 """This method replaces the need for "entityInAttributeValueState".
225 """
226 self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
227
228 def emitCurrentToken(self):
229 """This method is a generic handler for emitting the tags. It also sets
230 the state to "data" because that's what's needed after a token has been
231 emitted.
232 """
233 token = self.currentToken
234 # Add token to the queue to be yielded
235 if (token["type"] in tagTokenTypes):
236 token["name"] = token["name"].translate(asciiUpper2Lower)
237 if token["type"] == tokenTypes["StartTag"]:
238 raw = token["data"]
239 data = attributeMap(raw)
240 if len(raw) > len(data):
241 # we had some duplicated attribute, fix so first wins
242 data.update(raw[::-1])
243 token["data"] = data
244
245 if token["type"] == tokenTypes["EndTag"]:
246 if token["data"]:
247 self.tokenQueue.append({"type": tokenTypes["ParseError"],
248 "data": "attributes-in-end-tag"})
249 if token["selfClosing"]:
250 self.tokenQueue.append({"type": tokenTypes["ParseError"],
251 "data": "self-closing-flag-on-end-tag"})
252 self.tokenQueue.append(token)
253 self.state = self.dataState
254
255 # Below are the various tokenizer states worked out.
256 def dataState(self):
257 data = self.stream.char()
258 if data == "&":
259 self.state = self.entityDataState
260 elif data == "<":
261 self.state = self.tagOpenState
262 elif data == "\u0000":
263 self.tokenQueue.append({"type": tokenTypes["ParseError"],
264 "data": "invalid-codepoint"})
265 self.tokenQueue.append({"type": tokenTypes["Characters"],
266 "data": "\u0000"})
267 elif data is EOF:
268 # Tokenization ends.
269 return False
270 elif data in spaceCharacters:
271 # Directly after emitting a token you switch back to the "data
272 # state". At that point spaceCharacters are important so they are
273 # emitted separately.
274 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
275 data + self.stream.charsUntil(spaceCharacters, True)})
276 # No need to update lastFourChars here, since the first space will
277 # have already been appended to lastFourChars and will have broken
278 # any <!-- or --> sequences
279 else:
280 chars = self.stream.charsUntil(("&", "<", "\u0000"))
281 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
282 data + chars})
283 return True
284
285 def entityDataState(self):
286 self.consumeEntity()
287 self.state = self.dataState
288 return True
289
290 def rcdataState(self):
291 data = self.stream.char()
292 if data == "&":
293 self.state = self.characterReferenceInRcdata
294 elif data == "<":
295 self.state = self.rcdataLessThanSignState
296 elif data == EOF:
297 # Tokenization ends.
298 return False
299 elif data == "\u0000":
300 self.tokenQueue.append({"type": tokenTypes["ParseError"],
301 "data": "invalid-codepoint"})
302 self.tokenQueue.append({"type": tokenTypes["Characters"],
303 "data": "\uFFFD"})
304 elif data in spaceCharacters:
305 # Directly after emitting a token you switch back to the "data
306 # state". At that point spaceCharacters are important so they are
307 # emitted separately.
308 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
309 data + self.stream.charsUntil(spaceCharacters, True)})
310 # No need to update lastFourChars here, since the first space will
311 # have already been appended to lastFourChars and will have broken
312 # any <!-- or --> sequences
313 else:
314 chars = self.stream.charsUntil(("&", "<", "\u0000"))
315 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
316 data + chars})
317 return True
318
319 def characterReferenceInRcdata(self):
320 self.consumeEntity()
321 self.state = self.rcdataState
322 return True
323
324 def rawtextState(self):
325 data = self.stream.char()
326 if data == "<":
327 self.state = self.rawtextLessThanSignState
328 elif data == "\u0000":
329 self.tokenQueue.append({"type": tokenTypes["ParseError"],
330 "data": "invalid-codepoint"})
331 self.tokenQueue.append({"type": tokenTypes["Characters"],
332 "data": "\uFFFD"})
333 elif data == EOF:
334 # Tokenization ends.
335 return False
336 else:
337 chars = self.stream.charsUntil(("<", "\u0000"))
338 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
339 data + chars})
340 return True
341
342 def scriptDataState(self):
343 data = self.stream.char()
344 if data == "<":
345 self.state = self.scriptDataLessThanSignState
346 elif data == "\u0000":
347 self.tokenQueue.append({"type": tokenTypes["ParseError"],
348 "data": "invalid-codepoint"})
349 self.tokenQueue.append({"type": tokenTypes["Characters"],
350 "data": "\uFFFD"})
351 elif data == EOF:
352 # Tokenization ends.
353 return False
354 else:
355 chars = self.stream.charsUntil(("<", "\u0000"))
356 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
357 data + chars})
358 return True
359
360 def plaintextState(self):
361 data = self.stream.char()
362 if data == EOF:
363 # Tokenization ends.
364 return False
365 elif data == "\u0000":
366 self.tokenQueue.append({"type": tokenTypes["ParseError"],
367 "data": "invalid-codepoint"})
368 self.tokenQueue.append({"type": tokenTypes["Characters"],
369 "data": "\uFFFD"})
370 else:
371 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
372 data + self.stream.charsUntil("\u0000")})
373 return True
374
375 def tagOpenState(self):
376 data = self.stream.char()
377 if data == "!":
378 self.state = self.markupDeclarationOpenState
379 elif data == "/":
380 self.state = self.closeTagOpenState
381 elif data in asciiLetters:
382 self.currentToken = {"type": tokenTypes["StartTag"],
383 "name": data, "data": [],
384 "selfClosing": False,
385 "selfClosingAcknowledged": False}
386 self.state = self.tagNameState
387 elif data == ">":
388 # XXX In theory it could be something besides a tag name. But
389 # do we really care?
390 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
391 "expected-tag-name-but-got-right-bracket"})
392 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
393 self.state = self.dataState
394 elif data == "?":
395 # XXX In theory it could be something besides a tag name. But
396 # do we really care?
397 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
398 "expected-tag-name-but-got-question-mark"})
399 self.stream.unget(data)
400 self.state = self.bogusCommentState
401 else:
402 # XXX
403 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
404 "expected-tag-name"})
405 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
406 self.stream.unget(data)
407 self.state = self.dataState
408 return True
409
410 def closeTagOpenState(self):
411 data = self.stream.char()
412 if data in asciiLetters:
413 self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
414 "data": [], "selfClosing": False}
415 self.state = self.tagNameState
416 elif data == ">":
417 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
418 "expected-closing-tag-but-got-right-bracket"})
419 self.state = self.dataState
420 elif data is EOF:
421 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
422 "expected-closing-tag-but-got-eof"})
423 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
424 self.state = self.dataState
425 else:
426 # XXX data can be _'_...
427 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
428 "expected-closing-tag-but-got-char",
429 "datavars": {"data": data}})
430 self.stream.unget(data)
431 self.state = self.bogusCommentState
432 return True
433
434 def tagNameState(self):
435 data = self.stream.char()
436 if data in spaceCharacters:
437 self.state = self.beforeAttributeNameState
438 elif data == ">":
439 self.emitCurrentToken()
440 elif data is EOF:
441 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
442 "eof-in-tag-name"})
443 self.state = self.dataState
444 elif data == "/":
445 self.state = self.selfClosingStartTagState
446 elif data == "\u0000":
447 self.tokenQueue.append({"type": tokenTypes["ParseError"],
448 "data": "invalid-codepoint"})
449 self.currentToken["name"] += "\uFFFD"
450 else:
451 self.currentToken["name"] += data
452 # (Don't use charsUntil here, because tag names are
453 # very short and it's faster to not do anything fancy)
454 return True
455
456 def rcdataLessThanSignState(self):
457 data = self.stream.char()
458 if data == "/":
459 self.temporaryBuffer = ""
460 self.state = self.rcdataEndTagOpenState
461 else:
462 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
463 self.stream.unget(data)
464 self.state = self.rcdataState
465 return True
466
467 def rcdataEndTagOpenState(self):
468 data = self.stream.char()
469 if data in asciiLetters:
470 self.temporaryBuffer += data
471 self.state = self.rcdataEndTagNameState
472 else:
473 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
474 self.stream.unget(data)
475 self.state = self.rcdataState
476 return True
477
478 def rcdataEndTagNameState(self):
479 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
480 data = self.stream.char()
481 if data in spaceCharacters and appropriate:
482 self.currentToken = {"type": tokenTypes["EndTag"],
483 "name": self.temporaryBuffer,
484 "data": [], "selfClosing": False}
485 self.state = self.beforeAttributeNameState
486 elif data == "/" and appropriate:
487 self.currentToken = {"type": tokenTypes["EndTag"],
488 "name": self.temporaryBuffer,
489 "data": [], "selfClosing": False}
490 self.state = self.selfClosingStartTagState
491 elif data == ">" and appropriate:
492 self.currentToken = {"type": tokenTypes["EndTag"],
493 "name": self.temporaryBuffer,
494 "data": [], "selfClosing": False}
495 self.emitCurrentToken()
496 self.state = self.dataState
497 elif data in asciiLetters:
498 self.temporaryBuffer += data
499 else:
500 self.tokenQueue.append({"type": tokenTypes["Characters"],
501 "data": "</" + self.temporaryBuffer})
502 self.stream.unget(data)
503 self.state = self.rcdataState
504 return True
505
506 def rawtextLessThanSignState(self):
507 data = self.stream.char()
508 if data == "/":
509 self.temporaryBuffer = ""
510 self.state = self.rawtextEndTagOpenState
511 else:
512 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
513 self.stream.unget(data)
514 self.state = self.rawtextState
515 return True
516
517 def rawtextEndTagOpenState(self):
518 data = self.stream.char()
519 if data in asciiLetters:
520 self.temporaryBuffer += data
521 self.state = self.rawtextEndTagNameState
522 else:
523 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
524 self.stream.unget(data)
525 self.state = self.rawtextState
526 return True
527
528 def rawtextEndTagNameState(self):
529 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
530 data = self.stream.char()
531 if data in spaceCharacters and appropriate:
532 self.currentToken = {"type": tokenTypes["EndTag"],
533 "name": self.temporaryBuffer,
534 "data": [], "selfClosing": False}
535 self.state = self.beforeAttributeNameState
536 elif data == "/" and appropriate:
537 self.currentToken = {"type": tokenTypes["EndTag"],
538 "name": self.temporaryBuffer,
539 "data": [], "selfClosing": False}
540 self.state = self.selfClosingStartTagState
541 elif data == ">" and appropriate:
542 self.currentToken = {"type": tokenTypes["EndTag"],
543 "name": self.temporaryBuffer,
544 "data": [], "selfClosing": False}
545 self.emitCurrentToken()
546 self.state = self.dataState
547 elif data in asciiLetters:
548 self.temporaryBuffer += data
549 else:
550 self.tokenQueue.append({"type": tokenTypes["Characters"],
551 "data": "</" + self.temporaryBuffer})
552 self.stream.unget(data)
553 self.state = self.rawtextState
554 return True
555
556 def scriptDataLessThanSignState(self):
557 data = self.stream.char()
558 if data == "/":
559 self.temporaryBuffer = ""
560 self.state = self.scriptDataEndTagOpenState
561 elif data == "!":
562 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
563 self.state = self.scriptDataEscapeStartState
564 else:
565 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
566 self.stream.unget(data)
567 self.state = self.scriptDataState
568 return True
569
570 def scriptDataEndTagOpenState(self):
571 data = self.stream.char()
572 if data in asciiLetters:
573 self.temporaryBuffer += data
574 self.state = self.scriptDataEndTagNameState
575 else:
576 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
577 self.stream.unget(data)
578 self.state = self.scriptDataState
579 return True
580
581 def scriptDataEndTagNameState(self):
582 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
583 data = self.stream.char()
584 if data in spaceCharacters and appropriate:
585 self.currentToken = {"type": tokenTypes["EndTag"],
586 "name": self.temporaryBuffer,
587 "data": [], "selfClosing": False}
588 self.state = self.beforeAttributeNameState
589 elif data == "/" and appropriate:
590 self.currentToken = {"type": tokenTypes["EndTag"],
591 "name": self.temporaryBuffer,
592 "data": [], "selfClosing": False}
593 self.state = self.selfClosingStartTagState
594 elif data == ">" and appropriate:
595 self.currentToken = {"type": tokenTypes["EndTag"],
596 "name": self.temporaryBuffer,
597 "data": [], "selfClosing": False}
598 self.emitCurrentToken()
599 self.state = self.dataState
600 elif data in asciiLetters:
601 self.temporaryBuffer += data
602 else:
603 self.tokenQueue.append({"type": tokenTypes["Characters"],
604 "data": "</" + self.temporaryBuffer})
605 self.stream.unget(data)
606 self.state = self.scriptDataState
607 return True
608
609 def scriptDataEscapeStartState(self):
610 data = self.stream.char()
611 if data == "-":
612 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
613 self.state = self.scriptDataEscapeStartDashState
614 else:
615 self.stream.unget(data)
616 self.state = self.scriptDataState
617 return True
618
619 def scriptDataEscapeStartDashState(self):
620 data = self.stream.char()
621 if data == "-":
622 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
623 self.state = self.scriptDataEscapedDashDashState
624 else:
625 self.stream.unget(data)
626 self.state = self.scriptDataState
627 return True
628
629 def scriptDataEscapedState(self):
630 data = self.stream.char()
631 if data == "-":
632 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
633 self.state = self.scriptDataEscapedDashState
634 elif data == "<":
635 self.state = self.scriptDataEscapedLessThanSignState
636 elif data == "\u0000":
637 self.tokenQueue.append({"type": tokenTypes["ParseError"],
638 "data": "invalid-codepoint"})
639 self.tokenQueue.append({"type": tokenTypes["Characters"],
640 "data": "\uFFFD"})
641 elif data == EOF:
642 self.state = self.dataState
643 else:
644 chars = self.stream.charsUntil(("<", "-", "\u0000"))
645 self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
646 data + chars})
647 return True
648
649 def scriptDataEscapedDashState(self):
650 data = self.stream.char()
651 if data == "-":
652 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
653 self.state = self.scriptDataEscapedDashDashState
654 elif data == "<":
655 self.state = self.scriptDataEscapedLessThanSignState
656 elif data == "\u0000":
657 self.tokenQueue.append({"type": tokenTypes["ParseError"],
658 "data": "invalid-codepoint"})
659 self.tokenQueue.append({"type": tokenTypes["Characters"],
660 "data": "\uFFFD"})
661 self.state = self.scriptDataEscapedState
662 elif data == EOF:
663 self.state = self.dataState
664 else:
665 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
666 self.state = self.scriptDataEscapedState
667 return True
668
669 def scriptDataEscapedDashDashState(self):
670 data = self.stream.char()
671 if data == "-":
672 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
673 elif data == "<":
674 self.state = self.scriptDataEscapedLessThanSignState
675 elif data == ">":
676 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
677 self.state = self.scriptDataState
678 elif data == "\u0000":
679 self.tokenQueue.append({"type": tokenTypes["ParseError"],
680 "data": "invalid-codepoint"})
681 self.tokenQueue.append({"type": tokenTypes["Characters"],
682 "data": "\uFFFD"})
683 self.state = self.scriptDataEscapedState
684 elif data == EOF:
685 self.state = self.dataState
686 else:
687 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
688 self.state = self.scriptDataEscapedState
689 return True
690
691 def scriptDataEscapedLessThanSignState(self):
692 data = self.stream.char()
693 if data == "/":
694 self.temporaryBuffer = ""
695 self.state = self.scriptDataEscapedEndTagOpenState
696 elif data in asciiLetters:
697 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
698 self.temporaryBuffer = data
699 self.state = self.scriptDataDoubleEscapeStartState
700 else:
701 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
702 self.stream.unget(data)
703 self.state = self.scriptDataEscapedState
704 return True
705
706 def scriptDataEscapedEndTagOpenState(self):
707 data = self.stream.char()
708 if data in asciiLetters:
709 self.temporaryBuffer = data
710 self.state = self.scriptDataEscapedEndTagNameState
711 else:
712 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
713 self.stream.unget(data)
714 self.state = self.scriptDataEscapedState
715 return True
716
717 def scriptDataEscapedEndTagNameState(self):
718 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
719 data = self.stream.char()
720 if data in spaceCharacters and appropriate:
721 self.currentToken = {"type": tokenTypes["EndTag"],
722 "name": self.temporaryBuffer,
723 "data": [], "selfClosing": False}
724 self.state = self.beforeAttributeNameState
725 elif data == "/" and appropriate:
726 self.currentToken = {"type": tokenTypes["EndTag"],
727 "name": self.temporaryBuffer,
728 "data": [], "selfClosing": False}
729 self.state = self.selfClosingStartTagState
730 elif data == ">" and appropriate:
731 self.currentToken = {"type": tokenTypes["EndTag"],
732 "name": self.temporaryBuffer,
733 "data": [], "selfClosing": False}
734 self.emitCurrentToken()
735 self.state = self.dataState
736 elif data in asciiLetters:
737 self.temporaryBuffer += data
738 else:
739 self.tokenQueue.append({"type": tokenTypes["Characters"],
740 "data": "</" + self.temporaryBuffer})
741 self.stream.unget(data)
742 self.state = self.scriptDataEscapedState
743 return True
744
745 def scriptDataDoubleEscapeStartState(self):
746 data = self.stream.char()
747 if data in (spaceCharacters | frozenset(("/", ">"))):
748 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
749 if self.temporaryBuffer.lower() == "script":
750 self.state = self.scriptDataDoubleEscapedState
751 else:
752 self.state = self.scriptDataEscapedState
753 elif data in asciiLetters:
754 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
755 self.temporaryBuffer += data
756 else:
757 self.stream.unget(data)
758 self.state = self.scriptDataEscapedState
759 return True
760
761 def scriptDataDoubleEscapedState(self):
762 data = self.stream.char()
763 if data == "-":
764 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
765 self.state = self.scriptDataDoubleEscapedDashState
766 elif data == "<":
767 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
768 self.state = self.scriptDataDoubleEscapedLessThanSignState
769 elif data == "\u0000":
770 self.tokenQueue.append({"type": tokenTypes["ParseError"],
771 "data": "invalid-codepoint"})
772 self.tokenQueue.append({"type": tokenTypes["Characters"],
773 "data": "\uFFFD"})
774 elif data == EOF:
775 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
776 "eof-in-script-in-script"})
777 self.state = self.dataState
778 else:
779 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
780 return True
781
782 def scriptDataDoubleEscapedDashState(self):
783 data = self.stream.char()
784 if data == "-":
785 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
786 self.state = self.scriptDataDoubleEscapedDashDashState
787 elif data == "<":
788 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
789 self.state = self.scriptDataDoubleEscapedLessThanSignState
790 elif data == "\u0000":
791 self.tokenQueue.append({"type": tokenTypes["ParseError"],
792 "data": "invalid-codepoint"})
793 self.tokenQueue.append({"type": tokenTypes["Characters"],
794 "data": "\uFFFD"})
795 self.state = self.scriptDataDoubleEscapedState
796 elif data == EOF:
797 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
798 "eof-in-script-in-script"})
799 self.state = self.dataState
800 else:
801 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
802 self.state = self.scriptDataDoubleEscapedState
803 return True
804
805 def scriptDataDoubleEscapedDashDashState(self):
806 data = self.stream.char()
807 if data == "-":
808 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
809 elif data == "<":
810 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
811 self.state = self.scriptDataDoubleEscapedLessThanSignState
812 elif data == ">":
813 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
814 self.state = self.scriptDataState
815 elif data == "\u0000":
816 self.tokenQueue.append({"type": tokenTypes["ParseError"],
817 "data": "invalid-codepoint"})
818 self.tokenQueue.append({"type": tokenTypes["Characters"],
819 "data": "\uFFFD"})
820 self.state = self.scriptDataDoubleEscapedState
821 elif data == EOF:
822 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
823 "eof-in-script-in-script"})
824 self.state = self.dataState
825 else:
826 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
827 self.state = self.scriptDataDoubleEscapedState
828 return True
829
830 def scriptDataDoubleEscapedLessThanSignState(self):
831 data = self.stream.char()
832 if data == "/":
833 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
834 self.temporaryBuffer = ""
835 self.state = self.scriptDataDoubleEscapeEndState
836 else:
837 self.stream.unget(data)
838 self.state = self.scriptDataDoubleEscapedState
839 return True
840
841 def scriptDataDoubleEscapeEndState(self):
842 data = self.stream.char()
843 if data in (spaceCharacters | frozenset(("/", ">"))):
844 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
845 if self.temporaryBuffer.lower() == "script":
846 self.state = self.scriptDataEscapedState
847 else:
848 self.state = self.scriptDataDoubleEscapedState
849 elif data in asciiLetters:
850 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
851 self.temporaryBuffer += data
852 else:
853 self.stream.unget(data)
854 self.state = self.scriptDataDoubleEscapedState
855 return True
856
857 def beforeAttributeNameState(self):
858 data = self.stream.char()
859 if data in spaceCharacters:
860 self.stream.charsUntil(spaceCharacters, True)
861 elif data in asciiLetters:
862 self.currentToken["data"].append([data, ""])
863 self.state = self.attributeNameState
864 elif data == ">":
865 self.emitCurrentToken()
866 elif data == "/":
867 self.state = self.selfClosingStartTagState
868 elif data in ("'", '"', "=", "<"):
869 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
870 "invalid-character-in-attribute-name"})
871 self.currentToken["data"].append([data, ""])
872 self.state = self.attributeNameState
873 elif data == "\u0000":
874 self.tokenQueue.append({"type": tokenTypes["ParseError"],
875 "data": "invalid-codepoint"})
876 self.currentToken["data"].append(["\uFFFD", ""])
877 self.state = self.attributeNameState
878 elif data is EOF:
879 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
880 "expected-attribute-name-but-got-eof"})
881 self.state = self.dataState
882 else:
883 self.currentToken["data"].append([data, ""])
884 self.state = self.attributeNameState
885 return True
886
887 def attributeNameState(self):
888 data = self.stream.char()
889 leavingThisState = True
890 emitToken = False
891 if data == "=":
892 self.state = self.beforeAttributeValueState
893 elif data in asciiLetters:
894 self.currentToken["data"][-1][0] += data +\
895 self.stream.charsUntil(asciiLetters, True)
896 leavingThisState = False
897 elif data == ">":
898 # XXX If we emit here the attributes are converted to a dict
899 # without being checked and when the code below runs we error
900 # because data is a dict not a list
901 emitToken = True
902 elif data in spaceCharacters:
903 self.state = self.afterAttributeNameState
904 elif data == "/":
905 self.state = self.selfClosingStartTagState
906 elif data == "\u0000":
907 self.tokenQueue.append({"type": tokenTypes["ParseError"],
908 "data": "invalid-codepoint"})
909 self.currentToken["data"][-1][0] += "\uFFFD"
910 leavingThisState = False
911 elif data in ("'", '"', "<"):
912 self.tokenQueue.append({"type": tokenTypes["ParseError"],
913 "data":
914 "invalid-character-in-attribute-name"})
915 self.currentToken["data"][-1][0] += data
916 leavingThisState = False
917 elif data is EOF:
918 self.tokenQueue.append({"type": tokenTypes["ParseError"],
919 "data": "eof-in-attribute-name"})
920 self.state = self.dataState
921 else:
922 self.currentToken["data"][-1][0] += data
923 leavingThisState = False
924
925 if leavingThisState:
926 # Attributes are not dropped at this stage. That happens when the
927 # start tag token is emitted so values can still be safely appended
928 # to attributes, but we do want to report the parse error in time.
929 self.currentToken["data"][-1][0] = (
930 self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
931 for name, _ in self.currentToken["data"][:-1]:
932 if self.currentToken["data"][-1][0] == name:
933 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
934 "duplicate-attribute"})
935 break
936 # XXX Fix for above XXX
937 if emitToken:
938 self.emitCurrentToken()
939 return True
940
941 def afterAttributeNameState(self):
942 data = self.stream.char()
943 if data in spaceCharacters:
944 self.stream.charsUntil(spaceCharacters, True)
945 elif data == "=":
946 self.state = self.beforeAttributeValueState
947 elif data == ">":
948 self.emitCurrentToken()
949 elif data in asciiLetters:
950 self.currentToken["data"].append([data, ""])
951 self.state = self.attributeNameState
952 elif data == "/":
953 self.state = self.selfClosingStartTagState
954 elif data == "\u0000":
955 self.tokenQueue.append({"type": tokenTypes["ParseError"],
956 "data": "invalid-codepoint"})
957 self.currentToken["data"].append(["\uFFFD", ""])
958 self.state = self.attributeNameState
959 elif data in ("'", '"', "<"):
960 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
961 "invalid-character-after-attribute-name"})
962 self.currentToken["data"].append([data, ""])
963 self.state = self.attributeNameState
964 elif data is EOF:
965 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
966 "expected-end-of-tag-but-got-eof"})
967 self.state = self.dataState
968 else:
969 self.currentToken["data"].append([data, ""])
970 self.state = self.attributeNameState
971 return True
972
973 def beforeAttributeValueState(self):
974 data = self.stream.char()
975 if data in spaceCharacters:
976 self.stream.charsUntil(spaceCharacters, True)
977 elif data == "\"":
978 self.state = self.attributeValueDoubleQuotedState
979 elif data == "&":
980 self.state = self.attributeValueUnQuotedState
981 self.stream.unget(data)
982 elif data == "'":
983 self.state = self.attributeValueSingleQuotedState
984 elif data == ">":
985 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
986 "expected-attribute-value-but-got-right-bracket"})
987 self.emitCurrentToken()
988 elif data == "\u0000":
989 self.tokenQueue.append({"type": tokenTypes["ParseError"],
990 "data": "invalid-codepoint"})
991 self.currentToken["data"][-1][1] += "\uFFFD"
992 self.state = self.attributeValueUnQuotedState
993 elif data in ("=", "<", "`"):
994 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
995 "equals-in-unquoted-attribute-value"})
996 self.currentToken["data"][-1][1] += data
997 self.state = self.attributeValueUnQuotedState
998 elif data is EOF:
999 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1000 "expected-attribute-value-but-got-eof"})
1001 self.state = self.dataState
1002 else:
1003 self.currentToken["data"][-1][1] += data
1004 self.state = self.attributeValueUnQuotedState
1005 return True
1006
1007 def attributeValueDoubleQuotedState(self):
1008 data = self.stream.char()
1009 if data == "\"":
1010 self.state = self.afterAttributeValueState
1011 elif data == "&":
1012 self.processEntityInAttribute('"')
1013 elif data == "\u0000":
1014 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1015 "data": "invalid-codepoint"})
1016 self.currentToken["data"][-1][1] += "\uFFFD"
1017 elif data is EOF:
1018 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1019 "eof-in-attribute-value-double-quote"})
1020 self.state = self.dataState
1021 else:
1022 self.currentToken["data"][-1][1] += data +\
1023 self.stream.charsUntil(("\"", "&", "\u0000"))
1024 return True
1025
1026 def attributeValueSingleQuotedState(self):
1027 data = self.stream.char()
1028 if data == "'":
1029 self.state = self.afterAttributeValueState
1030 elif data == "&":
1031 self.processEntityInAttribute("'")
1032 elif data == "\u0000":
1033 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1034 "data": "invalid-codepoint"})
1035 self.currentToken["data"][-1][1] += "\uFFFD"
1036 elif data is EOF:
1037 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1038 "eof-in-attribute-value-single-quote"})
1039 self.state = self.dataState
1040 else:
1041 self.currentToken["data"][-1][1] += data +\
1042 self.stream.charsUntil(("'", "&", "\u0000"))
1043 return True
1044
1045 def attributeValueUnQuotedState(self):
1046 data = self.stream.char()
1047 if data in spaceCharacters:
1048 self.state = self.beforeAttributeNameState
1049 elif data == "&":
1050 self.processEntityInAttribute(">")
1051 elif data == ">":
1052 self.emitCurrentToken()
1053 elif data in ('"', "'", "=", "<", "`"):
1054 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1055 "unexpected-character-in-unquoted-attribute-value"})
1056 self.currentToken["data"][-1][1] += data
1057 elif data == "\u0000":
1058 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1059 "data": "invalid-codepoint"})
1060 self.currentToken["data"][-1][1] += "\uFFFD"
1061 elif data is EOF:
1062 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1063 "eof-in-attribute-value-no-quotes"})
1064 self.state = self.dataState
1065 else:
1066 self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
1067 frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
1068 return True
1069
1070 def afterAttributeValueState(self):
1071 data = self.stream.char()
1072 if data in spaceCharacters:
1073 self.state = self.beforeAttributeNameState
1074 elif data == ">":
1075 self.emitCurrentToken()
1076 elif data == "/":
1077 self.state = self.selfClosingStartTagState
1078 elif data is EOF:
1079 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1080 "unexpected-EOF-after-attribute-value"})
1081 self.stream.unget(data)
1082 self.state = self.dataState
1083 else:
1084 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1085 "unexpected-character-after-attribute-value"})
1086 self.stream.unget(data)
1087 self.state = self.beforeAttributeNameState
1088 return True
1089
1090 def selfClosingStartTagState(self):
1091 data = self.stream.char()
1092 if data == ">":
1093 self.currentToken["selfClosing"] = True
1094 self.emitCurrentToken()
1095 elif data is EOF:
1096 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1097 "data":
1098 "unexpected-EOF-after-solidus-in-tag"})
1099 self.stream.unget(data)
1100 self.state = self.dataState
1101 else:
1102 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1103 "unexpected-character-after-solidus-in-tag"})
1104 self.stream.unget(data)
1105 self.state = self.beforeAttributeNameState
1106 return True
1107
1108 def bogusCommentState(self):
1109 # Make a new comment token and give it as value all the characters
1110 # until the first > or EOF (charsUntil checks for EOF automatically)
1111 # and emit it.
1112 data = self.stream.charsUntil(">")
1113 data = data.replace("\u0000", "\uFFFD")
1114 self.tokenQueue.append(
1115 {"type": tokenTypes["Comment"], "data": data})
1116
1117 # Eat the character directly after the bogus comment which is either a
1118 # ">" or an EOF.
1119 self.stream.char()
1120 self.state = self.dataState
1121 return True
1122
1123 def markupDeclarationOpenState(self):
1124 charStack = [self.stream.char()]
1125 if charStack[-1] == "-":
1126 charStack.append(self.stream.char())
1127 if charStack[-1] == "-":
1128 self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
1129 self.state = self.commentStartState
1130 return True
1131 elif charStack[-1] in ('d', 'D'):
1132 matched = True
1133 for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
1134 ('y', 'Y'), ('p', 'P'), ('e', 'E')):
1135 charStack.append(self.stream.char())
1136 if charStack[-1] not in expected:
1137 matched = False
1138 break
1139 if matched:
1140 self.currentToken = {"type": tokenTypes["Doctype"],
1141 "name": "",
1142 "publicId": None, "systemId": None,
1143 "correct": True}
1144 self.state = self.doctypeState
1145 return True
1146 elif (charStack[-1] == "[" and
1147 self.parser is not None and
1148 self.parser.tree.openElements and
1149 self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
1150 matched = True
1151 for expected in ["C", "D", "A", "T", "A", "["]:
1152 charStack.append(self.stream.char())
1153 if charStack[-1] != expected:
1154 matched = False
1155 break
1156 if matched:
1157 self.state = self.cdataSectionState
1158 return True
1159
1160 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1161 "expected-dashes-or-doctype"})
1162
1163 while charStack:
1164 self.stream.unget(charStack.pop())
1165 self.state = self.bogusCommentState
1166 return True
1167
1168 def commentStartState(self):
1169 data = self.stream.char()
1170 if data == "-":
1171 self.state = self.commentStartDashState
1172 elif data == "\u0000":
1173 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1174 "data": "invalid-codepoint"})
1175 self.currentToken["data"] += "\uFFFD"
1176 elif data == ">":
1177 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1178 "incorrect-comment"})
1179 self.tokenQueue.append(self.currentToken)
1180 self.state = self.dataState
1181 elif data is EOF:
1182 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1183 "eof-in-comment"})
1184 self.tokenQueue.append(self.currentToken)
1185 self.state = self.dataState
1186 else:
1187 self.currentToken["data"] += data
1188 self.state = self.commentState
1189 return True
1190
1191 def commentStartDashState(self):
1192 data = self.stream.char()
1193 if data == "-":
1194 self.state = self.commentEndState
1195 elif data == "\u0000":
1196 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1197 "data": "invalid-codepoint"})
1198 self.currentToken["data"] += "-\uFFFD"
1199 elif data == ">":
1200 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1201 "incorrect-comment"})
1202 self.tokenQueue.append(self.currentToken)
1203 self.state = self.dataState
1204 elif data is EOF:
1205 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1206 "eof-in-comment"})
1207 self.tokenQueue.append(self.currentToken)
1208 self.state = self.dataState
1209 else:
1210 self.currentToken["data"] += "-" + data
1211 self.state = self.commentState
1212 return True
1213
1214 def commentState(self):
1215 data = self.stream.char()
1216 if data == "-":
1217 self.state = self.commentEndDashState
1218 elif data == "\u0000":
1219 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1220 "data": "invalid-codepoint"})
1221 self.currentToken["data"] += "\uFFFD"
1222 elif data is EOF:
1223 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1224 "data": "eof-in-comment"})
1225 self.tokenQueue.append(self.currentToken)
1226 self.state = self.dataState
1227 else:
1228 self.currentToken["data"] += data + \
1229 self.stream.charsUntil(("-", "\u0000"))
1230 return True
1231
1232 def commentEndDashState(self):
1233 data = self.stream.char()
1234 if data == "-":
1235 self.state = self.commentEndState
1236 elif data == "\u0000":
1237 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1238 "data": "invalid-codepoint"})
1239 self.currentToken["data"] += "-\uFFFD"
1240 self.state = self.commentState
1241 elif data is EOF:
1242 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1243 "eof-in-comment-end-dash"})
1244 self.tokenQueue.append(self.currentToken)
1245 self.state = self.dataState
1246 else:
1247 self.currentToken["data"] += "-" + data
1248 self.state = self.commentState
1249 return True
1250
1251 def commentEndState(self):
1252 data = self.stream.char()
1253 if data == ">":
1254 self.tokenQueue.append(self.currentToken)
1255 self.state = self.dataState
1256 elif data == "\u0000":
1257 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1258 "data": "invalid-codepoint"})
1259 self.currentToken["data"] += "--\uFFFD"
1260 self.state = self.commentState
1261 elif data == "!":
1262 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1263 "unexpected-bang-after-double-dash-in-comment"})
1264 self.state = self.commentEndBangState
1265 elif data == "-":
1266 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1267 "unexpected-dash-after-double-dash-in-comment"})
1268 self.currentToken["data"] += data
1269 elif data is EOF:
1270 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1271 "eof-in-comment-double-dash"})
1272 self.tokenQueue.append(self.currentToken)
1273 self.state = self.dataState
1274 else:
1275 # XXX
1276 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1277 "unexpected-char-in-comment"})
1278 self.currentToken["data"] += "--" + data
1279 self.state = self.commentState
1280 return True
1281
1282 def commentEndBangState(self):
1283 data = self.stream.char()
1284 if data == ">":
1285 self.tokenQueue.append(self.currentToken)
1286 self.state = self.dataState
1287 elif data == "-":
1288 self.currentToken["data"] += "--!"
1289 self.state = self.commentEndDashState
1290 elif data == "\u0000":
1291 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1292 "data": "invalid-codepoint"})
1293 self.currentToken["data"] += "--!\uFFFD"
1294 self.state = self.commentState
1295 elif data is EOF:
1296 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1297 "eof-in-comment-end-bang-state"})
1298 self.tokenQueue.append(self.currentToken)
1299 self.state = self.dataState
1300 else:
1301 self.currentToken["data"] += "--!" + data
1302 self.state = self.commentState
1303 return True
1304
1305 def doctypeState(self):
1306 data = self.stream.char()
1307 if data in spaceCharacters:
1308 self.state = self.beforeDoctypeNameState
1309 elif data is EOF:
1310 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1311 "expected-doctype-name-but-got-eof"})
1312 self.currentToken["correct"] = False
1313 self.tokenQueue.append(self.currentToken)
1314 self.state = self.dataState
1315 else:
1316 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1317 "need-space-after-doctype"})
1318 self.stream.unget(data)
1319 self.state = self.beforeDoctypeNameState
1320 return True
1321
1322 def beforeDoctypeNameState(self):
1323 data = self.stream.char()
1324 if data in spaceCharacters:
1325 pass
1326 elif data == ">":
1327 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1328 "expected-doctype-name-but-got-right-bracket"})
1329 self.currentToken["correct"] = False
1330 self.tokenQueue.append(self.currentToken)
1331 self.state = self.dataState
1332 elif data == "\u0000":
1333 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1334 "data": "invalid-codepoint"})
1335 self.currentToken["name"] = "\uFFFD"
1336 self.state = self.doctypeNameState
1337 elif data is EOF:
1338 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1339 "expected-doctype-name-but-got-eof"})
1340 self.currentToken["correct"] = False
1341 self.tokenQueue.append(self.currentToken)
1342 self.state = self.dataState
1343 else:
1344 self.currentToken["name"] = data
1345 self.state = self.doctypeNameState
1346 return True
1347
1348 def doctypeNameState(self):
1349 data = self.stream.char()
1350 if data in spaceCharacters:
1351 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1352 self.state = self.afterDoctypeNameState
1353 elif data == ">":
1354 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1355 self.tokenQueue.append(self.currentToken)
1356 self.state = self.dataState
1357 elif data == "\u0000":
1358 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1359 "data": "invalid-codepoint"})
1360 self.currentToken["name"] += "\uFFFD"
1361 self.state = self.doctypeNameState
1362 elif data is EOF:
1363 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1364 "eof-in-doctype-name"})
1365 self.currentToken["correct"] = False
1366 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1367 self.tokenQueue.append(self.currentToken)
1368 self.state = self.dataState
1369 else:
1370 self.currentToken["name"] += data
1371 return True
1372
1373 def afterDoctypeNameState(self):
1374 data = self.stream.char()
1375 if data in spaceCharacters:
1376 pass
1377 elif data == ">":
1378 self.tokenQueue.append(self.currentToken)
1379 self.state = self.dataState
1380 elif data is EOF:
1381 self.currentToken["correct"] = False
1382 self.stream.unget(data)
1383 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1384 "eof-in-doctype"})
1385 self.tokenQueue.append(self.currentToken)
1386 self.state = self.dataState
1387 else:
1388 if data in ("p", "P"):
1389 matched = True
1390 for expected in (("u", "U"), ("b", "B"), ("l", "L"),
1391 ("i", "I"), ("c", "C")):
1392 data = self.stream.char()
1393 if data not in expected:
1394 matched = False
1395 break
1396 if matched:
1397 self.state = self.afterDoctypePublicKeywordState
1398 return True
1399 elif data in ("s", "S"):
1400 matched = True
1401 for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
1402 ("e", "E"), ("m", "M")):
1403 data = self.stream.char()
1404 if data not in expected:
1405 matched = False
1406 break
1407 if matched:
1408 self.state = self.afterDoctypeSystemKeywordState
1409 return True
1410
1411 # All the characters read before the current 'data' will be
1412 # [a-zA-Z], so they're garbage in the bogus doctype and can be
1413 # discarded; only the latest character might be '>' or EOF
1414 # and needs to be ungetted
1415 self.stream.unget(data)
1416 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1417 "expected-space-or-right-bracket-in-doctype", "datavars":
1418 {"data": data}})
1419 self.currentToken["correct"] = False
1420 self.state = self.bogusDoctypeState
1421
1422 return True
1423
1424 def afterDoctypePublicKeywordState(self):
1425 data = self.stream.char()
1426 if data in spaceCharacters:
1427 self.state = self.beforeDoctypePublicIdentifierState
1428 elif data in ("'", '"'):
1429 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1430 "unexpected-char-in-doctype"})
1431 self.stream.unget(data)
1432 self.state = self.beforeDoctypePublicIdentifierState
1433 elif data is EOF:
1434 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1435 "eof-in-doctype"})
1436 self.currentToken["correct"] = False
1437 self.tokenQueue.append(self.currentToken)
1438 self.state = self.dataState
1439 else:
1440 self.stream.unget(data)
1441 self.state = self.beforeDoctypePublicIdentifierState
1442 return True
1443
1444 def beforeDoctypePublicIdentifierState(self):
1445 data = self.stream.char()
1446 if data in spaceCharacters:
1447 pass
1448 elif data == "\"":
1449 self.currentToken["publicId"] = ""
1450 self.state = self.doctypePublicIdentifierDoubleQuotedState
1451 elif data == "'":
1452 self.currentToken["publicId"] = ""
1453 self.state = self.doctypePublicIdentifierSingleQuotedState
1454 elif data == ">":
1455 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1456 "unexpected-end-of-doctype"})
1457 self.currentToken["correct"] = False
1458 self.tokenQueue.append(self.currentToken)
1459 self.state = self.dataState
1460 elif data is EOF:
1461 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1462 "eof-in-doctype"})
1463 self.currentToken["correct"] = False
1464 self.tokenQueue.append(self.currentToken)
1465 self.state = self.dataState
1466 else:
1467 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1468 "unexpected-char-in-doctype"})
1469 self.currentToken["correct"] = False
1470 self.state = self.bogusDoctypeState
1471 return True
1472
1473 def doctypePublicIdentifierDoubleQuotedState(self):
1474 data = self.stream.char()
1475 if data == "\"":
1476 self.state = self.afterDoctypePublicIdentifierState
1477 elif data == "\u0000":
1478 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1479 "data": "invalid-codepoint"})
1480 self.currentToken["publicId"] += "\uFFFD"
1481 elif data == ">":
1482 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1483 "unexpected-end-of-doctype"})
1484 self.currentToken["correct"] = False
1485 self.tokenQueue.append(self.currentToken)
1486 self.state = self.dataState
1487 elif data is EOF:
1488 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1489 "eof-in-doctype"})
1490 self.currentToken["correct"] = False
1491 self.tokenQueue.append(self.currentToken)
1492 self.state = self.dataState
1493 else:
1494 self.currentToken["publicId"] += data
1495 return True
1496
1497 def doctypePublicIdentifierSingleQuotedState(self):
1498 data = self.stream.char()
1499 if data == "'":
1500 self.state = self.afterDoctypePublicIdentifierState
1501 elif data == "\u0000":
1502 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1503 "data": "invalid-codepoint"})
1504 self.currentToken["publicId"] += "\uFFFD"
1505 elif data == ">":
1506 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1507 "unexpected-end-of-doctype"})
1508 self.currentToken["correct"] = False
1509 self.tokenQueue.append(self.currentToken)
1510 self.state = self.dataState
1511 elif data is EOF:
1512 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1513 "eof-in-doctype"})
1514 self.currentToken["correct"] = False
1515 self.tokenQueue.append(self.currentToken)
1516 self.state = self.dataState
1517 else:
1518 self.currentToken["publicId"] += data
1519 return True
1520
1521 def afterDoctypePublicIdentifierState(self):
1522 data = self.stream.char()
1523 if data in spaceCharacters:
1524 self.state = self.betweenDoctypePublicAndSystemIdentifiersState
1525 elif data == ">":
1526 self.tokenQueue.append(self.currentToken)
1527 self.state = self.dataState
1528 elif data == '"':
1529 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1530 "unexpected-char-in-doctype"})
1531 self.currentToken["systemId"] = ""
1532 self.state = self.doctypeSystemIdentifierDoubleQuotedState
1533 elif data == "'":
1534 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1535 "unexpected-char-in-doctype"})
1536 self.currentToken["systemId"] = ""
1537 self.state = self.doctypeSystemIdentifierSingleQuotedState
1538 elif data is EOF:
1539 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1540 "eof-in-doctype"})
1541 self.currentToken["correct"] = False
1542 self.tokenQueue.append(self.currentToken)
1543 self.state = self.dataState
1544 else:
1545 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1546 "unexpected-char-in-doctype"})
1547 self.currentToken["correct"] = False
1548 self.state = self.bogusDoctypeState
1549 return True
1550
1551 def betweenDoctypePublicAndSystemIdentifiersState(self):
1552 data = self.stream.char()
1553 if data in spaceCharacters:
1554 pass
1555 elif data == ">":
1556 self.tokenQueue.append(self.currentToken)
1557 self.state = self.dataState
1558 elif data == '"':
1559 self.currentToken["systemId"] = ""
1560 self.state = self.doctypeSystemIdentifierDoubleQuotedState
1561 elif data == "'":
1562 self.currentToken["systemId"] = ""
1563 self.state = self.doctypeSystemIdentifierSingleQuotedState
1564 elif data == EOF:
1565 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1566 "eof-in-doctype"})
1567 self.currentToken["correct"] = False
1568 self.tokenQueue.append(self.currentToken)
1569 self.state = self.dataState
1570 else:
1571 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1572 "unexpected-char-in-doctype"})
1573 self.currentToken["correct"] = False
1574 self.state = self.bogusDoctypeState
1575 return True
1576
1577 def afterDoctypeSystemKeywordState(self):
1578 data = self.stream.char()
1579 if data in spaceCharacters:
1580 self.state = self.beforeDoctypeSystemIdentifierState
1581 elif data in ("'", '"'):
1582 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1583 "unexpected-char-in-doctype"})
1584 self.stream.unget(data)
1585 self.state = self.beforeDoctypeSystemIdentifierState
1586 elif data is EOF:
1587 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1588 "eof-in-doctype"})
1589 self.currentToken["correct"] = False
1590 self.tokenQueue.append(self.currentToken)
1591 self.state = self.dataState
1592 else:
1593 self.stream.unget(data)
1594 self.state = self.beforeDoctypeSystemIdentifierState
1595 return True
1596
1597 def beforeDoctypeSystemIdentifierState(self):
1598 data = self.stream.char()
1599 if data in spaceCharacters:
1600 pass
1601 elif data == "\"":
1602 self.currentToken["systemId"] = ""
1603 self.state = self.doctypeSystemIdentifierDoubleQuotedState
1604 elif data == "'":
1605 self.currentToken["systemId"] = ""
1606 self.state = self.doctypeSystemIdentifierSingleQuotedState
1607 elif data == ">":
1608 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1609 "unexpected-char-in-doctype"})
1610 self.currentToken["correct"] = False
1611 self.tokenQueue.append(self.currentToken)
1612 self.state = self.dataState
1613 elif data is EOF:
1614 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1615 "eof-in-doctype"})
1616 self.currentToken["correct"] = False
1617 self.tokenQueue.append(self.currentToken)
1618 self.state = self.dataState
1619 else:
1620 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1621 "unexpected-char-in-doctype"})
1622 self.currentToken["correct"] = False
1623 self.state = self.bogusDoctypeState
1624 return True
1625
1626 def doctypeSystemIdentifierDoubleQuotedState(self):
1627 data = self.stream.char()
1628 if data == "\"":
1629 self.state = self.afterDoctypeSystemIdentifierState
1630 elif data == "\u0000":
1631 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1632 "data": "invalid-codepoint"})
1633 self.currentToken["systemId"] += "\uFFFD"
1634 elif data == ">":
1635 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1636 "unexpected-end-of-doctype"})
1637 self.currentToken["correct"] = False
1638 self.tokenQueue.append(self.currentToken)
1639 self.state = self.dataState
1640 elif data is EOF:
1641 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1642 "eof-in-doctype"})
1643 self.currentToken["correct"] = False
1644 self.tokenQueue.append(self.currentToken)
1645 self.state = self.dataState
1646 else:
1647 self.currentToken["systemId"] += data
1648 return True
1649
1650 def doctypeSystemIdentifierSingleQuotedState(self):
1651 data = self.stream.char()
1652 if data == "'":
1653 self.state = self.afterDoctypeSystemIdentifierState
1654 elif data == "\u0000":
1655 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1656 "data": "invalid-codepoint"})
1657 self.currentToken["systemId"] += "\uFFFD"
1658 elif data == ">":
1659 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1660 "unexpected-end-of-doctype"})
1661 self.currentToken["correct"] = False
1662 self.tokenQueue.append(self.currentToken)
1663 self.state = self.dataState
1664 elif data is EOF:
1665 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1666 "eof-in-doctype"})
1667 self.currentToken["correct"] = False
1668 self.tokenQueue.append(self.currentToken)
1669 self.state = self.dataState
1670 else:
1671 self.currentToken["systemId"] += data
1672 return True
1673
1674 def afterDoctypeSystemIdentifierState(self):
1675 data = self.stream.char()
1676 if data in spaceCharacters:
1677 pass
1678 elif data == ">":
1679 self.tokenQueue.append(self.currentToken)
1680 self.state = self.dataState
1681 elif data is EOF:
1682 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1683 "eof-in-doctype"})
1684 self.currentToken["correct"] = False
1685 self.tokenQueue.append(self.currentToken)
1686 self.state = self.dataState
1687 else:
1688 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1689 "unexpected-char-in-doctype"})
1690 self.state = self.bogusDoctypeState
1691 return True
1692
1693 def bogusDoctypeState(self):
1694 data = self.stream.char()
1695 if data == ">":
1696 self.tokenQueue.append(self.currentToken)
1697 self.state = self.dataState
1698 elif data is EOF:
1699 # XXX EMIT
1700 self.stream.unget(data)
1701 self.tokenQueue.append(self.currentToken)
1702 self.state = self.dataState
1703 else:
1704 pass
1705 return True
1706
1707 def cdataSectionState(self):
1708 data = []
1709 while True:
1710 data.append(self.stream.charsUntil("]"))
1711 data.append(self.stream.charsUntil(">"))
1712 char = self.stream.char()
1713 if char == EOF:
1714 break
1715 else:
1716 assert char == ">"
1717 if data[-1][-2:] == "]]":
1718 data[-1] = data[-1][:-2]
1719 break
1720 else:
1721 data.append(char)
1722
1723 data = "".join(data) # pylint:disable=redefined-variable-type
1724 # Deal with null here rather than in the parser
1725 nullCount = data.count("\u0000")
1726 if nullCount > 0:
1727 for _ in range(nullCount):
1728 self.tokenQueue.append({"type": tokenTypes["ParseError"],
1729 "data": "invalid-codepoint"})
1730 data = data.replace("\u0000", "\uFFFD")
1731 if data:
1732 self.tokenQueue.append({"type": tokenTypes["Characters"],
1733 "data": data})
1734 self.state = self.dataState
1735 return True