Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bleach/_vendor/html5lib/_tokenizer.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 from __future__ import absolute_import, division, unicode_literals | |
2 | |
3 from six import unichr as chr | |
4 | |
5 from collections import deque, OrderedDict | |
6 from sys import version_info | |
7 | |
8 from .constants import spaceCharacters | |
9 from .constants import entities | |
10 from .constants import asciiLetters, asciiUpper2Lower | |
11 from .constants import digits, hexDigits, EOF | |
12 from .constants import tokenTypes, tagTokenTypes | |
13 from .constants import replacementCharacters | |
14 | |
15 from ._inputstream import HTMLInputStream | |
16 | |
17 from ._trie import Trie | |
18 | |
19 entitiesTrie = Trie(entities) | |
20 | |
21 if version_info >= (3, 7): | |
22 attributeMap = dict | |
23 else: | |
24 attributeMap = OrderedDict | |
25 | |
26 | |
27 class HTMLTokenizer(object): | |
28 """ This class takes care of tokenizing HTML. | |
29 | |
30 * self.currentToken | |
31 Holds the token that is currently being processed. | |
32 | |
33 * self.state | |
34 Holds a reference to the method to be invoked... XXX | |
35 | |
36 * self.stream | |
37 Points to HTMLInputStream object. | |
38 """ | |
39 | |
40 def __init__(self, stream, parser=None, **kwargs): | |
41 | |
42 self.stream = HTMLInputStream(stream, **kwargs) | |
43 self.parser = parser | |
44 | |
45 # Setup the initial tokenizer state | |
46 self.escapeFlag = False | |
47 self.lastFourChars = [] | |
48 self.state = self.dataState | |
49 self.escape = False | |
50 | |
51 # The current token being created | |
52 self.currentToken = None | |
53 super(HTMLTokenizer, self).__init__() | |
54 | |
55 def __iter__(self): | |
56 """ This is where the magic happens. | |
57 | |
58 We do our usually processing through the states and when we have a token | |
59 to return we yield the token which pauses processing until the next token | |
60 is requested. | |
61 """ | |
62 self.tokenQueue = deque([]) | |
63 # Start processing. When EOF is reached self.state will return False | |
64 # instead of True and the loop will terminate. | |
65 while self.state(): | |
66 while self.stream.errors: | |
67 yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} | |
68 while self.tokenQueue: | |
69 yield self.tokenQueue.popleft() | |
70 | |
71 def consumeNumberEntity(self, isHex): | |
72 """This function returns either U+FFFD or the character based on the | |
73 decimal or hexadecimal representation. It also discards ";" if present. | |
74 If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. | |
75 """ | |
76 | |
77 allowed = digits | |
78 radix = 10 | |
79 if isHex: | |
80 allowed = hexDigits | |
81 radix = 16 | |
82 | |
83 charStack = [] | |
84 | |
85 # Consume all the characters that are in range while making sure we | |
86 # don't hit an EOF. | |
87 c = self.stream.char() | |
88 while c in allowed and c is not EOF: | |
89 charStack.append(c) | |
90 c = self.stream.char() | |
91 | |
92 # Convert the set of characters consumed to an int. | |
93 charAsInt = int("".join(charStack), radix) | |
94 | |
95 # Certain characters get replaced with others | |
96 if charAsInt in replacementCharacters: | |
97 char = replacementCharacters[charAsInt] | |
98 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
99 "illegal-codepoint-for-numeric-entity", | |
100 "datavars": {"charAsInt": charAsInt}}) | |
101 elif ((0xD800 <= charAsInt <= 0xDFFF) or | |
102 (charAsInt > 0x10FFFF)): | |
103 char = "\uFFFD" | |
104 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
105 "illegal-codepoint-for-numeric-entity", | |
106 "datavars": {"charAsInt": charAsInt}}) | |
107 else: | |
108 # Should speed up this check somehow (e.g. move the set to a constant) | |
109 if ((0x0001 <= charAsInt <= 0x0008) or | |
110 (0x000E <= charAsInt <= 0x001F) or | |
111 (0x007F <= charAsInt <= 0x009F) or | |
112 (0xFDD0 <= charAsInt <= 0xFDEF) or | |
113 charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, | |
114 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, | |
115 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, | |
116 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, | |
117 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, | |
118 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, | |
119 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, | |
120 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, | |
121 0xFFFFF, 0x10FFFE, 0x10FFFF])): | |
122 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
123 "data": | |
124 "illegal-codepoint-for-numeric-entity", | |
125 "datavars": {"charAsInt": charAsInt}}) | |
126 try: | |
127 # Try/except needed as UCS-2 Python builds' unichar only works | |
128 # within the BMP. | |
129 char = chr(charAsInt) | |
130 except ValueError: | |
131 v = charAsInt - 0x10000 | |
132 char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF)) | |
133 | |
134 # Discard the ; if present. Otherwise, put it back on the queue and | |
135 # invoke parseError on parser. | |
136 if c != ";": | |
137 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
138 "numeric-entity-without-semicolon"}) | |
139 self.stream.unget(c) | |
140 | |
141 return char | |
142 | |
143 def consumeEntity(self, allowedChar=None, fromAttribute=False): | |
144 # Initialise to the default output for when no entity is matched | |
145 output = "&" | |
146 | |
147 charStack = [self.stream.char()] | |
148 if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or | |
149 (allowedChar is not None and allowedChar == charStack[0])): | |
150 self.stream.unget(charStack[0]) | |
151 | |
152 elif charStack[0] == "#": | |
153 # Read the next character to see if it's hex or decimal | |
154 hex = False | |
155 charStack.append(self.stream.char()) | |
156 if charStack[-1] in ("x", "X"): | |
157 hex = True | |
158 charStack.append(self.stream.char()) | |
159 | |
160 # charStack[-1] should be the first digit | |
161 if (hex and charStack[-1] in hexDigits) \ | |
162 or (not hex and charStack[-1] in digits): | |
163 # At least one digit found, so consume the whole number | |
164 self.stream.unget(charStack[-1]) | |
165 output = self.consumeNumberEntity(hex) | |
166 else: | |
167 # No digits found | |
168 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
169 "data": "expected-numeric-entity"}) | |
170 self.stream.unget(charStack.pop()) | |
171 output = "&" + "".join(charStack) | |
172 | |
173 else: | |
174 # At this point in the process might have named entity. Entities | |
175 # are stored in the global variable "entities". | |
176 # | |
177 # Consume characters and compare to these to a substring of the | |
178 # entity names in the list until the substring no longer matches. | |
179 while (charStack[-1] is not EOF): | |
180 if not entitiesTrie.has_keys_with_prefix("".join(charStack)): | |
181 break | |
182 charStack.append(self.stream.char()) | |
183 | |
184 # At this point we have a string that starts with some characters | |
185 # that may match an entity | |
186 # Try to find the longest entity the string will match to take care | |
187 # of ¬i for instance. | |
188 try: | |
189 entityName = entitiesTrie.longest_prefix("".join(charStack[:-1])) | |
190 entityLength = len(entityName) | |
191 except KeyError: | |
192 entityName = None | |
193 | |
194 if entityName is not None: | |
195 if entityName[-1] != ";": | |
196 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
197 "named-entity-without-semicolon"}) | |
198 if (entityName[-1] != ";" and fromAttribute and | |
199 (charStack[entityLength] in asciiLetters or | |
200 charStack[entityLength] in digits or | |
201 charStack[entityLength] == "=")): | |
202 self.stream.unget(charStack.pop()) | |
203 output = "&" + "".join(charStack) | |
204 else: | |
205 output = entities[entityName] | |
206 self.stream.unget(charStack.pop()) | |
207 output += "".join(charStack[entityLength:]) | |
208 else: | |
209 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
210 "expected-named-entity"}) | |
211 self.stream.unget(charStack.pop()) | |
212 output = "&" + "".join(charStack) | |
213 | |
214 if fromAttribute: | |
215 self.currentToken["data"][-1][1] += output | |
216 else: | |
217 if output in spaceCharacters: | |
218 tokenType = "SpaceCharacters" | |
219 else: | |
220 tokenType = "Characters" | |
221 self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) | |
222 | |
223 def processEntityInAttribute(self, allowedChar): | |
224 """This method replaces the need for "entityInAttributeValueState". | |
225 """ | |
226 self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) | |
227 | |
228 def emitCurrentToken(self): | |
229 """This method is a generic handler for emitting the tags. It also sets | |
230 the state to "data" because that's what's needed after a token has been | |
231 emitted. | |
232 """ | |
233 token = self.currentToken | |
234 # Add token to the queue to be yielded | |
235 if (token["type"] in tagTokenTypes): | |
236 token["name"] = token["name"].translate(asciiUpper2Lower) | |
237 if token["type"] == tokenTypes["StartTag"]: | |
238 raw = token["data"] | |
239 data = attributeMap(raw) | |
240 if len(raw) > len(data): | |
241 # we had some duplicated attribute, fix so first wins | |
242 data.update(raw[::-1]) | |
243 token["data"] = data | |
244 | |
245 if token["type"] == tokenTypes["EndTag"]: | |
246 if token["data"]: | |
247 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
248 "data": "attributes-in-end-tag"}) | |
249 if token["selfClosing"]: | |
250 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
251 "data": "self-closing-flag-on-end-tag"}) | |
252 self.tokenQueue.append(token) | |
253 self.state = self.dataState | |
254 | |
255 # Below are the various tokenizer states worked out. | |
256 def dataState(self): | |
257 data = self.stream.char() | |
258 if data == "&": | |
259 self.state = self.entityDataState | |
260 elif data == "<": | |
261 self.state = self.tagOpenState | |
262 elif data == "\u0000": | |
263 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
264 "data": "invalid-codepoint"}) | |
265 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
266 "data": "\u0000"}) | |
267 elif data is EOF: | |
268 # Tokenization ends. | |
269 return False | |
270 elif data in spaceCharacters: | |
271 # Directly after emitting a token you switch back to the "data | |
272 # state". At that point spaceCharacters are important so they are | |
273 # emitted separately. | |
274 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": | |
275 data + self.stream.charsUntil(spaceCharacters, True)}) | |
276 # No need to update lastFourChars here, since the first space will | |
277 # have already been appended to lastFourChars and will have broken | |
278 # any <!-- or --> sequences | |
279 else: | |
280 chars = self.stream.charsUntil(("&", "<", "\u0000")) | |
281 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | |
282 data + chars}) | |
283 return True | |
284 | |
285 def entityDataState(self): | |
286 self.consumeEntity() | |
287 self.state = self.dataState | |
288 return True | |
289 | |
290 def rcdataState(self): | |
291 data = self.stream.char() | |
292 if data == "&": | |
293 self.state = self.characterReferenceInRcdata | |
294 elif data == "<": | |
295 self.state = self.rcdataLessThanSignState | |
296 elif data == EOF: | |
297 # Tokenization ends. | |
298 return False | |
299 elif data == "\u0000": | |
300 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
301 "data": "invalid-codepoint"}) | |
302 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
303 "data": "\uFFFD"}) | |
304 elif data in spaceCharacters: | |
305 # Directly after emitting a token you switch back to the "data | |
306 # state". At that point spaceCharacters are important so they are | |
307 # emitted separately. | |
308 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": | |
309 data + self.stream.charsUntil(spaceCharacters, True)}) | |
310 # No need to update lastFourChars here, since the first space will | |
311 # have already been appended to lastFourChars and will have broken | |
312 # any <!-- or --> sequences | |
313 else: | |
314 chars = self.stream.charsUntil(("&", "<", "\u0000")) | |
315 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | |
316 data + chars}) | |
317 return True | |
318 | |
319 def characterReferenceInRcdata(self): | |
320 self.consumeEntity() | |
321 self.state = self.rcdataState | |
322 return True | |
323 | |
324 def rawtextState(self): | |
325 data = self.stream.char() | |
326 if data == "<": | |
327 self.state = self.rawtextLessThanSignState | |
328 elif data == "\u0000": | |
329 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
330 "data": "invalid-codepoint"}) | |
331 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
332 "data": "\uFFFD"}) | |
333 elif data == EOF: | |
334 # Tokenization ends. | |
335 return False | |
336 else: | |
337 chars = self.stream.charsUntil(("<", "\u0000")) | |
338 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | |
339 data + chars}) | |
340 return True | |
341 | |
342 def scriptDataState(self): | |
343 data = self.stream.char() | |
344 if data == "<": | |
345 self.state = self.scriptDataLessThanSignState | |
346 elif data == "\u0000": | |
347 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
348 "data": "invalid-codepoint"}) | |
349 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
350 "data": "\uFFFD"}) | |
351 elif data == EOF: | |
352 # Tokenization ends. | |
353 return False | |
354 else: | |
355 chars = self.stream.charsUntil(("<", "\u0000")) | |
356 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | |
357 data + chars}) | |
358 return True | |
359 | |
360 def plaintextState(self): | |
361 data = self.stream.char() | |
362 if data == EOF: | |
363 # Tokenization ends. | |
364 return False | |
365 elif data == "\u0000": | |
366 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
367 "data": "invalid-codepoint"}) | |
368 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
369 "data": "\uFFFD"}) | |
370 else: | |
371 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | |
372 data + self.stream.charsUntil("\u0000")}) | |
373 return True | |
374 | |
375 def tagOpenState(self): | |
376 data = self.stream.char() | |
377 if data == "!": | |
378 self.state = self.markupDeclarationOpenState | |
379 elif data == "/": | |
380 self.state = self.closeTagOpenState | |
381 elif data in asciiLetters: | |
382 self.currentToken = {"type": tokenTypes["StartTag"], | |
383 "name": data, "data": [], | |
384 "selfClosing": False, | |
385 "selfClosingAcknowledged": False} | |
386 self.state = self.tagNameState | |
387 elif data == ">": | |
388 # XXX In theory it could be something besides a tag name. But | |
389 # do we really care? | |
390 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
391 "expected-tag-name-but-got-right-bracket"}) | |
392 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) | |
393 self.state = self.dataState | |
394 elif data == "?": | |
395 # XXX In theory it could be something besides a tag name. But | |
396 # do we really care? | |
397 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
398 "expected-tag-name-but-got-question-mark"}) | |
399 self.stream.unget(data) | |
400 self.state = self.bogusCommentState | |
401 else: | |
402 # XXX | |
403 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
404 "expected-tag-name"}) | |
405 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | |
406 self.stream.unget(data) | |
407 self.state = self.dataState | |
408 return True | |
409 | |
410 def closeTagOpenState(self): | |
411 data = self.stream.char() | |
412 if data in asciiLetters: | |
413 self.currentToken = {"type": tokenTypes["EndTag"], "name": data, | |
414 "data": [], "selfClosing": False} | |
415 self.state = self.tagNameState | |
416 elif data == ">": | |
417 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
418 "expected-closing-tag-but-got-right-bracket"}) | |
419 self.state = self.dataState | |
420 elif data is EOF: | |
421 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
422 "expected-closing-tag-but-got-eof"}) | |
423 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) | |
424 self.state = self.dataState | |
425 else: | |
426 # XXX data can be _'_... | |
427 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
428 "expected-closing-tag-but-got-char", | |
429 "datavars": {"data": data}}) | |
430 self.stream.unget(data) | |
431 self.state = self.bogusCommentState | |
432 return True | |
433 | |
434 def tagNameState(self): | |
435 data = self.stream.char() | |
436 if data in spaceCharacters: | |
437 self.state = self.beforeAttributeNameState | |
438 elif data == ">": | |
439 self.emitCurrentToken() | |
440 elif data is EOF: | |
441 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
442 "eof-in-tag-name"}) | |
443 self.state = self.dataState | |
444 elif data == "/": | |
445 self.state = self.selfClosingStartTagState | |
446 elif data == "\u0000": | |
447 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
448 "data": "invalid-codepoint"}) | |
449 self.currentToken["name"] += "\uFFFD" | |
450 else: | |
451 self.currentToken["name"] += data | |
452 # (Don't use charsUntil here, because tag names are | |
453 # very short and it's faster to not do anything fancy) | |
454 return True | |
455 | |
456 def rcdataLessThanSignState(self): | |
457 data = self.stream.char() | |
458 if data == "/": | |
459 self.temporaryBuffer = "" | |
460 self.state = self.rcdataEndTagOpenState | |
461 else: | |
462 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | |
463 self.stream.unget(data) | |
464 self.state = self.rcdataState | |
465 return True | |
466 | |
467 def rcdataEndTagOpenState(self): | |
468 data = self.stream.char() | |
469 if data in asciiLetters: | |
470 self.temporaryBuffer += data | |
471 self.state = self.rcdataEndTagNameState | |
472 else: | |
473 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) | |
474 self.stream.unget(data) | |
475 self.state = self.rcdataState | |
476 return True | |
477 | |
478 def rcdataEndTagNameState(self): | |
479 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | |
480 data = self.stream.char() | |
481 if data in spaceCharacters and appropriate: | |
482 self.currentToken = {"type": tokenTypes["EndTag"], | |
483 "name": self.temporaryBuffer, | |
484 "data": [], "selfClosing": False} | |
485 self.state = self.beforeAttributeNameState | |
486 elif data == "/" and appropriate: | |
487 self.currentToken = {"type": tokenTypes["EndTag"], | |
488 "name": self.temporaryBuffer, | |
489 "data": [], "selfClosing": False} | |
490 self.state = self.selfClosingStartTagState | |
491 elif data == ">" and appropriate: | |
492 self.currentToken = {"type": tokenTypes["EndTag"], | |
493 "name": self.temporaryBuffer, | |
494 "data": [], "selfClosing": False} | |
495 self.emitCurrentToken() | |
496 self.state = self.dataState | |
497 elif data in asciiLetters: | |
498 self.temporaryBuffer += data | |
499 else: | |
500 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
501 "data": "</" + self.temporaryBuffer}) | |
502 self.stream.unget(data) | |
503 self.state = self.rcdataState | |
504 return True | |
505 | |
506 def rawtextLessThanSignState(self): | |
507 data = self.stream.char() | |
508 if data == "/": | |
509 self.temporaryBuffer = "" | |
510 self.state = self.rawtextEndTagOpenState | |
511 else: | |
512 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | |
513 self.stream.unget(data) | |
514 self.state = self.rawtextState | |
515 return True | |
516 | |
517 def rawtextEndTagOpenState(self): | |
518 data = self.stream.char() | |
519 if data in asciiLetters: | |
520 self.temporaryBuffer += data | |
521 self.state = self.rawtextEndTagNameState | |
522 else: | |
523 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) | |
524 self.stream.unget(data) | |
525 self.state = self.rawtextState | |
526 return True | |
527 | |
528 def rawtextEndTagNameState(self): | |
529 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | |
530 data = self.stream.char() | |
531 if data in spaceCharacters and appropriate: | |
532 self.currentToken = {"type": tokenTypes["EndTag"], | |
533 "name": self.temporaryBuffer, | |
534 "data": [], "selfClosing": False} | |
535 self.state = self.beforeAttributeNameState | |
536 elif data == "/" and appropriate: | |
537 self.currentToken = {"type": tokenTypes["EndTag"], | |
538 "name": self.temporaryBuffer, | |
539 "data": [], "selfClosing": False} | |
540 self.state = self.selfClosingStartTagState | |
541 elif data == ">" and appropriate: | |
542 self.currentToken = {"type": tokenTypes["EndTag"], | |
543 "name": self.temporaryBuffer, | |
544 "data": [], "selfClosing": False} | |
545 self.emitCurrentToken() | |
546 self.state = self.dataState | |
547 elif data in asciiLetters: | |
548 self.temporaryBuffer += data | |
549 else: | |
550 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
551 "data": "</" + self.temporaryBuffer}) | |
552 self.stream.unget(data) | |
553 self.state = self.rawtextState | |
554 return True | |
555 | |
556 def scriptDataLessThanSignState(self): | |
557 data = self.stream.char() | |
558 if data == "/": | |
559 self.temporaryBuffer = "" | |
560 self.state = self.scriptDataEndTagOpenState | |
561 elif data == "!": | |
562 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"}) | |
563 self.state = self.scriptDataEscapeStartState | |
564 else: | |
565 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | |
566 self.stream.unget(data) | |
567 self.state = self.scriptDataState | |
568 return True | |
569 | |
570 def scriptDataEndTagOpenState(self): | |
571 data = self.stream.char() | |
572 if data in asciiLetters: | |
573 self.temporaryBuffer += data | |
574 self.state = self.scriptDataEndTagNameState | |
575 else: | |
576 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) | |
577 self.stream.unget(data) | |
578 self.state = self.scriptDataState | |
579 return True | |
580 | |
581 def scriptDataEndTagNameState(self): | |
582 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | |
583 data = self.stream.char() | |
584 if data in spaceCharacters and appropriate: | |
585 self.currentToken = {"type": tokenTypes["EndTag"], | |
586 "name": self.temporaryBuffer, | |
587 "data": [], "selfClosing": False} | |
588 self.state = self.beforeAttributeNameState | |
589 elif data == "/" and appropriate: | |
590 self.currentToken = {"type": tokenTypes["EndTag"], | |
591 "name": self.temporaryBuffer, | |
592 "data": [], "selfClosing": False} | |
593 self.state = self.selfClosingStartTagState | |
594 elif data == ">" and appropriate: | |
595 self.currentToken = {"type": tokenTypes["EndTag"], | |
596 "name": self.temporaryBuffer, | |
597 "data": [], "selfClosing": False} | |
598 self.emitCurrentToken() | |
599 self.state = self.dataState | |
600 elif data in asciiLetters: | |
601 self.temporaryBuffer += data | |
602 else: | |
603 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
604 "data": "</" + self.temporaryBuffer}) | |
605 self.stream.unget(data) | |
606 self.state = self.scriptDataState | |
607 return True | |
608 | |
609 def scriptDataEscapeStartState(self): | |
610 data = self.stream.char() | |
611 if data == "-": | |
612 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | |
613 self.state = self.scriptDataEscapeStartDashState | |
614 else: | |
615 self.stream.unget(data) | |
616 self.state = self.scriptDataState | |
617 return True | |
618 | |
619 def scriptDataEscapeStartDashState(self): | |
620 data = self.stream.char() | |
621 if data == "-": | |
622 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | |
623 self.state = self.scriptDataEscapedDashDashState | |
624 else: | |
625 self.stream.unget(data) | |
626 self.state = self.scriptDataState | |
627 return True | |
628 | |
629 def scriptDataEscapedState(self): | |
630 data = self.stream.char() | |
631 if data == "-": | |
632 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | |
633 self.state = self.scriptDataEscapedDashState | |
634 elif data == "<": | |
635 self.state = self.scriptDataEscapedLessThanSignState | |
636 elif data == "\u0000": | |
637 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
638 "data": "invalid-codepoint"}) | |
639 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
640 "data": "\uFFFD"}) | |
641 elif data == EOF: | |
642 self.state = self.dataState | |
643 else: | |
644 chars = self.stream.charsUntil(("<", "-", "\u0000")) | |
645 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | |
646 data + chars}) | |
647 return True | |
648 | |
649 def scriptDataEscapedDashState(self): | |
650 data = self.stream.char() | |
651 if data == "-": | |
652 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | |
653 self.state = self.scriptDataEscapedDashDashState | |
654 elif data == "<": | |
655 self.state = self.scriptDataEscapedLessThanSignState | |
656 elif data == "\u0000": | |
657 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
658 "data": "invalid-codepoint"}) | |
659 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
660 "data": "\uFFFD"}) | |
661 self.state = self.scriptDataEscapedState | |
662 elif data == EOF: | |
663 self.state = self.dataState | |
664 else: | |
665 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | |
666 self.state = self.scriptDataEscapedState | |
667 return True | |
668 | |
669 def scriptDataEscapedDashDashState(self): | |
670 data = self.stream.char() | |
671 if data == "-": | |
672 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | |
673 elif data == "<": | |
674 self.state = self.scriptDataEscapedLessThanSignState | |
675 elif data == ">": | |
676 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) | |
677 self.state = self.scriptDataState | |
678 elif data == "\u0000": | |
679 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
680 "data": "invalid-codepoint"}) | |
681 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
682 "data": "\uFFFD"}) | |
683 self.state = self.scriptDataEscapedState | |
684 elif data == EOF: | |
685 self.state = self.dataState | |
686 else: | |
687 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | |
688 self.state = self.scriptDataEscapedState | |
689 return True | |
690 | |
691 def scriptDataEscapedLessThanSignState(self): | |
692 data = self.stream.char() | |
693 if data == "/": | |
694 self.temporaryBuffer = "" | |
695 self.state = self.scriptDataEscapedEndTagOpenState | |
696 elif data in asciiLetters: | |
697 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) | |
698 self.temporaryBuffer = data | |
699 self.state = self.scriptDataDoubleEscapeStartState | |
700 else: | |
701 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | |
702 self.stream.unget(data) | |
703 self.state = self.scriptDataEscapedState | |
704 return True | |
705 | |
706 def scriptDataEscapedEndTagOpenState(self): | |
707 data = self.stream.char() | |
708 if data in asciiLetters: | |
709 self.temporaryBuffer = data | |
710 self.state = self.scriptDataEscapedEndTagNameState | |
711 else: | |
712 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) | |
713 self.stream.unget(data) | |
714 self.state = self.scriptDataEscapedState | |
715 return True | |
716 | |
717 def scriptDataEscapedEndTagNameState(self): | |
718 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | |
719 data = self.stream.char() | |
720 if data in spaceCharacters and appropriate: | |
721 self.currentToken = {"type": tokenTypes["EndTag"], | |
722 "name": self.temporaryBuffer, | |
723 "data": [], "selfClosing": False} | |
724 self.state = self.beforeAttributeNameState | |
725 elif data == "/" and appropriate: | |
726 self.currentToken = {"type": tokenTypes["EndTag"], | |
727 "name": self.temporaryBuffer, | |
728 "data": [], "selfClosing": False} | |
729 self.state = self.selfClosingStartTagState | |
730 elif data == ">" and appropriate: | |
731 self.currentToken = {"type": tokenTypes["EndTag"], | |
732 "name": self.temporaryBuffer, | |
733 "data": [], "selfClosing": False} | |
734 self.emitCurrentToken() | |
735 self.state = self.dataState | |
736 elif data in asciiLetters: | |
737 self.temporaryBuffer += data | |
738 else: | |
739 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
740 "data": "</" + self.temporaryBuffer}) | |
741 self.stream.unget(data) | |
742 self.state = self.scriptDataEscapedState | |
743 return True | |
744 | |
745 def scriptDataDoubleEscapeStartState(self): | |
746 data = self.stream.char() | |
747 if data in (spaceCharacters | frozenset(("/", ">"))): | |
748 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | |
749 if self.temporaryBuffer.lower() == "script": | |
750 self.state = self.scriptDataDoubleEscapedState | |
751 else: | |
752 self.state = self.scriptDataEscapedState | |
753 elif data in asciiLetters: | |
754 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | |
755 self.temporaryBuffer += data | |
756 else: | |
757 self.stream.unget(data) | |
758 self.state = self.scriptDataEscapedState | |
759 return True | |
760 | |
761 def scriptDataDoubleEscapedState(self): | |
762 data = self.stream.char() | |
763 if data == "-": | |
764 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | |
765 self.state = self.scriptDataDoubleEscapedDashState | |
766 elif data == "<": | |
767 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | |
768 self.state = self.scriptDataDoubleEscapedLessThanSignState | |
769 elif data == "\u0000": | |
770 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
771 "data": "invalid-codepoint"}) | |
772 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
773 "data": "\uFFFD"}) | |
774 elif data == EOF: | |
775 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
776 "eof-in-script-in-script"}) | |
777 self.state = self.dataState | |
778 else: | |
779 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | |
780 return True | |
781 | |
782 def scriptDataDoubleEscapedDashState(self): | |
783 data = self.stream.char() | |
784 if data == "-": | |
785 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | |
786 self.state = self.scriptDataDoubleEscapedDashDashState | |
787 elif data == "<": | |
788 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | |
789 self.state = self.scriptDataDoubleEscapedLessThanSignState | |
790 elif data == "\u0000": | |
791 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
792 "data": "invalid-codepoint"}) | |
793 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
794 "data": "\uFFFD"}) | |
795 self.state = self.scriptDataDoubleEscapedState | |
796 elif data == EOF: | |
797 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
798 "eof-in-script-in-script"}) | |
799 self.state = self.dataState | |
800 else: | |
801 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | |
802 self.state = self.scriptDataDoubleEscapedState | |
803 return True | |
804 | |
805 def scriptDataDoubleEscapedDashDashState(self): | |
806 data = self.stream.char() | |
807 if data == "-": | |
808 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | |
809 elif data == "<": | |
810 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | |
811 self.state = self.scriptDataDoubleEscapedLessThanSignState | |
812 elif data == ">": | |
813 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) | |
814 self.state = self.scriptDataState | |
815 elif data == "\u0000": | |
816 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
817 "data": "invalid-codepoint"}) | |
818 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
819 "data": "\uFFFD"}) | |
820 self.state = self.scriptDataDoubleEscapedState | |
821 elif data == EOF: | |
822 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
823 "eof-in-script-in-script"}) | |
824 self.state = self.dataState | |
825 else: | |
826 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | |
827 self.state = self.scriptDataDoubleEscapedState | |
828 return True | |
829 | |
830 def scriptDataDoubleEscapedLessThanSignState(self): | |
831 data = self.stream.char() | |
832 if data == "/": | |
833 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) | |
834 self.temporaryBuffer = "" | |
835 self.state = self.scriptDataDoubleEscapeEndState | |
836 else: | |
837 self.stream.unget(data) | |
838 self.state = self.scriptDataDoubleEscapedState | |
839 return True | |
840 | |
841 def scriptDataDoubleEscapeEndState(self): | |
842 data = self.stream.char() | |
843 if data in (spaceCharacters | frozenset(("/", ">"))): | |
844 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | |
845 if self.temporaryBuffer.lower() == "script": | |
846 self.state = self.scriptDataEscapedState | |
847 else: | |
848 self.state = self.scriptDataDoubleEscapedState | |
849 elif data in asciiLetters: | |
850 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | |
851 self.temporaryBuffer += data | |
852 else: | |
853 self.stream.unget(data) | |
854 self.state = self.scriptDataDoubleEscapedState | |
855 return True | |
856 | |
857 def beforeAttributeNameState(self): | |
858 data = self.stream.char() | |
859 if data in spaceCharacters: | |
860 self.stream.charsUntil(spaceCharacters, True) | |
861 elif data in asciiLetters: | |
862 self.currentToken["data"].append([data, ""]) | |
863 self.state = self.attributeNameState | |
864 elif data == ">": | |
865 self.emitCurrentToken() | |
866 elif data == "/": | |
867 self.state = self.selfClosingStartTagState | |
868 elif data in ("'", '"', "=", "<"): | |
869 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
870 "invalid-character-in-attribute-name"}) | |
871 self.currentToken["data"].append([data, ""]) | |
872 self.state = self.attributeNameState | |
873 elif data == "\u0000": | |
874 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
875 "data": "invalid-codepoint"}) | |
876 self.currentToken["data"].append(["\uFFFD", ""]) | |
877 self.state = self.attributeNameState | |
878 elif data is EOF: | |
879 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
880 "expected-attribute-name-but-got-eof"}) | |
881 self.state = self.dataState | |
882 else: | |
883 self.currentToken["data"].append([data, ""]) | |
884 self.state = self.attributeNameState | |
885 return True | |
886 | |
887 def attributeNameState(self): | |
888 data = self.stream.char() | |
889 leavingThisState = True | |
890 emitToken = False | |
891 if data == "=": | |
892 self.state = self.beforeAttributeValueState | |
893 elif data in asciiLetters: | |
894 self.currentToken["data"][-1][0] += data +\ | |
895 self.stream.charsUntil(asciiLetters, True) | |
896 leavingThisState = False | |
897 elif data == ">": | |
898 # XXX If we emit here the attributes are converted to a dict | |
899 # without being checked and when the code below runs we error | |
900 # because data is a dict not a list | |
901 emitToken = True | |
902 elif data in spaceCharacters: | |
903 self.state = self.afterAttributeNameState | |
904 elif data == "/": | |
905 self.state = self.selfClosingStartTagState | |
906 elif data == "\u0000": | |
907 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
908 "data": "invalid-codepoint"}) | |
909 self.currentToken["data"][-1][0] += "\uFFFD" | |
910 leavingThisState = False | |
911 elif data in ("'", '"', "<"): | |
912 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
913 "data": | |
914 "invalid-character-in-attribute-name"}) | |
915 self.currentToken["data"][-1][0] += data | |
916 leavingThisState = False | |
917 elif data is EOF: | |
918 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
919 "data": "eof-in-attribute-name"}) | |
920 self.state = self.dataState | |
921 else: | |
922 self.currentToken["data"][-1][0] += data | |
923 leavingThisState = False | |
924 | |
925 if leavingThisState: | |
926 # Attributes are not dropped at this stage. That happens when the | |
927 # start tag token is emitted so values can still be safely appended | |
928 # to attributes, but we do want to report the parse error in time. | |
929 self.currentToken["data"][-1][0] = ( | |
930 self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) | |
931 for name, _ in self.currentToken["data"][:-1]: | |
932 if self.currentToken["data"][-1][0] == name: | |
933 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
934 "duplicate-attribute"}) | |
935 break | |
936 # XXX Fix for above XXX | |
937 if emitToken: | |
938 self.emitCurrentToken() | |
939 return True | |
940 | |
941 def afterAttributeNameState(self): | |
942 data = self.stream.char() | |
943 if data in spaceCharacters: | |
944 self.stream.charsUntil(spaceCharacters, True) | |
945 elif data == "=": | |
946 self.state = self.beforeAttributeValueState | |
947 elif data == ">": | |
948 self.emitCurrentToken() | |
949 elif data in asciiLetters: | |
950 self.currentToken["data"].append([data, ""]) | |
951 self.state = self.attributeNameState | |
952 elif data == "/": | |
953 self.state = self.selfClosingStartTagState | |
954 elif data == "\u0000": | |
955 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
956 "data": "invalid-codepoint"}) | |
957 self.currentToken["data"].append(["\uFFFD", ""]) | |
958 self.state = self.attributeNameState | |
959 elif data in ("'", '"', "<"): | |
960 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
961 "invalid-character-after-attribute-name"}) | |
962 self.currentToken["data"].append([data, ""]) | |
963 self.state = self.attributeNameState | |
964 elif data is EOF: | |
965 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
966 "expected-end-of-tag-but-got-eof"}) | |
967 self.state = self.dataState | |
968 else: | |
969 self.currentToken["data"].append([data, ""]) | |
970 self.state = self.attributeNameState | |
971 return True | |
972 | |
973 def beforeAttributeValueState(self): | |
974 data = self.stream.char() | |
975 if data in spaceCharacters: | |
976 self.stream.charsUntil(spaceCharacters, True) | |
977 elif data == "\"": | |
978 self.state = self.attributeValueDoubleQuotedState | |
979 elif data == "&": | |
980 self.state = self.attributeValueUnQuotedState | |
981 self.stream.unget(data) | |
982 elif data == "'": | |
983 self.state = self.attributeValueSingleQuotedState | |
984 elif data == ">": | |
985 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
986 "expected-attribute-value-but-got-right-bracket"}) | |
987 self.emitCurrentToken() | |
988 elif data == "\u0000": | |
989 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
990 "data": "invalid-codepoint"}) | |
991 self.currentToken["data"][-1][1] += "\uFFFD" | |
992 self.state = self.attributeValueUnQuotedState | |
993 elif data in ("=", "<", "`"): | |
994 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
995 "equals-in-unquoted-attribute-value"}) | |
996 self.currentToken["data"][-1][1] += data | |
997 self.state = self.attributeValueUnQuotedState | |
998 elif data is EOF: | |
999 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1000 "expected-attribute-value-but-got-eof"}) | |
1001 self.state = self.dataState | |
1002 else: | |
1003 self.currentToken["data"][-1][1] += data | |
1004 self.state = self.attributeValueUnQuotedState | |
1005 return True | |
1006 | |
1007 def attributeValueDoubleQuotedState(self): | |
1008 data = self.stream.char() | |
1009 if data == "\"": | |
1010 self.state = self.afterAttributeValueState | |
1011 elif data == "&": | |
1012 self.processEntityInAttribute('"') | |
1013 elif data == "\u0000": | |
1014 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1015 "data": "invalid-codepoint"}) | |
1016 self.currentToken["data"][-1][1] += "\uFFFD" | |
1017 elif data is EOF: | |
1018 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1019 "eof-in-attribute-value-double-quote"}) | |
1020 self.state = self.dataState | |
1021 else: | |
1022 self.currentToken["data"][-1][1] += data +\ | |
1023 self.stream.charsUntil(("\"", "&", "\u0000")) | |
1024 return True | |
1025 | |
1026 def attributeValueSingleQuotedState(self): | |
1027 data = self.stream.char() | |
1028 if data == "'": | |
1029 self.state = self.afterAttributeValueState | |
1030 elif data == "&": | |
1031 self.processEntityInAttribute("'") | |
1032 elif data == "\u0000": | |
1033 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1034 "data": "invalid-codepoint"}) | |
1035 self.currentToken["data"][-1][1] += "\uFFFD" | |
1036 elif data is EOF: | |
1037 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1038 "eof-in-attribute-value-single-quote"}) | |
1039 self.state = self.dataState | |
1040 else: | |
1041 self.currentToken["data"][-1][1] += data +\ | |
1042 self.stream.charsUntil(("'", "&", "\u0000")) | |
1043 return True | |
1044 | |
1045 def attributeValueUnQuotedState(self): | |
1046 data = self.stream.char() | |
1047 if data in spaceCharacters: | |
1048 self.state = self.beforeAttributeNameState | |
1049 elif data == "&": | |
1050 self.processEntityInAttribute(">") | |
1051 elif data == ">": | |
1052 self.emitCurrentToken() | |
1053 elif data in ('"', "'", "=", "<", "`"): | |
1054 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1055 "unexpected-character-in-unquoted-attribute-value"}) | |
1056 self.currentToken["data"][-1][1] += data | |
1057 elif data == "\u0000": | |
1058 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1059 "data": "invalid-codepoint"}) | |
1060 self.currentToken["data"][-1][1] += "\uFFFD" | |
1061 elif data is EOF: | |
1062 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1063 "eof-in-attribute-value-no-quotes"}) | |
1064 self.state = self.dataState | |
1065 else: | |
1066 self.currentToken["data"][-1][1] += data + self.stream.charsUntil( | |
1067 frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) | |
1068 return True | |
1069 | |
1070 def afterAttributeValueState(self): | |
1071 data = self.stream.char() | |
1072 if data in spaceCharacters: | |
1073 self.state = self.beforeAttributeNameState | |
1074 elif data == ">": | |
1075 self.emitCurrentToken() | |
1076 elif data == "/": | |
1077 self.state = self.selfClosingStartTagState | |
1078 elif data is EOF: | |
1079 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1080 "unexpected-EOF-after-attribute-value"}) | |
1081 self.stream.unget(data) | |
1082 self.state = self.dataState | |
1083 else: | |
1084 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1085 "unexpected-character-after-attribute-value"}) | |
1086 self.stream.unget(data) | |
1087 self.state = self.beforeAttributeNameState | |
1088 return True | |
1089 | |
1090 def selfClosingStartTagState(self): | |
1091 data = self.stream.char() | |
1092 if data == ">": | |
1093 self.currentToken["selfClosing"] = True | |
1094 self.emitCurrentToken() | |
1095 elif data is EOF: | |
1096 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1097 "data": | |
1098 "unexpected-EOF-after-solidus-in-tag"}) | |
1099 self.stream.unget(data) | |
1100 self.state = self.dataState | |
1101 else: | |
1102 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1103 "unexpected-character-after-solidus-in-tag"}) | |
1104 self.stream.unget(data) | |
1105 self.state = self.beforeAttributeNameState | |
1106 return True | |
1107 | |
1108 def bogusCommentState(self): | |
1109 # Make a new comment token and give it as value all the characters | |
1110 # until the first > or EOF (charsUntil checks for EOF automatically) | |
1111 # and emit it. | |
1112 data = self.stream.charsUntil(">") | |
1113 data = data.replace("\u0000", "\uFFFD") | |
1114 self.tokenQueue.append( | |
1115 {"type": tokenTypes["Comment"], "data": data}) | |
1116 | |
1117 # Eat the character directly after the bogus comment which is either a | |
1118 # ">" or an EOF. | |
1119 self.stream.char() | |
1120 self.state = self.dataState | |
1121 return True | |
1122 | |
1123 def markupDeclarationOpenState(self): | |
1124 charStack = [self.stream.char()] | |
1125 if charStack[-1] == "-": | |
1126 charStack.append(self.stream.char()) | |
1127 if charStack[-1] == "-": | |
1128 self.currentToken = {"type": tokenTypes["Comment"], "data": ""} | |
1129 self.state = self.commentStartState | |
1130 return True | |
1131 elif charStack[-1] in ('d', 'D'): | |
1132 matched = True | |
1133 for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'), | |
1134 ('y', 'Y'), ('p', 'P'), ('e', 'E')): | |
1135 charStack.append(self.stream.char()) | |
1136 if charStack[-1] not in expected: | |
1137 matched = False | |
1138 break | |
1139 if matched: | |
1140 self.currentToken = {"type": tokenTypes["Doctype"], | |
1141 "name": "", | |
1142 "publicId": None, "systemId": None, | |
1143 "correct": True} | |
1144 self.state = self.doctypeState | |
1145 return True | |
1146 elif (charStack[-1] == "[" and | |
1147 self.parser is not None and | |
1148 self.parser.tree.openElements and | |
1149 self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): | |
1150 matched = True | |
1151 for expected in ["C", "D", "A", "T", "A", "["]: | |
1152 charStack.append(self.stream.char()) | |
1153 if charStack[-1] != expected: | |
1154 matched = False | |
1155 break | |
1156 if matched: | |
1157 self.state = self.cdataSectionState | |
1158 return True | |
1159 | |
1160 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1161 "expected-dashes-or-doctype"}) | |
1162 | |
1163 while charStack: | |
1164 self.stream.unget(charStack.pop()) | |
1165 self.state = self.bogusCommentState | |
1166 return True | |
1167 | |
1168 def commentStartState(self): | |
1169 data = self.stream.char() | |
1170 if data == "-": | |
1171 self.state = self.commentStartDashState | |
1172 elif data == "\u0000": | |
1173 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1174 "data": "invalid-codepoint"}) | |
1175 self.currentToken["data"] += "\uFFFD" | |
1176 elif data == ">": | |
1177 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1178 "incorrect-comment"}) | |
1179 self.tokenQueue.append(self.currentToken) | |
1180 self.state = self.dataState | |
1181 elif data is EOF: | |
1182 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1183 "eof-in-comment"}) | |
1184 self.tokenQueue.append(self.currentToken) | |
1185 self.state = self.dataState | |
1186 else: | |
1187 self.currentToken["data"] += data | |
1188 self.state = self.commentState | |
1189 return True | |
1190 | |
1191 def commentStartDashState(self): | |
1192 data = self.stream.char() | |
1193 if data == "-": | |
1194 self.state = self.commentEndState | |
1195 elif data == "\u0000": | |
1196 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1197 "data": "invalid-codepoint"}) | |
1198 self.currentToken["data"] += "-\uFFFD" | |
1199 elif data == ">": | |
1200 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1201 "incorrect-comment"}) | |
1202 self.tokenQueue.append(self.currentToken) | |
1203 self.state = self.dataState | |
1204 elif data is EOF: | |
1205 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1206 "eof-in-comment"}) | |
1207 self.tokenQueue.append(self.currentToken) | |
1208 self.state = self.dataState | |
1209 else: | |
1210 self.currentToken["data"] += "-" + data | |
1211 self.state = self.commentState | |
1212 return True | |
1213 | |
1214 def commentState(self): | |
1215 data = self.stream.char() | |
1216 if data == "-": | |
1217 self.state = self.commentEndDashState | |
1218 elif data == "\u0000": | |
1219 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1220 "data": "invalid-codepoint"}) | |
1221 self.currentToken["data"] += "\uFFFD" | |
1222 elif data is EOF: | |
1223 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1224 "data": "eof-in-comment"}) | |
1225 self.tokenQueue.append(self.currentToken) | |
1226 self.state = self.dataState | |
1227 else: | |
1228 self.currentToken["data"] += data + \ | |
1229 self.stream.charsUntil(("-", "\u0000")) | |
1230 return True | |
1231 | |
1232 def commentEndDashState(self): | |
1233 data = self.stream.char() | |
1234 if data == "-": | |
1235 self.state = self.commentEndState | |
1236 elif data == "\u0000": | |
1237 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1238 "data": "invalid-codepoint"}) | |
1239 self.currentToken["data"] += "-\uFFFD" | |
1240 self.state = self.commentState | |
1241 elif data is EOF: | |
1242 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1243 "eof-in-comment-end-dash"}) | |
1244 self.tokenQueue.append(self.currentToken) | |
1245 self.state = self.dataState | |
1246 else: | |
1247 self.currentToken["data"] += "-" + data | |
1248 self.state = self.commentState | |
1249 return True | |
1250 | |
1251 def commentEndState(self): | |
1252 data = self.stream.char() | |
1253 if data == ">": | |
1254 self.tokenQueue.append(self.currentToken) | |
1255 self.state = self.dataState | |
1256 elif data == "\u0000": | |
1257 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1258 "data": "invalid-codepoint"}) | |
1259 self.currentToken["data"] += "--\uFFFD" | |
1260 self.state = self.commentState | |
1261 elif data == "!": | |
1262 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1263 "unexpected-bang-after-double-dash-in-comment"}) | |
1264 self.state = self.commentEndBangState | |
1265 elif data == "-": | |
1266 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1267 "unexpected-dash-after-double-dash-in-comment"}) | |
1268 self.currentToken["data"] += data | |
1269 elif data is EOF: | |
1270 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1271 "eof-in-comment-double-dash"}) | |
1272 self.tokenQueue.append(self.currentToken) | |
1273 self.state = self.dataState | |
1274 else: | |
1275 # XXX | |
1276 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1277 "unexpected-char-in-comment"}) | |
1278 self.currentToken["data"] += "--" + data | |
1279 self.state = self.commentState | |
1280 return True | |
1281 | |
1282 def commentEndBangState(self): | |
1283 data = self.stream.char() | |
1284 if data == ">": | |
1285 self.tokenQueue.append(self.currentToken) | |
1286 self.state = self.dataState | |
1287 elif data == "-": | |
1288 self.currentToken["data"] += "--!" | |
1289 self.state = self.commentEndDashState | |
1290 elif data == "\u0000": | |
1291 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1292 "data": "invalid-codepoint"}) | |
1293 self.currentToken["data"] += "--!\uFFFD" | |
1294 self.state = self.commentState | |
1295 elif data is EOF: | |
1296 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1297 "eof-in-comment-end-bang-state"}) | |
1298 self.tokenQueue.append(self.currentToken) | |
1299 self.state = self.dataState | |
1300 else: | |
1301 self.currentToken["data"] += "--!" + data | |
1302 self.state = self.commentState | |
1303 return True | |
1304 | |
1305 def doctypeState(self): | |
1306 data = self.stream.char() | |
1307 if data in spaceCharacters: | |
1308 self.state = self.beforeDoctypeNameState | |
1309 elif data is EOF: | |
1310 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1311 "expected-doctype-name-but-got-eof"}) | |
1312 self.currentToken["correct"] = False | |
1313 self.tokenQueue.append(self.currentToken) | |
1314 self.state = self.dataState | |
1315 else: | |
1316 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1317 "need-space-after-doctype"}) | |
1318 self.stream.unget(data) | |
1319 self.state = self.beforeDoctypeNameState | |
1320 return True | |
1321 | |
1322 def beforeDoctypeNameState(self): | |
1323 data = self.stream.char() | |
1324 if data in spaceCharacters: | |
1325 pass | |
1326 elif data == ">": | |
1327 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1328 "expected-doctype-name-but-got-right-bracket"}) | |
1329 self.currentToken["correct"] = False | |
1330 self.tokenQueue.append(self.currentToken) | |
1331 self.state = self.dataState | |
1332 elif data == "\u0000": | |
1333 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1334 "data": "invalid-codepoint"}) | |
1335 self.currentToken["name"] = "\uFFFD" | |
1336 self.state = self.doctypeNameState | |
1337 elif data is EOF: | |
1338 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1339 "expected-doctype-name-but-got-eof"}) | |
1340 self.currentToken["correct"] = False | |
1341 self.tokenQueue.append(self.currentToken) | |
1342 self.state = self.dataState | |
1343 else: | |
1344 self.currentToken["name"] = data | |
1345 self.state = self.doctypeNameState | |
1346 return True | |
1347 | |
1348 def doctypeNameState(self): | |
1349 data = self.stream.char() | |
1350 if data in spaceCharacters: | |
1351 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) | |
1352 self.state = self.afterDoctypeNameState | |
1353 elif data == ">": | |
1354 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) | |
1355 self.tokenQueue.append(self.currentToken) | |
1356 self.state = self.dataState | |
1357 elif data == "\u0000": | |
1358 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1359 "data": "invalid-codepoint"}) | |
1360 self.currentToken["name"] += "\uFFFD" | |
1361 self.state = self.doctypeNameState | |
1362 elif data is EOF: | |
1363 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1364 "eof-in-doctype-name"}) | |
1365 self.currentToken["correct"] = False | |
1366 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) | |
1367 self.tokenQueue.append(self.currentToken) | |
1368 self.state = self.dataState | |
1369 else: | |
1370 self.currentToken["name"] += data | |
1371 return True | |
1372 | |
1373 def afterDoctypeNameState(self): | |
1374 data = self.stream.char() | |
1375 if data in spaceCharacters: | |
1376 pass | |
1377 elif data == ">": | |
1378 self.tokenQueue.append(self.currentToken) | |
1379 self.state = self.dataState | |
1380 elif data is EOF: | |
1381 self.currentToken["correct"] = False | |
1382 self.stream.unget(data) | |
1383 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1384 "eof-in-doctype"}) | |
1385 self.tokenQueue.append(self.currentToken) | |
1386 self.state = self.dataState | |
1387 else: | |
1388 if data in ("p", "P"): | |
1389 matched = True | |
1390 for expected in (("u", "U"), ("b", "B"), ("l", "L"), | |
1391 ("i", "I"), ("c", "C")): | |
1392 data = self.stream.char() | |
1393 if data not in expected: | |
1394 matched = False | |
1395 break | |
1396 if matched: | |
1397 self.state = self.afterDoctypePublicKeywordState | |
1398 return True | |
1399 elif data in ("s", "S"): | |
1400 matched = True | |
1401 for expected in (("y", "Y"), ("s", "S"), ("t", "T"), | |
1402 ("e", "E"), ("m", "M")): | |
1403 data = self.stream.char() | |
1404 if data not in expected: | |
1405 matched = False | |
1406 break | |
1407 if matched: | |
1408 self.state = self.afterDoctypeSystemKeywordState | |
1409 return True | |
1410 | |
1411 # All the characters read before the current 'data' will be | |
1412 # [a-zA-Z], so they're garbage in the bogus doctype and can be | |
1413 # discarded; only the latest character might be '>' or EOF | |
1414 # and needs to be ungetted | |
1415 self.stream.unget(data) | |
1416 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1417 "expected-space-or-right-bracket-in-doctype", "datavars": | |
1418 {"data": data}}) | |
1419 self.currentToken["correct"] = False | |
1420 self.state = self.bogusDoctypeState | |
1421 | |
1422 return True | |
1423 | |
1424 def afterDoctypePublicKeywordState(self): | |
1425 data = self.stream.char() | |
1426 if data in spaceCharacters: | |
1427 self.state = self.beforeDoctypePublicIdentifierState | |
1428 elif data in ("'", '"'): | |
1429 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1430 "unexpected-char-in-doctype"}) | |
1431 self.stream.unget(data) | |
1432 self.state = self.beforeDoctypePublicIdentifierState | |
1433 elif data is EOF: | |
1434 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1435 "eof-in-doctype"}) | |
1436 self.currentToken["correct"] = False | |
1437 self.tokenQueue.append(self.currentToken) | |
1438 self.state = self.dataState | |
1439 else: | |
1440 self.stream.unget(data) | |
1441 self.state = self.beforeDoctypePublicIdentifierState | |
1442 return True | |
1443 | |
1444 def beforeDoctypePublicIdentifierState(self): | |
1445 data = self.stream.char() | |
1446 if data in spaceCharacters: | |
1447 pass | |
1448 elif data == "\"": | |
1449 self.currentToken["publicId"] = "" | |
1450 self.state = self.doctypePublicIdentifierDoubleQuotedState | |
1451 elif data == "'": | |
1452 self.currentToken["publicId"] = "" | |
1453 self.state = self.doctypePublicIdentifierSingleQuotedState | |
1454 elif data == ">": | |
1455 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1456 "unexpected-end-of-doctype"}) | |
1457 self.currentToken["correct"] = False | |
1458 self.tokenQueue.append(self.currentToken) | |
1459 self.state = self.dataState | |
1460 elif data is EOF: | |
1461 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1462 "eof-in-doctype"}) | |
1463 self.currentToken["correct"] = False | |
1464 self.tokenQueue.append(self.currentToken) | |
1465 self.state = self.dataState | |
1466 else: | |
1467 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1468 "unexpected-char-in-doctype"}) | |
1469 self.currentToken["correct"] = False | |
1470 self.state = self.bogusDoctypeState | |
1471 return True | |
1472 | |
1473 def doctypePublicIdentifierDoubleQuotedState(self): | |
1474 data = self.stream.char() | |
1475 if data == "\"": | |
1476 self.state = self.afterDoctypePublicIdentifierState | |
1477 elif data == "\u0000": | |
1478 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1479 "data": "invalid-codepoint"}) | |
1480 self.currentToken["publicId"] += "\uFFFD" | |
1481 elif data == ">": | |
1482 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1483 "unexpected-end-of-doctype"}) | |
1484 self.currentToken["correct"] = False | |
1485 self.tokenQueue.append(self.currentToken) | |
1486 self.state = self.dataState | |
1487 elif data is EOF: | |
1488 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1489 "eof-in-doctype"}) | |
1490 self.currentToken["correct"] = False | |
1491 self.tokenQueue.append(self.currentToken) | |
1492 self.state = self.dataState | |
1493 else: | |
1494 self.currentToken["publicId"] += data | |
1495 return True | |
1496 | |
1497 def doctypePublicIdentifierSingleQuotedState(self): | |
1498 data = self.stream.char() | |
1499 if data == "'": | |
1500 self.state = self.afterDoctypePublicIdentifierState | |
1501 elif data == "\u0000": | |
1502 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1503 "data": "invalid-codepoint"}) | |
1504 self.currentToken["publicId"] += "\uFFFD" | |
1505 elif data == ">": | |
1506 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1507 "unexpected-end-of-doctype"}) | |
1508 self.currentToken["correct"] = False | |
1509 self.tokenQueue.append(self.currentToken) | |
1510 self.state = self.dataState | |
1511 elif data is EOF: | |
1512 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1513 "eof-in-doctype"}) | |
1514 self.currentToken["correct"] = False | |
1515 self.tokenQueue.append(self.currentToken) | |
1516 self.state = self.dataState | |
1517 else: | |
1518 self.currentToken["publicId"] += data | |
1519 return True | |
1520 | |
1521 def afterDoctypePublicIdentifierState(self): | |
1522 data = self.stream.char() | |
1523 if data in spaceCharacters: | |
1524 self.state = self.betweenDoctypePublicAndSystemIdentifiersState | |
1525 elif data == ">": | |
1526 self.tokenQueue.append(self.currentToken) | |
1527 self.state = self.dataState | |
1528 elif data == '"': | |
1529 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1530 "unexpected-char-in-doctype"}) | |
1531 self.currentToken["systemId"] = "" | |
1532 self.state = self.doctypeSystemIdentifierDoubleQuotedState | |
1533 elif data == "'": | |
1534 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1535 "unexpected-char-in-doctype"}) | |
1536 self.currentToken["systemId"] = "" | |
1537 self.state = self.doctypeSystemIdentifierSingleQuotedState | |
1538 elif data is EOF: | |
1539 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1540 "eof-in-doctype"}) | |
1541 self.currentToken["correct"] = False | |
1542 self.tokenQueue.append(self.currentToken) | |
1543 self.state = self.dataState | |
1544 else: | |
1545 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1546 "unexpected-char-in-doctype"}) | |
1547 self.currentToken["correct"] = False | |
1548 self.state = self.bogusDoctypeState | |
1549 return True | |
1550 | |
1551 def betweenDoctypePublicAndSystemIdentifiersState(self): | |
1552 data = self.stream.char() | |
1553 if data in spaceCharacters: | |
1554 pass | |
1555 elif data == ">": | |
1556 self.tokenQueue.append(self.currentToken) | |
1557 self.state = self.dataState | |
1558 elif data == '"': | |
1559 self.currentToken["systemId"] = "" | |
1560 self.state = self.doctypeSystemIdentifierDoubleQuotedState | |
1561 elif data == "'": | |
1562 self.currentToken["systemId"] = "" | |
1563 self.state = self.doctypeSystemIdentifierSingleQuotedState | |
1564 elif data == EOF: | |
1565 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1566 "eof-in-doctype"}) | |
1567 self.currentToken["correct"] = False | |
1568 self.tokenQueue.append(self.currentToken) | |
1569 self.state = self.dataState | |
1570 else: | |
1571 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1572 "unexpected-char-in-doctype"}) | |
1573 self.currentToken["correct"] = False | |
1574 self.state = self.bogusDoctypeState | |
1575 return True | |
1576 | |
1577 def afterDoctypeSystemKeywordState(self): | |
1578 data = self.stream.char() | |
1579 if data in spaceCharacters: | |
1580 self.state = self.beforeDoctypeSystemIdentifierState | |
1581 elif data in ("'", '"'): | |
1582 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1583 "unexpected-char-in-doctype"}) | |
1584 self.stream.unget(data) | |
1585 self.state = self.beforeDoctypeSystemIdentifierState | |
1586 elif data is EOF: | |
1587 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1588 "eof-in-doctype"}) | |
1589 self.currentToken["correct"] = False | |
1590 self.tokenQueue.append(self.currentToken) | |
1591 self.state = self.dataState | |
1592 else: | |
1593 self.stream.unget(data) | |
1594 self.state = self.beforeDoctypeSystemIdentifierState | |
1595 return True | |
1596 | |
1597 def beforeDoctypeSystemIdentifierState(self): | |
1598 data = self.stream.char() | |
1599 if data in spaceCharacters: | |
1600 pass | |
1601 elif data == "\"": | |
1602 self.currentToken["systemId"] = "" | |
1603 self.state = self.doctypeSystemIdentifierDoubleQuotedState | |
1604 elif data == "'": | |
1605 self.currentToken["systemId"] = "" | |
1606 self.state = self.doctypeSystemIdentifierSingleQuotedState | |
1607 elif data == ">": | |
1608 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1609 "unexpected-char-in-doctype"}) | |
1610 self.currentToken["correct"] = False | |
1611 self.tokenQueue.append(self.currentToken) | |
1612 self.state = self.dataState | |
1613 elif data is EOF: | |
1614 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1615 "eof-in-doctype"}) | |
1616 self.currentToken["correct"] = False | |
1617 self.tokenQueue.append(self.currentToken) | |
1618 self.state = self.dataState | |
1619 else: | |
1620 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1621 "unexpected-char-in-doctype"}) | |
1622 self.currentToken["correct"] = False | |
1623 self.state = self.bogusDoctypeState | |
1624 return True | |
1625 | |
1626 def doctypeSystemIdentifierDoubleQuotedState(self): | |
1627 data = self.stream.char() | |
1628 if data == "\"": | |
1629 self.state = self.afterDoctypeSystemIdentifierState | |
1630 elif data == "\u0000": | |
1631 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1632 "data": "invalid-codepoint"}) | |
1633 self.currentToken["systemId"] += "\uFFFD" | |
1634 elif data == ">": | |
1635 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1636 "unexpected-end-of-doctype"}) | |
1637 self.currentToken["correct"] = False | |
1638 self.tokenQueue.append(self.currentToken) | |
1639 self.state = self.dataState | |
1640 elif data is EOF: | |
1641 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1642 "eof-in-doctype"}) | |
1643 self.currentToken["correct"] = False | |
1644 self.tokenQueue.append(self.currentToken) | |
1645 self.state = self.dataState | |
1646 else: | |
1647 self.currentToken["systemId"] += data | |
1648 return True | |
1649 | |
1650 def doctypeSystemIdentifierSingleQuotedState(self): | |
1651 data = self.stream.char() | |
1652 if data == "'": | |
1653 self.state = self.afterDoctypeSystemIdentifierState | |
1654 elif data == "\u0000": | |
1655 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1656 "data": "invalid-codepoint"}) | |
1657 self.currentToken["systemId"] += "\uFFFD" | |
1658 elif data == ">": | |
1659 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1660 "unexpected-end-of-doctype"}) | |
1661 self.currentToken["correct"] = False | |
1662 self.tokenQueue.append(self.currentToken) | |
1663 self.state = self.dataState | |
1664 elif data is EOF: | |
1665 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1666 "eof-in-doctype"}) | |
1667 self.currentToken["correct"] = False | |
1668 self.tokenQueue.append(self.currentToken) | |
1669 self.state = self.dataState | |
1670 else: | |
1671 self.currentToken["systemId"] += data | |
1672 return True | |
1673 | |
1674 def afterDoctypeSystemIdentifierState(self): | |
1675 data = self.stream.char() | |
1676 if data in spaceCharacters: | |
1677 pass | |
1678 elif data == ">": | |
1679 self.tokenQueue.append(self.currentToken) | |
1680 self.state = self.dataState | |
1681 elif data is EOF: | |
1682 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1683 "eof-in-doctype"}) | |
1684 self.currentToken["correct"] = False | |
1685 self.tokenQueue.append(self.currentToken) | |
1686 self.state = self.dataState | |
1687 else: | |
1688 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | |
1689 "unexpected-char-in-doctype"}) | |
1690 self.state = self.bogusDoctypeState | |
1691 return True | |
1692 | |
1693 def bogusDoctypeState(self): | |
1694 data = self.stream.char() | |
1695 if data == ">": | |
1696 self.tokenQueue.append(self.currentToken) | |
1697 self.state = self.dataState | |
1698 elif data is EOF: | |
1699 # XXX EMIT | |
1700 self.stream.unget(data) | |
1701 self.tokenQueue.append(self.currentToken) | |
1702 self.state = self.dataState | |
1703 else: | |
1704 pass | |
1705 return True | |
1706 | |
1707 def cdataSectionState(self): | |
1708 data = [] | |
1709 while True: | |
1710 data.append(self.stream.charsUntil("]")) | |
1711 data.append(self.stream.charsUntil(">")) | |
1712 char = self.stream.char() | |
1713 if char == EOF: | |
1714 break | |
1715 else: | |
1716 assert char == ">" | |
1717 if data[-1][-2:] == "]]": | |
1718 data[-1] = data[-1][:-2] | |
1719 break | |
1720 else: | |
1721 data.append(char) | |
1722 | |
1723 data = "".join(data) # pylint:disable=redefined-variable-type | |
1724 # Deal with null here rather than in the parser | |
1725 nullCount = data.count("\u0000") | |
1726 if nullCount > 0: | |
1727 for _ in range(nullCount): | |
1728 self.tokenQueue.append({"type": tokenTypes["ParseError"], | |
1729 "data": "invalid-codepoint"}) | |
1730 data = data.replace("\u0000", "\uFFFD") | |
1731 if data: | |
1732 self.tokenQueue.append({"type": tokenTypes["Characters"], | |
1733 "data": data}) | |
1734 self.state = self.dataState | |
1735 return True |