comparison env/lib/python3.9/site-packages/ruamel/yaml/scanner.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000 (2021-03-22)
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # coding: utf-8
2
3 from __future__ import print_function, absolute_import, division, unicode_literals
4
5 # Scanner produces tokens of the following types:
6 # STREAM-START
7 # STREAM-END
8 # DIRECTIVE(name, value)
9 # DOCUMENT-START
10 # DOCUMENT-END
11 # BLOCK-SEQUENCE-START
12 # BLOCK-MAPPING-START
13 # BLOCK-END
14 # FLOW-SEQUENCE-START
15 # FLOW-MAPPING-START
16 # FLOW-SEQUENCE-END
17 # FLOW-MAPPING-END
18 # BLOCK-ENTRY
19 # FLOW-ENTRY
20 # KEY
21 # VALUE
22 # ALIAS(value)
23 # ANCHOR(value)
24 # TAG(value)
25 # SCALAR(value, plain, style)
26 #
27 # RoundTripScanner
28 # COMMENT(value)
29 #
30 # Read comments in the Scanner code for more details.
31 #
32
33 from ruamel.yaml.error import MarkedYAMLError
34 from ruamel.yaml.tokens import * # NOQA
35 from ruamel.yaml.compat import utf8, unichr, PY3, check_anchorname_char, nprint # NOQA
36
37 if False: # MYPY
38 from typing import Any, Dict, Optional, List, Union, Text # NOQA
39 from ruamel.yaml.compat import VersionType # NOQA
40
41 __all__ = ['Scanner', 'RoundTripScanner', 'ScannerError']
42
43
44 _THE_END = '\n\0\r\x85\u2028\u2029'
45 _THE_END_SPACE_TAB = ' \n\0\t\r\x85\u2028\u2029'
46 _SPACE_TAB = ' \t'
47
48
49 class ScannerError(MarkedYAMLError):
50 pass
51
52
53 class SimpleKey(object):
54 # See below simple keys treatment.
55
56 def __init__(self, token_number, required, index, line, column, mark):
57 # type: (Any, Any, int, int, int, Any) -> None
58 self.token_number = token_number
59 self.required = required
60 self.index = index
61 self.line = line
62 self.column = column
63 self.mark = mark
64
65
66 class Scanner(object):
67 def __init__(self, loader=None):
68 # type: (Any) -> None
69 """Initialize the scanner."""
70 # It is assumed that Scanner and Reader will have a common descendant.
71 # Reader do the dirty work of checking for BOM and converting the
72 # input data to Unicode. It also adds NUL to the end.
73 #
74 # Reader supports the following methods
75 # self.peek(i=0) # peek the next i-th character
76 # self.prefix(l=1) # peek the next l characters
77 # self.forward(l=1) # read the next l characters and move the pointer
78
79 self.loader = loader
80 if self.loader is not None and getattr(self.loader, '_scanner', None) is None:
81 self.loader._scanner = self
82 self.reset_scanner()
83 self.first_time = False
84 self.yaml_version = None # type: Any
85
86 @property
87 def flow_level(self):
88 # type: () -> int
89 return len(self.flow_context)
90
91 def reset_scanner(self):
92 # type: () -> None
93 # Had we reached the end of the stream?
94 self.done = False
95
96 # flow_context is an expanding/shrinking list consisting of '{' and '['
97 # for each unclosed flow context. If empty list that means block context
98 self.flow_context = [] # type: List[Text]
99
100 # List of processed tokens that are not yet emitted.
101 self.tokens = [] # type: List[Any]
102
103 # Add the STREAM-START token.
104 self.fetch_stream_start()
105
106 # Number of tokens that were emitted through the `get_token` method.
107 self.tokens_taken = 0
108
109 # The current indentation level.
110 self.indent = -1
111
112 # Past indentation levels.
113 self.indents = [] # type: List[int]
114
115 # Variables related to simple keys treatment.
116
117 # A simple key is a key that is not denoted by the '?' indicator.
118 # Example of simple keys:
119 # ---
120 # block simple key: value
121 # ? not a simple key:
122 # : { flow simple key: value }
123 # We emit the KEY token before all keys, so when we find a potential
124 # simple key, we try to locate the corresponding ':' indicator.
125 # Simple keys should be limited to a single line and 1024 characters.
126
127 # Can a simple key start at the current position? A simple key may
128 # start:
129 # - at the beginning of the line, not counting indentation spaces
130 # (in block context),
131 # - after '{', '[', ',' (in the flow context),
132 # - after '?', ':', '-' (in the block context).
133 # In the block context, this flag also signifies if a block collection
134 # may start at the current position.
135 self.allow_simple_key = True
136
137 # Keep track of possible simple keys. This is a dictionary. The key
138 # is `flow_level`; there can be no more that one possible simple key
139 # for each level. The value is a SimpleKey record:
140 # (token_number, required, index, line, column, mark)
141 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
142 # '[', or '{' tokens.
143 self.possible_simple_keys = {} # type: Dict[Any, Any]
144
145 @property
146 def reader(self):
147 # type: () -> Any
148 try:
149 return self._scanner_reader # type: ignore
150 except AttributeError:
151 if hasattr(self.loader, 'typ'):
152 self._scanner_reader = self.loader.reader
153 else:
154 self._scanner_reader = self.loader._reader
155 return self._scanner_reader
156
157 @property
158 def scanner_processing_version(self): # prefix until un-composited
159 # type: () -> Any
160 if hasattr(self.loader, 'typ'):
161 return self.loader.resolver.processing_version
162 return self.loader.processing_version
163
164 # Public methods.
165
166 def check_token(self, *choices):
167 # type: (Any) -> bool
168 # Check if the next token is one of the given types.
169 while self.need_more_tokens():
170 self.fetch_more_tokens()
171 if bool(self.tokens):
172 if not choices:
173 return True
174 for choice in choices:
175 if isinstance(self.tokens[0], choice):
176 return True
177 return False
178
179 def peek_token(self):
180 # type: () -> Any
181 # Return the next token, but do not delete if from the queue.
182 while self.need_more_tokens():
183 self.fetch_more_tokens()
184 if bool(self.tokens):
185 return self.tokens[0]
186
187 def get_token(self):
188 # type: () -> Any
189 # Return the next token.
190 while self.need_more_tokens():
191 self.fetch_more_tokens()
192 if bool(self.tokens):
193 self.tokens_taken += 1
194 return self.tokens.pop(0)
195
196 # Private methods.
197
198 def need_more_tokens(self):
199 # type: () -> bool
200 if self.done:
201 return False
202 if not self.tokens:
203 return True
204 # The current token may be a potential simple key, so we
205 # need to look further.
206 self.stale_possible_simple_keys()
207 if self.next_possible_simple_key() == self.tokens_taken:
208 return True
209 return False
210
211 def fetch_comment(self, comment):
212 # type: (Any) -> None
213 raise NotImplementedError
214
215 def fetch_more_tokens(self):
216 # type: () -> Any
217 # Eat whitespaces and comments until we reach the next token.
218 comment = self.scan_to_next_token()
219 if comment is not None: # never happens for base scanner
220 return self.fetch_comment(comment)
221 # Remove obsolete possible simple keys.
222 self.stale_possible_simple_keys()
223
224 # Compare the current indentation and column. It may add some tokens
225 # and decrease the current indentation level.
226 self.unwind_indent(self.reader.column)
227
228 # Peek the next character.
229 ch = self.reader.peek()
230
231 # Is it the end of stream?
232 if ch == '\0':
233 return self.fetch_stream_end()
234
235 # Is it a directive?
236 if ch == '%' and self.check_directive():
237 return self.fetch_directive()
238
239 # Is it the document start?
240 if ch == '-' and self.check_document_start():
241 return self.fetch_document_start()
242
243 # Is it the document end?
244 if ch == '.' and self.check_document_end():
245 return self.fetch_document_end()
246
247 # TODO: support for BOM within a stream.
248 # if ch == u'\uFEFF':
249 # return self.fetch_bom() <-- issue BOMToken
250
251 # Note: the order of the following checks is NOT significant.
252
253 # Is it the flow sequence start indicator?
254 if ch == '[':
255 return self.fetch_flow_sequence_start()
256
257 # Is it the flow mapping start indicator?
258 if ch == '{':
259 return self.fetch_flow_mapping_start()
260
261 # Is it the flow sequence end indicator?
262 if ch == ']':
263 return self.fetch_flow_sequence_end()
264
265 # Is it the flow mapping end indicator?
266 if ch == '}':
267 return self.fetch_flow_mapping_end()
268
269 # Is it the flow entry indicator?
270 if ch == ',':
271 return self.fetch_flow_entry()
272
273 # Is it the block entry indicator?
274 if ch == '-' and self.check_block_entry():
275 return self.fetch_block_entry()
276
277 # Is it the key indicator?
278 if ch == '?' and self.check_key():
279 return self.fetch_key()
280
281 # Is it the value indicator?
282 if ch == ':' and self.check_value():
283 return self.fetch_value()
284
285 # Is it an alias?
286 if ch == '*':
287 return self.fetch_alias()
288
289 # Is it an anchor?
290 if ch == '&':
291 return self.fetch_anchor()
292
293 # Is it a tag?
294 if ch == '!':
295 return self.fetch_tag()
296
297 # Is it a literal scalar?
298 if ch == '|' and not self.flow_level:
299 return self.fetch_literal()
300
301 # Is it a folded scalar?
302 if ch == '>' and not self.flow_level:
303 return self.fetch_folded()
304
305 # Is it a single quoted scalar?
306 if ch == "'":
307 return self.fetch_single()
308
309 # Is it a double quoted scalar?
310 if ch == '"':
311 return self.fetch_double()
312
313 # It must be a plain scalar then.
314 if self.check_plain():
315 return self.fetch_plain()
316
317 # No? It's an error. Let's produce a nice error message.
318 raise ScannerError(
319 'while scanning for the next token',
320 None,
321 'found character %r that cannot start any token' % utf8(ch),
322 self.reader.get_mark(),
323 )
324
325 # Simple keys treatment.
326
327 def next_possible_simple_key(self):
328 # type: () -> Any
329 # Return the number of the nearest possible simple key. Actually we
330 # don't need to loop through the whole dictionary. We may replace it
331 # with the following code:
332 # if not self.possible_simple_keys:
333 # return None
334 # return self.possible_simple_keys[
335 # min(self.possible_simple_keys.keys())].token_number
336 min_token_number = None
337 for level in self.possible_simple_keys:
338 key = self.possible_simple_keys[level]
339 if min_token_number is None or key.token_number < min_token_number:
340 min_token_number = key.token_number
341 return min_token_number
342
343 def stale_possible_simple_keys(self):
344 # type: () -> None
345 # Remove entries that are no longer possible simple keys. According to
346 # the YAML specification, simple keys
347 # - should be limited to a single line,
348 # - should be no longer than 1024 characters.
349 # Disabling this procedure will allow simple keys of any length and
350 # height (may cause problems if indentation is broken though).
351 for level in list(self.possible_simple_keys):
352 key = self.possible_simple_keys[level]
353 if key.line != self.reader.line or self.reader.index - key.index > 1024:
354 if key.required:
355 raise ScannerError(
356 'while scanning a simple key',
357 key.mark,
358 "could not find expected ':'",
359 self.reader.get_mark(),
360 )
361 del self.possible_simple_keys[level]
362
363 def save_possible_simple_key(self):
364 # type: () -> None
365 # The next token may start a simple key. We check if it's possible
366 # and save its position. This function is called for
367 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
368
369 # Check if a simple key is required at the current position.
370 required = not self.flow_level and self.indent == self.reader.column
371
372 # The next token might be a simple key. Let's save it's number and
373 # position.
374 if self.allow_simple_key:
375 self.remove_possible_simple_key()
376 token_number = self.tokens_taken + len(self.tokens)
377 key = SimpleKey(
378 token_number,
379 required,
380 self.reader.index,
381 self.reader.line,
382 self.reader.column,
383 self.reader.get_mark(),
384 )
385 self.possible_simple_keys[self.flow_level] = key
386
387 def remove_possible_simple_key(self):
388 # type: () -> None
389 # Remove the saved possible key position at the current flow level.
390 if self.flow_level in self.possible_simple_keys:
391 key = self.possible_simple_keys[self.flow_level]
392
393 if key.required:
394 raise ScannerError(
395 'while scanning a simple key',
396 key.mark,
397 "could not find expected ':'",
398 self.reader.get_mark(),
399 )
400
401 del self.possible_simple_keys[self.flow_level]
402
403 # Indentation functions.
404
405 def unwind_indent(self, column):
406 # type: (Any) -> None
407 # In flow context, tokens should respect indentation.
408 # Actually the condition should be `self.indent >= column` according to
409 # the spec. But this condition will prohibit intuitively correct
410 # constructions such as
411 # key : {
412 # }
413 # ####
414 # if self.flow_level and self.indent > column:
415 # raise ScannerError(None, None,
416 # "invalid intendation or unclosed '[' or '{'",
417 # self.reader.get_mark())
418
419 # In the flow context, indentation is ignored. We make the scanner less
420 # restrictive then specification requires.
421 if bool(self.flow_level):
422 return
423
424 # In block context, we may need to issue the BLOCK-END tokens.
425 while self.indent > column:
426 mark = self.reader.get_mark()
427 self.indent = self.indents.pop()
428 self.tokens.append(BlockEndToken(mark, mark))
429
430 def add_indent(self, column):
431 # type: (int) -> bool
432 # Check if we need to increase indentation.
433 if self.indent < column:
434 self.indents.append(self.indent)
435 self.indent = column
436 return True
437 return False
438
439 # Fetchers.
440
441 def fetch_stream_start(self):
442 # type: () -> None
443 # We always add STREAM-START as the first token and STREAM-END as the
444 # last token.
445 # Read the token.
446 mark = self.reader.get_mark()
447 # Add STREAM-START.
448 self.tokens.append(StreamStartToken(mark, mark, encoding=self.reader.encoding))
449
450 def fetch_stream_end(self):
451 # type: () -> None
452 # Set the current intendation to -1.
453 self.unwind_indent(-1)
454 # Reset simple keys.
455 self.remove_possible_simple_key()
456 self.allow_simple_key = False
457 self.possible_simple_keys = {}
458 # Read the token.
459 mark = self.reader.get_mark()
460 # Add STREAM-END.
461 self.tokens.append(StreamEndToken(mark, mark))
462 # The steam is finished.
463 self.done = True
464
465 def fetch_directive(self):
466 # type: () -> None
467 # Set the current intendation to -1.
468 self.unwind_indent(-1)
469
470 # Reset simple keys.
471 self.remove_possible_simple_key()
472 self.allow_simple_key = False
473
474 # Scan and add DIRECTIVE.
475 self.tokens.append(self.scan_directive())
476
477 def fetch_document_start(self):
478 # type: () -> None
479 self.fetch_document_indicator(DocumentStartToken)
480
481 def fetch_document_end(self):
482 # type: () -> None
483 self.fetch_document_indicator(DocumentEndToken)
484
485 def fetch_document_indicator(self, TokenClass):
486 # type: (Any) -> None
487 # Set the current intendation to -1.
488 self.unwind_indent(-1)
489
490 # Reset simple keys. Note that there could not be a block collection
491 # after '---'.
492 self.remove_possible_simple_key()
493 self.allow_simple_key = False
494
495 # Add DOCUMENT-START or DOCUMENT-END.
496 start_mark = self.reader.get_mark()
497 self.reader.forward(3)
498 end_mark = self.reader.get_mark()
499 self.tokens.append(TokenClass(start_mark, end_mark))
500
501 def fetch_flow_sequence_start(self):
502 # type: () -> None
503 self.fetch_flow_collection_start(FlowSequenceStartToken, to_push='[')
504
505 def fetch_flow_mapping_start(self):
506 # type: () -> None
507 self.fetch_flow_collection_start(FlowMappingStartToken, to_push='{')
508
509 def fetch_flow_collection_start(self, TokenClass, to_push):
510 # type: (Any, Text) -> None
511 # '[' and '{' may start a simple key.
512 self.save_possible_simple_key()
513 # Increase the flow level.
514 self.flow_context.append(to_push)
515 # Simple keys are allowed after '[' and '{'.
516 self.allow_simple_key = True
517 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
518 start_mark = self.reader.get_mark()
519 self.reader.forward()
520 end_mark = self.reader.get_mark()
521 self.tokens.append(TokenClass(start_mark, end_mark))
522
523 def fetch_flow_sequence_end(self):
524 # type: () -> None
525 self.fetch_flow_collection_end(FlowSequenceEndToken)
526
527 def fetch_flow_mapping_end(self):
528 # type: () -> None
529 self.fetch_flow_collection_end(FlowMappingEndToken)
530
531 def fetch_flow_collection_end(self, TokenClass):
532 # type: (Any) -> None
533 # Reset possible simple key on the current level.
534 self.remove_possible_simple_key()
535 # Decrease the flow level.
536 try:
537 popped = self.flow_context.pop() # NOQA
538 except IndexError:
539 # We must not be in a list or object.
540 # Defer error handling to the parser.
541 pass
542 # No simple keys after ']' or '}'.
543 self.allow_simple_key = False
544 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
545 start_mark = self.reader.get_mark()
546 self.reader.forward()
547 end_mark = self.reader.get_mark()
548 self.tokens.append(TokenClass(start_mark, end_mark))
549
550 def fetch_flow_entry(self):
551 # type: () -> None
552 # Simple keys are allowed after ','.
553 self.allow_simple_key = True
554 # Reset possible simple key on the current level.
555 self.remove_possible_simple_key()
556 # Add FLOW-ENTRY.
557 start_mark = self.reader.get_mark()
558 self.reader.forward()
559 end_mark = self.reader.get_mark()
560 self.tokens.append(FlowEntryToken(start_mark, end_mark))
561
562 def fetch_block_entry(self):
563 # type: () -> None
564 # Block context needs additional checks.
565 if not self.flow_level:
566 # Are we allowed to start a new entry?
567 if not self.allow_simple_key:
568 raise ScannerError(
569 None, None, 'sequence entries are not allowed here', self.reader.get_mark()
570 )
571 # We may need to add BLOCK-SEQUENCE-START.
572 if self.add_indent(self.reader.column):
573 mark = self.reader.get_mark()
574 self.tokens.append(BlockSequenceStartToken(mark, mark))
575 # It's an error for the block entry to occur in the flow context,
576 # but we let the parser detect this.
577 else:
578 pass
579 # Simple keys are allowed after '-'.
580 self.allow_simple_key = True
581 # Reset possible simple key on the current level.
582 self.remove_possible_simple_key()
583
584 # Add BLOCK-ENTRY.
585 start_mark = self.reader.get_mark()
586 self.reader.forward()
587 end_mark = self.reader.get_mark()
588 self.tokens.append(BlockEntryToken(start_mark, end_mark))
589
590 def fetch_key(self):
591 # type: () -> None
592 # Block context needs additional checks.
593 if not self.flow_level:
594
595 # Are we allowed to start a key (not nessesary a simple)?
596 if not self.allow_simple_key:
597 raise ScannerError(
598 None, None, 'mapping keys are not allowed here', self.reader.get_mark()
599 )
600
601 # We may need to add BLOCK-MAPPING-START.
602 if self.add_indent(self.reader.column):
603 mark = self.reader.get_mark()
604 self.tokens.append(BlockMappingStartToken(mark, mark))
605
606 # Simple keys are allowed after '?' in the block context.
607 self.allow_simple_key = not self.flow_level
608
609 # Reset possible simple key on the current level.
610 self.remove_possible_simple_key()
611
612 # Add KEY.
613 start_mark = self.reader.get_mark()
614 self.reader.forward()
615 end_mark = self.reader.get_mark()
616 self.tokens.append(KeyToken(start_mark, end_mark))
617
618 def fetch_value(self):
619 # type: () -> None
620 # Do we determine a simple key?
621 if self.flow_level in self.possible_simple_keys:
622 # Add KEY.
623 key = self.possible_simple_keys[self.flow_level]
624 del self.possible_simple_keys[self.flow_level]
625 self.tokens.insert(
626 key.token_number - self.tokens_taken, KeyToken(key.mark, key.mark)
627 )
628
629 # If this key starts a new block mapping, we need to add
630 # BLOCK-MAPPING-START.
631 if not self.flow_level:
632 if self.add_indent(key.column):
633 self.tokens.insert(
634 key.token_number - self.tokens_taken,
635 BlockMappingStartToken(key.mark, key.mark),
636 )
637
638 # There cannot be two simple keys one after another.
639 self.allow_simple_key = False
640
641 # It must be a part of a complex key.
642 else:
643
644 # Block context needs additional checks.
645 # (Do we really need them? They will be caught by the parser
646 # anyway.)
647 if not self.flow_level:
648
649 # We are allowed to start a complex value if and only if
650 # we can start a simple key.
651 if not self.allow_simple_key:
652 raise ScannerError(
653 None,
654 None,
655 'mapping values are not allowed here',
656 self.reader.get_mark(),
657 )
658
659 # If this value starts a new block mapping, we need to add
660 # BLOCK-MAPPING-START. It will be detected as an error later by
661 # the parser.
662 if not self.flow_level:
663 if self.add_indent(self.reader.column):
664 mark = self.reader.get_mark()
665 self.tokens.append(BlockMappingStartToken(mark, mark))
666
667 # Simple keys are allowed after ':' in the block context.
668 self.allow_simple_key = not self.flow_level
669
670 # Reset possible simple key on the current level.
671 self.remove_possible_simple_key()
672
673 # Add VALUE.
674 start_mark = self.reader.get_mark()
675 self.reader.forward()
676 end_mark = self.reader.get_mark()
677 self.tokens.append(ValueToken(start_mark, end_mark))
678
679 def fetch_alias(self):
680 # type: () -> None
681 # ALIAS could be a simple key.
682 self.save_possible_simple_key()
683 # No simple keys after ALIAS.
684 self.allow_simple_key = False
685 # Scan and add ALIAS.
686 self.tokens.append(self.scan_anchor(AliasToken))
687
688 def fetch_anchor(self):
689 # type: () -> None
690 # ANCHOR could start a simple key.
691 self.save_possible_simple_key()
692 # No simple keys after ANCHOR.
693 self.allow_simple_key = False
694 # Scan and add ANCHOR.
695 self.tokens.append(self.scan_anchor(AnchorToken))
696
697 def fetch_tag(self):
698 # type: () -> None
699 # TAG could start a simple key.
700 self.save_possible_simple_key()
701 # No simple keys after TAG.
702 self.allow_simple_key = False
703 # Scan and add TAG.
704 self.tokens.append(self.scan_tag())
705
706 def fetch_literal(self):
707 # type: () -> None
708 self.fetch_block_scalar(style='|')
709
710 def fetch_folded(self):
711 # type: () -> None
712 self.fetch_block_scalar(style='>')
713
714 def fetch_block_scalar(self, style):
715 # type: (Any) -> None
716 # A simple key may follow a block scalar.
717 self.allow_simple_key = True
718 # Reset possible simple key on the current level.
719 self.remove_possible_simple_key()
720 # Scan and add SCALAR.
721 self.tokens.append(self.scan_block_scalar(style))
722
723 def fetch_single(self):
724 # type: () -> None
725 self.fetch_flow_scalar(style="'")
726
727 def fetch_double(self):
728 # type: () -> None
729 self.fetch_flow_scalar(style='"')
730
731 def fetch_flow_scalar(self, style):
732 # type: (Any) -> None
733 # A flow scalar could be a simple key.
734 self.save_possible_simple_key()
735 # No simple keys after flow scalars.
736 self.allow_simple_key = False
737 # Scan and add SCALAR.
738 self.tokens.append(self.scan_flow_scalar(style))
739
740 def fetch_plain(self):
741 # type: () -> None
742 # A plain scalar could be a simple key.
743 self.save_possible_simple_key()
744 # No simple keys after plain scalars. But note that `scan_plain` will
745 # change this flag if the scan is finished at the beginning of the
746 # line.
747 self.allow_simple_key = False
748 # Scan and add SCALAR. May change `allow_simple_key`.
749 self.tokens.append(self.scan_plain())
750
751 # Checkers.
752
753 def check_directive(self):
754 # type: () -> Any
755 # DIRECTIVE: ^ '%' ...
756 # The '%' indicator is already checked.
757 if self.reader.column == 0:
758 return True
759 return None
760
761 def check_document_start(self):
762 # type: () -> Any
763 # DOCUMENT-START: ^ '---' (' '|'\n')
764 if self.reader.column == 0:
765 if self.reader.prefix(3) == '---' and self.reader.peek(3) in _THE_END_SPACE_TAB:
766 return True
767 return None
768
769 def check_document_end(self):
770 # type: () -> Any
771 # DOCUMENT-END: ^ '...' (' '|'\n')
772 if self.reader.column == 0:
773 if self.reader.prefix(3) == '...' and self.reader.peek(3) in _THE_END_SPACE_TAB:
774 return True
775 return None
776
777 def check_block_entry(self):
778 # type: () -> Any
779 # BLOCK-ENTRY: '-' (' '|'\n')
780 return self.reader.peek(1) in _THE_END_SPACE_TAB
781
782 def check_key(self):
783 # type: () -> Any
784 # KEY(flow context): '?'
785 if bool(self.flow_level):
786 return True
787 # KEY(block context): '?' (' '|'\n')
788 return self.reader.peek(1) in _THE_END_SPACE_TAB
789
790 def check_value(self):
791 # type: () -> Any
792 # VALUE(flow context): ':'
793 if self.scanner_processing_version == (1, 1):
794 if bool(self.flow_level):
795 return True
796 else:
797 if bool(self.flow_level):
798 if self.flow_context[-1] == '[':
799 if self.reader.peek(1) not in _THE_END_SPACE_TAB:
800 return False
801 elif self.tokens and isinstance(self.tokens[-1], ValueToken):
802 # mapping flow context scanning a value token
803 if self.reader.peek(1) not in _THE_END_SPACE_TAB:
804 return False
805 return True
806 # VALUE(block context): ':' (' '|'\n')
807 return self.reader.peek(1) in _THE_END_SPACE_TAB
808
809 def check_plain(self):
810 # type: () -> Any
811 # A plain scalar may start with any non-space character except:
812 # '-', '?', ':', ',', '[', ']', '{', '}',
813 # '#', '&', '*', '!', '|', '>', '\'', '\"',
814 # '%', '@', '`'.
815 #
816 # It may also start with
817 # '-', '?', ':'
818 # if it is followed by a non-space character.
819 #
820 # Note that we limit the last rule to the block context (except the
821 # '-' character) because we want the flow context to be space
822 # independent.
823 srp = self.reader.peek
824 ch = srp()
825 if self.scanner_processing_version == (1, 1):
826 return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`' or (
827 srp(1) not in _THE_END_SPACE_TAB
828 and (ch == '-' or (not self.flow_level and ch in '?:'))
829 )
830 # YAML 1.2
831 if ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`':
832 # ################### ^ ???
833 return True
834 ch1 = srp(1)
835 if ch == '-' and ch1 not in _THE_END_SPACE_TAB:
836 return True
837 if ch == ':' and bool(self.flow_level) and ch1 not in _SPACE_TAB:
838 return True
839
840 return srp(1) not in _THE_END_SPACE_TAB and (
841 ch == '-' or (not self.flow_level and ch in '?:')
842 )
843
844 # Scanners.
845
846 def scan_to_next_token(self):
847 # type: () -> Any
848 # We ignore spaces, line breaks and comments.
849 # If we find a line break in the block context, we set the flag
850 # `allow_simple_key` on.
851 # The byte order mark is stripped if it's the first character in the
852 # stream. We do not yet support BOM inside the stream as the
853 # specification requires. Any such mark will be considered as a part
854 # of the document.
855 #
856 # TODO: We need to make tab handling rules more sane. A good rule is
857 # Tabs cannot precede tokens
858 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
859 # KEY(block), VALUE(block), BLOCK-ENTRY
860 # So the checking code is
861 # if <TAB>:
862 # self.allow_simple_keys = False
863 # We also need to add the check for `allow_simple_keys == True` to
864 # `unwind_indent` before issuing BLOCK-END.
865 # Scanners for block, flow, and plain scalars need to be modified.
866 srp = self.reader.peek
867 srf = self.reader.forward
868 if self.reader.index == 0 and srp() == '\uFEFF':
869 srf()
870 found = False
871 _the_end = _THE_END
872 while not found:
873 while srp() == ' ':
874 srf()
875 if srp() == '#':
876 while srp() not in _the_end:
877 srf()
878 if self.scan_line_break():
879 if not self.flow_level:
880 self.allow_simple_key = True
881 else:
882 found = True
883 return None
884
885 def scan_directive(self):
886 # type: () -> Any
887 # See the specification for details.
888 srp = self.reader.peek
889 srf = self.reader.forward
890 start_mark = self.reader.get_mark()
891 srf()
892 name = self.scan_directive_name(start_mark)
893 value = None
894 if name == 'YAML':
895 value = self.scan_yaml_directive_value(start_mark)
896 end_mark = self.reader.get_mark()
897 elif name == 'TAG':
898 value = self.scan_tag_directive_value(start_mark)
899 end_mark = self.reader.get_mark()
900 else:
901 end_mark = self.reader.get_mark()
902 while srp() not in _THE_END:
903 srf()
904 self.scan_directive_ignored_line(start_mark)
905 return DirectiveToken(name, value, start_mark, end_mark)
906
907 def scan_directive_name(self, start_mark):
908 # type: (Any) -> Any
909 # See the specification for details.
910 length = 0
911 srp = self.reader.peek
912 ch = srp(length)
913 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_:.':
914 length += 1
915 ch = srp(length)
916 if not length:
917 raise ScannerError(
918 'while scanning a directive',
919 start_mark,
920 'expected alphabetic or numeric character, but found %r' % utf8(ch),
921 self.reader.get_mark(),
922 )
923 value = self.reader.prefix(length)
924 self.reader.forward(length)
925 ch = srp()
926 if ch not in '\0 \r\n\x85\u2028\u2029':
927 raise ScannerError(
928 'while scanning a directive',
929 start_mark,
930 'expected alphabetic or numeric character, but found %r' % utf8(ch),
931 self.reader.get_mark(),
932 )
933 return value
934
935 def scan_yaml_directive_value(self, start_mark):
936 # type: (Any) -> Any
937 # See the specification for details.
938 srp = self.reader.peek
939 srf = self.reader.forward
940 while srp() == ' ':
941 srf()
942 major = self.scan_yaml_directive_number(start_mark)
943 if srp() != '.':
944 raise ScannerError(
945 'while scanning a directive',
946 start_mark,
947 "expected a digit or '.', but found %r" % utf8(srp()),
948 self.reader.get_mark(),
949 )
950 srf()
951 minor = self.scan_yaml_directive_number(start_mark)
952 if srp() not in '\0 \r\n\x85\u2028\u2029':
953 raise ScannerError(
954 'while scanning a directive',
955 start_mark,
956 "expected a digit or ' ', but found %r" % utf8(srp()),
957 self.reader.get_mark(),
958 )
959 self.yaml_version = (major, minor)
960 return self.yaml_version
961
962 def scan_yaml_directive_number(self, start_mark):
963 # type: (Any) -> Any
964 # See the specification for details.
965 srp = self.reader.peek
966 srf = self.reader.forward
967 ch = srp()
968 if not ('0' <= ch <= '9'):
969 raise ScannerError(
970 'while scanning a directive',
971 start_mark,
972 'expected a digit, but found %r' % utf8(ch),
973 self.reader.get_mark(),
974 )
975 length = 0
976 while '0' <= srp(length) <= '9':
977 length += 1
978 value = int(self.reader.prefix(length))
979 srf(length)
980 return value
981
982 def scan_tag_directive_value(self, start_mark):
983 # type: (Any) -> Any
984 # See the specification for details.
985 srp = self.reader.peek
986 srf = self.reader.forward
987 while srp() == ' ':
988 srf()
989 handle = self.scan_tag_directive_handle(start_mark)
990 while srp() == ' ':
991 srf()
992 prefix = self.scan_tag_directive_prefix(start_mark)
993 return (handle, prefix)
994
995 def scan_tag_directive_handle(self, start_mark):
996 # type: (Any) -> Any
997 # See the specification for details.
998 value = self.scan_tag_handle('directive', start_mark)
999 ch = self.reader.peek()
1000 if ch != ' ':
1001 raise ScannerError(
1002 'while scanning a directive',
1003 start_mark,
1004 "expected ' ', but found %r" % utf8(ch),
1005 self.reader.get_mark(),
1006 )
1007 return value
1008
1009 def scan_tag_directive_prefix(self, start_mark):
1010 # type: (Any) -> Any
1011 # See the specification for details.
1012 value = self.scan_tag_uri('directive', start_mark)
1013 ch = self.reader.peek()
1014 if ch not in '\0 \r\n\x85\u2028\u2029':
1015 raise ScannerError(
1016 'while scanning a directive',
1017 start_mark,
1018 "expected ' ', but found %r" % utf8(ch),
1019 self.reader.get_mark(),
1020 )
1021 return value
1022
1023 def scan_directive_ignored_line(self, start_mark):
1024 # type: (Any) -> None
1025 # See the specification for details.
1026 srp = self.reader.peek
1027 srf = self.reader.forward
1028 while srp() == ' ':
1029 srf()
1030 if srp() == '#':
1031 while srp() not in _THE_END:
1032 srf()
1033 ch = srp()
1034 if ch not in _THE_END:
1035 raise ScannerError(
1036 'while scanning a directive',
1037 start_mark,
1038 'expected a comment or a line break, but found %r' % utf8(ch),
1039 self.reader.get_mark(),
1040 )
1041 self.scan_line_break()
1042
1043 def scan_anchor(self, TokenClass):
1044 # type: (Any) -> Any
1045 # The specification does not restrict characters for anchors and
1046 # aliases. This may lead to problems, for instance, the document:
1047 # [ *alias, value ]
1048 # can be interpteted in two ways, as
1049 # [ "value" ]
1050 # and
1051 # [ *alias , "value" ]
1052 # Therefore we restrict aliases to numbers and ASCII letters.
1053 srp = self.reader.peek
1054 start_mark = self.reader.get_mark()
1055 indicator = srp()
1056 if indicator == '*':
1057 name = 'alias'
1058 else:
1059 name = 'anchor'
1060 self.reader.forward()
1061 length = 0
1062 ch = srp(length)
1063 # while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
1064 # or ch in u'-_':
1065 while check_anchorname_char(ch):
1066 length += 1
1067 ch = srp(length)
1068 if not length:
1069 raise ScannerError(
1070 'while scanning an %s' % (name,),
1071 start_mark,
1072 'expected alphabetic or numeric character, but found %r' % utf8(ch),
1073 self.reader.get_mark(),
1074 )
1075 value = self.reader.prefix(length)
1076 self.reader.forward(length)
1077 # ch1 = ch
1078 # ch = srp() # no need to peek, ch is already set
1079 # assert ch1 == ch
1080 if ch not in '\0 \t\r\n\x85\u2028\u2029?:,[]{}%@`':
1081 raise ScannerError(
1082 'while scanning an %s' % (name,),
1083 start_mark,
1084 'expected alphabetic or numeric character, but found %r' % utf8(ch),
1085 self.reader.get_mark(),
1086 )
1087 end_mark = self.reader.get_mark()
1088 return TokenClass(value, start_mark, end_mark)
1089
1090 def scan_tag(self):
1091 # type: () -> Any
1092 # See the specification for details.
1093 srp = self.reader.peek
1094 start_mark = self.reader.get_mark()
1095 ch = srp(1)
1096 if ch == '<':
1097 handle = None
1098 self.reader.forward(2)
1099 suffix = self.scan_tag_uri('tag', start_mark)
1100 if srp() != '>':
1101 raise ScannerError(
1102 'while parsing a tag',
1103 start_mark,
1104 "expected '>', but found %r" % utf8(srp()),
1105 self.reader.get_mark(),
1106 )
1107 self.reader.forward()
1108 elif ch in _THE_END_SPACE_TAB:
1109 handle = None
1110 suffix = '!'
1111 self.reader.forward()
1112 else:
1113 length = 1
1114 use_handle = False
1115 while ch not in '\0 \r\n\x85\u2028\u2029':
1116 if ch == '!':
1117 use_handle = True
1118 break
1119 length += 1
1120 ch = srp(length)
1121 handle = '!'
1122 if use_handle:
1123 handle = self.scan_tag_handle('tag', start_mark)
1124 else:
1125 handle = '!'
1126 self.reader.forward()
1127 suffix = self.scan_tag_uri('tag', start_mark)
1128 ch = srp()
1129 if ch not in '\0 \r\n\x85\u2028\u2029':
1130 raise ScannerError(
1131 'while scanning a tag',
1132 start_mark,
1133 "expected ' ', but found %r" % utf8(ch),
1134 self.reader.get_mark(),
1135 )
1136 value = (handle, suffix)
1137 end_mark = self.reader.get_mark()
1138 return TagToken(value, start_mark, end_mark)
1139
1140 def scan_block_scalar(self, style, rt=False):
1141 # type: (Any, Optional[bool]) -> Any
1142 # See the specification for details.
1143 srp = self.reader.peek
1144 if style == '>':
1145 folded = True
1146 else:
1147 folded = False
1148
1149 chunks = [] # type: List[Any]
1150 start_mark = self.reader.get_mark()
1151
1152 # Scan the header.
1153 self.reader.forward()
1154 chomping, increment = self.scan_block_scalar_indicators(start_mark)
1155 # block scalar comment e.g. : |+ # comment text
1156 block_scalar_comment = self.scan_block_scalar_ignored_line(start_mark)
1157
1158 # Determine the indentation level and go to the first non-empty line.
1159 min_indent = self.indent + 1
1160 if increment is None:
1161 # no increment and top level, min_indent could be 0
1162 if min_indent < 1 and (
1163 style not in '|>'
1164 or (self.scanner_processing_version == (1, 1))
1165 and getattr(
1166 self.loader, 'top_level_block_style_scalar_no_indent_error_1_1', False
1167 )
1168 ):
1169 min_indent = 1
1170 breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
1171 indent = max(min_indent, max_indent)
1172 else:
1173 if min_indent < 1:
1174 min_indent = 1
1175 indent = min_indent + increment - 1
1176 breaks, end_mark = self.scan_block_scalar_breaks(indent)
1177 line_break = ""
1178
1179 # Scan the inner part of the block scalar.
1180 while self.reader.column == indent and srp() != '\0':
1181 chunks.extend(breaks)
1182 leading_non_space = srp() not in ' \t'
1183 length = 0
1184 while srp(length) not in _THE_END:
1185 length += 1
1186 chunks.append(self.reader.prefix(length))
1187 self.reader.forward(length)
1188 line_break = self.scan_line_break()
1189 breaks, end_mark = self.scan_block_scalar_breaks(indent)
1190 if style in '|>' and min_indent == 0:
1191 # at the beginning of a line, if in block style see if
1192 # end of document/start_new_document
1193 if self.check_document_start() or self.check_document_end():
1194 break
1195 if self.reader.column == indent and srp() != '\0':
1196
1197 # Unfortunately, folding rules are ambiguous.
1198 #
1199 # This is the folding according to the specification:
1200
1201 if rt and folded and line_break == '\n':
1202 chunks.append('\a')
1203 if folded and line_break == '\n' and leading_non_space and srp() not in ' \t':
1204 if not breaks:
1205 chunks.append(' ')
1206 else:
1207 chunks.append(line_break)
1208
1209 # This is Clark Evans's interpretation (also in the spec
1210 # examples):
1211 #
1212 # if folded and line_break == u'\n':
1213 # if not breaks:
1214 # if srp() not in ' \t':
1215 # chunks.append(u' ')
1216 # else:
1217 # chunks.append(line_break)
1218 # else:
1219 # chunks.append(line_break)
1220 else:
1221 break
1222
1223 # Process trailing line breaks. The 'chomping' setting determines
1224 # whether they are included in the value.
1225 trailing = [] # type: List[Any]
1226 if chomping in [None, True]:
1227 chunks.append(line_break)
1228 if chomping is True:
1229 chunks.extend(breaks)
1230 elif chomping in [None, False]:
1231 trailing.extend(breaks)
1232
1233 # We are done.
1234 token = ScalarToken("".join(chunks), False, start_mark, end_mark, style)
1235 if block_scalar_comment is not None:
1236 token.add_pre_comments([block_scalar_comment])
1237 if len(trailing) > 0:
1238 # nprint('trailing 1', trailing) # XXXXX
1239 # Eat whitespaces and comments until we reach the next token.
1240 comment = self.scan_to_next_token()
1241 while comment:
1242 trailing.append(' ' * comment[1].column + comment[0])
1243 comment = self.scan_to_next_token()
1244
1245 # Keep track of the trailing whitespace and following comments
1246 # as a comment token, if isn't all included in the actual value.
1247 comment_end_mark = self.reader.get_mark()
1248 comment = CommentToken("".join(trailing), end_mark, comment_end_mark)
1249 token.add_post_comment(comment)
1250 return token
1251
1252 def scan_block_scalar_indicators(self, start_mark):
1253 # type: (Any) -> Any
1254 # See the specification for details.
1255 srp = self.reader.peek
1256 chomping = None
1257 increment = None
1258 ch = srp()
1259 if ch in '+-':
1260 if ch == '+':
1261 chomping = True
1262 else:
1263 chomping = False
1264 self.reader.forward()
1265 ch = srp()
1266 if ch in '0123456789':
1267 increment = int(ch)
1268 if increment == 0:
1269 raise ScannerError(
1270 'while scanning a block scalar',
1271 start_mark,
1272 'expected indentation indicator in the range 1-9, ' 'but found 0',
1273 self.reader.get_mark(),
1274 )
1275 self.reader.forward()
1276 elif ch in '0123456789':
1277 increment = int(ch)
1278 if increment == 0:
1279 raise ScannerError(
1280 'while scanning a block scalar',
1281 start_mark,
1282 'expected indentation indicator in the range 1-9, ' 'but found 0',
1283 self.reader.get_mark(),
1284 )
1285 self.reader.forward()
1286 ch = srp()
1287 if ch in '+-':
1288 if ch == '+':
1289 chomping = True
1290 else:
1291 chomping = False
1292 self.reader.forward()
1293 ch = srp()
1294 if ch not in '\0 \r\n\x85\u2028\u2029':
1295 raise ScannerError(
1296 'while scanning a block scalar',
1297 start_mark,
1298 'expected chomping or indentation indicators, but found %r' % utf8(ch),
1299 self.reader.get_mark(),
1300 )
1301 return chomping, increment
1302
1303 def scan_block_scalar_ignored_line(self, start_mark):
1304 # type: (Any) -> Any
1305 # See the specification for details.
1306 srp = self.reader.peek
1307 srf = self.reader.forward
1308 prefix = ''
1309 comment = None
1310 while srp() == ' ':
1311 prefix += srp()
1312 srf()
1313 if srp() == '#':
1314 comment = prefix
1315 while srp() not in _THE_END:
1316 comment += srp()
1317 srf()
1318 ch = srp()
1319 if ch not in _THE_END:
1320 raise ScannerError(
1321 'while scanning a block scalar',
1322 start_mark,
1323 'expected a comment or a line break, but found %r' % utf8(ch),
1324 self.reader.get_mark(),
1325 )
1326 self.scan_line_break()
1327 return comment
1328
1329 def scan_block_scalar_indentation(self):
1330 # type: () -> Any
1331 # See the specification for details.
1332 srp = self.reader.peek
1333 srf = self.reader.forward
1334 chunks = []
1335 max_indent = 0
1336 end_mark = self.reader.get_mark()
1337 while srp() in ' \r\n\x85\u2028\u2029':
1338 if srp() != ' ':
1339 chunks.append(self.scan_line_break())
1340 end_mark = self.reader.get_mark()
1341 else:
1342 srf()
1343 if self.reader.column > max_indent:
1344 max_indent = self.reader.column
1345 return chunks, max_indent, end_mark
1346
1347 def scan_block_scalar_breaks(self, indent):
1348 # type: (int) -> Any
1349 # See the specification for details.
1350 chunks = []
1351 srp = self.reader.peek
1352 srf = self.reader.forward
1353 end_mark = self.reader.get_mark()
1354 while self.reader.column < indent and srp() == ' ':
1355 srf()
1356 while srp() in '\r\n\x85\u2028\u2029':
1357 chunks.append(self.scan_line_break())
1358 end_mark = self.reader.get_mark()
1359 while self.reader.column < indent and srp() == ' ':
1360 srf()
1361 return chunks, end_mark
1362
1363 def scan_flow_scalar(self, style):
1364 # type: (Any) -> Any
1365 # See the specification for details.
1366 # Note that we loose indentation rules for quoted scalars. Quoted
1367 # scalars don't need to adhere indentation because " and ' clearly
1368 # mark the beginning and the end of them. Therefore we are less
1369 # restrictive then the specification requires. We only need to check
1370 # that document separators are not included in scalars.
1371 if style == '"':
1372 double = True
1373 else:
1374 double = False
1375 srp = self.reader.peek
1376 chunks = [] # type: List[Any]
1377 start_mark = self.reader.get_mark()
1378 quote = srp()
1379 self.reader.forward()
1380 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1381 while srp() != quote:
1382 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
1383 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1384 self.reader.forward()
1385 end_mark = self.reader.get_mark()
1386 return ScalarToken("".join(chunks), False, start_mark, end_mark, style)
1387
1388 ESCAPE_REPLACEMENTS = {
1389 '0': '\0',
1390 'a': '\x07',
1391 'b': '\x08',
1392 't': '\x09',
1393 '\t': '\x09',
1394 'n': '\x0A',
1395 'v': '\x0B',
1396 'f': '\x0C',
1397 'r': '\x0D',
1398 'e': '\x1B',
1399 ' ': '\x20',
1400 '"': '"',
1401 '/': '/', # as per http://www.json.org/
1402 '\\': '\\',
1403 'N': '\x85',
1404 '_': '\xA0',
1405 'L': '\u2028',
1406 'P': '\u2029',
1407 }
1408
1409 ESCAPE_CODES = {'x': 2, 'u': 4, 'U': 8}
1410
1411 def scan_flow_scalar_non_spaces(self, double, start_mark):
1412 # type: (Any, Any) -> Any
1413 # See the specification for details.
1414 chunks = [] # type: List[Any]
1415 srp = self.reader.peek
1416 srf = self.reader.forward
1417 while True:
1418 length = 0
1419 while srp(length) not in ' \n\'"\\\0\t\r\x85\u2028\u2029':
1420 length += 1
1421 if length != 0:
1422 chunks.append(self.reader.prefix(length))
1423 srf(length)
1424 ch = srp()
1425 if not double and ch == "'" and srp(1) == "'":
1426 chunks.append("'")
1427 srf(2)
1428 elif (double and ch == "'") or (not double and ch in '"\\'):
1429 chunks.append(ch)
1430 srf()
1431 elif double and ch == '\\':
1432 srf()
1433 ch = srp()
1434 if ch in self.ESCAPE_REPLACEMENTS:
1435 chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1436 srf()
1437 elif ch in self.ESCAPE_CODES:
1438 length = self.ESCAPE_CODES[ch]
1439 srf()
1440 for k in range(length):
1441 if srp(k) not in '0123456789ABCDEFabcdef':
1442 raise ScannerError(
1443 'while scanning a double-quoted scalar',
1444 start_mark,
1445 'expected escape sequence of %d hexdecimal '
1446 'numbers, but found %r' % (length, utf8(srp(k))),
1447 self.reader.get_mark(),
1448 )
1449 code = int(self.reader.prefix(length), 16)
1450 chunks.append(unichr(code))
1451 srf(length)
1452 elif ch in '\n\r\x85\u2028\u2029':
1453 self.scan_line_break()
1454 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
1455 else:
1456 raise ScannerError(
1457 'while scanning a double-quoted scalar',
1458 start_mark,
1459 'found unknown escape character %r' % utf8(ch),
1460 self.reader.get_mark(),
1461 )
1462 else:
1463 return chunks
1464
1465 def scan_flow_scalar_spaces(self, double, start_mark):
1466 # type: (Any, Any) -> Any
1467 # See the specification for details.
1468 srp = self.reader.peek
1469 chunks = []
1470 length = 0
1471 while srp(length) in ' \t':
1472 length += 1
1473 whitespaces = self.reader.prefix(length)
1474 self.reader.forward(length)
1475 ch = srp()
1476 if ch == '\0':
1477 raise ScannerError(
1478 'while scanning a quoted scalar',
1479 start_mark,
1480 'found unexpected end of stream',
1481 self.reader.get_mark(),
1482 )
1483 elif ch in '\r\n\x85\u2028\u2029':
1484 line_break = self.scan_line_break()
1485 breaks = self.scan_flow_scalar_breaks(double, start_mark)
1486 if line_break != '\n':
1487 chunks.append(line_break)
1488 elif not breaks:
1489 chunks.append(' ')
1490 chunks.extend(breaks)
1491 else:
1492 chunks.append(whitespaces)
1493 return chunks
1494
1495 def scan_flow_scalar_breaks(self, double, start_mark):
1496 # type: (Any, Any) -> Any
1497 # See the specification for details.
1498 chunks = [] # type: List[Any]
1499 srp = self.reader.peek
1500 srf = self.reader.forward
1501 while True:
1502 # Instead of checking indentation, we check for document
1503 # separators.
1504 prefix = self.reader.prefix(3)
1505 if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
1506 raise ScannerError(
1507 'while scanning a quoted scalar',
1508 start_mark,
1509 'found unexpected document separator',
1510 self.reader.get_mark(),
1511 )
1512 while srp() in ' \t':
1513 srf()
1514 if srp() in '\r\n\x85\u2028\u2029':
1515 chunks.append(self.scan_line_break())
1516 else:
1517 return chunks
1518
1519 def scan_plain(self):
1520 # type: () -> Any
1521 # See the specification for details.
1522 # We add an additional restriction for the flow context:
1523 # plain scalars in the flow context cannot contain ',', ': ' and '?'.
1524 # We also keep track of the `allow_simple_key` flag here.
1525 # Indentation rules are loosed for the flow context.
1526 srp = self.reader.peek
1527 srf = self.reader.forward
1528 chunks = [] # type: List[Any]
1529 start_mark = self.reader.get_mark()
1530 end_mark = start_mark
1531 indent = self.indent + 1
1532 # We allow zero indentation for scalars, but then we need to check for
1533 # document separators at the beginning of the line.
1534 # if indent == 0:
1535 # indent = 1
1536 spaces = [] # type: List[Any]
1537 while True:
1538 length = 0
1539 if srp() == '#':
1540 break
1541 while True:
1542 ch = srp(length)
1543 if ch == ':' and srp(length + 1) not in _THE_END_SPACE_TAB:
1544 pass
1545 elif ch == '?' and self.scanner_processing_version != (1, 1):
1546 pass
1547 elif (
1548 ch in _THE_END_SPACE_TAB
1549 or (
1550 not self.flow_level
1551 and ch == ':'
1552 and srp(length + 1) in _THE_END_SPACE_TAB
1553 )
1554 or (self.flow_level and ch in ',:?[]{}')
1555 ):
1556 break
1557 length += 1
1558 # It's not clear what we should do with ':' in the flow context.
1559 if (
1560 self.flow_level
1561 and ch == ':'
1562 and srp(length + 1) not in '\0 \t\r\n\x85\u2028\u2029,[]{}'
1563 ):
1564 srf(length)
1565 raise ScannerError(
1566 'while scanning a plain scalar',
1567 start_mark,
1568 "found unexpected ':'",
1569 self.reader.get_mark(),
1570 'Please check '
1571 'http://pyyaml.org/wiki/YAMLColonInFlowContext '
1572 'for details.',
1573 )
1574 if length == 0:
1575 break
1576 self.allow_simple_key = False
1577 chunks.extend(spaces)
1578 chunks.append(self.reader.prefix(length))
1579 srf(length)
1580 end_mark = self.reader.get_mark()
1581 spaces = self.scan_plain_spaces(indent, start_mark)
1582 if (
1583 not spaces
1584 or srp() == '#'
1585 or (not self.flow_level and self.reader.column < indent)
1586 ):
1587 break
1588
1589 token = ScalarToken("".join(chunks), True, start_mark, end_mark)
1590 if spaces and spaces[0] == '\n':
1591 # Create a comment token to preserve the trailing line breaks.
1592 comment = CommentToken("".join(spaces) + '\n', start_mark, end_mark)
1593 token.add_post_comment(comment)
1594 return token
1595
1596 def scan_plain_spaces(self, indent, start_mark):
1597 # type: (Any, Any) -> Any
1598 # See the specification for details.
1599 # The specification is really confusing about tabs in plain scalars.
1600 # We just forbid them completely. Do not use tabs in YAML!
1601 srp = self.reader.peek
1602 srf = self.reader.forward
1603 chunks = []
1604 length = 0
1605 while srp(length) in ' ':
1606 length += 1
1607 whitespaces = self.reader.prefix(length)
1608 self.reader.forward(length)
1609 ch = srp()
1610 if ch in '\r\n\x85\u2028\u2029':
1611 line_break = self.scan_line_break()
1612 self.allow_simple_key = True
1613 prefix = self.reader.prefix(3)
1614 if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
1615 return
1616 breaks = []
1617 while srp() in ' \r\n\x85\u2028\u2029':
1618 if srp() == ' ':
1619 srf()
1620 else:
1621 breaks.append(self.scan_line_break())
1622 prefix = self.reader.prefix(3)
1623 if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
1624 return
1625 if line_break != '\n':
1626 chunks.append(line_break)
1627 elif not breaks:
1628 chunks.append(' ')
1629 chunks.extend(breaks)
1630 elif whitespaces:
1631 chunks.append(whitespaces)
1632 return chunks
1633
1634 def scan_tag_handle(self, name, start_mark):
1635 # type: (Any, Any) -> Any
1636 # See the specification for details.
1637 # For some strange reasons, the specification does not allow '_' in
1638 # tag handles. I have allowed it anyway.
1639 srp = self.reader.peek
1640 ch = srp()
1641 if ch != '!':
1642 raise ScannerError(
1643 'while scanning a %s' % (name,),
1644 start_mark,
1645 "expected '!', but found %r" % utf8(ch),
1646 self.reader.get_mark(),
1647 )
1648 length = 1
1649 ch = srp(length)
1650 if ch != ' ':
1651 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_':
1652 length += 1
1653 ch = srp(length)
1654 if ch != '!':
1655 self.reader.forward(length)
1656 raise ScannerError(
1657 'while scanning a %s' % (name,),
1658 start_mark,
1659 "expected '!', but found %r" % utf8(ch),
1660 self.reader.get_mark(),
1661 )
1662 length += 1
1663 value = self.reader.prefix(length)
1664 self.reader.forward(length)
1665 return value
1666
1667 def scan_tag_uri(self, name, start_mark):
1668 # type: (Any, Any) -> Any
1669 # See the specification for details.
1670 # Note: we do not check if URI is well-formed.
1671 srp = self.reader.peek
1672 chunks = []
1673 length = 0
1674 ch = srp(length)
1675 while (
1676 '0' <= ch <= '9'
1677 or 'A' <= ch <= 'Z'
1678 or 'a' <= ch <= 'z'
1679 or ch in "-;/?:@&=+$,_.!~*'()[]%"
1680 or ((self.scanner_processing_version > (1, 1)) and ch == '#')
1681 ):
1682 if ch == '%':
1683 chunks.append(self.reader.prefix(length))
1684 self.reader.forward(length)
1685 length = 0
1686 chunks.append(self.scan_uri_escapes(name, start_mark))
1687 else:
1688 length += 1
1689 ch = srp(length)
1690 if length != 0:
1691 chunks.append(self.reader.prefix(length))
1692 self.reader.forward(length)
1693 length = 0
1694 if not chunks:
1695 raise ScannerError(
1696 'while parsing a %s' % (name,),
1697 start_mark,
1698 'expected URI, but found %r' % utf8(ch),
1699 self.reader.get_mark(),
1700 )
1701 return "".join(chunks)
1702
1703 def scan_uri_escapes(self, name, start_mark):
1704 # type: (Any, Any) -> Any
1705 # See the specification for details.
1706 srp = self.reader.peek
1707 srf = self.reader.forward
1708 code_bytes = [] # type: List[Any]
1709 mark = self.reader.get_mark()
1710 while srp() == '%':
1711 srf()
1712 for k in range(2):
1713 if srp(k) not in '0123456789ABCDEFabcdef':
1714 raise ScannerError(
1715 'while scanning a %s' % (name,),
1716 start_mark,
1717 'expected URI escape sequence of 2 hexdecimal numbers,'
1718 ' but found %r' % utf8(srp(k)),
1719 self.reader.get_mark(),
1720 )
1721 if PY3:
1722 code_bytes.append(int(self.reader.prefix(2), 16))
1723 else:
1724 code_bytes.append(chr(int(self.reader.prefix(2), 16)))
1725 srf(2)
1726 try:
1727 if PY3:
1728 value = bytes(code_bytes).decode('utf-8')
1729 else:
1730 value = unicode(b"".join(code_bytes), 'utf-8')
1731 except UnicodeDecodeError as exc:
1732 raise ScannerError('while scanning a %s' % (name,), start_mark, str(exc), mark)
1733 return value
1734
1735 def scan_line_break(self):
1736 # type: () -> Any
1737 # Transforms:
1738 # '\r\n' : '\n'
1739 # '\r' : '\n'
1740 # '\n' : '\n'
1741 # '\x85' : '\n'
1742 # '\u2028' : '\u2028'
1743 # '\u2029 : '\u2029'
1744 # default : ''
1745 ch = self.reader.peek()
1746 if ch in '\r\n\x85':
1747 if self.reader.prefix(2) == '\r\n':
1748 self.reader.forward(2)
1749 else:
1750 self.reader.forward()
1751 return '\n'
1752 elif ch in '\u2028\u2029':
1753 self.reader.forward()
1754 return ch
1755 return ""
1756
1757
1758 class RoundTripScanner(Scanner):
1759 def check_token(self, *choices):
1760 # type: (Any) -> bool
1761 # Check if the next token is one of the given types.
1762 while self.need_more_tokens():
1763 self.fetch_more_tokens()
1764 self._gather_comments()
1765 if bool(self.tokens):
1766 if not choices:
1767 return True
1768 for choice in choices:
1769 if isinstance(self.tokens[0], choice):
1770 return True
1771 return False
1772
1773 def peek_token(self):
1774 # type: () -> Any
1775 # Return the next token, but do not delete if from the queue.
1776 while self.need_more_tokens():
1777 self.fetch_more_tokens()
1778 self._gather_comments()
1779 if bool(self.tokens):
1780 return self.tokens[0]
1781 return None
1782
1783 def _gather_comments(self):
1784 # type: () -> Any
1785 """combine multiple comment lines"""
1786 comments = [] # type: List[Any]
1787 if not self.tokens:
1788 return comments
1789 if isinstance(self.tokens[0], CommentToken):
1790 comment = self.tokens.pop(0)
1791 self.tokens_taken += 1
1792 comments.append(comment)
1793 while self.need_more_tokens():
1794 self.fetch_more_tokens()
1795 if not self.tokens:
1796 return comments
1797 if isinstance(self.tokens[0], CommentToken):
1798 self.tokens_taken += 1
1799 comment = self.tokens.pop(0)
1800 # nprint('dropping2', comment)
1801 comments.append(comment)
1802 if len(comments) >= 1:
1803 self.tokens[0].add_pre_comments(comments)
1804 # pull in post comment on e.g. ':'
1805 if not self.done and len(self.tokens) < 2:
1806 self.fetch_more_tokens()
1807
1808 def get_token(self):
1809 # type: () -> Any
1810 # Return the next token.
1811 while self.need_more_tokens():
1812 self.fetch_more_tokens()
1813 self._gather_comments()
1814 if bool(self.tokens):
1815 # nprint('tk', self.tokens)
1816 # only add post comment to single line tokens:
1817 # scalar, value token. FlowXEndToken, otherwise
1818 # hidden streamtokens could get them (leave them and they will be
1819 # pre comments for the next map/seq
1820 if (
1821 len(self.tokens) > 1
1822 and isinstance(
1823 self.tokens[0],
1824 (ScalarToken, ValueToken, FlowSequenceEndToken, FlowMappingEndToken),
1825 )
1826 and isinstance(self.tokens[1], CommentToken)
1827 and self.tokens[0].end_mark.line == self.tokens[1].start_mark.line
1828 ):
1829 self.tokens_taken += 1
1830 c = self.tokens.pop(1)
1831 self.fetch_more_tokens()
1832 while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken):
1833 self.tokens_taken += 1
1834 c1 = self.tokens.pop(1)
1835 c.value = c.value + (' ' * c1.start_mark.column) + c1.value
1836 self.fetch_more_tokens()
1837 self.tokens[0].add_post_comment(c)
1838 elif (
1839 len(self.tokens) > 1
1840 and isinstance(self.tokens[0], ScalarToken)
1841 and isinstance(self.tokens[1], CommentToken)
1842 and self.tokens[0].end_mark.line != self.tokens[1].start_mark.line
1843 ):
1844 self.tokens_taken += 1
1845 c = self.tokens.pop(1)
1846 c.value = (
1847 '\n' * (c.start_mark.line - self.tokens[0].end_mark.line)
1848 + (' ' * c.start_mark.column)
1849 + c.value
1850 )
1851 self.tokens[0].add_post_comment(c)
1852 self.fetch_more_tokens()
1853 while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken):
1854 self.tokens_taken += 1
1855 c1 = self.tokens.pop(1)
1856 c.value = c.value + (' ' * c1.start_mark.column) + c1.value
1857 self.fetch_more_tokens()
1858 self.tokens_taken += 1
1859 return self.tokens.pop(0)
1860 return None
1861
1862 def fetch_comment(self, comment):
1863 # type: (Any) -> None
1864 value, start_mark, end_mark = comment
1865 while value and value[-1] == ' ':
1866 # empty line within indented key context
1867 # no need to update end-mark, that is not used
1868 value = value[:-1]
1869 self.tokens.append(CommentToken(value, start_mark, end_mark))
1870
1871 # scanner
1872
1873 def scan_to_next_token(self):
1874 # type: () -> Any
1875 # We ignore spaces, line breaks and comments.
1876 # If we find a line break in the block context, we set the flag
1877 # `allow_simple_key` on.
1878 # The byte order mark is stripped if it's the first character in the
1879 # stream. We do not yet support BOM inside the stream as the
1880 # specification requires. Any such mark will be considered as a part
1881 # of the document.
1882 #
1883 # TODO: We need to make tab handling rules more sane. A good rule is
1884 # Tabs cannot precede tokens
1885 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
1886 # KEY(block), VALUE(block), BLOCK-ENTRY
1887 # So the checking code is
1888 # if <TAB>:
1889 # self.allow_simple_keys = False
1890 # We also need to add the check for `allow_simple_keys == True` to
1891 # `unwind_indent` before issuing BLOCK-END.
1892 # Scanners for block, flow, and plain scalars need to be modified.
1893
1894 srp = self.reader.peek
1895 srf = self.reader.forward
1896 if self.reader.index == 0 and srp() == '\uFEFF':
1897 srf()
1898 found = False
1899 while not found:
1900 while srp() == ' ':
1901 srf()
1902 ch = srp()
1903 if ch == '#':
1904 start_mark = self.reader.get_mark()
1905 comment = ch
1906 srf()
1907 while ch not in _THE_END:
1908 ch = srp()
1909 if ch == '\0': # don't gobble the end-of-stream character
1910 # but add an explicit newline as "YAML processors should terminate
1911 # the stream with an explicit line break
1912 # https://yaml.org/spec/1.2/spec.html#id2780069
1913 comment += '\n'
1914 break
1915 comment += ch
1916 srf()
1917 # gather any blank lines following the comment too
1918 ch = self.scan_line_break()
1919 while len(ch) > 0:
1920 comment += ch
1921 ch = self.scan_line_break()
1922 end_mark = self.reader.get_mark()
1923 if not self.flow_level:
1924 self.allow_simple_key = True
1925 return comment, start_mark, end_mark
1926 if bool(self.scan_line_break()):
1927 start_mark = self.reader.get_mark()
1928 if not self.flow_level:
1929 self.allow_simple_key = True
1930 ch = srp()
1931 if ch == '\n': # empty toplevel lines
1932 start_mark = self.reader.get_mark()
1933 comment = ""
1934 while ch:
1935 ch = self.scan_line_break(empty_line=True)
1936 comment += ch
1937 if srp() == '#':
1938 # empty line followed by indented real comment
1939 comment = comment.rsplit('\n', 1)[0] + '\n'
1940 end_mark = self.reader.get_mark()
1941 return comment, start_mark, end_mark
1942 else:
1943 found = True
1944 return None
1945
1946 def scan_line_break(self, empty_line=False):
1947 # type: (bool) -> Text
1948 # Transforms:
1949 # '\r\n' : '\n'
1950 # '\r' : '\n'
1951 # '\n' : '\n'
1952 # '\x85' : '\n'
1953 # '\u2028' : '\u2028'
1954 # '\u2029 : '\u2029'
1955 # default : ''
1956 ch = self.reader.peek() # type: Text
1957 if ch in '\r\n\x85':
1958 if self.reader.prefix(2) == '\r\n':
1959 self.reader.forward(2)
1960 else:
1961 self.reader.forward()
1962 return '\n'
1963 elif ch in '\u2028\u2029':
1964 self.reader.forward()
1965 return ch
1966 elif empty_line and ch in '\t ':
1967 self.reader.forward()
1968 return ch
1969 return ""
1970
1971 def scan_block_scalar(self, style, rt=True):
1972 # type: (Any, Optional[bool]) -> Any
1973 return Scanner.scan_block_scalar(self, style, rt=rt)
1974
1975
1976 # try:
1977 # import psyco
1978 # psyco.bind(Scanner)
1979 # except ImportError:
1980 # pass