Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/yaml/scanner.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 | |
| 2 # Scanner produces tokens of the following types: | |
| 3 # STREAM-START | |
| 4 # STREAM-END | |
| 5 # DIRECTIVE(name, value) | |
| 6 # DOCUMENT-START | |
| 7 # DOCUMENT-END | |
| 8 # BLOCK-SEQUENCE-START | |
| 9 # BLOCK-MAPPING-START | |
| 10 # BLOCK-END | |
| 11 # FLOW-SEQUENCE-START | |
| 12 # FLOW-MAPPING-START | |
| 13 # FLOW-SEQUENCE-END | |
| 14 # FLOW-MAPPING-END | |
| 15 # BLOCK-ENTRY | |
| 16 # FLOW-ENTRY | |
| 17 # KEY | |
| 18 # VALUE | |
| 19 # ALIAS(value) | |
| 20 # ANCHOR(value) | |
| 21 # TAG(value) | |
| 22 # SCALAR(value, plain, style) | |
| 23 # | |
| 24 # Read comments in the Scanner code for more details. | |
| 25 # | |
| 26 | |
| 27 __all__ = ['Scanner', 'ScannerError'] | |
| 28 | |
| 29 from .error import MarkedYAMLError | |
| 30 from .tokens import * | |
| 31 | |
| 32 class ScannerError(MarkedYAMLError): | |
| 33 pass | |
| 34 | |
| 35 class SimpleKey: | |
| 36 # See below simple keys treatment. | |
| 37 | |
| 38 def __init__(self, token_number, required, index, line, column, mark): | |
| 39 self.token_number = token_number | |
| 40 self.required = required | |
| 41 self.index = index | |
| 42 self.line = line | |
| 43 self.column = column | |
| 44 self.mark = mark | |
| 45 | |
| 46 class Scanner: | |
| 47 | |
| 48 def __init__(self): | |
| 49 """Initialize the scanner.""" | |
| 50 # It is assumed that Scanner and Reader will have a common descendant. | |
| 51 # Reader do the dirty work of checking for BOM and converting the | |
| 52 # input data to Unicode. It also adds NUL to the end. | |
| 53 # | |
| 54 # Reader supports the following methods | |
| 55 # self.peek(i=0) # peek the next i-th character | |
| 56 # self.prefix(l=1) # peek the next l characters | |
| 57 # self.forward(l=1) # read the next l characters and move the pointer. | |
| 58 | |
| 59 # Had we reached the end of the stream? | |
| 60 self.done = False | |
| 61 | |
| 62 # The number of unclosed '{' and '['. `flow_level == 0` means block | |
| 63 # context. | |
| 64 self.flow_level = 0 | |
| 65 | |
| 66 # List of processed tokens that are not yet emitted. | |
| 67 self.tokens = [] | |
| 68 | |
| 69 # Add the STREAM-START token. | |
| 70 self.fetch_stream_start() | |
| 71 | |
| 72 # Number of tokens that were emitted through the `get_token` method. | |
| 73 self.tokens_taken = 0 | |
| 74 | |
| 75 # The current indentation level. | |
| 76 self.indent = -1 | |
| 77 | |
| 78 # Past indentation levels. | |
| 79 self.indents = [] | |
| 80 | |
| 81 # Variables related to simple keys treatment. | |
| 82 | |
| 83 # A simple key is a key that is not denoted by the '?' indicator. | |
| 84 # Example of simple keys: | |
| 85 # --- | |
| 86 # block simple key: value | |
| 87 # ? not a simple key: | |
| 88 # : { flow simple key: value } | |
| 89 # We emit the KEY token before all keys, so when we find a potential | |
| 90 # simple key, we try to locate the corresponding ':' indicator. | |
| 91 # Simple keys should be limited to a single line and 1024 characters. | |
| 92 | |
| 93 # Can a simple key start at the current position? A simple key may | |
| 94 # start: | |
| 95 # - at the beginning of the line, not counting indentation spaces | |
| 96 # (in block context), | |
| 97 # - after '{', '[', ',' (in the flow context), | |
| 98 # - after '?', ':', '-' (in the block context). | |
| 99 # In the block context, this flag also signifies if a block collection | |
| 100 # may start at the current position. | |
| 101 self.allow_simple_key = True | |
| 102 | |
| 103 # Keep track of possible simple keys. This is a dictionary. The key | |
| 104 # is `flow_level`; there can be no more that one possible simple key | |
| 105 # for each level. The value is a SimpleKey record: | |
| 106 # (token_number, required, index, line, column, mark) | |
| 107 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow), | |
| 108 # '[', or '{' tokens. | |
| 109 self.possible_simple_keys = {} | |
| 110 | |
| 111 # Public methods. | |
| 112 | |
| 113 def check_token(self, *choices): | |
| 114 # Check if the next token is one of the given types. | |
| 115 while self.need_more_tokens(): | |
| 116 self.fetch_more_tokens() | |
| 117 if self.tokens: | |
| 118 if not choices: | |
| 119 return True | |
| 120 for choice in choices: | |
| 121 if isinstance(self.tokens[0], choice): | |
| 122 return True | |
| 123 return False | |
| 124 | |
| 125 def peek_token(self): | |
| 126 # Return the next token, but do not delete if from the queue. | |
| 127 # Return None if no more tokens. | |
| 128 while self.need_more_tokens(): | |
| 129 self.fetch_more_tokens() | |
| 130 if self.tokens: | |
| 131 return self.tokens[0] | |
| 132 else: | |
| 133 return None | |
| 134 | |
| 135 def get_token(self): | |
| 136 # Return the next token. | |
| 137 while self.need_more_tokens(): | |
| 138 self.fetch_more_tokens() | |
| 139 if self.tokens: | |
| 140 self.tokens_taken += 1 | |
| 141 return self.tokens.pop(0) | |
| 142 | |
| 143 # Private methods. | |
| 144 | |
| 145 def need_more_tokens(self): | |
| 146 if self.done: | |
| 147 return False | |
| 148 if not self.tokens: | |
| 149 return True | |
| 150 # The current token may be a potential simple key, so we | |
| 151 # need to look further. | |
| 152 self.stale_possible_simple_keys() | |
| 153 if self.next_possible_simple_key() == self.tokens_taken: | |
| 154 return True | |
| 155 | |
| 156 def fetch_more_tokens(self): | |
| 157 | |
| 158 # Eat whitespaces and comments until we reach the next token. | |
| 159 self.scan_to_next_token() | |
| 160 | |
| 161 # Remove obsolete possible simple keys. | |
| 162 self.stale_possible_simple_keys() | |
| 163 | |
| 164 # Compare the current indentation and column. It may add some tokens | |
| 165 # and decrease the current indentation level. | |
| 166 self.unwind_indent(self.column) | |
| 167 | |
| 168 # Peek the next character. | |
| 169 ch = self.peek() | |
| 170 | |
| 171 # Is it the end of stream? | |
| 172 if ch == '\0': | |
| 173 return self.fetch_stream_end() | |
| 174 | |
| 175 # Is it a directive? | |
| 176 if ch == '%' and self.check_directive(): | |
| 177 return self.fetch_directive() | |
| 178 | |
| 179 # Is it the document start? | |
| 180 if ch == '-' and self.check_document_start(): | |
| 181 return self.fetch_document_start() | |
| 182 | |
| 183 # Is it the document end? | |
| 184 if ch == '.' and self.check_document_end(): | |
| 185 return self.fetch_document_end() | |
| 186 | |
| 187 # TODO: support for BOM within a stream. | |
| 188 #if ch == '\uFEFF': | |
| 189 # return self.fetch_bom() <-- issue BOMToken | |
| 190 | |
| 191 # Note: the order of the following checks is NOT significant. | |
| 192 | |
| 193 # Is it the flow sequence start indicator? | |
| 194 if ch == '[': | |
| 195 return self.fetch_flow_sequence_start() | |
| 196 | |
| 197 # Is it the flow mapping start indicator? | |
| 198 if ch == '{': | |
| 199 return self.fetch_flow_mapping_start() | |
| 200 | |
| 201 # Is it the flow sequence end indicator? | |
| 202 if ch == ']': | |
| 203 return self.fetch_flow_sequence_end() | |
| 204 | |
| 205 # Is it the flow mapping end indicator? | |
| 206 if ch == '}': | |
| 207 return self.fetch_flow_mapping_end() | |
| 208 | |
| 209 # Is it the flow entry indicator? | |
| 210 if ch == ',': | |
| 211 return self.fetch_flow_entry() | |
| 212 | |
| 213 # Is it the block entry indicator? | |
| 214 if ch == '-' and self.check_block_entry(): | |
| 215 return self.fetch_block_entry() | |
| 216 | |
| 217 # Is it the key indicator? | |
| 218 if ch == '?' and self.check_key(): | |
| 219 return self.fetch_key() | |
| 220 | |
| 221 # Is it the value indicator? | |
| 222 if ch == ':' and self.check_value(): | |
| 223 return self.fetch_value() | |
| 224 | |
| 225 # Is it an alias? | |
| 226 if ch == '*': | |
| 227 return self.fetch_alias() | |
| 228 | |
| 229 # Is it an anchor? | |
| 230 if ch == '&': | |
| 231 return self.fetch_anchor() | |
| 232 | |
| 233 # Is it a tag? | |
| 234 if ch == '!': | |
| 235 return self.fetch_tag() | |
| 236 | |
| 237 # Is it a literal scalar? | |
| 238 if ch == '|' and not self.flow_level: | |
| 239 return self.fetch_literal() | |
| 240 | |
| 241 # Is it a folded scalar? | |
| 242 if ch == '>' and not self.flow_level: | |
| 243 return self.fetch_folded() | |
| 244 | |
| 245 # Is it a single quoted scalar? | |
| 246 if ch == '\'': | |
| 247 return self.fetch_single() | |
| 248 | |
| 249 # Is it a double quoted scalar? | |
| 250 if ch == '\"': | |
| 251 return self.fetch_double() | |
| 252 | |
| 253 # It must be a plain scalar then. | |
| 254 if self.check_plain(): | |
| 255 return self.fetch_plain() | |
| 256 | |
| 257 # No? It's an error. Let's produce a nice error message. | |
| 258 raise ScannerError("while scanning for the next token", None, | |
| 259 "found character %r that cannot start any token" % ch, | |
| 260 self.get_mark()) | |
| 261 | |
| 262 # Simple keys treatment. | |
| 263 | |
| 264 def next_possible_simple_key(self): | |
| 265 # Return the number of the nearest possible simple key. Actually we | |
| 266 # don't need to loop through the whole dictionary. We may replace it | |
| 267 # with the following code: | |
| 268 # if not self.possible_simple_keys: | |
| 269 # return None | |
| 270 # return self.possible_simple_keys[ | |
| 271 # min(self.possible_simple_keys.keys())].token_number | |
| 272 min_token_number = None | |
| 273 for level in self.possible_simple_keys: | |
| 274 key = self.possible_simple_keys[level] | |
| 275 if min_token_number is None or key.token_number < min_token_number: | |
| 276 min_token_number = key.token_number | |
| 277 return min_token_number | |
| 278 | |
| 279 def stale_possible_simple_keys(self): | |
| 280 # Remove entries that are no longer possible simple keys. According to | |
| 281 # the YAML specification, simple keys | |
| 282 # - should be limited to a single line, | |
| 283 # - should be no longer than 1024 characters. | |
| 284 # Disabling this procedure will allow simple keys of any length and | |
| 285 # height (may cause problems if indentation is broken though). | |
| 286 for level in list(self.possible_simple_keys): | |
| 287 key = self.possible_simple_keys[level] | |
| 288 if key.line != self.line \ | |
| 289 or self.index-key.index > 1024: | |
| 290 if key.required: | |
| 291 raise ScannerError("while scanning a simple key", key.mark, | |
| 292 "could not find expected ':'", self.get_mark()) | |
| 293 del self.possible_simple_keys[level] | |
| 294 | |
| 295 def save_possible_simple_key(self): | |
| 296 # The next token may start a simple key. We check if it's possible | |
| 297 # and save its position. This function is called for | |
| 298 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. | |
| 299 | |
| 300 # Check if a simple key is required at the current position. | |
| 301 required = not self.flow_level and self.indent == self.column | |
| 302 | |
| 303 # The next token might be a simple key. Let's save it's number and | |
| 304 # position. | |
| 305 if self.allow_simple_key: | |
| 306 self.remove_possible_simple_key() | |
| 307 token_number = self.tokens_taken+len(self.tokens) | |
| 308 key = SimpleKey(token_number, required, | |
| 309 self.index, self.line, self.column, self.get_mark()) | |
| 310 self.possible_simple_keys[self.flow_level] = key | |
| 311 | |
| 312 def remove_possible_simple_key(self): | |
| 313 # Remove the saved possible key position at the current flow level. | |
| 314 if self.flow_level in self.possible_simple_keys: | |
| 315 key = self.possible_simple_keys[self.flow_level] | |
| 316 | |
| 317 if key.required: | |
| 318 raise ScannerError("while scanning a simple key", key.mark, | |
| 319 "could not find expected ':'", self.get_mark()) | |
| 320 | |
| 321 del self.possible_simple_keys[self.flow_level] | |
| 322 | |
| 323 # Indentation functions. | |
| 324 | |
| 325 def unwind_indent(self, column): | |
| 326 | |
| 327 ## In flow context, tokens should respect indentation. | |
| 328 ## Actually the condition should be `self.indent >= column` according to | |
| 329 ## the spec. But this condition will prohibit intuitively correct | |
| 330 ## constructions such as | |
| 331 ## key : { | |
| 332 ## } | |
| 333 #if self.flow_level and self.indent > column: | |
| 334 # raise ScannerError(None, None, | |
| 335 # "invalid indentation or unclosed '[' or '{'", | |
| 336 # self.get_mark()) | |
| 337 | |
| 338 # In the flow context, indentation is ignored. We make the scanner less | |
| 339 # restrictive then specification requires. | |
| 340 if self.flow_level: | |
| 341 return | |
| 342 | |
| 343 # In block context, we may need to issue the BLOCK-END tokens. | |
| 344 while self.indent > column: | |
| 345 mark = self.get_mark() | |
| 346 self.indent = self.indents.pop() | |
| 347 self.tokens.append(BlockEndToken(mark, mark)) | |
| 348 | |
| 349 def add_indent(self, column): | |
| 350 # Check if we need to increase indentation. | |
| 351 if self.indent < column: | |
| 352 self.indents.append(self.indent) | |
| 353 self.indent = column | |
| 354 return True | |
| 355 return False | |
| 356 | |
| 357 # Fetchers. | |
| 358 | |
| 359 def fetch_stream_start(self): | |
| 360 # We always add STREAM-START as the first token and STREAM-END as the | |
| 361 # last token. | |
| 362 | |
| 363 # Read the token. | |
| 364 mark = self.get_mark() | |
| 365 | |
| 366 # Add STREAM-START. | |
| 367 self.tokens.append(StreamStartToken(mark, mark, | |
| 368 encoding=self.encoding)) | |
| 369 | |
| 370 | |
| 371 def fetch_stream_end(self): | |
| 372 | |
| 373 # Set the current indentation to -1. | |
| 374 self.unwind_indent(-1) | |
| 375 | |
| 376 # Reset simple keys. | |
| 377 self.remove_possible_simple_key() | |
| 378 self.allow_simple_key = False | |
| 379 self.possible_simple_keys = {} | |
| 380 | |
| 381 # Read the token. | |
| 382 mark = self.get_mark() | |
| 383 | |
| 384 # Add STREAM-END. | |
| 385 self.tokens.append(StreamEndToken(mark, mark)) | |
| 386 | |
| 387 # The steam is finished. | |
| 388 self.done = True | |
| 389 | |
| 390 def fetch_directive(self): | |
| 391 | |
| 392 # Set the current indentation to -1. | |
| 393 self.unwind_indent(-1) | |
| 394 | |
| 395 # Reset simple keys. | |
| 396 self.remove_possible_simple_key() | |
| 397 self.allow_simple_key = False | |
| 398 | |
| 399 # Scan and add DIRECTIVE. | |
| 400 self.tokens.append(self.scan_directive()) | |
| 401 | |
| 402 def fetch_document_start(self): | |
| 403 self.fetch_document_indicator(DocumentStartToken) | |
| 404 | |
| 405 def fetch_document_end(self): | |
| 406 self.fetch_document_indicator(DocumentEndToken) | |
| 407 | |
| 408 def fetch_document_indicator(self, TokenClass): | |
| 409 | |
| 410 # Set the current indentation to -1. | |
| 411 self.unwind_indent(-1) | |
| 412 | |
| 413 # Reset simple keys. Note that there could not be a block collection | |
| 414 # after '---'. | |
| 415 self.remove_possible_simple_key() | |
| 416 self.allow_simple_key = False | |
| 417 | |
| 418 # Add DOCUMENT-START or DOCUMENT-END. | |
| 419 start_mark = self.get_mark() | |
| 420 self.forward(3) | |
| 421 end_mark = self.get_mark() | |
| 422 self.tokens.append(TokenClass(start_mark, end_mark)) | |
| 423 | |
| 424 def fetch_flow_sequence_start(self): | |
| 425 self.fetch_flow_collection_start(FlowSequenceStartToken) | |
| 426 | |
| 427 def fetch_flow_mapping_start(self): | |
| 428 self.fetch_flow_collection_start(FlowMappingStartToken) | |
| 429 | |
| 430 def fetch_flow_collection_start(self, TokenClass): | |
| 431 | |
| 432 # '[' and '{' may start a simple key. | |
| 433 self.save_possible_simple_key() | |
| 434 | |
| 435 # Increase the flow level. | |
| 436 self.flow_level += 1 | |
| 437 | |
| 438 # Simple keys are allowed after '[' and '{'. | |
| 439 self.allow_simple_key = True | |
| 440 | |
| 441 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. | |
| 442 start_mark = self.get_mark() | |
| 443 self.forward() | |
| 444 end_mark = self.get_mark() | |
| 445 self.tokens.append(TokenClass(start_mark, end_mark)) | |
| 446 | |
| 447 def fetch_flow_sequence_end(self): | |
| 448 self.fetch_flow_collection_end(FlowSequenceEndToken) | |
| 449 | |
| 450 def fetch_flow_mapping_end(self): | |
| 451 self.fetch_flow_collection_end(FlowMappingEndToken) | |
| 452 | |
| 453 def fetch_flow_collection_end(self, TokenClass): | |
| 454 | |
| 455 # Reset possible simple key on the current level. | |
| 456 self.remove_possible_simple_key() | |
| 457 | |
| 458 # Decrease the flow level. | |
| 459 self.flow_level -= 1 | |
| 460 | |
| 461 # No simple keys after ']' or '}'. | |
| 462 self.allow_simple_key = False | |
| 463 | |
| 464 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. | |
| 465 start_mark = self.get_mark() | |
| 466 self.forward() | |
| 467 end_mark = self.get_mark() | |
| 468 self.tokens.append(TokenClass(start_mark, end_mark)) | |
| 469 | |
| 470 def fetch_flow_entry(self): | |
| 471 | |
| 472 # Simple keys are allowed after ','. | |
| 473 self.allow_simple_key = True | |
| 474 | |
| 475 # Reset possible simple key on the current level. | |
| 476 self.remove_possible_simple_key() | |
| 477 | |
| 478 # Add FLOW-ENTRY. | |
| 479 start_mark = self.get_mark() | |
| 480 self.forward() | |
| 481 end_mark = self.get_mark() | |
| 482 self.tokens.append(FlowEntryToken(start_mark, end_mark)) | |
| 483 | |
| 484 def fetch_block_entry(self): | |
| 485 | |
| 486 # Block context needs additional checks. | |
| 487 if not self.flow_level: | |
| 488 | |
| 489 # Are we allowed to start a new entry? | |
| 490 if not self.allow_simple_key: | |
| 491 raise ScannerError(None, None, | |
| 492 "sequence entries are not allowed here", | |
| 493 self.get_mark()) | |
| 494 | |
| 495 # We may need to add BLOCK-SEQUENCE-START. | |
| 496 if self.add_indent(self.column): | |
| 497 mark = self.get_mark() | |
| 498 self.tokens.append(BlockSequenceStartToken(mark, mark)) | |
| 499 | |
| 500 # It's an error for the block entry to occur in the flow context, | |
| 501 # but we let the parser detect this. | |
| 502 else: | |
| 503 pass | |
| 504 | |
| 505 # Simple keys are allowed after '-'. | |
| 506 self.allow_simple_key = True | |
| 507 | |
| 508 # Reset possible simple key on the current level. | |
| 509 self.remove_possible_simple_key() | |
| 510 | |
| 511 # Add BLOCK-ENTRY. | |
| 512 start_mark = self.get_mark() | |
| 513 self.forward() | |
| 514 end_mark = self.get_mark() | |
| 515 self.tokens.append(BlockEntryToken(start_mark, end_mark)) | |
| 516 | |
| 517 def fetch_key(self): | |
| 518 | |
| 519 # Block context needs additional checks. | |
| 520 if not self.flow_level: | |
| 521 | |
| 522 # Are we allowed to start a key (not necessary a simple)? | |
| 523 if not self.allow_simple_key: | |
| 524 raise ScannerError(None, None, | |
| 525 "mapping keys are not allowed here", | |
| 526 self.get_mark()) | |
| 527 | |
| 528 # We may need to add BLOCK-MAPPING-START. | |
| 529 if self.add_indent(self.column): | |
| 530 mark = self.get_mark() | |
| 531 self.tokens.append(BlockMappingStartToken(mark, mark)) | |
| 532 | |
| 533 # Simple keys are allowed after '?' in the block context. | |
| 534 self.allow_simple_key = not self.flow_level | |
| 535 | |
| 536 # Reset possible simple key on the current level. | |
| 537 self.remove_possible_simple_key() | |
| 538 | |
| 539 # Add KEY. | |
| 540 start_mark = self.get_mark() | |
| 541 self.forward() | |
| 542 end_mark = self.get_mark() | |
| 543 self.tokens.append(KeyToken(start_mark, end_mark)) | |
| 544 | |
| 545 def fetch_value(self): | |
| 546 | |
| 547 # Do we determine a simple key? | |
| 548 if self.flow_level in self.possible_simple_keys: | |
| 549 | |
| 550 # Add KEY. | |
| 551 key = self.possible_simple_keys[self.flow_level] | |
| 552 del self.possible_simple_keys[self.flow_level] | |
| 553 self.tokens.insert(key.token_number-self.tokens_taken, | |
| 554 KeyToken(key.mark, key.mark)) | |
| 555 | |
| 556 # If this key starts a new block mapping, we need to add | |
| 557 # BLOCK-MAPPING-START. | |
| 558 if not self.flow_level: | |
| 559 if self.add_indent(key.column): | |
| 560 self.tokens.insert(key.token_number-self.tokens_taken, | |
| 561 BlockMappingStartToken(key.mark, key.mark)) | |
| 562 | |
| 563 # There cannot be two simple keys one after another. | |
| 564 self.allow_simple_key = False | |
| 565 | |
| 566 # It must be a part of a complex key. | |
| 567 else: | |
| 568 | |
| 569 # Block context needs additional checks. | |
| 570 # (Do we really need them? They will be caught by the parser | |
| 571 # anyway.) | |
| 572 if not self.flow_level: | |
| 573 | |
| 574 # We are allowed to start a complex value if and only if | |
| 575 # we can start a simple key. | |
| 576 if not self.allow_simple_key: | |
| 577 raise ScannerError(None, None, | |
| 578 "mapping values are not allowed here", | |
| 579 self.get_mark()) | |
| 580 | |
| 581 # If this value starts a new block mapping, we need to add | |
| 582 # BLOCK-MAPPING-START. It will be detected as an error later by | |
| 583 # the parser. | |
| 584 if not self.flow_level: | |
| 585 if self.add_indent(self.column): | |
| 586 mark = self.get_mark() | |
| 587 self.tokens.append(BlockMappingStartToken(mark, mark)) | |
| 588 | |
| 589 # Simple keys are allowed after ':' in the block context. | |
| 590 self.allow_simple_key = not self.flow_level | |
| 591 | |
| 592 # Reset possible simple key on the current level. | |
| 593 self.remove_possible_simple_key() | |
| 594 | |
| 595 # Add VALUE. | |
| 596 start_mark = self.get_mark() | |
| 597 self.forward() | |
| 598 end_mark = self.get_mark() | |
| 599 self.tokens.append(ValueToken(start_mark, end_mark)) | |
| 600 | |
| 601 def fetch_alias(self): | |
| 602 | |
| 603 # ALIAS could be a simple key. | |
| 604 self.save_possible_simple_key() | |
| 605 | |
| 606 # No simple keys after ALIAS. | |
| 607 self.allow_simple_key = False | |
| 608 | |
| 609 # Scan and add ALIAS. | |
| 610 self.tokens.append(self.scan_anchor(AliasToken)) | |
| 611 | |
| 612 def fetch_anchor(self): | |
| 613 | |
| 614 # ANCHOR could start a simple key. | |
| 615 self.save_possible_simple_key() | |
| 616 | |
| 617 # No simple keys after ANCHOR. | |
| 618 self.allow_simple_key = False | |
| 619 | |
| 620 # Scan and add ANCHOR. | |
| 621 self.tokens.append(self.scan_anchor(AnchorToken)) | |
| 622 | |
| 623 def fetch_tag(self): | |
| 624 | |
| 625 # TAG could start a simple key. | |
| 626 self.save_possible_simple_key() | |
| 627 | |
| 628 # No simple keys after TAG. | |
| 629 self.allow_simple_key = False | |
| 630 | |
| 631 # Scan and add TAG. | |
| 632 self.tokens.append(self.scan_tag()) | |
| 633 | |
| 634 def fetch_literal(self): | |
| 635 self.fetch_block_scalar(style='|') | |
| 636 | |
| 637 def fetch_folded(self): | |
| 638 self.fetch_block_scalar(style='>') | |
| 639 | |
| 640 def fetch_block_scalar(self, style): | |
| 641 | |
| 642 # A simple key may follow a block scalar. | |
| 643 self.allow_simple_key = True | |
| 644 | |
| 645 # Reset possible simple key on the current level. | |
| 646 self.remove_possible_simple_key() | |
| 647 | |
| 648 # Scan and add SCALAR. | |
| 649 self.tokens.append(self.scan_block_scalar(style)) | |
| 650 | |
| 651 def fetch_single(self): | |
| 652 self.fetch_flow_scalar(style='\'') | |
| 653 | |
| 654 def fetch_double(self): | |
| 655 self.fetch_flow_scalar(style='"') | |
| 656 | |
| 657 def fetch_flow_scalar(self, style): | |
| 658 | |
| 659 # A flow scalar could be a simple key. | |
| 660 self.save_possible_simple_key() | |
| 661 | |
| 662 # No simple keys after flow scalars. | |
| 663 self.allow_simple_key = False | |
| 664 | |
| 665 # Scan and add SCALAR. | |
| 666 self.tokens.append(self.scan_flow_scalar(style)) | |
| 667 | |
| 668 def fetch_plain(self): | |
| 669 | |
| 670 # A plain scalar could be a simple key. | |
| 671 self.save_possible_simple_key() | |
| 672 | |
| 673 # No simple keys after plain scalars. But note that `scan_plain` will | |
| 674 # change this flag if the scan is finished at the beginning of the | |
| 675 # line. | |
| 676 self.allow_simple_key = False | |
| 677 | |
| 678 # Scan and add SCALAR. May change `allow_simple_key`. | |
| 679 self.tokens.append(self.scan_plain()) | |
| 680 | |
| 681 # Checkers. | |
| 682 | |
| 683 def check_directive(self): | |
| 684 | |
| 685 # DIRECTIVE: ^ '%' ... | |
| 686 # The '%' indicator is already checked. | |
| 687 if self.column == 0: | |
| 688 return True | |
| 689 | |
| 690 def check_document_start(self): | |
| 691 | |
| 692 # DOCUMENT-START: ^ '---' (' '|'\n') | |
| 693 if self.column == 0: | |
| 694 if self.prefix(3) == '---' \ | |
| 695 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
| 696 return True | |
| 697 | |
| 698 def check_document_end(self): | |
| 699 | |
| 700 # DOCUMENT-END: ^ '...' (' '|'\n') | |
| 701 if self.column == 0: | |
| 702 if self.prefix(3) == '...' \ | |
| 703 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
| 704 return True | |
| 705 | |
| 706 def check_block_entry(self): | |
| 707 | |
| 708 # BLOCK-ENTRY: '-' (' '|'\n') | |
| 709 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' | |
| 710 | |
| 711 def check_key(self): | |
| 712 | |
| 713 # KEY(flow context): '?' | |
| 714 if self.flow_level: | |
| 715 return True | |
| 716 | |
| 717 # KEY(block context): '?' (' '|'\n') | |
| 718 else: | |
| 719 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' | |
| 720 | |
| 721 def check_value(self): | |
| 722 | |
| 723 # VALUE(flow context): ':' | |
| 724 if self.flow_level: | |
| 725 return True | |
| 726 | |
| 727 # VALUE(block context): ':' (' '|'\n') | |
| 728 else: | |
| 729 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' | |
| 730 | |
| 731 def check_plain(self): | |
| 732 | |
| 733 # A plain scalar may start with any non-space character except: | |
| 734 # '-', '?', ':', ',', '[', ']', '{', '}', | |
| 735 # '#', '&', '*', '!', '|', '>', '\'', '\"', | |
| 736 # '%', '@', '`'. | |
| 737 # | |
| 738 # It may also start with | |
| 739 # '-', '?', ':' | |
| 740 # if it is followed by a non-space character. | |
| 741 # | |
| 742 # Note that we limit the last rule to the block context (except the | |
| 743 # '-' character) because we want the flow context to be space | |
| 744 # independent. | |
| 745 ch = self.peek() | |
| 746 return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \ | |
| 747 or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029' | |
| 748 and (ch == '-' or (not self.flow_level and ch in '?:'))) | |
| 749 | |
| 750 # Scanners. | |
| 751 | |
| 752 def scan_to_next_token(self): | |
| 753 # We ignore spaces, line breaks and comments. | |
| 754 # If we find a line break in the block context, we set the flag | |
| 755 # `allow_simple_key` on. | |
| 756 # The byte order mark is stripped if it's the first character in the | |
| 757 # stream. We do not yet support BOM inside the stream as the | |
| 758 # specification requires. Any such mark will be considered as a part | |
| 759 # of the document. | |
| 760 # | |
| 761 # TODO: We need to make tab handling rules more sane. A good rule is | |
| 762 # Tabs cannot precede tokens | |
| 763 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, | |
| 764 # KEY(block), VALUE(block), BLOCK-ENTRY | |
| 765 # So the checking code is | |
| 766 # if <TAB>: | |
| 767 # self.allow_simple_keys = False | |
| 768 # We also need to add the check for `allow_simple_keys == True` to | |
| 769 # `unwind_indent` before issuing BLOCK-END. | |
| 770 # Scanners for block, flow, and plain scalars need to be modified. | |
| 771 | |
| 772 if self.index == 0 and self.peek() == '\uFEFF': | |
| 773 self.forward() | |
| 774 found = False | |
| 775 while not found: | |
| 776 while self.peek() == ' ': | |
| 777 self.forward() | |
| 778 if self.peek() == '#': | |
| 779 while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
| 780 self.forward() | |
| 781 if self.scan_line_break(): | |
| 782 if not self.flow_level: | |
| 783 self.allow_simple_key = True | |
| 784 else: | |
| 785 found = True | |
| 786 | |
| 787 def scan_directive(self): | |
| 788 # See the specification for details. | |
| 789 start_mark = self.get_mark() | |
| 790 self.forward() | |
| 791 name = self.scan_directive_name(start_mark) | |
| 792 value = None | |
| 793 if name == 'YAML': | |
| 794 value = self.scan_yaml_directive_value(start_mark) | |
| 795 end_mark = self.get_mark() | |
| 796 elif name == 'TAG': | |
| 797 value = self.scan_tag_directive_value(start_mark) | |
| 798 end_mark = self.get_mark() | |
| 799 else: | |
| 800 end_mark = self.get_mark() | |
| 801 while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
| 802 self.forward() | |
| 803 self.scan_directive_ignored_line(start_mark) | |
| 804 return DirectiveToken(name, value, start_mark, end_mark) | |
| 805 | |
| 806 def scan_directive_name(self, start_mark): | |
| 807 # See the specification for details. | |
| 808 length = 0 | |
| 809 ch = self.peek(length) | |
| 810 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
| 811 or ch in '-_': | |
| 812 length += 1 | |
| 813 ch = self.peek(length) | |
| 814 if not length: | |
| 815 raise ScannerError("while scanning a directive", start_mark, | |
| 816 "expected alphabetic or numeric character, but found %r" | |
| 817 % ch, self.get_mark()) | |
| 818 value = self.prefix(length) | |
| 819 self.forward(length) | |
| 820 ch = self.peek() | |
| 821 if ch not in '\0 \r\n\x85\u2028\u2029': | |
| 822 raise ScannerError("while scanning a directive", start_mark, | |
| 823 "expected alphabetic or numeric character, but found %r" | |
| 824 % ch, self.get_mark()) | |
| 825 return value | |
| 826 | |
| 827 def scan_yaml_directive_value(self, start_mark): | |
| 828 # See the specification for details. | |
| 829 while self.peek() == ' ': | |
| 830 self.forward() | |
| 831 major = self.scan_yaml_directive_number(start_mark) | |
| 832 if self.peek() != '.': | |
| 833 raise ScannerError("while scanning a directive", start_mark, | |
| 834 "expected a digit or '.', but found %r" % self.peek(), | |
| 835 self.get_mark()) | |
| 836 self.forward() | |
| 837 minor = self.scan_yaml_directive_number(start_mark) | |
| 838 if self.peek() not in '\0 \r\n\x85\u2028\u2029': | |
| 839 raise ScannerError("while scanning a directive", start_mark, | |
| 840 "expected a digit or ' ', but found %r" % self.peek(), | |
| 841 self.get_mark()) | |
| 842 return (major, minor) | |
| 843 | |
| 844 def scan_yaml_directive_number(self, start_mark): | |
| 845 # See the specification for details. | |
| 846 ch = self.peek() | |
| 847 if not ('0' <= ch <= '9'): | |
| 848 raise ScannerError("while scanning a directive", start_mark, | |
| 849 "expected a digit, but found %r" % ch, self.get_mark()) | |
| 850 length = 0 | |
| 851 while '0' <= self.peek(length) <= '9': | |
| 852 length += 1 | |
| 853 value = int(self.prefix(length)) | |
| 854 self.forward(length) | |
| 855 return value | |
| 856 | |
| 857 def scan_tag_directive_value(self, start_mark): | |
| 858 # See the specification for details. | |
| 859 while self.peek() == ' ': | |
| 860 self.forward() | |
| 861 handle = self.scan_tag_directive_handle(start_mark) | |
| 862 while self.peek() == ' ': | |
| 863 self.forward() | |
| 864 prefix = self.scan_tag_directive_prefix(start_mark) | |
| 865 return (handle, prefix) | |
| 866 | |
| 867 def scan_tag_directive_handle(self, start_mark): | |
| 868 # See the specification for details. | |
| 869 value = self.scan_tag_handle('directive', start_mark) | |
| 870 ch = self.peek() | |
| 871 if ch != ' ': | |
| 872 raise ScannerError("while scanning a directive", start_mark, | |
| 873 "expected ' ', but found %r" % ch, self.get_mark()) | |
| 874 return value | |
| 875 | |
| 876 def scan_tag_directive_prefix(self, start_mark): | |
| 877 # See the specification for details. | |
| 878 value = self.scan_tag_uri('directive', start_mark) | |
| 879 ch = self.peek() | |
| 880 if ch not in '\0 \r\n\x85\u2028\u2029': | |
| 881 raise ScannerError("while scanning a directive", start_mark, | |
| 882 "expected ' ', but found %r" % ch, self.get_mark()) | |
| 883 return value | |
| 884 | |
| 885 def scan_directive_ignored_line(self, start_mark): | |
| 886 # See the specification for details. | |
| 887 while self.peek() == ' ': | |
| 888 self.forward() | |
| 889 if self.peek() == '#': | |
| 890 while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
| 891 self.forward() | |
| 892 ch = self.peek() | |
| 893 if ch not in '\0\r\n\x85\u2028\u2029': | |
| 894 raise ScannerError("while scanning a directive", start_mark, | |
| 895 "expected a comment or a line break, but found %r" | |
| 896 % ch, self.get_mark()) | |
| 897 self.scan_line_break() | |
| 898 | |
| 899 def scan_anchor(self, TokenClass): | |
| 900 # The specification does not restrict characters for anchors and | |
| 901 # aliases. This may lead to problems, for instance, the document: | |
| 902 # [ *alias, value ] | |
| 903 # can be interpreted in two ways, as | |
| 904 # [ "value" ] | |
| 905 # and | |
| 906 # [ *alias , "value" ] | |
| 907 # Therefore we restrict aliases to numbers and ASCII letters. | |
| 908 start_mark = self.get_mark() | |
| 909 indicator = self.peek() | |
| 910 if indicator == '*': | |
| 911 name = 'alias' | |
| 912 else: | |
| 913 name = 'anchor' | |
| 914 self.forward() | |
| 915 length = 0 | |
| 916 ch = self.peek(length) | |
| 917 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
| 918 or ch in '-_': | |
| 919 length += 1 | |
| 920 ch = self.peek(length) | |
| 921 if not length: | |
| 922 raise ScannerError("while scanning an %s" % name, start_mark, | |
| 923 "expected alphabetic or numeric character, but found %r" | |
| 924 % ch, self.get_mark()) | |
| 925 value = self.prefix(length) | |
| 926 self.forward(length) | |
| 927 ch = self.peek() | |
| 928 if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`': | |
| 929 raise ScannerError("while scanning an %s" % name, start_mark, | |
| 930 "expected alphabetic or numeric character, but found %r" | |
| 931 % ch, self.get_mark()) | |
| 932 end_mark = self.get_mark() | |
| 933 return TokenClass(value, start_mark, end_mark) | |
| 934 | |
| 935 def scan_tag(self): | |
| 936 # See the specification for details. | |
| 937 start_mark = self.get_mark() | |
| 938 ch = self.peek(1) | |
| 939 if ch == '<': | |
| 940 handle = None | |
| 941 self.forward(2) | |
| 942 suffix = self.scan_tag_uri('tag', start_mark) | |
| 943 if self.peek() != '>': | |
| 944 raise ScannerError("while parsing a tag", start_mark, | |
| 945 "expected '>', but found %r" % self.peek(), | |
| 946 self.get_mark()) | |
| 947 self.forward() | |
| 948 elif ch in '\0 \t\r\n\x85\u2028\u2029': | |
| 949 handle = None | |
| 950 suffix = '!' | |
| 951 self.forward() | |
| 952 else: | |
| 953 length = 1 | |
| 954 use_handle = False | |
| 955 while ch not in '\0 \r\n\x85\u2028\u2029': | |
| 956 if ch == '!': | |
| 957 use_handle = True | |
| 958 break | |
| 959 length += 1 | |
| 960 ch = self.peek(length) | |
| 961 handle = '!' | |
| 962 if use_handle: | |
| 963 handle = self.scan_tag_handle('tag', start_mark) | |
| 964 else: | |
| 965 handle = '!' | |
| 966 self.forward() | |
| 967 suffix = self.scan_tag_uri('tag', start_mark) | |
| 968 ch = self.peek() | |
| 969 if ch not in '\0 \r\n\x85\u2028\u2029': | |
| 970 raise ScannerError("while scanning a tag", start_mark, | |
| 971 "expected ' ', but found %r" % ch, self.get_mark()) | |
| 972 value = (handle, suffix) | |
| 973 end_mark = self.get_mark() | |
| 974 return TagToken(value, start_mark, end_mark) | |
| 975 | |
| 976 def scan_block_scalar(self, style): | |
| 977 # See the specification for details. | |
| 978 | |
| 979 if style == '>': | |
| 980 folded = True | |
| 981 else: | |
| 982 folded = False | |
| 983 | |
| 984 chunks = [] | |
| 985 start_mark = self.get_mark() | |
| 986 | |
| 987 # Scan the header. | |
| 988 self.forward() | |
| 989 chomping, increment = self.scan_block_scalar_indicators(start_mark) | |
| 990 self.scan_block_scalar_ignored_line(start_mark) | |
| 991 | |
| 992 # Determine the indentation level and go to the first non-empty line. | |
| 993 min_indent = self.indent+1 | |
| 994 if min_indent < 1: | |
| 995 min_indent = 1 | |
| 996 if increment is None: | |
| 997 breaks, max_indent, end_mark = self.scan_block_scalar_indentation() | |
| 998 indent = max(min_indent, max_indent) | |
| 999 else: | |
| 1000 indent = min_indent+increment-1 | |
| 1001 breaks, end_mark = self.scan_block_scalar_breaks(indent) | |
| 1002 line_break = '' | |
| 1003 | |
| 1004 # Scan the inner part of the block scalar. | |
| 1005 while self.column == indent and self.peek() != '\0': | |
| 1006 chunks.extend(breaks) | |
| 1007 leading_non_space = self.peek() not in ' \t' | |
| 1008 length = 0 | |
| 1009 while self.peek(length) not in '\0\r\n\x85\u2028\u2029': | |
| 1010 length += 1 | |
| 1011 chunks.append(self.prefix(length)) | |
| 1012 self.forward(length) | |
| 1013 line_break = self.scan_line_break() | |
| 1014 breaks, end_mark = self.scan_block_scalar_breaks(indent) | |
| 1015 if self.column == indent and self.peek() != '\0': | |
| 1016 | |
| 1017 # Unfortunately, folding rules are ambiguous. | |
| 1018 # | |
| 1019 # This is the folding according to the specification: | |
| 1020 | |
| 1021 if folded and line_break == '\n' \ | |
| 1022 and leading_non_space and self.peek() not in ' \t': | |
| 1023 if not breaks: | |
| 1024 chunks.append(' ') | |
| 1025 else: | |
| 1026 chunks.append(line_break) | |
| 1027 | |
| 1028 # This is Clark Evans's interpretation (also in the spec | |
| 1029 # examples): | |
| 1030 # | |
| 1031 #if folded and line_break == '\n': | |
| 1032 # if not breaks: | |
| 1033 # if self.peek() not in ' \t': | |
| 1034 # chunks.append(' ') | |
| 1035 # else: | |
| 1036 # chunks.append(line_break) | |
| 1037 #else: | |
| 1038 # chunks.append(line_break) | |
| 1039 else: | |
| 1040 break | |
| 1041 | |
| 1042 # Chomp the tail. | |
| 1043 if chomping is not False: | |
| 1044 chunks.append(line_break) | |
| 1045 if chomping is True: | |
| 1046 chunks.extend(breaks) | |
| 1047 | |
| 1048 # We are done. | |
| 1049 return ScalarToken(''.join(chunks), False, start_mark, end_mark, | |
| 1050 style) | |
| 1051 | |
| 1052 def scan_block_scalar_indicators(self, start_mark): | |
| 1053 # See the specification for details. | |
| 1054 chomping = None | |
| 1055 increment = None | |
| 1056 ch = self.peek() | |
| 1057 if ch in '+-': | |
| 1058 if ch == '+': | |
| 1059 chomping = True | |
| 1060 else: | |
| 1061 chomping = False | |
| 1062 self.forward() | |
| 1063 ch = self.peek() | |
| 1064 if ch in '0123456789': | |
| 1065 increment = int(ch) | |
| 1066 if increment == 0: | |
| 1067 raise ScannerError("while scanning a block scalar", start_mark, | |
| 1068 "expected indentation indicator in the range 1-9, but found 0", | |
| 1069 self.get_mark()) | |
| 1070 self.forward() | |
| 1071 elif ch in '0123456789': | |
| 1072 increment = int(ch) | |
| 1073 if increment == 0: | |
| 1074 raise ScannerError("while scanning a block scalar", start_mark, | |
| 1075 "expected indentation indicator in the range 1-9, but found 0", | |
| 1076 self.get_mark()) | |
| 1077 self.forward() | |
| 1078 ch = self.peek() | |
| 1079 if ch in '+-': | |
| 1080 if ch == '+': | |
| 1081 chomping = True | |
| 1082 else: | |
| 1083 chomping = False | |
| 1084 self.forward() | |
| 1085 ch = self.peek() | |
| 1086 if ch not in '\0 \r\n\x85\u2028\u2029': | |
| 1087 raise ScannerError("while scanning a block scalar", start_mark, | |
| 1088 "expected chomping or indentation indicators, but found %r" | |
| 1089 % ch, self.get_mark()) | |
| 1090 return chomping, increment | |
| 1091 | |
| 1092 def scan_block_scalar_ignored_line(self, start_mark): | |
| 1093 # See the specification for details. | |
| 1094 while self.peek() == ' ': | |
| 1095 self.forward() | |
| 1096 if self.peek() == '#': | |
| 1097 while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
| 1098 self.forward() | |
| 1099 ch = self.peek() | |
| 1100 if ch not in '\0\r\n\x85\u2028\u2029': | |
| 1101 raise ScannerError("while scanning a block scalar", start_mark, | |
| 1102 "expected a comment or a line break, but found %r" % ch, | |
| 1103 self.get_mark()) | |
| 1104 self.scan_line_break() | |
| 1105 | |
| 1106 def scan_block_scalar_indentation(self): | |
| 1107 # See the specification for details. | |
| 1108 chunks = [] | |
| 1109 max_indent = 0 | |
| 1110 end_mark = self.get_mark() | |
| 1111 while self.peek() in ' \r\n\x85\u2028\u2029': | |
| 1112 if self.peek() != ' ': | |
| 1113 chunks.append(self.scan_line_break()) | |
| 1114 end_mark = self.get_mark() | |
| 1115 else: | |
| 1116 self.forward() | |
| 1117 if self.column > max_indent: | |
| 1118 max_indent = self.column | |
| 1119 return chunks, max_indent, end_mark | |
| 1120 | |
| 1121 def scan_block_scalar_breaks(self, indent): | |
| 1122 # See the specification for details. | |
| 1123 chunks = [] | |
| 1124 end_mark = self.get_mark() | |
| 1125 while self.column < indent and self.peek() == ' ': | |
| 1126 self.forward() | |
| 1127 while self.peek() in '\r\n\x85\u2028\u2029': | |
| 1128 chunks.append(self.scan_line_break()) | |
| 1129 end_mark = self.get_mark() | |
| 1130 while self.column < indent and self.peek() == ' ': | |
| 1131 self.forward() | |
| 1132 return chunks, end_mark | |
| 1133 | |
| 1134 def scan_flow_scalar(self, style): | |
| 1135 # See the specification for details. | |
| 1136 # Note that we loose indentation rules for quoted scalars. Quoted | |
| 1137 # scalars don't need to adhere indentation because " and ' clearly | |
| 1138 # mark the beginning and the end of them. Therefore we are less | |
| 1139 # restrictive then the specification requires. We only need to check | |
| 1140 # that document separators are not included in scalars. | |
| 1141 if style == '"': | |
| 1142 double = True | |
| 1143 else: | |
| 1144 double = False | |
| 1145 chunks = [] | |
| 1146 start_mark = self.get_mark() | |
| 1147 quote = self.peek() | |
| 1148 self.forward() | |
| 1149 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) | |
| 1150 while self.peek() != quote: | |
| 1151 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark)) | |
| 1152 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) | |
| 1153 self.forward() | |
| 1154 end_mark = self.get_mark() | |
| 1155 return ScalarToken(''.join(chunks), False, start_mark, end_mark, | |
| 1156 style) | |
| 1157 | |
| 1158 ESCAPE_REPLACEMENTS = { | |
| 1159 '0': '\0', | |
| 1160 'a': '\x07', | |
| 1161 'b': '\x08', | |
| 1162 't': '\x09', | |
| 1163 '\t': '\x09', | |
| 1164 'n': '\x0A', | |
| 1165 'v': '\x0B', | |
| 1166 'f': '\x0C', | |
| 1167 'r': '\x0D', | |
| 1168 'e': '\x1B', | |
| 1169 ' ': '\x20', | |
| 1170 '\"': '\"', | |
| 1171 '\\': '\\', | |
| 1172 '/': '/', | |
| 1173 'N': '\x85', | |
| 1174 '_': '\xA0', | |
| 1175 'L': '\u2028', | |
| 1176 'P': '\u2029', | |
| 1177 } | |
| 1178 | |
| 1179 ESCAPE_CODES = { | |
| 1180 'x': 2, | |
| 1181 'u': 4, | |
| 1182 'U': 8, | |
| 1183 } | |
| 1184 | |
| 1185 def scan_flow_scalar_non_spaces(self, double, start_mark): | |
| 1186 # See the specification for details. | |
| 1187 chunks = [] | |
| 1188 while True: | |
| 1189 length = 0 | |
| 1190 while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029': | |
| 1191 length += 1 | |
| 1192 if length: | |
| 1193 chunks.append(self.prefix(length)) | |
| 1194 self.forward(length) | |
| 1195 ch = self.peek() | |
| 1196 if not double and ch == '\'' and self.peek(1) == '\'': | |
| 1197 chunks.append('\'') | |
| 1198 self.forward(2) | |
| 1199 elif (double and ch == '\'') or (not double and ch in '\"\\'): | |
| 1200 chunks.append(ch) | |
| 1201 self.forward() | |
| 1202 elif double and ch == '\\': | |
| 1203 self.forward() | |
| 1204 ch = self.peek() | |
| 1205 if ch in self.ESCAPE_REPLACEMENTS: | |
| 1206 chunks.append(self.ESCAPE_REPLACEMENTS[ch]) | |
| 1207 self.forward() | |
| 1208 elif ch in self.ESCAPE_CODES: | |
| 1209 length = self.ESCAPE_CODES[ch] | |
| 1210 self.forward() | |
| 1211 for k in range(length): | |
| 1212 if self.peek(k) not in '0123456789ABCDEFabcdef': | |
| 1213 raise ScannerError("while scanning a double-quoted scalar", start_mark, | |
| 1214 "expected escape sequence of %d hexdecimal numbers, but found %r" % | |
| 1215 (length, self.peek(k)), self.get_mark()) | |
| 1216 code = int(self.prefix(length), 16) | |
| 1217 chunks.append(chr(code)) | |
| 1218 self.forward(length) | |
| 1219 elif ch in '\r\n\x85\u2028\u2029': | |
| 1220 self.scan_line_break() | |
| 1221 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark)) | |
| 1222 else: | |
| 1223 raise ScannerError("while scanning a double-quoted scalar", start_mark, | |
| 1224 "found unknown escape character %r" % ch, self.get_mark()) | |
| 1225 else: | |
| 1226 return chunks | |
| 1227 | |
| 1228 def scan_flow_scalar_spaces(self, double, start_mark): | |
| 1229 # See the specification for details. | |
| 1230 chunks = [] | |
| 1231 length = 0 | |
| 1232 while self.peek(length) in ' \t': | |
| 1233 length += 1 | |
| 1234 whitespaces = self.prefix(length) | |
| 1235 self.forward(length) | |
| 1236 ch = self.peek() | |
| 1237 if ch == '\0': | |
| 1238 raise ScannerError("while scanning a quoted scalar", start_mark, | |
| 1239 "found unexpected end of stream", self.get_mark()) | |
| 1240 elif ch in '\r\n\x85\u2028\u2029': | |
| 1241 line_break = self.scan_line_break() | |
| 1242 breaks = self.scan_flow_scalar_breaks(double, start_mark) | |
| 1243 if line_break != '\n': | |
| 1244 chunks.append(line_break) | |
| 1245 elif not breaks: | |
| 1246 chunks.append(' ') | |
| 1247 chunks.extend(breaks) | |
| 1248 else: | |
| 1249 chunks.append(whitespaces) | |
| 1250 return chunks | |
| 1251 | |
| 1252 def scan_flow_scalar_breaks(self, double, start_mark): | |
| 1253 # See the specification for details. | |
| 1254 chunks = [] | |
| 1255 while True: | |
| 1256 # Instead of checking indentation, we check for document | |
| 1257 # separators. | |
| 1258 prefix = self.prefix(3) | |
| 1259 if (prefix == '---' or prefix == '...') \ | |
| 1260 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
| 1261 raise ScannerError("while scanning a quoted scalar", start_mark, | |
| 1262 "found unexpected document separator", self.get_mark()) | |
| 1263 while self.peek() in ' \t': | |
| 1264 self.forward() | |
| 1265 if self.peek() in '\r\n\x85\u2028\u2029': | |
| 1266 chunks.append(self.scan_line_break()) | |
| 1267 else: | |
| 1268 return chunks | |
| 1269 | |
| 1270 def scan_plain(self): | |
| 1271 # See the specification for details. | |
| 1272 # We add an additional restriction for the flow context: | |
| 1273 # plain scalars in the flow context cannot contain ',' or '?'. | |
| 1274 # We also keep track of the `allow_simple_key` flag here. | |
| 1275 # Indentation rules are loosed for the flow context. | |
| 1276 chunks = [] | |
| 1277 start_mark = self.get_mark() | |
| 1278 end_mark = start_mark | |
| 1279 indent = self.indent+1 | |
| 1280 # We allow zero indentation for scalars, but then we need to check for | |
| 1281 # document separators at the beginning of the line. | |
| 1282 #if indent == 0: | |
| 1283 # indent = 1 | |
| 1284 spaces = [] | |
| 1285 while True: | |
| 1286 length = 0 | |
| 1287 if self.peek() == '#': | |
| 1288 break | |
| 1289 while True: | |
| 1290 ch = self.peek(length) | |
| 1291 if ch in '\0 \t\r\n\x85\u2028\u2029' \ | |
| 1292 or (ch == ':' and | |
| 1293 self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029' | |
| 1294 + (u',[]{}' if self.flow_level else u''))\ | |
| 1295 or (self.flow_level and ch in ',?[]{}'): | |
| 1296 break | |
| 1297 length += 1 | |
| 1298 if length == 0: | |
| 1299 break | |
| 1300 self.allow_simple_key = False | |
| 1301 chunks.extend(spaces) | |
| 1302 chunks.append(self.prefix(length)) | |
| 1303 self.forward(length) | |
| 1304 end_mark = self.get_mark() | |
| 1305 spaces = self.scan_plain_spaces(indent, start_mark) | |
| 1306 if not spaces or self.peek() == '#' \ | |
| 1307 or (not self.flow_level and self.column < indent): | |
| 1308 break | |
| 1309 return ScalarToken(''.join(chunks), True, start_mark, end_mark) | |
| 1310 | |
| 1311 def scan_plain_spaces(self, indent, start_mark): | |
| 1312 # See the specification for details. | |
| 1313 # The specification is really confusing about tabs in plain scalars. | |
| 1314 # We just forbid them completely. Do not use tabs in YAML! | |
| 1315 chunks = [] | |
| 1316 length = 0 | |
| 1317 while self.peek(length) in ' ': | |
| 1318 length += 1 | |
| 1319 whitespaces = self.prefix(length) | |
| 1320 self.forward(length) | |
| 1321 ch = self.peek() | |
| 1322 if ch in '\r\n\x85\u2028\u2029': | |
| 1323 line_break = self.scan_line_break() | |
| 1324 self.allow_simple_key = True | |
| 1325 prefix = self.prefix(3) | |
| 1326 if (prefix == '---' or prefix == '...') \ | |
| 1327 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
| 1328 return | |
| 1329 breaks = [] | |
| 1330 while self.peek() in ' \r\n\x85\u2028\u2029': | |
| 1331 if self.peek() == ' ': | |
| 1332 self.forward() | |
| 1333 else: | |
| 1334 breaks.append(self.scan_line_break()) | |
| 1335 prefix = self.prefix(3) | |
| 1336 if (prefix == '---' or prefix == '...') \ | |
| 1337 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
| 1338 return | |
| 1339 if line_break != '\n': | |
| 1340 chunks.append(line_break) | |
| 1341 elif not breaks: | |
| 1342 chunks.append(' ') | |
| 1343 chunks.extend(breaks) | |
| 1344 elif whitespaces: | |
| 1345 chunks.append(whitespaces) | |
| 1346 return chunks | |
| 1347 | |
| 1348 def scan_tag_handle(self, name, start_mark): | |
| 1349 # See the specification for details. | |
| 1350 # For some strange reasons, the specification does not allow '_' in | |
| 1351 # tag handles. I have allowed it anyway. | |
| 1352 ch = self.peek() | |
| 1353 if ch != '!': | |
| 1354 raise ScannerError("while scanning a %s" % name, start_mark, | |
| 1355 "expected '!', but found %r" % ch, self.get_mark()) | |
| 1356 length = 1 | |
| 1357 ch = self.peek(length) | |
| 1358 if ch != ' ': | |
| 1359 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
| 1360 or ch in '-_': | |
| 1361 length += 1 | |
| 1362 ch = self.peek(length) | |
| 1363 if ch != '!': | |
| 1364 self.forward(length) | |
| 1365 raise ScannerError("while scanning a %s" % name, start_mark, | |
| 1366 "expected '!', but found %r" % ch, self.get_mark()) | |
| 1367 length += 1 | |
| 1368 value = self.prefix(length) | |
| 1369 self.forward(length) | |
| 1370 return value | |
| 1371 | |
| 1372 def scan_tag_uri(self, name, start_mark): | |
| 1373 # See the specification for details. | |
| 1374 # Note: we do not check if URI is well-formed. | |
| 1375 chunks = [] | |
| 1376 length = 0 | |
| 1377 ch = self.peek(length) | |
| 1378 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
| 1379 or ch in '-;/?:@&=+$,_.!~*\'()[]%': | |
| 1380 if ch == '%': | |
| 1381 chunks.append(self.prefix(length)) | |
| 1382 self.forward(length) | |
| 1383 length = 0 | |
| 1384 chunks.append(self.scan_uri_escapes(name, start_mark)) | |
| 1385 else: | |
| 1386 length += 1 | |
| 1387 ch = self.peek(length) | |
| 1388 if length: | |
| 1389 chunks.append(self.prefix(length)) | |
| 1390 self.forward(length) | |
| 1391 length = 0 | |
| 1392 if not chunks: | |
| 1393 raise ScannerError("while parsing a %s" % name, start_mark, | |
| 1394 "expected URI, but found %r" % ch, self.get_mark()) | |
| 1395 return ''.join(chunks) | |
| 1396 | |
| 1397 def scan_uri_escapes(self, name, start_mark): | |
| 1398 # See the specification for details. | |
| 1399 codes = [] | |
| 1400 mark = self.get_mark() | |
| 1401 while self.peek() == '%': | |
| 1402 self.forward() | |
| 1403 for k in range(2): | |
| 1404 if self.peek(k) not in '0123456789ABCDEFabcdef': | |
| 1405 raise ScannerError("while scanning a %s" % name, start_mark, | |
| 1406 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" | |
| 1407 % self.peek(k), self.get_mark()) | |
| 1408 codes.append(int(self.prefix(2), 16)) | |
| 1409 self.forward(2) | |
| 1410 try: | |
| 1411 value = bytes(codes).decode('utf-8') | |
| 1412 except UnicodeDecodeError as exc: | |
| 1413 raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark) | |
| 1414 return value | |
| 1415 | |
| 1416 def scan_line_break(self): | |
| 1417 # Transforms: | |
| 1418 # '\r\n' : '\n' | |
| 1419 # '\r' : '\n' | |
| 1420 # '\n' : '\n' | |
| 1421 # '\x85' : '\n' | |
| 1422 # '\u2028' : '\u2028' | |
| 1423 # '\u2029 : '\u2029' | |
| 1424 # default : '' | |
| 1425 ch = self.peek() | |
| 1426 if ch in '\r\n\x85': | |
| 1427 if self.prefix(2) == '\r\n': | |
| 1428 self.forward(2) | |
| 1429 else: | |
| 1430 self.forward() | |
| 1431 return '\n' | |
| 1432 elif ch in '\u2028\u2029': | |
| 1433 self.forward() | |
| 1434 return ch | |
| 1435 return '' |
