Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/docutils/parsers/rst/tableparser.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 # $Id: tableparser.py 8373 2019-08-27 12:11:30Z milde $ | |
| 2 # Author: David Goodger <goodger@python.org> | |
| 3 # Copyright: This module has been placed in the public domain. | |
| 4 | |
| 5 """ | |
| 6 This module defines table parser classes,which parse plaintext-graphic tables | |
| 7 and produce a well-formed data structure suitable for building a CALS table. | |
| 8 | |
| 9 :Classes: | |
| 10 - `GridTableParser`: Parse fully-formed tables represented with a grid. | |
| 11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom | |
| 12 borders. | |
| 13 | |
| 14 :Exception class: `TableMarkupError` | |
| 15 | |
| 16 :Function: | |
| 17 `update_dict_of_lists()`: Merge two dictionaries containing list values. | |
| 18 """ | |
| 19 | |
| 20 __docformat__ = 'reStructuredText' | |
| 21 | |
| 22 | |
| 23 import re | |
| 24 import sys | |
| 25 from docutils import DataError | |
| 26 from docutils.utils import strip_combining_chars | |
| 27 | |
| 28 | |
| 29 class TableMarkupError(DataError): | |
| 30 | |
| 31 """ | |
| 32 Raise if there is any problem with table markup. | |
| 33 | |
| 34 The keyword argument `offset` denotes the offset of the problem | |
| 35 from the table's start line. | |
| 36 """ | |
| 37 | |
| 38 def __init__(self, *args, **kwargs): | |
| 39 self.offset = kwargs.pop('offset', 0) | |
| 40 DataError.__init__(self, *args) | |
| 41 | |
| 42 | |
| 43 class TableParser(object): | |
| 44 | |
| 45 """ | |
| 46 Abstract superclass for the common parts of the syntax-specific parsers. | |
| 47 """ | |
| 48 | |
| 49 head_body_separator_pat = None | |
| 50 """Matches the row separator between head rows and body rows.""" | |
| 51 | |
| 52 double_width_pad_char = '\x00' | |
| 53 """Padding character for East Asian double-width text.""" | |
| 54 | |
| 55 def parse(self, block): | |
| 56 """ | |
| 57 Analyze the text `block` and return a table data structure. | |
| 58 | |
| 59 Given a plaintext-graphic table in `block` (list of lines of text; no | |
| 60 whitespace padding), parse the table, construct and return the data | |
| 61 necessary to construct a CALS table or equivalent. | |
| 62 | |
| 63 Raise `TableMarkupError` if there is any problem with the markup. | |
| 64 """ | |
| 65 self.setup(block) | |
| 66 self.find_head_body_sep() | |
| 67 self.parse_table() | |
| 68 structure = self.structure_from_cells() | |
| 69 return structure | |
| 70 | |
| 71 def find_head_body_sep(self): | |
| 72 """Look for a head/body row separator line; store the line index.""" | |
| 73 for i in range(len(self.block)): | |
| 74 line = self.block[i] | |
| 75 if self.head_body_separator_pat.match(line): | |
| 76 if self.head_body_sep: | |
| 77 raise TableMarkupError( | |
| 78 'Multiple head/body row separators ' | |
| 79 '(table lines %s and %s); only one allowed.' | |
| 80 % (self.head_body_sep+1, i+1), offset=i) | |
| 81 else: | |
| 82 self.head_body_sep = i | |
| 83 self.block[i] = line.replace('=', '-') | |
| 84 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block) | |
| 85 - 1): | |
| 86 raise TableMarkupError('The head/body row separator may not be ' | |
| 87 'the first or last line of the table.', | |
| 88 offset=i) | |
| 89 | |
| 90 | |
| 91 class GridTableParser(TableParser): | |
| 92 | |
| 93 """ | |
| 94 Parse a grid table using `parse()`. | |
| 95 | |
| 96 Here's an example of a grid table:: | |
| 97 | |
| 98 +------------------------+------------+----------+----------+ | |
| 99 | Header row, column 1 | Header 2 | Header 3 | Header 4 | | |
| 100 +========================+============+==========+==========+ | |
| 101 | body row 1, column 1 | column 2 | column 3 | column 4 | | |
| 102 +------------------------+------------+----------+----------+ | |
| 103 | body row 2 | Cells may span columns. | | |
| 104 +------------------------+------------+---------------------+ | |
| 105 | body row 3 | Cells may | - Table cells | | |
| 106 +------------------------+ span rows. | - contain | | |
| 107 | body row 4 | | - body elements. | | |
| 108 +------------------------+------------+---------------------+ | |
| 109 | |
| 110 Intersections use '+', row separators use '-' (except for one optional | |
| 111 head/body row separator, which uses '='), and column separators use '|'. | |
| 112 | |
| 113 Passing the above table to the `parse()` method will result in the | |
| 114 following data structure:: | |
| 115 | |
| 116 ([24, 12, 10, 10], | |
| 117 [[(0, 0, 1, ['Header row, column 1']), | |
| 118 (0, 0, 1, ['Header 2']), | |
| 119 (0, 0, 1, ['Header 3']), | |
| 120 (0, 0, 1, ['Header 4'])]], | |
| 121 [[(0, 0, 3, ['body row 1, column 1']), | |
| 122 (0, 0, 3, ['column 2']), | |
| 123 (0, 0, 3, ['column 3']), | |
| 124 (0, 0, 3, ['column 4'])], | |
| 125 [(0, 0, 5, ['body row 2']), | |
| 126 (0, 2, 5, ['Cells may span columns.']), | |
| 127 None, | |
| 128 None], | |
| 129 [(0, 0, 7, ['body row 3']), | |
| 130 (1, 0, 7, ['Cells may', 'span rows.', '']), | |
| 131 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']), | |
| 132 None], | |
| 133 [(0, 0, 9, ['body row 4']), None, None, None]]) | |
| 134 | |
| 135 The first item is a list containing column widths (colspecs). The second | |
| 136 item is a list of head rows, and the third is a list of body rows. Each | |
| 137 row contains a list of cells. Each cell is either None (for a cell unused | |
| 138 because of another cell's span), or a tuple. A cell tuple contains four | |
| 139 items: the number of extra rows used by the cell in a vertical span | |
| 140 (morerows); the number of extra columns used by the cell in a horizontal | |
| 141 span (morecols); the line offset of the first line of the cell contents; | |
| 142 and the cell contents, a list of lines of text. | |
| 143 """ | |
| 144 | |
| 145 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$') | |
| 146 | |
| 147 def setup(self, block): | |
| 148 self.block = block[:] # make a copy; it may be modified | |
| 149 self.block.disconnect() # don't propagate changes to parent | |
| 150 self.bottom = len(block) - 1 | |
| 151 self.right = len(block[0]) - 1 | |
| 152 self.head_body_sep = None | |
| 153 self.done = [-1] * len(block[0]) | |
| 154 self.cells = [] | |
| 155 self.rowseps = {0: [0]} | |
| 156 self.colseps = {0: [0]} | |
| 157 | |
| 158 def parse_table(self): | |
| 159 """ | |
| 160 Start with a queue of upper-left corners, containing the upper-left | |
| 161 corner of the table itself. Trace out one rectangular cell, remember | |
| 162 it, and add its upper-right and lower-left corners to the queue of | |
| 163 potential upper-left corners of further cells. Process the queue in | |
| 164 top-to-bottom order, keeping track of how much of each text column has | |
| 165 been seen. | |
| 166 | |
| 167 We'll end up knowing all the row and column boundaries, cell positions | |
| 168 and their dimensions. | |
| 169 """ | |
| 170 corners = [(0, 0)] | |
| 171 while corners: | |
| 172 top, left = corners.pop(0) | |
| 173 if top == self.bottom or left == self.right \ | |
| 174 or top <= self.done[left]: | |
| 175 continue | |
| 176 result = self.scan_cell(top, left) | |
| 177 if not result: | |
| 178 continue | |
| 179 bottom, right, rowseps, colseps = result | |
| 180 update_dict_of_lists(self.rowseps, rowseps) | |
| 181 update_dict_of_lists(self.colseps, colseps) | |
| 182 self.mark_done(top, left, bottom, right) | |
| 183 cellblock = self.block.get_2D_block(top + 1, left + 1, | |
| 184 bottom, right) | |
| 185 cellblock.disconnect() # lines in cell can't sync with parent | |
| 186 cellblock.replace(self.double_width_pad_char, '') | |
| 187 self.cells.append((top, left, bottom, right, cellblock)) | |
| 188 corners.extend([(top, right), (bottom, left)]) | |
| 189 corners.sort() | |
| 190 if not self.check_parse_complete(): | |
| 191 raise TableMarkupError('Malformed table; parse incomplete.') | |
| 192 | |
| 193 def mark_done(self, top, left, bottom, right): | |
| 194 """For keeping track of how much of each text column has been seen.""" | |
| 195 before = top - 1 | |
| 196 after = bottom - 1 | |
| 197 for col in range(left, right): | |
| 198 assert self.done[col] == before | |
| 199 self.done[col] = after | |
| 200 | |
| 201 def check_parse_complete(self): | |
| 202 """Each text column should have been completely seen.""" | |
| 203 last = self.bottom - 1 | |
| 204 for col in range(self.right): | |
| 205 if self.done[col] != last: | |
| 206 return False | |
| 207 return True | |
| 208 | |
| 209 def scan_cell(self, top, left): | |
| 210 """Starting at the top-left corner, start tracing out a cell.""" | |
| 211 assert self.block[top][left] == '+' | |
| 212 result = self.scan_right(top, left) | |
| 213 return result | |
| 214 | |
| 215 def scan_right(self, top, left): | |
| 216 """ | |
| 217 Look for the top-right corner of the cell, and make note of all column | |
| 218 boundaries ('+'). | |
| 219 """ | |
| 220 colseps = {} | |
| 221 line = self.block[top] | |
| 222 for i in range(left + 1, self.right + 1): | |
| 223 if line[i] == '+': | |
| 224 colseps[i] = [top] | |
| 225 result = self.scan_down(top, left, i) | |
| 226 if result: | |
| 227 bottom, rowseps, newcolseps = result | |
| 228 update_dict_of_lists(colseps, newcolseps) | |
| 229 return bottom, i, rowseps, colseps | |
| 230 elif line[i] != '-': | |
| 231 return None | |
| 232 return None | |
| 233 | |
| 234 def scan_down(self, top, left, right): | |
| 235 """ | |
| 236 Look for the bottom-right corner of the cell, making note of all row | |
| 237 boundaries. | |
| 238 """ | |
| 239 rowseps = {} | |
| 240 for i in range(top + 1, self.bottom + 1): | |
| 241 if self.block[i][right] == '+': | |
| 242 rowseps[i] = [right] | |
| 243 result = self.scan_left(top, left, i, right) | |
| 244 if result: | |
| 245 newrowseps, colseps = result | |
| 246 update_dict_of_lists(rowseps, newrowseps) | |
| 247 return i, rowseps, colseps | |
| 248 elif self.block[i][right] != '|': | |
| 249 return None | |
| 250 return None | |
| 251 | |
| 252 def scan_left(self, top, left, bottom, right): | |
| 253 """ | |
| 254 Noting column boundaries, look for the bottom-left corner of the cell. | |
| 255 It must line up with the starting point. | |
| 256 """ | |
| 257 colseps = {} | |
| 258 line = self.block[bottom] | |
| 259 for i in range(right - 1, left, -1): | |
| 260 if line[i] == '+': | |
| 261 colseps[i] = [bottom] | |
| 262 elif line[i] != '-': | |
| 263 return None | |
| 264 if line[left] != '+': | |
| 265 return None | |
| 266 result = self.scan_up(top, left, bottom, right) | |
| 267 if result is not None: | |
| 268 rowseps = result | |
| 269 return rowseps, colseps | |
| 270 return None | |
| 271 | |
| 272 def scan_up(self, top, left, bottom, right): | |
| 273 """ | |
| 274 Noting row boundaries, see if we can return to the starting point. | |
| 275 """ | |
| 276 rowseps = {} | |
| 277 for i in range(bottom - 1, top, -1): | |
| 278 if self.block[i][left] == '+': | |
| 279 rowseps[i] = [left] | |
| 280 elif self.block[i][left] != '|': | |
| 281 return None | |
| 282 return rowseps | |
| 283 | |
| 284 def structure_from_cells(self): | |
| 285 """ | |
| 286 From the data collected by `scan_cell()`, convert to the final data | |
| 287 structure. | |
| 288 """ | |
| 289 rowseps = sorted(self.rowseps.keys()) # list of row boundaries | |
| 290 rowindex = {} | |
| 291 for i in range(len(rowseps)): | |
| 292 rowindex[rowseps[i]] = i # row boundary -> row number mapping | |
| 293 colseps = sorted(self.colseps.keys()) # list of column boundaries | |
| 294 colindex = {} | |
| 295 for i in range(len(colseps)): | |
| 296 colindex[colseps[i]] = i # column boundary -> col number map | |
| 297 colspecs = [(colseps[i] - colseps[i - 1] - 1) | |
| 298 for i in range(1, len(colseps))] # list of column widths | |
| 299 # prepare an empty table with the correct number of rows & columns | |
| 300 onerow = [None for i in range(len(colseps) - 1)] | |
| 301 rows = [onerow[:] for i in range(len(rowseps) - 1)] | |
| 302 # keep track of # of cells remaining; should reduce to zero | |
| 303 remaining = (len(rowseps) - 1) * (len(colseps) - 1) | |
| 304 for top, left, bottom, right, block in self.cells: | |
| 305 rownum = rowindex[top] | |
| 306 colnum = colindex[left] | |
| 307 assert rows[rownum][colnum] is None, ( | |
| 308 'Cell (row %s, column %s) already used.' | |
| 309 % (rownum + 1, colnum + 1)) | |
| 310 morerows = rowindex[bottom] - rownum - 1 | |
| 311 morecols = colindex[right] - colnum - 1 | |
| 312 remaining -= (morerows + 1) * (morecols + 1) | |
| 313 # write the cell into the table | |
| 314 rows[rownum][colnum] = (morerows, morecols, top + 1, block) | |
| 315 assert remaining == 0, 'Unused cells remaining.' | |
| 316 if self.head_body_sep: # separate head rows from body rows | |
| 317 numheadrows = rowindex[self.head_body_sep] | |
| 318 headrows = rows[:numheadrows] | |
| 319 bodyrows = rows[numheadrows:] | |
| 320 else: | |
| 321 headrows = [] | |
| 322 bodyrows = rows | |
| 323 return (colspecs, headrows, bodyrows) | |
| 324 | |
| 325 | |
| 326 class SimpleTableParser(TableParser): | |
| 327 | |
| 328 """ | |
| 329 Parse a simple table using `parse()`. | |
| 330 | |
| 331 Here's an example of a simple table:: | |
| 332 | |
| 333 ===== ===== | |
| 334 col 1 col 2 | |
| 335 ===== ===== | |
| 336 1 Second column of row 1. | |
| 337 2 Second column of row 2. | |
| 338 Second line of paragraph. | |
| 339 3 - Second column of row 3. | |
| 340 | |
| 341 - Second item in bullet | |
| 342 list (row 3, column 2). | |
| 343 4 is a span | |
| 344 ------------ | |
| 345 5 | |
| 346 ===== ===== | |
| 347 | |
| 348 Top and bottom borders use '=', column span underlines use '-', column | |
| 349 separation is indicated with spaces. | |
| 350 | |
| 351 Passing the above table to the `parse()` method will result in the | |
| 352 following data structure, whose interpretation is the same as for | |
| 353 `GridTableParser`:: | |
| 354 | |
| 355 ([5, 25], | |
| 356 [[(0, 0, 1, ['col 1']), | |
| 357 (0, 0, 1, ['col 2'])]], | |
| 358 [[(0, 0, 3, ['1']), | |
| 359 (0, 0, 3, ['Second column of row 1.'])], | |
| 360 [(0, 0, 4, ['2']), | |
| 361 (0, 0, 4, ['Second column of row 2.', | |
| 362 'Second line of paragraph.'])], | |
| 363 [(0, 0, 6, ['3']), | |
| 364 (0, 0, 6, ['- Second column of row 3.', | |
| 365 '', | |
| 366 '- Second item in bullet', | |
| 367 ' list (row 3, column 2).'])], | |
| 368 [(0, 1, 10, ['4 is a span'])], | |
| 369 [(0, 0, 12, ['5']), | |
| 370 (0, 0, 12, [''])]]) | |
| 371 """ | |
| 372 | |
| 373 head_body_separator_pat = re.compile('=[ =]*$') | |
| 374 span_pat = re.compile('-[ -]*$') | |
| 375 | |
| 376 def setup(self, block): | |
| 377 self.block = block[:] # make a copy; it will be modified | |
| 378 self.block.disconnect() # don't propagate changes to parent | |
| 379 # Convert top & bottom borders to column span underlines: | |
| 380 self.block[0] = self.block[0].replace('=', '-') | |
| 381 self.block[-1] = self.block[-1].replace('=', '-') | |
| 382 self.head_body_sep = None | |
| 383 self.columns = [] | |
| 384 self.border_end = None | |
| 385 self.table = [] | |
| 386 self.done = [-1] * len(block[0]) | |
| 387 self.rowseps = {0: [0]} | |
| 388 self.colseps = {0: [0]} | |
| 389 | |
| 390 def parse_table(self): | |
| 391 """ | |
| 392 First determine the column boundaries from the top border, then | |
| 393 process rows. Each row may consist of multiple lines; accumulate | |
| 394 lines until a row is complete. Call `self.parse_row` to finish the | |
| 395 job. | |
| 396 """ | |
| 397 # Top border must fully describe all table columns. | |
| 398 self.columns = self.parse_columns(self.block[0], 0) | |
| 399 self.border_end = self.columns[-1][1] | |
| 400 firststart, firstend = self.columns[0] | |
| 401 offset = 1 # skip top border | |
| 402 start = 1 | |
| 403 text_found = None | |
| 404 while offset < len(self.block): | |
| 405 line = self.block[offset] | |
| 406 if self.span_pat.match(line): | |
| 407 # Column span underline or border; row is complete. | |
| 408 self.parse_row(self.block[start:offset], start, | |
| 409 (line.rstrip(), offset)) | |
| 410 start = offset + 1 | |
| 411 text_found = None | |
| 412 elif line[firststart:firstend].strip(): | |
| 413 # First column not blank, therefore it's a new row. | |
| 414 if text_found and offset != start: | |
| 415 self.parse_row(self.block[start:offset], start) | |
| 416 start = offset | |
| 417 text_found = 1 | |
| 418 elif not text_found: | |
| 419 start = offset + 1 | |
| 420 offset += 1 | |
| 421 | |
| 422 def parse_columns(self, line, offset): | |
| 423 """ | |
| 424 Given a column span underline, return a list of (begin, end) pairs. | |
| 425 """ | |
| 426 cols = [] | |
| 427 end = 0 | |
| 428 while True: | |
| 429 begin = line.find('-', end) | |
| 430 end = line.find(' ', begin) | |
| 431 if begin < 0: | |
| 432 break | |
| 433 if end < 0: | |
| 434 end = len(line) | |
| 435 cols.append((begin, end)) | |
| 436 if self.columns: | |
| 437 if cols[-1][1] != self.border_end: | |
| 438 raise TableMarkupError('Column span incomplete in table ' | |
| 439 'line %s.' % (offset+1), | |
| 440 offset=offset) | |
| 441 # Allow for an unbounded rightmost column: | |
| 442 cols[-1] = (cols[-1][0], self.columns[-1][1]) | |
| 443 return cols | |
| 444 | |
| 445 def init_row(self, colspec, offset): | |
| 446 i = 0 | |
| 447 cells = [] | |
| 448 for start, end in colspec: | |
| 449 morecols = 0 | |
| 450 try: | |
| 451 assert start == self.columns[i][0] | |
| 452 while end != self.columns[i][1]: | |
| 453 i += 1 | |
| 454 morecols += 1 | |
| 455 except (AssertionError, IndexError): | |
| 456 raise TableMarkupError('Column span alignment problem ' | |
| 457 'in table line %s.' % (offset+2), | |
| 458 offset=offset+1) | |
| 459 cells.append([0, morecols, offset, []]) | |
| 460 i += 1 | |
| 461 return cells | |
| 462 | |
| 463 def parse_row(self, lines, start, spanline=None): | |
| 464 """ | |
| 465 Given the text `lines` of a row, parse it and append to `self.table`. | |
| 466 | |
| 467 The row is parsed according to the current column spec (either | |
| 468 `spanline` if provided or `self.columns`). For each column, extract | |
| 469 text from each line, and check for text in column margins. Finally, | |
| 470 adjust for insignificant whitespace. | |
| 471 """ | |
| 472 if not (lines or spanline): | |
| 473 # No new row, just blank lines. | |
| 474 return | |
| 475 if spanline: | |
| 476 columns = self.parse_columns(*spanline) | |
| 477 span_offset = spanline[1] | |
| 478 else: | |
| 479 columns = self.columns[:] | |
| 480 span_offset = start | |
| 481 self.check_columns(lines, start, columns) | |
| 482 row = self.init_row(columns, start) | |
| 483 for i in range(len(columns)): | |
| 484 start, end = columns[i] | |
| 485 cellblock = lines.get_2D_block(0, start, len(lines), end) | |
| 486 cellblock.disconnect() # lines in cell can't sync with parent | |
| 487 cellblock.replace(self.double_width_pad_char, '') | |
| 488 row[i][3] = cellblock | |
| 489 self.table.append(row) | |
| 490 | |
| 491 def check_columns(self, lines, first_line, columns): | |
| 492 """ | |
| 493 Check for text in column margins and text overflow in the last column. | |
| 494 Raise TableMarkupError if anything but whitespace is in column margins. | |
| 495 Adjust the end value for the last column if there is text overflow. | |
| 496 """ | |
| 497 # "Infinite" value for a dummy last column's beginning, used to | |
| 498 # check for text overflow: | |
| 499 columns.append((sys.maxsize, None)) | |
| 500 lastcol = len(columns) - 2 | |
| 501 # combining characters do not contribute to the column width | |
| 502 lines = [strip_combining_chars(line) for line in lines] | |
| 503 | |
| 504 for i in range(len(columns) - 1): | |
| 505 start, end = columns[i] | |
| 506 nextstart = columns[i+1][0] | |
| 507 offset = 0 | |
| 508 for line in lines: | |
| 509 if i == lastcol and line[end:].strip(): | |
| 510 text = line[start:].rstrip() | |
| 511 new_end = start + len(text) | |
| 512 main_start, main_end = self.columns[-1] | |
| 513 columns[i] = (start, max(main_end, new_end)) | |
| 514 if new_end > main_end: | |
| 515 self.columns[-1] = (main_start, new_end) | |
| 516 elif line[end:nextstart].strip(): | |
| 517 raise TableMarkupError('Text in column margin ' | |
| 518 'in table line %s.' % (first_line+offset+1), | |
| 519 offset=first_line+offset) | |
| 520 offset += 1 | |
| 521 columns.pop() | |
| 522 | |
| 523 def structure_from_cells(self): | |
| 524 colspecs = [end - start for start, end in self.columns] | |
| 525 first_body_row = 0 | |
| 526 if self.head_body_sep: | |
| 527 for i in range(len(self.table)): | |
| 528 if self.table[i][0][2] > self.head_body_sep: | |
| 529 first_body_row = i | |
| 530 break | |
| 531 return (colspecs, self.table[:first_body_row], | |
| 532 self.table[first_body_row:]) | |
| 533 | |
| 534 | |
| 535 def update_dict_of_lists(master, newdata): | |
| 536 """ | |
| 537 Extend the list values of `master` with those from `newdata`. | |
| 538 | |
| 539 Both parameters must be dictionaries containing list values. | |
| 540 """ | |
| 541 for key, values in newdata.items(): | |
| 542 master.setdefault(key, []).extend(values) |
