Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/boltons/jsonutils.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:18:57 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d30785e31577 |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """``jsonutils`` aims to provide various helpers for working with | |
| 3 JSON. Currently it focuses on providing a reliable and intuitive means | |
| 4 of working with `JSON Lines`_-formatted files. | |
| 5 | |
| 6 .. _JSON Lines: http://jsonlines.org/ | |
| 7 | |
| 8 """ | |
| 9 | |
| 10 from __future__ import print_function | |
| 11 | |
| 12 import os | |
| 13 import json | |
| 14 | |
| 15 | |
| 16 DEFAULT_BLOCKSIZE = 4096 | |
| 17 | |
| 18 # reverse iter lines algorithm: | |
| 19 # | |
| 20 # - if it ends in a newline, add an empty string to the line list | |
| 21 # - if there's one item, then prepend it to the buffer, continue | |
| 22 # - if there's more than one item, pop the last item and prepend it | |
| 23 # to the buffer, yielding it | |
| 24 # - yield all remaining items in reverse, except for the first | |
| 25 # - first item becomes the new buffer | |
| 26 # | |
| 27 # - when the outer loop completes, yield the buffer | |
| 28 | |
| 29 | |
| 30 __all__ = ['JSONLIterator', 'reverse_iter_lines'] | |
| 31 | |
| 32 | |
| 33 def reverse_iter_lines(file_obj, blocksize=DEFAULT_BLOCKSIZE, preseek=True): | |
| 34 """Returns an iterator over the lines from a file object, in | |
| 35 reverse order, i.e., last line first, first line last. Uses the | |
| 36 :meth:`file.seek` method of file objects, and is tested compatible with | |
| 37 :class:`file` objects, as well as :class:`StringIO.StringIO`. | |
| 38 | |
| 39 Args: | |
| 40 file_obj (file): An open file object. Note that ``reverse_iter_lines`` | |
| 41 mutably reads from the file and other functions should not mutably | |
| 42 interact with the file object. | |
| 43 blocksize (int): The block size to pass to :meth:`file.read()` | |
| 44 preseek (bool): Tells the function whether or not to automatically | |
| 45 seek to the end of the file. Defaults to ``True``. | |
| 46 ``preseek=False`` is useful in cases when the | |
| 47 file cursor is already in position, either at the end of | |
| 48 the file or in the middle for relative reverse line | |
| 49 generation. | |
| 50 """ | |
| 51 if preseek: | |
| 52 file_obj.seek(0, os.SEEK_END) | |
| 53 cur_pos = file_obj.tell() | |
| 54 buff = '' | |
| 55 while 0 < cur_pos: | |
| 56 read_size = min(blocksize, cur_pos) | |
| 57 cur_pos -= read_size | |
| 58 file_obj.seek(cur_pos, os.SEEK_SET) | |
| 59 cur = file_obj.read(read_size) | |
| 60 lines = cur.splitlines() | |
| 61 if cur[-1] == '\n': | |
| 62 lines.append('') | |
| 63 if len(lines) == 1: | |
| 64 buff = lines[0] + buff | |
| 65 continue | |
| 66 last = lines.pop() | |
| 67 yield last + buff | |
| 68 for line in lines[:0:-1]: | |
| 69 yield line | |
| 70 buff = lines[0] | |
| 71 if buff: | |
| 72 # TODO: test this, does an empty buffer always mean don't yield? | |
| 73 yield buff | |
| 74 | |
| 75 | |
| 76 """ | |
| 77 TODO: allow passthroughs for: | |
| 78 | |
| 79 json.load(fp[, encoding[, cls[, object_hook[, parse_float[, parse_int[, parse_constant[, object_pairs_hook[, **kw]]]]]]]]) | |
| 80 """ | |
| 81 | |
| 82 | |
| 83 class JSONLIterator(object): | |
| 84 """The ``JSONLIterator`` is used to iterate over JSON-encoded objects | |
| 85 stored in the `JSON Lines format`_ (one object per line). | |
| 86 | |
| 87 Most notably it has the ability to efficiently read from the | |
| 88 bottom of files, making it very effective for reading in simple | |
| 89 append-only JSONL use cases. It also has the ability to start from | |
| 90 anywhere in the file and ignore corrupted lines. | |
| 91 | |
| 92 Args: | |
| 93 file_obj (file): An open file object. | |
| 94 ignore_errors (bool): Whether to skip over lines that raise an error on | |
| 95 deserialization (:func:`json.loads`). | |
| 96 reverse (bool): Controls the direction of the iteration. | |
| 97 Defaults to ``False``. If set to ``True`` and *rel_seek* | |
| 98 is unset, seeks to the end of the file before iteration | |
| 99 begins. | |
| 100 rel_seek (float): Used to preseek the start position of | |
| 101 iteration. Set to 0.0 for the start of the file, 1.0 for the | |
| 102 end, and anything in between. | |
| 103 | |
| 104 .. _JSON Lines format: http://jsonlines.org/ | |
| 105 """ | |
| 106 def __init__(self, file_obj, | |
| 107 ignore_errors=False, reverse=False, rel_seek=None): | |
| 108 self._reverse = bool(reverse) | |
| 109 self._file_obj = file_obj | |
| 110 self.ignore_errors = ignore_errors | |
| 111 | |
| 112 if rel_seek is None: | |
| 113 if reverse: | |
| 114 rel_seek = 1.0 | |
| 115 elif not -1.0 < rel_seek < 1.0: | |
| 116 raise ValueError("'rel_seek' expected a float between" | |
| 117 " -1.0 and 1.0, not %r" % rel_seek) | |
| 118 elif rel_seek < 0: | |
| 119 rel_seek = 1.0 - rel_seek | |
| 120 self._rel_seek = rel_seek | |
| 121 self._blocksize = 4096 | |
| 122 if rel_seek is not None: | |
| 123 self._init_rel_seek() | |
| 124 if self._reverse: | |
| 125 self._line_iter = reverse_iter_lines(self._file_obj, | |
| 126 blocksize=self._blocksize, | |
| 127 preseek=False) | |
| 128 else: | |
| 129 self._line_iter = iter(self._file_obj) | |
| 130 | |
| 131 @property | |
| 132 def cur_byte_pos(self): | |
| 133 "A property representing where in the file the iterator is reading." | |
| 134 return self._file_obj.tell() | |
| 135 | |
| 136 def _align_to_newline(self): | |
| 137 "Aligns the file object's position to the next newline." | |
| 138 fo, bsize = self._file_obj, self._blocksize | |
| 139 cur, total_read = '', 0 | |
| 140 cur_pos = fo.tell() | |
| 141 while '\n' not in cur: | |
| 142 cur = fo.read(bsize) | |
| 143 total_read += bsize | |
| 144 try: | |
| 145 newline_offset = cur.index('\n') + total_read - bsize | |
| 146 except ValueError: | |
| 147 raise # TODO: seek to end? | |
| 148 fo.seek(cur_pos + newline_offset) | |
| 149 | |
| 150 def _init_rel_seek(self): | |
| 151 "Sets the file object's position to the relative location set above." | |
| 152 rs, fo = self._rel_seek, self._file_obj | |
| 153 if rs == 0.0: | |
| 154 fo.seek(0, os.SEEK_SET) | |
| 155 else: | |
| 156 fo.seek(0, os.SEEK_END) | |
| 157 size = fo.tell() | |
| 158 if rs == 1.0: | |
| 159 self._cur_pos = size | |
| 160 else: | |
| 161 target = int(size * rs) | |
| 162 fo.seek(target, os.SEEK_SET) | |
| 163 self._align_to_newline() | |
| 164 self._cur_pos = fo.tell() | |
| 165 | |
| 166 def __iter__(self): | |
| 167 return self | |
| 168 | |
| 169 def next(self): | |
| 170 """Yields one :class:`dict` loaded with :func:`json.loads`, advancing | |
| 171 the file object by one line. Raises :exc:`StopIteration` upon reaching | |
| 172 the end of the file (or beginning, if ``reverse`` was set to ``True``. | |
| 173 """ | |
| 174 while 1: | |
| 175 line = next(self._line_iter).lstrip() | |
| 176 if not line: | |
| 177 continue | |
| 178 try: | |
| 179 obj = json.loads(line) | |
| 180 except Exception: | |
| 181 if not self.ignore_errors: | |
| 182 raise | |
| 183 continue | |
| 184 return obj | |
| 185 | |
| 186 __next__ = next | |
| 187 | |
| 188 | |
| 189 if __name__ == '__main__': | |
| 190 def _main(): | |
| 191 import sys | |
| 192 if '-h' in sys.argv or '--help' in sys.argv: | |
| 193 print('loads one or more JSON Line files for basic validation.') | |
| 194 return | |
| 195 verbose = False | |
| 196 if '-v' in sys.argv or '--verbose' in sys.argv: | |
| 197 verbose = True | |
| 198 file_count, obj_count = 0, 0 | |
| 199 filenames = sys.argv[1:] | |
| 200 for filename in filenames: | |
| 201 if filename in ('-h', '--help', '-v', '--verbose'): | |
| 202 continue | |
| 203 file_count += 1 | |
| 204 with open(filename, 'rb') as file_obj: | |
| 205 iterator = JSONLIterator(file_obj) | |
| 206 cur_obj_count = 0 | |
| 207 while 1: | |
| 208 try: | |
| 209 next(iterator) | |
| 210 except ValueError: | |
| 211 print('error reading object #%s around byte %s in %s' | |
| 212 % (cur_obj_count + 1, iterator.cur_byte_pos, filename)) | |
| 213 return | |
| 214 except StopIteration: | |
| 215 break | |
| 216 obj_count += 1 | |
| 217 cur_obj_count += 1 | |
| 218 if verbose and obj_count and obj_count % 100 == 0: | |
| 219 sys.stdout.write('.') | |
| 220 if obj_count % 10000: | |
| 221 sys.stdout.write('%s\n' % obj_count) | |
| 222 if verbose: | |
| 223 print('files checked: %s' % file_count) | |
| 224 print('objects loaded: %s' % obj_count) | |
| 225 return | |
| 226 | |
| 227 _main() |
