Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/future/utils/surrogateescape.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:18:57 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d30785e31577 |
|---|---|
| 1 """ | |
| 2 This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error | |
| 3 handler of Python 3. | |
| 4 | |
| 5 Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc | |
| 6 """ | |
| 7 | |
| 8 # This code is released under the Python license and the BSD 2-clause license | |
| 9 | |
| 10 import codecs | |
| 11 import sys | |
| 12 | |
| 13 from future import utils | |
| 14 | |
| 15 | |
| 16 FS_ERRORS = 'surrogateescape' | |
| 17 | |
| 18 # # -- Python 2/3 compatibility ------------------------------------- | |
| 19 # FS_ERRORS = 'my_surrogateescape' | |
| 20 | |
| 21 def u(text): | |
| 22 if utils.PY3: | |
| 23 return text | |
| 24 else: | |
| 25 return text.decode('unicode_escape') | |
| 26 | |
| 27 def b(data): | |
| 28 if utils.PY3: | |
| 29 return data.encode('latin1') | |
| 30 else: | |
| 31 return data | |
| 32 | |
| 33 if utils.PY3: | |
| 34 _unichr = chr | |
| 35 bytes_chr = lambda code: bytes((code,)) | |
| 36 else: | |
| 37 _unichr = unichr | |
| 38 bytes_chr = chr | |
| 39 | |
| 40 def surrogateescape_handler(exc): | |
| 41 """ | |
| 42 Pure Python implementation of the PEP 383: the "surrogateescape" error | |
| 43 handler of Python 3. Undecodable bytes will be replaced by a Unicode | |
| 44 character U+DCxx on decoding, and these are translated into the | |
| 45 original bytes on encoding. | |
| 46 """ | |
| 47 mystring = exc.object[exc.start:exc.end] | |
| 48 | |
| 49 try: | |
| 50 if isinstance(exc, UnicodeDecodeError): | |
| 51 # mystring is a byte-string in this case | |
| 52 decoded = replace_surrogate_decode(mystring) | |
| 53 elif isinstance(exc, UnicodeEncodeError): | |
| 54 # In the case of u'\udcc3'.encode('ascii', | |
| 55 # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an | |
| 56 # exception anyway after this function is called, even though I think | |
| 57 # it's doing what it should. It seems that the strict encoder is called | |
| 58 # to encode the unicode string that this function returns ... | |
| 59 decoded = replace_surrogate_encode(mystring) | |
| 60 else: | |
| 61 raise exc | |
| 62 except NotASurrogateError: | |
| 63 raise exc | |
| 64 return (decoded, exc.end) | |
| 65 | |
| 66 | |
| 67 class NotASurrogateError(Exception): | |
| 68 pass | |
| 69 | |
| 70 | |
| 71 def replace_surrogate_encode(mystring): | |
| 72 """ | |
| 73 Returns a (unicode) string, not the more logical bytes, because the codecs | |
| 74 register_error functionality expects this. | |
| 75 """ | |
| 76 decoded = [] | |
| 77 for ch in mystring: | |
| 78 # if utils.PY3: | |
| 79 # code = ch | |
| 80 # else: | |
| 81 code = ord(ch) | |
| 82 | |
| 83 # The following magic comes from Py3.3's Python/codecs.c file: | |
| 84 if not 0xD800 <= code <= 0xDCFF: | |
| 85 # Not a surrogate. Fail with the original exception. | |
| 86 raise NotASurrogateError | |
| 87 # mybytes = [0xe0 | (code >> 12), | |
| 88 # 0x80 | ((code >> 6) & 0x3f), | |
| 89 # 0x80 | (code & 0x3f)] | |
| 90 # Is this a good idea? | |
| 91 if 0xDC00 <= code <= 0xDC7F: | |
| 92 decoded.append(_unichr(code - 0xDC00)) | |
| 93 elif code <= 0xDCFF: | |
| 94 decoded.append(_unichr(code - 0xDC00)) | |
| 95 else: | |
| 96 raise NotASurrogateError | |
| 97 return str().join(decoded) | |
| 98 | |
| 99 | |
| 100 def replace_surrogate_decode(mybytes): | |
| 101 """ | |
| 102 Returns a (unicode) string | |
| 103 """ | |
| 104 decoded = [] | |
| 105 for ch in mybytes: | |
| 106 # We may be parsing newbytes (in which case ch is an int) or a native | |
| 107 # str on Py2 | |
| 108 if isinstance(ch, int): | |
| 109 code = ch | |
| 110 else: | |
| 111 code = ord(ch) | |
| 112 if 0x80 <= code <= 0xFF: | |
| 113 decoded.append(_unichr(0xDC00 + code)) | |
| 114 elif code <= 0x7F: | |
| 115 decoded.append(_unichr(code)) | |
| 116 else: | |
| 117 # # It may be a bad byte | |
| 118 # # Try swallowing it. | |
| 119 # continue | |
| 120 # print("RAISE!") | |
| 121 raise NotASurrogateError | |
| 122 return str().join(decoded) | |
| 123 | |
| 124 | |
| 125 def encodefilename(fn): | |
| 126 if FS_ENCODING == 'ascii': | |
| 127 # ASCII encoder of Python 2 expects that the error handler returns a | |
| 128 # Unicode string encodable to ASCII, whereas our surrogateescape error | |
| 129 # handler has to return bytes in 0x80-0xFF range. | |
| 130 encoded = [] | |
| 131 for index, ch in enumerate(fn): | |
| 132 code = ord(ch) | |
| 133 if code < 128: | |
| 134 ch = bytes_chr(code) | |
| 135 elif 0xDC80 <= code <= 0xDCFF: | |
| 136 ch = bytes_chr(code - 0xDC00) | |
| 137 else: | |
| 138 raise UnicodeEncodeError(FS_ENCODING, | |
| 139 fn, index, index+1, | |
| 140 'ordinal not in range(128)') | |
| 141 encoded.append(ch) | |
| 142 return bytes().join(encoded) | |
| 143 elif FS_ENCODING == 'utf-8': | |
| 144 # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF | |
| 145 # doesn't go through our error handler | |
| 146 encoded = [] | |
| 147 for index, ch in enumerate(fn): | |
| 148 code = ord(ch) | |
| 149 if 0xD800 <= code <= 0xDFFF: | |
| 150 if 0xDC80 <= code <= 0xDCFF: | |
| 151 ch = bytes_chr(code - 0xDC00) | |
| 152 encoded.append(ch) | |
| 153 else: | |
| 154 raise UnicodeEncodeError( | |
| 155 FS_ENCODING, | |
| 156 fn, index, index+1, 'surrogates not allowed') | |
| 157 else: | |
| 158 ch_utf8 = ch.encode('utf-8') | |
| 159 encoded.append(ch_utf8) | |
| 160 return bytes().join(encoded) | |
| 161 else: | |
| 162 return fn.encode(FS_ENCODING, FS_ERRORS) | |
| 163 | |
| 164 def decodefilename(fn): | |
| 165 return fn.decode(FS_ENCODING, FS_ERRORS) | |
| 166 | |
| 167 FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') | |
| 168 # FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') | |
| 169 # FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') | |
| 170 | |
| 171 | |
| 172 # normalize the filesystem encoding name. | |
| 173 # For example, we expect "utf-8", not "UTF8". | |
| 174 FS_ENCODING = codecs.lookup(FS_ENCODING).name | |
| 175 | |
| 176 | |
| 177 def register_surrogateescape(): | |
| 178 """ | |
| 179 Registers the surrogateescape error handler on Python 2 (only) | |
| 180 """ | |
| 181 if utils.PY3: | |
| 182 return | |
| 183 try: | |
| 184 codecs.lookup_error(FS_ERRORS) | |
| 185 except LookupError: | |
| 186 codecs.register_error(FS_ERRORS, surrogateescape_handler) | |
| 187 | |
| 188 | |
| 189 if __name__ == '__main__': | |
| 190 pass | |
| 191 # # Tests: | |
| 192 # register_surrogateescape() | |
| 193 | |
| 194 # b = decodefilename(fn) | |
| 195 # assert b == encoded, "%r != %r" % (b, encoded) | |
| 196 # c = encodefilename(b) | |
| 197 # assert c == fn, '%r != %r' % (c, fn) | |
| 198 # # print("ok") |
