Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/future/utils/surrogateescape.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
| author | shellac |
|---|---|
| date | Mon, 01 Jun 2020 08:59:25 -0400 |
| parents | 79f47841a781 |
| children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/future/utils/surrogateescape.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,198 +0,0 @@ -""" -This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error -handler of Python 3. - -Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc -""" - -# This code is released under the Python license and the BSD 2-clause license - -import codecs -import sys - -from future import utils - - -FS_ERRORS = 'surrogateescape' - -# # -- Python 2/3 compatibility ------------------------------------- -# FS_ERRORS = 'my_surrogateescape' - -def u(text): - if utils.PY3: - return text - else: - return text.decode('unicode_escape') - -def b(data): - if utils.PY3: - return data.encode('latin1') - else: - return data - -if utils.PY3: - _unichr = chr - bytes_chr = lambda code: bytes((code,)) -else: - _unichr = unichr - bytes_chr = chr - -def surrogateescape_handler(exc): - """ - Pure Python implementation of the PEP 383: the "surrogateescape" error - handler of Python 3. Undecodable bytes will be replaced by a Unicode - character U+DCxx on decoding, and these are translated into the - original bytes on encoding. - """ - mystring = exc.object[exc.start:exc.end] - - try: - if isinstance(exc, UnicodeDecodeError): - # mystring is a byte-string in this case - decoded = replace_surrogate_decode(mystring) - elif isinstance(exc, UnicodeEncodeError): - # In the case of u'\udcc3'.encode('ascii', - # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an - # exception anyway after this function is called, even though I think - # it's doing what it should. It seems that the strict encoder is called - # to encode the unicode string that this function returns ... - decoded = replace_surrogate_encode(mystring) - else: - raise exc - except NotASurrogateError: - raise exc - return (decoded, exc.end) - - -class NotASurrogateError(Exception): - pass - - -def replace_surrogate_encode(mystring): - """ - Returns a (unicode) string, not the more logical bytes, because the codecs - register_error functionality expects this. - """ - decoded = [] - for ch in mystring: - # if utils.PY3: - # code = ch - # else: - code = ord(ch) - - # The following magic comes from Py3.3's Python/codecs.c file: - if not 0xD800 <= code <= 0xDCFF: - # Not a surrogate. Fail with the original exception. - raise NotASurrogateError - # mybytes = [0xe0 | (code >> 12), - # 0x80 | ((code >> 6) & 0x3f), - # 0x80 | (code & 0x3f)] - # Is this a good idea? - if 0xDC00 <= code <= 0xDC7F: - decoded.append(_unichr(code - 0xDC00)) - elif code <= 0xDCFF: - decoded.append(_unichr(code - 0xDC00)) - else: - raise NotASurrogateError - return str().join(decoded) - - -def replace_surrogate_decode(mybytes): - """ - Returns a (unicode) string - """ - decoded = [] - for ch in mybytes: - # We may be parsing newbytes (in which case ch is an int) or a native - # str on Py2 - if isinstance(ch, int): - code = ch - else: - code = ord(ch) - if 0x80 <= code <= 0xFF: - decoded.append(_unichr(0xDC00 + code)) - elif code <= 0x7F: - decoded.append(_unichr(code)) - else: - # # It may be a bad byte - # # Try swallowing it. - # continue - # print("RAISE!") - raise NotASurrogateError - return str().join(decoded) - - -def encodefilename(fn): - if FS_ENCODING == 'ascii': - # ASCII encoder of Python 2 expects that the error handler returns a - # Unicode string encodable to ASCII, whereas our surrogateescape error - # handler has to return bytes in 0x80-0xFF range. - encoded = [] - for index, ch in enumerate(fn): - code = ord(ch) - if code < 128: - ch = bytes_chr(code) - elif 0xDC80 <= code <= 0xDCFF: - ch = bytes_chr(code - 0xDC00) - else: - raise UnicodeEncodeError(FS_ENCODING, - fn, index, index+1, - 'ordinal not in range(128)') - encoded.append(ch) - return bytes().join(encoded) - elif FS_ENCODING == 'utf-8': - # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF - # doesn't go through our error handler - encoded = [] - for index, ch in enumerate(fn): - code = ord(ch) - if 0xD800 <= code <= 0xDFFF: - if 0xDC80 <= code <= 0xDCFF: - ch = bytes_chr(code - 0xDC00) - encoded.append(ch) - else: - raise UnicodeEncodeError( - FS_ENCODING, - fn, index, index+1, 'surrogates not allowed') - else: - ch_utf8 = ch.encode('utf-8') - encoded.append(ch_utf8) - return bytes().join(encoded) - else: - return fn.encode(FS_ENCODING, FS_ERRORS) - -def decodefilename(fn): - return fn.decode(FS_ENCODING, FS_ERRORS) - -FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') -# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') -# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') - - -# normalize the filesystem encoding name. -# For example, we expect "utf-8", not "UTF8". -FS_ENCODING = codecs.lookup(FS_ENCODING).name - - -def register_surrogateescape(): - """ - Registers the surrogateescape error handler on Python 2 (only) - """ - if utils.PY3: - return - try: - codecs.lookup_error(FS_ERRORS) - except LookupError: - codecs.register_error(FS_ERRORS, surrogateescape_handler) - - -if __name__ == '__main__': - pass - # # Tests: - # register_surrogateescape() - - # b = decodefilename(fn) - # assert b == encoded, "%r != %r" % (b, encoded) - # c = encodefilename(b) - # assert c == fn, '%r != %r' % (c, fn) - # # print("ok")
