comparison env/lib/python3.9/site-packages/idna/core.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 from . import idnadata
2 import bisect
3 import unicodedata
4 import re
5 import sys
6 from .intranges import intranges_contain
7
8 _virama_combining_class = 9
9 _alabel_prefix = b'xn--'
10 _unicode_dots_re = re.compile(u'[\u002e\u3002\uff0e\uff61]')
11
12 if sys.version_info[0] >= 3:
13 unicode = str
14 unichr = chr
15
16 class IDNAError(UnicodeError):
17 """ Base exception for all IDNA-encoding related problems """
18 pass
19
20
21 class IDNABidiError(IDNAError):
22 """ Exception when bidirectional requirements are not satisfied """
23 pass
24
25
26 class InvalidCodepoint(IDNAError):
27 """ Exception when a disallowed or unallocated codepoint is used """
28 pass
29
30
31 class InvalidCodepointContext(IDNAError):
32 """ Exception when the codepoint is not valid in the context it is used """
33 pass
34
35
36 def _combining_class(cp):
37 v = unicodedata.combining(unichr(cp))
38 if v == 0:
39 if not unicodedata.name(unichr(cp)):
40 raise ValueError("Unknown character in unicodedata")
41 return v
42
43 def _is_script(cp, script):
44 return intranges_contain(ord(cp), idnadata.scripts[script])
45
46 def _punycode(s):
47 return s.encode('punycode')
48
49 def _unot(s):
50 return 'U+{0:04X}'.format(s)
51
52
53 def valid_label_length(label):
54
55 if len(label) > 63:
56 return False
57 return True
58
59
60 def valid_string_length(label, trailing_dot):
61
62 if len(label) > (254 if trailing_dot else 253):
63 return False
64 return True
65
66
67 def check_bidi(label, check_ltr=False):
68
69 # Bidi rules should only be applied if string contains RTL characters
70 bidi_label = False
71 for (idx, cp) in enumerate(label, 1):
72 direction = unicodedata.bidirectional(cp)
73 if direction == '':
74 # String likely comes from a newer version of Unicode
75 raise IDNABidiError('Unknown directionality in label {0} at position {1}'.format(repr(label), idx))
76 if direction in ['R', 'AL', 'AN']:
77 bidi_label = True
78 if not bidi_label and not check_ltr:
79 return True
80
81 # Bidi rule 1
82 direction = unicodedata.bidirectional(label[0])
83 if direction in ['R', 'AL']:
84 rtl = True
85 elif direction == 'L':
86 rtl = False
87 else:
88 raise IDNABidiError('First codepoint in label {0} must be directionality L, R or AL'.format(repr(label)))
89
90 valid_ending = False
91 number_type = False
92 for (idx, cp) in enumerate(label, 1):
93 direction = unicodedata.bidirectional(cp)
94
95 if rtl:
96 # Bidi rule 2
97 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
98 raise IDNABidiError('Invalid direction for codepoint at position {0} in a right-to-left label'.format(idx))
99 # Bidi rule 3
100 if direction in ['R', 'AL', 'EN', 'AN']:
101 valid_ending = True
102 elif direction != 'NSM':
103 valid_ending = False
104 # Bidi rule 4
105 if direction in ['AN', 'EN']:
106 if not number_type:
107 number_type = direction
108 else:
109 if number_type != direction:
110 raise IDNABidiError('Can not mix numeral types in a right-to-left label')
111 else:
112 # Bidi rule 5
113 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
114 raise IDNABidiError('Invalid direction for codepoint at position {0} in a left-to-right label'.format(idx))
115 # Bidi rule 6
116 if direction in ['L', 'EN']:
117 valid_ending = True
118 elif direction != 'NSM':
119 valid_ending = False
120
121 if not valid_ending:
122 raise IDNABidiError('Label ends with illegal codepoint directionality')
123
124 return True
125
126
127 def check_initial_combiner(label):
128
129 if unicodedata.category(label[0])[0] == 'M':
130 raise IDNAError('Label begins with an illegal combining character')
131 return True
132
133
134 def check_hyphen_ok(label):
135
136 if label[2:4] == '--':
137 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')
138 if label[0] == '-' or label[-1] == '-':
139 raise IDNAError('Label must not start or end with a hyphen')
140 return True
141
142
143 def check_nfc(label):
144
145 if unicodedata.normalize('NFC', label) != label:
146 raise IDNAError('Label must be in Normalization Form C')
147
148
149 def valid_contextj(label, pos):
150
151 cp_value = ord(label[pos])
152
153 if cp_value == 0x200c:
154
155 if pos > 0:
156 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
157 return True
158
159 ok = False
160 for i in range(pos-1, -1, -1):
161 joining_type = idnadata.joining_types.get(ord(label[i]))
162 if joining_type == ord('T'):
163 continue
164 if joining_type in [ord('L'), ord('D')]:
165 ok = True
166 break
167
168 if not ok:
169 return False
170
171 ok = False
172 for i in range(pos+1, len(label)):
173 joining_type = idnadata.joining_types.get(ord(label[i]))
174 if joining_type == ord('T'):
175 continue
176 if joining_type in [ord('R'), ord('D')]:
177 ok = True
178 break
179 return ok
180
181 if cp_value == 0x200d:
182
183 if pos > 0:
184 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
185 return True
186 return False
187
188 else:
189
190 return False
191
192
193 def valid_contexto(label, pos, exception=False):
194
195 cp_value = ord(label[pos])
196
197 if cp_value == 0x00b7:
198 if 0 < pos < len(label)-1:
199 if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:
200 return True
201 return False
202
203 elif cp_value == 0x0375:
204 if pos < len(label)-1 and len(label) > 1:
205 return _is_script(label[pos + 1], 'Greek')
206 return False
207
208 elif cp_value == 0x05f3 or cp_value == 0x05f4:
209 if pos > 0:
210 return _is_script(label[pos - 1], 'Hebrew')
211 return False
212
213 elif cp_value == 0x30fb:
214 for cp in label:
215 if cp == u'\u30fb':
216 continue
217 if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):
218 return True
219 return False
220
221 elif 0x660 <= cp_value <= 0x669:
222 for cp in label:
223 if 0x6f0 <= ord(cp) <= 0x06f9:
224 return False
225 return True
226
227 elif 0x6f0 <= cp_value <= 0x6f9:
228 for cp in label:
229 if 0x660 <= ord(cp) <= 0x0669:
230 return False
231 return True
232
233
234 def check_label(label):
235
236 if isinstance(label, (bytes, bytearray)):
237 label = label.decode('utf-8')
238 if len(label) == 0:
239 raise IDNAError('Empty Label')
240
241 check_nfc(label)
242 check_hyphen_ok(label)
243 check_initial_combiner(label)
244
245 for (pos, cp) in enumerate(label):
246 cp_value = ord(cp)
247 if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):
248 continue
249 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):
250 try:
251 if not valid_contextj(label, pos):
252 raise InvalidCodepointContext('Joiner {0} not allowed at position {1} in {2}'.format(
253 _unot(cp_value), pos+1, repr(label)))
254 except ValueError:
255 raise IDNAError('Unknown codepoint adjacent to joiner {0} at position {1} in {2}'.format(
256 _unot(cp_value), pos+1, repr(label)))
257 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):
258 if not valid_contexto(label, pos):
259 raise InvalidCodepointContext('Codepoint {0} not allowed at position {1} in {2}'.format(_unot(cp_value), pos+1, repr(label)))
260 else:
261 raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
262
263 check_bidi(label)
264
265
266 def alabel(label):
267
268 try:
269 label = label.encode('ascii')
270 ulabel(label)
271 if not valid_label_length(label):
272 raise IDNAError('Label too long')
273 return label
274 except UnicodeEncodeError:
275 pass
276
277 if not label:
278 raise IDNAError('No Input')
279
280 label = unicode(label)
281 check_label(label)
282 label = _punycode(label)
283 label = _alabel_prefix + label
284
285 if not valid_label_length(label):
286 raise IDNAError('Label too long')
287
288 return label
289
290
291 def ulabel(label):
292
293 if not isinstance(label, (bytes, bytearray)):
294 try:
295 label = label.encode('ascii')
296 except UnicodeEncodeError:
297 check_label(label)
298 return label
299
300 label = label.lower()
301 if label.startswith(_alabel_prefix):
302 label = label[len(_alabel_prefix):]
303 if not label:
304 raise IDNAError('Malformed A-label, no Punycode eligible content found')
305 if label.decode('ascii')[-1] == '-':
306 raise IDNAError('A-label must not end with a hyphen')
307 else:
308 check_label(label)
309 return label.decode('ascii')
310
311 label = label.decode('punycode')
312 check_label(label)
313 return label
314
315
316 def uts46_remap(domain, std3_rules=True, transitional=False):
317 """Re-map the characters in the string according to UTS46 processing."""
318 from .uts46data import uts46data
319 output = u""
320 try:
321 for pos, char in enumerate(domain):
322 code_point = ord(char)
323 uts46row = uts46data[code_point if code_point < 256 else
324 bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
325 status = uts46row[1]
326 replacement = uts46row[2] if len(uts46row) == 3 else None
327 if (status == "V" or
328 (status == "D" and not transitional) or
329 (status == "3" and not std3_rules and replacement is None)):
330 output += char
331 elif replacement is not None and (status == "M" or
332 (status == "3" and not std3_rules) or
333 (status == "D" and transitional)):
334 output += replacement
335 elif status != "I":
336 raise IndexError()
337 return unicodedata.normalize("NFC", output)
338 except IndexError:
339 raise InvalidCodepoint(
340 "Codepoint {0} not allowed at position {1} in {2}".format(
341 _unot(code_point), pos + 1, repr(domain)))
342
343
344 def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False):
345
346 if isinstance(s, (bytes, bytearray)):
347 s = s.decode("ascii")
348 if uts46:
349 s = uts46_remap(s, std3_rules, transitional)
350 trailing_dot = False
351 result = []
352 if strict:
353 labels = s.split('.')
354 else:
355 labels = _unicode_dots_re.split(s)
356 if not labels or labels == ['']:
357 raise IDNAError('Empty domain')
358 if labels[-1] == '':
359 del labels[-1]
360 trailing_dot = True
361 for label in labels:
362 s = alabel(label)
363 if s:
364 result.append(s)
365 else:
366 raise IDNAError('Empty label')
367 if trailing_dot:
368 result.append(b'')
369 s = b'.'.join(result)
370 if not valid_string_length(s, trailing_dot):
371 raise IDNAError('Domain too long')
372 return s
373
374
375 def decode(s, strict=False, uts46=False, std3_rules=False):
376
377 if isinstance(s, (bytes, bytearray)):
378 s = s.decode("ascii")
379 if uts46:
380 s = uts46_remap(s, std3_rules, False)
381 trailing_dot = False
382 result = []
383 if not strict:
384 labels = _unicode_dots_re.split(s)
385 else:
386 labels = s.split(u'.')
387 if not labels or labels == ['']:
388 raise IDNAError('Empty domain')
389 if not labels[-1]:
390 del labels[-1]
391 trailing_dot = True
392 for label in labels:
393 s = ulabel(label)
394 if s:
395 result.append(s)
396 else:
397 raise IDNAError('Empty label')
398 if trailing_dot:
399 result.append(u'')
400 return u'.'.join(result)