Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
core.py
Go to the documentation of this file.
1from . import idnadata
2import bisect
3import unicodedata
4import re
5from typing import Union, Optional
6from .intranges import intranges_contain
7
8_virama_combining_class = 9
9_alabel_prefix = b'xn--'
10_unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]')
11
12class IDNAError(UnicodeError):
13 """ Base exception for all IDNA-encoding related problems """
14 pass
15
16
18 """ Exception when bidirectional requirements are not satisfied """
19 pass
20
21
23 """ Exception when a disallowed or unallocated codepoint is used """
24 pass
25
26
28 """ Exception when the codepoint is not valid in the context it is used """
29 pass
30
31
32def _combining_class(cp: int) -> int:
34 if v == 0:
35 if not unicodedata.name(chr(cp)):
36 raise ValueError('Unknown character in unicodedata')
37 return v
38
39def _is_script(cp: str, script: str) -> bool:
40 return intranges_contain(ord(cp), idnadata.scripts[script])
41
42def _punycode(s: str) -> bytes:
43 return s.encode('punycode')
44
45def _unot(s: int) -> str:
46 return 'U+{:04X}'.format(s)
47
48
49def valid_label_length(label: Union[bytes, str]) -> bool:
50 if len(label) > 63:
51 return False
52 return True
53
54
55def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:
56 if len(label) > (254 if trailing_dot else 253):
57 return False
58 return True
59
60
61def check_bidi(label: str, check_ltr: bool = False) -> bool:
62 # Bidi rules should only be applied if string contains RTL characters
63 bidi_label = False
64 for (idx, cp) in enumerate(label, 1):
65 direction = unicodedata.bidirectional(cp)
66 if direction == '':
67 # String likely comes from a newer version of Unicode
68 raise IDNABidiError('Unknown directionality in label {} at position {}'.format(repr(label), idx))
69 if direction in ['R', 'AL', 'AN']:
70 bidi_label = True
71 if not bidi_label and not check_ltr:
72 return True
73
74 # Bidi rule 1
75 direction = unicodedata.bidirectional(label[0])
76 if direction in ['R', 'AL']:
77 rtl = True
78 elif direction == 'L':
79 rtl = False
80 else:
81 raise IDNABidiError('First codepoint in label {} must be directionality L, R or AL'.format(repr(label)))
82
83 valid_ending = False
84 number_type = None # type: Optional[str]
85 for (idx, cp) in enumerate(label, 1):
86 direction = unicodedata.bidirectional(cp)
87
88 if rtl:
89 # Bidi rule 2
90 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
91 raise IDNABidiError('Invalid direction for codepoint at position {} in a right-to-left label'.format(idx))
92 # Bidi rule 3
93 if direction in ['R', 'AL', 'EN', 'AN']:
94 valid_ending = True
95 elif direction != 'NSM':
96 valid_ending = False
97 # Bidi rule 4
98 if direction in ['AN', 'EN']:
99 if not number_type:
100 number_type = direction
101 else:
102 if number_type != direction:
103 raise IDNABidiError('Can not mix numeral types in a right-to-left label')
104 else:
105 # Bidi rule 5
106 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
107 raise IDNABidiError('Invalid direction for codepoint at position {} in a left-to-right label'.format(idx))
108 # Bidi rule 6
109 if direction in ['L', 'EN']:
110 valid_ending = True
111 elif direction != 'NSM':
112 valid_ending = False
113
114 if not valid_ending:
115 raise IDNABidiError('Label ends with illegal codepoint directionality')
116
117 return True
118
119
120def check_initial_combiner(label: str) -> bool:
121 if unicodedata.category(label[0])[0] == 'M':
122 raise IDNAError('Label begins with an illegal combining character')
123 return True
124
125
126def check_hyphen_ok(label: str) -> bool:
127 if label[2:4] == '--':
128 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')
129 if label[0] == '-' or label[-1] == '-':
130 raise IDNAError('Label must not start or end with a hyphen')
131 return True
132
133
134def check_nfc(label: str) -> None:
135 if unicodedata.normalize('NFC', label) != label:
136 raise IDNAError('Label must be in Normalization Form C')
137
138
139def valid_contextj(label: str, pos: int) -> bool:
140 cp_value = ord(label[pos])
141
142 if cp_value == 0x200c:
143
144 if pos > 0:
145 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
146 return True
147
148 ok = False
149 for i in range(pos-1, -1, -1):
150 joining_type = idnadata.joining_types.get(ord(label[i]))
151 if joining_type == ord('T'):
152 continue
153 if joining_type in [ord('L'), ord('D')]:
154 ok = True
155 break
156
157 if not ok:
158 return False
159
160 ok = False
161 for i in range(pos+1, len(label)):
162 joining_type = idnadata.joining_types.get(ord(label[i]))
163 if joining_type == ord('T'):
164 continue
165 if joining_type in [ord('R'), ord('D')]:
166 ok = True
167 break
168 return ok
169
170 if cp_value == 0x200d:
171
172 if pos > 0:
173 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
174 return True
175 return False
176
177 else:
178
179 return False
180
181
182def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:
183 cp_value = ord(label[pos])
184
185 if cp_value == 0x00b7:
186 if 0 < pos < len(label)-1:
187 if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:
188 return True
189 return False
190
191 elif cp_value == 0x0375:
192 if pos < len(label)-1 and len(label) > 1:
193 return _is_script(label[pos + 1], 'Greek')
194 return False
195
196 elif cp_value == 0x05f3 or cp_value == 0x05f4:
197 if pos > 0:
198 return _is_script(label[pos - 1], 'Hebrew')
199 return False
200
201 elif cp_value == 0x30fb:
202 for cp in label:
203 if cp == '\u30fb':
204 continue
205 if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):
206 return True
207 return False
208
209 elif 0x660 <= cp_value <= 0x669:
210 for cp in label:
211 if 0x6f0 <= ord(cp) <= 0x06f9:
212 return False
213 return True
214
215 elif 0x6f0 <= cp_value <= 0x6f9:
216 for cp in label:
217 if 0x660 <= ord(cp) <= 0x0669:
218 return False
219 return True
220
221 return False
222
223
224def check_label(label: Union[str, bytes, bytearray]) -> None:
225 if isinstance(label, (bytes, bytearray)):
226 label = label.decode('utf-8')
227 if len(label) == 0:
228 raise IDNAError('Empty Label')
229
230 check_nfc(label)
231 check_hyphen_ok(label)
232 check_initial_combiner(label)
233
234 for (pos, cp) in enumerate(label):
235 cp_value = ord(cp)
236 if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):
237 continue
238 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):
239 try:
240 if not valid_contextj(label, pos):
241 raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format(
242 _unot(cp_value), pos+1, repr(label)))
243 except ValueError:
244 raise IDNAError('Unknown codepoint adjacent to joiner {} at position {} in {}'.format(
245 _unot(cp_value), pos+1, repr(label)))
246 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):
247 if not valid_contexto(label, pos):
248 raise InvalidCodepointContext('Codepoint {} not allowed at position {} in {}'.format(_unot(cp_value), pos+1, repr(label)))
249 else:
250 raise InvalidCodepoint('Codepoint {} at position {} of {} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
251
252 check_bidi(label)
253
254
255def alabel(label: str) -> bytes:
256 try:
257 label_bytes = label.encode('ascii')
258 ulabel(label_bytes)
259 if not valid_label_length(label_bytes):
260 raise IDNAError('Label too long')
261 return label_bytes
262 except UnicodeEncodeError:
263 pass
264
265 if not label:
266 raise IDNAError('No Input')
267
268 label = str(label)
269 check_label(label)
270 label_bytes = _punycode(label)
271 label_bytes = _alabel_prefix + label_bytes
272
273 if not valid_label_length(label_bytes):
274 raise IDNAError('Label too long')
275
276 return label_bytes
277
278
279def ulabel(label: Union[str, bytes, bytearray]) -> str:
280 if not isinstance(label, (bytes, bytearray)):
281 try:
282 label_bytes = label.encode('ascii')
283 except UnicodeEncodeError:
284 check_label(label)
285 return label
286 else:
287 label_bytes = label
288
289 label_bytes = label_bytes.lower()
290 if label_bytes.startswith(_alabel_prefix):
291 label_bytes = label_bytes[len(_alabel_prefix):]
292 if not label_bytes:
293 raise IDNAError('Malformed A-label, no Punycode eligible content found')
294 if label_bytes.decode('ascii')[-1] == '-':
295 raise IDNAError('A-label must not end with a hyphen')
296 else:
297 check_label(label_bytes)
298 return label_bytes.decode('ascii')
299
300 try:
301 label = label_bytes.decode('punycode')
302 except UnicodeError:
303 raise IDNAError('Invalid A-label')
304 check_label(label)
305 return label
306
307
308def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:
309 """Re-map the characters in the string according to UTS46 processing."""
310 from .uts46data import uts46data
311 output = ''
312
313 for pos, char in enumerate(domain):
314 code_point = ord(char)
315 try:
316 uts46row = uts46data[code_point if code_point < 256 else
317 bisect.bisect_left(uts46data, (code_point, 'Z')) - 1]
318 status = uts46row[1]
319 replacement = None # type: Optional[str]
320 if len(uts46row) == 3:
321 replacement = uts46row[2] # type: ignore
322 if (status == 'V' or
323 (status == 'D' and not transitional) or
324 (status == '3' and not std3_rules and replacement is None)):
325 output += char
326 elif replacement is not None and (status == 'M' or
327 (status == '3' and not std3_rules) or
328 (status == 'D' and transitional)):
329 output += replacement
330 elif status != 'I':
331 raise IndexError()
332 except IndexError:
333 raise InvalidCodepoint(
334 'Codepoint {} not allowed at position {} in {}'.format(
335 _unot(code_point), pos + 1, repr(domain)))
336
337 return unicodedata.normalize('NFC', output)
338
339
340def encode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False, transitional: bool = False) -> bytes:
341 if isinstance(s, (bytes, bytearray)):
342 try:
343 s = s.decode('ascii')
344 except UnicodeDecodeError:
345 raise IDNAError('should pass a unicode string to the function rather than a byte string.')
346 if uts46:
347 s = uts46_remap(s, std3_rules, transitional)
348 trailing_dot = False
349 result = []
350 if strict:
351 labels = s.split('.')
352 else:
353 labels = _unicode_dots_re.split(s)
354 if not labels or labels == ['']:
355 raise IDNAError('Empty domain')
356 if labels[-1] == '':
357 del labels[-1]
358 trailing_dot = True
359 for label in labels:
360 s = alabel(label)
361 if s:
363 else:
364 raise IDNAError('Empty label')
365 if trailing_dot:
366 result.append(b'')
367 s = b'.'.join(result)
368 if not valid_string_length(s, trailing_dot):
369 raise IDNAError('Domain too long')
370 return s
371
372
373def decode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False) -> str:
374 try:
375 if isinstance(s, (bytes, bytearray)):
376 s = s.decode('ascii')
377 except UnicodeDecodeError:
378 raise IDNAError('Invalid ASCII in A-label')
379 if uts46:
380 s = uts46_remap(s, std3_rules, False)
381 trailing_dot = False
382 result = []
383 if not strict:
384 labels = _unicode_dots_re.split(s)
385 else:
386 labels = s.split('.')
387 if not labels or labels == ['']:
388 raise IDNAError('Empty domain')
389 if not labels[-1]:
390 del labels[-1]
391 trailing_dot = True
392 for label in labels:
393 s = ulabel(label)
394 if s:
396 else:
397 raise IDNAError('Empty label')
398 if trailing_dot:
399 result.append('')
400 return '.'.join(result)
bytes _punycode(str s)
Definition core.py:42
str _unot(int s)
Definition core.py:45
bool _is_script(str cp, str script)
Definition core.py:39
int _combining_class(int cp)
Definition core.py:32
for i