Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
utf1632prober.py
Go to the documentation of this file.
21from typing import List, Union
22
23from .charsetprober import CharSetProber
24from .enums import ProbingState
25
26
28 """
29 This class simply looks for occurrences of zero bytes, and infers
30 whether the file is UTF16 or UTF32 (low-endian or big-endian)
31 For instance, files looking like ( \0 \0 \0 [nonzero] )+
32 have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+
33 may be guessed to be UTF16BE, and inversely for little-endian varieties.
34 """
35
36 # how many logical characters to scan before feeling confident of prediction
37 MIN_CHARS_FOR_DETECTION = 20
38 # a fixed constant ratio of expected zeros or non-zeros in modulo-position.
39 EXPECTED_RATIO = 0.94
40
41 def __init__(self) -> None:
42 super().__init__()
43 self.position = 0
44 self.zeros_at_mod = [0] * 4
45 self.nonzeros_at_mod = [0] * 4
47 self.quad = [0, 0, 0, 0]
48 self.invalid_utf16be = False
49 self.invalid_utf16le = False
50 self.invalid_utf32be = False
51 self.invalid_utf32le = False
54 self.resetreset()
55
56 def reset(self) -> None:
57 super().reset()
58 self.position = 0
59 self.zeros_at_mod = [0] * 4
60 self.nonzeros_at_mod = [0] * 4
62 self.invalid_utf16be = False
63 self.invalid_utf16le = False
64 self.invalid_utf32be = False
65 self.invalid_utf32le = False
68 self.quad = [0, 0, 0, 0]
69
70 @property
71 def charset_name(self) -> str:
72 if self.is_likely_utf32be():
73 return "utf-32be"
74 if self.is_likely_utf32le():
75 return "utf-32le"
76 if self.is_likely_utf16be():
77 return "utf-16be"
78 if self.is_likely_utf16le():
79 return "utf-16le"
80 # default to something valid
81 return "utf-16"
82
83 @property
84 def language(self) -> str:
85 return ""
86
87 def approx_32bit_chars(self) -> float:
88 return max(1.0, self.position / 4.0)
89
90 def approx_16bit_chars(self) -> float:
91 return max(1.0, self.position / 2.0)
92
93 def is_likely_utf32be(self) -> bool:
94 approx_chars = self.approx_32bit_chars()
95 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
96 self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
97 and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
98 and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
99 and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
100 and not self.invalid_utf32be
101 )
102
103 def is_likely_utf32le(self) -> bool:
104 approx_chars = self.approx_32bit_chars()
105 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
106 self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
107 and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
108 and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
109 and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
110 and not self.invalid_utf32le
111 )
112
113 def is_likely_utf16be(self) -> bool:
114 approx_chars = self.approx_16bit_chars()
115 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
116 (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
117 > self.EXPECTED_RATIO
118 and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars
119 > self.EXPECTED_RATIO
120 and not self.invalid_utf16be
121 )
122
123 def is_likely_utf16le(self) -> bool:
124 approx_chars = self.approx_16bit_chars()
125 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
126 (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
127 > self.EXPECTED_RATIO
128 and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars
129 > self.EXPECTED_RATIO
130 and not self.invalid_utf16le
131 )
132
133 def validate_utf32_characters(self, quad: List[int]) -> None:
134 """
135 Validate if the quad of bytes is valid UTF-32.
136
137 UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
138 excluding 0x0000D800 - 0x0000DFFF
139
140 https://en.wikipedia.org/wiki/UTF-32
141 """
142 if (
143 quad[0] != 0
144 or quad[1] > 0x10
145 or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF)
146 ):
147 self.invalid_utf32be = True
148 if (
149 quad[3] != 0
150 or quad[2] > 0x10
151 or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF)
152 ):
153 self.invalid_utf32le = True
154
155 def validate_utf16_characters(self, pair: List[int]) -> None:
156 """
157 Validate if the pair of bytes is valid UTF-16.
158
159 UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
160 with an exception for surrogate pairs, which must be in the range
161 0xD800-0xDBFF followed by 0xDC00-0xDFFF
162
163 https://en.wikipedia.org/wiki/UTF-16
164 """
166 if 0xD8 <= pair[0] <= 0xDB:
168 elif 0xDC <= pair[0] <= 0xDF:
169 self.invalid_utf16be = True
170 else:
171 if 0xDC <= pair[0] <= 0xDF:
173 else:
174 self.invalid_utf16be = True
175
177 if 0xD8 <= pair[1] <= 0xDB:
179 elif 0xDC <= pair[1] <= 0xDF:
180 self.invalid_utf16le = True
181 else:
182 if 0xDC <= pair[1] <= 0xDF:
184 else:
185 self.invalid_utf16le = True
186
187 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
188 for c in byte_str:
189 mod4 = self.position % 4
190 self.quad[mod4] = c
191 if mod4 == 3:
193 self.validate_utf16_characters(self.quad[0:2])
194 self.validate_utf16_characters(self.quad[2:4])
195 if c == 0:
196 self.zeros_at_mod[mod4] += 1
197 else:
198 self.nonzeros_at_mod[mod4] += 1
199 self.position += 1
200 return self.statestate
201
202 @property
203 def state(self) -> ProbingState:
205 # terminal, decided states
206 return self._state_state
207 if self.get_confidenceget_confidence() > 0.80:
209 elif self.position > 4 * 1024:
210 # if we get to 4kb into the file, and we can't conclude it's UTF,
211 # let's give up
213 return self._state_state
214
215 def get_confidence(self) -> float:
216 return (
217 0.85
218 if (
219 self.is_likely_utf16le()
220 or self.is_likely_utf16be()
221 or self.is_likely_utf32le()
222 or self.is_likely_utf32be()
223 )
224 else 0.00
225 )
None validate_utf32_characters(self, List[int] quad)
None validate_utf16_characters(self, List[int] pair)
ProbingState feed(self, Union[bytes, bytearray] byte_str)
for i