21from typing
import List, Union
23from .charsetprober
import CharSetProber
24from .enums
import ProbingState
29 This class simply looks for occurrences of zero bytes, and infers
30 whether the file is UTF16 or UTF32 (low-endian or big-endian)
31 For instance, files looking like ( \0 \0 \0 [nonzero] )+
32 have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+
33 may be guessed to be UTF16BE, and inversely for little-endian varieties.
37 MIN_CHARS_FOR_DETECTION = 20
68 self.
quad = [0, 0, 0, 0]
135 Validate if the quad of bytes is valid UTF-32.
137 UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
138 excluding 0x0000D800 - 0x0000DFFF
140 https://en.wikipedia.org/wiki/UTF-32
145 or (quad[0] == 0
and quad[1] == 0
and 0xD8 <= quad[2] <= 0xDF)
151 or (quad[3] == 0
and quad[2] == 0
and 0xD8 <= quad[1] <= 0xDF)
157 Validate if the pair of bytes is valid UTF-16.
159 UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
160 with an exception for surrogate pairs, which must be in the range
161 0xD800-0xDBFF followed by 0xDC00-0xDFFF
163 https://en.wikipedia.org/wiki/UTF-16
166 if 0xD8 <= pair[0] <= 0xDB:
168 elif 0xDC <= pair[0] <= 0xDF:
171 if 0xDC <= pair[0] <= 0xDF:
177 if 0xD8 <= pair[1] <= 0xDB:
179 elif 0xDC <= pair[1] <= 0xDF:
182 if 0xDC <= pair[1] <= 0xDF:
187 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
float get_confidence(self)
float approx_32bit_chars(self)
int MIN_CHARS_FOR_DETECTION
None validate_utf32_characters(self, List[int] quad)
bool is_likely_utf16le(self)
bool is_likely_utf16be(self)
float approx_16bit_chars(self)
None validate_utf16_characters(self, List[int] pair)
bool is_likely_utf32be(self)
first_half_surrogate_pair_detected_16be
float get_confidence(self)
ProbingState feed(self, Union[bytes, bytearray] byte_str)
first_half_surrogate_pair_detected_16le
bool is_likely_utf32le(self)