Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
universaldetector.py
Go to the documentation of this file.
28"""
29Module containing the UniversalDetector detector class, which is the primary
30class a user of ``chardet`` should use.
31
32:author: Mark Pilgrim (initial port to Python)
33:author: Shy Shalom (original C code)
34:author: Dan Blanchard (major refactoring for 3.0)
35:author: Ian Cordasco
36"""
37
38
39import codecs
40import logging
41import re
42from typing import List, Optional, Union
43
44from .charsetgroupprober import CharSetGroupProber
45from .charsetprober import CharSetProber
46from .enums import InputState, LanguageFilter, ProbingState
47from .escprober import EscCharSetProber
48from .latin1prober import Latin1Prober
49from .macromanprober import MacRomanProber
50from .mbcsgroupprober import MBCSGroupProber
51from .resultdict import ResultDict
52from .sbcsgroupprober import SBCSGroupProber
53from .utf1632prober import UTF1632Prober
54
55
57 """
58 The ``UniversalDetector`` class underlies the ``chardet.detect`` function
59 and coordinates all of the different charset probers.
60
61 To get a ``dict`` containing an encoding and its confidence, you can simply
62 run:
63
64 .. code::
65
66 u = UniversalDetector()
67 u.feed(some_bytes)
68 u.close()
69 detected = u.result
70
71 """
72
73 MINIMUM_THRESHOLD = 0.20
74 HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xFF]")
75 ESC_DETECTOR = re.compile(b"(\033|~{)")
76 WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9F]")
77 ISO_WIN_MAP = {
78 "iso-8859-1": "Windows-1252",
79 "iso-8859-2": "Windows-1250",
80 "iso-8859-5": "Windows-1251",
81 "iso-8859-6": "Windows-1256",
82 "iso-8859-7": "Windows-1253",
83 "iso-8859-8": "Windows-1255",
84 "iso-8859-9": "Windows-1254",
85 "iso-8859-13": "Windows-1257",
86 }
87 # Based on https://encoding.spec.whatwg.org/#names-and-labels
88 # but altered to match Python names for encodings and remove mappings
89 # that break tests.
90 LEGACY_MAP = {
91 "ascii": "Windows-1252",
92 "iso-8859-1": "Windows-1252",
93 "tis-620": "ISO-8859-11",
94 "iso-8859-9": "Windows-1254",
95 "gb2312": "GB18030",
96 "euc-kr": "CP949",
97 "utf-16le": "UTF-16",
98 }
99
101 self,
102 lang_filter: LanguageFilter = LanguageFilter.ALL,
103 should_rename_legacy: bool = False,
104 ) -> None:
105 self._esc_charset_prober: Optional[EscCharSetProber] = None
106 self._utf1632_prober: Optional[UTF1632Prober] = None
107 self._charset_probers: List[CharSetProber] = []
108 self.result: ResultDict = {
109 "encoding": None,
110 "confidence": 0.0,
111 "language": None,
112 }
113 self.done = False
114 self._got_data = False
116 self._last_char = b""
117 self.lang_filter = lang_filter
118 self.logger = logging.getLogger(__name__)
119 self._has_win_bytes = False
120 self.should_rename_legacy = should_rename_legacy
121 self.reset()
122
123 @property
124 def input_state(self) -> int:
125 return self._input_state
126
127 @property
128 def has_win_bytes(self) -> bool:
129 return self._has_win_bytes
130
131 @property
132 def charset_probers(self) -> List[CharSetProber]:
133 return self._charset_probers
134
135 def reset(self) -> None:
136 """
137 Reset the UniversalDetector and all of its probers back to their
138 initial states. This is called by ``__init__``, so you only need to
139 call this directly in between analyses of different documents.
140 """
141 self.result = {"encoding": None, "confidence": 0.0, "language": None}
142 self.done = False
143 self._got_data = False
144 self._has_win_bytes = False
146 self._last_char = b""
147 if self._esc_charset_prober:
149 if self._utf1632_prober:
151 for prober in self._charset_probers:
153
154 def feed(self, byte_str: Union[bytes, bytearray]) -> None:
155 """
156 Takes a chunk of a document and feeds it through all of the relevant
157 charset probers.
158
159 After calling ``feed``, you can check the value of the ``done``
160 attribute to see if you need to continue feeding the
161 ``UniversalDetector`` more data, or if it has made a prediction
162 (in the ``result`` attribute).
163
164 .. note::
165 You should always call ``close`` when you're done feeding in your
166 document if ``done`` is not already ``True``.
167 """
168 if self.done:
169 return
170
171 if not byte_str:
172 return
173
174 if not isinstance(byte_str, bytearray):
175 byte_str = bytearray(byte_str)
176
177 # First check for known BOMs, since these are guaranteed to be correct
178 if not self._got_data:
179 # If the data starts with BOM, we know it is UTF
181 # EF BB BF UTF-8 with BOM
182 self.result = {
183 "encoding": "UTF-8-SIG",
184 "confidence": 1.0,
185 "language": "",
186 }
188 # FF FE 00 00 UTF-32, little-endian BOM
189 # 00 00 FE FF UTF-32, big-endian BOM
190 self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}
191 elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
192 # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
193 self.result = {
194 # TODO: This encoding is not supported by Python. Should remove?
195 "encoding": "X-ISO-10646-UCS-4-3412",
196 "confidence": 1.0,
197 "language": "",
198 }
199 elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
200 # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
201 self.result = {
202 # TODO: This encoding is not supported by Python. Should remove?
203 "encoding": "X-ISO-10646-UCS-4-2143",
204 "confidence": 1.0,
205 "language": "",
206 }
208 # FF FE UTF-16, little endian BOM
209 # FE FF UTF-16, big endian BOM
210 self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}
211
212 self._got_data = True
213 if self.result["encoding"] is not None:
214 self.done = True
215 return
216
217 # If none of those matched and we've only see ASCII so far, check
218 # for high bytes and escape sequences
220 if self.HIGH_BYTE_DETECTOR.search(byte_str):
222 elif (
224 and self.ESC_DETECTOR.search(self._last_char + byte_str)
225 ):
227
228 self._last_char = byte_str[-1:]
229
230 # next we will look to see if it is appears to be either a UTF-16 or
231 # UTF-32 encoding
232 if not self._utf1632_prober:
234
236 if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:
237 self.result = {
238 "encoding": self._utf1632_prober.charset_name,
239 "confidence": self._utf1632_prober.get_confidence(),
240 "language": "",
241 }
242 self.done = True
243 return
244
245 # If we've seen escape sequences, use the EscCharSetProber, which
246 # uses a simple state machine to check for known escape sequences in
247 # HZ and ISO-2022 encodings, since those are the only encodings that
248 # use such sequences.
250 if not self._esc_charset_prober:
253 self.result = {
254 "encoding": self._esc_charset_prober.charset_name,
255 "confidence": self._esc_charset_prober.get_confidence(),
256 "language": self._esc_charset_prober.language,
257 }
258 self.done = True
259 # If we've seen high bytes (i.e., those with values greater than 127),
260 # we need to do more complicated checks using all our multi-byte and
261 # single-byte probers that are left. The single-byte probers
262 # use character bigram distributions to determine the encoding, whereas
263 # the multi-byte probers use a combination of character unigram and
264 # bigram distributions.
266 if not self._charset_probers:
268 # If we're checking non-CJK encodings, use single-byte prober
270 self._charset_probers.append(SBCSGroupProber())
271 self._charset_probers.append(Latin1Prober())
272 self._charset_probers.append(MacRomanProber())
273 for prober in self._charset_probers:
274 if prober.feed(byte_str) == ProbingState.FOUND_IT:
275 self.result = {
276 "encoding": prober.charset_name,
277 "confidence": prober.get_confidence(),
278 "language": prober.language,
279 }
280 self.done = True
281 break
282 if self.WIN_BYTE_DETECTOR.search(byte_str):
283 self._has_win_bytes = True
284
285 def close(self) -> ResultDict:
286 """
287 Stop analyzing the current document and come up with a final
288 prediction.
289
290 :returns: The ``result`` attribute, a ``dict`` with the keys
291 `encoding`, `confidence`, and `language`.
292 """
293 # Don't bother with checks if we're already done
294 if self.done:
295 return self.result
296 self.done = True
297
298 if not self._got_data:
299 self.logger.debug("no data received!")
300
301 # Default to ASCII if it is all we've seen so far
303 self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}
304
305 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
307 prober_confidence = None
308 max_prober_confidence = 0.0
309 max_prober = None
310 for prober in self._charset_probers:
311 if not prober:
312 continue
313 prober_confidence = prober.get_confidence()
314 if prober_confidence > max_prober_confidence:
315 max_prober_confidence = prober_confidence
316 max_prober = prober
317 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLDMINIMUM_THRESHOLD):
318 charset_name = max_prober.charset_name
319 assert charset_name is not None
320 lower_charset_name = charset_name.lower()
321 confidence = max_prober.get_confidence()
322 # Use Windows encoding name instead of ISO-8859 if we saw any
323 # extra Windows-specific bytes
324 if lower_charset_name.startswith("iso-8859"):
325 if self._has_win_bytes:
326 charset_name = self.ISO_WIN_MAP.get(
327 lower_charset_name, charset_name
328 )
329 # Rename legacy encodings with superset encodings if asked
330 if self.should_rename_legacy:
331 charset_name = self.LEGACY_MAP.get(
332 (charset_name or "").lower(), charset_name
333 )
334 self.result = {
335 "encoding": charset_name,
336 "confidence": confidence,
337 "language": max_prober.language,
338 }
339
340 # Log all prober confidences if none met MINIMUM_THRESHOLD
342 if self.result["encoding"] is None:
343 self.logger.debug("no probers hit minimum threshold")
344 for group_prober in self._charset_probers:
345 if not group_prober:
346 continue
347 if isinstance(group_prober, CharSetGroupProber):
348 for prober in group_prober.probers:
349 self.logger.debug(
350 "%s %s confidence = %s",
354 )
355 else:
356 self.logger.debug(
357 "%s %s confidence = %s",
361 )
362 return self.result
None __init__(self, LanguageFilter lang_filter=LanguageFilter.ALL, bool should_rename_legacy=False)
None feed(self, Union[bytes, bytearray] byte_str)
for i