Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
sbcharsetprober.py
Go to the documentation of this file.
28
29from typing import Dict, List, NamedTuple, Optional, Union
30
31from .charsetprober import CharSetProber
32from .enums import CharacterCategory, ProbingState, SequenceLikelihood
33
34
35class SingleByteCharSetModel(NamedTuple):
36 charset_name: str
37 language: str
38 char_to_order_map: Dict[int, int]
39 language_model: Dict[int, Dict[int, int]]
40 typical_positive_ratio: float
41 keep_ascii_letters: bool
42 alphabet: str
43
44
46 SAMPLE_SIZE = 64
47 SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
48 POSITIVE_SHORTCUT_THRESHOLD = 0.95
49 NEGATIVE_SHORTCUT_THRESHOLD = 0.05
50
52 self,
53 model: SingleByteCharSetModel,
54 is_reversed: bool = False,
55 name_prober: Optional[CharSetProber] = None,
56 ) -> None:
57 super().__init__()
58 self._model = model
59 # TRUE if we need to reverse every pair in the model lookup
60 self._reversed = is_reversed
61 # Optional auxiliary prober for name decision
62 self._name_prober = name_prober
63 self._last_order = 255
64 self._seq_counters: List[int] = []
65 self._total_seqs = 0
66 self._total_char = 0
68 self._freq_char = 0
69 self.resetreset()
70
71 def reset(self) -> None:
72 super().reset()
73 # char order of last character
74 self._last_order = 255
76 self._total_seqs = 0
77 self._total_char = 0
78 self._control_char = 0
79 # characters that fall in our sampling range
80 self._freq_char = 0
81
82 @property
83 def charset_name(self) -> Optional[str]:
84 if self._name_prober:
85 return self._name_prober.charset_name
86 return self._model.charset_name
87
88 @property
89 def language(self) -> Optional[str]:
90 if self._name_prober:
91 return self._name_prober.language
92 return self._model.language
93
94 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
95 # TODO: Make filter_international_words keep things in self.alphabet
96 if not self._model.keep_ascii_letters:
97 byte_str = self.filter_international_words(byte_str)
98 else:
99 byte_str = self.remove_xml_tags(byte_str)
100 if not byte_str:
101 return self.statestate
102 char_to_order_map = self._model.char_to_order_map
103 language_model = self._model.language_model
104 for char in byte_str:
106 # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
107 # CharacterCategory.SYMBOL is actually 253, so we use CONTROL
108 # to make it closer to the original intent. The only difference
109 # is whether or not we count digits and control characters for
110 # _total_char purposes.
111 if order < CharacterCategory.CONTROL:
112 self._total_char += 1
113 if order < self.SAMPLE_SIZE:
114 self._freq_char += 1
115 if self._last_order < self.SAMPLE_SIZE:
116 self._total_seqs += 1
117 if not self._reversed:
118 lm_cat = language_model[self._last_order][order]
119 else:
120 lm_cat = language_model[order][self._last_order]
121 self._seq_counters[lm_cat] += 1
122 self._last_order = order
123
124 charset_name = self._model.charset_name
127 confidence = self.get_confidenceget_confidence()
128 if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
129 self.logger.debug(
130 "%s confidence = %s, we have a winner", charset_name, confidence
131 )
134 self.logger.debug(
135 "%s confidence = %s, below negative shortcut threshold %s",
136 charset_name,
137 confidence,
139 )
141
142 return self.statestate
143
144 def get_confidence(self) -> float:
145 r = 0.01
146 if self._total_seqs > 0:
147 r = (
148 (
151 )
152 / self._total_seqs
153 / self._model.typical_positive_ratio
154 )
155 # The more control characters (proportionnaly to the size
156 # of the text), the less confident we become in the current
157 # charset.
158 r = r * (self._total_char - self._control_char) / self._total_char
159 r = r * self._freq_char / self._total_char
160 if r >= 1.0:
161 r = 0.99
162 return r
bytearray filter_international_words(Union[bytes, bytearray] buf)
bytes remove_xml_tags(Union[bytes, bytearray] buf)
None __init__(self, SingleByteCharSetModel model, bool is_reversed=False, Optional[CharSetProber] name_prober=None)
ProbingState feed(self, Union[bytes, bytearray] byte_str)
for i