Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
hebrewprober.py
Go to the documentation of this file.
27
28from typing import Optional, Union
29
30from .charsetprober import CharSetProber
31from .enums import ProbingState
32from .sbcharsetprober import SingleByteCharSetProber
33
34# This prober doesn't actually recognize a language or a charset.
35# It is a helper prober for the use of the Hebrew model probers
36
37
130
131
133 SPACE = 0x20
134 # windows-1255 / ISO-8859-8 code points of interest
135 FINAL_KAF = 0xEA
136 NORMAL_KAF = 0xEB
137 FINAL_MEM = 0xED
138 NORMAL_MEM = 0xEE
139 FINAL_NUN = 0xEF
140 NORMAL_NUN = 0xF0
141 FINAL_PE = 0xF3
142 NORMAL_PE = 0xF4
143 FINAL_TSADI = 0xF5
144 NORMAL_TSADI = 0xF6
145
146 # Minimum Visual vs Logical final letter score difference.
147 # If the difference is below this, don't rely solely on the final letter score
148 # distance.
149 MIN_FINAL_CHAR_DISTANCE = 5
150
151 # Minimum Visual vs Logical model score difference.
152 # If the difference is below this, don't rely at all on the model score
153 # distance.
154 MIN_MODEL_DISTANCE = 0.01
155
156 VISUAL_HEBREW_NAME = "ISO-8859-8"
157 LOGICAL_HEBREW_NAME = "windows-1255"
158
159 def __init__(self) -> None:
160 super().__init__()
163 self._prev = self.SPACE
164 self._before_prev = self.SPACE
165 self._logical_prober: Optional[SingleByteCharSetProber] = None
166 self._visual_prober: Optional[SingleByteCharSetProber] = None
167 self.resetreset()
168
169 def reset(self) -> None:
172 # The two last characters seen in the previous buffer,
173 # mPrev and mBeforePrev are initialized to space in order to simulate
174 # a word delimiter at the beginning of the data
175 self._prev = self.SPACE
176 self._before_prev = self.SPACE
177 # These probers are owned by the group prober.
178
180 self,
181 logical_prober: SingleByteCharSetProber,
182 visual_prober: SingleByteCharSetProber,
183 ) -> None:
184 self._logical_prober = logical_prober
185 self._visual_prober = visual_prober
186
187 def is_final(self, c: int) -> bool:
188 return c in [
194 ]
195
196 def is_non_final(self, c: int) -> bool:
197 # The normal Tsadi is not a good Non-Final letter due to words like
198 # 'lechotet' (to chat) containing an apostrophe after the tsadi. This
199 # apostrophe is converted to a space in FilterWithoutEnglishLetters
200 # causing the Non-Final tsadi to appear at an end of a word even
201 # though this is not the case in the original text.
202 # The letters Pe and Kaf rarely display a related behavior of not being
203 # a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak'
204 # for example legally end with a Non-Final Pe or Kaf. However, the
205 # benefit of these letters as Non-Final letters outweighs the damage
206 # since these words are quite rare.
208
209 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
210 # Final letter analysis for logical-visual decision.
211 # Look for evidence that the received buffer is either logical Hebrew
212 # or visual Hebrew.
213 # The following cases are checked:
214 # 1) A word longer than 1 letter, ending with a final letter. This is
215 # an indication that the text is laid out "naturally" since the
216 # final letter really appears at the end. +1 for logical score.
217 # 2) A word longer than 1 letter, ending with a Non-Final letter. In
218 # normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi,
219 # should not end with the Non-Final form of that letter. Exceptions
220 # to this rule are mentioned above in isNonFinal(). This is an
221 # indication that the text is laid out backwards. +1 for visual
222 # score
223 # 3) A word longer than 1 letter, starting with a final letter. Final
224 # letters should not appear at the beginning of a word. This is an
225 # indication that the text is laid out backwards. +1 for visual
226 # score.
227 #
228 # The visual score and logical score are accumulated throughout the
229 # text and are finally checked against each other in GetCharSetName().
230 # No checking for final letters in the middle of words is done since
231 # that case is not an indication for either Logical or Visual text.
232 #
233 # We automatically filter out all 7-bit characters (replace them with
234 # spaces) so the word boundary detection works properly. [MAP]
235
237 # Both model probers say it's not them. No reason to continue.
239
240 byte_str = self.filter_high_byte_only(byte_str)
241
242 for cur in byte_str:
243 if cur == self.SPACE:
244 # We stand on a space - a word just ended
245 if self._before_prev != self.SPACE:
246 # next-to-last char was not a space so self._prev is not a
247 # 1 letter word
248 if self.is_final(self._prev):
249 # case (1) [-2:not space][-1:final letter][cur:space]
251 elif self.is_non_final(self._prev):
252 # case (2) [-2:not space][-1:Non-Final letter][
253 # cur:space]
255 else:
256 # Not standing on a space
257 if (
258 (self._before_prev == self.SPACE)
259 and (self.is_final(self._prev))
260 and (cur != self.SPACE)
261 ):
262 # case (3) [-2:space][-1:final letter][cur:not space]
264 self._before_prev = self._prev
265 self._prev = cur
266
267 # Forever detecting, till the end or until both model probers return
268 # ProbingState.NOT_ME (handled above)
270
271 @property
272 def charset_name(self) -> str:
273 assert self._logical_prober is not None
274 assert self._visual_prober is not None
275
276 # Make the decision: is it Logical or Visual?
277 # If the final letter score distance is dominant enough, rely on it.
279 if finalsub >= self.MIN_FINAL_CHAR_DISTANCE:
280 return self.LOGICAL_HEBREW_NAME
281 if finalsub <= -self.MIN_FINAL_CHAR_DISTANCE:
282 return self.VISUAL_HEBREW_NAME
283
284 # It's not dominant enough, try to rely on the model scores instead.
285 modelsub = (
287 )
288 if modelsub > self.MIN_MODEL_DISTANCE:
289 return self.LOGICAL_HEBREW_NAME
290 if modelsub < -self.MIN_MODEL_DISTANCE:
291 return self.VISUAL_HEBREW_NAME
292
293 # Still no good, back to final letter distance, maybe it'll save the
294 # day.
295 if finalsub < 0.0:
296 return self.VISUAL_HEBREW_NAME
297
298 # (finalsub > 0 - Logical) or (don't know what to do) default to
299 # Logical.
300 return self.LOGICAL_HEBREW_NAME
301
302 @property
303 def language(self) -> str:
304 return "Hebrew"
305
306 @property
307 def state(self) -> ProbingState:
308 assert self._logical_prober is not None
309 assert self._visual_prober is not None
310
311 # Remain active as long as any of the model probers are active.
312 if (self._logical_prober.state == ProbingState.NOT_ME) and (
314 ):
bytes filter_high_byte_only(Union[bytes, bytearray] buf)
General ideas of the Hebrew charset recognition ###.
None set_model_probers(self, SingleByteCharSetProber logical_prober, SingleByteCharSetProber visual_prober)
ProbingState feed(self, Union[bytes, bytearray] byte_str)
for i