Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
latin1prober.py
Go to the documentation of this file.
28
29from typing import List, Union
30
31from .charsetprober import CharSetProber
32from .enums import ProbingState
33
34FREQ_CAT_NUM = 4
35
36UDF = 0 # undefined
37OTH = 1 # other
38ASC = 2 # ascii capital letter
39ASS = 3 # ascii small letter
40ACV = 4 # accent capital vowel
41ACO = 5 # accent capital other
42ASV = 6 # accent small vowel
43ASO = 7 # accent small other
44CLASS_NUM = 8 # total classes
45
46# fmt: off
47Latin1_CharToClass = (
48 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
49 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
50 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
51 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
52 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
53 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
54 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
55 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
56 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
57 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
58 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
59 ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
60 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
61 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
62 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
63 ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
64 OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
65 OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
66 UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
67 OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
68 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
69 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
70 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
71 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
72 ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
73 ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
74 ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
75 ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
76 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
77 ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
78 ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
79 ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
80)
81
82# 0 : illegal
83# 1 : very unlikely
84# 2 : normal
85# 3 : very likely
86Latin1ClassModel = (
87# UDF OTH ASC ASS ACV ACO ASV ASO
88 0, 0, 0, 0, 0, 0, 0, 0, # UDF
89 0, 3, 3, 3, 3, 3, 3, 3, # OTH
90 0, 3, 3, 3, 3, 3, 3, 3, # ASC
91 0, 3, 3, 3, 1, 1, 3, 3, # ASS
92 0, 3, 3, 3, 1, 2, 1, 2, # ACV
93 0, 3, 3, 3, 3, 3, 3, 3, # ACO
94 0, 3, 1, 3, 1, 1, 1, 3, # ASV
95 0, 3, 1, 3, 1, 1, 3, 3, # ASO
96)
97# fmt: on
98
99
101 def __init__(self) -> None:
102 super().__init__()
104 self._freq_counter: List[int] = []
105 self.resetreset()
106
107 def reset(self) -> None:
108 self._last_char_class = OTH
109 self._freq_counter = [0] * FREQ_CAT_NUM
110 super().reset()
111
112 @property
113 def charset_name(self) -> str:
114 return "ISO-8859-1"
115
116 @property
117 def language(self) -> str:
118 return ""
119
120 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
121 byte_str = self.remove_xml_tags(byte_str)
122 for c in byte_str:
123 char_class = Latin1_CharToClass[c]
124 freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) + char_class]
125 if freq == 0:
127 break
128 self._freq_counter[freq] += 1
129 self._last_char_class = char_class
130
131 return self.statestate
132
133 def get_confidence(self) -> float:
135 return 0.01
136
137 total = sum(self._freq_counter)
138 confidence = (
139 0.0
140 if total < 0.01
141 else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
142 )
143 confidence = max(confidence, 0.0)
144 # lower the confidence of latin1 so that other more accurate
145 # detector can take priority.
146 confidence *= 0.73
147 return confidence
bytes remove_xml_tags(Union[bytes, bytearray] buf)
ProbingState feed(self, Union[bytes, bytearray] byte_str)
for i