Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
macromanprober.py
Go to the documentation of this file.
30
31from typing import List, Union
32
33from .charsetprober import CharSetProber
34from .enums import ProbingState
35
36FREQ_CAT_NUM = 4
37
38UDF = 0 # undefined
39OTH = 1 # other
40ASC = 2 # ascii capital letter
41ASS = 3 # ascii small letter
42ACV = 4 # accent capital vowel
43ACO = 5 # accent capital other
44ASV = 6 # accent small vowel
45ASO = 7 # accent small other
46ODD = 8 # character that is unlikely to appear
47CLASS_NUM = 9 # total classes
48
49# The change from Latin1 is that we explicitly look for extended characters
50# that are infrequently-occurring symbols, and consider them to always be
51# improbable. This should let MacRoman get out of the way of more likely
52# encodings in most situations.
53
54# fmt: off
55MacRoman_CharToClass = (
56 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
57 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
58 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
59 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
60 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
61 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
62 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
63 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
64 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
65 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
66 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
67 ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
68 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
69 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
70 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
71 ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
72 ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87
73 ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F
74 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97
75 ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F
76 OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7
77 OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF
78 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
79 OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF
80 OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7
81 OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF
82 OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7
83 ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF
84 OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7
85 ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF
86 ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7
87 ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF
88)
89
90# 0 : illegal
91# 1 : very unlikely
92# 2 : normal
93# 3 : very likely
94MacRomanClassModel = (
95# UDF OTH ASC ASS ACV ACO ASV ASO ODD
96 0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF
97 0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH
98 0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC
99 0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS
100 0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV
101 0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO
102 0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV
103 0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO
104 0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD
105)
106# fmt: on
107
108
110 def __init__(self) -> None:
111 super().__init__()
113 self._freq_counter: List[int] = []
114 self.resetreset()
115
116 def reset(self) -> None:
117 self._last_char_class = OTH
118 self._freq_counter = [0] * FREQ_CAT_NUM
119
120 # express the prior that MacRoman is a somewhat rare encoding;
121 # this can be done by starting out in a slightly improbable state
122 # that must be overcome
123 self._freq_counter[2] = 10
124
125 super().reset()
126
127 @property
128 def charset_name(self) -> str:
129 return "MacRoman"
130
131 @property
132 def language(self) -> str:
133 return ""
134
135 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
136 byte_str = self.remove_xml_tags(byte_str)
137 for c in byte_str:
138 char_class = MacRoman_CharToClass[c]
139 freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class]
140 if freq == 0:
142 break
143 self._freq_counter[freq] += 1
144 self._last_char_class = char_class
145
146 return self.statestate
147
148 def get_confidence(self) -> float:
150 return 0.01
151
152 total = sum(self._freq_counter)
153 confidence = (
154 0.0
155 if total < 0.01
156 else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
157 )
158 confidence = max(confidence, 0.0)
159 # lower the confidence of MacRoman so that other more accurate
160 # detector can take priority.
161 confidence *= 0.73
162 return confidence
bytes remove_xml_tags(Union[bytes, bytearray] buf)
ProbingState feed(self, Union[bytes, bytearray] byte_str)
for i