Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
charsetprober.py
Go to the documentation of this file.
28
29import logging
30import re
31from typing import Optional, Union
32
33from .enums import LanguageFilter, ProbingState
34
35INTERNATIONAL_WORDS_PATTERN = re.compile(
36 b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
37)
38
39
41
42 SHORTCUT_THRESHOLD = 0.95
43
44 def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
46 self.active = True
47 self.lang_filter = lang_filter
48 self.logger = logging.getLogger(__name__)
49
50 def reset(self) -> None:
52
53 @property
54 def charset_name(self) -> Optional[str]:
55 return None
56
57 @property
58 def language(self) -> Optional[str]:
59 raise NotImplementedError
60
61 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
62 raise NotImplementedError
63
64 @property
65 def state(self) -> ProbingState:
66 return self._state
67
68 def get_confidence(self) -> float:
69 return 0.0
70
71 @staticmethod
72 def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
73 buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
74 return buf
75
76 @staticmethod
77 def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
78 """
79 We define three types of bytes:
80 alphabet: english alphabets [a-zA-Z]
81 international: international characters [\x80-\xFF]
82 marker: everything else [^a-zA-Z\x80-\xFF]
83 The input buffer can be thought to contain a series of words delimited
84 by markers. This function works to filter all words that contain at
85 least one international character. All contiguous sequences of markers
86 are replaced by a single space ascii character.
87 This filter applies to all scripts which do not use English characters.
88 """
89 filtered = bytearray()
90
91 # This regex expression filters out only words that have at-least one
92 # international character. The word may include one marker character at
93 # the end.
95
96 for word in words:
97 filtered.extend(word[:-1])
98
99 # If the last character in the word is a marker, replace it with a
100 # space as markers shouldn't affect our analysis (they are used
101 # similarly across all languages and may thus have similar
102 # frequencies).
103 last_char = word[-1:]
104 if not last_char.isalpha() and last_char < b"\x80":
105 last_char = b" "
106 filtered.extend(last_char)
107
108 return filtered
109
110 @staticmethod
111 def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:
112 """
113 Returns a copy of ``buf`` that retains only the sequences of English
114 alphabet and high byte characters that are not between <> characters.
115 This filter can be applied to all scripts which contain both English
116 characters and extended ASCII characters, but is currently only used by
117 ``Latin1Prober``.
118 """
119 filtered = bytearray()
120 in_tag = False
121 prev = 0
122 buf = memoryview(buf).cast("c")
123
124 for curr, buf_char in enumerate(buf):
125 # Check if we're coming out of or entering an XML tag
126
127 # https://github.com/python/typeshed/issues/8182
128 if buf_char == b">": # type: ignore[comparison-overlap]
129 prev = curr + 1
130 in_tag = False
131 # https://github.com/python/typeshed/issues/8182
132 elif buf_char == b"<": # type: ignore[comparison-overlap]
133 if curr > prev and not in_tag:
134 # Keep everything after last non-extended-ASCII,
135 # non-alphabetic character
136 filtered.extend(buf[prev:curr])
137 # Output a space to delimit stretch we kept
138 filtered.extend(b" ")
139 in_tag = True
140
141 # If we're not in a tag...
142 if not in_tag:
143 # Keep everything after last non-extended-ASCII, non-alphabetic
144 # character
145 filtered.extend(buf[prev:])
146
147 return filtered
bytearray filter_international_words(Union[bytes, bytearray] buf)
bytes filter_high_byte_only(Union[bytes, bytearray] buf)
bytes remove_xml_tags(Union[bytes, bytearray] buf)
None __init__(self, LanguageFilter lang_filter=LanguageFilter.NONE)
ProbingState feed(self, Union[bytes, bytearray] byte_str)
for i