31from typing
import Optional, Union
33from .enums
import LanguageFilter, ProbingState
36 b
"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
42 SHORTCUT_THRESHOLD = 0.95
59 raise NotImplementedError
61 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
62 raise NotImplementedError
65 def state(self) -> ProbingState:
73 buf =
re.sub(b
"([\x00-\x7F])+", b
" ", buf)
79 We define three types of bytes:
80 alphabet: english alphabets [a-zA-Z]
81 international: international characters [\x80-\xFF]
82 marker: everything else [^a-zA-Z\x80-\xFF]
83 The input buffer can be thought to contain a series of words delimited
84 by markers. This function works to filter all words that contain at
85 least one international character. All contiguous sequences of markers
86 are replaced by a single space ascii character.
87 This filter applies to all scripts which do not use English characters.
103 last_char = word[-1:]
113 Returns a copy of ``buf`` that retains only the sequences of English
114 alphabet and high byte characters that are not between <> characters.
115 This filter can be applied to all scripts which contain both English
116 characters and extended ASCII characters, but is currently only used by
132 elif buf_char == b
"<":
133 if curr > prev
and not in_tag:
bytearray filter_international_words(Union[bytes, bytearray] buf)
bytes filter_high_byte_only(Union[bytes, bytearray] buf)
bytes remove_xml_tags(Union[bytes, bytearray] buf)
Optional[str] language(self)
Optional[str] charset_name(self)
None __init__(self, LanguageFilter lang_filter=LanguageFilter.NONE)
float get_confidence(self)
ProbingState feed(self, Union[bytes, bytearray] byte_str)