d5/d4b/utf1632prober_8py_source.html

from typing import List, Union


from .charsetprober import CharSetProber

from .enums import ProbingState


class UTF1632Prober(CharSetProber):

    """

    This class simply looks for occurrences of zero bytes, and infers

    whether the file is UTF16 or UTF32 (low-endian or big-endian)

    For instance, files looking like ( \0 \0 \0 [nonzero] )+

    have a good probability to be UTF32BE.  Files looking like ( \0 [nonzero] )+

    may be guessed to be UTF16BE, and inversely for little-endian varieties.

    """


    # how many logical characters to scan before feeling confident of prediction

    MIN_CHARS_FOR_DETECTION = 20

    # a fixed constant ratio of expected zeros or non-zeros in modulo-position.

    EXPECTED_RATIO = 0.94


    def __init__(self) -> None:

        super().__init__()

        self.position = 0

        self.zeros_at_mod = [0] * 4

        self.nonzeros_at_mod = [0] * 4

        self._state_state = ProbingState.DETECTING

        self.quad = [0, 0, 0, 0]

        self.invalid_utf16be = False

        self.invalid_utf16le = False

        self.invalid_utf32be = False

        self.invalid_utf32le = False

        self.first_half_surrogate_pair_detected_16be = False

        self.first_half_surrogate_pair_detected_16le = False

        self.resetreset()


    def reset(self) -> None:

        super().reset()

        self.position = 0

        self.zeros_at_mod = [0] * 4

        self.nonzeros_at_mod = [0] * 4

        self._state_state = ProbingState.DETECTING

        self.invalid_utf16be = False

        self.invalid_utf16le = False

        self.invalid_utf32be = False

        self.invalid_utf32le = False

        self.first_half_surrogate_pair_detected_16be = False

        self.first_half_surrogate_pair_detected_16le = False

        self.quad = [0, 0, 0, 0]


    @property


    def charset_name(self) -> str:

        if self.is_likely_utf32be():

            return "utf-32be"

        if self.is_likely_utf32le():

            return "utf-32le"

        if self.is_likely_utf16be():

            return "utf-16be"

        if self.is_likely_utf16le():

            return "utf-16le"

        # default to something valid

        return "utf-16"


    @property


    def language(self) -> str:

        return ""


    def approx_32bit_chars(self) -> float:

        return max(1.0, self.position / 4.0)


    def approx_16bit_chars(self) -> float:

        return max(1.0, self.position / 2.0)


    def is_likely_utf32be(self) -> bool:

        approx_chars = self.approx_32bit_chars()

        return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (

            self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO

            and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO

            and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO

            and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO

            and not self.invalid_utf32be

        )


    def is_likely_utf32le(self) -> bool:

        approx_chars = self.approx_32bit_chars()

        return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (

            self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO

            and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO

            and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO

            and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO

            and not self.invalid_utf32le

        )


    def is_likely_utf16be(self) -> bool:

        approx_chars = self.approx_16bit_chars()

        return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (

            (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars

            > self.EXPECTED_RATIO

            and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars

            > self.EXPECTED_RATIO

            and not self.invalid_utf16be

        )


    def is_likely_utf16le(self) -> bool:

        approx_chars = self.approx_16bit_chars()

        return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (

            (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars

            > self.EXPECTED_RATIO

            and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars

            > self.EXPECTED_RATIO

            and not self.invalid_utf16le

        )


    def validate_utf32_characters(self, quad: List[int]) -> None:

        """

        Validate if the quad of bytes is valid UTF-32.


        UTF-32 is valid in the range 0x00000000 - 0x0010FFFF

        excluding 0x0000D800 - 0x0000DFFF


        https://en.wikipedia.org/wiki/UTF-32

        """

        if (

            quad[0] != 0

            or quad[1] > 0x10

            or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF)

        ):

            self.invalid_utf32be = True

        if (

            quad[3] != 0

            or quad[2] > 0x10

            or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF)

        ):

            self.invalid_utf32le = True


    def validate_utf16_characters(self, pair: List[int]) -> None:

        """

        Validate if the pair of bytes is  valid UTF-16.


        UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF

        with an exception for surrogate pairs, which must be in the range

        0xD800-0xDBFF followed by 0xDC00-0xDFFF


        https://en.wikipedia.org/wiki/UTF-16

        """

        if not self.first_half_surrogate_pair_detected_16be:

            if 0xD8 <= pair[0] <= 0xDB:

                self.first_half_surrogate_pair_detected_16be = True

            elif 0xDC <= pair[0] <= 0xDF:

                self.invalid_utf16be = True

        else:

            if 0xDC <= pair[0] <= 0xDF:

                self.first_half_surrogate_pair_detected_16be = False

            else:

                self.invalid_utf16be = True


        if not self.first_half_surrogate_pair_detected_16le:

            if 0xD8 <= pair[1] <= 0xDB:

                self.first_half_surrogate_pair_detected_16le = True

            elif 0xDC <= pair[1] <= 0xDF:

                self.invalid_utf16le = True

        else:

            if 0xDC <= pair[1] <= 0xDF:

                self.first_half_surrogate_pair_detected_16le = False

            else:

                self.invalid_utf16le = True


    def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

        for c in byte_str:

            mod4 = self.position % 4

            self.quad[mod4] = c

            if mod4 == 3:

                self.validate_utf32_characters(self.quad)

                self.validate_utf16_characters(self.quad[0:2])

                self.validate_utf16_characters(self.quad[2:4])

            if c == 0:

                self.zeros_at_mod[mod4] += 1

            else:

                self.nonzeros_at_mod[mod4] += 1

            self.position += 1

        return self.statestate


    @property


    def state(self) -> ProbingState:

        if self._state_state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:

            # terminal, decided states

            return self._state_state

        if self.get_confidenceget_confidence() > 0.80:

            self._state_state = ProbingState.FOUND_IT

        elif self.position > 4 * 1024:

            # if we get to 4kb into the file, and we can't conclude it's UTF,

            # let's give up

            self._state_state = ProbingState.NOT_ME

        return self._state_state


    def get_confidence(self) -> float:

        return (

            0.85

            if (

                self.is_likely_utf16le()

                or self.is_likely_utf16be()

                or self.is_likely_utf32le()

                or self.is_likely_utf32be()

            )

            else 0.00

        )


pip._vendor.chardet.charsetprober.CharSetProber
Definition charsetprober.py:40

pip._vendor.chardet.charsetprober.CharSetProber.reset
None reset(self)
Definition charsetprober.py:50

pip._vendor.chardet.charsetprober.CharSetProber.state
ProbingState state(self)
Definition charsetprober.py:65

pip._vendor.chardet.charsetprober.CharSetProber.get_confidence
float get_confidence(self)
Definition charsetprober.py:68

pip._vendor.chardet.charsetprober.CharSetProber._state
_state
Definition charsetprober.py:45

pip._vendor.chardet.utf1632prober.UTF1632Prober
Definition utf1632prober.py:27

pip._vendor.chardet.utf1632prober.UTF1632Prober.zeros_at_mod
zeros_at_mod
Definition utf1632prober.py:44

pip._vendor.chardet.utf1632prober.UTF1632Prober.approx_32bit_chars
float approx_32bit_chars(self)
Definition utf1632prober.py:87

pip._vendor.chardet.utf1632prober.UTF1632Prober.MIN_CHARS_FOR_DETECTION
int MIN_CHARS_FOR_DETECTION
Definition utf1632prober.py:37

pip._vendor.chardet.utf1632prober.UTF1632Prober.invalid_utf32be
invalid_utf32be
Definition utf1632prober.py:50

pip._vendor.chardet.utf1632prober.UTF1632Prober.charset_name
str charset_name(self)
Definition utf1632prober.py:71

pip._vendor.chardet.utf1632prober.UTF1632Prober.position
position
Definition utf1632prober.py:43

pip._vendor.chardet.utf1632prober.UTF1632Prober.validate_utf32_characters
None validate_utf32_characters(self, List[int] quad)
Definition utf1632prober.py:133

pip._vendor.chardet.utf1632prober.UTF1632Prober.is_likely_utf16le
bool is_likely_utf16le(self)
Definition utf1632prober.py:123

pip._vendor.chardet.utf1632prober.UTF1632Prober.invalid_utf32le
invalid_utf32le
Definition utf1632prober.py:51

pip._vendor.chardet.utf1632prober.UTF1632Prober.reset
None reset(self)
Definition utf1632prober.py:56

pip._vendor.chardet.utf1632prober.UTF1632Prober.is_likely_utf16be
bool is_likely_utf16be(self)
Definition utf1632prober.py:113

pip._vendor.chardet.utf1632prober.UTF1632Prober.nonzeros_at_mod
nonzeros_at_mod
Definition utf1632prober.py:45

pip._vendor.chardet.utf1632prober.UTF1632Prober.approx_16bit_chars
float approx_16bit_chars(self)
Definition utf1632prober.py:90

pip._vendor.chardet.utf1632prober.UTF1632Prober.invalid_utf16le
invalid_utf16le
Definition utf1632prober.py:49

pip._vendor.chardet.utf1632prober.UTF1632Prober.validate_utf16_characters
None validate_utf16_characters(self, List[int] pair)
Definition utf1632prober.py:155

pip._vendor.chardet.utf1632prober.UTF1632Prober.state
ProbingState state(self)
Definition utf1632prober.py:203

pip._vendor.chardet.utf1632prober.UTF1632Prober.__init__
None __init__(self)
Definition utf1632prober.py:41

pip._vendor.chardet.utf1632prober.UTF1632Prober.EXPECTED_RATIO
float EXPECTED_RATIO
Definition utf1632prober.py:39

pip._vendor.chardet.utf1632prober.UTF1632Prober.language
str language(self)
Definition utf1632prober.py:84

pip._vendor.chardet.utf1632prober.UTF1632Prober.is_likely_utf32be
bool is_likely_utf32be(self)
Definition utf1632prober.py:93

pip._vendor.chardet.utf1632prober.UTF1632Prober.first_half_surrogate_pair_detected_16be
first_half_surrogate_pair_detected_16be
Definition utf1632prober.py:52

pip._vendor.chardet.utf1632prober.UTF1632Prober.get_confidence
float get_confidence(self)
Definition utf1632prober.py:215

pip._vendor.chardet.utf1632prober.UTF1632Prober.invalid_utf16be
invalid_utf16be
Definition utf1632prober.py:48

pip._vendor.chardet.utf1632prober.UTF1632Prober.feed
ProbingState feed(self, Union[bytes, bytearray] byte_str)
Definition utf1632prober.py:187

pip._vendor.chardet.utf1632prober.UTF1632Prober.first_half_surrogate_pair_detected_16le
first_half_surrogate_pair_detected_16le
Definition utf1632prober.py:53

pip._vendor.chardet.utf1632prober.UTF1632Prober.is_likely_utf32le
bool is_likely_utf32le(self)
Definition utf1632prober.py:103

pip._vendor.chardet.utf1632prober.UTF1632Prober._state
_state
Definition utf1632prober.py:46

pip._vendor.chardet.utf1632prober.UTF1632Prober.quad
quad
Definition utf1632prober.py:47

i
for i
Definition prime_search.m:10