Namespaces
namespace	big5freq

namespace	big5prober

namespace	chardistribution

namespace	charsetgroupprober

namespace	charsetprober

namespace	cli

namespace	codingstatemachine

namespace	codingstatemachinedict

namespace	cp949prober

namespace	enums

namespace	escprober

namespace	escsm

namespace	eucjpprober

namespace	euckrfreq

namespace	euckrprober

namespace	euctwfreq

namespace	euctwprober

namespace	gb2312freq

namespace	gb2312prober

namespace	hebrewprober

namespace	jisfreq

namespace	johabfreq

namespace	johabprober

namespace	jpcntx

namespace	langbulgarianmodel

namespace	langgreekmodel

namespace	langhebrewmodel

namespace	langhungarianmodel

namespace	langrussianmodel

namespace	langthaimodel

namespace	langturkishmodel

namespace	latin1prober

namespace	macromanprober

namespace	mbcharsetprober

namespace	mbcsgroupprober

namespace	mbcssm

namespace	metadata

namespace	resultdict

namespace	sbcharsetprober

namespace	sbcsgroupprober

namespace	sjisprober

namespace	universaldetector

namespace	utf1632prober

namespace	utf8prober

namespace	version

Functions
ResultDict	detect (Union[bytes, bytearray] byte_str, bool should_rename_legacy=False)

List[ResultDict]	detect_all (Union[bytes, bytearray] byte_str, bool ignore_threshold=False, bool should_rename_legacy=False)

Function Documentation

◆ detect()

ResultDict detect	(	Union[bytes, bytearray]	byte_str,
		bool	should_rename_legacy = `False`
	)

Detect the encoding of the given byte string.

:param byte_str:     The byte sequence to examine.
:type byte_str:      ``bytes`` or ``bytearray``
:param should_rename_legacy:  Should we rename legacy encodings
                              to their more modern equivalents?
:type should_rename_legacy:   ``bool``

Definition at line 30 of file __init__.py.

) -> ResultDict:
    """
    Detect the encoding of the given byte string.
 
    :param byte_str:     The byte sequence to examine.
    :type byte_str:      ``bytes`` or ``bytearray``
    :param should_rename_legacy:  Should we rename legacy encodings
                                  to their more modern equivalents?
    :type should_rename_legacy:   ``bool``
    """
    if not isinstance(byte_str, bytearray):
        if not isinstance(byte_str, bytes):
            raise TypeError(
                f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
            )
        byte_str = bytearray(byte_str)
    detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
    detector.feed(byte_str)
    return detector.close()
 
 

References i.

◆ detect_all()

List[ResultDict] detect_all	(	Union[bytes, bytearray]	byte_str,
		bool	ignore_threshold = `False`,
		bool	should_rename_legacy = `False`
	)

Detect all the possible encodings of the given byte string.

:param byte_str:          The byte sequence to examine.
:type byte_str:           ``bytes`` or ``bytearray``
:param ignore_threshold:  Include encodings that are below
                          ``UniversalDetector.MINIMUM_THRESHOLD``
                          in results.
:type ignore_threshold:   ``bool``
:param should_rename_legacy:  Should we rename legacy encodings
                              to their more modern equivalents?
:type should_rename_legacy:   ``bool``

Definition at line 53 of file __init__.py.

) -> List[ResultDict]:
    """
    Detect all the possible encodings of the given byte string.
 
    :param byte_str:          The byte sequence to examine.
    :type byte_str:           ``bytes`` or ``bytearray``
    :param ignore_threshold:  Include encodings that are below
                              ``UniversalDetector.MINIMUM_THRESHOLD``
                              in results.
    :type ignore_threshold:   ``bool``
    :param should_rename_legacy:  Should we rename legacy encodings
                                  to their more modern equivalents?
    :type should_rename_legacy:   ``bool``
    """
    if not isinstance(byte_str, bytearray):
        if not isinstance(byte_str, bytes):
            raise TypeError(
                f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
            )
        byte_str = bytearray(byte_str)
 
    detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
    detector.feed(byte_str)
    detector.close()
 
    if detector.input_state == InputState.HIGH_BYTE:
        results: List[ResultDict] = []
        probers: List[CharSetProber] = []
        for prober in detector.charset_probers:
            if isinstance(prober, CharSetGroupProber):
                probers.extend(p for p in prober.probers)
            else:
                probers.append(prober)
        for prober in probers:
            if ignore_threshold or prober.get_confidence() > detector.MINIMUM_THRESHOLD:
                charset_name = prober.charset_name or ""
                lower_charset_name = charset_name.lower()
                # Use Windows encoding name instead of ISO-8859 if we saw any
                # extra Windows-specific bytes
                if lower_charset_name.startswith("iso-8859") and detector.has_win_bytes:
                    charset_name = detector.ISO_WIN_MAP.get(
                        lower_charset_name, charset_name
                    )
                # Rename legacy encodings with superset encodings if asked
                if should_rename_legacy:
                    charset_name = detector.LEGACY_MAP.get(
                        charset_name.lower(), charset_name
                    )
                results.append(
                    {
                        "encoding": charset_name,
                        "confidence": prober.get_confidence(),
                        "language": prober.language,
                    }
                )
        if len(results) > 0:
            return sorted(results, key=lambda result: -result["confidence"])
 
    return [detector.result]

References i.

Namespaces

Functions

Function Documentation

◆ detect()

◆ detect_all()