[go: up one dir, main page]

Skip to content

Commit

Permalink
Add should_rename_legacy flag
Browse files Browse the repository at this point in the history
  • Loading branch information
dan-blanchard committed Jul 17, 2022
1 parent 023e7ea commit 5b5e841
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 14 deletions.
23 changes: 19 additions & 4 deletions chardet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,33 @@
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]


def detect(byte_str: Union[bytes, bytearray]) -> FinalResultDict:
def detect(
byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
) -> FinalResultDict:
"""
Detect the encoding of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
:type should_rename_legacy: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
detector.feed(byte_str)
return detector.close()


def detect_all(
byte_str: Union[bytes, bytearray], ignore_threshold: bool = False
byte_str: Union[bytes, bytearray],
ignore_threshold: bool = False,
should_rename_legacy: bool = False,
) -> List[IntermediateResultDict]:
"""
Detect all the possible encodings of the given byte string.
Expand All @@ -57,6 +64,9 @@ def detect_all(
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
:type should_rename_legacy: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
Expand All @@ -65,7 +75,7 @@ def detect_all(
)
byte_str = bytearray(byte_str)

detector = UniversalDetector()
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
detector.feed(byte_str)
detector.close()

Expand All @@ -87,6 +97,11 @@ def detect_all(
charset_name = detector.ISO_WIN_MAP.get(
lower_charset_name, charset_name
)
# Rename legacy encodings with superset encodings if asked
if should_rename_legacy:
charset_name = detector.LEGACY_MAP.get(
charset_name.lower(), charset_name
)
results.append(
{
"encoding": charset_name,
Expand Down
22 changes: 19 additions & 3 deletions chardet/cli/chardetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@


def description_of(
lines: Iterable[bytes], name: str = "stdin", minimal: bool = False
lines: Iterable[bytes],
name: str = "stdin",
minimal: bool = False,
should_rename_legacy: bool = False,
) -> Optional[str]:
"""
Return a string describing the probable encoding of a file or
Expand All @@ -32,8 +35,11 @@ def description_of(
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
:param should_rename_legacy: Should we rename legacy encodings to
their more modern equivalents?
:type should_rename_legacy: ``bool``
"""
u = UniversalDetector()
u = UniversalDetector(should_rename_legacy=should_rename_legacy)
for line in lines:
line = bytearray(line)
u.feed(line)
Expand Down Expand Up @@ -75,6 +81,12 @@ def main(argv: Optional[List[str]] = None) -> None:
help="Print only the encoding to standard output",
action="store_true",
)
parser.add_argument(
"-l",
"--legacy",
help="Rename legacy encodings to more modern ones.",
action="store_true",
)
parser.add_argument(
"--version", action="version", version=f"%(prog)s {__version__}"
)
Expand All @@ -89,7 +101,11 @@ def main(argv: Optional[List[str]] = None) -> None:
"--help\n",
file=sys.stderr,
)
print(description_of(f, f.name, minimal=args.minimal))
print(
description_of(
f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
)
)


if __name__ == "__main__":
Expand Down
26 changes: 25 additions & 1 deletion chardet/universaldetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,24 @@ class UniversalDetector:
"iso-8859-9": "Windows-1254",
"iso-8859-13": "Windows-1257",
}
# Based on https://encoding.spec.whatwg.org/#names-and-labels
# but altered to match Python names for encodings and remove mappings
# that break tests.
LEGACY_MAP = {
"ascii": "Windows-1252",
"iso-8859-1": "Windows-1252",
"tis-620": "ISO-8859-11",
"iso-8859-9": "Windows-1254",
"gb2312": "GB18030",
"euc-kr": "CP949",
"utf-16le": "UTF-16",
}

def __init__(self, lang_filter: LanguageFilter = LanguageFilter.ALL) -> None:
def __init__(
self,
lang_filter: LanguageFilter = LanguageFilter.ALL,
should_rename_legacy: bool = False,
) -> None:
self._esc_charset_prober: Optional[EscCharSetProber] = None
self._utf1632_prober: Optional[UTF1632Prober] = None
self._charset_probers: List[CharSetProber] = []
Expand All @@ -101,6 +117,7 @@ def __init__(self, lang_filter: LanguageFilter = LanguageFilter.ALL) -> None:
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
self._has_win_bytes = False
self.should_rename_legacy = should_rename_legacy
self.reset()

@property
Expand Down Expand Up @@ -174,13 +191,15 @@ def feed(self, byte_str: Union[bytes, bytearray]) -> None:
elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {
# TODO: This encoding is not supported by Python. Should remove?
"encoding": "X-ISO-10646-UCS-4-3412",
"confidence": 1.0,
"language": "",
}
elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {
# TODO: This encoding is not supported by Python. Should remove?
"encoding": "X-ISO-10646-UCS-4-2143",
"confidence": 1.0,
"language": "",
Expand Down Expand Up @@ -307,6 +326,11 @@ def close(self) -> FinalResultDict:
charset_name = self.ISO_WIN_MAP.get(
lower_charset_name, charset_name
)
# Rename legacy encodings with superset encodings if asked
if self.should_rename_legacy:
charset_name = self.LEGACY_MAP.get(
charset_name.lower(), charset_name
)
self.result = {
"encoding": charset_name,
"confidence": confidence,
Expand Down
63 changes: 57 additions & 6 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from os import listdir
from os.path import dirname, isdir, join, realpath, relpath, splitext
from pprint import pformat
from unicodedata import normalize

try:
import hypothesis.strategies as st
Expand All @@ -19,7 +20,7 @@
HAVE_HYPOTHESIS = True
except ImportError:
HAVE_HYPOTHESIS = False
import pytest # pylint: disable=import-error
import pytest

import chardet
from chardet.metadata.languages import LANGUAGES
Expand All @@ -34,7 +35,6 @@
"windows-1256",
}
EXPECTED_FAILURES = {
"tests/iso-8859-7-greek/disabled.gr.xml",
"tests/iso-8859-9-turkish/_ude_1.txt",
"tests/iso-8859-9-turkish/_ude_2.txt",
"tests/iso-8859-9-turkish/divxplanet.com.xml",
Expand Down Expand Up @@ -92,21 +92,72 @@ def test_encoding_detection(file_name, encoding):
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
expected_unicode = normalize("NFKC", expected_unicode)
detected_unicode = normalize("NFKC", detected_unicode)
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
[
line
for line in ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
if not line.startswith(" ")
][:20]
)
all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
else:
diff = ""
encoding_match = True
all_encodings = [result]
assert encoding_match, (
f"Expected {encoding}, but got {result} for {file_name}. First 20 "
f"lines with character differences: \n{diff}\n"
f"All encodings: {pformat(all_encodings)}"
)


@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection_rename_legacy(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes, should_rename_legacy=True)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
expected_unicode = normalize("NFKD", expected_unicode)
detected_unicode = normalize("NFKD", detected_unicode)
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
[
line
for line in ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
if not line.startswith(" ")
][:20]
)
all_encodings = chardet.detect_all(
input_bytes, ignore_threshold=True, should_rename_legacy=True
)
else:
diff = ""
encoding_match = True
all_encodings = [result]
assert encoding_match, (
f"Expected {encoding}, but got {result} for {file_name}. First 20 "
f"lines of character differences: \n{diff}\n"
Expand Down Expand Up @@ -146,7 +197,7 @@ def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
with pytest.raises(JustALengthIssue):

@given(st.text(), random=rnd)
@settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
@settings(verbosity=Verbosity.quiet, max_examples=50)
def string_poisons_following_text(suffix):
try:
extended = (txt + suffix).encode(enc)
Expand Down

0 comments on commit 5b5e841

Please sign in to comment.