Add should_rename_legacy flag

chardet · Jul 17, 2022 · 5b5e841 · 5b5e841
1 parent 023e7ea
commit 5b5e841
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 14 deletions.
diff --git a/chardet/__init__.py b/chardet/__init__.py
@@ -27,26 +27,33 @@
 __all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
 
 
-def detect(byte_str: Union[bytes, bytearray]) -> FinalResultDict:
+def detect(
+    byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
+) -> FinalResultDict:
     """
     Detect the encoding of the given byte string.
 
     :param byte_str:     The byte sequence to examine.
     :type byte_str:      ``bytes`` or ``bytearray``
+    :param should_rename_legacy:  Should we rename legacy encodings
+                                  to their more modern equivalents?
+    :type should_rename_legacy:   ``bool``
     """
     if not isinstance(byte_str, bytearray):
         if not isinstance(byte_str, bytes):
             raise TypeError(
                 f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
             )
         byte_str = bytearray(byte_str)
-    detector = UniversalDetector()
+    detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
     detector.feed(byte_str)
     return detector.close()
 
 
 def detect_all(
-    byte_str: Union[bytes, bytearray], ignore_threshold: bool = False
+    byte_str: Union[bytes, bytearray],
+    ignore_threshold: bool = False,
+    should_rename_legacy: bool = False,
 ) -> List[IntermediateResultDict]:
     """
     Detect all the possible encodings of the given byte string.
@@ -57,6 +64,9 @@ def detect_all(
                               ``UniversalDetector.MINIMUM_THRESHOLD``
                               in results.
     :type ignore_threshold:   ``bool``
+    :param should_rename_legacy:  Should we rename legacy encodings
+                                  to their more modern equivalents?
+    :type should_rename_legacy:   ``bool``
     """
     if not isinstance(byte_str, bytearray):
         if not isinstance(byte_str, bytes):
@@ -65,7 +75,7 @@ def detect_all(
             )
         byte_str = bytearray(byte_str)
 
-    detector = UniversalDetector()
+    detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
     detector.feed(byte_str)
     detector.close()
 
@@ -87,6 +97,11 @@ def detect_all(
                     charset_name = detector.ISO_WIN_MAP.get(
                         lower_charset_name, charset_name
                     )
+                # Rename legacy encodings with superset encodings if asked
+                if should_rename_legacy:
+                    charset_name = detector.LEGACY_MAP.get(
+                        charset_name.lower(), charset_name
+                    )
                 results.append(
                     {
                         "encoding": charset_name,

diff --git a/chardet/cli/chardetect.py b/chardet/cli/chardetect.py
@@ -22,7 +22,10 @@
 
 
 def description_of(
-    lines: Iterable[bytes], name: str = "stdin", minimal: bool = False
+    lines: Iterable[bytes],
+    name: str = "stdin",
+    minimal: bool = False,
+    should_rename_legacy: bool = False,
 ) -> Optional[str]:
     """
     Return a string describing the probable encoding of a file or
@@ -32,8 +35,11 @@ def description_of(
     :type lines: Iterable of bytes
     :param name: Name of file or collection of lines
     :type name: str
+    :param should_rename_legacy:  Should we rename legacy encodings to
+                                  their more modern equivalents?
+    :type should_rename_legacy:   ``bool``
     """
-    u = UniversalDetector()
+    u = UniversalDetector(should_rename_legacy=should_rename_legacy)
     for line in lines:
         line = bytearray(line)
         u.feed(line)
@@ -75,6 +81,12 @@ def main(argv: Optional[List[str]] = None) -> None:
         help="Print only the encoding to standard output",
         action="store_true",
     )
+    parser.add_argument(
+        "-l",
+        "--legacy",
+        help="Rename legacy encodings to more modern ones.",
+        action="store_true",
+    )
     parser.add_argument(
         "--version", action="version", version=f"%(prog)s {__version__}"
     )
@@ -89,7 +101,11 @@ def main(argv: Optional[List[str]] = None) -> None:
                 "--help\n",
                 file=sys.stderr,
             )
-        print(description_of(f, f.name, minimal=args.minimal))
+        print(
+            description_of(
+                f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
+            )
+        )
 
 
 if __name__ == "__main__":

diff --git a/chardet/universaldetector.py b/chardet/universaldetector.py
@@ -84,8 +84,24 @@ class UniversalDetector:
         "iso-8859-9": "Windows-1254",
         "iso-8859-13": "Windows-1257",
     }
+    # Based on https://encoding.spec.whatwg.org/#names-and-labels
+    # but altered to match Python names for encodings and remove mappings
+    # that break tests.
+    LEGACY_MAP = {
+        "ascii": "Windows-1252",
+        "iso-8859-1": "Windows-1252",
+        "tis-620": "ISO-8859-11",
+        "iso-8859-9": "Windows-1254",
+        "gb2312": "GB18030",
+        "euc-kr": "CP949",
+        "utf-16le": "UTF-16",
+    }
 
-    def __init__(self, lang_filter: LanguageFilter = LanguageFilter.ALL) -> None:
+    def __init__(
+        self,
+        lang_filter: LanguageFilter = LanguageFilter.ALL,
+        should_rename_legacy: bool = False,
+    ) -> None:
         self._esc_charset_prober: Optional[EscCharSetProber] = None
         self._utf1632_prober: Optional[UTF1632Prober] = None
         self._charset_probers: List[CharSetProber] = []
@@ -101,6 +117,7 @@ def __init__(self, lang_filter: LanguageFilter = LanguageFilter.ALL) -> None:
         self.lang_filter = lang_filter
         self.logger = logging.getLogger(__name__)
         self._has_win_bytes = False
+        self.should_rename_legacy = should_rename_legacy
         self.reset()
 
     @property
@@ -174,13 +191,15 @@ def feed(self, byte_str: Union[bytes, bytearray]) -> None:
             elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
                 # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                 self.result = {
+                    # TODO: This encoding is not supported by Python. Should remove?
                     "encoding": "X-ISO-10646-UCS-4-3412",
                     "confidence": 1.0,
                     "language": "",
                 }
             elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
                 # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                 self.result = {
+                    # TODO: This encoding is not supported by Python. Should remove?
                     "encoding": "X-ISO-10646-UCS-4-2143",
                     "confidence": 1.0,
                     "language": "",
@@ -307,6 +326,11 @@ def close(self) -> FinalResultDict:
                         charset_name = self.ISO_WIN_MAP.get(
                             lower_charset_name, charset_name
                         )
+                # Rename legacy encodings with superset encodings if asked
+                if self.should_rename_legacy:
+                    charset_name = self.LEGACY_MAP.get(
+                        charset_name.lower(), charset_name
+                    )
                 self.result = {
                     "encoding": charset_name,
                     "confidence": confidence,

diff --git a/test.py b/test.py
@@ -11,6 +11,7 @@
 from os import listdir
 from os.path import dirname, isdir, join, realpath, relpath, splitext
 from pprint import pformat
+from unicodedata import normalize
 
 try:
     import hypothesis.strategies as st
@@ -19,7 +20,7 @@
     HAVE_HYPOTHESIS = True
 except ImportError:
     HAVE_HYPOTHESIS = False
-import pytest  # pylint: disable=import-error
+import pytest
 
 import chardet
 from chardet.metadata.languages import LANGUAGES
@@ -34,7 +35,6 @@
     "windows-1256",
 }
 EXPECTED_FAILURES = {
-    "tests/iso-8859-7-greek/disabled.gr.xml",
     "tests/iso-8859-9-turkish/_ude_1.txt",
     "tests/iso-8859-9-turkish/_ude_2.txt",
     "tests/iso-8859-9-turkish/divxplanet.com.xml",
@@ -92,21 +92,72 @@ def test_encoding_detection(file_name, encoding):
         encoding_match = False
     # Only care about mismatches that would actually result in different
     # behavior when decoding
+    expected_unicode = normalize("NFKC", expected_unicode)
+    detected_unicode = normalize("NFKC", detected_unicode)
     if not encoding_match and expected_unicode != detected_unicode:
         wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
         wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
         diff = "".join(
-            list(
-                ndiff(
+            [
+                line
+                for line in ndiff(
                     wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
                 )
-            )[:20]
+                if not line.startswith(" ")
+            ][:20]
         )
         all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
     else:
         diff = ""
         encoding_match = True
         all_encodings = [result]
+    assert encoding_match, (
+        f"Expected {encoding}, but got {result} for {file_name}.  First 20 "
+        f"lines with character differences: \n{diff}\n"
+        f"All encodings: {pformat(all_encodings)}"
+    )
+
+
+@pytest.mark.parametrize("file_name, encoding", gen_test_params())
+def test_encoding_detection_rename_legacy(file_name, encoding):
+    with open(file_name, "rb") as f:
+        input_bytes = f.read()
+        result = chardet.detect(input_bytes, should_rename_legacy=True)
+        try:
+            expected_unicode = input_bytes.decode(encoding)
+        except LookupError:
+            expected_unicode = ""
+        try:
+            detected_unicode = input_bytes.decode(result["encoding"])
+        except (LookupError, UnicodeDecodeError, TypeError):
+            detected_unicode = ""
+    if result:
+        encoding_match = (result["encoding"] or "").lower() == encoding
+    else:
+        encoding_match = False
+    # Only care about mismatches that would actually result in different
+    # behavior when decoding
+    expected_unicode = normalize("NFKD", expected_unicode)
+    detected_unicode = normalize("NFKD", detected_unicode)
+    if not encoding_match and expected_unicode != detected_unicode:
+        wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
+        wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
+        diff = "".join(
+            [
+                line
+                for line in ndiff(
+                    wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
+                )
+                if not line.startswith(" ")
+            ][:20]
+        )
+        all_encodings = chardet.detect_all(
+            input_bytes, ignore_threshold=True, should_rename_legacy=True
+        )
+    else:
+        diff = ""
+        encoding_match = True
+        all_encodings = [result]
     assert encoding_match, (
         f"Expected {encoding}, but got {result} for {file_name}.  First 20 "
         f"lines of character differences: \n{diff}\n"
@@ -146,7 +197,7 @@ def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
             with pytest.raises(JustALengthIssue):
 
                 @given(st.text(), random=rnd)
-                @settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
+                @settings(verbosity=Verbosity.quiet, max_examples=50)
                 def string_poisons_following_text(suffix):
                     try:
                         extended = (txt + suffix).encode(enc)