From 46d5ae5ba8dfcc6963fed1fc63339bd62754f533 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sat, 28 Sep 2024 08:09:51 +0200 Subject: [PATCH 1/2] :heavy_check_mark: Add tests for previous issue 520, 509 and 498 --- tests/test_edge_case.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_edge_case.py b/tests/test_edge_case.py index f324664d..6caa1c48 100644 --- a/tests/test_edge_case.py +++ b/tests/test_edge_case.py @@ -10,3 +10,33 @@ def test_unicode_edge_case(): assert best_guess is not None, "Payload should have given something, detection failure" assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected" + + +def test_issue_gh520(): + """Verify that minorities does not strip basic latin characters!""" + payload = b"/includes/webform.compon\xd2\xaants.inc/" + + best_guess = from_bytes(payload).best() + + assert best_guess is not None, "Payload should have given something, detection failure" + assert "Basic Latin" in best_guess.alphabets + + +def test_issue_gh509(): + """Two common ASCII punctuations should render as-is.""" + payload = b");" + + best_guess = from_bytes(payload).best() + + assert best_guess is not None, "Payload should have given something, detection failure" + assert "ascii" == best_guess.encoding + + +def test_issue_gh498(): + """This case was mistaken for utf-16-le, this should never happen again.""" + payload = b'\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx' + + best_guess = from_bytes(payload).best() + + assert best_guess is not None, "Payload should have given something, detection failure" + assert "Cyrillic" in best_guess.alphabets From bf920e17d1c50ab7e7e7ca2195dedffc816ceab5 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sat, 28 Sep 2024 08:14:46 +0200 Subject: [PATCH 2/2] :sparkle: Patch declared charset when explicitly converting a CharsetMatch to Unicode bytes. --- CHANGELOG.md | 4 +++ charset_normalizer/api.py | 9 ++++++- charset_normalizer/cli/__main__.py | 36 ++++++++++++++++++++++----- charset_normalizer/models.py | 25 +++++++++++++++++-- docs/community/featured.rst | 5 +--- tests/test_preemptive_detection.py | 40 ++++++++++++++++++++++++++++++ 6 files changed, 106 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59fe33ff..a8a53576 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-09-??) +### Added +- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints. + ### Fixed - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch. - Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) +- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381) ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index b5e4dd51..a51ee35e 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -371,7 +371,13 @@ def from_bytes( and not lazy_str_hard_failure ): fallback_entry = CharsetMatch( - sequences, encoding_iana, threshold, False, [], decoded_payload + sequences, + encoding_iana, + threshold, + False, + [], + decoded_payload, + preemptive_declaration=specified_encoding, ) if encoding_iana == specified_encoding: fallback_specified = fallback_entry @@ -433,6 +439,7 @@ def from_bytes( bom_or_sig_available, cd_ratios_merged, decoded_payload, + preemptive_declaration=specified_encoding, ) ) diff --git a/charset_normalizer/cli/__main__.py b/charset_normalizer/cli/__main__.py index f4bcbaac..e7edd0fc 100644 --- a/charset_normalizer/cli/__main__.py +++ b/charset_normalizer/cli/__main__.py @@ -109,6 +109,14 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: dest="force", help="Replace file without asking if you are sure, use this flag with caution.", ) + parser.add_argument( + "-i", + "--no-preemptive", + action="store_true", + default=False, + dest="no_preemptive", + help="Disable looking at a charset declaration to hint the detector.", + ) parser.add_argument( "-t", "--threshold", @@ -133,21 +141,35 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: args = parser.parse_args(argv) if args.replace is True and args.normalize is False: + if args.files: + for my_file in args.files: + my_file.close() print("Use --replace in addition of --normalize only.", file=sys.stderr) return 1 if args.force is True and args.replace is False: + if args.files: + for my_file in args.files: + my_file.close() print("Use --force in addition of --replace only.", file=sys.stderr) return 1 if args.threshold < 0.0 or args.threshold > 1.0: + if args.files: + for my_file in args.files: + my_file.close() print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr) return 1 x_ = [] for my_file in args.files: - matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose) + matches = from_fp( + my_file, + threshold=args.threshold, + explain=args.verbose, + preemptive_behaviour=args.no_preemptive is False, + ) best_guess = matches.best() @@ -155,9 +177,11 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: print( 'Unable to identify originating encoding for "{}". {}'.format( my_file.name, - "Maybe try increasing maximum amount of chaos." - if args.threshold < 1.0 - else "", + ( + "Maybe try increasing maximum amount of chaos." + if args.threshold < 1.0 + else "" + ), ), file=sys.stderr, ) @@ -258,8 +282,8 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: try: x_[0].unicode_path = join(dir_path, ".".join(o_)) - with open(x_[0].unicode_path, "w", encoding="utf-8") as fp: - fp.write(str(best_guess)) + with open(x_[0].unicode_path, "wb") as fp: + fp.write(best_guess.output()) except IOError as e: print(str(e), file=sys.stderr) if my_file.closed is False: diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index 4d2ce867..ee5681ca 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -1,9 +1,10 @@ from encodings.aliases import aliases from hashlib import sha256 from json import dumps +from re import sub from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from .constant import TOO_BIG_SEQUENCE +from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE from .utils import iana_name, is_multi_byte_encoding, unicode_range @@ -16,6 +17,7 @@ def __init__( has_sig_or_bom: bool, languages: "CoherenceMatches", decoded_payload: Optional[str] = None, + preemptive_declaration: Optional[str] = None, ): self._payload: bytes = payload @@ -33,6 +35,8 @@ def __init__( self._string: Optional[str] = decoded_payload + self._preemptive_declaration: Optional[str] = preemptive_declaration + def __eq__(self, other: object) -> bool: if not isinstance(other, CharsetMatch): if isinstance(other, str): @@ -208,7 +212,24 @@ def output(self, encoding: str = "utf_8") -> bytes: """ if self._output_encoding is None or self._output_encoding != encoding: self._output_encoding = encoding - self._output_payload = str(self).encode(encoding, "replace") + decoded_string = str(self) + if ( + self._preemptive_declaration is not None + and self._preemptive_declaration.lower() + not in ["utf-8", "utf8", "utf_8"] + ): + patched_header = sub( + RE_POSSIBLE_ENCODING_INDICATION, + lambda m: m.string[m.span()[0] : m.span()[1]].replace( + m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type] + ), + decoded_string[:8192], + 1, + ) + + decoded_string = patched_header + decoded_string[8192:] + + self._output_payload = decoded_string.encode(encoding, "replace") return self._output_payload # type: ignore diff --git a/docs/community/featured.rst b/docs/community/featured.rst index 8d1814c5..a704a0bc 100644 --- a/docs/community/featured.rst +++ b/docs/community/featured.rst @@ -9,10 +9,7 @@ your level or opinions. Niquests -------- -Started as a simple though.. - -.. image:: https://i.imgflip.com/7xet0f.jpg - :width: 200 +Started as a simple though.. IE 11 has built-in HTTP/2 support while Requests 2.32 does not! Most of our programs that interact with HTTP server are built with ``requests`` and we aren't likely to switch without a substantial effort. diff --git a/tests/test_preemptive_detection.py b/tests/test_preemptive_detection.py index 042415b6..411bf45f 100644 --- a/tests/test_preemptive_detection.py +++ b/tests/test_preemptive_detection.py @@ -1,6 +1,7 @@ import pytest from charset_normalizer.utils import any_specified_encoding +from charset_normalizer import CharsetMatch @pytest.mark.parametrize( @@ -24,3 +25,42 @@ def test_detect_most_common_body_encoding(payload, expected_encoding): ) assert specified_encoding == expected_encoding, "Unable to determine properly encoding from given body" + + +@pytest.mark.parametrize( + "payload, expected_outcome", + [ + (b'', b''), + (b'', b''), + (b'', b''), + (b'# coding: utf-8', b'# coding: utf-8'), + (b'', b''), + (b'', b''), + (b'', b''), + (b'', b''), + (b'', b''), + ] +) +def test_preemptive_mark_replacement(payload, expected_outcome): + """ + When generating (to Unicode converted) bytes, we want to change any potential declarative charset + to utf-8. This test that. + """ + specified_encoding = any_specified_encoding( + payload + ) + + detected_encoding = specified_encoding if specified_encoding is not None else "utf-8" + + m = CharsetMatch( + payload, + detected_encoding, + 0., + False, + [], + preemptive_declaration=specified_encoding, + ) + + transformed_output = m.output() + + assert transformed_output == expected_outcome