Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix html file is not reported as UTF8 after conversion #533

Merged
merged 2 commits into from
Sep 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-09-??)

### Added
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.

### Fixed
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407)
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)

## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)

Expand Down
9 changes: 8 additions & 1 deletion charset_normalizer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,13 @@ def from_bytes(
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
sequences, encoding_iana, threshold, False, [], decoded_payload
sequences,
encoding_iana,
threshold,
False,
[],
decoded_payload,
preemptive_declaration=specified_encoding,
)
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
Expand Down Expand Up @@ -433,6 +439,7 @@ def from_bytes(
bom_or_sig_available,
cd_ratios_merged,
decoded_payload,
preemptive_declaration=specified_encoding,
)
)

Expand Down
36 changes: 30 additions & 6 deletions charset_normalizer/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-i",
"--no-preemptive",
action="store_true",
default=False,
dest="no_preemptive",
help="Disable looking at a charset declaration to hint the detector.",
)
parser.add_argument(
"-t",
"--threshold",
Expand All @@ -133,31 +141,47 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
args = parser.parse_args(argv)

if args.replace is True and args.normalize is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1

if args.force is True and args.replace is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1

if args.threshold < 0.0 or args.threshold > 1.0:
if args.files:
for my_file in args.files:
my_file.close()
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1

x_ = []

for my_file in args.files:
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
matches = from_fp(
my_file,
threshold=args.threshold,
explain=args.verbose,
preemptive_behaviour=args.no_preemptive is False,
)

best_guess = matches.best()

if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else "",
(
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else ""
),
),
file=sys.stderr,
)
Expand Down Expand Up @@ -258,8 +282,8 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
try:
x_[0].unicode_path = join(dir_path, ".".join(o_))

with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
fp.write(str(best_guess))
with open(x_[0].unicode_path, "wb") as fp:
fp.write(best_guess.output())
except IOError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
Expand Down
25 changes: 23 additions & 2 deletions charset_normalizer/models.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from re import sub
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

from .constant import TOO_BIG_SEQUENCE
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range


Expand All @@ -16,6 +17,7 @@ def __init__(
has_sig_or_bom: bool,
languages: "CoherenceMatches",
decoded_payload: Optional[str] = None,
preemptive_declaration: Optional[str] = None,
):
self._payload: bytes = payload

Expand All @@ -33,6 +35,8 @@ def __init__(

self._string: Optional[str] = decoded_payload

self._preemptive_declaration: Optional[str] = preemptive_declaration

def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
if isinstance(other, str):
Expand Down Expand Up @@ -208,7 +212,24 @@ def output(self, encoding: str = "utf_8") -> bytes:
"""
if self._output_encoding is None or self._output_encoding != encoding:
self._output_encoding = encoding
self._output_payload = str(self).encode(encoding, "replace")
decoded_string = str(self)
if (
self._preemptive_declaration is not None
and self._preemptive_declaration.lower()
not in ["utf-8", "utf8", "utf_8"]
):
patched_header = sub(
RE_POSSIBLE_ENCODING_INDICATION,
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type]
),
decoded_string[:8192],
1,
)

decoded_string = patched_header + decoded_string[8192:]

self._output_payload = decoded_string.encode(encoding, "replace")

return self._output_payload # type: ignore

Expand Down
5 changes: 1 addition & 4 deletions docs/community/featured.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@ your level or opinions.
Niquests
--------

Started as a simple though..

.. image:: https://i.imgflip.com/7xet0f.jpg
:width: 200
Started as a simple though.. IE 11 has built-in HTTP/2 support while Requests 2.32 does not!

Most of our programs that interact with HTTP server are built with ``requests`` and
we aren't likely to switch without a substantial effort.
Expand Down
30 changes: 30 additions & 0 deletions tests/test_edge_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,33 @@ def test_unicode_edge_case():

assert best_guess is not None, "Payload should have given something, detection failure"
assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected"


def test_issue_gh520():
"""Verify that minorities does not strip basic latin characters!"""
payload = b"/includes/webform.compon\xd2\xaants.inc/"

best_guess = from_bytes(payload).best()

assert best_guess is not None, "Payload should have given something, detection failure"
assert "Basic Latin" in best_guess.alphabets


def test_issue_gh509():
"""Two common ASCII punctuations should render as-is."""
payload = b");"

best_guess = from_bytes(payload).best()

assert best_guess is not None, "Payload should have given something, detection failure"
assert "ascii" == best_guess.encoding


def test_issue_gh498():
"""This case was mistaken for utf-16-le, this should never happen again."""
payload = b'\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx'

best_guess = from_bytes(payload).best()

assert best_guess is not None, "Payload should have given something, detection failure"
assert "Cyrillic" in best_guess.alphabets
40 changes: 40 additions & 0 deletions tests/test_preemptive_detection.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from charset_normalizer.utils import any_specified_encoding
from charset_normalizer import CharsetMatch


@pytest.mark.parametrize(
Expand All @@ -24,3 +25,42 @@ def test_detect_most_common_body_encoding(payload, expected_encoding):
)

assert specified_encoding == expected_encoding, "Unable to determine properly encoding from given body"


@pytest.mark.parametrize(
"payload, expected_outcome",
[
(b'<?xml version="1.0" encoding="EUC-JP"?>', b'<?xml version="1.0" encoding="utf_8"?>'),
(b'<html><head><meta charset="utf-8"></head></html>', b'<html><head><meta charset="utf-8"></head></html>'),
(b'<html><head><meta charset="utf-57"></head></html>', b'<html><head><meta charset="utf-57"></head></html>'),
(b'# coding: utf-8', b'# coding: utf-8'),
(b'<?xml version="1.0" encoding="UTF-8"?>', b'<?xml version="1.0" encoding="UTF-8"?>'),
(b'<?xml version="1.0" encoding="US-ASCII"?>', b'<?xml version="1.0" encoding="utf_8"?>'),
(b'<?xml version="1.0" encoding="JohaB"?>', b'<?xml version="1.0" encoding="utf_8"?>'),
(b'<html><head><meta charset=WINDOWS-1252></head></html>', b'<html><head><meta charset=utf_8></head></html>'),
(b'<html><head><meta charset="WINDOWS-1256"></head></html>', b'<html><head><meta charset="utf_8"></head></html>'),
]
)
def test_preemptive_mark_replacement(payload, expected_outcome):
"""
When generating (to Unicode converted) bytes, we want to change any potential declarative charset
to utf-8. This test that.
"""
specified_encoding = any_specified_encoding(
payload
)

detected_encoding = specified_encoding if specified_encoding is not None else "utf-8"

m = CharsetMatch(
payload,
detected_encoding,
0.,
False,
[],
preemptive_declaration=specified_encoding,
)

transformed_output = m.output()

assert transformed_output == expected_outcome
Loading