From 7df350051b08382e91cc970290e1a15cac5cc710 Mon Sep 17 00:00:00 2001 From: Matthew Hughes Date: Thu, 20 Jun 2024 20:29:21 +0100 Subject: [PATCH] Handle req file decode failures on locale encoding For the case where: * a requirements file is encoded as UTF-8, and * some bytes in the file are incompatible with the system locale In this case, fallback to decoding as UTF-8 as a last resort (rather than crashing on the `UnicodeDecodeError`). This behaviour was added when parsing the request file, rather than in `auto_decode` as it didn't seem to belong in a generic util (though that util looks to only be ever called when parsing requirements files anyway). Perhaps we should just go straight to UTF-8 without querying the system locale (unless there is a PEP-263 style comment), per the docs[1]: > Requirements files are utf-8 encoding by default But to avoid a breaking change just warn if decoding with this locale fails then fallback to UTF-8 [1] https://pip.pypa.io/en/stable/reference/requirements-file-format/#encoding Fixes: #12771 --- ...84-3fd4-4840-916b-414c164f9c28.trivial.rst | 0 src/pip/_internal/req/req_file.py | 19 +++++++- tests/unit/test_req_file.py | 43 +++++++++++++++++++ 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 news/9a3e9584-3fd4-4840-916b-414c164f9c28.trivial.rst diff --git a/news/9a3e9584-3fd4-4840-916b-414c164f9c28.trivial.rst b/news/9a3e9584-3fd4-4840-916b-414c164f9c28.trivial.rst new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/pip/_internal/req/req_file.py b/src/pip/_internal/req/req_file.py index 53ad8674cd8..13cd787c413 100644 --- a/src/pip/_internal/req/req_file.py +++ b/src/pip/_internal/req/req_file.py @@ -8,6 +8,7 @@ import re import shlex import urllib.parse +import warnings from optparse import Values from typing import ( TYPE_CHECKING, @@ -545,7 +546,23 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]: # Assume this is a bare path. try: with open(url, "rb") as f: - content = auto_decode(f.read()) + raw_content = f.read() except OSError as exc: raise InstallationError(f"Could not open requirements file: {exc}") + + try: + content = auto_decode(raw_content) + except UnicodeDecodeError as exc: + fallback_encoding = "utf-8" + # don't try an decode again if we know it will fail + if exc.encoding == fallback_encoding: + raise + + warnings.warn( + f"unable to decode data with {exc.encoding}, falling back to {fallback_encoding}", # noqa: E501 + UnicodeWarning, + stacklevel=2, + ) + content = raw_content.decode(fallback_encoding) + return url, content diff --git a/tests/unit/test_req_file.py b/tests/unit/test_req_file.py index f4f98b1901c..6d9964fcf85 100644 --- a/tests/unit/test_req_file.py +++ b/tests/unit/test_req_file.py @@ -2,6 +2,7 @@ import logging import os import textwrap +import warnings from optparse import Values from pathlib import Path from typing import Any, Iterator, List, Optional, Protocol, Tuple, Union @@ -883,3 +884,45 @@ def test_install_requirements_with_options( ) assert req.global_options == [global_option] + + def test_warns_on_decode_fail_in_locale( + self, tmpdir: Path, session: PipSession + ) -> None: + # \xe3\x80\x82 encodes to 'IDEOGRAPHIC FULL STOP' in UTF-8 + # the lone \x82 byte is invalid in the gbk encoding + data = b"pip<=24.0 # some comment\xe3\x80\x82\n" + locale_encoding = "gbk" + req_file = tmpdir / "requirements.txt" + req_file.write_bytes(data) + + # it's hard to rely on a locale definitely existing for testing + # so patch things out for simplicity + with pytest.warns(UnicodeWarning) as records, mock.patch( + "locale.getpreferredencoding", return_value=locale_encoding + ): + reqs = tuple(parse_reqfile(req_file.resolve(), session=session)) + + assert len(records) == 1 + assert ( + str(records[0].message) + == "unable to decode data with gbk, falling back to utf-8" + ) + assert len(reqs) == 1 + assert reqs[0].name == "pip" + assert str(reqs[0].specifier) == "<=24.0" + + @pytest.mark.parametrize("encoding", ("utf-8", "gbk")) + def test_erorrs_on_non_decodable_data( + self, encoding: str, tmpdir: Path, session: PipSession + ) -> None: + data = b"\xff" + req_file = tmpdir / "requirements.txt" + req_file.write_bytes(data) + + with warnings.catch_warnings(), pytest.raises(UnicodeDecodeError), mock.patch( + "locale.getpreferredencoding", return_value=encoding + ): + warnings.simplefilter( + "ignore", category=UnicodeWarning + ) # suppress warning not under test here + next(parse_reqfile(req_file.resolve(), session=session))