diff --git a/news/9a3e9584-3fd4-4840-916b-414c164f9c28.trivial.rst b/news/9a3e9584-3fd4-4840-916b-414c164f9c28.trivial.rst new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/pip/_internal/req/req_file.py b/src/pip/_internal/req/req_file.py index 53ad8674cd8..13cd787c413 100644 --- a/src/pip/_internal/req/req_file.py +++ b/src/pip/_internal/req/req_file.py @@ -8,6 +8,7 @@ import re import shlex import urllib.parse +import warnings from optparse import Values from typing import ( TYPE_CHECKING, @@ -545,7 +546,23 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]: # Assume this is a bare path. try: with open(url, "rb") as f: - content = auto_decode(f.read()) + raw_content = f.read() except OSError as exc: raise InstallationError(f"Could not open requirements file: {exc}") + + try: + content = auto_decode(raw_content) + except UnicodeDecodeError as exc: + fallback_encoding = "utf-8" + # don't try an decode again if we know it will fail + if exc.encoding == fallback_encoding: + raise + + warnings.warn( + f"unable to decode data with {exc.encoding}, falling back to {fallback_encoding}", # noqa: E501 + UnicodeWarning, + stacklevel=2, + ) + content = raw_content.decode(fallback_encoding) + return url, content diff --git a/tests/unit/test_req_file.py b/tests/unit/test_req_file.py index f4f98b1901c..6d9964fcf85 100644 --- a/tests/unit/test_req_file.py +++ b/tests/unit/test_req_file.py @@ -2,6 +2,7 @@ import logging import os import textwrap +import warnings from optparse import Values from pathlib import Path from typing import Any, Iterator, List, Optional, Protocol, Tuple, Union @@ -883,3 +884,45 @@ def test_install_requirements_with_options( ) assert req.global_options == [global_option] + + def test_warns_on_decode_fail_in_locale( + self, tmpdir: Path, session: PipSession + ) -> None: + # \xe3\x80\x82 encodes to 'IDEOGRAPHIC FULL STOP' in UTF-8 + # the lone \x82 byte is invalid in the gbk encoding + data = b"pip<=24.0 # some comment\xe3\x80\x82\n" + locale_encoding = "gbk" + req_file = tmpdir / "requirements.txt" + req_file.write_bytes(data) + + # it's hard to rely on a locale definitely existing for testing + # so patch things out for simplicity + with pytest.warns(UnicodeWarning) as records, mock.patch( + "locale.getpreferredencoding", return_value=locale_encoding + ): + reqs = tuple(parse_reqfile(req_file.resolve(), session=session)) + + assert len(records) == 1 + assert ( + str(records[0].message) + == "unable to decode data with gbk, falling back to utf-8" + ) + assert len(reqs) == 1 + assert reqs[0].name == "pip" + assert str(reqs[0].specifier) == "<=24.0" + + @pytest.mark.parametrize("encoding", ("utf-8", "gbk")) + def test_erorrs_on_non_decodable_data( + self, encoding: str, tmpdir: Path, session: PipSession + ) -> None: + data = b"\xff" + req_file = tmpdir / "requirements.txt" + req_file.write_bytes(data) + + with warnings.catch_warnings(), pytest.raises(UnicodeDecodeError), mock.patch( + "locale.getpreferredencoding", return_value=encoding + ): + warnings.simplefilter( + "ignore", category=UnicodeWarning + ) # suppress warning not under test here + next(parse_reqfile(req_file.resolve(), session=session))