From 1b5e494675648d03fd2585244b10c1fd59b4b6a0 Mon Sep 17 00:00:00 2001 From: Dominic Davis-Foster Date: Mon, 31 Jan 2022 10:02:47 +0000 Subject: [PATCH 1/3] Ignore leading whitespace when checking for HTML doctype --- src/pip/_internal/index/collector.py | 2 +- tests/data/indexes/README.txt | 4 ++++ tests/data/indexes/indent/simple/index.html | 6 ++++++ tests/functional/test_install_index.py | 11 +++++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 tests/data/indexes/indent/simple/index.html diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 4ecbb337805..89bfa539c55 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -350,7 +350,7 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin # requested to use html5lib. if not use_deprecated_html5lib: expected_doctype = "".encode(encoding) - actual_start = page.content[: len(expected_doctype)] + actual_start = page.content.lstrip()[: len(expected_doctype)] if actual_start.decode(encoding).lower() != "": deprecated( reason=( diff --git a/tests/data/indexes/README.txt b/tests/data/indexes/README.txt index 8e430effdba..f1a66afc5a3 100644 --- a/tests/data/indexes/README.txt +++ b/tests/data/indexes/README.txt @@ -13,3 +13,7 @@ for testing url quoting with indexes simple ------ contains index page for "simple" pkg + +indent +------ +For testing indented HTML pages diff --git a/tests/data/indexes/indent/simple/index.html b/tests/data/indexes/indent/simple/index.html new file mode 100644 index 00000000000..eca8c7e6a3f --- /dev/null +++ b/tests/data/indexes/indent/simple/index.html @@ -0,0 +1,6 @@ + + + + simple-1.0.tar.gz + + diff --git a/tests/functional/test_install_index.py b/tests/functional/test_install_index.py index 3308de504ac..62f68dd9b77 100644 --- a/tests/functional/test_install_index.py +++ b/tests/functional/test_install_index.py @@ -74,3 +74,14 @@ def test_file_index_url_quoting(script: PipTestEnvironment, data: TestData) -> N result = script.pip("install", "-vvv", "--index-url", index_url, "simple") result.did_create(script.site_packages / "simple") result.did_create(script.site_packages / "simple-1.0.dist-info") + + +@pytest.mark.usefixtures("with_wheel") +def test_file_index_indent(script: PipTestEnvironment, data: TestData) -> None: + """ + Test url quoting of file index url with a space + """ + index_url = data.index_url(urllib.parse.quote("indent")) + result = script.pip("install", "-vvv", "--index-url", index_url, "simple") + result.did_create(script.site_packages / "simple") + result.did_create(script.site_packages / "simple-1.0.dist-info") From 3d2984ca6a9be4dbac617fbc312080d7151d83eb Mon Sep 17 00:00:00 2001 From: Dominic Davis-Foster Date: Mon, 31 Jan 2022 10:21:44 +0000 Subject: [PATCH 2/3] Don't .lstrip() the entire HTML document. --- src/pip/_internal/index/collector.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 89bfa539c55..aa4149ed79d 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -350,7 +350,16 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin # requested to use html5lib. if not use_deprecated_html5lib: expected_doctype = "".encode(encoding) - actual_start = page.content.lstrip()[: len(expected_doctype)] + + char: int + offset: int = 0 + for char in page.content: + if chr(char).isspace(): + offset += 1 + else: + break + + actual_start = page.content[offset : offset + len(expected_doctype)] if actual_start.decode(encoding).lower() != "": deprecated( reason=( From 0074b50248f7aa5252adb09d56e27a4d646d7b27 Mon Sep 17 00:00:00 2001 From: Dominic Davis-Foster Date: Mon, 31 Jan 2022 14:28:46 +0000 Subject: [PATCH 3/3] Add news entry. --- news/10855.bugfix.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 news/10855.bugfix.rst diff --git a/news/10855.bugfix.rst b/news/10855.bugfix.rst new file mode 100644 index 00000000000..82687205e66 --- /dev/null +++ b/news/10855.bugfix.rst @@ -0,0 +1 @@ +Accept HTML files with whitespace before the doctype declaration.