From eaa9ce6564ff0d1025991d872f9d88a85ed8c726 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 27 Nov 2024 13:11:40 +0200 Subject: [PATCH] Preserve the status of allow_none in results. --- Doc/library/urllib.parse.rst | 20 ++--- Doc/whatsnew/3.14.rst | 5 +- Lib/test/test_urlparse.py | 81 ++++++++++++------ Lib/urllib/parse.py | 85 ++++++++++++------- ...4-11-27-13-11-16.gh-issue-67041.ym2WKK.rst | 6 ++ 5 files changed, 124 insertions(+), 73 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index ae2ff518b5ba7e..1b66531ccc768b 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -318,6 +318,8 @@ or on combining URL components into a URL string. a ``?`` for an empty query), only ``None`` components are omitted. This allows to restore the URL that was parsed with option ``allow_none=True``. + By default, *keep_empty* is true if *parts* is the result of the + :func:`urlparse` call with ``allow_none=True``. .. versionchanged:: 3.14 Added the *keep_empty* parameter. @@ -417,6 +419,8 @@ or on combining URL components into a URL string. a ``?`` for an empty query), only ``None`` components are omitted. This allows to restore the URL that was parsed with option ``allow_none=True``. + By default, *keep_empty* is true if *parts* is the result of the + :func:`urlsplit` call with ``allow_none=True``. .. versionchanged:: 3.14 Added the *keep_empty* parameter. @@ -461,10 +465,8 @@ or on combining URL components into a URL string. .. versionchanged:: 3.5 - Behavior updated to match the semantics defined in :rfc:`3986`. - .. versionchanged:: 3.14 - Added the *keep_empty* parameter. + Behavior updated to match the semantics defined in :rfc:`3986`. .. function:: urldefrag(url, *, allow_none=False) @@ -588,12 +590,13 @@ These subclasses add the attributes listed in the documentation for those functions, the encoding and decoding support described in the previous section, as well as an additional method: -.. method:: urllib.parse.SplitResult.geturl(*, keep_empty=False) +.. method:: urllib.parse.SplitResult.geturl() Return the re-combined version of the original URL as a string. This may differ from the original URL in that the scheme may be normalized to lower case and empty components may be dropped. Specifically, empty parameters, - queries, and fragment identifiers will be removed unless *keep_empty* is true. + queries, and fragment identifiers will be removed unless the URL was parsed + with ``allow_none=True``. For :func:`urldefrag` results, only empty fragment identifiers will be removed. For :func:`urlsplit` and :func:`urlparse` results, all noted changes will be @@ -611,11 +614,8 @@ previous section, as well as an additional method: >>> r2.geturl() 'http://www.Python.org/doc/' >>> r3 = urlsplit(url, allow_none=True) - >>> r1.geturl(keep_empty=True) - 'http://www.Python.org/doc/' - - .. versionchanged:: 3.14 - Added the *keep_empty* parameter. + >>> r3.geturl() + 'http://www.Python.org/doc/#' The following classes provide the implementations of the structured parse diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index c4c48492d92741..8ca9f43d7ae891 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -595,8 +595,9 @@ urllib.parse * Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`, :func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag` functions. Add the *keep_empty* parameter to :func:`~urllib.parse.urlunparse` and - :func:`~urllib.parse.urlunsplit` functions and - :func:`~urllib.parse.SplitResult.geturl` methods. + :func:`~urllib.parse.urlunsplit` functions. + This allows to distinguish between empty and not defined URI components + and preserve empty components. (Contributed by Serhiy Storchaka in :gh:`67041`.) uuid diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index a2504365bc0c0c..f9c583710e43a7 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -3,6 +3,7 @@ import unicodedata import unittest import urllib.parse +from urllib.parse import urlparse, urlsplit, urlunparse, urlunsplit RFC1808_BASE = "http://a/b/c/d;p?q#f" RFC2396_BASE = "http://a/b/c/d;p?q" @@ -119,23 +120,50 @@ def _encode(self, s): return tuple(self._encode(x) for x in s) return s - def checkRoundtrips(self, url, parsed, split, url2=None, *, allow_none=True): + def checkRoundtrips(self, url, parsed, split, url2=None): if url2 is None: url2 = url - result = urllib.parse.urlparse(url, allow_none=allow_none) + self.checkRoundtrips1(url, parsed, split, allow_none=True) + empty = url[:0] + parsed = tuple(x or empty for x in parsed) + split = tuple(x or empty for x in split) + self.checkRoundtrips1(url, parsed, split, url2, allow_none=False) + + result = urlparse(url, allow_none=True) + self.assertEqual(urlunparse(result, keep_empty=False), url2) + self.assertEqual(urlunparse(tuple(result), keep_empty=False), url2) + result = urlparse(url, allow_none=False) + with self.assertRaises(ValueError): + urlunparse(result, keep_empty=True) + urlunparse(tuple(result), keep_empty=True) + + result = urlsplit(url, allow_none=True) + self.assertEqual(urlunsplit(result, keep_empty=False), url2) + self.assertEqual(urlunsplit(tuple(result), keep_empty=False), url2) + result = urlsplit(url, allow_none=False) + with self.assertRaises(ValueError): + urlunsplit(result, keep_empty=True) + urlunsplit(tuple(result), keep_empty=True) + + def checkRoundtrips1(self, url, parsed, split, url2=None, *, allow_none): + if url2 is None: + url2 = url + result = urlparse(url, allow_none=allow_none) self.assertSequenceEqual(result, parsed) t = (result.scheme, result.netloc, result.path, - result.params, result.query, result.fragment) + result.params, result.query, result.fragment) self.assertSequenceEqual(t, parsed) # put it back together and it should be the same - result2 = urllib.parse.urlunparse(result, keep_empty=allow_none) - self.assertSequenceEqual(result2, url2) - self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none)) + result2 = urlunparse(result) + self.assertEqual(result2, url2) + self.assertEqual(result2, result.geturl()) + self.assertEqual(urlunparse(result, keep_empty=allow_none), url2) + self.assertEqual(urlunparse(tuple(result), keep_empty=allow_none), result2) # the result of geturl() is a fixpoint; we can always parse it # again to get the same result: - result3 = urllib.parse.urlparse(result.geturl(keep_empty=allow_none), allow_none=allow_none) - self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none)) + result3 = urlparse(result.geturl(), allow_none=allow_none) + self.assertEqual(result3.geturl(), result.geturl()) self.assertSequenceEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) self.assertEqual(result3.netloc, result.netloc) @@ -149,18 +177,19 @@ def checkRoundtrips(self, url, parsed, split, url2=None, *, allow_none=True): self.assertEqual(result3.port, result.port) # check the roundtrip using urlsplit() as well - result = urllib.parse.urlsplit(url, allow_none=allow_none) + result = urlsplit(url, allow_none=allow_none) self.assertSequenceEqual(result, split) t = (result.scheme, result.netloc, result.path, - result.query, result.fragment) + result.query, result.fragment) self.assertSequenceEqual(t, split) - result2 = urllib.parse.urlunsplit(result, keep_empty=allow_none) - self.assertSequenceEqual(result2, url2) - self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none)) + result2 = urlunsplit(result) + self.assertEqual(result2, url2) + self.assertEqual(result2, result.geturl()) + self.assertEqual(urlunsplit(tuple(result), keep_empty=allow_none), result2) # check the fixpoint property of re-parsing the result of geturl() - result3 = urllib.parse.urlsplit(result.geturl(keep_empty=allow_none), allow_none=allow_none) - self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none)) + result3 = urlsplit(result.geturl(), allow_none=allow_none) + self.assertEqual(result3.geturl(), result.geturl()) self.assertSequenceEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) self.assertEqual(result3.netloc, result.netloc) @@ -288,32 +317,28 @@ def test_roundtrips(self): ] for url, parsed, split in str_cases + bytes_cases: with self.subTest(url): - self.checkRoundtrips(url, parsed, split, allow_none=True) - empty = url[:0] - parsed = tuple(x or empty for x in parsed) - split = tuple(x or empty for x in split) - self.checkRoundtrips(url, parsed, split, allow_none=False) + self.checkRoundtrips(url, parsed, split) def test_roundtrips_normalization(self): str_cases = [ ('///path/to/file', - '///path/to/file', + '/path/to/file', (None, '', '/path/to/file', None, None, None), (None, '', '/path/to/file', None, None)), ('scheme:///path/to/file', - 'scheme:///path/to/file', + 'scheme:/path/to/file', ('scheme', '', '/path/to/file', None, None, None), ('scheme', '', '/path/to/file', None, None)), ('file:/tmp/junk.txt', - 'file:/tmp/junk.txt', + 'file:///tmp/junk.txt', ('file', None, '/tmp/junk.txt', None, None, None), ('file', None, '/tmp/junk.txt', None, None)), ('http:/tmp/junk.txt', - 'http:/tmp/junk.txt', + 'http:///tmp/junk.txt', ('http', None, '/tmp/junk.txt', None, None, None), ('http', None, '/tmp/junk.txt', None, None)), ('https:/tmp/junk.txt', - 'https:/tmp/junk.txt', + 'https:///tmp/junk.txt', ('https', None, '/tmp/junk.txt', None, None, None), ('https', None, '/tmp/junk.txt', None, None)), ] @@ -371,9 +396,9 @@ def checkJoin(self, base, relurl, expected, *, relroundtrip=True): relurlb2 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb)) self.assertEqual(urllib.parse.urljoin(baseb, relurlb2), expectedb) - relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True), keep_empty=True) + relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True)) self.assertEqual(urllib.parse.urljoin(base, relurl3), expected) - relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True), keep_empty=True) + relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True)) self.assertEqual(urllib.parse.urljoin(baseb, relurlb3), expectedb) def test_unparse_parse(self): @@ -796,7 +821,7 @@ def _encode(t): url = url.rstrip(hash) if frag is None: frag = url[:0] - self.assertEqual(result.geturl(keep_empty=allow_none), url) + self.assertEqual(result.geturl(), url) self.assertEqual(result, (defrag, frag)) self.assertEqual(result.url, defrag) self.assertEqual(result.fragment, frag) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 72c39886d6f065..8d2a05bd134135 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -267,11 +267,27 @@ def _hostinfo(self): return hostname, port -_DefragResultBase = namedtuple('_DefragResultBase', 'url fragment') -_SplitResultBase = namedtuple( - '_SplitResultBase', 'scheme netloc path query fragment') -_ParseResultBase = namedtuple( - '_ParseResultBase', 'scheme netloc path params query fragment') +_UNSPECIFIED = ['not specified'] +_ALLOW_NONE_DEFAULT = False + +class _DefragResultBase(namedtuple('_DefragResultBase', 'url fragment')): + def geturl(self): + if self.fragment or (self.fragment is not None and + getattr(self, '_keep_empty', _ALLOW_NONE_DEFAULT)): + return self.url + self._HASH + self.fragment + else: + return self.url + +class _SplitResultBase(namedtuple( + '_SplitResultBase', 'scheme netloc path query fragment')): + def geturl(self): + return urlunsplit(self) + +class _ParseResultBase(namedtuple( + '_ParseResultBase', 'scheme netloc path params query fragment')): + def geturl(self): + return urlunparse(self) + _DefragResultBase.__doc__ = """ DefragResult(url, fragment) @@ -339,45 +355,27 @@ def _hostinfo(self): # retained since deprecating it isn't worth the hassle ResultBase = _NetlocResultMixinStr -_ALLOW_NONE_DEFAULT = False - # Structured result objects for string data class DefragResult(_DefragResultBase, _ResultMixinStr): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - if self.fragment or (keep_empty and self.fragment is not None): - return self.url + '#' + self.fragment - else: - return self.url + _HASH = '#' class SplitResult(_SplitResultBase, _NetlocResultMixinStr): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - return urlunsplit(self, keep_empty=keep_empty) class ParseResult(_ParseResultBase, _NetlocResultMixinStr): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - return urlunparse(self, keep_empty=keep_empty) # Structured result objects for bytes data class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - if self.fragment or (keep_empty and self.fragment is not None): - return self.url + b'#' + self.fragment - else: - return self.url + _HASH = b'#' class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - return urlunsplit(self, keep_empty=keep_empty) class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - return urlunparse(self, keep_empty=keep_empty) # Set up the encode/decode result pairs def _fix_result_transcoding(): @@ -424,7 +422,9 @@ def urlparse(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D if query is None: query = '' if fragment is None: fragment = '' result = ParseResult(scheme, netloc, url, params, query, fragment) - return _coerce_result(result) + result = _coerce_result(result) + result._keep_empty = allow_none + return result def _urlparse(url, scheme=None, allow_fragments=True): scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments) @@ -513,8 +513,10 @@ def urlsplit(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D if netloc is None: netloc = '' if query is None: query = '' if fragment is None: fragment = '' - v = SplitResult(scheme, netloc, url, query, fragment) - return _coerce_result(v) + result = SplitResult(scheme, netloc, url, query, fragment) + result = _coerce_result(result) + result._keep_empty = allow_none + return result def _urlsplit(url, scheme=None, allow_fragments=True): # Only lstrip url as some applications rely on preserving trailing space. @@ -551,13 +553,20 @@ def _urlsplit(url, scheme=None, allow_fragments=True): _checknetloc(netloc) return (scheme, netloc, url, query, fragment) -def urlunparse(components, *, keep_empty=_ALLOW_NONE_DEFAULT): +def urlunparse(components, *, keep_empty=_UNSPECIFIED): """Put a parsed URL back together again. This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had redundant delimiters, e.g. a ? with an empty query - (the draft states that these are equivalent).""" + (the draft states that these are equivalent) and keep_empty is false + or components is the result of the urlparse() call with allow_none=False.""" scheme, netloc, url, params, query, fragment, _coerce_result = ( _coerce_args(*components)) + if keep_empty is _UNSPECIFIED: + keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT) + elif keep_empty and not getattr(components, '_keep_empty', True): + raise ValueError('Cannot distinguish between empty and not defined ' + 'URI components in the result of parsing URL with ' + 'allow_none=False') if not keep_empty: if not netloc: if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): @@ -572,14 +581,22 @@ def urlunparse(components, *, keep_empty=_ALLOW_NONE_DEFAULT): url = "%s;%s" % (url, params) return _coerce_result(_urlunsplit(scheme, netloc, url, query, fragment)) -def urlunsplit(components, *, keep_empty=_ALLOW_NONE_DEFAULT): +def urlunsplit(components, *, keep_empty=_UNSPECIFIED): """Combine the elements of a tuple as returned by urlsplit() into a complete URL as a string. The data argument can be any five-item iterable. This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had unnecessary delimiters (for example, a ? with an - empty query; the RFC states that these are equivalent).""" + empty query; the RFC states that these are equivalent) and keep_empty + is false or components is the result of the urlsplit() call with + allow_none=False.""" scheme, netloc, url, query, fragment, _coerce_result = ( _coerce_args(*components)) + if keep_empty is _UNSPECIFIED: + keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT) + elif keep_empty and not getattr(components, '_keep_empty', True): + raise ValueError('Cannot distinguish between empty and not defined ' + 'URI components in the result of parsing URL with ' + 'allow_none=False') if not keep_empty: if not netloc: if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): @@ -692,7 +709,9 @@ def urldefrag(url, *, allow_none=_ALLOW_NONE_DEFAULT): frag = None defrag = url if not allow_none and frag is None: frag = '' - return _coerce_result(DefragResult(defrag, frag)) + result = _coerce_result(DefragResult(defrag, frag)) + result._keep_empty = allow_none + return result _hexdig = '0123456789ABCDEFabcdef' _hextobyte = None diff --git a/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst b/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst new file mode 100644 index 00000000000000..86a7e754d0aae2 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst @@ -0,0 +1,6 @@ +Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`, +:func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag` +functions. Add the *keep_empty* parameter to +:func:`~urllib.parse.urlunparse` and :func:`~urllib.parse.urlunsplit` +functions. This allows to distinguish between empty and not defined URI +components and preserve empty components.