Skip to content

Commit

Permalink
pythongh-69589: Fix path normalization in urllib.parse.urljoin()
Browse files Browse the repository at this point in the history
* Preserve double slashes in path.
* Fix the case when the base path is relative and the relative reference
  path starts with '..'.
  • Loading branch information
serhiy-storchaka committed Nov 11, 2024
1 parent 450db61 commit 659a910
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 23 deletions.
100 changes: 99 additions & 1 deletion Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,11 @@ def checkJoin(self, base, relurl, expected, *, relroundtrip=True):
self.assertEqual(urllib.parse.urljoin(base, relurl), expected)
relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb))
self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb)
else:
relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl))
self.assertNotEqual(urllib.parse.urljoin(base, relurl), expected)
relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb))
self.assertNotEqual(urllib.parse.urljoin(baseb, relurlb), expectedb)

def test_unparse_parse(self):
str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',]
Expand Down Expand Up @@ -568,6 +573,9 @@ def test_urljoins(self):
# slashes
self.checkJoin('http://a/b/c/d/e/', '../../f/g/', 'http://a/b/c/f/g/')
self.checkJoin('http://a/b/c/d/e', '../../f/g/', 'http://a/b/f/g/')
self.checkJoin('http://a/b/c/d/e//', '../../f/g/', 'http://a/b/c/d/f/g/')
self.checkJoin('http://a/b/c/d/e///', '../../f/g/', 'http://a/b/c/d/e/f/g/')
self.checkJoin('http://a/b/c/d/e////', '../../f/g/', 'http://a/b/c/d/e//f/g/')
self.checkJoin('http://a/b/c/d/e/', '/../../f/g/', 'http://a/f/g/')
self.checkJoin('http://a/b/c/d/e', '/../../f/g/', 'http://a/f/g/')
self.checkJoin('http://a/b/c/d/e/', '../../f/g', 'http://a/b/c/f/g')
Expand Down Expand Up @@ -645,6 +653,16 @@ def test_urljoins_relative_base(self):
self.checkJoin('//', '/w', '///w')
self.checkJoin('//', '///w', '///w')
self.checkJoin('//', 'w', '///w')
self.checkJoin('//', '../w', '///w')
self.checkJoin('//', './w', '///w')
self.checkJoin('//', '..//w', '///w')
self.checkJoin('//', './/w', '///w')
self.checkJoin('//', '..', '//')
self.checkJoin('//', '.', '//')
self.checkJoin('//', '../', '//')
self.checkJoin('//', './', '//')
self.checkJoin('//', '..//', '///')
self.checkJoin('//', './/', '///')

self.checkJoin('//a', '', '//a')
self.checkJoin('//a', '//', '//a')
Expand All @@ -653,6 +671,16 @@ def test_urljoins_relative_base(self):
self.checkJoin('//a', '/w', '//a/w')
self.checkJoin('//a', '///w', '//a/w')
self.checkJoin('//a', 'w', '//a/w')
self.checkJoin('//a', '../w', '//a/w')
self.checkJoin('//a', './w', '//a/w')
self.checkJoin('//a', '..//w', '//a/w')
self.checkJoin('//a', './/w', '//a/w')
self.checkJoin('//a', '..', '//a')
self.checkJoin('//a', '.', '//a')
self.checkJoin('//a', '../', '//a')
self.checkJoin('//a', './', '//a')
self.checkJoin('//a', '..//', '//a/')
self.checkJoin('//a', './/', '//a/')

for scheme in '', 'http:':
self.checkJoin('http:', scheme + '', 'http:')
Expand All @@ -661,7 +689,21 @@ def test_urljoins_relative_base(self):
self.checkJoin('http:', scheme + '//v/w', 'http://v/w')
self.checkJoin('http:', scheme + '/w', 'http:/w')
self.checkJoin('http:', scheme + '///w', 'http:/w')
self.checkJoin('http:', scheme + 'w', 'http:/w')
self.checkJoin('http:', scheme + 'w', 'http:w')
self.checkJoin('http:', scheme + '../w', 'http:w')
self.checkJoin('http:', scheme + './w', 'http:w')
self.checkJoin('http:', scheme + '..//w', 'http:/w')
self.checkJoin('http:', scheme + './/w', 'http:/w')
self.checkJoin('http:', scheme + '..///w', 'http:////w')
self.checkJoin('http:', scheme + './//w', 'http:////w')
self.checkJoin('http:', scheme + '..', 'http:')
self.checkJoin('http:', scheme + '.', 'http:')
self.checkJoin('http:', scheme + '../', 'http:')
self.checkJoin('http:', scheme + './', 'http:')
self.checkJoin('http:', scheme + '..//', 'http:/')
self.checkJoin('http:', scheme + './/', 'http:/')
self.checkJoin('http:', scheme + '..///', 'http:////')
self.checkJoin('http:', scheme + './//', 'http:////')

self.checkJoin('http://', scheme + '', 'http://')
self.checkJoin('http://', scheme + '//', 'http://')
Expand All @@ -670,6 +712,20 @@ def test_urljoins_relative_base(self):
self.checkJoin('http://', scheme + '/w', 'http:///w')
self.checkJoin('http://', scheme + '///w', 'http:///w')
self.checkJoin('http://', scheme + 'w', 'http:///w')
self.checkJoin('http://', scheme + '../w', 'http:///w')
self.checkJoin('http://', scheme + './w', 'http:///w')
self.checkJoin('http://', scheme + '..//w', 'http:///w')
self.checkJoin('http://', scheme + './/w', 'http:///w')
self.checkJoin('http://', scheme + '..///w', 'http:////w')
self.checkJoin('http://', scheme + './//w', 'http:////w')
self.checkJoin('http://', scheme + '..', 'http://')
self.checkJoin('http://', scheme + '.', 'http://')
self.checkJoin('http://', scheme + '../', 'http://')
self.checkJoin('http://', scheme + './', 'http://')
self.checkJoin('http://', scheme + '..//', 'http:///')
self.checkJoin('http://', scheme + './/', 'http:///')
self.checkJoin('http://', scheme + '..///', 'http:////')
self.checkJoin('http://', scheme + './//', 'http:////')

self.checkJoin('http://a', scheme + '', 'http://a')
self.checkJoin('http://a', scheme + '//', 'http://a')
Expand All @@ -678,6 +734,38 @@ def test_urljoins_relative_base(self):
self.checkJoin('http://a', scheme + '/w', 'http://a/w')
self.checkJoin('http://a', scheme + '///w', 'http://a/w')
self.checkJoin('http://a', scheme + 'w', 'http://a/w')
self.checkJoin('http://a', scheme + '../w', 'http://a/w')
self.checkJoin('http://a', scheme + './w', 'http://a/w')
self.checkJoin('http://a', scheme + '..//w', 'http://a/w')
self.checkJoin('http://a', scheme + './/w', 'http://a/w')
self.checkJoin('http://a', scheme + '..///w', 'http://a//w')
self.checkJoin('http://a', scheme + './//w', 'http://a//w')
self.checkJoin('http://a', scheme + '..', 'http://a')
self.checkJoin('http://a', scheme + '.', 'http://a')
self.checkJoin('http://a', scheme + '../', 'http://a')
self.checkJoin('http://a', scheme + './', 'http://a')
self.checkJoin('http://a', scheme + '..//', 'http://a/')
self.checkJoin('http://a', scheme + './/', 'http://a/')
self.checkJoin('http://a', scheme + '..///', 'http://a//')
self.checkJoin('http://a', scheme + './//', 'http://a//')

self.checkJoin('b/c', '', 'b/c')
self.checkJoin('b/c', '//', 'b/c')
self.checkJoin('b/c', '//v', '//v')
self.checkJoin('b/c', '//v/w', '//v/w')
self.checkJoin('b/c', '/w', '/w')
self.checkJoin('b/c', '///w', '/w')
self.checkJoin('b/c', 'w', 'b/w')
self.checkJoin('b/c', '../w', 'w')
self.checkJoin('b/c', '../../w', 'w')
self.checkJoin('b/c', '../../../w', 'w')
self.checkJoin('b/c', 'w/.', 'b/w/')
self.checkJoin('b/c', '../w/.', 'w/')
self.checkJoin('b/c', '../../w/.', 'w/')
self.checkJoin('b/c', '../../../w/.', 'w/')
self.checkJoin('b/c', '..', '')
self.checkJoin('b/c', '../..', '')
self.checkJoin('b/c', '../../..', '')

self.checkJoin('/b/c', '', '/b/c')
self.checkJoin('/b/c', '//', '/b/c')
Expand All @@ -686,6 +774,16 @@ def test_urljoins_relative_base(self):
self.checkJoin('/b/c', '/w', '/w')
self.checkJoin('/b/c', '///w', '/w')
self.checkJoin('/b/c', 'w', '/b/w')
self.checkJoin('/b/c', '../w', '/w')
self.checkJoin('/b/c', '../../w', '/w')
self.checkJoin('/b/c', '../../../w', '/w')
self.checkJoin('/b/c', 'w/.', '/b/w/')
self.checkJoin('/b/c', '../w/.', '/w/')
self.checkJoin('/b/c', '../../w/.', '/w/')
self.checkJoin('/b/c', '../../../w/.', '/w/')
self.checkJoin('/b/c', '..', '/')
self.checkJoin('/b/c', '../..', '/')
self.checkJoin('/b/c', '../../..', '/')

self.checkJoin('///b/c', '', '///b/c')
self.checkJoin('///b/c', '//', '///b/c')
Expand Down
34 changes: 12 additions & 22 deletions Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,31 +594,23 @@ def urljoin(base, url, allow_fragments=True):
return _coerce_result(_urlunsplit(scheme, netloc, path,
query, fragment))

base_parts = bpath.split('/')
if base_parts[-1] != '':
# the last item is not a directory, so will not be taken into account
# in resolving the relative path
del base_parts[-1]

# for rfc3986, ignore all base path should the first character be root.
if path[:1] == '/':
segments = path.split('/')
else:
segments = base_parts + path.split('/')
# filter out elements that would cause redundant slashes on re-joining
# the resolved_path
segments[1:-1] = filter(None, segments[1:-1])
if path[:1] != '/' and '/' in bpath:
path = bpath.rsplit('/', 1)[0] + '/' + path

resolved_path = []
path = _remove_dot_segments(path)
return _coerce_result(_urlunsplit(scheme, netloc, path, query, fragment))

def _remove_dot_segments(path):
print(path)
segments = path.split('/')
min_len = 0 if segments[0] else 1

resolved_path = []
for seg in segments:
if seg == '..':
try:
if len(resolved_path) > min_len:
resolved_path.pop()
except IndexError:
# ignore any .. segments that would otherwise cause an IndexError
# when popped from resolved_path if resolving for rfc3986
pass
elif seg == '.':
continue
else:
Expand All @@ -629,9 +621,7 @@ def urljoin(base, url, allow_fragments=True):
# then we need to append the trailing '/'
resolved_path.append('')

return _coerce_result(_urlunsplit(scheme, netloc, '/'.join(
resolved_path) or '/', query, fragment))

return '/'.join(resolved_path)

def urldefrag(url):
"""Removes any existing fragment from URL.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Preserve double slashes in path in :func:`urllib.parse.urljoin`. Fix
:func:`!urljoin` for the case when the base path is relative and the
relative reference path starts with '..'.

0 comments on commit 659a910

Please sign in to comment.