Skip to content

Commit

Permalink
pythonGH-79634: Accept path-like objects as pathlib glob patterns. (p…
Browse files Browse the repository at this point in the history
…ython#114017)

Allow `os.PathLike` objects to be passed as patterns to `pathlib.Path.glob()` and `rglob()`. (It's already possible to use them in `PurePath.match()`)

While we're in the area:

- Allow empty glob patterns in `PathBase` (but not `Path`)
- Speed up globbing in `PathBase` by generating paths with trailing slashes only as a final step, rather than for every intermediate directory.
- Simplify and speed up handling of rare patterns involving both `**` and `..` segments.
  • Loading branch information
barneygale authored and Glyphack committed Jan 27, 2024
1 parent 8108589 commit e12cf3d
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 72 deletions.
6 changes: 6 additions & 0 deletions Doc/library/pathlib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,9 @@ call fails (for example because the path doesn't exist).
future Python release, patterns with this ending will match both files
and directories. Add a trailing slash to match only directories.

.. versionchanged:: 3.13
The *pattern* parameter accepts a :term:`path-like object`.

.. method:: Path.group(*, follow_symlinks=True)

Return the name of the group owning the file. :exc:`KeyError` is raised
Expand Down Expand Up @@ -1498,6 +1501,9 @@ call fails (for example because the path doesn't exist).
.. versionchanged:: 3.13
The *follow_symlinks* parameter was added.

.. versionchanged:: 3.13
The *pattern* parameter accepts a :term:`path-like object`.

.. method:: Path.rmdir()

Remove this directory. The directory must be empty.
Expand Down
49 changes: 31 additions & 18 deletions Lib/pathlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,29 @@ def as_uri(self):
from urllib.parse import quote_from_bytes
return prefix + quote_from_bytes(os.fsencode(path))

@property
def _pattern_stack(self):
"""Stack of path components, to be used with patterns in glob()."""
parts = self._tail.copy()
pattern = self._raw_path
if self.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
elif not parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
elif pattern[-1] in (self.pathmod.sep, self.pathmod.altsep):
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
parts.append('')
elif parts[-1] == '**':
# GH-70303: '**' only matches directories. Add trailing slash.
warnings.warn(
"Pattern ending '**' will match files and directories in a "
"future Python release. Add a trailing slash to match only "
"directories and remove this warning.",
FutureWarning, 4)
parts.append('')
parts.reverse()
return parts


# Subclassing os.PathLike makes isinstance() checks slower,
# which in turn makes Path construction slower. Register instead!
Expand Down Expand Up @@ -580,7 +603,7 @@ def iterdir(self):
def _scandir(self):
return os.scandir(self)

def _make_child_entry(self, entry, is_dir=False):
def _make_child_entry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
path_str = entry.name if str(self) == '.' else entry.path
path = self.with_segments(path_str)
Expand All @@ -591,6 +614,8 @@ def _make_child_entry(self, entry, is_dir=False):
return path

def _make_child_relpath(self, name):
if not name:
return self
path_str = str(self)
tail = self._tail
if tail:
Expand All @@ -611,14 +636,8 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
kind, including directories) matching the given relative pattern.
"""
sys.audit("pathlib.Path.glob", self, pattern)
if pattern.endswith('**'):
# GH-70303: '**' only matches directories. Add trailing slash.
warnings.warn(
"Pattern ending '**' will match files and directories in a "
"future Python release. Add a trailing slash to match only "
"directories and remove this warning.",
FutureWarning, 2)
pattern = f'{pattern}/'
if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern)
return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)

Expand All @@ -628,15 +647,9 @@ def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
this subtree.
"""
sys.audit("pathlib.Path.rglob", self, pattern)
if pattern.endswith('**'):
# GH-70303: '**' only matches directories. Add trailing slash.
warnings.warn(
"Pattern ending '**' will match files and directories in a "
"future Python release. Add a trailing slash to match only "
"directories and remove this warning.",
FutureWarning, 2)
pattern = f'{pattern}/'
pattern = f'**/{pattern}'
if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern)
pattern = '**' / pattern
return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)

Expand Down
98 changes: 47 additions & 51 deletions Lib/pathlib/_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ def _compile_pattern(pat, sep, case_sensitive):
return re.compile(regex, flags=flags).match


def _select_special(paths, part):
"""Yield special literal children of the given paths."""
for path in paths:
yield path._make_child_relpath(part)


def _select_children(parent_paths, dir_only, follow_symlinks, match):
"""Yield direct children of given paths, filtering by name and type."""
if follow_symlinks is None:
Expand All @@ -84,7 +90,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
except OSError:
continue
if match(entry.name):
yield parent_path._make_child_entry(entry, dir_only)
yield parent_path._make_child_entry(entry)


def _select_recursive(parent_paths, dir_only, follow_symlinks):
Expand All @@ -107,7 +113,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
for entry in entries:
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
paths.append(path._make_child_entry(entry, dir_only))
paths.append(path._make_child_entry(entry))
continue
except OSError:
pass
Expand Down Expand Up @@ -427,6 +433,14 @@ def is_absolute(self):
a drive)."""
return self.pathmod.isabs(self._raw_path)

@property
def _pattern_stack(self):
"""Stack of path components, to be used with patterns in glob()."""
anchor, parts = self._stack
if anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
return parts

def match(self, path_pattern, *, case_sensitive=None):
"""
Return True if this path matches the given pattern.
Expand All @@ -436,11 +450,10 @@ def match(self, path_pattern, *, case_sensitive=None):
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.pathmod)
sep = path_pattern.pathmod.sep
pattern_str = str(path_pattern)
if path_pattern.anchor:
pass
pattern_str = str(path_pattern)
elif path_pattern.parts:
pattern_str = f'**{sep}{pattern_str}'
pattern_str = str('**' / path_pattern)
else:
raise ValueError("empty pattern")
match = _compile_pattern(pattern_str, sep, case_sensitive)
Expand Down Expand Up @@ -714,10 +727,8 @@ def _scandir(self):
from contextlib import nullcontext
return nullcontext(self.iterdir())

def _make_child_entry(self, entry, is_dir=False):
def _make_child_entry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
if is_dir:
return entry.joinpath('')
return entry

def _make_child_relpath(self, name):
Expand All @@ -727,57 +738,35 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
"""Iterate over this subtree and yield all existing files (of any
kind, including directories) matching the given relative pattern.
"""
path_pattern = self.with_segments(pattern)
if path_pattern.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
elif not path_pattern.parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))

pattern_parts = list(path_pattern.parts)
if not self.pathmod.split(pattern)[1]:
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
pattern_parts.append('')

if not isinstance(pattern, PurePathBase):
pattern = self.with_segments(pattern)
if case_sensitive is None:
# TODO: evaluate case-sensitivity of each directory in _select_children().
case_sensitive = _is_case_sensitive(self.pathmod)

# If symlinks are handled consistently, and the pattern does not
# contain '..' components, then we can use a 'walk-and-match' strategy
# when expanding '**' wildcards. When a '**' wildcard is encountered,
# all following pattern parts are immediately consumed and used to
# build a `re.Pattern` object. This pattern is used to filter the
# recursive walk. As a result, pattern parts following a '**' wildcard
# do not perform any filesystem access, which can be much faster!
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
stack = pattern._pattern_stack
specials = ('', '.', '..')
filter_paths = False
deduplicate_paths = False
sep = self.pathmod.sep
paths = iter([self.joinpath('')] if self.is_dir() else [])
part_idx = 0
while part_idx < len(pattern_parts):
part = pattern_parts[part_idx]
part_idx += 1
if part == '':
# Trailing slash.
pass
elif part == '..':
paths = (path._make_child_relpath('..') for path in paths)
while stack:
part = stack.pop()
if part in specials:
paths = _select_special(paths, part)
elif part == '**':
# Consume adjacent '**' components.
while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**':
part_idx += 1

if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '':
dir_only = pattern_parts[-1] == ''
paths = _select_recursive(paths, dir_only, follow_symlinks)
while stack and stack[-1] == '**':
stack.pop()

# Filter out paths that don't match pattern.
prefix_len = len(str(self._make_child_relpath('_'))) - 1
match = _compile_pattern(str(path_pattern), sep, case_sensitive)
paths = (path for path in paths if match(str(path), prefix_len))
return paths
# Consume adjacent non-special components and enable post-walk
# regex filtering, provided we're treating symlinks consistently.
if follow_symlinks is not None:
while stack and stack[-1] not in specials:
filter_paths = True
stack.pop()

dir_only = part_idx < len(pattern_parts)
dir_only = bool(stack)
paths = _select_recursive(paths, dir_only, follow_symlinks)
if deduplicate_paths:
# De-duplicate if we've already seen a '**' component.
Expand All @@ -786,18 +775,25 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
dir_only = part_idx < len(pattern_parts)
dir_only = bool(stack)
match = _compile_pattern(part, sep, case_sensitive)
paths = _select_children(paths, dir_only, follow_symlinks, match)
if filter_paths:
# Filter out paths that don't match pattern.
prefix_len = len(str(self._make_child_relpath('_'))) - 1
match = _compile_pattern(str(pattern), sep, case_sensitive)
paths = (path for path in paths if match(str(path), prefix_len))
return paths

def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
"""Recursively yield all existing files (of any kind, including
directories) matching the given relative pattern, anywhere in
this subtree.
"""
return self.glob(
f'**/{pattern}', case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
if not isinstance(pattern, PurePathBase):
pattern = self.with_segments(pattern)
pattern = '**' / pattern
return self.glob(pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)

def walk(self, top_down=True, on_error=None, follow_symlinks=False):
"""Walk the directory tree from this directory, similar to os.walk()."""
Expand Down
23 changes: 23 additions & 0 deletions Lib/test/test_pathlib/test_pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -1818,6 +1818,13 @@ def test_walk_above_recursion_limit(self):
list(base.walk())
list(base.walk(top_down=False))

def test_glob_empty_pattern(self):
p = self.cls('')
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
list(p.glob(''))
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
list(p.glob('.'))

def test_glob_many_open_files(self):
depth = 30
P = self.cls
Expand Down Expand Up @@ -1860,6 +1867,22 @@ def test_glob_recursive_no_trailing_slash(self):
with self.assertWarns(FutureWarning):
p.rglob('*/**')

def test_glob_pathlike(self):
P = self.cls
p = P(self.base)
pattern = "dir*/file*"
expect = {p / "dirB/fileB", p / "dirC/fileC"}
self.assertEqual(expect, set(p.glob(P(pattern))))
self.assertEqual(expect, set(p.glob(FakePath(pattern))))

def test_rglob_pathlike(self):
P = self.cls
p = P(self.base, "dirC")
pattern = "**/file*"
expect = {p / "fileC", p / "dirD/fileD"}
self.assertEqual(expect, set(p.rglob(P(pattern))))
self.assertEqual(expect, set(p.rglob(FakePath(pattern))))


@only_posix
class PosixPathTest(PathTest, PurePosixPathTest):
Expand Down
9 changes: 6 additions & 3 deletions Lib/test/test_pathlib/test_pathlib_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1045,9 +1045,12 @@ def _check(glob, expected):
_check(p.glob("*/"), ["dirA/", "dirB/", "dirC/", "dirE/", "linkB/"])

def test_glob_empty_pattern(self):
p = self.cls('')
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
list(p.glob(''))
def _check(glob, expected):
self.assertEqual(set(glob), { P(self.base, q) for q in expected })
P = self.cls
p = P(self.base)
_check(p.glob(""), [""])
_check(p.glob("."), ["."])

def test_glob_case_sensitive(self):
P = self.cls
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Accept :term:`path-like objects <path-like object>` as patterns in
:meth:`pathlib.Path.glob` and :meth:`~pathlib.Path.rglob`.

0 comments on commit e12cf3d

Please sign in to comment.