Skip to content

Commit

Permalink
memoize a "master regex" instead of looping through regex patterns
Browse files Browse the repository at this point in the history
Add pylint overrides for tests which access the private memoized master regex. I think these tests
are necessary if we think memoizing the regex is a useful optimization, but this may well be
premature optimization.
  • Loading branch information
cosmicexplorer committed Oct 5, 2022
1 parent 767d266 commit da8b2ba
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 7 deletions.
82 changes: 78 additions & 4 deletions tests/arguments.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import os
import re
from tempfile import mkdtemp
from shutil import rmtree

from vermin import Backports, Features, DEFAULT_PROCESSES, CONFIG_FILE_NAMES
from vermin import Backports, Config, Features, DEFAULT_PROCESSES, CONFIG_FILE_NAMES
from vermin.utility import open_wrapper
import vermin.formats

Expand Down Expand Up @@ -231,11 +230,15 @@ def test_exclude_regex(self):
self.assertContainsDict({"code": 1}, self.parse_args(["--exclude-regex"])) # Needs <rx> part.
self.assertEmpty(self.config.exclusion_regex())

# Nothing should be excluded before any regex are added.
self.assertFalse(self.config.is_excluded_by_regex("asdf.py"))

args = ["--exclude-regex", r"\.pyi$",
"--exclude-regex", "^a/b$"]
self.assertContainsDict({"code": 0}, self.parse_args(args))
expected = [re.compile(r"\.pyi$"), re.compile("^a/b$")]
self.assertEqual(expected, self.config.exclusion_regex()) # Expect it sorted.

self.assertEqual([r"\.pyi$", "^a/b$"], self.config.exclusion_regex()) # Expect it sorted.

self.assertFalse(self.config.is_excluded_by_regex("a/b.py"))
self.assertTrue(self.config.is_excluded_by_regex("asdf.pyi"))
self.assertTrue(self.config.is_excluded_by_regex("a/m.pyi"))
Expand All @@ -249,6 +252,7 @@ def test_exclude_regex(self):
self.assertFalse(self.config.is_excluded_by_regex("a/b/c.py"))

self.config.reset()
self.assertEmpty(self.config.exclusion_regex())
args = ["--exclude-regex", "^a/b/.+$",
"--exclude-regex", r"^a/.+/.+\.pyi$"]
self.assertContainsDict({"code": 0}, self.parse_args(args))
Expand All @@ -269,6 +273,76 @@ def test_exclude_regex(self):
self.assertContainsDict({"code": 0}, self.parse_args(["--no-exclude-regex"]))
self.assertEmpty(self.config.exclusion_regex())

def test_exclude_regex_master_invalidation(self):
self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access

self.config.add_exclusion_regex(r"\.pyi$")

# The memoized master regex should not be set until a match is attempted.
self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access
self.assertTrue(self.config.is_excluded_by_regex("a/b.pyi"))
# After matching, the master regex should be memoized.
self.assertIsNotNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access

merged_rx = self.config._merged_exclusion_regex # pylint: disable=protected-access

self.config.add_exclusion_regex(r"\.py$")
# After adding a new regex, the memoization should be invalidated.
self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access
self.assertTrue(self.config.is_excluded_by_regex("a/b.py"))
# The memoization should now be reconstructed after attempting a match.
self.assertIsNotNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access

# Even though compiled regex are interned, adding a new exclude regex should produce a different
# memoized master regex.
self.assertNotEqual(
merged_rx,
self.config._merged_exclusion_regex, # pylint: disable=protected-access
)

self.config.reset()
# After clearing the exclusions, the memoization should be invalidated.
self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access

def test_exclude_regex_other_config(self):
"""Test a few properties of regex exclusion with multiple config instances."""

# Create another config object with its own exclude patterns.
other_config = Config()
other_config.add_exclusion_regex(r"\.py$")

# Verify that a memoization is generated for the separate config.
self.assertIsNone(other_config._merged_exclusion_regex) # pylint: disable=protected-access
self.assertTrue(other_config.is_excluded_by_regex("a/b.py"))
self.assertIsNotNone(other_config._merged_exclusion_regex) # pylint: disable=protected-access

# Initialize the current config again, and see that a memoization is generated.
self.config.add_exclusion_regex(r"\.py$")
self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access
self.assertTrue(self.config.is_excluded_by_regex("a/b.py"))
self.assertIsNotNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access

# Since the two configs have the same pattern inputs, their memoized regex should also be
# the same.
self.assertEqual(
self.config._merged_exclusion_regex, # pylint: disable=protected-access
other_config._merged_exclusion_regex, # pylint: disable=protected-access
)

# Clobber the current config and read from the other one.
self.config.override_from(other_config)
# The memoization should be invalidated after an override_from() call.
self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access

# Generate the memoization again.
self.assertTrue(self.config.is_excluded_by_regex("a/b.py"))
self.assertIsNotNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access
# The memoized regex should still be the same as before.
self.assertEqual(
self.config._merged_exclusion_regex, # pylint: disable=protected-access
other_config._merged_exclusion_regex, # pylint: disable=protected-access
)

def test_make_paths_absolute(self):
self.assertTrue(self.config.make_paths_absolute())

Expand Down
35 changes: 32 additions & 3 deletions vermin/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def reset(self):
self.__analyze_hidden = False
self.__exclusions = set()
self.__exclusion_regex = set()
self._clear_memoized_master_regex()
self.__make_paths_absolute = True
self.__backports = set()
self.__features = set()
Expand All @@ -52,6 +53,7 @@ def override_from(self, other_config):
self.__analyze_hidden = other_config.analyze_hidden()
self.__exclusions = set(other_config.exclusions())
self.__exclusion_regex = set(other_config.exclusion_regex())
self._clear_memoized_master_regex()
self.__make_paths_absolute = other_config.make_paths_absolute()
self.__backports = other_config.backports()
self.__features = other_config.features()
Expand Down Expand Up @@ -295,8 +297,16 @@ def set_ignore_incomp(self, ignore):
def add_exclusion(self, name):
self.__exclusions.add(name)

def _clear_memoized_master_regex(self):
"""Delete the master regex which joins all exclusion patterns.
We merge all our regex patterns together as an optimization, but this memoized "master regex" is
invalidated whenever the list of exclusion regexes is updated, and must be reset."""
self._merged_exclusion_regex = None

def add_exclusion_regex(self, pattern):
self.__exclusion_regex.add(re.compile(pattern))
self._clear_memoized_master_regex()
self.__exclusion_regex.add(pattern)

def add_exclusion_file(self, filename):
try:
Expand All @@ -310,6 +320,7 @@ def clear_exclusions(self):
self.__exclusions.clear()

def clear_exclusion_regex(self):
self._clear_memoized_master_regex()
self.__exclusion_regex.clear()

def exclusions(self):
Expand All @@ -319,7 +330,7 @@ def exclusions(self):

def exclusion_regex(self):
res = list(self.__exclusion_regex)
res.sort(key=lambda rx: rx.pattern)
res.sort()
return res

def is_excluded(self, name):
Expand All @@ -335,7 +346,25 @@ def is_excluded_codecs_encoding(self, name):
return "ce={}".format(name) in self.__exclusions

def is_excluded_by_regex(self, path):
return any(regex.search(path) for regex in self.__exclusion_regex)
"""Check whether `path` matches any of the registered exclusion regex patterns."""
# If no regex patterns are provided, then this method never discards any path. This early check
# also avoids attempting to construct a merged regex from zero patterns below.
if not self.__exclusion_regex:
return False
# If any regex patterns are provided to exclude files against, we OR them together with '|' and
# search against them all at once. This memoization with a private member variable means we
# should only compile any regex at most once during vermin's execution, and we can lean on the
# regex engine to amortize the work of searching multiple patterns instead of matching regex in
# a python loop.
if self._merged_exclusion_regex is None:
# Because self.exclusion_regex() sorts its output, we should always get the same merged regex
# regardless of the order in which exclude patterns are added or the python set iteration
# order. We don't rely on that idempotency anywhere, though.
self._merged_exclusion_regex = re.compile('|'.join(self.exclusion_regex()))
# As described in --help, we match --exclude-regex with re.search(), requiring the user to
# provide '^' or '$' anchors to ensure the pattern matches against either end of the string and
# not somewhere in the middle.
return bool(self._merged_exclusion_regex.search(path))

def make_paths_absolute(self):
return self.__make_paths_absolute
Expand Down

0 comments on commit da8b2ba

Please sign in to comment.