From da8b2ba844a57e8476d81102b3bf227cd1176377 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 4 Oct 2022 19:03:53 -0400 Subject: [PATCH] memoize a "master regex" instead of looping through regex patterns Add pylint overrides for tests which access the private memoized master regex. I think these tests are necessary if we think memoizing the regex is a useful optimization, but this may well be premature optimization. --- tests/arguments.py | 82 +++++++++++++++++++++++++++++++++++++++++++--- vermin/config.py | 35 ++++++++++++++++++-- 2 files changed, 110 insertions(+), 7 deletions(-) diff --git a/tests/arguments.py b/tests/arguments.py index 53c5eeae..bb0e7d9a 100644 --- a/tests/arguments.py +++ b/tests/arguments.py @@ -1,9 +1,8 @@ import os -import re from tempfile import mkdtemp from shutil import rmtree -from vermin import Backports, Features, DEFAULT_PROCESSES, CONFIG_FILE_NAMES +from vermin import Backports, Config, Features, DEFAULT_PROCESSES, CONFIG_FILE_NAMES from vermin.utility import open_wrapper import vermin.formats @@ -231,11 +230,15 @@ def test_exclude_regex(self): self.assertContainsDict({"code": 1}, self.parse_args(["--exclude-regex"])) # Needs part. self.assertEmpty(self.config.exclusion_regex()) + # Nothing should be excluded before any regex are added. + self.assertFalse(self.config.is_excluded_by_regex("asdf.py")) + args = ["--exclude-regex", r"\.pyi$", "--exclude-regex", "^a/b$"] self.assertContainsDict({"code": 0}, self.parse_args(args)) - expected = [re.compile(r"\.pyi$"), re.compile("^a/b$")] - self.assertEqual(expected, self.config.exclusion_regex()) # Expect it sorted. + + self.assertEqual([r"\.pyi$", "^a/b$"], self.config.exclusion_regex()) # Expect it sorted. + self.assertFalse(self.config.is_excluded_by_regex("a/b.py")) self.assertTrue(self.config.is_excluded_by_regex("asdf.pyi")) self.assertTrue(self.config.is_excluded_by_regex("a/m.pyi")) @@ -249,6 +252,7 @@ def test_exclude_regex(self): self.assertFalse(self.config.is_excluded_by_regex("a/b/c.py")) self.config.reset() + self.assertEmpty(self.config.exclusion_regex()) args = ["--exclude-regex", "^a/b/.+$", "--exclude-regex", r"^a/.+/.+\.pyi$"] self.assertContainsDict({"code": 0}, self.parse_args(args)) @@ -269,6 +273,76 @@ def test_exclude_regex(self): self.assertContainsDict({"code": 0}, self.parse_args(["--no-exclude-regex"])) self.assertEmpty(self.config.exclusion_regex()) + def test_exclude_regex_master_invalidation(self): + self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access + + self.config.add_exclusion_regex(r"\.pyi$") + + # The memoized master regex should not be set until a match is attempted. + self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access + self.assertTrue(self.config.is_excluded_by_regex("a/b.pyi")) + # After matching, the master regex should be memoized. + self.assertIsNotNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access + + merged_rx = self.config._merged_exclusion_regex # pylint: disable=protected-access + + self.config.add_exclusion_regex(r"\.py$") + # After adding a new regex, the memoization should be invalidated. + self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access + self.assertTrue(self.config.is_excluded_by_regex("a/b.py")) + # The memoization should now be reconstructed after attempting a match. + self.assertIsNotNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access + + # Even though compiled regex are interned, adding a new exclude regex should produce a different + # memoized master regex. + self.assertNotEqual( + merged_rx, + self.config._merged_exclusion_regex, # pylint: disable=protected-access + ) + + self.config.reset() + # After clearing the exclusions, the memoization should be invalidated. + self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access + + def test_exclude_regex_other_config(self): + """Test a few properties of regex exclusion with multiple config instances.""" + + # Create another config object with its own exclude patterns. + other_config = Config() + other_config.add_exclusion_regex(r"\.py$") + + # Verify that a memoization is generated for the separate config. + self.assertIsNone(other_config._merged_exclusion_regex) # pylint: disable=protected-access + self.assertTrue(other_config.is_excluded_by_regex("a/b.py")) + self.assertIsNotNone(other_config._merged_exclusion_regex) # pylint: disable=protected-access + + # Initialize the current config again, and see that a memoization is generated. + self.config.add_exclusion_regex(r"\.py$") + self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access + self.assertTrue(self.config.is_excluded_by_regex("a/b.py")) + self.assertIsNotNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access + + # Since the two configs have the same pattern inputs, their memoized regex should also be + # the same. + self.assertEqual( + self.config._merged_exclusion_regex, # pylint: disable=protected-access + other_config._merged_exclusion_regex, # pylint: disable=protected-access + ) + + # Clobber the current config and read from the other one. + self.config.override_from(other_config) + # The memoization should be invalidated after an override_from() call. + self.assertIsNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access + + # Generate the memoization again. + self.assertTrue(self.config.is_excluded_by_regex("a/b.py")) + self.assertIsNotNone(self.config._merged_exclusion_regex) # pylint: disable=protected-access + # The memoized regex should still be the same as before. + self.assertEqual( + self.config._merged_exclusion_regex, # pylint: disable=protected-access + other_config._merged_exclusion_regex, # pylint: disable=protected-access + ) + def test_make_paths_absolute(self): self.assertTrue(self.config.make_paths_absolute()) diff --git a/vermin/config.py b/vermin/config.py index 08dc114d..e9911da8 100644 --- a/vermin/config.py +++ b/vermin/config.py @@ -31,6 +31,7 @@ def reset(self): self.__analyze_hidden = False self.__exclusions = set() self.__exclusion_regex = set() + self._clear_memoized_master_regex() self.__make_paths_absolute = True self.__backports = set() self.__features = set() @@ -52,6 +53,7 @@ def override_from(self, other_config): self.__analyze_hidden = other_config.analyze_hidden() self.__exclusions = set(other_config.exclusions()) self.__exclusion_regex = set(other_config.exclusion_regex()) + self._clear_memoized_master_regex() self.__make_paths_absolute = other_config.make_paths_absolute() self.__backports = other_config.backports() self.__features = other_config.features() @@ -295,8 +297,16 @@ def set_ignore_incomp(self, ignore): def add_exclusion(self, name): self.__exclusions.add(name) + def _clear_memoized_master_regex(self): + """Delete the master regex which joins all exclusion patterns. + + We merge all our regex patterns together as an optimization, but this memoized "master regex" is + invalidated whenever the list of exclusion regexes is updated, and must be reset.""" + self._merged_exclusion_regex = None + def add_exclusion_regex(self, pattern): - self.__exclusion_regex.add(re.compile(pattern)) + self._clear_memoized_master_regex() + self.__exclusion_regex.add(pattern) def add_exclusion_file(self, filename): try: @@ -310,6 +320,7 @@ def clear_exclusions(self): self.__exclusions.clear() def clear_exclusion_regex(self): + self._clear_memoized_master_regex() self.__exclusion_regex.clear() def exclusions(self): @@ -319,7 +330,7 @@ def exclusions(self): def exclusion_regex(self): res = list(self.__exclusion_regex) - res.sort(key=lambda rx: rx.pattern) + res.sort() return res def is_excluded(self, name): @@ -335,7 +346,25 @@ def is_excluded_codecs_encoding(self, name): return "ce={}".format(name) in self.__exclusions def is_excluded_by_regex(self, path): - return any(regex.search(path) for regex in self.__exclusion_regex) + """Check whether `path` matches any of the registered exclusion regex patterns.""" + # If no regex patterns are provided, then this method never discards any path. This early check + # also avoids attempting to construct a merged regex from zero patterns below. + if not self.__exclusion_regex: + return False + # If any regex patterns are provided to exclude files against, we OR them together with '|' and + # search against them all at once. This memoization with a private member variable means we + # should only compile any regex at most once during vermin's execution, and we can lean on the + # regex engine to amortize the work of searching multiple patterns instead of matching regex in + # a python loop. + if self._merged_exclusion_regex is None: + # Because self.exclusion_regex() sorts its output, we should always get the same merged regex + # regardless of the order in which exclude patterns are added or the python set iteration + # order. We don't rely on that idempotency anywhere, though. + self._merged_exclusion_regex = re.compile('|'.join(self.exclusion_regex())) + # As described in --help, we match --exclude-regex with re.search(), requiring the user to + # provide '^' or '$' anchors to ensure the pattern matches against either end of the string and + # not somewhere in the middle. + return bool(self._merged_exclusion_regex.search(path)) def make_paths_absolute(self): return self.__make_paths_absolute