From 1acef9e1e24051643dbc7f133b9f764e7c618564 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 30 Oct 2022 05:20:03 -0400 Subject: [PATCH] add --exclude-regex and --no-make-paths-absolute to exclude specific file paths (#115) * add --exclude-glob argument to exclude file paths from detection * add --no-make-paths-absolute to allow relative globs NOTE: tests are broken here because [!/] doesn't work the way I thought it would. The next commit will change the glob option to use a regex. * change --exclude-glob to --exclude-regex - tests pass now - interface more familiar for windows users * match using re.search and use anchors * memoize a "master regex" instead of looping through regex patterns Add pylint overrides for tests which access the private memoized master regex. I think these tests are necessary if we think memoizing the regex is a useful optimization, but this may well be premature optimization. * Revert "memoize a "master regex" instead of looping through regex patterns" This reverts commit 27302a3c64802fa72bf9e05bb082f88dfbd9dd20. * respond to review comments! * add tests for config parsing - also add 'yes'/'no' test cases for other boolean config flags * compare exclusion_regex pattern strings * fix os path sep for regex on windows --- sample.vermin.ini | 24 ++++++++++++++ tests/arguments.py | 52 +++++++++++++++++++++++++++++ tests/config.py | 81 +++++++++++++++++++++++++++++++++++++++++++-- tests/general.py | 71 ++++++++++++++++++++++++++++++++++++++- tests/testutils.py | 10 ++++++ vermin/arguments.py | 42 +++++++++++++++++++++-- vermin/config.py | 37 ++++++++++++++++++++- vermin/detection.py | 9 ++--- vermin/main.py | 3 +- 9 files changed, 317 insertions(+), 12 deletions(-) diff --git a/sample.vermin.ini b/sample.vermin.ini index 6e0157b3..9b3df0af 100644 --- a/sample.vermin.ini +++ b/sample.vermin.ini @@ -77,6 +77,30 @@ #exclusions = # email.parser.FeedParser # argparse.ArgumentParser(allow_abbrev) +# +# +# Exclude specific file paths from being crawled by matching against a regular expression. Useful to +# ignore files that are not executed under the same python interpreter, such as .pyi files. +# +# Exclude any '.pyi' file: \.pyi$ +# +# (Note: the below examples require `make_paths_absolute = no`, or prefixing the patterns with the +# regex-escaped path to the current directory.) +# +# Exclude the directory 'a/b/': ^a/b$ +# Exclude '.pyi' files under 'a/b/': ^a/b/.+\.pyi$ +# Exclude '.pyi' files in exactly 'a/b/': ^a/b/[^/]+\.pyi$ +# +# Example regex exclusions: +#exclusion_regex = +# \.pyi$ + +### Absolute Path Resolution +# Convert any relative paths from the command line into absolute paths. This affects the path +# printed to the terminal if a file fails a check, and and requires exclusion_regex patterns to +# match absolute paths. +# +#make_paths_absolute = yes ### Backports ### # Some features are sometimes backported into packages, in repositories such as PyPi, that are diff --git a/tests/arguments.py b/tests/arguments.py index a84f76fc..1b13ba75 100644 --- a/tests/arguments.py +++ b/tests/arguments.py @@ -226,6 +226,58 @@ def test_exclude_file(self): self.assertContainsDict({"code": 0}, self.parse_args(["--exclude-file", fn])) self.assertEmpty(self.config.exclusions()) + def test_exclude_regex(self): + self.assertContainsDict({"code": 1}, self.parse_args(["--exclude-regex"])) # Needs part. + self.assertEmpty(self.config.exclusion_regex()) + + args = ["--exclude-regex", r"\.pyi$", + "--exclude-regex", "^a/b$"] + self.assertContainsDict({"code": 0}, self.parse_args(args)) + expected = [r"\.pyi$", "^a/b$"] + self.assertEqual(expected, self.config.exclusion_regex()) # Expect it sorted. + self.assertFalse(self.config.is_excluded_by_regex("a/b.py")) + self.assertTrue(self.config.is_excluded_by_regex("asdf.pyi")) + self.assertTrue(self.config.is_excluded_by_regex("a/m.pyi")) + self.assertTrue(self.config.is_excluded_by_regex("a/b")) + + # Regex patterns are applied at each level of directory traversal. If 'a/' is provided on the + # command line, then the regex 'a/b' will match the recursive traversal when it encounters the + # directory 'a/b', and avoid recursing into that directory. This makes it more efficient to use + # when possible, but more difficult to test in isolation. test_exclude_regex_relative() in + # general.py tests that 'a/b' excludes e.g. 'a/b/c.py'. + self.assertFalse(self.config.is_excluded_by_regex("a/b/c.py")) + + self.config.reset() + self.assertEmpty(self.config.exclusion_regex()) + args = ["--exclude-regex", "^a/b/.+$", + "--exclude-regex", r"^a/.+/.+\.pyi$"] + self.assertContainsDict({"code": 0}, self.parse_args(args)) + self.assertTrue(self.config.is_excluded_by_regex("a/b/c.py")) + self.assertTrue(self.config.is_excluded_by_regex("a/b/c/d.py")) + # '.+/.+\.pyi' does not match .pyi files in the top-level directory. + self.assertFalse(self.config.is_excluded_by_regex("a/m.pyi")) + self.assertTrue(self.config.is_excluded_by_regex("a/d/m.pyi")) + self.assertFalse(self.config.is_excluded_by_regex("m.pyi")) + + self.config.reset() + # Use '[^/]+' instead of '.+' to force only matching files in the top-level. + self.assertContainsDict({"code": 0}, self.parse_args(["--exclude-regex", r"^a/b/[^/]+\.pyi$"])) + self.assertTrue(self.config.is_excluded_by_regex("a/b/c.pyi")) + self.assertFalse(self.config.is_excluded_by_regex("a/b/c.py")) + self.assertFalse(self.config.is_excluded_by_regex("a/b/c/d.pyi")) + + self.assertContainsDict({"code": 0}, self.parse_args(["--no-exclude-regex"])) + self.assertEmpty(self.config.exclusion_regex()) + + def test_make_paths_absolute(self): + self.assertTrue(self.config.make_paths_absolute()) + + self.assertContainsDict({"code": 0}, self.parse_args(["--no-make-paths-absolute"])) + self.assertFalse(self.config.make_paths_absolute()) + + self.assertContainsDict({"code": 0}, self.parse_args(["--make-paths-absolute"])) + self.assertTrue(self.config.make_paths_absolute()) + def test_backport(self): # Needs part. self.assertContainsDict({"code": 1}, self.parse_args(["--backport"])) diff --git a/tests/config.py b/tests/config.py index 6d791d7e..79be487b 100644 --- a/tests/config.py +++ b/tests/config.py @@ -78,6 +78,8 @@ def test_repr(self): show_tips = {} analyze_hidden = {} exclusions = {} + exclusion_regex = {} + make_paths_absolute = {} backports = {} features = {} targets = {} @@ -89,8 +91,9 @@ def test_repr(self): )""".format(self.config.__class__.__name__, self.config.quiet(), self.config.verbose(), self.config.print_visits(), self.config.processes(), self.config.ignore_incomp(), self.config.pessimistic(), self.config.show_tips(), self.config.analyze_hidden(), - self.config.exclusions(), list(self.config.backports()), list(self.config.features()), - self.config.targets(), self.config.eval_annotations(), + self.config.exclusions(), self.config.exclusion_regex(), + self.config.make_paths_absolute(), list(self.config.backports()), + list(self.config.features()), self.config.targets(), self.config.eval_annotations(), self.config.only_show_violations(), self.config.parse_comments(), self.config.scan_symlink_folders(), self.config.format().name())) @@ -192,6 +195,12 @@ def test_parse_invalid_verbose(self): """, True], [u"""[vermin] print_visits = False +""", False], + [u"""[vermin] +print_visits = yes +""", True], + [u"""[vermin] +print_visits = no """, False], ]) def test_parse_print_visits(self, data, expected): @@ -238,6 +247,12 @@ def test_parse_invalid_processes(self): """, True], [u"""[vermin] ignore_incomp = False +""", False], + [u"""[vermin] +ignore_incomp = yes +""", True], + [u"""[vermin] +ignore_incomp = no """, False], ]) def test_parse_ignore_incomp(self, data, expected): @@ -257,6 +272,12 @@ def test_parse_ignore_incomp(self, data, expected): """, True], [u"""[vermin] pessimistic = False +""", False], + [u"""[vermin] +pessimistic = yes +""", True], + [u"""[vermin] +pessimistic = no """, False], ]) def test_parse_pessimistic(self, data, expected): @@ -276,6 +297,12 @@ def test_parse_pessimistic(self, data, expected): """, False], [u"""[vermin] show_tips = True +""", True], + [u"""[vermin] +show_tips = no +""", False], + [u"""[vermin] +show_tips = yes """, True], ]) def test_parse_show_tips(self, data, expected): @@ -308,6 +335,56 @@ def test_parse_exclusions(self, data, expected): self.assertIsNotNone(config) self.assertEqual(config.exclusions(), expected) + @VerminTest.parameterized_args([ + [u"""[vermin] +exclusion_regex = +""", []], + [u"""[vermin] +#exclusion_regex = \\.pyi$ +""", []], + [u"""[vermin] +exclusion_regex = \\.pyi$ +""", [r"\.pyi$"]], + [u"""[vermin] +exclusion_regex = \\.pyi$ + ^a/b$ +""", [r"\.pyi$", r"^a/b$"]], + [u"""[vermin] +exclusion_regex = + ^a/b$ + \\.pyi$ +""", [r"\.pyi$", r"^a/b$"]], + ]) + def test_parse_exclusion_regex(self, data, expected): + config = Config.parse_data(data) + self.assertIsNotNone(config) + self.assertEqual(config.exclusion_regex(), expected) + + @VerminTest.parameterized_args([ + [u"""[vermin] +make_paths_absolute = +""", True], + [u"""[vermin] +#make_paths_absolute = False +""", True], + [u"""[vermin] +make_paths_absolute = yes +""", True], + [u"""[vermin] +make_paths_absolute = no +""", False], + [u"""[vermin] +make_paths_absolute = True +""", True], + [u"""[vermin] +make_paths_absolute = False +""", False], + ]) + def test_parse_make_paths_absolute(self, data, expected): + config = Config.parse_data(data) + self.assertIsNotNone(config) + self.assertEqual(config.make_paths_absolute(), expected) + def test_parse_backports(self): bps = Backports.modules() config = Config.parse_data(u"""[vermin] diff --git a/tests/general.py b/tests/general.py index 0238da12..2ffe87e3 100644 --- a/tests/general.py +++ b/tests/general.py @@ -1,5 +1,6 @@ import sys import os +import re import io from os.path import abspath, basename, join, splitext from tempfile import NamedTemporaryFile, mkdtemp @@ -13,7 +14,8 @@ from vermin.formats import ParsableFormat from vermin.utility import open_wrapper -from .testutils import VerminTest, current_version, ScopedTemporaryFile, detect, visit, touch +from .testutils import VerminTest, current_version, ScopedTemporaryFile, detect, visit, touch, \ + working_dir class VerminGeneralTests(VerminTest): def test_detect_without_config(self): @@ -314,6 +316,73 @@ def test_detect_vermin_paths_no_invalid_exts(self): rmtree(tmp_fld) + def test_exclude_pyi_regex(self): + tmp_fld = mkdtemp() + + # With the default of --make-paths-absolute, this will match .pyi files in any subdirectory. The + # most common use case for --exclude-regex is expected to be for file extensions, so it's great + # that will work regardless of the --make-paths-absolute setting. + self.config.add_exclusion_regex(r"\.pyi$") + + f = touch(tmp_fld, "code.pyi") + with open_wrapper(f, mode="w", encoding="utf-8") as fp: + fp.write("print('this is code')") + + paths = detect_paths([tmp_fld], config=self.config) + self.assertEmpty(paths) + + rmtree(tmp_fld) + + def test_exclude_directory_regex(self): + tmp_fld = mkdtemp() + + # Excluding the directory .../a should exclude any files recursively beneath it as well. + self.config.add_exclusion_regex('^' + re.escape(join(tmp_fld, "a")) + '$') + + # Create .../a and .../a/b directories. + os.mkdir(join(tmp_fld, "a")) + os.mkdir(join(tmp_fld, "a/b")) + + paths = ["code.py", "a/code.py", "a/b/code.py"] + for p in paths: + f = touch(tmp_fld, p) + with open_wrapper(f, mode="w", encoding="utf-8") as fp: + fp.write("print('this is code')") + + paths = detect_paths([tmp_fld], config=self.config) + self.assertEqual(paths, [join(tmp_fld, "code.py")]) + + rmtree(tmp_fld) + + def test_exclude_regex_relative(self): + tmp_fld = mkdtemp() + + # Keep paths relative, and provide patterns matching relative paths. + self.config.set_make_paths_absolute(False) + self.config.add_exclusion_regex("^a{0}b$".format(re.escape(os.path.sep))) + self.config.add_exclusion_regex("^a{0}.+pyi$".format(re.escape(os.path.sep))) + + # Create .../a and .../a/b directories. + os.mkdir(join(tmp_fld, "a")) + os.mkdir(join(tmp_fld, "a", "b")) + + paths = [ + join("a", "code.py"), + join("a", "code.pyi"), + join("a", "b", "code.py"), + ] + for p in paths: + f = touch(tmp_fld, p) + with open_wrapper(f, mode="w", encoding="utf-8") as fp: + fp.write("print('this is code')") + + # Temporarily modify the working directory. + with working_dir(tmp_fld): + paths = detect_paths(["a"], config=self.config) + self.assertEqual(paths, [join("a", "code.py")]) + + rmtree(tmp_fld) + def test_detect_vermin_min_versions(self): paths = detect_paths([abspath("vermin")], config=self.config) processor = Processor() diff --git a/tests/testutils.py b/tests/testutils.py index 3e88ef63..01b7c140 100644 --- a/tests/testutils.py +++ b/tests/testutils.py @@ -1,6 +1,7 @@ import unittest import sys import os +from contextlib import contextmanager from os.path import join from tempfile import NamedTemporaryFile @@ -18,6 +19,15 @@ def touch(fld, name, contents=None): fp.close() return filename +@contextmanager +def working_dir(path): + prev_wd = os.getcwd() + try: + os.chdir(path) + yield + finally: + os.chdir(prev_wd) + class VerminTest(unittest.TestCase): """General test case class for all Vermin tests.""" diff --git a/vermin/arguments.py b/vermin/arguments.py index 68ddffb6..37bc9d30 100644 --- a/vermin/arguments.py +++ b/vermin/arguments.py @@ -22,12 +22,13 @@ def print_usage(full=False): print("\nFor full help and options, use `-h` or `--help`.") print("\nHeuristics are employed to determine which files to analyze:\n" - " - 'py', 'py3', 'pyw', 'pyj', 'pyi' are always scanned\n" + " - 'py', 'py3', 'pyw', 'pyj', 'pyi' are always scanned (unless otherwise excluded)\n" " - 'pyc', 'pyd', 'pxd', 'pyx', 'pyo' are ignored (including various other files)\n" " - Magic lines with 'python' are accepted, like: #!/usr/bin/env python\n" " - Files that cannot be opened for reading as text devices are ignored") - print("\nHowever, files directly specified are always attempted parsing, even without\n" - "accepted extensions or heuristics.") + print("\nHowever, Vermin will always attempt to parse any file paths directly specified on\n" + "the command line, even without accepted extensions or heuristics, unless otherwise\n" + "excluded.") print("\nResults interpretation:") print(" ~2 No known reason it won't work with py2.") print(" !2 It is known that it won't work with py2.") @@ -153,6 +154,26 @@ def print_usage(full=False): " line constitutes an exclusion with the same format as with --exclude.") print("\n --no-exclude (default)\n" " Use no excludes. Clears any excludes specified before this.") + print("\n [--exclude-regex ] ...\n" + " Exclude files from analysis by matching a regex pattern against their\n" + " entire path as expanded from the Vermin command line. Patterns are matched\n" + " using re.search(), so '^' or '$' anchors should be applied as needed.\n\n" + " Examples:\n" + " Exclude any '.pyi' file: --exclude-regex '\\.pyi$'\n\n" + " (Note: the below examples require --no-make-paths-absolute, or prefixing\n" + " the patterns with the regex-escaped path to the current directory.)\n\n" + " Exclude the directory 'a/b/': --exclude-regex '^a/b$'\n" + " (This will also exclude any files under 'a/b'.)\n\n" + " Exclude '.pyi' files under 'a/b/': --exclude-regex '^a/b/.+\\.pyi$'\n" + " Exclude '.pyi' files in exactly 'a/b/': --exclude-regex '^a/b/[^/]+\\.pyi$'") + print("\n --no-exclude-regex (default)\n" + " Use no exclude patterns. Clears any exclude patterns specified before this.") + print("\n --make-paths-absolute (default)\n" + " Convert any relative paths from the command line into absolute paths.\n" + " This affects the path printed to the terminal if a file fails a check,\n" + " and requires --exclude-regex patterns to match absolute paths.") + print("\n --no-make-paths-absolute\n" + " Do not convert relative paths from the command line into absolute paths.") print("\n [--backport ] ...\n" " Some features are sometimes backported into packages, in repositories such\n" " as PyPi, that are widely used but aren't in the standard language. If such a\n" @@ -304,6 +325,21 @@ def parse(self, config, detect_folder=None): elif arg == "--no-exclude": config.clear_exclusions() path_pos += 1 + elif arg == "--exclude-regex": + if (i + 1) >= len(self.__args): + print("Exclusion requires a regex! Example: --exclude-regex '\\.pyi$'") + return {"code": 1} + config.add_exclusion_regex(self.__args[i + 1]) + path_pos += 2 + elif arg == "--no-exclude-regex": + config.clear_exclusion_regex() + path_pos += 1 + elif arg == "--make-paths-absolute": + config.set_make_paths_absolute(True) + path_pos += 1 + elif arg == "--no-make-paths-absolute": + config.set_make_paths_absolute(False) + path_pos += 1 elif arg == "--backport": if (i + 1) >= len(self.__args): print("Requires a backport name! Example: --backport typing") diff --git a/vermin/config.py b/vermin/config.py index 17fa4ecc..db1531c3 100644 --- a/vermin/config.py +++ b/vermin/config.py @@ -1,6 +1,7 @@ import io import sys import os +import re # novm try: @@ -29,6 +30,8 @@ def reset(self): self.__show_tips = True self.__analyze_hidden = False self.__exclusions = set() + self.__exclusion_regex = set() + self.__make_paths_absolute = True self.__backports = set() self.__features = set() self.__targets = [] @@ -48,6 +51,8 @@ def override_from(self, other_config): self.__show_tips = other_config.show_tips() self.__analyze_hidden = other_config.analyze_hidden() self.__exclusions = set(other_config.exclusions()) + self.__exclusion_regex = set(other_config.exclusion_regex()) + self.__make_paths_absolute = other_config.make_paths_absolute() self.__backports = other_config.backports() self.__features = other_config.features() self.__targets = other_config.targets() @@ -68,6 +73,8 @@ def __repr__(self): show_tips = {} analyze_hidden = {} exclusions = {} + exclusion_regex = {} + make_paths_absolute = {} backports = {} features = {} targets = {} @@ -78,7 +85,8 @@ def __repr__(self): format = {} )""".format(self.__class__.__name__, self.quiet(), self.verbose(), self.print_visits(), self.processes(), self.ignore_incomp(), self.pessimistic(), self.show_tips(), - self.analyze_hidden(), self.exclusions(), list(self.backports()), list(self.features()), + self.analyze_hidden(), self.exclusions(), self.exclusion_regex(), + self.make_paths_absolute(), list(self.backports()), list(self.features()), self.targets(), self.eval_annotations(), self.only_show_violations(), self.parse_comments(), self.scan_symlink_folders(), self.format().name()) @@ -119,6 +127,8 @@ def encode_list(iterable): "show_tips": str(config.show_tips()), "analyze_hidden": str(config.analyze_hidden()), "exclusions": encode_list(config.exclusions()), + "exclusion_regex": encode_list(config.exclusion_regex()), + "make_paths_absolute": str(config.make_paths_absolute()), "backports": encode_list(config.backports()), "features": encode_list(config.features()), "targets": encode_list(config.targets()), @@ -184,6 +194,11 @@ def getstringlist(option): for exclusion in getstringlist("exclusions"): config.add_exclusion(exclusion) + for exclusion_regex in getstringlist("exclusion_regex"): + config.add_exclusion_regex(exclusion_regex) + + config.set_make_paths_absolute(getbool("make_paths_absolute")) + for backport in getstringlist("backports"): if not config.add_backport(backport): print("Unknown backport: {}".format(backport)) @@ -280,6 +295,9 @@ def set_ignore_incomp(self, ignore): def add_exclusion(self, name): self.__exclusions.add(name) + def add_exclusion_regex(self, pattern): + self.__exclusion_regex.add(re.compile(pattern)) + def add_exclusion_file(self, filename): try: with open_wrapper(filename, mode="r", encoding="utf-8") as f: @@ -291,11 +309,19 @@ def add_exclusion_file(self, filename): def clear_exclusions(self): self.__exclusions.clear() + def clear_exclusion_regex(self): + self.__exclusion_regex.clear() + def exclusions(self): res = list(self.__exclusions) res.sort() return res + def exclusion_regex(self): + res = [p.pattern for p in self.__exclusion_regex] + res.sort() + return res + def is_excluded(self, name): return name in self.__exclusions @@ -308,6 +334,15 @@ def is_excluded_codecs_error_handler(self, name): def is_excluded_codecs_encoding(self, name): return "ce={}".format(name) in self.__exclusions + def is_excluded_by_regex(self, path): + return any(regex.search(path) for regex in self.__exclusion_regex) + + def make_paths_absolute(self): + return self.__make_paths_absolute + + def set_make_paths_absolute(self, enable): + self.__make_paths_absolute = enable + def add_backport(self, name): if not Backports.is_backport(name): return False diff --git a/vermin/detection.py b/vermin/detection.py index d740cc88..a4edfad2 100644 --- a/vermin/detection.py +++ b/vermin/detection.py @@ -481,11 +481,12 @@ def detect_paths_incremental(args): further_args = [] for path in paths: try: - if any(ic in path for ic in ignore_chars): + if any(ic in path for ic in ignore_chars) or config.is_excluded_by_regex(path): continue # pragma: no cover if not hidden and path != "." and path[0] == ".": continue - path = abspath(path) + if config.make_paths_absolute(): + path = abspath(path) # Scan top-level folders, or input folders, in all cases. st = stat_path(path, True if depth == 0 else scan_symlink_folders) @@ -503,9 +504,9 @@ def detect_paths_incremental(args): return (accepted, further_args) # Some detected paths might not be python code since not all files use extensions like ".py" and -# ".pyw", for instance. But try directly specified files on CLI, on depth 0, in any case (non-pyhton +# ".pyw", for instance. But try directly specified files on CLI, on depth 0, in any case (non-python # files will be ignored when trying to parse them). Paths containing chars in `ignore_chars` will be -# ignored. +# ignored, as will any file excluded by regex from the config. def detect_paths(paths, hidden=False, processes=cpu_count(), ignore_chars=None, scan_symlink_folders=False, config=None): assert(config is not None) diff --git a/vermin/main.py b/vermin/main.py index 111f766d..b4a3a2dd 100644 --- a/vermin/main.py +++ b/vermin/main.py @@ -27,7 +27,8 @@ def main(): # Detect paths, remove duplicates, and sort for deterministic results. if not parsable: vprint("Detecting python files..", config) - paths = [abspath(p) for p in paths] + if config.make_paths_absolute(): + paths = [abspath(p) for p in paths] # Parsable format ignores paths with ":" in particular because it interferes with the format that # uses ":" a lot.