Skip to content

Commit

Permalink
feat: SP-1856 Refactor to pattern matching
Browse files Browse the repository at this point in the history
  • Loading branch information
matiasdaloia committed Nov 19, 2024
1 parent 3a73527 commit bb05f61
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 88 deletions.
34 changes: 13 additions & 21 deletions docs/source/_static/scanoss-settings-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,30 +29,22 @@
"type": "object",
"description": "Set of rules to skip files from the scan",
"properties": {
"folders": {
"patterns": {
"type": "array",
"description": "List of folders to skip from the scan. These should be relative to the scan root",
"description": "List of glob patterns to skip files",
"items": {
"type": "string",
"examples": ["/path/to/folder", "/path/to/another/folder"]
},
"uniqueItems": true
},
"files": {
"type": "array",
"description": "List of files to skip from the scan. These can be either relative file paths or just file names",
"items": {
"type": "string",
"examples": ["/path/to/include.h", "include.h"]
},
"uniqueItems": true
},
"extensions": {
"type": "array",
"description": "List of file extensions to skip from the scan",
"items": {
"type": "string",
"examples": [".h", ".c", ".cpp"]
"examples": [
"path/to/folder",
"path/to/folder/**",
"path/to/folder/**/*",

"path/to/file.c",
"path/to/another/file.py",

"**/*.ts",
"**/*.json"
]
},
"uniqueItems": true
},
Expand Down
2 changes: 1 addition & 1 deletion src/scanoss/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,7 @@ def scan(parser, args):
if args.skip_settings_file:
print_stderr('Omit settings file is set. Skipping...')
else:
scan_settings = ScanossSettings(debug=args.debug, trace=args.trace, quiet=args.quiet, scan_root=args.scan_dir)
scan_settings = ScanossSettings(debug=args.debug, trace=args.trace, quiet=args.quiet)
try:
if args.identify:
scan_settings.load_json_file(args.identify).set_file_type('legacy').set_scan_type('identify')
Expand Down
80 changes: 54 additions & 26 deletions src/scanoss/scan_filter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
import os
from typing import List, Set, Tuple

import pathspec
from pathspec import PathSpec

from scanoss.scanossbase import ScanossBase

Expand Down Expand Up @@ -209,7 +210,6 @@ def __init__(
debug: bool = False,
trace: bool = False,
quiet: bool = False,
scan_root: Path = None,
settings: dict = None,
):
"""
Expand All @@ -220,37 +220,65 @@ def __init__(
"""
super().__init__(debug, trace, quiet)

self.scan_root = scan_root

skip = settings.get('skip', {})
skip_patterns = []

skip_patterns.extend(f'**/*{ext}' for ext in DEFAULT_SKIPPED_EXT)
skip_patterns.extend(DEFAULT_SKIPPED_FILES)
skip_patterns.extend(f'**/{dir}/**' for dir in DEFAULT_SKIPPED_DIRS)
skip_patterns.extend(f'**/*{ext}/**' for ext in DEFAULT_SKIPPED_DIR_EXT)

skip_patterns_from_settings = []

# Add scan root to patterns, to support relative paths
for pattern in skip.get('patterns', []):
pattern_path = Path(scan_root, pattern)
skip_patterns_from_settings.append(str(pattern_path))
skip_patterns.extend(f'{dir}/' for dir in DEFAULT_SKIPPED_DIRS)
skip_patterns.extend(f'*{ext}' for ext in DEFAULT_SKIPPED_EXT)
skip_patterns.extend(f'*{ext}/' for ext in DEFAULT_SKIPPED_DIR_EXT)
skip_patterns.extend(skip.get('patterns', []))

self.skip_spec = pathspec.PathSpec.from_lines('gitwildmatch', skip_patterns)
self.skip_patterns = skip_patterns
self.min_size = skip.get('sizes', {}).get('min', 0)
self.max_size = skip.get('sizes', {}).get('max', float('inf'))

def should_process(self, path: Path) -> bool:
if self.skip_spec.match_file(path):
self.print_debug(f'Skipping {path} {"folder" if path.is_dir() else "file"} due to pattern match')
return False
def get_filtered_files(self, root: str) -> List[str]:
"""Get a list of files to scan based on the filter settings.
Args:
root (str): Root directory to scan
Returns:
list[str]: List of files to scan
"""
files = self._walk_with_ignore(root)
return files

def _walk_with_ignore(self, scan_root: str) -> List[str]:
files = []
root = os.path.abspath(scan_root)

path_spec, dir_patterns = self._create_skip_path_matchers()

for dirpath, dirnames, filenames in os.walk(root):
rel_path = os.path.relpath(dirpath, root)

# Return early if the entire directory should be skipped
if any(rel_path.startswith(p) for p in dir_patterns):
self.print_debug(f'Skipping directory: {rel_path}')
dirnames.clear()
continue

for filename in filenames:
file_rel_path = os.path.join(rel_path, filename)
file_path = os.path.join(dirpath, filename)
file_size = os.path.getsize(file_path)

if file_size < self.min_size or file_size > self.max_size:
self.print_debug(f'Skipping file: {file_rel_path} (size: {file_size})')
continue
if path_spec.match_file(file_rel_path):
self.print_debug(f'Skipping file: {file_rel_path}')
continue
else:
files.append(file_rel_path)

return files

def _create_skip_path_matchers(self) -> Tuple[PathSpec, Set[str]]:
dir_patterns = {p.rstrip('/') for p in self.skip_patterns if p.endswith('/')}

if path.is_file():
filesize = path.stat().st_size
if not (self.min_size <= filesize <= self.max_size):
self.print_debug(f'Skipping {path} due to size')
return False
path_spec = PathSpec.from_lines('gitwildmatch', self.skip_patterns)

return True
return path_spec, dir_patterns
54 changes: 23 additions & 31 deletions src/scanoss/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,39 +414,31 @@ def scan_folder(self, scan_dir: str) -> bool:
file_count = 0 # count all files fingerprinted
wfp_file_count = 0 # count number of files in each queue post
scan_started = False

for root, dirs, files in os.walk(scan_dir):
self.print_trace(f'U Root: {root}, Dirs: {dirs}, Files {files}')

to_scan_files = self.scan_settings.get_filtered_files(scan_dir)

for to_scan_file in to_scan_files:
if self.threaded_scan and self.threaded_scan.stop_scanning():
self.print_stderr('Warning: Aborting fingerprinting as the scanning service is not available.')
break

dirs[:] = [d for d in dirs if self.scan_settings.should_process(Path(root, d))]

for file in files:
path = Path(root, file)
if not self.scan_settings.should_process(path):
self.print_debug(f'Skipping filtered file: {path}')
continue

self.print_trace(f'Fingerprinting {path}...')
if spinner:
spinner.next()
wfp = self.winnowing.wfp_for_file(str(path), self.__strip_dir(scan_dir, scan_dir_len, str(path)))
if wfp is None or wfp == '':
self.print_debug(f'No WFP returned for {path}. Skipping.')
continue
if save_wfps_for_print:
wfp_list.append(wfp)
file_count += 1
if self.threaded_scan:
wfp_size = len(wfp.encode('utf-8'))
# If the WFP is bigger than the max post size and we already have something stored in the scan block, add it to the queue
if scan_block != '' and (wfp_size + scan_size) >= self.max_post_size:
self.threaded_scan.queue_add(scan_block)
queue_size += 1
scan_block = ''
wfp_file_count = 0
if spinner:
spinner.next()
abs_path = Path(scan_dir, to_scan_file).resolve()
wfp = self.winnowing.wfp_for_file(str(abs_path), to_scan_file)
if wfp is None or wfp == '':
self.print_debug(f'No WFP returned for {to_scan_file}. Skipping.')
continue
if save_wfps_for_print:
wfp_list.append(wfp)
file_count += 1
if self.threaded_scan:
wfp_size = len(wfp.encode('utf-8'))
# If the WFP is bigger than the max post size and we already have something stored in the scan block, add it to the queue
if scan_block != '' and (wfp_size + scan_size) >= self.max_post_size:
self.threaded_scan.queue_add(scan_block)
queue_size += 1
scan_block = ''
wfp_file_count = 0
scan_block += wfp
scan_size = len(scan_block.encode('utf-8'))
wfp_file_count += 1
Expand All @@ -463,7 +455,7 @@ def scan_folder(self, scan_dir: str) -> bool:
'Warning: Some errors encounted while scanning. Results might be incomplete.'
)
success = False

# End for loop
if self.threaded_scan and scan_block != '':
self.threaded_scan.queue_add(scan_block) # Make sure all files have been submitted
Expand Down
11 changes: 2 additions & 9 deletions src/scanoss/scanoss_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def __init__(
debug: bool = False,
trace: bool = False,
quiet: bool = False,
scan_root: Path = None,
filepath: str = None,
):
"""
Expand All @@ -63,7 +62,6 @@ def __init__(
"""

super().__init__(debug, trace, quiet)
self.scan_root = scan_root
self.data = {}
self.settings_file_type = None
self.scan_type = None
Expand Down Expand Up @@ -98,7 +96,6 @@ def load_json_file(self, filepath: str) -> 'ScanossSettings':
quiet=self.quiet,
trace=self.trace,
settings=self.data.get('settings', {}),
scan_root=self.scan_root,
)
self.print_debug(f'Loading scan settings from: {filepath}')
return self
Expand Down Expand Up @@ -254,9 +251,5 @@ def is_legacy(self):
"""Check if the settings file is legacy"""
return self.settings_file_type == 'legacy'

def should_process(self, path: Path) -> bool:
"""Check if file should be processed based on settings"""
if not self.filter:
return True

return self.filter.should_process(path)
def get_filtered_files(self, scan_root: str) -> List[str]:
return self.filter.get_filtered_files(scan_root)

0 comments on commit bb05f61

Please sign in to comment.