diff --git a/CHANGELOG.md b/CHANGELOG.md index 9113f6dac..76eb630ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## master (unreleased) ### New Features - +- add lint for detecting duplicate features in capa-rules #2250 @v1bh475u - add span-of-calls scope to match features against a across a sliding window of API calls within a thread @williballenthin #2532 - add lint to catch rules that depend on other rules with impossible scope @williballenthin #2124 diff --git a/scripts/lint.py b/scripts/lint.py index 4765a3919..4b480f22f 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -33,6 +33,7 @@ import argparse import itertools import posixpath +from typing import Any, Dict, List from pathlib import Path from dataclasses import field, dataclass @@ -597,6 +598,88 @@ def rec(statement): return self.violation +class DuplicateFeatureUnderStatement(Lint): + name = "rule contains a duplicate features" + recommendation = "remove the duplicate features" + recommendation_template = '\n\tduplicate line: "{:s}"\t: line numbers: {:s}' + violation = False + + def check_rule(self, ctx: Context, rule: Rule) -> bool: + self.violation = False + self.recommendation = "" + STATEMENTS = frozenset( + { + "or", + "and", + "not", + "optional", + "some", + "basic block", + "function", + "instruction", + "call", + " or more", + } + ) + # rule.statement discards the duplicate features by default so + # need to use the rule definition to check for duplicates + data = rule._get_ruamel_yaml_parser().load(rule.definition) + + def get_line_number(line: Dict[str, Any]) -> int: + lc = getattr(line, "lc", None) + if lc and hasattr(lc, "line"): + return lc.line + 1 + return 0 + + def is_statement(key: str) -> bool: + # to generalize the check for 'n or more' statements + return any(statement in key for statement in STATEMENTS) + + def get_feature_key(feature_dict: Dict[str, Any]) -> str: + # need this for generating key for multi-lined feature + # for example, - string: /dbghelp\.dll/i + # description: WindBG + parts = [] + for key, value in list(feature_dict.items()): + parts.append(f"{key}: {value}") + return "- " + ", ".join(parts) + + def find_duplicates(features: List[Any]) -> None: + if not isinstance(features, list): + return + + seen_features: Dict[str, List[int]] = {} + for item in features: + if not isinstance(item, dict): + continue + + if any(is_statement(key) for key in item.keys()): + for key, value in item.items(): + if is_statement(key): + # recursively check nested features + find_duplicates(value) + continue + + feature_key = get_feature_key(item) + line_num = get_line_number(item) + if feature_key in seen_features: + self.violation = True + seen_features[feature_key].append(line_num) + else: + seen_features[feature_key] = [line_num] + for feature_key, line_numbers in seen_features.items(): + if len(line_numbers) > 1: + sorted_lines = sorted(line_numbers) + self.recommendation += self.recommendation_template.format( + feature_key, ", ".join(str(line) for line in sorted_lines) + ) + + features = data["rule"].get("features", []) + find_duplicates(features) + + return self.violation + + class UnusualMetaField(Lint): name = "unusual meta field" recommendation = "Remove the meta field" @@ -916,6 +999,7 @@ def rec(statement): OrStatementWithAlwaysTrueChild(), NotNotUnderAnd(), OptionalNotUnderAnd(), + DuplicateFeatureUnderStatement(), RuleDependencyScopeMismatch(), )