From 4a10a9d9773cb3766a76b71521bd8cf93fc35e7e Mon Sep 17 00:00:00 2001 From: Jindrich Luza Date: Mon, 26 Aug 2024 11:01:53 +0200 Subject: [PATCH] Fixed deduplication of SPDX packages Signed-off-by: Jindrich Luza --- cachi2/core/models/sbom.py | 33 ++++++++- cachi2/core/models/validators.py | 38 +--------- tests/unit/models/test_sbom.py | 119 ++++++++++++++++++++++++------- 3 files changed, 125 insertions(+), 65 deletions(-) diff --git a/cachi2/core/models/sbom.py b/cachi2/core/models/sbom.py index 92e752877..09a4e6071 100644 --- a/cachi2/core/models/sbom.py +++ b/cachi2/core/models/sbom.py @@ -1,8 +1,8 @@ -from typing import Any, Literal, Optional +from typing import Any, Iterable, Literal, Optional import pydantic -from cachi2.core.models.validators import unique_sorted, unique_sorted_multikey +from cachi2.core.models.validators import unique_sorted PropertyName = Literal[ "cachi2:found_by", @@ -122,6 +122,9 @@ class SPDXPackageExternalRef(pydantic.BaseModel): referenceLocator: str referenceType: str + def __hash__(self) -> int: + return hash((self.referenceLocator, self.referenceType, self.referenceCategory)) + class SPDXPackage(pydantic.BaseModel): """SPDX Package. @@ -177,6 +180,30 @@ class SPDXCreationInfo(pydantic.BaseModel): creators: list[str] = [] +def deduplicate_spdx_packages(items: Iterable[SPDXPackage]) -> list[SPDXPackage]: + """ Deduplicate SPDX packages and merge external references. + + If package with same name and version is found multiple times in the list, + merge external references of all the packages into one package. + """ + unique_items = {} + for item in items: + key = (item.name, item.version) + if key not in unique_items: + unique_items[key] = SPDXPackage(name=item.name, version=item.version) + unique_items[key].externalRefs = item.externalRefs[:] + else: + unique_items[key].externalRefs.extend(item.externalRefs) + + for item in unique_items.values(): + item.externalRefs = sorted( + list(set(item.externalRefs)), + key=lambda ref: (ref.referenceLocator, ref.referenceType, ref.referenceCategory), + ) + + return sorted(list(unique_items.values()), key=lambda item: (item.name, item.version)) + + class SPDXSbom(pydantic.BaseModel): """Software bill of materials in the SPDX format. @@ -195,4 +222,4 @@ class SPDXSbom(pydantic.BaseModel): @pydantic.field_validator("packages") def _unique_packages(cls, packages: list[SPDXPackage]) -> list[SPDXPackage]: """Sort and de-duplicate components.""" - return unique_sorted_multikey(packages, by=lambda package: package.key()) + return deduplicate_spdx_packages(packages) diff --git a/cachi2/core/models/validators.py b/cachi2/core/models/validators.py index baedfe74a..e63cb32c5 100644 --- a/cachi2/core/models/validators.py +++ b/cachi2/core/models/validators.py @@ -1,6 +1,6 @@ import os from pathlib import Path -from typing import Any, Callable, Iterable, List, TypeVar +from typing import Any, Callable, Iterable, TypeVar T = TypeVar("T") @@ -24,31 +24,6 @@ def unique(items: Iterable[T], by: Callable[[T], Any], dedupe: bool = True) -> l return list(by_key.values()) -def unique_multikey(items: Iterable[T], by: Callable[[T], List[Any]]) -> list[T]: - """Make sure input items are unique by the specified key. - - The 'by' function must return a hashable key (the uniqueness key). - - If item A and item B have the same key, then - if dedupe is true (the default) and A == B, B is discarded - if dedupe is false or A != B, raise an error - """ - by_key: dict[tuple[str, ...], Any] = {} - for item in items: - multi_key = by(item) - found = False - for mkey in by_key: - for key in mkey: - if key in multi_key: - found = True - break - if found: - break - else: - by_key[tuple(multi_key)] = item - return list(by_key.values()) - - def unique_sorted(items: Iterable[T], by: Callable[[T], Any], dedupe: bool = True) -> list[T]: """Make sure input items are unique and sort them. @@ -59,17 +34,6 @@ def unique_sorted(items: Iterable[T], by: Callable[[T], Any], dedupe: bool = Tru return unique_items -def unique_sorted_multikey(items: Iterable[T], by: Callable[[T], Any]) -> list[T]: - """Make sure input items are unique and sort them. - - This version of unique_sorted works with items where keys is composed of list of multiple values - where every value is considered as key itself. One item can then have more single keys. - """ - unique_items = unique_multikey(items, by) - unique_items.sort(key=by) - return unique_items - - def check_sane_relpath(path: Path) -> Path: """Check that the path is relative and looks sane.""" if path.is_absolute(): diff --git a/tests/unit/models/test_sbom.py b/tests/unit/models/test_sbom.py index a9cd5047c..80fa93166 100644 --- a/tests/unit/models/test_sbom.py +++ b/tests/unit/models/test_sbom.py @@ -9,6 +9,7 @@ SPDXPackage, SPDXPackageExternalRef, SPDXSbom, + deduplicate_spdx_packages, ) @@ -311,22 +312,6 @@ def test_sort_and_dedupe_packages(self) -> None: } ], }, - { - "name": "github.com/org/A", - "version": "v1.1.0", - "externalRefs": [ - { - "referenceCategory": "PACKAGE-MANAGER", - "referenceLocator": "pkg:golang/github.com/org/A@v1.1.0?repository_id=R1", - "referenceType": "purl", - }, - { - "referenceCategory": "PACKAGE-MANAGER", - "referenceLocator": "pkg:golang/github.com/org/A@v1.1.0?repository_id=R2", - "referenceType": "purl", - }, - ], - }, { "name": "github.com/org/A", "version": "v1.0.0", @@ -385,7 +370,7 @@ def test_sort_and_dedupe_packages(self) -> None: ], ) print(sbom.packages) - assert len(sbom.packages) == 6 + assert len(sbom.packages) == 5 assert sbom.packages == [ SPDXPackage( name="bytes", @@ -426,18 +411,12 @@ def test_sort_and_dedupe_packages(self) -> None: referenceCategory="PACKAGE-MANAGER", referenceLocator="pkg:golang/github.com/org/A@v1.1.0?repository_id=R1", referenceType="purl", - ) - ], - ), - SPDXPackage( - name="github.com/org/A", - version="v1.1.0", - externalRefs=[ + ), SPDXPackageExternalRef( referenceCategory="PACKAGE-MANAGER", referenceLocator="pkg:golang/github.com/org/A@v1.1.0?repository_id=R2", referenceType="purl", - ) + ), ], ), SPDXPackage( @@ -452,3 +431,93 @@ def test_sort_and_dedupe_packages(self) -> None: ], ), ] + + +def test_deduplicate_spdx_packages() -> None: + packages = [ + SPDXPackage( + name="github.com/org/A", + version="v1.0.0", + externalRefs=[ + SPDXPackageExternalRef( + referenceCategory="PACKAGE-MANAGER", + referenceLocator="pkg:golang/github.com/org/A@v1.0.0?repository_id=R1", + referenceType="purl", + ) + ], + ), + SPDXPackage( + name="github.com/org/A", + version="v1.0.0", + externalRefs=[ + SPDXPackageExternalRef( + referenceCategory="PACKAGE-MANAGER", + referenceLocator="pkg:golang/github.com/org/A@v1.0.0?repository_id=R1", + referenceType="purl", + ), + SPDXPackageExternalRef( + referenceCategory="PACKAGE-MANAGER", + referenceLocator="pkg:golang/github.com/org/A@v1.0.0?repository_id=R2", + referenceType="purl", + ), + ], + ), + SPDXPackage( + name="github.com/org/B", + version="v1.0.0", + externalRefs=[ + SPDXPackageExternalRef( + referenceCategory="PACKAGE-MANAGER", + referenceLocator="pkg:golang/github.com/org/B@v1.0.0", + referenceType="purl", + ) + ], + ), + SPDXPackage( + name="github.com/org/B", + version="v1.0.0", + externalRefs=[ + SPDXPackageExternalRef( + referenceCategory="PACKAGE-MANAGER", + referenceLocator="pkg:golang/github.com/org/B@v1.0.0?repository_id=R1", + referenceType="purl", + ) + ], + ), + ] + deduped_packages = deduplicate_spdx_packages(packages) + assert len(deduped_packages) == 2 + assert deduped_packages == [ + SPDXPackage( + name="github.com/org/A", + version="v1.0.0", + externalRefs=[ + SPDXPackageExternalRef( + referenceCategory="PACKAGE-MANAGER", + referenceLocator="pkg:golang/github.com/org/A@v1.0.0?repository_id=R1", + referenceType="purl", + ), + SPDXPackageExternalRef( + referenceCategory="PACKAGE-MANAGER", + referenceLocator="pkg:golang/github.com/org/A@v1.0.0?repository_id=R2", + referenceType="purl", + ), + ], + ), + SPDXPackage( + name="github.com/org/B", + version="v1.0.0", + externalRefs=[ + SPDXPackageExternalRef( + referenceCategory="PACKAGE-MANAGER", + referenceLocator="pkg:golang/github.com/org/B@v1.0.0", + referenceType="purl", + ), + SPDXPackageExternalRef( + referenceCategory="PACKAGE-MANAGER", + referenceLocator="pkg:golang/github.com/org/B@v1.0.0?repository_id=R1", + referenceType="purl", + ), + ], + ), + ]