Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a faster package scan with --package-only #3689

Merged
merged 8 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ v33.0.0 (next next, roadmap)
v32.1.0 (next, roadmap)
----------------------------

New CLI options:

- A new CLI option ``--package-only`` has been added which performs
a faster package scan by skipping the package assembly step and
also skipping license/copyright detection on package metadata.

Major API/other changes:

- Output Format Version updated to 3.1.0 (minor version bump)
Expand Down
4 changes: 4 additions & 0 deletions docs/source/rst_snippets/basic_options.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ documenting a program's options. For example:
--system-package Scan ``<input>`` for installed system package
databases.

--package-only Scan ``<input>`` for system and application
only for package metadata, without license/
copyright detection and package assembly.

-e, --email Scan ``<input>`` for emails.

Sub-Options:
Expand Down
5 changes: 3 additions & 2 deletions src/packagedcode/about.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class AboutFileHandler(models.DatafileHandler):
documentation_url = 'https://aboutcode-toolkit.readthedocs.io/en/latest/specification.html'

@classmethod
def parse(cls, location):
def parse(cls, location, package_only=False):
"""
Yield one or more Package manifest objects given a file ``location`` pointing to a
package archive, manifest or similar.
Expand Down Expand Up @@ -90,7 +90,7 @@ def parse(cls, location):
file_references.append(models.FileReference(path=about_resource))

# FIXME: we should put the unprocessed attributes in extra data
yield models.PackageData(
package_data = dict(
datasource_id=cls.datasource_id,
type=package_type,
namespace=package_ns,
Expand All @@ -103,6 +103,7 @@ def parse(cls, location):
download_url=download_url,
file_references=file_references,
)
yield models.PackageData.from_data(package_data, package_only)

@classmethod
def assemble(cls, package_data, resource, codebase, package_adder):
Expand Down
38 changes: 28 additions & 10 deletions src/packagedcode/alpine.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,12 @@ class AlpineInstalledDatabaseHandler(models.DatafileHandler):
description = 'Alpine Linux installed package database'

@classmethod
def parse(cls, location):
def parse(cls, location, package_only=False):
yield from parse_alpine_installed_db(
location=location,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
package_only=package_only,
)

@classmethod
Expand Down Expand Up @@ -134,9 +135,14 @@ class AlpineApkbuildHandler(models.DatafileHandler):
documentation_url = 'https://wiki.alpinelinux.org/wiki/APKBUILD_Reference'

@classmethod
def parse(cls, location):
package_data = parse_apkbuild(location, strict=True)
cls.populate_license_fields(package_data)
def parse(cls, location, package_only=False):
package_data = parse_apkbuild(
location=location,
strict=True,
package_only=package_only
)
if not package_only:
cls.populate_license_fields(package_data)
if package_data:
yield package_data

Expand Down Expand Up @@ -165,7 +171,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder)
)


def parse_alpine_installed_db(location, datasource_id, package_type):
def parse_alpine_installed_db(location, datasource_id, package_type, package_only=False):
"""
Yield PackageData objects from an installed database file at `location`
or None. Typically found at '/lib/apk/db/installed' in an Alpine
Expand All @@ -179,6 +185,7 @@ def parse_alpine_installed_db(location, datasource_id, package_type):
package_fields=package_fields,
datasource_id=datasource_id,
package_type=package_type,
package_only=package_only,
)


Expand Down Expand Up @@ -241,7 +248,7 @@ def get_alpine_installed_db_fields(location):
])


def parse_apkbuild(location, strict=False):
def parse_apkbuild(location, strict=False, package_only=False):
"""
Return a PackageData object from an APKBUILD file at ``location`` or None.

Expand All @@ -256,6 +263,7 @@ def parse_apkbuild(location, strict=False):
datasource_id=AlpineApkbuildHandler.datasource_id,
package_type=AlpineApkbuildHandler.default_package_type,
strict=strict,
package_only=package_only,
)


Expand Down Expand Up @@ -732,7 +740,7 @@ def fix_apkbuild(text):
return text


def parse_apkbuild_text(text, datasource_id, package_type, strict=False):
def parse_apkbuild_text(text, datasource_id, package_type, strict=False, package_only=False):
"""
Return a PackageData object from an APKBUILD text context or None. Only
consider variables with a name listed in the ``names`` set.
Expand Down Expand Up @@ -761,7 +769,8 @@ def parse_apkbuild_text(text, datasource_id, package_type, strict=False):
package = build_package_data(
variables,
datasource_id=datasource_id,
package_type=package_type
package_type=package_type,
package_only=package_only,
)

if package and unresolved:
Expand Down Expand Up @@ -800,7 +809,7 @@ def parse_pkginfo(location):
raise NotImplementedError


def build_package_data(package_fields, datasource_id, package_type):
def build_package_data(package_fields, datasource_id, package_type, package_only=False):
"""
Return a PackageData object from a ``package_fields`` iterable of (name,
value) tuples.
Expand Down Expand Up @@ -850,7 +859,16 @@ def build_package_data(package_fields, datasource_id, package_type):

converted_fields.update(converted)

return models.PackageData.from_dict(converted_fields)
fields_not_required = ["current_file", "current_dir"]
for field in fields_not_required:
value = converted_fields.get(field)
if value:
converted_fields.pop(field)

return models.PackageData.from_data(
package_data=converted_fields,
package_only=package_only,
)

#####################################
# Note: all handlers MUST accept **kwargs as they also receive the current data
Expand Down
7 changes: 4 additions & 3 deletions src/packagedcode/bower.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class BowerJsonHandler(models.DatafileHandler):
documentation_url = 'https://bower.io'

@classmethod
def parse(cls, location):
def parse(cls, location, package_only=False):
with io.open(location, encoding='utf-8') as loc:
package_data = json.load(loc)

Expand Down Expand Up @@ -87,7 +87,7 @@ def parse(cls, location):
)
)

yield models.PackageData(
package_data = dict(
datasource_id=cls.datasource_id,
type=cls.default_package_type,
name=name,
Expand All @@ -98,5 +98,6 @@ def parse(cls, location):
parties=parties,
homepage_url=homepage_url,
vcs_url=vcs_url,
dependencies=dependencies
dependencies=dependencies,
)
yield models.PackageData.from_data(package_data, package_only)
32 changes: 20 additions & 12 deletions src/packagedcode/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class AutotoolsConfigureHandler(models.NonAssemblableDatafileHandler):
documentation_url = 'https://www.gnu.org/software/automake/'

@classmethod
def parse(cls, location):
def parse(cls, location, package_only=False):
# we use the parent directory as a package name
name = fileutils.file_name(fileutils.parent_directory(location))
# we could use checksums as version in the future
Expand All @@ -67,12 +67,13 @@ def parse(cls, location):
# there are dependencies we could use
# dependencies = []

yield models.PackageData(
package_data = dict(
datasource_id=cls.datasource_id,
type=cls.default_package_type,
name=name,
version=version,
)
yield models.PackageData.from_data(package_data, package_only)



Expand Down Expand Up @@ -104,6 +105,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
package = models.Package.from_package_data(
package_data=package_data,
datafile_path=resource.path,
package_only=True,
)

if TRACE:
Expand Down Expand Up @@ -135,8 +137,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
yield resource

@classmethod
def parse(cls, location):

def parse(cls, location, package_only=False):
# Thanks to Starlark being a Python dialect, we can use `ast` to parse it
with open(location, 'rb') as f:
tree = ast.parse(f.read())
Expand Down Expand Up @@ -188,23 +189,28 @@ def parse(cls, location):
if TRACE:
logger_debug(f"build: parse: license_files: {license_files}")

package_data = models.PackageData(
package_data = dict(
datasource_id=cls.datasource_id,
type=cls.default_package_type,
name=name,
extracted_license_statement=license_files,
)
# `package_only` is True as we do the license detection
# on assembly
yield models.PackageData.from_data(
package_data=package_data,
package_only=True,
)

package_data.extracted_license_statement = license_files
yield package_data

else:
# If we don't find anything in the pkgdata file, we yield a Package
# with the parent directory as the name
yield models.PackageData(
package_data = dict(
datasource_id=cls.datasource_id,
type=cls.default_package_type,
name=fileutils.file_name(fileutils.parent_directory(location))
)
yield models.PackageData.from_data(package_data, package_only)

@classmethod
def assign_package_to_resources(cls, package, resource, codebase, package_adder, skip_name=None):
Expand Down Expand Up @@ -326,7 +332,7 @@ class BuckMetadataBzlHandler(BaseStarlarkManifestHandler):
documentation_url = 'https://buck.build/'

@classmethod
def parse(cls, location):
def parse(cls, location, package_only=True):

with open(location, 'rb') as f:
tree = ast.parse(f.read())
Expand Down Expand Up @@ -378,7 +384,7 @@ def parse(cls, location):
):
# TODO: Create function that determines package type from download URL,
# then create a package of that package type from the metadata info
yield models.PackageData(
package_data = dict(
datasource_id=cls.datasource_id,
type=metadata_fields.get('upstream_type', cls.default_package_type),
name=metadata_fields.get('name'),
Expand All @@ -388,6 +394,7 @@ def parse(cls, location):
homepage_url=metadata_fields.get('upstream_address', ''),
# TODO: Store 'upstream_hash` somewhere
)
yield models.PackageData.from_data(package_data, package_only=True)

if (
'package_type'
Expand All @@ -401,7 +408,7 @@ def parse(cls, location):
and 'vcs_commit_hash'
in metadata_fields
):
yield models.PackageData(
package_data = dict(
datasource_id=cls.datasource_id,
type=metadata_fields.get('package_type', cls.default_package_type),
name=metadata_fields.get('name'),
Expand All @@ -414,6 +421,7 @@ def parse(cls, location):
sha1=metadata_fields.get('download_archive_sha1', ''),
extra_data=dict(vcs_commit_hash=metadata_fields.get('vcs_commit_hash', ''))
)
yield models.PackageData.from_data(package_data, package_only=True)

@classmethod
def assign_package_to_resources(cls, package, resource, codebase, package_adder):
Expand Down
9 changes: 5 additions & 4 deletions src/packagedcode/build_gradle.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ class BuildGradleHandler(models.DatafileHandler):
description = 'Gradle build script'

@classmethod
def parse(cls, location):
def parse(cls, location, package_only=False):
dependencies = get_dependencies(location)
return build_package(cls, dependencies)
return build_package(cls, dependencies, package_only)

# TODO: handle complex cases of nested builds with many packages
@classmethod
Expand Down Expand Up @@ -328,7 +328,7 @@ def get_dependencies(build_gradle_location):
return list(get_dependencies_from_parse_tree(parse_tree))


def build_package(cls, dependencies):
def build_package(cls, dependencies, package_only=False):
"""
Yield PackageData from a ``dependencies`` list of mappings.
"""
Expand Down Expand Up @@ -364,10 +364,11 @@ def build_package(cls, dependencies):
)
)

yield models.PackageData(
package_data = dict(
datasource_id=cls.datasource_id,
type=cls.default_package_type,
primary_language=BuildGradleHandler.default_primary_language,
dependencies=package_dependencies,
)
yield models.PackageData.from_data(package_data, package_only)

10 changes: 6 additions & 4 deletions src/packagedcode/cargo.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ class CargoTomlHandler(CargoBaseHandler):
documentation_url = 'https://doc.rust-lang.org/cargo/reference/manifest.html'

@classmethod
def parse(cls, location):
def parse(cls, location, package_only=False):
package_data = toml.load(location, _dict=dict)
core_package_data = package_data.get('package', {})
workspace = package_data.get('workspace', {})
Expand Down Expand Up @@ -149,7 +149,7 @@ def parse(cls, location):
if workspace:
extra_data["workspace"] = workspace

yield models.PackageData(
package_data = dict(
datasource_id=cls.datasource_id,
type=cls.default_package_type,
name=name,
Expand All @@ -166,6 +166,7 @@ def parse(cls, location):
dependencies=dependencies,
extra_data=extra_data,
)
yield models.PackageData.from_data(package_data, package_only)


CARGO_ATTRIBUTE_MAPPING = {
Expand Down Expand Up @@ -200,7 +201,7 @@ class CargoLockHandler(CargoBaseHandler):
# ]

@classmethod
def parse(cls, location):
def parse(cls, location, package_only=False):
cargo_lock = toml.load(location, _dict=dict)
dependencies = []
package = cargo_lock.get('package', [])
Expand All @@ -221,12 +222,13 @@ def parse(cls, location):
)
)

yield models.PackageData(
package_data = dict(
datasource_id=cls.datasource_id,
type=cls.default_package_type,
primary_language=cls.default_primary_language,
dependencies=dependencies,
)
yield models.PackageData.from_data(package_data, package_only)


def dependency_mapper(dependencies, scope='dependencies'):
Expand Down
Loading
Loading