Skip to content

Commit

Permalink
Merge pull request #779 from gerrod3/python-metadata-fix-script
Browse files Browse the repository at this point in the history
Fix requires_python metadata + add repair metadata command
  • Loading branch information
ggainey authored Dec 20, 2024
2 parents af35d18 + ac33f4b commit 87137d8
Show file tree
Hide file tree
Showing 10 changed files with 261 additions and 35 deletions.
4 changes: 4 additions & 0 deletions CHANGES/773.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fixed `requires_python` field not being properly set on package upload.

Run the new `pulpcore-manager repair-python-metadata` command with repositories containing affected
packages to repair their metadata.
Empty file.
Empty file.
118 changes: 118 additions & 0 deletions pulp_python/app/management/commands/repair-python-metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import re
import os
from django.core.management import BaseCommand, CommandError
from gettext import gettext as _

from django.conf import settings

from pulpcore.plugin.util import extract_pk
from pulp_python.app.models import PythonPackageContent, PythonRepository
from pulp_python.app.utils import artifact_to_python_content_data


def repair_metadata(content):
"""
Repairs the metadata for the passed in content queryset.
:param content: The PythonPackageContent queryset.
Return: number of content units that were repaired
"""
# TODO: Add on_demand content repair?
os.chdir(settings.WORKING_DIRECTORY)
content = content.select_related("pulp_domain")
immediate_content = content.filter(contentartifact__artifact__isnull=False)
batch = []
set_of_update_fields = set()
total_repaired = 0
for package in immediate_content.prefetch_related('_artifacts').iterator(chunk_size=1000):
new_data = artifact_to_python_content_data(
package.filename, package._artifacts.get(), package.pulp_domain
)
changed = False
for field, value in new_data.items():
if getattr(package, field) != value:
setattr(package, field, value)
set_of_update_fields.add(field)
changed = True
if changed:
batch.append(package)
if len(batch) == 1000:
total_repaired += len(batch)
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
batch = []
set_of_update_fields.clear()

if len(batch) > 0:
total_repaired += len(batch)
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)

return total_repaired


def href_prn_list_handler(value):
"""Common list parsing for a string of hrefs/prns."""
r = re.compile(
rf"""
(?:{settings.API_ROOT}(?:[-_a-zA-Z0-9]+/)?api/v3/repositories/python/python/[-a-f0-9]+/)
|(?:prn:python\.pythonrepository:[-a-f0-9]+)
""",
re.VERBOSE
)
values = []
for v in value.split(","):
if v:
if match := r.match(v.strip()):
values.append(match.group(0))
else:
raise CommandError(f"Invalid href/prn: {v}")
return values


class Command(BaseCommand):
"""
Management command to repair metadata of PythonPackageContent.
"""

help = _("Repair the metadata of PythonPackageContent stored in PythonRepositories")

def add_arguments(self, parser):
"""Set up arguments."""
parser.add_argument(
"--repositories",
type=href_prn_list_handler,
required=False,
help=_(
"List of PythonRepository hrefs/prns whose content's metadata will be repaired. "
"Leave blank to include all repositories in all domains. Mutually exclusive "
"with domain."
),
)
parser.add_argument(
"--domain",
default=None,
required=False,
help=_(
"The pulp domain to gather the repositories from if specified. Mutually"
" exclusive with repositories."
),
)

def handle(self, *args, **options):
"""Implement the command."""
domain = options.get("domain")
repository_hrefs = options.get("repositories")
if domain and repository_hrefs:
raise CommandError(_("--domain and --repositories are mutually exclusive"))

repositories = PythonRepository.objects.all()
if repository_hrefs:
repos_ids = [extract_pk(r) for r in repository_hrefs]
repositories = repositories.filter(pk__in=repos_ids)
elif domain:
repositories = repositories.filter(pulp_domain__name=domain)

content_set = set()
for repository in repositories:
content_set.update(repository.latest_version().content.values_list("pk", flat=True))
content = PythonPackageContent.objects.filter(pk__in=content_set)
num_repaired = repair_metadata(content)
print(f"{len(content_set)} packages processed, {num_repaired} package metadata repaired.")
12 changes: 2 additions & 10 deletions pulp_python/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@

from pathlib import PurePath
from .utils import (
artifact_to_python_content_data,
canonicalize_name,
get_project_metadata_from_artifact,
parse_project_metadata,
python_content_to_json,
PYPI_LAST_SERIAL,
PYPI_SERIAL_CONSTANT,
Expand Down Expand Up @@ -189,14 +188,7 @@ class PythonPackageContent(Content):
def init_from_artifact_and_relative_path(artifact, relative_path):
"""Used when downloading package from pull-through cache."""
path = PurePath(relative_path)
metadata = get_project_metadata_from_artifact(path.name, artifact)
data = parse_project_metadata(vars(metadata))
data["packagetype"] = metadata.packagetype
data["version"] = metadata.version
data["filename"] = path.name
data["sha256"] = artifact.sha256
data["pulp_domain_id"] = artifact.pulp_domain_id
data["_pulp_domain_id"] = artifact.pulp_domain_id
data = artifact_to_python_content_data(path.name, artifact, domain=get_domain())
return PythonPackageContent(**data)

def __str__(self):
Expand Down
12 changes: 2 additions & 10 deletions pulp_python/app/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from pulp_python.app import models as python_models
from pulp_python.app import fields
from pulp_python.app.utils import get_project_metadata_from_artifact, parse_project_metadata
from pulp_python.app.utils import artifact_to_python_content_data


class PythonRepositorySerializer(core_serializers.RepositorySerializer):
Expand Down Expand Up @@ -217,7 +217,7 @@ def deferred_validate(self, data):

artifact = data["artifact"]
try:
metadata = get_project_metadata_from_artifact(filename, artifact)
_data = artifact_to_python_content_data(filename, artifact, domain=get_domain())
except ValueError:
raise serializers.ValidationError(_(
"Extension on {} is not a valid python extension "
Expand All @@ -231,14 +231,6 @@ def deferred_validate(self, data):
)}
)

_data = parse_project_metadata(vars(metadata))
_data['packagetype'] = metadata.packagetype
_data['version'] = metadata.version
_data['filename'] = filename
_data['sha256'] = artifact.sha256
data["pulp_domain_id"] = artifact.pulp_domain_id
data["_pulp_domain_id"] = artifact.pulp_domain_id

data.update(_data)

return data
Expand Down
12 changes: 2 additions & 10 deletions pulp_python/app/tasks/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pulpcore.plugin.util import get_domain

from pulp_python.app.models import PythonPackageContent, PythonRepository
from pulp_python.app.utils import get_project_metadata_from_artifact, parse_project_metadata
from pulp_python.app.utils import artifact_to_python_content_data


def upload(artifact_sha256, filename, repository_pk=None):
Expand Down Expand Up @@ -76,15 +76,7 @@ def create_content(artifact_sha256, filename, domain):
queryset of the new created content
"""
artifact = Artifact.objects.get(sha256=artifact_sha256, pulp_domain=domain)
metadata = get_project_metadata_from_artifact(filename, artifact)

data = parse_project_metadata(vars(metadata))
data['packagetype'] = metadata.packagetype
data['version'] = metadata.version
data['filename'] = filename
data['sha256'] = artifact.sha256
data['pulp_domain'] = domain
data['_pulp_domain'] = domain
data = artifact_to_python_content_data(filename, artifact, domain)

@transaction.atomic()
def create():
Expand Down
48 changes: 43 additions & 5 deletions pulp_python/app/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pkginfo
import re
import shutil
import tempfile
import json
Expand Down Expand Up @@ -51,6 +52,20 @@
".zip": "sdist",
}

DIST_REGEXES = {
# regex from https://github.com/pypa/pip/blob/18.0/src/pip/_internal/wheel.py#L569
".whl": re.compile(
r"""^(?P<name>.+?)-(?P<version>.*?)
((-(?P<build>\d[^-]*?))?-(?P<pyver>.+?)-(?P<abi>.+?)-(?P<plat>.+?)
\.whl|\.dist-info)$""",
re.VERBOSE
),
# regex based on https://setuptools.pypa.io/en/latest/deprecated/python_eggs.html#filename-embedded-metadata # noqa: E501
".egg": re.compile(r"^(?P<name>.+?)-(?P<version>.*?)(-(?P<pyver>.+?(-(?P<plat>.+?))?))?\.egg|\.egg-info$"), # noqa: E501
# regex based on https://github.com/python/cpython/blob/v3.7.0/Lib/distutils/command/bdist_wininst.py#L292 # noqa: E501
".exe": re.compile(r"^(?P<name>.+?)-(?P<version>.*?)\.(?P<plat>.+?)(-(?P<pyver>.+?))?\.exe$"),
}

DIST_TYPES = {
"bdist_wheel": pkginfo.Wheel,
"bdist_wininst": pkginfo.Distribution,
Expand All @@ -72,6 +87,8 @@ def parse_project_metadata(project):
"""
package = {}
package['name'] = project.get('name') or ""
package['version'] = project.get('version') or ""
package['packagetype'] = project.get('packagetype') or ""
package['metadata_version'] = project.get('metadata_version') or ""
package['summary'] = project.get('summary') or ""
package['description'] = project.get('description') or ""
Expand All @@ -86,13 +103,15 @@ def parse_project_metadata(project):
package['project_url'] = project.get('project_url') or ""
package['platform'] = project.get('platform') or ""
package['supported_platform'] = project.get('supported_platform') or ""
package['requires_python'] = project.get('requires_python') or ""
package['requires_dist'] = json.dumps(project.get('requires_dist', []))
package['provides_dist'] = json.dumps(project.get('provides_dist', []))
package['obsoletes_dist'] = json.dumps(project.get('obsoletes_dist', []))
package['requires_external'] = json.dumps(project.get('requires_external', []))
package['classifiers'] = json.dumps(project.get('classifiers', []))
package['project_urls'] = json.dumps(project.get('project_urls', {}))
package['description_content_type'] = project.get('description_content_type') or ""
package['python_version'] = project.get('python_version') or ""

return package

Expand All @@ -113,17 +132,15 @@ def parse_metadata(project, version, distribution):
dictionary: of useful python metadata
"""
package = {}
package = parse_project_metadata(project)

package['filename'] = distribution.get('filename') or ""
package['packagetype'] = distribution.get('packagetype') or ""
package['version'] = version
package['url'] = distribution.get('url') or ""
package['sha256'] = distribution.get('digests', {}).get('sha256') or ""
package['python_version'] = distribution.get('python_version') or ""
package['requires_python'] = distribution.get('requires_python') or ""

package.update(parse_project_metadata(project))
package['python_version'] = distribution.get('python_version') or package.get('python_version')
package['requires_python'] = distribution.get('requires_python') or package.get('requires_python') # noqa: E501

return package

Expand All @@ -147,9 +164,30 @@ def get_project_metadata_from_artifact(filename, artifact):
temp_file.flush()
metadata = DIST_TYPES[packagetype](temp_file.name)
metadata.packagetype = packagetype
if packagetype == "sdist":
metadata.python_version = "source"
else:
pyver = ""
regex = DIST_REGEXES[extensions[pkg_type_index]]
if bdist_name := regex.match(filename):
pyver = bdist_name.group("pyver") or ""
metadata.python_version = pyver
return metadata


def artifact_to_python_content_data(filename, artifact, domain=None):
"""
Takes the artifact/filename and returns the metadata needed to create a PythonPackageContent.
"""
metadata = get_project_metadata_from_artifact(filename, artifact)
data = parse_project_metadata(vars(metadata))
data['sha256'] = artifact.sha256
data['filename'] = filename
data['pulp_domain'] = domain or artifact.pulp_domain
data['_pulp_domain'] = data['pulp_domain']
return data


def python_content_to_json(base_path, content_query, version=None, domain=None):
"""
Converts a QuerySet of PythonPackageContent into the PyPi JSON format
Expand Down
12 changes: 12 additions & 0 deletions pulp_python/tests/functional/api/test_crud_content_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,15 @@ def test_upload_metadata_23_spec(python_content_factory):
content = python_content_factory(filename, url=package.url)
assert content.metadata_version == "2.3"
break


@pytest.mark.parallel
def test_upload_requires_python(python_content_factory):
filename = "pip-24.3.1-py3-none-any.whl"
with PyPISimple() as client:
page = client.get_project_page("pip")
for package in page.packages:
if package.filename == filename:
content = python_content_factory(filename, url=package.url)
assert content.requires_python == ">=3.8"
break
Loading

0 comments on commit 87137d8

Please sign in to comment.