Skip to content

Commit

Permalink
Walk directories in sorted order for reproducibility (#517)
Browse files Browse the repository at this point in the history
* fix: Walk directories in sorted order for reproducibility

* recurse directories in sorted order
* recurse top-level *.dist-info/ directories last
* list filenames in sorted order
* list top-level *.dist-info/RECORD files last

* do not add zip file entries for non-empty directories

Co-authored-by: Lisandro Dalcin <dalcinl@gmail.com>

* Add test for empty folder

* Revert "Add test for empty folder"

This reverts commit a8134df.

* Revert "do not add zip file entries for non-empty directories"

This reverts commit fbdd62e.

* Reapply "Add test for empty folder"

This reverts commit 25b3dd1.

* fix test

---------

Co-authored-by: mayeut <mayeut@users.noreply.github.com>
  • Loading branch information
dalcinl and mayeut authored Jan 3, 2025
1 parent 2e00860 commit 5fac207
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 10 deletions.
53 changes: 49 additions & 4 deletions src/auditwheel/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import subprocess
import zipfile
from collections.abc import Iterable
from collections.abc import Generator, Iterable
from datetime import datetime, timezone
from typing import Any

Expand All @@ -29,6 +29,50 @@ def unique_by_index(sequence: Iterable[Any]) -> list[Any]:
return uniques


def walk(topdir: str) -> Generator[tuple[str, list[str], list[str]]]:
"""Wrapper for `os.walk` with outputs in reproducible order
Parameters
----------
topdir : str
Root of the directory tree
Yields
------
dirpath : str
Path to a directory
dirnames : list[str]
List of subdirectory names in `dirpath`
filenames : list[str]
List of non-directory file names in `dirpath`
"""
topdir = os.path.normpath(topdir)
for dirpath, dirnames, filenames in os.walk(topdir):
# sort list of dirnames in-place such that `os.walk`
# will recurse into subdirectories in reproducible order
dirnames.sort()
# recurse into any top-level .dist-info subdirectory last
if dirpath == topdir:
subdirs = []
dist_info = []
for dir in dirnames:
if dir.endswith(".dist-info"):
dist_info.append(dir)
else:
subdirs.append(dir)
dirnames[:] = subdirs
dirnames.extend(dist_info)
del dist_info
# sort list of filenames for iteration in reproducible order
filenames.sort()
# list any dist-info/RECORD file last
if dirpath.endswith(".dist-info") and os.path.dirname(dirpath) == topdir:
if "RECORD" in filenames:
filenames.remove("RECORD")
filenames.append("RECORD")
yield dirpath, dirnames, filenames


def zip2dir(zip_fname: str, out_dir: str) -> None:
"""Extract `zip_fname` into output directory `out_dir`
Expand Down Expand Up @@ -69,15 +113,16 @@ def dir2zip(in_dir: str, zip_fname: str, date_time: datetime | None = None) -> N
date_time : Optional[datetime]
Time stamp to set on each file in the archive
"""
in_dir = os.path.normpath(in_dir)
if date_time is None:
st = os.stat(in_dir)
date_time = datetime.fromtimestamp(st.st_mtime, tz=timezone.utc)
date_time_args = date_time.timetuple()[:6]
compression = zipfile.ZIP_DEFLATED
with zipfile.ZipFile(zip_fname, "w", compression=compression) as z:
for root, dirs, files in os.walk(in_dir):
for dir in dirs:
dname = os.path.join(root, dir)
for root, dirs, files in walk(in_dir):
if root != in_dir:
dname = root
out_dname = os.path.relpath(dname, in_dir) + "/"
zinfo = zipfile.ZipInfo.from_file(dname, out_dname)
zinfo.date_time = date_time_args
Expand Down
12 changes: 6 additions & 6 deletions src/auditwheel/wheeltools.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from ._vendor.wheel.pkginfo import read_pkg_info, write_pkg_info
from .tmpdirs import InTemporaryDirectory
from .tools import dir2zip, unique_by_index, zip2dir
from .tools import dir2zip, unique_by_index, walk, zip2dir

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -69,18 +69,18 @@ def rewrite_record(bdist_dir: str) -> None:
if exists(sig_path):
os.unlink(sig_path)

def walk() -> Generator[str]:
for dir, dirs, files in os.walk(bdist_dir):
for f in files:
yield pjoin(dir, f)
def files() -> Generator[str]:
for dir, _, files in walk(bdist_dir):
for file in files:
yield pjoin(dir, file)

def skip(path: str) -> bool:
"""Wheel hashes every possible file."""
return path == record_relpath

with open(record_path, "w+", newline="", encoding="utf-8") as record_file:
writer = csv.writer(record_file)
for path in walk():
for path in files():
relative_path = relpath(path, bdist_dir)
if skip(relative_path):
hash_ = ""
Expand Down
23 changes: 23 additions & 0 deletions tests/unit/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import argparse
import lzma
import zipfile
from pathlib import Path

import pytest
Expand Down Expand Up @@ -100,3 +101,25 @@ def test_dir2zip_deflate(tmp_path):
output_file = tmp_path / "ouput.zip"
dir2zip(str(input_dir), str(output_file))
assert output_file.stat().st_size < len(buffer) / 4


def test_dir2zip_folders(tmp_path):
input_dir = tmp_path / "input_dir"
input_dir.mkdir()
dist_info_folder = input_dir / "dummy-1.0.dist-info"
dist_info_folder.mkdir()
dist_info_folder.joinpath("METADATA").write_text("")
empty_folder = input_dir / "dummy" / "empty"
empty_folder.mkdir(parents=True)
output_file = tmp_path / "output.zip"
dir2zip(str(input_dir), str(output_file))
expected_dirs = {"dummy/", "dummy/empty/", "dummy-1.0.dist-info/"}
with zipfile.ZipFile(output_file, "r") as z:
assert len(z.filelist) == 4
for info in z.filelist:
if info.is_dir():
assert info.filename in expected_dirs
expected_dirs.remove(info.filename)
else:
assert info.filename == "dummy-1.0.dist-info/METADATA"
assert len(expected_dirs) == 0

0 comments on commit 5fac207

Please sign in to comment.