Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve detection of when breeze CI image needs rebuilding #33603

Merged
merged 2 commits into from
Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,10 @@ def should_we_run_the_build(build_ci_params: BuildCiParams) -> bool:
# We import those locally so that click autocomplete works
from inputimeout import TimeoutOccurred

if not md5sum_check_if_build_is_needed(md5sum_cache_dir=build_ci_params.md5sum_cache_dir):
if not md5sum_check_if_build_is_needed(
md5sum_cache_dir=build_ci_params.md5sum_cache_dir,
skip_provider_dependencies_check=build_ci_params.skip_provider_dependencies_check,
):
return False
try:
answer = user_confirm(
Expand Down Expand Up @@ -631,6 +634,7 @@ def rebuild_or_pull_ci_image_if_needed(command_params: ShellParams | BuildCiPara
image_tag=command_params.image_tag,
platform=command_params.platform,
force_build=command_params.force_build,
skip_provider_dependencies_check=command_params.skip_provider_dependencies_check,
)
if command_params.image_tag is not None and command_params.image_tag != "latest":
return_code, message = run_pull_image(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,9 @@ def static_checks(
force_build=force_build,
image_tag=image_tag,
github_repository=github_repository,
# for static checks we do not want to regenerate dependencies before pre-commits are run
# we want the pre-commit to do it for us (and detect the case the dependencies are updated)
skip_provider_dependencies_check=True,
)
if not skip_image_check:
rebuild_or_pull_ci_image_if_needed(command_params=build_params)
Expand Down
2 changes: 1 addition & 1 deletion dev/breeze/src/airflow_breeze/global_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,13 +323,13 @@ def get_airflow_extras():
"setup.cfg",
"Dockerfile.ci",
".dockerignore",
"generated/provider_dependencies.json",
"scripts/docker/common.sh",
"scripts/docker/install_additional_dependencies.sh",
"scripts/docker/install_airflow.sh",
"scripts/docker/install_airflow_dependencies_from_branch_tip.sh",
"scripts/docker/install_from_docker_context_files.sh",
"scripts/docker/install_mysql.sh",
*ALL_PROVIDER_YAML_FILES,
]

ENABLED_SYSTEMS = ""
Expand Down
1 change: 1 addition & 0 deletions dev/breeze/src/airflow_breeze/params/build_ci_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class BuildCiParams(CommonBuildParams):
airflow_pre_cached_pip_packages: bool = True
force_build: bool = False
eager_upgrade_additional_requirements: str = ""
skip_provider_dependencies_check: bool = False

@property
def airflow_version(self):
Expand Down
1 change: 1 addition & 0 deletions dev/breeze/src/airflow_breeze/params/shell_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ class ShellParams:
celery_flower: bool = False
only_min_version_update: bool = False
regenerate_missing_docs: bool = False
skip_provider_dependencies_check: bool = False

def clone_with_test(self, test_type: str) -> ShellParams:
new_params = deepcopy(self)
Expand Down
70 changes: 56 additions & 14 deletions dev/breeze/src/airflow_breeze/utils/md5_build_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@
from __future__ import annotations

import hashlib
import os
import sys
from pathlib import Path

from airflow_breeze.global_constants import FILES_FOR_REBUILD_CHECK
from airflow_breeze.global_constants import ALL_PROVIDER_YAML_FILES, FILES_FOR_REBUILD_CHECK
from airflow_breeze.utils.console import get_console
from airflow_breeze.utils.path_utils import AIRFLOW_SOURCES_ROOT
from airflow_breeze.utils.run_utils import run_command


def check_md5checksum_in_cache_modified(file_hash: str, cache_path: Path, update: bool) -> bool:
Expand Down Expand Up @@ -59,44 +62,83 @@ def generate_md5(filename, file_size: int = 65536):
return hash_md5.hexdigest()


def check_md5_sum_for_file(file_to_check: str, md5sum_cache_dir: Path, update: bool):
file_to_get_md5 = AIRFLOW_SOURCES_ROOT / file_to_check
md5_checksum = generate_md5(file_to_get_md5)
sub_dir_name = file_to_get_md5.parts[-2]
actual_file_name = file_to_get_md5.parts[-1]
cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" + actual_file_name + ".md5sum")
file_content = md5_checksum + " " + str(file_to_get_md5) + "\n"
is_modified = check_md5checksum_in_cache_modified(file_content, cache_file_name, update=update)
return is_modified


def calculate_md5_checksum_for_files(
md5sum_cache_dir: Path, update: bool = False
md5sum_cache_dir: Path, update: bool = False, skip_provider_dependencies_check: bool = False
) -> tuple[list[str], list[str]]:
"""
Calculates checksums for all interesting files and stores the hashes in the md5sum_cache_dir.
Optionally modifies the hashes.

:param md5sum_cache_dir: directory where to store cached information
:param update: whether to update the hashes
:param skip_provider_dependencies_check: whether to skip regeneration of the provider dependencies
:return: Tuple of two lists: modified and not-modified files
"""
not_modified_files = []
modified_files = []
for calculate_md5_file in FILES_FOR_REBUILD_CHECK:
file_to_get_md5 = AIRFLOW_SOURCES_ROOT / calculate_md5_file
md5_checksum = generate_md5(file_to_get_md5)
sub_dir_name = file_to_get_md5.parts[-2]
actual_file_name = file_to_get_md5.parts[-1]
cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" + actual_file_name + ".md5sum")
file_content = md5_checksum + " " + str(file_to_get_md5) + "\n"
is_modified = check_md5checksum_in_cache_modified(file_content, cache_file_name, update=update)
if not skip_provider_dependencies_check:
modified_provider_yaml_files = []
for file in ALL_PROVIDER_YAML_FILES:
# Only check provider yaml files once and save the result immediately.
# If we need to regenerate the dependencies and they are not modified then
# all is fine and we can save checksums for the new files
if check_md5_sum_for_file(file, md5sum_cache_dir, True):
modified_provider_yaml_files.append(file)
if modified_provider_yaml_files:
get_console().print(
"[info]Attempting to generate provider dependencies. "
"Provider yaml files changed since last check:[/]"
)
get_console().print(
[os.fspath(file.relative_to(AIRFLOW_SOURCES_ROOT)) for file in modified_provider_yaml_files]
)
# Regenerate provider_dependencies.json
run_command(
[
sys.executable,
os.fspath(
AIRFLOW_SOURCES_ROOT
/ "scripts"
/ "ci"
/ "pre_commit"
/ "pre_commit_update_providers_dependencies.py"
),
],
cwd=AIRFLOW_SOURCES_ROOT,
)
for file in FILES_FOR_REBUILD_CHECK:
is_modified = check_md5_sum_for_file(file, md5sum_cache_dir, update)
if is_modified:
modified_files.append(calculate_md5_file)
modified_files.append(file)
else:
not_modified_files.append(calculate_md5_file)
not_modified_files.append(file)
return modified_files, not_modified_files


def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path) -> bool:
def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path, skip_provider_dependencies_check: bool) -> bool:
"""
Checks if build is needed based on whether important files were modified.

:param md5sum_cache_dir: directory where cached md5 sums are stored
:param skip_provider_dependencies_check: whether to skip regeneration of the provider dependencies

:return: True if build is needed.
"""
build_needed = False
modified_files, not_modified_files = calculate_md5_checksum_for_files(md5sum_cache_dir, update=False)
modified_files, not_modified_files = calculate_md5_checksum_for_files(
md5sum_cache_dir, update=False, skip_provider_dependencies_check=skip_provider_dependencies_check
)
if modified_files:
get_console().print(
f"[warning]The following important files are modified in {AIRFLOW_SOURCES_ROOT} "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,12 +209,28 @@ def check_if_different_provider_used(file_path: Path) -> None:
console.print("[red]Errors found during verification. Exiting!")
console.print()
sys.exit(1)
DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies, indent=2) + "\n")
console.print(
f"[yellow]If you see changes to the {DEPENDENCIES_JSON_FILE_PATH} file - "
f"do not modify the file manually. Let pre-commit do the job!"
)
console.print()
console.print("[green]Verification complete! Success!\n")
console.print(f"Written {DEPENDENCIES_JSON_FILE_PATH}")
console.print()
old_dependencies = DEPENDENCIES_JSON_FILE_PATH.read_text()
new_dependencies = json.dumps(unique_sorted_dependencies, indent=2) + "\n"
if new_dependencies != old_dependencies:
DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies, indent=2) + "\n")
if os.environ.get("CI"):
console.print()
console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}")
console.print(
f"[yellow]You will need to run breeze locally and commit "
f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n"
)
console.print()
else:
console.print()
console.print(
f"[yellow]Regenerated new dependencies. Please commit "
f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n"
)
console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}")
console.print()
else:
console.print(
"[green]No need to regenerate dependencies!\n[/]"
f"The {DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)} is up to date!\n"
)