diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5df89e4fc7b4d..869bddd4c91f1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -357,6 +357,20 @@ repos: files: ^dev/breeze/src/airflow_breeze/utils/docker_command_utils\.py$|^scripts/ci/docker_compose/local\.yml$ pass_filenames: false additional_dependencies: ['rich>=12.4.4'] + - id: update-common-sql-api + name: Check and update common.sql API + entry: ./scripts/ci/pre_commit/pre_commit_update_common_sql_api.py + language: python + language_version: "3.7" + files: ^scripts/ci/pre_commit/pre_commit_update_common_sql_api\.py$ + pass_filenames: false + additional_dependencies: ['rich>=12.4.4', 'astor'] + - id: check-common-sql-api-usage + name: Check usages of the common.sql API in other providers + entry: ./scripts/ci/pre_commit/pre_commit_check_common_sql_api_usage.py + language: python + files: ^airflow/providers/.*\.py$ + additional_dependencies: ['rich>=12.4.4'] - id: update-providers-dependencies name: Update cross-dependencies for providers packages entry: ./scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py diff --git a/STATIC_CODE_CHECKS.rst b/STATIC_CODE_CHECKS.rst index 7495044f3dcd8..4dcdf3cdca7cf 100644 --- a/STATIC_CODE_CHECKS.rst +++ b/STATIC_CODE_CHECKS.rst @@ -155,6 +155,8 @@ require Breeze Docker image to be build locally. +--------------------------------------------------------+------------------------------------------------------------------+---------+ | check-changelog-has-no-duplicates | Check changelogs for duplicate entries | | +--------------------------------------------------------+------------------------------------------------------------------+---------+ +| check-common-sql-api-usage | Check usages of the common.sql API in other providers | | ++--------------------------------------------------------+------------------------------------------------------------------+---------+ | check-core-deprecation-classes | Verify using of dedicated Airflow deprecation classes in core | | +--------------------------------------------------------+------------------------------------------------------------------+---------+ | check-daysago-import-from-utils | Make sure days_ago is imported from airflow.utils.dates | | @@ -305,6 +307,8 @@ require Breeze Docker image to be build locally. +--------------------------------------------------------+------------------------------------------------------------------+---------+ | update-breeze-readme-config-hash | Update Breeze README.md with config files hash | | +--------------------------------------------------------+------------------------------------------------------------------+---------+ +| update-common-sql-api | Check and update common.sql API | | ++--------------------------------------------------------+------------------------------------------------------------------+---------+ | update-er-diagram | Update ER diagram | * | +--------------------------------------------------------+------------------------------------------------------------------+---------+ | update-extras | Update extras in documentation | | diff --git a/airflow/providers/common/sql/README_API.md b/airflow/providers/common/sql/README_API.md new file mode 100644 index 0000000000000..a7533a8231495 --- /dev/null +++ b/airflow/providers/common/sql/README_API.md @@ -0,0 +1,100 @@ + + +# Keeping the API of common.sql provider backwards compatible + +The API of the `common.sql` provider should be kept backwards compatible with previously released versions. +The reason is - there are already released providers that use `common.sql` provider and rely on the API and +behaviour of the `common.sql` provider, and any updates in the API or behaviour of it, might potentially +break those released providers. + +Therefore, we should keep an extra care when changing the APIs. + +The approach we take is similar to one that has been applied by Android OS team to keep their API in check, +and it is based on storing the current varsion of API and flagging changes that are potentially breaking. +This is done by comparing the previous API (store in [api.txt](api.txt) and the upcoming API from the PR. +The upcoming API is automatically extracted from `common.sql` Python files using `update-common-sql-api` +pre-commit. If the comparison determines that the change is potentially breaking, the contributor is asked +to review the changes and manually regenerate the [api.txt](api.txt). + +The details of the workflow are as follows: + +1) The previous API is stored in the (committed to repository) [api.txt](api.txt) file in this folder. +2) Every time when common.sql Python files are modified the `update-common-sql-api` pre-commit regenerates + the API file content and looks for potentially breaking changes are detected (removals or updates of + the existing classes/methods. +3) If the check reveals there are no changes to the API, nothing happens. +4) If there are only additions, the pre-commit automatically updates the [api.txt](api.txt) file, + asks the contributor to commit resulting update to the [api.txt](api.txt) and fails the pre-commit. This + is very similar to other static checks that automatically modify/fix source code. +5) If the pre-commit detects potentially breaking changes, the process is a bit more involved for the + contributor. The pre-commit flags such changes to the contributor by failing the pre-commit and + asks the contributor to review the change looking specifically for breaking compatibility with previous + providers (and fix any backwards compatibility). Once this is completed, the contributor is asked to + manually and explicitly regenerate and commit the new version of the API by running the pre-commit + with manually added environment variable: `UPDATE_COMMON_SQL_API=1 pre-commit run update-common-sql-api`. + +# Verifying other providers to use only public API of the `common.sql` provider + +The file which holds the current API is used by a second pre-commit - `check-common-sql-api-usage`. + +This pre-commit does not do any update - it only checks that any of the providers uses only the +public API stored in the [api.txt](api.txt) when accessing `common.sql` provider. The reason is that some +methods or classes in the common.sql are not supposed to be part of the API - there are helper methods +and classes. Those methods and classes are usually starting with `_` (protected methods) and they are +excluded. + +However - historically - some protected methods have been used, so until we determine it is +safe to remove them, the pre-commit will continue to manually select them as part of the API and they +are protected by this mechanism. This way we can be more certain that the community providers that use +`common.sql` provider only use the APIs that are intended to be used as public. + +Of course, this is never 100% fool-proof, due to the dynamic Python behaviours and due to possible internal +changes in behaviours of the common.sql APIs, but at the very least any of "interface" changes will be +flagged in any change to contributor making a change and also reviewers will be notified whenever any +API change happens. + +# External usage, tracking API changes + +The `api.txt` file might be used by any external provider developer to make sure that only official +APIs are used, which will also help those external providers to use the `common.sql` provider +functionality without the fear of using methods and classes that are deemed non-public. + +Since this file is stored in the repository, we can automatically generate the API differences between +future released version of `common.sql` provider, and we will be able to easily reason about changes of the +API between versions. + +# Possible future work + +This mechanism providing the "official" API of the `common.sql` provider is just the first attempt to +standardise APIs provided by various Airflow's components. In the future, we might decide to extend +after experimenting and testing its behaviour for the `common.sql`. + +Keeping a formal, automatically generated and maintained list of official classes and methods that +providers can use. Once we get used to it and confirm that it is a good mechanism, we will be able +to extend this approach to more cases: + +* for other cross-provider-dependencies, we will be able to flag potentially breaking changes between them +* we should be able to use very similar mechanism for exposing and flagging the public API of Airflow's core + and flag any misuse of methods in the Community providers +* as a follow-up of those two - we should be able to produce more "formal" API that providers written + externally could use and write a "compatibility tool" that will be able to verify if the provider + written externally is compatible (on API level) with certain version of Airflow, or in case of the + cross-provider dependencies whether it is compatible (on API level) with certain versions of the released + providers. diff --git a/airflow/providers/common/sql/api.txt b/airflow/providers/common/sql/api.txt new file mode 100644 index 0000000000000..97fb8ced9cc22 --- /dev/null +++ b/airflow/providers/common/sql/api.txt @@ -0,0 +1,69 @@ +# This is a common.sql provider API dump +# +# This file is generated automatically by the update-common-sql-api pre-commit +# and it shows the current API of the common.sql provider +# +# Any, potentially breaking change in the API will require deliberate manual action from the contributor +# making a change to the common.sql provider. This file is also used in checking if only public API of +# the common.sql provider is used by all the other providers. +# +# You can read more in the README_API.md file +# +airflow.providers.common.sql.hooks.sql.ConnectorProtocol +airflow.providers.common.sql.hooks.sql.ConnectorProtocol.connect() -> Any +airflow.providers.common.sql.hooks.sql.DbApiHook +airflow.providers.common.sql.hooks.sql.DbApiHook.__init__(schema: str | None = None, log_sql: bool = True, *args, **kwargs) +airflow.providers.common.sql.hooks.sql.DbApiHook.bulk_dump() +airflow.providers.common.sql.hooks.sql.DbApiHook.bulk_load() +airflow.providers.common.sql.hooks.sql.DbApiHook.get_autocommit() -> bool +airflow.providers.common.sql.hooks.sql.DbApiHook.get_conn() +airflow.providers.common.sql.hooks.sql.DbApiHook.get_cursor() +airflow.providers.common.sql.hooks.sql.DbApiHook.get_first(self = None) -> Any +airflow.providers.common.sql.hooks.sql.DbApiHook.get_pandas_df(self = None, **kwargs) +airflow.providers.common.sql.hooks.sql.DbApiHook.get_pandas_df_by_chunks(self = None, chunksize, **kwargs) +airflow.providers.common.sql.hooks.sql.DbApiHook.get_records(self = None) -> Any +airflow.providers.common.sql.hooks.sql.DbApiHook.get_sqlalchemy_engine(self = None) +airflow.providers.common.sql.hooks.sql.DbApiHook.get_uri() -> str +airflow.providers.common.sql.hooks.sql.DbApiHook.insert_rows(self = None, table = 1000, rows = False, **kwargs) +airflow.providers.common.sql.hooks.sql.DbApiHook.last_description() -> Sequence | None +airflow.providers.common.sql.hooks.sql.DbApiHook.run(self = False, sql: str | Iterable = None, autocommit: bool = None, parameters: Iterable | Mapping | None = False, handler: Callable | None = True) -> Any | list | None +airflow.providers.common.sql.hooks.sql.DbApiHook.set_autocommit() +airflow.providers.common.sql.hooks.sql.DbApiHook.split_sql_string() -> list +airflow.providers.common.sql.hooks.sql.DbApiHook.strip_sql_string() -> str +airflow.providers.common.sql.hooks.sql.DbApiHook.test_connection() +airflow.providers.common.sql.hooks.sql.fetch_all_handler() -> list | None +airflow.providers.common.sql.hooks.sql.fetch_one_handler() -> list | None +airflow.providers.common.sql.hooks.sql.return_single_query_results() +airflow.providers.common.sql.operators.sql.BaseSQLOperator +airflow.providers.common.sql.operators.sql.BaseSQLOperator.__init__(conn_id: str | None = None, database: str | None = None, hook_params: dict | None = None, retry_on_failure: bool = True, **kwargs) +airflow.providers.common.sql.operators.sql.BaseSQLOperator.get_db_hook() -> DbApiHook +airflow.providers.common.sql.operators.sql.BranchSQLOperator +airflow.providers.common.sql.operators.sql.BranchSQLOperator.__init__(sql: str, follow_task_ids_if_true: list, follow_task_ids_if_false: list, conn_id: str = default_conn_id, database: str | None = None, parameters: Iterable | Mapping | None = None, **kwargs) -> None +airflow.providers.common.sql.operators.sql.BranchSQLOperator.execute() +airflow.providers.common.sql.operators.sql.SQLCheckOperator +airflow.providers.common.sql.operators.sql.SQLCheckOperator.__init__(sql: str, conn_id: str | None = None, database: str | None = None, parameters: Iterable | Mapping | None = None, **kwargs) -> None +airflow.providers.common.sql.operators.sql.SQLCheckOperator.execute() +airflow.providers.common.sql.operators.sql.SQLColumnCheckOperator +airflow.providers.common.sql.operators.sql.SQLColumnCheckOperator.__init__(table: str, column_mapping: dict, partition_clause: str | None = None, conn_id: str | None = None, database: str | None = None, accept_none: bool = True, **kwargs) +airflow.providers.common.sql.operators.sql.SQLColumnCheckOperator.execute() +airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator +airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator.__init__(sql: str | list, autocommit: bool = False, parameters: Mapping | Iterable | None = None, handler: Callable = fetch_all_handler, split_statements: bool = False, return_last: bool = True, **kwargs) -> None +airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator.execute() +airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator.prepare_template() -> None +airflow.providers.common.sql.operators.sql.SQLIntervalCheckOperator +airflow.providers.common.sql.operators.sql.SQLIntervalCheckOperator.__init__(table: str, metrics_thresholds: dict, date_filter_column: str | None = ds, days_back: SupportsAbs = -7, ratio_formula: str | None = max_over_min, ignore_zero: bool = True, conn_id: str | None = None, database: str | None = None, **kwargs) +airflow.providers.common.sql.operators.sql.SQLIntervalCheckOperator.execute() +airflow.providers.common.sql.operators.sql.SQLTableCheckOperator +airflow.providers.common.sql.operators.sql.SQLTableCheckOperator.__init__(table: str, checks: dict, partition_clause: str | None = None, conn_id: str | None = None, database: str | None = None, **kwargs) +airflow.providers.common.sql.operators.sql.SQLTableCheckOperator.execute() +airflow.providers.common.sql.operators.sql.SQLThresholdCheckOperator +airflow.providers.common.sql.operators.sql.SQLThresholdCheckOperator.__init__(sql: str, min_threshold: Any, max_threshold: Any, conn_id: str | None = None, database: str | None = None, **kwargs) +airflow.providers.common.sql.operators.sql.SQLThresholdCheckOperator.execute() +airflow.providers.common.sql.operators.sql.SQLThresholdCheckOperator.push() +airflow.providers.common.sql.operators.sql.SQLValueCheckOperator +airflow.providers.common.sql.operators.sql.SQLValueCheckOperator.__init__(sql: str, pass_value: Any, tolerance: Any = None, conn_id: str | None = None, database: str | None = None, **kwargs) +airflow.providers.common.sql.operators.sql.SQLValueCheckOperator.execute() +airflow.providers.common.sql.operators.sql._parse_boolean() -> str | bool +airflow.providers.common.sql.sensors.sql.SqlSensor +airflow.providers.common.sql.sensors.sql.SqlSensor.__init__(conn_id, sql, parameters = None, success = None, failure = None, fail_on_empty = False, hook_params = None, **kwargs) +airflow.providers.common.sql.sensors.sql.SqlSensor.poke() diff --git a/dev/breeze/src/airflow_breeze/pre_commit_ids.py b/dev/breeze/src/airflow_breeze/pre_commit_ids.py index b64d92b5c4cf7..c463df850f873 100644 --- a/dev/breeze/src/airflow_breeze/pre_commit_ids.py +++ b/dev/breeze/src/airflow_breeze/pre_commit_ids.py @@ -34,6 +34,7 @@ "check-breeze-top-dependencies-limited", "check-builtin-literals", "check-changelog-has-no-duplicates", + "check-common-sql-api-usage", "check-core-deprecation-classes", "check-daysago-import-from-utils", "check-decorated-operator-implements-custom-name", @@ -100,6 +101,7 @@ "ts-compile-and-lint-javascript", "update-breeze-cmd-output", "update-breeze-readme-config-hash", + "update-common-sql-api", "update-er-diagram", "update-extras", "update-in-the-wild-to-be-sorted", diff --git a/images/breeze/output-commands-hash.txt b/images/breeze/output-commands-hash.txt index a538cf7cd3dfb..868591991df8e 100644 --- a/images/breeze/output-commands-hash.txt +++ b/images/breeze/output-commands-hash.txt @@ -50,7 +50,7 @@ setup:version:123b462a421884dc2320ffc5e54b2478 setup:a3bd246c3a425f3e586d11bbdc8937cb shell:51149096ed3fb28d2a01e0a5ad0146ef start-airflow:4bec5d26386dc268ec2de2f637c85272 -static-checks:61c8d96ffe3008bff89a9df39b423be2 +static-checks:4f1fc165bae62536a2f91de96799ba11 stop:8969537ccdd799f692ccb8600a7bbed6 testing:docker-compose-tests:b86c044b24138af0659a05ed6331576c testing:helm-tests:94a442e7f3f63b34c4831a84d165690a diff --git a/images/breeze/output-commands.svg b/images/breeze/output-commands.svg index e55274eb0cf22..9c51ea978fa1a 100644 --- a/images/breeze/output-commands.svg +++ b/images/breeze/output-commands.svg @@ -35,8 +35,8 @@ .terminal-2506199366-r1 { fill: #c5c8c6;font-weight: bold } .terminal-2506199366-r2 { fill: #c5c8c6 } .terminal-2506199366-r3 { fill: #d0b344;font-weight: bold } -.terminal-2506199366-r4 { fill: #68a0b3;font-weight: bold } -.terminal-2506199366-r5 { fill: #868887 } +.terminal-2506199366-r4 { fill: #868887 } +.terminal-2506199366-r5 { fill: #68a0b3;font-weight: bold } .terminal-2506199366-r6 { fill: #98a84b;font-weight: bold } .terminal-2506199366-r7 { fill: #8d7b39 } @@ -190,50 +190,50 @@ -Usage: breeze [OPTIONSCOMMAND [ARGS]... +Usage: breeze [OPTIONS] COMMAND [ARGS]... -╭─ Basic flags ────────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---python-pPython major/minor version used in Airflow image for images.(>3.7< | 3.8 | 3.9 | 3.10) -[default: 3.7]                                               ---backend-bDatabase backend to use.(>sqlite< | mysql | postgres | mssql)[default: sqlite] ---postgres-version-PVersion of Postgres used.(>11< | 12 | 13 | 14 | 15)[default: 11] ---mysql-version-MVersion of MySQL used.(>5.7< | 8)[default: 5.7] ---mssql-version-SVersion of MsSQL used.(>2017-latest< | 2019-latest)[default: 2017-latest] ---integrationIntegration(s) to enable when running (can be more than one).                             -(cassandra | kerberos | mongo | openldap | pinot | rabbitmq | redis | statsd | trino |    -all)                                                                                      ---forward-credentials-fForward local credentials to container when running. ---db-reset-dReset DB when entering the container. ---max-timeMaximum time that the command should take - if it takes longer, the command will fail. -(INTEGER RANGE)                                                                        ---github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Common options ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---verbose-vPrint verbose information about performed steps. ---dry-run-DIf dry-run is set, commands are only printed, not executed. ---answer-aForce answer to questions.(y | n | q | yes | no | quit) ---help-hShow this message and exit. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Basic developer commands ───────────────────────────────────────────────────────────────────────────────────────────╮ -start-airflow     Enter breeze environment and starts all Airflow components in the tmux session. Compile assets   -if contents of www directory changed.                                                            -static-checks     Run static checks.                                                                               -build-docs        Build documentation in the container.                                                            -stop              Stop running breeze environment.                                                                 -shell             Enter breeze environment. this is the default command use when no other is selected.             -exec              Joins the interactive shell of running airflow container.                                        -compile-www-assetsCompiles www assets.                                                                             -cleanup           Cleans the cache of parameters, docker cache and optionally built CI/PROD images.                -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Advanced command groups ────────────────────────────────────────────────────────────────────────────────────────────╮ -testing                Tools that developers can use to run tests                                                  -ci-image               Tools that developers can use to manually manage CI images                                  -k8s                    Tools that developers use to run Kubernetes tests                                           -prod-image             Tools that developers can use to manually manage PROD images                                -setup                  Tools that developers can use to configure Breeze                                           -release-management     Tools that release managers can use to prepare and manage Airflow releases                  -ci                     Tools that CI workflows use to cleanup/manage CI environment                                -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Basic flags ────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--python-pPython major/minor version used in Airflow image for images.(>3.7< | 3.8 | 3.9 | 3.10) +[default: 3.7]                                               +--backend-bDatabase backend to use.(>sqlite< | mysql | postgres | mssql)[default: sqlite] +--postgres-version-PVersion of Postgres used.(>11< | 12 | 13 | 14 | 15)[default: 11] +--mysql-version-MVersion of MySQL used.(>5.7< | 8)[default: 5.7] +--mssql-version-SVersion of MsSQL used.(>2017-latest< | 2019-latest)[default: 2017-latest] +--integrationIntegration(s) to enable when running (can be more than one).                             +(cassandra | kerberos | mongo | openldap | pinot | rabbitmq | redis | statsd | trino |    +all)                                                                                      +--forward-credentials-fForward local credentials to container when running. +--db-reset-dReset DB when entering the container. +--max-timeMaximum time that the command should take - if it takes longer, the command will fail. +(INTEGER RANGE)                                                                        +--github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Common options ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--verbose-vPrint verbose information about performed steps. +--dry-run-DIf dry-run is set, commands are only printed, not executed. +--answer-aForce answer to questions.(y | n | q | yes | no | quit) +--help-hShow this message and exit. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Basic developer commands ───────────────────────────────────────────────────────────────────────────────────────────╮ +start-airflow     Enter breeze environment and starts all Airflow components in the tmux session. Compile assets   +if contents of www directory changed.                                                            +static-checks     Run static checks.                                                                               +build-docs        Build documentation in the container.                                                            +stop              Stop running breeze environment.                                                                 +shell             Enter breeze environment. this is the default command use when no other is selected.             +exec              Joins the interactive shell of running airflow container.                                        +compile-www-assetsCompiles www assets.                                                                             +cleanup           Cleans the cache of parameters, docker cache and optionally built CI/PROD images.                +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Advanced command groups ────────────────────────────────────────────────────────────────────────────────────────────╮ +testing                Tools that developers can use to run tests                                                  +ci-image               Tools that developers can use to manually manage CI images                                  +k8s                    Tools that developers use to run Kubernetes tests                                           +prod-image             Tools that developers can use to manually manage PROD images                                +setup                  Tools that developers can use to configure Breeze                                           +release-management     Tools that release managers can use to prepare and manage Airflow releases                  +ci                     Tools that CI workflows use to cleanup/manage CI environment                                +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/images/breeze/output_static-checks.svg b/images/breeze/output_static-checks.svg index b9c725f1ad8ac..335c0b75db290 100644 --- a/images/breeze/output_static-checks.svg +++ b/images/breeze/output_static-checks.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + - Command: static-checks + Command: static-checks - + - - -Usage: breeze static-checks [OPTIONS] [PRECOMMIT_ARGS]... - -Run static checks. - -╭─ Pre-commit flags ───────────────────────────────────────────────────────────────────────────────────────────────────╮ ---type-tType(s) of the static checks to run (multiple can be added).                             -(all | black | blacken-docs | check-airflow-config-yaml-consistent |                     -check-airflow-provider-compatibility | check-apache-license-rat |                        -check-base-operator-partial-arguments | check-base-operator-usage |                      -check-boring-cyborg-configuration | check-breeze-top-dependencies-limited |              -check-builtin-literals | check-changelog-has-no-duplicates |                             -check-core-deprecation-classes | check-daysago-import-from-utils |                       -check-decorated-operator-implements-custom-name | check-docstring-param-types |          -check-example-dags-urls | check-executables-have-shebangs |                              -check-extra-packages-references | check-extras-order | check-for-inclusive-language |    -check-hooks-apply | check-incorrect-use-of-LoggingMixin | check-init-decorator-arguments -| check-lazy-logging | check-merge-conflict | check-newsfragments-are-valid |            -check-no-providers-in-core-examples | check-no-relative-imports |                        -check-persist-credentials-disabled-in-github-workflows |                                 -check-pre-commit-information-consistent | check-provide-create-sessions-imports |        -check-provider-yaml-valid | check-providers-init-file-missing |                          -check-providers-subpackages-init-file-exist | check-pydevd-left-in-code |                -check-revision-heads-map | check-safe-filter-usage-in-html | check-setup-order |         -check-start-date-not-used-in-defaults | check-system-tests-present |                     -check-system-tests-tocs | check-xml | codespell | compile-www-assets |                   -compile-www-assets-dev | create-missing-init-py-files-tests | debug-statements |         -detect-private-key | doctoc | end-of-file-fixer | fix-encoding-pragma | flynt | identity -| insert-license | isort | lint-chart-schema | lint-css | lint-dockerfile |              -lint-helm-chart | lint-json-schema | lint-markdown | lint-openapi | mixed-line-ending |  -pretty-format-json | pydocstyle | python-no-log-warn | pyupgrade |                       -replace-bad-characters | rst-backticks | run-flake8 | run-mypy | run-shellcheck |        -static-check-autoflake | trailing-whitespace | ts-compile-and-lint-javascript |          -update-breeze-cmd-output | update-breeze-readme-config-hash | update-er-diagram |        -update-extras | update-in-the-wild-to-be-sorted | update-inlined-dockerfile-scripts |    -update-local-yml-file | update-migration-references | update-providers-dependencies |    -update-spelling-wordlist-to-be-sorted | update-supported-versions |                      -update-vendored-in-k8s-json-schema | update-version | yamllint | yesqa)                  ---file-fList of files to run the checks on.(PATH) ---all-files-aRun checks on all files. ---show-diff-on-failure-sShow diff for files modified by the checks. ---last-commit-cRun checks for all files in last commit. Mutually exclusive with --commit-ref. ---commit-ref-rRun checks for this commit reference only (can be any git commit-ish reference).         -Mutually exclusive with --last-commit.                                                   -(TEXT)                                                                                   ---github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Common options ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---verbose-vPrint verbose information about performed steps. ---dry-run-DIf dry-run is set, commands are only printed, not executed. ---help-hShow this message and exit. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + + +Usage: breeze static-checks [OPTIONS] [PRECOMMIT_ARGS]... + +Run static checks. + +╭─ Pre-commit flags ───────────────────────────────────────────────────────────────────────────────────────────────────╮ +--type-tType(s) of the static checks to run (multiple can be added).                             +(all | black | blacken-docs | check-airflow-config-yaml-consistent |                     +check-airflow-provider-compatibility | check-apache-license-rat |                        +check-base-operator-partial-arguments | check-base-operator-usage |                      +check-boring-cyborg-configuration | check-breeze-top-dependencies-limited |              +check-builtin-literals | check-changelog-has-no-duplicates | check-common-sql-api-usage  +| check-core-deprecation-classes | check-daysago-import-from-utils |                     +check-decorated-operator-implements-custom-name | check-docstring-param-types |          +check-example-dags-urls | check-executables-have-shebangs |                              +check-extra-packages-references | check-extras-order | check-for-inclusive-language |    +check-hooks-apply | check-incorrect-use-of-LoggingMixin | check-init-decorator-arguments +| check-lazy-logging | check-merge-conflict | check-newsfragments-are-valid |            +check-no-providers-in-core-examples | check-no-relative-imports |                        +check-persist-credentials-disabled-in-github-workflows |                                 +check-pre-commit-information-consistent | check-provide-create-sessions-imports |        +check-provider-yaml-valid | check-providers-init-file-missing |                          +check-providers-subpackages-init-file-exist | check-pydevd-left-in-code |                +check-revision-heads-map | check-safe-filter-usage-in-html | check-setup-order |         +check-start-date-not-used-in-defaults | check-system-tests-present |                     +check-system-tests-tocs | check-xml | codespell | compile-www-assets |                   +compile-www-assets-dev | create-missing-init-py-files-tests | debug-statements |         +detect-private-key | doctoc | end-of-file-fixer | fix-encoding-pragma | flynt | identity +| insert-license | isort | lint-chart-schema | lint-css | lint-dockerfile |              +lint-helm-chart | lint-json-schema | lint-markdown | lint-openapi | mixed-line-ending |  +pretty-format-json | pydocstyle | python-no-log-warn | pyupgrade |                       +replace-bad-characters | rst-backticks | run-flake8 | run-mypy | run-shellcheck |        +static-check-autoflake | trailing-whitespace | ts-compile-and-lint-javascript |          +update-breeze-cmd-output | update-breeze-readme-config-hash | update-common-sql-api |    +update-er-diagram | update-extras | update-in-the-wild-to-be-sorted |                    +update-inlined-dockerfile-scripts | update-local-yml-file | update-migration-references  +| update-providers-dependencies | update-spelling-wordlist-to-be-sorted |                +update-supported-versions | update-vendored-in-k8s-json-schema | update-version |        +yamllint | yesqa)                                                                        +--file-fList of files to run the checks on.(PATH) +--all-files-aRun checks on all files. +--show-diff-on-failure-sShow diff for files modified by the checks. +--last-commit-cRun checks for all files in last commit. Mutually exclusive with --commit-ref. +--commit-ref-rRun checks for this commit reference only (can be any git commit-ish reference).         +Mutually exclusive with --last-commit.                                                   +(TEXT)                                                                                   +--github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Common options ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--verbose-vPrint verbose information about performed steps. +--dry-run-DIf dry-run is set, commands are only printed, not executed. +--help-hShow this message and exit. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/scripts/ci/pre_commit/pre_commit_check_common_sql_api_usage.py b/scripts/ci/pre_commit/pre_commit_check_common_sql_api_usage.py new file mode 100755 index 0000000000000..7c513a227514a --- /dev/null +++ b/scripts/ci/pre_commit/pre_commit_check_common_sql_api_usage.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import ast +import pathlib +import sys +from pathlib import Path + +from rich.console import Console + +if __name__ not in ("__main__", "__mp_main__"): + raise SystemExit( + "This file is intended to be executed as an executable program. You cannot use it as a module." + f"To execute this script, run ./{__file__} [FILE] ..." + ) + +AIRFLOW_SOURCES_ROOT = Path(__file__).parents[3].resolve() +COMMON_SQL_ROOT = AIRFLOW_SOURCES_ROOT / "airflow" / "providers" / "common" / "sql" + +COMMON_SQL_API_FILE = COMMON_SQL_ROOT / "api.txt" +COMMON_SQL_README_API_FILE = COMMON_SQL_ROOT / "README_API.md" + +COMMON_SQL_PACKAGE_PREFIX = "airflow.providers.common.sql." + +console = Console(width=400, color_system="standard") + +ALL_PUBLIC_COMMON_SQL_API_IMPORTS = { + line.split("(")[0] + for line in COMMON_SQL_API_FILE.read_text().splitlines() + if line.strip() and not line.strip().startswith("#") +} + + +def get_imports_from_file(filepath: Path): + content = filepath.read_text() + doc_node = ast.parse(content, str(filepath)) + import_names: set[str] = set() + for current_node in ast.walk(doc_node): + if not isinstance(current_node, (ast.Import, ast.ImportFrom)): + continue + for alias in current_node.names: + name = alias.name + fullname = f"{current_node.module}.{name}" if isinstance(current_node, ast.ImportFrom) else name + import_names.add(fullname) + return import_names + + +if __name__ == "__main__": + errors = 0 + for file_name in sys.argv[1:]: + file_path = AIRFLOW_SOURCES_ROOT / pathlib.Path(file_name) + try: + file_path.relative_to(COMMON_SQL_ROOT) + console.print(f"[bright_blue]Skipping {file_path} as it is part of the common.sql provider[/]") + continue + except ValueError: + # The is_relative_to is only available in 3.9+, so we have to attempt to get relative path, and + # it will fail in case it is not (we only want to check those paths that are not in common.sql + pass + imports = get_imports_from_file(file_path) + header_printed = False + for _import in imports: + if _import.startswith(COMMON_SQL_PACKAGE_PREFIX): + if not header_printed: + console.print(f"\n[bright_blue]The common.sql imports in {file_path}:[/]") + header_printed = True + if _import not in ALL_PUBLIC_COMMON_SQL_API_IMPORTS: + console.print( + f"[red]NOK. The import {_import} is not part of the public common.sql API.[/]" + ) + errors += 1 + else: + console.print(f"[green]OK. The import {_import} is part of the public common.sql API.[/]") + if errors == 0: + console.print( + "[green]All OK. All usages of the common.sql provider in other providers use public API[/]" + ) + sys.exit(0) + console.print( + "\n[red]Some of the above providers are not using the public API of common.sql.[/]" + f"\nYou can only use public API as defined in {COMMON_SQL_API_FILE}." + f"\n[yellow]Please fix it and make sure only public API is used.[/]" + f"\n[bright_blue]You can read more about it in the {COMMON_SQL_README_API_FILE}.[/]\n" + ) + sys.exit(1) diff --git a/scripts/ci/pre_commit/pre_commit_update_common_sql_api.py b/scripts/ci/pre_commit/pre_commit_update_common_sql_api.py new file mode 100755 index 0000000000000..6166e6edc1895 --- /dev/null +++ b/scripts/ci/pre_commit/pre_commit_update_common_sql_api.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import ast +import difflib +import os +import sys +import textwrap +from _ast import AST, BitOr, USub +from enum import Enum +from pathlib import Path +from typing import Generator + +import astor as astor +from rich.console import Console + +if __name__ not in ("__main__", "__mp_main__"): + raise SystemExit( + "This file is intended to be executed as an executable program. You cannot use it as a module." + f"To execute this script, run ./{__file__} [FILE] ..." + ) + +AIRFLOW_SOURCES_ROOT = Path(__file__).parents[3].resolve() +COMMON_SQL_ROOT = AIRFLOW_SOURCES_ROOT / "airflow" / "providers" / "common" / "sql" + +COMMON_SQL_API_FILE = COMMON_SQL_ROOT / "api.txt" + +COMMON_SQL_PACKAGE_PREFIX = "airflow.providers.common.sql." + +console = Console(width=400, color_system="standard") + + +def should_be_excluded(parent: str, name: str): + if parent == "airflow.providers.common.sql.operators.sql" and name == "_parse_boolean": + # special exception for already "public" _parse_boolean in operators.sql submodule + return False + return name.startswith("_") and name != "__init__" + + +def recurse_node(node) -> str: + if hasattr(node, "op"): + if isinstance(node.op, BitOr): + return recurse_node(node.left) + " | " + recurse_node(node.right) + elif isinstance(node.op, USub): + return f"-{recurse_node(node.operand)}" + else: + raise Exception(f"Unhandled op type {node}. Attributes: {node.__dict__}") + elif hasattr(node, "id"): + return str(node.id) if node.id is not None else "None" + elif hasattr(node, "value"): + if isinstance(node.value, AST): + return recurse_node(node.value) + return str(node.value) if node.value is not None else "None" + elif hasattr(node, "slice"): + return f"{node.value.id}[{recurse_node(node.slice)}]" + elif hasattr(node, "elts"): + return ",".join([recurse_node(el) for el in node.elts]) + elif hasattr(node, "s"): + # Python 3.7 has legacy `_ast.Str` type that is deprecated in 3.8 + # We can remove it when 3.7 is not our default version + return node.s + elif hasattr(node, "n"): + # Python 3.7 has legacy `_ast.Num` type that is deprecated in 3.8 + # We can remove it when 3.7 is not our default version + return node.n + raise Exception(f"I do not know how to handle node: {node}. Attributes: {node.__dict__}") + + +class ARGTYPE(Enum): + REGULAR = "" + POSITION_ONLY = "" + KEYWORD_ONLY = "" + + +def arg_generator(args: ast.arguments) -> Generator[tuple[ast.arg, ast.expr | None, ARGTYPE], None, None]: + if hasattr(args, "args"): + for arg, def_arg in zip(args.args, args.defaults): + yield arg, def_arg, ARGTYPE.REGULAR + if hasattr(args, "posonlyargs"): + # Python 3.8 introduced position-only args - until we switch to 3.8 we need to ignore typing issue + # We will be able to remove the `type: ignore` when we switch to 3.8 + for arg in args.posonlyargs: # type: ignore + yield arg, None, ARGTYPE.POSITION_ONLY + if hasattr(args, "kwonlyargs"): + for arg, def_kwarg in zip(args.kwonlyargs, args.kw_defaults): + yield arg, def_kwarg, ARGTYPE.KEYWORD_ONLY + + +class PublicUsageFinder(astor.TreeWalk): + package_path: str + functions: set[str] = set() + classes: set[str] = set() + methods: set[str] = set() + current_class: str | None = None + + def get_current_node_args(self) -> str: + result = "" + separator = "" + if hasattr(self.cur_node, "args"): + for arg, default_value, arg_type in arg_generator(self.cur_node.args): + if arg is not None: + result += f"{separator}{arg.arg}{arg_type.value}" + if hasattr(arg, "annotation") and arg.annotation is not None: + result += f": {recurse_node(arg.annotation)}" + if default_value: + result += f" = {recurse_node(default_value)}" + separator = ", " + if self.cur_node.args.vararg: + result += f"{separator}*args" + if self.cur_node.args.kwarg: + result += f"{separator}**kwargs" + return result + + def get_current_return(self) -> str: + if self.cur_node.returns: + return f" -> {recurse_node(self.cur_node.returns)}" + return "" + + def pre_ClassDef(self): + class_name = f"{self.package_path}.{self.cur_node.name}" + if not should_be_excluded(class_name, self.cur_node.name): + self.classes.add(class_name) + self.current_class = class_name + + def post_ClassDef(self): + self.current_class = None + + def pre_FunctionDef(self): + if not self.current_class: + if not should_be_excluded(self.package_path, self.cur_node.name): + self.functions.add( + f"{self.package_path}.{self.cur_node.name}(" + f"{self.get_current_node_args()}){self.get_current_return()}" + ) + return + if not should_be_excluded(self.current_class, self.cur_node.name): + self.methods.add( + f"{self.current_class}.{self.cur_node.name}(" + f"{self.get_current_node_args()}){self.get_current_return()}" + ) + + +def get_current_apis_of_common_sql_provider() -> list[str]: + """ + Extracts API from the common.sql provider. + + The file generated is a text file of all classes and methods that should be considered as "Public API" + of the common.sql provider. + + All "users" of common.sql API are verified if they are only using those API. + """ + results = set() + all_common_sql_files = COMMON_SQL_ROOT.rglob("**/*.py") + for module_source_path in all_common_sql_files: + module = ast.parse(module_source_path.read_text("utf-8"), str(module_source_path)) + finder = PublicUsageFinder() + finder.package_path = str(module_source_path.relative_to(AIRFLOW_SOURCES_ROOT)).replace(os.sep, ".")[ + :-3 + ] + finder.walk(node=module) + results.update(finder.classes) + results.update(finder.methods) + results.update(finder.functions) + return list(sorted(results)) + + +class ConsoleDiff(difflib.Differ): + def _dump(self, tag, x, lo, hi): + """Generate comparison results for a same-tagged range.""" + for i in range(lo, hi): + if tag == "+": + yield f"[green]{tag} {x[i]}[/]" + elif tag == "-": + yield f"[red]{tag} {x[i]}[/]" + else: + yield f"[white]{tag} {x[i]}[/]" + + +WHAT_TO_CHECK = """Those are the changes you should check: + + * for every removed method/class/function, check that is not used and that ALL ACTIVE PAST VERSIONS of + released SQL providers do not use the method. + + * for every changed parameter of a method/function/class constructor, check that changes in the parameters + are backwards compatible. +""" + +PREAMBLE = """# This is a common.sql provider API dump +# +# This file is generated automatically by the update-common-sql-api pre-commit +# and it shows the current API of the common.sql provider +# +# Any, potentially breaking change in the API will require deliberate manual action from the contributor +# making a change to the common.sql provider. This file is also used in checking if only public API of +# the common.sql provider is used by all the other providers. +# +# You can read more in the README_API.md file +# +""" + + +def summarize_changes(results: list[str]) -> tuple[int, int]: + """ + Returns summary of the changes. + + :param results: results of comparison in the form of line of strings + :return: Tuple: [number of removed lines, number of added lines] + """ + removals, additions = 0, 0 + for line in results: + if line.startswith("+") or line.startswith("[green]+"): + additions += 1 + if line.startswith("-") or line.startswith("[red]-"): + removals += 1 + return removals, additions + + +if __name__ == "__main__": + extracted_api_list = get_current_apis_of_common_sql_provider() + # Exclude empty lines and comments + current_api_list = [ + line + for line in COMMON_SQL_API_FILE.read_text(encoding="utf-8").splitlines() + if line.strip() and not line.strip().startswith("#") + ] + perform_update = False + if current_api_list != extracted_api_list: + diff = ConsoleDiff() + comparison_results = list(diff.compare(current_api_list, extracted_api_list)) + _removals, _additions = summarize_changes(comparison_results) + console.print("[bright_blue]Summary of the changes in common.sql API that this change brings:[/]\n") + console.print(textwrap.indent("\n".join(comparison_results), " " * 4)) + if _removals: + if not os.environ.get("UPDATE_COMMON_SQL_API") == "1": + console.print( + f"\n[red]ERROR! As you can see above, there are changes in the common.sql API:[/]\n\n" + f"[red]* additions: {_additions}[/]\n" + f"[red]* removals: {_removals}[/]\n" + ) + console.print( + "[bright_blue]Make sure to review the removals and changes for back-compatibility.[/]\n" + "[bright_blue]If you are sure all the changes are justified, run:[/]" + ) + console.print("\n[magenta]UPDATE_COMMON_SQL_API=1 pre-commit run update-common-sql-api[/]\n") + console.print(WHAT_TO_CHECK) + console.print("\n[yellow]Make sure to commit the changes after you updating the API.[/]") + else: + console.print( + f"\n[bright_blue]As you can see above, there are changes in the common.sql API:[/]\n\n" + f"[bright_blue]* additions: {_additions}[/]\n" + f"[bright_blue]* removals: {_removals}[/]\n" + ) + console.print("[bright_blue]You've set UPDATE_COMMON_SQL_API to 1 to update the API.[/]\n\n") + perform_update = True + else: + perform_update = True + console.print( + f"\n[yellow]There are only additions in the API extracted from the common.sql code[/]\n\n" + f"[bright_blue]* additions: {_additions}[/]\n" + ) + if perform_update: + COMMON_SQL_API_FILE.write_text(PREAMBLE + "\n".join(extracted_api_list) + "\n") + console.print( + f"\n[bright_blue]The {COMMON_SQL_API_FILE} file is updated automatically.[/]\n" + "\n[yellow]Make sure to commit the changes.[/]" + ) + sys.exit(1) + else: + console.print("\n[green]All OK. The common.sql APIs did not change[/]")