From ae5832d76b30ca1bf1ca349be6f0b09442b95fa4 Mon Sep 17 00:00:00 2001 From: Ricardo Portilla Date: Thu, 26 Sep 2024 09:30:59 -0400 Subject: [PATCH 01/21] make fmt run --- src/databricks/labs/ucx/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index d3ae23fdca..c6242090e2 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -19,7 +19,6 @@ from databricks.labs.ucx.install import AccountInstaller from databricks.labs.ucx.source_code.linters.files import LocalCodeLinter -from databricks.labs.ucx.assessment.export import AssessmentExporter ucx = App(__file__) logger = get_logger(__file__) @@ -681,6 +680,7 @@ def join_collection(a: AccountClient, workspace_ids: str): w_ids = [int(_.strip()) for _ in workspace_ids.split(",") if _] account_installer.join_collection(w_ids) + @ucx.command def upload( file: Path | str, From 204ab4a203c77c6507cee11f511b5dde7180d16e Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 24 Sep 2024 12:18:11 +0200 Subject: [PATCH 02/21] Minor contributor documentation changes (#2729) ## Changes This PR makes some minor changes to the the contributor documentation: - Hatch/env setup tweak to keep IntelliJ/PyCharm happy. (For some reason if the full path isn't specified IntelliJ can have problems locating the python interpreter for the venv.) - ~Add in the linting step.~ --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 48b00fc086..3baf859c77 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -259,7 +259,7 @@ make dev To use different python version, specify it in `HATCH_PYTHON` variable: ```shell -HATCH_PYTHON=python3.10 make clean dev test +HATCH_PYTHON="$(which python3.10)" make clean dev test ``` Configure your IDE to use `.venv/bin/python` from the virtual environment when developing the project: From 0a6e72f9e60fef6a0cad4009abde6b0e581c82ae Mon Sep 17 00:00:00 2001 From: Cor Date: Tue, 24 Sep 2024 12:20:42 +0200 Subject: [PATCH 03/21] Handle `PermissionDenied` when listing accessible workspaces (#2733) ## Changes Handle `PermissionDenied` when listing accessible workspaces ### Linked issues Resolves #2732 ### Functionality - [x] modified existing command: that user the accesible workspaces (most account level commands) ### Tests - [ ] manually tested - [x] added unit tests --- src/databricks/labs/ucx/account/workspaces.py | 27 +++++---- tests/unit/account/test_workspaces.py | 57 ++++++++++++++++++- 2 files changed, 72 insertions(+), 12 deletions(-) diff --git a/src/databricks/labs/ucx/account/workspaces.py b/src/databricks/labs/ucx/account/workspaces.py index 6e9be51721..b15ecd78ae 100644 --- a/src/databricks/labs/ucx/account/workspaces.py +++ b/src/databricks/labs/ucx/account/workspaces.py @@ -98,20 +98,25 @@ def get_accessible_workspaces(self) -> list[Workspace]: return accessible_workspaces def can_administer(self, workspace: Workspace) -> bool: + """Evaluate if the user can administer a workspace. + + A user can administer a workspace if the user can access the workspace and is a member of the workspace "admins" + group. + + Args: + workspace (Workspace): The workspace to check if the user can administer. + + Returns: + bool: True if the user can administer the workspace, False otherwise. + """ try: - # check if user has access to workspace ws = self.client_for(workspace) - except (PermissionDenied, NotFound, ValueError) as err: - logger.warning(f"{workspace.deployment_name}: Encounter error {err}. Skipping...") - return False - current_user = ws.current_user.me() - if current_user.groups is None: + current_user = ws.current_user.me() + except (PermissionDenied, NotFound, ValueError) as e: + logger.warning(f"User cannot access workspace: {workspace.deployment_name}", exc_info=e) return False - # check if user is a workspace admin - if "admins" not in [g.display for g in current_user.groups]: - logger.warning( - f"{workspace.deployment_name}: User {current_user.user_name} is not a workspace admin. Skipping..." - ) + if current_user.groups is None or "admins" not in {g.display for g in current_user.groups}: + logger.warning(f"User '{current_user.user_name}' is not a workspace admin: {workspace.deployment_name}") return False return True diff --git a/tests/unit/account/test_workspaces.py b/tests/unit/account/test_workspaces.py index d6862bf801..10023b5b92 100644 --- a/tests/unit/account/test_workspaces.py +++ b/tests/unit/account/test_workspaces.py @@ -1,3 +1,4 @@ +import logging import io import json from unittest.mock import create_autospec @@ -6,7 +7,7 @@ from databricks.labs.blueprint.installation import Installation, MockInstallation from databricks.labs.blueprint.tui import MockPrompts from databricks.sdk import AccountClient, WorkspaceClient -from databricks.sdk.errors import NotFound, ResourceConflict +from databricks.sdk.errors import NotFound, PermissionDenied, ResourceConflict from databricks.sdk.service import iam from databricks.sdk.service.iam import ComplexValue, Group, ResourceMeta, User from databricks.sdk.service.provisioning import Workspace @@ -494,3 +495,57 @@ def get_workspace_client(workspace) -> WorkspaceClient: acc.config.auth_type = "databricks-cli" account_workspaces = AccountWorkspaces(acc) assert len(account_workspaces.get_accessible_workspaces()) == 1 + + +def test_account_workspaces_can_administer_when_user_in_admins_group() -> None: + acc = create_autospec(AccountClient) + ws = create_autospec(WorkspaceClient) + acc.get_workspace_client.return_value = ws + ws.current_user.me.return_value = User(user_name="test", groups=[ComplexValue(display="admins")]) + account_workspaces = AccountWorkspaces(acc) + workspace = Workspace(deployment_name="test") + + assert account_workspaces.can_administer(workspace) + + +@pytest.mark.parametrize("groups", [[ComplexValue(display="not-admins")], None]) +def test_account_workspaces_cannot_administer_when_user_not_in_admins_group(caplog, groups) -> None: + acc = create_autospec(AccountClient) + ws = create_autospec(WorkspaceClient) + acc.get_workspace_client.return_value = ws + ws.current_user.me.return_value = User(user_name="test", groups=groups) + account_workspaces = AccountWorkspaces(acc) + workspace = Workspace(deployment_name="test") + + with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.account.workspaces"): + can_administer = account_workspaces.can_administer(workspace) + assert not can_administer + assert "User 'test' is not a workspace admin: test" in caplog.messages + + +def test_account_workspaces_can_administer_handles_not_found_error_for_get_workspace_client(caplog) -> None: + acc = create_autospec(AccountClient) + acc.get_workspace_client.side_effect = NotFound + account_workspaces = AccountWorkspaces(acc) + workspace = Workspace(deployment_name="test") + + with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.account.workspaces"): + can_administer = account_workspaces.can_administer(workspace) + assert not can_administer + assert "User cannot access workspace: test" in caplog.messages + + +def test_account_workspaces_can_administer_handles_permission_denied_error_for_current_user(caplog) -> None: + acc = create_autospec(AccountClient) + ws = create_autospec(WorkspaceClient) + acc.get_workspace_client.return_value = ws + ws.current_user.me.side_effect = PermissionDenied( + "This API is disabled for users without the databricks-sql-access or workspace-access entitlements" + ) + account_workspaces = AccountWorkspaces(acc) + workspace = Workspace(deployment_name="test") + + with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.account.workspaces"): + can_administer = account_workspaces.can_administer(workspace) + assert not can_administer + assert "User cannot access workspace: test" in caplog.messages From 93d496e7751ca5362c0a680a33e72862ba1772d2 Mon Sep 17 00:00:00 2001 From: Amin Movahed <140028681+aminmovahed-db@users.noreply.github.com> Date: Tue, 24 Sep 2024 21:17:55 +1000 Subject: [PATCH 04/21] Adding unskip CLI command to undo a skip on schema or a table (#2727) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🔧 This is still in progress... ## Changes ### Linked issues Resolves #1938 ### Functionality - [ ] added relevant user documentation - [x] added new CLI command --> unskip ### Tests - [ ] manually tested - [ ] added unit tests - [ ] added integration tests --- src/databricks/labs/ucx/cli.py | 13 ++++++++ .../labs/ucx/hive_metastore/mapping.py | 33 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index c6242090e2..ef136d3893 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -99,6 +99,19 @@ def skip(w: WorkspaceClient, schema: str | None = None, table: str | None = None return ctx.table_mapping.skip_schema(schema) +@ucx.command +def unskip(w: WorkspaceClient, schema: str | None = None, table: str | None = None): + """Create a unskip comment on a schema or a table""" + logger.info("Running unskip command") + if not schema: + logger.error("--schema is a required parameter.") + return None + ctx = WorkspaceContext(w) + if table: + return ctx.table_mapping.unskip_table_or_view(schema, table, ctx.tables_crawler.load_one) + return ctx.table_mapping.unskip_schema(schema) + + @ucx.command(is_account=True) def sync_workspace_info(a: AccountClient): """upload workspace config to all workspaces in the account where ucx is installed""" diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index 655172e388..94f33a97fe 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -135,6 +135,25 @@ def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Call except BadRequest as err: logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) + def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]): + # Removes skip mark from the table property + try: + table = load_table(schema_name, table_name) + if table is None: + raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]") + self._sql_backend.execute( + f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}' );" + ) + except NotFound as err: + if "[TABLE_OR_VIEW_NOT_FOUND]" in str(err) or "[DELTA_TABLE_NOT_FOUND]" in str(err): + logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}. Table not found.") + else: + logger.error( + f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True + ) + except BadRequest as err: + logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) + def skip_schema(self, schema: str): # Marks a schema to be skipped in the migration process by applying a table property try: @@ -149,6 +168,20 @@ def skip_schema(self, schema: str): except BadRequest as err: logger.error(err) + def unskip_schema(self, schema: str): + # Removes skip mark from the schema property + try: + self._sql_backend.execute( + f"ALTER SCHEMA {escape_sql_identifier(schema)} UNSET DBPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" + ) + except NotFound as err: + if "[SCHEMA_NOT_FOUND]" in str(err): + logger.error(f"Failed to remove skip marker for Schema {schema}. Schema not found.") + else: + logger.error(err) + except BadRequest as err: + logger.error(err) + def get_tables_to_migrate(self, tables_crawler: TablesCrawler) -> Collection[TableToMigrate]: rules = self.load() # Getting all the source tables from the rules From ee201129228c0d8cfeb7dc59636a3d6cfdabc068 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 17:00:16 +0200 Subject: [PATCH 05/21] Fix failing integration tests that perform a real assessment (#2736) ## Changes Ensure 'assessment' workflow only runs minimal assessment in integration tests ### Linked issues None ### Functionality None ### Tests - [x] changed integration tests Co-authored-by: Eric Vergnaud --- tests/integration/assessment/test_ext_hms.py | 14 +++------- .../integration/assessment/test_workflows.py | 14 +++++----- tests/integration/conftest.py | 27 +++++++++++++++++++ 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/tests/integration/assessment/test_ext_hms.py b/tests/integration/assessment/test_ext_hms.py index 805ab13c06..44b8e45fd4 100644 --- a/tests/integration/assessment/test_ext_hms.py +++ b/tests/integration/assessment/test_ext_hms.py @@ -1,6 +1,5 @@ import dataclasses import datetime as dt -import io from databricks.labs.lsql.backends import CommandExecutionBackend from databricks.sdk.service.iam import PermissionLevel @@ -9,12 +8,11 @@ def test_running_real_assessment_job_ext_hms( ws, installation_ctx, + product_info, env_or_skip, make_cluster_policy, make_cluster_policy_permissions, - make_notebook, - make_job, - make_dashboard, + populate_for_linting, ): cluster_id = env_or_skip('TEST_EXT_HMS_CLUSTER_ID') ext_hms_ctx = installation_ctx.replace( @@ -41,14 +39,10 @@ def test_running_real_assessment_job_ext_hms( ext_hms_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] ext_hms_ctx.workspace_installation.run() + populate_for_linting(installation_ctx.installation) + # Under ideal circumstances this can take 10-16 minutes (depending on whether there are compute instances available # via the integration pool). Allow some margin to reduce spurious failures. - notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) - job = make_job(notebook_path=notebook_path) - installation_ctx.config.include_job_ids = [job.job_id] - - dashboard = make_dashboard() - installation_ctx.config.include_dashboard_ids = [dashboard.id] ext_hms_ctx.deployed_workflows.run_workflow("assessment", max_wait=dt.timedelta(minutes=25)) # assert the workflow is successful. the tasks on sql warehouse will fail so skip checking them diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index bb1e1576a7..7955e93c69 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -1,4 +1,3 @@ -import io from datetime import timedelta from databricks.sdk.errors import NotFound, InvalidParameterValue @@ -8,7 +7,11 @@ @retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8)) def test_running_real_assessment_job( - ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions, make_job, make_notebook, make_dashboard + ws, + installation_ctx, + make_cluster_policy, + make_cluster_policy_permissions, + populate_for_linting, ): ws_group, _ = installation_ctx.make_ucx_group() cluster_policy = make_cluster_policy() @@ -20,12 +23,7 @@ def test_running_real_assessment_job( installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] installation_ctx.workspace_installation.run() - notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) - job = make_job(notebook_path=notebook_path) - installation_ctx.config.include_job_ids = [job.job_id] - - dashboard = make_dashboard() - installation_ctx.config.include_dashboard_ids = [dashboard.id] + populate_for_linting(installation_ctx.installation) installation_ctx.deployed_workflows.run_workflow("assessment") assert installation_ctx.deployed_workflows.validate_step("assessment") diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index a3c5bfe912..90204c9d26 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,3 +1,4 @@ +import io import json from collections.abc import Callable, Generator import functools @@ -9,11 +10,15 @@ from functools import cached_property import shutil import subprocess +from pathlib import Path + import pytest # pylint: disable=wrong-import-order +import yaml from databricks.labs.blueprint.commands import CommandExecutor from databricks.labs.blueprint.entrypoint import is_in_debug from databricks.labs.blueprint.installation import Installation, MockInstallation from databricks.labs.blueprint.parallel import Threads +from databricks.labs.blueprint.paths import WorkspacePath from databricks.labs.blueprint.tui import MockPrompts from databricks.labs.blueprint.wheels import ProductInfo from databricks.labs.lsql.backends import SqlBackend @@ -1175,3 +1180,25 @@ def _run(command: str) -> str: except ValueError as err: logger.debug(f"pytest_ignore_collect: error: {err}") return False + + +@pytest.fixture +def populate_for_linting(ws, make_random, make_job, make_notebook, make_query, make_dashboard, watchdog_purge_suffix): + def populate_workspace(installation): + # keep linting scope to minimum to avoid test timeouts + path = Path(installation.install_folder()) / f"dummy-{make_random(4)}-{watchdog_purge_suffix}" + notebook_path = make_notebook(path=path, content=io.BytesIO(b"spark.read.parquet('dbfs://mnt/foo/bar')")) + job = make_job(notebook_path=notebook_path) + query = make_query(sql_query='SELECT * from parquet.`dbfs://mnt/foo/bar`') + dashboard = make_dashboard(query=query) + # can't use installation.load(WorkspaceConfig)/installation.save() because they populate empty credentials + config_path = WorkspacePath(ws, installation.install_folder()) / "config.yml" + text = config_path.read_text() + config = yaml.safe_load(text) + config["include_job_ids"] = [job.job_id] + config["include_dashboard_ids"] = [dashboard.id] + text = yaml.dump(config) + config_path.unlink() + config_path.write_text(text) + + return populate_workspace From 2f62a0f2d4890690a3e1987d6b9e942e4217df42 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan <105197202+HariGS-DB@users.noreply.github.com> Date: Tue, 24 Sep 2024 18:09:15 +0100 Subject: [PATCH 06/21] Update documentation to explain the usage of collections and eligible commands (#2738) ## Changes Updated the doc to explain the concept of collection and how to use the join-collection cmd and the related collection-eligible commands ### Functionality - [ ] added relevant user documentation --- README.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/README.md b/README.md index 60b842a9c4..2893f4f894 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,8 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [`revert-cluster-remap` command](#revert-cluster-remap-command) * [`upload` command](#upload-command) * [`download` command](#download-command) + * [`join-collection` command](#join-collection command) + * [collection eligible command](#collection-eligible-command) * [Common Challenges and the Solutions](#common-challenges-and-the-solutions) * [Network Connectivity Issues](#network-connectivity-issues) * [Insufficient Privileges](#insufficient-privileges) @@ -1807,6 +1809,40 @@ $ databricks labs ucx download --file --run-as-collection True Download a csv file from a single workspace (`--run-as-collection False`) or a collection of workspaces (`--run-as-collection True`). This command is especially useful when downloading the same file from multiple workspaces. +## `join-collection` command + +```text +$ databricks labs ucx join-collection --workspace-ids --profile +``` + +`join-collection` command joins 2 or more workspaces into a collection. This helps in running supported cli commands as a collection +`join-collection` command updates config.yml file on each workspace ucx installation with installed_workspace_ids attribute. +In order to run `join-collectioon` command a user should: + - be an Account admin on the Databricks account + - be a Workspace admin on all the workspaces to be joined as a collection) or a collection of workspaces + - have installed UCX on the workspace +The `join-collection` command will fail and throw an error msg if the above conditions are not met. + +## collection eligible command + +Once `join-collection` command is run, it allows user to run multiple cli commands as a collection. The following cli commands +are eligible to be run as a collection. User can run the below commands as collection by passing an additional flag `--run-as-collection=True` +- `ensure-assessment-run` +- `create-table-mapping` +- `principal-prefix-access` +- `migrate-credentials` +- `create-uber-principal` +- `create-missing-principals` +- `validate-external-location` +- `migrate-locations` +- `create-catalog-schemas` +- `migrate-tables` +- `migrate-acls` +- `migrate-dbsql-dashboards` +- `validate-group-membership` +Ex: `databricks labs ucx ensure-assessment-run --run-as-collection=True` + + # Common Challenges and the Solutions Users might encounter some challenges while installing and executing UCX. Please find the listing of some common challenges and the solutions below. From 8fee9d853f1c3604b9e7f9d4f4dbdf3efc9cb495 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan <105197202+HariGS-DB@users.noreply.github.com> Date: Tue, 24 Sep 2024 19:35:15 +0100 Subject: [PATCH 07/21] Enables cli cmd `databricks labs ucx create-catalog-schemas` to apply catalog/schema acl from legacy hive_metastore (#2676) ## Changes ### Linked issues Resolves #2514 ### Functionality - [ ] modified existing command: `databricks labs ucx create-catalog-schemas` ### Tests - [ ] added unit tests - [ ] added integration tests --- .../labs/ucx/contexts/application.py | 8 +++- .../labs/ucx/hive_metastore/catalog_schema.py | 40 +++++++++++++++++-- .../hive_metastore/test_catalog_schema.py | 29 +++++++++++++- 3 files changed, 70 insertions(+), 7 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 1bb030c1fa..59517fdf12 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -353,7 +353,13 @@ def table_mapping(self): @cached_property def catalog_schema(self): - return CatalogSchema(self.workspace_client, self.table_mapping, self.principal_acl, self.sql_backend) + return CatalogSchema( + self.workspace_client, + self.table_mapping, + self.principal_acl, + self.sql_backend, + self.grants_crawler, + ) @cached_property def verify_timeout(self): diff --git a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py index 9da944ba2f..6802ad5dc5 100644 --- a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py +++ b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py @@ -6,7 +6,7 @@ from databricks.labs.blueprint.tui import Prompts from databricks.labs.lsql.backends import SqlBackend -from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant +from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant, GrantsCrawler from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service.catalog import SchemaInfo @@ -19,13 +19,19 @@ class CatalogSchema: def __init__( - self, ws: WorkspaceClient, table_mapping: TableMapping, principal_grants: PrincipalACL, sql_backend: SqlBackend + self, + ws: WorkspaceClient, + table_mapping: TableMapping, + principal_grants: PrincipalACL, + sql_backend: SqlBackend, + grants_crawler: GrantsCrawler, ): self._ws = ws self._table_mapping = table_mapping self._external_locations = self._ws.external_locations.list() self._principal_grants = principal_grants self._backend = sql_backend + self._hive_grants_crawler = grants_crawler def create_all_catalogs_schemas(self, prompts: Prompts) -> None: candidate_catalogs, candidate_schemas = self._get_missing_catalogs_schemas() @@ -46,10 +52,22 @@ def create_all_catalogs_schemas(self, prompts: Prompts) -> None: f"Schema {candidate_schema} in catalog {candidate_catalog} " f"already exists. Skipping." ) continue + self._apply_from_legacy_table_acls() self._update_principal_acl() + def _apply_from_legacy_table_acls(self): + grants = self._get_catalog_schema_hive_grants() + for grant in grants: + acl_migrate_sql = grant.uc_grant_sql() + if acl_migrate_sql is None: + logger.warning(f"Cannot identify UC grant for {grant.this_type_and_key()}. Skipping.") + continue + logger.debug(f"Migrating acls on {grant.this_type_and_key()} using SQL query: {acl_migrate_sql}") + self._backend.execute(acl_migrate_sql) + def _update_principal_acl(self): - grants = self._get_catalog_schema_grants() + + grants = self._get_catalog_schema_principal_acl_grants() for grant in grants: acl_migrate_sql = grant.uc_grant_sql() if acl_migrate_sql is None: @@ -58,7 +76,21 @@ def _update_principal_acl(self): logger.debug(f"Migrating acls on {grant.this_type_and_key()} using SQL query: {acl_migrate_sql}") self._backend.execute(acl_migrate_sql) - def _get_catalog_schema_grants(self) -> list[Grant]: + def _get_catalog_schema_hive_grants(self) -> list[Grant]: + src_dst_schema_mapping = self._get_database_source_target_mapping() + hive_grants = self._hive_grants_crawler.snapshot() + new_grants: list[Grant] = [] + for grant in hive_grants: + if grant.this_type_and_key()[0] == "DATABASE" and grant.database: + for schema in src_dst_schema_mapping[grant.database]: + new_grants.append(replace(grant, catalog=schema.catalog_name, database=schema.name)) + catalog_grants: set[Grant] = set() + for grant in new_grants: + catalog_grants.add(replace(grant, database=None)) + new_grants.extend(catalog_grants) + return new_grants + + def _get_catalog_schema_principal_acl_grants(self) -> list[Grant]: src_trg_schema_mapping = self._get_database_source_target_mapping() grants = self._principal_grants.get_interactive_cluster_grants() # filter on grants to only get database level grants diff --git a/tests/unit/hive_metastore/test_catalog_schema.py b/tests/unit/hive_metastore/test_catalog_schema.py index d632875cb9..66e0784293 100644 --- a/tests/unit/hive_metastore/test_catalog_schema.py +++ b/tests/unit/hive_metastore/test_catalog_schema.py @@ -9,7 +9,7 @@ from databricks.sdk.service.catalog import CatalogInfo, ExternalLocationInfo, SchemaInfo from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema -from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant +from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant, GrantsCrawler from databricks.labs.ucx.hive_metastore.mapping import TableMapping @@ -83,6 +83,7 @@ def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema: ) table_mapping = TableMapping(installation, ws, backend) principal_acl = create_autospec(PrincipalACL) + hive_acl = create_autospec(GrantsCrawler) grants = [ Grant('user1', 'SELECT', 'catalog1', 'schema3', 'table'), Grant('user1', 'MODIFY', 'catalog2', 'schema2', 'table'), @@ -90,8 +91,27 @@ def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema: Grant('user1', 'USAGE', 'hive_metastore', 'schema3'), Grant('user1', 'USAGE', 'hive_metastore', 'schema2'), ] + hive_grants = [ + Grant(principal="princ1", catalog="hive_metastore", action_type="USE"), + Grant(principal="princ2", catalog="hive_metastore", database="schema3", action_type="USAGE"), + Grant( + principal="princ33", + catalog="hive_metastore", + database="database_one", + view="table_one", + action_type="SELECT", + ), + Grant( + principal="princ5", + catalog="hive_metastore", + database="schema2", + action_type="USAGE", + ), + ] principal_acl.get_interactive_cluster_grants.return_value = grants - return CatalogSchema(ws, table_mapping, principal_acl, backend) + hive_acl.snapshot.return_value = hive_grants + + return CatalogSchema(ws, table_mapping, principal_acl, backend, hive_acl) @pytest.mark.parametrize("location", ["s3://foo/bar", "s3://foo/bar/test", "s3://foo/bar/test/baz"]) @@ -171,6 +191,11 @@ def test_catalog_schema_acl(): 'GRANT USE SCHEMA ON DATABASE `catalog2`.`schema3` TO `user1`', 'GRANT USE CATALOG ON CATALOG `catalog1` TO `user1`', 'GRANT USE CATALOG ON CATALOG `catalog2` TO `user1`', + 'GRANT USE CATALOG ON CATALOG `catalog1` TO `princ2`', + 'GRANT USE SCHEMA ON DATABASE `catalog1`.`schema3` TO `princ2`', + 'GRANT USE SCHEMA ON DATABASE `catalog2`.`schema2` TO `princ5`', + 'GRANT USE SCHEMA ON DATABASE `catalog2`.`schema3` TO `princ5`', + 'GRANT USE CATALOG ON CATALOG `catalog2` TO `princ5`', ] assert len(backend.queries) == len(queries) for query in queries: From 21da7cc3038aef19bde99afbb164bb6ebe48ce7d Mon Sep 17 00:00:00 2001 From: Cor Date: Wed, 25 Sep 2024 11:50:04 +0200 Subject: [PATCH 08/21] Add `create-ucx-catalog` cli command (#2694) ## Changes Add `create-ucx-catalog` cli command to create the catalog (going to be) used for migration tracking (possibly) accross multiple workspaces. ### Linked issues Resolves #2571 ### Functionality - [x] added relevant user documentation - [x] added new CLI command: `create-ucx-catalog` ### Tests - [x] manually tested - [x] added unit tests - [x] added integration tests --------- Co-authored-by: Andrew Snare --- Makefile | 2 +- README.md | 21 +- labs.yml | 7 +- src/databricks/labs/ucx/account/metastores.py | 21 +- src/databricks/labs/ucx/cli.py | 20 +- src/databricks/labs/ucx/config.py | 3 +- .../labs/ucx/contexts/account_cli.py | 1 - .../labs/ucx/contexts/application.py | 5 +- .../labs/ucx/hive_metastore/catalog_schema.py | 46 +++- src/databricks/labs/ucx/install.py | 2 + .../labs/ucx/source_code/queries.py | 6 +- tests/integration/conftest.py | 79 +++++-- .../hive_metastore/test_catalog_schema.py | 19 +- tests/unit/account/test_metastores.py | 21 +- tests/unit/conftest.py | 2 +- .../hive_metastore/test_catalog_schema.py | 51 ++++- tests/unit/install/test_install.py | 214 +++++------------- tests/unit/source_code/test_queries.py | 12 + tests/unit/test_cli.py | 26 ++- tests/unit/test_config.py | 17 ++ 20 files changed, 338 insertions(+), 237 deletions(-) diff --git a/Makefile b/Makefile index 8f6ceb1db1..4b62910e46 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ integration: hatch run integration coverage: - hatch run coverage && open htmlcov/index.html + hatch run coverage; status=$$?; [ -e "htmlcov/index.html" ] && open htmlcov/index.html; exit $$status known: hatch run python src/databricks/labs/ucx/source_code/known.py diff --git a/README.md b/README.md index 2893f4f894..cc4e05706a 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [Metastore related commands](#metastore-related-commands) * [`show-all-metastores` command](#show-all-metastores-command) * [`assign-metastore` command](#assign-metastore-command) + * [`create-ucx-catalog` command](#create-ucx-catalog-command) * [Table migration commands](#table-migration-commands) * [`principal-prefix-access` command](#principal-prefix-access-command) * [Access for AWS S3 Buckets](#access-for-aws-s3-buckets) @@ -1189,9 +1190,23 @@ a region, and you want to see which ones are available for assignment. databricks labs ucx assign-metastore --workspace-id [--metastore-id ] ``` -This command assigns a metastore to a workspace with `workspace-id`. If there is only a single metastore in the workspace -region, it will be automatically assigned to the workspace. If there are multiple metastores available, you need to specify -the metastore id of the metastore you want to assign to the workspace. +This command assigns a metastore to a workspace with `--workspace-id`. If there is only a single metastore in the +workspace region, the command automatically assigns that metastore to the workspace. If there are multiple metastores +available, the command prompts for specification of the metastore (id) you want to assign to the workspace. + +[[back to top](#databricks-labs-ucx)] + +## `create-ucx-catalog` command + +```commandline +databricks labs ucx create-ucx-catalog +16:12:59 INFO [d.l.u.hive_metastore.catalog_schema] Validating UC catalog: ucx +Please provide storage location url for catalog: ucx (default: metastore): ... +16:13:01 INFO [d.l.u.hive_metastore.catalog_schema] Creating UC catalog: ucx +``` + +Create and setup UCX artifact catalog. Amongst other things, the artifacts are used for tracking the migration progress +across workspaces. # Table migration commands diff --git a/labs.yml b/labs.yml index 6a1adf5878..9e25ef0494 100644 --- a/labs.yml +++ b/labs.yml @@ -261,15 +261,18 @@ commands: - name: assign-metastore is_account_level: true - description: Enable Unity Catalog features on a workspace by assign a metastore to it + description: Enable Unity Catalog features on a workspace by assigning a metastore to it. flags: - name: workspace-id - description: (Optional) Workspace ID to assign a metastore to + description: Workspace ID to assign a metastore to - name: metastore-id description: (Optional) If there are multiple metastores in the region, specify the metastore ID to assign - name: default-catalog description: (Optional) Default catalog to assign to the workspace. If not provided, it will be hive_metastore + - name: create-ucx-catalog + description: Create UCX artifact catalog + - name: migrate-tables description: | Trigger the `migrate-tables` workflow and, optionally, `migrate-external-hiveserde-tables-in-place-experimental` diff --git a/src/databricks/labs/ucx/account/metastores.py b/src/databricks/labs/ucx/account/metastores.py index 7089dd20c6..94c5b9e2fe 100644 --- a/src/databricks/labs/ucx/account/metastores.py +++ b/src/databricks/labs/ucx/account/metastores.py @@ -27,20 +27,16 @@ def show_all_metastores(self, workspace_id: str | None = None): def assign_metastore( self, prompts: Prompts, - str_workspace_id: str | None = None, + workspace_id: int, + *, metastore_id: str | None = None, default_catalog: str | None = None, ): - if not str_workspace_id: - workspace_choices = self._get_all_workspaces() - workspace_id = prompts.choice_from_dict("Please select a workspace:", workspace_choices) - else: - workspace_id = int(str_workspace_id) - if not metastore_id: + if metastore_id is None: # search for all matching metastores metastore_choices = self._get_all_metastores(self._get_region(workspace_id)) if len(metastore_choices) == 0: - raise ValueError(f"No matching metastore found for workspace {workspace_id}") + raise ValueError(f"No matching metastore found for workspace: {workspace_id}") # if there are multiple matches, prompt users to select one if len(metastore_choices) > 1: metastore_id = prompts.choice_from_dict( @@ -48,17 +44,16 @@ def assign_metastore( ) else: metastore_id = list(metastore_choices.values())[0] - if metastore_id is not None: - self._ac.metastore_assignments.create(workspace_id, metastore_id) + self._ac.metastore_assignments.create(workspace_id, metastore_id) # set the default catalog using the default_namespace setting API if default_catalog is not None: self._set_default_catalog(workspace_id, default_catalog) - def _get_region(self, workspace_id: int) -> str: + def _get_region(self, workspace_id: int) -> str | None: workspace = self._ac.workspaces.get(workspace_id) if self._ac.config.is_aws: - return str(workspace.aws_region) - return str(workspace.location) + return workspace.aws_region + return workspace.location def _get_all_workspaces(self) -> dict[str, int]: output = dict[str, int]() diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index ef136d3893..e3f5d76756 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -591,11 +591,27 @@ def assign_metastore( workspace_id: str | None = None, metastore_id: str | None = None, default_catalog: str | None = None, + ctx: AccountContext | None = None, ): """Assign metastore to a workspace""" logger.info(f"Account ID: {a.config.account_id}") - ctx = AccountContext(a) - ctx.account_metastores.assign_metastore(ctx.prompts, workspace_id, metastore_id, default_catalog) + ctx = ctx or AccountContext(a) + ctx.account_metastores.assign_metastore( + ctx.prompts, + workspace_id, + metastore_id=metastore_id, + default_catalog=default_catalog, + ) + + +@ucx.command +def create_ucx_catalog(w: WorkspaceClient, prompts: Prompts, ctx: WorkspaceContext | None = None) -> None: + """Create and setup UCX artifact catalog + + Amongst other things, the artifacts are used for tracking the migration progress across workspaces. + """ + workspace_context = ctx or WorkspaceContext(w) + workspace_context.catalog_schema.create_ucx_catalog(prompts) @ucx.command diff --git a/src/databricks/labs/ucx/config.py b/src/databricks/labs/ucx/config.py index fb5e1790bf..17e6f10ead 100644 --- a/src/databricks/labs/ucx/config.py +++ b/src/databricks/labs/ucx/config.py @@ -11,6 +11,7 @@ class WorkspaceConfig: # pylint: disable=too-many-instance-attributes __version__ = 2 inventory_database: str + ucx_catalog: str = "ucx" # Catalog to store UCX artifact tables (shared across workspaces) # Group name conversion parameters. workspace_group_regex: str | None = None workspace_group_replace: str | None = None @@ -25,7 +26,7 @@ class WorkspaceConfig: # pylint: disable=too-many-instance-attributes connect: Config | None = None num_threads: int | None = 10 database_to_catalog_mapping: dict[str, str] | None = None - default_catalog: str | None = "ucx_default" + default_catalog: str | None = "ucx_default" # DEPRECATED: Keeping to avoid errors when loading old configurations log_level: str | None = "INFO" # Starting path for notebooks and directories crawler diff --git a/src/databricks/labs/ucx/contexts/account_cli.py b/src/databricks/labs/ucx/contexts/account_cli.py index 20617edab3..9f88049f65 100644 --- a/src/databricks/labs/ucx/contexts/account_cli.py +++ b/src/databricks/labs/ucx/contexts/account_cli.py @@ -3,7 +3,6 @@ from databricks.sdk import AccountClient - from databricks.labs.ucx.account.aggregate import AccountAggregate from databricks.labs.ucx.account.metastores import AccountMetastores from databricks.labs.ucx.account.workspaces import AccountWorkspaces diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 59517fdf12..d5f506ac1e 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -18,7 +18,7 @@ from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.sdk import AccountClient, WorkspaceClient, core -from databricks.sdk.errors import ResourceDoesNotExist +from databricks.sdk.errors import NotFound from databricks.sdk.service import sql from databricks.labs.ucx.account.workspaces import WorkspaceInfo @@ -319,7 +319,7 @@ def principal_locations(self): eligible_locations = self.aws_acl.get_eligible_locations_principals() if self.is_gcp: raise NotImplementedError("Not implemented for GCP.") - except ResourceDoesNotExist: + except NotFound: pass return eligible_locations @@ -359,6 +359,7 @@ def catalog_schema(self): self.principal_acl, self.sql_backend, self.grants_crawler, + self.config.ucx_catalog, ) @cached_property diff --git a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py index 6802ad5dc5..280a345460 100644 --- a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py +++ b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py @@ -18,6 +18,7 @@ class CatalogSchema: + def __init__( self, ws: WorkspaceClient, @@ -25,6 +26,7 @@ def __init__( principal_grants: PrincipalACL, sql_backend: SqlBackend, grants_crawler: GrantsCrawler, + ucx_catalog: str, ): self._ws = ws self._table_mapping = table_mapping @@ -32,15 +34,33 @@ def __init__( self._principal_grants = principal_grants self._backend = sql_backend self._hive_grants_crawler = grants_crawler + self._ucx_catalog = ucx_catalog + + def create_ucx_catalog(self, prompts: Prompts, *, properties: dict[str, str] | None = None) -> None: + """Create the UCX catalog. + + Args: + prompts : Prompts + The prompts object to use for interactive input. + properties : (dict[str, str] | None), default None + The properties to pass to the catalog. If None, no properties are passed. + """ + try: + self._create_catalog_validate(self._ucx_catalog, prompts, properties=properties) + except BadRequest as e: + if "already exists" in str(e): + logger.warning(f"Catalog '{self._ucx_catalog}' already exists. Skipping.") + return + raise def create_all_catalogs_schemas(self, prompts: Prompts) -> None: candidate_catalogs, candidate_schemas = self._get_missing_catalogs_schemas() for candidate_catalog in candidate_catalogs: try: - self._create_catalog_validate(candidate_catalog, prompts) + self._create_catalog_validate(candidate_catalog, prompts, properties=None) except BadRequest as e: if "already exists" in str(e): - logger.warning(f"Catalog {candidate_catalog} already exists. Skipping.") + logger.warning(f"Catalog '{candidate_catalog}' already exists. Skipping.") continue for candidate_catalog, schemas in candidate_schemas.items(): for candidate_schema in schemas: @@ -49,7 +69,7 @@ def create_all_catalogs_schemas(self, prompts: Prompts) -> None: except BadRequest as e: if "already exists" in str(e): logger.warning( - f"Schema {candidate_schema} in catalog {candidate_catalog} " f"already exists. Skipping." + f"Schema '{candidate_schema}' in catalog '{candidate_catalog}' already exists. Skipping." ) continue self._apply_from_legacy_table_acls() @@ -121,20 +141,19 @@ def _get_database_source_target_mapping(self) -> dict[str, list[SchemaInfo]]: src_trg_schema_mapping[table_mapping.src_schema].append(schema) return src_trg_schema_mapping - def _create_catalog_validate(self, catalog, prompts: Prompts): - logger.info(f"Creating UC catalog: {catalog}") - # create catalogs + def _create_catalog_validate(self, catalog: str, prompts: Prompts, *, properties: dict[str, str] | None) -> None: + logger.info(f"Validating UC catalog: {catalog}") attempts = 3 while True: catalog_storage = prompts.question( - f"Please provide storage location url for catalog:{catalog}.", default="metastore" + f"Please provide storage location url for catalog: {catalog}", default="metastore" ) if self._validate_location(catalog_storage): break attempts -= 1 if attempts == 0: raise NotFound(f"Failed to validate location for {catalog} catalog") - self._create_catalog(catalog, catalog_storage) + self._create_catalog(catalog, catalog_storage, properties=properties) def _list_existing(self) -> tuple[set[str], dict[str, set[str]]]: """generate a list of existing UC catalogs and schema.""" @@ -199,12 +218,17 @@ def _validate_location(self, location: str): return True return False - def _create_catalog(self, catalog, catalog_storage): + def _create_catalog(self, catalog: str, catalog_storage: str, *, properties: dict[str, str] | None) -> None: logger.info(f"Creating UC catalog: {catalog}") if catalog_storage == "metastore": - self._ws.catalogs.create(catalog, comment="Created by UCX") + self._ws.catalogs.create(catalog, comment="Created by UCX", properties=properties) else: - self._ws.catalogs.create(catalog, storage_root=catalog_storage, comment="Created by UCX") + self._ws.catalogs.create( + catalog, + storage_root=catalog_storage, + comment="Created by UCX", + properties=properties, + ) def _create_schema(self, catalog, schema): logger.info(f"Creating UC schema: {schema} in catalog: {catalog}") diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index c837d93ced..49c6020449 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -234,6 +234,7 @@ def _prompt_for_new_installation(self) -> WorkspaceConfig: inventory_database = self.prompts.question( "Inventory Database stored in hive_metastore", default=default_database, valid_regex=r"^\w+$" ) + ucx_catalog = self.prompts.question("Catalog to store UCX artifacts in", default="ucx", valid_regex=r"^\w+$") log_level = self.prompts.question("Log level", default="INFO").upper() num_threads = int(self.prompts.question("Number of threads", default="8", valid_number=True)) configure_groups = ConfigureGroups(self.prompts) @@ -248,6 +249,7 @@ def _prompt_for_new_installation(self) -> WorkspaceConfig: ) return WorkspaceConfig( inventory_database=inventory_database, + ucx_catalog=ucx_catalog, workspace_group_regex=configure_groups.workspace_group_regex, workspace_group_replace=configure_groups.workspace_group_replace, account_group_regex=configure_groups.account_group_regex, diff --git a/src/databricks/labs/ucx/source_code/queries.py b/src/databricks/labs/ucx/source_code/queries.py index 79b40caa0a..5bd11b1bc7 100644 --- a/src/databricks/labs/ucx/source_code/queries.py +++ b/src/databricks/labs/ucx/source_code/queries.py @@ -49,13 +49,13 @@ def __init__( def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): assessment_start = datetime.now(timezone.utc) + dashboard_ids = self._dashboard_ids_in_scope() + logger.info(f"Running {len(dashboard_ids)} linting tasks...") linted_queries: set[str] = set() - all_dashboards = list(self._ws.dashboards.list()) - logger.info(f"Running {len(all_dashboards)} linting tasks...") all_problems: list[QueryProblem] = [] all_dfsas: list[DirectFsAccess] = [] # first lint and collect queries from dashboards - for dashboard_id in self._dashboard_ids_in_scope(): + for dashboard_id in dashboard_ids: dashboard = self._ws.dashboards.get(dashboard_id=dashboard_id) problems, dfsas = self._lint_and_collect_from_dashboard(dashboard, linted_queries) all_problems.extend(problems) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 90204c9d26..d2f48cdfa7 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -374,10 +374,19 @@ def snapshot(self, *, force_refresh: bool = False) -> list[Mount]: class CommonUtils: - def __init__(self, make_schema_fixture, env_or_skip_fixture, ws_fixture): + def __init__( + self, + make_catalog_fixture, + make_schema_fixture, + env_or_skip_fixture, + ws_fixture, + make_random_fixture, + ): + self._make_catalog = make_catalog_fixture self._make_schema = make_schema_fixture self._env_or_skip = env_or_skip_fixture self._ws = ws_fixture + self._make_random = make_random_fixture def with_dummy_resource_permission(self): # TODO: in most cases (except prepared_principal_acl) it's just a sign of a bad logic, fix it @@ -432,6 +441,10 @@ def installation(self): def inventory_database(self) -> str: return self._make_schema(catalog_name="hive_metastore").name + @cached_property + def ucx_catalog(self) -> str: + return self._make_catalog(name=f"ucx-{self._make_random()}").name + @cached_property def workspace_client(self) -> WorkspaceClient: return self._ws @@ -440,14 +453,22 @@ def workspace_client(self) -> WorkspaceClient: class MockRuntimeContext(CommonUtils, RuntimeContext): def __init__( self, - make_table_fixture, + make_catalog_fixture, make_schema_fixture, + make_table_fixture, make_udf_fixture, make_group_fixture, env_or_skip_fixture, ws_fixture, + make_random_fixture, ) -> None: - super().__init__(make_schema_fixture, env_or_skip_fixture, ws_fixture) + super().__init__( + make_catalog_fixture, + make_schema_fixture, + env_or_skip_fixture, + ws_fixture, + make_random_fixture, + ) RuntimeContext.__init__(self) self._make_table = make_table_fixture self._make_schema = make_schema_fixture @@ -533,6 +554,7 @@ def config(self) -> WorkspaceConfig: return WorkspaceConfig( warehouse_id=self._env_or_skip("TEST_DEFAULT_WAREHOUSE_ID"), inventory_database=self.inventory_database, + ucx_catalog=self.ucx_catalog, connect=self.workspace_client.config, renamed_group_prefix=f'tmp-{self.inventory_database}-', include_group_names=self.created_groups, @@ -668,19 +690,46 @@ def group_manager(self): @pytest.fixture -def runtime_ctx(ws, sql_backend, make_table, make_schema, make_udf, make_group, env_or_skip) -> MockRuntimeContext: - ctx = MockRuntimeContext(make_table, make_schema, make_udf, make_group, env_or_skip, ws) +def runtime_ctx( + ws, + sql_backend, + make_catalog, + make_schema, + make_table, + make_udf, + make_group, + env_or_skip, + make_random, +) -> MockRuntimeContext: + ctx = MockRuntimeContext( + make_catalog, + make_schema, + make_table, + make_udf, + make_group, + env_or_skip, + ws, + make_random, + ) return ctx.replace(workspace_client=ws, sql_backend=sql_backend) class MockWorkspaceContext(CommonUtils, WorkspaceContext): def __init__( self, + make_catalog_fixture, make_schema_fixture, env_or_skip_fixture, ws_fixture, + make_random_fixture, ): - super().__init__(make_schema_fixture, env_or_skip_fixture, ws_fixture) + super().__init__( + make_catalog_fixture, + make_schema_fixture, + env_or_skip_fixture, + ws_fixture, + make_random_fixture, + ) WorkspaceContext.__init__(self, ws_fixture) @cached_property @@ -720,8 +769,8 @@ def azure_subscription_id(self) -> str: @pytest.fixture -def az_cli_ctx(ws, env_or_skip, make_schema, sql_backend): - ctx = MockLocalAzureCli(make_schema, env_or_skip, ws) +def az_cli_ctx(ws, env_or_skip, make_catalog, make_schema, make_random, sql_backend): + ctx = MockLocalAzureCli(make_catalog, make_schema, env_or_skip, ws, make_random) return ctx.replace(sql_backend=sql_backend) @@ -778,8 +827,9 @@ class MockInstallationContext(MockRuntimeContext): def __init__( # pylint: disable=too-many-arguments self, - make_table_fixture, + make_catalog_fixture, make_schema_fixture, + make_table_fixture, make_udf_fixture, make_group_fixture, env_or_skip_fixture, @@ -790,14 +840,15 @@ def __init__( # pylint: disable=too-many-arguments watchdog_purge_suffix, ): super().__init__( - make_table_fixture, + make_catalog_fixture, make_schema_fixture, + make_table_fixture, make_udf_fixture, make_group_fixture, env_or_skip_fixture, ws_fixture, + make_random_fixture, ) - self._make_random = make_random_fixture self._make_acc_group = make_acc_group_fixture self._make_user = make_user_fixture self._watchdog_purge_suffix = watchdog_purge_suffix @@ -953,8 +1004,9 @@ def prompts(self): def installation_ctx( # pylint: disable=too-many-arguments ws, sql_backend, - make_table, + make_catalog, make_schema, + make_table, make_udf, make_group, env_or_skip, @@ -964,8 +1016,9 @@ def installation_ctx( # pylint: disable=too-many-arguments watchdog_purge_suffix, ) -> Generator[MockInstallationContext, None, None]: ctx = MockInstallationContext( - make_table, + make_catalog, make_schema, + make_table, make_udf, make_group, env_or_skip, diff --git a/tests/integration/hive_metastore/test_catalog_schema.py b/tests/integration/hive_metastore/test_catalog_schema.py index 713bfba1f6..3377a53034 100644 --- a/tests/integration/hive_metastore/test_catalog_schema.py +++ b/tests/integration/hive_metastore/test_catalog_schema.py @@ -1,11 +1,11 @@ import logging from datetime import timedelta -import pytest +import pytest from databricks.labs.blueprint.tui import MockPrompts - from databricks.sdk.errors import NotFound from databricks.sdk.retries import retried +from databricks.sdk.service.catalog import CatalogInfo from databricks.sdk.service.compute import DataSecurityMode, AwsAttributes from databricks.sdk.service.catalog import Privilege, SecurableType, PrivilegeAssignment from databricks.sdk.service.iam import PermissionLevel @@ -16,6 +16,21 @@ _SPARK_CONF = get_azure_spark_conf() +@retried(on=[NotFound], timeout=timedelta(minutes=2)) +def test_create_ucx_catalog_creates_catalog(ws, runtime_ctx, watchdog_remove_after) -> None: + # Delete catalog created for testing to test the creation of a new catalog + runtime_ctx.workspace_client.catalogs.delete(runtime_ctx.ucx_catalog, force=True) + prompts = MockPrompts({f"Please provide storage location url for catalog: {runtime_ctx.ucx_catalog}": "metastore"}) + + runtime_ctx.catalog_schema.create_ucx_catalog(prompts, properties={"RemoveAfter": watchdog_remove_after}) + + @retried(on=[NotFound], timeout=timedelta(seconds=20)) + def get_catalog(name: str) -> CatalogInfo: + return ws.catalogs.get(name) + + assert get_catalog(runtime_ctx.ucx_catalog) + + @retried(on=[NotFound], timeout=timedelta(minutes=2)) def test_create_catalog_schema_with_principal_acl_azure( ws, diff --git a/tests/unit/account/test_metastores.py b/tests/unit/account/test_metastores.py index 88fe9226fb..59ba89d36b 100644 --- a/tests/unit/account/test_metastores.py +++ b/tests/unit/account/test_metastores.py @@ -37,7 +37,7 @@ def test_show_all_metastores(acc_client, caplog): assert "metastore_use - 124" in caplog.messages -def test_assign_metastore(acc_client): +def test_assign_metastore(acc_client) -> None: acc_client.metastores.list.return_value = [ MetastoreInfo(name="metastore_usw_1", metastore_id="123", region="us-west-2"), MetastoreInfo(name="metastore_usw_2", metastore_id="124", region="us-west-2"), @@ -55,14 +55,19 @@ def test_assign_metastore(acc_client): etag="123", namespace=StringMessage("hive_metastore") ) account_metastores = AccountMetastores(acc_client) - prompts = MockPrompts({"Multiple metastores found, please select one*": "0", "Please select a workspace:*": "0"}) + prompts = MockPrompts({"Multiple metastores found, please select one*": "0"}) - # need to select a workspace - since it is alphabetically sorted, needs to pick workspace bar - account_metastores.assign_metastore(prompts, "", "", "") + account_metastores.assign_metastore(prompts, 123457) acc_client.metastore_assignments.create.assert_called_with(123457, "123") + ws.settings.default_namespaces.update.assert_not_called() + + # Empty default catalog should not call default name spaces + account_metastores.assign_metastore(prompts, 123457, default_catalog="") + acc_client.metastore_assignments.create.assert_called_with(123457, "123") + ws.settings.default_namespaces.update.assert_not_called() # multiple metastores & default catalog name, need to choose one - account_metastores.assign_metastore(prompts, "123456", "", "main") + account_metastores.assign_metastore(prompts, 123456, default_catalog="main") acc_client.metastore_assignments.create.assert_called_with(123456, "123") ws.settings.default_namespace.update.assert_called_with( allow_missing=True, @@ -72,7 +77,7 @@ def test_assign_metastore(acc_client): # default catalog not found, still get etag ws.settings.default_namespace.get.side_effect = NotFound(details=[{"metadata": {"etag": "not_found"}}]) - account_metastores.assign_metastore(prompts, "123456", "", "main") + account_metastores.assign_metastore(prompts, 123456, default_catalog="main") acc_client.metastore_assignments.create.assert_called_with(123456, "123") ws.settings.default_namespace.update.assert_called_with( allow_missing=True, @@ -82,10 +87,10 @@ def test_assign_metastore(acc_client): # only one metastore, should assign directly acc_client.workspaces.get.return_value = Workspace(workspace_id=123456, aws_region="us-east-2") - account_metastores.assign_metastore(MockPrompts({}), "123456") + account_metastores.assign_metastore(MockPrompts({}), 123456) acc_client.metastore_assignments.create.assert_called_with(123456, "126") # no metastore found, error acc_client.workspaces.get.return_value = Workspace(workspace_id=123456, aws_region="us-central-2") with pytest.raises(ValueError): - account_metastores.assign_metastore(MockPrompts({}), "123456") + account_metastores.assign_metastore(MockPrompts({}), 123456) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index d8cf58f221..7c99db89ed 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -111,5 +111,5 @@ def mock_notebook_resolver(): @pytest.fixture -def mock_backend(): +def mock_backend() -> MockBackend: return MockBackend() diff --git a/tests/unit/hive_metastore/test_catalog_schema.py b/tests/unit/hive_metastore/test_catalog_schema.py index 66e0784293..100c130519 100644 --- a/tests/unit/hive_metastore/test_catalog_schema.py +++ b/tests/unit/hive_metastore/test_catalog_schema.py @@ -1,3 +1,4 @@ +import logging from unittest.mock import call, create_autospec import pytest @@ -5,7 +6,7 @@ from databricks.labs.blueprint.tui import MockPrompts from databricks.labs.lsql.backends import MockBackend from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import NotFound +from databricks.sdk.errors import BadRequest, NotFound from databricks.sdk.service.catalog import CatalogInfo, ExternalLocationInfo, SchemaInfo from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema @@ -15,6 +16,12 @@ def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema: ws.catalogs.list.return_value = [CatalogInfo(name="catalog1")] + + def raise_catalog_exists(catalog: str, *_, **__) -> None: + if catalog == "catalog1": + raise BadRequest("Catalog 'catalog1' already exists") + + ws.catalogs.create.side_effect = raise_catalog_exists ws.schemas.list.return_value = [SchemaInfo(name="schema1")] ws.external_locations.list.return_value = [ExternalLocationInfo(url="s3://foo/bar")] if backend is None: @@ -111,7 +118,33 @@ def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema: principal_acl.get_interactive_cluster_grants.return_value = grants hive_acl.snapshot.return_value = hive_grants - return CatalogSchema(ws, table_mapping, principal_acl, backend, hive_acl) + return CatalogSchema(ws, table_mapping, principal_acl, backend, hive_acl, "ucx") + + +def test_create_ucx_catalog_creates_ucx_catalog() -> None: + ws = create_autospec(WorkspaceClient) + mock_prompts = MockPrompts({"Please provide storage location url for catalog: ucx": "metastore"}) + + catalog_schema = prepare_test(ws) + catalog_schema.create_ucx_catalog(mock_prompts) + + ws.catalogs.create.assert_called_with("ucx", comment="Created by UCX", properties=None) + + +def test_create_ucx_catalog_skips_when_ucx_catalogs_exists(caplog) -> None: + ws = create_autospec(WorkspaceClient) + mock_prompts = MockPrompts({"Please provide storage location url for catalog: ucx": "metastore"}) + catalog_schema = prepare_test(ws) + + def raise_catalog_exists(catalog: str, *_, **__) -> None: + if catalog == "ucx": + raise BadRequest("Catalog 'ucx' already exists") + + ws.catalogs.create.side_effect = raise_catalog_exists + + with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.hive_metastore.catalog_schema"): + catalog_schema.create_ucx_catalog(mock_prompts) + assert "Catalog 'ucx' already exists. Skipping." in caplog.text @pytest.mark.parametrize("location", ["s3://foo/bar", "s3://foo/bar/test", "s3://foo/bar/test/baz"]) @@ -124,9 +157,9 @@ def test_create_all_catalogs_schemas_creates_catalogs(location: str): catalog_schema.create_all_catalogs_schemas(mock_prompts) calls = [ - call("catalog2", storage_root=location, comment="Created by UCX"), - call("catalog3", storage_root=location, comment="Created by UCX"), - call("catalog4", storage_root=location, comment="Created by UCX"), + call("catalog2", storage_root=location, comment="Created by UCX", properties=None), + call("catalog3", storage_root=location, comment="Created by UCX", properties=None), + call("catalog4", storage_root=location, comment="Created by UCX", properties=None), ] ws.catalogs.create.assert_has_calls(calls, any_order=True) @@ -165,8 +198,8 @@ def test_no_catalog_storage(): catalog_schema.create_all_catalogs_schemas(mock_prompts) calls = [ - call("catalog2", comment="Created by UCX"), - call("catalog3", comment="Created by UCX"), + call("catalog2", comment="Created by UCX", properties=None), + call("catalog3", comment="Created by UCX", properties=None), ] ws.catalogs.create.assert_has_calls(calls, any_order=True) @@ -180,8 +213,8 @@ def test_catalog_schema_acl(): catalog_schema.create_all_catalogs_schemas(mock_prompts) calls = [ - call("catalog2", comment="Created by UCX"), - call("catalog3", comment="Created by UCX"), + call("catalog2", comment="Created by UCX", properties=None), + call("catalog3", comment="Created by UCX", properties=None), ] ws.catalogs.create.assert_has_calls(calls, any_order=True) ws.schemas.create.assert_any_call("schema2", "catalog2", comment="Created by UCX") diff --git a/tests/unit/install/test_install.py b/tests/unit/install/test_install.py index d48970beaf..404d085400 100644 --- a/tests/unit/install/test_install.py +++ b/tests/unit/install/test_install.py @@ -26,10 +26,9 @@ ) from databricks.sdk.errors.platform import BadRequest from databricks.sdk.service import iam, jobs, sql -from databricks.sdk.service.compute import Policy, State +from databricks.sdk.service.compute import Policy from databricks.sdk.service.jobs import BaseRun, RunLifeCycleState, RunResultState, RunState from databricks.sdk.service.provisioning import Workspace -from databricks.sdk.service.sql import EndpointInfo, EndpointInfoWarehouseType import databricks.labs.ucx.installer.mixins import databricks.labs.ucx.uninstall # noqa @@ -333,47 +332,64 @@ def result(): ) -def test_save_config(ws, mock_installation): - ws.workspace.get_status = not_found - ws.warehouses.list = lambda **_: [ - EndpointInfo(name="abc", id="abc", warehouse_type=EndpointInfoWarehouseType.PRO, state=State.RUNNING) - ] - ws.workspace.download = not_found - +@pytest.mark.parametrize( + "prompt_question,prompt_answer,workspace_config_overwrite", + [ + ("Inventory Database stored in hive_metastore", "non_default", {"inventory_database": "non_default"}), + ("Catalog to store UCX artifacts in", "ucx-test", {"ucx_catalog": "ucx-test"}), + ("Log level", "DEBUG", {"log_level": "DEBUG"}), + (r".*workspace group names.*", "g1, g2, g99", {"include_group_names": ["g1", "g2", "g99"]}), + ("Number of threads", "16", {"num_threads": 16}), + (r"Comma-separated list of databases to migrate.*", "db1,db2", {"include_databases": ["db1", "db2"]}), + (r"Does given workspace .* block Internet access?", "yes", {"upload_dependencies": True}), + ("Do you want to trigger assessment job after installation?", "yes", {"trigger_job": True}), + ("Reconciliation threshold, in percentage", "10", {"recon_tolerance_percent": 10}), + ( + r"Parallelism for migrating.*", + "1000", + {"spark_conf": {"spark.sql.sources.parallelPartitionDiscovery.parallelism": "1000"}}, + ), + (r"Min workers for auto-scale.*", "2", {"min_workers": 2}), + (r"Max workers for auto-scale.*", "20", {"max_workers": 20}), + ], +) +def test_configure_sets_expected_workspace_configuration_values( + ws, + mock_installation, + prompt_question, + prompt_answer, + workspace_config_overwrite, +) -> None: + workspace_config_default = { + "version": 2, + "default_catalog": "ucx_default", + "ucx_catalog": "ucx", + "inventory_database": "ucx", + "log_level": "INFO", + "num_threads": 8, + "min_workers": 1, + "max_workers": 10, + "policy_id": "foo", + "renamed_group_prefix": "db-temp-", + "warehouse_id": "abc", + "workspace_start_path": "/", + "num_days_submit_runs_history": 30, + "recon_tolerance_percent": 5, + } prompts = MockPrompts( { r".*PRO or SERVERLESS SQL warehouse.*": "1", - r"Choose how to map the workspace groups.*": "2", + r"Choose how to map the workspace groups.*": "2", # specify names r".*": "", - r".*days to analyze submitted runs.*": "1", - r"Reconciliation threshold, in percentage.*": "5", + prompt_question: prompt_answer, } ) - install = WorkspaceInstaller(ws).replace( - prompts=prompts, - installation=mock_installation, - product_info=PRODUCT_INFO, - ) + install = WorkspaceInstaller(ws).replace(prompts=prompts, installation=mock_installation, product_info=PRODUCT_INFO) + install.configure() - mock_installation.assert_file_written( - 'config.yml', - { - 'version': 2, - 'default_catalog': 'ucx_default', - 'inventory_database': 'ucx', - 'log_level': 'INFO', - 'num_days_submit_runs_history': 30, - 'num_threads': 8, - 'min_workers': 1, - 'max_workers': 10, - 'policy_id': 'foo', - 'renamed_group_prefix': 'db-temp-', - 'warehouse_id': 'abc', - 'workspace_start_path': '/', - 'recon_tolerance_percent': 5, - }, - ) + workspace_config_expected = {**workspace_config_default, **workspace_config_overwrite} + mock_installation.assert_file_written("config.yml", workspace_config_expected) def test_corrupted_config(ws, mock_installation, caplog): @@ -398,47 +414,7 @@ def test_corrupted_config(ws, mock_installation, caplog): assert 'Existing installation at ~/mock is corrupted' in caplog.text -def test_save_config_strip_group_names(ws, mock_installation): - prompts = MockPrompts( - { - r".*PRO or SERVERLESS SQL warehouse.*": "1", - r"Choose how to map the workspace groups.*": "2", # specify names - r".*workspace group names.*": "g1, g2, g99", - r".*": "", - r"Reconciliation threshold, in percentage.*": "5", - } - ) - ws.workspace.get_status = not_found - - install = WorkspaceInstaller(ws).replace( - prompts=prompts, - installation=mock_installation, - product_info=PRODUCT_INFO, - ) - install.configure() - - mock_installation.assert_file_written( - 'config.yml', - { - 'version': 2, - 'default_catalog': 'ucx_default', - 'include_group_names': ['g1', 'g2', 'g99'], - 'inventory_database': 'ucx', - 'log_level': 'INFO', - 'num_days_submit_runs_history': 30, - 'num_threads': 8, - 'min_workers': 1, - 'max_workers': 10, - 'policy_id': 'foo', - 'renamed_group_prefix': 'db-temp-', - 'warehouse_id': 'abc', - 'workspace_start_path': '/', - 'recon_tolerance_percent': 5, - }, - ) - - -def test_create_cluster_policy(ws, mock_installation): +def test_create_cluster_policy(ws, mock_installation) -> None: ws.cluster_policies.list.return_value = [ Policy( policy_id="foo1", @@ -470,6 +446,7 @@ def test_create_cluster_policy(ws, mock_installation): { 'version': 2, 'default_catalog': 'ucx_default', + 'ucx_catalog': 'ucx', 'include_group_names': ['g1', 'g2', 'g99'], 'inventory_database': 'ucx', 'log_level': 'INFO', @@ -1083,45 +1060,6 @@ def test_open_config(ws, mocker, mock_installation): webbrowser_open.assert_called_with('https://localhost/#workspace~/mock/config.yml') -def test_save_config_should_include_databases(ws, mock_installation): - prompts = MockPrompts( - { - r".*PRO or SERVERLESS SQL warehouse.*": "1", - r"Choose how to map the workspace groups.*": "2", # specify names - r"Comma-separated list of databases to migrate.*": "db1,db2", - r"Reconciliation threshold, in percentage.*": "5", - r".*": "", - } - ) - ws.workspace.get_status = not_found - install = WorkspaceInstaller(ws).replace( - prompts=prompts, - installation=mock_installation, - product_info=PRODUCT_INFO, - ) - install.configure() - - mock_installation.assert_file_written( - 'config.yml', - { - 'version': 2, - 'default_catalog': 'ucx_default', - 'include_databases': ['db1', 'db2'], - 'inventory_database': 'ucx', - 'log_level': 'INFO', - 'num_threads': 8, - 'min_workers': 1, - 'max_workers': 10, - 'policy_id': 'foo', - 'renamed_group_prefix': 'db-temp-', - 'warehouse_id': 'abc', - 'workspace_start_path': '/', - 'num_days_submit_runs_history': 30, - 'recon_tolerance_percent': 5, - }, - ) - - def test_triggering_assessment_wf(ws, mocker, mock_installation): ws.jobs.run_now = mocker.Mock() mocker.patch("webbrowser.open") @@ -1237,49 +1175,6 @@ def test_runs_upgrades_on_more_recent_version(ws, any_prompt): wheels.upload_to_wsfs.assert_called() -def test_fresh_install(ws, mock_installation): - prompts = MockPrompts( - { - r".*PRO or SERVERLESS SQL warehouse.*": "1", - r"Choose how to map the workspace groups.*": "2", - r"Open config file in.*": "no", - r"Parallelism for migrating.*": "1000", - r"Min workers for auto-scale.*": "2", - r"Max workers for auto-scale.*": "20", - r"Reconciliation threshold, in percentage.*": "5", - r".*": "", - } - ) - ws.workspace.get_status = not_found - - install = WorkspaceInstaller(ws).replace( - prompts=prompts, - installation=mock_installation, - product_info=PRODUCT_INFO, - ) - install.configure() - - mock_installation.assert_file_written( - 'config.yml', - { - 'version': 2, - 'default_catalog': 'ucx_default', - 'inventory_database': 'ucx', - 'log_level': 'INFO', - 'num_days_submit_runs_history': 30, - 'num_threads': 8, - 'policy_id': 'foo', - 'spark_conf': {'spark.sql.sources.parallelPartitionDiscovery.parallelism': '1000'}, - 'min_workers': 2, - 'max_workers': 20, - 'renamed_group_prefix': 'db-temp-', - 'warehouse_id': 'abc', - 'workspace_start_path': '/', - 'recon_tolerance_percent': 5, - }, - ) - - def test_remove_jobs(ws, caplog, mock_installation_extra_jobs, any_prompt): sql_backend = MockBackend() install_state = InstallState.from_installation(mock_installation_extra_jobs) @@ -1746,7 +1641,7 @@ def test_user_workspace_installer(mock_ws): assert workspace_installer.install_state.install_folder().startswith("/Users/") -def test_save_config_ext_hms(ws, mock_installation): +def test_save_config_ext_hms(ws, mock_installation) -> None: ws.get_workspace_id.return_value = 12345678 cluster_policy = { "spark_conf.spark.hadoop.javax.jdo.option.ConnectionURL": {"value": "url"}, @@ -1786,6 +1681,7 @@ def test_save_config_ext_hms(ws, mock_installation): { 'version': 2, 'default_catalog': 'ucx_default', + 'ucx_catalog': 'ucx', 'include_databases': ['db1', 'db2'], 'inventory_database': 'ucx_12345678', 'log_level': 'INFO', diff --git a/tests/unit/source_code/test_queries.py b/tests/unit/source_code/test_queries.py index c9b3a98a88..216a4647a9 100644 --- a/tests/unit/source_code/test_queries.py +++ b/tests/unit/source_code/test_queries.py @@ -33,3 +33,15 @@ def test_query_linter_collects_dfsas_from_queries(name, query, dfsa_paths, is_re assert set(dfsa.path for dfsa in dfsas) == set(dfsa_paths) assert all(dfsa.is_read == is_read for dfsa in dfsas) assert all(dfsa.is_write == is_write for dfsa in dfsas) + + +def test_query_liner_refresh_report_writes_query_problems(migration_index, mock_backend) -> None: + ws = create_autospec(WorkspaceClient) + crawlers = create_autospec(DirectFsAccessCrawler) + linter = QueryLinter(ws, migration_index, crawlers, None) + + linter.refresh_report(mock_backend, inventory_database="test") + + assert mock_backend.has_rows_written_for("`test`.query_problems") + ws.dashboards.list.assert_called_once() + crawlers.assert_not_called() diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index d53b8d1bfb..9c2e1ae646 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -13,7 +13,7 @@ from databricks.sdk.errors import NotFound from databricks.sdk.errors.platform import BadRequest from databricks.sdk.service import jobs, sql -from databricks.sdk.service.catalog import ExternalLocationInfo +from databricks.sdk.service.catalog import ExternalLocationInfo, MetastoreInfo from databricks.sdk.service.compute import ClusterDetails, ClusterSource from databricks.sdk.service.iam import ComplexValue, User from databricks.sdk.service.jobs import Run, RunResultState, RunState @@ -803,8 +803,8 @@ def test_create_catalogs_schemas_handles_existing(ws, caplog) -> None: create_catalogs_schemas(ws, prompts, ctx=WorkspaceContext(ws)) ws.catalogs.list.assert_called_once() - assert "Catalog test already exists. Skipping." in caplog.messages - assert "Schema test in catalog test already exists. Skipping." in caplog.messages + assert "Catalog 'test' already exists. Skipping." in caplog.messages + assert "Schema 'test' in catalog 'test' already exists. Skipping." in caplog.messages def test_cluster_remap(ws, caplog): @@ -875,9 +875,23 @@ def test_show_all_metastores(acc_client, caplog): assert 'Matching metastores are:' in caplog.messages -def test_assign_metastore(acc_client, caplog): - with pytest.raises(ValueError): - assign_metastore(acc_client, "123") +def test_assign_metastore_logs_account_id_and_assigns_metastore(caplog, acc_client) -> None: + ctx = AccountContext(acc_client) + acc_client.metastores.list.return_value = [MetastoreInfo(name="test", metastore_id="123")] + + with caplog.at_level(logging.INFO, logger="databricks.labs.ucx.cli"): + assign_metastore(acc_client, "456", ctx=ctx) + + assert "Account ID: 123" in caplog.messages + acc_client.metastore_assignments.create.assert_called_once() + + +def test_create_ucx_catalog_calls_create_catalog(ws) -> None: + prompts = MockPrompts({"Please provide storage location url for catalog: .*": "metastore"}) + + create_catalogs_schemas(ws, prompts, ctx=WorkspaceContext(ws)) + + ws.catalogs.create.assert_called_once() @pytest.mark.parametrize("run_as_collection", [False, True]) diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index b91be9fa46..83f1e39e07 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -1,8 +1,25 @@ +import dataclasses +import pytest + from databricks.labs.blueprint.installation import MockInstallation from databricks.labs.ucx.config import WorkspaceConfig +@pytest.mark.parametrize( + "attribute,value", + [ + ("ucx_catalog", "ucx"), + ("default_catalog", "default"), + ], +) +def test_config_attributes(attribute, value) -> None: + """Increase code coverage with this test""" + config = dataclasses.replace(WorkspaceConfig("test"), **{attribute: value}) + + assert getattr(config, attribute) == value + + def test_v1_migrate_zeroconf(): installation = MockInstallation( {'config.yml': {'inventory_database': 'x', 'groups': {}, 'connect': {'host': 'a', 'token': 'b'}}} From 2a09a8f4ae81fb44a14d7613ef68b58c9df2cc7e Mon Sep 17 00:00:00 2001 From: Hari Selvarajan <105197202+HariGS-DB@users.noreply.github.com> Date: Wed, 25 Sep 2024 12:59:03 +0100 Subject: [PATCH 09/21] Fixes issue of circular dependency of migrate-location ACL (#2741) ## Changes Resolves #2529 ### Functionality - [ ] modified existing command: `databricks labs ucx migrate-locations` ### Tests - [ ] added unit tests - [ ] added integration tests --------- Co-authored-by: Amin Movahed Co-authored-by: Amin Movahed <140028681+aminmovahed-db@users.noreply.github.com> --- .../labs/ucx/contexts/application.py | 20 ++++++++----------- .../labs/ucx/hive_metastore/grants.py | 6 +++--- tests/unit/azure/test_locations.py | 2 +- .../hive_metastore/test_principal_grants.py | 14 ++++++++----- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index d5f506ac1e..d595eb137f 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -18,7 +18,6 @@ from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.sdk import AccountClient, WorkspaceClient, core -from databricks.sdk.errors import NotFound from databricks.sdk.service import sql from databricks.labs.ucx.account.workspaces import WorkspaceInfo @@ -310,18 +309,15 @@ def aws_acl(self): ) @cached_property - def principal_locations(self): - eligible_locations = {} - try: + def principal_locations_retriever(self): + def inner(): if self.is_azure: - eligible_locations = self.azure_acl.get_eligible_locations_principals() + return self.azure_acl.get_eligible_locations_principals() if self.is_aws: - eligible_locations = self.aws_acl.get_eligible_locations_principals() - if self.is_gcp: - raise NotImplementedError("Not implemented for GCP.") - except NotFound: - pass - return eligible_locations + return self.aws_acl.get_eligible_locations_principals() + raise NotImplementedError("Not implemented for GCP.") + + return inner @cached_property def principal_acl(self): @@ -331,7 +327,7 @@ def principal_acl(self): self.installation, self.tables_crawler, self.mounts_crawler, - self.principal_locations, + self.principal_locations_retriever, ) @cached_property diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index a2afa473f3..8673779697 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -579,7 +579,7 @@ def __init__( installation: Installation, tables_crawler: TablesCrawler, mounts_crawler: Mounts, - cluster_locations: list[ComputeLocations], + cluster_locations: Callable[[], list[ComputeLocations]], ): self._backend = backend self._ws = ws @@ -593,7 +593,7 @@ def get_interactive_cluster_grants(self) -> list[Grant]: mounts = list(self._mounts_crawler.snapshot()) grants: set[Grant] = set() - for compute_location in self._compute_locations: + for compute_location in self._compute_locations(): principals = self._get_cluster_principal_mapping(compute_location.compute_id, compute_location.compute_type) if len(principals) == 0: continue @@ -697,7 +697,7 @@ def apply_location_acl(self): "CREATE EXTERNAL VOLUME and READ_FILES for existing eligible interactive cluster users" ) # get the eligible location mapped for each interactive cluster - for compute_location in self._compute_locations: + for compute_location in self._compute_locations(): # get interactive cluster users principals = self._get_cluster_principal_mapping(compute_location.compute_id, compute_location.compute_type) if len(principals) == 0: diff --git a/tests/unit/azure/test_locations.py b/tests/unit/azure/test_locations.py index 2b6e8a177a..f1b901638b 100644 --- a/tests/unit/azure/test_locations.py +++ b/tests/unit/azure/test_locations.py @@ -30,7 +30,7 @@ def location_migration_for_test(ws, mock_backend, mock_installation, azurerm=Non azure_resource_permissions = AzureResourcePermissions(mock_installation, ws, azurerm, location_crawler) tables_crawler = TablesCrawler(mock_backend, 'ucx') mounts_crawler = Mounts(mock_backend, ws, 'ucx') - principal_acl = PrincipalACL(ws, mock_backend, mock_installation, tables_crawler, mounts_crawler, {}) + principal_acl = PrincipalACL(ws, mock_backend, mock_installation, tables_crawler, mounts_crawler, lambda: []) external_locations_migration = ExternalLocationsMigration( ws, location_crawler, azure_resource_permissions, azurerm, principal_acl ) diff --git a/tests/unit/hive_metastore/test_principal_grants.py b/tests/unit/hive_metastore/test_principal_grants.py index e62a998905..bef584fdab 100644 --- a/tests/unit/hive_metastore/test_principal_grants.py +++ b/tests/unit/hive_metastore/test_principal_grants.py @@ -157,11 +157,15 @@ def principal_acl(w, install, cluster_spn: list, warehouse_spn: list): spn_crawler = create_autospec(AzureServicePrincipalCrawler) spn_crawler.get_cluster_to_storage_mapping.return_value = cluster_spn - locations = [] - if w.config.is_azure: - locations = azure_acl(w, install, cluster_spn, warehouse_spn).get_eligible_locations_principals() - if w.config.is_aws: - locations = aws_acl(w, install).get_eligible_locations_principals() + + def inner(): + if w.config.is_azure: + return azure_acl(w, install, cluster_spn, warehouse_spn).get_eligible_locations_principals() + if w.config.is_aws: + return aws_acl(w, install).get_eligible_locations_principals() + return None + + locations = inner return PrincipalACL(w, sql_backend, install, table_crawler, mount_crawler, locations) From d1dd0c5f684b0096acd8b447997893cd05c4211a Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 25 Sep 2024 14:09:42 +0200 Subject: [PATCH 10/21] Added static code analysis results to assessment dashboard (#2696) ## Changes Add job/query problem widgets to the dashboard Add directfs access widget to the dashboard ### Linked issues Resolves #2595 ### Functionality None ### Tests - [x] added integration tests using mock data - [x] manually tested widgets, see below: https://github.com/user-attachments/assets/3684c30f-761a-4de6-bc67-de650c5d5353 --------- Co-authored-by: Eric Vergnaud Co-authored-by: Serge Smertin <259697+nfx@users.noreply.github.com> --- README.md | 16 ++- .../main/35_0_code_compatibility_problems.md | 8 ++ ...de_compatibility_problems_in_workflows.sql | 35 ++++++ ...e_compatibility_problems_in_dashboards.sql | 29 +++++ .../36_0_direct_filesystem_access_problems.md | 14 +++ .../main/36_1_direct_filesystem_accesses.sql | 62 ++++++++++ src/databricks/labs/ucx/source_code/graph.py | 3 +- src/databricks/labs/ucx/source_code/jobs.py | 8 +- .../labs/ucx/source_code/queries.py | 8 +- .../integration/assessment/test_dashboards.py | 111 ++++++++++++++++++ .../integration/assessment/test_workflows.py | 4 +- tests/integration/conftest.py | 62 ++++++++-- tests/integration/source_code/test_jobs.py | 4 +- tests/integration/source_code/test_queries.py | 5 +- tests/unit/source_code/test_queries.py | 2 +- 15 files changed, 344 insertions(+), 27 deletions(-) create mode 100644 src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md create mode 100644 src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql create mode 100644 src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql create mode 100644 src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md create mode 100644 src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql create mode 100644 tests/integration/assessment/test_dashboards.py diff --git a/README.md b/README.md index cc4e05706a..7c5bf044c0 100644 --- a/README.md +++ b/README.md @@ -399,6 +399,9 @@ which can be used for further analysis and decision-making through the [assessme 9. `assess_pipelines`: This task scans through all the Pipelines and identifies those pipelines that have Azure Service Principals embedded in their configurations. A list of all the pipelines with matching configurations is stored in the `$inventory.pipelines` table. 10. `assess_azure_service_principals`: This task scans through all the clusters configurations, cluster policies, job cluster configurations, Pipeline configurations, and Warehouse configuration and identifies all the Azure Service Principals who have been given access to the Azure storage accounts via spark configurations referred in those entities. The list of all the Azure Service Principals referred in those configurations is saved in the `$inventory.azure_service_principals` table. 11. `assess_global_init_scripts`: This task scans through all the global init scripts and identifies if there is an Azure Service Principal who has been given access to the Azure storage accounts via spark configurations referred in those scripts. +12. `assess_dashboards`: This task scans through all the dashboards and analyzes embedded queries for migration problems. It also collects direct filesystem access patterns that require attention. +13. `assess_workflows`: This task scans through all the jobs and tasks and analyzes notebooks and files for migration problems. It also collects direct filesystem access patterns that require attention. + ![report](docs/assessment-report.png) @@ -714,11 +717,16 @@ in the Migration dashboard. > Please note that this is an experimental workflow. -The `experimental-workflow-linter` workflow lints accessible code belonging to all workflows/jobs present in the -workspace. The linting emits problems indicating what to resolve for making the code Unity Catalog compatible. +The `experimental-workflow-linter` workflow lints accessible code from 2 sources: + - all workflows/jobs present in the workspace + - all dashboards/queries present in the workspace +The linting emits problems indicating what to resolve for making the code Unity Catalog compatible. +The linting also locates direct filesystem access that need to be migrated. -Once the workflow completes, the output will be stored in `$inventory_database.workflow_problems` table, and displayed -in the Migration dashboard. +Once the workflow completes: + - problems are stored in the `$inventory_database.workflow_problems`/`$inventory_database.query_problems` table + - direct filesystem access are stored in the `$inventory_database.directfs_in_paths`/`$inventory_database.directfs_in_queries` table + - all the above are displayed in the Migration dashboard. ![code compatibility problems](docs/code_compatibility_problems.png) diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md b/src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md new file mode 100644 index 0000000000..851d95a766 --- /dev/null +++ b/src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md @@ -0,0 +1,8 @@ +## Code compatibility problems + +The tables below assist with verifying if workflows and dashboards are Unity Catalog compatible. It can be filtered on the path, +problem code and workflow name. +Each row: +- Points to a problem detected in the code using the code path, query or workflow & task reference and start/end line & column; +- Explains the problem with a human-readable message and a code. + diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql b/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql new file mode 100644 index 0000000000..64a7e956a5 --- /dev/null +++ b/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql @@ -0,0 +1,35 @@ +/* +--title 'Workflow migration problems' +--width 6 +--overrides '{"spec":{ + "encodings":{ + "columns": [ + {"fieldName": "path", "booleanValues": ["false", "true"], "linkUrlTemplate": "/#workspace/{{ link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "path"}, + {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, + {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, + {"fieldName": "workflow_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/jobs/{{ workflow_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_name"}, + {"fieldName": "task_key", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/jobs/{{ workflow_id }}/tasks/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "task_key"}, + {"fieldName": "start_line", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "start_line"}, + {"fieldName": "start_col", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "start_col"}, + {"fieldName": "end_line", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "end_line"}, + {"fieldName": "end_col", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "end_col"} + ]}, + "invisibleColumns": [ + {"name": "link", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "link"}, + {"name": "workflow_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_id"} + ] + }}' +*/ +SELECT + substring_index(path, '@databricks.com/', -1) as path, + path as link, + code, + message, + job_id AS workflow_id, + job_name AS workflow_name, + task_key, + start_line, + start_col, + end_line, + end_col +FROM inventory.workflow_problems diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql new file mode 100644 index 0000000000..a918e9682c --- /dev/null +++ b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql @@ -0,0 +1,29 @@ +/* +--title 'Dashboard compatibility problems' +--width 6 +--overrides '{"spec":{ + "encodings":{ + "columns": [ + {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, + {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, + {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Dashboard"}, + {"fieldName": "query_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/editor/{{ query_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Query"} + ]}, + "invisibleColumns": [ + {"name": "dashboard_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_parent"}, + {"name": "dashboard_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_id"}, + {"name": "query_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_parent"}, + {"name": "query_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_id"} + ] + }}' +*/ +SELECT + dashboard_id, + dashboard_parent, + dashboard_name, + query_id, + query_parent, + query_name, + code, + message +FROM inventory.query_problems diff --git a/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md b/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md new file mode 100644 index 0000000000..3f5b255681 --- /dev/null +++ b/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md @@ -0,0 +1,14 @@ +--- +height: 4 +--- + +# Direct filesystem access problems + +The table below assists with verifying if workflows and dashboards require direct filesystem access. +As a reminder, `dbfs:/` is not supported in Unity Catalog, and more generally direct filesystem access is discouraged. +Rather, data should be accessed via Unity tables. + +Each row: +- Points to a direct filesystem access detected in the code using the code path, query or workflow & task reference and start/end line & column; +- Provides the _lineage_ i.e. which `workflow -> task -> notebook...` execution sequence leads to that access. + diff --git a/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql new file mode 100644 index 0000000000..b038dbfbc8 --- /dev/null +++ b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql @@ -0,0 +1,62 @@ +/* +--title 'Direct filesystem access problems' +--width 6 +--overrides '{"spec":{ + "encodings":{ + "columns": [ + {"fieldName": "location", "title": "location", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, + {"fieldName": "is_read", "title": "is_read", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]}, + {"fieldName": "is_write", "title": "is_write", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]}, + {"fieldName": "source", "title": "source", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ source_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]}, + {"fieldName": "timestamp", "title": "last_modified", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]}, + {"fieldName": "lineage", "title": "lineage", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ lineage_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]}, + {"fieldName": "lineage_data", "title": "lineage_data", "type": "complex", "displayAs": "json", "booleanValues": ["false", "true"]}, + {"fieldName": "assessment_start", "title": "assessment_start", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]}, + {"fieldName": "assessment_end", "title": "assessment_end", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]} + ]}, + "invisibleColumns": [ + {"fieldName": "source_link", "title": "source_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, + {"fieldName": "lineage_type", "title": "lineage_type", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, + {"fieldName": "lineage_id", "title": "lineage_id", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, + {"fieldName": "lineage_link", "title": "lineage_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]} + ] + }}' +*/ +SELECT + path as location, + is_read, + is_write, + if( startswith(source_id, '/'), substring_index(source_id, '@databricks.com/', -1), split_part(source_id, '/', 2)) as source, + if( startswith(source_id, '/'), concat('/#workspace/', source_id), concat('/sql/editor/', split_part(source_id, '/', 2))) as source_link, + source_timestamp as `timestamp`, + case + when lineage.object_type = 'WORKFLOW' then concat('Workflow: ', lineage.other.name) + when lineage.object_type = 'TASK' then concat('Task: ', split_part(lineage.object_id, '/', 2)) + when lineage.object_type = 'NOTEBOOK' then concat('Notebook: ', substring_index(lineage.object_id, '@databricks.com/', -1)) + when lineage.object_type = 'FILE' then concat('File: ', substring_index(lineage.object_id, '@databricks.com/', -1)) + when lineage.object_type = 'DASHBOARD' then concat('Dashboard: ', lineage.other.name) + when lineage.object_type = 'QUERY' then concat('Query: ', lineage.other.name) + end as lineage, + lineage.object_type as lineage_type, + lineage.object_id as lineage_id, + case + when lineage.object_type = 'WORKFLOW' then concat('/jobs/', lineage.object_id) + when lineage.object_type = 'TASK' then concat('/jobs/', split_part(lineage.object_id, '/', 1), '/tasks/', split_part(lineage.object_id, '/', 2)) + when lineage.object_type = 'NOTEBOOK' then concat('/#workspace/', lineage.object_id) + when lineage.object_type = 'FILE' then concat('/#workspace/', lineage.object_id) + when lineage.object_type = 'DASHBOARD' then concat('/sql/dashboards/', lineage.object_id) + when lineage.object_type = 'QUERY' then concat('/sql/editor/', split_part(lineage.object_id, '/', 2)) + end as lineage_link, + lineage.other as lineage_data, + assessment_start, + assessment_end +from (SELECT + path, + is_read, + is_write, + source_id, + source_timestamp, + explode(source_lineage) as lineage, + assessment_start_timestamp as assessment_start, + assessment_end_timestamp as assessment_end +FROM inventory.directfs) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index 347690f067..a722e9ef3c 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -337,7 +337,8 @@ def __repr__(self): @property def lineage(self) -> list[LineageAtom]: - return [LineageAtom(object_type="PATH", object_id=str(self.path))] + object_type = "NOTEBOOK" if is_a_notebook(self.path) else "FILE" + return [LineageAtom(object_type=object_type, object_id=str(self.path))] class SourceContainer(abc.ABC): diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index b5a4e47991..eb50b7ef31 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -86,8 +86,8 @@ def __repr__(self): @property def lineage(self) -> list[LineageAtom]: job_name = (None if self._job.settings is None else self._job.settings.name) or "unknown job" - job_lineage = LineageAtom("JOB", str(self._job.job_id), {"name": job_name}) - task_lineage = LineageAtom("TASK", self._task.task_key) + job_lineage = LineageAtom("WORKFLOW", str(self._job.job_id), {"name": job_name}) + task_lineage = LineageAtom("TASK", f"{self._job.job_id}/{self._task.task_key}") return [job_lineage, task_lineage] @@ -469,8 +469,8 @@ def _collect_task_dfsas( job_name = job.settings.name if job.settings and job.settings.name else "" for dfsa in DfsaCollectorWalker(graph, set(), self._path_lookup, session_state): atoms = [ - LineageAtom(object_type="JOB", object_id=job_id, other={"name": job_name}), - LineageAtom(object_type="TASK", object_id=task.task_key), + LineageAtom(object_type="WORKFLOW", object_id=job_id, other={"name": job_name}), + LineageAtom(object_type="TASK", object_id=f"{job_id}/{task.task_key}"), ] yield dataclasses.replace(dfsa, source_lineage=atoms + dfsa.source_lineage) diff --git a/src/databricks/labs/ucx/source_code/queries.py b/src/databricks/labs/ucx/source_code/queries.py index 5bd11b1bc7..26a2c14a26 100644 --- a/src/databricks/labs/ucx/source_code/queries.py +++ b/src/databricks/labs/ucx/source_code/queries.py @@ -66,7 +66,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): linted_queries.add(query.id) problems = self.lint_query(query) all_problems.extend(problems) - dfsas = self.collect_dfsas_from_query(query) + dfsas = self.collect_dfsas_from_query("no-dashboard-id", query) all_dfsas.extend(dfsas) # dump problems logger.info(f"Saving {len(all_problems)} linting problems...") @@ -123,7 +123,7 @@ def _lint_and_collect_from_dashboard( dashboard_name=dashboard_name, ) ) - dfsas = self.collect_dfsas_from_query(query) + dfsas = self.collect_dfsas_from_query(dashboard_id, query) for dfsa in dfsas: atom = LineageAtom( object_type="DASHBOARD", @@ -155,11 +155,11 @@ def lint_query(self, query: LegacyQuery) -> Iterable[QueryProblem]: ) @classmethod - def collect_dfsas_from_query(cls, query: LegacyQuery) -> Iterable[DirectFsAccess]: + def collect_dfsas_from_query(cls, dashboard_id: str, query: LegacyQuery) -> Iterable[DirectFsAccess]: if query.query is None: return linter = DirectFsAccessSqlLinter() - source_id = query.id or "no id" + source_id = f"{dashboard_id}/{query.id}" source_name = query.name or "" source_timestamp = cls._read_timestamp(query.updated_at) source_lineage = [LineageAtom(object_type="QUERY", object_id=source_id, other={"name": source_name})] diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py new file mode 100644 index 0000000000..e95193faf5 --- /dev/null +++ b/tests/integration/assessment/test_dashboards.py @@ -0,0 +1,111 @@ +from datetime import datetime, timezone, timedelta + +import pytest + +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, LineageAtom +from databricks.labs.ucx.source_code.jobs import JobProblem +from databricks.sdk.service.iam import PermissionLevel + +from databricks.labs.ucx.source_code.queries import QueryProblem + + +def _populate_workflow_problems(installation_ctx): + job_problems = [ + JobProblem( + job_id=12345, + job_name="Peter the Job", + task_key="23456", + path="parent/child.py", + code="sql-parse-error", + message="Could not parse SQL", + start_line=1234, + start_col=22, + end_line=1234, + end_col=32, + ) + ] + installation_ctx.sql_backend.save_table( + f'{installation_ctx.inventory_database}.workflow_problems', + job_problems, + JobProblem, + mode='overwrite', + ) + + +def _populate_dashboard_problems(installation_ctx): + query_problems = [ + QueryProblem( + dashboard_id="12345", + dashboard_parent="dashbards/parent", + dashboard_name="my_dashboard", + query_id="23456", + query_parent="queries/parent", + query_name="my_query", + code="sql-parse-error", + message="Could not parse SQL", + ) + ] + installation_ctx.sql_backend.save_table( + f'{installation_ctx.inventory_database}.query_problems', + query_problems, + QueryProblem, + mode='overwrite', + ) + + +def _populate_directfs_problems(installation_ctx): + dfsas = [ + DirectFsAccess( + path="some_path", + is_read=False, + is_write=True, + source_id="xyz.py", + source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="WORKFLOW", object_id="my_workflow"), + LineageAtom(object_type="TASK", object_id="my_workflow/my_task"), + LineageAtom(object_type="NOTEBOOK", object_id="my_notebook"), + LineageAtom(object_type="FILE", object_id="my file"), + ], + assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), + assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), + ) + ] + installation_ctx.directfs_access_crawler_for_paths.dump_all(dfsas) + dfsas = [ + DirectFsAccess( + path="some_path", + is_read=False, + is_write=True, + source_id="xyz.py", + source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="DASHBOARD", object_id="my_dashboard"), + LineageAtom(object_type="QUERY", object_id="my_dashboard/my_query"), + ], + assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), + assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), + ) + ] + installation_ctx.directfs_access_crawler_for_queries.dump_all(dfsas) + + +@pytest.mark.skip("Development tool") +def test_dashboard_with_prepopulated_data(installation_ctx, make_cluster_policy, make_cluster_policy_permissions): + """the purpose of this test is to prepopulate data used by the dashboard without running an actual -lengthy- assessment""" + ucx_group, _ = installation_ctx.make_ucx_group() + cluster_policy = make_cluster_policy() + make_cluster_policy_permissions( + object_id=cluster_policy.policy_id, + permission_level=PermissionLevel.CAN_USE, + group_name=ucx_group.display_name, + ) + installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] + installation_ctx.workspace_installation.run() + # populate data + _populate_workflow_problems(installation_ctx) + _populate_dashboard_problems(installation_ctx) + _populate_directfs_problems(installation_ctx) + print(f"\nInventory database is {installation_ctx.inventory_database}\n") + # put a breakpoint here + print("Put a breakpoint here! Then go check the dashboard in your workspace ;-)\n") diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index 7955e93c69..1110528bcf 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -5,7 +5,7 @@ from databricks.sdk.service.iam import PermissionLevel -@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8)) +@retried(on=[NotFound, InvalidParameterValue]) def test_running_real_assessment_job( ws, installation_ctx, @@ -25,7 +25,7 @@ def test_running_real_assessment_job( populate_for_linting(installation_ctx.installation) - installation_ctx.deployed_workflows.run_workflow("assessment") + installation_ctx.deployed_workflows.run_workflow("assessment", max_wait=timedelta(minutes=25)) assert installation_ctx.deployed_workflows.validate_step("assessment") after = installation_ctx.generic_permissions_support.load_as_dict("cluster-policies", cluster_policy.policy_id) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index d2f48cdfa7..9e18b42284 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -28,8 +28,10 @@ from databricks.sdk.retries import retried from databricks.sdk.service import iam from databricks.sdk.service.catalog import FunctionInfo, SchemaInfo, TableInfo -from databricks.sdk.service.iam import Group +from databricks.sdk.service.compute import ClusterSpec from databricks.sdk.service.dashboards import Dashboard as SDKDashboard +from databricks.sdk.service.iam import Group +from databricks.sdk.service.jobs import Task, SparkPythonTask from databricks.sdk.service.sql import Dashboard, WidgetPosition, WidgetOptions, LegacyQuery from databricks.labs.ucx.__about__ import __version__ @@ -1236,19 +1238,65 @@ def _run(command: str) -> str: @pytest.fixture -def populate_for_linting(ws, make_random, make_job, make_notebook, make_query, make_dashboard, watchdog_purge_suffix): +def create_file_job(ws, make_random, watchdog_remove_after, watchdog_purge_suffix, log_workspace_link): + + def create(installation, **_kwargs): + # create args + data = {"name": f"dummy-{make_random(4)}"} + # create file to run + file_name = f"dummy_{make_random(4)}_{watchdog_purge_suffix}" + file_path = WorkspacePath(ws, installation.install_folder()) / file_name + file_path.write_text("spark.read.parquet('dbfs://mnt/foo/bar')") + task = Task( + task_key=make_random(4), + description=make_random(4), + new_cluster=ClusterSpec( + num_workers=1, + node_type_id=ws.clusters.select_node_type(local_disk=True, min_memory_gb=16), + spark_version=ws.clusters.select_spark_version(latest=True), + ), + spark_python_task=SparkPythonTask(python_file=str(file_path)), + timeout_seconds=0, + ) + data["tasks"] = [task] + # add RemoveAfter tag for job cleanup + data["tags"] = [{"key": "RemoveAfter", "value": watchdog_remove_after}] + job = ws.jobs.create(**data) + log_workspace_link(data["name"], f'job/{job.job_id}', anchor=False) + return job + + yield from factory("job", create, lambda item: ws.jobs.delete(item.job_id)) + + +@pytest.fixture +def populate_for_linting( + ws, + make_random, + make_job, + make_notebook, + make_query, + make_dashboard, + create_file_job, + watchdog_purge_suffix, +): + + def create_notebook_job(installation): + path = Path(installation.install_folder()) / f"dummy_{make_random(4)}_{watchdog_purge_suffix}" + notebook_text = "spark.read.parquet('dbfs://mnt/foo1/bar1')" + notebook_path = make_notebook(path=path, content=io.BytesIO(notebook_text.encode("utf-8"))) + return make_job(notebook_path=notebook_path) + def populate_workspace(installation): # keep linting scope to minimum to avoid test timeouts - path = Path(installation.install_folder()) / f"dummy-{make_random(4)}-{watchdog_purge_suffix}" - notebook_path = make_notebook(path=path, content=io.BytesIO(b"spark.read.parquet('dbfs://mnt/foo/bar')")) - job = make_job(notebook_path=notebook_path) - query = make_query(sql_query='SELECT * from parquet.`dbfs://mnt/foo/bar`') + file_job = create_file_job(installation=installation) + notebook_job = create_notebook_job(installation=installation) + query = make_query(sql_query='SELECT * from parquet.`dbfs://mnt/foo2/bar2`') dashboard = make_dashboard(query=query) # can't use installation.load(WorkspaceConfig)/installation.save() because they populate empty credentials config_path = WorkspacePath(ws, installation.install_folder()) / "config.yml" text = config_path.read_text() config = yaml.safe_load(text) - config["include_job_ids"] = [job.job_id] + config["include_job_ids"] = [file_job.job_id, notebook_job.job_id] config["include_dashboard_ids"] = [dashboard.id] text = yaml.dump(config) config_path.unlink() diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index 4c5ae4f758..451b5a3815 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -163,7 +163,7 @@ def test_job_linter_some_notebook_graph_with_problems( assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages) assert len(dfsas) == 2 - task_keys = set(task.task_key for task in j.settings.tasks) + task_keys = set(f"{j.job_id}/{task.task_key}" for task in j.settings.tasks) yesterday = datetime.now(timezone.utc) - timedelta(days=1) for dfsa in dfsas: assert dfsa.source_id != DirectFsAccess.UNKNOWN @@ -172,7 +172,7 @@ def test_job_linter_some_notebook_graph_with_problems( assert dfsa.assessment_start_timestamp > yesterday assert dfsa.assessment_end_timestamp > yesterday assert dfsa.source_lineage[0] == LineageAtom( - object_type="JOB", object_id=str(j.job_id), other={"name": j.settings.name} + object_type="WORKFLOW", object_id=str(j.job_id), other={"name": j.settings.name} ) assert dfsa.source_lineage[1].object_type == "TASK" assert dfsa.source_lineage[1].object_id in task_keys diff --git a/tests/integration/source_code/test_queries.py b/tests/integration/source_code/test_queries.py index 5ecfbf45bd..10d4ded773 100644 --- a/tests/integration/source_code/test_queries.py +++ b/tests/integration/source_code/test_queries.py @@ -13,7 +13,8 @@ def test_query_linter_lints_queries_and_stores_dfsas(simple_ctx, ws, sql_backend assert len(problems) == 1 crawler = DirectFsAccessCrawler.for_queries(sql_backend, simple_ctx.inventory_database) all_dfsas = crawler.snapshot() - dfsas = [dfsa for dfsa in all_dfsas if dfsa.source_id == query.id] + source_id = f"{_dashboard.id}/{query.id}" + dfsas = [dfsa for dfsa in all_dfsas if dfsa.source_id == source_id] assert len(dfsas) == 1 dfsa = dfsas[0] assert len(dfsa.source_lineage) == 2 @@ -25,6 +26,6 @@ def test_query_linter_lints_queries_and_stores_dfsas(simple_ctx, ws, sql_backend assert lineage.other.get("name", None) == _dashboard.name lineage = dfsa.source_lineage[1] assert lineage.object_type == "QUERY" - assert lineage.object_id == query.id + assert lineage.object_id == source_id assert lineage.other assert lineage.other.get("name", None) == query.name diff --git a/tests/unit/source_code/test_queries.py b/tests/unit/source_code/test_queries.py index 216a4647a9..0ed1e187ba 100644 --- a/tests/unit/source_code/test_queries.py +++ b/tests/unit/source_code/test_queries.py @@ -27,7 +27,7 @@ def test_query_linter_collects_dfsas_from_queries(name, query, dfsa_paths, is_re crawlers = create_autospec(DirectFsAccessCrawler) query = LegacyQuery.from_dict({"parent": "workspace", "name": name, "query": query}) linter = QueryLinter(ws, migration_index, crawlers, None) - dfsas = linter.collect_dfsas_from_query(query) + dfsas = linter.collect_dfsas_from_query("no-dashboard-id", query) ws.assert_not_called() crawlers.assert_not_called() assert set(dfsa.path for dfsa in dfsas) == set(dfsa_paths) From 7cba9b0ed9bccacc1c23b6fa774be857e3463fce Mon Sep 17 00:00:00 2001 From: Pritish Pai <136742693+pritishpai@users.noreply.github.com> Date: Wed, 25 Sep 2024 08:10:37 -0400 Subject: [PATCH 11/21] Increases test coverage (#2739) ## Changes Increases coverage ### Functionality - [x] fixes and modifies testing warnings ### Tests - [x] modified unit tests --- tests/unit/hive_metastore/test_table_size.py | 43 +++++++++++++------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/tests/unit/hive_metastore/test_table_size.py b/tests/unit/hive_metastore/test_table_size.py index 8464a5dd3b..8216a0a780 100644 --- a/tests/unit/hive_metastore/test_table_size.py +++ b/tests/unit/hive_metastore/test_table_size.py @@ -1,3 +1,4 @@ +import logging import sys from databricks.labs.lsql.backends import MockBackend @@ -41,11 +42,11 @@ def test_table_size_crawler(mocker): assert TableSize("hive_metastore", "db1", "table2", 200) in results -def test_table_size_unknown_error(mocker): +def test_table_size_unknown_error(mocker, caplog): errors = {} rows = { "table_size": [], - "hive_metastore.inventory_database.tables": [ + "`hive_metastore`.`inventory_database`.`tables`": [ ("hive_metastore", "db1", "table1", "MANAGED", "DELTA", "dbfs:/location/table", None), ], "SHOW DATABASES": [("db1",)], @@ -56,16 +57,17 @@ def test_table_size_unknown_error(mocker): tsc = TableSizeCrawler(backend, "inventory_database") tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception(...) - results = tsc.snapshot() + with caplog.at_level(logging.WARNING): + results = tsc.snapshot() assert len(results) == 0 -def test_table_size_table_or_view_not_found(mocker): +def test_table_size_table_or_view_not_found(mocker, caplog): errors = {} rows = { "table_size": [], - "hive_metastore.inventory_database.tables": [ + "`hive_metastore`.`inventory_database`.`tables`": [ ("hive_metastore", "db1", "table1", "MANAGED", "DELTA", "dbfs:/location/table", None), ], "SHOW DATABASES": [("db1",)], @@ -80,16 +82,18 @@ def test_table_size_table_or_view_not_found(mocker): "[TABLE_OR_VIEW_NOT_FOUND]" ) - results = tsc.snapshot() + with caplog.at_level(logging.WARNING): + results = tsc.snapshot() assert len(results) == 0 + assert "Failed to evaluate hive_metastore.db1.table1 table size. Table not found" in caplog.text -def test_table_size_delta_table_not_found(mocker): +def test_table_size_delta_table_not_found(mocker, caplog): errors = {} rows = { "table_size": [], - "hive_metastore.inventory_database.tables": [ + "`hive_metastore`.`inventory_database`.`tables`": [ ("hive_metastore", "db1", "table1", "MANAGED", "DELTA", "dbfs:/location/table", None), ], "SHOW DATABASES": [("db1",)], @@ -104,16 +108,18 @@ def test_table_size_delta_table_not_found(mocker): "[DELTA_TABLE_NOT_FOUND]" ) - results = tsc.snapshot() + with caplog.at_level(logging.WARNING): + results = tsc.snapshot() assert len(results) == 0 + assert "Failed to evaluate hive_metastore.db1.table1 table size. Table not found" in caplog.text -def test_table_size_when_table_corrupted(mocker): +def test_table_size_when_table_corrupted(mocker, caplog): errors = {} rows = { "table_size": [], - "hive_metastore.inventory_database.tables": [ + "`hive_metastore`.`inventory_database`.`tables`": [ ("hive_metastore", "db1", "table1", "MANAGED", "DELTA", "dbfs:/location/table", None), ], "SHOW DATABASES": [("db1",)], @@ -127,16 +133,18 @@ def test_table_size_when_table_corrupted(mocker): "[DELTA_MISSING_TRANSACTION_LOG]" ) - results = tsc.snapshot() + with caplog.at_level(logging.WARNING): + results = tsc.snapshot() assert len(results) == 0 + assert "Delta table hive_metastore.db1.table1 is corrupted: missing transaction log" in caplog.text -def test_table_size_when_delta_invalid_format_error(mocker): +def test_table_size_when_delta_invalid_format_error(mocker, caplog): errors = {} rows = { "table_size": [], - "hive_metastore.inventory_database.tables": [ + "`hive_metastore`.`inventory_database`.`tables`": [ ("hive_metastore", "db1", "table1", "MANAGED", "DELTA", "dbfs:/location/table", None), ], "SHOW DATABASES": [("db1",)], @@ -150,6 +158,11 @@ def test_table_size_when_delta_invalid_format_error(mocker): "[DELTA_INVALID_FORMAT]" ) - results = tsc.snapshot() + with caplog.at_level(logging.WARNING): + results = tsc.snapshot() assert len(results) == 0 + assert ( + "Unable to read Delta table hive_metastore.db1.table1, please check table structure and try again." + in caplog.text + ) From 04a4956bda94cec58ca8b70f59455ca2d42decbb Mon Sep 17 00:00:00 2001 From: Pritish Pai <136742693+pritishpai@users.noreply.github.com> Date: Wed, 25 Sep 2024 08:14:45 -0400 Subject: [PATCH 12/21] Fixes source table alias dissapearance during migrate_views (#2726) ## Changes The alias for the source table is disappearing while converting the create view sql from legacy hive metastore to uc catalog. ### Linked issues Resolves #2661 ### Functionality - [x] fixes sql conversion ### Tests - [x] manually tested - [x] added unit tests - [ ] added integration tests - [ ] verified on staging environment (screenshot attached) --------- Co-authored-by: Liran Bareket --- .../ucx/source_code/linters/from_table.py | 2 +- .../hive_metastore/test_migrate.py | 40 +++++++++++++++++++ .../unit/hive_metastore/test_view_migrate.py | 25 ++++++++++++ 3 files changed, 66 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/linters/from_table.py b/src/databricks/labs/ucx/source_code/linters/from_table.py index 09b2235a74..9da4dd7a2c 100644 --- a/src/databricks/labs/ucx/source_code/linters/from_table.py +++ b/src/databricks/labs/ucx/source_code/linters/from_table.py @@ -99,7 +99,7 @@ def apply(self, code: str) -> str: dst = self._index.get(src_schema, old_table.name) if not dst: continue - new_table = Table(catalog=dst.dst_catalog, db=dst.dst_schema, this=dst.dst_table) + new_table = Table(catalog=dst.dst_catalog, db=dst.dst_schema, this=dst.dst_table, alias=old_table.alias) old_table.replace(new_table) new_sql = statement.sql('databricks') new_statements.append(new_sql) diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index 6779f0c84e..14afdfc527 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -341,6 +341,46 @@ def test_migrate_view(ws, sql_backend, runtime_ctx, make_catalog): assert "(col1,col2)" in view3_view_text.replace("\n", "").replace(" ", "").lower() +@retried(on=[NotFound], timeout=timedelta(minutes=3)) +def test_migrate_view_alias_test(ws, sql_backend, runtime_ctx, make_catalog): + src_schema = runtime_ctx.make_schema(catalog_name="hive_metastore") + src_managed_table = runtime_ctx.make_table(catalog_name=src_schema.catalog_name, schema_name=src_schema.name) + src_view1 = runtime_ctx.make_table( + catalog_name=src_schema.catalog_name, + schema_name=src_schema.name, + ctas=f"SELECT A.* FROM {src_managed_table.full_name} AS A", + view=True, + ) + + dst_catalog = make_catalog() + dst_schema = runtime_ctx.make_schema(catalog_name=dst_catalog.name, name=src_schema.name) + + logger.info(f"dst_catalog={dst_catalog.name}, managed_table={src_managed_table.full_name}") + + rules = [ + Rule.from_src_dst(src_managed_table, dst_schema), + Rule.from_src_dst(src_view1, dst_schema), + ] + + runtime_ctx.with_table_mapping_rules(rules) + runtime_ctx.with_dummy_resource_permission() + runtime_ctx.tables_migrator.index() + runtime_ctx.tables_migrator.migrate_tables(what=What.DBFS_ROOT_DELTA) + runtime_ctx.migration_status_refresher.snapshot() + runtime_ctx.tables_migrator.migrate_tables(what=What.VIEW) + target_tables = list(sql_backend.fetch(f"SHOW TABLES IN {dst_schema.full_name}")) + assert len(target_tables) == 2 + + target_table_properties = ws.tables.get(f"{dst_schema.full_name}.{src_managed_table.name}").properties + assert target_table_properties["upgraded_from"] == src_managed_table.full_name + assert target_table_properties[Table.UPGRADED_FROM_WS_PARAM] == str(ws.get_workspace_id()) + view1_view_text = ws.tables.get(f"{dst_schema.full_name}.{src_view1.name}").view_definition + assert ( + view1_view_text + == f"SELECT `A`.* FROM `{dst_schema.catalog_name}`.`{dst_schema.name}`.`{src_managed_table.name}` AS `A`" + ) + + @retried(on=[NotFound], timeout=timedelta(minutes=2)) def test_revert_migrated_table(sql_backend, runtime_ctx, make_catalog): src_schema1 = runtime_ctx.make_schema(catalog_name="hive_metastore") diff --git a/tests/unit/hive_metastore/test_view_migrate.py b/tests/unit/hive_metastore/test_view_migrate.py index 4d6b0cab73..0715b00dc6 100644 --- a/tests/unit/hive_metastore/test_view_migrate.py +++ b/tests/unit/hive_metastore/test_view_migrate.py @@ -37,6 +37,31 @@ def test_view_to_migrate_sql_migrate_view_sql(): assert sql == expected_query +def test_view_with_alias_to_migrate_sql_migrate_view_sql(): + expected_query = "CREATE OR REPLACE VIEW IF NOT EXISTS `cat1`.`schema1`.`dest_view1` AS SELECT `A`.* FROM `cat1`.`schema1`.`dest_table1` AS `A`" + view = Table( + object_type="VIEW", + table_format="VIEW", + catalog="hive_metastore", + database="test_schema1", + name="test_view1", + # The view text is overwritten with the create view statement before running the sql migrate view + view_text="CREATE OR REPLACE VIEW hive_metastore.test_schema1.test_view1 AS SELECT A.* FROM test_schema1.test_table1 AS A", + ) + rule = Rule("workspace", "cat1", "test_schema1", "schema1", "test_view1", "dest_view1") + view_to_migrate = ViewToMigrate(view, rule) + migration_index = TableMigrationIndex( + [ + TableMigrationStatus("test_schema1", "test_table1", "cat1", "schema1", "dest_table1"), + TableMigrationStatus("test_schema1", "test_view1"), + ] + ) + + sql = view_to_migrate.sql_migrate_view(migration_index) + + assert sql == expected_query + + @pytest.fixture(scope="session") def samples() -> dict[str, dict[str, str]]: path = Path(Path(__file__).parent, "tables", "tables_and_views.json") From 15c553681a685ac3f968adf5ff5c3ac19aaf1fbd Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 25 Sep 2024 15:12:39 +0200 Subject: [PATCH 13/21] Bump astroid version, pylint version and drop our f-string workaround (#2746) ## Changes Our code works around a limitation of astroid < 3.3 where f-strings are not inferred This PR: - updates pylint and astroid - drops workarounds - fixes corresponding tests ### Linked issues None ### Functionality None ### Tests - [x] updated unit tests --------- Co-authored-by: Eric Vergnaud --- pyproject.toml | 11 +++++++---- .../ucx/source_code/python/python_infer.py | 18 ------------------ .../source_code/python/test_python_infer.py | 16 ---------------- .../functional/file-access/direct-fs2.py | 4 +--- ...string.py => sys-path-with-uninferrable.py} | 2 +- tests/unit/source_code/test_dependencies.py | 15 +++------------ tests/unit/source_code/test_notebook.py | 7 +++---- 7 files changed, 15 insertions(+), 58 deletions(-) rename tests/unit/source_code/samples/{sys-path-with-fstring.py => sys-path-with-uninferrable.py} (83%) diff --git a/pyproject.toml b/pyproject.toml index 66112a81f5..6ac648bdb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ dependencies = ["databricks-sdk~=0.30", "databricks-labs-blueprint>=0.8,<0.9", "PyYAML>=6.0.0,<7.0.0", "sqlglot>=25.5.0,<25.23", - "astroid>=3.2.2"] + "astroid>=3.3.1"] [project.optional-dependencies] pylsp = [ @@ -74,7 +74,7 @@ dependencies = [ "black~=24.3.0", "coverage[toml]~=7.4.4", "mypy~=1.9.0", - "pylint~=3.2.2", + "pylint~=3.3.1", "pylint-pytest==2.0.0a0", "databricks-labs-pylint~=0.4.0", "databricks-labs-pytester>=0.2.1", @@ -209,7 +209,7 @@ fail-under = 10.0 # ignore-list. The regex matches against paths and can be in Posix or Windows # format. Because '\\' represents the directory delimiter on Windows systems, it # can't be used as an escape character. - ignore-paths='^tests/unit/source_code/samples/.*$' +ignore-paths='^tests/unit/source_code/samples/.*$' # Files or directories matching the regular expression patterns are skipped. The # regex matches against base names, not paths. The default value ignores Emacs @@ -587,7 +587,10 @@ disable = [ "fixme", "consider-using-assignment-expr", "logging-fstring-interpolation", - "consider-using-any-or-all" + "consider-using-any-or-all", + "too-many-positional-arguments", + "unnecessary-default-type-args", + "logging-not-lazy" ] # Enable the message, report, category or checker with the given id(s). You can diff --git a/src/databricks/labs/ucx/source_code/python/python_infer.py b/src/databricks/labs/ucx/source_code/python/python_infer.py index b1d79da641..f74c287ba9 100644 --- a/src/databricks/labs/ucx/source_code/python/python_infer.py +++ b/src/databricks/labs/ucx/source_code/python/python_infer.py @@ -64,10 +64,6 @@ def _infer_values(cls, node: NodeNG) -> Iterator[Iterable[NodeNG]]: # deal with node types that don't implement 'inferred()' if node is Uninferable or isinstance(node, Const): yield [node] - elif isinstance(node, JoinedStr): - yield from cls._infer_values_from_joined_string(node) - elif isinstance(node, FormattedValue): - yield from _LocalInferredValue.do_infer_values(node.value) else: yield from cls._safe_infer_internal(node) @@ -91,20 +87,6 @@ def _unsafe_infer_internal(cls, node: NodeNG): continue yield from _LocalInferredValue.do_infer_values(inferred) - @classmethod - def _infer_values_from_joined_string(cls, node: NodeNG) -> Iterator[Iterable[NodeNG]]: - assert isinstance(node, JoinedStr) - yield from cls._infer_values_from_joined_values(node.values) - - @classmethod - def _infer_values_from_joined_values(cls, nodes: list[NodeNG]) -> Iterator[Iterable[NodeNG]]: - if len(nodes) == 1: - yield from _LocalInferredValue.do_infer_values(nodes[0]) - return - for firsts in _LocalInferredValue.do_infer_values(nodes[0]): - for remains in cls._infer_values_from_joined_values(nodes[1:]): - yield list(firsts) + list(remains) - def __init__(self, atoms: Iterable[NodeNG]): self._atoms = list(atoms) diff --git a/tests/unit/source_code/python/test_python_infer.py b/tests/unit/source_code/python/test_python_infer.py index 38f3b4cb6f..9afc690a7d 100644 --- a/tests/unit/source_code/python/test_python_infer.py +++ b/tests/unit/source_code/python/test_python_infer.py @@ -88,22 +88,6 @@ def test_infers_fstring_values(): assert strings == ["Hello abc, ghi!", "Hello abc, jkl!", "Hello def, ghi!", "Hello def, jkl!"] -def test_fails_to_infer_cascading_fstring_values(): - # The purpose of this test is to detect a change in astroid support for f-strings - source = """ -value1 = "John" -value2 = f"Hello {value1}" -value3 = f"{value2}, how are you today?" -""" - tree = Tree.parse(source) - nodes = tree.locate(Assign, []) - tree = Tree(nodes[2].value) # value of value3 = ... - values = list(InferredValue.infer_from_node(tree.node)) - # for now, we simply check failure to infer! - assert any(not value.is_inferred() for value in values) - # the expected value would be ["Hello John, how are you today?"] - - def test_infers_externally_defined_value(): state = CurrentSessionState() state.named_parameters = {"my-widget": "my-value"} diff --git a/tests/unit/source_code/samples/functional/file-access/direct-fs2.py b/tests/unit/source_code/samples/functional/file-access/direct-fs2.py index c48a44c840..0be4ec4d7f 100644 --- a/tests/unit/source_code/samples/functional/file-access/direct-fs2.py +++ b/tests/unit/source_code/samples/functional/file-access/direct-fs2.py @@ -6,11 +6,9 @@ print(len(result)) index = 10 spark.sql(f"SELECT * FROM table_{index}").collect() -# ucx[cannot-autofix-table-reference:+2:0:+2:40] Can't migrate table_name argument in 'f'SELECT * FROM {table_name}'' because its value cannot be computed table_name = f"table_{index}" spark.sql(f"SELECT * FROM {table_name}").collect() -# ucx[table-migrated-to-uc:+4:4:+4:20] Table old.things is migrated to brand.new.stuff in Unity Catalog -# ucx[cannot-autofix-table-reference:+3:4:+3:20] Can't migrate table_name argument in 'query' because its value cannot be computed +# ucx[table-migrated-to-uc:+3:4:+3:20] Table old.things is migrated to brand.new.stuff in Unity Catalog table_name = f"table_{index}" for query in ["SELECT * FROM old.things", f"SELECT * FROM {table_name}"]: spark.sql(query).collect() diff --git a/tests/unit/source_code/samples/sys-path-with-fstring.py b/tests/unit/source_code/samples/sys-path-with-uninferrable.py similarity index 83% rename from tests/unit/source_code/samples/sys-path-with-fstring.py rename to tests/unit/source_code/samples/sys-path-with-uninferrable.py index 5aac77763b..ddbe6b17e0 100644 --- a/tests/unit/source_code/samples/sys-path-with-fstring.py +++ b/tests/unit/source_code/samples/sys-path-with-uninferrable.py @@ -1,7 +1,7 @@ import sys name_1 = "whatever" -name_2 = f"{name_1}" +name_2 = some_call(name_1) sys.path.append(f"{name_2}") names = [f"{name_2}", name_2] for name in names: diff --git a/tests/unit/source_code/test_dependencies.py b/tests/unit/source_code/test_dependencies.py index 449d071b74..e051aad349 100644 --- a/tests/unit/source_code/test_dependencies.py +++ b/tests/unit/source_code/test_dependencies.py @@ -199,24 +199,15 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> So ] -def test_dependency_resolver_raises_problem_for_non_inferable_sys_path(simple_dependency_resolver): +def test_dependency_resolver_raises_problem_for_uninferrable_sys_path(simple_dependency_resolver): maybe = simple_dependency_resolver.build_local_file_dependency_graph( - Path("sys-path-with-fstring.py"), CurrentSessionState() + Path("sys-path-with-uninferrable.py"), CurrentSessionState() ) assert list(maybe.problems) == [ - DependencyProblem( - code='sys-path-cannot-compute-value', - message="Can't update sys.path from f'{name_2}' because the expression cannot be computed", - source_path=Path('sys-path-with-fstring.py'), - start_line=4, - start_col=16, - end_line=4, - end_col=27, - ), DependencyProblem( code='sys-path-cannot-compute-value', message="Can't update sys.path from name because the expression cannot be computed", - source_path=Path('sys-path-with-fstring.py'), + source_path=Path('sys-path-with-uninferrable.py'), start_line=7, start_col=20, end_line=7, diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py index c542569f15..bda25a0106 100644 --- a/tests/unit/source_code/test_notebook.py +++ b/tests/unit/source_code/test_notebook.py @@ -249,7 +249,7 @@ def test_notebook_builds_python_dependency_graph_with_fstring_loop(mock_path_loo container = maybe.dependency.load(mock_path_lookup) assert container is not None container.build_dependency_graph(graph) - expected_paths = [path, "leaf1.py", "leaf3.py"] + expected_paths = [path, "leaf1.py", "leaf2.py", "leaf3.py"] all_paths = set(d.path for d in graph.all_dependencies) assert all_paths == {mock_path_lookup.cwd / path for path in expected_paths} @@ -279,7 +279,7 @@ def test_does_not_detect_partial_call_to_dbutils_notebook_run_in_python_code_() assert len(nodes) == 0 -def test_raises_advice_when_dbutils_notebook_run_is_too_complex() -> None: +def test_lints_complex_dbutils_notebook_run() -> None: source = """ name1 = "John" name2 = f"{name1}" @@ -287,5 +287,4 @@ def test_raises_advice_when_dbutils_notebook_run_is_too_complex() -> None: """ linter = DbutilsPyLinter(CurrentSessionState()) advices = list(linter.lint(source)) - assert len(advices) == 1 - assert advices[0].code == "notebook-run-cannot-compute-value" + assert not advices From 2611b119f4b84c54c40c8043654a11491fd469fc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 25 Sep 2024 15:13:11 +0200 Subject: [PATCH 14/21] Update databricks-labs-blueprint requirement from <0.9,>=0.8 to >=0.8,<0.10 (#2747) Updates the requirements on [databricks-labs-blueprint](https://github.com/databrickslabs/blueprint) to permit the latest version.
Changelog

Sourced from databricks-labs-blueprint's changelog.

0.9.0

  • Added Databricks CLI version as part of routed command telemetry (#147). A new environment variable, "DATABRICKS_CLI_VERSION", has been introduced in the Databricks CLI version for routed command telemetry. This variable is incorporated into the with_user_agent_extra method, which adds it to the user agent for outgoing requests, thereby enhancing detailed tracking and version identification in telemetry data. The with_user_agent_extra method is invoked twice, with the blueprint prefix and the version variable, followed by the cli prefix and the DATABRICKS_CLI_VERSION environment variable, ensuring that both the blueprint and CLI versions are transmitted in the user agent for all requests.

0.8.3

  • add missing stat() methods to DBFSPath and WorkspacePath (#144). The stat() method has been added to both DBFSPath and WorkspacePath classes, addressing issues #142 and #143. This method, which adheres to the Posix standard, returns file status in the os.stat_result format, providing access to various metadata attributes such as file size, last modification time, and creation time. By incorporating this method, developers can now obtain essential file information for Databricks File System (DBFS) and Databricks Workspace paths when working with these classes. The change includes a new test case for stat() in the test_paths.py file to ensure the correctness of the method for both classes.

0.8.2

  • Make hatch a prerequisite (#137). In version 1.9.4, hatch has become a prerequisite for installation in the GitHub workflow for the project's main branch, due to occasional failures in pip install hatch that depend on the local environment. This change, which includes defining the hatch version as an environment variable and adding a new step for installing hatch with a specific version, aims to enhance the reliability of the build and testing process by eliminating potential installation issues with hatch. Users should install hatch manually before executing the Makefile, as the line pip install hatch has been removed from the Makefile. This change aligns with the approach taken for ucx, and users are expected to understand the requirement to install prerequisites before executing the Makefile. To contribute to this project, please install hatch using pip install hatch, clone the GitHub repository, and run make dev to start the development environment and install necessary dependencies.
  • support files with unicode BOM (#138). The recent change to the open-source library introduces support for handling files with a Unicode Byte Order Mark (BOM) during file upload and download operations in Databricks Workspace. This new functionality, added to the WorkspacePath class, allows for easier reading of text from files with the addition of a read_text method. When downloading a file, if it starts with a BOM, it will be detected and used for decoding, regardless of the preferred encoding based on the system's locale. The change includes a new test function that verifies the accurate encoding and decoding of files with different types of BOM using the appropriate encoding. Despite the inability to test Databrick notebooks with a BOM due to the Databricks platform modifying the uploaded data, this change enhances support for handling files with various encodings and BOM, improving compatibility with a broader range of file formats, and ensuring more accurate handling of files with BOM.

0.8.1

  • Fixed py3.10 compatibility for _parts in pathlike (#135). The recent update to our open-source library addresses the compatibility issue with Python 3.10 in the _parts property of a certain type. Prior to this change, there was also a _cparts property that returned the same value as _parts, which has been removed and replaced with a direct reference to _parts. The _parts property can now be accessed via reverse equality comparison, and this change has been implemented in the joinpath and __truediv__ methods as well. This enhancement improves the library's compatibility with Python 3.10 and beyond, ensuring continued functionality and stability for software engineers working with the latest Python versions.

0.8.0

  • Added DBFSPath as os.PathLike implementation (#131). The open-source library has been updated with a new class DBFSPath, an implementation of os.PathLike for Databricks File System (DBFS) paths. This new class extends the existing WorkspacePath support and provides pathlib-like functionality for DBFS paths, including methods for creating directories, renaming and deleting files and directories, and reading and writing files. The addition of DBFSPath includes type-hinting for improved code linting and is integrated in the test suite with new and updated tests for path-like objects. The behavior of the exists and unlink methods have been updated for WorkspacePath to improve performance and raise appropriate errors.
  • Fixed .as_uri() and .absolute() implementations for WorkspacePath (#127). In this release, the WorkspacePath class in the paths.py module has been updated with several improvements to the .as_uri() and .absolute() methods. These methods now utilize PathLib internals, providing better cross-version compatibility. The .as_uri() method now uses an f-string for concatenation and returns the UTF-8 encoded string representation of the WorkspacePath object via a new __bytes__() dunder method. Additionally, the .absolute() method has been implemented for the trivial (no-op) case and now supports returning the absolute path of files or directories in Databricks Workspace. Furthermore, the glob() and rglob() methods have been enhanced to support case-sensitive pattern matching based on a new case_sensitive parameter. To ensure the integrity of these changes, two new test cases, test_as_uri() and test_absolute(), have been added, thoroughly testing the functionality of these methods.
  • Fixed WorkspacePath support for python 3.11 (#121). The WorkspacePath class in our open-source library has been updated to improve compatibility with Python 3.11. The .expanduser() and .glob() methods have been modified to address internal changes in Python 3.11. The is_dir() and is_file() methods now include a follow_symlinks parameter, although it is not currently used. A new method, _scandir(), has been added for compatibility with Python 3.11. The expanduser() method has also been updated to expand ~ (but not ~user) constructs. Additionally, a new method is_notebook() has been introduced to check if the path points to a notebook in Databricks Workspace. These changes aim to ensure that the library functions smoothly with the latest version of Python and provides additional functionality for users working with Databricks Workspace.
  • Properly verify versions of python (#118). In this release, we have made significant updates to the pyproject.toml file to enhance project dependency and development environment management. We have added several new packages to the dependencies section to expand the library's functionality and compatibility. Additionally, we have removed the python field, as it is no longer necessary. We have also updated the path field to specify the location of the virtual environment, which can improve integration with popular development tools such as Visual Studio Code and PyCharm. These changes are intended to streamline the development process and make it easier to manage dependencies and set up the development environment.
  • Type annotations on path-related unit tests (#128). In this open-source library update, type annotations have been added to path-related unit tests to enhance code clarity and maintainability. The tests encompass various scenarios, including verifying if a path exists, creating, removing, and checking directories, and testing file attributes such as distinguishing directories, notebooks, and regular files. The additions also cover functionality for opening and manipulating files in different modes like read binary, write binary, read text, and write text. Furthermore, tests for checking file permissions, handling errors, and globbing (pattern-based file path matching) have been incorporated. The tests interact with a WorkspaceClient mock object, simulating file system interactions. This enhancement bolsters the library's reliability and assists developers in creating robust, well-documented code when working with file system paths.
  • Updated WorkspacePath to support Python 3.12 (#122). In this release, the WorkspacePath implementation has been updated to ensure compatibility with Python 3.12, in addition to Python 3.10 and 3.11. The class was modified to replace most of the internal implementation and add extensive tests for public interfaces, ensuring that the superclass implementations are not used unless they are known to be safe. This change is in response to the significant changes in the superclass implementations between Python 3.11 and 3.12, which were found to be incompatible with each other. The WorkspacePath class now includes several new methods and tests to ensure that it functions seamlessly with different versions of Python. These changes include testing for initialization, equality, hash, comparison, path components, and various path manipulations. This update enhances the library's adaptability and ensures it functions correctly with different versions of Python. Classifiers have also been updated to include support for Python 3.12.
  • WorkspacePath fixes for the .resolve() implementation (#129). The .resolve() method for WorkspacePath has been updated to improve its handling of relative paths and the strict argument. Previously, relative paths were not properly validated and would be returned as-is. Now, relative paths will cause the method to fail. The strict argument is now checked, and if set to True and the path does not exist, a FileNotFoundError will be raised. The method .absolute() is used to obtain the absolute path of the file or directory in Databricks Workspace and is used in the implementation of .resolve(). A new test, test_resolve(), has been added to verify these changes, covering scenarios where the path is absolute, the path exists, the path does not exist, and the path is relative. In the case of relative paths, a NotImplementedError is raised, as .resolve() is not supported for them.
  • WorkspacePath: Fix the .rename() and .replace() implementations to return the target path (#130). The .rename() and .replace() methods of the WorkspacePath class have been updated to return the target path as part of the public API, with .rename() no longer accepting the overwrite keyword argument and always failing if the target path already exists. A new private method, ._rename(), has been added to include the overwrite argument and is used by both .rename() and .replace(). This update is a preparatory step for factoring out common code to support DBFS paths. The tests have been updated accordingly, combining and adding functions to test the new and updated methods. The .unlink() method's behavior remains unchanged. Please note that the exact error raised when .rename() fails due to an existing target path is yet to be defined.

Dependency updates:

  • Bump sigstore/gh-action-sigstore-python from 2.1.1 to 3.0.0 (#133).

0.7.0

  • Added databricks.labs.blueprint.paths.WorkspacePath as pathlib.Path equivalent (#115). This commit introduces the databricks.labs.blueprint.paths.WorkspacePath library, providing Python-native pathlib.Path-like interfaces to simplify working with Databricks Workspace paths. The library includes WorkspacePath and WorkspacePathDuringTest classes offering advanced functionality for handling user home folders, relative file paths, browser URLs, and file manipulation methods such as read/write_text(), read/write_bytes(), and glob(). This addition brings enhanced, Pythonic ways to interact with Databricks Workspace paths, including creating and moving files, managing directories, and generating browser-accessible URIs. Additionally, the commit includes updates to existing methods and introduces new fixtures for creating notebooks, accompanied by extensive unit tests to ensure reliability and functionality.
  • Added propagation of blueprint version into User-Agent header when it is used as library (#114). A new feature has been introduced in the library that allows for the propagation of the blueprint version and the name of the command line interface (CLI) command used in the User-Agent header when the library is utilized as a library. This feature includes the addition of two new pairs of OtherInfo: blueprint/X.Y.Z to indicate that the request is made using the blueprint library and cmd/<name> to store the name of the CLI command used for making the request. The implementation involves using the with_user_agent_extra function from databricks.sdk.config to set the user agent consistently with the Databricks CLI. Several changes have been made to the test file for test_useragent.py to include a new test case, test_user_agent_is_propagated, which checks if the blueprint version and the name of the command are correctly propagated to the User-Agent header. A context manager http_fixture_server has been added that creates an HTTP server with a custom handler, which extracts the blueprint version and the command name from the User-Agent header and stores them in the user_agent dictionary. The test case calls the foo command with a mocked WorkspaceClient instance and sets the DATABRICKS_HOST and DATABRICKS_TOKEN environment variables to test the propagation of the blueprint version and the command name in the User-Agent header. The test case then asserts that the blueprint version and the name of the command are present and correctly set in the user_agent dictionary.
  • Bump actions/checkout from 4.1.6 to 4.1.7 (#112). In this release, the version of the "actions/checkout" action used in the Checkout Code step of the acceptance workflow has been updated from 4.1.6 to 4.1.7. This update may include bug fixes, performance improvements, and new features, although specific changes are not mentioned in the commit message. The Unshallow step remains unchanged, continuing to fetch and clean up the repository's history. This update ensures that the latest enhancements from the "actions/checkout" action are utilized, aiming to improve the reliability and performance of the code checkout process in the GitHub Actions workflow. Software engineers should be aware of this update and its potential impact on their workflows.

Dependency updates:

  • Bump actions/checkout from 4.1.6 to 4.1.7 (#112).

0.6.3

  • fixed Command.get_argument_type bug with UnionType (#110). In this release, the Command.get_argument_type method has been updated to include special handling for UnionType, resolving a bug that caused the function to crash when encountering this type. The method now returns the string representation of the annotation if the argument is a UnionType, providing more accurate and reliable results. To facilitate this, modifications were made using the types module. Additionally, the foo function has a new optional argument optional_arg of type str, with a default value of None. This argument is passed to the some function in the assertion. The Prompts type has been added to the foo function signature, and an assertion has been added to verify if prompts is an instance of Prompts. Lastly, the default value of the address argument has been changed from an empty string to "default", and the same changes have been applied to the test_injects_prompts test function.

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6ac648bdb2..7afc837663 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ classifiers = [ dependencies = ["databricks-sdk~=0.30", "databricks-labs-lsql>=0.5,<0.13", - "databricks-labs-blueprint>=0.8,<0.9", + "databricks-labs-blueprint>=0.8,<0.10", "PyYAML>=6.0.0,<7.0.0", "sqlglot>=25.5.0,<25.23", "astroid>=3.3.1"] From 555e83aaf196cb5d0c51357f588e52ebe30d1cba Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 25 Sep 2024 18:05:13 +0200 Subject: [PATCH 15/21] Delete temporary files when running solacc (#2750) ## Changes solacc.py currently lints the entire solacc repo, thus accumulating temporary files to a point that exceeds CI storage capacity This PR fixes the issue by: - lint the repo on a per top-level solacc 'solution' (within the repo, top folders are independent of each other) - delete temp files and dirs registered in PathLookup after linting a solution This PR also prepares for improving false positive detection ### Linked issues None ### Functionality None ### Tests - [x] manually tested --------- Co-authored-by: Eric Vergnaud --- .../ucx/source_code/python/python_infer.py | 2 - tests/integration/source_code/solacc.py | 171 ++++++++++++------ tests/unit/source_code/test_notebook.py | 2 +- 3 files changed, 113 insertions(+), 62 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_infer.py b/src/databricks/labs/ucx/source_code/python/python_infer.py index f74c287ba9..6727745280 100644 --- a/src/databricks/labs/ucx/source_code/python/python_infer.py +++ b/src/databricks/labs/ucx/source_code/python/python_infer.py @@ -10,9 +10,7 @@ Const, decorators, Dict, - FormattedValue, Instance, - JoinedStr, Name, NodeNG, Uninferable, diff --git a/tests/integration/source_code/solacc.py b/tests/integration/source_code/solacc.py index 707c798ef5..e209b5a363 100644 --- a/tests/integration/source_code/solacc.py +++ b/tests/integration/source_code/solacc.py @@ -1,6 +1,8 @@ import logging import os +import shutil import sys +from dataclasses import dataclass, field from pathlib import Path import requests @@ -12,6 +14,7 @@ from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex from databricks.labs.ucx.source_code.base import LocatedAdvice from databricks.labs.ucx.source_code.linters.context import LinterContext +from databricks.labs.ucx.source_code.path_lookup import PathLookup logger = logging.getLogger("verify-accelerators") @@ -54,7 +57,7 @@ def collect_missing_imports(advices: list[LocatedAdvice]): return missing_imports -def collect_not_computed(advices: list[LocatedAdvice]): +def collect_uninferrable_count(advices: list[LocatedAdvice]): not_computed = 0 for located_advice in advices: if "computed" in located_advice.advice.message: @@ -68,93 +71,143 @@ def print_advices(advices: list[LocatedAdvice], file: Path): sys.stdout.write(f"{message}\n") -def lint_one(file: Path, ctx: LocalCheckoutContext, unparsed: Path | None) -> tuple[set[str], int, int]: +@dataclass +class SolaccContext: + unparsed_path: Path | None = None + files_to_skip: set[str] | None = None + total_count = 0 + parseable_count = 0 + uninferrable_count = 0 + missing_imports: dict[str, dict[str, int]] = field(default_factory=dict) + + @classmethod + def create(cls, for_all_dirs: bool): + unparsed_path: Path | None = None + # if lint_all, recreate "solacc-unparsed.txt" + if for_all_dirs: + unparsed_path = Path(Path(__file__).parent, "solacc-unparsed.txt") + if unparsed_path.exists(): + os.remove(unparsed_path) + files_to_skip: set[str] | None = None + malformed = Path(__file__).parent / "solacc-malformed.txt" + if for_all_dirs and malformed.exists(): + lines = malformed.read_text(encoding="utf-8").split("\n") + files_to_skip = set(line for line in lines if len(line) > 0 and not line.startswith("#")) + return SolaccContext(unparsed_path=unparsed_path, files_to_skip=files_to_skip) + + def register_missing_import(self, missing_import: str): + prefix = missing_import.split(".")[0] + details = self.missing_imports.get(prefix, None) + if details is None: + details = {} + self.missing_imports[prefix] = details + count = details.get(missing_import, 0) + details[missing_import] = count + 1 + + def log_missing_imports(self): + missing_imports = dict( + sorted(self.missing_imports.items(), key=lambda item: sum(item[1].values()), reverse=True) + ) + for prefix, details in missing_imports.items(): + logger.info(f"Missing import '{prefix}'") + for item, count in details.items(): + logger.info(f" {item}: {count} occurrences") + + +def lint_one(solacc: SolaccContext, file: Path, ctx: LocalCheckoutContext) -> None: try: advices = list(ctx.local_code_linter.lint_path(file, set())) - missing_imports = collect_missing_imports(advices) - not_computed = collect_not_computed(advices) + solacc.parseable_count += 1 + for missing_import in collect_missing_imports(advices): + solacc.register_missing_import(missing_import) + solacc.uninferrable_count += collect_uninferrable_count(advices) print_advices(advices, file) - return missing_imports, 1, not_computed except Exception as e: # pylint: disable=broad-except # here we're most likely catching astroid & sqlglot errors - if unparsed is None: # linting single file, log exception details - logger.error(f"Error during parsing of {file}: {e}".replace("\n", " "), exc_info=e) - else: + # when linting single file, log exception details + logger.error( + f"Error during parsing of {file}: {e}".replace("\n", " "), + exc_info=e if solacc.unparsed_path is None else None, + ) + if solacc.unparsed_path: logger.error(f"Error during parsing of {file}: {e}".replace("\n", " ")) # populate solacc-unparsed.txt - with unparsed.open(mode="a", encoding="utf-8") as f: + with solacc.unparsed_path.open(mode="a", encoding="utf-8") as f: f.write(file.relative_to(dist).as_posix()) f.write("\n") - return set(), 0, 0 -def lint_all(file_to_lint: str | None): +class _CleanablePathLookup(PathLookup): + + def __init__(self): + super().__init__(Path.cwd(), [Path(path) for path in sys.path]) + self._original_sys_paths = set(self._sys_paths) + + def clean_tmp_sys_paths(self): + for path in self._sys_paths: + if path in self._original_sys_paths: + continue + if path.is_file(): + path.unlink() + if path.is_dir(): + shutil.rmtree(path) + + +def lint_dir(solacc: SolaccContext, soldir: Path, file_to_lint: str | None = None): + path_lookup = _CleanablePathLookup() ws = WorkspaceClient(host='...', token='...') ctx = LocalCheckoutContext(ws).replace( - linter_context_factory=lambda session_state: LinterContext(TableMigrationIndex([]), session_state) + linter_context_factory=lambda session_state: LinterContext(TableMigrationIndex([]), session_state), + path_lookup=path_lookup, ) - parseable = 0 - not_computed = 0 - missing_imports: dict[str, dict[str, int]] = {} - all_files = list(dist.glob('**/*.py')) if file_to_lint is None else [Path(dist, file_to_lint)] - unparsed: Path | None = None - if file_to_lint is None: - unparsed = Path(Path(__file__).parent, "solacc-unparsed.txt") - if unparsed.exists(): - os.remove(unparsed) - skipped: set[str] | None = None - malformed = Path(__file__).parent / "solacc-malformed.txt" - if file_to_lint is None and malformed.exists(): - lines = malformed.read_text(encoding="utf-8").split("\n") - skipped = set(line for line in lines if len(line) > 0 and not line.startswith("#")) + all_files = list(soldir.glob('**/*.py')) if file_to_lint is None else [Path(soldir, file_to_lint)] + solacc.total_count += len(all_files) for file in all_files: - if skipped and file.relative_to(dist).as_posix() in skipped: + if solacc.files_to_skip and file.relative_to(dist).as_posix() in solacc.files_to_skip: continue - _missing_imports, _parseable, _not_computed = lint_one(file, ctx, unparsed) - for _import in _missing_imports: - register_missing_import(missing_imports, _import) - parseable += _parseable - not_computed += _not_computed - all_files_len = len(all_files) - (len(skipped) if skipped else 0) - parseable_pct = int(parseable / all_files_len * 100) - missing_imports_count = sum(sum(details.values()) for details in missing_imports.values()) + lint_one(solacc, file, ctx) + # cleanup tmp dirs + path_lookup.clean_tmp_sys_paths() + + +def lint_file(file_to_lint: str): + solacc = SolaccContext.create(False) + file_path = Path(file_to_lint) + lint_dir(solacc, file_path.parent, file_path.name) + + +def lint_all(): + solacc = SolaccContext.create(True) + for soldir in os.listdir(dist): + lint_dir(solacc, dist / soldir) + all_files_len = solacc.total_count - (len(solacc.files_to_skip) if solacc.files_to_skip else 0) + parseable_pct = int(solacc.parseable_count / all_files_len * 100) + missing_imports_count = sum(sum(details.values()) for details in solacc.missing_imports.values()) logger.info( - f"Skipped: {len(skipped or [])}, parseable: {parseable_pct}% ({parseable}/{all_files_len}), missing imports: {missing_imports_count}, not computed: {not_computed}" + f"Skipped: {len(solacc.files_to_skip or [])}, " + f"parseable: {parseable_pct}% ({solacc.parseable_count}/{all_files_len}), " + f"missing imports: {missing_imports_count}, " + f"not computed: {solacc.uninferrable_count}" ) - log_missing_imports(missing_imports) + solacc.log_missing_imports() # fail the job if files are unparseable if parseable_pct < 100: sys.exit(1) -def register_missing_import(missing_imports: dict[str, dict[str, int]], missing_import: str): - prefix = missing_import.split(".")[0] - details = missing_imports.get(prefix, None) - if details is None: - details = {} - missing_imports[prefix] = details - count = details.get(missing_import, 0) - details[missing_import] = count + 1 - - -def log_missing_imports(missing_imports: dict[str, dict[str, int]]): - missing_imports = dict(sorted(missing_imports.items(), key=lambda item: sum(item[1].values()), reverse=True)) - for prefix, details in missing_imports.items(): - logger.info(f"Missing import '{prefix}'") - for item, count in details.items(): - logger.info(f" {item}: {count} occurrences") - - def main(args: list[str]): install_logger() logging.root.setLevel(logging.INFO) file_to_lint = args[1] if len(args) > 1 else None - if not file_to_lint: + if file_to_lint: # don't clone if linting just one file, assumption is we're troubleshooting - logger.info("Cloning...") - clone_all() + logger.info("Linting...") + lint_file(file_to_lint) + return + logger.info("Cloning...") + clone_all() logger.info("Linting...") - lint_all(file_to_lint) + lint_all() if __name__ == "__main__": diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py index bda25a0106..b255a0e398 100644 --- a/tests/unit/source_code/test_notebook.py +++ b/tests/unit/source_code/test_notebook.py @@ -249,7 +249,7 @@ def test_notebook_builds_python_dependency_graph_with_fstring_loop(mock_path_loo container = maybe.dependency.load(mock_path_lookup) assert container is not None container.build_dependency_graph(graph) - expected_paths = [path, "leaf1.py", "leaf2.py", "leaf3.py"] + expected_paths = [path, "leaf1.py", "leaf2.py", "leaf3.py"] all_paths = set(d.path for d in graph.all_dependencies) assert all_paths == {mock_path_lookup.cwd / path for path in expected_paths} From d7444652ad5e3be60c56a7cda9dc2fb346e17844 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 25 Sep 2024 18:05:58 +0200 Subject: [PATCH 16/21] Code format: `make fmt` (#2749) ## Changes This PR just includes changes from `make fmt`, which doesn't currently pass on `main`. ### Linked issues Updates #2746. From 04918be5772ac603812dfa79afa293deea1ce8e4 Mon Sep 17 00:00:00 2001 From: Serge Smertin <259697+nfx@users.noreply.github.com> Date: Wed, 25 Sep 2024 18:06:28 +0200 Subject: [PATCH 17/21] Speedup assessment workflow by making DBFS root table size calculation parallel (#2745) We were not doing that before and now we do. --- .../labs/ucx/contexts/workflow_task.py | 2 +- .../labs/ucx/hive_metastore/table_size.py | 39 ++++++++++--------- .../labs/ucx/hive_metastore/tables.py | 2 +- .../labs/ucx/source_code/known.json | 3 +- tests/unit/hive_metastore/test_table_size.py | 2 +- 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index f52bcbadb8..9e3cbabbad 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -71,7 +71,7 @@ def pipelines_crawler(self): @cached_property def table_size_crawler(self): - return TableSizeCrawler(self.sql_backend, self.inventory_database) + return TableSizeCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) @cached_property def policies_crawler(self): diff --git a/src/databricks/labs/ucx/hive_metastore/table_size.py b/src/databricks/labs/ucx/hive_metastore/table_size.py index 0106fbab48..aadb7576e5 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_size.py +++ b/src/databricks/labs/ucx/hive_metastore/table_size.py @@ -1,12 +1,15 @@ import logging from collections.abc import Iterable from dataclasses import dataclass +from functools import partial +from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore import TablesCrawler +from databricks.labs.ucx.hive_metastore.tables import Table logger = logging.getLogger(__name__) @@ -40,43 +43,43 @@ def _crawl(self) -> Iterable[TableSize]: """Crawls and lists tables using table crawler Identifies DBFS root tables and calculates the size for these. """ + tasks = [] for table in self._tables_crawler.snapshot(): if not table.kind == "TABLE": continue if not table.is_dbfs_root: continue - size_in_bytes = self._safe_get_table_size(table.key) - if size_in_bytes is None: - continue # table does not exist anymore or is corrupted - - yield TableSize( - catalog=table.catalog, database=table.database, name=table.name, size_in_bytes=size_in_bytes - ) + tasks.append(partial(self._safe_get_table_size, table)) + return Threads.strict('DBFS root table sizes', tasks) def _try_fetch(self) -> Iterable[TableSize]: """Tries to load table information from the database or throws TABLE_OR_VIEW_NOT_FOUND error""" for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"): yield TableSize(*row) - def _safe_get_table_size(self, table_full_name: str) -> int | None: - logger.debug(f"Evaluating {table_full_name} table size.") + def _safe_get_table_size(self, table: Table) -> TableSize | None: + logger.debug(f"Evaluating {table.key} table size.") try: # refresh table statistics to avoid stale stats in HMS - self._backend.execute(f"ANALYZE table {escape_sql_identifier(table_full_name)} compute STATISTICS NOSCAN") - # pylint: disable-next=protected-access - return self._spark._jsparkSession.table(table_full_name).queryExecution().analyzed().stats().sizeInBytes() + self._backend.execute(f"ANALYZE table {table.safe_sql_key} compute STATISTICS NOSCAN") + jvm_df = self._spark._jsparkSession.table(table.safe_sql_key) # pylint: disable=protected-access + size_in_bytes = jvm_df.queryExecution().analyzed().stats().sizeInBytes() + return TableSize( + catalog=table.catalog, + database=table.database, + name=table.name, + size_in_bytes=size_in_bytes, + ) except Exception as e: # pylint: disable=broad-exception-caught if "[TABLE_OR_VIEW_NOT_FOUND]" in str(e) or "[DELTA_TABLE_NOT_FOUND]" in str(e): - logger.warning(f"Failed to evaluate {table_full_name} table size. Table not found.") + logger.warning(f"Failed to evaluate {table.key} table size. Table not found.") return None if "[DELTA_INVALID_FORMAT]" in str(e): - logger.warning( - f"Unable to read Delta table {table_full_name}, please check table structure and try again." - ) + logger.warning(f"Unable to read Delta table {table.key}, please check table structure and try again.") return None if "[DELTA_MISSING_TRANSACTION_LOG]" in str(e): - logger.warning(f"Delta table {table_full_name} is corrupted: missing transaction log.") + logger.warning(f"Delta table {table.key} is corrupt: missing transaction log.") return None - logger.error(f"Failed to evaluate {table_full_name} table size: ", exc_info=True) + logger.error(f"Failed to evaluate {table.key} table size: ", exc_info=True) return None diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 5eff7a1815..0545b584e9 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -340,7 +340,7 @@ class MigrationCount: what_count: dict[What, int] -class TablesCrawler(CrawlerBase): +class TablesCrawler(CrawlerBase[Table]): def __init__(self, backend: SqlBackend, schema, include_databases: list[str] | None = None): """ Initializes a TablesCrawler instance. diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 93b7eafe36..b7ad275abc 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -1830,6 +1830,7 @@ "databricks-labs-ucx": { "databricks.labs.ucx": [] }, + "databricks-pydabs": {}, "databricks-sdk": { "databricks.sdk": [] }, @@ -29921,4 +29922,4 @@ "zipp.compat.py310": [], "zipp.glob": [] } -} +} \ No newline at end of file diff --git a/tests/unit/hive_metastore/test_table_size.py b/tests/unit/hive_metastore/test_table_size.py index 8216a0a780..e23d3a7a35 100644 --- a/tests/unit/hive_metastore/test_table_size.py +++ b/tests/unit/hive_metastore/test_table_size.py @@ -137,7 +137,7 @@ def test_table_size_when_table_corrupted(mocker, caplog): results = tsc.snapshot() assert len(results) == 0 - assert "Delta table hive_metastore.db1.table1 is corrupted: missing transaction log" in caplog.text + assert "Delta table hive_metastore.db1.table1 is corrupt: missing transaction log" in caplog.text def test_table_size_when_delta_invalid_format_error(mocker, caplog): From 0a03defb538396c601879fb96686b5e3e92823bb Mon Sep 17 00:00:00 2001 From: Cor Date: Thu, 26 Sep 2024 10:43:26 +0200 Subject: [PATCH 18/21] Harden configuration reading (#2701) ## Changes Harden configuration reading by verifying the type before reading the "value" using `.get` ### Linked issues Resolves #2581 (hopefully the second get is the issue, type hinting should cover that, but who knows) ### Functionality - [x] modified existing workflow: `assessment` ### Tests - [x] added unit tests --- src/databricks/labs/ucx/assessment/secrets.py | 11 +++++++-- tests/unit/assessment/test_secrets.py | 23 +++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 tests/unit/assessment/test_secrets.py diff --git a/src/databricks/labs/ucx/assessment/secrets.py b/src/databricks/labs/ucx/assessment/secrets.py index 715d6d82aa..2eb3c0143f 100644 --- a/src/databricks/labs/ucx/assessment/secrets.py +++ b/src/databricks/labs/ucx/assessment/secrets.py @@ -30,13 +30,20 @@ def _get_secret_if_exists(self, secret_scope, secret_key) -> str | None: ) return None - def _get_value_from_config_key(self, config: dict, key: str, get_secret: bool = True) -> str | None: + def _get_value_from_config_key( + self, + config: dict, + key: str, + get_secret: bool = True, + ) -> str | None: """Get a config value based on its key, with some special handling: If the key is prefixed with spark_conf, i.e. this is in a cluster policy, the actual value is nested If the value is of format {{secret_scope/secret}}, we extract that as well """ if re.search("spark_conf", key): - value = config.get(key, {}).get("value", "") + value = config.get(key, {}) + if isinstance(value, dict): + value = value.get("value", "") else: value = config.get(key, "") # retrieve from secret scope if used diff --git a/tests/unit/assessment/test_secrets.py b/tests/unit/assessment/test_secrets.py new file mode 100644 index 0000000000..555a3cd8bf --- /dev/null +++ b/tests/unit/assessment/test_secrets.py @@ -0,0 +1,23 @@ +import pytest + +from databricks.labs.ucx.assessment.secrets import SecretsMixin + + +@pytest.mark.parametrize( + "key,expected", + [ + ("spark_conf.spark.hadoop.javax.jdo.option.ConnectionURL", "url"), + ("NonExistentKey", ""), + ("spark_conf.invalid", "{'should_not': 'be_string'}"), + ], +) +def test_secrets_mixin_gets_value_from_config_key(key, expected) -> None: + config = { + "spark_conf.invalid": "{'should_not': 'be_string'}", + "spark_conf.spark.hadoop.javax.jdo.option.ConnectionURL": {"value": "url"}, + } + secrets_mixin = SecretsMixin() + + value = secrets_mixin._get_value_from_config_key(config, key) # pylint: disable=protected-access + + assert value == expected From be955ea7633f92ef16816afc14c1da432ce326bb Mon Sep 17 00:00:00 2001 From: Amin Movahed <140028681+aminmovahed-db@users.noreply.github.com> Date: Thu, 26 Sep 2024 18:51:03 +1000 Subject: [PATCH 19/21] Add unskip CLI command to undo a skip on schema or a table (#2734) ## Changes Add unskip CLI command to undo a skip on schema or a table ### Linked issues Resolves #1938 ### Functionality - [x] added relevant user documentation - [x] added new CLI command --> unskip ### Tests - [x] Unit test added --- README.md | 10 +++ src/databricks/labs/ucx/cli.py | 2 +- .../labs/ucx/hive_metastore/mapping.py | 53 ++++++++------- tests/unit/hive_metastore/test_mapping.py | 68 ++++++++++++++++++- 4 files changed, 106 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 7c5bf044c0..f15bd6c7ca 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [`migrate-locations` command](#migrate-locations-command) * [`create-table-mapping` command](#create-table-mapping-command) * [`skip` command](#skip-command) + * [`unskip` command](#unskip-command) * [`create-catalogs-schemas` command](#create-catalogs-schemas-command) * [`migrate-tables` command](#migrate-tables-command) * [`revert-migrated-tables` command](#revert-migrated-tables-command) @@ -1466,6 +1467,15 @@ Once you're done with table migration, proceed to the [code migration](#code-mig [[back to top](#databricks-labs-ucx)] +## `unskip` command + +```commandline +databricks labs ucx unskip --schema X [--table Y] +``` +This command removes the mark set by the [`skip` command](#skip-command) on the given schema or table. + +[[back to top](#databricks-labs-ucx)] + ## `create-catalogs-schemas` command ```text diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index e3f5d76756..19929794a7 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -101,7 +101,7 @@ def skip(w: WorkspaceClient, schema: str | None = None, table: str | None = None @ucx.command def unskip(w: WorkspaceClient, schema: str | None = None, table: str | None = None): - """Create a unskip comment on a schema or a table""" + """Unset the skip mark from a schema or a table""" logger.info("Running unskip command") if not schema: logger.error("--schema is a required parameter.") diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index 94f33a97fe..3aef32ba20 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -135,24 +135,28 @@ def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Call except BadRequest as err: logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) - def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]): - # Removes skip mark from the table property + def unskip_table_or_view( + self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None] + ) -> None: + """Removes skip mark from the table property. + + Args: + schema_name (str): The schema name of the table to be unskipped. + table_name (str): The table name of the table to be unskipped. + load_table (Callable[[str, str], Table | None]): A function that loads a table from the metastore. + """ + table = load_table(schema_name, table_name) + if table is None: + logger.error( + f"Failed to remove skip marker from table: {schema_name}.{table_name}. Table not found.", + ) + return try: - table = load_table(schema_name, table_name) - if table is None: - raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]") self._sql_backend.execute( - f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}' );" + f"ALTER {table.kind} {escape_sql_identifier(table.full_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" ) - except NotFound as err: - if "[TABLE_OR_VIEW_NOT_FOUND]" in str(err) or "[DELTA_TABLE_NOT_FOUND]" in str(err): - logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}. Table not found.") - else: - logger.error( - f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True - ) - except BadRequest as err: - logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) + except (NotFound, BadRequest) as e: + logger.error(f"Failed to remove skip marker from table: {table.full_name}", exc_info=e) def skip_schema(self, schema: str): # Marks a schema to be skipped in the migration process by applying a table property @@ -168,19 +172,18 @@ def skip_schema(self, schema: str): except BadRequest as err: logger.error(err) - def unskip_schema(self, schema: str): - # Removes skip mark from the schema property + def unskip_schema(self, schema: str) -> None: + """Removes skip mark from the schema property. + + Args: + schema (str): The schema name of the table to be unskipped. + """ try: self._sql_backend.execute( - f"ALTER SCHEMA {escape_sql_identifier(schema)} UNSET DBPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" + f"ALTER SCHEMA hive_metastore.{escape_sql_identifier(schema)} UNSET DBPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" ) - except NotFound as err: - if "[SCHEMA_NOT_FOUND]" in str(err): - logger.error(f"Failed to remove skip marker for Schema {schema}. Schema not found.") - else: - logger.error(err) - except BadRequest as err: - logger.error(err) + except (NotFound, BadRequest) as e: + logger.error(f"Failed to remove skip marker from schema: {schema}.", exc_info=e) def get_tables_to_migrate(self, tables_crawler: TablesCrawler) -> Collection[TableToMigrate]: rules = self.load() diff --git a/tests/unit/hive_metastore/test_mapping.py b/tests/unit/hive_metastore/test_mapping.py index 0ea2951131..e0ac9f56ad 100644 --- a/tests/unit/hive_metastore/test_mapping.py +++ b/tests/unit/hive_metastore/test_mapping.py @@ -6,7 +6,7 @@ from databricks.labs.blueprint.parallel import ManyError from databricks.labs.lsql.backends import MockBackend, SqlBackend from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import NotFound +from databricks.sdk.errors import NotFound, BadRequest from databricks.sdk.errors.platform import ResourceConflict from databricks.sdk.service.catalog import TableInfo @@ -211,6 +211,72 @@ def test_skip_happy_path(caplog): assert len(caplog.records) == 0 +def test_unskip_on_table() -> None: + ws = create_autospec(WorkspaceClient) + mock_backend = MockBackend() + installation = MockInstallation() + mapping = TableMapping(installation, ws, mock_backend) + table = Table(catalog="catalog", database="schema", name="table", object_type="table", table_format="csv") + mapping.unskip_table_or_view(schema_name="schema", table_name="table", load_table=lambda _schema, _table: table) + ws.tables.get.assert_not_called() + assert ( + f"ALTER TABLE `catalog`.`schema`.`table` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" + in mock_backend.queries + ) + + +def test_unskip_on_view() -> None: + ws = create_autospec(WorkspaceClient) + mock_backend = MockBackend() + installation = MockInstallation() + mapping = TableMapping(installation, ws, mock_backend) + view = Table( + catalog="catalog", database="schema", name="view", object_type="table", table_format="csv", view_text="stuff" + ) + mapping.unskip_table_or_view(schema_name="schema", table_name="view", load_table=lambda _schema, _table: view) + ws.tables.get.assert_not_called() + assert ( + f"ALTER VIEW `catalog`.`schema`.`view` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" + in mock_backend.queries + ) + + +def test_unskip_on_schema() -> None: + ws = create_autospec(WorkspaceClient) + mock_backend = MockBackend() + installation = MockInstallation() + mapping = TableMapping(installation, ws, mock_backend) + mapping.unskip_schema(schema="schema") + ws.tables.get.assert_not_called() + assert ( + f"ALTER SCHEMA hive_metastore.`schema` UNSET DBPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" + in mock_backend.queries + ) + + +def test_unskip_missing_table(caplog) -> None: + ws = create_autospec(WorkspaceClient) + sbe = create_autospec(SqlBackend) + sbe.execute.side_effect = NotFound("[TABLE_OR_VIEW_NOT_FOUND]") + installation = MockInstallation() + mapping = TableMapping(installation, ws, sbe) + mapping.unskip_table_or_view(schema_name='foo', table_name="table", load_table=lambda schema, table: None) + assert [rec.message for rec in caplog.records if "table not found" in rec.message.lower()] + ws.tables.get.assert_not_called() + + +def test_unskip_badrequest(caplog) -> None: + ws = create_autospec(WorkspaceClient) + sbe = create_autospec(SqlBackend) + sbe.execute.side_effect = BadRequest("[Bad command]") + installation = MockInstallation() + mapping = TableMapping(installation, ws, sbe) + table = Table(catalog="catalog", database="schema", name="table", object_type="table", table_format="csv") + mapping.unskip_table_or_view(schema_name="schema", table_name="table", load_table=lambda _schema, _table: table) + assert [rec.message for rec in caplog.records if "failed to remove skip marker " in rec.message.lower()] + ws.tables.get.assert_not_called() + + def test_skip_missing_schema(caplog): ws = create_autospec(WorkspaceClient) sbe = create_autospec(SqlBackend) From 788c2737f86c2a5606c60e37f0e32c3713731224 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 26 Sep 2024 14:35:42 +0200 Subject: [PATCH 20/21] Improve solacc linting (#2752) ## Changes `solacc` currently lints on a per-file basis, which is incorrect this PR implements linting a per solution basis, thus improving dependency resolution ### Linked issues None ### Functionality None ### Tests - [x] manually tested --------- Co-authored-by: Eric Vergnaud --- src/databricks/labs/ucx/source_code/base.py | 3 +- tests/integration/source_code/solacc.py | 101 +++++++++----------- 2 files changed, 48 insertions(+), 56 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 9052ed9545..4e22e74a1a 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -97,7 +97,8 @@ def message_relative_to(self, base: Path, *, default: Path | None = None) -> str if default is not None: path = default path = path.relative_to(base) - return f"./{path.as_posix()}:{advice.start_line}:{advice.start_col}: [{advice.code}] {advice.message}" + # increment start_line because it is 0-based whereas IDEs are usually 1-based + return f"./{path.as_posix()}:{advice.start_line+1}:{advice.start_col}: [{advice.code}] {advice.message}" class Advisory(Advice): diff --git a/tests/integration/source_code/solacc.py b/tests/integration/source_code/solacc.py index e209b5a363..9ae1fdb748 100644 --- a/tests/integration/source_code/solacc.py +++ b/tests/integration/source_code/solacc.py @@ -22,7 +22,7 @@ dist = (this_file / '../../../../dist').resolve().absolute() -def clone_all(): +def _clone_all(): params = {'per_page': 100, 'page': 1} to_clone = [] while True: @@ -49,7 +49,7 @@ def clone_all(): run_command(f'git clone {url} {dst}') -def collect_missing_imports(advices: list[LocatedAdvice]): +def _collect_missing_imports(advices: list[LocatedAdvice]): missing_imports: set[str] = set() for located_advice in advices: if located_advice.advice.code == 'import-not-found': @@ -57,7 +57,7 @@ def collect_missing_imports(advices: list[LocatedAdvice]): return missing_imports -def collect_uninferrable_count(advices: list[LocatedAdvice]): +def _collect_uninferrable_count(advices: list[LocatedAdvice]): not_computed = 0 for located_advice in advices: if "computed" in located_advice.advice.message: @@ -65,15 +65,19 @@ def collect_uninferrable_count(advices: list[LocatedAdvice]): return not_computed -def print_advices(advices: list[LocatedAdvice], file: Path): +def _collect_unparseable(advices: list[LocatedAdvice]): + return set(located_advice for located_advice in advices if located_advice.advice.code == 'parse-error') + + +def _print_advices(advices: list[LocatedAdvice]): for located_advice in advices: - message = located_advice.message_relative_to(dist.parent, default=file) + message = located_advice.message_relative_to(dist.parent) sys.stdout.write(f"{message}\n") @dataclass -class SolaccContext: - unparsed_path: Path | None = None +class _SolaccContext: + unparsed_files_path: Path | None = None files_to_skip: set[str] | None = None total_count = 0 parseable_count = 0 @@ -93,7 +97,7 @@ def create(cls, for_all_dirs: bool): if for_all_dirs and malformed.exists(): lines = malformed.read_text(encoding="utf-8").split("\n") files_to_skip = set(line for line in lines if len(line) > 0 and not line.startswith("#")) - return SolaccContext(unparsed_path=unparsed_path, files_to_skip=files_to_skip) + return _SolaccContext(unparsed_files_path=unparsed_path, files_to_skip=files_to_skip) def register_missing_import(self, missing_import: str): prefix = missing_import.split(".")[0] @@ -114,29 +118,6 @@ def log_missing_imports(self): logger.info(f" {item}: {count} occurrences") -def lint_one(solacc: SolaccContext, file: Path, ctx: LocalCheckoutContext) -> None: - try: - advices = list(ctx.local_code_linter.lint_path(file, set())) - solacc.parseable_count += 1 - for missing_import in collect_missing_imports(advices): - solacc.register_missing_import(missing_import) - solacc.uninferrable_count += collect_uninferrable_count(advices) - print_advices(advices, file) - except Exception as e: # pylint: disable=broad-except - # here we're most likely catching astroid & sqlglot errors - # when linting single file, log exception details - logger.error( - f"Error during parsing of {file}: {e}".replace("\n", " "), - exc_info=e if solacc.unparsed_path is None else None, - ) - if solacc.unparsed_path: - logger.error(f"Error during parsing of {file}: {e}".replace("\n", " ")) - # populate solacc-unparsed.txt - with solacc.unparsed_path.open(mode="a", encoding="utf-8") as f: - f.write(file.relative_to(dist).as_posix()) - f.write("\n") - - class _CleanablePathLookup(PathLookup): def __init__(self): @@ -153,33 +134,46 @@ def clean_tmp_sys_paths(self): shutil.rmtree(path) -def lint_dir(solacc: SolaccContext, soldir: Path, file_to_lint: str | None = None): +def _lint_dir(solacc: _SolaccContext, soldir: Path): path_lookup = _CleanablePathLookup() ws = WorkspaceClient(host='...', token='...') ctx = LocalCheckoutContext(ws).replace( linter_context_factory=lambda session_state: LinterContext(TableMigrationIndex([]), session_state), path_lookup=path_lookup, ) - all_files = list(soldir.glob('**/*.py')) if file_to_lint is None else [Path(soldir, file_to_lint)] + all_files = list(soldir.glob('**/*.py')) + list(soldir.glob('**/*.sql')) solacc.total_count += len(all_files) - for file in all_files: - if solacc.files_to_skip and file.relative_to(dist).as_posix() in solacc.files_to_skip: - continue - lint_one(solacc, file, ctx) + # pre-populate linted_files such that files to skip are not linted + files_to_skip = set(solacc.files_to_skip) if solacc.files_to_skip else set() + linted_files = set(files_to_skip) + # lint solution + advices = list(ctx.local_code_linter.lint_path(soldir, linted_files)) + # collect unparseable files + unparseables = _collect_unparseable(advices) + solacc.parseable_count += len(linted_files) - len(files_to_skip) - len(set(advice.path for advice in unparseables)) + if solacc.unparsed_files_path: + for unparseable in unparseables: + logger.error(f"Error during parsing of {unparseable.path}: {unparseable.advice.message}".replace("\n", " ")) + # populate solacc-unparsed.txt + with solacc.unparsed_files_path.open(mode="a", encoding="utf-8") as f: + f.write(unparseable.path.relative_to(dist).as_posix()) + f.write("\n") + # collect missing imports + for missing_import in _collect_missing_imports(advices): + solacc.register_missing_import(missing_import) + # collect uninferrable + solacc.uninferrable_count += _collect_uninferrable_count(advices) + # display advices + _print_advices(advices) # cleanup tmp dirs path_lookup.clean_tmp_sys_paths() -def lint_file(file_to_lint: str): - solacc = SolaccContext.create(False) - file_path = Path(file_to_lint) - lint_dir(solacc, file_path.parent, file_path.name) - - -def lint_all(): - solacc = SolaccContext.create(True) - for soldir in os.listdir(dist): - lint_dir(solacc, dist / soldir) +def _lint_dirs(dir_to_lint: str | None): + solacc = _SolaccContext.create(dir_to_lint is not None) + all_dirs = os.listdir(dist) if dir_to_lint is None else [dir_to_lint] + for soldir in all_dirs: + _lint_dir(solacc, dist / soldir) all_files_len = solacc.total_count - (len(solacc.files_to_skip) if solacc.files_to_skip else 0) parseable_pct = int(solacc.parseable_count / all_files_len * 100) missing_imports_count = sum(sum(details.values()) for details in solacc.missing_imports.values()) @@ -198,16 +192,13 @@ def lint_all(): def main(args: list[str]): install_logger() logging.root.setLevel(logging.INFO) - file_to_lint = args[1] if len(args) > 1 else None - if file_to_lint: + dir_to_lint = args[1] if len(args) > 1 else None + if not dir_to_lint: # don't clone if linting just one file, assumption is we're troubleshooting - logger.info("Linting...") - lint_file(file_to_lint) - return - logger.info("Cloning...") - clone_all() + logger.info("Cloning...") + _clone_all() logger.info("Linting...") - lint_all() + _lint_dirs(dir_to_lint) if __name__ == "__main__": From 1d12391723f451e76b6bb735f28a063d98143a7f Mon Sep 17 00:00:00 2001 From: Andres Garcia Date: Thu, 26 Sep 2024 09:04:54 -0600 Subject: [PATCH 21/21] Sync Fork, make fmt test --- src/databricks/labs/ucx/assessment/export.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/export.py b/src/databricks/labs/ucx/assessment/export.py index 113dd3bd4d..987132e02e 100644 --- a/src/databricks/labs/ucx/assessment/export.py +++ b/src/databricks/labs/ucx/assessment/export.py @@ -11,8 +11,6 @@ class AssessmentExporter: - # File and Path Constants - _EXPORT_FILE_NAME = "ucx_assessment_results.zip" def __init__(self, sql_backend: SqlBackend, config: WorkspaceConfig): self._sql_backend = sql_backend