From ae5832d76b30ca1bf1ca349be6f0b09442b95fa4 Mon Sep 17 00:00:00 2001
From: Ricardo Portilla <ricardo.portilla@databricks.com>
Date: Thu, 26 Sep 2024 09:30:59 -0400
Subject: [PATCH 01/21] make fmt run

---
 src/databricks/labs/ucx/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py
index d3ae23fdca..c6242090e2 100644
--- a/src/databricks/labs/ucx/cli.py
+++ b/src/databricks/labs/ucx/cli.py
@@ -19,7 +19,6 @@
 from databricks.labs.ucx.install import AccountInstaller
 from databricks.labs.ucx.source_code.linters.files import LocalCodeLinter
 
-from databricks.labs.ucx.assessment.export import AssessmentExporter
 
 ucx = App(__file__)
 logger = get_logger(__file__)
@@ -681,6 +680,7 @@ def join_collection(a: AccountClient, workspace_ids: str):
     w_ids = [int(_.strip()) for _ in workspace_ids.split(",") if _]
     account_installer.join_collection(w_ids)
 
+
 @ucx.command
 def upload(
     file: Path | str,

From 204ab4a203c77c6507cee11f511b5dde7180d16e Mon Sep 17 00:00:00 2001
From: Andrew Snare <asnare@users.noreply.github.com>
Date: Tue, 24 Sep 2024 12:18:11 +0200
Subject: [PATCH 02/21] Minor contributor documentation changes (#2729)

## Changes

This PR makes some minor changes to the the contributor documentation:

- Hatch/env setup tweak to keep IntelliJ/PyCharm happy. (For some reason
if the full path isn't specified IntelliJ can have problems locating the
python interpreter for the venv.)
 - ~Add in the linting step.~
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 48b00fc086..3baf859c77 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -259,7 +259,7 @@ make dev
 
 To use different python version, specify it in `HATCH_PYTHON` variable:
 ```shell
-HATCH_PYTHON=python3.10 make clean dev test
+HATCH_PYTHON="$(which python3.10)" make clean dev test
 ```
 
 Configure your IDE to use `.venv/bin/python` from the virtual environment when developing the project:

From 0a6e72f9e60fef6a0cad4009abde6b0e581c82ae Mon Sep 17 00:00:00 2001
From: Cor <jczuurmond@protonmail.com>
Date: Tue, 24 Sep 2024 12:20:42 +0200
Subject: [PATCH 03/21] Handle `PermissionDenied` when listing accessible
 workspaces (#2733)

## Changes
Handle `PermissionDenied` when listing accessible workspaces

### Linked issues
Resolves #2732

### Functionality

- [x] modified existing command: that user the accesible workspaces
(most account level commands)

### Tests

- [ ] manually tested
- [x] added unit tests
---
 src/databricks/labs/ucx/account/workspaces.py | 27 +++++----
 tests/unit/account/test_workspaces.py         | 57 ++++++++++++++++++-
 2 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/src/databricks/labs/ucx/account/workspaces.py b/src/databricks/labs/ucx/account/workspaces.py
index 6e9be51721..b15ecd78ae 100644
--- a/src/databricks/labs/ucx/account/workspaces.py
+++ b/src/databricks/labs/ucx/account/workspaces.py
@@ -98,20 +98,25 @@ def get_accessible_workspaces(self) -> list[Workspace]:
         return accessible_workspaces
 
     def can_administer(self, workspace: Workspace) -> bool:
+        """Evaluate if the user can administer a workspace.
+
+        A user can administer a workspace if the user can access the workspace and is a member of the workspace "admins"
+        group.
+
+        Args:
+            workspace (Workspace): The workspace to check if the user can administer.
+
+        Returns:
+            bool: True if the user can administer the workspace, False otherwise.
+        """
         try:
-            # check if user has access to workspace
             ws = self.client_for(workspace)
-        except (PermissionDenied, NotFound, ValueError) as err:
-            logger.warning(f"{workspace.deployment_name}: Encounter error {err}. Skipping...")
-            return False
-        current_user = ws.current_user.me()
-        if current_user.groups is None:
+            current_user = ws.current_user.me()
+        except (PermissionDenied, NotFound, ValueError) as e:
+            logger.warning(f"User cannot access workspace: {workspace.deployment_name}", exc_info=e)
             return False
-        # check if user is a workspace admin
-        if "admins" not in [g.display for g in current_user.groups]:
-            logger.warning(
-                f"{workspace.deployment_name}: User {current_user.user_name} is not a workspace admin. Skipping..."
-            )
+        if current_user.groups is None or "admins" not in {g.display for g in current_user.groups}:
+            logger.warning(f"User '{current_user.user_name}' is not a workspace admin: {workspace.deployment_name}")
             return False
         return True
 
diff --git a/tests/unit/account/test_workspaces.py b/tests/unit/account/test_workspaces.py
index d6862bf801..10023b5b92 100644
--- a/tests/unit/account/test_workspaces.py
+++ b/tests/unit/account/test_workspaces.py
@@ -1,3 +1,4 @@
+import logging
 import io
 import json
 from unittest.mock import create_autospec
@@ -6,7 +7,7 @@
 from databricks.labs.blueprint.installation import Installation, MockInstallation
 from databricks.labs.blueprint.tui import MockPrompts
 from databricks.sdk import AccountClient, WorkspaceClient
-from databricks.sdk.errors import NotFound, ResourceConflict
+from databricks.sdk.errors import NotFound, PermissionDenied, ResourceConflict
 from databricks.sdk.service import iam
 from databricks.sdk.service.iam import ComplexValue, Group, ResourceMeta, User
 from databricks.sdk.service.provisioning import Workspace
@@ -494,3 +495,57 @@ def get_workspace_client(workspace) -> WorkspaceClient:
     acc.config.auth_type = "databricks-cli"
     account_workspaces = AccountWorkspaces(acc)
     assert len(account_workspaces.get_accessible_workspaces()) == 1
+
+
+def test_account_workspaces_can_administer_when_user_in_admins_group() -> None:
+    acc = create_autospec(AccountClient)
+    ws = create_autospec(WorkspaceClient)
+    acc.get_workspace_client.return_value = ws
+    ws.current_user.me.return_value = User(user_name="test", groups=[ComplexValue(display="admins")])
+    account_workspaces = AccountWorkspaces(acc)
+    workspace = Workspace(deployment_name="test")
+
+    assert account_workspaces.can_administer(workspace)
+
+
+@pytest.mark.parametrize("groups", [[ComplexValue(display="not-admins")], None])
+def test_account_workspaces_cannot_administer_when_user_not_in_admins_group(caplog, groups) -> None:
+    acc = create_autospec(AccountClient)
+    ws = create_autospec(WorkspaceClient)
+    acc.get_workspace_client.return_value = ws
+    ws.current_user.me.return_value = User(user_name="test", groups=groups)
+    account_workspaces = AccountWorkspaces(acc)
+    workspace = Workspace(deployment_name="test")
+
+    with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.account.workspaces"):
+        can_administer = account_workspaces.can_administer(workspace)
+    assert not can_administer
+    assert "User 'test' is not a workspace admin: test" in caplog.messages
+
+
+def test_account_workspaces_can_administer_handles_not_found_error_for_get_workspace_client(caplog) -> None:
+    acc = create_autospec(AccountClient)
+    acc.get_workspace_client.side_effect = NotFound
+    account_workspaces = AccountWorkspaces(acc)
+    workspace = Workspace(deployment_name="test")
+
+    with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.account.workspaces"):
+        can_administer = account_workspaces.can_administer(workspace)
+    assert not can_administer
+    assert "User cannot access workspace: test" in caplog.messages
+
+
+def test_account_workspaces_can_administer_handles_permission_denied_error_for_current_user(caplog) -> None:
+    acc = create_autospec(AccountClient)
+    ws = create_autospec(WorkspaceClient)
+    acc.get_workspace_client.return_value = ws
+    ws.current_user.me.side_effect = PermissionDenied(
+        "This API is disabled for users without the databricks-sql-access or workspace-access entitlements"
+    )
+    account_workspaces = AccountWorkspaces(acc)
+    workspace = Workspace(deployment_name="test")
+
+    with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.account.workspaces"):
+        can_administer = account_workspaces.can_administer(workspace)
+    assert not can_administer
+    assert "User cannot access workspace: test" in caplog.messages

From 93d496e7751ca5362c0a680a33e72862ba1772d2 Mon Sep 17 00:00:00 2001
From: Amin Movahed <140028681+aminmovahed-db@users.noreply.github.com>
Date: Tue, 24 Sep 2024 21:17:55 +1000
Subject: [PATCH 04/21] Adding unskip CLI command to undo a skip on schema or a
 table (#2727)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🔧 This is still in progress...

<!-- REMOVE IRRELEVANT COMMENTS BEFORE CREATING A PULL REQUEST -->
## Changes
<!-- Summary of your changes that are easy to understand. Add
screenshots when necessary -->

### Linked issues
<!-- DOC: Link issue with a keyword: close, closes, closed, fix, fixes,
fixed, resolve, resolves, resolved. See
https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword
-->

Resolves #1938

### Functionality

- [ ] added relevant user documentation
- [x] added new CLI command --> unskip

### Tests
<!-- How is this tested? Please see the checklist below and also
describe any other relevant tests -->

- [ ] manually tested
- [ ] added unit tests
- [ ] added integration tests
---
 src/databricks/labs/ucx/cli.py                | 13 ++++++++
 .../labs/ucx/hive_metastore/mapping.py        | 33 +++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py
index c6242090e2..ef136d3893 100644
--- a/src/databricks/labs/ucx/cli.py
+++ b/src/databricks/labs/ucx/cli.py
@@ -99,6 +99,19 @@ def skip(w: WorkspaceClient, schema: str | None = None, table: str | None = None
     return ctx.table_mapping.skip_schema(schema)
 
 
+@ucx.command
+def unskip(w: WorkspaceClient, schema: str | None = None, table: str | None = None):
+    """Create a unskip comment on a schema or a table"""
+    logger.info("Running unskip command")
+    if not schema:
+        logger.error("--schema is a required parameter.")
+        return None
+    ctx = WorkspaceContext(w)
+    if table:
+        return ctx.table_mapping.unskip_table_or_view(schema, table, ctx.tables_crawler.load_one)
+    return ctx.table_mapping.unskip_schema(schema)
+
+
 @ucx.command(is_account=True)
 def sync_workspace_info(a: AccountClient):
     """upload workspace config to all workspaces in the account where ucx is installed"""
diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py
index 655172e388..94f33a97fe 100644
--- a/src/databricks/labs/ucx/hive_metastore/mapping.py
+++ b/src/databricks/labs/ucx/hive_metastore/mapping.py
@@ -135,6 +135,25 @@ def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Call
         except BadRequest as err:
             logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True)
 
+    def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]):
+        # Removes skip mark from the table property
+        try:
+            table = load_table(schema_name, table_name)
+            if table is None:
+                raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]")
+            self._sql_backend.execute(
+                f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}' );"
+            )
+        except NotFound as err:
+            if "[TABLE_OR_VIEW_NOT_FOUND]" in str(err) or "[DELTA_TABLE_NOT_FOUND]" in str(err):
+                logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}. Table not found.")
+            else:
+                logger.error(
+                    f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True
+                )
+        except BadRequest as err:
+            logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True)
+
     def skip_schema(self, schema: str):
         # Marks a schema to be skipped in the migration process by applying a table property
         try:
@@ -149,6 +168,20 @@ def skip_schema(self, schema: str):
         except BadRequest as err:
             logger.error(err)
 
+    def unskip_schema(self, schema: str):
+        # Removes skip mark from the schema property
+        try:
+            self._sql_backend.execute(
+                f"ALTER SCHEMA {escape_sql_identifier(schema)} UNSET DBPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');"
+            )
+        except NotFound as err:
+            if "[SCHEMA_NOT_FOUND]" in str(err):
+                logger.error(f"Failed to remove skip marker for Schema {schema}. Schema not found.")
+            else:
+                logger.error(err)
+        except BadRequest as err:
+            logger.error(err)
+
     def get_tables_to_migrate(self, tables_crawler: TablesCrawler) -> Collection[TableToMigrate]:
         rules = self.load()
         # Getting all the source tables from the rules

From ee201129228c0d8cfeb7dc59636a3d6cfdabc068 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@wanadoo.fr>
Date: Tue, 24 Sep 2024 17:00:16 +0200
Subject: [PATCH 05/21] Fix failing integration tests that perform a real
 assessment (#2736)

## Changes
Ensure 'assessment' workflow only runs minimal assessment in integration
tests

### Linked issues
None

### Functionality
None

### Tests
- [x] changed integration tests

Co-authored-by: Eric Vergnaud <eric.vergnaud@databricks.com>
---
 tests/integration/assessment/test_ext_hms.py  | 14 +++-------
 .../integration/assessment/test_workflows.py  | 14 +++++-----
 tests/integration/conftest.py                 | 27 +++++++++++++++++++
 3 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/tests/integration/assessment/test_ext_hms.py b/tests/integration/assessment/test_ext_hms.py
index 805ab13c06..44b8e45fd4 100644
--- a/tests/integration/assessment/test_ext_hms.py
+++ b/tests/integration/assessment/test_ext_hms.py
@@ -1,6 +1,5 @@
 import dataclasses
 import datetime as dt
-import io
 
 from databricks.labs.lsql.backends import CommandExecutionBackend
 from databricks.sdk.service.iam import PermissionLevel
@@ -9,12 +8,11 @@
 def test_running_real_assessment_job_ext_hms(
     ws,
     installation_ctx,
+    product_info,
     env_or_skip,
     make_cluster_policy,
     make_cluster_policy_permissions,
-    make_notebook,
-    make_job,
-    make_dashboard,
+    populate_for_linting,
 ):
     cluster_id = env_or_skip('TEST_EXT_HMS_CLUSTER_ID')
     ext_hms_ctx = installation_ctx.replace(
@@ -41,14 +39,10 @@ def test_running_real_assessment_job_ext_hms(
     ext_hms_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"]
     ext_hms_ctx.workspace_installation.run()
 
+    populate_for_linting(installation_ctx.installation)
+
     # Under ideal circumstances this can take 10-16 minutes (depending on whether there are compute instances available
     # via the integration pool). Allow some margin to reduce spurious failures.
-    notebook_path = make_notebook(content=io.BytesIO(b"import xyz"))
-    job = make_job(notebook_path=notebook_path)
-    installation_ctx.config.include_job_ids = [job.job_id]
-
-    dashboard = make_dashboard()
-    installation_ctx.config.include_dashboard_ids = [dashboard.id]
     ext_hms_ctx.deployed_workflows.run_workflow("assessment", max_wait=dt.timedelta(minutes=25))
 
     # assert the workflow is successful. the tasks on sql warehouse will fail so skip checking them
diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py
index bb1e1576a7..7955e93c69 100644
--- a/tests/integration/assessment/test_workflows.py
+++ b/tests/integration/assessment/test_workflows.py
@@ -1,4 +1,3 @@
-import io
 from datetime import timedelta
 
 from databricks.sdk.errors import NotFound, InvalidParameterValue
@@ -8,7 +7,11 @@
 
 @retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8))
 def test_running_real_assessment_job(
-    ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions, make_job, make_notebook, make_dashboard
+    ws,
+    installation_ctx,
+    make_cluster_policy,
+    make_cluster_policy_permissions,
+    populate_for_linting,
 ):
     ws_group, _ = installation_ctx.make_ucx_group()
     cluster_policy = make_cluster_policy()
@@ -20,12 +23,7 @@ def test_running_real_assessment_job(
     installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"]
     installation_ctx.workspace_installation.run()
 
-    notebook_path = make_notebook(content=io.BytesIO(b"import xyz"))
-    job = make_job(notebook_path=notebook_path)
-    installation_ctx.config.include_job_ids = [job.job_id]
-
-    dashboard = make_dashboard()
-    installation_ctx.config.include_dashboard_ids = [dashboard.id]
+    populate_for_linting(installation_ctx.installation)
 
     installation_ctx.deployed_workflows.run_workflow("assessment")
     assert installation_ctx.deployed_workflows.validate_step("assessment")
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index a3c5bfe912..90204c9d26 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -1,3 +1,4 @@
+import io
 import json
 from collections.abc import Callable, Generator
 import functools
@@ -9,11 +10,15 @@
 from functools import cached_property
 import shutil
 import subprocess
+from pathlib import Path
+
 import pytest  # pylint: disable=wrong-import-order
+import yaml
 from databricks.labs.blueprint.commands import CommandExecutor
 from databricks.labs.blueprint.entrypoint import is_in_debug
 from databricks.labs.blueprint.installation import Installation, MockInstallation
 from databricks.labs.blueprint.parallel import Threads
+from databricks.labs.blueprint.paths import WorkspacePath
 from databricks.labs.blueprint.tui import MockPrompts
 from databricks.labs.blueprint.wheels import ProductInfo
 from databricks.labs.lsql.backends import SqlBackend
@@ -1175,3 +1180,25 @@ def _run(command: str) -> str:
     except ValueError as err:
         logger.debug(f"pytest_ignore_collect: error: {err}")
         return False
+
+
+@pytest.fixture
+def populate_for_linting(ws, make_random, make_job, make_notebook, make_query, make_dashboard, watchdog_purge_suffix):
+    def populate_workspace(installation):
+        # keep linting scope to minimum to avoid test timeouts
+        path = Path(installation.install_folder()) / f"dummy-{make_random(4)}-{watchdog_purge_suffix}"
+        notebook_path = make_notebook(path=path, content=io.BytesIO(b"spark.read.parquet('dbfs://mnt/foo/bar')"))
+        job = make_job(notebook_path=notebook_path)
+        query = make_query(sql_query='SELECT * from parquet.`dbfs://mnt/foo/bar`')
+        dashboard = make_dashboard(query=query)
+        # can't use installation.load(WorkspaceConfig)/installation.save() because they populate empty credentials
+        config_path = WorkspacePath(ws, installation.install_folder()) / "config.yml"
+        text = config_path.read_text()
+        config = yaml.safe_load(text)
+        config["include_job_ids"] = [job.job_id]
+        config["include_dashboard_ids"] = [dashboard.id]
+        text = yaml.dump(config)
+        config_path.unlink()
+        config_path.write_text(text)
+
+    return populate_workspace

From 2f62a0f2d4890690a3e1987d6b9e942e4217df42 Mon Sep 17 00:00:00 2001
From: Hari Selvarajan <105197202+HariGS-DB@users.noreply.github.com>
Date: Tue, 24 Sep 2024 18:09:15 +0100
Subject: [PATCH 06/21] Update documentation to explain the usage of
 collections and eligible commands (#2738)

<!-- REMOVE IRRELEVANT COMMENTS BEFORE CREATING A PULL REQUEST -->
## Changes
Updated the doc to explain the concept of collection and how to use the
join-collection cmd and the related collection-eligible commands
<!-- Summary of your changes that are easy to understand. Add
screenshots when necessary -->

### Functionality

- [ ] added relevant user documentation
---
 README.md | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/README.md b/README.md
index 60b842a9c4..2893f4f894 100644
--- a/README.md
+++ b/README.md
@@ -127,6 +127,8 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project.
   * [`revert-cluster-remap` command](#revert-cluster-remap-command)
   * [`upload` command](#upload-command)
   * [`download` command](#download-command)
+  * [`join-collection` command](#join-collection command)
+  * [collection eligible command](#collection-eligible-command)
 * [Common Challenges and the Solutions](#common-challenges-and-the-solutions)
     * [Network Connectivity Issues](#network-connectivity-issues)
     * [Insufficient Privileges](#insufficient-privileges)
@@ -1807,6 +1809,40 @@ $ databricks labs ucx download --file <file_path> --run-as-collection True
 Download a csv file from a single workspace (`--run-as-collection False`) or a collection of workspaces
 (`--run-as-collection True`). This command is especially useful when downloading the same file from multiple workspaces.
 
+## `join-collection` command
+
+```text
+$ databricks labs ucx join-collection --workspace-ids <comma seperate list of workspace ids> --profile <account-profile>
+```
+
+`join-collection` command joins 2 or more workspaces into a collection. This helps in running supported cli commands as a collection
+`join-collection` command updates config.yml file on each workspace ucx installation with installed_workspace_ids attribute.
+In order to run `join-collectioon` command a user should:
+ - be an Account admin on the Databricks account
+ - be a Workspace admin on all the workspaces to be joined as a collection) or a collection of workspaces
+ - have installed UCX on the workspace
+The `join-collection` command will fail and throw an error msg if the above conditions are not met.
+
+## collection eligible command
+
+Once `join-collection` command is run, it allows user to run multiple cli commands as a collection. The following cli commands
+are eligible to be run as a collection. User can run the below commands as collection by passing an additional flag `--run-as-collection=True`
+- `ensure-assessment-run`
+- `create-table-mapping`
+- `principal-prefix-access`
+- `migrate-credentials`
+- `create-uber-principal`
+- `create-missing-principals`
+- `validate-external-location`
+- `migrate-locations`
+- `create-catalog-schemas`
+- `migrate-tables`
+- `migrate-acls`
+- `migrate-dbsql-dashboards`
+- `validate-group-membership`
+Ex: `databricks labs ucx ensure-assessment-run --run-as-collection=True`
+
+
 # Common Challenges and the Solutions
 Users might encounter some challenges while installing and executing UCX. Please find the listing of some common challenges and the solutions below.
 

From 8fee9d853f1c3604b9e7f9d4f4dbdf3efc9cb495 Mon Sep 17 00:00:00 2001
From: Hari Selvarajan <105197202+HariGS-DB@users.noreply.github.com>
Date: Tue, 24 Sep 2024 19:35:15 +0100
Subject: [PATCH 07/21] Enables cli cmd `databricks labs ucx
 create-catalog-schemas` to apply catalog/schema acl from legacy
 hive_metastore (#2676)

<!-- REMOVE IRRELEVANT COMMENTS BEFORE CREATING A PULL REQUEST -->
## Changes
<!-- Summary of your changes that are easy to understand. Add
screenshots when necessary -->

### Linked issues
<!-- DOC: Link issue with a keyword: close, closes, closed, fix, fixes,
fixed, resolve, resolves, resolved. See
https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword
-->

Resolves #2514

### Functionality

- [ ] modified existing command: `databricks labs ucx
create-catalog-schemas`

### Tests
<!-- How is this tested? Please see the checklist below and also
describe any other relevant tests -->

- [ ] added unit tests
- [ ] added integration tests
---
 .../labs/ucx/contexts/application.py          |  8 +++-
 .../labs/ucx/hive_metastore/catalog_schema.py | 40 +++++++++++++++++--
 .../hive_metastore/test_catalog_schema.py     | 29 +++++++++++++-
 3 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py
index 1bb030c1fa..59517fdf12 100644
--- a/src/databricks/labs/ucx/contexts/application.py
+++ b/src/databricks/labs/ucx/contexts/application.py
@@ -353,7 +353,13 @@ def table_mapping(self):
 
     @cached_property
     def catalog_schema(self):
-        return CatalogSchema(self.workspace_client, self.table_mapping, self.principal_acl, self.sql_backend)
+        return CatalogSchema(
+            self.workspace_client,
+            self.table_mapping,
+            self.principal_acl,
+            self.sql_backend,
+            self.grants_crawler,
+        )
 
     @cached_property
     def verify_timeout(self):
diff --git a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py
index 9da944ba2f..6802ad5dc5 100644
--- a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py
+++ b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py
@@ -6,7 +6,7 @@
 
 from databricks.labs.blueprint.tui import Prompts
 from databricks.labs.lsql.backends import SqlBackend
-from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant
+from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant, GrantsCrawler
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.errors import NotFound
 from databricks.sdk.service.catalog import SchemaInfo
@@ -19,13 +19,19 @@
 
 class CatalogSchema:
     def __init__(
-        self, ws: WorkspaceClient, table_mapping: TableMapping, principal_grants: PrincipalACL, sql_backend: SqlBackend
+        self,
+        ws: WorkspaceClient,
+        table_mapping: TableMapping,
+        principal_grants: PrincipalACL,
+        sql_backend: SqlBackend,
+        grants_crawler: GrantsCrawler,
     ):
         self._ws = ws
         self._table_mapping = table_mapping
         self._external_locations = self._ws.external_locations.list()
         self._principal_grants = principal_grants
         self._backend = sql_backend
+        self._hive_grants_crawler = grants_crawler
 
     def create_all_catalogs_schemas(self, prompts: Prompts) -> None:
         candidate_catalogs, candidate_schemas = self._get_missing_catalogs_schemas()
@@ -46,10 +52,22 @@ def create_all_catalogs_schemas(self, prompts: Prompts) -> None:
                             f"Schema {candidate_schema} in catalog {candidate_catalog} " f"already exists. Skipping."
                         )
                         continue
+        self._apply_from_legacy_table_acls()
         self._update_principal_acl()
 
+    def _apply_from_legacy_table_acls(self):
+        grants = self._get_catalog_schema_hive_grants()
+        for grant in grants:
+            acl_migrate_sql = grant.uc_grant_sql()
+            if acl_migrate_sql is None:
+                logger.warning(f"Cannot identify UC grant for {grant.this_type_and_key()}. Skipping.")
+                continue
+            logger.debug(f"Migrating acls on {grant.this_type_and_key()} using SQL query: {acl_migrate_sql}")
+            self._backend.execute(acl_migrate_sql)
+
     def _update_principal_acl(self):
-        grants = self._get_catalog_schema_grants()
+
+        grants = self._get_catalog_schema_principal_acl_grants()
         for grant in grants:
             acl_migrate_sql = grant.uc_grant_sql()
             if acl_migrate_sql is None:
@@ -58,7 +76,21 @@ def _update_principal_acl(self):
             logger.debug(f"Migrating acls on {grant.this_type_and_key()} using SQL query: {acl_migrate_sql}")
             self._backend.execute(acl_migrate_sql)
 
-    def _get_catalog_schema_grants(self) -> list[Grant]:
+    def _get_catalog_schema_hive_grants(self) -> list[Grant]:
+        src_dst_schema_mapping = self._get_database_source_target_mapping()
+        hive_grants = self._hive_grants_crawler.snapshot()
+        new_grants: list[Grant] = []
+        for grant in hive_grants:
+            if grant.this_type_and_key()[0] == "DATABASE" and grant.database:
+                for schema in src_dst_schema_mapping[grant.database]:
+                    new_grants.append(replace(grant, catalog=schema.catalog_name, database=schema.name))
+        catalog_grants: set[Grant] = set()
+        for grant in new_grants:
+            catalog_grants.add(replace(grant, database=None))
+        new_grants.extend(catalog_grants)
+        return new_grants
+
+    def _get_catalog_schema_principal_acl_grants(self) -> list[Grant]:
         src_trg_schema_mapping = self._get_database_source_target_mapping()
         grants = self._principal_grants.get_interactive_cluster_grants()
         # filter on grants to only get database level grants
diff --git a/tests/unit/hive_metastore/test_catalog_schema.py b/tests/unit/hive_metastore/test_catalog_schema.py
index d632875cb9..66e0784293 100644
--- a/tests/unit/hive_metastore/test_catalog_schema.py
+++ b/tests/unit/hive_metastore/test_catalog_schema.py
@@ -9,7 +9,7 @@
 from databricks.sdk.service.catalog import CatalogInfo, ExternalLocationInfo, SchemaInfo
 
 from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema
-from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant
+from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant, GrantsCrawler
 from databricks.labs.ucx.hive_metastore.mapping import TableMapping
 
 
@@ -83,6 +83,7 @@ def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema:
     )
     table_mapping = TableMapping(installation, ws, backend)
     principal_acl = create_autospec(PrincipalACL)
+    hive_acl = create_autospec(GrantsCrawler)
     grants = [
         Grant('user1', 'SELECT', 'catalog1', 'schema3', 'table'),
         Grant('user1', 'MODIFY', 'catalog2', 'schema2', 'table'),
@@ -90,8 +91,27 @@ def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema:
         Grant('user1', 'USAGE', 'hive_metastore', 'schema3'),
         Grant('user1', 'USAGE', 'hive_metastore', 'schema2'),
     ]
+    hive_grants = [
+        Grant(principal="princ1", catalog="hive_metastore", action_type="USE"),
+        Grant(principal="princ2", catalog="hive_metastore", database="schema3", action_type="USAGE"),
+        Grant(
+            principal="princ33",
+            catalog="hive_metastore",
+            database="database_one",
+            view="table_one",
+            action_type="SELECT",
+        ),
+        Grant(
+            principal="princ5",
+            catalog="hive_metastore",
+            database="schema2",
+            action_type="USAGE",
+        ),
+    ]
     principal_acl.get_interactive_cluster_grants.return_value = grants
-    return CatalogSchema(ws, table_mapping, principal_acl, backend)
+    hive_acl.snapshot.return_value = hive_grants
+
+    return CatalogSchema(ws, table_mapping, principal_acl, backend, hive_acl)
 
 
 @pytest.mark.parametrize("location", ["s3://foo/bar", "s3://foo/bar/test", "s3://foo/bar/test/baz"])
@@ -171,6 +191,11 @@ def test_catalog_schema_acl():
         'GRANT USE SCHEMA ON DATABASE `catalog2`.`schema3` TO `user1`',
         'GRANT USE CATALOG ON CATALOG `catalog1` TO `user1`',
         'GRANT USE CATALOG ON CATALOG `catalog2` TO `user1`',
+        'GRANT USE CATALOG ON CATALOG `catalog1` TO `princ2`',
+        'GRANT USE SCHEMA ON DATABASE `catalog1`.`schema3` TO `princ2`',
+        'GRANT USE SCHEMA ON DATABASE `catalog2`.`schema2` TO `princ5`',
+        'GRANT USE SCHEMA ON DATABASE `catalog2`.`schema3` TO `princ5`',
+        'GRANT USE CATALOG ON CATALOG `catalog2` TO `princ5`',
     ]
     assert len(backend.queries) == len(queries)
     for query in queries:

From 21da7cc3038aef19bde99afbb164bb6ebe48ce7d Mon Sep 17 00:00:00 2001
From: Cor <jczuurmond@protonmail.com>
Date: Wed, 25 Sep 2024 11:50:04 +0200
Subject: [PATCH 08/21] Add `create-ucx-catalog` cli command (#2694)

## Changes
Add `create-ucx-catalog` cli command to create the catalog (going to be)
used for migration tracking (possibly) accross multiple workspaces.

### Linked issues

Resolves #2571

### Functionality

- [x] added relevant user documentation
- [x] added new CLI command: `create-ucx-catalog`

### Tests

- [x] manually tested
- [x] added unit tests
- [x] added integration tests

---------

Co-authored-by: Andrew Snare <asnare@users.noreply.github.com>
---
 Makefile                                      |   2 +-
 README.md                                     |  21 +-
 labs.yml                                      |   7 +-
 src/databricks/labs/ucx/account/metastores.py |  21 +-
 src/databricks/labs/ucx/cli.py                |  20 +-
 src/databricks/labs/ucx/config.py             |   3 +-
 .../labs/ucx/contexts/account_cli.py          |   1 -
 .../labs/ucx/contexts/application.py          |   5 +-
 .../labs/ucx/hive_metastore/catalog_schema.py |  46 +++-
 src/databricks/labs/ucx/install.py            |   2 +
 .../labs/ucx/source_code/queries.py           |   6 +-
 tests/integration/conftest.py                 |  79 +++++--
 .../hive_metastore/test_catalog_schema.py     |  19 +-
 tests/unit/account/test_metastores.py         |  21 +-
 tests/unit/conftest.py                        |   2 +-
 .../hive_metastore/test_catalog_schema.py     |  51 ++++-
 tests/unit/install/test_install.py            | 214 +++++-------------
 tests/unit/source_code/test_queries.py        |  12 +
 tests/unit/test_cli.py                        |  26 ++-
 tests/unit/test_config.py                     |  17 ++
 20 files changed, 338 insertions(+), 237 deletions(-)

diff --git a/Makefile b/Makefile
index 8f6ceb1db1..4b62910e46 100644
--- a/Makefile
+++ b/Makefile
@@ -23,7 +23,7 @@ integration:
 	hatch run integration
 
 coverage:
-	hatch run coverage && open htmlcov/index.html
+	hatch run coverage; status=$$?; [ -e "htmlcov/index.html" ] && open htmlcov/index.html; exit $$status
 
 known:
 	hatch run python src/databricks/labs/ucx/source_code/known.py
diff --git a/README.md b/README.md
index 2893f4f894..cc4e05706a 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project.
 * [Metastore related commands](#metastore-related-commands)
   * [`show-all-metastores` command](#show-all-metastores-command)
   * [`assign-metastore` command](#assign-metastore-command)
+  * [`create-ucx-catalog` command](#create-ucx-catalog-command)
 * [Table migration commands](#table-migration-commands)
   * [`principal-prefix-access` command](#principal-prefix-access-command)
     * [Access for AWS S3 Buckets](#access-for-aws-s3-buckets)
@@ -1189,9 +1190,23 @@ a region, and you want to see which ones are available for assignment.
 databricks labs ucx assign-metastore --workspace-id <workspace-id> [--metastore-id <metastore-id>]
 ```
 
-This command assigns a metastore to a workspace with `workspace-id`. If there is only a single metastore in the workspace
-region, it will be automatically assigned to the workspace. If there are multiple metastores available, you need to specify
-the metastore id of the metastore you want to assign to the workspace.
+This command assigns a metastore to a workspace with `--workspace-id`. If there is only a single metastore in the
+workspace region, the command automatically assigns that metastore to the workspace. If there are multiple metastores
+available, the command prompts for specification of the metastore (id) you want to assign to the workspace.
+
+[[back to top](#databricks-labs-ucx)]
+
+## `create-ucx-catalog` command
+
+```commandline
+databricks labs ucx create-ucx-catalog
+16:12:59  INFO [d.l.u.hive_metastore.catalog_schema] Validating UC catalog: ucx
+Please provide storage location url for catalog: ucx (default: metastore): ...
+16:13:01  INFO [d.l.u.hive_metastore.catalog_schema] Creating UC catalog: ucx
+```
+
+Create and setup UCX artifact catalog. Amongst other things, the artifacts are used for tracking the migration progress
+across workspaces.
 
 # Table migration commands
 
diff --git a/labs.yml b/labs.yml
index 6a1adf5878..9e25ef0494 100644
--- a/labs.yml
+++ b/labs.yml
@@ -261,15 +261,18 @@ commands:
 
   - name: assign-metastore
     is_account_level: true
-    description: Enable Unity Catalog features on a workspace by assign a metastore to it
+    description: Enable Unity Catalog features on a workspace by assigning a metastore to it.
     flags:
       - name: workspace-id
-        description: (Optional) Workspace ID to assign a metastore to
+        description: Workspace ID to assign a metastore to
       - name: metastore-id
         description: (Optional) If there are multiple metastores in the region, specify the metastore ID to assign
       - name: default-catalog
         description: (Optional) Default catalog to assign to the workspace. If not provided, it will be hive_metastore
 
+  - name: create-ucx-catalog
+    description: Create UCX artifact catalog
+
   - name: migrate-tables
     description: |
       Trigger the `migrate-tables` workflow and, optionally, `migrate-external-hiveserde-tables-in-place-experimental`
diff --git a/src/databricks/labs/ucx/account/metastores.py b/src/databricks/labs/ucx/account/metastores.py
index 7089dd20c6..94c5b9e2fe 100644
--- a/src/databricks/labs/ucx/account/metastores.py
+++ b/src/databricks/labs/ucx/account/metastores.py
@@ -27,20 +27,16 @@ def show_all_metastores(self, workspace_id: str | None = None):
     def assign_metastore(
         self,
         prompts: Prompts,
-        str_workspace_id: str | None = None,
+        workspace_id: int,
+        *,
         metastore_id: str | None = None,
         default_catalog: str | None = None,
     ):
-        if not str_workspace_id:
-            workspace_choices = self._get_all_workspaces()
-            workspace_id = prompts.choice_from_dict("Please select a workspace:", workspace_choices)
-        else:
-            workspace_id = int(str_workspace_id)
-        if not metastore_id:
+        if metastore_id is None:
             # search for all matching metastores
             metastore_choices = self._get_all_metastores(self._get_region(workspace_id))
             if len(metastore_choices) == 0:
-                raise ValueError(f"No matching metastore found for workspace {workspace_id}")
+                raise ValueError(f"No matching metastore found for workspace: {workspace_id}")
             # if there are multiple matches, prompt users to select one
             if len(metastore_choices) > 1:
                 metastore_id = prompts.choice_from_dict(
@@ -48,17 +44,16 @@ def assign_metastore(
                 )
             else:
                 metastore_id = list(metastore_choices.values())[0]
-        if metastore_id is not None:
-            self._ac.metastore_assignments.create(workspace_id, metastore_id)
+        self._ac.metastore_assignments.create(workspace_id, metastore_id)
         # set the default catalog using the default_namespace setting API
         if default_catalog is not None:
             self._set_default_catalog(workspace_id, default_catalog)
 
-    def _get_region(self, workspace_id: int) -> str:
+    def _get_region(self, workspace_id: int) -> str | None:
         workspace = self._ac.workspaces.get(workspace_id)
         if self._ac.config.is_aws:
-            return str(workspace.aws_region)
-        return str(workspace.location)
+            return workspace.aws_region
+        return workspace.location
 
     def _get_all_workspaces(self) -> dict[str, int]:
         output = dict[str, int]()
diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py
index ef136d3893..e3f5d76756 100644
--- a/src/databricks/labs/ucx/cli.py
+++ b/src/databricks/labs/ucx/cli.py
@@ -591,11 +591,27 @@ def assign_metastore(
     workspace_id: str | None = None,
     metastore_id: str | None = None,
     default_catalog: str | None = None,
+    ctx: AccountContext | None = None,
 ):
     """Assign metastore to a workspace"""
     logger.info(f"Account ID: {a.config.account_id}")
-    ctx = AccountContext(a)
-    ctx.account_metastores.assign_metastore(ctx.prompts, workspace_id, metastore_id, default_catalog)
+    ctx = ctx or AccountContext(a)
+    ctx.account_metastores.assign_metastore(
+        ctx.prompts,
+        workspace_id,
+        metastore_id=metastore_id,
+        default_catalog=default_catalog,
+    )
+
+
+@ucx.command
+def create_ucx_catalog(w: WorkspaceClient, prompts: Prompts, ctx: WorkspaceContext | None = None) -> None:
+    """Create and setup UCX artifact catalog
+
+    Amongst other things, the artifacts are used for tracking the migration progress across workspaces.
+    """
+    workspace_context = ctx or WorkspaceContext(w)
+    workspace_context.catalog_schema.create_ucx_catalog(prompts)
 
 
 @ucx.command
diff --git a/src/databricks/labs/ucx/config.py b/src/databricks/labs/ucx/config.py
index fb5e1790bf..17e6f10ead 100644
--- a/src/databricks/labs/ucx/config.py
+++ b/src/databricks/labs/ucx/config.py
@@ -11,6 +11,7 @@ class WorkspaceConfig:  # pylint: disable=too-many-instance-attributes
     __version__ = 2
 
     inventory_database: str
+    ucx_catalog: str = "ucx"  # Catalog to store UCX artifact tables (shared across workspaces)
     # Group name conversion parameters.
     workspace_group_regex: str | None = None
     workspace_group_replace: str | None = None
@@ -25,7 +26,7 @@ class WorkspaceConfig:  # pylint: disable=too-many-instance-attributes
     connect: Config | None = None
     num_threads: int | None = 10
     database_to_catalog_mapping: dict[str, str] | None = None
-    default_catalog: str | None = "ucx_default"
+    default_catalog: str | None = "ucx_default"  # DEPRECATED: Keeping to avoid errors when loading old configurations
     log_level: str | None = "INFO"
 
     # Starting path for notebooks and directories crawler
diff --git a/src/databricks/labs/ucx/contexts/account_cli.py b/src/databricks/labs/ucx/contexts/account_cli.py
index 20617edab3..9f88049f65 100644
--- a/src/databricks/labs/ucx/contexts/account_cli.py
+++ b/src/databricks/labs/ucx/contexts/account_cli.py
@@ -3,7 +3,6 @@
 
 from databricks.sdk import AccountClient
 
-
 from databricks.labs.ucx.account.aggregate import AccountAggregate
 from databricks.labs.ucx.account.metastores import AccountMetastores
 from databricks.labs.ucx.account.workspaces import AccountWorkspaces
diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py
index 59517fdf12..d5f506ac1e 100644
--- a/src/databricks/labs/ucx/contexts/application.py
+++ b/src/databricks/labs/ucx/contexts/application.py
@@ -18,7 +18,7 @@
 from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler
 from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
 from databricks.sdk import AccountClient, WorkspaceClient, core
-from databricks.sdk.errors import ResourceDoesNotExist
+from databricks.sdk.errors import NotFound
 from databricks.sdk.service import sql
 
 from databricks.labs.ucx.account.workspaces import WorkspaceInfo
@@ -319,7 +319,7 @@ def principal_locations(self):
                 eligible_locations = self.aws_acl.get_eligible_locations_principals()
             if self.is_gcp:
                 raise NotImplementedError("Not implemented for GCP.")
-        except ResourceDoesNotExist:
+        except NotFound:
             pass
         return eligible_locations
 
@@ -359,6 +359,7 @@ def catalog_schema(self):
             self.principal_acl,
             self.sql_backend,
             self.grants_crawler,
+            self.config.ucx_catalog,
         )
 
     @cached_property
diff --git a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py
index 6802ad5dc5..280a345460 100644
--- a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py
+++ b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py
@@ -18,6 +18,7 @@
 
 
 class CatalogSchema:
+
     def __init__(
         self,
         ws: WorkspaceClient,
@@ -25,6 +26,7 @@ def __init__(
         principal_grants: PrincipalACL,
         sql_backend: SqlBackend,
         grants_crawler: GrantsCrawler,
+        ucx_catalog: str,
     ):
         self._ws = ws
         self._table_mapping = table_mapping
@@ -32,15 +34,33 @@ def __init__(
         self._principal_grants = principal_grants
         self._backend = sql_backend
         self._hive_grants_crawler = grants_crawler
+        self._ucx_catalog = ucx_catalog
+
+    def create_ucx_catalog(self, prompts: Prompts, *, properties: dict[str, str] | None = None) -> None:
+        """Create the UCX catalog.
+
+        Args:
+            prompts : Prompts
+                The prompts object to use for interactive input.
+            properties : (dict[str, str] | None), default None
+                The properties to pass to the catalog. If None, no properties are passed.
+        """
+        try:
+            self._create_catalog_validate(self._ucx_catalog, prompts, properties=properties)
+        except BadRequest as e:
+            if "already exists" in str(e):
+                logger.warning(f"Catalog '{self._ucx_catalog}' already exists. Skipping.")
+                return
+            raise
 
     def create_all_catalogs_schemas(self, prompts: Prompts) -> None:
         candidate_catalogs, candidate_schemas = self._get_missing_catalogs_schemas()
         for candidate_catalog in candidate_catalogs:
             try:
-                self._create_catalog_validate(candidate_catalog, prompts)
+                self._create_catalog_validate(candidate_catalog, prompts, properties=None)
             except BadRequest as e:
                 if "already exists" in str(e):
-                    logger.warning(f"Catalog {candidate_catalog} already exists. Skipping.")
+                    logger.warning(f"Catalog '{candidate_catalog}' already exists. Skipping.")
                     continue
         for candidate_catalog, schemas in candidate_schemas.items():
             for candidate_schema in schemas:
@@ -49,7 +69,7 @@ def create_all_catalogs_schemas(self, prompts: Prompts) -> None:
                 except BadRequest as e:
                     if "already exists" in str(e):
                         logger.warning(
-                            f"Schema {candidate_schema} in catalog {candidate_catalog} " f"already exists. Skipping."
+                            f"Schema '{candidate_schema}' in catalog '{candidate_catalog}' already exists. Skipping."
                         )
                         continue
         self._apply_from_legacy_table_acls()
@@ -121,20 +141,19 @@ def _get_database_source_target_mapping(self) -> dict[str, list[SchemaInfo]]:
                 src_trg_schema_mapping[table_mapping.src_schema].append(schema)
         return src_trg_schema_mapping
 
-    def _create_catalog_validate(self, catalog, prompts: Prompts):
-        logger.info(f"Creating UC catalog: {catalog}")
-        # create catalogs
+    def _create_catalog_validate(self, catalog: str, prompts: Prompts, *, properties: dict[str, str] | None) -> None:
+        logger.info(f"Validating UC catalog: {catalog}")
         attempts = 3
         while True:
             catalog_storage = prompts.question(
-                f"Please provide storage location url for catalog:{catalog}.", default="metastore"
+                f"Please provide storage location url for catalog: {catalog}", default="metastore"
             )
             if self._validate_location(catalog_storage):
                 break
             attempts -= 1
             if attempts == 0:
                 raise NotFound(f"Failed to validate location for {catalog} catalog")
-        self._create_catalog(catalog, catalog_storage)
+        self._create_catalog(catalog, catalog_storage, properties=properties)
 
     def _list_existing(self) -> tuple[set[str], dict[str, set[str]]]:
         """generate a list of existing UC catalogs and schema."""
@@ -199,12 +218,17 @@ def _validate_location(self, location: str):
                 return True
         return False
 
-    def _create_catalog(self, catalog, catalog_storage):
+    def _create_catalog(self, catalog: str, catalog_storage: str, *, properties: dict[str, str] | None) -> None:
         logger.info(f"Creating UC catalog: {catalog}")
         if catalog_storage == "metastore":
-            self._ws.catalogs.create(catalog, comment="Created by UCX")
+            self._ws.catalogs.create(catalog, comment="Created by UCX", properties=properties)
         else:
-            self._ws.catalogs.create(catalog, storage_root=catalog_storage, comment="Created by UCX")
+            self._ws.catalogs.create(
+                catalog,
+                storage_root=catalog_storage,
+                comment="Created by UCX",
+                properties=properties,
+            )
 
     def _create_schema(self, catalog, schema):
         logger.info(f"Creating UC schema: {schema} in catalog: {catalog}")
diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py
index c837d93ced..49c6020449 100644
--- a/src/databricks/labs/ucx/install.py
+++ b/src/databricks/labs/ucx/install.py
@@ -234,6 +234,7 @@ def _prompt_for_new_installation(self) -> WorkspaceConfig:
         inventory_database = self.prompts.question(
             "Inventory Database stored in hive_metastore", default=default_database, valid_regex=r"^\w+$"
         )
+        ucx_catalog = self.prompts.question("Catalog to store UCX artifacts in", default="ucx", valid_regex=r"^\w+$")
         log_level = self.prompts.question("Log level", default="INFO").upper()
         num_threads = int(self.prompts.question("Number of threads", default="8", valid_number=True))
         configure_groups = ConfigureGroups(self.prompts)
@@ -248,6 +249,7 @@ def _prompt_for_new_installation(self) -> WorkspaceConfig:
         )
         return WorkspaceConfig(
             inventory_database=inventory_database,
+            ucx_catalog=ucx_catalog,
             workspace_group_regex=configure_groups.workspace_group_regex,
             workspace_group_replace=configure_groups.workspace_group_replace,
             account_group_regex=configure_groups.account_group_regex,
diff --git a/src/databricks/labs/ucx/source_code/queries.py b/src/databricks/labs/ucx/source_code/queries.py
index 79b40caa0a..5bd11b1bc7 100644
--- a/src/databricks/labs/ucx/source_code/queries.py
+++ b/src/databricks/labs/ucx/source_code/queries.py
@@ -49,13 +49,13 @@ def __init__(
 
     def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
         assessment_start = datetime.now(timezone.utc)
+        dashboard_ids = self._dashboard_ids_in_scope()
+        logger.info(f"Running {len(dashboard_ids)} linting tasks...")
         linted_queries: set[str] = set()
-        all_dashboards = list(self._ws.dashboards.list())
-        logger.info(f"Running {len(all_dashboards)} linting tasks...")
         all_problems: list[QueryProblem] = []
         all_dfsas: list[DirectFsAccess] = []
         # first lint and collect queries from dashboards
-        for dashboard_id in self._dashboard_ids_in_scope():
+        for dashboard_id in dashboard_ids:
             dashboard = self._ws.dashboards.get(dashboard_id=dashboard_id)
             problems, dfsas = self._lint_and_collect_from_dashboard(dashboard, linted_queries)
             all_problems.extend(problems)
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 90204c9d26..d2f48cdfa7 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -374,10 +374,19 @@ def snapshot(self, *, force_refresh: bool = False) -> list[Mount]:
 
 
 class CommonUtils:
-    def __init__(self, make_schema_fixture, env_or_skip_fixture, ws_fixture):
+    def __init__(
+        self,
+        make_catalog_fixture,
+        make_schema_fixture,
+        env_or_skip_fixture,
+        ws_fixture,
+        make_random_fixture,
+    ):
+        self._make_catalog = make_catalog_fixture
         self._make_schema = make_schema_fixture
         self._env_or_skip = env_or_skip_fixture
         self._ws = ws_fixture
+        self._make_random = make_random_fixture
 
     def with_dummy_resource_permission(self):
         # TODO: in most cases (except prepared_principal_acl) it's just a sign of a bad logic, fix it
@@ -432,6 +441,10 @@ def installation(self):
     def inventory_database(self) -> str:
         return self._make_schema(catalog_name="hive_metastore").name
 
+    @cached_property
+    def ucx_catalog(self) -> str:
+        return self._make_catalog(name=f"ucx-{self._make_random()}").name
+
     @cached_property
     def workspace_client(self) -> WorkspaceClient:
         return self._ws
@@ -440,14 +453,22 @@ def workspace_client(self) -> WorkspaceClient:
 class MockRuntimeContext(CommonUtils, RuntimeContext):
     def __init__(
         self,
-        make_table_fixture,
+        make_catalog_fixture,
         make_schema_fixture,
+        make_table_fixture,
         make_udf_fixture,
         make_group_fixture,
         env_or_skip_fixture,
         ws_fixture,
+        make_random_fixture,
     ) -> None:
-        super().__init__(make_schema_fixture, env_or_skip_fixture, ws_fixture)
+        super().__init__(
+            make_catalog_fixture,
+            make_schema_fixture,
+            env_or_skip_fixture,
+            ws_fixture,
+            make_random_fixture,
+        )
         RuntimeContext.__init__(self)
         self._make_table = make_table_fixture
         self._make_schema = make_schema_fixture
@@ -533,6 +554,7 @@ def config(self) -> WorkspaceConfig:
         return WorkspaceConfig(
             warehouse_id=self._env_or_skip("TEST_DEFAULT_WAREHOUSE_ID"),
             inventory_database=self.inventory_database,
+            ucx_catalog=self.ucx_catalog,
             connect=self.workspace_client.config,
             renamed_group_prefix=f'tmp-{self.inventory_database}-',
             include_group_names=self.created_groups,
@@ -668,19 +690,46 @@ def group_manager(self):
 
 
 @pytest.fixture
-def runtime_ctx(ws, sql_backend, make_table, make_schema, make_udf, make_group, env_or_skip) -> MockRuntimeContext:
-    ctx = MockRuntimeContext(make_table, make_schema, make_udf, make_group, env_or_skip, ws)
+def runtime_ctx(
+    ws,
+    sql_backend,
+    make_catalog,
+    make_schema,
+    make_table,
+    make_udf,
+    make_group,
+    env_or_skip,
+    make_random,
+) -> MockRuntimeContext:
+    ctx = MockRuntimeContext(
+        make_catalog,
+        make_schema,
+        make_table,
+        make_udf,
+        make_group,
+        env_or_skip,
+        ws,
+        make_random,
+    )
     return ctx.replace(workspace_client=ws, sql_backend=sql_backend)
 
 
 class MockWorkspaceContext(CommonUtils, WorkspaceContext):
     def __init__(
         self,
+        make_catalog_fixture,
         make_schema_fixture,
         env_or_skip_fixture,
         ws_fixture,
+        make_random_fixture,
     ):
-        super().__init__(make_schema_fixture, env_or_skip_fixture, ws_fixture)
+        super().__init__(
+            make_catalog_fixture,
+            make_schema_fixture,
+            env_or_skip_fixture,
+            ws_fixture,
+            make_random_fixture,
+        )
         WorkspaceContext.__init__(self, ws_fixture)
 
     @cached_property
@@ -720,8 +769,8 @@ def azure_subscription_id(self) -> str:
 
 
 @pytest.fixture
-def az_cli_ctx(ws, env_or_skip, make_schema, sql_backend):
-    ctx = MockLocalAzureCli(make_schema, env_or_skip, ws)
+def az_cli_ctx(ws, env_or_skip, make_catalog, make_schema, make_random, sql_backend):
+    ctx = MockLocalAzureCli(make_catalog, make_schema, env_or_skip, ws, make_random)
     return ctx.replace(sql_backend=sql_backend)
 
 
@@ -778,8 +827,9 @@ class MockInstallationContext(MockRuntimeContext):
 
     def __init__(  # pylint: disable=too-many-arguments
         self,
-        make_table_fixture,
+        make_catalog_fixture,
         make_schema_fixture,
+        make_table_fixture,
         make_udf_fixture,
         make_group_fixture,
         env_or_skip_fixture,
@@ -790,14 +840,15 @@ def __init__(  # pylint: disable=too-many-arguments
         watchdog_purge_suffix,
     ):
         super().__init__(
-            make_table_fixture,
+            make_catalog_fixture,
             make_schema_fixture,
+            make_table_fixture,
             make_udf_fixture,
             make_group_fixture,
             env_or_skip_fixture,
             ws_fixture,
+            make_random_fixture,
         )
-        self._make_random = make_random_fixture
         self._make_acc_group = make_acc_group_fixture
         self._make_user = make_user_fixture
         self._watchdog_purge_suffix = watchdog_purge_suffix
@@ -953,8 +1004,9 @@ def prompts(self):
 def installation_ctx(  # pylint: disable=too-many-arguments
     ws,
     sql_backend,
-    make_table,
+    make_catalog,
     make_schema,
+    make_table,
     make_udf,
     make_group,
     env_or_skip,
@@ -964,8 +1016,9 @@ def installation_ctx(  # pylint: disable=too-many-arguments
     watchdog_purge_suffix,
 ) -> Generator[MockInstallationContext, None, None]:
     ctx = MockInstallationContext(
-        make_table,
+        make_catalog,
         make_schema,
+        make_table,
         make_udf,
         make_group,
         env_or_skip,
diff --git a/tests/integration/hive_metastore/test_catalog_schema.py b/tests/integration/hive_metastore/test_catalog_schema.py
index 713bfba1f6..3377a53034 100644
--- a/tests/integration/hive_metastore/test_catalog_schema.py
+++ b/tests/integration/hive_metastore/test_catalog_schema.py
@@ -1,11 +1,11 @@
 import logging
 from datetime import timedelta
-import pytest
 
+import pytest
 from databricks.labs.blueprint.tui import MockPrompts
-
 from databricks.sdk.errors import NotFound
 from databricks.sdk.retries import retried
+from databricks.sdk.service.catalog import CatalogInfo
 from databricks.sdk.service.compute import DataSecurityMode, AwsAttributes
 from databricks.sdk.service.catalog import Privilege, SecurableType, PrivilegeAssignment
 from databricks.sdk.service.iam import PermissionLevel
@@ -16,6 +16,21 @@
 _SPARK_CONF = get_azure_spark_conf()
 
 
+@retried(on=[NotFound], timeout=timedelta(minutes=2))
+def test_create_ucx_catalog_creates_catalog(ws, runtime_ctx, watchdog_remove_after) -> None:
+    # Delete catalog created for testing to test the creation of a new catalog
+    runtime_ctx.workspace_client.catalogs.delete(runtime_ctx.ucx_catalog, force=True)
+    prompts = MockPrompts({f"Please provide storage location url for catalog: {runtime_ctx.ucx_catalog}": "metastore"})
+
+    runtime_ctx.catalog_schema.create_ucx_catalog(prompts, properties={"RemoveAfter": watchdog_remove_after})
+
+    @retried(on=[NotFound], timeout=timedelta(seconds=20))
+    def get_catalog(name: str) -> CatalogInfo:
+        return ws.catalogs.get(name)
+
+    assert get_catalog(runtime_ctx.ucx_catalog)
+
+
 @retried(on=[NotFound], timeout=timedelta(minutes=2))
 def test_create_catalog_schema_with_principal_acl_azure(
     ws,
diff --git a/tests/unit/account/test_metastores.py b/tests/unit/account/test_metastores.py
index 88fe9226fb..59ba89d36b 100644
--- a/tests/unit/account/test_metastores.py
+++ b/tests/unit/account/test_metastores.py
@@ -37,7 +37,7 @@ def test_show_all_metastores(acc_client, caplog):
     assert "metastore_use - 124" in caplog.messages
 
 
-def test_assign_metastore(acc_client):
+def test_assign_metastore(acc_client) -> None:
     acc_client.metastores.list.return_value = [
         MetastoreInfo(name="metastore_usw_1", metastore_id="123", region="us-west-2"),
         MetastoreInfo(name="metastore_usw_2", metastore_id="124", region="us-west-2"),
@@ -55,14 +55,19 @@ def test_assign_metastore(acc_client):
         etag="123", namespace=StringMessage("hive_metastore")
     )
     account_metastores = AccountMetastores(acc_client)
-    prompts = MockPrompts({"Multiple metastores found, please select one*": "0", "Please select a workspace:*": "0"})
+    prompts = MockPrompts({"Multiple metastores found, please select one*": "0"})
 
-    # need to select a workspace - since it is alphabetically sorted, needs to pick workspace bar
-    account_metastores.assign_metastore(prompts, "", "", "")
+    account_metastores.assign_metastore(prompts, 123457)
     acc_client.metastore_assignments.create.assert_called_with(123457, "123")
+    ws.settings.default_namespaces.update.assert_not_called()
+
+    # Empty default catalog should not call default name spaces
+    account_metastores.assign_metastore(prompts, 123457, default_catalog="")
+    acc_client.metastore_assignments.create.assert_called_with(123457, "123")
+    ws.settings.default_namespaces.update.assert_not_called()
 
     # multiple metastores & default catalog name, need to choose one
-    account_metastores.assign_metastore(prompts, "123456", "", "main")
+    account_metastores.assign_metastore(prompts, 123456, default_catalog="main")
     acc_client.metastore_assignments.create.assert_called_with(123456, "123")
     ws.settings.default_namespace.update.assert_called_with(
         allow_missing=True,
@@ -72,7 +77,7 @@ def test_assign_metastore(acc_client):
 
     # default catalog not found, still get etag
     ws.settings.default_namespace.get.side_effect = NotFound(details=[{"metadata": {"etag": "not_found"}}])
-    account_metastores.assign_metastore(prompts, "123456", "", "main")
+    account_metastores.assign_metastore(prompts, 123456, default_catalog="main")
     acc_client.metastore_assignments.create.assert_called_with(123456, "123")
     ws.settings.default_namespace.update.assert_called_with(
         allow_missing=True,
@@ -82,10 +87,10 @@ def test_assign_metastore(acc_client):
 
     # only one metastore, should assign directly
     acc_client.workspaces.get.return_value = Workspace(workspace_id=123456, aws_region="us-east-2")
-    account_metastores.assign_metastore(MockPrompts({}), "123456")
+    account_metastores.assign_metastore(MockPrompts({}), 123456)
     acc_client.metastore_assignments.create.assert_called_with(123456, "126")
 
     # no metastore found, error
     acc_client.workspaces.get.return_value = Workspace(workspace_id=123456, aws_region="us-central-2")
     with pytest.raises(ValueError):
-        account_metastores.assign_metastore(MockPrompts({}), "123456")
+        account_metastores.assign_metastore(MockPrompts({}), 123456)
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index d8cf58f221..7c99db89ed 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -111,5 +111,5 @@ def mock_notebook_resolver():
 
 
 @pytest.fixture
-def mock_backend():
+def mock_backend() -> MockBackend:
     return MockBackend()
diff --git a/tests/unit/hive_metastore/test_catalog_schema.py b/tests/unit/hive_metastore/test_catalog_schema.py
index 66e0784293..100c130519 100644
--- a/tests/unit/hive_metastore/test_catalog_schema.py
+++ b/tests/unit/hive_metastore/test_catalog_schema.py
@@ -1,3 +1,4 @@
+import logging
 from unittest.mock import call, create_autospec
 
 import pytest
@@ -5,7 +6,7 @@
 from databricks.labs.blueprint.tui import MockPrompts
 from databricks.labs.lsql.backends import MockBackend
 from databricks.sdk import WorkspaceClient
-from databricks.sdk.errors import NotFound
+from databricks.sdk.errors import BadRequest, NotFound
 from databricks.sdk.service.catalog import CatalogInfo, ExternalLocationInfo, SchemaInfo
 
 from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema
@@ -15,6 +16,12 @@
 
 def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema:
     ws.catalogs.list.return_value = [CatalogInfo(name="catalog1")]
+
+    def raise_catalog_exists(catalog: str, *_, **__) -> None:
+        if catalog == "catalog1":
+            raise BadRequest("Catalog 'catalog1' already exists")
+
+    ws.catalogs.create.side_effect = raise_catalog_exists
     ws.schemas.list.return_value = [SchemaInfo(name="schema1")]
     ws.external_locations.list.return_value = [ExternalLocationInfo(url="s3://foo/bar")]
     if backend is None:
@@ -111,7 +118,33 @@ def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema:
     principal_acl.get_interactive_cluster_grants.return_value = grants
     hive_acl.snapshot.return_value = hive_grants
 
-    return CatalogSchema(ws, table_mapping, principal_acl, backend, hive_acl)
+    return CatalogSchema(ws, table_mapping, principal_acl, backend, hive_acl, "ucx")
+
+
+def test_create_ucx_catalog_creates_ucx_catalog() -> None:
+    ws = create_autospec(WorkspaceClient)
+    mock_prompts = MockPrompts({"Please provide storage location url for catalog: ucx": "metastore"})
+
+    catalog_schema = prepare_test(ws)
+    catalog_schema.create_ucx_catalog(mock_prompts)
+
+    ws.catalogs.create.assert_called_with("ucx", comment="Created by UCX", properties=None)
+
+
+def test_create_ucx_catalog_skips_when_ucx_catalogs_exists(caplog) -> None:
+    ws = create_autospec(WorkspaceClient)
+    mock_prompts = MockPrompts({"Please provide storage location url for catalog: ucx": "metastore"})
+    catalog_schema = prepare_test(ws)
+
+    def raise_catalog_exists(catalog: str, *_, **__) -> None:
+        if catalog == "ucx":
+            raise BadRequest("Catalog 'ucx' already exists")
+
+    ws.catalogs.create.side_effect = raise_catalog_exists
+
+    with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.hive_metastore.catalog_schema"):
+        catalog_schema.create_ucx_catalog(mock_prompts)
+    assert "Catalog 'ucx' already exists. Skipping." in caplog.text
 
 
 @pytest.mark.parametrize("location", ["s3://foo/bar", "s3://foo/bar/test", "s3://foo/bar/test/baz"])
@@ -124,9 +157,9 @@ def test_create_all_catalogs_schemas_creates_catalogs(location: str):
     catalog_schema.create_all_catalogs_schemas(mock_prompts)
 
     calls = [
-        call("catalog2", storage_root=location, comment="Created by UCX"),
-        call("catalog3", storage_root=location, comment="Created by UCX"),
-        call("catalog4", storage_root=location, comment="Created by UCX"),
+        call("catalog2", storage_root=location, comment="Created by UCX", properties=None),
+        call("catalog3", storage_root=location, comment="Created by UCX", properties=None),
+        call("catalog4", storage_root=location, comment="Created by UCX", properties=None),
     ]
     ws.catalogs.create.assert_has_calls(calls, any_order=True)
 
@@ -165,8 +198,8 @@ def test_no_catalog_storage():
     catalog_schema.create_all_catalogs_schemas(mock_prompts)
 
     calls = [
-        call("catalog2", comment="Created by UCX"),
-        call("catalog3", comment="Created by UCX"),
+        call("catalog2", comment="Created by UCX", properties=None),
+        call("catalog3", comment="Created by UCX", properties=None),
     ]
     ws.catalogs.create.assert_has_calls(calls, any_order=True)
 
@@ -180,8 +213,8 @@ def test_catalog_schema_acl():
     catalog_schema.create_all_catalogs_schemas(mock_prompts)
 
     calls = [
-        call("catalog2", comment="Created by UCX"),
-        call("catalog3", comment="Created by UCX"),
+        call("catalog2", comment="Created by UCX", properties=None),
+        call("catalog3", comment="Created by UCX", properties=None),
     ]
     ws.catalogs.create.assert_has_calls(calls, any_order=True)
     ws.schemas.create.assert_any_call("schema2", "catalog2", comment="Created by UCX")
diff --git a/tests/unit/install/test_install.py b/tests/unit/install/test_install.py
index d48970beaf..404d085400 100644
--- a/tests/unit/install/test_install.py
+++ b/tests/unit/install/test_install.py
@@ -26,10 +26,9 @@
 )
 from databricks.sdk.errors.platform import BadRequest
 from databricks.sdk.service import iam, jobs, sql
-from databricks.sdk.service.compute import Policy, State
+from databricks.sdk.service.compute import Policy
 from databricks.sdk.service.jobs import BaseRun, RunLifeCycleState, RunResultState, RunState
 from databricks.sdk.service.provisioning import Workspace
-from databricks.sdk.service.sql import EndpointInfo, EndpointInfoWarehouseType
 
 import databricks.labs.ucx.installer.mixins
 import databricks.labs.ucx.uninstall  # noqa
@@ -333,47 +332,64 @@ def result():
     )
 
 
-def test_save_config(ws, mock_installation):
-    ws.workspace.get_status = not_found
-    ws.warehouses.list = lambda **_: [
-        EndpointInfo(name="abc", id="abc", warehouse_type=EndpointInfoWarehouseType.PRO, state=State.RUNNING)
-    ]
-    ws.workspace.download = not_found
-
+@pytest.mark.parametrize(
+    "prompt_question,prompt_answer,workspace_config_overwrite",
+    [
+        ("Inventory Database stored in hive_metastore", "non_default", {"inventory_database": "non_default"}),
+        ("Catalog to store UCX artifacts in", "ucx-test", {"ucx_catalog": "ucx-test"}),
+        ("Log level", "DEBUG", {"log_level": "DEBUG"}),
+        (r".*workspace group names.*", "g1, g2, g99", {"include_group_names": ["g1", "g2", "g99"]}),
+        ("Number of threads", "16", {"num_threads": 16}),
+        (r"Comma-separated list of databases to migrate.*", "db1,db2", {"include_databases": ["db1", "db2"]}),
+        (r"Does given workspace .* block Internet access?", "yes", {"upload_dependencies": True}),
+        ("Do you want to trigger assessment job after installation?", "yes", {"trigger_job": True}),
+        ("Reconciliation threshold, in percentage", "10", {"recon_tolerance_percent": 10}),
+        (
+            r"Parallelism for migrating.*",
+            "1000",
+            {"spark_conf": {"spark.sql.sources.parallelPartitionDiscovery.parallelism": "1000"}},
+        ),
+        (r"Min workers for auto-scale.*", "2", {"min_workers": 2}),
+        (r"Max workers for auto-scale.*", "20", {"max_workers": 20}),
+    ],
+)
+def test_configure_sets_expected_workspace_configuration_values(
+    ws,
+    mock_installation,
+    prompt_question,
+    prompt_answer,
+    workspace_config_overwrite,
+) -> None:
+    workspace_config_default = {
+        "version": 2,
+        "default_catalog": "ucx_default",
+        "ucx_catalog": "ucx",
+        "inventory_database": "ucx",
+        "log_level": "INFO",
+        "num_threads": 8,
+        "min_workers": 1,
+        "max_workers": 10,
+        "policy_id": "foo",
+        "renamed_group_prefix": "db-temp-",
+        "warehouse_id": "abc",
+        "workspace_start_path": "/",
+        "num_days_submit_runs_history": 30,
+        "recon_tolerance_percent": 5,
+    }
     prompts = MockPrompts(
         {
             r".*PRO or SERVERLESS SQL warehouse.*": "1",
-            r"Choose how to map the workspace groups.*": "2",
+            r"Choose how to map the workspace groups.*": "2",  # specify names
             r".*": "",
-            r".*days to analyze submitted runs.*": "1",
-            r"Reconciliation threshold, in percentage.*": "5",
+            prompt_question: prompt_answer,
         }
     )
-    install = WorkspaceInstaller(ws).replace(
-        prompts=prompts,
-        installation=mock_installation,
-        product_info=PRODUCT_INFO,
-    )
+    install = WorkspaceInstaller(ws).replace(prompts=prompts, installation=mock_installation, product_info=PRODUCT_INFO)
+
     install.configure()
 
-    mock_installation.assert_file_written(
-        'config.yml',
-        {
-            'version': 2,
-            'default_catalog': 'ucx_default',
-            'inventory_database': 'ucx',
-            'log_level': 'INFO',
-            'num_days_submit_runs_history': 30,
-            'num_threads': 8,
-            'min_workers': 1,
-            'max_workers': 10,
-            'policy_id': 'foo',
-            'renamed_group_prefix': 'db-temp-',
-            'warehouse_id': 'abc',
-            'workspace_start_path': '/',
-            'recon_tolerance_percent': 5,
-        },
-    )
+    workspace_config_expected = {**workspace_config_default, **workspace_config_overwrite}
+    mock_installation.assert_file_written("config.yml", workspace_config_expected)
 
 
 def test_corrupted_config(ws, mock_installation, caplog):
@@ -398,47 +414,7 @@ def test_corrupted_config(ws, mock_installation, caplog):
     assert 'Existing installation at ~/mock is corrupted' in caplog.text
 
 
-def test_save_config_strip_group_names(ws, mock_installation):
-    prompts = MockPrompts(
-        {
-            r".*PRO or SERVERLESS SQL warehouse.*": "1",
-            r"Choose how to map the workspace groups.*": "2",  # specify names
-            r".*workspace group names.*": "g1, g2, g99",
-            r".*": "",
-            r"Reconciliation threshold, in percentage.*": "5",
-        }
-    )
-    ws.workspace.get_status = not_found
-
-    install = WorkspaceInstaller(ws).replace(
-        prompts=prompts,
-        installation=mock_installation,
-        product_info=PRODUCT_INFO,
-    )
-    install.configure()
-
-    mock_installation.assert_file_written(
-        'config.yml',
-        {
-            'version': 2,
-            'default_catalog': 'ucx_default',
-            'include_group_names': ['g1', 'g2', 'g99'],
-            'inventory_database': 'ucx',
-            'log_level': 'INFO',
-            'num_days_submit_runs_history': 30,
-            'num_threads': 8,
-            'min_workers': 1,
-            'max_workers': 10,
-            'policy_id': 'foo',
-            'renamed_group_prefix': 'db-temp-',
-            'warehouse_id': 'abc',
-            'workspace_start_path': '/',
-            'recon_tolerance_percent': 5,
-        },
-    )
-
-
-def test_create_cluster_policy(ws, mock_installation):
+def test_create_cluster_policy(ws, mock_installation) -> None:
     ws.cluster_policies.list.return_value = [
         Policy(
             policy_id="foo1",
@@ -470,6 +446,7 @@ def test_create_cluster_policy(ws, mock_installation):
         {
             'version': 2,
             'default_catalog': 'ucx_default',
+            'ucx_catalog': 'ucx',
             'include_group_names': ['g1', 'g2', 'g99'],
             'inventory_database': 'ucx',
             'log_level': 'INFO',
@@ -1083,45 +1060,6 @@ def test_open_config(ws, mocker, mock_installation):
     webbrowser_open.assert_called_with('https://localhost/#workspace~/mock/config.yml')
 
 
-def test_save_config_should_include_databases(ws, mock_installation):
-    prompts = MockPrompts(
-        {
-            r".*PRO or SERVERLESS SQL warehouse.*": "1",
-            r"Choose how to map the workspace groups.*": "2",  # specify names
-            r"Comma-separated list of databases to migrate.*": "db1,db2",
-            r"Reconciliation threshold, in percentage.*": "5",
-            r".*": "",
-        }
-    )
-    ws.workspace.get_status = not_found
-    install = WorkspaceInstaller(ws).replace(
-        prompts=prompts,
-        installation=mock_installation,
-        product_info=PRODUCT_INFO,
-    )
-    install.configure()
-
-    mock_installation.assert_file_written(
-        'config.yml',
-        {
-            'version': 2,
-            'default_catalog': 'ucx_default',
-            'include_databases': ['db1', 'db2'],
-            'inventory_database': 'ucx',
-            'log_level': 'INFO',
-            'num_threads': 8,
-            'min_workers': 1,
-            'max_workers': 10,
-            'policy_id': 'foo',
-            'renamed_group_prefix': 'db-temp-',
-            'warehouse_id': 'abc',
-            'workspace_start_path': '/',
-            'num_days_submit_runs_history': 30,
-            'recon_tolerance_percent': 5,
-        },
-    )
-
-
 def test_triggering_assessment_wf(ws, mocker, mock_installation):
     ws.jobs.run_now = mocker.Mock()
     mocker.patch("webbrowser.open")
@@ -1237,49 +1175,6 @@ def test_runs_upgrades_on_more_recent_version(ws, any_prompt):
     wheels.upload_to_wsfs.assert_called()
 
 
-def test_fresh_install(ws, mock_installation):
-    prompts = MockPrompts(
-        {
-            r".*PRO or SERVERLESS SQL warehouse.*": "1",
-            r"Choose how to map the workspace groups.*": "2",
-            r"Open config file in.*": "no",
-            r"Parallelism for migrating.*": "1000",
-            r"Min workers for auto-scale.*": "2",
-            r"Max workers for auto-scale.*": "20",
-            r"Reconciliation threshold, in percentage.*": "5",
-            r".*": "",
-        }
-    )
-    ws.workspace.get_status = not_found
-
-    install = WorkspaceInstaller(ws).replace(
-        prompts=prompts,
-        installation=mock_installation,
-        product_info=PRODUCT_INFO,
-    )
-    install.configure()
-
-    mock_installation.assert_file_written(
-        'config.yml',
-        {
-            'version': 2,
-            'default_catalog': 'ucx_default',
-            'inventory_database': 'ucx',
-            'log_level': 'INFO',
-            'num_days_submit_runs_history': 30,
-            'num_threads': 8,
-            'policy_id': 'foo',
-            'spark_conf': {'spark.sql.sources.parallelPartitionDiscovery.parallelism': '1000'},
-            'min_workers': 2,
-            'max_workers': 20,
-            'renamed_group_prefix': 'db-temp-',
-            'warehouse_id': 'abc',
-            'workspace_start_path': '/',
-            'recon_tolerance_percent': 5,
-        },
-    )
-
-
 def test_remove_jobs(ws, caplog, mock_installation_extra_jobs, any_prompt):
     sql_backend = MockBackend()
     install_state = InstallState.from_installation(mock_installation_extra_jobs)
@@ -1746,7 +1641,7 @@ def test_user_workspace_installer(mock_ws):
     assert workspace_installer.install_state.install_folder().startswith("/Users/")
 
 
-def test_save_config_ext_hms(ws, mock_installation):
+def test_save_config_ext_hms(ws, mock_installation) -> None:
     ws.get_workspace_id.return_value = 12345678
     cluster_policy = {
         "spark_conf.spark.hadoop.javax.jdo.option.ConnectionURL": {"value": "url"},
@@ -1786,6 +1681,7 @@ def test_save_config_ext_hms(ws, mock_installation):
         {
             'version': 2,
             'default_catalog': 'ucx_default',
+            'ucx_catalog': 'ucx',
             'include_databases': ['db1', 'db2'],
             'inventory_database': 'ucx_12345678',
             'log_level': 'INFO',
diff --git a/tests/unit/source_code/test_queries.py b/tests/unit/source_code/test_queries.py
index c9b3a98a88..216a4647a9 100644
--- a/tests/unit/source_code/test_queries.py
+++ b/tests/unit/source_code/test_queries.py
@@ -33,3 +33,15 @@ def test_query_linter_collects_dfsas_from_queries(name, query, dfsa_paths, is_re
     assert set(dfsa.path for dfsa in dfsas) == set(dfsa_paths)
     assert all(dfsa.is_read == is_read for dfsa in dfsas)
     assert all(dfsa.is_write == is_write for dfsa in dfsas)
+
+
+def test_query_liner_refresh_report_writes_query_problems(migration_index, mock_backend) -> None:
+    ws = create_autospec(WorkspaceClient)
+    crawlers = create_autospec(DirectFsAccessCrawler)
+    linter = QueryLinter(ws, migration_index, crawlers, None)
+
+    linter.refresh_report(mock_backend, inventory_database="test")
+
+    assert mock_backend.has_rows_written_for("`test`.query_problems")
+    ws.dashboards.list.assert_called_once()
+    crawlers.assert_not_called()
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
index d53b8d1bfb..9c2e1ae646 100644
--- a/tests/unit/test_cli.py
+++ b/tests/unit/test_cli.py
@@ -13,7 +13,7 @@
 from databricks.sdk.errors import NotFound
 from databricks.sdk.errors.platform import BadRequest
 from databricks.sdk.service import jobs, sql
-from databricks.sdk.service.catalog import ExternalLocationInfo
+from databricks.sdk.service.catalog import ExternalLocationInfo, MetastoreInfo
 from databricks.sdk.service.compute import ClusterDetails, ClusterSource
 from databricks.sdk.service.iam import ComplexValue, User
 from databricks.sdk.service.jobs import Run, RunResultState, RunState
@@ -803,8 +803,8 @@ def test_create_catalogs_schemas_handles_existing(ws, caplog) -> None:
     create_catalogs_schemas(ws, prompts, ctx=WorkspaceContext(ws))
     ws.catalogs.list.assert_called_once()
 
-    assert "Catalog test already exists. Skipping." in caplog.messages
-    assert "Schema test in catalog test already exists. Skipping." in caplog.messages
+    assert "Catalog 'test' already exists. Skipping." in caplog.messages
+    assert "Schema 'test' in catalog 'test' already exists. Skipping." in caplog.messages
 
 
 def test_cluster_remap(ws, caplog):
@@ -875,9 +875,23 @@ def test_show_all_metastores(acc_client, caplog):
     assert 'Matching metastores are:' in caplog.messages
 
 
-def test_assign_metastore(acc_client, caplog):
-    with pytest.raises(ValueError):
-        assign_metastore(acc_client, "123")
+def test_assign_metastore_logs_account_id_and_assigns_metastore(caplog, acc_client) -> None:
+    ctx = AccountContext(acc_client)
+    acc_client.metastores.list.return_value = [MetastoreInfo(name="test", metastore_id="123")]
+
+    with caplog.at_level(logging.INFO, logger="databricks.labs.ucx.cli"):
+        assign_metastore(acc_client, "456", ctx=ctx)
+
+    assert "Account ID: 123" in caplog.messages
+    acc_client.metastore_assignments.create.assert_called_once()
+
+
+def test_create_ucx_catalog_calls_create_catalog(ws) -> None:
+    prompts = MockPrompts({"Please provide storage location url for catalog: .*": "metastore"})
+
+    create_catalogs_schemas(ws, prompts, ctx=WorkspaceContext(ws))
+
+    ws.catalogs.create.assert_called_once()
 
 
 @pytest.mark.parametrize("run_as_collection", [False, True])
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index b91be9fa46..83f1e39e07 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -1,8 +1,25 @@
+import dataclasses
+import pytest
+
 from databricks.labs.blueprint.installation import MockInstallation
 
 from databricks.labs.ucx.config import WorkspaceConfig
 
 
+@pytest.mark.parametrize(
+    "attribute,value",
+    [
+        ("ucx_catalog", "ucx"),
+        ("default_catalog", "default"),
+    ],
+)
+def test_config_attributes(attribute, value) -> None:
+    """Increase code coverage with this test"""
+    config = dataclasses.replace(WorkspaceConfig("test"), **{attribute: value})
+
+    assert getattr(config, attribute) == value
+
+
 def test_v1_migrate_zeroconf():
     installation = MockInstallation(
         {'config.yml': {'inventory_database': 'x', 'groups': {}, 'connect': {'host': 'a', 'token': 'b'}}}

From 2a09a8f4ae81fb44a14d7613ef68b58c9df2cc7e Mon Sep 17 00:00:00 2001
From: Hari Selvarajan <105197202+HariGS-DB@users.noreply.github.com>
Date: Wed, 25 Sep 2024 12:59:03 +0100
Subject: [PATCH 09/21] Fixes issue of circular dependency of migrate-location
 ACL (#2741)

<!-- REMOVE IRRELEVANT COMMENTS BEFORE CREATING A PULL REQUEST -->
## Changes
<!-- Summary of your changes that are easy to understand. Add
screenshots when necessary -->


Resolves #2529

### Functionality

- [ ] modified existing command: `databricks labs ucx migrate-locations`

### Tests
<!-- How is this tested? Please see the checklist below and also
describe any other relevant tests -->

- [ ] added unit tests
- [ ] added integration tests

---------

Co-authored-by: Amin Movahed <amin.movahed@databricks.com>
Co-authored-by: Amin Movahed <140028681+aminmovahed-db@users.noreply.github.com>
---
 .../labs/ucx/contexts/application.py          | 20 ++++++++-----------
 .../labs/ucx/hive_metastore/grants.py         |  6 +++---
 tests/unit/azure/test_locations.py            |  2 +-
 .../hive_metastore/test_principal_grants.py   | 14 ++++++++-----
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py
index d5f506ac1e..d595eb137f 100644
--- a/src/databricks/labs/ucx/contexts/application.py
+++ b/src/databricks/labs/ucx/contexts/application.py
@@ -18,7 +18,6 @@
 from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler
 from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
 from databricks.sdk import AccountClient, WorkspaceClient, core
-from databricks.sdk.errors import NotFound
 from databricks.sdk.service import sql
 
 from databricks.labs.ucx.account.workspaces import WorkspaceInfo
@@ -310,18 +309,15 @@ def aws_acl(self):
         )
 
     @cached_property
-    def principal_locations(self):
-        eligible_locations = {}
-        try:
+    def principal_locations_retriever(self):
+        def inner():
             if self.is_azure:
-                eligible_locations = self.azure_acl.get_eligible_locations_principals()
+                return self.azure_acl.get_eligible_locations_principals()
             if self.is_aws:
-                eligible_locations = self.aws_acl.get_eligible_locations_principals()
-            if self.is_gcp:
-                raise NotImplementedError("Not implemented for GCP.")
-        except NotFound:
-            pass
-        return eligible_locations
+                return self.aws_acl.get_eligible_locations_principals()
+            raise NotImplementedError("Not implemented for GCP.")
+
+        return inner
 
     @cached_property
     def principal_acl(self):
@@ -331,7 +327,7 @@ def principal_acl(self):
             self.installation,
             self.tables_crawler,
             self.mounts_crawler,
-            self.principal_locations,
+            self.principal_locations_retriever,
         )
 
     @cached_property
diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py
index a2afa473f3..8673779697 100644
--- a/src/databricks/labs/ucx/hive_metastore/grants.py
+++ b/src/databricks/labs/ucx/hive_metastore/grants.py
@@ -579,7 +579,7 @@ def __init__(
         installation: Installation,
         tables_crawler: TablesCrawler,
         mounts_crawler: Mounts,
-        cluster_locations: list[ComputeLocations],
+        cluster_locations: Callable[[], list[ComputeLocations]],
     ):
         self._backend = backend
         self._ws = ws
@@ -593,7 +593,7 @@ def get_interactive_cluster_grants(self) -> list[Grant]:
         mounts = list(self._mounts_crawler.snapshot())
         grants: set[Grant] = set()
 
-        for compute_location in self._compute_locations:
+        for compute_location in self._compute_locations():
             principals = self._get_cluster_principal_mapping(compute_location.compute_id, compute_location.compute_type)
             if len(principals) == 0:
                 continue
@@ -697,7 +697,7 @@ def apply_location_acl(self):
             "CREATE EXTERNAL VOLUME and READ_FILES for existing eligible interactive cluster users"
         )
         # get the eligible location mapped for each interactive cluster
-        for compute_location in self._compute_locations:
+        for compute_location in self._compute_locations():
             # get interactive cluster users
             principals = self._get_cluster_principal_mapping(compute_location.compute_id, compute_location.compute_type)
             if len(principals) == 0:
diff --git a/tests/unit/azure/test_locations.py b/tests/unit/azure/test_locations.py
index 2b6e8a177a..f1b901638b 100644
--- a/tests/unit/azure/test_locations.py
+++ b/tests/unit/azure/test_locations.py
@@ -30,7 +30,7 @@ def location_migration_for_test(ws, mock_backend, mock_installation, azurerm=Non
     azure_resource_permissions = AzureResourcePermissions(mock_installation, ws, azurerm, location_crawler)
     tables_crawler = TablesCrawler(mock_backend, 'ucx')
     mounts_crawler = Mounts(mock_backend, ws, 'ucx')
-    principal_acl = PrincipalACL(ws, mock_backend, mock_installation, tables_crawler, mounts_crawler, {})
+    principal_acl = PrincipalACL(ws, mock_backend, mock_installation, tables_crawler, mounts_crawler, lambda: [])
     external_locations_migration = ExternalLocationsMigration(
         ws, location_crawler, azure_resource_permissions, azurerm, principal_acl
     )
diff --git a/tests/unit/hive_metastore/test_principal_grants.py b/tests/unit/hive_metastore/test_principal_grants.py
index e62a998905..bef584fdab 100644
--- a/tests/unit/hive_metastore/test_principal_grants.py
+++ b/tests/unit/hive_metastore/test_principal_grants.py
@@ -157,11 +157,15 @@ def principal_acl(w, install, cluster_spn: list, warehouse_spn: list):
 
     spn_crawler = create_autospec(AzureServicePrincipalCrawler)
     spn_crawler.get_cluster_to_storage_mapping.return_value = cluster_spn
-    locations = []
-    if w.config.is_azure:
-        locations = azure_acl(w, install, cluster_spn, warehouse_spn).get_eligible_locations_principals()
-    if w.config.is_aws:
-        locations = aws_acl(w, install).get_eligible_locations_principals()
+
+    def inner():
+        if w.config.is_azure:
+            return azure_acl(w, install, cluster_spn, warehouse_spn).get_eligible_locations_principals()
+        if w.config.is_aws:
+            return aws_acl(w, install).get_eligible_locations_principals()
+        return None
+
+    locations = inner
     return PrincipalACL(w, sql_backend, install, table_crawler, mount_crawler, locations)
 
 
From d1dd0c5f684b0096acd8b447997893cd05c4211a Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@wanadoo.fr>
Date: Wed, 25 Sep 2024 14:09:42 +0200
Subject: [PATCH 10/21] Added static code analysis results to assessment
 dashboard (#2696)

## Changes
Add job/query problem widgets to the dashboard
Add directfs access widget to the dashboard

### Linked issues
Resolves #2595

### Functionality
None

### Tests
- [x] added integration tests using mock data
- [x] manually tested widgets, see below:


https://github.com/user-attachments/assets/3684c30f-761a-4de6-bc67-de650c5d5353

---------

Co-authored-by: Eric Vergnaud <eric.vergnaud@databricks.com>
Co-authored-by: Serge Smertin <259697+nfx@users.noreply.github.com>
---
 README.md                                     |  16 ++-
 .../main/35_0_code_compatibility_problems.md  |   8 ++
 ...de_compatibility_problems_in_workflows.sql |  35 ++++++
 ...e_compatibility_problems_in_dashboards.sql |  29 +++++
 .../36_0_direct_filesystem_access_problems.md |  14 +++
 .../main/36_1_direct_filesystem_accesses.sql  |  62 ++++++++++
 src/databricks/labs/ucx/source_code/graph.py  |   3 +-
 src/databricks/labs/ucx/source_code/jobs.py   |   8 +-
 .../labs/ucx/source_code/queries.py           |   8 +-
 .../integration/assessment/test_dashboards.py | 111 ++++++++++++++++++
 .../integration/assessment/test_workflows.py  |   4 +-
 tests/integration/conftest.py                 |  62 ++++++++--
 tests/integration/source_code/test_jobs.py    |   4 +-
 tests/integration/source_code/test_queries.py |   5 +-
 tests/unit/source_code/test_queries.py        |   2 +-
 15 files changed, 344 insertions(+), 27 deletions(-)
 create mode 100644 src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md
 create mode 100644 src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql
 create mode 100644 src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql
 create mode 100644 src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md
 create mode 100644 src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql
 create mode 100644 tests/integration/assessment/test_dashboards.py

diff --git a/README.md b/README.md
index cc4e05706a..7c5bf044c0 100644
--- a/README.md
+++ b/README.md
@@ -399,6 +399,9 @@ which can be used for further analysis and decision-making through the [assessme
 9. `assess_pipelines`: This task scans through all the Pipelines and identifies those pipelines that have Azure Service Principals embedded in their configurations. A list of all the pipelines with matching configurations is stored in the `$inventory.pipelines` table.
 10. `assess_azure_service_principals`: This task scans through all the clusters configurations, cluster policies, job cluster configurations, Pipeline configurations, and Warehouse configuration and identifies all the Azure Service Principals who have been given access to the Azure storage accounts via spark configurations referred in those entities. The list of all the Azure Service Principals referred in those configurations is saved in the `$inventory.azure_service_principals` table.
 11. `assess_global_init_scripts`: This task scans through all the global init scripts and identifies if there is an Azure Service Principal who has been given access to the Azure storage accounts via spark configurations referred in those scripts.
+12. `assess_dashboards`: This task scans through all the dashboards and analyzes embedded queries for migration problems. It also collects direct filesystem access patterns that require attention.
+13. `assess_workflows`: This task scans through all the jobs and tasks and analyzes notebooks and files for migration problems. It also collects direct filesystem access patterns that require attention.
+
 
 ![report](docs/assessment-report.png)
 
@@ -714,11 +717,16 @@ in the Migration dashboard.
 
 > Please note that this is an experimental workflow.
 
-The `experimental-workflow-linter` workflow lints accessible code belonging to all workflows/jobs present in the
-workspace. The linting emits problems indicating what to resolve for making the code Unity Catalog compatible.
+The `experimental-workflow-linter` workflow lints accessible code from 2 sources:
+ - all workflows/jobs present in the workspace
+ - all dashboards/queries present in the workspace
+The linting emits problems indicating what to resolve for making the code Unity Catalog compatible.
+The linting also locates direct filesystem access that need to be migrated.
 
-Once the workflow completes, the output will be stored in `$inventory_database.workflow_problems` table, and displayed
-in the Migration dashboard.
+Once the workflow completes:
+ - problems are stored in the `$inventory_database.workflow_problems`/`$inventory_database.query_problems` table
+ - direct filesystem access are stored in the `$inventory_database.directfs_in_paths`/`$inventory_database.directfs_in_queries` table
+ - all the above are displayed in the Migration dashboard.
 
 ![code compatibility problems](docs/code_compatibility_problems.png)
 
diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md b/src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md
new file mode 100644
index 0000000000..851d95a766
--- /dev/null
+++ b/src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md
@@ -0,0 +1,8 @@
+## Code compatibility problems
+
+The tables below assist with verifying if workflows and dashboards are Unity Catalog compatible. It can be filtered on the path,
+problem code and workflow name.
+Each row:
+- Points to a problem detected in the code using the code path, query or workflow & task reference and start/end line & column;
+- Explains the problem with a human-readable message and a code.
+
diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql b/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql
new file mode 100644
index 0000000000..64a7e956a5
--- /dev/null
+++ b/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql
@@ -0,0 +1,35 @@
+/*
+--title 'Workflow migration problems'
+--width 6
+--overrides '{"spec":{
+    "encodings":{
+      "columns": [
+        {"fieldName": "path", "booleanValues": ["false", "true"], "linkUrlTemplate": "/#workspace/{{ link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "path"},
+        {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"},
+        {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"},
+        {"fieldName": "workflow_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/jobs/{{ workflow_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_name"},
+        {"fieldName": "task_key", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/jobs/{{ workflow_id }}/tasks/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "task_key"},
+        {"fieldName": "start_line", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "start_line"},
+        {"fieldName": "start_col", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "start_col"},
+        {"fieldName": "end_line", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "end_line"},
+        {"fieldName": "end_col", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "end_col"}
+      ]},
+    "invisibleColumns": [
+      {"name": "link", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "link"},
+      {"name": "workflow_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_id"}
+    ]
+  }}'
+*/
+SELECT
+  substring_index(path, '@databricks.com/', -1) as path,
+  path as link,
+  code,
+  message,
+  job_id AS workflow_id,
+  job_name AS workflow_name,
+  task_key,
+  start_line,
+  start_col,
+  end_line,
+  end_col
+FROM inventory.workflow_problems
diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql
new file mode 100644
index 0000000000..a918e9682c
--- /dev/null
+++ b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql
@@ -0,0 +1,29 @@
+/*
+--title 'Dashboard compatibility problems'
+--width 6
+--overrides '{"spec":{
+    "encodings":{
+      "columns": [
+        {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"},
+        {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"},
+        {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Dashboard"},
+        {"fieldName": "query_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/editor/{{ query_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Query"}
+      ]},
+    "invisibleColumns": [
+      {"name": "dashboard_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_parent"},
+      {"name": "dashboard_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_id"},
+      {"name": "query_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_parent"},
+      {"name": "query_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_id"}
+    ]
+  }}'
+*/
+SELECT
+  dashboard_id,
+  dashboard_parent,
+  dashboard_name,
+  query_id,
+  query_parent,
+  query_name,
+  code,
+  message
+FROM inventory.query_problems
diff --git a/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md b/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md
new file mode 100644
index 0000000000..3f5b255681
--- /dev/null
+++ b/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md
@@ -0,0 +1,14 @@
+---
+height: 4
+---
+
+# Direct filesystem access problems
+
+The table below assists with verifying if workflows and dashboards require direct filesystem access.
+As a reminder, `dbfs:/` is not supported in Unity Catalog, and more generally direct filesystem access is discouraged.
+Rather, data should be accessed via Unity tables.
+
+Each row:
+- Points to a direct filesystem access detected in the code using the code path, query or workflow & task reference and start/end line & column;
+- Provides the _lineage_ i.e. which `workflow -> task -> notebook...` execution sequence leads to that access.
+
diff --git a/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql
new file mode 100644
index 0000000000..b038dbfbc8
--- /dev/null
+++ b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql
@@ -0,0 +1,62 @@
+/*
+--title 'Direct filesystem access problems'
+--width 6
+--overrides '{"spec":{
+    "encodings":{
+      "columns": [
+        {"fieldName": "location", "title": "location", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+        {"fieldName": "is_read", "title": "is_read", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]},
+        {"fieldName": "is_write", "title": "is_write", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]},
+        {"fieldName": "source", "title": "source", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ source_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]},
+        {"fieldName": "timestamp", "title": "last_modified", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]},
+        {"fieldName": "lineage", "title": "lineage", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ lineage_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]},
+        {"fieldName": "lineage_data", "title": "lineage_data", "type": "complex", "displayAs": "json", "booleanValues": ["false", "true"]},
+        {"fieldName": "assessment_start", "title": "assessment_start", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]},
+        {"fieldName": "assessment_end", "title": "assessment_end", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]}
+      ]},
+    "invisibleColumns": [
+    {"fieldName": "source_link", "title": "source_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+    {"fieldName": "lineage_type", "title": "lineage_type", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+    {"fieldName": "lineage_id", "title": "lineage_id", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+    {"fieldName": "lineage_link", "title": "lineage_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}
+    ]
+  }}'
+*/
+SELECT
+  path as location,
+  is_read,
+  is_write,
+  if( startswith(source_id, '/'), substring_index(source_id, '@databricks.com/', -1), split_part(source_id, '/', 2)) as source,
+  if( startswith(source_id, '/'), concat('/#workspace/', source_id), concat('/sql/editor/', split_part(source_id, '/', 2))) as source_link,
+  source_timestamp as `timestamp`,
+  case
+    when lineage.object_type = 'WORKFLOW' then concat('Workflow: ', lineage.other.name)
+    when lineage.object_type = 'TASK' then concat('Task: ', split_part(lineage.object_id, '/', 2))
+    when lineage.object_type = 'NOTEBOOK' then concat('Notebook: ', substring_index(lineage.object_id, '@databricks.com/', -1))
+    when lineage.object_type = 'FILE' then concat('File: ', substring_index(lineage.object_id, '@databricks.com/', -1))
+    when lineage.object_type = 'DASHBOARD' then concat('Dashboard: ', lineage.other.name)
+    when lineage.object_type = 'QUERY' then concat('Query: ', lineage.other.name)
+  end as lineage,
+  lineage.object_type as lineage_type,
+  lineage.object_id as lineage_id,
+  case
+    when lineage.object_type = 'WORKFLOW' then concat('/jobs/', lineage.object_id)
+    when lineage.object_type = 'TASK' then concat('/jobs/', split_part(lineage.object_id, '/', 1), '/tasks/', split_part(lineage.object_id, '/', 2))
+    when lineage.object_type = 'NOTEBOOK' then concat('/#workspace/', lineage.object_id)
+    when lineage.object_type = 'FILE' then concat('/#workspace/', lineage.object_id)
+    when lineage.object_type = 'DASHBOARD' then concat('/sql/dashboards/', lineage.object_id)
+    when lineage.object_type = 'QUERY' then concat('/sql/editor/', split_part(lineage.object_id, '/', 2))
+  end as lineage_link,
+  lineage.other as lineage_data,
+  assessment_start,
+  assessment_end
+from (SELECT
+  path,
+  is_read,
+  is_write,
+  source_id,
+  source_timestamp,
+  explode(source_lineage) as lineage,
+  assessment_start_timestamp as assessment_start,
+  assessment_end_timestamp as assessment_end
+FROM inventory.directfs)
diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py
index 347690f067..a722e9ef3c 100644
--- a/src/databricks/labs/ucx/source_code/graph.py
+++ b/src/databricks/labs/ucx/source_code/graph.py
@@ -337,7 +337,8 @@ def __repr__(self):
 
     @property
     def lineage(self) -> list[LineageAtom]:
-        return [LineageAtom(object_type="PATH", object_id=str(self.path))]
+        object_type = "NOTEBOOK" if is_a_notebook(self.path) else "FILE"
+        return [LineageAtom(object_type=object_type, object_id=str(self.path))]
 
 
 class SourceContainer(abc.ABC):
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index b5a4e47991..eb50b7ef31 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -86,8 +86,8 @@ def __repr__(self):
     @property
     def lineage(self) -> list[LineageAtom]:
         job_name = (None if self._job.settings is None else self._job.settings.name) or "unknown job"
-        job_lineage = LineageAtom("JOB", str(self._job.job_id), {"name": job_name})
-        task_lineage = LineageAtom("TASK", self._task.task_key)
+        job_lineage = LineageAtom("WORKFLOW", str(self._job.job_id), {"name": job_name})
+        task_lineage = LineageAtom("TASK", f"{self._job.job_id}/{self._task.task_key}")
         return [job_lineage, task_lineage]
 
 
@@ -469,8 +469,8 @@ def _collect_task_dfsas(
         job_name = job.settings.name if job.settings and job.settings.name else "<anonymous>"
         for dfsa in DfsaCollectorWalker(graph, set(), self._path_lookup, session_state):
             atoms = [
-                LineageAtom(object_type="JOB", object_id=job_id, other={"name": job_name}),
-                LineageAtom(object_type="TASK", object_id=task.task_key),
+                LineageAtom(object_type="WORKFLOW", object_id=job_id, other={"name": job_name}),
+                LineageAtom(object_type="TASK", object_id=f"{job_id}/{task.task_key}"),
             ]
             yield dataclasses.replace(dfsa, source_lineage=atoms + dfsa.source_lineage)
 
diff --git a/src/databricks/labs/ucx/source_code/queries.py b/src/databricks/labs/ucx/source_code/queries.py
index 5bd11b1bc7..26a2c14a26 100644
--- a/src/databricks/labs/ucx/source_code/queries.py
+++ b/src/databricks/labs/ucx/source_code/queries.py
@@ -66,7 +66,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
             linted_queries.add(query.id)
             problems = self.lint_query(query)
             all_problems.extend(problems)
-            dfsas = self.collect_dfsas_from_query(query)
+            dfsas = self.collect_dfsas_from_query("no-dashboard-id", query)
             all_dfsas.extend(dfsas)
         # dump problems
         logger.info(f"Saving {len(all_problems)} linting problems...")
@@ -123,7 +123,7 @@ def _lint_and_collect_from_dashboard(
                         dashboard_name=dashboard_name,
                     )
                 )
-            dfsas = self.collect_dfsas_from_query(query)
+            dfsas = self.collect_dfsas_from_query(dashboard_id, query)
             for dfsa in dfsas:
                 atom = LineageAtom(
                     object_type="DASHBOARD",
@@ -155,11 +155,11 @@ def lint_query(self, query: LegacyQuery) -> Iterable[QueryProblem]:
             )
 
     @classmethod
-    def collect_dfsas_from_query(cls, query: LegacyQuery) -> Iterable[DirectFsAccess]:
+    def collect_dfsas_from_query(cls, dashboard_id: str, query: LegacyQuery) -> Iterable[DirectFsAccess]:
         if query.query is None:
             return
         linter = DirectFsAccessSqlLinter()
-        source_id = query.id or "no id"
+        source_id = f"{dashboard_id}/{query.id}"
         source_name = query.name or "<anonymous>"
         source_timestamp = cls._read_timestamp(query.updated_at)
         source_lineage = [LineageAtom(object_type="QUERY", object_id=source_id, other={"name": source_name})]
diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py
new file mode 100644
index 0000000000..e95193faf5
--- /dev/null
+++ b/tests/integration/assessment/test_dashboards.py
@@ -0,0 +1,111 @@
+from datetime import datetime, timezone, timedelta
+
+import pytest
+
+from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, LineageAtom
+from databricks.labs.ucx.source_code.jobs import JobProblem
+from databricks.sdk.service.iam import PermissionLevel
+
+from databricks.labs.ucx.source_code.queries import QueryProblem
+
+
+def _populate_workflow_problems(installation_ctx):
+    job_problems = [
+        JobProblem(
+            job_id=12345,
+            job_name="Peter the Job",
+            task_key="23456",
+            path="parent/child.py",
+            code="sql-parse-error",
+            message="Could not parse SQL",
+            start_line=1234,
+            start_col=22,
+            end_line=1234,
+            end_col=32,
+        )
+    ]
+    installation_ctx.sql_backend.save_table(
+        f'{installation_ctx.inventory_database}.workflow_problems',
+        job_problems,
+        JobProblem,
+        mode='overwrite',
+    )
+
+
+def _populate_dashboard_problems(installation_ctx):
+    query_problems = [
+        QueryProblem(
+            dashboard_id="12345",
+            dashboard_parent="dashbards/parent",
+            dashboard_name="my_dashboard",
+            query_id="23456",
+            query_parent="queries/parent",
+            query_name="my_query",
+            code="sql-parse-error",
+            message="Could not parse SQL",
+        )
+    ]
+    installation_ctx.sql_backend.save_table(
+        f'{installation_ctx.inventory_database}.query_problems',
+        query_problems,
+        QueryProblem,
+        mode='overwrite',
+    )
+
+
+def _populate_directfs_problems(installation_ctx):
+    dfsas = [
+        DirectFsAccess(
+            path="some_path",
+            is_read=False,
+            is_write=True,
+            source_id="xyz.py",
+            source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0),
+            source_lineage=[
+                LineageAtom(object_type="WORKFLOW", object_id="my_workflow"),
+                LineageAtom(object_type="TASK", object_id="my_workflow/my_task"),
+                LineageAtom(object_type="NOTEBOOK", object_id="my_notebook"),
+                LineageAtom(object_type="FILE", object_id="my file"),
+            ],
+            assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0),
+            assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0),
+        )
+    ]
+    installation_ctx.directfs_access_crawler_for_paths.dump_all(dfsas)
+    dfsas = [
+        DirectFsAccess(
+            path="some_path",
+            is_read=False,
+            is_write=True,
+            source_id="xyz.py",
+            source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0),
+            source_lineage=[
+                LineageAtom(object_type="DASHBOARD", object_id="my_dashboard"),
+                LineageAtom(object_type="QUERY", object_id="my_dashboard/my_query"),
+            ],
+            assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0),
+            assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0),
+        )
+    ]
+    installation_ctx.directfs_access_crawler_for_queries.dump_all(dfsas)
+
+
+@pytest.mark.skip("Development tool")
+def test_dashboard_with_prepopulated_data(installation_ctx, make_cluster_policy, make_cluster_policy_permissions):
+    """the purpose of this test is to prepopulate data used by the dashboard without running an actual -lengthy- assessment"""
+    ucx_group, _ = installation_ctx.make_ucx_group()
+    cluster_policy = make_cluster_policy()
+    make_cluster_policy_permissions(
+        object_id=cluster_policy.policy_id,
+        permission_level=PermissionLevel.CAN_USE,
+        group_name=ucx_group.display_name,
+    )
+    installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"]
+    installation_ctx.workspace_installation.run()
+    # populate data
+    _populate_workflow_problems(installation_ctx)
+    _populate_dashboard_problems(installation_ctx)
+    _populate_directfs_problems(installation_ctx)
+    print(f"\nInventory database is {installation_ctx.inventory_database}\n")
+    # put a breakpoint here
+    print("Put a breakpoint here! Then go check the dashboard in your workspace ;-)\n")
diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py
index 7955e93c69..1110528bcf 100644
--- a/tests/integration/assessment/test_workflows.py
+++ b/tests/integration/assessment/test_workflows.py
@@ -5,7 +5,7 @@
 from databricks.sdk.service.iam import PermissionLevel
 
 
-@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8))
+@retried(on=[NotFound, InvalidParameterValue])
 def test_running_real_assessment_job(
     ws,
     installation_ctx,
@@ -25,7 +25,7 @@ def test_running_real_assessment_job(
 
     populate_for_linting(installation_ctx.installation)
 
-    installation_ctx.deployed_workflows.run_workflow("assessment")
+    installation_ctx.deployed_workflows.run_workflow("assessment", max_wait=timedelta(minutes=25))
     assert installation_ctx.deployed_workflows.validate_step("assessment")
 
     after = installation_ctx.generic_permissions_support.load_as_dict("cluster-policies", cluster_policy.policy_id)
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index d2f48cdfa7..9e18b42284 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -28,8 +28,10 @@
 from databricks.sdk.retries import retried
 from databricks.sdk.service import iam
 from databricks.sdk.service.catalog import FunctionInfo, SchemaInfo, TableInfo
-from databricks.sdk.service.iam import Group
+from databricks.sdk.service.compute import ClusterSpec
 from databricks.sdk.service.dashboards import Dashboard as SDKDashboard
+from databricks.sdk.service.iam import Group
+from databricks.sdk.service.jobs import Task, SparkPythonTask
 from databricks.sdk.service.sql import Dashboard, WidgetPosition, WidgetOptions, LegacyQuery
 
 from databricks.labs.ucx.__about__ import __version__
@@ -1236,19 +1238,65 @@ def _run(command: str) -> str:
 
 
 @pytest.fixture
-def populate_for_linting(ws, make_random, make_job, make_notebook, make_query, make_dashboard, watchdog_purge_suffix):
+def create_file_job(ws, make_random, watchdog_remove_after, watchdog_purge_suffix, log_workspace_link):
+
+    def create(installation, **_kwargs):
+        # create args
+        data = {"name": f"dummy-{make_random(4)}"}
+        # create file to run
+        file_name = f"dummy_{make_random(4)}_{watchdog_purge_suffix}"
+        file_path = WorkspacePath(ws, installation.install_folder()) / file_name
+        file_path.write_text("spark.read.parquet('dbfs://mnt/foo/bar')")
+        task = Task(
+            task_key=make_random(4),
+            description=make_random(4),
+            new_cluster=ClusterSpec(
+                num_workers=1,
+                node_type_id=ws.clusters.select_node_type(local_disk=True, min_memory_gb=16),
+                spark_version=ws.clusters.select_spark_version(latest=True),
+            ),
+            spark_python_task=SparkPythonTask(python_file=str(file_path)),
+            timeout_seconds=0,
+        )
+        data["tasks"] = [task]
+        # add RemoveAfter tag for job cleanup
+        data["tags"] = [{"key": "RemoveAfter", "value": watchdog_remove_after}]
+        job = ws.jobs.create(**data)
+        log_workspace_link(data["name"], f'job/{job.job_id}', anchor=False)
+        return job
+
+    yield from factory("job", create, lambda item: ws.jobs.delete(item.job_id))
+
+
+@pytest.fixture
+def populate_for_linting(
+    ws,
+    make_random,
+    make_job,
+    make_notebook,
+    make_query,
+    make_dashboard,
+    create_file_job,
+    watchdog_purge_suffix,
+):
+
+    def create_notebook_job(installation):
+        path = Path(installation.install_folder()) / f"dummy_{make_random(4)}_{watchdog_purge_suffix}"
+        notebook_text = "spark.read.parquet('dbfs://mnt/foo1/bar1')"
+        notebook_path = make_notebook(path=path, content=io.BytesIO(notebook_text.encode("utf-8")))
+        return make_job(notebook_path=notebook_path)
+
     def populate_workspace(installation):
         # keep linting scope to minimum to avoid test timeouts
-        path = Path(installation.install_folder()) / f"dummy-{make_random(4)}-{watchdog_purge_suffix}"
-        notebook_path = make_notebook(path=path, content=io.BytesIO(b"spark.read.parquet('dbfs://mnt/foo/bar')"))
-        job = make_job(notebook_path=notebook_path)
-        query = make_query(sql_query='SELECT * from parquet.`dbfs://mnt/foo/bar`')
+        file_job = create_file_job(installation=installation)
+        notebook_job = create_notebook_job(installation=installation)
+        query = make_query(sql_query='SELECT * from parquet.`dbfs://mnt/foo2/bar2`')
         dashboard = make_dashboard(query=query)
         # can't use installation.load(WorkspaceConfig)/installation.save() because they populate empty credentials
         config_path = WorkspacePath(ws, installation.install_folder()) / "config.yml"
         text = config_path.read_text()
         config = yaml.safe_load(text)
-        config["include_job_ids"] = [job.job_id]
+        config["include_job_ids"] = [file_job.job_id, notebook_job.job_id]
         config["include_dashboard_ids"] = [dashboard.id]
         text = yaml.dump(config)
         config_path.unlink()
diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index 4c5ae4f758..451b5a3815 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -163,7 +163,7 @@ def test_job_linter_some_notebook_graph_with_problems(
     assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages)
 
     assert len(dfsas) == 2
-    task_keys = set(task.task_key for task in j.settings.tasks)
+    task_keys = set(f"{j.job_id}/{task.task_key}" for task in j.settings.tasks)
     yesterday = datetime.now(timezone.utc) - timedelta(days=1)
     for dfsa in dfsas:
         assert dfsa.source_id != DirectFsAccess.UNKNOWN
@@ -172,7 +172,7 @@ def test_job_linter_some_notebook_graph_with_problems(
         assert dfsa.assessment_start_timestamp > yesterday
         assert dfsa.assessment_end_timestamp > yesterday
         assert dfsa.source_lineage[0] == LineageAtom(
-            object_type="JOB", object_id=str(j.job_id), other={"name": j.settings.name}
+            object_type="WORKFLOW", object_id=str(j.job_id), other={"name": j.settings.name}
         )
         assert dfsa.source_lineage[1].object_type == "TASK"
         assert dfsa.source_lineage[1].object_id in task_keys
diff --git a/tests/integration/source_code/test_queries.py b/tests/integration/source_code/test_queries.py
index 5ecfbf45bd..10d4ded773 100644
--- a/tests/integration/source_code/test_queries.py
+++ b/tests/integration/source_code/test_queries.py
@@ -13,7 +13,8 @@ def test_query_linter_lints_queries_and_stores_dfsas(simple_ctx, ws, sql_backend
     assert len(problems) == 1
     crawler = DirectFsAccessCrawler.for_queries(sql_backend, simple_ctx.inventory_database)
     all_dfsas = crawler.snapshot()
-    dfsas = [dfsa for dfsa in all_dfsas if dfsa.source_id == query.id]
+    source_id = f"{_dashboard.id}/{query.id}"
+    dfsas = [dfsa for dfsa in all_dfsas if dfsa.source_id == source_id]
     assert len(dfsas) == 1
     dfsa = dfsas[0]
     assert len(dfsa.source_lineage) == 2
@@ -25,6 +26,6 @@ def test_query_linter_lints_queries_and_stores_dfsas(simple_ctx, ws, sql_backend
     assert lineage.other.get("name", None) == _dashboard.name
     lineage = dfsa.source_lineage[1]
     assert lineage.object_type == "QUERY"
-    assert lineage.object_id == query.id
+    assert lineage.object_id == source_id
     assert lineage.other
     assert lineage.other.get("name", None) == query.name
diff --git a/tests/unit/source_code/test_queries.py b/tests/unit/source_code/test_queries.py
index 216a4647a9..0ed1e187ba 100644
--- a/tests/unit/source_code/test_queries.py
+++ b/tests/unit/source_code/test_queries.py
@@ -27,7 +27,7 @@ def test_query_linter_collects_dfsas_from_queries(name, query, dfsa_paths, is_re
     crawlers = create_autospec(DirectFsAccessCrawler)
     query = LegacyQuery.from_dict({"parent": "workspace", "name": name, "query": query})
     linter = QueryLinter(ws, migration_index, crawlers, None)
-    dfsas = linter.collect_dfsas_from_query(query)
+    dfsas = linter.collect_dfsas_from_query("no-dashboard-id", query)
     ws.assert_not_called()
     crawlers.assert_not_called()
     assert set(dfsa.path for dfsa in dfsas) == set(dfsa_paths)

From 7cba9b0ed9bccacc1c23b6fa774be857e3463fce Mon Sep 17 00:00:00 2001
From: Pritish Pai <136742693+pritishpai@users.noreply.github.com>
Date: Wed, 25 Sep 2024 08:10:37 -0400
Subject: [PATCH 11/21] Increases test coverage (#2739)

## Changes
Increases coverage

### Functionality

- [x] fixes and modifies testing warnings

### Tests
- [x] modified unit tests
---
 tests/unit/hive_metastore/test_table_size.py | 43 +++++++++++++-------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/tests/unit/hive_metastore/test_table_size.py b/tests/unit/hive_metastore/test_table_size.py
index 8464a5dd3b..8216a0a780 100644
--- a/tests/unit/hive_metastore/test_table_size.py
+++ b/tests/unit/hive_metastore/test_table_size.py
@@ -1,3 +1,4 @@
+import logging
 import sys
 
 from databricks.labs.lsql.backends import MockBackend
@@ -41,11 +42,11 @@ def test_table_size_crawler(mocker):
     assert TableSize("hive_metastore", "db1", "table2", 200) in results
 
 
-def test_table_size_unknown_error(mocker):
+def test_table_size_unknown_error(mocker, caplog):
     errors = {}
     rows = {
         "table_size": [],
-        "hive_metastore.inventory_database.tables": [
+        "`hive_metastore`.`inventory_database`.`tables`": [
             ("hive_metastore", "db1", "table1", "MANAGED", "DELTA", "dbfs:/location/table", None),
         ],
         "SHOW DATABASES": [("db1",)],
@@ -56,16 +57,17 @@ def test_table_size_unknown_error(mocker):
     tsc = TableSizeCrawler(backend, "inventory_database")
     tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception(...)
 
-    results = tsc.snapshot()
+    with caplog.at_level(logging.WARNING):
+        results = tsc.snapshot()
 
     assert len(results) == 0
 
 
-def test_table_size_table_or_view_not_found(mocker):
+def test_table_size_table_or_view_not_found(mocker, caplog):
     errors = {}
     rows = {
         "table_size": [],
-        "hive_metastore.inventory_database.tables": [
+        "`hive_metastore`.`inventory_database`.`tables`": [
             ("hive_metastore", "db1", "table1", "MANAGED", "DELTA", "dbfs:/location/table", None),
         ],
         "SHOW DATABASES": [("db1",)],
@@ -80,16 +82,18 @@ def test_table_size_table_or_view_not_found(mocker):
         "[TABLE_OR_VIEW_NOT_FOUND]"
     )
 
-    results = tsc.snapshot()
+    with caplog.at_level(logging.WARNING):
+        results = tsc.snapshot()
 
     assert len(results) == 0
+    assert "Failed to evaluate hive_metastore.db1.table1 table size. Table not found" in caplog.text
 
 
-def test_table_size_delta_table_not_found(mocker):
+def test_table_size_delta_table_not_found(mocker, caplog):
     errors = {}
     rows = {
         "table_size": [],
-        "hive_metastore.inventory_database.tables": [
+        "`hive_metastore`.`inventory_database`.`tables`": [
             ("hive_metastore", "db1", "table1", "MANAGED", "DELTA", "dbfs:/location/table", None),
         ],
         "SHOW DATABASES": [("db1",)],
@@ -104,16 +108,18 @@ def test_table_size_delta_table_not_found(mocker):
         "[DELTA_TABLE_NOT_FOUND]"
     )
 
-    results = tsc.snapshot()
+    with caplog.at_level(logging.WARNING):
+        results = tsc.snapshot()
 
     assert len(results) == 0
+    assert "Failed to evaluate hive_metastore.db1.table1 table size. Table not found" in caplog.text
 
 
-def test_table_size_when_table_corrupted(mocker):
+def test_table_size_when_table_corrupted(mocker, caplog):
     errors = {}
     rows = {
         "table_size": [],
-        "hive_metastore.inventory_database.tables": [
+        "`hive_metastore`.`inventory_database`.`tables`": [
             ("hive_metastore", "db1", "table1", "MANAGED", "DELTA", "dbfs:/location/table", None),
         ],
         "SHOW DATABASES": [("db1",)],
@@ -127,16 +133,18 @@ def test_table_size_when_table_corrupted(mocker):
         "[DELTA_MISSING_TRANSACTION_LOG]"
     )
 
-    results = tsc.snapshot()
+    with caplog.at_level(logging.WARNING):
+        results = tsc.snapshot()
 
     assert len(results) == 0
+    assert "Delta table hive_metastore.db1.table1 is corrupted: missing transaction log" in caplog.text
 
 
-def test_table_size_when_delta_invalid_format_error(mocker):
+def test_table_size_when_delta_invalid_format_error(mocker, caplog):
     errors = {}
     rows = {
         "table_size": [],
-        "hive_metastore.inventory_database.tables": [
+        "`hive_metastore`.`inventory_database`.`tables`": [
             ("hive_metastore", "db1", "table1", "MANAGED", "DELTA", "dbfs:/location/table", None),
         ],
         "SHOW DATABASES": [("db1",)],
@@ -150,6 +158,11 @@ def test_table_size_when_delta_invalid_format_error(mocker):
         "[DELTA_INVALID_FORMAT]"
     )
 
-    results = tsc.snapshot()
+    with caplog.at_level(logging.WARNING):
+        results = tsc.snapshot()
 
     assert len(results) == 0
+    assert (
+        "Unable to read Delta table hive_metastore.db1.table1, please check table structure and try again."
+        in caplog.text
+    )

From 04a4956bda94cec58ca8b70f59455ca2d42decbb Mon Sep 17 00:00:00 2001
From: Pritish Pai <136742693+pritishpai@users.noreply.github.com>
Date: Wed, 25 Sep 2024 08:14:45 -0400
Subject: [PATCH 12/21] Fixes source table alias dissapearance during
 migrate_views (#2726)

## Changes
The alias for the source table is disappearing while converting the
create view sql from legacy hive metastore to uc catalog.

### Linked issues
Resolves #2661

### Functionality

- [x] fixes sql conversion

### Tests
- [x] manually tested
- [x] added unit tests
- [ ] added integration tests
- [ ] verified on staging environment (screenshot attached)

---------

Co-authored-by: Liran Bareket <liran.bareket@databricks.com>
---
 .../ucx/source_code/linters/from_table.py     |  2 +-
 .../hive_metastore/test_migrate.py            | 40 +++++++++++++++++++
 .../unit/hive_metastore/test_view_migrate.py  | 25 ++++++++++++
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/src/databricks/labs/ucx/source_code/linters/from_table.py b/src/databricks/labs/ucx/source_code/linters/from_table.py
index 09b2235a74..9da4dd7a2c 100644
--- a/src/databricks/labs/ucx/source_code/linters/from_table.py
+++ b/src/databricks/labs/ucx/source_code/linters/from_table.py
@@ -99,7 +99,7 @@ def apply(self, code: str) -> str:
                 dst = self._index.get(src_schema, old_table.name)
                 if not dst:
                     continue
-                new_table = Table(catalog=dst.dst_catalog, db=dst.dst_schema, this=dst.dst_table)
+                new_table = Table(catalog=dst.dst_catalog, db=dst.dst_schema, this=dst.dst_table, alias=old_table.alias)
                 old_table.replace(new_table)
             new_sql = statement.sql('databricks')
             new_statements.append(new_sql)
diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py
index 6779f0c84e..14afdfc527 100644
--- a/tests/integration/hive_metastore/test_migrate.py
+++ b/tests/integration/hive_metastore/test_migrate.py
@@ -341,6 +341,46 @@ def test_migrate_view(ws, sql_backend, runtime_ctx, make_catalog):
     assert "(col1,col2)" in view3_view_text.replace("\n", "").replace(" ", "").lower()
 
 
+@retried(on=[NotFound], timeout=timedelta(minutes=3))
+def test_migrate_view_alias_test(ws, sql_backend, runtime_ctx, make_catalog):
+    src_schema = runtime_ctx.make_schema(catalog_name="hive_metastore")
+    src_managed_table = runtime_ctx.make_table(catalog_name=src_schema.catalog_name, schema_name=src_schema.name)
+    src_view1 = runtime_ctx.make_table(
+        catalog_name=src_schema.catalog_name,
+        schema_name=src_schema.name,
+        ctas=f"SELECT A.* FROM {src_managed_table.full_name} AS A",
+        view=True,
+    )
+
+    dst_catalog = make_catalog()
+    dst_schema = runtime_ctx.make_schema(catalog_name=dst_catalog.name, name=src_schema.name)
+
+    logger.info(f"dst_catalog={dst_catalog.name}, managed_table={src_managed_table.full_name}")
+
+    rules = [
+        Rule.from_src_dst(src_managed_table, dst_schema),
+        Rule.from_src_dst(src_view1, dst_schema),
+    ]
+
+    runtime_ctx.with_table_mapping_rules(rules)
+    runtime_ctx.with_dummy_resource_permission()
+    runtime_ctx.tables_migrator.index()
+    runtime_ctx.tables_migrator.migrate_tables(what=What.DBFS_ROOT_DELTA)
+    runtime_ctx.migration_status_refresher.snapshot()
+    runtime_ctx.tables_migrator.migrate_tables(what=What.VIEW)
+    target_tables = list(sql_backend.fetch(f"SHOW TABLES IN {dst_schema.full_name}"))
+    assert len(target_tables) == 2
+
+    target_table_properties = ws.tables.get(f"{dst_schema.full_name}.{src_managed_table.name}").properties
+    assert target_table_properties["upgraded_from"] == src_managed_table.full_name
+    assert target_table_properties[Table.UPGRADED_FROM_WS_PARAM] == str(ws.get_workspace_id())
+    view1_view_text = ws.tables.get(f"{dst_schema.full_name}.{src_view1.name}").view_definition
+    assert (
+        view1_view_text
+        == f"SELECT `A`.* FROM `{dst_schema.catalog_name}`.`{dst_schema.name}`.`{src_managed_table.name}` AS `A`"
+    )
+
+
 @retried(on=[NotFound], timeout=timedelta(minutes=2))
 def test_revert_migrated_table(sql_backend, runtime_ctx, make_catalog):
     src_schema1 = runtime_ctx.make_schema(catalog_name="hive_metastore")
diff --git a/tests/unit/hive_metastore/test_view_migrate.py b/tests/unit/hive_metastore/test_view_migrate.py
index 4d6b0cab73..0715b00dc6 100644
--- a/tests/unit/hive_metastore/test_view_migrate.py
+++ b/tests/unit/hive_metastore/test_view_migrate.py
@@ -37,6 +37,31 @@ def test_view_to_migrate_sql_migrate_view_sql():
     assert sql == expected_query
 
 
+def test_view_with_alias_to_migrate_sql_migrate_view_sql():
+    expected_query = "CREATE OR REPLACE VIEW IF NOT EXISTS `cat1`.`schema1`.`dest_view1` AS SELECT `A`.* FROM `cat1`.`schema1`.`dest_table1` AS `A`"
+    view = Table(
+        object_type="VIEW",
+        table_format="VIEW",
+        catalog="hive_metastore",
+        database="test_schema1",
+        name="test_view1",
+        # The view text is overwritten with the create view statement before running the sql migrate view
+        view_text="CREATE OR REPLACE VIEW hive_metastore.test_schema1.test_view1 AS SELECT A.* FROM test_schema1.test_table1 AS A",
+    )
+    rule = Rule("workspace", "cat1", "test_schema1", "schema1", "test_view1", "dest_view1")
+    view_to_migrate = ViewToMigrate(view, rule)
+    migration_index = TableMigrationIndex(
+        [
+            TableMigrationStatus("test_schema1", "test_table1", "cat1", "schema1", "dest_table1"),
+            TableMigrationStatus("test_schema1", "test_view1"),
+        ]
+    )
+
+    sql = view_to_migrate.sql_migrate_view(migration_index)
+
+    assert sql == expected_query
+
+
 @pytest.fixture(scope="session")
 def samples() -> dict[str, dict[str, str]]:
     path = Path(Path(__file__).parent, "tables", "tables_and_views.json")

From 15c553681a685ac3f968adf5ff5c3ac19aaf1fbd Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@wanadoo.fr>
Date: Wed, 25 Sep 2024 15:12:39 +0200
Subject: [PATCH 13/21] Bump astroid version, pylint version and drop our
 f-string workaround (#2746)

## Changes
Our code works around a limitation of astroid < 3.3 where f-strings are
not inferred
This PR:
 - updates pylint and astroid
 - drops workarounds
 - fixes corresponding tests

### Linked issues
None

### Functionality
None

### Tests
- [x] updated unit tests

---------

Co-authored-by: Eric Vergnaud <eric.vergnaud@databricks.com>
---
 pyproject.toml                                 | 11 +++++++----
 .../ucx/source_code/python/python_infer.py     | 18 ------------------
 .../source_code/python/test_python_infer.py    | 16 ----------------
 .../functional/file-access/direct-fs2.py       |  4 +---
 ...string.py => sys-path-with-uninferrable.py} |  2 +-
 tests/unit/source_code/test_dependencies.py    | 15 +++------------
 tests/unit/source_code/test_notebook.py        |  7 +++----
 7 files changed, 15 insertions(+), 58 deletions(-)
 rename tests/unit/source_code/samples/{sys-path-with-fstring.py => sys-path-with-uninferrable.py} (83%)

diff --git a/pyproject.toml b/pyproject.toml
index 66112a81f5..6ac648bdb2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,7 +49,7 @@ dependencies = ["databricks-sdk~=0.30",
                 "databricks-labs-blueprint>=0.8,<0.9",
                 "PyYAML>=6.0.0,<7.0.0",
                 "sqlglot>=25.5.0,<25.23",
-                "astroid>=3.2.2"]
+                "astroid>=3.3.1"]
 
 [project.optional-dependencies]
 pylsp = [
@@ -74,7 +74,7 @@ dependencies = [
     "black~=24.3.0",
     "coverage[toml]~=7.4.4",
     "mypy~=1.9.0",
-    "pylint~=3.2.2",
+    "pylint~=3.3.1",
     "pylint-pytest==2.0.0a0",
     "databricks-labs-pylint~=0.4.0",
     "databricks-labs-pytester>=0.2.1",
@@ -209,7 +209,7 @@ fail-under = 10.0
 # ignore-list. The regex matches against paths and can be in Posix or Windows
 # format. Because '\\' represents the directory delimiter on Windows systems, it
 # can't be used as an escape character.
- ignore-paths='^tests/unit/source_code/samples/.*$'
+ignore-paths='^tests/unit/source_code/samples/.*$'
 
 # Files or directories matching the regular expression patterns are skipped. The
 # regex matches against base names, not paths. The default value ignores Emacs
@@ -587,7 +587,10 @@ disable = [
     "fixme",
     "consider-using-assignment-expr",
     "logging-fstring-interpolation",
-    "consider-using-any-or-all"
+    "consider-using-any-or-all",
+    "too-many-positional-arguments",
+    "unnecessary-default-type-args",
+    "logging-not-lazy"
 ]
 
 # Enable the message, report, category or checker with the given id(s). You can
diff --git a/src/databricks/labs/ucx/source_code/python/python_infer.py b/src/databricks/labs/ucx/source_code/python/python_infer.py
index b1d79da641..f74c287ba9 100644
--- a/src/databricks/labs/ucx/source_code/python/python_infer.py
+++ b/src/databricks/labs/ucx/source_code/python/python_infer.py
@@ -64,10 +64,6 @@ def _infer_values(cls, node: NodeNG) -> Iterator[Iterable[NodeNG]]:
         # deal with node types that don't implement 'inferred()'
         if node is Uninferable or isinstance(node, Const):
             yield [node]
-        elif isinstance(node, JoinedStr):
-            yield from cls._infer_values_from_joined_string(node)
-        elif isinstance(node, FormattedValue):
-            yield from _LocalInferredValue.do_infer_values(node.value)
         else:
             yield from cls._safe_infer_internal(node)
 
@@ -91,20 +87,6 @@ def _unsafe_infer_internal(cls, node: NodeNG):
                 continue
             yield from _LocalInferredValue.do_infer_values(inferred)
 
-    @classmethod
-    def _infer_values_from_joined_string(cls, node: NodeNG) -> Iterator[Iterable[NodeNG]]:
-        assert isinstance(node, JoinedStr)
-        yield from cls._infer_values_from_joined_values(node.values)
-
-    @classmethod
-    def _infer_values_from_joined_values(cls, nodes: list[NodeNG]) -> Iterator[Iterable[NodeNG]]:
-        if len(nodes) == 1:
-            yield from _LocalInferredValue.do_infer_values(nodes[0])
-            return
-        for firsts in _LocalInferredValue.do_infer_values(nodes[0]):
-            for remains in cls._infer_values_from_joined_values(nodes[1:]):
-                yield list(firsts) + list(remains)
-
     def __init__(self, atoms: Iterable[NodeNG]):
         self._atoms = list(atoms)
 
diff --git a/tests/unit/source_code/python/test_python_infer.py b/tests/unit/source_code/python/test_python_infer.py
index 38f3b4cb6f..9afc690a7d 100644
--- a/tests/unit/source_code/python/test_python_infer.py
+++ b/tests/unit/source_code/python/test_python_infer.py
@@ -88,22 +88,6 @@ def test_infers_fstring_values():
     assert strings == ["Hello abc, ghi!", "Hello abc, jkl!", "Hello def, ghi!", "Hello def, jkl!"]
 
 
-def test_fails_to_infer_cascading_fstring_values():
-    # The purpose of this test is to detect a change in astroid support for f-strings
-    source = """
-value1 = "John"
-value2 = f"Hello {value1}"
-value3 = f"{value2}, how are you today?"
-"""
-    tree = Tree.parse(source)
-    nodes = tree.locate(Assign, [])
-    tree = Tree(nodes[2].value)  # value of value3 = ...
-    values = list(InferredValue.infer_from_node(tree.node))
-    # for now, we simply check failure to infer!
-    assert any(not value.is_inferred() for value in values)
-    # the expected value would be ["Hello John, how are you today?"]
-
-
 def test_infers_externally_defined_value():
     state = CurrentSessionState()
     state.named_parameters = {"my-widget": "my-value"}
diff --git a/tests/unit/source_code/samples/functional/file-access/direct-fs2.py b/tests/unit/source_code/samples/functional/file-access/direct-fs2.py
index c48a44c840..0be4ec4d7f 100644
--- a/tests/unit/source_code/samples/functional/file-access/direct-fs2.py
+++ b/tests/unit/source_code/samples/functional/file-access/direct-fs2.py
@@ -6,11 +6,9 @@
     print(len(result))
 index = 10
 spark.sql(f"SELECT * FROM table_{index}").collect()
-# ucx[cannot-autofix-table-reference:+2:0:+2:40] Can't migrate table_name argument in 'f'SELECT * FROM {table_name}'' because its value cannot be computed
 table_name = f"table_{index}"
 spark.sql(f"SELECT * FROM {table_name}").collect()
-# ucx[table-migrated-to-uc:+4:4:+4:20] Table old.things is migrated to brand.new.stuff in Unity Catalog
-# ucx[cannot-autofix-table-reference:+3:4:+3:20] Can't migrate table_name argument in 'query' because its value cannot be computed
+# ucx[table-migrated-to-uc:+3:4:+3:20] Table old.things is migrated to brand.new.stuff in Unity Catalog
 table_name = f"table_{index}"
 for query in ["SELECT * FROM old.things", f"SELECT * FROM {table_name}"]:
     spark.sql(query).collect()
diff --git a/tests/unit/source_code/samples/sys-path-with-fstring.py b/tests/unit/source_code/samples/sys-path-with-uninferrable.py
similarity index 83%
rename from tests/unit/source_code/samples/sys-path-with-fstring.py
rename to tests/unit/source_code/samples/sys-path-with-uninferrable.py
index 5aac77763b..ddbe6b17e0 100644
--- a/tests/unit/source_code/samples/sys-path-with-fstring.py
+++ b/tests/unit/source_code/samples/sys-path-with-uninferrable.py
@@ -1,7 +1,7 @@
 import sys
 
 name_1 = "whatever"
-name_2 = f"{name_1}"
+name_2 = some_call(name_1)
 sys.path.append(f"{name_2}")
 names = [f"{name_2}", name_2]
 for name in names:
diff --git a/tests/unit/source_code/test_dependencies.py b/tests/unit/source_code/test_dependencies.py
index 449d071b74..e051aad349 100644
--- a/tests/unit/source_code/test_dependencies.py
+++ b/tests/unit/source_code/test_dependencies.py
@@ -199,24 +199,15 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> So
     ]
 
 
-def test_dependency_resolver_raises_problem_for_non_inferable_sys_path(simple_dependency_resolver):
+def test_dependency_resolver_raises_problem_for_uninferrable_sys_path(simple_dependency_resolver):
     maybe = simple_dependency_resolver.build_local_file_dependency_graph(
-        Path("sys-path-with-fstring.py"), CurrentSessionState()
+        Path("sys-path-with-uninferrable.py"), CurrentSessionState()
     )
     assert list(maybe.problems) == [
-        DependencyProblem(
-            code='sys-path-cannot-compute-value',
-            message="Can't update sys.path from f'{name_2}' because the expression cannot be computed",
-            source_path=Path('sys-path-with-fstring.py'),
-            start_line=4,
-            start_col=16,
-            end_line=4,
-            end_col=27,
-        ),
         DependencyProblem(
             code='sys-path-cannot-compute-value',
             message="Can't update sys.path from name because the expression cannot be computed",
-            source_path=Path('sys-path-with-fstring.py'),
+            source_path=Path('sys-path-with-uninferrable.py'),
             start_line=7,
             start_col=20,
             end_line=7,
diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py
index c542569f15..bda25a0106 100644
--- a/tests/unit/source_code/test_notebook.py
+++ b/tests/unit/source_code/test_notebook.py
@@ -249,7 +249,7 @@ def test_notebook_builds_python_dependency_graph_with_fstring_loop(mock_path_loo
     container = maybe.dependency.load(mock_path_lookup)
     assert container is not None
     container.build_dependency_graph(graph)
-    expected_paths = [path, "leaf1.py", "leaf3.py"]
+    expected_paths = [path, "leaf1.py",  "leaf2.py", "leaf3.py"]
     all_paths = set(d.path for d in graph.all_dependencies)
     assert all_paths == {mock_path_lookup.cwd / path for path in expected_paths}
 
@@ -279,7 +279,7 @@ def test_does_not_detect_partial_call_to_dbutils_notebook_run_in_python_code_()
     assert len(nodes) == 0
 
 
-def test_raises_advice_when_dbutils_notebook_run_is_too_complex() -> None:
+def test_lints_complex_dbutils_notebook_run() -> None:
     source = """
 name1 = "John"
 name2 = f"{name1}"
@@ -287,5 +287,4 @@ def test_raises_advice_when_dbutils_notebook_run_is_too_complex() -> None:
     """
     linter = DbutilsPyLinter(CurrentSessionState())
     advices = list(linter.lint(source))
-    assert len(advices) == 1
-    assert advices[0].code == "notebook-run-cannot-compute-value"
+    assert not advices

From 2611b119f4b84c54c40c8043654a11491fd469fc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 25 Sep 2024 15:13:11 +0200
Subject: [PATCH 14/21] Update databricks-labs-blueprint requirement from
 <0.9,>=0.8 to >=0.8,<0.10 (#2747)

Updates the requirements on
[databricks-labs-blueprint](https://github.com/databrickslabs/blueprint)
to permit the latest version.
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/databrickslabs/blueprint/blob/main/CHANGELOG.md">databricks-labs-blueprint's
changelog</a>.</em></p>
<blockquote>
<h2>0.9.0</h2>
<ul>
<li>Added Databricks CLI version as part of routed command telemetry (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/147">#147</a>).
A new environment variable, &quot;DATABRICKS_CLI_VERSION&quot;, has been
introduced in the Databricks CLI version for routed command telemetry.
This variable is incorporated into the
<code>with_user_agent_extra</code> method, which adds it to the user
agent for outgoing requests, thereby enhancing detailed tracking and
version identification in telemetry data. The
<code>with_user_agent_extra</code> method is invoked twice, with the
<code>blueprint</code> prefix and the <strong>version</strong> variable,
followed by the <code>cli</code> prefix and the DATABRICKS_CLI_VERSION
environment variable, ensuring that both the blueprint and CLI versions
are transmitted in the user agent for all requests.</li>
</ul>
<h2>0.8.3</h2>
<ul>
<li>add missing stat() methods to DBFSPath and WorkspacePath (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/144">#144</a>).
The <code>stat()</code> method has been added to both
<code>DBFSPath</code> and <code>WorkspacePath</code> classes, addressing
issues <a
href="https://redirect.github.com/databrickslabs/blueprint/issues/142">#142</a>
and <a
href="https://redirect.github.com/databrickslabs/blueprint/issues/143">#143</a>.
This method, which adheres to the Posix standard, returns file status in
the <code>os.stat_result</code> format, providing access to various
metadata attributes such as file size, last modification time, and
creation time. By incorporating this method, developers can now obtain
essential file information for Databricks File System (DBFS) and
Databricks Workspace paths when working with these classes. The change
includes a new test case for <code>stat()</code> in the
<code>test_paths.py</code> file to ensure the correctness of the method
for both classes.</li>
</ul>
<h2>0.8.2</h2>
<ul>
<li>Make hatch a prerequisite (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/137">#137</a>).
In version 1.9.4, hatch has become a prerequisite for installation in
the GitHub workflow for the project's main branch, due to occasional
failures in <code>pip install hatch</code> that depend on the local
environment. This change, which includes defining the hatch version as
an environment variable and adding a new step for installing hatch with
a specific version, aims to enhance the reliability of the build and
testing process by eliminating potential installation issues with hatch.
Users should install hatch manually before executing the Makefile, as
the line <code>pip install hatch</code> has been removed from the
Makefile. This change aligns with the approach taken for ucx, and users
are expected to understand the requirement to install prerequisites
before executing the Makefile. To contribute to this project, please
install hatch using <code>pip install hatch</code>, clone the GitHub
repository, and run <code>make dev</code> to start the development
environment and install necessary dependencies.</li>
<li>support files with unicode BOM (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/138">#138</a>).
The recent change to the open-source library introduces support for
handling files with a Unicode Byte Order Mark (BOM) during file upload
and download operations in Databricks Workspace. This new functionality,
added to the <code>WorkspacePath</code> class, allows for easier reading
of text from files with the addition of a <code>read_text</code> method.
When downloading a file, if it starts with a BOM, it will be detected
and used for decoding, regardless of the preferred encoding based on the
system's locale. The change includes a new test function that verifies
the accurate encoding and decoding of files with different types of BOM
using the appropriate encoding. Despite the inability to test Databrick
notebooks with a BOM due to the Databricks platform modifying the
uploaded data, this change enhances support for handling files with
various encodings and BOM, improving compatibility with a broader range
of file formats, and ensuring more accurate handling of files with
BOM.</li>
</ul>
<h2>0.8.1</h2>
<ul>
<li>Fixed py3.10 compatibility for <code>_parts</code> in pathlike (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/135">#135</a>).
The recent update to our open-source library addresses the compatibility
issue with Python 3.10 in the <code>_parts</code> property of a certain
type. Prior to this change, there was also a <code>_cparts</code>
property that returned the same value as <code>_parts</code>, which has
been removed and replaced with a direct reference to
<code>_parts</code>. The <code>_parts</code> property can now be
accessed via reverse equality comparison, and this change has been
implemented in the <code>joinpath</code> and <code>__truediv__</code>
methods as well. This enhancement improves the library's compatibility
with Python 3.10 and beyond, ensuring continued functionality and
stability for software engineers working with the latest Python
versions.</li>
</ul>
<h2>0.8.0</h2>
<ul>
<li>Added <code>DBFSPath</code> as <code>os.PathLike</code>
implementation (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/131">#131</a>).
The open-source library has been updated with a new class
<code>DBFSPath</code>, an implementation of <code>os.PathLike</code> for
Databricks File System (DBFS) paths. This new class extends the existing
<code>WorkspacePath</code> support and provides pathlib-like
functionality for DBFS paths, including methods for creating
directories, renaming and deleting files and directories, and reading
and writing files. The addition of <code>DBFSPath</code> includes
type-hinting for improved code linting and is integrated in the test
suite with new and updated tests for path-like objects. The behavior of
the <code>exists</code> and <code>unlink</code> methods have been
updated for <code>WorkspacePath</code> to improve performance and raise
appropriate errors.</li>
<li>Fixed <code>.as_uri()</code> and <code>.absolute()</code>
implementations for <code>WorkspacePath</code> (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/127">#127</a>).
In this release, the <code>WorkspacePath</code> class in the
<code>paths.py</code> module has been updated with several improvements
to the <code>.as_uri()</code> and <code>.absolute()</code> methods.
These methods now utilize PathLib internals, providing better
cross-version compatibility. The <code>.as_uri()</code> method now uses
an f-string for concatenation and returns the UTF-8 encoded string
representation of the <code>WorkspacePath</code> object via a new
<code>__bytes__()</code> dunder method. Additionally, the
<code>.absolute()</code> method has been implemented for the trivial
(no-op) case and now supports returning the absolute path of files or
directories in Databricks Workspace. Furthermore, the
<code>glob()</code> and <code>rglob()</code> methods have been enhanced
to support case-sensitive pattern matching based on a new
<code>case_sensitive</code> parameter. To ensure the integrity of these
changes, two new test cases, <code>test_as_uri()</code> and
<code>test_absolute()</code>, have been added, thoroughly testing the
functionality of these methods.</li>
<li>Fixed <code>WorkspacePath</code> support for python 3.11 (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/121">#121</a>).
The <code>WorkspacePath</code> class in our open-source library has been
updated to improve compatibility with Python 3.11. The
<code>.expanduser()</code> and <code>.glob()</code> methods have been
modified to address internal changes in Python 3.11. The
<code>is_dir()</code> and <code>is_file()</code> methods now include a
<code>follow_symlinks</code> parameter, although it is not currently
used. A new method, <code>_scandir()</code>, has been added for
compatibility with Python 3.11. The <code>expanduser()</code> method has
also been updated to expand <code>~</code> (but not <code>~user</code>)
constructs. Additionally, a new method <code>is_notebook()</code> has
been introduced to check if the path points to a notebook in Databricks
Workspace. These changes aim to ensure that the library functions
smoothly with the latest version of Python and provides additional
functionality for users working with Databricks Workspace.</li>
<li>Properly verify versions of python (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/118">#118</a>).
In this release, we have made significant updates to the pyproject.toml
file to enhance project dependency and development environment
management. We have added several new packages to the
<code>dependencies</code> section to expand the library's functionality
and compatibility. Additionally, we have removed the <code>python</code>
field, as it is no longer necessary. We have also updated the
<code>path</code> field to specify the location of the virtual
environment, which can improve integration with popular development
tools such as Visual Studio Code and PyCharm. These changes are intended
to streamline the development process and make it easier to manage
dependencies and set up the development environment.</li>
<li>Type annotations on path-related unit tests (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/128">#128</a>).
In this open-source library update, type annotations have been added to
path-related unit tests to enhance code clarity and maintainability. The
tests encompass various scenarios, including verifying if a path exists,
creating, removing, and checking directories, and testing file
attributes such as distinguishing directories, notebooks, and regular
files. The additions also cover functionality for opening and
manipulating files in different modes like read binary, write binary,
read text, and write text. Furthermore, tests for checking file
permissions, handling errors, and globbing (pattern-based file path
matching) have been incorporated. The tests interact with a
WorkspaceClient mock object, simulating file system interactions. This
enhancement bolsters the library's reliability and assists developers in
creating robust, well-documented code when working with file system
paths.</li>
<li>Updated <code>WorkspacePath</code> to support Python 3.12 (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/122">#122</a>).
In this release, the <code>WorkspacePath</code> implementation has been
updated to ensure compatibility with Python 3.12, in addition to Python
3.10 and 3.11. The class was modified to replace most of the internal
implementation and add extensive tests for public interfaces, ensuring
that the superclass implementations are not used unless they are known
to be safe. This change is in response to the significant changes in the
superclass implementations between Python 3.11 and 3.12, which were
found to be incompatible with each other. The <code>WorkspacePath</code>
class now includes several new methods and tests to ensure that it
functions seamlessly with different versions of Python. These changes
include testing for initialization, equality, hash, comparison, path
components, and various path manipulations. This update enhances the
library's adaptability and ensures it functions correctly with different
versions of Python. Classifiers have also been updated to include
support for Python 3.12.</li>
<li><code>WorkspacePath</code> fixes for the <code>.resolve()</code>
implementation (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/129">#129</a>).
The <code>.resolve()</code> method for <code>WorkspacePath</code> has
been updated to improve its handling of relative paths and the
<code>strict</code> argument. Previously, relative paths were not
properly validated and would be returned as-is. Now, relative paths will
cause the method to fail. The <code>strict</code> argument is now
checked, and if set to <code>True</code> and the path does not exist, a
<code>FileNotFoundError</code> will be raised. The method
<code>.absolute()</code> is used to obtain the absolute path of the file
or directory in Databricks Workspace and is used in the implementation
of <code>.resolve()</code>. A new test, <code>test_resolve()</code>, has
been added to verify these changes, covering scenarios where the path is
absolute, the path exists, the path does not exist, and the path is
relative. In the case of relative paths, a
<code>NotImplementedError</code> is raised, as <code>.resolve()</code>
is not supported for them.</li>
<li><code>WorkspacePath</code>: Fix the .rename() and .replace()
implementations to return the target path (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/130">#130</a>).
The <code>.rename()</code> and <code>.replace()</code> methods of the
<code>WorkspacePath</code> class have been updated to return the target
path as part of the public API, with <code>.rename()</code> no longer
accepting the <code>overwrite</code> keyword argument and always failing
if the target path already exists. A new private method,
<code>._rename()</code>, has been added to include the
<code>overwrite</code> argument and is used by both
<code>.rename()</code> and <code>.replace()</code>. This update is a
preparatory step for factoring out common code to support DBFS paths.
The tests have been updated accordingly, combining and adding functions
to test the new and updated methods. The <code>.unlink()</code> method's
behavior remains unchanged. Please note that the exact error raised when
<code>.rename()</code> fails due to an existing target path is yet to be
defined.</li>
</ul>
<p>Dependency updates:</p>
<ul>
<li>Bump sigstore/gh-action-sigstore-python from 2.1.1 to 3.0.0 (<a
href="https://redirect.github.com/databrickslabs/blueprint/pull/133">#133</a>).</li>
</ul>
<h2>0.7.0</h2>
<ul>
<li>Added <code>databricks.labs.blueprint.paths.WorkspacePath</code> as
<code>pathlib.Path</code> equivalent (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/115">#115</a>).
This commit introduces the
<code>databricks.labs.blueprint.paths.WorkspacePath</code> library,
providing Python-native <code>pathlib.Path</code>-like interfaces to
simplify working with Databricks Workspace paths. The library includes
<code>WorkspacePath</code> and <code>WorkspacePathDuringTest</code>
classes offering advanced functionality for handling user home folders,
relative file paths, browser URLs, and file manipulation methods such as
<code>read/write_text()</code>, <code>read/write_bytes()</code>, and
<code>glob()</code>. This addition brings enhanced, Pythonic ways to
interact with Databricks Workspace paths, including creating and moving
files, managing directories, and generating browser-accessible URIs.
Additionally, the commit includes updates to existing methods and
introduces new fixtures for creating notebooks, accompanied by extensive
unit tests to ensure reliability and functionality.</li>
<li>Added propagation of <code>blueprint</code> version into
<code>User-Agent</code> header when it is used as library (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/114">#114</a>).
A new feature has been introduced in the library that allows for the
propagation of the <code>blueprint</code> version and the name of the
command line interface (CLI) command used in the <code>User-Agent</code>
header when the library is utilized as a library. This feature includes
the addition of two new pairs of <code>OtherInfo</code>:
<code>blueprint/X.Y.Z</code> to indicate that the request is made using
the <code>blueprint</code> library and <code>cmd/&lt;name&gt;</code> to
store the name of the CLI command used for making the request. The
implementation involves using the <code>with_user_agent_extra</code>
function from <code>databricks.sdk.config</code> to set the user agent
consistently with the Databricks CLI. Several changes have been made to
the test file for <code>test_useragent.py</code> to include a new test
case, <code>test_user_agent_is_propagated</code>, which checks if the
<code>blueprint</code> version and the name of the command are correctly
propagated to the <code>User-Agent</code> header. A context manager
<code>http_fixture_server</code> has been added that creates an HTTP
server with a custom handler, which extracts the <code>blueprint</code>
version and the command name from the <code>User-Agent</code> header and
stores them in the <code>user_agent</code> dictionary. The test case
calls the <code>foo</code> command with a mocked
<code>WorkspaceClient</code> instance and sets the
<code>DATABRICKS_HOST</code> and <code>DATABRICKS_TOKEN</code>
environment variables to test the propagation of the
<code>blueprint</code> version and the command name in the
<code>User-Agent</code> header. The test case then asserts that the
<code>blueprint</code> version and the name of the command are present
and correctly set in the <code>user_agent</code> dictionary.</li>
<li>Bump actions/checkout from 4.1.6 to 4.1.7 (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/112">#112</a>).
In this release, the version of the &quot;actions/checkout&quot; action
used in the <code>Checkout Code</code> step of the acceptance workflow
has been updated from 4.1.6 to 4.1.7. This update may include bug fixes,
performance improvements, and new features, although specific changes
are not mentioned in the commit message. The <code>Unshallow</code> step
remains unchanged, continuing to fetch and clean up the repository's
history. This update ensures that the latest enhancements from the
&quot;actions/checkout&quot; action are utilized, aiming to improve the
reliability and performance of the code checkout process in the GitHub
Actions workflow. Software engineers should be aware of this update and
its potential impact on their workflows.</li>
</ul>
<p>Dependency updates:</p>
<ul>
<li>Bump actions/checkout from 4.1.6 to 4.1.7 (<a
href="https://redirect.github.com/databrickslabs/blueprint/pull/112">#112</a>).</li>
</ul>
<h2>0.6.3</h2>
<ul>
<li>fixed <code>Command.get_argument_type</code> bug with
<code>UnionType</code> (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/110">#110</a>).
In this release, the <code>Command.get_argument_type</code> method has
been updated to include special handling for <code>UnionType</code>,
resolving a bug that caused the function to crash when encountering this
type. The method now returns the string representation of the annotation
if the argument is a <code>UnionType</code>, providing more accurate and
reliable results. To facilitate this, modifications were made using the
<code>types</code> module. Additionally, the <code>foo</code> function
has a new optional argument <code>optional_arg</code> of type
<code>str</code>, with a default value of <code>None</code>. This
argument is passed to the <code>some</code> function in the assertion.
The <code>Prompts</code> type has been added to the <code>foo</code>
function signature, and an assertion has been added to verify if
<code>prompts</code> is an instance of <code>Prompts</code>. Lastly, the
default value of the <code>address</code> argument has been changed from
an empty string to &quot;default&quot;, and the same changes have been
applied to the <code>test_injects_prompts</code> test function.</li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/databrickslabs/blueprint/commit/c3b53a48471b7c3d9ff911d7c6cc3921d6dd9846"><code>c3b53a4</code></a>
Release v0.9.0 (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/148">#148</a>)</li>
<li><a
href="https://github.com/databrickslabs/blueprint/commit/98c5f305e721b7f9b2db88db7ad062481e4191dd"><code>98c5f30</code></a>
Added Databricks CLI version as part of routed command telemetry (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/147">#147</a>)</li>
<li><a
href="https://github.com/databrickslabs/blueprint/commit/2bfbf1801c1f8638dfadd5c072daeb4cbb9fa372"><code>2bfbf18</code></a>
Release v0.8.3 (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/145">#145</a>)</li>
<li><a
href="https://github.com/databrickslabs/blueprint/commit/36fc873c0f9795e293d4716916fb96ed31680240"><code>36fc873</code></a>
add missing stat() methods to DBFSPath and WorkspacePath (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/144">#144</a>)</li>
<li><a
href="https://github.com/databrickslabs/blueprint/commit/c531c3f5627d15057d0ae6e570140dedbed968ef"><code>c531c3f</code></a>
Release v0.8.2 (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/139">#139</a>)</li>
<li><a
href="https://github.com/databrickslabs/blueprint/commit/53b94634639e673eeac880b005e9e64981259035"><code>53b9463</code></a>
support files with unicode BOM (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/138">#138</a>)</li>
<li><a
href="https://github.com/databrickslabs/blueprint/commit/ec8232664d4f45e2233404c7ab414d7c3393db1e"><code>ec82326</code></a>
Make hatch a prerequisite (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/137">#137</a>)</li>
<li><a
href="https://github.com/databrickslabs/blueprint/commit/98e75bcffd72dd3075a772947e0d06042ba81f6a"><code>98e75bc</code></a>
Release v0.8.1 (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/136">#136</a>)</li>
<li><a
href="https://github.com/databrickslabs/blueprint/commit/821bc0adb4438e211586af91d3ea011bf97115cf"><code>821bc0a</code></a>
Fixed py3.10 compatibility for <code>_parts</code> in pathlike (<a
href="https://redirect.github.com/databrickslabs/blueprint/issues/135">#135</a>)</li>
<li>See full diff in <a
href="https://github.com/databrickslabs/blueprint/compare/v0.8.0...v0.9.0">compare
view</a></li>
</ul>
</details>
<br />


Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6ac648bdb2..7afc837663 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,7 @@ classifiers = [
 
 dependencies = ["databricks-sdk~=0.30",
                 "databricks-labs-lsql>=0.5,<0.13",
-                "databricks-labs-blueprint>=0.8,<0.9",
+                "databricks-labs-blueprint>=0.8,<0.10",
                 "PyYAML>=6.0.0,<7.0.0",
                 "sqlglot>=25.5.0,<25.23",
                 "astroid>=3.3.1"]

From 555e83aaf196cb5d0c51357f588e52ebe30d1cba Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@wanadoo.fr>
Date: Wed, 25 Sep 2024 18:05:13 +0200
Subject: [PATCH 15/21] Delete temporary files when running solacc (#2750)

## Changes
solacc.py currently lints the entire solacc repo, thus accumulating
temporary files to a point that exceeds CI storage capacity
This PR fixes the issue by:
- lint the repo on a per top-level solacc 'solution' (within the repo,
top folders are independent of each other)
- delete temp files and dirs registered in PathLookup after linting a
solution
This PR also prepares for improving false positive detection

### Linked issues
None

### Functionality
None

### Tests
- [x] manually tested

---------

Co-authored-by: Eric Vergnaud <eric.vergnaud@databricks.com>
---
 .../ucx/source_code/python/python_infer.py    |   2 -
 tests/integration/source_code/solacc.py       | 171 ++++++++++++------
 tests/unit/source_code/test_notebook.py       |   2 +-
 3 files changed, 113 insertions(+), 62 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/python/python_infer.py b/src/databricks/labs/ucx/source_code/python/python_infer.py
index f74c287ba9..6727745280 100644
--- a/src/databricks/labs/ucx/source_code/python/python_infer.py
+++ b/src/databricks/labs/ucx/source_code/python/python_infer.py
@@ -10,9 +10,7 @@
     Const,
     decorators,
     Dict,
-    FormattedValue,
     Instance,
-    JoinedStr,
     Name,
     NodeNG,
     Uninferable,
diff --git a/tests/integration/source_code/solacc.py b/tests/integration/source_code/solacc.py
index 707c798ef5..e209b5a363 100644
--- a/tests/integration/source_code/solacc.py
+++ b/tests/integration/source_code/solacc.py
@@ -1,6 +1,8 @@
 import logging
 import os
+import shutil
 import sys
+from dataclasses import dataclass, field
 from pathlib import Path
 
 import requests
@@ -12,6 +14,7 @@
 from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
 from databricks.labs.ucx.source_code.base import LocatedAdvice
 from databricks.labs.ucx.source_code.linters.context import LinterContext
+from databricks.labs.ucx.source_code.path_lookup import PathLookup
 
 logger = logging.getLogger("verify-accelerators")
 
@@ -54,7 +57,7 @@ def collect_missing_imports(advices: list[LocatedAdvice]):
     return missing_imports
 
 
-def collect_not_computed(advices: list[LocatedAdvice]):
+def collect_uninferrable_count(advices: list[LocatedAdvice]):
     not_computed = 0
     for located_advice in advices:
         if "computed" in located_advice.advice.message:
@@ -68,93 +71,143 @@ def print_advices(advices: list[LocatedAdvice], file: Path):
         sys.stdout.write(f"{message}\n")
 
 
-def lint_one(file: Path, ctx: LocalCheckoutContext, unparsed: Path | None) -> tuple[set[str], int, int]:
+@dataclass
+class SolaccContext:
+    unparsed_path: Path | None = None
+    files_to_skip: set[str] | None = None
+    total_count = 0
+    parseable_count = 0
+    uninferrable_count = 0
+    missing_imports: dict[str, dict[str, int]] = field(default_factory=dict)
+
+    @classmethod
+    def create(cls, for_all_dirs: bool):
+        unparsed_path: Path | None = None
+        # if lint_all, recreate "solacc-unparsed.txt"
+        if for_all_dirs:
+            unparsed_path = Path(Path(__file__).parent, "solacc-unparsed.txt")
+            if unparsed_path.exists():
+                os.remove(unparsed_path)
+        files_to_skip: set[str] | None = None
+        malformed = Path(__file__).parent / "solacc-malformed.txt"
+        if for_all_dirs and malformed.exists():
+            lines = malformed.read_text(encoding="utf-8").split("\n")
+            files_to_skip = set(line for line in lines if len(line) > 0 and not line.startswith("#"))
+        return SolaccContext(unparsed_path=unparsed_path, files_to_skip=files_to_skip)
+
+    def register_missing_import(self, missing_import: str):
+        prefix = missing_import.split(".")[0]
+        details = self.missing_imports.get(prefix, None)
+        if details is None:
+            details = {}
+            self.missing_imports[prefix] = details
+        count = details.get(missing_import, 0)
+        details[missing_import] = count + 1
+
+    def log_missing_imports(self):
+        missing_imports = dict(
+            sorted(self.missing_imports.items(), key=lambda item: sum(item[1].values()), reverse=True)
+        )
+        for prefix, details in missing_imports.items():
+            logger.info(f"Missing import '{prefix}'")
+            for item, count in details.items():
+                logger.info(f"  {item}: {count} occurrences")
+
+
+def lint_one(solacc: SolaccContext, file: Path, ctx: LocalCheckoutContext) -> None:
     try:
         advices = list(ctx.local_code_linter.lint_path(file, set()))
-        missing_imports = collect_missing_imports(advices)
-        not_computed = collect_not_computed(advices)
+        solacc.parseable_count += 1
+        for missing_import in collect_missing_imports(advices):
+            solacc.register_missing_import(missing_import)
+        solacc.uninferrable_count += collect_uninferrable_count(advices)
         print_advices(advices, file)
-        return missing_imports, 1, not_computed
     except Exception as e:  # pylint: disable=broad-except
         # here we're most likely catching astroid & sqlglot errors
-        if unparsed is None:  # linting single file, log exception details
-            logger.error(f"Error during parsing of {file}: {e}".replace("\n", " "), exc_info=e)
-        else:
+        # when linting single file, log exception details
+        logger.error(
+            f"Error during parsing of {file}: {e}".replace("\n", " "),
+            exc_info=e if solacc.unparsed_path is None else None,
+        )
+        if solacc.unparsed_path:
             logger.error(f"Error during parsing of {file}: {e}".replace("\n", " "))
             # populate solacc-unparsed.txt
-            with unparsed.open(mode="a", encoding="utf-8") as f:
+            with solacc.unparsed_path.open(mode="a", encoding="utf-8") as f:
                 f.write(file.relative_to(dist).as_posix())
                 f.write("\n")
-        return set(), 0, 0
 
 
-def lint_all(file_to_lint: str | None):
+class _CleanablePathLookup(PathLookup):
+
+    def __init__(self):
+        super().__init__(Path.cwd(), [Path(path) for path in sys.path])
+        self._original_sys_paths = set(self._sys_paths)
+
+    def clean_tmp_sys_paths(self):
+        for path in self._sys_paths:
+            if path in self._original_sys_paths:
+                continue
+            if path.is_file():
+                path.unlink()
+            if path.is_dir():
+                shutil.rmtree(path)
+
+
+def lint_dir(solacc: SolaccContext, soldir: Path, file_to_lint: str | None = None):
+    path_lookup = _CleanablePathLookup()
     ws = WorkspaceClient(host='...', token='...')
     ctx = LocalCheckoutContext(ws).replace(
-        linter_context_factory=lambda session_state: LinterContext(TableMigrationIndex([]), session_state)
+        linter_context_factory=lambda session_state: LinterContext(TableMigrationIndex([]), session_state),
+        path_lookup=path_lookup,
     )
-    parseable = 0
-    not_computed = 0
-    missing_imports: dict[str, dict[str, int]] = {}
-    all_files = list(dist.glob('**/*.py')) if file_to_lint is None else [Path(dist, file_to_lint)]
-    unparsed: Path | None = None
-    if file_to_lint is None:
-        unparsed = Path(Path(__file__).parent, "solacc-unparsed.txt")
-        if unparsed.exists():
-            os.remove(unparsed)
-    skipped: set[str] | None = None
-    malformed = Path(__file__).parent / "solacc-malformed.txt"
-    if file_to_lint is None and malformed.exists():
-        lines = malformed.read_text(encoding="utf-8").split("\n")
-        skipped = set(line for line in lines if len(line) > 0 and not line.startswith("#"))
+    all_files = list(soldir.glob('**/*.py')) if file_to_lint is None else [Path(soldir, file_to_lint)]
+    solacc.total_count += len(all_files)
     for file in all_files:
-        if skipped and file.relative_to(dist).as_posix() in skipped:
+        if solacc.files_to_skip and file.relative_to(dist).as_posix() in solacc.files_to_skip:
             continue
-        _missing_imports, _parseable, _not_computed = lint_one(file, ctx, unparsed)
-        for _import in _missing_imports:
-            register_missing_import(missing_imports, _import)
-        parseable += _parseable
-        not_computed += _not_computed
-    all_files_len = len(all_files) - (len(skipped) if skipped else 0)
-    parseable_pct = int(parseable / all_files_len * 100)
-    missing_imports_count = sum(sum(details.values()) for details in missing_imports.values())
+        lint_one(solacc, file, ctx)
+    # cleanup tmp dirs
+    path_lookup.clean_tmp_sys_paths()
+
+
+def lint_file(file_to_lint: str):
+    solacc = SolaccContext.create(False)
+    file_path = Path(file_to_lint)
+    lint_dir(solacc, file_path.parent, file_path.name)
+
+
+def lint_all():
+    solacc = SolaccContext.create(True)
+    for soldir in os.listdir(dist):
+        lint_dir(solacc, dist / soldir)
+    all_files_len = solacc.total_count - (len(solacc.files_to_skip) if solacc.files_to_skip else 0)
+    parseable_pct = int(solacc.parseable_count / all_files_len * 100)
+    missing_imports_count = sum(sum(details.values()) for details in solacc.missing_imports.values())
     logger.info(
-        f"Skipped: {len(skipped or [])}, parseable: {parseable_pct}% ({parseable}/{all_files_len}), missing imports: {missing_imports_count}, not computed: {not_computed}"
+        f"Skipped: {len(solacc.files_to_skip or [])}, "
+        f"parseable: {parseable_pct}% ({solacc.parseable_count}/{all_files_len}), "
+        f"missing imports: {missing_imports_count}, "
+        f"not computed: {solacc.uninferrable_count}"
     )
-    log_missing_imports(missing_imports)
+    solacc.log_missing_imports()
     # fail the job if files are unparseable
     if parseable_pct < 100:
         sys.exit(1)
 
 
-def register_missing_import(missing_imports: dict[str, dict[str, int]], missing_import: str):
-    prefix = missing_import.split(".")[0]
-    details = missing_imports.get(prefix, None)
-    if details is None:
-        details = {}
-        missing_imports[prefix] = details
-    count = details.get(missing_import, 0)
-    details[missing_import] = count + 1
-
-
-def log_missing_imports(missing_imports: dict[str, dict[str, int]]):
-    missing_imports = dict(sorted(missing_imports.items(), key=lambda item: sum(item[1].values()), reverse=True))
-    for prefix, details in missing_imports.items():
-        logger.info(f"Missing import '{prefix}'")
-        for item, count in details.items():
-            logger.info(f"  {item}: {count} occurrences")
-
-
 def main(args: list[str]):
     install_logger()
     logging.root.setLevel(logging.INFO)
     file_to_lint = args[1] if len(args) > 1 else None
-    if not file_to_lint:
+    if file_to_lint:
         # don't clone if linting just one file, assumption is we're troubleshooting
-        logger.info("Cloning...")
-        clone_all()
+        logger.info("Linting...")
+        lint_file(file_to_lint)
+        return
+    logger.info("Cloning...")
+    clone_all()
     logger.info("Linting...")
-    lint_all(file_to_lint)
+    lint_all()
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py
index bda25a0106..b255a0e398 100644
--- a/tests/unit/source_code/test_notebook.py
+++ b/tests/unit/source_code/test_notebook.py
@@ -249,7 +249,7 @@ def test_notebook_builds_python_dependency_graph_with_fstring_loop(mock_path_loo
     container = maybe.dependency.load(mock_path_lookup)
     assert container is not None
     container.build_dependency_graph(graph)
-    expected_paths = [path, "leaf1.py",  "leaf2.py", "leaf3.py"]
+    expected_paths = [path, "leaf1.py", "leaf2.py", "leaf3.py"]
     all_paths = set(d.path for d in graph.all_dependencies)
     assert all_paths == {mock_path_lookup.cwd / path for path in expected_paths}
 

From d7444652ad5e3be60c56a7cda9dc2fb346e17844 Mon Sep 17 00:00:00 2001
From: Andrew Snare <asnare@users.noreply.github.com>
Date: Wed, 25 Sep 2024 18:05:58 +0200
Subject: [PATCH 16/21] Code format: `make fmt` (#2749)

## Changes

This PR just includes changes from `make fmt`, which doesn't currently
pass on `main`.

### Linked issues

Updates #2746.

From 04918be5772ac603812dfa79afa293deea1ce8e4 Mon Sep 17 00:00:00 2001
From: Serge Smertin <259697+nfx@users.noreply.github.com>
Date: Wed, 25 Sep 2024 18:06:28 +0200
Subject: [PATCH 17/21] Speedup assessment workflow by making DBFS root table
 size calculation parallel (#2745)

We were not doing that before and now we do.
---
 .../labs/ucx/contexts/workflow_task.py        |  2 +-
 .../labs/ucx/hive_metastore/table_size.py     | 39 ++++++++++---------
 .../labs/ucx/hive_metastore/tables.py         |  2 +-
 .../labs/ucx/source_code/known.json           |  3 +-
 tests/unit/hive_metastore/test_table_size.py  |  2 +-
 5 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py
index f52bcbadb8..9e3cbabbad 100644
--- a/src/databricks/labs/ucx/contexts/workflow_task.py
+++ b/src/databricks/labs/ucx/contexts/workflow_task.py
@@ -71,7 +71,7 @@ def pipelines_crawler(self):
 
     @cached_property
     def table_size_crawler(self):
-        return TableSizeCrawler(self.sql_backend, self.inventory_database)
+        return TableSizeCrawler(self.sql_backend, self.inventory_database, self.config.include_databases)
 
     @cached_property
     def policies_crawler(self):
diff --git a/src/databricks/labs/ucx/hive_metastore/table_size.py b/src/databricks/labs/ucx/hive_metastore/table_size.py
index 0106fbab48..aadb7576e5 100644
--- a/src/databricks/labs/ucx/hive_metastore/table_size.py
+++ b/src/databricks/labs/ucx/hive_metastore/table_size.py
@@ -1,12 +1,15 @@
 import logging
 from collections.abc import Iterable
 from dataclasses import dataclass
+from functools import partial
 
+from databricks.labs.blueprint.parallel import Threads
 from databricks.labs.lsql.backends import SqlBackend
 
 from databricks.labs.ucx.framework.crawlers import CrawlerBase
 from databricks.labs.ucx.framework.utils import escape_sql_identifier
 from databricks.labs.ucx.hive_metastore import TablesCrawler
+from databricks.labs.ucx.hive_metastore.tables import Table
 
 logger = logging.getLogger(__name__)
 
@@ -40,43 +43,43 @@ def _crawl(self) -> Iterable[TableSize]:
         """Crawls and lists tables using table crawler
         Identifies DBFS root tables and calculates the size for these.
         """
+        tasks = []
         for table in self._tables_crawler.snapshot():
             if not table.kind == "TABLE":
                 continue
             if not table.is_dbfs_root:
                 continue
-            size_in_bytes = self._safe_get_table_size(table.key)
-            if size_in_bytes is None:
-                continue  # table does not exist anymore or is corrupted
-
-            yield TableSize(
-                catalog=table.catalog, database=table.database, name=table.name, size_in_bytes=size_in_bytes
-            )
+            tasks.append(partial(self._safe_get_table_size, table))
+        return Threads.strict('DBFS root table sizes', tasks)
 
     def _try_fetch(self) -> Iterable[TableSize]:
         """Tries to load table information from the database or throws TABLE_OR_VIEW_NOT_FOUND error"""
         for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"):
             yield TableSize(*row)
 
-    def _safe_get_table_size(self, table_full_name: str) -> int | None:
-        logger.debug(f"Evaluating {table_full_name} table size.")
+    def _safe_get_table_size(self, table: Table) -> TableSize | None:
+        logger.debug(f"Evaluating {table.key} table size.")
         try:
             # refresh table statistics to avoid stale stats in HMS
-            self._backend.execute(f"ANALYZE table {escape_sql_identifier(table_full_name)} compute STATISTICS NOSCAN")
-            # pylint: disable-next=protected-access
-            return self._spark._jsparkSession.table(table_full_name).queryExecution().analyzed().stats().sizeInBytes()
+            self._backend.execute(f"ANALYZE table {table.safe_sql_key} compute STATISTICS NOSCAN")
+            jvm_df = self._spark._jsparkSession.table(table.safe_sql_key)  # pylint: disable=protected-access
+            size_in_bytes = jvm_df.queryExecution().analyzed().stats().sizeInBytes()
+            return TableSize(
+                catalog=table.catalog,
+                database=table.database,
+                name=table.name,
+                size_in_bytes=size_in_bytes,
+            )
         except Exception as e:  # pylint: disable=broad-exception-caught
             if "[TABLE_OR_VIEW_NOT_FOUND]" in str(e) or "[DELTA_TABLE_NOT_FOUND]" in str(e):
-                logger.warning(f"Failed to evaluate {table_full_name} table size. Table not found.")
+                logger.warning(f"Failed to evaluate {table.key} table size. Table not found.")
                 return None
             if "[DELTA_INVALID_FORMAT]" in str(e):
-                logger.warning(
-                    f"Unable to read Delta table {table_full_name}, please check table structure and try again."
-                )
+                logger.warning(f"Unable to read Delta table {table.key}, please check table structure and try again.")
                 return None
             if "[DELTA_MISSING_TRANSACTION_LOG]" in str(e):
-                logger.warning(f"Delta table {table_full_name} is corrupted: missing transaction log.")
+                logger.warning(f"Delta table {table.key} is corrupt: missing transaction log.")
                 return None
-            logger.error(f"Failed to evaluate {table_full_name} table size: ", exc_info=True)
+            logger.error(f"Failed to evaluate {table.key} table size: ", exc_info=True)
 
         return None
diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py
index 5eff7a1815..0545b584e9 100644
--- a/src/databricks/labs/ucx/hive_metastore/tables.py
+++ b/src/databricks/labs/ucx/hive_metastore/tables.py
@@ -340,7 +340,7 @@ class MigrationCount:
     what_count: dict[What, int]
 
 
-class TablesCrawler(CrawlerBase):
+class TablesCrawler(CrawlerBase[Table]):
     def __init__(self, backend: SqlBackend, schema, include_databases: list[str] | None = None):
         """
         Initializes a TablesCrawler instance.
diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json
index 93b7eafe36..b7ad275abc 100644
--- a/src/databricks/labs/ucx/source_code/known.json
+++ b/src/databricks/labs/ucx/source_code/known.json
@@ -1830,6 +1830,7 @@
   "databricks-labs-ucx": {
     "databricks.labs.ucx": []
   },
+  "databricks-pydabs": {},
   "databricks-sdk": {
     "databricks.sdk": []
   },
@@ -29921,4 +29922,4 @@
     "zipp.compat.py310": [],
     "zipp.glob": []
   }
-}
+}
\ No newline at end of file
diff --git a/tests/unit/hive_metastore/test_table_size.py b/tests/unit/hive_metastore/test_table_size.py
index 8216a0a780..e23d3a7a35 100644
--- a/tests/unit/hive_metastore/test_table_size.py
+++ b/tests/unit/hive_metastore/test_table_size.py
@@ -137,7 +137,7 @@ def test_table_size_when_table_corrupted(mocker, caplog):
         results = tsc.snapshot()
 
     assert len(results) == 0
-    assert "Delta table hive_metastore.db1.table1 is corrupted: missing transaction log" in caplog.text
+    assert "Delta table hive_metastore.db1.table1 is corrupt: missing transaction log" in caplog.text
 
 
 def test_table_size_when_delta_invalid_format_error(mocker, caplog):

From 0a03defb538396c601879fb96686b5e3e92823bb Mon Sep 17 00:00:00 2001
From: Cor <jczuurmond@protonmail.com>
Date: Thu, 26 Sep 2024 10:43:26 +0200
Subject: [PATCH 18/21] Harden configuration reading (#2701)

## Changes
Harden configuration reading by verifying the type before reading the
"value" using `.get`

### Linked issues

Resolves #2581 (hopefully the second get is the issue, type hinting
should cover that, but who knows)

### Functionality

- [x] modified existing workflow: `assessment`

### Tests

- [x] added unit tests
---
 src/databricks/labs/ucx/assessment/secrets.py | 11 +++++++--
 tests/unit/assessment/test_secrets.py         | 23 +++++++++++++++++++
 2 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit/assessment/test_secrets.py

diff --git a/src/databricks/labs/ucx/assessment/secrets.py b/src/databricks/labs/ucx/assessment/secrets.py
index 715d6d82aa..2eb3c0143f 100644
--- a/src/databricks/labs/ucx/assessment/secrets.py
+++ b/src/databricks/labs/ucx/assessment/secrets.py
@@ -30,13 +30,20 @@ def _get_secret_if_exists(self, secret_scope, secret_key) -> str | None:
             )
             return None
 
-    def _get_value_from_config_key(self, config: dict, key: str, get_secret: bool = True) -> str | None:
+    def _get_value_from_config_key(
+        self,
+        config: dict,
+        key: str,
+        get_secret: bool = True,
+    ) -> str | None:
         """Get a config value based on its key, with some special handling:
         If the key is prefixed with spark_conf, i.e. this is in a cluster policy, the actual value is nested
         If the value is of format {{secret_scope/secret}}, we extract that as well
         """
         if re.search("spark_conf", key):
-            value = config.get(key, {}).get("value", "")
+            value = config.get(key, {})
+            if isinstance(value, dict):
+                value = value.get("value", "")
         else:
             value = config.get(key, "")
         # retrieve from secret scope if used
diff --git a/tests/unit/assessment/test_secrets.py b/tests/unit/assessment/test_secrets.py
new file mode 100644
index 0000000000..555a3cd8bf
--- /dev/null
+++ b/tests/unit/assessment/test_secrets.py
@@ -0,0 +1,23 @@
+import pytest
+
+from databricks.labs.ucx.assessment.secrets import SecretsMixin
+
+
+@pytest.mark.parametrize(
+    "key,expected",
+    [
+        ("spark_conf.spark.hadoop.javax.jdo.option.ConnectionURL", "url"),
+        ("NonExistentKey", ""),
+        ("spark_conf.invalid", "{'should_not': 'be_string'}"),
+    ],
+)
+def test_secrets_mixin_gets_value_from_config_key(key, expected) -> None:
+    config = {
+        "spark_conf.invalid": "{'should_not': 'be_string'}",
+        "spark_conf.spark.hadoop.javax.jdo.option.ConnectionURL": {"value": "url"},
+    }
+    secrets_mixin = SecretsMixin()
+
+    value = secrets_mixin._get_value_from_config_key(config, key)  # pylint: disable=protected-access
+
+    assert value == expected

From be955ea7633f92ef16816afc14c1da432ce326bb Mon Sep 17 00:00:00 2001
From: Amin Movahed <140028681+aminmovahed-db@users.noreply.github.com>
Date: Thu, 26 Sep 2024 18:51:03 +1000
Subject: [PATCH 19/21] Add unskip CLI command to undo a skip on schema or a
 table  (#2734)

## Changes
Add unskip CLI command to undo a skip on schema or a table

### Linked issues

Resolves #1938

### Functionality

- [x] added relevant user documentation
- [x] added new CLI command --> unskip

### Tests
<!-- How is this tested? Please see the checklist below and also
describe any other relevant tests -->
- [x] Unit test added
---
 README.md                                     | 10 +++
 src/databricks/labs/ucx/cli.py                |  2 +-
 .../labs/ucx/hive_metastore/mapping.py        | 53 ++++++++-------
 tests/unit/hive_metastore/test_mapping.py     | 68 ++++++++++++++++++-
 4 files changed, 106 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 7c5bf044c0..f15bd6c7ca 100644
--- a/README.md
+++ b/README.md
@@ -108,6 +108,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project.
   * [`migrate-locations` command](#migrate-locations-command)
   * [`create-table-mapping` command](#create-table-mapping-command)
   * [`skip` command](#skip-command)
+  * [`unskip` command](#unskip-command)
   * [`create-catalogs-schemas` command](#create-catalogs-schemas-command)
   * [`migrate-tables` command](#migrate-tables-command)
   * [`revert-migrated-tables` command](#revert-migrated-tables-command)
@@ -1466,6 +1467,15 @@ Once you're done with table migration, proceed to the [code migration](#code-mig
 
 [[back to top](#databricks-labs-ucx)]
 
+## `unskip` command
+
+```commandline
+databricks labs ucx unskip --schema X [--table Y]
+```
+This command removes the mark set by the [`skip` command](#skip-command) on the given schema or table.
+
+[[back to top](#databricks-labs-ucx)]
+
 ## `create-catalogs-schemas` command
 
 ```text
diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py
index e3f5d76756..19929794a7 100644
--- a/src/databricks/labs/ucx/cli.py
+++ b/src/databricks/labs/ucx/cli.py
@@ -101,7 +101,7 @@ def skip(w: WorkspaceClient, schema: str | None = None, table: str | None = None
 
 @ucx.command
 def unskip(w: WorkspaceClient, schema: str | None = None, table: str | None = None):
-    """Create a unskip comment on a schema or a table"""
+    """Unset the skip mark from a schema or a table"""
     logger.info("Running unskip command")
     if not schema:
         logger.error("--schema is a required parameter.")
diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py
index 94f33a97fe..3aef32ba20 100644
--- a/src/databricks/labs/ucx/hive_metastore/mapping.py
+++ b/src/databricks/labs/ucx/hive_metastore/mapping.py
@@ -135,24 +135,28 @@ def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Call
         except BadRequest as err:
             logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True)
 
-    def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]):
-        # Removes skip mark from the table property
+    def unskip_table_or_view(
+        self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]
+    ) -> None:
+        """Removes skip mark from the table property.
+
+        Args:
+            schema_name (str): The schema name of the table to be unskipped.
+            table_name (str): The table name of the table to be unskipped.
+            load_table (Callable[[str, str], Table | None]): A function that loads a table from the metastore.
+        """
+        table = load_table(schema_name, table_name)
+        if table is None:
+            logger.error(
+                f"Failed to remove skip marker from table: {schema_name}.{table_name}. Table not found.",
+            )
+            return
         try:
-            table = load_table(schema_name, table_name)
-            if table is None:
-                raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]")
             self._sql_backend.execute(
-                f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}' );"
+                f"ALTER {table.kind} {escape_sql_identifier(table.full_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');"
             )
-        except NotFound as err:
-            if "[TABLE_OR_VIEW_NOT_FOUND]" in str(err) or "[DELTA_TABLE_NOT_FOUND]" in str(err):
-                logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}. Table not found.")
-            else:
-                logger.error(
-                    f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True
-                )
-        except BadRequest as err:
-            logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True)
+        except (NotFound, BadRequest) as e:
+            logger.error(f"Failed to remove skip marker from table: {table.full_name}", exc_info=e)
 
     def skip_schema(self, schema: str):
         # Marks a schema to be skipped in the migration process by applying a table property
@@ -168,19 +172,18 @@ def skip_schema(self, schema: str):
         except BadRequest as err:
             logger.error(err)
 
-    def unskip_schema(self, schema: str):
-        # Removes skip mark from the schema property
+    def unskip_schema(self, schema: str) -> None:
+        """Removes skip mark from the schema property.
+
+        Args:
+            schema (str): The schema name of the table to be unskipped.
+        """
         try:
             self._sql_backend.execute(
-                f"ALTER SCHEMA {escape_sql_identifier(schema)} UNSET DBPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');"
+                f"ALTER SCHEMA hive_metastore.{escape_sql_identifier(schema)} UNSET DBPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');"
             )
-        except NotFound as err:
-            if "[SCHEMA_NOT_FOUND]" in str(err):
-                logger.error(f"Failed to remove skip marker for Schema {schema}. Schema not found.")
-            else:
-                logger.error(err)
-        except BadRequest as err:
-            logger.error(err)
+        except (NotFound, BadRequest) as e:
+            logger.error(f"Failed to remove skip marker from schema: {schema}.", exc_info=e)
 
     def get_tables_to_migrate(self, tables_crawler: TablesCrawler) -> Collection[TableToMigrate]:
         rules = self.load()
diff --git a/tests/unit/hive_metastore/test_mapping.py b/tests/unit/hive_metastore/test_mapping.py
index 0ea2951131..e0ac9f56ad 100644
--- a/tests/unit/hive_metastore/test_mapping.py
+++ b/tests/unit/hive_metastore/test_mapping.py
@@ -6,7 +6,7 @@
 from databricks.labs.blueprint.parallel import ManyError
 from databricks.labs.lsql.backends import MockBackend, SqlBackend
 from databricks.sdk import WorkspaceClient
-from databricks.sdk.errors import NotFound
+from databricks.sdk.errors import NotFound, BadRequest
 from databricks.sdk.errors.platform import ResourceConflict
 from databricks.sdk.service.catalog import TableInfo
 
@@ -211,6 +211,72 @@ def test_skip_happy_path(caplog):
     assert len(caplog.records) == 0
 
 
+def test_unskip_on_table() -> None:
+    ws = create_autospec(WorkspaceClient)
+    mock_backend = MockBackend()
+    installation = MockInstallation()
+    mapping = TableMapping(installation, ws, mock_backend)
+    table = Table(catalog="catalog", database="schema", name="table", object_type="table", table_format="csv")
+    mapping.unskip_table_or_view(schema_name="schema", table_name="table", load_table=lambda _schema, _table: table)
+    ws.tables.get.assert_not_called()
+    assert (
+        f"ALTER TABLE `catalog`.`schema`.`table` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');"
+        in mock_backend.queries
+    )
+
+
+def test_unskip_on_view() -> None:
+    ws = create_autospec(WorkspaceClient)
+    mock_backend = MockBackend()
+    installation = MockInstallation()
+    mapping = TableMapping(installation, ws, mock_backend)
+    view = Table(
+        catalog="catalog", database="schema", name="view", object_type="table", table_format="csv", view_text="stuff"
+    )
+    mapping.unskip_table_or_view(schema_name="schema", table_name="view", load_table=lambda _schema, _table: view)
+    ws.tables.get.assert_not_called()
+    assert (
+        f"ALTER VIEW `catalog`.`schema`.`view` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');"
+        in mock_backend.queries
+    )
+
+
+def test_unskip_on_schema() -> None:
+    ws = create_autospec(WorkspaceClient)
+    mock_backend = MockBackend()
+    installation = MockInstallation()
+    mapping = TableMapping(installation, ws, mock_backend)
+    mapping.unskip_schema(schema="schema")
+    ws.tables.get.assert_not_called()
+    assert (
+        f"ALTER SCHEMA hive_metastore.`schema` UNSET DBPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');"
+        in mock_backend.queries
+    )
+
+
+def test_unskip_missing_table(caplog) -> None:
+    ws = create_autospec(WorkspaceClient)
+    sbe = create_autospec(SqlBackend)
+    sbe.execute.side_effect = NotFound("[TABLE_OR_VIEW_NOT_FOUND]")
+    installation = MockInstallation()
+    mapping = TableMapping(installation, ws, sbe)
+    mapping.unskip_table_or_view(schema_name='foo', table_name="table", load_table=lambda schema, table: None)
+    assert [rec.message for rec in caplog.records if "table not found" in rec.message.lower()]
+    ws.tables.get.assert_not_called()
+
+
+def test_unskip_badrequest(caplog) -> None:
+    ws = create_autospec(WorkspaceClient)
+    sbe = create_autospec(SqlBackend)
+    sbe.execute.side_effect = BadRequest("[Bad command]")
+    installation = MockInstallation()
+    mapping = TableMapping(installation, ws, sbe)
+    table = Table(catalog="catalog", database="schema", name="table", object_type="table", table_format="csv")
+    mapping.unskip_table_or_view(schema_name="schema", table_name="table", load_table=lambda _schema, _table: table)
+    assert [rec.message for rec in caplog.records if "failed to remove skip marker " in rec.message.lower()]
+    ws.tables.get.assert_not_called()
+
+
 def test_skip_missing_schema(caplog):
     ws = create_autospec(WorkspaceClient)
     sbe = create_autospec(SqlBackend)

From 788c2737f86c2a5606c60e37f0e32c3713731224 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@wanadoo.fr>
Date: Thu, 26 Sep 2024 14:35:42 +0200
Subject: [PATCH 20/21] Improve solacc linting (#2752)

## Changes
`solacc` currently lints on a per-file basis, which is incorrect
this PR implements linting a per solution basis, thus improving
dependency resolution

### Linked issues
None

### Functionality
None

### Tests
- [x] manually tested

---------

Co-authored-by: Eric Vergnaud <eric.vergnaud@databricks.com>
---
 src/databricks/labs/ucx/source_code/base.py |   3 +-
 tests/integration/source_code/solacc.py     | 101 +++++++++-----------
 2 files changed, 48 insertions(+), 56 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index 9052ed9545..4e22e74a1a 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -97,7 +97,8 @@ def message_relative_to(self, base: Path, *, default: Path | None = None) -> str
         if default is not None:
             path = default
         path = path.relative_to(base)
-        return f"./{path.as_posix()}:{advice.start_line}:{advice.start_col}: [{advice.code}] {advice.message}"
+        # increment start_line because it is 0-based whereas IDEs are usually 1-based
+        return f"./{path.as_posix()}:{advice.start_line+1}:{advice.start_col}: [{advice.code}] {advice.message}"
 
 
 class Advisory(Advice):
diff --git a/tests/integration/source_code/solacc.py b/tests/integration/source_code/solacc.py
index e209b5a363..9ae1fdb748 100644
--- a/tests/integration/source_code/solacc.py
+++ b/tests/integration/source_code/solacc.py
@@ -22,7 +22,7 @@
 dist = (this_file / '../../../../dist').resolve().absolute()
 
 
-def clone_all():
+def _clone_all():
     params = {'per_page': 100, 'page': 1}
     to_clone = []
     while True:
@@ -49,7 +49,7 @@ def clone_all():
         run_command(f'git clone {url} {dst}')
 
 
-def collect_missing_imports(advices: list[LocatedAdvice]):
+def _collect_missing_imports(advices: list[LocatedAdvice]):
     missing_imports: set[str] = set()
     for located_advice in advices:
         if located_advice.advice.code == 'import-not-found':
@@ -57,7 +57,7 @@ def collect_missing_imports(advices: list[LocatedAdvice]):
     return missing_imports
 
 
-def collect_uninferrable_count(advices: list[LocatedAdvice]):
+def _collect_uninferrable_count(advices: list[LocatedAdvice]):
     not_computed = 0
     for located_advice in advices:
         if "computed" in located_advice.advice.message:
@@ -65,15 +65,19 @@ def collect_uninferrable_count(advices: list[LocatedAdvice]):
     return not_computed
 
 
-def print_advices(advices: list[LocatedAdvice], file: Path):
+def _collect_unparseable(advices: list[LocatedAdvice]):
+    return set(located_advice for located_advice in advices if located_advice.advice.code == 'parse-error')
+
+
+def _print_advices(advices: list[LocatedAdvice]):
     for located_advice in advices:
-        message = located_advice.message_relative_to(dist.parent, default=file)
+        message = located_advice.message_relative_to(dist.parent)
         sys.stdout.write(f"{message}\n")
 
 
 @dataclass
-class SolaccContext:
-    unparsed_path: Path | None = None
+class _SolaccContext:
+    unparsed_files_path: Path | None = None
     files_to_skip: set[str] | None = None
     total_count = 0
     parseable_count = 0
@@ -93,7 +97,7 @@ def create(cls, for_all_dirs: bool):
         if for_all_dirs and malformed.exists():
             lines = malformed.read_text(encoding="utf-8").split("\n")
             files_to_skip = set(line for line in lines if len(line) > 0 and not line.startswith("#"))
-        return SolaccContext(unparsed_path=unparsed_path, files_to_skip=files_to_skip)
+        return _SolaccContext(unparsed_files_path=unparsed_path, files_to_skip=files_to_skip)
 
     def register_missing_import(self, missing_import: str):
         prefix = missing_import.split(".")[0]
@@ -114,29 +118,6 @@ def log_missing_imports(self):
                 logger.info(f"  {item}: {count} occurrences")
 
 
-def lint_one(solacc: SolaccContext, file: Path, ctx: LocalCheckoutContext) -> None:
-    try:
-        advices = list(ctx.local_code_linter.lint_path(file, set()))
-        solacc.parseable_count += 1
-        for missing_import in collect_missing_imports(advices):
-            solacc.register_missing_import(missing_import)
-        solacc.uninferrable_count += collect_uninferrable_count(advices)
-        print_advices(advices, file)
-    except Exception as e:  # pylint: disable=broad-except
-        # here we're most likely catching astroid & sqlglot errors
-        # when linting single file, log exception details
-        logger.error(
-            f"Error during parsing of {file}: {e}".replace("\n", " "),
-            exc_info=e if solacc.unparsed_path is None else None,
-        )
-        if solacc.unparsed_path:
-            logger.error(f"Error during parsing of {file}: {e}".replace("\n", " "))
-            # populate solacc-unparsed.txt
-            with solacc.unparsed_path.open(mode="a", encoding="utf-8") as f:
-                f.write(file.relative_to(dist).as_posix())
-                f.write("\n")
-
-
 class _CleanablePathLookup(PathLookup):
 
     def __init__(self):
@@ -153,33 +134,46 @@ def clean_tmp_sys_paths(self):
                 shutil.rmtree(path)
 
 
-def lint_dir(solacc: SolaccContext, soldir: Path, file_to_lint: str | None = None):
+def _lint_dir(solacc: _SolaccContext, soldir: Path):
     path_lookup = _CleanablePathLookup()
     ws = WorkspaceClient(host='...', token='...')
     ctx = LocalCheckoutContext(ws).replace(
         linter_context_factory=lambda session_state: LinterContext(TableMigrationIndex([]), session_state),
         path_lookup=path_lookup,
     )
-    all_files = list(soldir.glob('**/*.py')) if file_to_lint is None else [Path(soldir, file_to_lint)]
+    all_files = list(soldir.glob('**/*.py')) + list(soldir.glob('**/*.sql'))
     solacc.total_count += len(all_files)
-    for file in all_files:
-        if solacc.files_to_skip and file.relative_to(dist).as_posix() in solacc.files_to_skip:
-            continue
-        lint_one(solacc, file, ctx)
+    # pre-populate linted_files such that files to skip are not linted
+    files_to_skip = set(solacc.files_to_skip) if solacc.files_to_skip else set()
+    linted_files = set(files_to_skip)
+    # lint solution
+    advices = list(ctx.local_code_linter.lint_path(soldir, linted_files))
+    # collect unparseable files
+    unparseables = _collect_unparseable(advices)
+    solacc.parseable_count += len(linted_files) - len(files_to_skip) - len(set(advice.path for advice in unparseables))
+    if solacc.unparsed_files_path:
+        for unparseable in unparseables:
+            logger.error(f"Error during parsing of {unparseable.path}: {unparseable.advice.message}".replace("\n", " "))
+            # populate solacc-unparsed.txt
+            with solacc.unparsed_files_path.open(mode="a", encoding="utf-8") as f:
+                f.write(unparseable.path.relative_to(dist).as_posix())
+                f.write("\n")
+    # collect missing imports
+    for missing_import in _collect_missing_imports(advices):
+        solacc.register_missing_import(missing_import)
+    # collect uninferrable
+    solacc.uninferrable_count += _collect_uninferrable_count(advices)
+    # display advices
+    _print_advices(advices)
     # cleanup tmp dirs
     path_lookup.clean_tmp_sys_paths()
 
 
-def lint_file(file_to_lint: str):
-    solacc = SolaccContext.create(False)
-    file_path = Path(file_to_lint)
-    lint_dir(solacc, file_path.parent, file_path.name)
-
-
-def lint_all():
-    solacc = SolaccContext.create(True)
-    for soldir in os.listdir(dist):
-        lint_dir(solacc, dist / soldir)
+def _lint_dirs(dir_to_lint: str | None):
+    solacc = _SolaccContext.create(dir_to_lint is not None)
+    all_dirs = os.listdir(dist) if dir_to_lint is None else [dir_to_lint]
+    for soldir in all_dirs:
+        _lint_dir(solacc, dist / soldir)
     all_files_len = solacc.total_count - (len(solacc.files_to_skip) if solacc.files_to_skip else 0)
     parseable_pct = int(solacc.parseable_count / all_files_len * 100)
     missing_imports_count = sum(sum(details.values()) for details in solacc.missing_imports.values())
@@ -198,16 +192,13 @@ def lint_all():
 def main(args: list[str]):
     install_logger()
     logging.root.setLevel(logging.INFO)
-    file_to_lint = args[1] if len(args) > 1 else None
-    if file_to_lint:
+    dir_to_lint = args[1] if len(args) > 1 else None
+    if not dir_to_lint:
         # don't clone if linting just one file, assumption is we're troubleshooting
-        logger.info("Linting...")
-        lint_file(file_to_lint)
-        return
-    logger.info("Cloning...")
-    clone_all()
+        logger.info("Cloning...")
+        _clone_all()
     logger.info("Linting...")
-    lint_all()
+    _lint_dirs(dir_to_lint)
 
 
 if __name__ == "__main__":

From 1d12391723f451e76b6bb735f28a063d98143a7f Mon Sep 17 00:00:00 2001
From: Andres Garcia <andres.garcia@databricks.com>
Date: Thu, 26 Sep 2024 09:04:54 -0600
Subject: [PATCH 21/21] Sync Fork, make fmt test

---
 src/databricks/labs/ucx/assessment/export.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/databricks/labs/ucx/assessment/export.py b/src/databricks/labs/ucx/assessment/export.py
index 113dd3bd4d..987132e02e 100644
--- a/src/databricks/labs/ucx/assessment/export.py
+++ b/src/databricks/labs/ucx/assessment/export.py
@@ -11,8 +11,6 @@
 
 
 class AssessmentExporter:
-    # File and Path Constants
-    _EXPORT_FILE_NAME = "ucx_assessment_results.zip"
 
     def __init__(self, sql_backend: SqlBackend, config: WorkspaceConfig):
         self._sql_backend = sql_backend