rportilla-databricks · rportilla-databricks · Sep 27, 2024 · Sep 26, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -259,7 +259,7 @@ make dev
 
 To use different python version, specify it in `HATCH_PYTHON` variable:
 ```shell
-HATCH_PYTHON=python3.10 make clean dev test
+HATCH_PYTHON="$(which python3.10)" make clean dev test
 ```
 
 Configure your IDE to use `.venv/bin/python` from the virtual environment when developing the project:

diff --git a/Makefile b/Makefile
@@ -23,7 +23,7 @@ integration:
 	hatch run integration
 
 coverage:
-	hatch run coverage && open htmlcov/index.html
+	hatch run coverage; status=$$?; [ -e "htmlcov/index.html" ] && open htmlcov/index.html; exit $$status
 
 known:
 	hatch run python src/databricks/labs/ucx/source_code/known.py

diff --git a/README.md b/README.md
@@ -95,6 +95,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project.
 * [Metastore related commands](#metastore-related-commands)
   * [`show-all-metastores` command](#show-all-metastores-command)
   * [`assign-metastore` command](#assign-metastore-command)
+  * [`create-ucx-catalog` command](#create-ucx-catalog-command)
 * [Table migration commands](#table-migration-commands)
   * [`principal-prefix-access` command](#principal-prefix-access-command)
     * [Access for AWS S3 Buckets](#access-for-aws-s3-buckets)
@@ -107,6 +108,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project.
   * [`migrate-locations` command](#migrate-locations-command)
   * [`create-table-mapping` command](#create-table-mapping-command)
   * [`skip` command](#skip-command)
+  * [`unskip` command](#unskip-command)
   * [`create-catalogs-schemas` command](#create-catalogs-schemas-command)
   * [`migrate-tables` command](#migrate-tables-command)
   * [`revert-migrated-tables` command](#revert-migrated-tables-command)
@@ -127,6 +129,8 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project.
   * [`revert-cluster-remap` command](#revert-cluster-remap-command)
   * [`upload` command](#upload-command)
   * [`download` command](#download-command)
+  * [`join-collection` command](#join-collection command)
+  * [collection eligible command](#collection-eligible-command)
 * [Common Challenges and the Solutions](#common-challenges-and-the-solutions)
     * [Network Connectivity Issues](#network-connectivity-issues)
     * [Insufficient Privileges](#insufficient-privileges)
@@ -396,6 +400,9 @@ which can be used for further analysis and decision-making through the [assessme
 9. `assess_pipelines`: This task scans through all the Pipelines and identifies those pipelines that have Azure Service Principals embedded in their configurations. A list of all the pipelines with matching configurations is stored in the `$inventory.pipelines` table.
 10. `assess_azure_service_principals`: This task scans through all the clusters configurations, cluster policies, job cluster configurations, Pipeline configurations, and Warehouse configuration and identifies all the Azure Service Principals who have been given access to the Azure storage accounts via spark configurations referred in those entities. The list of all the Azure Service Principals referred in those configurations is saved in the `$inventory.azure_service_principals` table.
 11. `assess_global_init_scripts`: This task scans through all the global init scripts and identifies if there is an Azure Service Principal who has been given access to the Azure storage accounts via spark configurations referred in those scripts.
+12. `assess_dashboards`: This task scans through all the dashboards and analyzes embedded queries for migration problems. It also collects direct filesystem access patterns that require attention.
+13. `assess_workflows`: This task scans through all the jobs and tasks and analyzes notebooks and files for migration problems. It also collects direct filesystem access patterns that require attention.
+
 
 ![report](docs/assessment-report.png)
 
@@ -711,11 +718,16 @@ in the Migration dashboard.
 
 > Please note that this is an experimental workflow.
 
-The `experimental-workflow-linter` workflow lints accessible code belonging to all workflows/jobs present in the
-workspace. The linting emits problems indicating what to resolve for making the code Unity Catalog compatible.
+The `experimental-workflow-linter` workflow lints accessible code from 2 sources:
+ - all workflows/jobs present in the workspace
+ - all dashboards/queries present in the workspace
+The linting emits problems indicating what to resolve for making the code Unity Catalog compatible.
+The linting also locates direct filesystem access that need to be migrated.
 
-Once the workflow completes, the output will be stored in `$inventory_database.workflow_problems` table, and displayed
-in the Migration dashboard.
+Once the workflow completes:
+ - problems are stored in the `$inventory_database.workflow_problems`/`$inventory_database.query_problems` table
+ - direct filesystem access are stored in the `$inventory_database.directfs_in_paths`/`$inventory_database.directfs_in_queries` table
+ - all the above are displayed in the Migration dashboard.
 
 ![code compatibility problems](docs/code_compatibility_problems.png)
 
@@ -1187,9 +1199,23 @@ a region, and you want to see which ones are available for assignment.
 databricks labs ucx assign-metastore --workspace-id <workspace-id> [--metastore-id <metastore-id>]
 ```
 
-This command assigns a metastore to a workspace with `workspace-id`. If there is only a single metastore in the workspace
-region, it will be automatically assigned to the workspace. If there are multiple metastores available, you need to specify
-the metastore id of the metastore you want to assign to the workspace.
+This command assigns a metastore to a workspace with `--workspace-id`. If there is only a single metastore in the
+workspace region, the command automatically assigns that metastore to the workspace. If there are multiple metastores
+available, the command prompts for specification of the metastore (id) you want to assign to the workspace.
+
+[[back to top](#databricks-labs-ucx)]
+
+## `create-ucx-catalog` command
+
+```commandline
+databricks labs ucx create-ucx-catalog
+16:12:59  INFO [d.l.u.hive_metastore.catalog_schema] Validating UC catalog: ucx
+Please provide storage location url for catalog: ucx (default: metastore): ...
+16:13:01  INFO [d.l.u.hive_metastore.catalog_schema] Creating UC catalog: ucx
+```
+
+Create and setup UCX artifact catalog. Amongst other things, the artifacts are used for tracking the migration progress
+across workspaces.
 
 # Table migration commands
 
@@ -1441,6 +1467,15 @@ Once you're done with table migration, proceed to the [code migration](#code-mig
 
 [[back to top](#databricks-labs-ucx)]
 
+## `unskip` command
+
+```commandline
+databricks labs ucx unskip --schema X [--table Y]
+```
+This command removes the mark set by the [`skip` command](#skip-command) on the given schema or table.
+
+[[back to top](#databricks-labs-ucx)]
+
 ## `create-catalogs-schemas` command
 
 ```text
@@ -1807,6 +1842,40 @@ $ databricks labs ucx download --file <file_path> --run-as-collection True
 Download a csv file from a single workspace (`--run-as-collection False`) or a collection of workspaces
 (`--run-as-collection True`). This command is especially useful when downloading the same file from multiple workspaces.
 
+## `join-collection` command
+
+```text
+$ databricks labs ucx join-collection --workspace-ids <comma seperate list of workspace ids> --profile <account-profile>
+```
+
+`join-collection` command joins 2 or more workspaces into a collection. This helps in running supported cli commands as a collection
+`join-collection` command updates config.yml file on each workspace ucx installation with installed_workspace_ids attribute.
+In order to run `join-collectioon` command a user should:
+ - be an Account admin on the Databricks account
+ - be a Workspace admin on all the workspaces to be joined as a collection) or a collection of workspaces
+ - have installed UCX on the workspace
+The `join-collection` command will fail and throw an error msg if the above conditions are not met.
+
+## collection eligible command
+
+Once `join-collection` command is run, it allows user to run multiple cli commands as a collection. The following cli commands
+are eligible to be run as a collection. User can run the below commands as collection by passing an additional flag `--run-as-collection=True`
+- `ensure-assessment-run`
+- `create-table-mapping`
+- `principal-prefix-access`
+- `migrate-credentials`
+- `create-uber-principal`
+- `create-missing-principals`
+- `validate-external-location`
+- `migrate-locations`
+- `create-catalog-schemas`
+- `migrate-tables`
+- `migrate-acls`
+- `migrate-dbsql-dashboards`
+- `validate-group-membership`
+Ex: `databricks labs ucx ensure-assessment-run --run-as-collection=True`
+
+
 # Common Challenges and the Solutions
 Users might encounter some challenges while installing and executing UCX. Please find the listing of some common challenges and the solutions below.
 

diff --git a/labs.yml b/labs.yml
@@ -261,15 +261,18 @@ commands:
 
   - name: assign-metastore
     is_account_level: true
-    description: Enable Unity Catalog features on a workspace by assign a metastore to it
+    description: Enable Unity Catalog features on a workspace by assigning a metastore to it.
     flags:
       - name: workspace-id
-        description: (Optional) Workspace ID to assign a metastore to
+        description: Workspace ID to assign a metastore to
       - name: metastore-id
         description: (Optional) If there are multiple metastores in the region, specify the metastore ID to assign
       - name: default-catalog
         description: (Optional) Default catalog to assign to the workspace. If not provided, it will be hive_metastore
 
+  - name: create-ucx-catalog
+    description: Create UCX artifact catalog
+
   - name: migrate-tables
     description: |
       Trigger the `migrate-tables` workflow and, optionally, `migrate-external-hiveserde-tables-in-place-experimental`

diff --git a/pyproject.toml b/pyproject.toml
@@ -46,10 +46,10 @@ classifiers = [
 
 dependencies = ["databricks-sdk~=0.30",
                 "databricks-labs-lsql>=0.5,<0.13",
-                "databricks-labs-blueprint>=0.8,<0.9",
+                "databricks-labs-blueprint>=0.8,<0.10",
                 "PyYAML>=6.0.0,<7.0.0",
                 "sqlglot>=25.5.0,<25.23",
-                "astroid>=3.2.2"]
+                "astroid>=3.3.1"]
 
 [project.optional-dependencies]
 pylsp = [
@@ -74,7 +74,7 @@ dependencies = [
     "black~=24.3.0",
     "coverage[toml]~=7.4.4",
     "mypy~=1.9.0",
-    "pylint~=3.2.2",
+    "pylint~=3.3.1",
     "pylint-pytest==2.0.0a0",
     "databricks-labs-pylint~=0.4.0",
     "databricks-labs-pytester>=0.2.1",
@@ -209,7 +209,7 @@ fail-under = 10.0
 # ignore-list. The regex matches against paths and can be in Posix or Windows
 # format. Because '\\' represents the directory delimiter on Windows systems, it
 # can't be used as an escape character.
- ignore-paths='^tests/unit/source_code/samples/.*$'
+ignore-paths='^tests/unit/source_code/samples/.*$'
 
 # Files or directories matching the regular expression patterns are skipped. The
 # regex matches against base names, not paths. The default value ignores Emacs
@@ -587,7 +587,10 @@ disable = [
     "fixme",
     "consider-using-assignment-expr",
     "logging-fstring-interpolation",
-    "consider-using-any-or-all"
+    "consider-using-any-or-all",
+    "too-many-positional-arguments",
+    "unnecessary-default-type-args",
+    "logging-not-lazy"
 ]
 
 # Enable the message, report, category or checker with the given id(s). You can

diff --git a/src/databricks/labs/ucx/account/metastores.py b/src/databricks/labs/ucx/account/metastores.py
@@ -27,38 +27,33 @@ def show_all_metastores(self, workspace_id: str | None = None):
     def assign_metastore(
         self,
         prompts: Prompts,
-        str_workspace_id: str | None = None,
+        workspace_id: int,
+        *,
         metastore_id: str | None = None,
         default_catalog: str | None = None,
     ):
-        if not str_workspace_id:
-            workspace_choices = self._get_all_workspaces()
-            workspace_id = prompts.choice_from_dict("Please select a workspace:", workspace_choices)
-        else:
-            workspace_id = int(str_workspace_id)
-        if not metastore_id:
+        if metastore_id is None:
             # search for all matching metastores
             metastore_choices = self._get_all_metastores(self._get_region(workspace_id))
             if len(metastore_choices) == 0:
-                raise ValueError(f"No matching metastore found for workspace {workspace_id}")
+                raise ValueError(f"No matching metastore found for workspace: {workspace_id}")
             # if there are multiple matches, prompt users to select one
             if len(metastore_choices) > 1:
                 metastore_id = prompts.choice_from_dict(
                     "Multiple metastores found, please select one:", metastore_choices
                 )
             else:
                 metastore_id = list(metastore_choices.values())[0]
-        if metastore_id is not None:
-            self._ac.metastore_assignments.create(workspace_id, metastore_id)
+        self._ac.metastore_assignments.create(workspace_id, metastore_id)
         # set the default catalog using the default_namespace setting API
         if default_catalog is not None:
             self._set_default_catalog(workspace_id, default_catalog)
 
-    def _get_region(self, workspace_id: int) -> str:
+    def _get_region(self, workspace_id: int) -> str | None:
         workspace = self._ac.workspaces.get(workspace_id)
         if self._ac.config.is_aws:
-            return str(workspace.aws_region)
-        return str(workspace.location)
+            return workspace.aws_region
+        return workspace.location
 
     def _get_all_workspaces(self) -> dict[str, int]:
         output = dict[str, int]()

diff --git a/src/databricks/labs/ucx/account/workspaces.py b/src/databricks/labs/ucx/account/workspaces.py
@@ -98,20 +98,25 @@ def get_accessible_workspaces(self) -> list[Workspace]:
         return accessible_workspaces
 
     def can_administer(self, workspace: Workspace) -> bool:
+        """Evaluate if the user can administer a workspace.
+
+        A user can administer a workspace if the user can access the workspace and is a member of the workspace "admins"
+        group.
+
+        Args:
+            workspace (Workspace): The workspace to check if the user can administer.
+
+        Returns:
+            bool: True if the user can administer the workspace, False otherwise.
+        """
         try:
-            # check if user has access to workspace
             ws = self.client_for(workspace)
-        except (PermissionDenied, NotFound, ValueError) as err:
-            logger.warning(f"{workspace.deployment_name}: Encounter error {err}. Skipping...")
-            return False
-        current_user = ws.current_user.me()
-        if current_user.groups is None:
+            current_user = ws.current_user.me()
+        except (PermissionDenied, NotFound, ValueError) as e:
+            logger.warning(f"User cannot access workspace: {workspace.deployment_name}", exc_info=e)
             return False
-        # check if user is a workspace admin
-        if "admins" not in [g.display for g in current_user.groups]:
-            logger.warning(
-                f"{workspace.deployment_name}: User {current_user.user_name} is not a workspace admin. Skipping..."
-            )
+        if current_user.groups is None or "admins" not in {g.display for g in current_user.groups}:
+            logger.warning(f"User '{current_user.user_name}' is not a workspace admin: {workspace.deployment_name}")
             return False
         return True
 

diff --git a/src/databricks/labs/ucx/assessment/export.py b/src/databricks/labs/ucx/assessment/export.py
@@ -11,8 +11,6 @@
 
 
 class AssessmentExporter:
-    # File and Path Constants
-    _EXPORT_FILE_NAME = "ucx_assessment_results.zip"
 
     def __init__(self, sql_backend: SqlBackend, config: WorkspaceConfig):
         self._sql_backend = sql_backend

diff --git a/src/databricks/labs/ucx/assessment/secrets.py b/src/databricks/labs/ucx/assessment/secrets.py
@@ -30,13 +30,20 @@ def _get_secret_if_exists(self, secret_scope, secret_key) -> str | None:
             )
             return None
 
-    def _get_value_from_config_key(self, config: dict, key: str, get_secret: bool = True) -> str | None:
+    def _get_value_from_config_key(
+        self,
+        config: dict,
+        key: str,
+        get_secret: bool = True,
+    ) -> str | None:
         """Get a config value based on its key, with some special handling:
         If the key is prefixed with spark_conf, i.e. this is in a cluster policy, the actual value is nested
         If the value is of format {{secret_scope/secret}}, we extract that as well
         """
         if re.search("spark_conf", key):
-            value = config.get(key, {}).get("value", "")
+            value = config.get(key, {})
+            if isinstance(value, dict):
+                value = value.get("value", "")
         else:
             value = config.get(key, "")
         # retrieve from secret scope if used