databrickslabs · nfx · Oct 8, 2024 · Aug 26, 2024 · Aug 27, 2024 · Aug 27, 2024
@@ -10,8 +10,10 @@ so that you'll be able to [scope the migration](docs/assessment.md) and execute
 The [README notebook](#readme-notebook), which can be found in the installation folder contains further instructions and explanations of the different ucx workflows & dashboards.
 Once the migration is scoped, you can start with the [table migration process](#Table-Migration).
 
+
 More workflows, like notebook code migration are coming in future releases.
 
+
 UCX also provides a number of command line utilities accessible via `databricks labs ucx`.
 
 For questions, troubleshooting or bug fixes, please see our [troubleshooting guide](docs/troubleshooting.md) or submit [an issue](https://github.com/databrickslabs/ucx/issues).

@@ -275,3 +275,5 @@ commands:
       - name: target-workspace-id
         description: (Optional) id of a workspace in the target collection. If not specified, ucx will prompt to select from a list
 
+  - name: export
+    description: export widget data from the assessment
-    description: export widget data from the assessment
+    description: export ucx data
-    description: export widget data from the assessment
+    description: export ucx data
@@ -0,0 +1,116 @@
+import os
+import re
+import csv
+import logging
+from pathlib import Path
+from zipfile import ZipFile
+from concurrent.futures import ThreadPoolExecutor
+from databricks.labs.blueprint.tui import Prompts
+from databricks.labs.ucx.contexts.workspace_cli import WorkspaceContext
+
+logger = logging.getLogger(__name__)
+
+
+class AssessmentExporter:
+    # File and Path Constants
+    _ZIP_FILE_NAME = "ucx_assessment_results.zip"
-    _ZIP_FILE_NAME = "ucx_assessment_results.zip"
+    _EXPORT_FILE_NAME = "ucx_assessment_results.zip"
-    _ZIP_FILE_NAME = "ucx_assessment_results.zip"
+    _EXPORT_FILE_NAME = "ucx_assessment_results.zip"
+
+    def __init__(self, ctx: WorkspaceContext):
-    def __init__(self, ctx: WorkspaceContext):
+    def __init__(self, sql_backend: SqlBackend, config: WorkspaceConfig):
-    def __init__(self, ctx: WorkspaceContext):
+    def __init__(self, sql_backend: SqlBackend, config: WorkspaceConfig):
+        self._ctx = ctx
+
+    def _get_ucx_main_queries(self) -> list[dict[str, str]]:
+        """Retrieve and construct the main UCX queries."""
+        pattern = r"\b.inventory\b"
+        schema = self._ctx.inventory_database
+        project_root = Path(__file__).parent.parent.parent.parent
+        ucx_main_queries_path = project_root / "labs/ucx/queries/assessment/main"
+
+        # List all SQL files in the directory, excluding those with 'count' in their names
+        sql_files = [file for file in ucx_main_queries_path.iterdir() if file.suffix == ".sql"]
+
+        ucx_main_queries = []
+
+        for sql_file in sql_files:
+            content = sql_file.read_text()
+            modified_content = re.sub(pattern, f" {schema}", content, flags=re.IGNORECASE)
-            modified_content = re.sub(pattern, f" {schema}", content, flags=re.IGNORECASE)
+            modified_content = self._config.replace_inventory_variable(content)
-            modified_content = re.sub(pattern, f" {schema}", content, flags=re.IGNORECASE)
+            modified_content = self._config.replace_inventory_variable(content)
+            query_name = sql_file.stem
+            ucx_main_queries.append({"name": query_name, "query": modified_content})
+
+        return ucx_main_queries
+
+    @staticmethod
+    def _extract_target_name(name: str, pattern: str) -> str:
+        """Extract target name from the file name using the provided pattern."""
+        match = re.search(pattern, name)
+        return match.group(1) if match else ""
+
+    @staticmethod
+    def _cleanup(path: Path, target_name: str) -> None:
+        """Remove a specific CSV file in the given path that matches the target name."""
+        target_file = path.joinpath(target_name)
+
+        if target_file.exists():
+            target_file.unlink()
+
+    def _execute_query(self, path: Path, result: dict[str, str]) -> None:
+        """Execute a SQL query and write the result to a CSV file."""
+        pattern = r"^\d+_\d+_(.*)"
+        match = re.search(pattern, result["name"])
+        if match:
+            file_name = f"{match.group(1)}.csv"
+            csv_path = os.path.join(path, file_name)
+
+            query_results = list(self._ctx.sql_backend.fetch(result["query"]))
+
+            if query_results:
+                headers = query_results[0].asDict().keys()
+                with open(csv_path, mode='w', newline='', encoding='utf-8') as file:
-            csv_path = os.path.join(path, file_name)
-
-            query_results = list(self._ctx.sql_backend.fetch(result["query"]))
-
-            if query_results:
-                headers = query_results[0].asDict().keys()
-                with open(csv_path, mode='w', newline='', encoding='utf-8') as file:
+            query_results = list(self._ctx.sql_backend.fetch(result["query"]))
+
+            if query_results:
+                headers = query_results[0].asDict().keys()
+                with (path / file_name).open(mode='w', newline='', encoding='utf-8') as file:
-            csv_path = os.path.join(path, file_name)
-
-            query_results = list(self._ctx.sql_backend.fetch(result["query"]))
-
-            if query_results:
-                headers = query_results[0].asDict().keys()
-                with open(csv_path, mode='w', newline='', encoding='utf-8') as file:
+            query_results = list(self._ctx.sql_backend.fetch(result["query"]))
+
+            if query_results:
+                headers = query_results[0].asDict().keys()
+                with (path / file_name).open(mode='w', newline='', encoding='utf-8') as file:
+                    writer = csv.DictWriter(file, fieldnames=headers)
+                    writer.writeheader()
+                    for row in query_results:
+                        writer.writerow(row.asDict())
+                # Add the CSV file to the ZIP archive
+                self._add_to_zip(path, file_name)
+
+    def _add_to_zip(self, path: Path, file_name) -> None:
-    def _add_to_zip(self, path: Path, file_name) -> None:
+    def _add_to_zip(self, path: Path, file_name: str) -> None:
-    def _add_to_zip(self, path: Path, file_name) -> None:
+    def _add_to_zip(self, path: Path, file_name: str) -> None:
+        """Create a ZIP file containing all the CSV files."""
+        zip_path = path / self._ZIP_FILE_NAME
+        file_path = path / file_name
+
+        try:
+            with ZipFile(zip_path, 'a') as zipf:
+                zipf.write(file_path, arcname=file_name)
+
+        except FileNotFoundError:
+            print(f"File {file_path} not found.")
-            print(f"File {file_path} not found.")
+            logger.warning(f"File {file_path} not found.")
-            print(f"File {file_path} not found.")
+            logger.warning(f"File {file_path} not found.")
+        except PermissionError:
+            print(f"Permission denied for {file_path} or {zip_path}.")
+
+        # Clean up the file if it was successfully added
+        if file_path.exists():
+            self._cleanup(path, file_name)
+
+    def export_results(self, prompts: Prompts, path: Path | None) -> None:
+        """Main method to export results to CSV files inside a ZIP archive."""
+        results = self._get_ucx_main_queries()
+        if path is None:
+            response = prompts.question(
+                "Choose a path to save the UCX Assessment results",
+                default=Path.cwd().as_posix(),
+                validate=lambda p_: Path(p_).exists(),
+            )
+            path = Path(response)
+        else:
+            logger.info(f"Using the provided path: {path}")
-        else:
-            logger.info(f"Using the provided path: {path}")
-        else:
-            logger.info(f"Using the provided path: {path}")
+        try:
+            logger.info(f"Exporting UCX Assessment (Main) results to {path}")
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                futures = [executor.submit(self._execute_query, path, result) for result in results]
+                for future in futures:
+                    future.result()
+
+        except TimeoutError as e:
+            print("A thread execution timed out. Check the query execution logic.")
+            print(f"Error exporting results: {e}")
+        finally:
+            logger.info(f"UCX Assessment (Main) results exported to {path}")
@@ -16,6 +16,7 @@
 from databricks.labs.ucx.hive_metastore.tables import What
 from databricks.labs.ucx.install import AccountInstaller
 from databricks.labs.ucx.source_code.linters.files import LocalCodeLinter
+from databricks.labs.ucx.assessment.export import Exporter
 
 ucx = App(__file__)
 logger = get_logger(__file__)
@@ -564,6 +565,14 @@ def join_collection(a: AccountClient, workspace_ids: str):
     account_installer.join_collection(w_ids)
 
 
+@ucx.command()
+def export(w: WorkspaceClient, prompts: Prompts, path: Path | None = None):
+    """exports the assessment dashboard"""
+    ctx = WorkspaceContext(w)
+    exporter = Exporter(ctx)
+    exporter.export_results(prompts, path)
+
+
 @ucx.command
 def lint_local_code(
     w: WorkspaceClient, prompts: Prompts, path: str | None = None, ctx: LocalCheckoutContext | None = None

@@ -51,6 +51,7 @@
 from databricks.labs.ucx.installer.logs import PartialLogRecord, parse_logs
 from databricks.labs.ucx.installer.mixins import InstallationMixin
 
+
 logger = logging.getLogger(__name__)
 
 TEST_RESOURCE_PURGE_TIMEOUT = timedelta(hours=1)
@@ -112,6 +113,126 @@
      f'--parent_run_id=' + dbutils.widgets.get('parent_run_id'))
 """
 
+EXPORT_UCX_NOTEBOOK = """
-EXPORT_UCX_NOTEBOOK = """
+EXPORT_TO_EXCEL_NOTEBOOK = """
-EXPORT_UCX_NOTEBOOK = """
+EXPORT_TO_EXCEL_NOTEBOOK = """
+# Databricks notebook source
+# MAGIC %md
+# MAGIC ##### Exporter of UCX assessment results
+# MAGIC ##### Instructions:
+# MAGIC 1. Execute using an all-purpose cluster with Databricks Runtime 14 or higher.
+# MAGIC 1. Hit **Run all** button and wait for completion.
+# MAGIC 1. Go to the bottom of the notebook and click the Download UCX Results button.
+# MAGIC
+# MAGIC ##### Important:
+# MAGIC Please note that this is only meant to serve as example code.
+# MAGIC This is not official **Databricks** or **Databricks Labs UCX** code.
+# MAGIC
+# MAGIC Example code developed by **Databricks Shared Technical Services team**.
+# COMMAND ----------
+# DBTITLE 1,Installing Packages
+# MAGIC %pip install {remote_wheel} -q -q -q
+# MAGIC %pip install xlsxwriter -q -q -q
-# MAGIC %pip install {remote_wheel} -q -q -q
-# MAGIC %pip install xlsxwriter -q -q -q
+# MAGIC %pip install {remote_wheel} -qqq
+# MAGIC %pip install xlsxwriter -qqq
-# MAGIC %pip install {remote_wheel} -q -q -q
-# MAGIC %pip install xlsxwriter -q -q -q
+# MAGIC %pip install {remote_wheel} -qqq
+# MAGIC %pip install xlsxwriter -qqq
+# MAGIC dbutils.library.restartPython()
+# COMMAND ----------
+# DBTITLE 1,Import Libraries
+# Standard library imports
+import os
+import re
+import shutil
+import json
+from typing import List, Dict
+from ast import literal_eval
+from concurrent.futures import ThreadPoolExecutor
+# Third-party library imports
+import pandas as pd
+import xlsxwriter
+# Databricks imports
+from databricks.labs.ucx.contexts.workflow_task import RuntimeContext
+import databricks.labs.ucx.queries.assessment.main as queries
+# Resource management
+import importlib.resources as resources
+# COMMAND ----------
+# DBTITLE 1,UCX Assessment Export
+class Exporter:
+    # File and Path Constants
+    _FILE_NAME = "ucx_assessment_results.xlsx"
+    _TMP_PATH = "/Workspace/Applications/ucx/ucx_results/"
+    _DOWNLOAD_PATH = "/dbfs/FileStore/ucx_results"
+    # Named Parameters
+    _NAMED_PARAMS = dict("config": "/Workspace{config_file}")
+    def __init__(self) -> None:
+        self._ctx = RuntimeContext(self._NAMED_PARAMS)
+    def _get_ucx_main_queries(self) -> List[Dict[str, str]]:
+        '''Retrieve and construct the main UCX queries.'''
+        pattern = r"\\b.inventory\\b"
+        schema = self._ctx.inventory_database
+        sql_files = [
+            file.name
+            for file in resources.files(queries).iterdir()
+            if file.suffix == ".sql" and "count" not in file.name
+        ]
+        ucx_main_queries = [
+            dict(name = "01_1_permissions","query": f"SELECT * FROM {schema}.permissions"),
+            dict(name = "02_2_ucx_grants", "query": f"SELECT * FROM {schema}.grants;"),
+            dict(name =  "03_3_groups", "query": f"SELECT * FROM {schema}.groups;"),
+        ]
+        for sql_file in sql_files:
+            with resources.as_file(resources.files(queries) / sql_file) as file_path:
+                content = file_path.read_text()
+                modified_content = re.sub(pattern, f" {schema}", content, flags=re.IGNORECASE)
+                query_name = sql_file[:-4]
+                ucx_main_queries.append(dict(name = query_name, "query": modified_content)
+        return ucx_main_queries
+    def _cleanup(self) -> None:
+        '''Move the temporary results file to the download path and clean up the temp directory.'''
+        shutil.move(
+            os.path.join(self._TMP_PATH, self._FILE_NAME),
+            os.path.join(self._DOWNLOAD_PATH, self._FILE_NAME),
+        )
+        shutil.rmtree(self._TMP_PATH)
+    def _prepare_directories(self) -> None:
+        '''Ensure that the necessary directories exist.'''
+        os.makedirs(self._TMP_PATH, exist_ok=True)
+        os.makedirs(self._DOWNLOAD_PATH, exist_ok=True)
+    def _execute_query(self, result: Dict[str, str], writer: pd.ExcelWriter) -> None:
+        '''Execute a SQL query and write the result to an Excel sheet.'''
+        pattern = r'^\\d+_\\d+_(.*)'
+        match = re.search(pattern, result["name"])
+        if match:
+            sheet_name = match.group(1)
+            sdf = spark.sql(result["query"])
+            if sdf.count() > 0:
+                df = sdf.toPandas()
+                df.to_excel(writer, sheet_name=sheet_name, index=False)
+    def _render_export(self) -> None:
+        '''Render an HTML link for downloading the results.'''
+        html_content = f'''
+                <style>@font-face{{font-family:'DM Sans';src:url(https://cdn.bfldr.com/9AYANS2F/at/p9qfs3vgsvnp5c7txz583vgs/dm-sans-regular.ttf?auto=webp&format=ttf) format('truetype');font-weight:400;font-style:normal}}body{{font-family:'DM Sans',Arial,sans-serif}}.export-container{{text-align:center;margin-top:20px}}.export-container h2{{color:#1B3139;font-size:24px;margin-bottom:20px}}.export-container a{{display:inline-block;padding:12px 25px;background-color:#1B3139;color:#fff;text-decoration:none;border-radius:4px;font-size:18px;font-weight:500;transition:background-color 0.3s ease,transform 0.3s ease}}.export-container a:hover{{background-color:#FF3621;transform:translateY(-2px)}}</style><div class="export-container"><h2>Export Results</h2><a href='{workspace_host}files/ucx_results/ucx_assessment_results.xlsx?o={workspace_id}' target='_blank' download>Download UCX Results </a></div>
+        '''
+        displayHTML(html_content)
+    def export_results(self) -> None:
+        '''Main method to export results to an Excel file.'''
+        self._prepare_directories()
+        results = self._get_ucx_main_queries()
+        try:
+            with pd.ExcelWriter(
+                os.path.join(self._TMP_PATH, self._FILE_NAME), engine="xlsxwriter"
+            ) as writer:
+                with ThreadPoolExecutor(max_workers=4) as executor:
+                    futures = [
+                        executor.submit(self._execute_query, result, writer)
+                        for result in results
+                    ]
+                    for future in futures:
+                        future.result()
+            self._cleanup()
+            self._render_export()
+        except Exception as e:
+            print(f"Error exporting results ", e)
+# COMMAND ----------
+# DBTITLE 1,Automate UCX Data Export
+Exporter().export_results()
+"""
+
 
 class DeployedWorkflows:
     def __init__(self, ws: WorkspaceClient, install_state: InstallState, verify_timeout: timedelta):
@@ -486,6 +607,7 @@ def create_jobs(self) -> None:
 
         self._install_state.save()
         self._create_debug(remote_wheels)
+        self._create_export(remote_wheels)
         self._create_readme()
 
     @property
@@ -788,6 +910,16 @@ def _create_debug(self, remote_wheels: list[str]):
         ).encode("utf8")
         self._installation.upload('DEBUG.py', content)
 
+    def _create_export(self, remote_wheels: list[str]):
+        content = EXPORT_UCX_NOTEBOOK.format(
+            remote_wheel=remote_wheels,
+            config_file=self._config_file,
+            workspace_host=self._ws.config.host,
+            workspace_id=self._ws.get_workspace_id(),
+            schema=self._config.inventory_database,
+        ).encode("utf8")
+        self._installation.upload('EXPORT_UCX_RESULTS.py', content)
+
 
 class MaxedStreamHandler(logging.StreamHandler):