Merge pull request #21 from jgarciaf106/feat/add-cli-export-assessmen…

…t-reviewed Feat/add cli export assessment reviewed
rportilla-databricks · Oct 2, 2024 · 84830fd · 84830fd
2 parents 4d2e9a6 + 77786a2
commit 84830fd
Show file tree

Hide file tree

Showing 10 changed files with 102 additions and 21 deletions.
diff --git a/.github/workflows/solacc.yml b/.github/workflows/solacc.yml
@@ -26,3 +26,10 @@ jobs:
 
       - name: Verify linters on solution accelerators
         run: make solacc
+
+      - name: Upload reports
+        uses: actions/upload-artifact@v4
+        with:
+          name: report
+          path: build/
+          if-no-files-found: error
diff --git a/README.md b/README.md
@@ -1174,6 +1174,24 @@ The export-assessment command is used to export UCX assessment results to a spec
 
 [[back to top](#databricks-labs-ucx)]
 
+## `export-assessment` command
+
+```commandline
+databricks labs ucx export-assessment
+```
+The export-assessment command is used to export UCX assessment results to a specified location. When you run this command, you will be prompted to provide details on the destination path and the type of report you wish to generate. If you do not specify these details, the command will default to exporting the main results to the current directory. The exported file will be named based on the selection made in the format. Eg: export_{query_choice}_results.zip
+- **Choose a path to save the UCX Assessment results:**
+    - **Description:** Specify the path where the results should be saved. If not provided, results will be saved in the current directory.
+
+- **Choose which assessment results to export:**
+    - **Description:** Select the type of results to export. Options include:
+        - `azure`
+        - `estimates`
+        - `interactive`
+        - `main`
+
+[[back to top](#databricks-labs-ucx)]
+
 # Metastore related commands
 
 These commands are used to assign a Unity Catalog metastore to a workspace. The metastore assignment is a pre-requisite

diff --git a/pyproject.toml b/pyproject.toml
@@ -48,7 +48,7 @@ dependencies = ["databricks-sdk~=0.30",
                 "databricks-labs-lsql>=0.5,<0.13",
                 "databricks-labs-blueprint>=0.8,<0.10",
                 "PyYAML>=6.0.0,<7.0.0",
-                "sqlglot>=25.5.0,<25.23",
+                "sqlglot>=25.5.0,<25.25",
                 "astroid>=3.3.1"]
 
 [project.optional-dependencies]

diff --git a/src/databricks/labs/ucx/hive_metastore/table_size.py b/src/databricks/labs/ucx/hive_metastore/table_size.py
@@ -22,7 +22,7 @@ class TableSize:
     size_in_bytes: int
 
 
-class TableSizeCrawler(CrawlerBase):
+class TableSizeCrawler(CrawlerBase[TableSize]):
     def __init__(self, backend: SqlBackend, schema, include_databases: list[str] | None = None):
         """
         Initializes a TablesSizeCrawler instance.

diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py
@@ -476,7 +476,7 @@ def _describe(self, catalog: str, database: str, table: str) -> Table | None:
             return None
 
 
-class FasterTableScanCrawler(CrawlerBase):
+class FasterTableScanCrawler(CrawlerBase[Table]):
     """
     FasterTableScanCrawler is a specialized version of TablesCrawler that uses spark._jsparkSession to utilize
     faster scanning with Scala APIs.

diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py
@@ -33,7 +33,7 @@ def key(self) -> str:
         return f"{self.catalog}.{self.database}.{self.name}".lower()
 
 
-class UdfsCrawler(CrawlerBase):
+class UdfsCrawler(CrawlerBase[Udf]):
     def __init__(self, backend: SqlBackend, schema: str, include_databases: list[str] | None = None):
         """
         Initializes a UdfsCrawler instance.

diff --git a/src/databricks/labs/ucx/installer/workflows.py b/src/databricks/labs/ucx/installer/workflows.py
@@ -112,8 +112,7 @@
      f'--parent_run_id=' + dbutils.widgets.get('parent_run_id'))
 """
 
-EXPORT_TO_EXCEL_NOTEBOOK = """
-# Databricks notebook source
+EXPORT_TO_EXCEL_NOTEBOOK = """# Databricks notebook source
 # MAGIC %md
 # MAGIC ##### Exporter of UCX assessment results
 # MAGIC ##### Instructions:
@@ -165,7 +164,7 @@
 
 # DBTITLE 1,Assessment Export
 FILE_NAME = "ucx_assessment_main.xlsx"
-TMP_PATH = f"/Workspace{ctx.installation.install_folder()}/tmp/"
+TMP_PATH = f"/Workspace{{ctx.installation.install_folder()}}/tmp/"
 DOWNLOAD_PATH = "/dbfs/FileStore/excel-export"
 
 
@@ -194,8 +193,10 @@ def _to_excel(dataset: Dataset, writer: ...) -> None:
 
 def _render_export() -> None:
     '''Render an HTML link for downloading the results.'''
-    html_content = f'''
-            <style>@font-face{{font-family:'DM Sans';src:url(https://cdn.bfldr.com/9AYANS2F/at/p9qfs3vgsvnp5c7txz583vgs/dm-sans-regular.ttf?auto=webp&format=ttf) format('truetype');font-weight:400;font-style:normal}}body{{font-family:'DM Sans',Arial,sans-serif}}.export-container{{text-align:center;margin-top:20px}}.export-container h2{{color:#1B3139;font-size:24px;margin-bottom:20px}}.export-container a{{display:inline-block;padding:12px 25px;background-color:#1B3139;color:#fff;text-decoration:none;border-radius:4px;font-size:18px;font-weight:500;transition:background-color 0.3s ease,transform 0.3s ease}}.export-container a:hover{{background-color:#FF3621;transform:translateY(-2px)}}</style><div class="export-container"><h2>Export Results</h2><a href='{workspace_host}files/excel-export/ucx_assessment_main.xlsx?o={workspace_id}' target='_blank' download>Download Results</a></div>
+    html_content = '''
+    <style>@font-face{{font-family:'DM Sans';src:url(https://cdn.bfldr.com/9AYANS2F/at/p9qfs3vgsvnp5c7txz583vgs/dm-sans-regular.ttf?auto=webp&format=ttf) format('truetype');font-weight:400;font-style:normal}}body{{font-family:'DM Sans',Arial,sans-serif}}.export-container{{text-align:center;margin-top:20px}}.export-container h2{{color:#1B3139;font-size:24px;margin-bottom:20px}}.export-container a{{display:inline-block;padding:12px 25px;background-color:#1B3139;color:#fff;text-decoration:none;border-radius:4px;font-size:18px;font-weight:500;transition:background-color 0.3s ease,transform:translateY(-2px) ease}}.export-container a:hover{{background-color:#FF3621;transform:translateY(-2px)}}</style>
+    <div class="export-container"><h2>Export Results</h2><a href='{workspace_host}/files/excel-export/ucx_assessment_main.xlsx?o={workspace_id}' target='_blank' download>Download Results</a></div>
+
     '''
     displayHTML(html_content)
 
@@ -597,6 +598,7 @@ def create_jobs(self) -> None:
         self.remove_jobs(keep=desired_workflows)
         self._install_state.save()
         self._create_debug(remote_wheels)
+        self._create_export(remote_wheels)
         self._create_readme()
 
     def remove_jobs(self, *, keep: set[str] | None = None) -> None:

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
@@ -96,7 +96,10 @@ def message_relative_to(self, base: Path, *, default: Path | None = None) -> str
             logger.debug(f'THIS IS A BUG! {advice.code}:{advice.message} has unknown path')
         if default is not None:
             path = default
-        path = path.relative_to(base)
+        try:
+            path = path.relative_to(base)
+        except ValueError:
+            logger.debug(f'Not a relative path: {path} to base: {base}')
         # increment start_line because it is 0-based whereas IDEs are usually 1-based
         return f"./{path.as_posix()}:{advice.start_line+1}:{advice.start_col}: [{advice.code}] {advice.message}"
 

diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py
@@ -163,7 +163,13 @@ def __init__(
         self._python_trees: dict[PythonCell, Tree] = {}  # the original trees to be linted
 
     def lint(self) -> Iterable[Advice]:
-        yield from self._load_tree_from_notebook(self._notebook, True)
+        has_failure = False
+        for advice in self._load_tree_from_notebook(self._notebook, True):
+            if isinstance(advice, Failure):  # happens when a cell is unparseable
+                has_failure = True
+            yield advice
+        if has_failure:
+            return
         for cell in self._notebook.cells:
             if not self._context.is_supported(cell.language.language):
                 continue

diff --git a/tests/integration/source_code/solacc.py b/tests/integration/source_code/solacc.py
@@ -1,8 +1,11 @@
+import dataclasses
+import json
 import logging
 import os
 import shutil
 import sys
 from dataclasses import dataclass, field
+from datetime import datetime, timezone
 from pathlib import Path
 
 import requests
@@ -20,6 +23,8 @@
 
 this_file = Path(__file__)
 dist = (this_file / '../../../../dist').resolve().absolute()
+build = dist.parent / "build"
+build.mkdir(exist_ok=True)
 
 
 def _get_repos_to_clone() -> dict[str, str]:
@@ -72,23 +77,41 @@ def _collect_uninferrable_count(advices: list[LocatedAdvice]):
 
 
 def _collect_unparseable(advices: list[LocatedAdvice]):
-    return set(located_advice for located_advice in advices if located_advice.advice.code == 'parse-error')
+    return list(located_advice for located_advice in advices if located_advice.advice.code == 'parse-error')
 
 
 def _print_advices(advices: list[LocatedAdvice]):
-    for located_advice in advices:
-        message = located_advice.message_relative_to(dist.parent)
-        sys.stdout.write(f"{message}\n")
+    messages = list(
+        located_advice.message_relative_to(dist.parent).replace('\n', ' ') + '\n' for located_advice in advices
+    )
+    if os.getenv("CI"):
+        advices_path = build / "advices.txt"
+        with advices_path.open("a") as advices_file:
+            advices_file.writelines(messages)
+    else:
+        for message in messages:
+            sys.stdout.write(message)
+
+
+@dataclass
+class _SolaccStats:
+    run_id: str
+    name: str
+    start_timestamp: datetime
+    end_timestamp: datetime
+    files_count: int
+    files_size: int
 
 
 @dataclass
 class _SolaccContext:
     unparsed_files_path: Path | None = None
-    files_to_skip: set[str] | None = None
+    files_to_skip: set[Path] | None = None
     total_count = 0
     parseable_count = 0
     uninferrable_count = 0
     missing_imports: dict[str, dict[str, int]] = field(default_factory=dict)
+    stats: list[_SolaccStats] = field(default_factory=list)
 
     @classmethod
     def create(cls, for_all_dirs: bool):
@@ -98,11 +121,11 @@ def create(cls, for_all_dirs: bool):
             unparsed_path = Path(Path(__file__).parent, "solacc-unparsed.txt")
             if unparsed_path.exists():
                 os.remove(unparsed_path)
-        files_to_skip: set[str] | None = None
+        files_to_skip: set[Path] | None = None
         malformed = Path(__file__).parent / "solacc-malformed.txt"
         if for_all_dirs and malformed.exists():
             lines = malformed.read_text(encoding="utf-8").split("\n")
-            files_to_skip = set(line for line in lines if len(line) > 0 and not line.startswith("#"))
+            files_to_skip = set(dist / line for line in lines if len(line) > 0 and not line.startswith("#"))
         return _SolaccContext(unparsed_files_path=unparsed_path, files_to_skip=files_to_skip)
 
     def register_missing_import(self, missing_import: str):
@@ -153,7 +176,19 @@ def _lint_dir(solacc: _SolaccContext, soldir: Path):
     files_to_skip = set(solacc.files_to_skip) if solacc.files_to_skip else set()
     linted_files = set(files_to_skip)
     # lint solution
+    start_timestamp = datetime.now(timezone.utc)
     advices = list(ctx.local_code_linter.lint_path(soldir, linted_files))
+    end_timestamp = datetime.now(timezone.utc)
+    # record stats
+    stats = _SolaccStats(
+        run_id=os.getenv("GITHUB_RUN_ATTEMPT") or "local",
+        start_timestamp=start_timestamp,
+        end_timestamp=end_timestamp,
+        name=soldir.name,
+        files_count=len(all_files),
+        files_size=sum(path.stat().st_size for path in [soldir / filename for filename in all_files]),
+    )
+    solacc.stats.append(stats)
     # collect unparseable files
     unparseables = _collect_unparseable(advices)
     solacc.parseable_count += len(linted_files) - len(files_to_skip) - len(set(advice.path for advice in unparseables))
@@ -162,7 +197,11 @@ def _lint_dir(solacc: _SolaccContext, soldir: Path):
             logger.error(f"Error during parsing of {unparseable.path}: {unparseable.advice.message}".replace("\n", " "))
             # populate solacc-unparsed.txt
             with solacc.unparsed_files_path.open(mode="a", encoding="utf-8") as f:
-                f.write(unparseable.path.relative_to(dist).as_posix())
+                try:
+                    path = unparseable.path.relative_to(dist)
+                except ValueError:
+                    path = unparseable.path
+                f.write(path.as_posix())
                 f.write("\n")
     # collect missing imports
     for missing_import in _collect_missing_imports(advices):
@@ -178,8 +217,8 @@ def _lint_dir(solacc: _SolaccContext, soldir: Path):
 def _lint_repos(clone_urls, sol_to_lint: str | None):
     solacc = _SolaccContext.create(sol_to_lint is not None)
     if sol_to_lint:
-        # don't clone if linting just one file, assumption is we're troubleshooting
-        _lint_dir(solacc, dist / sol_to_lint)
+        sol_dir = _clone_repo(clone_urls[sol_to_lint], sol_to_lint)
+        _lint_dir(solacc, sol_dir)
     else:
         names: list[str] = list(clone_urls.keys())
         for name in sorted(names, key=str.casefold):
@@ -199,6 +238,12 @@ def _lint_repos(clone_urls, sol_to_lint: str | None):
         f"not computed: {solacc.uninferrable_count}"
     )
     solacc.log_missing_imports()
+    # log stats
+    stats_path = build / "stats.json"
+    with stats_path.open("a") as stats_file:
+        for stats in solacc.stats:
+            message = json.dumps(dataclasses.asdict(stats), default=str)
+            stats_file.writelines([message])
     # fail the job if files are unparseable
     if parseable_pct < 100:
         sys.exit(1)