From 729e9ae795d000133839b84ce58ad17f95f9ecea Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 30 Sep 2024 14:08:02 +0200 Subject: [PATCH 1/7] Generate solacc reports (#2757) ## Changes Currently, `solacc` outputs advices to the console, and does not keep track of linting time. This PR: - dumps advices to an 'advices.txt' file - collects stats and dumps them to a 'stats.json' file (in the form of a json-like file, with 1 json object per line) - uploads the above ### Linked issues None ### Functionality None ### Tests - [x] manually tested: Sample solacc run: ![Screenshot 2024-09-27 at 16 44 22](https://github.com/user-attachments/assets/a0f508e4-4095-45b8-be38-51d08824b15e) Sample stats.json (expanded): ``` { "run_id": "1", "name": "ab-testing", "start_timestamp": "2024-09-27 10:16:02.512363+00:00", "end_timestamp": "2024-09-27 10:17:07.622161+00:00", "files_count": 6, "files_size": 34934 } { "run_id": "1", "name": "adverse-drug-events", "start_timestamp": "2024-09-27 10:17:08.669225+00:00", "end_timestamp": "2024-09-27 10:17:09.399495+00:00", "files_count": 5, "files_size": 48743 } { "run_id": "1", "name": "als-recommender", "start_timestamp": "2024-09-27 10:17:09.565942+00:00", "end_timestamp": "2024-09-27 10:17:12.039422+00:00", "files_count": 6, "files_size": 62750 } ``` Sample advices.txt: ``` ./dist/ab-testing/4. Real time inference.py:76:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/ab-testing/4. Real time inference.py:139:0: [jvm-access-in-shared-clusters] Cannot access Spark Driver JVM on UC Shared Clusters ./dist/ab-testing/4. Real time inference.py:139:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/ab-testing/RUNME.py:1:1: [library-install-failed] Unsupported 'pip' command: DBTITLE ./dist/ab-testing/5. AB testing metrics.py:50:0: [jvm-access-in-shared-clusters] Cannot access Spark Driver JVM on UC Shared Clusters ./dist/ab-testing/5. AB testing metrics.py:50:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/ab-testing/5. AB testing metrics.py:51:0: [jvm-access-in-shared-clusters] Cannot access Spark Driver JVM on UC Shared Clusters ./dist/ab-testing/5. AB testing metrics.py:51:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/ab-testing/5. AB testing metrics.py:54:0: [jvm-access-in-shared-clusters] Cannot access Spark Driver JVM on UC Shared Clusters ./dist/ab-testing/5. AB testing metrics.py:54:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/ab-testing/5. AB testing metrics.py:58:0: [jvm-access-in-shared-clusters] Cannot access Spark Driver JVM on UC Shared Clusters ./dist/ab-testing/5. AB testing metrics.py:58:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/ab-testing/2. Model training.py:34:0: [jvm-access-in-shared-clusters] Cannot access Spark Driver JVM on UC Shared Clusters ./dist/ab-testing/2. Model training.py:34:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/ab-testing/2. Model training.py:36:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/ab-testing/2. Model training.py:37:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/ab-testing/2. Model training.py:37:0: [rdd-in-shared-clusters] RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API ./dist/ab-testing/2. Model training.py:38:0: [jvm-access-in-shared-clusters] Cannot access Spark Driver JVM on UC Shared Clusters ./dist/ab-testing/2. Model training.py:41:0: [jvm-access-in-shared-clusters] Cannot access Spark Driver JVM on UC Shared Clusters ./dist/ab-testing/2. Model training.py:41:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/ab-testing/5. AB testing metrics.py:22:0: [default-format-changed-in-dbr8] The default format changed in Databricks Runtime 8.0, from Parquet to Delta ./dist/ab-testing/5. AB testing metrics.py:36:27: [default-format-changed-in-dbr8] The default format changed in Databricks Runtime 8.0, from Parquet to Delta ./dist/ab-testing/5. AB testing metrics.py:90:2: [default-format-changed-in-dbr8] The default format changed in Databricks Runtime 8.0, from Parquet to Delta ./dist/ab-testing/5. AB testing metrics.py:97:2: [default-format-changed-in-dbr8] The default format changed in Databricks Runtime 8.0, from Parquet to Delta ./dist/ab-testing/risk_demo.dbdash:1:0: [unknown-language] Cannot detect language for /home/runner/work/ucx/ucx/dist/ab-testing/risk_demo.dbdash ./dist/ab-testing/1. Introduction.py:72:2: [direct-filesystem-access] The use of direct filesystem references is deprecated: /tmp/german_credit_data.csv ./dist/ab-testing/4. Real time inference.py:10:0: [direct-filesystem-access] The use of direct filesystem references is deprecated: /FileStore/tmp/streaming_ckpnt_risk_demo ./dist/ab-testing/4. Real time inference.py:10:14: [direct-filesystem-access] The use of direct filesystem references is deprecated: /FileStore/tmp/streaming_ckpnt_risk_demo ./dist/ab-testing/4. Real time inference.py:29:5: [default-format-changed-in-dbr8] The default format changed in Databricks Runtime 8.0, from Parquet to Delta ./dist/ab-testing/4. Real time inference.py:214:32: [direct-filesystem-access] The use of direct filesystem references is deprecated: /FileStore/tmp/streaming_ckpnt_risk_demo ./dist/ab-testing/4. Real time inference.py:236:2: [default-format-changed-in-dbr8] The default format changed in Databricks Runtime 8.0, from Parquet to Delta ./dist/ab-testing/4. Real time inference.py:254:27: [default-format-changed-in-dbr8] The default format changed in Databricks Runtime 8.0, from Parquet to Delta ./dist/ab-testing/2. Model training.py:53:22: [direct-filesystem-access] The use of direct filesystem references is deprecated: /Users/Uninferable/german_credit_experiment ./dist/ab-testing/2. Model training.py:66:5: [default-format-changed-in-dbr8] The default format changed in Databricks Runtime 8.0, from Parquet to Delta ./dist/ab-testing/2. Model training.py:197:67: [direct-filesystem-access] The use of direct filesystem references is deprecated: /tmp/pr-curve-model-a.png ./dist/ab-testing/2. Model training.py:229:67: [direct-filesystem-access] The use of direct filesystem references is deprecated: /tmp/pr-curve-model-b.png ./dist/adverse-drug-events/RUNME.py:1:1: [library-install-failed] Unsupported 'pip' command: DBTITLE ./dist/adverse-drug-events/01-ade-extraction.py:23:0: [jvm-access-in-shared-clusters] Cannot access Spark Driver JVM on UC Shared Clusters ./dist/adverse-drug-events/01-ade-extraction.py:23:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/adverse-drug-events/02-ade-analysis.py:13:0: [jvm-access-in-shared-clusters] Cannot access Spark Driver JVM on UC Shared Clusters ./dist/adverse-drug-events/02-ade-analysis.py:13:0: [legacy-context-in-shared-clusters] sc is not supported on UC Shared Clusters. Rewrite it using spark ./dist/adverse-drug-events/02-ade-analysis.py:14:0: [jvm-access-in-shared-clusters] Cannot access Spark Driver JVM on UC Shared Clusters ``` --------- Co-authored-by: Eric Vergnaud --- .github/workflows/solacc.yml | 7 ++ src/databricks/labs/ucx/source_code/base.py | 5 +- .../labs/ucx/source_code/notebooks/sources.py | 8 ++- tests/integration/source_code/solacc.py | 65 ++++++++++++++++--- 4 files changed, 73 insertions(+), 12 deletions(-) diff --git a/.github/workflows/solacc.yml b/.github/workflows/solacc.yml index 82c10bb2f2..69995672b0 100644 --- a/.github/workflows/solacc.yml +++ b/.github/workflows/solacc.yml @@ -26,3 +26,10 @@ jobs: - name: Verify linters on solution accelerators run: make solacc + + - name: Upload reports + uses: actions/upload-artifact@v4 + with: + name: report + path: build/ + if-no-files-found: error diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 4e22e74a1a..3457751688 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -96,7 +96,10 @@ def message_relative_to(self, base: Path, *, default: Path | None = None) -> str logger.debug(f'THIS IS A BUG! {advice.code}:{advice.message} has unknown path') if default is not None: path = default - path = path.relative_to(base) + try: + path = path.relative_to(base) + except ValueError: + logger.debug(f'Not a relative path: {path} to base: {base}') # increment start_line because it is 0-based whereas IDEs are usually 1-based return f"./{path.as_posix()}:{advice.start_line+1}:{advice.start_col}: [{advice.code}] {advice.message}" diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index febe224ed3..afc45a04cc 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -163,7 +163,13 @@ def __init__( self._python_trees: dict[PythonCell, Tree] = {} # the original trees to be linted def lint(self) -> Iterable[Advice]: - yield from self._load_tree_from_notebook(self._notebook, True) + has_failure = False + for advice in self._load_tree_from_notebook(self._notebook, True): + if isinstance(advice, Failure): # happens when a cell is unparseable + has_failure = True + yield advice + if has_failure: + return for cell in self._notebook.cells: if not self._context.is_supported(cell.language.language): continue diff --git a/tests/integration/source_code/solacc.py b/tests/integration/source_code/solacc.py index 5f61b30b40..1fe921f3fb 100644 --- a/tests/integration/source_code/solacc.py +++ b/tests/integration/source_code/solacc.py @@ -1,8 +1,11 @@ +import dataclasses +import json import logging import os import shutil import sys from dataclasses import dataclass, field +from datetime import datetime, timezone from pathlib import Path import requests @@ -20,6 +23,8 @@ this_file = Path(__file__) dist = (this_file / '../../../../dist').resolve().absolute() +build = dist.parent / "build" +build.mkdir(exist_ok=True) def _get_repos_to_clone() -> dict[str, str]: @@ -72,23 +77,41 @@ def _collect_uninferrable_count(advices: list[LocatedAdvice]): def _collect_unparseable(advices: list[LocatedAdvice]): - return set(located_advice for located_advice in advices if located_advice.advice.code == 'parse-error') + return list(located_advice for located_advice in advices if located_advice.advice.code == 'parse-error') def _print_advices(advices: list[LocatedAdvice]): - for located_advice in advices: - message = located_advice.message_relative_to(dist.parent) - sys.stdout.write(f"{message}\n") + messages = list( + located_advice.message_relative_to(dist.parent).replace('\n', ' ') + '\n' for located_advice in advices + ) + if os.getenv("CI"): + advices_path = build / "advices.txt" + with advices_path.open("a") as advices_file: + advices_file.writelines(messages) + else: + for message in messages: + sys.stdout.write(message) + + +@dataclass +class _SolaccStats: + run_id: str + name: str + start_timestamp: datetime + end_timestamp: datetime + files_count: int + files_size: int @dataclass class _SolaccContext: unparsed_files_path: Path | None = None - files_to_skip: set[str] | None = None + files_to_skip: set[Path] | None = None total_count = 0 parseable_count = 0 uninferrable_count = 0 missing_imports: dict[str, dict[str, int]] = field(default_factory=dict) + stats: list[_SolaccStats] = field(default_factory=list) @classmethod def create(cls, for_all_dirs: bool): @@ -98,11 +121,11 @@ def create(cls, for_all_dirs: bool): unparsed_path = Path(Path(__file__).parent, "solacc-unparsed.txt") if unparsed_path.exists(): os.remove(unparsed_path) - files_to_skip: set[str] | None = None + files_to_skip: set[Path] | None = None malformed = Path(__file__).parent / "solacc-malformed.txt" if for_all_dirs and malformed.exists(): lines = malformed.read_text(encoding="utf-8").split("\n") - files_to_skip = set(line for line in lines if len(line) > 0 and not line.startswith("#")) + files_to_skip = set(dist / line for line in lines if len(line) > 0 and not line.startswith("#")) return _SolaccContext(unparsed_files_path=unparsed_path, files_to_skip=files_to_skip) def register_missing_import(self, missing_import: str): @@ -153,7 +176,19 @@ def _lint_dir(solacc: _SolaccContext, soldir: Path): files_to_skip = set(solacc.files_to_skip) if solacc.files_to_skip else set() linted_files = set(files_to_skip) # lint solution + start_timestamp = datetime.now(timezone.utc) advices = list(ctx.local_code_linter.lint_path(soldir, linted_files)) + end_timestamp = datetime.now(timezone.utc) + # record stats + stats = _SolaccStats( + run_id=os.getenv("GITHUB_RUN_ATTEMPT") or "local", + start_timestamp=start_timestamp, + end_timestamp=end_timestamp, + name=soldir.name, + files_count=len(all_files), + files_size=sum(path.stat().st_size for path in [soldir / filename for filename in all_files]), + ) + solacc.stats.append(stats) # collect unparseable files unparseables = _collect_unparseable(advices) solacc.parseable_count += len(linted_files) - len(files_to_skip) - len(set(advice.path for advice in unparseables)) @@ -162,7 +197,11 @@ def _lint_dir(solacc: _SolaccContext, soldir: Path): logger.error(f"Error during parsing of {unparseable.path}: {unparseable.advice.message}".replace("\n", " ")) # populate solacc-unparsed.txt with solacc.unparsed_files_path.open(mode="a", encoding="utf-8") as f: - f.write(unparseable.path.relative_to(dist).as_posix()) + try: + path = unparseable.path.relative_to(dist) + except ValueError: + path = unparseable.path + f.write(path.as_posix()) f.write("\n") # collect missing imports for missing_import in _collect_missing_imports(advices): @@ -178,8 +217,8 @@ def _lint_dir(solacc: _SolaccContext, soldir: Path): def _lint_repos(clone_urls, sol_to_lint: str | None): solacc = _SolaccContext.create(sol_to_lint is not None) if sol_to_lint: - # don't clone if linting just one file, assumption is we're troubleshooting - _lint_dir(solacc, dist / sol_to_lint) + sol_dir = _clone_repo(clone_urls[sol_to_lint], sol_to_lint) + _lint_dir(solacc, sol_dir) else: names: list[str] = list(clone_urls.keys()) for name in sorted(names, key=str.casefold): @@ -199,6 +238,12 @@ def _lint_repos(clone_urls, sol_to_lint: str | None): f"not computed: {solacc.uninferrable_count}" ) solacc.log_missing_imports() + # log stats + stats_path = build / "stats.json" + with stats_path.open("a") as stats_file: + for stats in solacc.stats: + message = json.dumps(dataclasses.asdict(stats), default=str) + stats_file.writelines([message]) # fail the job if files are unparseable if parseable_pct < 100: sys.exit(1) From 05ca45a7088b2e7fa903fe65ceef69c513dd455c Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 30 Sep 2024 14:08:22 +0200 Subject: [PATCH 2/7] Crawler: fix missing generic types on subclasses. (#2760) ## Changes This PR simply adds in some missing generic annotations for crawler implementations. --- src/databricks/labs/ucx/hive_metastore/table_size.py | 2 +- src/databricks/labs/ucx/hive_metastore/tables.py | 2 +- src/databricks/labs/ucx/hive_metastore/udfs.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/table_size.py b/src/databricks/labs/ucx/hive_metastore/table_size.py index aadb7576e5..3e5c61f81c 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_size.py +++ b/src/databricks/labs/ucx/hive_metastore/table_size.py @@ -22,7 +22,7 @@ class TableSize: size_in_bytes: int -class TableSizeCrawler(CrawlerBase): +class TableSizeCrawler(CrawlerBase[TableSize]): def __init__(self, backend: SqlBackend, schema, include_databases: list[str] | None = None): """ Initializes a TablesSizeCrawler instance. diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 7f8ecdfca3..f935aada95 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -476,7 +476,7 @@ def _describe(self, catalog: str, database: str, table: str) -> Table | None: return None -class FasterTableScanCrawler(CrawlerBase): +class FasterTableScanCrawler(CrawlerBase[Table]): """ FasterTableScanCrawler is a specialized version of TablesCrawler that uses spark._jsparkSession to utilize faster scanning with Scala APIs. diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index 6bfd173449..6ee1eefd38 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -33,7 +33,7 @@ def key(self) -> str: return f"{self.catalog}.{self.database}.{self.name}".lower() -class UdfsCrawler(CrawlerBase): +class UdfsCrawler(CrawlerBase[Udf]): def __init__(self, backend: SqlBackend, schema: str, include_databases: list[str] | None = None): """ Initializes a UdfsCrawler instance. From 6ef9b8dbc14c1feb6697b684348019b34446ac47 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:08:40 +0200 Subject: [PATCH 3/7] Update sqlglot requirement from <25.23,>=25.5.0 to >=25.5.0,<25.25 (#2765) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates the requirements on [sqlglot](https://github.com/tobymao/sqlglot) to permit the latest version.
Changelog

Sourced from sqlglot's changelog.

[v25.24.0] - 2024-09-26

:boom: BREAKING CHANGES

:sparkles: New Features

:bug: Bug Fixes

:recycle: Refactors

:wrench: Chores

[v25.23.2] - 2024-09-25

:wrench: Chores

[v25.23.1] - 2024-09-25

:wrench: Chores

[v25.23.0] - 2024-09-25

:boom: BREAKING CHANGES

... (truncated)

Commits
  • 73cd334 Cleanup
  • 0a5444d Feat: expose a flag to automatically exclude Keep diff nodes (#4168)
  • 93cef30 feat(postgres): Support OVERLAY function (#4165)
  • 3ab6dfb fix(clickhouse)!: Generalize COLUMNS(...) APPLY (#4161)
  • f6d3bdd Chore: update supported dialect count (21 -> 23)
  • 2540e50 Refactor: simplify check_deploy job
  • 9c17264 fix(hive): Enclose exp.Split with \E (#4163)
  • abafa60 docs: update API docs, CHANGELOG.md for v25.23.2 [skip ci]
  • eca05d3 Chore: tweak should_deploy_rs script to avoid marking CI as failed
  • 0778c29 docs: update API docs, CHANGELOG.md for v25.23.1 [skip ci]
  • Additional commits viewable in compare view

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7afc837663..4812d5bff9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ dependencies = ["databricks-sdk~=0.30", "databricks-labs-lsql>=0.5,<0.13", "databricks-labs-blueprint>=0.8,<0.10", "PyYAML>=6.0.0,<7.0.0", - "sqlglot>=25.5.0,<25.23", + "sqlglot>=25.5.0,<25.25", "astroid>=3.3.1"] [project.optional-dependencies] From b163801d4ff2d9e7d47ff6eabacb808a404a230f Mon Sep 17 00:00:00 2001 From: Andres Garcia Date: Mon, 30 Sep 2024 10:09:18 -0600 Subject: [PATCH 4/7] Added CLI Functionality to export UCX Assessment --- src/databricks/labs/ucx/assessment/export.py | 0 tests/unit/assessment/test_export.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/databricks/labs/ucx/assessment/export.py create mode 100644 tests/unit/assessment/test_export.py diff --git a/src/databricks/labs/ucx/assessment/export.py b/src/databricks/labs/ucx/assessment/export.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/assessment/test_export.py b/tests/unit/assessment/test_export.py new file mode 100644 index 0000000000..e69de29bb2 From 188bc244ee34d7b2444627d88d8d4f8cfc896d59 Mon Sep 17 00:00:00 2001 From: Andres Garcia Date: Mon, 30 Sep 2024 10:10:52 -0600 Subject: [PATCH 5/7] Added CLI Functionality to export UCX Assessment --- labs.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/labs.yml b/labs.yml index c27fd791c7..9e25ef0494 100644 --- a/labs.yml +++ b/labs.yml @@ -330,3 +330,6 @@ commands: description: The file to download - name: run-as-collection description: Run the command for the collection of workspaces with ucx installed. Default is False. + + - name: export-assessment + description: Export UCX results to a specified location From cb89541d1fa97c99264e2f0fa3421328b36f6de8 Mon Sep 17 00:00:00 2001 From: Andres Garcia Date: Mon, 30 Sep 2024 10:22:37 -0600 Subject: [PATCH 6/7] Added CLI Functionality to export UCX Assessment --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index 787fe09cd0..f1ad7b6525 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [`open-remote-config` command](#open-remote-config-command) * [`installations` command](#installations-command) * [`report-account-compatibility` command](#report-account-compatibility-command) + * [`export-assessment` command](#export-assessment-command) * [Metastore related commands](#metastore-related-commands) * [`show-all-metastores` command](#show-all-metastores-command) * [`assign-metastore` command](#assign-metastore-command) @@ -1153,6 +1154,24 @@ databricks labs ucx report-account-compatibility --profile labs-azure-account [[back to top](#databricks-labs-ucx)] +## `export-assessment` command + +```commandline +databricks labs ucx export-assessment +``` +The export-assessment command is used to export UCX assessment results to a specified location. When you run this command, you will be prompted to provide details on the destination path and the type of report you wish to generate. If you do not specify these details, the command will default to exporting the main results to the current directory. The exported file will be named based on the selection made in the format. Eg: export_{query_choice}_results.zip +- **Choose a path to save the UCX Assessment results:** + - **Description:** Specify the path where the results should be saved. If not provided, results will be saved in the current directory. + +- **Choose which assessment results to export:** + - **Description:** Select the type of results to export. Options include: + - `azure` + - `estimates` + - `interactive` + - `main` + +[[back to top](#databricks-labs-ucx)] + # Metastore related commands These commands are used to assign a Unity Catalog metastore to a workspace. The metastore assignment is a pre-requisite From 571fc8b456cd772b1bac2d0f178b929fef757910 Mon Sep 17 00:00:00 2001 From: Andres Garcia Date: Mon, 30 Sep 2024 17:13:14 -0600 Subject: [PATCH 7/7] Added CLI Functionality to export UCX Assessment --- src/databricks/labs/ucx/assessment/export.py | 48 +++++++ src/databricks/labs/ucx/cli.py | 8 ++ .../labs/ucx/contexts/application.py | 5 + .../labs/ucx/installer/workflows.py | 127 ++++++++++++++++++ tests/unit/assessment/test_export.py | 41 ++++++ 5 files changed, 229 insertions(+) diff --git a/src/databricks/labs/ucx/assessment/export.py b/src/databricks/labs/ucx/assessment/export.py index e69de29bb2..987132e02e 100644 --- a/src/databricks/labs/ucx/assessment/export.py +++ b/src/databricks/labs/ucx/assessment/export.py @@ -0,0 +1,48 @@ +import logging +from pathlib import Path + +from databricks.labs.blueprint.tui import Prompts + +from databricks.labs.ucx.config import WorkspaceConfig +from databricks.labs.lsql.backends import SqlBackend +from databricks.labs.lsql.dashboards import DashboardMetadata + +logger = logging.getLogger(__name__) + + +class AssessmentExporter: + + def __init__(self, sql_backend: SqlBackend, config: WorkspaceConfig): + self._sql_backend = sql_backend + self._config = config + + def export_results(self, prompts: Prompts): + """Main method to export results to CSV files inside a ZIP archive.""" + project_root = Path(__file__).resolve().parents[3] + queries_path_root = project_root / "labs/ucx/queries/assessment" + + results_directory = Path( + prompts.question( + "Choose a path to save the UCX Assessment results", + default=Path.cwd().as_posix(), + validate=lambda p_: Path(p_).exists(), + ) + ) + + query_choice = prompts.choice( + "Choose which assessment results to export", + [subdir.name for subdir in queries_path_root.iterdir() if subdir.is_dir()], + ) + + export_path = results_directory / f"export_{query_choice}_results.zip" + queries_path = queries_path_root / query_choice + + assessment_results = DashboardMetadata.from_path(queries_path).replace_database( + database=self._config.inventory_database, database_to_replace="inventory" + ) + + logger.info("Exporting assessment results....") + results_path = assessment_results.export_to_zipped_csv(self._sql_backend, export_path) + logger.info(f"Results exported to {results_path}") + + return results_path diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index 04f0b22d0b..27b6368bb4 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -779,5 +779,13 @@ def lint_local_code( linter.lint(prompts, None if path is None else Path(path)) +@ucx.command +def export_assessment(w: WorkspaceClient, prompts: Prompts): + """Export the UCX assessment queries to a zip file.""" + ctx = WorkspaceContext(w) + exporter = ctx.assessment_exporter + exporter.export_results(prompts) + + if __name__ == "__main__": ucx() diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 95944a3d2a..d595eb137f 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -22,6 +22,7 @@ from databricks.labs.ucx.account.workspaces import WorkspaceInfo from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler +from databricks.labs.ucx.assessment.export import AssessmentExporter from databricks.labs.ucx.aws.credentials import CredentialManager from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.hive_metastore import ExternalLocations, Mounts, TablesCrawler @@ -249,6 +250,10 @@ def tables_migrator(self): self.migrate_grants, ) + @cached_property + def assessment_exporter(self): + return AssessmentExporter(self.sql_backend, self.config) + @cached_property def acl_migrator(self): return ACLMigrator( diff --git a/src/databricks/labs/ucx/installer/workflows.py b/src/databricks/labs/ucx/installer/workflows.py index e6148a0da5..b65410c5a2 100644 --- a/src/databricks/labs/ucx/installer/workflows.py +++ b/src/databricks/labs/ucx/installer/workflows.py @@ -112,6 +112,122 @@ f'--parent_run_id=' + dbutils.widgets.get('parent_run_id')) """ +EXPORT_TO_EXCEL_NOTEBOOK = """# Databricks notebook source +# MAGIC %md +# MAGIC ##### Exporter of UCX assessment results +# MAGIC ##### Instructions: +# MAGIC 1. Execute using an all-purpose cluster with Databricks Runtime 14 or higher. +# MAGIC 1. Hit **Run all** button and wait for completion. +# MAGIC 1. Go to the bottom of the notebook and click the Download UCX Results button. +# MAGIC +# MAGIC ##### Important: +# MAGIC Please note that this is only meant to serve as example code. +# MAGIC +# MAGIC Example code developed by **Databricks Shared Technical Services team**. + +# COMMAND ---------- + +# DBTITLE 1,Installing Packages +# MAGIC %pip install {remote_wheel} -qqq +# MAGIC %pip install xlsxwriter -qqq +# MAGIC dbutils.library.restartPython() + +# COMMAND ---------- + +# DBTITLE 1,Libraries Import and Setting UCX +import os +import logging +import threading +import shutil +from pathlib import Path +from threading import Lock +from functools import partial + +import pandas as pd +import xlsxwriter + +from databricks.sdk.config import with_user_agent_extra +from databricks.labs.blueprint.logger import install_logger +from databricks.labs.blueprint.parallel import Threads +from databricks.labs.lsql.dashboards import Dashboards +from databricks.labs.lsql.lakeview.model import Dataset +from databricks.labs.ucx.contexts.workflow_task import RuntimeContext + +# ctx +install_logger() +with_user_agent_extra("cmd", "export-assessment") +named_parameters = dict(config="/Workspace{config_file}") +ctx = RuntimeContext(named_parameters) +lock = Lock() + +# COMMAND ---------- + +# DBTITLE 1,Assessment Export +FILE_NAME = "ucx_assessment_main.xlsx" +TMP_PATH = f"/Workspace{{ctx.installation.install_folder()}}/tmp/" +DOWNLOAD_PATH = "/dbfs/FileStore/excel-export" + + +def _cleanup() -> None: + '''Move the temporary results file to the download path and clean up the temp directory.''' + shutil.move( + os.path.join(TMP_PATH, FILE_NAME), + os.path.join(DOWNLOAD_PATH, FILE_NAME), + ) + shutil.rmtree(TMP_PATH) + + +def _prepare_directories() -> None: + '''Ensure that the necessary directories exist.''' + os.makedirs(TMP_PATH, exist_ok=True) + os.makedirs(DOWNLOAD_PATH, exist_ok=True) + + +def _to_excel(dataset: Dataset, writer: ...) -> None: + '''Execute a SQL query and write the result to an Excel sheet.''' + worksheet_name = dataset.display_name[:31] + df = spark.sql(dataset.query).toPandas() + with lock: + df.to_excel(writer, sheet_name=worksheet_name, index=False) + + +def _render_export() -> None: + '''Render an HTML link for downloading the results.''' + html_content = ''' + +

Export Results

Download Results
+ ''' + displayHTML(html_content) + + +def export_results() -> None: + '''Main method to export results to an Excel file.''' + _prepare_directories() + + dashboard_path = ( + Path(ctx.installation.install_folder()) + / "dashboards/[UCX] UCX Assessment (Main).lvdash.json" + ) + dashboard = Dashboards(ctx.workspace_client) + dashboard_datasets = dashboard.get_dashboard(dashboard_path).datasets + try: + target = TMP_PATH + "/ucx_assessment_main.xlsx" + with pd.ExcelWriter(target, engine="xlsxwriter") as writer: + tasks = [] + for dataset in dashboard_datasets: + tasks.append(partial(_to_excel, dataset, writer)) + Threads.strict("exporting", tasks) + _cleanup() + _render_export() + except Exception as e: + print(f"Error exporting results ", e) + +# COMMAND ---------- + +# DBTITLE 1,Data Export +export_results() +""" + class DeployedWorkflows: def __init__(self, ws: WorkspaceClient, install_state: InstallState): @@ -481,6 +597,7 @@ def create_jobs(self) -> None: self.remove_jobs(keep=desired_workflows) self._install_state.save() self._create_debug(remote_wheels) + self._create_export(remote_wheels) self._create_readme() def remove_jobs(self, *, keep: set[str] | None = None) -> None: @@ -819,6 +936,16 @@ def _create_debug(self, remote_wheels: list[str]): ).encode("utf8") self._installation.upload('DEBUG.py', content) + def _create_export(self, remote_wheels: list[str]): + remote_wheels_str = " ".join(remote_wheels) + content = EXPORT_TO_EXCEL_NOTEBOOK.format( + remote_wheel=remote_wheels_str, + config_file=self._config_file, + workspace_host=self._ws.config.host, + workspace_id=self._ws.get_workspace_id(), + ).encode("utf8") + self._installation.upload('EXPORT_ASSESSMENT_TO_EXCEL.py', content) + class MaxedStreamHandler(logging.StreamHandler): diff --git a/tests/unit/assessment/test_export.py b/tests/unit/assessment/test_export.py index e69de29bb2..c67355f367 100644 --- a/tests/unit/assessment/test_export.py +++ b/tests/unit/assessment/test_export.py @@ -0,0 +1,41 @@ +from databricks.labs.ucx.config import WorkspaceConfig +from databricks.labs.ucx.assessment.export import AssessmentExporter +from databricks.labs.lsql.backends import MockBackend +from databricks.labs.blueprint.tui import MockPrompts +from databricks.labs.lsql.core import Row + + +def test_export(tmp_path): + """Test the export_results method of the AssessmentExporter class.""" + query = { + "SELECT\n one\nFROM ucx.external_locations": [ + Row(location="s3://bucket1/folder1", table_count=1), + Row(location="abfss://container1@storage1.dfs.core.windows.net/folder1", table_count=1), + Row(location="gcp://folder1", table_count=2), + ] + } + + # Setup workspace configuration + config = WorkspaceConfig(inventory_database="ucx") + + # Prepare temporary paths and files + export_path = tmp_path / "export" + export_path.mkdir(parents=True, exist_ok=True) + + # Mock backend and prompts + mock_backend = MockBackend(rows=query) + query_choice = {"assessment_name": "main", "option": 3} + mock_prompts = MockPrompts( + { + "Choose a path to save the UCX Assessment results": export_path.as_posix(), + "Choose which assessment results to export": query_choice["option"], + } + ) + + # Execute export process + export = AssessmentExporter(mock_backend, config) + exported = export.export_results(mock_prompts) + + # Assertion based on the query_choice + expected_file_name = f"export_{query_choice['assessment_name']}_results.zip" # Adjusted filename + assert exported == export_path / expected_file_name