From 0f838954aaffb1e3f5975c2f6f76f3234bf53993 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 19 Sep 2024 11:00:01 +0200 Subject: [PATCH 01/41] add tasks for assessing source code --- src/databricks/labs/ucx/assessment/workflows.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/databricks/labs/ucx/assessment/workflows.py b/src/databricks/labs/ucx/assessment/workflows.py index 5dfbcd1f15..1c37f53c14 100644 --- a/src/databricks/labs/ucx/assessment/workflows.py +++ b/src/databricks/labs/ucx/assessment/workflows.py @@ -182,6 +182,14 @@ def crawl_groups(self, ctx: RuntimeContext): """Scans all groups for the local group migration scope""" ctx.group_manager.snapshot() + @job_task + def assess_dashboards(self, ctx: RuntimeContext): + ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) + + @job_task + def assess_workflows(self, ctx: RuntimeContext): + ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) + class Failing(Workflow): def __init__(self): From d2bbabe5e98058d4ac3c6e06d56358bd9b8d2b01 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 19 Sep 2024 11:46:33 +0200 Subject: [PATCH 02/41] drop experimental workflow --- .../labs/ucx/assessment/workflows.py | 4 ++++ src/databricks/labs/ucx/runtime.py | 2 -- .../labs/ucx/source_code/workflows.py | 19 ------------------- 3 files changed, 4 insertions(+), 21 deletions(-) delete mode 100644 src/databricks/labs/ucx/source_code/workflows.py diff --git a/src/databricks/labs/ucx/assessment/workflows.py b/src/databricks/labs/ucx/assessment/workflows.py index 1c37f53c14..801ddc9322 100644 --- a/src/databricks/labs/ucx/assessment/workflows.py +++ b/src/databricks/labs/ucx/assessment/workflows.py @@ -184,10 +184,14 @@ def crawl_groups(self, ctx: RuntimeContext): @job_task def assess_dashboards(self, ctx: RuntimeContext): + """Scans all dashboards for migration issues in SQL code of embedded widgets. + Also stores direct filesystem accesses for display in the migration dashboard.""" ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) @job_task def assess_workflows(self, ctx: RuntimeContext): + """Scans all jobs for migration issues in notebooks. + Also stores direct filesystem accesses for display in the migration dashboard.""" ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index f0c534ce60..88e12e1928 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -19,7 +19,6 @@ MigrateExternalTablesCTAS, ) from databricks.labs.ucx.recon.workflows import MigrationRecon -from databricks.labs.ucx.source_code.workflows import ExperimentalWorkflowLinter from databricks.labs.ucx.workspace_access.workflows import ( GroupMigration, PermissionsMigrationAPI, @@ -56,7 +55,6 @@ def all(cls): ScanTablesInMounts(), MigrateTablesInMounts(), PermissionsMigrationAPI(), - ExperimentalWorkflowLinter(), MigrationRecon(), Failing(), ] diff --git a/src/databricks/labs/ucx/source_code/workflows.py b/src/databricks/labs/ucx/source_code/workflows.py deleted file mode 100644 index 9fd3046070..0000000000 --- a/src/databricks/labs/ucx/source_code/workflows.py +++ /dev/null @@ -1,19 +0,0 @@ -from databricks.labs.ucx.contexts.workflow_task import RuntimeContext -from databricks.labs.ucx.framework.tasks import Workflow, job_task - - -class ExperimentalWorkflowLinter(Workflow): - def __init__(self): - super().__init__('experimental-workflow-linter') - - @job_task(job_cluster="table_migration") - def lint_all_workflows(self, ctx: RuntimeContext): - """[EXPERIMENTAL] Analyses all jobs for source code compatibility problems. This is an experimental feature, - that is not yet fully supported.""" - ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) - - @job_task(job_cluster="table_migration") - def lint_all_queries(self, ctx: RuntimeContext): - """[EXPERIMENTAL] Analyses all jobs for source code compatibility problems. This is an experimental feature, - that is not yet fully supported.""" - ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) From 4c237477695823f6dab7d1487db99134a9cc3970 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 19 Sep 2024 16:01:29 +0200 Subject: [PATCH 03/41] Revert "drop experimental workflow" This reverts commit d2bbabe5e98058d4ac3c6e06d56358bd9b8d2b01. --- .../labs/ucx/assessment/workflows.py | 4 ---- src/databricks/labs/ucx/runtime.py | 2 ++ .../labs/ucx/source_code/workflows.py | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) create mode 100644 src/databricks/labs/ucx/source_code/workflows.py diff --git a/src/databricks/labs/ucx/assessment/workflows.py b/src/databricks/labs/ucx/assessment/workflows.py index 801ddc9322..1c37f53c14 100644 --- a/src/databricks/labs/ucx/assessment/workflows.py +++ b/src/databricks/labs/ucx/assessment/workflows.py @@ -184,14 +184,10 @@ def crawl_groups(self, ctx: RuntimeContext): @job_task def assess_dashboards(self, ctx: RuntimeContext): - """Scans all dashboards for migration issues in SQL code of embedded widgets. - Also stores direct filesystem accesses for display in the migration dashboard.""" ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) @job_task def assess_workflows(self, ctx: RuntimeContext): - """Scans all jobs for migration issues in notebooks. - Also stores direct filesystem accesses for display in the migration dashboard.""" ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index 88e12e1928..f0c534ce60 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -19,6 +19,7 @@ MigrateExternalTablesCTAS, ) from databricks.labs.ucx.recon.workflows import MigrationRecon +from databricks.labs.ucx.source_code.workflows import ExperimentalWorkflowLinter from databricks.labs.ucx.workspace_access.workflows import ( GroupMigration, PermissionsMigrationAPI, @@ -55,6 +56,7 @@ def all(cls): ScanTablesInMounts(), MigrateTablesInMounts(), PermissionsMigrationAPI(), + ExperimentalWorkflowLinter(), MigrationRecon(), Failing(), ] diff --git a/src/databricks/labs/ucx/source_code/workflows.py b/src/databricks/labs/ucx/source_code/workflows.py new file mode 100644 index 0000000000..9fd3046070 --- /dev/null +++ b/src/databricks/labs/ucx/source_code/workflows.py @@ -0,0 +1,19 @@ +from databricks.labs.ucx.contexts.workflow_task import RuntimeContext +from databricks.labs.ucx.framework.tasks import Workflow, job_task + + +class ExperimentalWorkflowLinter(Workflow): + def __init__(self): + super().__init__('experimental-workflow-linter') + + @job_task(job_cluster="table_migration") + def lint_all_workflows(self, ctx: RuntimeContext): + """[EXPERIMENTAL] Analyses all jobs for source code compatibility problems. This is an experimental feature, + that is not yet fully supported.""" + ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) + + @job_task(job_cluster="table_migration") + def lint_all_queries(self, ctx: RuntimeContext): + """[EXPERIMENTAL] Analyses all jobs for source code compatibility problems. This is an experimental feature, + that is not yet fully supported.""" + ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) From 2977a36089418c4e873337b28435ce6ad0bfe1c9 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 19 Sep 2024 16:05:15 +0200 Subject: [PATCH 04/41] restore comments --- src/databricks/labs/ucx/assessment/workflows.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/databricks/labs/ucx/assessment/workflows.py b/src/databricks/labs/ucx/assessment/workflows.py index 1c37f53c14..801ddc9322 100644 --- a/src/databricks/labs/ucx/assessment/workflows.py +++ b/src/databricks/labs/ucx/assessment/workflows.py @@ -184,10 +184,14 @@ def crawl_groups(self, ctx: RuntimeContext): @job_task def assess_dashboards(self, ctx: RuntimeContext): + """Scans all dashboards for migration issues in SQL code of embedded widgets. + Also stores direct filesystem accesses for display in the migration dashboard.""" ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) @job_task def assess_workflows(self, ctx: RuntimeContext): + """Scans all jobs for migration issues in notebooks. + Also stores direct filesystem accesses for display in the migration dashboard.""" ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) From fd07db93d14bf62d7f89e5b4e226ec2a54553af9 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 19 Sep 2024 19:17:47 +0200 Subject: [PATCH 05/41] fix tests failing due to long running tasks --- tests/integration/assessment/test_ext_hms.py | 2 +- tests/integration/assessment/test_workflows.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/assessment/test_ext_hms.py b/tests/integration/assessment/test_ext_hms.py index 7dc79f4621..b6c04213ee 100644 --- a/tests/integration/assessment/test_ext_hms.py +++ b/tests/integration/assessment/test_ext_hms.py @@ -39,7 +39,7 @@ def test_running_real_assessment_job_ext_hms( # Under ideal circumstances this can take 10-16 minutes (depending on whether there are compute instances available # via the integration pool). Allow some margin to reduce spurious failures. - ext_hms_ctx.deployed_workflows.run_workflow("assessment", max_wait=dt.timedelta(minutes=25)) + ext_hms_ctx.deployed_workflows.run_workflow("assessment", max_wait=dt.timedelta(minutes=30)) # assert the workflow is successful. the tasks on sql warehouse will fail so skip checking them assert ext_hms_ctx.deployed_workflows.validate_step("assessment") diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index 1f433bd52d..f43bdc88b2 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -5,7 +5,7 @@ from databricks.sdk.service.iam import PermissionLevel -@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8)) +@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=25)) def test_running_real_assessment_job(ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions): ws_group_a, _ = installation_ctx.make_ucx_group() cluster_policy = make_cluster_policy() From 883c37ab9cd09b6701d5651b2b700e057cd6fd40 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 20 Sep 2024 13:46:17 +0200 Subject: [PATCH 06/41] add workflow linting widget to assessment dashboard --- .../main/35_0_code_compatibility_problems.md | 8 +++++ ...de_compatibility_problems_in_workflows.sql | 33 +++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md create mode 100644 src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md b/src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md new file mode 100644 index 0000000000..851d95a766 --- /dev/null +++ b/src/databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md @@ -0,0 +1,8 @@ +## Code compatibility problems + +The tables below assist with verifying if workflows and dashboards are Unity Catalog compatible. It can be filtered on the path, +problem code and workflow name. +Each row: +- Points to a problem detected in the code using the code path, query or workflow & task reference and start/end line & column; +- Explains the problem with a human-readable message and a code. + diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql b/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql new file mode 100644 index 0000000000..54f161e677 --- /dev/null +++ b/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql @@ -0,0 +1,33 @@ +/* +--title 'Workflow migration problems' +--width 6 +--overrides '{"spec":{ + "encodings":{ + "columns": [ + {"fieldName": "path", "booleanValues": ["false", "true"], "linkUrlTemplate": "/#workspace/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "path"}, + {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, + {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, + {"fieldName": "workflow_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/jobs/{{ workflow_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_name"}, + {"fieldName": "task_key", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/jobs/{{ workflow_id }}/tasks/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "task_key"}, + {"fieldName": "start_line", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "start_line"}, + {"fieldName": "start_col", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "start_col"}, + {"fieldName": "end_line", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "end_line"}, + {"fieldName": "end_col", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "end_col"} + ]}, + "invisibleColumns": [ + {"name": "workflow_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "/jobs/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_id"} + ] + }}' +*/ +SELECT + path, + code, + message, + job_id AS workflow_id, + job_name AS workflow_name, + task_key, + start_line, + start_col, + end_line, + end_col +FROM inventory.workflow_problems \ No newline at end of file From 15fc47cfc66290a167ae1458d39b9da9171debf0 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 20 Sep 2024 13:56:29 +0200 Subject: [PATCH 07/41] add dashboards linting widget to assessment dashboard --- ...e_compatibility_problems_in_dashboards.sql | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql new file mode 100644 index 0000000000..1e1374b45b --- /dev/null +++ b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql @@ -0,0 +1,29 @@ +/* +--title 'Dashboard migration problems' +--width 6 +--overrides '{"spec":{ + "encodings":{ + "columns": [ + {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, + {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, + {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_name"}, + {"fieldName": "query_name", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/queries/{{ query_id }}/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_name"} + ]}, + "invisibleColumns": [ + {"name": "dashboard_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_parent"}, + {"name": "dashboard_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_id"}, + {"name": "query_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_parent"}, + {"name": "query_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_id"} + ] + }}' +*/ +SELECT + dashboard_id, + dashboard_parent, + dashboard_name, + query_id, + query_parent, + query_name, + code, + message +FROM inventory.query_problems From 19fe54c42987ce18ab795962a748208c4f3da5a5 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 20 Sep 2024 13:56:41 +0200 Subject: [PATCH 08/41] update readme --- README.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 26f9e31c08..598af8c5db 100644 --- a/README.md +++ b/README.md @@ -377,6 +377,9 @@ which can be used for further analysis and decision-making through the [assessme 9. `assess_pipelines`: This task scans through all the Pipelines and identifies those pipelines that have Azure Service Principals embedded in their configurations. A list of all the pipelines with matching configurations is stored in the `$inventory.pipelines` table. 10. `assess_azure_service_principals`: This task scans through all the clusters configurations, cluster policies, job cluster configurations, Pipeline configurations, and Warehouse configuration and identifies all the Azure Service Principals who have been given access to the Azure storage accounts via spark configurations referred in those entities. The list of all the Azure Service Principals referred in those configurations is saved in the `$inventory.azure_service_principals` table. 11. `assess_global_init_scripts`: This task scans through all the global init scripts and identifies if there is an Azure Service Principal who has been given access to the Azure storage accounts via spark configurations referred in those scripts. +12. `assess_dashboards`: This task scans through all the dashboards and analyzes embedded queries for migration problems. It also collects direct filesystem access patterns that require attention. +13. `assess_workflows`: This task scans through all the jobs and tasks and analyzes notebooks and files for migration problems. It also collects direct filesystem access patterns that require attention. + ![report](docs/assessment-report.png) @@ -692,11 +695,16 @@ in the Migration dashboard. > Please note that this is an experimental workflow. -The `experimental-workflow-linter` workflow lints accessible code belonging to all workflows/jobs present in the -workspace. The linting emits problems indicating what to resolve for making the code Unity Catalog compatible. +The `experimental-workflow-linter` workflow lints accessible code from 2 sources: + - all workflows/jobs present in the workspace + - all dashboards/queries present in the workspace +The linting emits problems indicating what to resolve for making the code Unity Catalog compatible. +The linting also locates direct filesystem access that need to be migrated. -Once the workflow completes, the output will be stored in `$inventory_database.workflow_problems` table, and displayed -in the Migration dashboard. +Once the workflow completes: + - problems are stored in the `$inventory_database.workflow_problems`/`$inventory_database.query_problems` table + - direct filesystem access are stored in the `$inventory_database.directfs_in_paths`/`$inventory_database.directfs_in_queries` table + - all the above are displayed in the Migration dashboard. ![code compatibility problems](docs/code_compatibility_problems.png) From 69497b176f566eb6225592fe5b8636dbff199747 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 20 Sep 2024 13:57:05 +0200 Subject: [PATCH 09/41] add test utility for developing dashboards --- .../integration/assessment/test_dashboards.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tests/integration/assessment/test_dashboards.py diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py new file mode 100644 index 0000000000..e92979a6e2 --- /dev/null +++ b/tests/integration/assessment/test_dashboards.py @@ -0,0 +1,69 @@ +import pytest + +from databricks.labs.ucx.source_code.jobs import JobProblem +from databricks.sdk.service.iam import PermissionLevel + +from databricks.labs.ucx.source_code.queries import QueryProblem + + +def _populate_workflow_problems(installation_ctx): + job_problems = [ + JobProblem( + job_id=12345, + job_name="Peter the Job", + task_key="23456", + path="parent/child.py", + code="sql-parse-error", + message="Could not parse SQL", + start_line=1234, + start_col=22, + end_line=1234, + end_col=32, + ) + ] + installation_ctx.sql_backend.save_table( + f'{installation_ctx.inventory_database}.workflow_problems', + job_problems, + JobProblem, + mode='overwrite', + ) + + +def _populate_dashboard_problems(installation_ctx): + query_problems = [ + QueryProblem( + dashboard_id = "12345", + dashboard_parent = "dashbards/parent", + dashboard_name = "my_dashboard", + query_id="23456", + query_parent= "queries/parent", + query_name="my_query", + code="sql-parse-error", + message="Could not parse SQL" + ) + ] + installation_ctx.sql_backend.save_table( + f'{installation_ctx.inventory_database}.query_problems', + query_problems, + QueryProblem, + mode='overwrite', + ) + + +@pytest.mark.skip +def test_dashboard_with_prepopulated_data(installation_ctx, make_cluster_policy, make_cluster_policy_permissions): + """the purpose of this test is to prepopulate data used by the dashboard without running an assessment""" + ucx_group, _ = installation_ctx.make_ucx_group() + cluster_policy = make_cluster_policy() + make_cluster_policy_permissions( + object_id=cluster_policy.policy_id, + permission_level=PermissionLevel.CAN_USE, + group_name=ucx_group.display_name, + ) + installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] + installation_ctx.workspace_installation.run() + # populate data + _populate_workflow_problems(installation_ctx) + _populate_dashboard_problems(installation_ctx) + # put a breakpoint here + print("Put a breakpoint here! Then go check the dashboard in your workspace ;-)") From 496688c11ad7fc39ab388c7943f77a185a313a73 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 20 Sep 2024 16:42:27 +0200 Subject: [PATCH 10/41] add directfs access widget to assessment dashboard - no link --- .../36_0_direct_filesystem_access_problems.md | 10 +++++ .../main/36_1_direct_filesystem_accesses.sql | 44 +++++++++++++++++++ .../integration/assessment/test_dashboards.py | 39 +++++++++++++++- 3 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md create mode 100644 src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql diff --git a/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md b/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md new file mode 100644 index 0000000000..fef9e8b7d6 --- /dev/null +++ b/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md @@ -0,0 +1,10 @@ +## Direct filesystem access problems + +The table below assists with verifying if workflows and dashboards require direct filesystem access. +As a reminder, `dbfs:/` is not supported in Unity Catalog, and more generally direct filesystem access is discouraged. +Rather, data should be accessed via Unity tables. + +Each row: +- Points to a direct filesystem access detected in the code using the code path, query or workflow & task reference and start/end line & column; +- Provides the _lineage_ i.e. which `workflow -> task -> notebook...` execution sequence leads to that access. + diff --git a/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql new file mode 100644 index 0000000000..50960990ee --- /dev/null +++ b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql @@ -0,0 +1,44 @@ +/* +--title 'Direct filesystem access problems' +--width 6 +--overrides '{"spec":{ + "encodings":{ + "columns": [ + {"fieldName": "path", "title": "path", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, + {"fieldName": "is_read", "title": "is_read", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]}, + {"fieldName": "is_write", "title": "is_write", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]}, + {"fieldName": "source", "title": "source", "type": "string", "displayAs": "link", "linkUrlTemplate": "/jobs/{{ workflow_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]}, + {"fieldName": "timestamp", "title": "last_modified", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]}, + {"fieldName": "lineage", "title": "lineage", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, + {"fieldName": "lineage_data", "title": "lineage_data", "type": "complex", "displayAs": "json", "booleanValues": ["false", "true"]}, + {"fieldName": "assessment_start", "title": "assessment_start", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]}, + {"fieldName": "assessment_end", "title": "assessment_end", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]} + ]}, + "invisibleColumns": [ + {"fieldName": "lineage_type", "title": "lineage_type", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, + {"fieldName": "lineage_id", "title": "lineage_id", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]} + ] + }}' +*/ +SELECT + path, + is_read, + is_write, + source_id as source, + source_timestamp as `timestamp`, + concat(lineage.object_type, ': ', lineage.object_id) as lineage, + lineage.object_type as lineage_type, + lineage.object_id as lineage_id, + lineage.other as lineage_data, + assessment_start, + assessment_end +from (SELECT + path, + is_read, + is_write, + source_id, + source_timestamp, + explode(source_lineage) as lineage, + assessment_start_timestamp as assessment_start, + assessment_end_timestamp as assessment_end +FROM inventory.directfs) diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py index e92979a6e2..9d4110dd81 100644 --- a/tests/integration/assessment/test_dashboards.py +++ b/tests/integration/assessment/test_dashboards.py @@ -1,5 +1,8 @@ import pytest +from datetime import datetime, timezone, timedelta + +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, LineageAtom from databricks.labs.ucx.source_code.jobs import JobProblem from databricks.sdk.service.iam import PermissionLevel @@ -50,7 +53,37 @@ def _populate_dashboard_problems(installation_ctx): ) -@pytest.mark.skip +def _populate_directfs_problems(installation_ctx): + dfsas = [ + DirectFsAccess(path="some_path", + is_read=False, + is_write=True, + source_id="xyz.py", + source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0), + source_lineage=[ LineageAtom(object_type="WORKFLOW", object_id="my workflow"), + LineageAtom(object_type="TASK", object_id="my task"), + LineageAtom(object_type="PATH", object_id="my notebook")], + assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), + assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0) + ) + ] + installation_ctx.directfs_access_crawler_for_paths.dump_all(dfsas) + dfsas = [ + DirectFsAccess(path="some_path", + is_read=False, + is_write=True, + source_id="xyz.py", + source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0), + source_lineage=[LineageAtom(object_type="DASHBOARD", object_id="my dashboard"), + LineageAtom(object_type="QUERY", object_id="my query")], + assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), + assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0) + ) + ] + installation_ctx.directfs_access_crawler_for_queries.dump_all(dfsas) + + +# @pytest.mark.skip def test_dashboard_with_prepopulated_data(installation_ctx, make_cluster_policy, make_cluster_policy_permissions): """the purpose of this test is to prepopulate data used by the dashboard without running an assessment""" ucx_group, _ = installation_ctx.make_ucx_group() @@ -65,5 +98,7 @@ def test_dashboard_with_prepopulated_data(installation_ctx, make_cluster_policy, # populate data _populate_workflow_problems(installation_ctx) _populate_dashboard_problems(installation_ctx) + _populate_directfs_problems(installation_ctx) + print(f"\nInventory database is {installation_ctx.inventory_database}\n") # put a breakpoint here - print("Put a breakpoint here! Then go check the dashboard in your workspace ;-)") + print("Put a breakpoint here! Then go check the dashboard in your workspace ;-)\n") From dc97a4ab904a357df23bd2cdb10c4fa22c427515 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 20 Sep 2024 16:46:06 +0200 Subject: [PATCH 11/41] reduce test timeout --- tests/integration/assessment/test_workflows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index f43bdc88b2..0c10acf92d 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -5,7 +5,7 @@ from databricks.sdk.service.iam import PermissionLevel -@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=25)) +@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=15)) def test_running_real_assessment_job(ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions): ws_group_a, _ = installation_ctx.make_ucx_group() cluster_policy = make_cluster_policy() From 11e88668597466bbd3e01c7fa5985de9d9e56b90 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 20 Sep 2024 17:55:20 +0200 Subject: [PATCH 12/41] increase timeout based on observed durations in CI --- tests/integration/assessment/test_workflows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index 0c10acf92d..d29dd77b47 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -5,7 +5,7 @@ from databricks.sdk.service.iam import PermissionLevel -@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=15)) +@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=20)) def test_running_real_assessment_job(ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions): ws_group_a, _ = installation_ctx.make_ucx_group() cluster_policy = make_cluster_policy() From 4e9cd43c643e1ae7a1d33dc1dc896c805bc598f5 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 20 Sep 2024 18:05:20 +0200 Subject: [PATCH 13/41] enrich and specialize dfsa source_id such that dashboard links can be generated --- src/databricks/labs/ucx/source_code/graph.py | 3 ++- src/databricks/labs/ucx/source_code/jobs.py | 2 +- src/databricks/labs/ucx/source_code/queries.py | 8 ++++---- tests/integration/source_code/test_queries.py | 5 +++-- tests/unit/source_code/test_queries.py | 2 +- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index 347690f067..a722e9ef3c 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -337,7 +337,8 @@ def __repr__(self): @property def lineage(self) -> list[LineageAtom]: - return [LineageAtom(object_type="PATH", object_id=str(self.path))] + object_type = "NOTEBOOK" if is_a_notebook(self.path) else "FILE" + return [LineageAtom(object_type=object_type, object_id=str(self.path))] class SourceContainer(abc.ABC): diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index b5a4e47991..37933bb9ac 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -87,7 +87,7 @@ def __repr__(self): def lineage(self) -> list[LineageAtom]: job_name = (None if self._job.settings is None else self._job.settings.name) or "unknown job" job_lineage = LineageAtom("JOB", str(self._job.job_id), {"name": job_name}) - task_lineage = LineageAtom("TASK", self._task.task_key) + task_lineage = LineageAtom("TASK", f"{self._job.job_id}/{self._task.task_key}") return [job_lineage, task_lineage] diff --git a/src/databricks/labs/ucx/source_code/queries.py b/src/databricks/labs/ucx/source_code/queries.py index cbcac03072..670ce65145 100644 --- a/src/databricks/labs/ucx/source_code/queries.py +++ b/src/databricks/labs/ucx/source_code/queries.py @@ -68,7 +68,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): linted_queries.add(query.id) problems = self.lint_query(query) all_problems.extend(problems) - dfsas = self.collect_dfsas_from_query(query) + dfsas = self.collect_dfsas_from_query("no-dashboard-id", query) all_dfsas.extend(dfsas) # dump problems logger.info(f"Saving {len(all_problems)} linting problems...") @@ -113,7 +113,7 @@ def _lint_and_collect_from_dashboard( dashboard_name=dashboard_name, ) ) - dfsas = self.collect_dfsas_from_query(query) + dfsas = self.collect_dfsas_from_query(dashboard_id, query) for dfsa in dfsas: atom = LineageAtom( object_type="DASHBOARD", @@ -145,11 +145,11 @@ def lint_query(self, query: LegacyQuery) -> Iterable[QueryProblem]: ) @classmethod - def collect_dfsas_from_query(cls, query: LegacyQuery) -> Iterable[DirectFsAccess]: + def collect_dfsas_from_query(cls, dashboard_id: str, query: LegacyQuery) -> Iterable[DirectFsAccess]: if query.query is None: return linter = DirectFsAccessSqlLinter() - source_id = query.id or "no id" + source_id = f"{dashboard_id}/{query.id}" source_name = query.name or "" source_timestamp = cls._read_timestamp(query.updated_at) source_lineage = [LineageAtom(object_type="QUERY", object_id=source_id, other={"name": source_name})] diff --git a/tests/integration/source_code/test_queries.py b/tests/integration/source_code/test_queries.py index 5adcfca0f3..24fe3262c1 100644 --- a/tests/integration/source_code/test_queries.py +++ b/tests/integration/source_code/test_queries.py @@ -13,7 +13,8 @@ def test_query_linter_lints_queries_and_stores_dfsas(simple_ctx, ws, sql_backend assert len(problems) == 1 crawler = DirectFsAccessCrawler.for_queries(sql_backend, simple_ctx.inventory_database) all_dfsas = crawler.snapshot() - dfsas = [dfsa for dfsa in all_dfsas if dfsa.source_id == query.id] + source_id = f"{_dashboard.id}/{query.id}" + dfsas = [dfsa for dfsa in all_dfsas if dfsa.source_id == source_id] assert len(dfsas) == 1 dfsa = dfsas[0] assert len(dfsa.source_lineage) == 2 @@ -25,6 +26,6 @@ def test_query_linter_lints_queries_and_stores_dfsas(simple_ctx, ws, sql_backend assert lineage.other.get("name", None) == _dashboard.name lineage = dfsa.source_lineage[1] assert lineage.object_type == "QUERY" - assert lineage.object_id == query.id + assert lineage.object_id == source_id assert lineage.other assert lineage.other.get("name", None) == query.name diff --git a/tests/unit/source_code/test_queries.py b/tests/unit/source_code/test_queries.py index 8910b3ac09..0add56b0e0 100644 --- a/tests/unit/source_code/test_queries.py +++ b/tests/unit/source_code/test_queries.py @@ -27,7 +27,7 @@ def test_query_linter_collects_dfsas_from_queries(name, query, dfsa_paths, is_re crawlers = create_autospec(DirectFsAccessCrawler) query = LegacyQuery.from_dict({"parent": "workspace", "name": name, "query": query}) linter = QueryLinter(ws, migration_index, crawlers) - dfsas = linter.collect_dfsas_from_query(query) + dfsas = linter.collect_dfsas_from_query("no-dashboard-id", query) ws.assert_not_called() crawlers.assert_not_called() assert set(dfsa.path for dfsa in dfsas) == set(dfsa_paths) From 5565487055456887d9b8d1b1e812869158723d2b Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 20 Sep 2024 18:25:15 +0200 Subject: [PATCH 14/41] try out liinks --- .../main/36_1_direct_filesystem_accesses.sql | 13 +++- .../integration/assessment/test_dashboards.py | 64 ++++++++++--------- 2 files changed, 46 insertions(+), 31 deletions(-) diff --git a/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql index 50960990ee..fd58641ba9 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql @@ -9,14 +9,15 @@ {"fieldName": "is_write", "title": "is_write", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]}, {"fieldName": "source", "title": "source", "type": "string", "displayAs": "link", "linkUrlTemplate": "/jobs/{{ workflow_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]}, {"fieldName": "timestamp", "title": "last_modified", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]}, - {"fieldName": "lineage", "title": "lineage", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, + {"fieldName": "lineage", "title": "lineage", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ lineage_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]}, {"fieldName": "lineage_data", "title": "lineage_data", "type": "complex", "displayAs": "json", "booleanValues": ["false", "true"]}, {"fieldName": "assessment_start", "title": "assessment_start", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]}, {"fieldName": "assessment_end", "title": "assessment_end", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]} ]}, "invisibleColumns": [ {"fieldName": "lineage_type", "title": "lineage_type", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, - {"fieldName": "lineage_id", "title": "lineage_id", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]} + {"fieldName": "lineage_id", "title": "lineage_id", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, + {"fieldName": "lineage_link", "title": "lineage_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]} ] }}' */ @@ -29,6 +30,14 @@ SELECT concat(lineage.object_type, ': ', lineage.object_id) as lineage, lineage.object_type as lineage_type, lineage.object_id as lineage_id, + case + when lineage.object_type = 'WORKFLOW' then concat('/jobs/', lineage.object_id) + when lineage.object_type = 'TASK' then concat('/jobs/', split_part(lineage.object_id, '/', 1), '/tasks/', split_part(lineage.object_id, '/', 2)) + when lineage.object_type = 'NOTEBOOK' then concat('/#notebook/', lineage.object_id) + when lineage.object_type = 'FILE' then concat('/#files/', lineage.object_id) + when lineage.object_type = 'DASHBOARD' then concat('/sql/dashboardsv3/', lineage.object_id) + when lineage.object_type = 'QUERY' then concat('/sql/dashboardsv3/', split_part(lineage.object_id, '/', 1), '/datasets/', split_part(lineage.object_id, '/', 2)) + end as lineage_link, lineage.other as lineage_data, assessment_start, assessment_end diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py index 9d4110dd81..1f4dfa3cd3 100644 --- a/tests/integration/assessment/test_dashboards.py +++ b/tests/integration/assessment/test_dashboards.py @@ -1,5 +1,3 @@ -import pytest - from datetime import datetime, timezone, timedelta from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, LineageAtom @@ -35,14 +33,14 @@ def _populate_workflow_problems(installation_ctx): def _populate_dashboard_problems(installation_ctx): query_problems = [ QueryProblem( - dashboard_id = "12345", - dashboard_parent = "dashbards/parent", - dashboard_name = "my_dashboard", + dashboard_id="12345", + dashboard_parent="dashbards/parent", + dashboard_name="my_dashboard", query_id="23456", - query_parent= "queries/parent", + query_parent="queries/parent", query_name="my_query", code="sql-parse-error", - message="Could not parse SQL" + message="Could not parse SQL", ) ] installation_ctx.sql_backend.save_table( @@ -55,30 +53,38 @@ def _populate_dashboard_problems(installation_ctx): def _populate_directfs_problems(installation_ctx): dfsas = [ - DirectFsAccess(path="some_path", - is_read=False, - is_write=True, - source_id="xyz.py", - source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0), - source_lineage=[ LineageAtom(object_type="WORKFLOW", object_id="my workflow"), - LineageAtom(object_type="TASK", object_id="my task"), - LineageAtom(object_type="PATH", object_id="my notebook")], - assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), - assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0) - ) - ] + DirectFsAccess( + path="some_path", + is_read=False, + is_write=True, + source_id="xyz.py", + source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="WORKFLOW", object_id="my_workflow"), + LineageAtom(object_type="TASK", object_id="my_workflow/my_task"), + LineageAtom(object_type="NOTEBOOK", object_id="my_notebook"), + LineageAtom(object_type="FILE", object_id="my file"), + + ], + assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), + assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), + ) + ] installation_ctx.directfs_access_crawler_for_paths.dump_all(dfsas) dfsas = [ - DirectFsAccess(path="some_path", - is_read=False, - is_write=True, - source_id="xyz.py", - source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0), - source_lineage=[LineageAtom(object_type="DASHBOARD", object_id="my dashboard"), - LineageAtom(object_type="QUERY", object_id="my query")], - assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), - assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0) - ) + DirectFsAccess( + path="some_path", + is_read=False, + is_write=True, + source_id="xyz.py", + source_timestamp=datetime.now(timezone.utc) - timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="DASHBOARD", object_id="my_dashboard"), + LineageAtom(object_type="QUERY", object_id="my_dashboard/my_query"), + ], + assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), + assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), + ) ] installation_ctx.directfs_access_crawler_for_queries.dump_all(dfsas) From 2b9f9645798f64e3656812d21b8fcac368c226c9 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 23 Sep 2024 12:35:36 +0200 Subject: [PATCH 15/41] reduce assessment scope to limit duration of integration tests --- src/databricks/labs/ucx/config.py | 3 +++ .../labs/ucx/contexts/application.py | 1 + .../labs/ucx/source_code/queries.py | 24 +++++++++++++------ tests/integration/assessment/test_ext_hms.py | 12 +++++++++- .../integration/assessment/test_workflows.py | 20 ++++++++++++---- tests/integration/source_code/test_queries.py | 2 +- tests/unit/source_code/test_queries.py | 2 +- 7 files changed, 49 insertions(+), 15 deletions(-) diff --git a/src/databricks/labs/ucx/config.py b/src/databricks/labs/ucx/config.py index 480f5fd45d..fb5e1790bf 100644 --- a/src/databricks/labs/ucx/config.py +++ b/src/databricks/labs/ucx/config.py @@ -70,6 +70,9 @@ class WorkspaceConfig: # pylint: disable=too-many-instance-attributes # [INTERNAL ONLY] Whether the assessment should capture only specific object permissions. include_object_permissions: list[str] | None = None + # [INTERNAL ONLY] Whether the assessment should lint only specific dashboards. + include_dashboard_ids: list[str] | None = None + def replace_inventory_variable(self, text: str) -> str: return text.replace("$inventory", f"hive_metastore.{self.inventory_database}") diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index dab1ea9351..f61fd4afe9 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -435,6 +435,7 @@ def query_linter(self): self.workspace_client, TableMigrationIndex([]), # TODO: bring back self.tables_migrator.index() self.directfs_access_crawler_for_queries, + self.config.include_dashboard_ids, ) @cached_property diff --git a/src/databricks/labs/ucx/source_code/queries.py b/src/databricks/labs/ucx/source_code/queries.py index cbcac03072..79b40caa0a 100644 --- a/src/databricks/labs/ucx/source_code/queries.py +++ b/src/databricks/labs/ucx/source_code/queries.py @@ -40,10 +40,12 @@ def __init__( ws: WorkspaceClient, migration_index: TableMigrationIndex, directfs_crawler: DirectFsAccessCrawler, + include_dashboard_ids: list[str] | None, ): self._ws = ws self._migration_index = migration_index self._directfs_crawler = directfs_crawler + self._include_dashboard_ids = include_dashboard_ids def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): assessment_start = datetime.now(timezone.utc) @@ -53,16 +55,12 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): all_problems: list[QueryProblem] = [] all_dfsas: list[DirectFsAccess] = [] # first lint and collect queries from dashboards - for dashboard in all_dashboards: - if not dashboard.id: - continue - dashboard = self._ws.dashboards.get(dashboard_id=dashboard.id) + for dashboard_id in self._dashboard_ids_in_scope(): + dashboard = self._ws.dashboards.get(dashboard_id=dashboard_id) problems, dfsas = self._lint_and_collect_from_dashboard(dashboard, linted_queries) all_problems.extend(problems) all_dfsas.extend(dfsas) - for query in self._ws.queries_legacy.list(): - if query.id is None: - continue + for query in self._queries_in_scope(): if query.id in linted_queries: continue linted_queries.add(query.id) @@ -88,6 +86,18 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): ] self._directfs_crawler.dump_all(all_dfsas) + def _dashboard_ids_in_scope(self) -> list[str]: + if self._include_dashboard_ids is not None: # an empty list is accepted + return self._include_dashboard_ids + all_dashboards = self._ws.dashboards.list() + return [dashboard.id for dashboard in all_dashboards if dashboard.id] + + def _queries_in_scope(self): + if self._include_dashboard_ids is not None: # an empty list is accepted + return [] + all_queries = self._ws.queries_legacy.list() + return [query for query in all_queries if query.id] + def _lint_and_collect_from_dashboard( self, dashboard: Dashboard, linted_queries: set[str] ) -> tuple[Iterable[QueryProblem], Iterable[DirectFsAccess]]: diff --git a/tests/integration/assessment/test_ext_hms.py b/tests/integration/assessment/test_ext_hms.py index b6c04213ee..805ab13c06 100644 --- a/tests/integration/assessment/test_ext_hms.py +++ b/tests/integration/assessment/test_ext_hms.py @@ -1,5 +1,6 @@ import dataclasses import datetime as dt +import io from databricks.labs.lsql.backends import CommandExecutionBackend from databricks.sdk.service.iam import PermissionLevel @@ -11,6 +12,9 @@ def test_running_real_assessment_job_ext_hms( env_or_skip, make_cluster_policy, make_cluster_policy_permissions, + make_notebook, + make_job, + make_dashboard, ): cluster_id = env_or_skip('TEST_EXT_HMS_CLUSTER_ID') ext_hms_ctx = installation_ctx.replace( @@ -39,7 +43,13 @@ def test_running_real_assessment_job_ext_hms( # Under ideal circumstances this can take 10-16 minutes (depending on whether there are compute instances available # via the integration pool). Allow some margin to reduce spurious failures. - ext_hms_ctx.deployed_workflows.run_workflow("assessment", max_wait=dt.timedelta(minutes=30)) + notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) + job = make_job(notebook_path=notebook_path) + installation_ctx.config.include_job_ids = [job.job_id] + + dashboard = make_dashboard() + installation_ctx.config.include_dashboard_ids = [dashboard.id] + ext_hms_ctx.deployed_workflows.run_workflow("assessment", max_wait=dt.timedelta(minutes=25)) # assert the workflow is successful. the tasks on sql warehouse will fail so skip checking them assert ext_hms_ctx.deployed_workflows.validate_step("assessment") diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index d29dd77b47..bb1e1576a7 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -1,3 +1,4 @@ +import io from datetime import timedelta from databricks.sdk.errors import NotFound, InvalidParameterValue @@ -5,20 +6,29 @@ from databricks.sdk.service.iam import PermissionLevel -@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=20)) -def test_running_real_assessment_job(ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions): - ws_group_a, _ = installation_ctx.make_ucx_group() +@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8)) +def test_running_real_assessment_job( + ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions, make_job, make_notebook, make_dashboard +): + ws_group, _ = installation_ctx.make_ucx_group() cluster_policy = make_cluster_policy() make_cluster_policy_permissions( object_id=cluster_policy.policy_id, permission_level=PermissionLevel.CAN_USE, - group_name=ws_group_a.display_name, + group_name=ws_group.display_name, ) installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] installation_ctx.workspace_installation.run() + notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) + job = make_job(notebook_path=notebook_path) + installation_ctx.config.include_job_ids = [job.job_id] + + dashboard = make_dashboard() + installation_ctx.config.include_dashboard_ids = [dashboard.id] + installation_ctx.deployed_workflows.run_workflow("assessment") assert installation_ctx.deployed_workflows.validate_step("assessment") after = installation_ctx.generic_permissions_support.load_as_dict("cluster-policies", cluster_policy.policy_id) - assert after[ws_group_a.display_name] == PermissionLevel.CAN_USE + assert after[ws_group.display_name] == PermissionLevel.CAN_USE diff --git a/tests/integration/source_code/test_queries.py b/tests/integration/source_code/test_queries.py index 5adcfca0f3..5ecfbf45bd 100644 --- a/tests/integration/source_code/test_queries.py +++ b/tests/integration/source_code/test_queries.py @@ -6,7 +6,7 @@ def test_query_linter_lints_queries_and_stores_dfsas(simple_ctx, ws, sql_backend, make_query, make_dashboard): query = make_query(sql_query="SELECT * from csv.`dbfs://some_folder/some_file.csv`") _dashboard = make_dashboard(query=query) - linter = QueryLinter(ws, TableMigrationIndex([]), simple_ctx.directfs_access_crawler_for_queries) + linter = QueryLinter(ws, TableMigrationIndex([]), simple_ctx.directfs_access_crawler_for_queries, None) linter.refresh_report(sql_backend, simple_ctx.inventory_database) all_problems = sql_backend.fetch("SELECT * FROM query_problems", schema=simple_ctx.inventory_database) problems = [row for row in all_problems if row["query_name"] == query.name] diff --git a/tests/unit/source_code/test_queries.py b/tests/unit/source_code/test_queries.py index 8910b3ac09..c9b3a98a88 100644 --- a/tests/unit/source_code/test_queries.py +++ b/tests/unit/source_code/test_queries.py @@ -26,7 +26,7 @@ def test_query_linter_collects_dfsas_from_queries(name, query, dfsa_paths, is_re ws = create_autospec(WorkspaceClient) crawlers = create_autospec(DirectFsAccessCrawler) query = LegacyQuery.from_dict({"parent": "workspace", "name": name, "query": query}) - linter = QueryLinter(ws, migration_index, crawlers) + linter = QueryLinter(ws, migration_index, crawlers, None) dfsas = linter.collect_dfsas_from_query(query) ws.assert_not_called() crawlers.assert_not_called() From 012d2643ca26177c7a6e2446f5b19e1edb455c23 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 23 Sep 2024 13:29:35 +0200 Subject: [PATCH 16/41] inhibit and comment test --- tests/integration/assessment/test_dashboards.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py index 5b5742660c..e95193faf5 100644 --- a/tests/integration/assessment/test_dashboards.py +++ b/tests/integration/assessment/test_dashboards.py @@ -1,5 +1,7 @@ from datetime import datetime, timezone, timedelta +import pytest + from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, LineageAtom from databricks.labs.ucx.source_code.jobs import JobProblem from databricks.sdk.service.iam import PermissionLevel @@ -88,9 +90,9 @@ def _populate_directfs_problems(installation_ctx): installation_ctx.directfs_access_crawler_for_queries.dump_all(dfsas) -# @pytest.mark.skip +@pytest.mark.skip("Development tool") def test_dashboard_with_prepopulated_data(installation_ctx, make_cluster_policy, make_cluster_policy_permissions): - """the purpose of this test is to prepopulate data used by the dashboard without running an assessment""" + """the purpose of this test is to prepopulate data used by the dashboard without running an actual -lengthy- assessment""" ucx_group, _ = installation_ctx.make_ucx_group() cluster_policy = make_cluster_policy() make_cluster_policy_permissions( From 3efd331c9f14c20343e7688c393bdec72c8d4518 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 23 Sep 2024 14:04:41 +0200 Subject: [PATCH 17/41] Update src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql Co-authored-by: Serge Smertin <259697+nfx@users.noreply.github.com> --- .../main/35_2_code_compatibility_problems_in_dashboards.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql index 1e1374b45b..00070859da 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql @@ -1,5 +1,5 @@ /* ---title 'Dashboard migration problems' +--title 'Dashboard compatibility problems' --width 6 --overrides '{"spec":{ "encodings":{ From d5f1f742503fe87642040e1188b1e986248006f5 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 23 Sep 2024 14:04:48 +0200 Subject: [PATCH 18/41] Update src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql Co-authored-by: Serge Smertin <259697+nfx@users.noreply.github.com> --- .../main/35_2_code_compatibility_problems_in_dashboards.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql index 00070859da..86c38ef0ef 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql @@ -6,7 +6,7 @@ "columns": [ {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, - {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_name"}, + {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Dashboard"}, {"fieldName": "query_name", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/queries/{{ query_id }}/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_name"} ]}, "invisibleColumns": [ From 81d15519ffa18320ea2ac1d95abb89b273ce9066 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 23 Sep 2024 14:04:56 +0200 Subject: [PATCH 19/41] Update src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql Co-authored-by: Serge Smertin <259697+nfx@users.noreply.github.com> --- .../main/35_2_code_compatibility_problems_in_dashboards.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql index 86c38ef0ef..886841d706 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql @@ -7,7 +7,7 @@ {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Dashboard"}, - {"fieldName": "query_name", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/queries/{{ query_id }}/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_name"} + {"fieldName": "query_name", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/queries/{{ query_id }}/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Query"} ]}, "invisibleColumns": [ {"name": "dashboard_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_parent"}, From 7522eb57095e5548286a3f2da78b93f690d15fc7 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 23 Sep 2024 15:47:13 +0200 Subject: [PATCH 20/41] reduce linting scope in integration tests --- tests/integration/assessment/test_ext_hms.py | 13 ++++++++++--- tests/integration/assessment/test_workflows.py | 17 +++++++++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/tests/integration/assessment/test_ext_hms.py b/tests/integration/assessment/test_ext_hms.py index 805ab13c06..ac4e841402 100644 --- a/tests/integration/assessment/test_ext_hms.py +++ b/tests/integration/assessment/test_ext_hms.py @@ -5,10 +5,13 @@ from databricks.labs.lsql.backends import CommandExecutionBackend from databricks.sdk.service.iam import PermissionLevel +from databricks.labs.ucx.install import WorkspaceInstaller + def test_running_real_assessment_job_ext_hms( ws, installation_ctx, + product_info, env_or_skip, make_cluster_policy, make_cluster_policy_permissions, @@ -41,14 +44,18 @@ def test_running_real_assessment_job_ext_hms( ext_hms_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] ext_hms_ctx.workspace_installation.run() - # Under ideal circumstances this can take 10-16 minutes (depending on whether there are compute instances available - # via the integration pool). Allow some margin to reduce spurious failures. + # keep linting scope to minimum to avoid test timeouts + installer = WorkspaceInstaller(ws).replace(product_info=product_info) + notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) job = make_job(notebook_path=notebook_path) - installation_ctx.config.include_job_ids = [job.job_id] + installer.replace_config(include_job_ids=[job.job_id]) dashboard = make_dashboard() installation_ctx.config.include_dashboard_ids = [dashboard.id] + + # Under ideal circumstances this can take 10-16 minutes (depending on whether there are compute instances available + # via the integration pool). Allow some margin to reduce spurious failures. ext_hms_ctx.deployed_workflows.run_workflow("assessment", max_wait=dt.timedelta(minutes=25)) # assert the workflow is successful. the tasks on sql warehouse will fail so skip checking them diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index bb1e1576a7..96087686a2 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -5,10 +5,19 @@ from databricks.sdk.retries import retried from databricks.sdk.service.iam import PermissionLevel +from databricks.labs.ucx.install import WorkspaceInstaller + @retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8)) def test_running_real_assessment_job( - ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions, make_job, make_notebook, make_dashboard + ws, + installation_ctx, + product_info, + make_cluster_policy, + make_cluster_policy_permissions, + make_job, + make_notebook, + make_dashboard, ): ws_group, _ = installation_ctx.make_ucx_group() cluster_policy = make_cluster_policy() @@ -20,11 +29,15 @@ def test_running_real_assessment_job( installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] installation_ctx.workspace_installation.run() + # keep linting scope to minimum to avoid test timeouts + installer = WorkspaceInstaller(ws).replace(product_info=product_info) + notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) job = make_job(notebook_path=notebook_path) - installation_ctx.config.include_job_ids = [job.job_id] + installer.replace_config(include_job_ids=[job.job_id]) dashboard = make_dashboard() + installer.replace_config(include_dashboard_ids=[dashboard.id]) installation_ctx.config.include_dashboard_ids = [dashboard.id] installation_ctx.deployed_workflows.run_workflow("assessment") From 144e7b1a4124cda501cb7bf567b3c35fc114b06d Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 23 Sep 2024 16:44:47 +0200 Subject: [PATCH 21/41] update config on workspace side --- tests/integration/assessment/test_ext_hms.py | 4 ++-- tests/integration/assessment/test_workflows.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/integration/assessment/test_ext_hms.py b/tests/integration/assessment/test_ext_hms.py index ac4e841402..34fe9c69af 100644 --- a/tests/integration/assessment/test_ext_hms.py +++ b/tests/integration/assessment/test_ext_hms.py @@ -45,14 +45,14 @@ def test_running_real_assessment_job_ext_hms( ext_hms_ctx.workspace_installation.run() # keep linting scope to minimum to avoid test timeouts - installer = WorkspaceInstaller(ws).replace(product_info=product_info) + installer = WorkspaceInstaller(ws) notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) job = make_job(notebook_path=notebook_path) installer.replace_config(include_job_ids=[job.job_id]) dashboard = make_dashboard() - installation_ctx.config.include_dashboard_ids = [dashboard.id] + installer.replace_config(include_dashboard_ids=[dashboard.id]) # Under ideal circumstances this can take 10-16 minutes (depending on whether there are compute instances available # via the integration pool). Allow some margin to reduce spurious failures. diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index 96087686a2..6576479ea5 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -12,7 +12,6 @@ def test_running_real_assessment_job( ws, installation_ctx, - product_info, make_cluster_policy, make_cluster_policy_permissions, make_job, @@ -30,7 +29,7 @@ def test_running_real_assessment_job( installation_ctx.workspace_installation.run() # keep linting scope to minimum to avoid test timeouts - installer = WorkspaceInstaller(ws).replace(product_info=product_info) + installer = WorkspaceInstaller(ws) notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) job = make_job(notebook_path=notebook_path) @@ -38,7 +37,6 @@ def test_running_real_assessment_job( dashboard = make_dashboard() installer.replace_config(include_dashboard_ids=[dashboard.id]) - installation_ctx.config.include_dashboard_ids = [dashboard.id] installation_ctx.deployed_workflows.run_workflow("assessment") assert installation_ctx.deployed_workflows.validate_step("assessment") From 6f3283646a78a74d3ab3c7942474ae240faf5c70 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 23 Sep 2024 18:11:34 +0200 Subject: [PATCH 22/41] fix workspace config update --- tests/integration/assessment/test_ext_hms.py | 12 ++++++------ tests/integration/assessment/test_workflows.py | 12 +++++++----- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/tests/integration/assessment/test_ext_hms.py b/tests/integration/assessment/test_ext_hms.py index 34fe9c69af..0c9d11b8c2 100644 --- a/tests/integration/assessment/test_ext_hms.py +++ b/tests/integration/assessment/test_ext_hms.py @@ -5,7 +5,7 @@ from databricks.labs.lsql.backends import CommandExecutionBackend from databricks.sdk.service.iam import PermissionLevel -from databricks.labs.ucx.install import WorkspaceInstaller +from databricks.labs.ucx.config import WorkspaceConfig def test_running_real_assessment_job_ext_hms( @@ -45,14 +45,14 @@ def test_running_real_assessment_job_ext_hms( ext_hms_ctx.workspace_installation.run() # keep linting scope to minimum to avoid test timeouts - installer = WorkspaceInstaller(ws) - notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) job = make_job(notebook_path=notebook_path) - installer.replace_config(include_job_ids=[job.job_id]) - dashboard = make_dashboard() - installer.replace_config(include_dashboard_ids=[dashboard.id]) + config = installation_ctx.installation.load(WorkspaceConfig) + new_config = dataclasses.replace( + config, include_job_idsinclude_job_ids=[job.job_id], include_dashboard_ids=[dashboard.id] + ) + installation_ctx.installation.save(new_config) # Under ideal circumstances this can take 10-16 minutes (depending on whether there are compute instances available # via the integration pool). Allow some margin to reduce spurious failures. diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index 6576479ea5..4b5a22cb69 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -1,3 +1,4 @@ +import dataclasses import io from datetime import timedelta @@ -5,7 +6,7 @@ from databricks.sdk.retries import retried from databricks.sdk.service.iam import PermissionLevel -from databricks.labs.ucx.install import WorkspaceInstaller +from databricks.labs.ucx.config import WorkspaceConfig @retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8)) @@ -29,14 +30,15 @@ def test_running_real_assessment_job( installation_ctx.workspace_installation.run() # keep linting scope to minimum to avoid test timeouts - installer = WorkspaceInstaller(ws) notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) job = make_job(notebook_path=notebook_path) - installer.replace_config(include_job_ids=[job.job_id]) - dashboard = make_dashboard() - installer.replace_config(include_dashboard_ids=[dashboard.id]) + config = installation_ctx.installation.load(WorkspaceConfig) + new_config = dataclasses.replace( + config, include_job_idsinclude_job_ids=[job.job_id], include_dashboard_ids=[dashboard.id] + ) + installation_ctx.installation.save(new_config) installation_ctx.deployed_workflows.run_workflow("assessment") assert installation_ctx.deployed_workflows.validate_step("assessment") From 4a9e2e617318cf08cf18eb9863744d103059c25e Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 23 Sep 2024 18:30:00 +0200 Subject: [PATCH 23/41] deduplicate code --- tests/integration/assessment/test_ext_hms.py | 17 ++--------------- .../integration/assessment/test_workflows.py | 19 ++----------------- tests/integration/conftest.py | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+), 32 deletions(-) diff --git a/tests/integration/assessment/test_ext_hms.py b/tests/integration/assessment/test_ext_hms.py index 0c9d11b8c2..0a5d3288ba 100644 --- a/tests/integration/assessment/test_ext_hms.py +++ b/tests/integration/assessment/test_ext_hms.py @@ -1,12 +1,9 @@ import dataclasses import datetime as dt -import io from databricks.labs.lsql.backends import CommandExecutionBackend from databricks.sdk.service.iam import PermissionLevel -from databricks.labs.ucx.config import WorkspaceConfig - def test_running_real_assessment_job_ext_hms( ws, @@ -15,9 +12,7 @@ def test_running_real_assessment_job_ext_hms( env_or_skip, make_cluster_policy, make_cluster_policy_permissions, - make_notebook, - make_job, - make_dashboard, + populator_for_linting, ): cluster_id = env_or_skip('TEST_EXT_HMS_CLUSTER_ID') ext_hms_ctx = installation_ctx.replace( @@ -44,15 +39,7 @@ def test_running_real_assessment_job_ext_hms( ext_hms_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] ext_hms_ctx.workspace_installation.run() - # keep linting scope to minimum to avoid test timeouts - notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) - job = make_job(notebook_path=notebook_path) - dashboard = make_dashboard() - config = installation_ctx.installation.load(WorkspaceConfig) - new_config = dataclasses.replace( - config, include_job_idsinclude_job_ids=[job.job_id], include_dashboard_ids=[dashboard.id] - ) - installation_ctx.installation.save(new_config) + populator_for_linting(installation_ctx.installation) # Under ideal circumstances this can take 10-16 minutes (depending on whether there are compute instances available # via the integration pool). Allow some margin to reduce spurious failures. diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index 4b5a22cb69..ba6aa04c0b 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -1,13 +1,9 @@ -import dataclasses -import io from datetime import timedelta from databricks.sdk.errors import NotFound, InvalidParameterValue from databricks.sdk.retries import retried from databricks.sdk.service.iam import PermissionLevel -from databricks.labs.ucx.config import WorkspaceConfig - @retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8)) def test_running_real_assessment_job( @@ -15,9 +11,7 @@ def test_running_real_assessment_job( installation_ctx, make_cluster_policy, make_cluster_policy_permissions, - make_job, - make_notebook, - make_dashboard, + populator_for_linting, ): ws_group, _ = installation_ctx.make_ucx_group() cluster_policy = make_cluster_policy() @@ -29,16 +23,7 @@ def test_running_real_assessment_job( installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] installation_ctx.workspace_installation.run() - # keep linting scope to minimum to avoid test timeouts - - notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) - job = make_job(notebook_path=notebook_path) - dashboard = make_dashboard() - config = installation_ctx.installation.load(WorkspaceConfig) - new_config = dataclasses.replace( - config, include_job_idsinclude_job_ids=[job.job_id], include_dashboard_ids=[dashboard.id] - ) - installation_ctx.installation.save(new_config) + populator_for_linting(installation_ctx.installation) installation_ctx.deployed_workflows.run_workflow("assessment") assert installation_ctx.deployed_workflows.validate_step("assessment") diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index a3c5bfe912..9a7b93c893 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,3 +1,5 @@ +import dataclasses +import io import json from collections.abc import Callable, Generator import functools @@ -1175,3 +1177,19 @@ def _run(command: str) -> str: except ValueError as err: logger.debug(f"pytest_ignore_collect: error: {err}") return False + + +@pytest.fixture +def populator_for_linting(make_job, make_notebook, make_dashboard): + def populate_workspace(installation): + # keep linting scope to minimum to avoid test timeouts + notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) + job = make_job(notebook_path=notebook_path) + dashboard = make_dashboard() + config = installation.load(WorkspaceConfig) + new_config = dataclasses.replace( + config, include_job_idsinclude_job_ids=[job.job_id], include_dashboard_ids=[dashboard.id] + ) + installation.save(new_config) + + return populate_workspace From ef6bd0276e928ec7415d4fe234c1b57c56a71a4c Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 23 Sep 2024 18:35:46 +0200 Subject: [PATCH 24/41] fix typo --- tests/integration/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 9a7b93c893..b949495a72 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1188,7 +1188,7 @@ def populate_workspace(installation): dashboard = make_dashboard() config = installation.load(WorkspaceConfig) new_config = dataclasses.replace( - config, include_job_idsinclude_job_ids=[job.job_id], include_dashboard_ids=[dashboard.id] + config, include_job_ids=[job.job_id], include_dashboard_ids=[dashboard.id] ) installation.save(new_config) From 41f1eee65d1fd35ddf19ee322790465f6c8336b6 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 13:22:19 +0200 Subject: [PATCH 25/41] fix config update --- tests/integration/assessment/test_ext_hms.py | 4 ++-- .../integration/assessment/test_workflows.py | 4 ++-- tests/integration/conftest.py | 24 ++++++++++++------- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/tests/integration/assessment/test_ext_hms.py b/tests/integration/assessment/test_ext_hms.py index 0a5d3288ba..44b8e45fd4 100644 --- a/tests/integration/assessment/test_ext_hms.py +++ b/tests/integration/assessment/test_ext_hms.py @@ -12,7 +12,7 @@ def test_running_real_assessment_job_ext_hms( env_or_skip, make_cluster_policy, make_cluster_policy_permissions, - populator_for_linting, + populate_for_linting, ): cluster_id = env_or_skip('TEST_EXT_HMS_CLUSTER_ID') ext_hms_ctx = installation_ctx.replace( @@ -39,7 +39,7 @@ def test_running_real_assessment_job_ext_hms( ext_hms_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] ext_hms_ctx.workspace_installation.run() - populator_for_linting(installation_ctx.installation) + populate_for_linting(installation_ctx.installation) # Under ideal circumstances this can take 10-16 minutes (depending on whether there are compute instances available # via the integration pool). Allow some margin to reduce spurious failures. diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index ba6aa04c0b..7955e93c69 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -11,7 +11,7 @@ def test_running_real_assessment_job( installation_ctx, make_cluster_policy, make_cluster_policy_permissions, - populator_for_linting, + populate_for_linting, ): ws_group, _ = installation_ctx.make_ucx_group() cluster_policy = make_cluster_policy() @@ -23,7 +23,7 @@ def test_running_real_assessment_job( installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] installation_ctx.workspace_installation.run() - populator_for_linting(installation_ctx.installation) + populate_for_linting(installation_ctx.installation) installation_ctx.deployed_workflows.run_workflow("assessment") assert installation_ctx.deployed_workflows.validate_step("assessment") diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index b949495a72..ebf5fb5c2f 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,4 +1,3 @@ -import dataclasses import io import json from collections.abc import Callable, Generator @@ -11,11 +10,15 @@ from functools import cached_property import shutil import subprocess +from pathlib import Path + import pytest # pylint: disable=wrong-import-order +import yaml from databricks.labs.blueprint.commands import CommandExecutor from databricks.labs.blueprint.entrypoint import is_in_debug from databricks.labs.blueprint.installation import Installation, MockInstallation from databricks.labs.blueprint.parallel import Threads +from databricks.labs.blueprint.paths import WorkspacePath from databricks.labs.blueprint.tui import MockPrompts from databricks.labs.blueprint.wheels import ProductInfo from databricks.labs.lsql.backends import SqlBackend @@ -1180,16 +1183,21 @@ def _run(command: str) -> str: @pytest.fixture -def populator_for_linting(make_job, make_notebook, make_dashboard): +def populate_for_linting(ws, make_random, make_job, make_notebook, make_dashboard, watchdog_purge_suffix): def populate_workspace(installation): # keep linting scope to minimum to avoid test timeouts - notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) + path = Path(installation.install_folder()) / f"dummy-{make_random(4)}-{watchdog_purge_suffix}" + notebook_path = make_notebook(path=path, content=io.BytesIO(b"import xyz")) job = make_job(notebook_path=notebook_path) dashboard = make_dashboard() - config = installation.load(WorkspaceConfig) - new_config = dataclasses.replace( - config, include_job_ids=[job.job_id], include_dashboard_ids=[dashboard.id] - ) - installation.save(new_config) + # can't use installation.load(WorkspaceConfig)/installation.save() because they populate empty credentials + config_path = WorkspacePath(ws, installation.install_folder()) / "config.yml" + text = config_path.read_text() + config = yaml.safe_load(text) + config["include_job_ids"] = [job.job_id] + config["include_dashboard_ids"] = [dashboard.id] + text = yaml.dump(config) + config_path.unlink() + config_path.write_text(text) return populate_workspace From fd9ea4fbcb655271c44e8c8c892c5dfaf89b8acb Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 14:24:56 +0200 Subject: [PATCH 26/41] ensure problems are generated --- tests/integration/conftest.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index ebf5fb5c2f..90204c9d26 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1183,13 +1183,14 @@ def _run(command: str) -> str: @pytest.fixture -def populate_for_linting(ws, make_random, make_job, make_notebook, make_dashboard, watchdog_purge_suffix): +def populate_for_linting(ws, make_random, make_job, make_notebook, make_query, make_dashboard, watchdog_purge_suffix): def populate_workspace(installation): # keep linting scope to minimum to avoid test timeouts path = Path(installation.install_folder()) / f"dummy-{make_random(4)}-{watchdog_purge_suffix}" - notebook_path = make_notebook(path=path, content=io.BytesIO(b"import xyz")) + notebook_path = make_notebook(path=path, content=io.BytesIO(b"spark.read.parquet('dbfs://mnt/foo/bar')")) job = make_job(notebook_path=notebook_path) - dashboard = make_dashboard() + query = make_query(sql_query='SELECT * from parquet.`dbfs://mnt/foo/bar`') + dashboard = make_dashboard(query=query) # can't use installation.load(WorkspaceConfig)/installation.save() because they populate empty credentials config_path = WorkspacePath(ws, installation.install_folder()) / "config.yml" text = config_path.read_text() From 938210a3b606933588466950d67ea22af466ce5a Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 14:25:29 +0200 Subject: [PATCH 27/41] Fix links in 'Dashboard compatibility problems' widget --- ...5_2_code_compatibility_problems_in_dashboards.sql | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql index 886841d706..bf8b2b4f1f 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql @@ -6,14 +6,14 @@ "columns": [ {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, - {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Dashboard"}, - {"fieldName": "query_name", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/queries/{{ query_id }}/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Query"} + {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Dashboard"}, + {"fieldName": "query_name", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/sql/editor/{{ query_id }}/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Query"} ]}, "invisibleColumns": [ - {"name": "dashboard_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_parent"}, - {"name": "dashboard_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_id"}, - {"name": "query_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_parent"}, - {"name": "query_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "/dashboards/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_id"} + {"name": "dashboard_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_parent"}, + {"name": "dashboard_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_id"}, + {"name": "query_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_parent"}, + {"name": "query_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_id"} ] }}' */ From 534d4a77b35e248aa8eec242d343b9f965593634 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 14:41:23 +0200 Subject: [PATCH 28/41] fix query link --- .../main/35_2_code_compatibility_problems_in_dashboards.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql index bf8b2b4f1f..a918e9682c 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql @@ -7,7 +7,7 @@ {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Dashboard"}, - {"fieldName": "query_name", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/sql/editor/{{ query_id }}/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Query"} + {"fieldName": "query_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/editor/{{ query_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Query"} ]}, "invisibleColumns": [ {"name": "dashboard_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_parent"}, From 2241c1a72f3a06831607e0f48297896ccdbf6db4 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 16:15:20 +0200 Subject: [PATCH 29/41] progress --- .../35_1_code_compatibility_problems_in_workflows.sql | 8 +++++--- .../main/36_0_direct_filesystem_access_problems.md | 6 +++++- src/databricks/labs/ucx/source_code/jobs.py | 8 ++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql b/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql index 54f161e677..8b1f73fe24 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql @@ -4,7 +4,7 @@ --overrides '{"spec":{ "encodings":{ "columns": [ - {"fieldName": "path", "booleanValues": ["false", "true"], "linkUrlTemplate": "/#workspace/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "path"}, + {"fieldName": "path", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "path"}, {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, {"fieldName": "workflow_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/jobs/{{ workflow_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_name"}, @@ -15,12 +15,14 @@ {"fieldName": "end_col", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "end_col"} ]}, "invisibleColumns": [ - {"name": "workflow_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "/jobs/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_id"} + {"name": "link", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "link"}, + {"name": "workflow_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_id"} ] }}' */ SELECT path, + if ( contains(path, '.'), concat('/#files/', path), concat('/#notebook/', path) ) as link, code, message, job_id AS workflow_id, @@ -30,4 +32,4 @@ SELECT start_col, end_line, end_col -FROM inventory.workflow_problems \ No newline at end of file +FROM inventory.workflow_problems diff --git a/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md b/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md index fef9e8b7d6..3f5b255681 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md +++ b/src/databricks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md @@ -1,4 +1,8 @@ -## Direct filesystem access problems +--- +height: 4 +--- + +# Direct filesystem access problems The table below assists with verifying if workflows and dashboards require direct filesystem access. As a reminder, `dbfs:/` is not supported in Unity Catalog, and more generally direct filesystem access is discouraged. diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 37933bb9ac..0fac88129c 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -406,12 +406,16 @@ def _lint_job(self, job: jobs.Job) -> tuple[list[JobProblem], list[DirectFsAcces if not advices: advices = self._lint_task(task, graph, session_state, linted_paths) for advice in advices: - absolute_path = advice.path.absolute().as_posix() if advice.path != self._UNKNOWN else 'UNKNOWN' + relative_path = advice.path.as_posix() if advice.path != self._UNKNOWN else 'UNKNOWN' + domain = "@databricks.com/" + idx = relative_path.index(domain) + if idx >= 0: + relative_path = relative_path[idx + len(domain):] job_problem = JobProblem( job_id=job.job_id, job_name=job.settings.name, task_key=task.task_key, - path=absolute_path, + path=relative_path, code=advice.advice.code, message=advice.advice.message, start_line=advice.advice.start_line, From b4511d52d9b152cc588319706cccc7832cd61ce8 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 18:27:00 +0200 Subject: [PATCH 30/41] fix links in workflow problems widget --- ...35_1_code_compatibility_problems_in_workflows.sql | 6 +++--- src/databricks/labs/ucx/source_code/jobs.py | 12 ++++-------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql b/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql index 8b1f73fe24..64a7e956a5 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql @@ -4,7 +4,7 @@ --overrides '{"spec":{ "encodings":{ "columns": [ - {"fieldName": "path", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "path"}, + {"fieldName": "path", "booleanValues": ["false", "true"], "linkUrlTemplate": "/#workspace/{{ link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "path"}, {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, {"fieldName": "workflow_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/jobs/{{ workflow_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_name"}, @@ -21,8 +21,8 @@ }}' */ SELECT - path, - if ( contains(path, '.'), concat('/#files/', path), concat('/#notebook/', path) ) as link, + substring_index(path, '@databricks.com/', -1) as path, + path as link, code, message, job_id AS workflow_id, diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 0fac88129c..11955014b2 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -12,7 +12,7 @@ from urllib import parse from databricks.labs.blueprint.parallel import ManyError, Threads -from databricks.labs.blueprint.paths import DBFSPath +from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound @@ -86,7 +86,7 @@ def __repr__(self): @property def lineage(self) -> list[LineageAtom]: job_name = (None if self._job.settings is None else self._job.settings.name) or "unknown job" - job_lineage = LineageAtom("JOB", str(self._job.job_id), {"name": job_name}) + job_lineage = LineageAtom("WORKFLOW", str(self._job.job_id), {"name": job_name}) task_lineage = LineageAtom("TASK", f"{self._job.job_id}/{self._task.task_key}") return [job_lineage, task_lineage] @@ -406,16 +406,12 @@ def _lint_job(self, job: jobs.Job) -> tuple[list[JobProblem], list[DirectFsAcces if not advices: advices = self._lint_task(task, graph, session_state, linted_paths) for advice in advices: - relative_path = advice.path.as_posix() if advice.path != self._UNKNOWN else 'UNKNOWN' - domain = "@databricks.com/" - idx = relative_path.index(domain) - if idx >= 0: - relative_path = relative_path[idx + len(domain):] + absolute_path = advice.path.absolute().as_posix() if advice.path != self._UNKNOWN else 'UNKNOWN' job_problem = JobProblem( job_id=job.job_id, job_name=job.settings.name, task_key=task.task_key, - path=relative_path, + path=absolute_path, code=advice.advice.code, message=advice.advice.message, start_line=advice.advice.start_line, From 872a430868d33f4454551068de3aeaaf1178edb8 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 20:03:47 +0200 Subject: [PATCH 31/41] also lint a file --- tests/integration/conftest.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 90204c9d26..f6b7096547 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1186,10 +1186,14 @@ def _run(command: str) -> str: def populate_for_linting(ws, make_random, make_job, make_notebook, make_query, make_dashboard, watchdog_purge_suffix): def populate_workspace(installation): # keep linting scope to minimum to avoid test timeouts - path = Path(installation.install_folder()) / f"dummy-{make_random(4)}-{watchdog_purge_suffix}" - notebook_path = make_notebook(path=path, content=io.BytesIO(b"spark.read.parquet('dbfs://mnt/foo/bar')")) + file_name = f"dummy_{make_random(4)}_{watchdog_purge_suffix}" + file_path = WorkspacePath(ws, installation.install_folder()) / file_name + file_path.write_text("spark.read.parquet('dbfs://mnt/foo/bar')") + path = Path(installation.install_folder()) / f"dummy_{make_random(4)}_{watchdog_purge_suffix}" + notebook_text = f"import {file_name}\nspark.read.parquet('dbfs://mnt/foo1/bar1')" + notebook_path = make_notebook(path=path, content=io.BytesIO(notebook_text.encode("utf-8"))) job = make_job(notebook_path=notebook_path) - query = make_query(sql_query='SELECT * from parquet.`dbfs://mnt/foo/bar`') + query = make_query(sql_query='SELECT * from parquet.`dbfs://mnt/foo2/bar2`') dashboard = make_dashboard(query=query) # can't use installation.load(WorkspaceConfig)/installation.save() because they populate empty credentials config_path = WorkspacePath(ws, installation.install_folder()) / "config.yml" From ee0a7bab399f1f5721cf1b918813cd896f6f2c25 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 20:04:09 +0200 Subject: [PATCH 32/41] fix lineage records --- src/databricks/labs/ucx/source_code/jobs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 11955014b2..0ca688779a 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -469,8 +469,8 @@ def _collect_task_dfsas( job_name = job.settings.name if job.settings and job.settings.name else "" for dfsa in DfsaCollectorWalker(graph, set(), self._path_lookup, session_state): atoms = [ - LineageAtom(object_type="JOB", object_id=job_id, other={"name": job_name}), - LineageAtom(object_type="TASK", object_id=task.task_key), + LineageAtom(object_type="WORKFLOW", object_id=job_id, other={"name": job_name}), + LineageAtom(object_type="TASK", object_id=f"{job_id}/{task.task_key}"), ] yield dataclasses.replace(dfsa, source_lineage=atoms + dfsa.source_lineage) From cb28aa100b9921b4dcd85134b5efb0fa5570b0ef Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 20:04:43 +0200 Subject: [PATCH 33/41] fix links and display of DFSA widget --- .../main/36_1_direct_filesystem_accesses.sql | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql index fd58641ba9..98a798077a 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql @@ -4,10 +4,10 @@ --overrides '{"spec":{ "encodings":{ "columns": [ - {"fieldName": "path", "title": "path", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, + {"fieldName": "location", "title": "location", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, {"fieldName": "is_read", "title": "is_read", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]}, {"fieldName": "is_write", "title": "is_write", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]}, - {"fieldName": "source", "title": "source", "type": "string", "displayAs": "link", "linkUrlTemplate": "/jobs/{{ workflow_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]}, + {"fieldName": "source", "title": "source", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ source_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]}, {"fieldName": "timestamp", "title": "last_modified", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]}, {"fieldName": "lineage", "title": "lineage", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ lineage_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]}, {"fieldName": "lineage_data", "title": "lineage_data", "type": "complex", "displayAs": "json", "booleanValues": ["false", "true"]}, @@ -15,28 +15,38 @@ {"fieldName": "assessment_end", "title": "assessment_end", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]} ]}, "invisibleColumns": [ + {"fieldName": "source_link", "title": "source_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, {"fieldName": "lineage_type", "title": "lineage_type", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, {"fieldName": "lineage_id", "title": "lineage_id", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, - {"fieldName": "lineage_link", "title": "lineage_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]} + {"fieldName": "lineage_link", "title": "lineage_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, + {"fieldName": "lineage_data", "title": "lineage_data", "type": "complex", "displayAs": "json", "booleanValues": ["false", "true"]} ] }}' */ SELECT - path, + path as location, is_read, is_write, - source_id as source, + if( startswith(source_id, '/'), substring_index(source_id, '@databricks.com/', -1), split_part(source_id, '/', 2)) as source, + if( startswith(source_id, '/'), concat('/#workspace/', source_id), concat('/sql/editor/', split_part(source_id, '/', 2))) as source_link, source_timestamp as `timestamp`, - concat(lineage.object_type, ': ', lineage.object_id) as lineage, + case + when lineage.object_type = 'WORKFLOW' then concat('Workflow: ', lineage.other.name) + when lineage.object_type = 'TASK' then concat('Task: ', split_part(lineage.object_id, '/', 2)) + when lineage.object_type = 'NOTEBOOK' then concat('Notebook: ', substring_index(lineage.object_id, '@databricks.com/', -1)) + when lineage.object_type = 'FILE' then concat('File: ', substring_index(lineage.object_id, '@databricks.com/', -1)) + when lineage.object_type = 'DASHBOARD' then concat('Dashboard: ', lineage.other.name) + when lineage.object_type = 'QUERY' then concat('Query: ', lineage.other.name) + end as lineage, lineage.object_type as lineage_type, lineage.object_id as lineage_id, case when lineage.object_type = 'WORKFLOW' then concat('/jobs/', lineage.object_id) when lineage.object_type = 'TASK' then concat('/jobs/', split_part(lineage.object_id, '/', 1), '/tasks/', split_part(lineage.object_id, '/', 2)) - when lineage.object_type = 'NOTEBOOK' then concat('/#notebook/', lineage.object_id) - when lineage.object_type = 'FILE' then concat('/#files/', lineage.object_id) - when lineage.object_type = 'DASHBOARD' then concat('/sql/dashboardsv3/', lineage.object_id) - when lineage.object_type = 'QUERY' then concat('/sql/dashboardsv3/', split_part(lineage.object_id, '/', 1), '/datasets/', split_part(lineage.object_id, '/', 2)) + when lineage.object_type = 'NOTEBOOK' then concat('/#workspace/', lineage.object_id) + when lineage.object_type = 'FILE' then concat('/#workspace/', lineage.object_id) + when lineage.object_type = 'DASHBOARD' then concat('/sql/dashboards/', lineage.object_id) + when lineage.object_type = 'QUERY' then concat('/sql/editor/', split_part(lineage.object_id, '/', 2)) end as lineage_link, lineage.other as lineage_data, assessment_start, From 9b73c8c55e9502a1a52cbfe0074bce1e4badb594 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 20:09:09 +0200 Subject: [PATCH 34/41] formatting --- src/databricks/labs/ucx/source_code/jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 0ca688779a..eb50b7ef31 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -12,7 +12,7 @@ from urllib import parse from databricks.labs.blueprint.parallel import ManyError, Threads -from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath +from databricks.labs.blueprint.paths import DBFSPath from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound From 4a8106db303c4fe8ce8bd40938422c708055b75e Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 24 Sep 2024 20:15:57 +0200 Subject: [PATCH 35/41] use relative path --- tests/integration/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index f6b7096547..993ef04a30 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1190,7 +1190,7 @@ def populate_workspace(installation): file_path = WorkspacePath(ws, installation.install_folder()) / file_name file_path.write_text("spark.read.parquet('dbfs://mnt/foo/bar')") path = Path(installation.install_folder()) / f"dummy_{make_random(4)}_{watchdog_purge_suffix}" - notebook_text = f"import {file_name}\nspark.read.parquet('dbfs://mnt/foo1/bar1')" + notebook_text = f"import ./{file_name}\nspark.read.parquet('dbfs://mnt/foo1/bar1')" notebook_path = make_notebook(path=path, content=io.BytesIO(notebook_text.encode("utf-8"))) job = make_job(notebook_path=notebook_path) query = make_query(sql_query='SELECT * from parquet.`dbfs://mnt/foo2/bar2`') From 81a7bfe4ee4ab1cd950c34dacaacd427d1f18a3f Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 25 Sep 2024 10:55:56 +0200 Subject: [PATCH 36/41] populate file task for liinting --- tests/integration/conftest.py | 58 ++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 993ef04a30..3a900425dc 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -28,8 +28,10 @@ from databricks.sdk.retries import retried from databricks.sdk.service import iam from databricks.sdk.service.catalog import FunctionInfo, SchemaInfo, TableInfo -from databricks.sdk.service.iam import Group +from databricks.sdk.service.compute import ClusterSpec from databricks.sdk.service.dashboards import Dashboard as SDKDashboard +from databricks.sdk.service.iam import Group +from databricks.sdk.service.jobs import Task, SparkPythonTask from databricks.sdk.service.sql import Dashboard, WidgetPosition, WidgetOptions, LegacyQuery from databricks.labs.ucx.__about__ import __version__ @@ -1183,23 +1185,65 @@ def _run(command: str) -> str: @pytest.fixture -def populate_for_linting(ws, make_random, make_job, make_notebook, make_query, make_dashboard, watchdog_purge_suffix): - def populate_workspace(installation): - # keep linting scope to minimum to avoid test timeouts +def create_file_job(ws, make_random, watchdog_remove_after, watchdog_purge_suffix, log_workspace_link): + + def create(installation, **kwargs): + # create args + data = {"name": f"dummy-{make_random(4)}"} + # create file to run file_name = f"dummy_{make_random(4)}_{watchdog_purge_suffix}" file_path = WorkspacePath(ws, installation.install_folder()) / file_name file_path.write_text("spark.read.parquet('dbfs://mnt/foo/bar')") + task = Task( + task_key=make_random(4), + description=make_random(4), + new_cluster=ClusterSpec( + num_workers=1, + node_type_id=ws.clusters.select_node_type(local_disk=True, min_memory_gb=16), + spark_version=ws.clusters.select_spark_version(latest=True) + ), + spark_python_task=SparkPythonTask(python_file=str(file_path)), + timeout_seconds=0, + ) + data["tasks"] = [task] + # add RemoveAfter tag for job cleanup + data["tags"] = [{"key": "RemoveAfter", "value": watchdog_remove_after}] + job = ws.jobs.create(**data) + log_workspace_link(data["name"], f'job/{job.job_id}', anchor=False) + return job + + yield from factory("job", create, lambda item: ws.jobs.delete(item.job_id)) + + +@pytest.fixture +def populate_for_linting( + ws, + make_random, + make_job, + make_notebook, + make_query, + make_dashboard, + create_file_job, + watchdog_purge_suffix, +): + + def create_notebook_job(installation): path = Path(installation.install_folder()) / f"dummy_{make_random(4)}_{watchdog_purge_suffix}" - notebook_text = f"import ./{file_name}\nspark.read.parquet('dbfs://mnt/foo1/bar1')" + notebook_text = "spark.read.parquet('dbfs://mnt/foo1/bar1')" notebook_path = make_notebook(path=path, content=io.BytesIO(notebook_text.encode("utf-8"))) - job = make_job(notebook_path=notebook_path) + return make_job(notebook_path=notebook_path) + + def populate_workspace(installation): + # keep linting scope to minimum to avoid test timeouts + file_job = create_file_job(installation=installation) + notebook_job = create_notebook_job(installation=installation) query = make_query(sql_query='SELECT * from parquet.`dbfs://mnt/foo2/bar2`') dashboard = make_dashboard(query=query) # can't use installation.load(WorkspaceConfig)/installation.save() because they populate empty credentials config_path = WorkspacePath(ws, installation.install_folder()) / "config.yml" text = config_path.read_text() config = yaml.safe_load(text) - config["include_job_ids"] = [job.job_id] + config["include_job_ids"] = [file_job.job_id, notebook_job.job_id] config["include_dashboard_ids"] = [dashboard.id] text = yaml.dump(config) config_path.unlink() From 0cf260e8918a8d1719bb005eda4fea1a5ef95b5c Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 25 Sep 2024 11:39:24 +0200 Subject: [PATCH 37/41] formatting --- tests/integration/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 3a900425dc..61b90cbf5f 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1187,7 +1187,7 @@ def _run(command: str) -> str: @pytest.fixture def create_file_job(ws, make_random, watchdog_remove_after, watchdog_purge_suffix, log_workspace_link): - def create(installation, **kwargs): + def create(installation, **_kwargs): # create args data = {"name": f"dummy-{make_random(4)}"} # create file to run @@ -1200,7 +1200,7 @@ def create(installation, **kwargs): new_cluster=ClusterSpec( num_workers=1, node_type_id=ws.clusters.select_node_type(local_disk=True, min_memory_gb=16), - spark_version=ws.clusters.select_spark_version(latest=True) + spark_version=ws.clusters.select_spark_version(latest=True), ), spark_python_task=SparkPythonTask(python_file=str(file_path)), timeout_seconds=0, From f81a2c33f77367e6a35b7af55b3c5e3d254efd83 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 25 Sep 2024 12:02:16 +0200 Subject: [PATCH 38/41] fix failing test --- tests/integration/source_code/test_jobs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index 4c5ae4f758..451b5a3815 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -163,7 +163,7 @@ def test_job_linter_some_notebook_graph_with_problems( assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages) assert len(dfsas) == 2 - task_keys = set(task.task_key for task in j.settings.tasks) + task_keys = set(f"{j.job_id}/{task.task_key}" for task in j.settings.tasks) yesterday = datetime.now(timezone.utc) - timedelta(days=1) for dfsa in dfsas: assert dfsa.source_id != DirectFsAccess.UNKNOWN @@ -172,7 +172,7 @@ def test_job_linter_some_notebook_graph_with_problems( assert dfsa.assessment_start_timestamp > yesterday assert dfsa.assessment_end_timestamp > yesterday assert dfsa.source_lineage[0] == LineageAtom( - object_type="JOB", object_id=str(j.job_id), other={"name": j.settings.name} + object_type="WORKFLOW", object_id=str(j.job_id), other={"name": j.settings.name} ) assert dfsa.source_lineage[1].object_type == "TASK" assert dfsa.source_lineage[1].object_id in task_keys From 9de75a819e4b211d010fefe7f75dfd1131b352e1 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 25 Sep 2024 12:36:50 +0200 Subject: [PATCH 39/41] tentatively fix test flakyness --- tests/integration/assessment/test_workflows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index 7955e93c69..b3493cf494 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -5,7 +5,7 @@ from databricks.sdk.service.iam import PermissionLevel -@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8)) +@retried(on=[NotFound, InvalidParameterValue]) def test_running_real_assessment_job( ws, installation_ctx, @@ -26,7 +26,7 @@ def test_running_real_assessment_job( populate_for_linting(installation_ctx.installation) installation_ctx.deployed_workflows.run_workflow("assessment") - assert installation_ctx.deployed_workflows.validate_step("assessment") + assert installation_ctx.deployed_workflows.validate_step("assessment", max_wait=timedelta(minutes=25)) after = installation_ctx.generic_permissions_support.load_as_dict("cluster-policies", cluster_policy.policy_id) assert after[ws_group.display_name] == PermissionLevel.CAN_USE From 6c4025170238dc906b1a2257d954990a7b63307e Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 25 Sep 2024 12:51:13 +0200 Subject: [PATCH 40/41] fix typo --- tests/integration/assessment/test_workflows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py index b3493cf494..1110528bcf 100644 --- a/tests/integration/assessment/test_workflows.py +++ b/tests/integration/assessment/test_workflows.py @@ -25,8 +25,8 @@ def test_running_real_assessment_job( populate_for_linting(installation_ctx.installation) - installation_ctx.deployed_workflows.run_workflow("assessment") - assert installation_ctx.deployed_workflows.validate_step("assessment", max_wait=timedelta(minutes=25)) + installation_ctx.deployed_workflows.run_workflow("assessment", max_wait=timedelta(minutes=25)) + assert installation_ctx.deployed_workflows.validate_step("assessment") after = installation_ctx.generic_permissions_support.load_as_dict("cluster-policies", cluster_policy.policy_id) assert after[ws_group.display_name] == PermissionLevel.CAN_USE From f792f6b3d36f396614f2003b4b5e0201801da13a Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 25 Sep 2024 13:09:37 +0200 Subject: [PATCH 41/41] remove hidden column --- .../assessment/main/36_1_direct_filesystem_accesses.sql | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql index 98a798077a..b038dbfbc8 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/36_1_direct_filesystem_accesses.sql @@ -18,8 +18,7 @@ {"fieldName": "source_link", "title": "source_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, {"fieldName": "lineage_type", "title": "lineage_type", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, {"fieldName": "lineage_id", "title": "lineage_id", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, - {"fieldName": "lineage_link", "title": "lineage_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}, - {"fieldName": "lineage_data", "title": "lineage_data", "type": "complex", "displayAs": "json", "booleanValues": ["false", "true"]} + {"fieldName": "lineage_link", "title": "lineage_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]} ] }}' */