Skip to content

Commit

Permalink
Added more views to assessment dashboard (#474)
Browse files Browse the repository at this point in the history
This PR has the following improvements for views and tables:

- `assessment/queries` are renamed and refactored to more efficiently
fit the grid
- introduced `assessment/views` to hold common views that could be
reused by multiple commands and widgets.
- introduced `crawlers.SchemaDeployer`, that is (re-)creating views and
tables **upon installation** (and not on the assessment step).
- added `search_by` field for table widgets
- improved parallelism for fetching grants
- removed (now redundant) `setup_schema` and `setup_view` tasks from
`assessment` workflow
<img width="1981" alt="_UCX__serge_smertin__UCX_Assessment"
src="https://github.com/databrickslabs/ucx/assets/259697/7addb0bb-b301-47ea-b351-9b75cd0a5d9d">

<img width="1985" alt="_UCX__serge_smertin__UCX_Assessment"
src="https://github.com/databrickslabs/ucx/assets/259697/88effab7-59bb-46aa-af9b-bd9185d4f817">
  • Loading branch information
nfx authored and pritishpai committed Nov 21, 2023
1 parent 7ff8d5d commit ed9775a
Show file tree
Hide file tree
Showing 34 changed files with 336 additions and 274 deletions.
28 changes: 28 additions & 0 deletions src/databricks/labs/ucx/framework/crawlers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import dataclasses
import logging
import os
import pkgutil
from abc import ABC, abstractmethod
from collections.abc import Iterator
from typing import ClassVar
Expand Down Expand Up @@ -244,3 +245,30 @@ def _snapshot(self, fetcher, loader) -> list[any]:
def _append_records(self, items):
logger.debug(f"[{self._full_name}] found {len(items)} new records for {self._table}")
self._backend.save_table(self._full_name, items, self._klass, mode="append")


class SchemaDeployer:
def __init__(self, sql_backend: SqlBackend, inventory_schema: str, mod: any):
self._sql_backend = sql_backend
self._inventory_schema = inventory_schema
self._module = mod

def deploy_schema(self):
logger.info(f"Ensuring {self._inventory_schema} database exists")
self._sql_backend.execute(f"CREATE SCHEMA IF NOT EXISTS hive_metastore.{self._inventory_schema}")

def deploy_table(self, name: str, klass: type):
logger.info(f"Ensuring {self._inventory_schema}.{name} table exists")
self._sql_backend.create_table(f"hive_metastore.{self._inventory_schema}.{name}", klass)

def deploy_view(self, name: str, relative_filename: str):
query = self._load(relative_filename)
logger.info(f"Ensuring {self._inventory_schema}.{name} view matches {relative_filename} contents")
ddl = f"CREATE OR REPLACE VIEW hive_metastore.{self._inventory_schema}.{name} AS {query}"
self._sql_backend.execute(ddl)

def _load(self, relative_filename: str) -> str:
data = pkgutil.get_data(self._module.__name__, relative_filename)
sql = data.decode("utf-8")
sql = sql.replace("$inventory", f"hive_metastore.{self._inventory_schema}")
return sql
5 changes: 4 additions & 1 deletion src/databricks/labs/ucx/framework/dashboards.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ def _table_viz_args(
condensed=True,
with_row_number=False,
description: str | None = None,
search_by: str | None = None,
) -> dict:
return {
"type": "TABLE",
Expand All @@ -326,7 +327,9 @@ def _table_viz_args(
"condensed": condensed,
"withRowNumber": with_row_number,
"version": 2,
"columns": [VizColumn(name=x, title=x).as_dict() for x in columns.split(",")],
"columns": [
VizColumn(name=x, title=x, allowSearch=x == search_by).as_dict() for x in columns.split(",")
],
},
}

Expand Down
27 changes: 16 additions & 11 deletions src/databricks/labs/ucx/hive_metastore/grants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
from collections import defaultdict
from collections.abc import Iterator
from dataclasses import dataclass
from functools import partial

Expand Down Expand Up @@ -198,7 +197,7 @@ def _grants(
view: str | None = None,
any_file: bool = False,
anonymous_function: bool = False,
) -> Iterator[Grant]:
) -> list[Grant]:
"""
Fetches and yields grant information for the specified database objects.
Expand Down Expand Up @@ -239,23 +238,29 @@ def _grants(
anonymous_function=anonymous_function,
)
try:
grants = []
object_type_normalization = {"SCHEMA": "DATABASE", "CATALOG$": "CATALOG"}
for row in self._fetch(f"SHOW GRANTS ON {on_type} {key}"):
(principal, action_type, object_type, _) = row
if object_type in object_type_normalization:
object_type = object_type_normalization[object_type]
if on_type != object_type:
continue
yield Grant(
principal=principal,
action_type=action_type,
table=table,
view=view,
database=database,
catalog=catalog,
any_file=any_file,
anonymous_function=anonymous_function,
# we have to return concrete list, as with yield we're executing
# everything on the main thread.
grants.append(
Grant(
principal=principal,
action_type=action_type,
table=table,
view=view,
database=database,
catalog=catalog,
any_file=any_file,
anonymous_function=anonymous_function,
)
)
return grants
except Exception as e:
# TODO: https://github.com/databrickslabs/ucx/issues/406
logger.error(f"Couldn't fetch grants for object {on_type} {key}: {e}")
Expand Down
8 changes: 8 additions & 0 deletions src/databricks/labs/ucx/hive_metastore/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ def sql_alter_from(self, catalog):
)


@dataclass
class TableError:
catalog: str
database: str
name: str = None
error: str = None


class TablesCrawler(CrawlerBase):
def __init__(self, backend: SqlBackend, schema):
"""
Expand Down
59 changes: 56 additions & 3 deletions src/databricks/labs/ucx/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,30 @@
from databricks.sdk.service.workspace import ImportFormat

from databricks.labs.ucx.__about__ import __version__
from databricks.labs.ucx.assessment.crawlers import (
AzureServicePrincipalInfo,
ClusterInfo,
GlobalInitScriptInfo,
JobInfo,
PipelineInfo,
)
from databricks.labs.ucx.config import GroupsConfig, WorkspaceConfig
from databricks.labs.ucx.framework.crawlers import (
SchemaDeployer,
SqlBackend,
StatementExecutionBackend,
)
from databricks.labs.ucx.framework.dashboards import DashboardFromFiles
from databricks.labs.ucx.framework.install_state import InstallState
from databricks.labs.ucx.framework.tasks import _TASKS, Task
from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocation
from databricks.labs.ucx.hive_metastore.grants import Grant
from databricks.labs.ucx.hive_metastore.hms_lineage import HiveMetastoreLineageEnabler
from databricks.labs.ucx.hive_metastore.mounts import Mount
from databricks.labs.ucx.hive_metastore.tables import Table, TableError
from databricks.labs.ucx.runtime import main
from databricks.labs.ucx.workspace_access.base import Permissions
from databricks.labs.ucx.workspace_access.generic import WorkspaceObjectInfo

TAG_STEP = "step"
TAG_APP = "App"
Expand Down Expand Up @@ -89,12 +107,36 @@
logger = logging.getLogger(__name__)


def deploy_schema(sql_backend: SqlBackend, inventory_schema: str):
from databricks.labs import ucx

deployer = SchemaDeployer(sql_backend, inventory_schema, ucx)
deployer.deploy_schema()
deployer.deploy_table("azure_service_principals", AzureServicePrincipalInfo)
deployer.deploy_table("clusters", ClusterInfo)
deployer.deploy_table("global_init_scripts", GlobalInitScriptInfo)
deployer.deploy_table("jobs", JobInfo)
deployer.deploy_table("pipelines", PipelineInfo)
deployer.deploy_table("external_locations", ExternalLocation)
deployer.deploy_table("mounts", Mount)
deployer.deploy_table("grants", Grant)
deployer.deploy_table("tables", Table)
deployer.deploy_table("table_failures", TableError)
deployer.deploy_table("workspace_objects", WorkspaceObjectInfo)
deployer.deploy_table("permissions", Permissions)
deployer.deploy_view("objects", "queries/views/objects.sql")
deployer.deploy_view("grant_detail", "queries/views/grant_detail.sql")


class WorkspaceInstaller:
def __init__(self, ws: WorkspaceClient, *, prefix: str = "ucx", promtps: bool = True):
def __init__(
self, ws: WorkspaceClient, *, prefix: str = "ucx", promtps: bool = True, sql_backend: SqlBackend = None
):
if "DATABRICKS_RUNTIME_VERSION" in os.environ:
msg = "WorkspaceInstaller is not supposed to be executed in Databricks Runtime"
raise SystemExit(msg)
self._ws = ws
self._sql_backend = sql_backend
self._prefix = prefix
self._prompts = promtps
self._this_file = Path(__file__)
Expand All @@ -111,10 +153,16 @@ def _run_configured(self):
self._install_spark_config_for_hms_lineage()
self._create_dashboards()
self._create_jobs()
self._create_database()
readme = f'{self.notebook_link(f"{self._install_folder}/README.py")}'
msg = f"Installation completed successfully! Please refer to the {readme} notebook for next steps."
logger.info(msg)

def _create_database(self):
if self._sql_backend is None:
self._sql_backend = StatementExecutionBackend(self._ws, self._current_config.warehouse_id)
deploy_schema(self._sql_backend, self._current_config.inventory_database)

def _install_spark_config_for_hms_lineage(self):
hms_lineage = HiveMetastoreLineageEnabler(ws=self._ws)
logger.info(
Expand Down Expand Up @@ -157,10 +205,15 @@ def _install_spark_config_for_hms_lineage(self):

@staticmethod
def run_for_config(
ws: WorkspaceClient, config: WorkspaceConfig, *, prefix="ucx", override_clusters: dict[str, str] | None = None
ws: WorkspaceClient,
config: WorkspaceConfig,
*,
prefix="ucx",
override_clusters: dict[str, str] | None = None,
sql_backend: SqlBackend = None,
) -> "WorkspaceInstaller":
logger.info(f"Installing UCX v{__version__} on {ws.config.host}")
workspace_installer = WorkspaceInstaller(ws, prefix=prefix, promtps=False)
workspace_installer = WorkspaceInstaller(ws, prefix=prefix, promtps=False, sql_backend=sql_backend)
logger.info(f"Installing UCX v{workspace_installer._version} on {ws.config.host}")
workspace_installer._config = config
workspace_installer._write_config()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- viz type=table, name=Azure Service Principals, columns=application_id,secret_scope,secret_key,tenant_id,storage_account
-- widget title=Azure Service Principals, col=0, row=49, size_x=6, size_y=8
-- widget title=Azure Service Principals, row=0, col=0, size_x=3, size_y=8
select
application_id,
if(secret_scope = '', "NA", secret_scope) secret_scope,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- viz type=counter, name=Workspace UC readiness, counter_label=UC readiness, value_column=readiness
-- widget row=0, col=0, size_x=1, size_y=3
WITH raw AS (
SELECT object_type, object_id, IF(failures == '[]', 1, 0) AS ready FROM $inventory.objects
)
SELECT CONCAT(ROUND(SUM(ready) / COUNT(*) * 100, 1), '%') AS readiness FROM raw
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- viz type=counter, name=Total Database Count, counter_label=Total Databases, value_column=count_total_databases
-- widget col=0, row=0, size_x=1, size_y=3
-- widget row=0, col=1, size_x=1, size_y=3
SELECT COUNT(DISTINCT `database`) AS count_total_databases
FROM $inventory.tables
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- viz type=counter, name=Storage Locations, counter_label=Storage Locations, value_column=count_total
-- widget row=0, col=4, size_x=1, size_y=3
SELECT count(*) AS count_total
FROM $inventory.external_locations
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- viz type=counter, name=Total Table Count, counter_label=Total Tables, value_column=count_total_tables
-- widget col=1, row=0, size_x=1, size_y=3
-- widget row=0, col=3, size_x=1, size_y=3
SELECT count(*) AS count_total_tables
FROM $inventory.tables
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
-- viz type=table, name=Object Type Readiness, columns=object_type,readiness
-- widget title=Readiness, row=1, col=0, size_x=1, size_y=8
WITH raw AS (
SELECT object_type, object_id, IF(failures == '[]', 1, 0) AS ready FROM $inventory.objects
)
SELECT object_type, CONCAT(ROUND(SUM(ready) / COUNT(*) * 100, 1), '%') AS readiness
FROM raw
GROUP BY object_type
ORDER BY readiness DESC
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- viz type=table, name=Failure Summary, search_by=failure, columns=failure,count
-- widget title=Failure Summary, row=1, col=1, size_x=2, size_y=8
WITH raw AS (
SELECT EXPLODE(FROM_JSON(failures, 'array<string>')) AS failure FROM $inventory.objects WHERE failures <> '[]'
)
SELECT failure, COUNT(*) AS count FROM raw GROUP BY failure
ORDER BY count DESC, failure DESC
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
-- viz type=table, name=Table Types, columns=database,name,type,format,table_view,storage,is_delta,location
-- widget title=Table Types, col=0, row=3, size_x=6, size_y=6
SELECT `database`,
name,
-- viz type=table, name=Table Types, search_by=name, columns=name,type,format,storage,is_delta,location
-- widget title=Table Types, row=1, col=3, size_x=3, size_y=8
SELECT CONCAT(`database`, '.', name) AS name,
object_type AS type,
UPPER(table_format) AS format,
IF(object_type IN ("MANAGED", "EXTERNAL"), "TABLE", "VIEW") AS table_view,
CASE
WHEN STARTSWITH(location, "dbfs:/mnt") THEN "DBFS MOUNT"
WHEN STARTSWITH(location, "/dbfs/mnt") THEN "DBFS MOUNT"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
-- viz type=table, name=Database Summary, search_by=database, columns=database,upgrade,tables,views,dbfs_root,delta_tables,total_grants,granted_principals,database_grants,table_grants,service_principal_grants,user_grants,group_grants
-- widget title=Database Summary, col=0, row=2, size_x=6, size_y=8
WITH table_stats AS (
SELECT
`database`,
object_type,
UPPER(table_format) AS `format`,
`location`,
IF(object_type IN ("MANAGED", "EXTERNAL"), 1, 0) AS is_table,
IF(object_type = "VIEW", 1, 0) AS is_view,
CASE
WHEN STARTSWITH(location, "dbfs:/") AND NOT STARTSWITH(location, "dbfs:/mnt") THEN 1
WHEN STARTSWITH(location, "/dbfs/") AND NOT STARTSWITH(location, "/dbfs/mnt") THEN 1
ELSE 0
END AS is_dbfs_root,
CASE
WHEN STARTSWITH(location, "wasb") THEN 1
WHEN STARTSWITH(location, "adl") THEN 1
ELSE 0
END AS is_unsupported,
IF(format = "DELTA", 1, 0) AS is_delta
FROM $inventory.tables
), database_stats AS (
SELECT `database`,
CASE
WHEN SUM(is_table) = 0 AND SUM(is_view) > 0 THEN "View Migration Required"
WHEN SUM(is_dbfs_root)/SUM(is_table) > .3 THEN "Asset Replication Required"
WHEN SUM(is_delta)/SUM(is_table) < .7 THEN "Some Non Delta Assets"
WHEN SUM(is_unsupported)/SUM(is_table) > .7 THEN "Storage Migration Required"
ELSE "In Place Sync"
END AS upgrade,
SUM(is_table) AS tables,
SUM(is_view) AS views,
SUM(is_unsupported) AS unsupported,
SUM(is_dbfs_root) AS dbfs_root,
SUM(is_delta) AS delta_tables
FROM table_stats
GROUP BY `database`
), grant_stats AS (
SELECT
`database`,
COUNT(*) AS total_grants,
COUNT(DISTINCT principal) AS granted_principals,
SUM(IF(object_type == 'DATABASE', 1, 0)) AS database_grants,
SUM(IF(object_type == 'TABLE', 1, 0)) AS table_grants,
SUM(IF(principal_type == 'service-principal', 1, 0)) AS service_principal_grants,
SUM(IF(principal_type == 'user', 1, 0)) AS user_grants,
SUM(IF(principal_type == 'group', 1, 0)) AS group_grants
FROM $inventory.grant_detail
GROUP BY `database`
)
SELECT * FROM database_stats FULL JOIN grant_stats USING (`database`)
ORDER BY tables DESC
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- viz type=table, name=External Locations, columns=location
-- widget title=External Locations, col=0, row=17, size_x=3, size_y=8
-- widget title=External Locations, row=3, col=0, size_x=3, size_y=8
SELECT location
FROM $inventory.external_locations
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- viz type=table, name=Mount Points, columns=name,source
-- widget title=Mount Points, col=3, row=17, size_x=3, size_y=8
-- widget title=Mount Points, row=3, col=3, size_x=3, size_y=8
SELECT name,
source
FROM $inventory.mounts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
-- viz type=table, name=Clusters, columns=failure,cluster_name,cluster_id,creator
-- widget title=Incompatible clusters, row=4, col=0, size_x=3, size_y=8
SELECT EXPLODE(FROM_JSON(failures, 'array<string>')) AS failure,
cluster_id,
cluster_name,
creator
FROM $inventory.clusters
WHERE NOT STARTSWITH(cluster_name, "job-")
ORDER BY cluster_id DESC
10 changes: 10 additions & 0 deletions src/databricks/labs/ucx/queries/assessment/main/04_3_jobs.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-- viz type=table, name=Jobs, columns=failure,job_id,job_name,creator
-- widget title=Incompatible jobs, row=4, col=3, size_x=3, size_y=8
SELECT
EXPLODE(FROM_JSON(failures, 'array<string>')) AS failure,
job_id,
job_name,
creator
FROM $inventory.jobs
WHERE job_name not like '[UCX]%'
ORDER BY job_id DESC
Loading

0 comments on commit ed9775a

Please sign in to comment.