From 143c637ad9609ab388dee3d8e9dd8263811d8087 Mon Sep 17 00:00:00 2001
From: Pritish Pai <136742693+pritishpai@users.noreply.github.com>
Date: Thu, 26 Sep 2024 12:32:22 -0400
Subject: [PATCH 1/7] Added Py4j implementation of tables crawler to retrieve a
list of HMS tables in the assessment workflow (#2579)
## Changes
Tests, fixes and uses the new FasterTableScanCrawler in the Assessment
Job. The feature flag will determine whether the assessment uses the old
scala code or the new python (using py4j) for better logging during
table scans.
### Linked issues
Fix #2190
### Functionality
- [x] modified existing workflow: `assesment.crawl_tables` now uses new
py4j crawler over the scala one
### Tests
- [x] added integration tests
---------
Co-authored-by: Serge Smertin <259697+nfx@users.noreply.github.com>
---
docs/table_persistence.md | 14 --
.../labs/ucx/assessment/workflows.py | 10 +-
.../labs/ucx/contexts/workflow_task.py | 5 +
.../labs/ucx/hive_metastore/tables.py | 51 +++++--
.../labs/ucx/hive_metastore/tables.scala | 123 ----------------
.../main/01_4_table_crawl_failures.sql | 59 ++++++++
.../integration/assessment/test_workflows.py | 24 +++
tests/integration/conftest.py | 9 ++
tests/unit/conftest.py | 88 ++++++++++-
tests/unit/hive_metastore/test_tables.py | 138 ++++++++++--------
10 files changed, 303 insertions(+), 218 deletions(-)
delete mode 100644 src/databricks/labs/ucx/hive_metastore/tables.scala
create mode 100644 src/databricks/labs/ucx/queries/assessment/main/01_4_table_crawl_failures.sql
diff --git a/docs/table_persistence.md b/docs/table_persistence.md
index 4448269624..948b276609 100644
--- a/docs/table_persistence.md
+++ b/docs/table_persistence.md
@@ -22,7 +22,6 @@ Table Utilization:
| pipelines | RW | RW | | | | | | |
| groups | RW | | RO | | | | | |
| table_size | RW | | | | | | | |
-| table_failures | RW | | | | | | | |
| submit_runs | RW | | | | | | | |
| policies | RW | RW | | | | | | |
| migration_status | | RW | | RW | | RW | | |
@@ -53,19 +52,6 @@ Holds Inventory of all tables in all databases and their relevant metadata.
-#### _$inventory_.table_failures
-
-Holds failures that occurred during crawling HMS tables
-
-| Column | Datatype | Description | Comments |
-|----------|----------|----------------------------------------------------------|----------|
-| catalog | string | Original catalog of the table. hive_metastore by default |
-| database | string | Original schema of the table |
-| name | string | Name of the table |
-| failures | string | Exception message context |
-
-
-
#### _$inventory_.grants
Inventory of all Table ACLs for tables indexed in tables table.
diff --git a/src/databricks/labs/ucx/assessment/workflows.py b/src/databricks/labs/ucx/assessment/workflows.py
index 6ba8d029db..8d5dd16a32 100644
--- a/src/databricks/labs/ucx/assessment/workflows.py
+++ b/src/databricks/labs/ucx/assessment/workflows.py
@@ -11,12 +11,14 @@ class Assessment(Workflow):
def __init__(self):
super().__init__('assessment')
- @job_task(notebook="hive_metastore/tables.scala")
+ @job_task
def crawl_tables(self, ctx: RuntimeContext):
"""Iterates over all tables in the Hive Metastore of the current workspace and persists their metadata, such
- as _database name_, _table name_, _table type_, _table location_, etc., in the table named
- `$inventory_database.tables`. The metadata stored is then used in the subsequent tasks and workflows to, for
- example, find all Hive Metastore tables that cannot easily be migrated to Unity Catalog."""
+ as _database name_, _table name_, _table type_, _table location_, etc., in the Delta table named
+ `$inventory_database.tables`. Note that the `inventory_database` is set in the configuration file. The metadata
+ stored is then used in the subsequent tasks and workflows to, for example, find all Hive Metastore tables that
+ cannot easily be migrated to Unity Catalog."""
+ ctx.tables_crawler.snapshot()
@job_task
def crawl_udfs(self, ctx: RuntimeContext):
diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py
index 9e3cbabbad..488c224243 100644
--- a/src/databricks/labs/ucx/contexts/workflow_task.py
+++ b/src/databricks/labs/ucx/contexts/workflow_task.py
@@ -14,6 +14,7 @@
from databricks.labs.ucx.contexts.application import GlobalContext
from databricks.labs.ucx.hive_metastore import TablesInMounts
from databricks.labs.ucx.hive_metastore.table_size import TableSizeCrawler
+from databricks.labs.ucx.hive_metastore.tables import FasterTableScanCrawler
from databricks.labs.ucx.installer.logs import TaskRunWarningRecorder
@@ -81,6 +82,10 @@ def policies_crawler(self):
def global_init_scripts_crawler(self):
return GlobalInitScriptCrawler(self.workspace_client, self.sql_backend, self.inventory_database)
+ @cached_property
+ def tables_crawler(self):
+ return FasterTableScanCrawler(self.sql_backend, self.inventory_database)
+
@cached_property
def tables_in_mounts(self):
return TablesInMounts(
diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py
index 0545b584e9..7f8ecdfca3 100644
--- a/src/databricks/labs/ucx/hive_metastore/tables.py
+++ b/src/databricks/labs/ucx/hive_metastore/tables.py
@@ -411,12 +411,14 @@ def _crawl(self) -> Iterable[Table]:
After performing initial scan of all tables, starts making parallel
DESCRIBE TABLE EXTENDED queries for every table.
- Production tasks would most likely be executed through `tables.scala`
+ Production tasks would most likely be executed through FasterTableScanCrawler
within `crawl_tables` task due to `spark.sharedState.externalCatalog`
lower-level APIs not requiring a roundtrip to storage, which is not
possible for Azure storage with credentials supplied through Spark
conf (see https://github.com/databrickslabs/ucx/issues/249).
+ FasterTableScanCrawler uses the _jsparkSession to utilize faster scanning with Scala APIs.
+
See also https://github.com/databrickslabs/ucx/issues/247
"""
tasks = []
@@ -475,10 +477,14 @@ def _describe(self, catalog: str, database: str, table: str) -> Table | None:
class FasterTableScanCrawler(CrawlerBase):
- def _try_fetch(self) -> Iterable[Table]:
- """Tries to load table information from the database or throws TABLE_OR_VIEW_NOT_FOUND error"""
- for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"):
- yield Table(*row)
+ """
+ FasterTableScanCrawler is a specialized version of TablesCrawler that uses spark._jsparkSession to utilize
+ faster scanning with Scala APIs.
+
+ For integration testing, FasterTableScanCrawler is tested using the larger assessment test rather than
+ just the class. Testing the class individually would require utilizing a remote spark connection with the
+ Databricks workspace.
+ """
def __init__(self, backend: SqlBackend, schema, include_databases: list[str] | None = None):
self._backend = backend
@@ -504,17 +510,26 @@ def _option_as_python(scala_option: typing.Any):
return scala_option.get() if scala_option.isDefined() else None
def _all_databases(self) -> list[str]:
- if not self._include_database:
- return list(self._iterator(self._external_catalog.listDatabases()))
- return self._include_database
+ try:
+ if not self._include_database:
+ return list(self._iterator(self._external_catalog.listDatabases()))
+ return self._include_database
+ except Exception as err: # pylint: disable=broad-exception-caught
+ logger.error(f"failed-table-crawl: listing databases -> catalog : {err}", exc_info=True)
+ return []
def _list_tables(self, database: str) -> list[str]:
try:
return list(self._iterator(self._external_catalog.listTables(database)))
except Exception as err: # pylint: disable=broad-exception-caught
- logger.warning(f"Failed to list tables in {database}: {err}")
+ logger.warning(f"failed-table-crawl: listing tables from database -> {database} : {err}", exc_info=True)
return []
+ def _try_fetch(self) -> Iterable[Table]:
+ """Tries to load table information from the database or throws TABLE_OR_VIEW_NOT_FOUND error"""
+ for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"):
+ yield Table(*row)
+
@staticmethod
def _format_properties_list(properties_list: list) -> str:
if len(properties_list) == 0:
@@ -555,8 +570,8 @@ def _describe(self, catalog, database, table) -> Table | None:
view_text = self._option_as_python(raw_table.viewText())
table_properties = list(self._iterator(raw_table.properties()))
formatted_table_properties = self._format_properties_list(table_properties)
- except Exception: # pylint: disable=broad-exception-caught
- logger.warning(f"Couldn't fetch information for table: {full_name}", exc_info=True)
+ except Exception as e: # pylint: disable=broad-exception-caught
+ logger.warning(f"failed-table-crawl: describing table -> {full_name}: {e}", exc_info=True)
return None
return Table(
catalog=catalog,
@@ -583,9 +598,16 @@ def _crawl(self) -> Iterable[Table]:
catalog_tables, errors = Threads.gather("describing tables in ", tasks)
if len(errors) > 0:
logger.warning(f"Detected {len(errors)} errors while scanning tables in ")
+
+ logger.info(f"Finished scanning {len(catalog_tables)} tables")
return catalog_tables
def _get_table_names(self, database: str) -> list[str]:
+ """
+ Lists tables names in the specified database.
+ :param database:
+ :return: list of table names
+ """
table_names = []
table_names_batches = Threads.strict('listing tables', [partial(self._list_tables, database)])
for table_batch in table_names_batches:
@@ -593,6 +615,13 @@ def _get_table_names(self, database: str) -> list[str]:
return table_names
def _create_describe_tasks(self, catalog: str, database: str, table_names: list[str]) -> list[partial]:
+ """
+ Creates a list of partial functions for describing tables.
+ :param catalog:
+ :param database:
+ :param table_names:
+ :return: list of partial functions
+ """
tasks = []
for table in table_names:
tasks.append(partial(self._describe, catalog, database, table))
diff --git a/src/databricks/labs/ucx/hive_metastore/tables.scala b/src/databricks/labs/ucx/hive_metastore/tables.scala
deleted file mode 100644
index 7f4e077e1f..0000000000
--- a/src/databricks/labs/ucx/hive_metastore/tables.scala
+++ /dev/null
@@ -1,123 +0,0 @@
-// Databricks notebook source
-import java.util.concurrent.ConcurrentLinkedQueue
-import scala.collection.JavaConverters
-import org.apache.hadoop.fs._
-import org.yaml.snakeyaml.Yaml
-import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException
-import org.apache.hadoop.hive.metastore.api.NoSuchObjectException
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.functions.{col,lower,upper}
-import org.apache.spark.sql.catalyst.TableIdentifier
-
-// must follow the same structure as databricks.labs.ucx.hive_metastore.tables.Table
-case class TableDetails(catalog: String, database: String, name: String, object_type: String,
- table_format: String, location: String, view_text: String, upgraded_to: String, storage_properties: String, is_partitioned: Boolean)
-// recording error log in the database
-case class TableError(catalog: String, database: String, name: String, error: String)
-
-val failures = new ConcurrentLinkedQueue[TableError]()
-
-def metadataForAllTables(databases: Seq[String], queue: ConcurrentLinkedQueue[TableError]): DataFrame = {
- import spark.implicits._
-
- val externalCatalog = spark.sharedState.externalCatalog
- databases.par.flatMap(databaseName => {
- val tables = try {
- externalCatalog.listTables(databaseName)
- } catch {
- case err: NoSuchDatabaseException =>
- failures.add(TableError("hive_metastore", databaseName, null, s"ignoring database because of ${err}"))
- null
- case err: AnalysisException =>
- failures.add(TableError("hive_metastore", databaseName, null, s"ignoring object because of ${err}"))
- null
- }
- if (tables == null) {
- failures.add(TableError("hive_metastore", databaseName, null, s"listTables returned null"))
- Seq()
- } else {
- tables.par.map(tableName => try {
- val table = externalCatalog.getTable(databaseName, tableName)
- if (table == null) {
- failures.add(TableError("hive_metastore", databaseName, tableName, s"result is null"))
- None
- } else {
- val upgraded_to = table.properties.get("upgraded_to")
- val redactedKey = "*********"
-
- val formattedString = table.storage.properties.map {
- case (key, value) =>
- if (key == "personalAccessToken")
- s"$key=$redactedKey(redacted)"
- else if (key.equalsIgnoreCase("password"))
- s"$key=$redactedKey(redacted)"
- else
- s"$key=$value"
- }.mkString("[", ", ", "]")
-
- val partitionColumnNames = try {
- spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName, Some(databaseName))).partitionColumnNames
- } catch {
- case e: Exception => null
- }
- val isPartitioned = if (partitionColumnNames != null && !partitionColumnNames.isEmpty) true else false
-
- Some(TableDetails("hive_metastore", databaseName, tableName, table.tableType.name, table.provider.getOrElse("UNKNOWN"),
- table.storage.locationUri.map(_.toString).orNull, table.viewText.orNull,
- upgraded_to match { case Some(target) => target case None => null }, formattedString, isPartitioned))
- }
- } catch {
- case err: Throwable =>
- failures.add(TableError("hive_metastore", databaseName, tableName, s"ignoring table because of ${err}"))
- None
- }).toList.collect {
- case Some(x) => x
- }
- }
- }).toList.toDF
-}
-
-def getConfig(): java.util.Map[String, Any] = {
- dbutils.widgets.text("config", "./config.yml")
- val configFile = dbutils.widgets.get("config")
- val fs = FileSystem.get(new java.net.URI("file:/Workspace"), sc.hadoopConfiguration)
- val file = fs.open(new Path(configFile))
- val configContents = org.apache.commons.io.IOUtils.toString(file, java.nio.charset.StandardCharsets.UTF_8)
- return new Yaml().load(configContents).asInstanceOf[java.util.Map[String, Any]]
-}
-
-def getInventoryDatabase(configObj:java.util.Map[String, Any]): String ={
- return configObj.get("inventory_database").toString()
-}
-
-def getIncludeDatabases(configObj:java.util.Map[String, Any], inventoryDatabase:String): Seq[String] ={
- val includeDatabases = JavaConverters.asScalaBuffer(configObj.getOrDefault("include_databases",new java.util.ArrayList[String]()).asInstanceOf[java.util.ArrayList[String]]).toList
-
- if (includeDatabases.isEmpty) {
- return spark.sharedState.externalCatalog.listDatabases().filter(_ != s"$inventoryDatabase")
- }
- return spark.sharedState.externalCatalog.listDatabases().filter(includeDatabases.contains(_))
-}
-
-val config = getConfig()
-val inventoryDatabase = getInventoryDatabase(config)
-val includeDatabases = getIncludeDatabases(config, inventoryDatabase)
-var df = metadataForAllTables(includeDatabases, failures)
-var columnsToMapLower = Array("catalog","database","name","upgraded_to","storage_properties")
-columnsToMapLower.map(column => {
- df = df.withColumn(column, lower(col(column)))
-})
-var columnsToMapUpper = Array("object_type","table_format")
-columnsToMapUpper.map(column => {
- df = df.withColumn(column, upper(col(column)))
-})
-df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(s"hive_metastore.$inventoryDatabase.tables")
-var dfTableFailures = JavaConverters.asScalaIteratorConverter(failures.iterator).asScala.toList.toDF
-columnsToMapLower = Array("catalog","database","name")
-columnsToMapLower.map(column => {
- dfTableFailures = dfTableFailures.withColumn(column, lower(col(column)))
-})
-
-dfTableFailures.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(s"hive_metastore.$inventoryDatabase.table_failures")
diff --git a/src/databricks/labs/ucx/queries/assessment/main/01_4_table_crawl_failures.sql b/src/databricks/labs/ucx/queries/assessment/main/01_4_table_crawl_failures.sql
new file mode 100644
index 0000000000..6af691a198
--- /dev/null
+++ b/src/databricks/labs/ucx/queries/assessment/main/01_4_table_crawl_failures.sql
@@ -0,0 +1,59 @@
+/*
+--title 'Table Crawl Failures'
+--height 4
+--width 4
+*/
+WITH latest_job_runs AS (
+ SELECT
+ timestamp,
+ job_id,
+ job_run_id
+ FROM (
+ SELECT
+ CAST(timestamp AS TIMESTAMP) AS timestamp,
+ job_id,
+ job_run_id,
+ ROW_NUMBER() OVER (PARTITION BY job_id ORDER BY CAST(timestamp AS TIMESTAMP) DESC) = 1 AS latest_run_of_job
+ FROM inventory.logs
+ )
+ WHERE
+ latest_run_of_job
+), logs_latest_job_runs AS (
+ SELECT
+ CAST(logs.timestamp AS TIMESTAMP) AS timestamp,
+ message,
+ job_run_id,
+ job_id,
+ workflow_name,
+ task_name
+ FROM inventory.logs
+ JOIN latest_job_runs
+ USING (job_id, job_run_id)
+ WHERE
+ workflow_name IN ('assessment')
+), table_crawl_failures AS (
+SELECT
+ timestamp,
+ REGEXP_EXTRACT(message, '^failed-table-crawl: (.+?) -> (.+?): (.+)$', 1) AS error_reason,
+ REGEXP_EXTRACT(message, '^failed-table-crawl: (.+?) -> (.+?): (.+)$', 2) AS error_entity,
+ REGEXP_EXTRACT(message, '^failed-table-crawl: (.+?) -> (.+?): (.+)$', 3) AS error_message,
+ job_run_id,
+ job_id,
+ workflow_name,
+ task_name
+FROM logs_latest_job_runs
+ WHERE
+ STARTSWITH(message, 'failed-table-crawl: ')
+)
+SELECT
+ timestamp,
+ error_reason,
+ error_entity,
+ error_message,
+ job_run_id,
+ job_id,
+ workflow_name,
+ task_name
+FROM table_crawl_failures
+ORDER BY
+ 1
diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py
index 1110528bcf..c044a52b81 100644
--- a/tests/integration/assessment/test_workflows.py
+++ b/tests/integration/assessment/test_workflows.py
@@ -4,13 +4,19 @@
from databricks.sdk.retries import retried
from databricks.sdk.service.iam import PermissionLevel
+from databricks.labs.ucx.hive_metastore import TablesCrawler
+
+# pylint: disable=too-many-locals
@retried(on=[NotFound, InvalidParameterValue])
def test_running_real_assessment_job(
ws,
installation_ctx,
make_cluster_policy,
make_cluster_policy_permissions,
+ make_dashboard,
+ sql_backend,
+ inventory_schema,
populate_for_linting,
):
ws_group, _ = installation_ctx.make_ucx_group()
@@ -20,6 +26,14 @@ def test_running_real_assessment_job(
permission_level=PermissionLevel.CAN_USE,
group_name=ws_group.display_name,
)
+
+ source_schema = installation_ctx.make_schema(catalog_name="hive_metastore")
+ managed_table = installation_ctx.make_table(schema_name=source_schema.name)
+ external_table = installation_ctx.make_table(schema_name=source_schema.name, external=True)
+ tmp_table = installation_ctx.make_table(schema_name=source_schema.name, ctas="SELECT 2+2 AS four")
+ view = installation_ctx.make_table(schema_name=source_schema.name, ctas="SELECT 2+2 AS four", view=True)
+ non_delta = installation_ctx.make_table(schema_name=source_schema.name, non_delta=True)
+
installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"]
installation_ctx.workspace_installation.run()
@@ -29,4 +43,14 @@ def test_running_real_assessment_job(
assert installation_ctx.deployed_workflows.validate_step("assessment")
after = installation_ctx.generic_permissions_support.load_as_dict("cluster-policies", cluster_policy.policy_id)
+
assert after[ws_group.display_name] == PermissionLevel.CAN_USE
+
+ tables = set[str]()
+ local_crawler = TablesCrawler(sql_backend, inventory_schema, [source_schema.name])
+ for _ in local_crawler.snapshot():
+ tables.add(_.name)
+
+ expected_tables = {managed_table.name, external_table.name, tmp_table.name, view.name, non_delta.name}
+ assert len(tables) == len(expected_tables)
+ assert set(tables) == expected_tables
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 9e18b42284..c4dc8f4c33 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -563,6 +563,15 @@ def config(self) -> WorkspaceConfig:
include_databases=self.created_databases,
)
+ @cached_property
+ def tables_crawler(self) -> TablesCrawler:
+ """
+ Returns a TablesCrawler instance with the tables that were created in the context.
+ Overrides the FasterTableScanCrawler with TablesCrawler used as DBR is not available while running integration tests
+ :return: TablesCrawler
+ """
+ return TablesCrawler(self.sql_backend, self.inventory_database, self.config.include_databases)
+
def save_tables(self, is_hiveserde: bool = False):
# populate the tables crawled, as it is used by get_tables_to_migrate in the migrate-tables workflow
default_table_format = "HIVE" if is_hiveserde else ""
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 7c99db89ed..8f828a417b 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -8,6 +8,8 @@
from databricks.labs.blueprint.installation import MockInstallation
from databricks.labs.lsql.backends import MockBackend
+from databricks.labs.ucx.hive_metastore import TablesCrawler
+from databricks.labs.ucx.hive_metastore.tables import FasterTableScanCrawler
from databricks.labs.ucx.source_code.graph import BaseNotebookResolver
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.sdk import WorkspaceClient, AccountClient
@@ -49,8 +51,76 @@ def mock_installation() -> MockInstallation:
)
+class CustomIterator:
+ def __init__(self, values):
+ self._values = iter(values)
+ self._has_next = True
+
+ def hasNext(self): # pylint: disable=invalid-name
+ try:
+ self._next_value = next(self._values)
+ self._has_next = True
+ except StopIteration:
+ self._has_next = False
+ return self._has_next
+
+ def next(self):
+ if self._has_next:
+ return self._next_value
+ raise StopIteration
+
+
@pytest.fixture
-def run_workflow(mocker, mock_installation):
+def spark_table_crawl_mocker(mocker):
+ def create_product_element_mock(key, value):
+ def product_element_side_effect(index):
+ if index == 0:
+ return key
+ if index == 1:
+ return value
+ raise IndexError(f"Invalid index: {index}")
+
+ mock = mocker.Mock()
+ mock.productElement.side_effect = product_element_side_effect
+ return mock
+
+ mock_list_databases_iterator = mocker.Mock()
+ mock_list_databases_iterator.iterator.return_value = CustomIterator(["default", "test_database"])
+ mock_list_tables_iterator = mocker.Mock()
+ mock_list_tables_iterator.iterator.return_value = CustomIterator(["table1"])
+
+ mock_property_1 = create_product_element_mock("delta.appendOnly", "true")
+ mock_property_2 = create_product_element_mock("delta.autoOptimize", "false")
+ mock_property_pat = create_product_element_mock("personalAccessToken", "e32kfkasdas")
+ mock_property_password = create_product_element_mock("password", "very_secret")
+
+ mock_storage_properties_list = [
+ mock_property_1,
+ mock_property_2,
+ mock_property_pat,
+ mock_property_password,
+ ]
+ mock_properties_iterator = mocker.Mock()
+ mock_properties_iterator.iterator.return_value = CustomIterator(mock_storage_properties_list)
+
+ mock_partition_col_iterator = mocker.Mock()
+ mock_partition_col_iterator.iterator.return_value = CustomIterator(["age", "name"])
+
+ get_table_mock = mocker.Mock()
+ get_table_mock.provider().isDefined.return_value = True
+ get_table_mock.provider().get.return_value = "delta"
+ get_table_mock.storage().locationUri().isDefined.return_value = False
+
+ get_table_mock.viewText().isDefined.return_value = True
+ get_table_mock.viewText().get.return_value = "mock table text"
+ get_table_mock.properties.return_value = mock_properties_iterator
+ get_table_mock.partitionColumnNames.return_value = mock_partition_col_iterator
+
+ return mock_list_databases_iterator, mock_list_tables_iterator, get_table_mock
+
+
+@pytest.fixture
+def run_workflow(mocker, mock_installation, spark_table_crawl_mocker):
def inner(cb, **replace) -> RuntimeContext:
with _lock, patch.dict(os.environ, {"DATABRICKS_RUNTIME_VERSION": "14.0"}):
pyspark_sql_session = mocker.Mock()
@@ -66,6 +136,8 @@ def inner(cb, **replace) -> RuntimeContext:
replace['sql_backend'] = MockBackend()
if 'config' not in replace:
replace['config'] = mock_installation.load(WorkspaceConfig)
+ if 'tables_crawler' not in replace:
+ replace['tables_crawler'] = TablesCrawler(replace['sql_backend'], replace['config'].inventory_database)
module = __import__(cb.__module__, fromlist=[cb.__name__])
klass, method = cb.__qualname__.split('.', 1)
@@ -73,8 +145,20 @@ def inner(cb, **replace) -> RuntimeContext:
current_task = getattr(workflow, method)
ctx = RuntimeContext().replace(**replace)
+ if isinstance(ctx.tables_crawler, FasterTableScanCrawler):
+ mock_list_databases_iterator, mock_list_tables_iterator, get_table_mock = spark_table_crawl_mocker
+ # pylint: disable=protected-access
+ ctx.tables_crawler._spark._jsparkSession.sharedState().externalCatalog().listDatabases.return_value = (
+ mock_list_databases_iterator
+ )
+ ctx.tables_crawler._spark._jsparkSession.sharedState().externalCatalog().listTables.return_value = (
+ mock_list_tables_iterator
+ )
+ ctx.tables_crawler._spark._jsparkSession.sharedState().externalCatalog().getTable.return_value = (
+ get_table_mock
+ )
+ # pylint: enable=protected-access
current_task(ctx)
-
return ctx
yield inner
diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py
index b22f678a5a..c48698f53c 100644
--- a/tests/unit/hive_metastore/test_tables.py
+++ b/tests/unit/hive_metastore/test_tables.py
@@ -1,3 +1,4 @@
+import logging
import sys
import pytest
@@ -7,25 +8,6 @@
from databricks.labs.ucx.hive_metastore.tables import Table, TablesCrawler, What, HiveSerdeType, FasterTableScanCrawler
-class CustomIterator:
- def __init__(self, values):
- self._values = iter(values)
- self._has_next = True
-
- def hasNext(self): # pylint: disable=invalid-name
- try:
- self._next_value = next(self._values)
- self._has_next = True
- except StopIteration:
- self._has_next = False
- return self._has_next
-
- def next(self):
- if self._has_next:
- return self._next_value
- raise StopIteration
-
-
def test_is_delta_true():
delta_table = Table(catalog="catalog", database="db", name="table", object_type="type", table_format="DELTA")
assert delta_table.is_delta
@@ -570,60 +552,17 @@ def test_fast_table_scan_crawler_already_crawled(mocker):
assert len(results) == 3
-def test_fast_table_scan_crawler_crawl_new(caplog, mocker):
+def test_fast_table_scan_crawler_crawl_new(caplog, mocker, spark_table_crawl_mocker):
pyspark_sql_session = mocker.Mock()
sys.modules["pyspark.sql.session"] = pyspark_sql_session
- def create_product_element_mock(key, value):
- def product_element_side_effect(index):
- if index == 0:
- return key
- if index == 1:
- return value
- raise IndexError(f"Invalid index: {index}")
-
- mock = mocker.Mock()
- mock.productElement.side_effect = product_element_side_effect
- return mock
-
errors = {}
rows = {
"hive_metastore.inventory_database.tables": [],
}
sql_backend = MockBackend(fails_on_first=errors, rows=rows)
ftsc = FasterTableScanCrawler(sql_backend, "inventory_database")
-
- mock_list_databases_iterator = mocker.Mock()
- mock_list_databases_iterator.iterator.return_value = CustomIterator(["default", "test_database"])
- mock_list_tables_iterator = mocker.Mock()
- mock_list_tables_iterator.iterator.return_value = CustomIterator(["table1"])
-
- mock_property_1 = create_product_element_mock("delta.appendOnly", "true")
- mock_property_2 = create_product_element_mock("delta.autoOptimize", "false")
- mock_property_pat = create_product_element_mock("personalAccessToken", "e32kfkasdas")
- mock_property_password = create_product_element_mock("password", "very_secret")
-
- mock_storage_properties_list = [
- mock_property_1,
- mock_property_2,
- mock_property_pat,
- mock_property_password,
- ]
- mock_properties_iterator = mocker.Mock()
- mock_properties_iterator.iterator.return_value = CustomIterator(mock_storage_properties_list)
-
- mock_partition_col_iterator = mocker.Mock()
- mock_partition_col_iterator.iterator.return_value = CustomIterator(["age", "name"])
-
- get_table_mock = mocker.Mock()
- get_table_mock.provider().isDefined.return_value = True
- get_table_mock.provider().get.return_value = "delta"
- get_table_mock.storage().locationUri().isDefined.return_value = False
-
- get_table_mock.viewText().isDefined.return_value = True
- get_table_mock.viewText().get.return_value = "mock table text"
- get_table_mock.properties.return_value = mock_properties_iterator
- get_table_mock.partitionColumnNames.return_value = mock_partition_col_iterator
+ mock_list_databases_iterator, mock_list_tables_iterator, get_table_mock = spark_table_crawl_mocker
# pylint: disable=protected-access
ftsc._spark._jsparkSession.sharedState().externalCatalog().listDatabases.return_value = mock_list_databases_iterator
@@ -642,3 +581,74 @@ def product_element_side_effect(index):
assert results[0].storage_properties == (
"[delta.appendOnly=true, " "delta.autoOptimize=false, " "personalAccessToken=*******, " "password=*******]"
)
+
+
+def test_fast_table_scan_crawler_crawl_test_warnings_list_databases(caplog, mocker, spark_table_crawl_mocker):
+
+ pyspark_sql_session = mocker.Mock()
+ sys.modules["pyspark.sql.session"] = pyspark_sql_session
+
+ errors = {}
+ rows = {
+ "hive_metastore.inventory_database.tables": [],
+ }
+ sql_backend = MockBackend(fails_on_first=errors, rows=rows)
+ ftsc = FasterTableScanCrawler(sql_backend, "inventory_database")
+
+ # pylint: disable=protected-access
+ ftsc._spark._jsparkSession.sharedState().externalCatalog().listDatabases.side_effect = Exception(
+ "Test listDatabases warning"
+ )
+
+ with caplog.at_level(logging.WARNING):
+ ftsc.snapshot()
+ assert "Test listDatabases warning" in caplog.text
+
+
+def test_fast_table_scan_crawler_crawl_test_warnings_list_tables(caplog, mocker, spark_table_crawl_mocker):
+
+ pyspark_sql_session = mocker.Mock()
+ sys.modules["pyspark.sql.session"] = pyspark_sql_session
+
+ errors = {}
+ rows = {
+ "hive_metastore.inventory_database.tables": [],
+ }
+ sql_backend = MockBackend(fails_on_first=errors, rows=rows)
+ ftsc = FasterTableScanCrawler(sql_backend, "inventory_database")
+
+ mock_list_databases_iterator, _, _ = spark_table_crawl_mocker
+
+ # pylint: disable=protected-access
+ ftsc._spark._jsparkSession.sharedState().externalCatalog().listDatabases.return_value = mock_list_databases_iterator
+ ftsc._spark._jsparkSession.sharedState().externalCatalog().listTables.side_effect = Exception(
+ "Test listTables warning"
+ )
+
+ with caplog.at_level(logging.WARNING):
+ ftsc.snapshot()
+ assert "Test listTables warning" in caplog.text
+
+
+def test_fast_table_scan_crawler_crawl_test_warnings_get_table(caplog, mocker, spark_table_crawl_mocker):
+
+ pyspark_sql_session = mocker.Mock()
+ sys.modules["pyspark.sql.session"] = pyspark_sql_session
+
+ errors = {}
+ rows = {
+ "hive_metastore.inventory_database.tables": [],
+ }
+ sql_backend = MockBackend(fails_on_first=errors, rows=rows)
+ ftsc = FasterTableScanCrawler(sql_backend, "inventory_database")
+
+ mock_list_databases_iterator, mock_list_tables_iterator, _ = spark_table_crawl_mocker
+
+ # pylint: disable=protected-access
+ ftsc._spark._jsparkSession.sharedState().externalCatalog().listDatabases.return_value = mock_list_databases_iterator
+ ftsc._spark._jsparkSession.sharedState().externalCatalog().listTables.return_value = mock_list_tables_iterator
+ ftsc._spark._jsparkSession.sharedState().externalCatalog().getTable.side_effect = Exception("Test getTable warning")
+
+ with caplog.at_level(logging.WARNING):
+ ftsc.snapshot()
+ assert "Test getTable warning" in caplog.text
From 615dafa7cc0afb79cf3ed7c1ff62ffe8cb36f1ca Mon Sep 17 00:00:00 2001
From: Eric Vergnaud
Date: Thu, 26 Sep 2024 18:33:02 +0200
Subject: [PATCH 2/7] Clone solacc repos 1 by 1 and delete them once linting is
done (#2753)
## Changes
Current implementation clones all `solacc` repos at once, occupying 1.5g
which brings us close to limits in CI
This PR fixes that by cloning repos 1 by 1 and deleting them once
linting is done
### Linked issues
None
### Functionality
None
### Tests
- [x] manually tested
---------
Co-authored-by: Eric Vergnaud
---
tests/integration/source_code/solacc.py | 58 +++++++++++++++----------
1 file changed, 35 insertions(+), 23 deletions(-)
diff --git a/tests/integration/source_code/solacc.py b/tests/integration/source_code/solacc.py
index 9ae1fdb748..5f61b30b40 100644
--- a/tests/integration/source_code/solacc.py
+++ b/tests/integration/source_code/solacc.py
@@ -22,9 +22,9 @@
dist = (this_file / '../../../../dist').resolve().absolute()
-def _clone_all():
+def _get_repos_to_clone() -> dict[str, str]:
params = {'per_page': 100, 'page': 1}
- to_clone = []
+ to_clone: dict[str, str] = {}
while True:
result = requests.get(
'https://api.github.com/orgs/databricks-industry-solutions/repos',
@@ -35,18 +35,24 @@ def _clone_all():
break
if 'message' in result:
logger.error(result['message'])
- return
+ return to_clone
params['page'] += 1
for repo in result:
- to_clone.append(repo['clone_url'])
+ name = repo['name']
+ if name == '.github':
+ continue
+ to_clone[name] = repo['clone_url']
+ return to_clone
+
+
+def _clone_repo(repo_url, repo_name):
dist.mkdir(exist_ok=True)
- to_clone = sorted(to_clone) # [:10]
- for url in to_clone:
- dst = dist / url.split("/")[-1].split(".")[0]
- if dst.exists():
- continue
- logger.info(f'Cloning {url} into {dst}')
- run_command(f'git clone {url} {dst}')
+ dst = dist / repo_name
+ if dst.exists():
+ return dst
+ logger.info(f'Cloning {repo_url} into {dst}')
+ run_command(f'git clone {repo_url} {dst}')
+ return dst
def _collect_missing_imports(advices: list[LocatedAdvice]):
@@ -169,11 +175,20 @@ def _lint_dir(solacc: _SolaccContext, soldir: Path):
path_lookup.clean_tmp_sys_paths()
-def _lint_dirs(dir_to_lint: str | None):
- solacc = _SolaccContext.create(dir_to_lint is not None)
- all_dirs = os.listdir(dist) if dir_to_lint is None else [dir_to_lint]
- for soldir in all_dirs:
- _lint_dir(solacc, dist / soldir)
+def _lint_repos(clone_urls, sol_to_lint: str | None):
+ solacc = _SolaccContext.create(sol_to_lint is not None)
+ if sol_to_lint:
+ # don't clone if linting just one file, assumption is we're troubleshooting
+ _lint_dir(solacc, dist / sol_to_lint)
+ else:
+ names: list[str] = list(clone_urls.keys())
+ for name in sorted(names, key=str.casefold):
+ logger.info(f"Cloning {name}...")
+ sol_dir = _clone_repo(clone_urls[name], name)
+ logger.info(f"Linting {name}...")
+ _lint_dir(solacc, sol_dir)
+ if os.getenv("CI"):
+ shutil.rmtree(sol_dir)
all_files_len = solacc.total_count - (len(solacc.files_to_skip) if solacc.files_to_skip else 0)
parseable_pct = int(solacc.parseable_count / all_files_len * 100)
missing_imports_count = sum(sum(details.values()) for details in solacc.missing_imports.values())
@@ -192,13 +207,10 @@ def _lint_dirs(dir_to_lint: str | None):
def main(args: list[str]):
install_logger()
logging.root.setLevel(logging.INFO)
- dir_to_lint = args[1] if len(args) > 1 else None
- if not dir_to_lint:
- # don't clone if linting just one file, assumption is we're troubleshooting
- logger.info("Cloning...")
- _clone_all()
- logger.info("Linting...")
- _lint_dirs(dir_to_lint)
+ sol_to_lint = args[1] if len(args) > 1 else None
+ logger.info("Fetching repos to clone...")
+ repo_urls = _get_repos_to_clone()
+ _lint_repos(repo_urls, sol_to_lint)
if __name__ == "__main__":
From e3d34d1599432fec8ad785f885f9eb78f57bc98f Mon Sep 17 00:00:00 2001
From: Andrew Snare
Date: Thu, 26 Sep 2024 18:33:31 +0200
Subject: [PATCH 3/7] Documentation: table utilization updates (#2755)
## Changes
This PR adds updates the table utilization table:
- Missing linter tables.
- Since #2678 the assessment workflow also writes to
`workflow_problems`.
---
docs/table_persistence.md | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/docs/table_persistence.md b/docs/table_persistence.md
index 948b276609..4aecd9469d 100644
--- a/docs/table_persistence.md
+++ b/docs/table_persistence.md
@@ -14,6 +14,8 @@ Table Utilization:
| permissions | RW | | RW | RO | | RO | | |
| jobs | RW | RW | | | RO | | | |
| clusters | RW | RW | | | | | | |
+| directfs_in_paths | RW | | | | | | | RW |
+| directfs_in_queries | RW | | | | | | | RW |
| external_locations | RW | | | RO | | | | |
| workspace | RW | | RO | | RO | | | |
| workspace_objects | RW | | | | | | | |
@@ -25,7 +27,8 @@ Table Utilization:
| submit_runs | RW | | | | | | | |
| policies | RW | RW | | | | | | |
| migration_status | | RW | | RW | | RW | | |
-| workflow_problems | | | | | | | | RW |
+| query_problems | RW | | | | | | | RW |
+| workflow_problems | RW | | | | | | | RW |
| udfs | RW | RW | RO | | | | | |
| logs | RW | | RW | RW | | RW | RW | |
| recon_results | | | | | | | RW | |
From fc4ec374ed98910131f0faf9283da1640b31745e Mon Sep 17 00:00:00 2001
From: Eric Vergnaud
Date: Thu, 26 Sep 2024 18:34:40 +0200
Subject: [PATCH 4/7] Fix sqlglot crasher with 'drop schema ...' statement
(#2758)
## Changes
Fix a crash
### Linked issues
None
### Functionality
None
### Tests
- [x] added unit tests
---------
Co-authored-by: Eric Vergnaud
---
.../ucx/source_code/linters/from_table.py | 82 +++++++++++--------
.../source_code/linters/test_from_table.py | 8 ++
2 files changed, 58 insertions(+), 32 deletions(-)
diff --git a/src/databricks/labs/ucx/source_code/linters/from_table.py b/src/databricks/labs/ucx/source_code/linters/from_table.py
index 9da4dd7a2c..45ffc01697 100644
--- a/src/databricks/labs/ucx/source_code/linters/from_table.py
+++ b/src/databricks/labs/ucx/source_code/linters/from_table.py
@@ -1,8 +1,8 @@
import logging
from sqlglot import parse as parse_sql
-from sqlglot.expressions import Table, Expression, Use, Create
+from sqlglot.expressions import Table, Expression, Use, Create, Drop
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
-from databricks.labs.ucx.source_code.base import Deprecation, CurrentSessionState, SqlLinter, Fixer
+from databricks.labs.ucx.source_code.base import Deprecation, CurrentSessionState, SqlLinter, Fixer, Failure
logger = logging.getLogger(__name__)
@@ -43,37 +43,55 @@ def schema(self):
def lint_expression(self, expression: Expression):
for table in expression.find_all(Table):
- if isinstance(expression, Use):
- # Sqlglot captures the database name in the Use statement as a Table, with
- # the schema as the table name.
- self._session_state.schema = table.name
- continue
- if isinstance(expression, Create) and getattr(expression, "kind", None) == "SCHEMA":
- # Sqlglot captures the schema name in the Create statement as a Table, with
- # the schema as the db name.
- self._session_state.schema = table.db
- continue
+ try:
+ yield from self._unsafe_lint_expression(expression, table)
+ except Exception as _: # pylint: disable=broad-exception-caught
+ yield Failure(
+ code='sql-parse-error',
+ message=f"Could not parse SQL expression: {expression} ",
+ # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159
+ start_line=0,
+ start_col=0,
+ end_line=0,
+ end_col=1024,
+ )
- # we only migrate tables in the hive_metastore catalog
- if self._catalog(table) != 'hive_metastore':
- continue
- # Sqlglot uses db instead of schema, watch out for that
- src_schema = table.db if table.db else self._session_state.schema
- if not src_schema:
- logger.error(f"Could not determine schema for table {table.name}")
- continue
- dst = self._index.get(src_schema, table.name)
- if not dst:
- continue
- yield Deprecation(
- code='table-migrated-to-uc',
- message=f"Table {src_schema}.{table.name} is migrated to {dst.destination()} in Unity Catalog",
- # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159
- start_line=0,
- start_col=0,
- end_line=0,
- end_col=1024,
- )
+ def _unsafe_lint_expression(self, expression: Expression, table: Table):
+ if isinstance(expression, Use):
+ # Sqlglot captures the database name in the Use statement as a Table, with
+ # the schema as the table name.
+ self._session_state.schema = table.name
+ return
+ if isinstance(expression, Drop) and getattr(expression, "kind", None) == "SCHEMA":
+ # Sqlglot captures the schema name in the Drop statement as a Table, with
+ # the schema as the db name.
+ return
+ if isinstance(expression, Create) and getattr(expression, "kind", None) == "SCHEMA":
+ # Sqlglot captures the schema name in the Create statement as a Table, with
+ # the schema as the db name.
+ self._session_state.schema = table.db
+ return
+
+ # we only migrate tables in the hive_metastore catalog
+ if self._catalog(table) != 'hive_metastore':
+ return
+ # Sqlglot uses db instead of schema, watch out for that
+ src_schema = table.db if table.db else self._session_state.schema
+ if not src_schema:
+ logger.error(f"Could not determine schema for table {table.name}")
+ return
+ dst = self._index.get(src_schema, table.name)
+ if not dst:
+ return
+ yield Deprecation(
+ code='table-migrated-to-uc',
+ message=f"Table {src_schema}.{table.name} is migrated to {dst.destination()} in Unity Catalog",
+ # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159
+ start_line=0,
+ start_col=0,
+ end_line=0,
+ end_col=1024,
+ )
@staticmethod
def _catalog(table):
diff --git a/tests/unit/source_code/linters/test_from_table.py b/tests/unit/source_code/linters/test_from_table.py
index 6469b1c622..921486ecc5 100644
--- a/tests/unit/source_code/linters/test_from_table.py
+++ b/tests/unit/source_code/linters/test_from_table.py
@@ -87,6 +87,14 @@ def test_parses_create_schema(migration_index):
assert not list(advices)
+def test_parses_drop_schema(migration_index):
+ query = "DROP SCHEMA xyz"
+ session_state = CurrentSessionState(schema="old")
+ ftf = FromTableSqlLinter(migration_index, session_state=session_state)
+ advices = ftf.lint(query)
+ assert not list(advices)
+
+
def test_raises_advice_when_parsing_unsupported_sql(migration_index):
query = "XDESCRIBE DETAILS xyz" # not a valid query
session_state = CurrentSessionState(schema="old")
From cad665cb5425010bd9cca1e9e4c889540d1aee58 Mon Sep 17 00:00:00 2001
From: Pritish Pai <136742693+pritishpai@users.noreply.github.com>
Date: Thu, 26 Sep 2024 12:35:07 -0400
Subject: [PATCH 5/7] Fixes error message (#2759)
## Changes
Fixes incorrect error message discovered while working on something else
---
src/databricks/labs/ucx/hive_metastore/mapping.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py
index 3aef32ba20..04ee92ec94 100644
--- a/src/databricks/labs/ucx/hive_metastore/mapping.py
+++ b/src/databricks/labs/ucx/hive_metastore/mapping.py
@@ -113,7 +113,7 @@ def load(self) -> list[Rule]:
try:
return self._installation.load(list[Rule], filename=self.FILENAME)
except NotFound:
- msg = "Please run: databricks labs ucx table-mapping"
+ msg = "Please run: databricks labs ucx create-table-mapping"
raise ValueError(msg) from None
def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]):
From d14a2cf98c745224aee01be65a4e12162db4fd85 Mon Sep 17 00:00:00 2001
From: Serge Smertin <259697+nfx@users.noreply.github.com>
Date: Thu, 26 Sep 2024 18:45:20 +0200
Subject: [PATCH 6/7] Release v0.38.0 (#2762)
* Added Py4j implementation of tables crawler to retrieve a list of HMS
tables in the assessment workflow
([#2579](https://github.com/databrickslabs/ucx/issues/2579)). In this
release, we have added a Py4j implementation of a tables crawler to
retrieve a list of Hive Metastore tables in the assessment workflow. A
new `FasterTableScanCrawler` class has been introduced, which can be
used in the Assessment Job based on a feature flag to replace the old
Scala code, allowing for better logging during table scans. The existing
`assessment.crawl_tables` workflow now utilizes the new py4j crawler
instead of the scala one. Integration tests have been added to ensure
the functionality works correctly. The commit also includes a new method
for listing table names in the specified database and improvements to
error handling and logging mechanisms. The new Py4j tables crawler
enhances the functionality of the assessment workflow by improving error
handling, resulting in better logging and faster table scanning during
the assessment process. This change is part of addressing issue
[#2190](https://github.com/databrickslabs/ucx/issues/2190) and was
co-authored by Serge Smertin.
* Added `create-ucx-catalog` cli command
([#2694](https://github.com/databrickslabs/ucx/issues/2694)). A new CLI
command, `create-ucx-catalog`, has been added to create a catalog for
migration tracking that can be used across multiple workspaces. The
command creates a UCX catalog for tracking migration status and
artifacts, and is created by running `databricks labs ucx
create-ucx-catalog` and specifying the storage location for the catalog.
Relevant user documentation, unit tests, and integration tests have been
added for this command. The `assign-metastore` command has also been
updated to allow for the selection of a metastore when multiple
metastores are available in the workspace region. This change improves
the migration tracking feature and enhances the user experience.
* Added experimental `migration-progress-experimental` workflow
([#2658](https://github.com/databrickslabs/ucx/issues/2658)). This
commit introduces an experimental workflow,
`migration-progress-experimental`, which refreshes the inventory for
various resources such as clusters, grants, jobs, pipelines, policies,
tables, TableMigrationStatus, and UDFs. The workflow can be triggered
using the `databricks labs ucx migration-progress` CLI command and uses
a new implementation of a Scala-based crawler, `TablesCrawler`, which
will eventually replace the current implementation. The new workflow is
a duplicate of most of the `assessment` pipeline's functionality but
with some differences, such as the use of `TablesCrawler`. Relevant user
documentation has been added, along with unit tests, integration tests,
and a screenshot of a successful staging environment run. The new
workflow is expected to run on a schedule in the future. This change
resolves [#2574](https://github.com/databrickslabs/ucx/issues/2574) and
progresses [#2074](https://github.com/databrickslabs/ucx/issues/2074).
* Added handling for `InternalError` in `Listing.__iter__`
([#2697](https://github.com/databrickslabs/ucx/issues/2697)). This
release introduces improved error handling in the `Listing.__iter__`
method of the `Generic` class, located in the
`workspace_access/generic.py` file. Previously, only `NotFound`
exceptions were handled, but now both `InternalError` and `NotFound`
exceptions are caught and logged appropriately. This change enhances the
robustness of the method, which is responsible for listing objects of a
specific type and returning them as `GenericPermissionsInfo` objects. To
ensure the correct functionality, we have added new unit tests and
manual testing. The logging of the `InternalError` exception is properly
handled in the `GenericPermissionsSupport` class when listing serving
endpoints. This behavior is verified by the newly added test function
`test_internal_error_in_serving_endpoints_raises_warning` and the
updated `test_serving_endpoints_not_enabled_raises_warning`.
* Added handling for `PermissionDenied` when listing accessible
workspaces ([#2733](https://github.com/databrickslabs/ucx/issues/2733)).
A new `can_administer` method has been added to the `Workspaces` class
in the `workspaces.py` file, which allows for more fine-grained control
over which users can administer workspaces. This method checks if the
user has access to a given workspace and is a member of the workspace's
`admins` group, indicating that the user has administrative privileges
for that workspace. If the user does not have access to the workspace or
is not a member of the `admins` group, the method returns `False`.
Additionally, error handling in the `get_accessible_workspaces` method
has been improved by adding a `PermissionDenied` exception to the list
of exceptions that are caught and logged. New unit tests have been added
for the `AccountWorkspaces` class of the
`databricks.labs.blueprint.account` module to ensure that the new method
is functioning as intended, specifically checking if a user is a
workspace administrator based on whether they belong to the `admins`
group. The linked issue
[#2732](https://github.com/databrickslabs/ucx/issues/2732) is resolved
by this change. All changes have been manually and unit tested.
* Added static code analysis results to assessment dashboard
([#2696](https://github.com/databrickslabs/ucx/issues/2696)). This
commit introduces two new tasks, `assess_dashboards` and
`assess_workflows`, to the existing assessment dashboard for identifying
migration problems in dashboards and workflows. These tasks analyze
embedded queries and notebooks for migration issues and collect direct
filesystem access patterns requiring attention. Upon completion, the
results are stored in the inventory database and displayed on the
Migration dashboard. Additionally, two new widgets, job/query problem
widgets and directfs access widgets, have been added to enhance the
dashboard's functionality by providing additional information related to
code compatibility and access control. Integration tests using mock data
have been added and manually tested to ensure the proper functionality
of these new features. This update improves the overall assessment and
compatibility checking capabilities of the dashboard, making it easier
for users to identify and address issues related to Unity Catalog
compatibility in their workflows and dashboards.
* Added unskip CLI command to undo a skip on schema or a table
([#2727](https://github.com/databrickslabs/ucx/issues/2727)). This pull
request introduces a new CLI command, "unskip", which allows users to
reverse a previously applied `skip` on a schema or table. The `unskip`
command accepts a required `--schema` parameter and an optional
`--table` parameter. A new function, also named "unskip", has been
added, which takes the same parameters as the `skip` command. The
function checks for the required `--schema` parameter and creates a new
WorkspaceContext object to call the appropriate method on the
table_mapping object. Two new methods, `unskip_schema` and
"unskip_table_or_view", have been added to the HiveMapping class. These
methods remove the skip mark from a schema or table, respectively, and
handle exceptions such as NotFound and BadRequest. The
get_tables_to_migrate method has been updated to consider the unskipped
tables or schemas. Currently, the feature is tested manually and has not
been added to the user documentation.
* Added unskip CLI command to undo a skip on schema or a table
([#2734](https://github.com/databrickslabs/ucx/issues/2734)). A new
`unskip` CLI command has been added to the project, which allows users
to remove the `skip` mark set by the existing `skip` command on a
specified schema or table. This command takes an optional `--table`
flag, and if not provided, it will unskip the entire schema. The new
functionality is accompanied by a unit test and relevant user
documentation, and addresses issue
[#1938](https://github.com/databrickslabs/ucx/issues/1938). The
implementation includes the addition of the `unskip_table_or_view`
method, which generates the appropriate `ALTER TABLE/VIEW` statement to
remove the skip marker, and updates to the `unskip_schema` method to
include the schema name in the `ALTER SCHEMA` statement. Additionally,
exception handling has been updated to include `NotFound` and
`BadRequest` exceptions. This feature simplifies the process of undoing
a skip on a schema, table, or view in the Hive metastore, which
previously required manual editing of the Hive metastore properties.
* Assess source code as part of the assessment
([#2678](https://github.com/databrickslabs/ucx/issues/2678)). This
commit introduces enhancements to the assessment workflow, including the
addition of two new tasks for evaluating source code from SQL queries in
dashboards and from notebooks/files in jobs and tasks. The existing
`databricks labs install ucx` command has been modified to incorporate
linting during the assessment. The `QueryLinter` class has been updated
to accept an additional argument for linting source code. These changes
have been thoroughly tested through integration tests to ensure proper
functionality. Co-authored by Eric Vergnaud.
* Bump astroid version, pylint version and drop our f-string workaround
([#2746](https://github.com/databrickslabs/ucx/issues/2746)). In this
update, we have bumped the versions of astroid and pylint to 3.3.1 and
removed workarounds related to f-string inference limitations in
previous versions of astroid (< 3.3). These workarounds were necessary
for handling issues such as uninferrable sys.path values and the lack of
f-string inference in loops. We have also updated corresponding tests to
reflect these changes and improve the overall code quality and
maintainability of the project. These changes are part of a larger
effort to update dependencies and simplify the codebase by leveraging
the latest features of updated tools and removing obsolete workarounds.
* Delete temporary files when running solacc
([#2750](https://github.com/databrickslabs/ucx/issues/2750)). This
commit includes changes to the `solacc.py` script to improve the linting
process for the `solacc` repository, specifically targeting the issue of
excessive temporary files that were exceeding CI storage capacity. The
modifications include linting the repository on a per-top-level
`solution` basis, where each solution resides within the top folders and
is independent of others. Post-linting, temporary files and directories
registered in `PathLookup` are deleted to enhance storage efficiency.
Additionally, this commit prepares for improving false positive
detection and introduces a new `SolaccContext` class that tracks various
aspects of the linting process, providing more detailed feedback on the
linting results. This change does not introduce new functionality or
modify existing functionality, but rather optimizes the linting process
for the `solacc` repository, maintaining CI storage capacity levels
within acceptable limits.
* Don't report direct filesystem access for API calls
([#2689](https://github.com/databrickslabs/ucx/issues/2689)). This
release introduces enhancements to the Direct File System Access (DFSA)
linter, resolving false positives in API call reporting. The
`ws.api_client.do` call previously triggered inaccurate direct
filesystem access alerts, which have been addressed by adding new
methods to identify HTTP call parameters and specific API calls. The
linter now disregards DFSA patterns within known API calls, eliminating
false positives with relative URLs and duplicate advice from
SparkSqlPyLinter. Additionally, improvements in the `python_ast.py` and
`python_infer.py` files include the addition of `is_instance_of` and
`is_from_module` methods, along with safer inference methods to prevent
infinite recursion and enhance value inference. These changes
significantly improve the DFSA linter's accuracy and effectiveness when
analyzing code containing API calls.
* Enables cli cmd `databricks labs ucx create-catalog-schemas` to apply
catalog/schema acl from legacy hive_metastore
([#2676](https://github.com/databrickslabs/ucx/issues/2676)). The new
release introduces a `databricks labs ucx create-catalog-schemas`
command, which applies catalog/schema Access Control List (ACL) from a
legacy hive_metastore. This command modifies the existing
`table_mapping` method to include a new `grants_crawler` parameter in
the `CatalogSchema` constructor, enabling the application of ACLs from
the legacy hive_metastore. A corresponding unit test is included to
ensure proper functionality. The `CatalogSchema` class in the
`databricks.labs.ucx.hive_metastore.catalog_schema` module has been
updated with a new argument `hive_acl` and the integration of the
`GrantsCrawler` class. The `GrantsCrawler` class is responsible for
crawling the Hive metastore and retrieving grants for catalogs, schemas,
and tables. The `prepare_test` function has been updated to include the
`hive_acl` argument and the `test_catalog_schema_acl` function has been
updated to test the new functionality, ensuring that the correct grant
statements are generated for a wider range of principals and
catalogs/schemas. These changes improve the functionality and usability
of the `databricks labs ucx create-catalog-schemas` command, allowing
for a more seamless transition from a legacy hive metastore.
* Fail `make test` on coverage below 90%
([#2682](https://github.com/databrickslabs/ucx/issues/2682)). A new
change has been introduced to the pyproject.toml file to enhance the
codebase's quality and robustness by ensuring that the test coverage
remains above 90%. This has been accomplished by adding the
`--cov-fail-under=90` flag to the `test` and `coverage` scripts in the
`[tool.hatch.envs.default.scripts]` section. This flag will cause the
`make test` command to fail if the coverage percentage falls below the
specified value of 90%, ensuring that all new changes are thoroughly
tested and that the codebase maintains a minimum coverage threshold.
This is a best practice for maintaining code coverage and improving the
overall quality and reliability of the codebase.
* Fixed DFSA false positives from f-string fragments
([#2679](https://github.com/databrickslabs/ucx/issues/2679)). This
commit addresses false positive DataFrame API Scanning Antipattern
(DFSA) reports in Python code, specifically in f-string fragments
containing forward slashes and curly braces. The linter has been updated
to accurately detect DFSA paths while avoiding false positives, and it
now checks for `JoinedStr` fragments in string constants. Additionally,
the commit rectifies issues with duplicate advices reported by
`SparkSqlPyLinter`. No new features or major functionality changes have
been introduced; instead, the focus has been on improving the
reliability and accuracy of DFSA detection. Co-authored by Eric
Vergnaud, this commit includes new unit tests and refinements to the
DFSA linter, specifically addressing false positive patterns like
`f"/Repos/{thing1}/sdk-{thing2}-{thing3}"`. To review these changes,
consult the updated tests in the
`tests/unit/source_code/linters/test_directfs.py` file, such as the new
test case for the f-string pattern causing false positives. By
understanding these improvements, you'll ensure your project adheres to
the latest updates, maintaining quality and accurate DFSA detection.
* Fixed failing integration tests that perform a real assessment
([#2736](https://github.com/databrickslabs/ucx/issues/2736)). In this
release, we have made significant improvements to the integration tests
in the `assessment` workflow, by reducing the scope of the assessment
and improving efficiency and reliability. We have removed several object
creation functions and added a new function `populate_for_linting` for
linting purposes. The `populate_for_linting` function adds necessary
information to the installation context, and is used to ensure that the
integration tests still have the required data for linting. We have also
added a pytest fixture `populate_for_linting` to set up a minimal amount
of data in the workspace for linting purposes. These changes have been
implemented in the `test_workflows.py` file in the
integration/assessment directory. This will help to ensure that the
tests are not unnecessarily extensive, and that they are able to
accurately assess the functionality of the library.
* Fixed sqlglot crasher with 'drop schema ...' statement
([#2758](https://github.com/databrickslabs/ucx/issues/2758)). In this
release, we have addressed a crash issue in the `sqlglot` library caused
by the `drop schema` statement. A new method, `_unsafe_lint_expression`,
has been introduced to prevent the crash by checking if the current
expression is a `Use`, `Create`, or `Drop` statement and updating the
`schema` attribute accordingly. The library now correctly handles the
`drop schema` statement and returns a `Deprecation` warning if the table
being processed is in the `hive_metastore` catalog and has been migrated
to the Unity Catalog. Unit tests have been added to ensure the correct
behavior of this code, and the linter for `from table` SQL has been
updated to parse and handle the `drop schema` statement without raising
any errors. These changes improve the library's overall reliability and
stability, allowing it to operate smoothly with the `drop schema`
statement.
* Fixed test failure:
`test_table_migration_job_refreshes_migration_status[regular-migrate-tables]`
([#2625](https://github.com/databrickslabs/ucx/issues/2625)). In this
release, we have addressed two issues
([#2621](https://github.com/databrickslabs/ucx/issues/2621) and
[#2537](https://github.com/databrickslabs/ucx/issues/2537)) and fixed a
test failure in
`test_table_migration_job_refreshes_migration_status[regular-migrate-tables]`.
The `index` and `index_full_refresh` methods in `table_migrate.py` have
been updated to accept a new `force_refresh` flag. When set to `True`,
these methods will ensure that the migration status is up-to-date. This
change also affects the `ViewsMigrationSequencer` class, which now
passes `force_refresh=True` to the `index` method. Additionally, we have
fixed a test failure by reusing the `force_refresh` flag to ensure the
migration status is up-to-date. The `TableMigrationStatus` class in
`table_migration_status.py` has been modified to accept an optional
`force_refresh` parameter in the `index` method, and a unit test has
been updated to assert the correct behavior when updating the migration
status.
* Fixes error message
([#2759](https://github.com/databrickslabs/ucx/issues/2759)). The `load`
method of the `mapping.py` file in the
`databricks/labs/ucx/hive_metastore` package has been updated to correct
an error message displayed when a `NotFound` exception is raised. The
previous message suggested running an incorrect command, which has been
updated to the correct one: "Please run: databricks labs ucx
create-table-mapping". This change does not add any new methods or alter
existing functionality, but instead focuses on improving the user
experience by providing accurate information when an error occurs. The
scope of this change is limited to updating the error message, and no
other modifications have been made.
* Fixes issue of circular dependency of migrate-location ACL
([#2741](https://github.com/databrickslabs/ucx/issues/2741)). In this
release, we have resolved two issues
([#274](https://github.com/databrickslabs/ucx/issues/274)
* Fixes source table alias dissapearance during migrate_views
([#2726](https://github.com/databrickslabs/ucx/issues/2726)). This
release introduces a fix to preserve the alias for the source table
during the conversion of CREATE VIEW SQL from the legacy Hive metastore
to the Unity Catalog. The issue was addressed by adding a new test case,
`test_migrate_view_alias_test`, to verify the correct handling of table
aliases during migration. The changes also include a fix for the SQL
conversion and new test cases to ensure the correct handling of table
aliases, reflected in accurate SQL conversion. A new parameter, `alias`,
has been added to the Table class, and the `apply` method in the
`from_table.py` file has been updated. The migration process has been
updated to retain the original alias of the table. Unit tests have been
added and thoroughly tested to confirm the correctness of the changes,
including handling potential intermittent failures caused by external
dependencies.
* Py4j table crawler: suggestions/fixes for describing tables
([#2684](https://github.com/databrickslabs/ucx/issues/2684)). This
release introduces significant improvements and fixes to the Py4J-based
table crawler, enhancing its capability to describe tables effectively.
The code for fetching table properties over the bridge has been updated,
and error tracing has been improved through individual fetching of each
table property and providing python backtrace on JVM side errors. Scala
`Option` values unboxing issues have been resolved, and a small
optimization has been implemented to detect partitioned tables without
materializing the collection. The table's `.viewText()` property is now
properly handled as a Scala `Option`. The `catalog` argument is now
explicitly verified to be `hive_metastore`, and a new static method
`_option_as_python` has been introduced for safely extracting values
from Scala `Option`. The `_describe` method has been refactored to
handle exceptions more gracefully and improved code readability. These
changes result in better functionality, error handling, logging, and
performance when describing tables within a specified catalog and
database. The linked issues
[#2658](https://github.com/databrickslabs/ucx/issues/2658) and
[#2579](https://github.com/databrickslabs/ucx/issues/2579) are
progressed through these updates, and appropriate testing has been
conducted to ensure the improvements' effectiveness.
* Speedup assessment workflow by making DBFS root table size calculation
parallel ([#2745](https://github.com/databrickslabs/ucx/issues/2745)).
In this release, the assessment workflow for calculating DBFS root table
size has been optimized through the parallelization of the calculation
process, resulting in improved performance. This has been achieved by
updating the `pipelines_crawler` function in
`src/databricks/labs/ucx/contexts/workflow_task.py`, specifically the
`cached_property table_size_crawler`, to include an additional argument
`self.config.include_databases`. The `TablesCrawler` class has also been
modified to include a generic type parameter `Table`, enabling type
hinting and more robust type checking. Furthermore, the unit test file
`test_table_size.py` in the `hive_metastore` directory has been updated
to handle corrupt tables and invalid delta format errors more
effectively. Additionally, a new entry `databricks-pydabs` has been
added to the "known.json" file, potentially enabling better integration
with the `databricks-pydabs` library or providing necessary
configuration information for parallel processing. Overall, these
changes improve the efficiency and scalability of the codebase and
optimize the assessment workflow for calculating DBFS root table size.
* Updated databricks-labs-blueprint requirement from <0.9,>=0.8 to
>=0.8,<0.10
([#2747](https://github.com/databrickslabs/ucx/issues/2747)). In this
update, the requirement for `databricks-labs-blueprint` has been updated
to version `>=0.8,<0.10` in the `pyproject.toml` file. This change
allows the project to utilize the latest features and bug fixes included
in version 0.9.0 of the `databricks-labs-blueprint` library. Notable
updates in version 0.9.0 consist of the addition of Databricks CLI
version as part of routed command telemetry and support for Unicode Byte
Order Mark (BOM) in file upload and download operations. Additionally,
various bug fixes and improvements have been implemented for the
`WorkspacePath` class, including the addition of `stat()` methods and
improved compatibility with different versions of Python.
* Updated databricks-labs-lsql requirement from <0.12,>=0.5 to
>=0.5,<0.13
([#2688](https://github.com/databrickslabs/ucx/issues/2688)). In this
update, the version requirement of the `databricks-labs-lsql` library
has been changed from a version greater than or equal to 0.5 and less
than 0.12 to a version greater than or equal to 0.5 and less than 0.13.
This allows the project to utilize the latest version of
'databricks-labs-lsql', which includes new methods for differentiating
between a table that has never been written to and one with zero rows in
the MockBackend class. Additionally, the update adds support for various
filter types and improves testing coverage and reliability. The release
notes and changelog for the updated library are provided in the commit
message for reference.
* Updated documentation to explain the usage of collections and eligible
commands ([#2738](https://github.com/databrickslabs/ucx/issues/2738)).
The latest update to the Databricks Labs Unified CLI (UCX) tool
introduces the `join-collection` command, which enables users to join
two or more workspaces into a collection, allowing for streamlined and
consolidated command execution across multiple workspaces. This feature
is available to Account admins on the Databricks account, Workspace
admins on the workspaces to be joined, and requires UCX installation on
the workspace. To run collection-eligible commands, users can simply
pass the `--run-as-collection=True` flag. This enhancement enhances the
UCX tool's functionality, making it easier to manage and execute
commands on multiple workspaces.
* Updated sqlglot requirement from <25.22,>=25.5.0 to >=25.5.0,<25.23
([#2687](https://github.com/databrickslabs/ucx/issues/2687)). In this
pull request, we have updated the version requirement for the `sqlglot`
library in the pyproject.toml file. The previous requirement specified a
version greater than or equal to 25.5.0 and less than 25.22, but we have
updated it to allow for versions greater than or equal to 25.5.0 and
less than 25.23. This change allows us to use the latest version of
'sqlglot', while still ensuring compatibility with other dependencies.
Additionally, this pull request includes a detailed changelog from the
`sqlglot` repository, which provides information on the features, bug
fixes, and changes included in each version. This can help us understand
the scope of the update and how it may impact our project.
* [DOCUMENTATION] Improve documentation on using account profile for
`sync-workspace-info` cli command
([#2683](https://github.com/databrickslabs/ucx/issues/2683)). The
`sync-workspace-info` CLI command has been added to the Databricks Labs
UCX package, which uploads the workspace configuration to all workspaces
in the Databricks account where the `ucx` tool is installed. This
feature requires Databricks Account Administrator privileges and is
necessary to create an immutable default catalog mapping for the table
migration process. It also serves as a prerequisite for the
`create-table-mapping` command. To utilize this command, users must
configure the Databricks CLI profile with access to the Databricks
account console, available at "accounts.cloud.databricks.com" or
"accounts.azuredatabricks.net". Additionally, the documentation for
using the account profile with the `sync-workspace-info` command has
been enhanced, addressing issue
[#1762](https://github.com/databrickslabs/ucx/issues/1762).
* [DOCUMENTATION] Improve documentation when installing UCX from a
machine with restricted internet access
([#2690](https://github.com/databrickslabs/ucx/issues/2690)). "A new
section has been added to the `ADVANCED` installation section of the UCX
library documentation, providing detailed instructions for installing
UCX with a company-hosted PyPI mirror. This feature is intended for
environments with restricted internet access, allowing users to bypass
the public PyPI index and use a company-controlled mirror instead. Users
will need to add all UCX dependencies to the company-hosted PyPI mirror
and set the `PIP_INDEX_URL` environment variable to the mirror URL
during installation. The solution also includes a prompt asking the user
if their workspace blocks internet access. Additionally, the
documentation has been updated to clarify that UCX requires internet
access to connect to GitHub for downloading the tool, specifying the
necessary URLs that need to be accessible. This update aims to improve
the installation process for users with restricted internet access and
provide clear instructions and prompts for installing UCX on machines
with limited internet connectivity."
Dependency updates:
* Updated sqlglot requirement from <25.22,>=25.5.0 to >=25.5.0,<25.23
([#2687](https://github.com/databrickslabs/ucx/pull/2687)).
* Updated databricks-labs-lsql requirement from <0.12,>=0.5 to
>=0.5,<0.13 ([#2688](https://github.com/databrickslabs/ucx/pull/2688)).
* Updated databricks-labs-blueprint requirement from <0.9,>=0.8 to
>=0.8,<0.10 ([#2747](https://github.com/databrickslabs/ucx/pull/2747)).
---
CHANGELOG.md | 38 ++++++++++++++++++++++++++++
src/databricks/labs/ucx/__about__.py | 2 +-
2 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa5441b8a9..585c723e3a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,43 @@
# Version changelog
+## 0.38.0
+
+* Added Py4j implementation of tables crawler to retrieve a list of HMS tables in the assessment workflow ([#2579](https://github.com/databrickslabs/ucx/issues/2579)). In this release, we have added a Py4j implementation of a tables crawler to retrieve a list of Hive Metastore tables in the assessment workflow. A new `FasterTableScanCrawler` class has been introduced, which can be used in the Assessment Job based on a feature flag to replace the old Scala code, allowing for better logging during table scans. The existing `assessment.crawl_tables` workflow now utilizes the new py4j crawler instead of the scala one. Integration tests have been added to ensure the functionality works correctly. The commit also includes a new method for listing table names in the specified database and improvements to error handling and logging mechanisms. The new Py4j tables crawler enhances the functionality of the assessment workflow by improving error handling, resulting in better logging and faster table scanning during the assessment process. This change is part of addressing issue [#2190](https://github.com/databrickslabs/ucx/issues/2190) and was co-authored by Serge Smertin.
+* Added `create-ucx-catalog` cli command ([#2694](https://github.com/databrickslabs/ucx/issues/2694)). A new CLI command, `create-ucx-catalog`, has been added to create a catalog for migration tracking that can be used across multiple workspaces. The command creates a UCX catalog for tracking migration status and artifacts, and is created by running `databricks labs ucx create-ucx-catalog` and specifying the storage location for the catalog. Relevant user documentation, unit tests, and integration tests have been added for this command. The `assign-metastore` command has also been updated to allow for the selection of a metastore when multiple metastores are available in the workspace region. This change improves the migration tracking feature and enhances the user experience.
+* Added experimental `migration-progress-experimental` workflow ([#2658](https://github.com/databrickslabs/ucx/issues/2658)). This commit introduces an experimental workflow, `migration-progress-experimental`, which refreshes the inventory for various resources such as clusters, grants, jobs, pipelines, policies, tables, TableMigrationStatus, and UDFs. The workflow can be triggered using the `databricks labs ucx migration-progress` CLI command and uses a new implementation of a Scala-based crawler, `TablesCrawler`, which will eventually replace the current implementation. The new workflow is a duplicate of most of the `assessment` pipeline's functionality but with some differences, such as the use of `TablesCrawler`. Relevant user documentation has been added, along with unit tests, integration tests, and a screenshot of a successful staging environment run. The new workflow is expected to run on a schedule in the future. This change resolves [#2574](https://github.com/databrickslabs/ucx/issues/2574) and progresses [#2074](https://github.com/databrickslabs/ucx/issues/2074).
+* Added handling for `InternalError` in `Listing.__iter__` ([#2697](https://github.com/databrickslabs/ucx/issues/2697)). This release introduces improved error handling in the `Listing.__iter__` method of the `Generic` class, located in the `workspace_access/generic.py` file. Previously, only `NotFound` exceptions were handled, but now both `InternalError` and `NotFound` exceptions are caught and logged appropriately. This change enhances the robustness of the method, which is responsible for listing objects of a specific type and returning them as `GenericPermissionsInfo` objects. To ensure the correct functionality, we have added new unit tests and manual testing. The logging of the `InternalError` exception is properly handled in the `GenericPermissionsSupport` class when listing serving endpoints. This behavior is verified by the newly added test function `test_internal_error_in_serving_endpoints_raises_warning` and the updated `test_serving_endpoints_not_enabled_raises_warning`.
+* Added handling for `PermissionDenied` when listing accessible workspaces ([#2733](https://github.com/databrickslabs/ucx/issues/2733)). A new `can_administer` method has been added to the `Workspaces` class in the `workspaces.py` file, which allows for more fine-grained control over which users can administer workspaces. This method checks if the user has access to a given workspace and is a member of the workspace's `admins` group, indicating that the user has administrative privileges for that workspace. If the user does not have access to the workspace or is not a member of the `admins` group, the method returns `False`. Additionally, error handling in the `get_accessible_workspaces` method has been improved by adding a `PermissionDenied` exception to the list of exceptions that are caught and logged. New unit tests have been added for the `AccountWorkspaces` class of the `databricks.labs.blueprint.account` module to ensure that the new method is functioning as intended, specifically checking if a user is a workspace administrator based on whether they belong to the `admins` group. The linked issue [#2732](https://github.com/databrickslabs/ucx/issues/2732) is resolved by this change. All changes have been manually and unit tested.
+* Added static code analysis results to assessment dashboard ([#2696](https://github.com/databrickslabs/ucx/issues/2696)). This commit introduces two new tasks, `assess_dashboards` and `assess_workflows`, to the existing assessment dashboard for identifying migration problems in dashboards and workflows. These tasks analyze embedded queries and notebooks for migration issues and collect direct filesystem access patterns requiring attention. Upon completion, the results are stored in the inventory database and displayed on the Migration dashboard. Additionally, two new widgets, job/query problem widgets and directfs access widgets, have been added to enhance the dashboard's functionality by providing additional information related to code compatibility and access control. Integration tests using mock data have been added and manually tested to ensure the proper functionality of these new features. This update improves the overall assessment and compatibility checking capabilities of the dashboard, making it easier for users to identify and address issues related to Unity Catalog compatibility in their workflows and dashboards.
+* Added unskip CLI command to undo a skip on schema or a table ([#2727](https://github.com/databrickslabs/ucx/issues/2727)). This pull request introduces a new CLI command, "unskip", which allows users to reverse a previously applied `skip` on a schema or table. The `unskip` command accepts a required `--schema` parameter and an optional `--table` parameter. A new function, also named "unskip", has been added, which takes the same parameters as the `skip` command. The function checks for the required `--schema` parameter and creates a new WorkspaceContext object to call the appropriate method on the table_mapping object. Two new methods, `unskip_schema` and "unskip_table_or_view", have been added to the HiveMapping class. These methods remove the skip mark from a schema or table, respectively, and handle exceptions such as NotFound and BadRequest. The get_tables_to_migrate method has been updated to consider the unskipped tables or schemas. Currently, the feature is tested manually and has not been added to the user documentation.
+* Added unskip CLI command to undo a skip on schema or a table ([#2734](https://github.com/databrickslabs/ucx/issues/2734)). A new `unskip` CLI command has been added to the project, which allows users to remove the `skip` mark set by the existing `skip` command on a specified schema or table. This command takes an optional `--table` flag, and if not provided, it will unskip the entire schema. The new functionality is accompanied by a unit test and relevant user documentation, and addresses issue [#1938](https://github.com/databrickslabs/ucx/issues/1938). The implementation includes the addition of the `unskip_table_or_view` method, which generates the appropriate `ALTER TABLE/VIEW` statement to remove the skip marker, and updates to the `unskip_schema` method to include the schema name in the `ALTER SCHEMA` statement. Additionally, exception handling has been updated to include `NotFound` and `BadRequest` exceptions. This feature simplifies the process of undoing a skip on a schema, table, or view in the Hive metastore, which previously required manual editing of the Hive metastore properties.
+* Assess source code as part of the assessment ([#2678](https://github.com/databrickslabs/ucx/issues/2678)). This commit introduces enhancements to the assessment workflow, including the addition of two new tasks for evaluating source code from SQL queries in dashboards and from notebooks/files in jobs and tasks. The existing `databricks labs install ucx` command has been modified to incorporate linting during the assessment. The `QueryLinter` class has been updated to accept an additional argument for linting source code. These changes have been thoroughly tested through integration tests to ensure proper functionality. Co-authored by Eric Vergnaud.
+* Bump astroid version, pylint version and drop our f-string workaround ([#2746](https://github.com/databrickslabs/ucx/issues/2746)). In this update, we have bumped the versions of astroid and pylint to 3.3.1 and removed workarounds related to f-string inference limitations in previous versions of astroid (< 3.3). These workarounds were necessary for handling issues such as uninferrable sys.path values and the lack of f-string inference in loops. We have also updated corresponding tests to reflect these changes and improve the overall code quality and maintainability of the project. These changes are part of a larger effort to update dependencies and simplify the codebase by leveraging the latest features of updated tools and removing obsolete workarounds.
+* Delete temporary files when running solacc ([#2750](https://github.com/databrickslabs/ucx/issues/2750)). This commit includes changes to the `solacc.py` script to improve the linting process for the `solacc` repository, specifically targeting the issue of excessive temporary files that were exceeding CI storage capacity. The modifications include linting the repository on a per-top-level `solution` basis, where each solution resides within the top folders and is independent of others. Post-linting, temporary files and directories registered in `PathLookup` are deleted to enhance storage efficiency. Additionally, this commit prepares for improving false positive detection and introduces a new `SolaccContext` class that tracks various aspects of the linting process, providing more detailed feedback on the linting results. This change does not introduce new functionality or modify existing functionality, but rather optimizes the linting process for the `solacc` repository, maintaining CI storage capacity levels within acceptable limits.
+* Don't report direct filesystem access for API calls ([#2689](https://github.com/databrickslabs/ucx/issues/2689)). This release introduces enhancements to the Direct File System Access (DFSA) linter, resolving false positives in API call reporting. The `ws.api_client.do` call previously triggered inaccurate direct filesystem access alerts, which have been addressed by adding new methods to identify HTTP call parameters and specific API calls. The linter now disregards DFSA patterns within known API calls, eliminating false positives with relative URLs and duplicate advice from SparkSqlPyLinter. Additionally, improvements in the `python_ast.py` and `python_infer.py` files include the addition of `is_instance_of` and `is_from_module` methods, along with safer inference methods to prevent infinite recursion and enhance value inference. These changes significantly improve the DFSA linter's accuracy and effectiveness when analyzing code containing API calls.
+* Enables cli cmd `databricks labs ucx create-catalog-schemas` to apply catalog/schema acl from legacy hive_metastore ([#2676](https://github.com/databrickslabs/ucx/issues/2676)). The new release introduces a `databricks labs ucx create-catalog-schemas` command, which applies catalog/schema Access Control List (ACL) from a legacy hive_metastore. This command modifies the existing `table_mapping` method to include a new `grants_crawler` parameter in the `CatalogSchema` constructor, enabling the application of ACLs from the legacy hive_metastore. A corresponding unit test is included to ensure proper functionality. The `CatalogSchema` class in the `databricks.labs.ucx.hive_metastore.catalog_schema` module has been updated with a new argument `hive_acl` and the integration of the `GrantsCrawler` class. The `GrantsCrawler` class is responsible for crawling the Hive metastore and retrieving grants for catalogs, schemas, and tables. The `prepare_test` function has been updated to include the `hive_acl` argument and the `test_catalog_schema_acl` function has been updated to test the new functionality, ensuring that the correct grant statements are generated for a wider range of principals and catalogs/schemas. These changes improve the functionality and usability of the `databricks labs ucx create-catalog-schemas` command, allowing for a more seamless transition from a legacy hive metastore.
+* Fail `make test` on coverage below 90% ([#2682](https://github.com/databrickslabs/ucx/issues/2682)). A new change has been introduced to the pyproject.toml file to enhance the codebase's quality and robustness by ensuring that the test coverage remains above 90%. This has been accomplished by adding the `--cov-fail-under=90` flag to the `test` and `coverage` scripts in the `[tool.hatch.envs.default.scripts]` section. This flag will cause the `make test` command to fail if the coverage percentage falls below the specified value of 90%, ensuring that all new changes are thoroughly tested and that the codebase maintains a minimum coverage threshold. This is a best practice for maintaining code coverage and improving the overall quality and reliability of the codebase.
+* Fixed DFSA false positives from f-string fragments ([#2679](https://github.com/databrickslabs/ucx/issues/2679)). This commit addresses false positive DataFrame API Scanning Antipattern (DFSA) reports in Python code, specifically in f-string fragments containing forward slashes and curly braces. The linter has been updated to accurately detect DFSA paths while avoiding false positives, and it now checks for `JoinedStr` fragments in string constants. Additionally, the commit rectifies issues with duplicate advices reported by `SparkSqlPyLinter`. No new features or major functionality changes have been introduced; instead, the focus has been on improving the reliability and accuracy of DFSA detection. Co-authored by Eric Vergnaud, this commit includes new unit tests and refinements to the DFSA linter, specifically addressing false positive patterns like `f"/Repos/{thing1}/sdk-{thing2}-{thing3}"`. To review these changes, consult the updated tests in the `tests/unit/source_code/linters/test_directfs.py` file, such as the new test case for the f-string pattern causing false positives. By understanding these improvements, you'll ensure your project adheres to the latest updates, maintaining quality and accurate DFSA detection.
+* Fixed failing integration tests that perform a real assessment ([#2736](https://github.com/databrickslabs/ucx/issues/2736)). In this release, we have made significant improvements to the integration tests in the `assessment` workflow, by reducing the scope of the assessment and improving efficiency and reliability. We have removed several object creation functions and added a new function `populate_for_linting` for linting purposes. The `populate_for_linting` function adds necessary information to the installation context, and is used to ensure that the integration tests still have the required data for linting. We have also added a pytest fixture `populate_for_linting` to set up a minimal amount of data in the workspace for linting purposes. These changes have been implemented in the `test_workflows.py` file in the integration/assessment directory. This will help to ensure that the tests are not unnecessarily extensive, and that they are able to accurately assess the functionality of the library.
+* Fixed sqlglot crasher with 'drop schema ...' statement ([#2758](https://github.com/databrickslabs/ucx/issues/2758)). In this release, we have addressed a crash issue in the `sqlglot` library caused by the `drop schema` statement. A new method, `_unsafe_lint_expression`, has been introduced to prevent the crash by checking if the current expression is a `Use`, `Create`, or `Drop` statement and updating the `schema` attribute accordingly. The library now correctly handles the `drop schema` statement and returns a `Deprecation` warning if the table being processed is in the `hive_metastore` catalog and has been migrated to the Unity Catalog. Unit tests have been added to ensure the correct behavior of this code, and the linter for `from table` SQL has been updated to parse and handle the `drop schema` statement without raising any errors. These changes improve the library's overall reliability and stability, allowing it to operate smoothly with the `drop schema` statement.
+* Fixed test failure: `test_table_migration_job_refreshes_migration_status[regular-migrate-tables]` ([#2625](https://github.com/databrickslabs/ucx/issues/2625)). In this release, we have addressed two issues ([#2621](https://github.com/databrickslabs/ucx/issues/2621) and [#2537](https://github.com/databrickslabs/ucx/issues/2537)) and fixed a test failure in `test_table_migration_job_refreshes_migration_status[regular-migrate-tables]`. The `index` and `index_full_refresh` methods in `table_migrate.py` have been updated to accept a new `force_refresh` flag. When set to `True`, these methods will ensure that the migration status is up-to-date. This change also affects the `ViewsMigrationSequencer` class, which now passes `force_refresh=True` to the `index` method. Additionally, we have fixed a test failure by reusing the `force_refresh` flag to ensure the migration status is up-to-date. The `TableMigrationStatus` class in `table_migration_status.py` has been modified to accept an optional `force_refresh` parameter in the `index` method, and a unit test has been updated to assert the correct behavior when updating the migration status.
+* Fixes error message ([#2759](https://github.com/databrickslabs/ucx/issues/2759)). The `load` method of the `mapping.py` file in the `databricks/labs/ucx/hive_metastore` package has been updated to correct an error message displayed when a `NotFound` exception is raised. The previous message suggested running an incorrect command, which has been updated to the correct one: "Please run: databricks labs ucx create-table-mapping". This change does not add any new methods or alter existing functionality, but instead focuses on improving the user experience by providing accurate information when an error occurs. The scope of this change is limited to updating the error message, and no other modifications have been made.
+* Fixes issue of circular dependency of migrate-location ACL ([#2741](https://github.com/databrickslabs/ucx/issues/2741)). In this release, we have resolved two issues ([#274](https://github.com/databrickslabs/ucx/issues/274)
+* Fixes source table alias dissapearance during migrate_views ([#2726](https://github.com/databrickslabs/ucx/issues/2726)). This release introduces a fix to preserve the alias for the source table during the conversion of CREATE VIEW SQL from the legacy Hive metastore to the Unity Catalog. The issue was addressed by adding a new test case, `test_migrate_view_alias_test`, to verify the correct handling of table aliases during migration. The changes also include a fix for the SQL conversion and new test cases to ensure the correct handling of table aliases, reflected in accurate SQL conversion. A new parameter, `alias`, has been added to the Table class, and the `apply` method in the `from_table.py` file has been updated. The migration process has been updated to retain the original alias of the table. Unit tests have been added and thoroughly tested to confirm the correctness of the changes, including handling potential intermittent failures caused by external dependencies.
+* Py4j table crawler: suggestions/fixes for describing tables ([#2684](https://github.com/databrickslabs/ucx/issues/2684)). This release introduces significant improvements and fixes to the Py4J-based table crawler, enhancing its capability to describe tables effectively. The code for fetching table properties over the bridge has been updated, and error tracing has been improved through individual fetching of each table property and providing python backtrace on JVM side errors. Scala `Option` values unboxing issues have been resolved, and a small optimization has been implemented to detect partitioned tables without materializing the collection. The table's `.viewText()` property is now properly handled as a Scala `Option`. The `catalog` argument is now explicitly verified to be `hive_metastore`, and a new static method `_option_as_python` has been introduced for safely extracting values from Scala `Option`. The `_describe` method has been refactored to handle exceptions more gracefully and improved code readability. These changes result in better functionality, error handling, logging, and performance when describing tables within a specified catalog and database. The linked issues [#2658](https://github.com/databrickslabs/ucx/issues/2658) and [#2579](https://github.com/databrickslabs/ucx/issues/2579) are progressed through these updates, and appropriate testing has been conducted to ensure the improvements' effectiveness.
+* Speedup assessment workflow by making DBFS root table size calculation parallel ([#2745](https://github.com/databrickslabs/ucx/issues/2745)). In this release, the assessment workflow for calculating DBFS root table size has been optimized through the parallelization of the calculation process, resulting in improved performance. This has been achieved by updating the `pipelines_crawler` function in `src/databricks/labs/ucx/contexts/workflow_task.py`, specifically the `cached_property table_size_crawler`, to include an additional argument `self.config.include_databases`. The `TablesCrawler` class has also been modified to include a generic type parameter `Table`, enabling type hinting and more robust type checking. Furthermore, the unit test file `test_table_size.py` in the `hive_metastore` directory has been updated to handle corrupt tables and invalid delta format errors more effectively. Additionally, a new entry `databricks-pydabs` has been added to the "known.json" file, potentially enabling better integration with the `databricks-pydabs` library or providing necessary configuration information for parallel processing. Overall, these changes improve the efficiency and scalability of the codebase and optimize the assessment workflow for calculating DBFS root table size.
+* Updated databricks-labs-blueprint requirement from <0.9,>=0.8 to >=0.8,<0.10 ([#2747](https://github.com/databrickslabs/ucx/issues/2747)). In this update, the requirement for `databricks-labs-blueprint` has been updated to version `>=0.8,<0.10` in the `pyproject.toml` file. This change allows the project to utilize the latest features and bug fixes included in version 0.9.0 of the `databricks-labs-blueprint` library. Notable updates in version 0.9.0 consist of the addition of Databricks CLI version as part of routed command telemetry and support for Unicode Byte Order Mark (BOM) in file upload and download operations. Additionally, various bug fixes and improvements have been implemented for the `WorkspacePath` class, including the addition of `stat()` methods and improved compatibility with different versions of Python.
+* Updated databricks-labs-lsql requirement from <0.12,>=0.5 to >=0.5,<0.13 ([#2688](https://github.com/databrickslabs/ucx/issues/2688)). In this update, the version requirement of the `databricks-labs-lsql` library has been changed from a version greater than or equal to 0.5 and less than 0.12 to a version greater than or equal to 0.5 and less than 0.13. This allows the project to utilize the latest version of 'databricks-labs-lsql', which includes new methods for differentiating between a table that has never been written to and one with zero rows in the MockBackend class. Additionally, the update adds support for various filter types and improves testing coverage and reliability. The release notes and changelog for the updated library are provided in the commit message for reference.
+* Updated documentation to explain the usage of collections and eligible commands ([#2738](https://github.com/databrickslabs/ucx/issues/2738)). The latest update to the Databricks Labs Unified CLI (UCX) tool introduces the `join-collection` command, which enables users to join two or more workspaces into a collection, allowing for streamlined and consolidated command execution across multiple workspaces. This feature is available to Account admins on the Databricks account, Workspace admins on the workspaces to be joined, and requires UCX installation on the workspace. To run collection-eligible commands, users can simply pass the `--run-as-collection=True` flag. This enhancement enhances the UCX tool's functionality, making it easier to manage and execute commands on multiple workspaces.
+* Updated sqlglot requirement from <25.22,>=25.5.0 to >=25.5.0,<25.23 ([#2687](https://github.com/databrickslabs/ucx/issues/2687)). In this pull request, we have updated the version requirement for the `sqlglot` library in the pyproject.toml file. The previous requirement specified a version greater than or equal to 25.5.0 and less than 25.22, but we have updated it to allow for versions greater than or equal to 25.5.0 and less than 25.23. This change allows us to use the latest version of 'sqlglot', while still ensuring compatibility with other dependencies. Additionally, this pull request includes a detailed changelog from the `sqlglot` repository, which provides information on the features, bug fixes, and changes included in each version. This can help us understand the scope of the update and how it may impact our project.
+* [DOCUMENTATION] Improve documentation on using account profile for `sync-workspace-info` cli command ([#2683](https://github.com/databrickslabs/ucx/issues/2683)). The `sync-workspace-info` CLI command has been added to the Databricks Labs UCX package, which uploads the workspace configuration to all workspaces in the Databricks account where the `ucx` tool is installed. This feature requires Databricks Account Administrator privileges and is necessary to create an immutable default catalog mapping for the table migration process. It also serves as a prerequisite for the `create-table-mapping` command. To utilize this command, users must configure the Databricks CLI profile with access to the Databricks account console, available at "accounts.cloud.databricks.com" or "accounts.azuredatabricks.net". Additionally, the documentation for using the account profile with the `sync-workspace-info` command has been enhanced, addressing issue [#1762](https://github.com/databrickslabs/ucx/issues/1762).
+* [DOCUMENTATION] Improve documentation when installing UCX from a machine with restricted internet access ([#2690](https://github.com/databrickslabs/ucx/issues/2690)). "A new section has been added to the `ADVANCED` installation section of the UCX library documentation, providing detailed instructions for installing UCX with a company-hosted PyPI mirror. This feature is intended for environments with restricted internet access, allowing users to bypass the public PyPI index and use a company-controlled mirror instead. Users will need to add all UCX dependencies to the company-hosted PyPI mirror and set the `PIP_INDEX_URL` environment variable to the mirror URL during installation. The solution also includes a prompt asking the user if their workspace blocks internet access. Additionally, the documentation has been updated to clarify that UCX requires internet access to connect to GitHub for downloading the tool, specifying the necessary URLs that need to be accessible. This update aims to improve the installation process for users with restricted internet access and provide clear instructions and prompts for installing UCX on machines with limited internet connectivity."
+
+Dependency updates:
+
+ * Updated sqlglot requirement from <25.22,>=25.5.0 to >=25.5.0,<25.23 ([#2687](https://github.com/databrickslabs/ucx/pull/2687)).
+ * Updated databricks-labs-lsql requirement from <0.12,>=0.5 to >=0.5,<0.13 ([#2688](https://github.com/databrickslabs/ucx/pull/2688)).
+ * Updated databricks-labs-blueprint requirement from <0.9,>=0.8 to >=0.8,<0.10 ([#2747](https://github.com/databrickslabs/ucx/pull/2747)).
+
## 0.37.0
* Added ability to run create-missing-principals command as collection ([#2675](https://github.com/databrickslabs/ucx/issues/2675)). This release introduces the capability to run the `create-missing-principals` command as a collection in the UCX (Unified Cloud Experience) tool with the new optional flag `run-as-collection`. This allows for more control and flexibility when managing cloud resources, particularly in handling multiple workspaces. The existing `create-missing-principals` command has been modified to accept a new `run_as_collection` parameter, enabling the command to run on multiple workspaces when set to True. The function has been updated to handle a list of `WorkspaceContext` objects, allowing it to iterate over each object and execute necessary actions for each workspace. Additionally, a new `AccountClient` parameter has been added to facilitate the retrieval of all workspaces associated with a specific account. New test functions have been added to `test_cli.py` to test this new functionality on AWS and Azure cloud providers. The `acc_client` argument has been added to the test functions to enable running the tests with an authenticated AWS or Azure client, and the `MockPrompts` object is used to simulate user responses to the prompts displayed during the execution of the command.
diff --git a/src/databricks/labs/ucx/__about__.py b/src/databricks/labs/ucx/__about__.py
index 532d9fbdee..f7f434acf3 100644
--- a/src/databricks/labs/ucx/__about__.py
+++ b/src/databricks/labs/ucx/__about__.py
@@ -1,2 +1,2 @@
# DO NOT MODIFY THIS FILE
-__version__ = "0.37.0"
+__version__ = "0.38.0"
From 97b999690596625ea73e2aeeb373d8626a0e6bbd Mon Sep 17 00:00:00 2001
From: Eric Vergnaud
Date: Fri, 27 Sep 2024 14:54:01 +0200
Subject: [PATCH 7/7] Strip preliminary comments in pip cells (#2763)
## Changes
Current implementation fails when pip command is preceded by non MAGIC
comments
This PR fixes the issue
### Linked issues
None
### Functionality
None
### Tests
- [x] added unit tests
Co-authored-by: Eric Vergnaud
---
.../labs/ucx/source_code/notebooks/cells.py | 10 +++++--
tests/integration/source_code/test_cells.py | 27 +++++++++++++++++++
2 files changed, 35 insertions(+), 2 deletions(-)
create mode 100644 tests/integration/source_code/test_cells.py
diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py
index 020cd279bc..712acee54a 100644
--- a/src/databricks/labs/ucx/source_code/notebooks/cells.py
+++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py
@@ -484,9 +484,15 @@ def _split(cls, code: str) -> list[str]:
Sources:
https://docs.databricks.com/en/libraries/notebooks-python-libraries.html#manage-libraries-with-pip-commands
"""
+ # strip preliminary comments
+ pip_idx = code.find("pip")
+ if pip_idx > 0 and code[pip_idx - 1] in {'%', '!'}:
+ pip_idx -= 1
+ code = code[pip_idx:]
+ # look for standalone '\n'
match = cls._splitter.search(code)
if match:
code = code[: match.start()] # Remove code after non-escaped newline
+ # make single line
code = code.replace("\\\n", " ")
- lexer = shlex.split(code, posix=True)
- return list(lexer)
+ return shlex.split(code, posix=True)
diff --git a/tests/integration/source_code/test_cells.py b/tests/integration/source_code/test_cells.py
new file mode 100644
index 0000000000..a1621ad94a
--- /dev/null
+++ b/tests/integration/source_code/test_cells.py
@@ -0,0 +1,27 @@
+from pathlib import Path
+
+from databricks.sdk.service.workspace import Language
+
+from databricks.labs.ucx.source_code.base import CurrentSessionState
+from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph
+from databricks.labs.ucx.source_code.linters.files import FileLoader
+from databricks.labs.ucx.source_code.notebooks.sources import Notebook
+
+
+def test_malformed_pip_cell_is_supported(simple_ctx):
+ source = """# Databricks notebook source
+# MAGIC %md This notebook sets up the companion cluster(s) to run the solution accelerator. It also creates the Workflow to illustrate the order of execution. Happy exploring!
+
+# COMMAND ----------
+
+# DBTITLE 0,Install util packages
+# MAGIC %pip install git+https://github.com/databricks-academy/dbacademy@v1.0.13 git+https://github.com/databricks-industry-solutions/notebook-solution-companion@safe-print-html --quiet --disable-pip-version-check
+
+"""
+ notebook = Notebook.parse(Path(""), source=source, default_language=Language.PYTHON)
+ dependency = Dependency(FileLoader(), Path(""))
+ parent = DependencyGraph(
+ dependency, None, simple_ctx.dependency_resolver, simple_ctx.path_lookup, CurrentSessionState()
+ )
+ problems = notebook.build_dependency_graph(parent)
+ assert not problems