From 0fdfcc718776f7f855e6ff6b54f05f749634e6c7 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 19 Sep 2024 17:01:02 +0200 Subject: [PATCH 1/4] Log the full stacktrace when an error occurs crossing the Py4J bridge to describe a table. Prior to this only the JVM stacktrace was logged; after this change the python side is also included. --- src/databricks/labs/ucx/hive_metastore/tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 795950e02c..f76e9c66a3 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -555,8 +555,8 @@ def _describe(self, catalog, database, table) -> Table | None: storage_properties=self._format_properties_list(list(self._iterator(raw_table.properties()))), is_partitioned=is_partitioned, ) - except Exception as e: # pylint: disable=broad-exception-caught - logger.warning(f"Couldn't fetch information for table {full_name} : {e}") + except Exception: # pylint: disable=broad-exception-caught + logger.warning(f"Couldn't fetch information for table: {full_name}", exc_info=True) return None def _crawl(self) -> Iterable[Table]: From 8e00e0548c227c529331901d6fc657870c84e579 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 19 Sep 2024 17:02:21 +0200 Subject: [PATCH 2/4] Debug logging for the table being described before we start. --- src/databricks/labs/ucx/hive_metastore/tables.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index f76e9c66a3..14b6c6e5de 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -536,6 +536,7 @@ def _describe(self, catalog, database, table) -> Table | None: catalog and database. """ full_name = f"{catalog}.{database}.{table}" + logger.debug(f"Fetching metadata for table: {full_name}") try: raw_table = self._external_catalog.getTable(database, table) table_format = raw_table.provider().getOrElse(None) or "UNKNOWN" From 0562be604be485aed61fbaac755949c813a5088e Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 19 Sep 2024 17:03:39 +0200 Subject: [PATCH 3/4] Explicit handling of the catalog parameter: only hive_catalog is supported. --- src/databricks/labs/ucx/hive_metastore/tables.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 14b6c6e5de..3405e40817 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -536,6 +536,9 @@ def _describe(self, catalog, database, table) -> Table | None: catalog and database. """ full_name = f"{catalog}.{database}.{table}" + if catalog != "hive_metastore": + msg = f"Only tables in the hive_metastore catalog can be described: {full_name}" + raise ValueError(msg) logger.debug(f"Fetching metadata for table: {full_name}") try: raw_table = self._external_catalog.getTable(database, table) @@ -546,7 +549,7 @@ def _describe(self, catalog, database, table) -> Table | None: is_partitioned = len(list(self._iterator(raw_table.partitionColumnNames()))) > 0 return Table( - catalog='hive_metastore', + catalog=catalog, database=database, name=table, object_type=raw_table.tableType().name(), From afcb94648cdfddf3564eadca68d148b12fc0d29f Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 19 Sep 2024 17:04:22 +0200 Subject: [PATCH 4/4] Fix Scala-Option unpacking and ensure that each property is fetched in its own statement. This aids debugging and pinpointing the origin of errors. --- .../labs/ucx/hive_metastore/tables.py | 39 +++++++++++-------- tests/unit/hive_metastore/test_tables.py | 8 ++-- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 3405e40817..5eff7a1815 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -499,6 +499,10 @@ def _iterator(self, result: typing.Any) -> Iterator: while iterator.hasNext(): yield iterator.next() + @staticmethod + def _option_as_python(scala_option: typing.Any): + return scala_option.get() if scala_option.isDefined() else None + def _all_databases(self) -> list[str]: if not self._include_database: return list(self._iterator(self._external_catalog.listDatabases())) @@ -540,28 +544,31 @@ def _describe(self, catalog, database, table) -> Table | None: msg = f"Only tables in the hive_metastore catalog can be described: {full_name}" raise ValueError(msg) logger.debug(f"Fetching metadata for table: {full_name}") - try: + try: # pylint: disable=too-many-try-statements raw_table = self._external_catalog.getTable(database, table) - table_format = raw_table.provider().getOrElse(None) or "UNKNOWN" - location_uri = raw_table.storage().locationUri().getOrElse(None) + table_format = self._option_as_python(raw_table.provider()) or "UNKNOWN" + location_uri = self._option_as_python(raw_table.storage().locationUri()) if location_uri: location_uri = location_uri.toString() - is_partitioned = len(list(self._iterator(raw_table.partitionColumnNames()))) > 0 - - return Table( - catalog=catalog, - database=database, - name=table, - object_type=raw_table.tableType().name(), - table_format=table_format, - location=location_uri, - view_text=raw_table.viewText(), - storage_properties=self._format_properties_list(list(self._iterator(raw_table.properties()))), - is_partitioned=is_partitioned, - ) + is_partitioned = raw_table.partitionColumnNames().iterator().hasNext() + object_type = raw_table.tableType().name() + view_text = self._option_as_python(raw_table.viewText()) + table_properties = list(self._iterator(raw_table.properties())) + formatted_table_properties = self._format_properties_list(table_properties) except Exception: # pylint: disable=broad-exception-caught logger.warning(f"Couldn't fetch information for table: {full_name}", exc_info=True) return None + return Table( + catalog=catalog, + database=database, + name=table, + object_type=object_type, + table_format=table_format, + location=location_uri, + view_text=view_text, + storage_properties=formatted_table_properties, + is_partitioned=is_partitioned, + ) def _crawl(self) -> Iterable[Table]: """Crawls and lists tables within the specified catalog and database.""" diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py index 747f648130..b22f678a5a 100644 --- a/tests/unit/hive_metastore/test_tables.py +++ b/tests/unit/hive_metastore/test_tables.py @@ -616,10 +616,12 @@ def product_element_side_effect(index): mock_partition_col_iterator.iterator.return_value = CustomIterator(["age", "name"]) get_table_mock = mocker.Mock() - get_table_mock.provider().getOrElse.return_value = "delta" - get_table_mock.storage().locationUri().getOrElse.return_value = None + get_table_mock.provider().isDefined.return_value = True + get_table_mock.provider().get.return_value = "delta" + get_table_mock.storage().locationUri().isDefined.return_value = False - get_table_mock.viewText.return_value = "mock table text" + get_table_mock.viewText().isDefined.return_value = True + get_table_mock.viewText().get.return_value = "mock table text" get_table_mock.properties.return_value = mock_properties_iterator get_table_mock.partitionColumnNames.return_value = mock_partition_col_iterator