Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast table crawler: suggestions/fixes for describing tables #2684

Merged
merged 6 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 29 additions & 18 deletions src/databricks/labs/ucx/hive_metastore/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,10 @@ def _iterator(self, result: typing.Any) -> Iterator:
while iterator.hasNext():
yield iterator.next()

@staticmethod
def _option_as_python(scala_option: typing.Any):
JCZuurmond marked this conversation as resolved.
Show resolved Hide resolved
return scala_option.get() if scala_option.isDefined() else None

def _all_databases(self) -> list[str]:
if not self._include_database:
return list(self._iterator(self._external_catalog.listDatabases()))
Expand Down Expand Up @@ -536,28 +540,35 @@ def _describe(self, catalog, database, table) -> Table | None:
catalog and database.
"""
full_name = f"{catalog}.{database}.{table}"
JCZuurmond marked this conversation as resolved.
Show resolved Hide resolved
try:
if catalog != "hive_metastore":
msg = f"Only tables in the hive_metastore catalog can be described: {full_name}"
raise ValueError(msg)
logger.debug(f"Fetching metadata for table: {full_name}")
try: # pylint: disable=too-many-try-statements
raw_table = self._external_catalog.getTable(database, table)
table_format = raw_table.provider().getOrElse(None) or "UNKNOWN"
location_uri = raw_table.storage().locationUri().getOrElse(None)
table_format = self._option_as_python(raw_table.provider()) or "UNKNOWN"
location_uri = self._option_as_python(raw_table.storage().locationUri())
if location_uri:
location_uri = location_uri.toString()
is_partitioned = len(list(self._iterator(raw_table.partitionColumnNames()))) > 0

return Table(
catalog='hive_metastore',
database=database,
name=table,
object_type=raw_table.tableType().name(),
table_format=table_format,
location=location_uri,
view_text=raw_table.viewText(),
storage_properties=self._format_properties_list(list(self._iterator(raw_table.properties()))),
is_partitioned=is_partitioned,
)
except Exception as e: # pylint: disable=broad-exception-caught
logger.warning(f"Couldn't fetch information for table {full_name} : {e}")
is_partitioned = raw_table.partitionColumnNames().iterator().hasNext()
object_type = raw_table.tableType().name()
view_text = self._option_as_python(raw_table.viewText())
table_properties = list(self._iterator(raw_table.properties()))
formatted_table_properties = self._format_properties_list(table_properties)
except Exception: # pylint: disable=broad-exception-caught
logger.warning(f"Couldn't fetch information for table: {full_name}", exc_info=True)
Comment on lines +558 to +559
Copy link
Member

@JCZuurmond JCZuurmond Sep 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has my preference since it is more clear which exception info is added to the log message. Though, it does not matter

Suggested change
except Exception: # pylint: disable=broad-exception-caught
logger.warning(f"Couldn't fetch information for table: {full_name}", exc_info=True)
except Exception as e: # pylint: disable=broad-exception-caught
logger.warning(f"Couldn't fetch information for table: {full_name}", exc_info=e)

return None
return Table(
catalog=catalog,
database=database,
name=table,
object_type=object_type,
table_format=table_format,
location=location_uri,
view_text=view_text,
storage_properties=formatted_table_properties,
is_partitioned=is_partitioned,
)

def _crawl(self) -> Iterable[Table]:
"""Crawls and lists tables within the specified catalog and database."""
Expand Down
8 changes: 5 additions & 3 deletions tests/unit/hive_metastore/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,10 +616,12 @@ def product_element_side_effect(index):
mock_partition_col_iterator.iterator.return_value = CustomIterator(["age", "name"])

get_table_mock = mocker.Mock()
get_table_mock.provider().getOrElse.return_value = "delta"
get_table_mock.storage().locationUri().getOrElse.return_value = None
get_table_mock.provider().isDefined.return_value = True
get_table_mock.provider().get.return_value = "delta"
get_table_mock.storage().locationUri().isDefined.return_value = False

get_table_mock.viewText.return_value = "mock table text"
get_table_mock.viewText().isDefined.return_value = True
get_table_mock.viewText().get.return_value = "mock table text"
get_table_mock.properties.return_value = mock_properties_iterator
get_table_mock.partitionColumnNames.return_value = mock_partition_col_iterator

Expand Down
Loading