Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make delta format case insensitive #2861

Merged
merged 8 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/databricks/labs/ucx/hive_metastore/locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,10 @@ class TableInMount:
format: str
is_partitioned: bool

def __post_init__(self) -> None:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nfx: I added this fall safe to always have the format as an upper case string.

if isinstance(self.format, str): # Should not happen according to type hint, still safer
self.format = self.format.upper()


class TablesInMounts(CrawlerBase[Table]):
"""Experimental scanner for tables that can be found on mounts.
Expand Down
18 changes: 14 additions & 4 deletions src/databricks/labs/ucx/hive_metastore/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class AclMigrationWhat(Enum):


@dataclass
class Table:
class Table: # pylint: disable=too-many-public-methods
catalog: str
database: str
name: str
Expand Down Expand Up @@ -80,12 +80,22 @@ class Table:

UPGRADED_FROM_WS_PARAM: typing.ClassVar[str] = "upgraded_from_workspace_id"

def __post_init__(self) -> None:
if isinstance(self.table_format, str): # Should not happen according to type hint, still safer
self.table_format = self.table_format.upper()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay...

@dataclass
class Foo:
    table_format: str = dataclasses.field()
    def __post_init__(self):
        if isinstance(self.table_format, str):
            self.table_format = self.table_format.upper()

def test_x():
    f = Foo("parquet")
    assert f.table_format == "PARQUET"

    f = dataclasses.replace(f, table_format="csv")
    assert f.table_format == "CSV"


@property
def is_delta(self) -> bool:
if self.table_format is None:
return False
return self.table_format.upper() == "DELTA"

@property
def is_hive(self) -> bool:
if self.table_format is None:
return False
return self.table_format.upper() == "HIVE"

@property
def key(self) -> str:
if self.is_table_in_mount:
Expand Down Expand Up @@ -163,13 +173,13 @@ def what(self) -> What:
return What.DB_DATASET
if self.is_table_in_mount:
return What.TABLE_IN_MOUNT
if self.is_dbfs_root and self.table_format == "DELTA":
if self.is_dbfs_root and self.is_delta:
return What.DBFS_ROOT_DELTA
if self.is_dbfs_root:
return What.DBFS_ROOT_NON_DELTA
if self.kind == "TABLE" and self.is_format_supported_for_sync:
return What.EXTERNAL_SYNC
if self.kind == "TABLE" and self.table_format.upper() == "HIVE":
if self.kind == "TABLE" and self.is_hive:
return What.EXTERNAL_HIVESERDE
if self.kind == "TABLE":
return What.EXTERNAL_NO_SYNC
Expand All @@ -194,7 +204,7 @@ def sql_migrate_ctas_managed(self, target_table_key) -> str:
)

def hiveserde_type(self, backend: SqlBackend) -> HiveSerdeType:
if self.table_format != "HIVE":
if not self.is_hive:
return HiveSerdeType.NOT_HIVESERDE
# Extract hive serde info, ideally this should be done by table crawler.
# But doing here to avoid breaking change to the `tables` table in the inventory schema.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ WITH table_stats AS (
SELECT
`database`,
object_type,
table_format AS `format`,
UPPER(table_format) AS `format`,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Required for backwards compatibility, i.e. users that already populated the UCX inventory

`location`,
IF(object_type IN ('MANAGED', 'EXTERNAL'), 1, 0) AS is_table,
IF(object_type = 'VIEW', 1, 0) AS is_view,
Expand All @@ -15,7 +15,7 @@ WITH table_stats AS (
ELSE 0
END AS is_dbfs_root,
CASE WHEN STARTSWITH(location, 'wasb') THEN 1 WHEN STARTSWITH(location, 'adl') THEN 1 ELSE 0 END AS is_unsupported,
IF(table_format = 'DELTA', 1, 0) AS is_delta
IF(UPPER(table_format) = 'DELTA', 1, 0) AS is_delta
FROM inventory.tables
), database_stats AS (
SELECT
Expand Down Expand Up @@ -71,4 +71,4 @@ FROM database_stats
FULL JOIN grant_stats
USING (`database`)
ORDER BY
tables DESC
tables DESC
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
SELECT
CONCAT(tables.`database`, '.', tables.name) AS name,
object_type AS type,
table_format AS format,
UPPER(table_format) AS format,
CASE
WHEN STARTSWITH(location, 'dbfs:/mnt')
THEN 'DBFS MOUNT'
Expand All @@ -22,7 +22,7 @@ SELECT
THEN 'UNSUPPORTED'
ELSE 'EXTERNAL'
END AS storage,
IF(table_format = 'DELTA', 'Yes', 'No') AS is_delta,
IF(UPPER(table_format) = 'DELTA', 'Yes', 'No') AS is_delta,
location,
CASE
WHEN size_in_bytes IS NULL
Expand All @@ -41,4 +41,4 @@ SELECT
END AS table_size
FROM inventory.tables AS tables
LEFT OUTER JOIN inventory.table_size AS table_size
ON tables.catalog = table_size.catalog AND tables.database = table_size.database AND tables.name = table_size.name
ON tables.catalog = table_size.catalog AND tables.database = table_size.database AND tables.name = table_size.name
4 changes: 2 additions & 2 deletions src/databricks/labs/ucx/queries/views/objects.sql
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ FROM (
TO_JSON(
FILTER(
ARRAY(
IF(NOT STARTSWITH(t.table_format, 'DELTA') AND t.object_type <> 'VIEW', CONCAT('Non-DELTA format: ', t.table_format), NULL),
IF(NOT STARTSWITH(UPPER(t.table_format), 'DELTA') AND t.object_type <> 'VIEW', CONCAT('Non-DELTA format: ', UPPER(t.table_format)), NULL),
IF(STARTSWITH(t.location, 'wasb'), 'Unsupported Storage Type: wasb://', NULL),
IF(STARTSWITH(t.location, 'adl'), 'Unsupported Storage Type: adl://', NULL),
CASE
Expand Down Expand Up @@ -75,4 +75,4 @@ SELECT
'permissions' AS object_type,
object_id,
failures
FROM $inventory.grant_detail
FROM $inventory.grant_detail
13 changes: 11 additions & 2 deletions src/databricks/labs/ucx/queries/views/table_estimates.sql
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ SELECT
THEN 2 /* Can vary depending of view complexity and number of tables used in the view */
ELSE NULL
END AS estimated_hours
FROM $inventory.tables
FROM
(
SELECT
catalog,
database,
name,
UPPER(object_type) AS object_type,
UPPER(table_format) AS table_format
FROM $inventory.tables
)
WHERE
NOT STARTSWITH(name, '__apply_changes')
NOT STARTSWITH(name, '__apply_changes')
4 changes: 2 additions & 2 deletions tests/unit/hive_metastore/test_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ def my_side_effect(path, **_):
client.dbutils.fs.ls.assert_has_calls([call('/mnt/test_mount'), call('/mnt/test_mount/table1/')])


def test_historical_data_should_be_overwritten():
def test_historical_data_should_be_overwritten() -> None:
client = create_autospec(WorkspaceClient)

first_folder = FileInfo("dbfs:/mnt/test_mount/table1/", "table1/", "", "")
Expand Down Expand Up @@ -590,7 +590,7 @@ def my_side_effect(path, **_):
database='database',
name='name',
object_type='object_type',
table_format='table_format',
table_format='TABLE_FORMAT',
location='location',
view_text=None,
upgraded_to=None,
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/hive_metastore/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ def test_tables_returning_error_when_show_tables(caplog):
'table,dbfs_root,what',
[
(Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/somelocation/tablename"), True, What.DBFS_ROOT_DELTA),
(Table("a", "b", "c", "MANAGED", "delta", location="dbfs:/somelocation/tablename"), True, What.DBFS_ROOT_DELTA),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@HariGS-DB : This test fails without the code changes in the PR. Could be responsible for #2840

(
Table("a", "b", "c", "MANAGED", "PARQUET", location="dbfs:/somelocation/tablename"),
True,
Expand Down Expand Up @@ -225,7 +226,7 @@ def test_tables_returning_error_when_show_tables(caplog):
(Table("a", "b", "c", "MANAGED", "DELTA", location="adls:/somelocation/tablename"), False, What.EXTERNAL_SYNC),
],
)
def test_is_dbfs_root(table, dbfs_root, what):
def test_is_dbfs_root(table, dbfs_root, what) -> None:
assert table.is_dbfs_root == dbfs_root
assert table.what == what

Expand Down
Loading