Skip to content

Commit

Permalink
Make delta format case sensitive (#2861)
Browse files Browse the repository at this point in the history
## Changes
Make delta format case sensitive

### Linked issues
Resolves #2858
Relevant to #2840

### Functionality

- [x] all table format related code

### Tests

- [x] added unit tests
- [ ] verified on staging environment (screenshot attached)
  • Loading branch information
JCZuurmond authored Oct 8, 2024
1 parent 2531122 commit 4162830
Show file tree
Hide file tree
Showing 8 changed files with 41 additions and 17 deletions.
4 changes: 4 additions & 0 deletions src/databricks/labs/ucx/hive_metastore/locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,10 @@ class TableInMount:
format: str
is_partitioned: bool

def __post_init__(self) -> None:
if isinstance(self.format, str): # Should not happen according to type hint, still safer
self.format = self.format.upper()


class TablesInMounts(CrawlerBase[Table]):
"""Experimental scanner for tables that can be found on mounts.
Expand Down
18 changes: 14 additions & 4 deletions src/databricks/labs/ucx/hive_metastore/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class AclMigrationWhat(Enum):


@dataclass
class Table:
class Table: # pylint: disable=too-many-public-methods
catalog: str
database: str
name: str
Expand Down Expand Up @@ -80,12 +80,22 @@ class Table:

UPGRADED_FROM_WS_PARAM: typing.ClassVar[str] = "upgraded_from_workspace_id"

def __post_init__(self) -> None:
if isinstance(self.table_format, str): # Should not happen according to type hint, still safer
self.table_format = self.table_format.upper()

@property
def is_delta(self) -> bool:
if self.table_format is None:
return False
return self.table_format.upper() == "DELTA"

@property
def is_hive(self) -> bool:
if self.table_format is None:
return False
return self.table_format.upper() == "HIVE"

@property
def key(self) -> str:
if self.is_table_in_mount:
Expand Down Expand Up @@ -163,13 +173,13 @@ def what(self) -> What:
return What.DB_DATASET
if self.is_table_in_mount:
return What.TABLE_IN_MOUNT
if self.is_dbfs_root and self.table_format == "DELTA":
if self.is_dbfs_root and self.is_delta:
return What.DBFS_ROOT_DELTA
if self.is_dbfs_root:
return What.DBFS_ROOT_NON_DELTA
if self.kind == "TABLE" and self.is_format_supported_for_sync:
return What.EXTERNAL_SYNC
if self.kind == "TABLE" and self.table_format.upper() == "HIVE":
if self.kind == "TABLE" and self.is_hive:
return What.EXTERNAL_HIVESERDE
if self.kind == "TABLE":
return What.EXTERNAL_NO_SYNC
Expand All @@ -194,7 +204,7 @@ def sql_migrate_ctas_managed(self, target_table_key) -> str:
)

def hiveserde_type(self, backend: SqlBackend) -> HiveSerdeType:
if self.table_format != "HIVE":
if not self.is_hive:
return HiveSerdeType.NOT_HIVESERDE
# Extract hive serde info, ideally this should be done by table crawler.
# But doing here to avoid breaking change to the `tables` table in the inventory schema.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ WITH table_stats AS (
SELECT
`database`,
object_type,
table_format AS `format`,
UPPER(table_format) AS `format`,
`location`,
IF(object_type IN ('MANAGED', 'EXTERNAL'), 1, 0) AS is_table,
IF(object_type = 'VIEW', 1, 0) AS is_view,
Expand All @@ -15,7 +15,7 @@ WITH table_stats AS (
ELSE 0
END AS is_dbfs_root,
CASE WHEN STARTSWITH(location, 'wasb') THEN 1 WHEN STARTSWITH(location, 'adl') THEN 1 ELSE 0 END AS is_unsupported,
IF(table_format = 'DELTA', 1, 0) AS is_delta
IF(UPPER(table_format) = 'DELTA', 1, 0) AS is_delta
FROM inventory.tables
), database_stats AS (
SELECT
Expand Down Expand Up @@ -71,4 +71,4 @@ FROM database_stats
FULL JOIN grant_stats
USING (`database`)
ORDER BY
tables DESC
tables DESC
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
SELECT
CONCAT(tables.`database`, '.', tables.name) AS name,
object_type AS type,
table_format AS format,
UPPER(table_format) AS format,
CASE
WHEN STARTSWITH(location, 'dbfs:/mnt')
THEN 'DBFS MOUNT'
Expand All @@ -22,7 +22,7 @@ SELECT
THEN 'UNSUPPORTED'
ELSE 'EXTERNAL'
END AS storage,
IF(table_format = 'DELTA', 'Yes', 'No') AS is_delta,
IF(UPPER(table_format) = 'DELTA', 'Yes', 'No') AS is_delta,
location,
CASE
WHEN size_in_bytes IS NULL
Expand All @@ -41,4 +41,4 @@ SELECT
END AS table_size
FROM inventory.tables AS tables
LEFT OUTER JOIN inventory.table_size AS table_size
ON tables.catalog = table_size.catalog AND tables.database = table_size.database AND tables.name = table_size.name
ON tables.catalog = table_size.catalog AND tables.database = table_size.database AND tables.name = table_size.name
4 changes: 2 additions & 2 deletions src/databricks/labs/ucx/queries/views/objects.sql
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ FROM (
TO_JSON(
FILTER(
ARRAY(
IF(NOT STARTSWITH(t.table_format, 'DELTA') AND t.object_type <> 'VIEW', CONCAT('Non-DELTA format: ', t.table_format), NULL),
IF(NOT STARTSWITH(UPPER(t.table_format), 'DELTA') AND t.object_type <> 'VIEW', CONCAT('Non-DELTA format: ', UPPER(t.table_format)), NULL),
IF(STARTSWITH(t.location, 'wasb'), 'Unsupported Storage Type: wasb://', NULL),
IF(STARTSWITH(t.location, 'adl'), 'Unsupported Storage Type: adl://', NULL),
CASE
Expand Down Expand Up @@ -75,4 +75,4 @@ SELECT
'permissions' AS object_type,
object_id,
failures
FROM $inventory.grant_detail
FROM $inventory.grant_detail
13 changes: 11 additions & 2 deletions src/databricks/labs/ucx/queries/views/table_estimates.sql
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ SELECT
THEN 2 /* Can vary depending of view complexity and number of tables used in the view */
ELSE NULL
END AS estimated_hours
FROM $inventory.tables
FROM
(
SELECT
catalog,
database,
name,
UPPER(object_type) AS object_type,
UPPER(table_format) AS table_format
FROM $inventory.tables
)
WHERE
NOT STARTSWITH(name, '__apply_changes')
NOT STARTSWITH(name, '__apply_changes')
4 changes: 2 additions & 2 deletions tests/unit/hive_metastore/test_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ def my_side_effect(path, **_):
client.dbutils.fs.ls.assert_has_calls([call('/mnt/test_mount'), call('/mnt/test_mount/table1/')])


def test_historical_data_should_be_overwritten():
def test_historical_data_should_be_overwritten() -> None:
client = create_autospec(WorkspaceClient)

first_folder = FileInfo("dbfs:/mnt/test_mount/table1/", "table1/", "", "")
Expand Down Expand Up @@ -590,7 +590,7 @@ def my_side_effect(path, **_):
database='database',
name='name',
object_type='object_type',
table_format='table_format',
table_format='TABLE_FORMAT',
location='location',
view_text=None,
upgraded_to=None,
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/hive_metastore/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ def test_tables_returning_error_when_show_tables(caplog):
'table,dbfs_root,what',
[
(Table("a", "b", "c", "MANAGED", "DELTA", location="dbfs:/somelocation/tablename"), True, What.DBFS_ROOT_DELTA),
(Table("a", "b", "c", "MANAGED", "delta", location="dbfs:/somelocation/tablename"), True, What.DBFS_ROOT_DELTA),
(
Table("a", "b", "c", "MANAGED", "PARQUET", location="dbfs:/somelocation/tablename"),
True,
Expand Down Expand Up @@ -225,7 +226,7 @@ def test_tables_returning_error_when_show_tables(caplog):
(Table("a", "b", "c", "MANAGED", "DELTA", location="adls:/somelocation/tablename"), False, What.EXTERNAL_SYNC),
],
)
def test_is_dbfs_root(table, dbfs_root, what):
def test_is_dbfs_root(table, dbfs_root, what) -> None:
assert table.is_dbfs_root == dbfs_root
assert table.what == what

Expand Down

0 comments on commit 4162830

Please sign in to comment.