From baafbb4755ff91e884a428733a2c430fde76a064 Mon Sep 17 00:00:00 2001 From: Amin Movahed Date: Tue, 24 Sep 2024 15:15:52 +1000 Subject: [PATCH 01/19] Adding unskip CLI command to undo a skip on schema or a table --- src/databricks/labs/ucx/cli.py | 11 +++++++++++ src/databricks/labs/ucx/hive_metastore/mapping.py | 11 +++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index a8a821d586..c24de41242 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -97,6 +97,17 @@ def skip(w: WorkspaceClient, schema: str | None = None, table: str | None = None return ctx.table_mapping.skip_table_or_view(schema, table, ctx.tables_crawler.load_one) return ctx.table_mapping.skip_schema(schema) +def unskip(w: WorkspaceClient, schema: str | None = None, table: str | None = None): + """Create a unskip comment on a schema or a table""" + logger.info("Running unskip command") + if not schema: + logger.error("--schema is a required parameter.") + return None + ctx = WorkspaceContext(w) + if table: + return ctx.table_mapping.skip_table_or_view(schema, table, ctx.tables_crawler.load_one, unskip=True) + return ctx.table_mapping.skip_schema(schema, unskip=True) + @ucx.command(is_account=True) def sync_workspace_info(a: AccountClient): diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index 655172e388..4b9825ecab 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -116,14 +116,15 @@ def load(self) -> list[Rule]: msg = "Please run: databricks labs ucx table-mapping" raise ValueError(msg) from None - def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]): + def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None], unskip: bool = False): # Marks a table to be skipped in the migration process by applying a table property + unskip = "false" if unskip else "true" try: table = load_table(schema_name, table_name) if table is None: raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]") self._sql_backend.execute( - f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} SET TBLPROPERTIES('{self.UCX_SKIP_PROPERTY}' = true)" + f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} SET TBLPROPERTIES('{self.UCX_SKIP_PROPERTY}' = {unskip})" ) except NotFound as err: if "[TABLE_OR_VIEW_NOT_FOUND]" in str(err) or "[DELTA_TABLE_NOT_FOUND]" in str(err): @@ -135,11 +136,12 @@ def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Call except BadRequest as err: logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) - def skip_schema(self, schema: str): + def skip_schema(self, schema: str, unskip: bool = False): # Marks a schema to be skipped in the migration process by applying a table property + unskip = "false" if unskip else "true" try: self._sql_backend.execute( - f"ALTER SCHEMA {escape_sql_identifier(schema)} SET DBPROPERTIES('{self.UCX_SKIP_PROPERTY}' = true)" + f"ALTER SCHEMA {escape_sql_identifier(schema)} SET DBPROPERTIES('{self.UCX_SKIP_PROPERTY}' = {unskip})" ) except NotFound as err: if "[SCHEMA_NOT_FOUND]" in str(err): @@ -209,6 +211,7 @@ def _get_table_in_scope_task(self, table_to_migrate: TableToMigrate) -> TableToM return None for value in properties: + #TODO : there should be a check to see if this property is equal to true then skip and if false then ignore if value["key"] == self.UCX_SKIP_PROPERTY: logger.info(f"{table.key} is marked to be skipped") return None From 13c9ef65e5a37bea6e90cc8501563b9c0a037941 Mon Sep 17 00:00:00 2001 From: Amin Movahed Date: Tue, 24 Sep 2024 15:21:30 +1000 Subject: [PATCH 02/19] Added a missing cli command decorator --- src/databricks/labs/ucx/cli.py | 2 ++ src/databricks/labs/ucx/hive_metastore/mapping.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index c24de41242..ae6185c12c 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -97,6 +97,8 @@ def skip(w: WorkspaceClient, schema: str | None = None, table: str | None = None return ctx.table_mapping.skip_table_or_view(schema, table, ctx.tables_crawler.load_one) return ctx.table_mapping.skip_schema(schema) + +@ucx.command def unskip(w: WorkspaceClient, schema: str | None = None, table: str | None = None): """Create a unskip comment on a schema or a table""" logger.info("Running unskip command") diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index 4b9825ecab..7e364949c6 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -116,15 +116,17 @@ def load(self) -> list[Rule]: msg = "Please run: databricks labs ucx table-mapping" raise ValueError(msg) from None - def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None], unskip: bool = False): + def skip_table_or_view( + self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None], unskip: bool = False + ): # Marks a table to be skipped in the migration process by applying a table property - unskip = "false" if unskip else "true" + skip_flag = "false" if unskip else "true" try: table = load_table(schema_name, table_name) if table is None: raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]") self._sql_backend.execute( - f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} SET TBLPROPERTIES('{self.UCX_SKIP_PROPERTY}' = {unskip})" + f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} SET TBLPROPERTIES('{self.UCX_SKIP_PROPERTY}' = {skip_flag})" ) except NotFound as err: if "[TABLE_OR_VIEW_NOT_FOUND]" in str(err) or "[DELTA_TABLE_NOT_FOUND]" in str(err): @@ -138,10 +140,10 @@ def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Call def skip_schema(self, schema: str, unskip: bool = False): # Marks a schema to be skipped in the migration process by applying a table property - unskip = "false" if unskip else "true" + skip_flag = "false" if unskip else "true" try: self._sql_backend.execute( - f"ALTER SCHEMA {escape_sql_identifier(schema)} SET DBPROPERTIES('{self.UCX_SKIP_PROPERTY}' = {unskip})" + f"ALTER SCHEMA {escape_sql_identifier(schema)} SET DBPROPERTIES('{self.UCX_SKIP_PROPERTY}' = {skip_flag})" ) except NotFound as err: if "[SCHEMA_NOT_FOUND]" in str(err): @@ -211,7 +213,7 @@ def _get_table_in_scope_task(self, table_to_migrate: TableToMigrate) -> TableToM return None for value in properties: - #TODO : there should be a check to see if this property is equal to true then skip and if false then ignore + # TODO : there should be a check to see if this property is equal to true then skip and if false then ignore if value["key"] == self.UCX_SKIP_PROPERTY: logger.info(f"{table.key} is marked to be skipped") return None From ca95837ff3a1289655adf2a8964fa2361ff158e8 Mon Sep 17 00:00:00 2001 From: Amin Movahed Date: Tue, 24 Sep 2024 16:07:28 +1000 Subject: [PATCH 03/19] Another approach added --- src/databricks/labs/ucx/cli.py | 4 +- .../labs/ucx/hive_metastore/mapping.py | 45 +++++++++++++++---- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index ae6185c12c..9b438ee408 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -107,8 +107,8 @@ def unskip(w: WorkspaceClient, schema: str | None = None, table: str | None = No return None ctx = WorkspaceContext(w) if table: - return ctx.table_mapping.skip_table_or_view(schema, table, ctx.tables_crawler.load_one, unskip=True) - return ctx.table_mapping.skip_schema(schema, unskip=True) + return ctx.table_mapping.unskip_table_or_view(schema, table, ctx.tables_crawler.load_one) + return ctx.table_mapping.unskip_schema(schema) @ucx.command(is_account=True) diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index 7e364949c6..7dc5fd2325 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -116,17 +116,14 @@ def load(self) -> list[Rule]: msg = "Please run: databricks labs ucx table-mapping" raise ValueError(msg) from None - def skip_table_or_view( - self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None], unskip: bool = False - ): + def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]): # Marks a table to be skipped in the migration process by applying a table property - skip_flag = "false" if unskip else "true" try: table = load_table(schema_name, table_name) if table is None: raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]") self._sql_backend.execute( - f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} SET TBLPROPERTIES('{self.UCX_SKIP_PROPERTY}' = {skip_flag})" + f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} SET TBLPROPERTIES('{self.UCX_SKIP_PROPERTY}' = true)" ) except NotFound as err: if "[TABLE_OR_VIEW_NOT_FOUND]" in str(err) or "[DELTA_TABLE_NOT_FOUND]" in str(err): @@ -138,12 +135,30 @@ def skip_table_or_view( except BadRequest as err: logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) - def skip_schema(self, schema: str, unskip: bool = False): + def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]): + # Removes skip mark from the table property + try: + table = load_table(schema_name, table_name) + if table is None: + raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]") + self._sql_backend.execute( + f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}' );" + ) + except NotFound as err: + if "[TABLE_OR_VIEW_NOT_FOUND]" in str(err) or "[DELTA_TABLE_NOT_FOUND]" in str(err): + logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}. Table not found.") + else: + logger.error( + f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True + ) + except BadRequest as err: + logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) + + def skip_schema(self, schema: str): # Marks a schema to be skipped in the migration process by applying a table property - skip_flag = "false" if unskip else "true" try: self._sql_backend.execute( - f"ALTER SCHEMA {escape_sql_identifier(schema)} SET DBPROPERTIES('{self.UCX_SKIP_PROPERTY}' = {skip_flag})" + f"ALTER SCHEMA {escape_sql_identifier(schema)} SET DBPROPERTIES('{self.UCX_SKIP_PROPERTY}' = true)" ) except NotFound as err: if "[SCHEMA_NOT_FOUND]" in str(err): @@ -153,6 +168,20 @@ def skip_schema(self, schema: str, unskip: bool = False): except BadRequest as err: logger.error(err) + def unskip_schema(self, schema: str): + # Removes skip mark from the schema property + try: + self._sql_backend.execute( + f"ALTER SCHEMA {escape_sql_identifier(schema)} UNSET DBPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" + ) + except NotFound as err: + if "[SCHEMA_NOT_FOUND]" in str(err): + logger.error(f"Failed to remove skip marker for Schema {schema}. Schema not found.") + else: + logger.error(err) + except BadRequest as err: + logger.error(err) + def get_tables_to_migrate(self, tables_crawler: TablesCrawler) -> Collection[TableToMigrate]: rules = self.load() # Getting all the source tables from the rules From b26e1f493d8cebf37d42d6ae59feb258f6706726 Mon Sep 17 00:00:00 2001 From: Amin Movahed Date: Tue, 24 Sep 2024 16:14:43 +1000 Subject: [PATCH 04/19] Another approach added --- src/databricks/labs/ucx/hive_metastore/mapping.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index 7dc5fd2325..94f33a97fe 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -242,7 +242,6 @@ def _get_table_in_scope_task(self, table_to_migrate: TableToMigrate) -> TableToM return None for value in properties: - # TODO : there should be a check to see if this property is equal to true then skip and if false then ignore if value["key"] == self.UCX_SKIP_PROPERTY: logger.info(f"{table.key} is marked to be skipped") return None From 6b463a6e212e152de4561d61995e8bde39cd2c39 Mon Sep 17 00:00:00 2001 From: Amin Movahed Date: Tue, 24 Sep 2024 21:38:32 +1000 Subject: [PATCH 05/19] Add documentation --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 5589fbe36d..ed069c2261 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [`migrate-locations` command](#migrate-locations-command) * [`create-table-mapping` command](#create-table-mapping-command) * [`skip` command](#skip-command) + * [`unskip` command](#unskip-command) * [`create-catalogs-schemas` command](#create-catalogs-schemas-command) * [`migrate-tables` command](#migrate-tables-command) * [`revert-migrated-tables` command](#revert-migrated-tables-command) @@ -1420,6 +1421,15 @@ Once you're done with table migration, proceed to the [code migration](#code-mig [[back to top](#databricks-labs-ucx)] +## `unskip` command + +```text +databricks labs ucx unskip --schema X [--table Y] +``` +This command is useful to re-enable migration of a particular schema or table that has been already marked by [`skip` command](#skip-command). + +[[back to top](#databricks-labs-ucx)] + ## `create-catalogs-schemas` command ```text From 9641aa56997d354286bb8cb60e21d6b37709e01f Mon Sep 17 00:00:00 2001 From: Amin Movahed Date: Tue, 24 Sep 2024 22:15:16 +1000 Subject: [PATCH 06/19] Unit test added --- .../labs/ucx/hive_metastore/mapping.py | 4 ++-- tests/unit/hive_metastore/test_mapping.py | 23 +++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index 94f33a97fe..9a192e218b 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -142,8 +142,8 @@ def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Ca if table is None: raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]") self._sql_backend.execute( - f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}' );" - ) + f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" +) except NotFound as err: if "[TABLE_OR_VIEW_NOT_FOUND]" in str(err) or "[DELTA_TABLE_NOT_FOUND]" in str(err): logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}. Table not found.") diff --git a/tests/unit/hive_metastore/test_mapping.py b/tests/unit/hive_metastore/test_mapping.py index 0ea2951131..d6fb393f10 100644 --- a/tests/unit/hive_metastore/test_mapping.py +++ b/tests/unit/hive_metastore/test_mapping.py @@ -211,6 +211,29 @@ def test_skip_happy_path(caplog): assert len(caplog.records) == 0 +def test_unskip(caplog): + ws = create_autospec(WorkspaceClient) + sbe = create_autospec(SqlBackend) + installation = MockInstallation() + mapping = TableMapping(installation, ws, sbe) + table = Table(catalog="catalog", database="schema", name="table", object_type="table", table_format="csv") + mapping.unskip_table_or_view(schema_name="schema", table_name="table", load_table=lambda _schema, _table: table) + sbe.execute.assert_called_with( + f"ALTER TABLE `schema`.`table` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" + ) + view = Table( + catalog="catalog", database="schema", name="table", object_type="table", table_format="csv", view_text="stuff" + ) + mapping.unskip_table_or_view(schema_name="schema", table_name="view", load_table=lambda _schema, _table: view) + sbe.execute.assert_called_with( + f"ALTER VIEW `schema`.`view` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" + ) + assert len(caplog.records) == 0 + mapping.unskip_schema(schema="schema") + sbe.execute.assert_called_with(f"ALTER SCHEMA `schema` UNSET DBPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');") + assert len(caplog.records) == 0 + + def test_skip_missing_schema(caplog): ws = create_autospec(WorkspaceClient) sbe = create_autospec(SqlBackend) From e62d9bcaba613454ef143708fe2ac0068c1842e9 Mon Sep 17 00:00:00 2001 From: Amin Movahed Date: Wed, 25 Sep 2024 15:18:38 +1000 Subject: [PATCH 07/19] Incorporating the review comments --- README.md | 4 +- src/databricks/labs/ucx/cli.py | 2 +- .../labs/ucx/hive_metastore/mapping.py | 43 +++++++++++-------- tests/unit/hive_metastore/test_mapping.py | 30 +++++++------ 4 files changed, 46 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index ed069c2261..bb455d311c 100644 --- a/README.md +++ b/README.md @@ -1423,10 +1423,10 @@ Once you're done with table migration, proceed to the [code migration](#code-mig ## `unskip` command -```text +```commandline databricks labs ucx unskip --schema X [--table Y] ``` -This command is useful to re-enable migration of a particular schema or table that has been already marked by [`skip` command](#skip-command). +This command removes the mark set by the [`skip` command](#skip-command) on the given schema or table. [[back to top](#databricks-labs-ucx)] diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index 9b438ee408..ef046ecc26 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -100,7 +100,7 @@ def skip(w: WorkspaceClient, schema: str | None = None, table: str | None = None @ucx.command def unskip(w: WorkspaceClient, schema: str | None = None, table: str | None = None): - """Create a unskip comment on a schema or a table""" + """Unset the skip mark from a schema or a table""" logger.info("Running unskip command") if not schema: logger.error("--schema is a required parameter.") diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index 9a192e218b..bf17f5437b 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -135,8 +135,13 @@ def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Call except BadRequest as err: logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) - def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]): - # Removes skip mark from the table property + def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]) -> None: + """Removes skip mark from the table property.. + Args: + schema_name (String): The schema name of the table to be unskipped. + table_name (String): The table name of the table to be unskipped. + load_table (Callable): A function that loads a table from the metastore. + """ try: table = load_table(schema_name, table_name) if table is None: @@ -144,15 +149,14 @@ def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Ca self._sql_backend.execute( f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" ) - except NotFound as err: - if "[TABLE_OR_VIEW_NOT_FOUND]" in str(err) or "[DELTA_TABLE_NOT_FOUND]" in str(err): - logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}. Table not found.") + except NotFound as e: + if "[TABLE_OR_VIEW_NOT_FOUND]" in str(e) or "[DELTA_TABLE_NOT_FOUND]" in str(e): + logger.error(f"Failed to remove skip marker from table: {schema_name}.{table_name}. Table not found.", exc_info=e) else: logger.error( - f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True - ) - except BadRequest as err: - logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) + f"Failed to remove skip marker from table: {schema_name}.{table_name}", exc_info=e) + except BadRequest as e: + logger.error(f"Failed to remove skip marker from table: {schema_name}.{table_name}: {e!s}", exc_info=e) def skip_schema(self, schema: str): # Marks a schema to be skipped in the migration process by applying a table property @@ -168,19 +172,22 @@ def skip_schema(self, schema: str): except BadRequest as err: logger.error(err) - def unskip_schema(self, schema: str): - # Removes skip mark from the schema property + def unskip_schema(self, schema: str) -> None: + """Removes skip mark from the schema property. + Args: + schema_name (String): The schema name of the table to be unskipped. + """ try: self._sql_backend.execute( - f"ALTER SCHEMA {escape_sql_identifier(schema)} UNSET DBPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" + f"ALTER SCHEMA hive_metastore.{escape_sql_identifier(schema)} UNSET DBPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" ) - except NotFound as err: - if "[SCHEMA_NOT_FOUND]" in str(err): - logger.error(f"Failed to remove skip marker for Schema {schema}. Schema not found.") + except NotFound as e: + if "[SCHEMA_NOT_FOUND]" in str(e): + logger.error(f"Failed to remove skip marker from schema: {schema}. Schema not found.", exc_info=e) else: - logger.error(err) - except BadRequest as err: - logger.error(err) + logger.error(f"Failed to remove skip marker from schema: {schema}.", exc_info=e) + except BadRequest as e: + logger.error(f"Failed to remove skip marker from schema: {schema}.", exc_info=e) def get_tables_to_migrate(self, tables_crawler: TablesCrawler) -> Collection[TableToMigrate]: rules = self.load() diff --git a/tests/unit/hive_metastore/test_mapping.py b/tests/unit/hive_metastore/test_mapping.py index d6fb393f10..2d54e83206 100644 --- a/tests/unit/hive_metastore/test_mapping.py +++ b/tests/unit/hive_metastore/test_mapping.py @@ -211,27 +211,33 @@ def test_skip_happy_path(caplog): assert len(caplog.records) == 0 -def test_unskip(caplog): +def test_unskip_on_table(): ws = create_autospec(WorkspaceClient) - sbe = create_autospec(SqlBackend) + mock_backend = MockBackend() installation = MockInstallation() - mapping = TableMapping(installation, ws, sbe) + mapping = TableMapping(installation, ws, mock_backend) table = Table(catalog="catalog", database="schema", name="table", object_type="table", table_format="csv") mapping.unskip_table_or_view(schema_name="schema", table_name="table", load_table=lambda _schema, _table: table) - sbe.execute.assert_called_with( - f"ALTER TABLE `schema`.`table` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" - ) + f"ALTER TABLE `schema`.`table` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" in mock_backend.queries + +def test_unskip_on_view(): + ws = create_autospec(WorkspaceClient) + mock_backend = MockBackend() + installation = MockInstallation() + mapping = TableMapping(installation, ws, mock_backend) view = Table( catalog="catalog", database="schema", name="table", object_type="table", table_format="csv", view_text="stuff" ) mapping.unskip_table_or_view(schema_name="schema", table_name="view", load_table=lambda _schema, _table: view) - sbe.execute.assert_called_with( - f"ALTER VIEW `schema`.`view` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" - ) - assert len(caplog.records) == 0 + f"ALTER VIEW `schema`.`view` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" in mock_backend.queries + +def test_unskip_on_schema(): + ws = create_autospec(WorkspaceClient) + mock_backend = MockBackend() + installation = MockInstallation() + mapping = TableMapping(installation, ws, mock_backend) mapping.unskip_schema(schema="schema") - sbe.execute.assert_called_with(f"ALTER SCHEMA `schema` UNSET DBPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');") - assert len(caplog.records) == 0 + f"ALTER SCHEMA hive_metastore.`schema` UNSET DBPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" in mock_backend.queries def test_skip_missing_schema(caplog): From de1fbc4a06b56c3289060a1f66b447c321f51214 Mon Sep 17 00:00:00 2001 From: Amin Movahed Date: Wed, 25 Sep 2024 15:45:13 +1000 Subject: [PATCH 08/19] Incorporating the review comments --- .../labs/ucx/hive_metastore/mapping.py | 31 ++++++++++--------- tests/unit/hive_metastore/test_mapping.py | 20 ++++++++++-- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index bf17f5437b..bde1cdaf89 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -135,26 +135,29 @@ def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Call except BadRequest as err: logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) - def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]) -> None: + def unskip_table_or_view( + self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None] + ) -> None: """Removes skip mark from the table property.. - Args: - schema_name (String): The schema name of the table to be unskipped. - table_name (String): The table name of the table to be unskipped. - load_table (Callable): A function that loads a table from the metastore. + Args: + schema_name (String): The schema name of the table to be unskipped. + table_name (String): The table name of the table to be unskipped. + load_table (Callable): A function that loads a table from the metastore. """ + table = load_table(schema_name, table_name) + if table is None: + raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]") try: - table = load_table(schema_name, table_name) - if table is None: - raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]") self._sql_backend.execute( f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" -) + ) except NotFound as e: if "[TABLE_OR_VIEW_NOT_FOUND]" in str(e) or "[DELTA_TABLE_NOT_FOUND]" in str(e): - logger.error(f"Failed to remove skip marker from table: {schema_name}.{table_name}. Table not found.", exc_info=e) - else: logger.error( - f"Failed to remove skip marker from table: {schema_name}.{table_name}", exc_info=e) + f"Failed to remove skip marker from table: {schema_name}.{table_name}. Table not found.", exc_info=e + ) + else: + logger.error(f"Failed to remove skip marker from table: {schema_name}.{table_name}", exc_info=e) except BadRequest as e: logger.error(f"Failed to remove skip marker from table: {schema_name}.{table_name}: {e!s}", exc_info=e) @@ -174,8 +177,8 @@ def skip_schema(self, schema: str): def unskip_schema(self, schema: str) -> None: """Removes skip mark from the schema property. - Args: - schema_name (String): The schema name of the table to be unskipped. + Args: + schema (String): The schema name of the table to be unskipped. """ try: self._sql_backend.execute( diff --git a/tests/unit/hive_metastore/test_mapping.py b/tests/unit/hive_metastore/test_mapping.py index 2d54e83206..0734a9ae03 100644 --- a/tests/unit/hive_metastore/test_mapping.py +++ b/tests/unit/hive_metastore/test_mapping.py @@ -218,7 +218,12 @@ def test_unskip_on_table(): mapping = TableMapping(installation, ws, mock_backend) table = Table(catalog="catalog", database="schema", name="table", object_type="table", table_format="csv") mapping.unskip_table_or_view(schema_name="schema", table_name="table", load_table=lambda _schema, _table: table) - f"ALTER TABLE `schema`.`table` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" in mock_backend.queries + ws.tables.get.assert_not_called() + assert ( + f"ALTER TABLE `schema`.`table` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" + in mock_backend.queries + ) + def test_unskip_on_view(): ws = create_autospec(WorkspaceClient) @@ -229,7 +234,12 @@ def test_unskip_on_view(): catalog="catalog", database="schema", name="table", object_type="table", table_format="csv", view_text="stuff" ) mapping.unskip_table_or_view(schema_name="schema", table_name="view", load_table=lambda _schema, _table: view) - f"ALTER VIEW `schema`.`view` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" in mock_backend.queries + ws.tables.get.assert_not_called() + assert ( + f"ALTER VIEW `schema`.`view` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" + in mock_backend.queries + ) + def test_unskip_on_schema(): ws = create_autospec(WorkspaceClient) @@ -237,7 +247,11 @@ def test_unskip_on_schema(): installation = MockInstallation() mapping = TableMapping(installation, ws, mock_backend) mapping.unskip_schema(schema="schema") - f"ALTER SCHEMA hive_metastore.`schema` UNSET DBPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" in mock_backend.queries + ws.tables.get.assert_not_called() + assert ( + f"ALTER SCHEMA hive_metastore.`schema` UNSET DBPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" + in mock_backend.queries + ) def test_skip_missing_schema(caplog): From 589d4eb9876d20521d314e5997507e740aec3ec6 Mon Sep 17 00:00:00 2001 From: hari-selvarajan_data Date: Wed, 25 Sep 2024 07:02:14 +0100 Subject: [PATCH 09/19] initial draft --- src/databricks/labs/ucx/aws/locations.py | 4 ---- src/databricks/labs/ucx/azure/locations.py | 4 ---- src/databricks/labs/ucx/cli.py | 1 + src/databricks/labs/ucx/contexts/workspace_cli.py | 2 -- tests/integration/aws/test_access.py | 4 ++-- tests/integration/azure/test_locations.py | 8 ++++---- tests/unit/aws/test_access.py | 4 ++-- tests/unit/azure/test_locations.py | 10 ++-------- 8 files changed, 11 insertions(+), 26 deletions(-) diff --git a/src/databricks/labs/ucx/aws/locations.py b/src/databricks/labs/ucx/aws/locations.py index c87c1f79a8..af10fb2e0d 100644 --- a/src/databricks/labs/ucx/aws/locations.py +++ b/src/databricks/labs/ucx/aws/locations.py @@ -5,7 +5,6 @@ from databricks.labs.ucx.assessment.aws import AWSRoleAction from databricks.labs.ucx.aws.access import AWSResourcePermissions from databricks.labs.ucx.hive_metastore import ExternalLocations -from databricks.labs.ucx.hive_metastore.grants import PrincipalACL from databricks.labs.ucx.hive_metastore.locations import ExternalLocation from databricks.sdk import WorkspaceClient @@ -20,12 +19,10 @@ def __init__( ws: WorkspaceClient, external_locations: ExternalLocations, aws_resource_permissions: AWSResourcePermissions, - principal_acl: PrincipalACL, ): self._ws = ws self._external_locations = external_locations self._aws_resource_permissions = aws_resource_permissions - self._principal_acl = principal_acl def run(self) -> None: """ @@ -53,7 +50,6 @@ def run(self) -> None: credential_dict[role_arn], skip_validation=True, ) - self._principal_acl.apply_location_acl() @staticmethod def _generate_external_location_name(path: str) -> str: diff --git a/src/databricks/labs/ucx/azure/locations.py b/src/databricks/labs/ucx/azure/locations.py index 295e3cc32e..42c952c2dc 100644 --- a/src/databricks/labs/ucx/azure/locations.py +++ b/src/databricks/labs/ucx/azure/locations.py @@ -6,7 +6,6 @@ from databricks.labs.ucx.azure.access import AzureResourcePermissions from databricks.labs.ucx.azure.resources import AzureResources from databricks.labs.ucx.hive_metastore import ExternalLocations -from databricks.labs.ucx.hive_metastore.grants import PrincipalACL logger = logging.getLogger(__name__) @@ -19,13 +18,11 @@ def __init__( hms_locations: ExternalLocations, resource_permissions: AzureResourcePermissions, azurerm: AzureResources, - principal_acl: PrincipalACL, ): self._ws = ws self._hms_locations = hms_locations self._resource_permissions = resource_permissions self._azurerm = azurerm - self._principal_acl = principal_acl def _app_id_credential_name_mapping(self) -> tuple[dict[str, str], dict[str, str]]: # list all storage credentials. @@ -201,7 +198,6 @@ def run(self) -> list[str]: migrated_loc_urls.append(migrated_loc_url) leftover_loc_urls = [url for url in missing_loc_urls if url not in migrated_loc_urls] - self._principal_acl.apply_location_acl() if leftover_loc_urls: logger.info( "External locations below are not created in UC. You may check following cases and rerun this command:" diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index 9b438ee408..78600e2d82 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -506,6 +506,7 @@ def migrate_locations( for workspace_context in workspace_contexts: if workspace_context.is_azure or workspace_context.is_aws: workspace_context.external_locations_migration.run() + workspace_context.principal_acl.apply_location_acl() else: raise ValueError("Unsupported cloud provider") diff --git a/src/databricks/labs/ucx/contexts/workspace_cli.py b/src/databricks/labs/ucx/contexts/workspace_cli.py index 0ef0edc3aa..6f1429c0d1 100644 --- a/src/databricks/labs/ucx/contexts/workspace_cli.py +++ b/src/databricks/labs/ucx/contexts/workspace_cli.py @@ -112,7 +112,6 @@ def external_locations_migration(self): self.workspace_client, self.external_locations, self.aws_resource_permissions, - self.principal_acl, ) if self.is_azure: return ExternalLocationsMigration( @@ -120,7 +119,6 @@ def external_locations_migration(self): self.external_locations, self.azure_resource_permissions, self.azure_resources, - self.principal_acl, ) raise NotImplementedError diff --git a/tests/integration/aws/test_access.py b/tests/integration/aws/test_access.py index b0d3dcc330..67cee91c39 100644 --- a/tests/integration/aws/test_access.py +++ b/tests/integration/aws/test_access.py @@ -53,9 +53,9 @@ def test_create_external_location(ws, env_or_skip, make_random, inventory_schema ws, external_location, aws_permissions, - aws_cli_ctx.principal_acl, ) external_location_migration.run() + aws_cli_ctx.principal_acl.apply_location_acl() external_location = [ external_location for external_location in list(ws.external_locations.list()) @@ -150,10 +150,10 @@ def test_create_external_location_validate_acl( ws, external_location, aws_permissions, - aws_cli_ctx.principal_acl, ) try: external_location_migration.run() + aws_cli_ctx.principal_acl.apply_location_acl() permissions = ws.grants.get( SecurableType.EXTERNAL_LOCATION, external_location_name, principal=cluster_user.user_name ) diff --git a/tests/integration/azure/test_locations.py b/tests/integration/azure/test_locations.py index 60d6de47c1..9dc458d0d9 100644 --- a/tests/integration/azure/test_locations.py +++ b/tests/integration/azure/test_locations.py @@ -72,10 +72,10 @@ def test_run(caplog, ws, sql_backend, inventory_schema, az_cli_ctx): location_crawler, azure_resource_permissions, azurerm, - az_cli_ctx.principal_acl, ) try: location_migration.run() + az_cli_ctx.principal_acl.apply_location_acl() assert "All UC external location are created." in caplog.text assert ws.external_locations.get("uctest_ziyuanqintest_one").credential_name == "oneenv-adls" assert ws.external_locations.get("uctest_ziyuanqintest_two").credential_name == "oneenv-adls" @@ -119,10 +119,10 @@ def test_read_only_location(caplog, ws, sql_backend, inventory_schema, az_cli_ct location_crawler, azure_resource_permissions, azurerm, - az_cli_ctx.principal_acl, ) try: location_migration.run() + az_cli_ctx.principal_acl.apply_location_acl() assert ws.external_locations.get("ucx1_ziyuanqintest").credential_name == "ziyuanqin-uc-test-ac" assert ws.external_locations.get("ucx1_ziyuanqintest").read_only finally: @@ -164,9 +164,9 @@ def test_missing_credential(caplog, ws, sql_backend, inventory_schema, az_cli_ct location_crawler, azure_resource_permissions, azurerm, - az_cli_ctx.principal_acl, ) leftover_loc = location_migration.run() + az_cli_ctx.principal_acl.apply_location_acl() assert "External locations below are not created in UC" in caplog.text assert len(leftover_loc) == 2 @@ -212,10 +212,10 @@ def test_overlapping_location(caplog, ws, sql_backend, inventory_schema, az_cli_ location_crawler, azure_resource_permissions, azurerm, - az_cli_ctx.principal_acl, ) try: leftover_loc_urls = location_migration.run() + az_cli_ctx.principal_acl.apply_location_acl() assert "abfss://uctest@ziyuanqintest.dfs.core.windows.net/" in leftover_loc_urls assert "overlaps with an existing external location" in caplog.text finally: diff --git a/tests/unit/aws/test_access.py b/tests/unit/aws/test_access.py index 93c9c57b84..7a0e32f384 100644 --- a/tests/unit/aws/test_access.py +++ b/tests/unit/aws/test_access.py @@ -133,7 +133,6 @@ def test_create_external_locations(mock_ws, installation_multiple_roles, backend mock_ws, locations, aws_resource_permissions, - principal_acl, ) external_locations_migration.run() calls = [ @@ -141,6 +140,7 @@ def test_create_external_locations(mock_ws, installation_multiple_roles, backend call('bucket2_folder2', 's3://BUCKET2/FOLDER2', 'cred1', skip_validation=True), call('bucketx_folderx', 's3://BUCKETX/FOLDERX', 'credx', skip_validation=True), ] + principal_acl.apply_location_acl() mock_ws.external_locations.create.assert_has_calls(calls, any_order=True) aws.get_role_policy.assert_not_called() principal_acl.apply_location_acl.assert_called() @@ -187,9 +187,9 @@ def test_create_external_locations_skip_existing(mock_ws, backend, locations): mock_ws, locations, aws_resource_permissions, - principal_acl, ) external_locations_migration.run() + principal_acl.apply_location_acl() calls = [ call("bucket1_folder1", 's3://BUCKET1/FOLDER1', 'cred1', skip_validation=True), ] diff --git a/tests/unit/azure/test_locations.py b/tests/unit/azure/test_locations.py index 2b6e8a177a..28605f1f55 100644 --- a/tests/unit/azure/test_locations.py +++ b/tests/unit/azure/test_locations.py @@ -16,8 +16,7 @@ from databricks.labs.ucx.azure.access import AzureResourcePermissions from databricks.labs.ucx.azure.locations import ExternalLocationsMigration from databricks.labs.ucx.azure.resources import AzureResource, AzureResources, StorageAccount -from databricks.labs.ucx.hive_metastore import ExternalLocations, TablesCrawler, Mounts -from databricks.labs.ucx.hive_metastore.grants import PrincipalACL +from databricks.labs.ucx.hive_metastore import ExternalLocations from tests.unit.azure import azure_api_client @@ -28,12 +27,7 @@ def location_migration_for_test(ws, mock_backend, mock_installation, azurerm=Non azurerm = azurerm or AzureResources(azure_api_client(), azure_api_client()) location_crawler = ExternalLocations(ws, mock_backend, "location_test") azure_resource_permissions = AzureResourcePermissions(mock_installation, ws, azurerm, location_crawler) - tables_crawler = TablesCrawler(mock_backend, 'ucx') - mounts_crawler = Mounts(mock_backend, ws, 'ucx') - principal_acl = PrincipalACL(ws, mock_backend, mock_installation, tables_crawler, mounts_crawler, {}) - external_locations_migration = ExternalLocationsMigration( - ws, location_crawler, azure_resource_permissions, azurerm, principal_acl - ) + external_locations_migration = ExternalLocationsMigration(ws, location_crawler, azure_resource_permissions, azurerm) return external_locations_migration From 59f63ddf7ef8ae32e3d32964197db94425df3258 Mon Sep 17 00:00:00 2001 From: hari-selvarajan_data Date: Wed, 25 Sep 2024 11:38:13 +0100 Subject: [PATCH 10/19] changing the design as per review comments --- .../labs/ucx/contexts/application.py | 20 ++++++++----------- .../labs/ucx/hive_metastore/grants.py | 6 +++--- tests/unit/azure/test_locations.py | 2 +- .../hive_metastore/test_principal_grants.py | 14 ++++++++----- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index db05e36132..08b6dffd89 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -18,7 +18,6 @@ from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.sdk import AccountClient, WorkspaceClient, core -from databricks.sdk.errors import ResourceDoesNotExist from databricks.sdk.service import sql from databricks.labs.ucx.account.workspaces import WorkspaceInfo @@ -305,18 +304,15 @@ def aws_acl(self): ) @cached_property - def principal_locations(self): - eligible_locations = {} - try: + def principal_locations_retriever(self): + def inner(): if self.is_azure: - eligible_locations = self.azure_acl.get_eligible_locations_principals() + return self.azure_acl.get_eligible_locations_principals() if self.is_aws: - eligible_locations = self.aws_acl.get_eligible_locations_principals() - if self.is_gcp: - raise NotImplementedError("Not implemented for GCP.") - except ResourceDoesNotExist: - pass - return eligible_locations + return self.aws_acl.get_eligible_locations_principals() + raise NotImplementedError("Not implemented for GCP.") + + return inner @cached_property def principal_acl(self): @@ -326,7 +322,7 @@ def principal_acl(self): self.installation, self.tables_crawler, self.mounts_crawler, - self.principal_locations, + self.principal_locations_retriever, ) @cached_property diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index a2afa473f3..8673779697 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -579,7 +579,7 @@ def __init__( installation: Installation, tables_crawler: TablesCrawler, mounts_crawler: Mounts, - cluster_locations: list[ComputeLocations], + cluster_locations: Callable[[], list[ComputeLocations]], ): self._backend = backend self._ws = ws @@ -593,7 +593,7 @@ def get_interactive_cluster_grants(self) -> list[Grant]: mounts = list(self._mounts_crawler.snapshot()) grants: set[Grant] = set() - for compute_location in self._compute_locations: + for compute_location in self._compute_locations(): principals = self._get_cluster_principal_mapping(compute_location.compute_id, compute_location.compute_type) if len(principals) == 0: continue @@ -697,7 +697,7 @@ def apply_location_acl(self): "CREATE EXTERNAL VOLUME and READ_FILES for existing eligible interactive cluster users" ) # get the eligible location mapped for each interactive cluster - for compute_location in self._compute_locations: + for compute_location in self._compute_locations(): # get interactive cluster users principals = self._get_cluster_principal_mapping(compute_location.compute_id, compute_location.compute_type) if len(principals) == 0: diff --git a/tests/unit/azure/test_locations.py b/tests/unit/azure/test_locations.py index 2b6e8a177a..f1b901638b 100644 --- a/tests/unit/azure/test_locations.py +++ b/tests/unit/azure/test_locations.py @@ -30,7 +30,7 @@ def location_migration_for_test(ws, mock_backend, mock_installation, azurerm=Non azure_resource_permissions = AzureResourcePermissions(mock_installation, ws, azurerm, location_crawler) tables_crawler = TablesCrawler(mock_backend, 'ucx') mounts_crawler = Mounts(mock_backend, ws, 'ucx') - principal_acl = PrincipalACL(ws, mock_backend, mock_installation, tables_crawler, mounts_crawler, {}) + principal_acl = PrincipalACL(ws, mock_backend, mock_installation, tables_crawler, mounts_crawler, lambda: []) external_locations_migration = ExternalLocationsMigration( ws, location_crawler, azure_resource_permissions, azurerm, principal_acl ) diff --git a/tests/unit/hive_metastore/test_principal_grants.py b/tests/unit/hive_metastore/test_principal_grants.py index e62a998905..bef584fdab 100644 --- a/tests/unit/hive_metastore/test_principal_grants.py +++ b/tests/unit/hive_metastore/test_principal_grants.py @@ -157,11 +157,15 @@ def principal_acl(w, install, cluster_spn: list, warehouse_spn: list): spn_crawler = create_autospec(AzureServicePrincipalCrawler) spn_crawler.get_cluster_to_storage_mapping.return_value = cluster_spn - locations = [] - if w.config.is_azure: - locations = azure_acl(w, install, cluster_spn, warehouse_spn).get_eligible_locations_principals() - if w.config.is_aws: - locations = aws_acl(w, install).get_eligible_locations_principals() + + def inner(): + if w.config.is_azure: + return azure_acl(w, install, cluster_spn, warehouse_spn).get_eligible_locations_principals() + if w.config.is_aws: + return aws_acl(w, install).get_eligible_locations_principals() + return None + + locations = inner return PrincipalACL(w, sql_backend, install, table_crawler, mount_crawler, locations) From 4a196463e0c078999bd52c5e02f46ebdd390950e Mon Sep 17 00:00:00 2001 From: hari-selvarajan_data Date: Wed, 25 Sep 2024 11:46:25 +0100 Subject: [PATCH 11/19] merging --- src/databricks/labs/ucx/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index 04f0b22d0b..0c9fb46f4f 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -100,7 +100,7 @@ def skip(w: WorkspaceClient, schema: str | None = None, table: str | None = None @ucx.command def unskip(w: WorkspaceClient, schema: str | None = None, table: str | None = None): - """Unset the skip mark from a schema or a table""" + """Create a unskip comment on a schema or a table""" logger.info("Running unskip command") if not schema: logger.error("--schema is a required parameter.") From 44cffe520b31dcfc5ecfcdd7f4e0cc6a301f205d Mon Sep 17 00:00:00 2001 From: hari-selvarajan_data Date: Wed, 25 Sep 2024 11:47:29 +0100 Subject: [PATCH 12/19] merging --- README.md | 312 ++++++++++++++++++++++++++---------------------------- 1 file changed, 151 insertions(+), 161 deletions(-) diff --git a/README.md b/README.md index d78d5729fb..db1e7e3b03 100644 --- a/README.md +++ b/README.md @@ -22,112 +22,111 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [Databricks Labs UCX](#databricks-labs-ucx) * [Installation](#installation) - * [Authenticate Databricks CLI](#authenticate-databricks-cli) - * [Install UCX](#install-ucx) - * [[ADVANCED] Force install over existing UCX](#advanced-force-install-over-existing-ucx) - * [[ADVANCED] Installing UCX on all workspaces within a Databricks account](#advanced-installing-ucx-on-all-workspaces-within-a-databricks-account) - * [[ADVANCED] Installing UCX with company hosted PYPI mirror](#advanced-installing-ucx-with-company-hosted-pypi-mirror) - * [Upgrading UCX for newer versions](#upgrading-ucx-for-newer-versions) - * [Uninstall UCX](#uninstall-ucx) + * [Authenticate Databricks CLI](#authenticate-databricks-cli) + * [Install UCX](#install-ucx) + * [[ADVANCED] Force install over existing UCX](#advanced-force-install-over-existing-ucx) + * [[ADVANCED] Installing UCX on all workspaces within a Databricks account](#advanced-installing-ucx-on-all-workspaces-within-a-databricks-account) + * [[ADVANCED] Installing UCX with company hosted PYPI mirror](#advanced-installing-ucx-with-company-hosted-pypi-mirror) + * [Upgrading UCX for newer versions](#upgrading-ucx-for-newer-versions) + * [Uninstall UCX](#uninstall-ucx) * [Migration process](#migration-process) * [Workflows](#workflows) - * [Readme notebook](#readme-notebook) - * [Assessment workflow](#assessment-workflow) - * [Group migration workflow](#group-migration-workflow) - * [Debug notebook](#debug-notebook) - * [Debug logs](#debug-logs) - * [Table Migration](#table-migration) - * [Prerequisites](#prerequisites) - * [Upgrade Process](#upgrade-process) - * [Step 1: Mapping Metastore Tables (UCX Only)](#step-1-mapping-metastore-tables-ucx-only) - * [Step 1.1: Create the mapping file](#step-11-create-the-mapping-file) - * [Step 1.2: Update the mapping file](#step-12-update-the-mapping-file) - * [Step 2: Create the necessary cloud principals for the upgrade](#step-2-create-the-necessary-cloud-principals-for-the-upgrade) - * [Step 2.1: Map the cloud principals to the cloud "prefixes"](#step-21-map-the-cloud-principals-to-the-cloud-prefixes) - * [Step 2.2: Create/Modify Cloud Principals and Credentials](#step-22-createmodify-cloud-principals-and-credentials) - * [Step 2.3: Create External Locations](#step-23-create-external-locations) - * [Step 2.4: Create "Uber Principal"](#step-24-create-uber-principal) - * [Step 2.5: Create Catalogs and Schemas](#step-25-create-catalogs-and-schemas) - * [Step 3: Upgrade the Metastore](#step-3-upgrade-the-metastore) - * [Table migration workflows](#table-migration-workflows) - * [Step 4: Odds and Ends](#step-4-odds-and-ends) - * [Step 4.1: Skipping Table/Schema](#step-41-skipping-tableschema) - * [Step 4.2: Moving objects](#step-42-moving-objects) - * [Step 4.2: Aliasing objects](#step-42-aliasing-objects) - * [Step 4.3: Reverting objects](#step-43-reverting-objects) - * [Post Migration Data Reconciliation Task](#post-migration-data-reconciliation-task) - * [Other considerations](#other-considerations) - * [[EXPERIMENTAL] Scan tables in mounts Workflow](#experimental-scan-tables-in-mounts-workflow) - * [Always run this workflow AFTER the assessment has finished](#balways-run-this-workflow-after-the-assessment-has-finishedb) - * [[EXPERIMENTAL] Migrate tables in mounts Workflow](#experimental-migrate-tables-in-mounts-workflow) - * [Jobs Static Code Analysis Workflow](#jobs-static-code-analysis-workflow) - * [Linter message codes](#linter-message-codes) - * [`cannot-autofix-table-reference`](#cannot-autofix-table-reference) - * [`catalog-api-in-shared-clusters`](#catalog-api-in-shared-clusters) - * [`changed-result-format-in-uc`](#changed-result-format-in-uc) - * [`direct-filesystem-access-in-sql-query`](#direct-filesystem-access-in-sql-query) - * [`direct-filesystem-access`](#direct-filesystem-access) - * [`dependency-not-found`](#dependency-not-found) - * [`jvm-access-in-shared-clusters`](#jvm-access-in-shared-clusters) - * [`legacy-context-in-shared-clusters`](#legacy-context-in-shared-clusters) - * [`not-supported`](#not-supported) - * [`notebook-run-cannot-compute-value`](#notebook-run-cannot-compute-value) - * [`python-udf-in-shared-clusters`](#python-udf-in-shared-clusters) - * [`rdd-in-shared-clusters`](#rdd-in-shared-clusters) - * [`spark-logging-in-shared-clusters`](#spark-logging-in-shared-clusters) - * [`sql-parse-error`](#sql-parse-error) - * [`sys-path-cannot-compute-value`](#sys-path-cannot-compute-value) - * [`table-migrated-to-uc`](#table-migrated-to-uc) - * [`to-json-in-shared-clusters`](#to-json-in-shared-clusters) - * [`unsupported-magic-line`](#unsupported-magic-line) + * [Readme notebook](#readme-notebook) + * [Assessment workflow](#assessment-workflow) + * [Group migration workflow](#group-migration-workflow) + * [Debug notebook](#debug-notebook) + * [Debug logs](#debug-logs) + * [Table Migration](#table-migration) + * [Prerequisites](#prerequisites) + * [Upgrade Process](#upgrade-process) + * [Step 1: Mapping Metastore Tables (UCX Only)](#step-1-mapping-metastore-tables-ucx-only) + * [Step 1.1: Create the mapping file](#step-11-create-the-mapping-file) + * [Step 1.2: Update the mapping file](#step-12-update-the-mapping-file) + * [Step 2: Create the necessary cloud principals for the upgrade](#step-2-create-the-necessary-cloud-principals-for-the-upgrade) + * [Step 2.1: Map the cloud principals to the cloud "prefixes"](#step-21-map-the-cloud-principals-to-the-cloud-prefixes) + * [Step 2.2: Create/Modify Cloud Principals and Credentials](#step-22-createmodify-cloud-principals-and-credentials) + * [Step 2.3: Create External Locations](#step-23-create-external-locations) + * [Step 2.4: Create "Uber Principal"](#step-24-create-uber-principal) + * [Step 2.5: Create Catalogs and Schemas](#step-25-create-catalogs-and-schemas) + * [Step 3: Upgrade the Metastore](#step-3-upgrade-the-metastore) + * [Table migration workflows](#table-migration-workflows) + * [Step 4: Odds and Ends](#step-4-odds-and-ends) + * [Step 4.1: Skipping Table/Schema](#step-41-skipping-tableschema) + * [Step 4.2: Moving objects](#step-42-moving-objects) + * [Step 4.2: Aliasing objects](#step-42-aliasing-objects) + * [Step 4.3: Reverting objects](#step-43-reverting-objects) + * [Post Migration Data Reconciliation Task](#post-migration-data-reconciliation-task) + * [Other considerations](#other-considerations) + * [[EXPERIMENTAL] Scan tables in mounts Workflow](#experimental-scan-tables-in-mounts-workflow) + * [Always run this workflow AFTER the assessment has finished](#balways-run-this-workflow-after-the-assessment-has-finishedb) + * [[EXPERIMENTAL] Migrate tables in mounts Workflow](#experimental-migrate-tables-in-mounts-workflow) + * [Jobs Static Code Analysis Workflow](#jobs-static-code-analysis-workflow) + * [Linter message codes](#linter-message-codes) + * [`cannot-autofix-table-reference`](#cannot-autofix-table-reference) + * [`catalog-api-in-shared-clusters`](#catalog-api-in-shared-clusters) + * [`changed-result-format-in-uc`](#changed-result-format-in-uc) + * [`direct-filesystem-access-in-sql-query`](#direct-filesystem-access-in-sql-query) + * [`direct-filesystem-access`](#direct-filesystem-access) + * [`dependency-not-found`](#dependency-not-found) + * [`jvm-access-in-shared-clusters`](#jvm-access-in-shared-clusters) + * [`legacy-context-in-shared-clusters`](#legacy-context-in-shared-clusters) + * [`not-supported`](#not-supported) + * [`notebook-run-cannot-compute-value`](#notebook-run-cannot-compute-value) + * [`python-udf-in-shared-clusters`](#python-udf-in-shared-clusters) + * [`rdd-in-shared-clusters`](#rdd-in-shared-clusters) + * [`spark-logging-in-shared-clusters`](#spark-logging-in-shared-clusters) + * [`sql-parse-error`](#sql-parse-error) + * [`sys-path-cannot-compute-value`](#sys-path-cannot-compute-value) + * [`table-migrated-to-uc`](#table-migrated-to-uc) + * [`to-json-in-shared-clusters`](#to-json-in-shared-clusters) + * [`unsupported-magic-line`](#unsupported-magic-line) * [Utility commands](#utility-commands) - * [`logs` command](#logs-command) - * [`ensure-assessment-run` command](#ensure-assessment-run-command) - * [`update-migration-progress` command](#update-migration-progress-command) - * [`repair-run` command](#repair-run-command) - * [`workflows` command](#workflows-command) - * [`open-remote-config` command](#open-remote-config-command) - * [`installations` command](#installations-command) - * [`report-account-compatibility` command](#report-account-compatibility-command) + * [`logs` command](#logs-command) + * [`ensure-assessment-run` command](#ensure-assessment-run-command) + * [`update-migration-progress` command](#update-migration-progress-command) + * [`repair-run` command](#repair-run-command) + * [`workflows` command](#workflows-command) + * [`open-remote-config` command](#open-remote-config-command) + * [`installations` command](#installations-command) + * [`report-account-compatibility` command](#report-account-compatibility-command) * [Metastore related commands](#metastore-related-commands) - * [`show-all-metastores` command](#show-all-metastores-command) - * [`assign-metastore` command](#assign-metastore-command) - * [`create-ucx-catalog` command](#create-ucx-catalog-command) + * [`show-all-metastores` command](#show-all-metastores-command) + * [`assign-metastore` command](#assign-metastore-command) + * [`create-ucx-catalog` command](#create-ucx-catalog-command) * [Table migration commands](#table-migration-commands) - * [`principal-prefix-access` command](#principal-prefix-access-command) - * [Access for AWS S3 Buckets](#access-for-aws-s3-buckets) - * [Access for Azure Storage Accounts](#access-for-azure-storage-accounts) - * [`create-missing-principals` command (AWS Only)](#create-missing-principals-command-aws-only) - * [`delete-missing-principals` command (AWS Only)](#delete-missing-principals-command-aws-only) - * [`create-uber-principal` command](#create-uber-principal-command) - * [`migrate-credentials` command](#migrate-credentials-command) - * [`validate-external-locations` command](#validate-external-locations-command) - * [`migrate-locations` command](#migrate-locations-command) - * [`create-table-mapping` command](#create-table-mapping-command) - * [`skip` command](#skip-command) - * [`unskip` command](#unskip-command) - * [`create-catalogs-schemas` command](#create-catalogs-schemas-command) - * [`migrate-tables` command](#migrate-tables-command) - * [`revert-migrated-tables` command](#revert-migrated-tables-command) - * [`move` command](#move-command) - * [`alias` command](#alias-command) + * [`principal-prefix-access` command](#principal-prefix-access-command) + * [Access for AWS S3 Buckets](#access-for-aws-s3-buckets) + * [Access for Azure Storage Accounts](#access-for-azure-storage-accounts) + * [`create-missing-principals` command (AWS Only)](#create-missing-principals-command-aws-only) + * [`delete-missing-principals` command (AWS Only)](#delete-missing-principals-command-aws-only) + * [`create-uber-principal` command](#create-uber-principal-command) + * [`migrate-credentials` command](#migrate-credentials-command) + * [`validate-external-locations` command](#validate-external-locations-command) + * [`migrate-locations` command](#migrate-locations-command) + * [`create-table-mapping` command](#create-table-mapping-command) + * [`skip` command](#skip-command) + * [`create-catalogs-schemas` command](#create-catalogs-schemas-command) + * [`migrate-tables` command](#migrate-tables-command) + * [`revert-migrated-tables` command](#revert-migrated-tables-command) + * [`move` command](#move-command) + * [`alias` command](#alias-command) * [Code migration commands](#code-migration-commands) - * [`lint-local-code` command](#lint-local-code-command) - * [`migrate-local-code` command](#migrate-local-code-command) - * [`migrate-dbsql-dashboards` command](#migrate-dbsql-dashboards-command) - * [`revert-dbsql-dashboards` command](#revert-dbsql-dashboards-command) + * [`lint-local-code` command](#lint-local-code-command) + * [`migrate-local-code` command](#migrate-local-code-command) + * [`migrate-dbsql-dashboards` command](#migrate-dbsql-dashboards-command) + * [`revert-dbsql-dashboards` command](#revert-dbsql-dashboards-command) * [Cross-workspace installations](#cross-workspace-installations) - * [`sync-workspace-info` command](#sync-workspace-info-command) - * [`manual-workspace-info` command](#manual-workspace-info-command) - * [`create-account-groups` command](#create-account-groups-command) - * [`validate-groups-membership` command](#validate-groups-membership-command) - * [`validate-table-locations` command](#validate-table-locations-command) - * [`cluster-remap` command](#cluster-remap-command) - * [`revert-cluster-remap` command](#revert-cluster-remap-command) - * [`upload` command](#upload-command) - * [`download` command](#download-command) - * [`join-collection` command](#join-collection command) - * [collection eligible command](#collection-eligible-command) + * [`sync-workspace-info` command](#sync-workspace-info-command) + * [`manual-workspace-info` command](#manual-workspace-info-command) + * [`create-account-groups` command](#create-account-groups-command) + * [`validate-groups-membership` command](#validate-groups-membership-command) + * [`validate-table-locations` command](#validate-table-locations-command) + * [`cluster-remap` command](#cluster-remap-command) + * [`revert-cluster-remap` command](#revert-cluster-remap-command) + * [`upload` command](#upload-command) + * [`download` command](#download-command) + * [`join-collection` command](#join-collection command) + * [collection eligible command](#collection-eligible-command) * [Common Challenges and the Solutions](#common-challenges-and-the-solutions) * [Network Connectivity Issues](#network-connectivity-issues) * [Insufficient Privileges](#insufficient-privileges) @@ -462,8 +461,8 @@ To effectively upgrade the metastores, four principal operations are required: 1. Assess - In this step, you evaluate the existing HMS tables identified for upgrade to determine the right approach for upgrade. This step is a prerequisite and is performed by the assessment workflow. 2. Create - In this step, you create the required UC assets such as, Metastore, Catalog, Schema, Storage Credentials, External Locations. This step is part of the upgrade process. 3. Upgrade/Grant these are two steps that UCX combine. - 4. Upgrade - The metastores objects (tables/views) will be converted to a format supported by UC - 4. Grant - The table upgrade the newly created object the same permission as the original object. + 4. Upgrade - The metastores objects (tables/views) will be converted to a format supported by UC + 4. Grant - The table upgrade the newly created object the same permission as the original object. ### Prerequisites For UCX to be able to upgrade the metastore. The following prerequisites must be met: @@ -663,12 +662,12 @@ This command will remove the upgraded table and reset the `upgraded_from` proper ### Post Migration Data Reconciliation Task UCX also provides `migrate-data-reconciliation` workflow to validate the integrity of the migrated tables: - Compare the schema of the source and target tables. The result is `schema_matches`, and column by column comparison -is stored as `column_comparison` struct. + is stored as `column_comparison` struct. - Compare the row counts of the source and target tables. If the row count is within the reconciliation threshold -(defaults to 5%), `data_matches` is True. + (defaults to 5%), `data_matches` is True. - Compare the content of individual row between source and target tables to identify any discrepancies (when `compare_rows` -flag is enabled). This is done using hash comparison, and number of missing rows are stored as `source_missing_count` -and `target_missing_count` + flag is enabled). This is done using hash comparison, and number of missing rows are stored as `source_missing_count` + and `target_missing_count` Once the workflow completes, the output will be stored in `$inventory_database.reconciliation_results` view, and displayed in the Migration dashboard. @@ -678,8 +677,8 @@ in the Migration dashboard. ### Other considerations - You may need to run the workflow multiple times to ensure all the tables are migrated successfully in phases. - If your Delta tables in DBFS root have a large number of files, consider: - - Setting higher `Min` and `Max workers for auto-scale` when being asked during the UCX installation. More cores in the cluster means more concurrency for calling cloud storage API to copy files when deep cloning the Delta tables. - - Setting higher `Parallelism for migrating DBFS root Delta tables with deep clone` (default 200) when being asked during the UCX installation. This controls the number of Spark tasks/partitions to be created for deep clone. + - Setting higher `Min` and `Max workers for auto-scale` when being asked during the UCX installation. More cores in the cluster means more concurrency for calling cloud storage API to copy files when deep cloning the Delta tables. + - Setting higher `Parallelism for migrating DBFS root Delta tables with deep clone` (default 200) when being asked during the UCX installation. This controls the number of Spark tasks/partitions to be created for deep clone. - Consider creating an instance pool, and setting its id when prompted during the UCX installation. This instance pool will be specified in the cluster policy used by all UCX workflows job clusters. - You may also manually edit the job cluster configration per job or per task after the workflows are deployed. @@ -690,20 +689,20 @@ in the Migration dashboard. - It writes all results to `hive_metastore..tables`, you can query those tables found by filtering on database values that starts with `mounted_` - This command is incremental, meaning that each time you run it, it will overwrite the previous tables in mounts found. - Current format are supported: - - DELTA - PARQUET - CSV - JSON - - Also detects partitioned DELTA and PARQUET + - DELTA - PARQUET - CSV - JSON + - Also detects partitioned DELTA and PARQUET - You can configure these workflows with the following options available on conf.yml: - - include_mounts : A list of mount points to scans, by default the workflow scans for all mount points - - exclude_paths_in_mount : A list of paths to exclude in all mount points - - include_paths_in_mount : A list of paths to include in all mount points + - include_mounts : A list of mount points to scans, by default the workflow scans for all mount points + - exclude_paths_in_mount : A list of paths to exclude in all mount points + - include_paths_in_mount : A list of paths to include in all mount points ### [EXPERIMENTAL] Migrate tables in mounts Workflow - An experimental workflow that migrates tables in mount points using a `CREATE TABLE` command, optinally sets a default tables owner if provided in `default_table_owner` conf parameter. - You must do the following in order to make this work: - - run the Assessment [workflow](#assessment-workflow) - - run the scan tables in mounts [workflow](#EXPERIMENTAL-scan-tables-in-mounts-workflow) - - run the [`create-table-mapping` command](#create-table-mapping-command) - - or manually create a `mapping.csv` file in Workspace -> Applications -> ucx + - run the Assessment [workflow](#assessment-workflow) + - run the scan tables in mounts [workflow](#EXPERIMENTAL-scan-tables-in-mounts-workflow) + - run the [`create-table-mapping` command](#create-table-mapping-command) + - or manually create a `mapping.csv` file in Workspace -> Applications -> ucx [[back to top](#databricks-labs-ucx)] @@ -1075,26 +1074,26 @@ and opens it using the `webbrowser.open()` method. This command is useful for de edit the remote configuration file without having to manually navigate to it in the workspace. It can also be used to quickly access the configuration file from the command line. Here's the description of configuration properties: - * `inventory_database`: A string representing the name of the inventory database. - * `workspace_group_regex`: An optional string representing the regular expression to match workspace group names. - * `workspace_group_replace`: An optional string to replace the matched group names with. - * `account_group_regex`: An optional string representing the regular expression to match account group names. - * `group_match_by_external_id`: A boolean value indicating whether to match groups by their external IDs. - * `include_group_names`: An optional list of strings representing the names of groups to include for migration. - * `renamed_group_prefix`: An optional string representing the prefix to add to renamed group names. - * `instance_pool_id`: An optional string representing the ID of the instance pool. - * `warehouse_id`: An optional string representing the ID of the warehouse. - * `connect`: An optional `Config` object representing the configuration for connecting to the warehouse. - * `num_threads`: An optional integer representing the number of threads to use for migration. - * `database_to_catalog_mapping`: An optional dictionary mapping source database names to target catalog names. - * `default_catalog`: An optional string representing the default catalog name. - * `log_level`: An optional string representing the log level. - * `workspace_start_path`: A string representing the starting path for notebooks and directories crawler in the workspace. - * `instance_profile`: An optional string representing the name of the instance profile. - * `spark_conf`: An optional dictionary of Spark configuration properties. - * `override_clusters`: An optional dictionary mapping job cluster names to existing cluster IDs. - * `policy_id`: An optional string representing the ID of the cluster policy. - * `include_databases`: An optional list of strings representing the names of databases to include for migration. +* `inventory_database`: A string representing the name of the inventory database. +* `workspace_group_regex`: An optional string representing the regular expression to match workspace group names. +* `workspace_group_replace`: An optional string to replace the matched group names with. +* `account_group_regex`: An optional string representing the regular expression to match account group names. +* `group_match_by_external_id`: A boolean value indicating whether to match groups by their external IDs. +* `include_group_names`: An optional list of strings representing the names of groups to include for migration. +* `renamed_group_prefix`: An optional string representing the prefix to add to renamed group names. +* `instance_pool_id`: An optional string representing the ID of the instance pool. +* `warehouse_id`: An optional string representing the ID of the warehouse. +* `connect`: An optional `Config` object representing the configuration for connecting to the warehouse. +* `num_threads`: An optional integer representing the number of threads to use for migration. +* `database_to_catalog_mapping`: An optional dictionary mapping source database names to target catalog names. +* `default_catalog`: An optional string representing the default catalog name. +* `log_level`: An optional string representing the log level. +* `workspace_start_path`: A string representing the starting path for notebooks and directories crawler in the workspace. +* `instance_profile`: An optional string representing the name of the instance profile. +* `spark_conf`: An optional dictionary of Spark configuration properties. +* `override_clusters`: An optional dictionary mapping job cluster names to existing cluster IDs. +* `policy_id`: An optional string representing the ID of the cluster policy. +* `include_databases`: An optional list of strings representing the names of databases to include for migration. [[back to top](#databricks-labs-ucx)] @@ -1327,7 +1326,7 @@ databricks labs ucx migrate-credentials For Azure, this command prompts to confirm performing the following credential migration steps: 1. [RECOMMENDED] For each storage account, create access connectors with managed identities that have the `Storage Blob Data Contributor` role on the respective storage account. A storage credential is created for each - access connector. + access connector. 2. Migrate Azure Service Principals, which have `Storage Blob Data Contributor`, `Storage Blob Data Reader`, `Storage Blob Data Owner`, or custom roles on ADLS Gen2 locations that are being used in Databricks, to UC storage credentials. The Azure Service Principals to location mapping are listed @@ -1336,8 +1335,8 @@ For Azure, this command prompts to confirm performing the following credential m Principals you do not want to be migrated. The command will only migrate the Service Principals that have client secret stored in Databricks Secret. - **Warning**: Service principals used to access storage accounts behind firewalls might cause connectivity issues. We - recommend to use access connectors instead. +**Warning**: Service principals used to access storage accounts behind firewalls might cause connectivity issues. We +recommend to use access connectors instead. For AWS, this command migrates AWS Instance Profiles that are being used in Databricks, to UC storage credentials. The AWS Instance Profiles to location mapping are listed in @@ -1438,15 +1437,6 @@ Once you're done with table migration, proceed to the [code migration](#code-mig [[back to top](#databricks-labs-ucx)] -## `unskip` command - -```commandline -databricks labs ucx unskip --schema X [--table Y] -``` -This command removes the mark set by the [`skip` command](#skip-command) on the given schema or table. - -[[back to top](#databricks-labs-ucx)] - ## `create-catalogs-schemas` command ```text @@ -1517,7 +1507,7 @@ Users will be prompted whether tables/view are dropped after moving to new schem This command moves different table types differently: - `MANAGED` tables are deep-cloned to the new schema. - `EXTERNAL` tables are dropped from the original schema, then created in the target schema using the same location. -This is due to Unity Catalog not supporting multiple tables with overlapping paths + This is due to Unity Catalog not supporting multiple tables with overlapping paths - `VIEW` are recreated using the same view definition. This command supports moving multiple tables at once, by specifying `*` as the table name. @@ -1739,14 +1729,14 @@ Options to resolve tables with overlapping locations are: Considerations when resolving tables with overlapping locations are: - Migrate the tables one workspace at a time: - - Let later migrated workspaces read tables from the earlier migrated workspace catalogs. - - [Move](#move-command) tables between schemas and catalogs when it fits the data management model. + - Let later migrated workspaces read tables from the earlier migrated workspace catalogs. + - [Move](#move-command) tables between schemas and catalogs when it fits the data management model. - The tables might have different: - - Metadata, like: - - Column schema (names, types, order) - - Description - - Tags - - ACLs + - Metadata, like: + - Column schema (names, types, order) + - Description + - Tags + - ACLs [[back to top](#databricks-labs-ucx)] @@ -1822,10 +1812,10 @@ $ databricks labs ucx join-collection --workspace-ids Date: Wed, 25 Sep 2024 11:47:55 +0100 Subject: [PATCH 13/19] merging --- README.md | 312 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 161 insertions(+), 151 deletions(-) diff --git a/README.md b/README.md index db1e7e3b03..d78d5729fb 100644 --- a/README.md +++ b/README.md @@ -22,111 +22,112 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [Databricks Labs UCX](#databricks-labs-ucx) * [Installation](#installation) - * [Authenticate Databricks CLI](#authenticate-databricks-cli) - * [Install UCX](#install-ucx) - * [[ADVANCED] Force install over existing UCX](#advanced-force-install-over-existing-ucx) - * [[ADVANCED] Installing UCX on all workspaces within a Databricks account](#advanced-installing-ucx-on-all-workspaces-within-a-databricks-account) - * [[ADVANCED] Installing UCX with company hosted PYPI mirror](#advanced-installing-ucx-with-company-hosted-pypi-mirror) - * [Upgrading UCX for newer versions](#upgrading-ucx-for-newer-versions) - * [Uninstall UCX](#uninstall-ucx) + * [Authenticate Databricks CLI](#authenticate-databricks-cli) + * [Install UCX](#install-ucx) + * [[ADVANCED] Force install over existing UCX](#advanced-force-install-over-existing-ucx) + * [[ADVANCED] Installing UCX on all workspaces within a Databricks account](#advanced-installing-ucx-on-all-workspaces-within-a-databricks-account) + * [[ADVANCED] Installing UCX with company hosted PYPI mirror](#advanced-installing-ucx-with-company-hosted-pypi-mirror) + * [Upgrading UCX for newer versions](#upgrading-ucx-for-newer-versions) + * [Uninstall UCX](#uninstall-ucx) * [Migration process](#migration-process) * [Workflows](#workflows) - * [Readme notebook](#readme-notebook) - * [Assessment workflow](#assessment-workflow) - * [Group migration workflow](#group-migration-workflow) - * [Debug notebook](#debug-notebook) - * [Debug logs](#debug-logs) - * [Table Migration](#table-migration) - * [Prerequisites](#prerequisites) - * [Upgrade Process](#upgrade-process) - * [Step 1: Mapping Metastore Tables (UCX Only)](#step-1-mapping-metastore-tables-ucx-only) - * [Step 1.1: Create the mapping file](#step-11-create-the-mapping-file) - * [Step 1.2: Update the mapping file](#step-12-update-the-mapping-file) - * [Step 2: Create the necessary cloud principals for the upgrade](#step-2-create-the-necessary-cloud-principals-for-the-upgrade) - * [Step 2.1: Map the cloud principals to the cloud "prefixes"](#step-21-map-the-cloud-principals-to-the-cloud-prefixes) - * [Step 2.2: Create/Modify Cloud Principals and Credentials](#step-22-createmodify-cloud-principals-and-credentials) - * [Step 2.3: Create External Locations](#step-23-create-external-locations) - * [Step 2.4: Create "Uber Principal"](#step-24-create-uber-principal) - * [Step 2.5: Create Catalogs and Schemas](#step-25-create-catalogs-and-schemas) - * [Step 3: Upgrade the Metastore](#step-3-upgrade-the-metastore) - * [Table migration workflows](#table-migration-workflows) - * [Step 4: Odds and Ends](#step-4-odds-and-ends) - * [Step 4.1: Skipping Table/Schema](#step-41-skipping-tableschema) - * [Step 4.2: Moving objects](#step-42-moving-objects) - * [Step 4.2: Aliasing objects](#step-42-aliasing-objects) - * [Step 4.3: Reverting objects](#step-43-reverting-objects) - * [Post Migration Data Reconciliation Task](#post-migration-data-reconciliation-task) - * [Other considerations](#other-considerations) - * [[EXPERIMENTAL] Scan tables in mounts Workflow](#experimental-scan-tables-in-mounts-workflow) - * [Always run this workflow AFTER the assessment has finished](#balways-run-this-workflow-after-the-assessment-has-finishedb) - * [[EXPERIMENTAL] Migrate tables in mounts Workflow](#experimental-migrate-tables-in-mounts-workflow) - * [Jobs Static Code Analysis Workflow](#jobs-static-code-analysis-workflow) - * [Linter message codes](#linter-message-codes) - * [`cannot-autofix-table-reference`](#cannot-autofix-table-reference) - * [`catalog-api-in-shared-clusters`](#catalog-api-in-shared-clusters) - * [`changed-result-format-in-uc`](#changed-result-format-in-uc) - * [`direct-filesystem-access-in-sql-query`](#direct-filesystem-access-in-sql-query) - * [`direct-filesystem-access`](#direct-filesystem-access) - * [`dependency-not-found`](#dependency-not-found) - * [`jvm-access-in-shared-clusters`](#jvm-access-in-shared-clusters) - * [`legacy-context-in-shared-clusters`](#legacy-context-in-shared-clusters) - * [`not-supported`](#not-supported) - * [`notebook-run-cannot-compute-value`](#notebook-run-cannot-compute-value) - * [`python-udf-in-shared-clusters`](#python-udf-in-shared-clusters) - * [`rdd-in-shared-clusters`](#rdd-in-shared-clusters) - * [`spark-logging-in-shared-clusters`](#spark-logging-in-shared-clusters) - * [`sql-parse-error`](#sql-parse-error) - * [`sys-path-cannot-compute-value`](#sys-path-cannot-compute-value) - * [`table-migrated-to-uc`](#table-migrated-to-uc) - * [`to-json-in-shared-clusters`](#to-json-in-shared-clusters) - * [`unsupported-magic-line`](#unsupported-magic-line) + * [Readme notebook](#readme-notebook) + * [Assessment workflow](#assessment-workflow) + * [Group migration workflow](#group-migration-workflow) + * [Debug notebook](#debug-notebook) + * [Debug logs](#debug-logs) + * [Table Migration](#table-migration) + * [Prerequisites](#prerequisites) + * [Upgrade Process](#upgrade-process) + * [Step 1: Mapping Metastore Tables (UCX Only)](#step-1-mapping-metastore-tables-ucx-only) + * [Step 1.1: Create the mapping file](#step-11-create-the-mapping-file) + * [Step 1.2: Update the mapping file](#step-12-update-the-mapping-file) + * [Step 2: Create the necessary cloud principals for the upgrade](#step-2-create-the-necessary-cloud-principals-for-the-upgrade) + * [Step 2.1: Map the cloud principals to the cloud "prefixes"](#step-21-map-the-cloud-principals-to-the-cloud-prefixes) + * [Step 2.2: Create/Modify Cloud Principals and Credentials](#step-22-createmodify-cloud-principals-and-credentials) + * [Step 2.3: Create External Locations](#step-23-create-external-locations) + * [Step 2.4: Create "Uber Principal"](#step-24-create-uber-principal) + * [Step 2.5: Create Catalogs and Schemas](#step-25-create-catalogs-and-schemas) + * [Step 3: Upgrade the Metastore](#step-3-upgrade-the-metastore) + * [Table migration workflows](#table-migration-workflows) + * [Step 4: Odds and Ends](#step-4-odds-and-ends) + * [Step 4.1: Skipping Table/Schema](#step-41-skipping-tableschema) + * [Step 4.2: Moving objects](#step-42-moving-objects) + * [Step 4.2: Aliasing objects](#step-42-aliasing-objects) + * [Step 4.3: Reverting objects](#step-43-reverting-objects) + * [Post Migration Data Reconciliation Task](#post-migration-data-reconciliation-task) + * [Other considerations](#other-considerations) + * [[EXPERIMENTAL] Scan tables in mounts Workflow](#experimental-scan-tables-in-mounts-workflow) + * [Always run this workflow AFTER the assessment has finished](#balways-run-this-workflow-after-the-assessment-has-finishedb) + * [[EXPERIMENTAL] Migrate tables in mounts Workflow](#experimental-migrate-tables-in-mounts-workflow) + * [Jobs Static Code Analysis Workflow](#jobs-static-code-analysis-workflow) + * [Linter message codes](#linter-message-codes) + * [`cannot-autofix-table-reference`](#cannot-autofix-table-reference) + * [`catalog-api-in-shared-clusters`](#catalog-api-in-shared-clusters) + * [`changed-result-format-in-uc`](#changed-result-format-in-uc) + * [`direct-filesystem-access-in-sql-query`](#direct-filesystem-access-in-sql-query) + * [`direct-filesystem-access`](#direct-filesystem-access) + * [`dependency-not-found`](#dependency-not-found) + * [`jvm-access-in-shared-clusters`](#jvm-access-in-shared-clusters) + * [`legacy-context-in-shared-clusters`](#legacy-context-in-shared-clusters) + * [`not-supported`](#not-supported) + * [`notebook-run-cannot-compute-value`](#notebook-run-cannot-compute-value) + * [`python-udf-in-shared-clusters`](#python-udf-in-shared-clusters) + * [`rdd-in-shared-clusters`](#rdd-in-shared-clusters) + * [`spark-logging-in-shared-clusters`](#spark-logging-in-shared-clusters) + * [`sql-parse-error`](#sql-parse-error) + * [`sys-path-cannot-compute-value`](#sys-path-cannot-compute-value) + * [`table-migrated-to-uc`](#table-migrated-to-uc) + * [`to-json-in-shared-clusters`](#to-json-in-shared-clusters) + * [`unsupported-magic-line`](#unsupported-magic-line) * [Utility commands](#utility-commands) - * [`logs` command](#logs-command) - * [`ensure-assessment-run` command](#ensure-assessment-run-command) - * [`update-migration-progress` command](#update-migration-progress-command) - * [`repair-run` command](#repair-run-command) - * [`workflows` command](#workflows-command) - * [`open-remote-config` command](#open-remote-config-command) - * [`installations` command](#installations-command) - * [`report-account-compatibility` command](#report-account-compatibility-command) + * [`logs` command](#logs-command) + * [`ensure-assessment-run` command](#ensure-assessment-run-command) + * [`update-migration-progress` command](#update-migration-progress-command) + * [`repair-run` command](#repair-run-command) + * [`workflows` command](#workflows-command) + * [`open-remote-config` command](#open-remote-config-command) + * [`installations` command](#installations-command) + * [`report-account-compatibility` command](#report-account-compatibility-command) * [Metastore related commands](#metastore-related-commands) - * [`show-all-metastores` command](#show-all-metastores-command) - * [`assign-metastore` command](#assign-metastore-command) - * [`create-ucx-catalog` command](#create-ucx-catalog-command) + * [`show-all-metastores` command](#show-all-metastores-command) + * [`assign-metastore` command](#assign-metastore-command) + * [`create-ucx-catalog` command](#create-ucx-catalog-command) * [Table migration commands](#table-migration-commands) - * [`principal-prefix-access` command](#principal-prefix-access-command) - * [Access for AWS S3 Buckets](#access-for-aws-s3-buckets) - * [Access for Azure Storage Accounts](#access-for-azure-storage-accounts) - * [`create-missing-principals` command (AWS Only)](#create-missing-principals-command-aws-only) - * [`delete-missing-principals` command (AWS Only)](#delete-missing-principals-command-aws-only) - * [`create-uber-principal` command](#create-uber-principal-command) - * [`migrate-credentials` command](#migrate-credentials-command) - * [`validate-external-locations` command](#validate-external-locations-command) - * [`migrate-locations` command](#migrate-locations-command) - * [`create-table-mapping` command](#create-table-mapping-command) - * [`skip` command](#skip-command) - * [`create-catalogs-schemas` command](#create-catalogs-schemas-command) - * [`migrate-tables` command](#migrate-tables-command) - * [`revert-migrated-tables` command](#revert-migrated-tables-command) - * [`move` command](#move-command) - * [`alias` command](#alias-command) + * [`principal-prefix-access` command](#principal-prefix-access-command) + * [Access for AWS S3 Buckets](#access-for-aws-s3-buckets) + * [Access for Azure Storage Accounts](#access-for-azure-storage-accounts) + * [`create-missing-principals` command (AWS Only)](#create-missing-principals-command-aws-only) + * [`delete-missing-principals` command (AWS Only)](#delete-missing-principals-command-aws-only) + * [`create-uber-principal` command](#create-uber-principal-command) + * [`migrate-credentials` command](#migrate-credentials-command) + * [`validate-external-locations` command](#validate-external-locations-command) + * [`migrate-locations` command](#migrate-locations-command) + * [`create-table-mapping` command](#create-table-mapping-command) + * [`skip` command](#skip-command) + * [`unskip` command](#unskip-command) + * [`create-catalogs-schemas` command](#create-catalogs-schemas-command) + * [`migrate-tables` command](#migrate-tables-command) + * [`revert-migrated-tables` command](#revert-migrated-tables-command) + * [`move` command](#move-command) + * [`alias` command](#alias-command) * [Code migration commands](#code-migration-commands) - * [`lint-local-code` command](#lint-local-code-command) - * [`migrate-local-code` command](#migrate-local-code-command) - * [`migrate-dbsql-dashboards` command](#migrate-dbsql-dashboards-command) - * [`revert-dbsql-dashboards` command](#revert-dbsql-dashboards-command) + * [`lint-local-code` command](#lint-local-code-command) + * [`migrate-local-code` command](#migrate-local-code-command) + * [`migrate-dbsql-dashboards` command](#migrate-dbsql-dashboards-command) + * [`revert-dbsql-dashboards` command](#revert-dbsql-dashboards-command) * [Cross-workspace installations](#cross-workspace-installations) - * [`sync-workspace-info` command](#sync-workspace-info-command) - * [`manual-workspace-info` command](#manual-workspace-info-command) - * [`create-account-groups` command](#create-account-groups-command) - * [`validate-groups-membership` command](#validate-groups-membership-command) - * [`validate-table-locations` command](#validate-table-locations-command) - * [`cluster-remap` command](#cluster-remap-command) - * [`revert-cluster-remap` command](#revert-cluster-remap-command) - * [`upload` command](#upload-command) - * [`download` command](#download-command) - * [`join-collection` command](#join-collection command) - * [collection eligible command](#collection-eligible-command) + * [`sync-workspace-info` command](#sync-workspace-info-command) + * [`manual-workspace-info` command](#manual-workspace-info-command) + * [`create-account-groups` command](#create-account-groups-command) + * [`validate-groups-membership` command](#validate-groups-membership-command) + * [`validate-table-locations` command](#validate-table-locations-command) + * [`cluster-remap` command](#cluster-remap-command) + * [`revert-cluster-remap` command](#revert-cluster-remap-command) + * [`upload` command](#upload-command) + * [`download` command](#download-command) + * [`join-collection` command](#join-collection command) + * [collection eligible command](#collection-eligible-command) * [Common Challenges and the Solutions](#common-challenges-and-the-solutions) * [Network Connectivity Issues](#network-connectivity-issues) * [Insufficient Privileges](#insufficient-privileges) @@ -461,8 +462,8 @@ To effectively upgrade the metastores, four principal operations are required: 1. Assess - In this step, you evaluate the existing HMS tables identified for upgrade to determine the right approach for upgrade. This step is a prerequisite and is performed by the assessment workflow. 2. Create - In this step, you create the required UC assets such as, Metastore, Catalog, Schema, Storage Credentials, External Locations. This step is part of the upgrade process. 3. Upgrade/Grant these are two steps that UCX combine. - 4. Upgrade - The metastores objects (tables/views) will be converted to a format supported by UC - 4. Grant - The table upgrade the newly created object the same permission as the original object. + 4. Upgrade - The metastores objects (tables/views) will be converted to a format supported by UC + 4. Grant - The table upgrade the newly created object the same permission as the original object. ### Prerequisites For UCX to be able to upgrade the metastore. The following prerequisites must be met: @@ -662,12 +663,12 @@ This command will remove the upgraded table and reset the `upgraded_from` proper ### Post Migration Data Reconciliation Task UCX also provides `migrate-data-reconciliation` workflow to validate the integrity of the migrated tables: - Compare the schema of the source and target tables. The result is `schema_matches`, and column by column comparison - is stored as `column_comparison` struct. +is stored as `column_comparison` struct. - Compare the row counts of the source and target tables. If the row count is within the reconciliation threshold - (defaults to 5%), `data_matches` is True. +(defaults to 5%), `data_matches` is True. - Compare the content of individual row between source and target tables to identify any discrepancies (when `compare_rows` - flag is enabled). This is done using hash comparison, and number of missing rows are stored as `source_missing_count` - and `target_missing_count` +flag is enabled). This is done using hash comparison, and number of missing rows are stored as `source_missing_count` +and `target_missing_count` Once the workflow completes, the output will be stored in `$inventory_database.reconciliation_results` view, and displayed in the Migration dashboard. @@ -677,8 +678,8 @@ in the Migration dashboard. ### Other considerations - You may need to run the workflow multiple times to ensure all the tables are migrated successfully in phases. - If your Delta tables in DBFS root have a large number of files, consider: - - Setting higher `Min` and `Max workers for auto-scale` when being asked during the UCX installation. More cores in the cluster means more concurrency for calling cloud storage API to copy files when deep cloning the Delta tables. - - Setting higher `Parallelism for migrating DBFS root Delta tables with deep clone` (default 200) when being asked during the UCX installation. This controls the number of Spark tasks/partitions to be created for deep clone. + - Setting higher `Min` and `Max workers for auto-scale` when being asked during the UCX installation. More cores in the cluster means more concurrency for calling cloud storage API to copy files when deep cloning the Delta tables. + - Setting higher `Parallelism for migrating DBFS root Delta tables with deep clone` (default 200) when being asked during the UCX installation. This controls the number of Spark tasks/partitions to be created for deep clone. - Consider creating an instance pool, and setting its id when prompted during the UCX installation. This instance pool will be specified in the cluster policy used by all UCX workflows job clusters. - You may also manually edit the job cluster configration per job or per task after the workflows are deployed. @@ -689,20 +690,20 @@ in the Migration dashboard. - It writes all results to `hive_metastore..tables`, you can query those tables found by filtering on database values that starts with `mounted_` - This command is incremental, meaning that each time you run it, it will overwrite the previous tables in mounts found. - Current format are supported: - - DELTA - PARQUET - CSV - JSON - - Also detects partitioned DELTA and PARQUET + - DELTA - PARQUET - CSV - JSON + - Also detects partitioned DELTA and PARQUET - You can configure these workflows with the following options available on conf.yml: - - include_mounts : A list of mount points to scans, by default the workflow scans for all mount points - - exclude_paths_in_mount : A list of paths to exclude in all mount points - - include_paths_in_mount : A list of paths to include in all mount points + - include_mounts : A list of mount points to scans, by default the workflow scans for all mount points + - exclude_paths_in_mount : A list of paths to exclude in all mount points + - include_paths_in_mount : A list of paths to include in all mount points ### [EXPERIMENTAL] Migrate tables in mounts Workflow - An experimental workflow that migrates tables in mount points using a `CREATE TABLE` command, optinally sets a default tables owner if provided in `default_table_owner` conf parameter. - You must do the following in order to make this work: - - run the Assessment [workflow](#assessment-workflow) - - run the scan tables in mounts [workflow](#EXPERIMENTAL-scan-tables-in-mounts-workflow) - - run the [`create-table-mapping` command](#create-table-mapping-command) - - or manually create a `mapping.csv` file in Workspace -> Applications -> ucx + - run the Assessment [workflow](#assessment-workflow) + - run the scan tables in mounts [workflow](#EXPERIMENTAL-scan-tables-in-mounts-workflow) + - run the [`create-table-mapping` command](#create-table-mapping-command) + - or manually create a `mapping.csv` file in Workspace -> Applications -> ucx [[back to top](#databricks-labs-ucx)] @@ -1074,26 +1075,26 @@ and opens it using the `webbrowser.open()` method. This command is useful for de edit the remote configuration file without having to manually navigate to it in the workspace. It can also be used to quickly access the configuration file from the command line. Here's the description of configuration properties: -* `inventory_database`: A string representing the name of the inventory database. -* `workspace_group_regex`: An optional string representing the regular expression to match workspace group names. -* `workspace_group_replace`: An optional string to replace the matched group names with. -* `account_group_regex`: An optional string representing the regular expression to match account group names. -* `group_match_by_external_id`: A boolean value indicating whether to match groups by their external IDs. -* `include_group_names`: An optional list of strings representing the names of groups to include for migration. -* `renamed_group_prefix`: An optional string representing the prefix to add to renamed group names. -* `instance_pool_id`: An optional string representing the ID of the instance pool. -* `warehouse_id`: An optional string representing the ID of the warehouse. -* `connect`: An optional `Config` object representing the configuration for connecting to the warehouse. -* `num_threads`: An optional integer representing the number of threads to use for migration. -* `database_to_catalog_mapping`: An optional dictionary mapping source database names to target catalog names. -* `default_catalog`: An optional string representing the default catalog name. -* `log_level`: An optional string representing the log level. -* `workspace_start_path`: A string representing the starting path for notebooks and directories crawler in the workspace. -* `instance_profile`: An optional string representing the name of the instance profile. -* `spark_conf`: An optional dictionary of Spark configuration properties. -* `override_clusters`: An optional dictionary mapping job cluster names to existing cluster IDs. -* `policy_id`: An optional string representing the ID of the cluster policy. -* `include_databases`: An optional list of strings representing the names of databases to include for migration. + * `inventory_database`: A string representing the name of the inventory database. + * `workspace_group_regex`: An optional string representing the regular expression to match workspace group names. + * `workspace_group_replace`: An optional string to replace the matched group names with. + * `account_group_regex`: An optional string representing the regular expression to match account group names. + * `group_match_by_external_id`: A boolean value indicating whether to match groups by their external IDs. + * `include_group_names`: An optional list of strings representing the names of groups to include for migration. + * `renamed_group_prefix`: An optional string representing the prefix to add to renamed group names. + * `instance_pool_id`: An optional string representing the ID of the instance pool. + * `warehouse_id`: An optional string representing the ID of the warehouse. + * `connect`: An optional `Config` object representing the configuration for connecting to the warehouse. + * `num_threads`: An optional integer representing the number of threads to use for migration. + * `database_to_catalog_mapping`: An optional dictionary mapping source database names to target catalog names. + * `default_catalog`: An optional string representing the default catalog name. + * `log_level`: An optional string representing the log level. + * `workspace_start_path`: A string representing the starting path for notebooks and directories crawler in the workspace. + * `instance_profile`: An optional string representing the name of the instance profile. + * `spark_conf`: An optional dictionary of Spark configuration properties. + * `override_clusters`: An optional dictionary mapping job cluster names to existing cluster IDs. + * `policy_id`: An optional string representing the ID of the cluster policy. + * `include_databases`: An optional list of strings representing the names of databases to include for migration. [[back to top](#databricks-labs-ucx)] @@ -1326,7 +1327,7 @@ databricks labs ucx migrate-credentials For Azure, this command prompts to confirm performing the following credential migration steps: 1. [RECOMMENDED] For each storage account, create access connectors with managed identities that have the `Storage Blob Data Contributor` role on the respective storage account. A storage credential is created for each - access connector. + access connector. 2. Migrate Azure Service Principals, which have `Storage Blob Data Contributor`, `Storage Blob Data Reader`, `Storage Blob Data Owner`, or custom roles on ADLS Gen2 locations that are being used in Databricks, to UC storage credentials. The Azure Service Principals to location mapping are listed @@ -1335,8 +1336,8 @@ For Azure, this command prompts to confirm performing the following credential m Principals you do not want to be migrated. The command will only migrate the Service Principals that have client secret stored in Databricks Secret. -**Warning**: Service principals used to access storage accounts behind firewalls might cause connectivity issues. We -recommend to use access connectors instead. + **Warning**: Service principals used to access storage accounts behind firewalls might cause connectivity issues. We + recommend to use access connectors instead. For AWS, this command migrates AWS Instance Profiles that are being used in Databricks, to UC storage credentials. The AWS Instance Profiles to location mapping are listed in @@ -1437,6 +1438,15 @@ Once you're done with table migration, proceed to the [code migration](#code-mig [[back to top](#databricks-labs-ucx)] +## `unskip` command + +```commandline +databricks labs ucx unskip --schema X [--table Y] +``` +This command removes the mark set by the [`skip` command](#skip-command) on the given schema or table. + +[[back to top](#databricks-labs-ucx)] + ## `create-catalogs-schemas` command ```text @@ -1507,7 +1517,7 @@ Users will be prompted whether tables/view are dropped after moving to new schem This command moves different table types differently: - `MANAGED` tables are deep-cloned to the new schema. - `EXTERNAL` tables are dropped from the original schema, then created in the target schema using the same location. - This is due to Unity Catalog not supporting multiple tables with overlapping paths +This is due to Unity Catalog not supporting multiple tables with overlapping paths - `VIEW` are recreated using the same view definition. This command supports moving multiple tables at once, by specifying `*` as the table name. @@ -1729,14 +1739,14 @@ Options to resolve tables with overlapping locations are: Considerations when resolving tables with overlapping locations are: - Migrate the tables one workspace at a time: - - Let later migrated workspaces read tables from the earlier migrated workspace catalogs. - - [Move](#move-command) tables between schemas and catalogs when it fits the data management model. + - Let later migrated workspaces read tables from the earlier migrated workspace catalogs. + - [Move](#move-command) tables between schemas and catalogs when it fits the data management model. - The tables might have different: - - Metadata, like: - - Column schema (names, types, order) - - Description - - Tags - - ACLs + - Metadata, like: + - Column schema (names, types, order) + - Description + - Tags + - ACLs [[back to top](#databricks-labs-ucx)] @@ -1812,10 +1822,10 @@ $ databricks labs ucx join-collection --workspace-ids Date: Wed, 25 Sep 2024 11:50:03 +0100 Subject: [PATCH 14/19] merging --- README.md | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/README.md b/README.md index d78d5729fb..e7a75f7363 100644 --- a/README.md +++ b/README.md @@ -105,7 +105,6 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [`migrate-locations` command](#migrate-locations-command) * [`create-table-mapping` command](#create-table-mapping-command) * [`skip` command](#skip-command) - * [`unskip` command](#unskip-command) * [`create-catalogs-schemas` command](#create-catalogs-schemas-command) * [`migrate-tables` command](#migrate-tables-command) * [`revert-migrated-tables` command](#revert-migrated-tables-command) @@ -1438,15 +1437,6 @@ Once you're done with table migration, proceed to the [code migration](#code-mig [[back to top](#databricks-labs-ucx)] -## `unskip` command - -```commandline -databricks labs ucx unskip --schema X [--table Y] -``` -This command removes the mark set by the [`skip` command](#skip-command) on the given schema or table. - -[[back to top](#databricks-labs-ucx)] - ## `create-catalogs-schemas` command ```text From afb8a3e3e7577d06486aedbf4f609f12c8022dc7 Mon Sep 17 00:00:00 2001 From: hari-selvarajan_data Date: Wed, 25 Sep 2024 11:51:37 +0100 Subject: [PATCH 15/19] merging --- .../labs/ucx/hive_metastore/mapping.py | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index bde1cdaf89..f01ce1014e 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -135,32 +135,6 @@ def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Call except BadRequest as err: logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) - def unskip_table_or_view( - self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None] - ) -> None: - """Removes skip mark from the table property.. - Args: - schema_name (String): The schema name of the table to be unskipped. - table_name (String): The table name of the table to be unskipped. - load_table (Callable): A function that loads a table from the metastore. - """ - table = load_table(schema_name, table_name) - if table is None: - raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]") - try: - self._sql_backend.execute( - f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" - ) - except NotFound as e: - if "[TABLE_OR_VIEW_NOT_FOUND]" in str(e) or "[DELTA_TABLE_NOT_FOUND]" in str(e): - logger.error( - f"Failed to remove skip marker from table: {schema_name}.{table_name}. Table not found.", exc_info=e - ) - else: - logger.error(f"Failed to remove skip marker from table: {schema_name}.{table_name}", exc_info=e) - except BadRequest as e: - logger.error(f"Failed to remove skip marker from table: {schema_name}.{table_name}: {e!s}", exc_info=e) - def skip_schema(self, schema: str): # Marks a schema to be skipped in the migration process by applying a table property try: From 6915b2f05911323a0f3882dc6bc87575a548a9d3 Mon Sep 17 00:00:00 2001 From: hari-selvarajan_data Date: Wed, 25 Sep 2024 11:54:32 +0100 Subject: [PATCH 16/19] merging --- .../labs/ucx/hive_metastore/mapping.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index f01ce1014e..e5c3911b0e 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -135,6 +135,24 @@ def skip_table_or_view(self, schema_name: str, table_name: str, load_table: Call except BadRequest as err: logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) + def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Callable[[str, str], Table | None]): + # Removes skip mark from the table property + try: + table = load_table(schema_name, table_name) + if table is None: + raise NotFound("[TABLE_OR_VIEW_NOT_FOUND]") + self._sql_backend.execute( + f"ALTER {table.kind} {escape_sql_identifier(schema_name)}.{escape_sql_identifier(table_name)} UNSET TBLPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}' );" + ) + except NotFound as err: + if "[TABLE_OR_VIEW_NOT_FOUND]" in str(err) or "[DELTA_TABLE_NOT_FOUND]" in str(err): + logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}. Table not found.") + else: + logger.error( + f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True + ) + except BadRequest as err: + logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) def skip_schema(self, schema: str): # Marks a schema to be skipped in the migration process by applying a table property try: From 8a1893c9b524bda3d41d659dd4eab4b111371050 Mon Sep 17 00:00:00 2001 From: hari-selvarajan_data Date: Wed, 25 Sep 2024 11:55:42 +0100 Subject: [PATCH 17/19] merging --- .../labs/ucx/hive_metastore/mapping.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index e5c3911b0e..94f33a97fe 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -153,6 +153,7 @@ def unskip_table_or_view(self, schema_name: str, table_name: str, load_table: Ca ) except BadRequest as err: logger.error(f"Failed to apply skip marker for Table {schema_name}.{table_name}: {err!s}", exc_info=True) + def skip_schema(self, schema: str): # Marks a schema to be skipped in the migration process by applying a table property try: @@ -167,22 +168,19 @@ def skip_schema(self, schema: str): except BadRequest as err: logger.error(err) - def unskip_schema(self, schema: str) -> None: - """Removes skip mark from the schema property. - Args: - schema (String): The schema name of the table to be unskipped. - """ + def unskip_schema(self, schema: str): + # Removes skip mark from the schema property try: self._sql_backend.execute( - f"ALTER SCHEMA hive_metastore.{escape_sql_identifier(schema)} UNSET DBPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" + f"ALTER SCHEMA {escape_sql_identifier(schema)} UNSET DBPROPERTIES IF EXISTS('{self.UCX_SKIP_PROPERTY}');" ) - except NotFound as e: - if "[SCHEMA_NOT_FOUND]" in str(e): - logger.error(f"Failed to remove skip marker from schema: {schema}. Schema not found.", exc_info=e) + except NotFound as err: + if "[SCHEMA_NOT_FOUND]" in str(err): + logger.error(f"Failed to remove skip marker for Schema {schema}. Schema not found.") else: - logger.error(f"Failed to remove skip marker from schema: {schema}.", exc_info=e) - except BadRequest as e: - logger.error(f"Failed to remove skip marker from schema: {schema}.", exc_info=e) + logger.error(err) + except BadRequest as err: + logger.error(err) def get_tables_to_migrate(self, tables_crawler: TablesCrawler) -> Collection[TableToMigrate]: rules = self.load() From 34aab8ffa8566ac4736fdbaa83ba5afe5581e9bb Mon Sep 17 00:00:00 2001 From: hari-selvarajan_data Date: Wed, 25 Sep 2024 11:56:35 +0100 Subject: [PATCH 18/19] merging --- tests/unit/hive_metastore/test_mapping.py | 61 ++++------------------- 1 file changed, 9 insertions(+), 52 deletions(-) diff --git a/tests/unit/hive_metastore/test_mapping.py b/tests/unit/hive_metastore/test_mapping.py index 0734a9ae03..492c5f17e3 100644 --- a/tests/unit/hive_metastore/test_mapping.py +++ b/tests/unit/hive_metastore/test_mapping.py @@ -175,15 +175,15 @@ def test_load_mapping(): ws.tables.get.assert_not_called() assert [ - Rule( - workspace_name="foo-bar", - catalog_name="foo_bar", - src_schema="foo", - dst_schema="foo", - src_table="bar", - dst_table="bar", - ) - ] == rules + Rule( + workspace_name="foo-bar", + catalog_name="foo_bar", + src_schema="foo", + dst_schema="foo", + src_table="bar", + dst_table="bar", + ) + ] == rules def test_skip_happy_path(caplog): @@ -211,49 +211,6 @@ def test_skip_happy_path(caplog): assert len(caplog.records) == 0 -def test_unskip_on_table(): - ws = create_autospec(WorkspaceClient) - mock_backend = MockBackend() - installation = MockInstallation() - mapping = TableMapping(installation, ws, mock_backend) - table = Table(catalog="catalog", database="schema", name="table", object_type="table", table_format="csv") - mapping.unskip_table_or_view(schema_name="schema", table_name="table", load_table=lambda _schema, _table: table) - ws.tables.get.assert_not_called() - assert ( - f"ALTER TABLE `schema`.`table` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" - in mock_backend.queries - ) - - -def test_unskip_on_view(): - ws = create_autospec(WorkspaceClient) - mock_backend = MockBackend() - installation = MockInstallation() - mapping = TableMapping(installation, ws, mock_backend) - view = Table( - catalog="catalog", database="schema", name="table", object_type="table", table_format="csv", view_text="stuff" - ) - mapping.unskip_table_or_view(schema_name="schema", table_name="view", load_table=lambda _schema, _table: view) - ws.tables.get.assert_not_called() - assert ( - f"ALTER VIEW `schema`.`view` UNSET TBLPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" - in mock_backend.queries - ) - - -def test_unskip_on_schema(): - ws = create_autospec(WorkspaceClient) - mock_backend = MockBackend() - installation = MockInstallation() - mapping = TableMapping(installation, ws, mock_backend) - mapping.unskip_schema(schema="schema") - ws.tables.get.assert_not_called() - assert ( - f"ALTER SCHEMA hive_metastore.`schema` UNSET DBPROPERTIES IF EXISTS('{mapping.UCX_SKIP_PROPERTY}');" - in mock_backend.queries - ) - - def test_skip_missing_schema(caplog): ws = create_autospec(WorkspaceClient) sbe = create_autospec(SqlBackend) From 45f27efe555081991c83c8b836b6af54ce2e8979 Mon Sep 17 00:00:00 2001 From: hari-selvarajan_data Date: Wed, 25 Sep 2024 12:04:32 +0100 Subject: [PATCH 19/19] merging --- .../labs/ucx/contexts/application.py | 3 +-- tests/unit/azure/test_locations.py | 2 +- tests/unit/hive_metastore/test_mapping.py | 18 +++++++++--------- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index cd138596ae..95944a3d2a 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -18,7 +18,6 @@ from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.sdk import AccountClient, WorkspaceClient, core -from databricks.sdk.errors import NotFound from databricks.sdk.service import sql from databricks.labs.ucx.account.workspaces import WorkspaceInfo @@ -323,7 +322,7 @@ def principal_acl(self): self.installation, self.tables_crawler, self.mounts_crawler, - self.principal_locations, + self.principal_locations_retriever, ) @cached_property diff --git a/tests/unit/azure/test_locations.py b/tests/unit/azure/test_locations.py index 95c9ff8bfd..f1b901638b 100644 --- a/tests/unit/azure/test_locations.py +++ b/tests/unit/azure/test_locations.py @@ -44,7 +44,7 @@ def test_run_service_principal(): # mock crawled HMS external locations mock_backend = MockBackend( rows={ - r"SELECT \* FROM `hive_metastore`.`location_test`.`external_locations`": EXTERNAL_LOCATIONS[ + r"SELECT \* FROM `hive_metastore`.`location_test`.`external_locations`": EXTERNAL_LOCATIONS[ ("abfss://container1@test.dfs.core.windows.net/one/", 1), ("abfss://container2@test.dfs.core.windows.net/", 2), ] diff --git a/tests/unit/hive_metastore/test_mapping.py b/tests/unit/hive_metastore/test_mapping.py index 492c5f17e3..0ea2951131 100644 --- a/tests/unit/hive_metastore/test_mapping.py +++ b/tests/unit/hive_metastore/test_mapping.py @@ -175,15 +175,15 @@ def test_load_mapping(): ws.tables.get.assert_not_called() assert [ - Rule( - workspace_name="foo-bar", - catalog_name="foo_bar", - src_schema="foo", - dst_schema="foo", - src_table="bar", - dst_table="bar", - ) - ] == rules + Rule( + workspace_name="foo-bar", + catalog_name="foo_bar", + src_schema="foo", + dst_schema="foo", + src_table="bar", + dst_table="bar", + ) + ] == rules def test_skip_happy_path(caplog):