Skip to content

Commit

Permalink
Improved cleanup for workspace backup groups by adding more retries o…
Browse files Browse the repository at this point in the history
…n errors (#375)

Fixed deletion of backup groups [issue #374].
Added rate limits and retries to group operations [issue #353].
Temp fix for issue #359
Added log messages for better visibility.
Added useful troubleshooting snippets to the docs.
  • Loading branch information
mwojtyczka authored Oct 4, 2023
1 parent 17db2ef commit b0e82a0
Show file tree
Hide file tree
Showing 8 changed files with 140 additions and 36 deletions.
38 changes: 37 additions & 1 deletion docs/local-group-migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,4 +202,40 @@ To apply the permissions, we use the following logic:
2. Map the items to the relevant `support` object. If no relevant `support` object is found, an exception is raised.
3. Deserialize the items using the relevant applier.
4. Generate a list of callables that will apply the permissions.
5. Execute the callables in parallel.
5. Execute the callables in parallel.

## Troubleshooting

Below are some useful code snippets that can be useful for troubleshooting.
Make sure to install [databricks-sdk](https://docs.databricks.com/en/dev-tools/sdk-python.html) on the cluster to run it.

1. Find workspace-local groups that are eligible for migration to the account:
```
from databricks.sdk import WorkspaceClient
from databricks.sdk.service import iam
ws = WorkspaceClient()
workspace_groups = [
g
for g in ws.groups.list(attributes='id,displayName,meta')
if g.meta.resource_type == "WorkspaceGroup"
]
print(f'Found {len(workspace_groups)} workspace-local groups')
account_groups = [
iam.Group.from_dict(r)
for r in ws.api_client.do(
"get",
"/api/2.0/account/scim/v2/Groups",
query={"attributes": "id,displayName,meta,members"},
).get("Resources", [])
]
account_groups = [g for g in account_groups if g.display_name not in ["users", "admins", "account users"]]
print(f"Found {len(account_groups)} account groups")
ws_group_names = {_.display_name for _ in workspace_groups}
ac_group_names = {_.display_name for _ in account_groups}
group_names = list(ws_group_names.intersection(ac_group_names))
print(f"Found {len(group_names)} groups to migrate")
```
1 change: 0 additions & 1 deletion src/databricks/labs/ucx/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,6 @@ def migrate_permissions(cfg: WorkspaceConfig):
def delete_backup_groups(cfg: WorkspaceConfig):
"""Removes workspace-level backup groups"""
toolkit = GroupMigrationToolkit(cfg)
toolkit.prepare_environment()
toolkit.delete_backup_groups()


Expand Down
9 changes: 8 additions & 1 deletion src/databricks/labs/ucx/workspace_access/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from databricks.sdk import WorkspaceClient
from databricks.sdk.core import DatabricksError
from databricks.sdk.retries import retried
from databricks.sdk.service import iam, ml, workspace

from databricks.labs.ucx.mixins.hardening import rate_limited
Expand All @@ -26,6 +27,10 @@ class GenericPermissionsInfo:
request_type: str


class RetryableError(DatabricksError):
pass


class GenericPermissionsSupport(Crawler, Applier):
def __init__(self, ws: WorkspaceClient, listings: list[Callable[..., Iterator[GenericPermissionsInfo]]]):
self._ws = ws
Expand Down Expand Up @@ -68,6 +73,8 @@ def _crawler_task(self, object_type: str, object_id: str) -> Permissions | None:
raw=json.dumps(permissions.as_dict()),
)

# TODO remove after ES-892977 is fixed
@retried(on=[RetryableError])
def _safe_get_permissions(self, object_type: str, object_id: str) -> iam.ObjectPermissions | None:
try:
return self._ws.permissions.get(object_type, object_id)
Expand All @@ -81,7 +88,7 @@ def _safe_get_permissions(self, object_type: str, object_id: str) -> iam.ObjectP
logger.warning(f"Could not get permissions for {object_type} {object_id} due to {e.error_code}")
return None
else:
raise e
raise RetryableError() from e

def _prepare_new_acl(
self, permissions: iam.ObjectPermissions, migration_state: GroupMigrationState, destination: Destination
Expand Down
68 changes: 47 additions & 21 deletions src/databricks/labs/ucx/workspace_access/groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from functools import partial

from databricks.sdk import WorkspaceClient
from databricks.sdk.core import DatabricksError
from databricks.sdk.retries import retried
from databricks.sdk.service import iam
from databricks.sdk.service.iam import Group

Expand Down Expand Up @@ -59,19 +61,19 @@ def __init__(self, ws: WorkspaceClient, groups: GroupsConfig):
self._workspace_groups = self._list_workspace_groups()

def _list_workspace_groups(self) -> list[iam.Group]:
logger.debug("Listing workspace groups...")
logger.info("Listing workspace groups...")
workspace_groups = [
g
for g in self._ws.groups.list(attributes=self.SCIM_ATTRIBUTES)
if g.meta.resource_type == "WorkspaceGroup" and g.display_name not in self.SYSTEM_GROUPS
]
logger.debug(f"Found {len(workspace_groups)} workspace groups")
logger.info(f"Found {len(workspace_groups)} workspace groups")
return sorted(workspace_groups, key=lambda _: _.display_name)

def _list_account_groups(self) -> list[iam.Group]:
# TODO: we should avoid using this method, as it's not documented
# get account-level groups even if they're not (yet) assigned to a workspace
logger.debug("Listing account groups...")
logger.info("Listing account groups...")
account_groups = [
iam.Group.from_dict(r)
for r in self._ws.api_client.do(
Expand All @@ -81,7 +83,7 @@ def _list_account_groups(self) -> list[iam.Group]:
).get("Resources", [])
]
account_groups = [g for g in account_groups if g.display_name not in self.SYSTEM_GROUPS]
logger.debug(f"Found {len(account_groups)} account groups")
logger.info(f"Found {len(account_groups)} account groups")
return sorted(account_groups, key=lambda _: _.display_name)

def _get_group(self, group_name, level: GroupLevel) -> iam.Group | None:
Expand All @@ -90,6 +92,8 @@ def _get_group(self, group_name, level: GroupLevel) -> iam.Group | None:
if group.display_name == group_name:
return group

@retried(on=[DatabricksError])
@rate_limited(max_requests=5)
def _get_or_create_backup_group(self, source_group_name: str, source_group: iam.Group) -> iam.Group:
backup_group_name = f"{self.config.backup_group_prefix}{source_group_name}"
backup_group = self._get_group(backup_group_name, "workspace")
Expand Down Expand Up @@ -131,17 +135,24 @@ def get_group_info(name: str):
def _replace_group(self, migration_info: MigrationGroupInfo):
ws_group = migration_info.workspace

logger.info(f"Deleting the workspace-level group {ws_group.display_name} with id {ws_group.id}")
self._ws.groups.delete(ws_group.id)
self._delete_workspace_group(ws_group)

# delete ws_group from the list of workspace groups
self._workspace_groups = [g for g in self._workspace_groups if g.id != ws_group.id]

logger.info(f"Workspace-level group {ws_group.display_name} with id {ws_group.id} was deleted")

self._reflect_account_group_to_workspace(migration_info.account)

@rate_limited(max_requests=5) # assumption
@retried(on=[DatabricksError])
@rate_limited(max_requests=5)
def _delete_workspace_group(self, ws_group: iam.Group) -> None:
logger.info(f"Deleting the workspace-level group {ws_group.display_name} with id {ws_group.id}")

self._ws.groups.delete(id=ws_group.id)

logger.info(f"Workspace-level group {ws_group.display_name} with id {ws_group.id} was deleted")

@retried(on=[DatabricksError])
@rate_limited(max_requests=10)
def _reflect_account_group_to_workspace(self, acc_group: iam.Group) -> None:
logger.info(f"Reflecting group {acc_group.display_name} to workspace")

Expand All @@ -153,6 +164,24 @@ def _reflect_account_group_to_workspace(self, acc_group: iam.Group) -> None:

logger.info(f"Group {acc_group.display_name} successfully reflected to workspace")

def _get_backup_groups(self) -> list[iam.Group]:
if self.config.selected:
ac_group_names = {_.display_name for _ in self._account_groups if _.display_name in self.config.selected}
else:
ac_group_names = {_.display_name for _ in self._account_groups}

backup_groups = [
g
for g in self._workspace_groups
if g.display_name.startswith(self.config.backup_group_prefix)
# backup groups are only created for workspace groups that have corresponding account group
and g.display_name.removeprefix(self.config.backup_group_prefix) in ac_group_names
]

logger.info(f"Found {len(backup_groups)} backup groups")

return backup_groups

# please keep the public methods below this line

def prepare_groups_in_environment(self):
Expand Down Expand Up @@ -187,6 +216,7 @@ def prepare_groups_in_environment(self):
ws_group_names = {_.display_name for _ in self._workspace_groups}
ac_group_names = {_.display_name for _ in self._account_groups}
valid_group_names = list(ws_group_names.intersection(ac_group_names))
logger.info(f"Found {len(valid_group_names)} workspace groups that have corresponding account groups")

self._set_migration_groups(valid_group_names)
logger.info("Environment prepared successfully")
Expand All @@ -212,21 +242,17 @@ def replace_workspace_groups_with_account_groups(self):
logger.info("Workspace groups were successfully replaced with account-level groups")

def delete_backup_groups(self):
if len(self._migration_state.groups) == 0:
backup_groups = self._get_backup_groups()

if len(backup_groups) == 0:
logger.info("No backup group found, nothing to do")
return

logger.info(
f"Deleting the workspace-level backup groups. "
f"In total, {len(self.migration_groups_provider.groups)} group(s) to be deleted"
f"Deleting the workspace-level backup groups. In total, {len(backup_groups)} group(s) to be deleted"
)

for migration_info in self.migration_groups_provider.groups:
try:
self._ws.groups.delete(id=migration_info.backup.id)
except Exception as e:
logger.warning(
f"Failed to delete backup group {migration_info.backup.display_name} "
f"with id {migration_info.backup.id}"
)
logger.warning(f"Original exception {e}")
for group in backup_groups:
self._delete_workspace_group(group)

logger.info("Backup groups were successfully deleted")
9 changes: 8 additions & 1 deletion src/databricks/labs/ucx/workspace_access/scim.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from functools import partial

from databricks.sdk import WorkspaceClient
from databricks.sdk.core import DatabricksError
from databricks.sdk.retries import retried
from databricks.sdk.service import iam

from databricks.labs.ucx.mixins.hardening import rate_limited
Expand All @@ -22,14 +24,19 @@ def is_item_relevant(self, item: Permissions, migration_state: GroupMigrationSta
return any(g.workspace.id == item.object_id for g in migration_state.groups)

def get_crawler_tasks(self):
groups = self._ws.groups.list(attributes="id,displayName,roles,entitlements")
groups = self._get_groups()
with_roles = [g for g in groups if g.roles and len(g.roles) > 0]
with_entitlements = [g for g in groups if g.entitlements and len(g.entitlements) > 0]
for g in with_roles:
yield partial(self._crawler_task, g, "roles")
for g in with_entitlements:
yield partial(self._crawler_task, g, "entitlements")

# TODO remove after ES-892977 is fixed
@retried(on=[DatabricksError])
def _get_groups(self):
return self._ws.groups.list(attributes="id,displayName,roles,entitlements")

def _get_apply_task(self, item: Permissions, migration_state: GroupMigrationState, destination: Destination):
value = [iam.ComplexValue.from_dict(e) for e in json.loads(item.raw)]
target_info = [g for g in migration_state.groups if g.workspace.id == item.object_id]
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_installation.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def test_jobs_with_no_inventory_database(
)

try:
for step in ["assessment", "migrate-groups"]:
for step in ["assessment", "migrate-groups", "migrate-groups-cleanup"]:
logger.debug(f"starting {step} job: {ws.config.host}#job/{install._deployed_steps[step]}")
ws.jobs.run_now(install._deployed_steps[step]).result()

Expand Down
8 changes: 4 additions & 4 deletions tests/unit/workspace_access/test_generic.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
from unittest.mock import MagicMock

import pytest
from databricks.sdk.core import DatabricksError
from databricks.sdk.service import compute, iam, ml

Expand Down Expand Up @@ -111,9 +110,10 @@ def test_safe_get():
result = sup._safe_get_permissions("clusters", "test")
assert result is None

ws.permissions.get.side_effect = DatabricksError(error_code="SOMETHING_UNEXPECTED")
with pytest.raises(DatabricksError):
sup._safe_get_permissions("clusters", "test")
# TODO uncomment after ES-892977 is fixed. The code now is retried.
# ws.permissions.get.side_effect = DatabricksError(error_code="SOMETHING_UNEXPECTED")
# with pytest.raises(DatabricksError):
# sup._safe_get_permissions("clusters", "test")


def test_no_permissions():
Expand Down
41 changes: 35 additions & 6 deletions tests/unit/workspace_access/test_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def test_replace_workspace_groups_with_account_groups_should_call_delete_and_do(
manager._migration_state.groups = [group_info]
manager.replace_workspace_groups_with_account_groups()

client.groups.delete.assert_called_with(test_ws_group_id)
client.groups.delete.assert_called_with(id=test_ws_group_id)
client.api_client.do.assert_called_with(
"PUT",
f"/api/2.0/preview/permissionassignments/principals/{test_acc_group_id}",
Expand Down Expand Up @@ -372,19 +372,48 @@ def test_workspace_only_groups():
def test_delete_backup_groups():
client = Mock()

test_ws_group = Group(display_name="de", meta=ResourceMeta(resource_type="WorkspaceGroup"))
test_acc_group = Group(display_name="de", meta=ResourceMeta(resource_type="Group"))
backup_group_id = "100"
client.groups.list.return_value = [test_ws_group]
client.groups.create.return_value = Group(
ws_group = Group(display_name="de", meta=ResourceMeta(resource_type="Group"))
test_ws_backup_group = Group(
display_name="dbr_backup_de", meta=ResourceMeta(resource_type="WorkspaceGroup"), id=backup_group_id
)

client.groups.list.return_value = [ws_group, test_ws_backup_group]

test_acc_group = Group(display_name="de", meta=ResourceMeta(resource_type="Group"))
client.api_client.do.return_value = {
"Resources": [g.as_dict() for g in [test_acc_group]],
}

group_conf = GroupsConfig(backup_group_prefix="dbr_backup_", auto=True)
manager = GroupManager(client, group_conf)
manager.delete_backup_groups()
client.groups.delete.assert_called_with(id=backup_group_id)


def test_delete_selected_backup_groups():
client = Mock()

backup_group_id = "100"
ws_group = Group(display_name="de", meta=ResourceMeta(resource_type="Group"))
test_ws_backup_group = Group(
display_name="dbr_backup_de", meta=ResourceMeta(resource_type="WorkspaceGroup"), id=backup_group_id
)

ws_group_to_skip = Group(display_name="de2", meta=ResourceMeta(resource_type="Group"))
test_ws_backup_group_to_skip = Group(
display_name="dbr_backup_de2", meta=ResourceMeta(resource_type="WorkspaceGroup"), id="1"
)

client.groups.list.return_value = [ws_group, test_ws_backup_group, ws_group_to_skip, test_ws_backup_group_to_skip]

test_acc_group = Group(display_name="de", meta=ResourceMeta(resource_type="Group"))
test_acc_group_to_skip = Group(display_name="de2", meta=ResourceMeta(resource_type="Group"))
client.api_client.do.return_value = {
"Resources": [g.as_dict() for g in [test_acc_group, test_acc_group_to_skip]],
}

group_conf = GroupsConfig(backup_group_prefix="dbr_backup_", selected=["de"])
manager = GroupManager(client, group_conf)
manager.prepare_groups_in_environment()
manager.delete_backup_groups()
client.groups.delete.assert_called_with(id=backup_group_id)

0 comments on commit b0e82a0

Please sign in to comment.