Skip to content

Commit

Permalink
Added databricks labs ucx create-uber-principal command to create A…
Browse files Browse the repository at this point in the history
…zure Service Principal for migration (#976)

## Changes
 - Added new cli cmd for create-master-principal in labs.yml, cli.py
- Added separate class for AzureApiClient to separate out azure API
calls
- Added logic to create SPN, secret, roleassignment in resources and
update workspace config with spn client_id
- added logic to call create spn, update rbac of all storage account to
that spn, update ucx cluster policy with spn secret for each storage
account
 - test unit and int test cases

Resolves #881 

Related issues: 
- #993
- #693

### Functionality 

- [ ] added relevant user documentation
- [X] added new CLI command
- [ ] modified existing command: `databricks labs ucx ...`
- [ ] added a new workflow
- [ ] modified existing workflow: `...`
- [ ] added a new table
- [ ] modified existing table: `...`

### Tests

- [X] manually tested
- [X] added unit tests
- [X] added integration tests
- [ ] verified on staging environment (screenshot attached)
  • Loading branch information
HariGS-DB authored Mar 5, 2024
1 parent 2c137f7 commit 18a7d2d
Show file tree
Hide file tree
Showing 17 changed files with 796 additions and 133 deletions.
7 changes: 7 additions & 0 deletions labs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ commands:
- name: aws-profile
description: AWS Profile to use for authentication

- name: create-uber-principal
description: For azure cloud, creates a service principal and gives STORAGE BLOB READER access on all the storage account
used by tables in the workspace and stores the spn info in the UCX cluster policy.
flags:
- name: subscription-id
description: Subscription to scan storage account in

- name: validate-groups-membership
description: Validate groups to check if the groups at account level and workspace level have different memberships
table_template: |-
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ classifiers = [
"Programming Language :: Python :: Implementation :: CPython",
]
dependencies = ["databricks-sdk~=0.20.0",
"databricks-labs-blueprint~=0.3.0",
"databricks-labs-blueprint~=0.3.1",
"PyYAML>=6.0.0,<7.0.0"]

[project.entry-points.databricks]
Expand Down
133 changes: 131 additions & 2 deletions src/databricks/labs/ucx/azure/access.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
import json
import uuid
from dataclasses import dataclass

from databricks.labs.blueprint.installation import Installation
from databricks.labs.blueprint.tui import Prompts
from databricks.sdk import WorkspaceClient
from databricks.sdk.errors import NotFound, ResourceAlreadyExists
from databricks.sdk.service.catalog import Privilege

from databricks.labs.ucx.assessment.crawlers import logger
from databricks.labs.ucx.azure.resources import AzureResource, AzureResources
from databricks.labs.ucx.azure.resources import (
AzureAPIClient,
AzureResource,
AzureResources,
PrincipalSecret,
)
from databricks.labs.ucx.config import WorkspaceConfig
from databricks.labs.ucx.framework.crawlers import StatementExecutionBackend
from databricks.labs.ucx.hive_metastore.locations import ExternalLocations
Expand Down Expand Up @@ -46,7 +55,12 @@ def for_cli(cls, ws: WorkspaceClient, product='ucx', include_subscriptions=None)
installation = Installation.current(ws, product)
config = installation.load(WorkspaceConfig)
sql_backend = StatementExecutionBackend(ws, config.warehouse_id)
azurerm = AzureResources(ws, include_subscriptions=include_subscriptions)
azure_mgmt_client = AzureAPIClient(
ws.config.arm_environment.resource_manager_endpoint,
ws.config.arm_environment.service_management_endpoint,
)
graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com")
azurerm = AzureResources(azure_mgmt_client, graph_client, include_subscriptions)
locations = ExternalLocations(ws, sql_backend, config.inventory_database)
return cls(installation, ws, azurerm, locations)

Expand Down Expand Up @@ -91,6 +105,121 @@ def save_spn_permissions(self) -> str | None:
return None
return self._installation.save(storage_account_infos, filename=self._filename)

def _update_cluster_policy_definition(
self,
policy_definition: str,
storage_accounts: list[AzureResource],
uber_principal: PrincipalSecret,
inventory_database: str,
) -> str:
policy_dict = json.loads(policy_definition)
tenant_id = self._azurerm.tenant_id()
endpoint = f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
for storage in storage_accounts:
policy_dict[
f"spark_conf.fs.azure.account.oauth2.client.id.{storage.storage_account}.dfs.core.windows.net"
] = self._policy_config(uber_principal.client.client_id)
policy_dict[
f"spark_conf.fs.azure.account.oauth.provider.type.{storage.storage_account}.dfs.core.windows.net"
] = self._policy_config("org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
policy_dict[
f"spark_conf.fs.azure.account.oauth2.client.endpoint.{storage.storage_account}.dfs.core.windows.net"
] = self._policy_config(endpoint)
policy_dict[f"spark_conf.fs.azure.account.auth.type.{storage.storage_account}.dfs.core.windows.net"] = (
self._policy_config("OAuth")
)
policy_dict[
f"spark_conf.fs.azure.account.oauth2.client.secret.{storage.storage_account}.dfs.core.windows.net"
] = self._policy_config(f"{{secrets/{inventory_database}/uber_principal_secret}}")
return json.dumps(policy_dict)

@staticmethod
def _policy_config(value: str):
return {"type": "fixed", "value": value}

def _update_cluster_policy_with_spn(
self,
policy_id: str,
storage_accounts: list[AzureResource],
uber_principal: PrincipalSecret,
inventory_database: str,
):
try:
policy_definition = ""
cluster_policy = self._ws.cluster_policies.get(policy_id)

self._installation.save(cluster_policy, filename="policy-backup.json")

if cluster_policy.definition is not None:
policy_definition = self._update_cluster_policy_definition(
cluster_policy.definition, storage_accounts, uber_principal, inventory_database
)
if cluster_policy.name is not None:
self._ws.cluster_policies.edit(policy_id, cluster_policy.name, definition=policy_definition)
except NotFound:
msg = f"cluster policy {policy_id} not found, please run UCX installation to create UCX cluster policy"
raise NotFound(msg) from None

def create_uber_principal(self, prompts: Prompts):
config = self._installation.load(WorkspaceConfig)
inventory_database = config.inventory_database
display_name = f"unity-catalog-migration-{inventory_database}-{self._ws.get_workspace_id()}"
uber_principal_name = prompts.question(
"Enter a name for the uber service principal to be created", default=display_name
)
policy_id = config.policy_id
if policy_id is None:
msg = "UCX cluster policy not found in config. Please run latest UCX installation to set cluster policy"
logger.error(msg)
raise ValueError(msg) from None
if config.uber_spn_id is not None:
logger.warning("Uber service principal already created for this workspace.")
return
used_storage_accounts = self._get_storage_accounts()
if len(used_storage_accounts) == 0:
logger.warning(
"There are no external table present with azure storage account. "
"Please check if assessment job is run"
)
return
storage_account_info = []
for storage in self._azurerm.storage_accounts():
if storage.storage_account in used_storage_accounts:
storage_account_info.append(storage)
logger.info("Creating service principal")
uber_principal = self._azurerm.create_service_principal(uber_principal_name)
self._create_scope(uber_principal, inventory_database)
config.uber_spn_id = uber_principal.client.client_id
logger.info(
f"Created service principal of client_id {config.uber_spn_id}. " f"Applying permission on storage accounts"
)
try:
self._apply_storage_permission(storage_account_info, uber_principal)
self._installation.save(config)
self._update_cluster_policy_with_spn(policy_id, storage_account_info, uber_principal, inventory_database)
except PermissionError:
self._azurerm.delete_service_principal(uber_principal.client.object_id)
logger.info(f"Update UCX cluster policy {policy_id} with spn connection details for storage accounts")

def _apply_storage_permission(self, storage_account_info: list[AzureResource], uber_principal: PrincipalSecret):
for storage in storage_account_info:
role_name = str(uuid.uuid4())
self._azurerm.apply_storage_permission(
uber_principal.client.object_id, storage, "STORAGE_BLOB_DATA_READER", role_name
)
logger.debug(
f"Storage Data Blob Reader permission applied for spn {uber_principal.client.client_id} "
f"to storage account {storage.storage_account}"
)

def _create_scope(self, uber_principal: PrincipalSecret, inventory_database: str):
logger.info(f"Creating secret scope {inventory_database}.")
try:
self._ws.secrets.create_scope(inventory_database)
except ResourceAlreadyExists:
logger.warning(f"Secret scope {inventory_database} already exists, using the same")
self._ws.secrets.put_secret(inventory_database, "uber_principal_secret", string_value=uber_principal.secret)

def load(self):
return self._installation.load(list[StoragePermissionMapping], filename=self._filename)

Expand Down
9 changes: 7 additions & 2 deletions src/databricks/labs/ucx/azure/credentials.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
AzureResourcePermissions,
StoragePermissionMapping,
)
from databricks.labs.ucx.azure.resources import AzureResources
from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources
from databricks.labs.ucx.config import WorkspaceConfig
from databricks.labs.ucx.framework.crawlers import StatementExecutionBackend
from databricks.labs.ucx.hive_metastore.locations import ExternalLocations
Expand Down Expand Up @@ -171,7 +171,12 @@ def for_cli(cls, ws: WorkspaceClient, installation: Installation, prompts: Promp

config = installation.load(WorkspaceConfig)
sql_backend = StatementExecutionBackend(ws, config.warehouse_id)
azurerm = AzureResources(ws)
azure_mgmt_client = AzureAPIClient(
ws.config.arm_environment.resource_manager_endpoint,
ws.config.arm_environment.service_management_endpoint,
)
graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com")
azurerm = AzureResources(azure_mgmt_client, graph_client)
locations = ExternalLocations(ws, sql_backend, config.inventory_database)

resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations)
Expand Down
Loading

0 comments on commit 18a7d2d

Please sign in to comment.