Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Support for Migrating Table ACL of Interactive clusters using SPN #1077

Merged
merged 32 commits into from
Mar 31, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
11f85bc
initial commit
HariGS-DB Mar 20, 2024
a2c41ea
merge
HariGS-DB Mar 23, 2024
4f4dccd
moving code to Grants and generating Grant objects
HariGS-DB Mar 24, 2024
3f7a461
moving code to Grants and generating Grant objects
HariGS-DB Mar 25, 2024
30847b8
Merge branch 'main' into feature/907
HariGS-DB Mar 26, 2024
d60d266
azure spn test case
HariGS-DB Mar 27, 2024
b8d3095
Merge branch 'main' into feature/907
HariGS-DB Mar 27, 2024
7c73578
added ACLPrincipal test cases
HariGS-DB Mar 28, 2024
0b76645
merging
HariGS-DB Mar 28, 2024
72671a9
adding changes to TableMigrate and unit test cases
HariGS-DB Mar 28, 2024
4b6bd95
fmting
HariGS-DB Mar 28, 2024
54ff3cb
integration test
HariGS-DB Mar 29, 2024
3798cdd
merges
HariGS-DB Mar 29, 2024
f8e5725
formating
HariGS-DB Mar 29, 2024
1bbe118
handling scenarios for mounts
HariGS-DB Mar 29, 2024
390d646
Merge branch 'main' into feature/907
HariGS-DB Mar 29, 2024
a65ea07
review comments
HariGS-DB Mar 30, 2024
4e71d6a
merge
HariGS-DB Mar 30, 2024
fb281e7
passing sql_backend in the cli
HariGS-DB Mar 30, 2024
69cd19a
calling init directly in runtime
HariGS-DB Mar 30, 2024
bd561e6
fmting
HariGS-DB Mar 30, 2024
4f6c7f4
spiliting big method into two to reduce pylint warning
HariGS-DB Mar 30, 2024
752797d
review comments
HariGS-DB Mar 30, 2024
be7e757
naming standards
HariGS-DB Mar 30, 2024
4abe7af
fixing storage account extraction in AzureACL
HariGS-DB Mar 30, 2024
cf65f47
timeout change to 3 mins
HariGS-DB Mar 30, 2024
b23d2d6
fixes to int test failues
HariGS-DB Mar 31, 2024
4d2ba61
removing circular references for Grants crawler and fixing test_insta…
HariGS-DB Mar 31, 2024
f2af302
removing circular references for Grants crawler and fixing test_insta…
HariGS-DB Mar 31, 2024
543f07c
Merge branch 'main' into feature/907
HariGS-DB Mar 31, 2024
50bb6b2
removing connection parameter in config.yml
HariGS-DB Mar 31, 2024
a19aaeb
Update __init__.py
nfx Mar 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion src/databricks/labs/ucx/assessment/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from databricks.labs.lsql.backends import SqlBackend
from databricks.sdk import WorkspaceClient
from databricks.sdk.errors import NotFound
from databricks.sdk.service.compute import ClusterSource, Policy
from databricks.sdk.service.compute import ClusterSource, DataSecurityMode, Policy

from databricks.labs.ucx.assessment.crawlers import azure_sp_conf_present_check, logger
from databricks.labs.ucx.assessment.jobs import JobsMixin
Expand All @@ -30,6 +30,15 @@ class AzureServicePrincipalInfo:
storage_account: str | None = None


@dataclass
class ServicePrincipalClusterMapping:
# this class is created separately as we need cluster to spn mapping
# Cluster id where the spn is used
cluster_id: str
# spn info data class
spn_info: set[AzureServicePrincipalInfo]


class AzureServicePrincipalCrawler(CrawlerBase[AzureServicePrincipalInfo], JobsMixin, SecretsMixin):
def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
super().__init__(sbe, "hive_metastore", schema, "azure_service_principals", AzureServicePrincipalInfo)
Expand Down Expand Up @@ -171,3 +180,16 @@ def _get_azure_spn_from_config(self, config: dict) -> set[AzureServicePrincipalI
)
)
return set_service_principals

def get_cluster_to_storage_mapping(self):
# this function gives a mapping between an interactive cluster and the spn used by it
# either directly or through a cluster policy.
set_service_principals = set[AzureServicePrincipalInfo]()
spn_cluster_mapping = []
for cluster in self._ws.clusters.list():
if cluster.cluster_source != ClusterSource.JOB and (
cluster.data_security_mode in [DataSecurityMode.LEGACY_SINGLE_USER, DataSecurityMode.NONE]
):
set_service_principals = self._get_azure_spn_from_cluster_config(cluster)
spn_cluster_mapping.append(ServicePrincipalClusterMapping(cluster.cluster_id, set_service_principals))
return spn_cluster_mapping
212 changes: 209 additions & 3 deletions src/databricks/labs/ucx/hive_metastore/grants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,27 @@
from dataclasses import dataclass
from functools import partial

from databricks.labs.blueprint.installation import Installation
from databricks.labs.blueprint.parallel import ManyError, Threads
from databricks.sdk.service.catalog import SchemaInfo, TableInfo

from databricks.labs.lsql.backends import SqlBackend, StatementExecutionBackend
from databricks.sdk import WorkspaceClient
from databricks.sdk.errors import ResourceDoesNotExist
from databricks.sdk.service.catalog import ExternalLocationInfo, SchemaInfo, TableInfo

from databricks.labs.ucx.assessment.azure import (
AzureServicePrincipalCrawler,
AzureServicePrincipalInfo,
)
from databricks.labs.ucx.azure.access import (
AzureResourcePermissions,
StoragePermissionMapping,
)
from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources
from databricks.labs.ucx.config import WorkspaceConfig
from databricks.labs.ucx.framework.crawlers import CrawlerBase
from databricks.labs.ucx.framework.utils import escape_sql_identifier
from databricks.labs.ucx.hive_metastore.tables import TablesCrawler
from databricks.labs.ucx.hive_metastore.locations import ExternalLocations
from databricks.labs.ucx.hive_metastore.tables import Table, TablesCrawler
from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -127,6 +142,7 @@
("TABLE", "SELECT"): self._uc_action("SELECT"),
("TABLE", "MODIFY"): self._uc_action("MODIFY"),
("TABLE", "READ_METADATA"): self._uc_action("BROWSE"),
("TABLE", "ALL PRIVILEGES"): self._uc_action("ALL PRIVILEGES"),
("TABLE", "OWN"): self._set_owner_sql,
("VIEW", "SELECT"): self._uc_action("SELECT"),
("VIEW", "READ_METADATA"): self._uc_action("BROWSE"),
Expand Down Expand Up @@ -307,3 +323,193 @@
# TODO: https://github.com/databrickslabs/ucx/issues/406
logger.error(f"Couldn't fetch grants for object {on_type} {key}: {e}")
return []


class PrincipalACL:
def __init__(
self,
ws: WorkspaceClient,
backend: SqlBackend,
installation: Installation,
table_crawler: TablesCrawler,
spn_crawler: AzureServicePrincipalCrawler | None = None,
resource_permission: AzureResourcePermissions | None = None,
):
self._backend = backend
self._ws = ws
self._spn_crawler = spn_crawler
self._installation = installation
self._resource_permission = resource_permission
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved
self._table_crawler = table_crawler

@classmethod
def for_cli(cls, ws: WorkspaceClient, installation: Installation):
config = installation.load(WorkspaceConfig)
sql_backend = StatementExecutionBackend(ws, config.warehouse_id)
locations = ExternalLocations(ws, sql_backend, config.inventory_database)
table_crawler = TablesCrawler(sql_backend, config.inventory_database)
if ws.config.is_azure:
azure_client = AzureAPIClient(
ws.config.arm_environment.resource_manager_endpoint,
ws.config.arm_environment.service_management_endpoint,
)
graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com")
azurerm = AzureResources(azure_client, graph_client)
resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations)
spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, config.inventory_database)
return cls(ws, sql_backend, installation, table_crawler, spn_crawler, resource_permissions)
if ws.config.is_aws:
return None
if ws.config.is_gcp:
logger.error("UCX is not supported for GCP yet. Please run it on azure or aws")
return None
return None

Check warning on line 366 in src/databricks/labs/ucx/hive_metastore/grants.py

View check run for this annotation

Codecov / codecov/patch

src/databricks/labs/ucx/hive_metastore/grants.py#L366

Added line #L366 was not covered by tests

def get_interactive_cluster_grants(self) -> list[Grant]:
if self._ws.config.is_azure:
return self._get_azure_grants()
return []

Check warning on line 371 in src/databricks/labs/ucx/hive_metastore/grants.py

View check run for this annotation

Codecov / codecov/patch

src/databricks/labs/ucx/hive_metastore/grants.py#L371

Added line #L371 was not covered by tests

def _get_azure_grants(self) -> list[Grant]:
assert self._spn_crawler is not None
assert self._resource_permission is not None
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved
spn_cluster_mapping = self._spn_crawler.get_cluster_to_storage_mapping()
if len(spn_cluster_mapping) == 0:
# if there are no interactive clusters , then return empty grants
logger.info("No interactive cluster found with spn configured")
return []
external_locations = list(self._ws.external_locations.list())
if len(external_locations) == 0:
# if there are no external locations, then throw an error to run migrate_locations cli command
msg = (
"No external location found, If hive metastore tables are created in external storage, "
"ensure migrate_locations cli cmd is run to create the required locations."
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved
)
logger.error(msg)
raise ResourceDoesNotExist(msg) from None
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved

permission_mappings = self._resource_permission.load()
if len(permission_mappings) == 0:
# if permission mapping is empty, raise an error to run principal_prefix cmd
msg = "No storage permission file found. Please ensure principal_prefix_access cli cmd is run to create the access permission file."
logger.error(msg)
raise ResourceDoesNotExist(msg) from None
tables = self._table_crawler.snapshot()
grants: list[Grant] = []

for cluster_spn in spn_cluster_mapping:
principals = self._get_cluster_principal_mapping(cluster_spn.cluster_id)
if len(principals) == 0:
continue
for spn in cluster_spn.spn_info:
eligible_locations = self._get_external_location(spn, external_locations, permission_mappings)
if len(eligible_locations) == 0:
continue

Check warning on line 407 in src/databricks/labs/ucx/hive_metastore/grants.py

View check run for this annotation

Codecov / codecov/patch

src/databricks/labs/ucx/hive_metastore/grants.py#L407

Added line #L407 was not covered by tests
grant = self._get_grants(eligible_locations, principals, tables)
grants.extend(grant)
catalog_grants = [Grant(principal, "USE", "hive_metastore") for principal in principals]
grants.extend(catalog_grants)

return list(set(grants))
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved

def _get_aws_grants(self) -> list[Grant]:
# TODO
return []

Check warning on line 417 in src/databricks/labs/ucx/hive_metastore/grants.py

View check run for this annotation

Codecov / codecov/patch

src/databricks/labs/ucx/hive_metastore/grants.py#L417

Added line #L417 was not covered by tests

def _get_privilege(self, table: Table, locations: dict[str, str]):
if table.view_text is not None:
# return nothing for view so that it goes to the separate view logic
return None
if table.location is None:
return "WRITE_FILES"
if table.location.startswith('dbfs:/') or table.location.startswith('/dbfs/'):
return "WRITE_FILES"

for loc, privilege in locations.items():
if loc is not None and table.location.startswith(loc):
return privilege
return None

def _get_database_grants(self, tables: list[Table], principals: list[str]) -> list[Grant]:
databases = []
for table in tables:
if table.database not in databases:
databases.append(table.database)
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved
return [
Grant(principal, "USE", "hive_metastore", database) for database in databases for principal in principals
]

def _get_grants(
self,
locations: dict[str, str],
principals: list[str],
tables: list[Table],
) -> list[Grant]:
grants = []
filtered_tables = []
for table in tables:
privilege = self._get_privilege(table, locations)
if privilege == "READ_FILES":
grants.extend(
[Grant(principal, "SELECT", table.catalog, table.database, table.name) for principal in principals]
)
filtered_tables.append(table)
if privilege == "WRITE_FILES":
grants.extend(
[
Grant(principal, "ALL PRIVILEGES", table.catalog, table.database, table.name)
for principal in principals
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved
]
)
filtered_tables.append(table)
if table.view_text is not None:
grants.extend(
[
Grant(principal, "ALL PRIVILEGES", table.catalog, table.database, view=table.name)
for principal in principals
]
)
filtered_tables.append(table)

database_grants = self._get_database_grants(filtered_tables, principals)

grants.extend(database_grants)

return grants

def _get_external_location(
self,
spn: AzureServicePrincipalInfo,
external_locations: list[ExternalLocationInfo],
permission_mappings: list[StoragePermissionMapping],
) -> dict[str, str]:
matching_location = {}
for location in external_locations:
if location.url is None:
continue

Check warning on line 489 in src/databricks/labs/ucx/hive_metastore/grants.py

View check run for this annotation

Codecov / codecov/patch

src/databricks/labs/ucx/hive_metastore/grants.py#L489

Added line #L489 was not covered by tests
for permission_mapping in permission_mappings:
if (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have to create account-level group for every workspace admins group, so that we keep their data access

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At least as an optional command

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you are referring to removing the admin in the method _get_cluster_principal_mapping, then yes I agree.
may be we should include that in the create_account_groups and include that in the cluster permission cmd to replace workspace local admin with account equivalent. what do you think?

location.url.startswith(permission_mapping.prefix)
and permission_mapping.client_id == spn.application_id
and spn.storage_account is not None
and spn.storage_account in permission_mapping.prefix
):
matching_location[location.url] = permission_mapping.privilege
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved
return matching_location

def _get_cluster_principal_mapping(self, cluster_id: str) -> list[str]:
# gets all the users,groups,spn which have access to the clusters and returns a dataclass of that mapping
principal_list = []
cluster_permission = self._ws.permissions.get("clusters", cluster_id)
if cluster_permission.access_control_list is None:
return []
nfx marked this conversation as resolved.
Show resolved Hide resolved
for acl in cluster_permission.access_control_list:
if acl.user_name is not None:
principal_list.append(acl.user_name)
if acl.group_name is not None:
if acl.group_name == "admins":
continue

Check warning on line 511 in src/databricks/labs/ucx/hive_metastore/grants.py

View check run for this annotation

Codecov / codecov/patch

src/databricks/labs/ucx/hive_metastore/grants.py#L511

Added line #L511 was not covered by tests
principal_list.append(acl.group_name)
if acl.service_principal_name is not None:
principal_list.append(acl.service_principal_name)
return principal_list
17 changes: 15 additions & 2 deletions src/databricks/labs/ucx/hive_metastore/table_migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from databricks.labs.ucx.framework.crawlers import CrawlerBase
from databricks.labs.ucx.framework.utils import escape_sql_identifier
from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler
from databricks.labs.ucx.hive_metastore.grants import Grant
from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL
from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping
from databricks.labs.ucx.hive_metastore.tables import (
AclMigrationWhat,
Expand Down Expand Up @@ -52,6 +52,7 @@ def __init__(
table_mapping: TableMapping,
group_manager: GroupManager,
migration_status_refresher,
principal_grants: PrincipalACL,
):
self._tc = table_crawler
self._gc = grant_crawler
Expand All @@ -61,6 +62,7 @@ def __init__(
self._group = group_manager
self._migration_status_refresher = migration_status_refresher
self._seen_tables: dict[str, str] = {}
self._principal_grants = principal_grants

@classmethod
def for_cli(cls, ws: WorkspaceClient, product='ucx'):
Expand All @@ -72,9 +74,17 @@ def for_cli(cls, ws: WorkspaceClient, product='ucx'):
grants_crawler = GrantsCrawler(table_crawler, udfs_crawler)
table_mapping = TableMapping(installation, ws, sql_backend)
group_manager = GroupManager(sql_backend, ws, config.inventory_database)
principal_grants = PrincipalACL.for_cli(ws, installation)
migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, config.inventory_database, table_crawler)
return cls(
table_crawler, grants_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher
table_crawler,
grants_crawler,
ws,
sql_backend,
table_mapping,
group_manager,
migration_status_refresher,
principal_grants,
)

def index(self):
Expand All @@ -87,6 +97,7 @@ def migrate_tables(self, *, what: What | None = None, acl_strategy: list[AclMigr
if acl_strategy is not None:
grants_to_migrate = self._gc.snapshot()
migrated_groups = self._group.snapshot()
principal_grants = self._principal_grants.get_interactive_cluster_grants()
else:
acl_strategy = []
for table in tables_to_migrate:
Expand All @@ -95,6 +106,8 @@ def migrate_tables(self, *, what: What | None = None, acl_strategy: list[AclMigr
continue
if AclMigrationWhat.LEGACY_TACL in acl_strategy:
grants.extend(self._match_grants(table.src, grants_to_migrate, migrated_groups))
if AclMigrationWhat.PRINCIPAL in acl_strategy:
grants.extend(self._match_grants(table.src, principal_grants, migrated_groups))
tasks.append(partial(self._migrate_table, table.src, table.rule, grants))
Threads.strict("migrate tables", tasks)

Expand Down
27 changes: 22 additions & 5 deletions src/databricks/labs/ucx/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@
Mounts,
TablesCrawler,
)
from databricks.labs.ucx.hive_metastore.grants import PrincipalACL
from databricks.labs.ucx.hive_metastore.mapping import TableMapping
from databricks.labs.ucx.hive_metastore.table_migrate import (
MigrationStatusRefresher,
TablesMigrate,
)
from databricks.labs.ucx.hive_metastore.table_size import TableSizeCrawler
from databricks.labs.ucx.hive_metastore.tables import AclMigrationWhat, What
from databricks.labs.ucx.hive_metastore.tables import What
from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler
from databricks.labs.ucx.hive_metastore.verification import VerifyHasMetastore
from databricks.labs.ucx.workspace_access.generic import WorkspaceListing
Expand Down Expand Up @@ -434,11 +435,19 @@ def migrate_external_tables_sync(
table_crawler = TablesCrawler(sql_backend, cfg.inventory_database)
udf_crawler = UdfsCrawler(sql_backend, cfg.inventory_database)
grant_crawler = GrantsCrawler(table_crawler, udf_crawler)
table_mapping = TableMapping(install, ws, sql_backend)
table_mappings = TableMapping(install, ws, sql_backend)
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved
migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler)
group_manager = GroupManager(sql_backend, ws, cfg.inventory_database)
interactive_grants = PrincipalACL.for_cli(ws, install)
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved
TablesMigrate(
table_crawler, grant_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher
table_crawler,
grant_crawler,
ws,
sql_backend,
table_mappings,
group_manager,
migration_status_refresher,
interactive_grants,
).migrate_tables(what=What.EXTERNAL_SYNC)


Expand All @@ -454,11 +463,19 @@ def migrate_dbfs_root_delta_tables(
table_crawler = TablesCrawler(sql_backend, cfg.inventory_database)
udf_crawler = UdfsCrawler(sql_backend, cfg.inventory_database)
grant_crawler = GrantsCrawler(table_crawler, udf_crawler)
table_mapping = TableMapping(install, ws, sql_backend)
table_mappings = TableMapping(install, ws, sql_backend)
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved
migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler)
group_manager = GroupManager(sql_backend, ws, cfg.inventory_database)
interactive_grants = PrincipalACL.for_cli(ws, install)
TablesMigrate(
HariGS-DB marked this conversation as resolved.
Show resolved Hide resolved
table_crawler, grant_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher
table_crawler,
grant_crawler,
ws,
sql_backend,
table_mappings,
group_manager,
migration_status_refresher,
interactive_grants,
).migrate_tables(what=What.DBFS_ROOT_DELTA)


Expand Down
Loading
Loading