From 11f85bcd9b11c1ce9dca576f2a95821263417661 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Wed, 20 Mar 2024 22:16:08 +0000 Subject: [PATCH 01/24] initial commit --- labs.yml | 5 + src/databricks/labs/ucx/assessment/azure.py | 24 +++ src/databricks/labs/ucx/cli.py | 19 ++ .../ucx/hive_metastore/table_acl_migrate.py | 174 ++++++++++++++++++ 4 files changed, 222 insertions(+) create mode 100644 src/databricks/labs/ucx/hive_metastore/table_acl_migrate.py diff --git a/labs.yml b/labs.yml index b863f57aa2..59ec2ea60c 100644 --- a/labs.yml +++ b/labs.yml @@ -143,3 +143,8 @@ commands: - name: create-catalogs-schemas description: Create UC external catalogs and schemas based on the destinations created from create_table_mapping command. This command is supposed to be run before migrating tables to UC. + + - name: migrate-table-acl + description: Migrates legacy ACL on pre-uc clusters to the new UC ACL model. Interactive clusters reading/writing to external storage + use service principal / instance profiles to access the underlying data. Users get access to the data by having permission to the cluster. + This command will convert those access to UC ACL for the identified tables and users groups. diff --git a/src/databricks/labs/ucx/assessment/azure.py b/src/databricks/labs/ucx/assessment/azure.py index 69b75c398e..04576821a4 100644 --- a/src/databricks/labs/ucx/assessment/azure.py +++ b/src/databricks/labs/ucx/assessment/azure.py @@ -30,6 +30,15 @@ class AzureServicePrincipalInfo: storage_account: str | None = None +@dataclass() +class AzureServicePrincipalClusterMapping: + # this class is created separately as we need cluster to spn mapping + # Cluster id where the spn is used + cluster_id: str + # spn info data class + spn_info: set[AzureServicePrincipalInfo] + + class AzureServicePrincipalCrawler(CrawlerBase[AzureServicePrincipalInfo], JobsMixin, SecretsMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): super().__init__(sbe, "hive_metastore", schema, "azure_service_principals", AzureServicePrincipalInfo) @@ -171,3 +180,18 @@ def _get_azure_spn_from_config(self, config: dict) -> set[AzureServicePrincipalI ) ) return set_service_principals + + def get_cluster_to_storage_mapping(self): + # this function gives a mapping between an interactive cluster and the spn used by it + # either directly or through a cluster policy. + set_service_principals = set[AzureServicePrincipalInfo]() + spn_cluster_mapping = [] + for cluster in self._ws.clusters.list(): + if cluster.cluster_source != ClusterSource.JOB and ( + cluster.data_security_mode.LEGACY_SINGLE_USER or cluster.data_security_mode.NONE + ): + set_service_principals = self._get_azure_spn_from_cluster_config(cluster) + spn_cluster_mapping.append( + AzureServicePrincipalClusterMapping(cluster.cluster_id, set_service_principals) + ) + return spn_cluster_mapping diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index 9199136d6c..e1fc1317ab 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -23,6 +23,7 @@ from databricks.labs.ucx.hive_metastore import ExternalLocations, TablesCrawler from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema from databricks.labs.ucx.hive_metastore.mapping import TableMapping +from databricks.labs.ucx.hive_metastore.table_acl_migrate import TableACLMigrate from databricks.labs.ucx.hive_metastore.table_migrate import TablesMigrate from databricks.labs.ucx.hive_metastore.table_move import TableMove from databricks.labs.ucx.install import WorkspaceInstallation @@ -504,5 +505,23 @@ def create_catalogs_schemas(w: WorkspaceClient, prompts: Prompts): catalog_schema.create_catalog_schema() +@ucx.command +def migrate_table_acl(w: WorkspaceClient, prompts: Prompts): + """This command migrates legacy ACL on pre-uc clusters to the new UC ACL model. Interactive clusters reading/writing + to external storage use service principal / instance profiles to access the underlying data. Users get access to + the data by having permission to the cluster. This command will convert those access to UC ACL for the identified + tables and users groups. + """ + installation = Installation.current(w, 'ucx') + if w.config.is_azure: + logger.info("Running migrate_table_acl for Azure") + table_acl_migrate = TableACLMigrate.for_cli(w, installation, prompts) + table_acl_migrate.migrate_cluster_acl() + if w.config.is_aws: + logger.error("Running migrate_table_acl for AWS") + if w.config.is_gcp: + logger.error("migrate_table_acl is not yet supported in GCP") + + if __name__ == "__main__": ucx() diff --git a/src/databricks/labs/ucx/hive_metastore/table_acl_migrate.py b/src/databricks/labs/ucx/hive_metastore/table_acl_migrate.py new file mode 100644 index 0000000000..63c1bf2c17 --- /dev/null +++ b/src/databricks/labs/ucx/hive_metastore/table_acl_migrate.py @@ -0,0 +1,174 @@ +import logging +from dataclasses import dataclass + +from databricks.labs.blueprint.installation import Installation +from databricks.labs.blueprint.tui import Prompts +from databricks.labs.lsql.backends import SqlBackend, StatementExecutionBackend +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.catalog import ExternalLocationInfo + +from databricks.labs.ucx.assessment.azure import ( + AzureServicePrincipalClusterMapping, + AzureServicePrincipalCrawler, + AzureServicePrincipalInfo, +) +from databricks.labs.ucx.azure.access import ( + AzureResourcePermissions, + StoragePermissionMapping, +) +from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources +from databricks.labs.ucx.config import WorkspaceConfig +from databricks.labs.ucx.hive_metastore import ExternalLocations +from databricks.labs.ucx.hive_metastore.mapping import TableMapping + +logger = logging.getLogger(__name__) + + +@dataclass +class IAMPrincipal: + # user or group or spn + principal_type: str + # id of the principal + principal_id: str + + +@dataclass +class ClusterPrincipalMapping: + # cluster id + cluster_id: str + # list of all principals that has access to this cluster id + principals: list[IAMPrincipal] + + +class TableACLMigrate: + def __init__( + self, + ws: WorkspaceClient, + backend: SqlBackend, + installation: Installation, + spn_crawler: AzureServicePrincipalCrawler, + resource_permission: AzureResourcePermissions, + table_mapping: TableMapping, + ): + self._backend = backend + self._ws = ws + self._spn_crawler = spn_crawler + self._installation = installation + self._resource_permission = resource_permission + self._table_mapping = table_mapping + + @classmethod + def for_cli(cls, ws: WorkspaceClient, installation: Installation, prompts: Prompts): + msg = ( + "This cmd will migrate acl for all interactive clusters to the related storage account tables, " + "Please confirm " + ) + if not prompts.confirm(msg): + return None + + config = installation.load(WorkspaceConfig) + sql_backend = StatementExecutionBackend(ws, config.warehouse_id) + azure_client = AzureAPIClient( + ws.config.arm_environment.resource_manager_endpoint, + ws.config.arm_environment.service_management_endpoint, + ) + graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") + azurerm = AzureResources(azure_client, graph_client) + locations = ExternalLocations(ws, sql_backend, config.inventory_database) + + resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations) + spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, config.inventory_database) + table_mapping = TableMapping(installation, ws, sql_backend) + + return cls(ws, sql_backend, installation, spn_crawler, resource_permissions, table_mapping) + + def migrate_cluster_acl(self): + spn_cluster_mapping = self._spn_crawler.get_cluster_to_storage_mapping() + external_locations = self._get_principal_prefix(spn_cluster_mapping) + + if len(spn_cluster_mapping) == 0: + logger.info("There are no interactive clusters configured with service principals") + return + for cluster_spn in spn_cluster_mapping: + # user_mapping = self._get_cluster_principal_mapping(cluster_spn.cluster_id) + logger.info(f"Applying UC ACL for cluster id {cluster_spn.cluster_id}") + for spn in cluster_spn.spn_info: + self._apply_uc_permission(external_locations) + + def _get_principal_prefix( + self, spn_cluster_mapping: list[AzureServicePrincipalClusterMapping] + ) -> list[StoragePermissionMapping]: + set_service_principals = set[AzureServicePrincipalInfo]() + for spn_mapping in spn_cluster_mapping: + set_service_principals.update(spn_mapping.spn_info) + external_locations = [] + permission_mappings = self._resource_permission.load() + for spn in set_service_principals: + for perm_mapping in permission_mappings: + if perm_mapping.client_id == spn.application_id and spn.storage_account in perm_mapping.prefix: + external_locations.append(perm_mapping) + return external_locations + + def _get_tables(self, locations: list[ExternalLocationInfo]): + matching_tables = [] + tables = self._table_mapping.load() + for table in tables: + table_name = f"{table.catalog_name}.{table.dst_schema}.{table.dst_table}" + table_info = self._ws.tables.get(table_name) + for location in locations: + assert location.url is not None + if table_info.storage_location is not None and table_info.storage_location.startswith(location.url): + matching_tables.append(location) + return matching_tables + + def _get_external_location(self, locations: list[StoragePermissionMapping]) -> list[ExternalLocationInfo]: + matching_location = [] + for location in self._ws.external_locations.list(): + for principal_prefix in locations: + assert location.url is not None + if location.url.startswith(principal_prefix.prefix): + matching_location.append(location) + return matching_location + + def _apply_uc_permission( + self, + locations: list[StoragePermissionMapping], + ): + matching_external_locations = self._get_external_location(locations) + matching_tables = self._get_tables(matching_external_locations) + print(matching_tables) + + def _get_cluster_principal_mapping(self, cluster_id: str) -> list[IAMPrincipal]: + # gets all the users,groups,spn which have access to the clusters and returns a dataclass of that mapping + principal_list = [] + cluster_permission = self._ws.permissions.get("clusters", cluster_id) + if cluster_permission.access_control_list is None: + return [] + for acl in cluster_permission.access_control_list: + if acl.user_name is not None: + principal_list.append(IAMPrincipal("user", acl.user_name)) + if acl.group_name is not None: + principal_list.append(IAMPrincipal("group", acl.group_name)) + if acl.service_principal_name is not None: + principal_list.append(IAMPrincipal("spn", acl.service_principal_name)) + return principal_list + + def _get_spn_permission(self, spn_cluster_mapping: list[AzureServicePrincipalClusterMapping]): + set_service_principals = set[AzureServicePrincipalInfo]() + for spn in spn_cluster_mapping: + set_service_principals.update(spn.spn_info) + cluster_user_mapping = [] + for cluster in spn_cluster_mapping: + principal_list = [] + cluster_permission = self._ws.permissions.get("clusters", cluster.cluster_id) + if cluster_permission.access_control_list is None: + return [] + for acl in cluster_permission.access_control_list: + if acl.user_name is not None: + principal_list.append(IAMPrincipal("user", acl.user_name)) + if acl.group_name is not None: + principal_list.append(IAMPrincipal("group", acl.group_name)) + if acl.service_principal_name is not None: + principal_list.append(IAMPrincipal("spn", acl.service_principal_name)) + cluster_user_mapping.append(ClusterPrincipalMapping(cluster.cluster_id, principal_list)) + return cluster_user_mapping From 4f4dccd84257dd01d8bb17d59f8d75f6d31ce86f Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sun, 24 Mar 2024 10:27:46 +0000 Subject: [PATCH 02/24] moving code to Grants and generating Grant objects --- src/databricks/labs/ucx/assessment/azure.py | 12 +- .../labs/ucx/hive_metastore/grants.py | 174 +++++++++++++++++- .../ucx/hive_metastore/table_acl_migrate.py | 174 ------------------ .../labs/ucx/hive_metastore/table_migrate.py | 18 +- src/databricks/labs/ucx/runtime.py | 21 ++- 5 files changed, 210 insertions(+), 189 deletions(-) delete mode 100644 src/databricks/labs/ucx/hive_metastore/table_acl_migrate.py diff --git a/src/databricks/labs/ucx/assessment/azure.py b/src/databricks/labs/ucx/assessment/azure.py index 04576821a4..1af38db7ab 100644 --- a/src/databricks/labs/ucx/assessment/azure.py +++ b/src/databricks/labs/ucx/assessment/azure.py @@ -6,7 +6,7 @@ from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound -from databricks.sdk.service.compute import ClusterSource, Policy +from databricks.sdk.service.compute import ClusterSource, DataSecurityMode, Policy from databricks.labs.ucx.assessment.crawlers import azure_sp_conf_present_check, logger from databricks.labs.ucx.assessment.jobs import JobsMixin @@ -30,8 +30,8 @@ class AzureServicePrincipalInfo: storage_account: str | None = None -@dataclass() -class AzureServicePrincipalClusterMapping: +@dataclass +class ServicePrincipalClusterMapping: # this class is created separately as we need cluster to spn mapping # Cluster id where the spn is used cluster_id: str @@ -188,10 +188,8 @@ def get_cluster_to_storage_mapping(self): spn_cluster_mapping = [] for cluster in self._ws.clusters.list(): if cluster.cluster_source != ClusterSource.JOB and ( - cluster.data_security_mode.LEGACY_SINGLE_USER or cluster.data_security_mode.NONE + cluster.data_security_mode in [DataSecurityMode.LEGACY_SINGLE_USER, DataSecurityMode.NONE] ): set_service_principals = self._get_azure_spn_from_cluster_config(cluster) - spn_cluster_mapping.append( - AzureServicePrincipalClusterMapping(cluster.cluster_id, set_service_principals) - ) + spn_cluster_mapping.append(ServicePrincipalClusterMapping(cluster.cluster_id, set_service_principals)) return spn_cluster_mapping diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index b6b2913d64..1abc00292b 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -4,12 +4,27 @@ from dataclasses import dataclass from functools import partial +from databricks.labs.blueprint.installation import Installation from databricks.labs.blueprint.parallel import ManyError, Threads -from databricks.sdk.service.catalog import SchemaInfo, TableInfo - +from databricks.labs.lsql.backends import SqlBackend, StatementExecutionBackend +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.catalog import ExternalLocationInfo, SchemaInfo, TableInfo + +from databricks.labs.ucx.assessment.azure import ( + AzureServicePrincipalCrawler, + AzureServicePrincipalInfo, +) +from databricks.labs.ucx.azure.access import ( + AzureResourcePermissions, + Privilege, + StoragePermissionMapping, +) +from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources +from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier -from databricks.labs.ucx.hive_metastore.tables import TablesCrawler +from databricks.labs.ucx.hive_metastore.locations import ExternalLocations +from databricks.labs.ucx.hive_metastore.tables import Table, TablesCrawler from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler logger = logging.getLogger(__name__) @@ -307,3 +322,156 @@ def grants( # TODO: https://github.com/databrickslabs/ucx/issues/406 logger.error(f"Couldn't fetch grants for object {on_type} {key}: {e}") return [] + + +class PrincipalACL: + def __init__( + self, + ws: WorkspaceClient, + backend: SqlBackend, + installation: Installation, + spn_crawler: AzureServicePrincipalCrawler, + resource_permission: AzureResourcePermissions, + table_crawler: TablesCrawler, + ): + self._backend = backend + self._ws = ws + self._spn_crawler = spn_crawler + self._installation = installation + self._resource_permission = resource_permission + self._table_crawler = table_crawler + + @classmethod + def for_cli(cls, ws: WorkspaceClient, installation: Installation): + + config = installation.load(WorkspaceConfig) + sql_backend = StatementExecutionBackend(ws, config.warehouse_id) + azure_client = AzureAPIClient( + ws.config.arm_environment.resource_manager_endpoint, + ws.config.arm_environment.service_management_endpoint, + ) + graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") + azurerm = AzureResources(azure_client, graph_client) + locations = ExternalLocations(ws, sql_backend, config.inventory_database) + table_crawler = TablesCrawler(sql_backend, config.inventory_database) + resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations) + spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, config.inventory_database) + + return cls(ws, sql_backend, installation, spn_crawler, resource_permissions, table_crawler) + + def get_interactive_cluster_grants(self) -> list[Grant]: + spn_cluster_mapping = self._spn_crawler.get_cluster_to_storage_mapping() + external_locations = list(self._ws.external_locations.list()) + permission_mappings = self._resource_permission.load() + tables = self._table_crawler.snapshot() + grants = [] + + if len(spn_cluster_mapping) == 0: + return [] + for cluster_spn in spn_cluster_mapping: + principals = self._get_cluster_principal_mapping(cluster_spn.cluster_id) + if len(principals) == 0: + continue + for spn in cluster_spn.spn_info: + eligible_locations = self._get_external_location(spn, external_locations, permission_mappings) + if len(eligible_locations) == 0: + continue + grant = self._get_grants(eligible_locations, principals, tables) + grants.extend(grant) + return grants + + def _get_privilege(self, location: str | None, locations: dict[ExternalLocationInfo, str]): + if location is None: + return None + for loc, privilege in locations.items(): + if loc.url is not None and location.startswith(loc.url): + return privilege + return None + + def _get_database_grants(self, tables: list[Table], principals: list[str]): + databases = [] + for table in tables: + if table.database not in databases: + databases.append(table.database) + return [ + Grant(principal, "USE", "hive_metastore", database) for database in databases for principal in principals + ] + + def _get_grants( + self, + locations: dict[ExternalLocationInfo, str], + principals: list[str], + tables: list[Table], + ) -> list[Grant]: + grants = [] + for table in tables: + if self._get_privilege(table.location, locations) is not None: + privilege = self._get_privilege(table.location, locations) + if privilege == Privilege.READ_FILES: + grants.extend( + [ + Grant(principal, "SELECT", table.catalog, table.database, table.name) + for principal in principals + ] + ) + if privilege == Privilege.WRITE_FILES: + grants.extend( + [ + Grant(principal, "ALL_PRIVILEGES", table.catalog, table.database, table.name) + for principal in principals + ] + ) + if table.location is None: + grants.extend( + [ + Grant(principal, "ALL_PRIVILEGES", table.catalog, table.database, view=table.name) + for principal in principals + ] + ) + if table.view_text is not None: + grants.extend( + [ + Grant(principal, "ALL_PRIVILEGES", table.catalog, table.database, view=table.name) + for principal in principals + ] + ) + database_grants = self._get_database_grants(tables, principals) + catalog_grants = [Grant(principal, "USE", "hive_metastore") for principal in principals] + grants.extend(database_grants) + grants.extend(catalog_grants) + return grants + + def _get_external_location( + self, + spn: AzureServicePrincipalInfo, + external_locations: list[ExternalLocationInfo], + permission_mappings: list[StoragePermissionMapping], + ) -> dict[ExternalLocationInfo, str]: + matching_location = {} + for location in external_locations: + if location.url is None: + continue + for permission_mapping in permission_mappings: + if ( + location.url.startswith(permission_mapping.prefix) + and permission_mapping.client_id == spn.application_id + and spn.storage_account is not None + and spn.storage_account in permission_mapping.prefix + ): + matching_location[location] = permission_mapping.privilege + return matching_location + + def _get_cluster_principal_mapping(self, cluster_id: str) -> list[str]: + # gets all the users,groups,spn which have access to the clusters and returns a dataclass of that mapping + principal_list = [] + cluster_permission = self._ws.permissions.get("clusters", cluster_id) + if cluster_permission.access_control_list is None: + return [] + for acl in cluster_permission.access_control_list: + if acl.user_name is not None: + principal_list.append(acl.user_name) + if acl.group_name is not None: + principal_list.append(acl.group_name) + if acl.service_principal_name is not None: + principal_list.append(acl.service_principal_name) + return principal_list diff --git a/src/databricks/labs/ucx/hive_metastore/table_acl_migrate.py b/src/databricks/labs/ucx/hive_metastore/table_acl_migrate.py deleted file mode 100644 index 63c1bf2c17..0000000000 --- a/src/databricks/labs/ucx/hive_metastore/table_acl_migrate.py +++ /dev/null @@ -1,174 +0,0 @@ -import logging -from dataclasses import dataclass - -from databricks.labs.blueprint.installation import Installation -from databricks.labs.blueprint.tui import Prompts -from databricks.labs.lsql.backends import SqlBackend, StatementExecutionBackend -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.catalog import ExternalLocationInfo - -from databricks.labs.ucx.assessment.azure import ( - AzureServicePrincipalClusterMapping, - AzureServicePrincipalCrawler, - AzureServicePrincipalInfo, -) -from databricks.labs.ucx.azure.access import ( - AzureResourcePermissions, - StoragePermissionMapping, -) -from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources -from databricks.labs.ucx.config import WorkspaceConfig -from databricks.labs.ucx.hive_metastore import ExternalLocations -from databricks.labs.ucx.hive_metastore.mapping import TableMapping - -logger = logging.getLogger(__name__) - - -@dataclass -class IAMPrincipal: - # user or group or spn - principal_type: str - # id of the principal - principal_id: str - - -@dataclass -class ClusterPrincipalMapping: - # cluster id - cluster_id: str - # list of all principals that has access to this cluster id - principals: list[IAMPrincipal] - - -class TableACLMigrate: - def __init__( - self, - ws: WorkspaceClient, - backend: SqlBackend, - installation: Installation, - spn_crawler: AzureServicePrincipalCrawler, - resource_permission: AzureResourcePermissions, - table_mapping: TableMapping, - ): - self._backend = backend - self._ws = ws - self._spn_crawler = spn_crawler - self._installation = installation - self._resource_permission = resource_permission - self._table_mapping = table_mapping - - @classmethod - def for_cli(cls, ws: WorkspaceClient, installation: Installation, prompts: Prompts): - msg = ( - "This cmd will migrate acl for all interactive clusters to the related storage account tables, " - "Please confirm " - ) - if not prompts.confirm(msg): - return None - - config = installation.load(WorkspaceConfig) - sql_backend = StatementExecutionBackend(ws, config.warehouse_id) - azure_client = AzureAPIClient( - ws.config.arm_environment.resource_manager_endpoint, - ws.config.arm_environment.service_management_endpoint, - ) - graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") - azurerm = AzureResources(azure_client, graph_client) - locations = ExternalLocations(ws, sql_backend, config.inventory_database) - - resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations) - spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, config.inventory_database) - table_mapping = TableMapping(installation, ws, sql_backend) - - return cls(ws, sql_backend, installation, spn_crawler, resource_permissions, table_mapping) - - def migrate_cluster_acl(self): - spn_cluster_mapping = self._spn_crawler.get_cluster_to_storage_mapping() - external_locations = self._get_principal_prefix(spn_cluster_mapping) - - if len(spn_cluster_mapping) == 0: - logger.info("There are no interactive clusters configured with service principals") - return - for cluster_spn in spn_cluster_mapping: - # user_mapping = self._get_cluster_principal_mapping(cluster_spn.cluster_id) - logger.info(f"Applying UC ACL for cluster id {cluster_spn.cluster_id}") - for spn in cluster_spn.spn_info: - self._apply_uc_permission(external_locations) - - def _get_principal_prefix( - self, spn_cluster_mapping: list[AzureServicePrincipalClusterMapping] - ) -> list[StoragePermissionMapping]: - set_service_principals = set[AzureServicePrincipalInfo]() - for spn_mapping in spn_cluster_mapping: - set_service_principals.update(spn_mapping.spn_info) - external_locations = [] - permission_mappings = self._resource_permission.load() - for spn in set_service_principals: - for perm_mapping in permission_mappings: - if perm_mapping.client_id == spn.application_id and spn.storage_account in perm_mapping.prefix: - external_locations.append(perm_mapping) - return external_locations - - def _get_tables(self, locations: list[ExternalLocationInfo]): - matching_tables = [] - tables = self._table_mapping.load() - for table in tables: - table_name = f"{table.catalog_name}.{table.dst_schema}.{table.dst_table}" - table_info = self._ws.tables.get(table_name) - for location in locations: - assert location.url is not None - if table_info.storage_location is not None and table_info.storage_location.startswith(location.url): - matching_tables.append(location) - return matching_tables - - def _get_external_location(self, locations: list[StoragePermissionMapping]) -> list[ExternalLocationInfo]: - matching_location = [] - for location in self._ws.external_locations.list(): - for principal_prefix in locations: - assert location.url is not None - if location.url.startswith(principal_prefix.prefix): - matching_location.append(location) - return matching_location - - def _apply_uc_permission( - self, - locations: list[StoragePermissionMapping], - ): - matching_external_locations = self._get_external_location(locations) - matching_tables = self._get_tables(matching_external_locations) - print(matching_tables) - - def _get_cluster_principal_mapping(self, cluster_id: str) -> list[IAMPrincipal]: - # gets all the users,groups,spn which have access to the clusters and returns a dataclass of that mapping - principal_list = [] - cluster_permission = self._ws.permissions.get("clusters", cluster_id) - if cluster_permission.access_control_list is None: - return [] - for acl in cluster_permission.access_control_list: - if acl.user_name is not None: - principal_list.append(IAMPrincipal("user", acl.user_name)) - if acl.group_name is not None: - principal_list.append(IAMPrincipal("group", acl.group_name)) - if acl.service_principal_name is not None: - principal_list.append(IAMPrincipal("spn", acl.service_principal_name)) - return principal_list - - def _get_spn_permission(self, spn_cluster_mapping: list[AzureServicePrincipalClusterMapping]): - set_service_principals = set[AzureServicePrincipalInfo]() - for spn in spn_cluster_mapping: - set_service_principals.update(spn.spn_info) - cluster_user_mapping = [] - for cluster in spn_cluster_mapping: - principal_list = [] - cluster_permission = self._ws.permissions.get("clusters", cluster.cluster_id) - if cluster_permission.access_control_list is None: - return [] - for acl in cluster_permission.access_control_list: - if acl.user_name is not None: - principal_list.append(IAMPrincipal("user", acl.user_name)) - if acl.group_name is not None: - principal_list.append(IAMPrincipal("group", acl.group_name)) - if acl.service_principal_name is not None: - principal_list.append(IAMPrincipal("spn", acl.service_principal_name)) - cluster_user_mapping.append(ClusterPrincipalMapping(cluster.cluster_id, principal_list)) - return cluster_user_mapping diff --git a/src/databricks/labs/ucx/hive_metastore/table_migrate.py b/src/databricks/labs/ucx/hive_metastore/table_migrate.py index 57b19c9424..249c83325f 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migrate.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migrate.py @@ -15,7 +15,7 @@ from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping from databricks.labs.ucx.hive_metastore.tables import ( AclMigrationWhat, @@ -52,6 +52,7 @@ def __init__( table_mapping: TableMapping, group_manager: GroupManager, migration_status_refresher, + principal_grants: PrincipalACL, ): self._tc = table_crawler self._gc = grant_crawler @@ -61,6 +62,7 @@ def __init__( self._group = group_manager self._migration_status_refresher = migration_status_refresher self._seen_tables: dict[str, str] = {} + self._principal_grants = principal_grants @classmethod def for_cli(cls, ws: WorkspaceClient, product='ucx'): @@ -72,9 +74,17 @@ def for_cli(cls, ws: WorkspaceClient, product='ucx'): grants_crawler = GrantsCrawler(table_crawler, udfs_crawler) table_mapping = TableMapping(installation, ws, sql_backend) group_manager = GroupManager(sql_backend, ws, config.inventory_database) + principal_grants = PrincipalACL.for_cli(ws, installation) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, config.inventory_database, table_crawler) return cls( - table_crawler, grants_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grants_crawler, + ws, + sql_backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) def index(self): @@ -86,6 +96,7 @@ def migrate_tables(self, *, what: What | None = None, acl_strategy: AclMigration if acl_strategy is not None: grants_to_migrate = self._gc.snapshot() migrated_groups = self._group.snapshot() + principal_grants = self._principal_grants.get_interactive_cluster_grants() tasks = [] for table in tables_to_migrate: if what is not None and table.src.what != what: @@ -98,7 +109,8 @@ def migrate_tables(self, *, what: What | None = None, acl_strategy: AclMigration tasks.append(partial(self._migrate_table, table.src, table.rule, grants)) case AclMigrationWhat.PRINCIPAL: # TODO: Implement principal-based ACL migration - pass + grants = self._match_grants(table.src, principal_grants, migrated_groups) + tasks.append(partial(self._migrate_table, table.src, table.rule, grants)) Threads.strict("migrate tables", tasks) def _migrate_table(self, src_table: Table, rule: Rule, grants: list[Grant] | None = None): diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index dcfea4499f..405956600f 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -19,6 +19,7 @@ Mounts, TablesCrawler, ) +from databricks.labs.ucx.hive_metastore.grants import PrincipalACL from databricks.labs.ucx.hive_metastore.mapping import TableMapping from databricks.labs.ucx.hive_metastore.table_migrate import ( MigrationStatusRefresher, @@ -435,9 +436,17 @@ def migrate_external_tables_sync( grant_crawler = GrantsCrawler(table_crawler, udf_crawler) table_mapping = TableMapping(install, ws, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler) + principal_grants = PrincipalACL.for_cli(ws, install) group_manager = GroupManager(sql_backend, ws, cfg.inventory_database) TablesMigrate( - table_crawler, grant_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + sql_backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ).migrate_tables(what=What.EXTERNAL_SYNC) @@ -456,8 +465,16 @@ def migrate_dbfs_root_delta_tables( table_mapping = TableMapping(install, ws, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler) group_manager = GroupManager(sql_backend, ws, cfg.inventory_database) + principal_grants = PrincipalACL.for_cli(ws, install) TablesMigrate( - table_crawler, grant_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + sql_backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ).migrate_tables(what=What.DBFS_ROOT_DELTA) From 3f7a46123d7e1c082fb374dd9cebe73b984e049e Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Mon, 25 Mar 2024 12:14:19 +0000 Subject: [PATCH 03/24] moving code to Grants and generating Grant objects --- pyproject.toml | 4 +- .../labs/ucx/hive_metastore/grants.py | 20 ++++++--- .../labs/ucx/hive_metastore/table_migrate.py | 17 ++++---- src/databricks/labs/ucx/runtime.py | 9 ++-- .../hive_metastore/test_principal_grants.py | 42 +++++++++++++++++++ .../unit/hive_metastore/test_table_migrate.py | 8 +++- 6 files changed, 78 insertions(+), 22 deletions(-) create mode 100644 tests/unit/hive_metastore/test_principal_grants.py diff --git a/pyproject.toml b/pyproject.toml index 69760117fa..e76bc23ee2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,8 +82,8 @@ python="3.10" path = ".venv" [tool.hatch.envs.default.scripts] -test = "pytest -n 4 --cov src --cov-report=xml --timeout 30 tests/unit --durations 20" -coverage = "pytest -n auto --cov src tests/unit --timeout 30 --cov-report=html --durations 20" +test = "pytest -n 4 --cov src --cov-report=xml --timeout 30 tests/unit/hive_metastore/test_principal_grants.py --durations 20" +coverage = "pytest -n auto --cov src tests/unit/hive_metastore/test_principal_grants.py --timeout 30 --cov-report=html --durations 20" integration = "pytest -n 10 --cov src tests/integration --durations 20" fmt = ["isort .", "black .", diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 1abc00292b..6ee4420f3c 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -343,7 +343,6 @@ def __init__( @classmethod def for_cli(cls, ws: WorkspaceClient, installation: Installation): - config = installation.load(WorkspaceConfig) sql_backend = StatementExecutionBackend(ws, config.warehouse_id) azure_client = AzureAPIClient( @@ -356,18 +355,25 @@ def for_cli(cls, ws: WorkspaceClient, installation: Installation): table_crawler = TablesCrawler(sql_backend, config.inventory_database) resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations) spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, config.inventory_database) - return cls(ws, sql_backend, installation, spn_crawler, resource_permissions, table_crawler) def get_interactive_cluster_grants(self) -> list[Grant]: spn_cluster_mapping = self._spn_crawler.get_cluster_to_storage_mapping() + if len(spn_cluster_mapping) == 0: + logger.info("No interactive cluster found with spn configured") + return [] external_locations = list(self._ws.external_locations.list()) + if len(external_locations) == 0: + logger.warning("No external location found, If hive metastore tables are created in external storage, " + "ensure migrate_locations cli cmd is run to create the required locations.") + return [] permission_mappings = self._resource_permission.load() + if len(permission_mappings) == 0: + logger.warning("Please ensure principal_prefix_access cli cmd is run to create the access permission file.") tables = self._table_crawler.snapshot() - grants = [] + grants: list[Grant] = [] + - if len(spn_cluster_mapping) == 0: - return [] for cluster_spn in spn_cluster_mapping: principals = self._get_cluster_principal_mapping(cluster_spn.cluster_id) if len(principals) == 0: @@ -377,6 +383,8 @@ def get_interactive_cluster_grants(self) -> list[Grant]: if len(eligible_locations) == 0: continue grant = self._get_grants(eligible_locations, principals, tables) + if len(grants) == 0: + continue grants.extend(grant) return grants @@ -388,7 +396,7 @@ def _get_privilege(self, location: str | None, locations: dict[ExternalLocationI return privilege return None - def _get_database_grants(self, tables: list[Table], principals: list[str]): + def _get_database_grants(self, tables: list[Table], principals: list[str]) -> list[Grant]: databases = [] for table in tables: if table.database not in databases: diff --git a/src/databricks/labs/ucx/hive_metastore/table_migrate.py b/src/databricks/labs/ucx/hive_metastore/table_migrate.py index 249c83325f..febacdece2 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migrate.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migrate.py @@ -15,7 +15,7 @@ from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL +from databricks.labs.ucx.hive_metastore.grants import Grant from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping from databricks.labs.ucx.hive_metastore.tables import ( AclMigrationWhat, @@ -52,7 +52,7 @@ def __init__( table_mapping: TableMapping, group_manager: GroupManager, migration_status_refresher, - principal_grants: PrincipalACL, + # principal_grants: PrincipalACL, ): self._tc = table_crawler self._gc = grant_crawler @@ -62,7 +62,7 @@ def __init__( self._group = group_manager self._migration_status_refresher = migration_status_refresher self._seen_tables: dict[str, str] = {} - self._principal_grants = principal_grants + # self._principal_grants = principal_grants @classmethod def for_cli(cls, ws: WorkspaceClient, product='ucx'): @@ -74,7 +74,7 @@ def for_cli(cls, ws: WorkspaceClient, product='ucx'): grants_crawler = GrantsCrawler(table_crawler, udfs_crawler) table_mapping = TableMapping(installation, ws, sql_backend) group_manager = GroupManager(sql_backend, ws, config.inventory_database) - principal_grants = PrincipalACL.for_cli(ws, installation) + # principal_grants = PrincipalACL.for_cli(ws, installation) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, config.inventory_database, table_crawler) return cls( table_crawler, @@ -84,7 +84,7 @@ def for_cli(cls, ws: WorkspaceClient, product='ucx'): table_mapping, group_manager, migration_status_refresher, - principal_grants, + # principal_grants, ) def index(self): @@ -96,7 +96,7 @@ def migrate_tables(self, *, what: What | None = None, acl_strategy: AclMigration if acl_strategy is not None: grants_to_migrate = self._gc.snapshot() migrated_groups = self._group.snapshot() - principal_grants = self._principal_grants.get_interactive_cluster_grants() + # principal_grants = self._principal_grants.get_interactive_cluster_grants() tasks = [] for table in tables_to_migrate: if what is not None and table.src.what != what: @@ -108,9 +108,10 @@ def migrate_tables(self, *, what: What | None = None, acl_strategy: AclMigration grants = self._match_grants(table.src, grants_to_migrate, migrated_groups) tasks.append(partial(self._migrate_table, table.src, table.rule, grants)) case AclMigrationWhat.PRINCIPAL: + pass # TODO: Implement principal-based ACL migration - grants = self._match_grants(table.src, principal_grants, migrated_groups) - tasks.append(partial(self._migrate_table, table.src, table.rule, grants)) + # grants = self._match_grants(table.src, principal_grants, migrated_groups) + # tasks.append(partial(self._migrate_table, table.src, table.rule, grants)) Threads.strict("migrate tables", tasks) def _migrate_table(self, src_table: Table, rule: Rule, grants: list[Grant] | None = None): diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index 405956600f..28b7916198 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -19,7 +19,6 @@ Mounts, TablesCrawler, ) -from databricks.labs.ucx.hive_metastore.grants import PrincipalACL from databricks.labs.ucx.hive_metastore.mapping import TableMapping from databricks.labs.ucx.hive_metastore.table_migrate import ( MigrationStatusRefresher, @@ -436,7 +435,7 @@ def migrate_external_tables_sync( grant_crawler = GrantsCrawler(table_crawler, udf_crawler) table_mapping = TableMapping(install, ws, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, install) + # principal_grants = PrincipalACL.for_cli(ws, install) group_manager = GroupManager(sql_backend, ws, cfg.inventory_database) TablesMigrate( table_crawler, @@ -446,7 +445,7 @@ def migrate_external_tables_sync( table_mapping, group_manager, migration_status_refresher, - principal_grants, + # principal_grants, ).migrate_tables(what=What.EXTERNAL_SYNC) @@ -465,7 +464,7 @@ def migrate_dbfs_root_delta_tables( table_mapping = TableMapping(install, ws, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler) group_manager = GroupManager(sql_backend, ws, cfg.inventory_database) - principal_grants = PrincipalACL.for_cli(ws, install) + # principal_grants = PrincipalACL.for_cli(ws, install) TablesMigrate( table_crawler, grant_crawler, @@ -474,7 +473,7 @@ def migrate_dbfs_root_delta_tables( table_mapping, group_manager, migration_status_refresher, - principal_grants, + # principal_grants, ).migrate_tables(what=What.DBFS_ROOT_DELTA) diff --git a/tests/unit/hive_metastore/test_principal_grants.py b/tests/unit/hive_metastore/test_principal_grants.py new file mode 100644 index 0000000000..00ae07f333 --- /dev/null +++ b/tests/unit/hive_metastore/test_principal_grants.py @@ -0,0 +1,42 @@ +from unittest.mock import create_autospec + +import pytest +from databricks.labs.blueprint.installation import MockInstallation +from databricks.sdk import WorkspaceClient + +from databricks.labs.ucx.hive_metastore.grants import PrincipalACL + + +@pytest.fixture +def ws(): + return create_autospec(WorkspaceClient) + + +@pytest.fixture +def installation(): + return MockInstallation( + { + "config.yml": { + 'warehouse_id': 'abc', + 'connect': {'host': 'a', 'token': 'b'}, + 'inventory_database':'ucx' + }, + "azure_storage_account_info.csv": [ + { + 'prefix': 'prefix1', + 'client_id': 'app_secret1', + 'principal': 'principal_1', + 'privilege': 'WRITE_FILES', + 'type': 'Application', + 'directory_id': 'directory_id_ss1', + }, + ], + } + ) + + +def test_for_cli(ws, installation): + assert isinstance(PrincipalACL.for_cli(ws, installation), PrincipalACL) + +def test_interactive_cluster(ws, installation): + assert isinstance(PrincipalACL.for_cli(ws, installation), PrincipalACL) diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index cc684be0ab..6fa812d144 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -310,7 +310,13 @@ def get_table_migrate(backend: SqlBackend) -> TablesMigrate: table_mapping = table_mapping_mock() migration_status_refresher = MigrationStatusRefresher(client, backend, "inventory_database", table_crawler) table_migrate = TablesMigrate( - table_crawler, grant_crawler, client, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + client, + backend, + table_mapping, + group_manager, + migration_status_refresher, ) return table_migrate From d60d266ad093abf9e2574b0075409070f0607fbf Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Wed, 27 Mar 2024 13:39:28 +0000 Subject: [PATCH 04/24] azure spn test case --- pyproject.toml | 10 +-- .../labs/ucx/hive_metastore/grants.py | 7 ++- .../azure-spn-secret-interactive.json | 20 ++++++ tests/unit/assessment/test_azure.py | 61 ++++++++++++++++++- .../hive_metastore/test_principal_grants.py | 7 +-- 5 files changed, 90 insertions(+), 15 deletions(-) create mode 100644 tests/unit/assessment/clusters/azure-spn-secret-interactive.json diff --git a/pyproject.toml b/pyproject.toml index d36fc966e1..69760117fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ classifiers = [ "Topic :: Software Development :: Libraries", "Topic :: Utilities", ] -dependencies = ["databricks-sdk==0.23.0", +dependencies = ["databricks-sdk==0.22.0", "databricks-labs-lsql~=0.2.2", "databricks-labs-blueprint~=0.4.3", "PyYAML>=6.0.0,<7.0.0"] @@ -69,7 +69,7 @@ dependencies = [ "pytest-mock>=3.0.0,<4.0.0", "pytest-timeout", "black>=23.1.0", - "ruff>=0.3.0", + "ruff>=0.0.243", "isort>=2.5.0", "mypy", "types-PyYAML", @@ -82,12 +82,12 @@ python="3.10" path = ".venv" [tool.hatch.envs.default.scripts] -test = "pytest -n 4 --cov src --cov-report=xml --timeout 30 tests/unit/hive_metastore/test_principal_grants.py --durations 20" -coverage = "pytest -n auto --cov src tests/unit/hive_metastore/test_principal_grants.py --timeout 30 --cov-report=html --durations 20" +test = "pytest -n 4 --cov src --cov-report=xml --timeout 30 tests/unit --durations 20" +coverage = "pytest -n auto --cov src tests/unit --timeout 30 --cov-report=html --durations 20" integration = "pytest -n 10 --cov src tests/integration --durations 20" fmt = ["isort .", "black .", - "ruff check . --fix", + "ruff . --fix", "mypy .", "pylint --output-format=colorized -j 0 src tests"] verify = ["black --check .", diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 6ee4420f3c..8b72b11664 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -364,8 +364,10 @@ def get_interactive_cluster_grants(self) -> list[Grant]: return [] external_locations = list(self._ws.external_locations.list()) if len(external_locations) == 0: - logger.warning("No external location found, If hive metastore tables are created in external storage, " - "ensure migrate_locations cli cmd is run to create the required locations.") + logger.warning( + "No external location found, If hive metastore tables are created in external storage, " + "ensure migrate_locations cli cmd is run to create the required locations." + ) return [] permission_mappings = self._resource_permission.load() if len(permission_mappings) == 0: @@ -373,7 +375,6 @@ def get_interactive_cluster_grants(self) -> list[Grant]: tables = self._table_crawler.snapshot() grants: list[Grant] = [] - for cluster_spn in spn_cluster_mapping: principals = self._get_cluster_principal_mapping(cluster_spn.cluster_id) if len(principals) == 0: diff --git a/tests/unit/assessment/clusters/azure-spn-secret-interactive.json b/tests/unit/assessment/clusters/azure-spn-secret-interactive.json new file mode 100644 index 0000000000..fc0bdc4995 --- /dev/null +++ b/tests/unit/assessment/clusters/azure-spn-secret-interactive.json @@ -0,0 +1,20 @@ +{ + "autoscale": { + "min_workers": 1, + "max_workers": 6 + }, + "cluster_id": "azure-spn-secret-interactive", + "cluster_name": "Azure SPN Secret", + "data_security_mode": "NONE", + "cluster_source": "UI", + "spark_conf": { + "spark.hadoop.fs.azure.account.oauth2.client.id.abcde.dfs.core.windows.net": "{{secrets/abcff/sp_app_client_id}}", + "spark.hadoop.fs.azure.account.oauth2.client.endpoint.abcde.dfs.core.windows.net": "https://login.microsoftonline.com/dedededede/oauth2/token", + "spark.hadoop.fs.azure.account.oauth2.client.secret.abcde.dfs.core.windows.net": "{{secrets/abcff/sp_secret}}", + "spark.hadoop.fs.azure.account.oauth2.client.id.fgh.dfs.core.windows.net": "{{secrets/fgh/sp_app_client_id2}}", + "spark.hadoop.fs.azure.account.oauth2.client.endpoint.fgh.dfs.core.windows.net": "https://login.microsoftonline.com/dedededede/oauth2/token", + "spark.hadoop.fs.azure.account.oauth2.client.secret.fgh.dfs.core.windows.net": "{{secrets/fgh/sp_secret2}}" + }, + "spark_context_id": 5134472582179565315, + "spark_version": "13.3.x-cpu-ml-scala2.12" +} \ No newline at end of file diff --git a/tests/unit/assessment/test_azure.py b/tests/unit/assessment/test_azure.py index ec6b1bb562..002380c36f 100644 --- a/tests/unit/assessment/test_azure.py +++ b/tests/unit/assessment/test_azure.py @@ -1,6 +1,17 @@ -from databricks.labs.lsql.backends import MockBackend +from unittest.mock import create_autospec -from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler +from databricks.labs.lsql.backends import MockBackend +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.compute import ( + ClusterDetails, + ClusterSource, + DataSecurityMode, +) + +from databricks.labs.ucx.assessment.azure import ( + AzureServicePrincipalCrawler, + AzureServicePrincipalInfo, +) from .. import workspace_client_mock @@ -199,3 +210,49 @@ def test_jobs_assessment_with_spn_cluster_policy_not_found(): ws = workspace_client_mock(job_ids=['policy-not-found']) crawler = AzureServicePrincipalCrawler(ws, MockBackend(), "ucx").snapshot() assert len(crawler) == 1 + + +def test_get_cluster_to_storage_mapping_no_cluster_return_empty(): + ws = create_autospec(WorkspaceClient) + ws.clusters.list.return_value = [] + crawler = AzureServicePrincipalCrawler(ws, MockBackend(), "ucx") + assert not crawler.get_cluster_to_storage_mapping() + + +def test_get_cluster_to_storage_mapping_no_interactive_cluster_return_empty(): + ws = workspace_client_mock(cluster_ids=['azure-spn-secret']) + ws.clusters.list.return_value = [ + ClusterDetails(cluster_source=ClusterSource.JOB), + ClusterDetails(cluster_source=ClusterSource.UI, data_security_mode=DataSecurityMode.SINGLE_USER), + ] + crawler = AzureServicePrincipalCrawler(ws, MockBackend(), "ucx") + assert not crawler.get_cluster_to_storage_mapping() + + +def test_get_cluster_to_storage_mapping_interactive_cluster_no_spn_return_empty(): + ws = workspace_client_mock(cluster_ids=['azure-spn-secret-interactive']) + + crawler = AzureServicePrincipalCrawler(ws, MockBackend(), "ucx") + cluster_spn_info = crawler.get_cluster_to_storage_mapping() + spn_info = set( + [ + AzureServicePrincipalInfo( + application_id='Hello, World!', + secret_scope='abcff', + secret_key='sp_secret', + tenant_id='dedededede', + storage_account='abcde', + ), + AzureServicePrincipalInfo( + application_id='Hello, World!', + secret_scope='fgh', + secret_key='sp_secret2', + tenant_id='dedededede', + storage_account='fgh', + ), + ] + ) + + assert cluster_spn_info[0].cluster_id == "azure-spn-secret-interactive" + assert len(cluster_spn_info[0].spn_info) == 2 + assert cluster_spn_info[0].spn_info == spn_info diff --git a/tests/unit/hive_metastore/test_principal_grants.py b/tests/unit/hive_metastore/test_principal_grants.py index 00ae07f333..131ffd751c 100644 --- a/tests/unit/hive_metastore/test_principal_grants.py +++ b/tests/unit/hive_metastore/test_principal_grants.py @@ -16,11 +16,7 @@ def ws(): def installation(): return MockInstallation( { - "config.yml": { - 'warehouse_id': 'abc', - 'connect': {'host': 'a', 'token': 'b'}, - 'inventory_database':'ucx' - }, + "config.yml": {'warehouse_id': 'abc', 'connect': {'host': 'a', 'token': 'b'}, 'inventory_database': 'ucx'}, "azure_storage_account_info.csv": [ { 'prefix': 'prefix1', @@ -38,5 +34,6 @@ def installation(): def test_for_cli(ws, installation): assert isinstance(PrincipalACL.for_cli(ws, installation), PrincipalACL) + def test_interactive_cluster(ws, installation): assert isinstance(PrincipalACL.for_cli(ws, installation), PrincipalACL) From 7c7357839403fb4f7ed3eadf71fc8bb17d981ff1 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Thu, 28 Mar 2024 14:27:23 +0000 Subject: [PATCH 05/24] added ACLPrincipal test cases --- pyproject.toml | 2 +- .../labs/ucx/hive_metastore/grants.py | 116 +++++---- .../hive_metastore/test_principal_grants.py | 225 +++++++++++++++++- 3 files changed, 290 insertions(+), 53 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8251b55de3..67ea6b386a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ classifiers = [ "Topic :: Software Development :: Libraries", "Topic :: Utilities", ] -dependencies = ["databricks-sdk==0.22.0", +dependencies = ["databricks-sdk==0.23.0", "databricks-labs-lsql~=0.2.2", "databricks-labs-blueprint~=0.4.3", "PyYAML>=6.0.0,<7.0.0"] diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 8b72b11664..db805ceba9 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -8,6 +8,7 @@ from databricks.labs.blueprint.parallel import ManyError, Threads from databricks.labs.lsql.backends import SqlBackend, StatementExecutionBackend from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import ResourceDoesNotExist from databricks.sdk.service.catalog import ExternalLocationInfo, SchemaInfo, TableInfo from databricks.labs.ucx.assessment.azure import ( @@ -16,7 +17,6 @@ ) from databricks.labs.ucx.azure.access import ( AzureResourcePermissions, - Privilege, StoragePermissionMapping, ) from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources @@ -330,9 +330,9 @@ def __init__( ws: WorkspaceClient, backend: SqlBackend, installation: Installation, - spn_crawler: AzureServicePrincipalCrawler, - resource_permission: AzureResourcePermissions, table_crawler: TablesCrawler, + spn_crawler: AzureServicePrincipalCrawler | None = None, + resource_permission: AzureResourcePermissions | None = None, ): self._backend = backend self._ws = ws @@ -345,33 +345,54 @@ def __init__( def for_cli(cls, ws: WorkspaceClient, installation: Installation): config = installation.load(WorkspaceConfig) sql_backend = StatementExecutionBackend(ws, config.warehouse_id) - azure_client = AzureAPIClient( - ws.config.arm_environment.resource_manager_endpoint, - ws.config.arm_environment.service_management_endpoint, - ) - graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") - azurerm = AzureResources(azure_client, graph_client) locations = ExternalLocations(ws, sql_backend, config.inventory_database) table_crawler = TablesCrawler(sql_backend, config.inventory_database) - resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations) - spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, config.inventory_database) - return cls(ws, sql_backend, installation, spn_crawler, resource_permissions, table_crawler) + if ws.config.is_azure: + azure_client = AzureAPIClient( + ws.config.arm_environment.resource_manager_endpoint, + ws.config.arm_environment.service_management_endpoint, + ) + graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") + azurerm = AzureResources(azure_client, graph_client) + resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations) + spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, config.inventory_database) + return cls(ws, sql_backend, installation, table_crawler, spn_crawler, resource_permissions) + if ws.config.is_aws: + return None + if ws.config.is_gcp: + logger.error("UCX is not supported for GCP yet. Please run it on azure or aws") + return None + return None def get_interactive_cluster_grants(self) -> list[Grant]: + if self._ws.config.is_azure: + return self._get_azure_grants() + return [] + + def _get_azure_grants(self) -> list[Grant]: + assert self._spn_crawler is not None + assert self._resource_permission is not None spn_cluster_mapping = self._spn_crawler.get_cluster_to_storage_mapping() if len(spn_cluster_mapping) == 0: + # if there are no interactive clusters , then return empty grants logger.info("No interactive cluster found with spn configured") return [] external_locations = list(self._ws.external_locations.list()) if len(external_locations) == 0: - logger.warning( + # if there are no external locations, then throw an error to run migrate_locations cli command + msg = ( "No external location found, If hive metastore tables are created in external storage, " "ensure migrate_locations cli cmd is run to create the required locations." ) - return [] + logger.error(msg) + raise ResourceDoesNotExist(msg) from None + permission_mappings = self._resource_permission.load() if len(permission_mappings) == 0: - logger.warning("Please ensure principal_prefix_access cli cmd is run to create the access permission file.") + # if permission mapping is empty, raise an error to run principal_prefix cmd + msg = "No storage permission file found. Please ensure principal_prefix_access cli cmd is run to create the access permission file." + logger.error(msg) + raise ResourceDoesNotExist(msg) from None tables = self._table_crawler.snapshot() grants: list[Grant] = [] @@ -384,16 +405,27 @@ def get_interactive_cluster_grants(self) -> list[Grant]: if len(eligible_locations) == 0: continue grant = self._get_grants(eligible_locations, principals, tables) - if len(grants) == 0: - continue grants.extend(grant) - return grants + catalog_grants = [Grant(principal, "USE", "hive_metastore") for principal in principals] + grants.extend(catalog_grants) - def _get_privilege(self, location: str | None, locations: dict[ExternalLocationInfo, str]): - if location is None: + return list(set(grants)) + + def _get_aws_grants(self) -> list[Grant]: + # TODO + return [] + + def _get_privilege(self, table: Table, locations: dict[str, str]): + if table.view_text is not None: + # return nothing for view so that it goes to the seperate view logic return None + if table.location is None: + return "WRITE_FILES" + if table.location.startswith('dbfs://') or table.location.startswith('/dbfs/'): + return "WRITE_FILES" + for loc, privilege in locations.items(): - if loc.url is not None and location.startswith(loc.url): + if loc is not None and table.location.startswith(loc): return privilege return None @@ -408,35 +440,27 @@ def _get_database_grants(self, tables: list[Table], principals: list[str]) -> li def _get_grants( self, - locations: dict[ExternalLocationInfo, str], + locations: dict[str, str], principals: list[str], tables: list[Table], ) -> list[Grant]: grants = [] + filtered_tables = [] for table in tables: - if self._get_privilege(table.location, locations) is not None: - privilege = self._get_privilege(table.location, locations) - if privilege == Privilege.READ_FILES: - grants.extend( - [ - Grant(principal, "SELECT", table.catalog, table.database, table.name) - for principal in principals - ] - ) - if privilege == Privilege.WRITE_FILES: - grants.extend( - [ - Grant(principal, "ALL_PRIVILEGES", table.catalog, table.database, table.name) - for principal in principals - ] - ) - if table.location is None: + privilege = self._get_privilege(table, locations) + if privilege == "READ_FILES": + grants.extend( + [Grant(principal, "SELECT", table.catalog, table.database, table.name) for principal in principals] + ) + filtered_tables.append(table) + if privilege == "WRITE_FILES": grants.extend( [ - Grant(principal, "ALL_PRIVILEGES", table.catalog, table.database, view=table.name) + Grant(principal, "ALL_PRIVILEGES", table.catalog, table.database, table.name) for principal in principals ] ) + filtered_tables.append(table) if table.view_text is not None: grants.extend( [ @@ -444,10 +468,12 @@ def _get_grants( for principal in principals ] ) - database_grants = self._get_database_grants(tables, principals) - catalog_grants = [Grant(principal, "USE", "hive_metastore") for principal in principals] + filtered_tables.append(table) + + database_grants = self._get_database_grants(filtered_tables, principals) + grants.extend(database_grants) - grants.extend(catalog_grants) + return grants def _get_external_location( @@ -455,7 +481,7 @@ def _get_external_location( spn: AzureServicePrincipalInfo, external_locations: list[ExternalLocationInfo], permission_mappings: list[StoragePermissionMapping], - ) -> dict[ExternalLocationInfo, str]: + ) -> dict[str, str]: matching_location = {} for location in external_locations: if location.url is None: @@ -467,7 +493,7 @@ def _get_external_location( and spn.storage_account is not None and spn.storage_account in permission_mapping.prefix ): - matching_location[location] = permission_mapping.privilege + matching_location[location.url] = permission_mapping.privilege return matching_location def _get_cluster_principal_mapping(self, cluster_id: str) -> list[str]: diff --git a/tests/unit/hive_metastore/test_principal_grants.py b/tests/unit/hive_metastore/test_principal_grants.py index 131ffd751c..1fd3d9c737 100644 --- a/tests/unit/hive_metastore/test_principal_grants.py +++ b/tests/unit/hive_metastore/test_principal_grants.py @@ -2,14 +2,109 @@ import pytest from databricks.labs.blueprint.installation import MockInstallation +from databricks.labs.lsql.backends import StatementExecutionBackend from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import ResourceDoesNotExist +from databricks.sdk.service import iam +from databricks.sdk.service.catalog import ExternalLocationInfo -from databricks.labs.ucx.hive_metastore.grants import PrincipalACL +from databricks.labs.ucx.assessment.azure import ( + AzureServicePrincipalCrawler, + AzureServicePrincipalInfo, + ServicePrincipalClusterMapping, +) +from databricks.labs.ucx.azure.access import AzureResourcePermissions +from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources +from databricks.labs.ucx.config import WorkspaceConfig +from databricks.labs.ucx.hive_metastore import ExternalLocations, TablesCrawler +from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL +from databricks.labs.ucx.hive_metastore.tables import Table @pytest.fixture def ws(): - return create_autospec(WorkspaceClient) + w = create_autospec(WorkspaceClient) + w.config.is_azure = True + w.external_locations.list.return_value = [ + ExternalLocationInfo(url="abfss://container1@storage1.dfs.core.windows.net/folder1"), + ExternalLocationInfo(url="abfss://container1@storage2.dfs.core.windows.net/folder2"), + ExternalLocationInfo(url="abfss://container1@storage3.dfs.core.windows.net/folder3"), + ] + + permissions = { + 'cluster1': iam.ObjectPermissions( + object_id='cluster1', + object_type="clusters", + access_control_list=[ + iam.AccessControlResponse(group_name='group1', all_permissions=[iam.Permission(inherited=False)]), + iam.AccessControlResponse( + user_name='foo.bar@imagine.com', + all_permissions=[iam.Permission(permission_level=iam.PermissionLevel.CAN_USE)], + ), + ], + ), + 'cluster2': iam.ObjectPermissions( + object_id='cluster2', + object_type="clusters", + access_control_list=[ + iam.AccessControlResponse( + service_principal_name='spn1', + all_permissions=[iam.Permission(permission_level=iam.PermissionLevel.CAN_USE)], + ), + ], + ), + 'cluster3': iam.ObjectPermissions(object_id='cluster2', object_type="clusters"), + } + w.permissions.get.side_effect = lambda _, object_id: permissions[object_id] + return w + + +def principal_acl(w, install, cluster_spn: list): + + config = install.load(WorkspaceConfig) + sql_backend = StatementExecutionBackend(w, config.warehouse_id) + locations = create_autospec(ExternalLocations) + table_crawler = create_autospec(TablesCrawler) + tables = [ + Table( + 'hive_metastore', + 'schema1', + 'table1', + 'TABLE', + 'delta', + location='abfss://container1@storage1.dfs.core.windows.net/folder1/table1', + ), + Table('hive_metastore', 'schema1', 'view1', 'VIEW', 'delta', view_text="select * from table1"), + Table( + 'hive_metastore', + 'schema1', + 'table2', + 'TABLE', + 'delta', + location='abfss://container1@storage2.dfs.core.windows.net/folder2/table2', + ), + Table('hive_metastore', 'schema1', 'table3', 'TABLE', 'delta'), + Table('hive_metastore', 'schema1', 'table5', 'TABLE', 'delta', location='dbfs://hms/folder1/table1'), + Table( + 'hive_metastore', + 'schema2', + 'table4', + 'TABLE', + 'delta', + location='abfss://container1@storage3.dfs.core.windows.net/folder3/table3', + ), + ] + table_crawler.snapshot.return_value = tables + azure_client = AzureAPIClient( + w.config.arm_environment.resource_manager_endpoint, + w.config.arm_environment.service_management_endpoint, + ) + graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") + azurerm = AzureResources(azure_client, graph_client) + resource_permissions = AzureResourcePermissions(install, w, azurerm, locations) + spn_crawler = create_autospec(AzureServicePrincipalCrawler) + spn_crawler.get_cluster_to_storage_mapping.return_value = cluster_spn + return PrincipalACL(w, sql_backend, install, table_crawler, spn_crawler, resource_permissions) @pytest.fixture @@ -19,8 +114,24 @@ def installation(): "config.yml": {'warehouse_id': 'abc', 'connect': {'host': 'a', 'token': 'b'}, 'inventory_database': 'ucx'}, "azure_storage_account_info.csv": [ { - 'prefix': 'prefix1', - 'client_id': 'app_secret1', + 'prefix': 'abfss://container1@storage1.dfs.core.windows.net', + 'client_id': 'client1', + 'principal': 'principal_1', + 'privilege': 'WRITE_FILES', + 'type': 'Application', + 'directory_id': 'directory_id_ss1', + }, + { + 'prefix': 'abfss://container1@storage2.dfs.core.windows.net', + 'client_id': 'client2', + 'principal': 'principal_1', + 'privilege': 'READ_FILES', + 'type': 'Application', + 'directory_id': 'directory_id_ss1', + }, + { + 'prefix': 'abfss://container1@storage3.dfs.core.windows.net', + 'client_id': 'client2', 'principal': 'principal_1', 'privilege': 'WRITE_FILES', 'type': 'Application', @@ -31,9 +142,109 @@ def installation(): ) -def test_for_cli(ws, installation): +def test_for_cli_azure(ws, installation): + ws.config.is_azure = True assert isinstance(PrincipalACL.for_cli(ws, installation), PrincipalACL) -def test_interactive_cluster(ws, installation): - assert isinstance(PrincipalACL.for_cli(ws, installation), PrincipalACL) +def test_for_cli_aws(ws, installation): + ws.config.is_azure = False + ws.config.is_aws = True + assert PrincipalACL.for_cli(ws, installation) is None + + +def test_for_cli_gcp(ws, installation): + ws.config.is_azure = False + ws.config.is_aws = False + ws.config.is_gcp = True + assert PrincipalACL.for_cli(ws, installation) is None + + +def test_interactive_cluster_no_cluster_mapping(ws, installation): + grants = principal_acl(ws, installation, []) + grants.get_interactive_cluster_grants() + ws.external_locations.list.assert_not_called() + + +def test_interactive_cluster_no_external_location(ws, installation): + cluster_spn = ServicePrincipalClusterMapping( + 'abc', {AzureServicePrincipalInfo(application_id='Hello, World!', storage_account='abcde')} + ) + grants = principal_acl(ws, installation, [cluster_spn]) + ws.external_locations.list.return_value = [] + with pytest.raises(ResourceDoesNotExist): + grants = grants.get_interactive_cluster_grants() + + +def test_interactive_cluster_no_permission_mapping(ws): + cluster_spn = ServicePrincipalClusterMapping( + 'abc', {AzureServicePrincipalInfo(application_id='Hello, World!', storage_account='abcde')} + ) + install = MockInstallation( + { + "config.yml": {'warehouse_id': 'abc', 'connect': {'host': 'a', 'token': 'b'}, 'inventory_database': 'ucx'}, + "azure_storage_account_info.csv": [], + } + ) + grants = principal_acl(ws, install, [cluster_spn]) + + with pytest.raises(ResourceDoesNotExist): + grants.get_interactive_cluster_grants() + + +def test_interactive_cluster_no_acl(ws, installation): + cluster_spn = ServicePrincipalClusterMapping( + 'cluster3', {AzureServicePrincipalInfo(application_id='client1', storage_account='storage1')} + ) + grants = principal_acl(ws, installation, [cluster_spn]) + actual_grants = grants.get_interactive_cluster_grants() + assert len(actual_grants) == 0 + + +def test_interactive_cluster_single_spn(ws, installation): + cluster_spn = ServicePrincipalClusterMapping( + 'cluster1', + {AzureServicePrincipalInfo(application_id='client1', storage_account='storage1')}, + ) + grants = principal_acl(ws, installation, [cluster_spn]) + expected_grants = [ + Grant('group1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table1'), + Grant('foo.bar@imagine.com', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table1'), + Grant('group1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', view='view1'), + Grant('foo.bar@imagine.com', "ALL_PRIVILEGES", "hive_metastore", 'schema1', view='view1'), + Grant('group1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table3'), + Grant('foo.bar@imagine.com', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table3'), + Grant('group1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table5'), + Grant('foo.bar@imagine.com', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table5'), + Grant('group1', "USE", "hive_metastore", 'schema1'), + Grant('foo.bar@imagine.com', "USE", "hive_metastore", 'schema1'), + Grant('group1', "USE", "hive_metastore"), + Grant('foo.bar@imagine.com', "USE", "hive_metastore"), + ] + actual_grants = grants.get_interactive_cluster_grants() + for grant in expected_grants: + assert grant in actual_grants + + +def test_interactive_cluster_multiple_spn(ws, installation): + cluster_spn = ServicePrincipalClusterMapping( + 'cluster2', + { + AzureServicePrincipalInfo(application_id='client2', storage_account='storage2'), + AzureServicePrincipalInfo(application_id='client2', storage_account='storage3'), + }, + ) + grants = principal_acl(ws, installation, [cluster_spn]) + expected_grants = [ + Grant('spn1', "SELECT", "hive_metastore", 'schema1', 'table2'), + Grant('spn1', "ALL_PRIVILEGES", "hive_metastore", 'schema2', 'table4'), + Grant('spn1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table3'), + Grant('spn1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table5'), + Grant('spn1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', view='view1'), + Grant('spn1', "USE", "hive_metastore", 'schema1'), + Grant('spn1', "USE", "hive_metastore", 'schema2'), + Grant('spn1', "USE", "hive_metastore"), + ] + actual_grants = grants.get_interactive_cluster_grants() + for grant in expected_grants: + assert grant in actual_grants From 72671a9f842aadda96f38a62e2b22901575ce602 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Thu, 28 Mar 2024 17:41:45 +0000 Subject: [PATCH 06/24] adding changes to TableMigrate and unit test cases --- .../labs/ucx/hive_metastore/table_migrate.py | 14 +- src/databricks/labs/ucx/runtime.py | 29 +++- .../hive_metastore/test_migrate.py | 67 +++++++- .../unit/hive_metastore/test_table_migrate.py | 147 ++++++++++++++++-- tests/unit/test_runtime.py | 3 +- 5 files changed, 229 insertions(+), 31 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/table_migrate.py b/src/databricks/labs/ucx/hive_metastore/table_migrate.py index 9b6d2633f9..6e6cdd99de 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migrate.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migrate.py @@ -15,7 +15,7 @@ from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping from databricks.labs.ucx.hive_metastore.tables import ( AclMigrationWhat, @@ -52,7 +52,7 @@ def __init__( table_mapping: TableMapping, group_manager: GroupManager, migration_status_refresher, - # principal_grants: PrincipalACL, + principal_grants: PrincipalACL, ): self._tc = table_crawler self._gc = grant_crawler @@ -62,7 +62,7 @@ def __init__( self._group = group_manager self._migration_status_refresher = migration_status_refresher self._seen_tables: dict[str, str] = {} - # self._principal_grants = principal_grants + self._principal_grants = principal_grants @classmethod def for_cli(cls, ws: WorkspaceClient, product='ucx'): @@ -74,7 +74,7 @@ def for_cli(cls, ws: WorkspaceClient, product='ucx'): grants_crawler = GrantsCrawler(table_crawler, udfs_crawler) table_mapping = TableMapping(installation, ws, sql_backend) group_manager = GroupManager(sql_backend, ws, config.inventory_database) - # principal_grants = PrincipalACL.for_cli(ws, installation) + principal_grants = PrincipalACL.for_cli(ws, installation) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, config.inventory_database, table_crawler) return cls( table_crawler, @@ -84,7 +84,7 @@ def for_cli(cls, ws: WorkspaceClient, product='ucx'): table_mapping, group_manager, migration_status_refresher, - # principal_grants, + principal_grants, ) def index(self): @@ -97,7 +97,7 @@ def migrate_tables(self, *, what: What | None = None, acl_strategy: list[AclMigr if acl_strategy is not None: grants_to_migrate = self._gc.snapshot() migrated_groups = self._group.snapshot() - # principal_grants = self._principal_grants.get_interactive_cluster_grants() + principal_grants = self._principal_grants.get_interactive_cluster_grants() else: acl_strategy = [] for table in tables_to_migrate: @@ -106,6 +106,8 @@ def migrate_tables(self, *, what: What | None = None, acl_strategy: list[AclMigr continue if AclMigrationWhat.LEGACY_TACL in acl_strategy: grants.extend(self._match_grants(table.src, grants_to_migrate, migrated_groups)) + if AclMigrationWhat.PRINCIPAL in acl_strategy: + grants.extend(self._match_grants(table.src, principal_grants, migrated_groups)) tasks.append(partial(self._migrate_table, table.src, table.rule, grants)) Threads.strict("migrate tables", tasks) diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index b407d244df..d60f7c897c 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -19,6 +19,7 @@ Mounts, TablesCrawler, ) +from databricks.labs.ucx.hive_metastore.grants import PrincipalACL from databricks.labs.ucx.hive_metastore.mapping import TableMapping from databricks.labs.ucx.hive_metastore.table_migrate import ( MigrationStatusRefresher, @@ -434,12 +435,20 @@ def migrate_external_tables_sync( table_crawler = TablesCrawler(sql_backend, cfg.inventory_database) udf_crawler = UdfsCrawler(sql_backend, cfg.inventory_database) grant_crawler = GrantsCrawler(table_crawler, udf_crawler) - table_mapping = TableMapping(install, ws, sql_backend) + table_mappings = TableMapping(install, ws, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler) group_manager = GroupManager(sql_backend, ws, cfg.inventory_database) + interactive_grants = PrincipalACL.for_cli(ws, install) TablesMigrate( - table_crawler, grant_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher - ).migrate_tables(what=What.EXTERNAL_SYNC, acl_strategy=[AclMigrationWhat.LEGACY_TACL]) + table_crawler, + grant_crawler, + ws, + sql_backend, + table_mappings, + group_manager, + migration_status_refresher, + interactive_grants, + ).migrate_tables(what=What.EXTERNAL_SYNC, acl_strategy=[AclMigrationWhat.LEGACY_TACL, AclMigrationWhat.PRINCIPAL]) @task("migrate-tables", job_cluster="table_migration") @@ -454,12 +463,20 @@ def migrate_dbfs_root_delta_tables( table_crawler = TablesCrawler(sql_backend, cfg.inventory_database) udf_crawler = UdfsCrawler(sql_backend, cfg.inventory_database) grant_crawler = GrantsCrawler(table_crawler, udf_crawler) - table_mapping = TableMapping(install, ws, sql_backend) + table_mappings = TableMapping(install, ws, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler) group_manager = GroupManager(sql_backend, ws, cfg.inventory_database) + interactive_grants = PrincipalACL.for_cli(ws, install) TablesMigrate( - table_crawler, grant_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher - ).migrate_tables(what=What.DBFS_ROOT_DELTA, acl_strategy=[AclMigrationWhat.LEGACY_TACL]) + table_crawler, + grant_crawler, + ws, + sql_backend, + table_mappings, + group_manager, + migration_status_refresher, + interactive_grants, + ).migrate_tables(what=What.DBFS_ROOT_DELTA, acl_strategy=[AclMigrationWhat.LEGACY_TACL, AclMigrationWhat.PRINCIPAL]) @task("migrate-groups-experimental", depends_on=[crawl_groups]) diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index 6b5d623c7a..5eee461ca4 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -3,12 +3,13 @@ from unittest.mock import create_autospec import pytest +from databricks.labs.blueprint.installation import Installation from databricks.sdk.errors import NotFound from databricks.sdk.retries import retried from databricks.sdk.service.catalog import Privilege, SecurableType from databricks.labs.ucx.hive_metastore import GrantsCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL from databricks.labs.ucx.hive_metastore.mapping import Rule from databricks.labs.ucx.hive_metastore.table_migrate import ( MigrationStatusRefresher, @@ -55,9 +56,19 @@ def test_migrate_managed_tables(ws, sql_backend, inventory_schema, make_catalog, ] table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) - migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) + + install = Installation.current(ws, 'ucx') + principal_grants = PrincipalACL.for_cli(ws, install) + migration_status_refresh = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + sql_backend, + table_mapping, + group_manager, + migration_status_refresh, + principal_grants, ) table_migrate.migrate_tables() @@ -117,8 +128,17 @@ def test_migrate_tables_with_cache_should_not_create_table( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) + install = Installation.current(ws, 'ucx') + principal_grants = PrincipalACL.for_cli(ws, install) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + sql_backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) # FIXME: flaky: databricks.sdk.errors.platform.NotFound: Catalog 'ucx_cjazg' does not exist. @@ -169,6 +189,8 @@ def test_migrate_external_table( # pylint: disable=too-many-locals ] group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) + installation = Installation.current(ws, 'ucx') + principal_grants = PrincipalACL.for_cli(ws, installation) table_migrate = TablesMigrate( table_crawler, grant_crawler, @@ -177,6 +199,7 @@ def test_migrate_external_table( # pylint: disable=too-many-locals StaticTableMapping(ws, sql_backend, rules=rules), group_manager, migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() @@ -227,6 +250,8 @@ def test_migrate_external_table_failed_sync( ] group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) + installation = Installation.current(ws, 'ucx') + principal_grants = PrincipalACL.for_cli(ws, installation) table_migrate = TablesMigrate( table_crawler, grant_crawler, @@ -235,6 +260,7 @@ def test_migrate_external_table_failed_sync( StaticTableMapping(ws, sql_backend, rules=rules), group_manager, migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() @@ -281,8 +307,17 @@ def test_revert_migrated_table( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) + installation = Installation.current(ws, 'ucx') + principal_grants = PrincipalACL.for_cli(ws, installation) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + sql_backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() @@ -393,8 +428,17 @@ def test_mapping_reverts_table( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) group_manager = GroupManager(sql_backend, ws, inventory_schema) + installation = Installation.current(ws, 'ucx') + principal_grants = PrincipalACL.for_cli(ws, installation) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + sql_backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() @@ -482,8 +526,17 @@ def test_migrate_managed_tables_with_acl( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) + installation = Installation.current(ws, 'ucx') + principal_grants = PrincipalACL.for_cli(ws, installation) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, sql_backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + sql_backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables(acl_strategy=[AclMigrationWhat.LEGACY_TACL]) diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index 4da7d5616a..db0ee63caa 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -9,6 +9,7 @@ from databricks.sdk.service.catalog import CatalogInfo, SchemaInfo, TableInfo from databricks.labs.ucx.hive_metastore import GrantsCrawler +from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant from databricks.labs.ucx.hive_metastore.mapping import ( Rule, TableMapping, @@ -50,8 +51,16 @@ def test_migrate_dbfs_root_tables_should_produce_proper_queries(ws): table_mapping = table_mapping_mock(["managed_dbfs", "managed_mnt", "managed_other"]) group_manager = GroupManager(backend, ws, "inventory_database") migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() @@ -83,8 +92,16 @@ def test_migrate_dbfs_root_tables_should_be_skipped_when_upgrading_external(ws): table_mapping = table_mapping_mock(["managed_dbfs"]) group_manager = GroupManager(backend, ws, "inventory_database") migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables(what=What.EXTERNAL_SYNC) @@ -102,8 +119,16 @@ def test_migrate_external_tables_should_produce_proper_queries(ws): table_mapping = table_mapping_mock(["external_src"]) group_manager = GroupManager(backend, ws, "inventory_database") migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() @@ -128,8 +153,16 @@ def test_migrate_external_table_failed_sync(ws, caplog): table_mapping = table_mapping_mock(["external_src"]) group_manager = GroupManager(backend, ws, "inventory_database") migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() assert "SYNC command failed to migrate" in caplog.text @@ -166,8 +199,16 @@ def test_migrate_already_upgraded_table_should_produce_no_queries(ws): ] group_manager = GroupManager(backend, ws, "inventory_database") migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() @@ -185,8 +226,16 @@ def test_migrate_unsupported_format_table_should_produce_no_queries(ws): table_mapping = table_mapping_mock(["external_src_unsupported"]) group_manager = GroupManager(backend, ws, "inventory_database") migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() @@ -203,8 +252,16 @@ def test_migrate_view_should_produce_proper_queries(ws): table_mapping = table_mapping_mock(["view"]) group_manager = GroupManager(backend, ws, "inventory_database") migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() @@ -309,6 +366,7 @@ def get_table_migrate(backend: SqlBackend) -> TablesMigrate: group_manager = GroupManager(backend, client, "inventory_database") table_mapping = table_mapping_mock() migration_status_refresher = MigrationStatusRefresher(client, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( table_crawler, grant_crawler, @@ -317,6 +375,7 @@ def get_table_migrate(backend: SqlBackend) -> TablesMigrate: table_mapping, group_manager, migration_status_refresher, + principal_grants, ) return table_migrate @@ -378,8 +437,16 @@ def test_no_migrated_tables(ws): ] group_manager = GroupManager(backend, ws, "inventory_database") migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() table_migrate.revert_migrated_tables("test_schema1", "test_table1") @@ -412,8 +479,16 @@ def test_empty_revert_report(ws): table_mapping = table_mapping_mock() group_manager = GroupManager(backend, ws, "inventory_database") migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() assert not table_migrate.print_revert_report(delete_managed=False) @@ -435,8 +510,16 @@ def test_is_upgraded(ws): table_mapping = table_mapping_mock() group_manager = GroupManager(backend, ws, "inventory_database") migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables() assert table_migrate.is_upgraded("schema1", "table1") @@ -631,8 +714,16 @@ def test_migrate_acls_should_produce_proper_queries(ws, caplog): table_mapping = table_mapping_mock(["managed_dbfs", "managed_mnt", "managed_other", "view"]) group_manager = GroupManager(backend, ws, "inventory_database") migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) table_migrate = TablesMigrate( - table_crawler, grant_crawler, ws, backend, table_mapping, group_manager, migration_status_refresher + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, ) table_migrate.migrate_tables(acl_strategy=[AclMigrationWhat.LEGACY_TACL]) @@ -648,3 +739,37 @@ def test_migrate_acls_should_produce_proper_queries(ws, caplog): assert "GRANT MODIFY ON VIEW ucx_default.db1_dst.view_dst TO `account group`" not in backend.queries assert "Cannot identify UC grant" in caplog.text + +def test_migrate_principal_acls_should_produce_proper_queries(ws): + errors = {} + rows = { + r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")] + } + backend = MockBackend(fails_on_first=errors, rows=rows) + table_crawler = TablesCrawler(backend, "inventory_database") + udf_crawler = UdfsCrawler(backend, "inventory_database") + grant_crawler = GrantsCrawler(table_crawler, udf_crawler) + table_mapping = table_mapping_mock(["managed_dbfs", "managed_mnt", "managed_other", "view"]) + group_manager = GroupManager(backend, ws, "inventory_database") + migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) + principal_grants = create_autospec(PrincipalACL) + expected_grants = [ + Grant('spn1', "SELECT", "hive_metastore", 'db1_src', 'managed_dbfs'), + Grant('Group2', "ALL_PRIVILEGES", "hive_metastore", 'db1_src', 'managed_other'), + ] + principal_grants.get_interactive_cluster_grants.return_value = expected_grants + table_migrate = TablesMigrate( + table_crawler, + grant_crawler, + ws, + backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, + ) + table_migrate.migrate_tables(acl_strategy=[AclMigrationWhat.PRINCIPAL]) + + assert "GRANT SELECT ON TABLE ucx_default.db1_dst.managed_dbfs TO `spn1`" in backend.queries + assert "GRANT ALL_PRIVILEGES ON TABLE ucx_default.db1_dst.managed_other TO `Group2`" not in backend.queries + diff --git a/tests/unit/test_runtime.py b/tests/unit/test_runtime.py index 43b9364787..6ae49eee21 100644 --- a/tests/unit/test_runtime.py +++ b/tests/unit/test_runtime.py @@ -56,6 +56,7 @@ def azure_mock_config() -> WorkspaceConfig: def mock_installation() -> MockInstallation: return MockInstallation( { + 'config.yml': {'warehouse_id': 'abc', 'connect': {'host': 'a', 'token': 'b'}, 'inventory_database': 'ucx'}, 'mapping.csv': [ { 'catalog_name': 'catalog', @@ -65,7 +66,7 @@ def mock_installation() -> MockInstallation: 'src_table': 'table', 'workspace_name': 'workspace', }, - ] + ], } ) From 4b6bd953699610d6560b022975a45a652c24c546 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Thu, 28 Mar 2024 17:42:58 +0000 Subject: [PATCH 07/24] fmting --- tests/unit/hive_metastore/test_table_migrate.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index db0ee63caa..b4647778ea 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -9,7 +9,7 @@ from databricks.sdk.service.catalog import CatalogInfo, SchemaInfo, TableInfo from databricks.labs.ucx.hive_metastore import GrantsCrawler -from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant +from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL from databricks.labs.ucx.hive_metastore.mapping import ( Rule, TableMapping, @@ -740,11 +740,10 @@ def test_migrate_acls_should_produce_proper_queries(ws, caplog): assert "Cannot identify UC grant" in caplog.text + def test_migrate_principal_acls_should_produce_proper_queries(ws): errors = {} - rows = { - r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")] - } + rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")]} backend = MockBackend(fails_on_first=errors, rows=rows) table_crawler = TablesCrawler(backend, "inventory_database") udf_crawler = UdfsCrawler(backend, "inventory_database") @@ -772,4 +771,3 @@ def test_migrate_principal_acls_should_produce_proper_queries(ws): assert "GRANT SELECT ON TABLE ucx_default.db1_dst.managed_dbfs TO `spn1`" in backend.queries assert "GRANT ALL_PRIVILEGES ON TABLE ucx_default.db1_dst.managed_other TO `Group2`" not in backend.queries - From 54ff3cb0aba3541cf742421abed6f0aedd19f777 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Fri, 29 Mar 2024 01:25:35 +0000 Subject: [PATCH 08/24] integration test --- .../labs/ucx/hive_metastore/grants.py | 11 +- .../hive_metastore/test_migrate.py | 113 +++++++++++++++++- .../hive_metastore/test_principal_grants.py | 24 ++-- .../unit/hive_metastore/test_table_migrate.py | 8 +- 4 files changed, 135 insertions(+), 21 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index db805ceba9..3e757077cf 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -142,6 +142,7 @@ def uc_grant_sql(self, object_type: str | None = None, object_key: str | None = ("TABLE", "SELECT"): self._uc_action("SELECT"), ("TABLE", "MODIFY"): self._uc_action("MODIFY"), ("TABLE", "READ_METADATA"): self._uc_action("BROWSE"), + ("TABLE", "ALL PRIVILEGES"): self._uc_action("ALL PRIVILEGES"), ("TABLE", "OWN"): self._set_owner_sql, ("VIEW", "SELECT"): self._uc_action("SELECT"), ("VIEW", "READ_METADATA"): self._uc_action("BROWSE"), @@ -417,11 +418,11 @@ def _get_aws_grants(self) -> list[Grant]: def _get_privilege(self, table: Table, locations: dict[str, str]): if table.view_text is not None: - # return nothing for view so that it goes to the seperate view logic + # return nothing for view so that it goes to the separate view logic return None if table.location is None: return "WRITE_FILES" - if table.location.startswith('dbfs://') or table.location.startswith('/dbfs/'): + if table.location.startswith('dbfs:/') or table.location.startswith('/dbfs/'): return "WRITE_FILES" for loc, privilege in locations.items(): @@ -456,7 +457,7 @@ def _get_grants( if privilege == "WRITE_FILES": grants.extend( [ - Grant(principal, "ALL_PRIVILEGES", table.catalog, table.database, table.name) + Grant(principal, "ALL PRIVILEGES", table.catalog, table.database, table.name) for principal in principals ] ) @@ -464,7 +465,7 @@ def _get_grants( if table.view_text is not None: grants.extend( [ - Grant(principal, "ALL_PRIVILEGES", table.catalog, table.database, view=table.name) + Grant(principal, "ALL PRIVILEGES", table.catalog, table.database, view=table.name) for principal in principals ] ) @@ -506,6 +507,8 @@ def _get_cluster_principal_mapping(self, cluster_id: str) -> list[str]: if acl.user_name is not None: principal_list.append(acl.user_name) if acl.group_name is not None: + if acl.group_name == "admins": + continue principal_list.append(acl.group_name) if acl.service_principal_name is not None: principal_list.append(acl.service_principal_name) diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index 5eee461ca4..6600051483 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -3,10 +3,12 @@ from unittest.mock import create_autospec import pytest -from databricks.labs.blueprint.installation import Installation +from databricks.labs.blueprint.installation import Installation, MockInstallation from databricks.sdk.errors import NotFound from databricks.sdk.retries import retried from databricks.sdk.service.catalog import Privilege, SecurableType +from databricks.sdk.service.compute import DataSecurityMode +from databricks.sdk.service.iam import PermissionLevel from databricks.labs.ucx.hive_metastore import GrantsCrawler from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL @@ -26,6 +28,17 @@ ) logger = logging.getLogger(__name__) +_SPARK_CONF = { + "spark.databricks.cluster.profile": "singleNode", + "spark.master": "local[*]", + "fs.azure.account.auth.type.labsazurethings.dfs.core.windows.net": "OAuth", + "fs.azure.account.oauth.provider.type.labsazurethings.dfs.core.windows.net": "org.apache.hadoop.fs" + ".azurebfs.oauth2.ClientCredsTokenProvider", + "fs.azure.account.oauth2.client.id.labsazurethings.dfs.core.windows.net": "dummy_application_id", + "fs.azure.account.oauth2.client.secret.labsazurethings.dfs.core.windows.net": "dummy", + "fs.azure.account.oauth2.client.endpoint.labsazurethings.dfs.core.windows.net": "https://login" + ".microsoftonline.com/directory_12345/oauth2/token", +} @retried(on=[NotFound], timeout=timedelta(minutes=2)) @@ -550,3 +563,101 @@ def test_migrate_managed_tables_with_acl( assert target_table_properties[Table.UPGRADED_FROM_WS_PARAM] == str(ws.get_workspace_id()) assert target_table_grants.privilege_assignments[0].principal == user.user_name assert target_table_grants.privilege_assignments[0].privileges == [Privilege.MODIFY, Privilege.SELECT] + + +@retried(on=[NotFound], timeout=timedelta(minutes=3)) +def test_migrate_managed_tables_with_principal_acl_azure( # pylint: disable=too-many-arguments + ws, + sql_backend, + inventory_schema, + make_catalog, + make_schema, + make_table, + make_user, + make_cluster, + make_cluster_permissions, + env_or_skip, +): # pylint: disable=too-many-locals + if not ws.config.is_azure: + pytest.skip("temporary: only works in azure test env") + user = make_user() + cluster = make_cluster(single_node=True, spark_conf=_SPARK_CONF, data_security_mode=DataSecurityMode.NONE) + make_cluster_permissions( + object_id=cluster.cluster_id, + permission_level=PermissionLevel.CAN_MANAGE, + user_name=user.user_name, + ) + src_schema = make_schema(catalog_name="hive_metastore") + + src_managed_table = make_table( + catalog_name=src_schema.catalog_name, + schema_name=src_schema.name, + ) + + dst_catalog = make_catalog() + dst_schema = make_schema(catalog_name=dst_catalog.name, name=src_schema.name) + + logger.info(f"dst_catalog={dst_catalog.name}, managed_table={src_managed_table.full_name}") + + table_crawler = StaticTablesCrawler(sql_backend, inventory_schema, [src_managed_table]) + udf_crawler = StaticUdfsCrawler(sql_backend, inventory_schema, []) + grant_crawler = StaticGrantsCrawler(table_crawler, udf_crawler, []) + rules = [ + Rule( + "workspace", + dst_catalog.name, + src_schema.name, + dst_schema.name, + src_managed_table.name, + src_managed_table.name, + ), + ] + table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) + group_manager = GroupManager(sql_backend, ws, inventory_schema) + migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) + installation = MockInstallation( + { + "config.yml": { + 'warehouse_id': env_or_skip("TEST_DEFAULT_WAREHOUSE_ID"), + 'connect': {'host': 'a', 'token': 'b'}, + 'inventory_database': inventory_schema, + }, + "azure_storage_account_info.csv": [ + { + 'prefix': 'abfss://things@labsazurethings.dfs.core.windows.net', + 'client_id': 'dummy_application_id', + 'principal': 'principal_1', + 'privilege': 'WRITE_FILES', + 'type': 'Application', + 'directory_id': 'directory_id_ss1', + } + ], + } + ) + principal_grants = PrincipalACL.for_cli(ws, installation) + table_migrate = TablesMigrate( + table_crawler, + grant_crawler, + ws, + sql_backend, + table_mapping, + group_manager, + migration_status_refresher, + principal_grants, + ) + + table_migrate.migrate_tables(acl_strategy=[AclMigrationWhat.PRINCIPAL]) + + target_tables = list(sql_backend.fetch(f"SHOW TABLES IN {dst_schema.full_name}")) + assert len(target_tables) == 1 + + target_table_properties = ws.tables.get(f"{dst_schema.full_name}.{src_managed_table.name}").properties + target_table_grants = ws.grants.get(SecurableType.TABLE, f"{dst_schema.full_name}.{src_managed_table.name}") + assert target_table_properties["upgraded_from"] == src_managed_table.full_name + assert target_table_properties[Table.UPGRADED_FROM_WS_PARAM] == str(ws.get_workspace_id()) + match = False + for assignment in target_table_grants.privilege_assignments: + if assignment.principal == user.user_name and assignment.privileges == [Privilege.ALL_PRIVILEGES]: + match = True + break + assert match diff --git a/tests/unit/hive_metastore/test_principal_grants.py b/tests/unit/hive_metastore/test_principal_grants.py index 1fd3d9c737..42abb86796 100644 --- a/tests/unit/hive_metastore/test_principal_grants.py +++ b/tests/unit/hive_metastore/test_principal_grants.py @@ -208,14 +208,14 @@ def test_interactive_cluster_single_spn(ws, installation): ) grants = principal_acl(ws, installation, [cluster_spn]) expected_grants = [ - Grant('group1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table1'), - Grant('foo.bar@imagine.com', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table1'), - Grant('group1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', view='view1'), - Grant('foo.bar@imagine.com', "ALL_PRIVILEGES", "hive_metastore", 'schema1', view='view1'), - Grant('group1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table3'), - Grant('foo.bar@imagine.com', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table3'), - Grant('group1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table5'), - Grant('foo.bar@imagine.com', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table5'), + Grant('group1', "ALL PRIVILEGES", "hive_metastore", 'schema1', 'table1'), + Grant('foo.bar@imagine.com', "ALL PRIVILEGES", "hive_metastore", 'schema1', 'table1'), + Grant('group1', "ALL PRIVILEGES", "hive_metastore", 'schema1', view='view1'), + Grant('foo.bar@imagine.com', "ALL PRIVILEGES", "hive_metastore", 'schema1', view='view1'), + Grant('group1', "ALL PRIVILEGES", "hive_metastore", 'schema1', 'table3'), + Grant('foo.bar@imagine.com', "ALL PRIVILEGES", "hive_metastore", 'schema1', 'table3'), + Grant('group1', "ALL PRIVILEGES", "hive_metastore", 'schema1', 'table5'), + Grant('foo.bar@imagine.com', "ALL PRIVILEGES", "hive_metastore", 'schema1', 'table5'), Grant('group1', "USE", "hive_metastore", 'schema1'), Grant('foo.bar@imagine.com', "USE", "hive_metastore", 'schema1'), Grant('group1', "USE", "hive_metastore"), @@ -237,10 +237,10 @@ def test_interactive_cluster_multiple_spn(ws, installation): grants = principal_acl(ws, installation, [cluster_spn]) expected_grants = [ Grant('spn1', "SELECT", "hive_metastore", 'schema1', 'table2'), - Grant('spn1', "ALL_PRIVILEGES", "hive_metastore", 'schema2', 'table4'), - Grant('spn1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table3'), - Grant('spn1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', 'table5'), - Grant('spn1', "ALL_PRIVILEGES", "hive_metastore", 'schema1', view='view1'), + Grant('spn1', "ALL PRIVILEGES", "hive_metastore", 'schema2', 'table4'), + Grant('spn1', "ALL PRIVILEGES", "hive_metastore", 'schema1', 'table3'), + Grant('spn1', "ALL PRIVILEGES", "hive_metastore", 'schema1', 'table5'), + Grant('spn1', "ALL PRIVILEGES", "hive_metastore", 'schema1', view='view1'), Grant('spn1', "USE", "hive_metastore", 'schema1'), Grant('spn1', "USE", "hive_metastore", 'schema2'), Grant('spn1', "USE", "hive_metastore"), diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index b4647778ea..2b64859667 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -753,8 +753,9 @@ def test_migrate_principal_acls_should_produce_proper_queries(ws): migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) principal_grants = create_autospec(PrincipalACL) expected_grants = [ - Grant('spn1', "SELECT", "hive_metastore", 'db1_src', 'managed_dbfs'), - Grant('Group2', "ALL_PRIVILEGES", "hive_metastore", 'db1_src', 'managed_other'), + Grant('spn1', "ALL PRIVILEGES", "hive_metastore", 'db1_src', 'managed_dbfs'), + Grant('spn1', "USE", "hive_metastore", 'db1_src'), + Grant('spn1', "USE", "hive_metastore"), ] principal_grants.get_interactive_cluster_grants.return_value = expected_grants table_migrate = TablesMigrate( @@ -769,5 +770,4 @@ def test_migrate_principal_acls_should_produce_proper_queries(ws): ) table_migrate.migrate_tables(acl_strategy=[AclMigrationWhat.PRINCIPAL]) - assert "GRANT SELECT ON TABLE ucx_default.db1_dst.managed_dbfs TO `spn1`" in backend.queries - assert "GRANT ALL_PRIVILEGES ON TABLE ucx_default.db1_dst.managed_other TO `Group2`" not in backend.queries + assert "GRANT ALL PRIVILEGES ON TABLE ucx_default.db1_dst.managed_dbfs TO `spn1`" in backend.queries From f8e57253838f69bfef190c586586ba2c4fda8d00 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Fri, 29 Mar 2024 01:30:21 +0000 Subject: [PATCH 09/24] formating --- src/databricks/labs/ucx/runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index 1073bdc851..2a414acecf 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -26,7 +26,7 @@ TablesMigrate, ) from databricks.labs.ucx.hive_metastore.table_size import TableSizeCrawler -from databricks.labs.ucx.hive_metastore.tables import AclMigrationWhat, What +from databricks.labs.ucx.hive_metastore.tables import What from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.hive_metastore.verification import VerifyHasMetastore from databricks.labs.ucx.workspace_access.generic import WorkspaceListing From 1bbe118f68da79dbbf6ab22fcfd4275e79f535c3 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Fri, 29 Mar 2024 08:57:47 +0000 Subject: [PATCH 10/24] handling scenarios for mounts --- .../labs/ucx/hive_metastore/grants.py | 39 +++++++++++---- tests/integration/conftest.py | 10 ++++ .../hive_metastore/test_migrate.py | 50 ++++++++++++++----- .../hive_metastore/test_principal_grants.py | 14 ++++-- 4 files changed, 85 insertions(+), 28 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 3e757077cf..945dc9fa6f 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -23,7 +23,11 @@ from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier -from databricks.labs.ucx.hive_metastore.locations import ExternalLocations +from databricks.labs.ucx.hive_metastore.locations import ( + ExternalLocations, + Mount, + Mounts, +) from databricks.labs.ucx.hive_metastore.tables import Table, TablesCrawler from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler @@ -332,6 +336,7 @@ def __init__( backend: SqlBackend, installation: Installation, table_crawler: TablesCrawler, + mount_crawler: Mounts, spn_crawler: AzureServicePrincipalCrawler | None = None, resource_permission: AzureResourcePermissions | None = None, ): @@ -341,6 +346,7 @@ def __init__( self._installation = installation self._resource_permission = resource_permission self._table_crawler = table_crawler + self._mount_crawler = mount_crawler @classmethod def for_cli(cls, ws: WorkspaceClient, installation: Installation): @@ -348,6 +354,7 @@ def for_cli(cls, ws: WorkspaceClient, installation: Installation): sql_backend = StatementExecutionBackend(ws, config.warehouse_id) locations = ExternalLocations(ws, sql_backend, config.inventory_database) table_crawler = TablesCrawler(sql_backend, config.inventory_database) + mount_crawler = Mounts(sql_backend, ws, config.inventory_database) if ws.config.is_azure: azure_client = AzureAPIClient( ws.config.arm_environment.resource_manager_endpoint, @@ -357,7 +364,7 @@ def for_cli(cls, ws: WorkspaceClient, installation: Installation): azurerm = AzureResources(azure_client, graph_client) resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations) spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, config.inventory_database) - return cls(ws, sql_backend, installation, table_crawler, spn_crawler, resource_permissions) + return cls(ws, sql_backend, installation, table_crawler, mount_crawler, spn_crawler, resource_permissions) if ws.config.is_aws: return None if ws.config.is_gcp: @@ -395,6 +402,7 @@ def _get_azure_grants(self) -> list[Grant]: logger.error(msg) raise ResourceDoesNotExist(msg) from None tables = self._table_crawler.snapshot() + mounts = list(self._mount_crawler.snapshot()) grants: list[Grant] = [] for cluster_spn in spn_cluster_mapping: @@ -405,7 +413,7 @@ def _get_azure_grants(self) -> list[Grant]: eligible_locations = self._get_external_location(spn, external_locations, permission_mappings) if len(eligible_locations) == 0: continue - grant = self._get_grants(eligible_locations, principals, tables) + grant = self._get_grants(eligible_locations, principals, tables, mounts) grants.extend(grant) catalog_grants = [Grant(principal, "USE", "hive_metastore") for principal in principals] grants.extend(catalog_grants) @@ -416,12 +424,26 @@ def _get_aws_grants(self) -> list[Grant]: # TODO return [] - def _get_privilege(self, table: Table, locations: dict[str, str]): + def _get_mount_location(self, location: str, mounts: list[Mount]): + for mount in mounts: + if location.startswith(f"dbfs:{mount.name}") or location.startswith(f"/dbfs{mount.name}"): + return mount.source + return None + + def _get_privilege(self, table: Table, locations: dict[str, str], mounts: list[Mount]): if table.view_text is not None: # return nothing for view so that it goes to the separate view logic return None if table.location is None: - return "WRITE_FILES" + return None + if table.location.startswith('dbfs:/mnt') or table.location.startswith('/dbfs/mnt'): + mount_location = self._get_mount_location(table.location, mounts) + if mount_location is None: + return None + for loc, privilege in locations.items(): + if loc is not None and mount_location.startswith(loc): + return privilege + return None if table.location.startswith('dbfs:/') or table.location.startswith('/dbfs/'): return "WRITE_FILES" @@ -440,15 +462,12 @@ def _get_database_grants(self, tables: list[Table], principals: list[str]) -> li ] def _get_grants( - self, - locations: dict[str, str], - principals: list[str], - tables: list[Table], + self, locations: dict[str, str], principals: list[str], tables: list[Table], mounts: list[Mount] ) -> list[Grant]: grants = [] filtered_tables = [] for table in tables: - privilege = self._get_privilege(table, locations) + privilege = self._get_privilege(table, locations, mounts) if privilege == "READ_FILES": grants.extend( [Grant(principal, "SELECT", table.catalog, table.database, table.name) for principal in principals] diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 328efed7df..d51d84a1d8 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -17,6 +17,7 @@ ) from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore.locations import Mount, Mounts from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping from databricks.labs.ucx.hive_metastore.tables import Table from databricks.labs.ucx.hive_metastore.udfs import Udf, UdfsCrawler @@ -190,3 +191,12 @@ def __init__(self, spn_infos: list[AzureServicePrincipalInfo], *args): def snapshot(self) -> list[AzureServicePrincipalInfo]: return self._spn_infos + + +class StaticMountCrawler(Mounts): + def __init__(self, mounts: list[Mount], *args): + super().__init__(*args) + self._mounts = mounts + + def snapshot(self) -> list[Mount]: + return self._mounts diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index 6600051483..c6078a8803 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -10,8 +10,12 @@ from databricks.sdk.service.compute import DataSecurityMode from databricks.sdk.service.iam import PermissionLevel +from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler +from databricks.labs.ucx.azure.access import AzureResourcePermissions +from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources from databricks.labs.ucx.hive_metastore import GrantsCrawler from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL +from databricks.labs.ucx.hive_metastore.locations import ExternalLocations, Mount from databricks.labs.ucx.hive_metastore.mapping import Rule from databricks.labs.ucx.hive_metastore.table_migrate import ( MigrationStatusRefresher, @@ -22,6 +26,7 @@ from ..conftest import ( StaticGrantsCrawler, + StaticMountCrawler, StaticTableMapping, StaticTablesCrawler, StaticUdfsCrawler, @@ -442,6 +447,7 @@ def test_mapping_reverts_table( migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) group_manager = GroupManager(sql_backend, ws, inventory_schema) installation = Installation.current(ws, 'ucx') + principal_grants = PrincipalACL.for_cli(ws, installation) table_migrate = TablesMigrate( table_crawler, @@ -577,6 +583,8 @@ def test_migrate_managed_tables_with_principal_acl_azure( # pylint: disable=too make_cluster, make_cluster_permissions, env_or_skip, + make_dbfs_data_copy, + make_random, ): # pylint: disable=too-many-locals if not ws.config.is_azure: pytest.skip("temporary: only works in azure test env") @@ -588,28 +596,33 @@ def test_migrate_managed_tables_with_principal_acl_azure( # pylint: disable=too user_name=user.user_name, ) src_schema = make_schema(catalog_name="hive_metastore") - - src_managed_table = make_table( - catalog_name=src_schema.catalog_name, - schema_name=src_schema.name, - ) + existing_mounted_location = f'dbfs:/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a/b/c' + new_mounted_location = f'dbfs:/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a/b/{make_random(4)}' + make_dbfs_data_copy(src_path=existing_mounted_location, dst_path=new_mounted_location) + src_external_table = make_table(schema_name=src_schema.name, external_csv=new_mounted_location) dst_catalog = make_catalog() dst_schema = make_schema(catalog_name=dst_catalog.name, name=src_schema.name) - logger.info(f"dst_catalog={dst_catalog.name}, managed_table={src_managed_table.full_name}") + logger.info(f"dst_catalog={dst_catalog.name}, managed_table={src_external_table.full_name}") - table_crawler = StaticTablesCrawler(sql_backend, inventory_schema, [src_managed_table]) + table_crawler = StaticTablesCrawler(sql_backend, inventory_schema, [src_external_table]) udf_crawler = StaticUdfsCrawler(sql_backend, inventory_schema, []) grant_crawler = StaticGrantsCrawler(table_crawler, udf_crawler, []) + mount_crawler = StaticMountCrawler( + [Mount(f'/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a', 'abfss://things@labsazurethings.dfs.core.windows.net/a')], + sql_backend, + ws, + inventory_schema, + ) rules = [ Rule( "workspace", dst_catalog.name, src_schema.name, dst_schema.name, - src_managed_table.name, - src_managed_table.name, + src_external_table.name, + src_external_table.name, ), ] table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) @@ -634,7 +647,18 @@ def test_migrate_managed_tables_with_principal_acl_azure( # pylint: disable=too ], } ) - principal_grants = PrincipalACL.for_cli(ws, installation) + azure_client = AzureAPIClient( + ws.config.arm_environment.resource_manager_endpoint, + ws.config.arm_environment.service_management_endpoint, + ) + locations = ExternalLocations(ws, sql_backend, inventory_schema) + graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") + azurerm = AzureResources(azure_client, graph_client) + resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations) + spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, inventory_schema) + principal_grants = PrincipalACL( + ws, sql_backend, installation, table_crawler, mount_crawler, spn_crawler, resource_permissions + ) table_migrate = TablesMigrate( table_crawler, grant_crawler, @@ -651,9 +675,9 @@ def test_migrate_managed_tables_with_principal_acl_azure( # pylint: disable=too target_tables = list(sql_backend.fetch(f"SHOW TABLES IN {dst_schema.full_name}")) assert len(target_tables) == 1 - target_table_properties = ws.tables.get(f"{dst_schema.full_name}.{src_managed_table.name}").properties - target_table_grants = ws.grants.get(SecurableType.TABLE, f"{dst_schema.full_name}.{src_managed_table.name}") - assert target_table_properties["upgraded_from"] == src_managed_table.full_name + target_table_properties = ws.tables.get(f"{dst_schema.full_name}.{src_external_table.name}").properties + target_table_grants = ws.grants.get(SecurableType.TABLE, f"{dst_schema.full_name}.{src_external_table.name}") + assert target_table_properties["upgraded_from"] == src_external_table.full_name assert target_table_properties[Table.UPGRADED_FROM_WS_PARAM] == str(ws.get_workspace_id()) match = False for assignment in target_table_grants.privilege_assignments: diff --git a/tests/unit/hive_metastore/test_principal_grants.py b/tests/unit/hive_metastore/test_principal_grants.py index 42abb86796..fc1d8f1fa6 100644 --- a/tests/unit/hive_metastore/test_principal_grants.py +++ b/tests/unit/hive_metastore/test_principal_grants.py @@ -16,8 +16,9 @@ from databricks.labs.ucx.azure.access import AzureResourcePermissions from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources from databricks.labs.ucx.config import WorkspaceConfig -from databricks.labs.ucx.hive_metastore import ExternalLocations, TablesCrawler +from databricks.labs.ucx.hive_metastore import ExternalLocations, Mounts, TablesCrawler from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL +from databricks.labs.ucx.hive_metastore.locations import Mount from databricks.labs.ucx.hive_metastore.tables import Table @@ -83,8 +84,8 @@ def principal_acl(w, install, cluster_spn: list): 'delta', location='abfss://container1@storage2.dfs.core.windows.net/folder2/table2', ), - Table('hive_metastore', 'schema1', 'table3', 'TABLE', 'delta'), - Table('hive_metastore', 'schema1', 'table5', 'TABLE', 'delta', location='dbfs://hms/folder1/table1'), + Table('hive_metastore', 'schema1', 'table3', 'TABLE', 'delta', location='dbfs:/mnt/folder1/table3'), + Table('hive_metastore', 'schema1', 'table5', 'TABLE', 'delta', location='dbfs:/hms/folder1/table1'), Table( 'hive_metastore', 'schema2', @@ -95,6 +96,10 @@ def principal_acl(w, install, cluster_spn: list): ), ] table_crawler.snapshot.return_value = tables + mount_crawler = create_autospec(Mounts) + mount_crawler.snapshot.return_value = [ + Mount('/mnt/folder1', 'abfss://container1@storage1.dfs.core.windows.net/folder1') + ] azure_client = AzureAPIClient( w.config.arm_environment.resource_manager_endpoint, w.config.arm_environment.service_management_endpoint, @@ -104,7 +109,7 @@ def principal_acl(w, install, cluster_spn: list): resource_permissions = AzureResourcePermissions(install, w, azurerm, locations) spn_crawler = create_autospec(AzureServicePrincipalCrawler) spn_crawler.get_cluster_to_storage_mapping.return_value = cluster_spn - return PrincipalACL(w, sql_backend, install, table_crawler, spn_crawler, resource_permissions) + return PrincipalACL(w, sql_backend, install, table_crawler, mount_crawler, spn_crawler, resource_permissions) @pytest.fixture @@ -238,7 +243,6 @@ def test_interactive_cluster_multiple_spn(ws, installation): expected_grants = [ Grant('spn1', "SELECT", "hive_metastore", 'schema1', 'table2'), Grant('spn1', "ALL PRIVILEGES", "hive_metastore", 'schema2', 'table4'), - Grant('spn1', "ALL PRIVILEGES", "hive_metastore", 'schema1', 'table3'), Grant('spn1', "ALL PRIVILEGES", "hive_metastore", 'schema1', 'table5'), Grant('spn1', "ALL PRIVILEGES", "hive_metastore", 'schema1', view='view1'), Grant('spn1', "USE", "hive_metastore", 'schema1'), From a65ea07c78444ea7be5c8cf792859985f4fc8619 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sat, 30 Mar 2024 13:57:25 +0000 Subject: [PATCH 11/24] review comments --- .../labs/ucx/hive_metastore/grants.py | 194 ++++++++++-------- .../labs/ucx/hive_metastore/locations.py | 5 +- .../hive_metastore/test_migrate.py | 24 +-- ...-spn-secret-interactive-multiple-spn.json} | 0 tests/unit/assessment/test_azure.py | 36 ++-- .../hive_metastore/test_principal_grants.py | 69 +++++-- 6 files changed, 184 insertions(+), 144 deletions(-) rename tests/unit/assessment/clusters/{azure-spn-secret-interactive.json => azure-spn-secret-interactive-multiple-spn.json} (100%) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 945dc9fa6f..3c42c8e4e9 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -34,6 +34,12 @@ logger = logging.getLogger(__name__) +@dataclass +class ClusterLocationMapping: + cluster_id: str + locations: dict[str, str] + + @dataclass(frozen=True) class Grant: principal: str @@ -329,62 +335,42 @@ def grants( return [] -class PrincipalACL: +class AzureACL: def __init__( self, ws: WorkspaceClient, backend: SqlBackend, - installation: Installation, - table_crawler: TablesCrawler, - mount_crawler: Mounts, - spn_crawler: AzureServicePrincipalCrawler | None = None, - resource_permission: AzureResourcePermissions | None = None, + spn_crawler: AzureServicePrincipalCrawler, + resource_permission: AzureResourcePermissions, ): self._backend = backend self._ws = ws self._spn_crawler = spn_crawler - self._installation = installation self._resource_permission = resource_permission - self._table_crawler = table_crawler - self._mount_crawler = mount_crawler @classmethod def for_cli(cls, ws: WorkspaceClient, installation: Installation): config = installation.load(WorkspaceConfig) sql_backend = StatementExecutionBackend(ws, config.warehouse_id) locations = ExternalLocations(ws, sql_backend, config.inventory_database) - table_crawler = TablesCrawler(sql_backend, config.inventory_database) - mount_crawler = Mounts(sql_backend, ws, config.inventory_database) - if ws.config.is_azure: - azure_client = AzureAPIClient( - ws.config.arm_environment.resource_manager_endpoint, - ws.config.arm_environment.service_management_endpoint, - ) - graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") - azurerm = AzureResources(azure_client, graph_client) - resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations) - spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, config.inventory_database) - return cls(ws, sql_backend, installation, table_crawler, mount_crawler, spn_crawler, resource_permissions) - if ws.config.is_aws: - return None - if ws.config.is_gcp: - logger.error("UCX is not supported for GCP yet. Please run it on azure or aws") - return None - return None - - def get_interactive_cluster_grants(self) -> list[Grant]: - if self._ws.config.is_azure: - return self._get_azure_grants() - return [] - - def _get_azure_grants(self) -> list[Grant]: - assert self._spn_crawler is not None - assert self._resource_permission is not None + azure_client = AzureAPIClient( + ws.config.arm_environment.resource_manager_endpoint, + ws.config.arm_environment.service_management_endpoint, + ) + graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") + azurerm = AzureResources(azure_client, graph_client) + resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations) + spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, config.inventory_database) + return cls(ws, sql_backend, spn_crawler, resource_permissions) + + def get_eligible_locations_principals(self) -> dict[str, dict]: + cluster_locations = {} + eligible_locations = {} spn_cluster_mapping = self._spn_crawler.get_cluster_to_storage_mapping() if len(spn_cluster_mapping) == 0: # if there are no interactive clusters , then return empty grants logger.info("No interactive cluster found with spn configured") - return [] + return {} external_locations = list(self._ws.external_locations.list()) if len(external_locations) == 0: # if there are no external locations, then throw an error to run migrate_locations cli command @@ -398,38 +384,97 @@ def _get_azure_grants(self) -> list[Grant]: permission_mappings = self._resource_permission.load() if len(permission_mappings) == 0: # if permission mapping is empty, raise an error to run principal_prefix cmd - msg = "No storage permission file found. Please ensure principal_prefix_access cli cmd is run to create the access permission file." + msg = ( + "No storage permission file found. Please ensure principal_prefix_access cli " + "cmd is run to create the access permission file." + ) logger.error(msg) raise ResourceDoesNotExist(msg) from None - tables = self._table_crawler.snapshot() - mounts = list(self._mount_crawler.snapshot()) - grants: list[Grant] = [] for cluster_spn in spn_cluster_mapping: - principals = self._get_cluster_principal_mapping(cluster_spn.cluster_id) - if len(principals) == 0: - continue for spn in cluster_spn.spn_info: - eligible_locations = self._get_external_location(spn, external_locations, permission_mappings) - if len(eligible_locations) == 0: - continue - grant = self._get_grants(eligible_locations, principals, tables, mounts) - grants.extend(grant) - catalog_grants = [Grant(principal, "USE", "hive_metastore") for principal in principals] - grants.extend(catalog_grants) + eligible_locations.update(self._get_external_location(spn, external_locations, permission_mappings)) + cluster_locations[cluster_spn.cluster_id] = eligible_locations + return cluster_locations - return list(set(grants)) + def _get_external_location( + self, + spn: AzureServicePrincipalInfo, + external_locations: list[ExternalLocationInfo], + permission_mappings: list[StoragePermissionMapping], + ) -> dict[str, str]: + matching_location = {} + for location in external_locations: + if location.url is None: + continue + for permission_mapping in permission_mappings: + if ( + location.url.startswith(permission_mapping.prefix) + and permission_mapping.client_id == spn.application_id + and spn.storage_account is not None + and spn.storage_account in permission_mapping.prefix + ): + matching_location[location.url] = permission_mapping.privilege + return matching_location + + +class PrincipalACL: + def __init__( + self, + ws: WorkspaceClient, + backend: SqlBackend, + installation: Installation, + table_crawler: TablesCrawler, + mount_crawler: Mounts, + cluster_locations: dict[str, dict], + ): + self._backend = backend + self._ws = ws + self._installation = installation + self._table_crawler = table_crawler + self._mount_crawler = mount_crawler + self._cluster_locations = cluster_locations - def _get_aws_grants(self) -> list[Grant]: - # TODO - return [] + @classmethod + def for_cli(cls, ws: WorkspaceClient, installation: Installation): + config = installation.load(WorkspaceConfig) - def _get_mount_location(self, location: str, mounts: list[Mount]): - for mount in mounts: - if location.startswith(f"dbfs:{mount.name}") or location.startswith(f"/dbfs{mount.name}"): - return mount.source + sql_backend = StatementExecutionBackend(ws, config.warehouse_id) + table_crawler = TablesCrawler(sql_backend, config.inventory_database) + mount_crawler = Mounts(sql_backend, ws, config.inventory_database) + if ws.config.is_azure: + azure_acl = AzureACL.for_cli(ws, installation) + return cls( + ws, + sql_backend, + installation, + table_crawler, + mount_crawler, + azure_acl.get_eligible_locations_principals(), + ) + if ws.config.is_aws: + return None + if ws.config.is_gcp: + logger.error("UCX is not supported for GCP yet. Please run it on azure or aws") + return None return None + def get_interactive_cluster_grants(self) -> list[Grant]: + tables = self._table_crawler.snapshot() + mounts = list(self._mount_crawler.snapshot()) + grants: set[Grant] = set() + + for cluster_id, locations in self._cluster_locations.items(): + principals = self._get_cluster_principal_mapping(cluster_id) + if len(principals) == 0: + continue + grant = self._get_grants(locations, principals, tables, mounts) + grants.update(grant) + catalog_grants = [Grant(principal, "USE", "hive_metastore") for principal in principals] + grants.update(catalog_grants) + + return list(grants) + def _get_privilege(self, table: Table, locations: dict[str, str], mounts: list[Mount]): if table.view_text is not None: # return nothing for view so that it goes to the separate view logic @@ -437,9 +482,7 @@ def _get_privilege(self, table: Table, locations: dict[str, str], mounts: list[M if table.location is None: return None if table.location.startswith('dbfs:/mnt') or table.location.startswith('/dbfs/mnt'): - mount_location = self._get_mount_location(table.location, mounts) - if mount_location is None: - return None + mount_location = ExternalLocations.resolve_mount(table.location, mounts) for loc, privilege in locations.items(): if loc is not None and mount_location.startswith(loc): return privilege @@ -453,10 +496,7 @@ def _get_privilege(self, table: Table, locations: dict[str, str], mounts: list[M return None def _get_database_grants(self, tables: list[Table], principals: list[str]) -> list[Grant]: - databases = [] - for table in tables: - if table.database not in databases: - databases.append(table.database) + databases = {table.database for table in tables} return [ Grant(principal, "USE", "hive_metastore", database) for database in databases for principal in principals ] @@ -473,6 +513,7 @@ def _get_grants( [Grant(principal, "SELECT", table.catalog, table.database, table.name) for principal in principals] ) filtered_tables.append(table) + continue if privilege == "WRITE_FILES": grants.extend( [ @@ -481,6 +522,7 @@ def _get_grants( ] ) filtered_tables.append(table) + continue if table.view_text is not None: grants.extend( [ @@ -496,26 +538,6 @@ def _get_grants( return grants - def _get_external_location( - self, - spn: AzureServicePrincipalInfo, - external_locations: list[ExternalLocationInfo], - permission_mappings: list[StoragePermissionMapping], - ) -> dict[str, str]: - matching_location = {} - for location in external_locations: - if location.url is None: - continue - for permission_mapping in permission_mappings: - if ( - location.url.startswith(permission_mapping.prefix) - and permission_mapping.client_id == spn.application_id - and spn.storage_account is not None - and spn.storage_account in permission_mapping.prefix - ): - matching_location[location.url] = permission_mapping.privilege - return matching_location - def _get_cluster_principal_mapping(self, cluster_id: str) -> list[str]: # gets all the users,groups,spn which have access to the clusters and returns a dataclass of that mapping principal_list = [] diff --git a/src/databricks/labs/ucx/hive_metastore/locations.py b/src/databricks/labs/ucx/hive_metastore/locations.py index 7c5760b483..fa94107940 100644 --- a/src/databricks/labs/ucx/hive_metastore/locations.py +++ b/src/databricks/labs/ucx/hive_metastore/locations.py @@ -44,7 +44,7 @@ def _external_locations(self, tables: list[Row], mounts) -> Iterable[ExternalLoc if not location: continue if location.startswith("dbfs:/mnt"): - location = self._resolve_mount(location, mounts) + location = self.resolve_mount(location, mounts) if ( not location.startswith("dbfs") and (self._prefix_size[0] < location.find(":/") < self._prefix_size[1]) @@ -55,7 +55,8 @@ def _external_locations(self, tables: list[Row], mounts) -> Iterable[ExternalLoc self._add_jdbc_location(external_locations, location, table) return external_locations - def _resolve_mount(self, location, mounts): + @staticmethod + def resolve_mount(location, mounts): for mount in mounts: if location[5:].startswith(mount.name.lower()): location = location[5:].replace(mount.name, mount.source) diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index c6078a8803..48c46f4637 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -10,12 +10,9 @@ from databricks.sdk.service.compute import DataSecurityMode from databricks.sdk.service.iam import PermissionLevel -from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler -from databricks.labs.ucx.azure.access import AzureResourcePermissions -from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources from databricks.labs.ucx.hive_metastore import GrantsCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL -from databricks.labs.ucx.hive_metastore.locations import ExternalLocations, Mount +from databricks.labs.ucx.hive_metastore.grants import AzureACL, Grant, PrincipalACL +from databricks.labs.ucx.hive_metastore.locations import Mount from databricks.labs.ucx.hive_metastore.mapping import Rule from databricks.labs.ucx.hive_metastore.table_migrate import ( MigrationStatusRefresher, @@ -647,17 +644,14 @@ def test_migrate_managed_tables_with_principal_acl_azure( # pylint: disable=too ], } ) - azure_client = AzureAPIClient( - ws.config.arm_environment.resource_manager_endpoint, - ws.config.arm_environment.service_management_endpoint, - ) - locations = ExternalLocations(ws, sql_backend, inventory_schema) - graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") - azurerm = AzureResources(azure_client, graph_client) - resource_permissions = AzureResourcePermissions(installation, ws, azurerm, locations) - spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, inventory_schema) + principal_grants = PrincipalACL( - ws, sql_backend, installation, table_crawler, mount_crawler, spn_crawler, resource_permissions + ws, + sql_backend, + installation, + table_crawler, + mount_crawler, + AzureACL.for_cli(ws, installation), ) table_migrate = TablesMigrate( table_crawler, diff --git a/tests/unit/assessment/clusters/azure-spn-secret-interactive.json b/tests/unit/assessment/clusters/azure-spn-secret-interactive-multiple-spn.json similarity index 100% rename from tests/unit/assessment/clusters/azure-spn-secret-interactive.json rename to tests/unit/assessment/clusters/azure-spn-secret-interactive-multiple-spn.json diff --git a/tests/unit/assessment/test_azure.py b/tests/unit/assessment/test_azure.py index 002380c36f..7e5b2fce04 100644 --- a/tests/unit/assessment/test_azure.py +++ b/tests/unit/assessment/test_azure.py @@ -230,28 +230,26 @@ def test_get_cluster_to_storage_mapping_no_interactive_cluster_return_empty(): def test_get_cluster_to_storage_mapping_interactive_cluster_no_spn_return_empty(): - ws = workspace_client_mock(cluster_ids=['azure-spn-secret-interactive']) + ws = workspace_client_mock(cluster_ids=['azure-spn-secret-interactive-multiple-spn']) crawler = AzureServicePrincipalCrawler(ws, MockBackend(), "ucx") cluster_spn_info = crawler.get_cluster_to_storage_mapping() - spn_info = set( - [ - AzureServicePrincipalInfo( - application_id='Hello, World!', - secret_scope='abcff', - secret_key='sp_secret', - tenant_id='dedededede', - storage_account='abcde', - ), - AzureServicePrincipalInfo( - application_id='Hello, World!', - secret_scope='fgh', - secret_key='sp_secret2', - tenant_id='dedededede', - storage_account='fgh', - ), - ] - ) + spn_info = { + AzureServicePrincipalInfo( + application_id='Hello, World!', + secret_scope='abcff', + secret_key='sp_secret', + tenant_id='dedededede', + storage_account='abcde', + ), + AzureServicePrincipalInfo( + application_id='Hello, World!', + secret_scope='fgh', + secret_key='sp_secret2', + tenant_id='dedededede', + storage_account='fgh', + ), + } assert cluster_spn_info[0].cluster_id == "azure-spn-secret-interactive" assert len(cluster_spn_info[0].spn_info) == 2 diff --git a/tests/unit/hive_metastore/test_principal_grants.py b/tests/unit/hive_metastore/test_principal_grants.py index fc1d8f1fa6..6789dcbe70 100644 --- a/tests/unit/hive_metastore/test_principal_grants.py +++ b/tests/unit/hive_metastore/test_principal_grants.py @@ -16,9 +16,9 @@ from databricks.labs.ucx.azure.access import AzureResourcePermissions from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources from databricks.labs.ucx.config import WorkspaceConfig -from databricks.labs.ucx.hive_metastore import ExternalLocations, Mounts, TablesCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL -from databricks.labs.ucx.hive_metastore.locations import Mount +from databricks.labs.ucx.hive_metastore import Mounts, TablesCrawler +from databricks.labs.ucx.hive_metastore.grants import AzureACL, Grant, PrincipalACL +from databricks.labs.ucx.hive_metastore.locations import ExternalLocations, Mount from databricks.labs.ucx.hive_metastore.tables import Table @@ -60,11 +60,25 @@ def ws(): return w -def principal_acl(w, install, cluster_spn: list): - +def azure_acl(w, install, cluster_spn: list): config = install.load(WorkspaceConfig) sql_backend = StatementExecutionBackend(w, config.warehouse_id) locations = create_autospec(ExternalLocations) + azure_client = AzureAPIClient( + w.config.arm_environment.resource_manager_endpoint, + w.config.arm_environment.service_management_endpoint, + ) + graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") + azurerm = AzureResources(azure_client, graph_client) + resource_permissions = AzureResourcePermissions(install, w, azurerm, locations) + spn_crawler = create_autospec(AzureServicePrincipalCrawler) + spn_crawler.get_cluster_to_storage_mapping.return_value = cluster_spn + return AzureACL(w, sql_backend, spn_crawler, resource_permissions) + + +def principal_acl(w, install, cluster_spn: list): + config = install.load(WorkspaceConfig) + sql_backend = StatementExecutionBackend(w, config.warehouse_id) table_crawler = create_autospec(TablesCrawler) tables = [ Table( @@ -100,16 +114,13 @@ def principal_acl(w, install, cluster_spn: list): mount_crawler.snapshot.return_value = [ Mount('/mnt/folder1', 'abfss://container1@storage1.dfs.core.windows.net/folder1') ] - azure_client = AzureAPIClient( - w.config.arm_environment.resource_manager_endpoint, - w.config.arm_environment.service_management_endpoint, - ) - graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") - azurerm = AzureResources(azure_client, graph_client) - resource_permissions = AzureResourcePermissions(install, w, azurerm, locations) + spn_crawler = create_autospec(AzureServicePrincipalCrawler) spn_crawler.get_cluster_to_storage_mapping.return_value = cluster_spn - return PrincipalACL(w, sql_backend, install, table_crawler, mount_crawler, spn_crawler, resource_permissions) + azure_locations = azure_acl(w, install, cluster_spn) + return PrincipalACL( + w, sql_backend, install, table_crawler, mount_crawler, azure_locations.get_eligible_locations_principals() + ) @pytest.fixture @@ -147,6 +158,10 @@ def installation(): ) +def test_for_cli_azure_acl(ws, installation): + assert isinstance(AzureACL.for_cli(ws, installation), AzureACL) + + def test_for_cli_azure(ws, installation): ws.config.is_azure = True assert isinstance(PrincipalACL.for_cli(ws, installation), PrincipalACL) @@ -165,23 +180,23 @@ def test_for_cli_gcp(ws, installation): assert PrincipalACL.for_cli(ws, installation) is None -def test_interactive_cluster_no_cluster_mapping(ws, installation): - grants = principal_acl(ws, installation, []) - grants.get_interactive_cluster_grants() +def test_get_eligible_locations_principals_no_cluster_mapping(ws, installation): + locations = azure_acl(ws, installation, []) + locations.get_eligible_locations_principals() ws.external_locations.list.assert_not_called() -def test_interactive_cluster_no_external_location(ws, installation): +def test_get_eligible_locations_principals_no_external_location(ws, installation): cluster_spn = ServicePrincipalClusterMapping( 'abc', {AzureServicePrincipalInfo(application_id='Hello, World!', storage_account='abcde')} ) - grants = principal_acl(ws, installation, [cluster_spn]) + locations = azure_acl(ws, installation, [cluster_spn]) ws.external_locations.list.return_value = [] with pytest.raises(ResourceDoesNotExist): - grants = grants.get_interactive_cluster_grants() + locations.get_eligible_locations_principals() -def test_interactive_cluster_no_permission_mapping(ws): +def test_get_eligible_locations_principals_no_permission_mapping(ws): cluster_spn = ServicePrincipalClusterMapping( 'abc', {AzureServicePrincipalInfo(application_id='Hello, World!', storage_account='abcde')} ) @@ -191,10 +206,20 @@ def test_interactive_cluster_no_permission_mapping(ws): "azure_storage_account_info.csv": [], } ) - grants = principal_acl(ws, install, [cluster_spn]) + locations = azure_acl(ws, install, [cluster_spn]) with pytest.raises(ResourceDoesNotExist): - grants.get_interactive_cluster_grants() + locations.get_eligible_locations_principals() + + +def test_get_eligible_locations_principals(ws, installation): + cluster_spn = ServicePrincipalClusterMapping( + 'abc', {AzureServicePrincipalInfo(application_id='client1', storage_account='storage1')} + ) + locations = azure_acl(ws, installation, [cluster_spn]) + eligible_locations = locations.get_eligible_locations_principals() + assert len(eligible_locations) == 1 + assert eligible_locations['abc'] == {'abfss://container1@storage1.dfs.core.windows.net/folder1': 'WRITE_FILES'} def test_interactive_cluster_no_acl(ws, installation): From fb281e7e1f6a55d1453433e0a40b1fe252ba339a Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sat, 30 Mar 2024 14:49:05 +0000 Subject: [PATCH 12/24] passing sql_backend in the cli --- src/databricks/labs/ucx/hive_metastore/grants.py | 3 +-- .../labs/ucx/hive_metastore/table_migrate.py | 2 +- src/databricks/labs/ucx/runtime.py | 4 ++-- tests/integration/hive_metastore/test_migrate.py | 14 +++++++------- tests/unit/hive_metastore/test_principal_grants.py | 9 ++++++--- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 3c42c8e4e9..6448918bfd 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -436,10 +436,9 @@ def __init__( self._cluster_locations = cluster_locations @classmethod - def for_cli(cls, ws: WorkspaceClient, installation: Installation): + def for_cli(cls, ws: WorkspaceClient, installation: Installation, sql_backend: SqlBackend): config = installation.load(WorkspaceConfig) - sql_backend = StatementExecutionBackend(ws, config.warehouse_id) table_crawler = TablesCrawler(sql_backend, config.inventory_database) mount_crawler = Mounts(sql_backend, ws, config.inventory_database) if ws.config.is_azure: diff --git a/src/databricks/labs/ucx/hive_metastore/table_migrate.py b/src/databricks/labs/ucx/hive_metastore/table_migrate.py index 28c085bb78..ff69a824f4 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migrate.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migrate.py @@ -74,7 +74,7 @@ def for_cli(cls, ws: WorkspaceClient, product='ucx'): grants_crawler = GrantsCrawler(table_crawler, udfs_crawler) table_mapping = TableMapping(installation, ws, sql_backend) group_manager = GroupManager(sql_backend, ws, config.inventory_database) - principal_grants = PrincipalACL.for_cli(ws, installation) + principal_grants = PrincipalACL.for_cli(ws, installation, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, config.inventory_database, table_crawler) return cls( table_crawler, diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index 85bae6880b..1a0e67d13a 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -438,7 +438,7 @@ def migrate_external_tables_sync( table_mappings = TableMapping(install, ws, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler) group_manager = GroupManager(sql_backend, ws, cfg.inventory_database) - interactive_grants = PrincipalACL.for_cli(ws, install) + interactive_grants = PrincipalACL.for_cli(ws, install, sql_backend) TablesMigrator( table_crawler, grant_crawler, @@ -466,7 +466,7 @@ def migrate_dbfs_root_delta_tables( table_mappings = TableMapping(install, ws, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler) group_manager = GroupManager(sql_backend, ws, cfg.inventory_database) - interactive_grants = PrincipalACL.for_cli(ws, install) + interactive_grants = PrincipalACL.for_cli(ws, install, sql_backend) TablesMigrator( table_crawler, grant_crawler, diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index 440a20f1ae..34f8983709 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -72,7 +72,7 @@ def test_migrate_managed_tables(ws, sql_backend, inventory_schema, make_catalog, table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation()) + principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -141,7 +141,7 @@ def test_migrate_tables_with_cache_should_not_create_table( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation()) + principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -201,7 +201,7 @@ def test_migrate_external_table( # pylint: disable=too-many-locals ] group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation()) + principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -261,7 +261,7 @@ def test_migrate_external_table_failed_sync( ] group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation()) + principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -317,7 +317,7 @@ def test_revert_migrated_table( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation()) + principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -437,7 +437,7 @@ def test_mapping_reverts_table( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) group_manager = GroupManager(sql_backend, ws, inventory_schema) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation()) + principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -534,7 +534,7 @@ def test_migrate_managed_tables_with_acl( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation()) + principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, diff --git a/tests/unit/hive_metastore/test_principal_grants.py b/tests/unit/hive_metastore/test_principal_grants.py index 6789dcbe70..e485b34897 100644 --- a/tests/unit/hive_metastore/test_principal_grants.py +++ b/tests/unit/hive_metastore/test_principal_grants.py @@ -164,20 +164,23 @@ def test_for_cli_azure_acl(ws, installation): def test_for_cli_azure(ws, installation): ws.config.is_azure = True - assert isinstance(PrincipalACL.for_cli(ws, installation), PrincipalACL) + sql_backend = StatementExecutionBackend(ws, ws.config.warehouse_id) + assert isinstance(PrincipalACL.for_cli(ws, installation, sql_backend), PrincipalACL) def test_for_cli_aws(ws, installation): ws.config.is_azure = False ws.config.is_aws = True - assert PrincipalACL.for_cli(ws, installation) is None + sql_backend = StatementExecutionBackend(ws, ws.config.warehouse_id) + assert PrincipalACL.for_cli(ws, installation, sql_backend) is None def test_for_cli_gcp(ws, installation): ws.config.is_azure = False ws.config.is_aws = False ws.config.is_gcp = True - assert PrincipalACL.for_cli(ws, installation) is None + sql_backend = StatementExecutionBackend(ws, ws.config.warehouse_id) + assert PrincipalACL.for_cli(ws, installation, sql_backend) is None def test_get_eligible_locations_principals_no_cluster_mapping(ws, installation): From 69cd19afdcd5803024f382bf6104543b765bd16c Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sat, 30 Mar 2024 15:05:56 +0000 Subject: [PATCH 13/24] calling init directly in runtime --- src/databricks/labs/ucx/runtime.py | 42 ++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index 1a0e67d13a..6cc7cd62fb 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -11,6 +11,8 @@ from databricks.labs.ucx.assessment.init_scripts import GlobalInitScriptCrawler from databricks.labs.ucx.assessment.jobs import JobsCrawler, SubmitRunsCrawler from databricks.labs.ucx.assessment.pipelines import PipelinesCrawler +from databricks.labs.ucx.azure.access import AzureResourcePermissions +from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.framework.tasks import task, trigger from databricks.labs.ucx.hive_metastore import ( @@ -19,7 +21,7 @@ Mounts, TablesCrawler, ) -from databricks.labs.ucx.hive_metastore.grants import PrincipalACL +from databricks.labs.ucx.hive_metastore.grants import AzureACL, PrincipalACL from databricks.labs.ucx.hive_metastore.locations import TablesInMounts from databricks.labs.ucx.hive_metastore.mapping import TableMapping from databricks.labs.ucx.hive_metastore.table_migrate import ( @@ -438,7 +440,22 @@ def migrate_external_tables_sync( table_mappings = TableMapping(install, ws, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler) group_manager = GroupManager(sql_backend, ws, cfg.inventory_database) - interactive_grants = PrincipalACL.for_cli(ws, install, sql_backend) + mount_crawler = Mounts(sql_backend, ws, cfg.inventory_database) + cluster_locations = {} + if ws.config.is_azure: + locations = ExternalLocations(ws, sql_backend, cfg.inventory_database) + azure_client = AzureAPIClient( + ws.config.arm_environment.resource_manager_endpoint, + ws.config.arm_environment.service_management_endpoint, + ) + graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") + azurerm = AzureResources(azure_client, graph_client) + resource_permissions = AzureResourcePermissions(install, ws, azurerm, locations) + spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, cfg.inventory_database) + cluster_locations = AzureACL( + ws, sql_backend, spn_crawler, resource_permissions + ).get_eligible_locations_principals() + interactive_grants = PrincipalACL(ws, sql_backend, install, table_crawler, mount_crawler, cluster_locations) TablesMigrator( table_crawler, grant_crawler, @@ -463,16 +480,31 @@ def migrate_dbfs_root_delta_tables( table_crawler = TablesCrawler(sql_backend, cfg.inventory_database) udf_crawler = UdfsCrawler(sql_backend, cfg.inventory_database) grant_crawler = GrantsCrawler(table_crawler, udf_crawler) - table_mappings = TableMapping(install, ws, sql_backend) + table_mapping = TableMapping(install, ws, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler) group_manager = GroupManager(sql_backend, ws, cfg.inventory_database) - interactive_grants = PrincipalACL.for_cli(ws, install, sql_backend) + mount_crawler = Mounts(sql_backend, ws, cfg.inventory_database) + cluster_locations = {} + if ws.config.is_azure: + locations = ExternalLocations(ws, sql_backend, cfg.inventory_database) + azure_client = AzureAPIClient( + ws.config.arm_environment.resource_manager_endpoint, + ws.config.arm_environment.service_management_endpoint, + ) + graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com") + azurerm = AzureResources(azure_client, graph_client) + resource_permissions = AzureResourcePermissions(install, ws, azurerm, locations) + spn_crawler = AzureServicePrincipalCrawler(ws, sql_backend, cfg.inventory_database) + cluster_locations = AzureACL( + ws, sql_backend, spn_crawler, resource_permissions + ).get_eligible_locations_principals() + interactive_grants = PrincipalACL(ws, sql_backend, install, table_crawler, mount_crawler, cluster_locations) TablesMigrator( table_crawler, grant_crawler, ws, sql_backend, - table_mappings, + table_mapping, group_manager, migration_status_refresher, interactive_grants, From bd561e6ec0658b22f59c599c7d8ae0da5e041e22 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sat, 30 Mar 2024 15:09:15 +0000 Subject: [PATCH 14/24] fmting --- src/databricks/labs/ucx/runtime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index 6cc7cd62fb..ddf6272dd9 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -480,7 +480,7 @@ def migrate_dbfs_root_delta_tables( table_crawler = TablesCrawler(sql_backend, cfg.inventory_database) udf_crawler = UdfsCrawler(sql_backend, cfg.inventory_database) grant_crawler = GrantsCrawler(table_crawler, udf_crawler) - table_mapping = TableMapping(install, ws, sql_backend) + table_mappings = TableMapping(install, ws, sql_backend) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, cfg.inventory_database, table_crawler) group_manager = GroupManager(sql_backend, ws, cfg.inventory_database) mount_crawler = Mounts(sql_backend, ws, cfg.inventory_database) @@ -504,7 +504,7 @@ def migrate_dbfs_root_delta_tables( grant_crawler, ws, sql_backend, - table_mapping, + table_mappings, group_manager, migration_status_refresher, interactive_grants, From 4f6c7f4b0225af817f2a03303b91ed4e499a2b31 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sat, 30 Mar 2024 19:33:07 +0000 Subject: [PATCH 15/24] spiliting big method into two to reduce pylint warning --- .../hive_metastore/test_migrate.py | 127 ++++++++++-------- 1 file changed, 70 insertions(+), 57 deletions(-) diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index 34f8983709..579d8878bc 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -559,63 +559,32 @@ def test_migrate_managed_tables_with_acl( assert target_table_grants.privilege_assignments[0].privileges == [Privilege.MODIFY, Privilege.SELECT] -@retried(on=[NotFound], timeout=timedelta(minutes=3)) -def test_migrate_managed_tables_with_principal_acl_azure( # pylint: disable=too-many-arguments +@pytest.fixture() +def test_prepare_principal_acl( ws, sql_backend, inventory_schema, - make_catalog, - make_schema, - make_table, - make_user, - make_cluster, - make_cluster_permissions, env_or_skip, make_dbfs_data_copy, - make_random, -): # pylint: disable=too-many-locals - if not ws.config.is_azure: - pytest.skip("temporary: only works in azure test env") - user = make_user() - cluster = make_cluster(single_node=True, spark_conf=_SPARK_CONF, data_security_mode=DataSecurityMode.NONE) - make_cluster_permissions( - object_id=cluster.cluster_id, - permission_level=PermissionLevel.CAN_MANAGE, - user_name=user.user_name, - ) - src_schema = make_schema(catalog_name="hive_metastore") + make_table, + make_catalog, +): existing_mounted_location = f'dbfs:/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a/b/c' - new_mounted_location = f'dbfs:/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a/b/{make_random(4)}' + new_mounted_location = f'dbfs:/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a/b/{inventory_schema}' make_dbfs_data_copy(src_path=existing_mounted_location, dst_path=new_mounted_location) - src_external_table = make_table(schema_name=src_schema.name, external_csv=new_mounted_location) - + src_external_table = make_table(external_csv=new_mounted_location) + src_schema = src_external_table.schema_name dst_catalog = make_catalog() - dst_schema = make_schema(catalog_name=dst_catalog.name, name=src_schema.name) - - logger.info(f"dst_catalog={dst_catalog.name}, managed_table={src_external_table.full_name}") - - table_crawler = StaticTablesCrawler(sql_backend, inventory_schema, [src_external_table]) - udf_crawler = StaticUdfsCrawler(sql_backend, inventory_schema, []) - grant_crawler = StaticGrantsCrawler(table_crawler, udf_crawler, []) - mount_crawler = StaticMountCrawler( - [Mount(f'/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a', 'abfss://things@labsazurethings.dfs.core.windows.net/a')], - sql_backend, - ws, - inventory_schema, - ) rules = [ Rule( "workspace", dst_catalog.name, - src_schema.name, - dst_schema.name, + src_schema, + src_schema, src_external_table.name, src_external_table.name, ), ] - table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) - group_manager = GroupManager(sql_backend, ws, inventory_schema) - migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) installation = MockInstallation( { "config.yml": { @@ -640,33 +609,77 @@ def test_migrate_managed_tables_with_principal_acl_azure( # pylint: disable=too ws, sql_backend, installation, - table_crawler, - mount_crawler, + StaticTablesCrawler(sql_backend, inventory_schema, [src_external_table]), + StaticMountCrawler( + [ + Mount( + f'/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a', 'abfss://things@labsazurethings.dfs.core.windows.net/a' + ) + ], + sql_backend, + ws, + inventory_schema, + ), AzureACL.for_cli(ws, installation), ) table_migrate = TablesMigrator( - table_crawler, - grant_crawler, + StaticTablesCrawler(sql_backend, inventory_schema, [src_external_table]), + StaticGrantsCrawler( + StaticTablesCrawler(sql_backend, inventory_schema, [src_external_table]), + StaticUdfsCrawler(sql_backend, inventory_schema, []), + [], + ), ws, sql_backend, - table_mapping, - group_manager, - migration_status_refresher, + StaticTableMapping(ws, sql_backend, rules=rules), + GroupManager(sql_backend, ws, inventory_schema), + MigrationStatusRefresher( + ws, sql_backend, inventory_schema, StaticTablesCrawler(sql_backend, inventory_schema, [src_external_table]) + ), principal_grants, ) + return table_migrate - table_migrate.migrate_tables(acl_strategy=[AclMigrationWhat.PRINCIPAL]) - target_tables = list(sql_backend.fetch(f"SHOW TABLES IN {dst_schema.full_name}")) - assert len(target_tables) == 1 +@retried(on=[NotFound], timeout=timedelta(minutes=3)) +def test_migrate_managed_tables_with_principal_acl_azure( + ws, + inventory_schema, + make_catalog, + make_table, + make_user, + make_dbfs_data_copy, + test_prepare_principal_acl, + make_cluster_permissions, + make_cluster, +): + if not ws.config.is_azure: + pytest.skip("temporary: only works in azure test env") + table_migrate = test_prepare_principal_acl + user = make_user() + cluster = make_cluster(single_node=True, spark_conf=_SPARK_CONF, data_security_mode=DataSecurityMode.NONE) + make_cluster_permissions( + object_id=cluster.cluster_id, + permission_level=PermissionLevel.CAN_USE, + user_name=user.user_name, + ) + new_mounted_location = f'dbfs:/mnt/things/a/b/{inventory_schema}' + make_dbfs_data_copy(src_path='dbfs:/mnt/things/a/b/c', dst_path=new_mounted_location) + src_external_table = make_table(external_csv=new_mounted_location) + src_schema = src_external_table.schema_name - target_table_properties = ws.tables.get(f"{dst_schema.full_name}.{src_external_table.name}").properties - target_table_grants = ws.grants.get(SecurableType.TABLE, f"{dst_schema.full_name}.{src_external_table.name}") - assert target_table_properties["upgraded_from"] == src_external_table.full_name - assert target_table_properties[Table.UPGRADED_FROM_WS_PARAM] == str(ws.get_workspace_id()) + dst_catalog = make_catalog() + + logger.info(f"dst_catalog={dst_catalog.name}, managed_table={src_external_table.full_name}") + + table_migrate.migrate_tables(acl_strategy=[AclMigrationWhat.PRINCIPAL]) + + target_table_grants = ws.grants.get( + SecurableType.TABLE, f"{dst_catalog.name}.{src_schema}.{src_external_table.name}" + ) match = False - for assignment in target_table_grants.privilege_assignments: - if assignment.principal == user.user_name and assignment.privileges == [Privilege.ALL_PRIVILEGES]: + for _ in target_table_grants.privilege_assignments: + if _.principal == user.user_name and _.privileges == [Privilege.ALL_PRIVILEGES]: match = True break assert match From 752797de79e751f2b414a782ebc653c7b26d75a9 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sat, 30 Mar 2024 19:53:11 +0000 Subject: [PATCH 16/24] review comments --- .../labs/ucx/hive_metastore/grants.py | 30 +++++++++---------- src/databricks/labs/ucx/runtime.py | 4 +-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 6448918bfd..3626630d8d 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -341,12 +341,12 @@ def __init__( ws: WorkspaceClient, backend: SqlBackend, spn_crawler: AzureServicePrincipalCrawler, - resource_permission: AzureResourcePermissions, + resource_permissions: AzureResourcePermissions, ): self._backend = backend self._ws = ws self._spn_crawler = spn_crawler - self._resource_permission = resource_permission + self._resource_permissions = resource_permissions @classmethod def for_cli(cls, ws: WorkspaceClient, installation: Installation): @@ -376,16 +376,16 @@ def get_eligible_locations_principals(self) -> dict[str, dict]: # if there are no external locations, then throw an error to run migrate_locations cli command msg = ( "No external location found, If hive metastore tables are created in external storage, " - "ensure migrate_locations cli cmd is run to create the required locations." + "ensure migrate-locations cli cmd is run to create the required locations." ) logger.error(msg) raise ResourceDoesNotExist(msg) from None - permission_mappings = self._resource_permission.load() + permission_mappings = self._resource_permissions.load() if len(permission_mappings) == 0: # if permission mapping is empty, raise an error to run principal_prefix cmd msg = ( - "No storage permission file found. Please ensure principal_prefix_access cli " + "No storage permission file found. Please ensure principal-prefix-access cli " "cmd is run to create the access permission file." ) logger.error(msg) @@ -424,22 +424,22 @@ def __init__( ws: WorkspaceClient, backend: SqlBackend, installation: Installation, - table_crawler: TablesCrawler, - mount_crawler: Mounts, + tables_crawler: TablesCrawler, + mounts_crawler: Mounts, cluster_locations: dict[str, dict], ): self._backend = backend self._ws = ws self._installation = installation - self._table_crawler = table_crawler - self._mount_crawler = mount_crawler + self._tables_crawler = tables_crawler + self._mounts_crawler = mounts_crawler self._cluster_locations = cluster_locations @classmethod def for_cli(cls, ws: WorkspaceClient, installation: Installation, sql_backend: SqlBackend): config = installation.load(WorkspaceConfig) - table_crawler = TablesCrawler(sql_backend, config.inventory_database) + tables_crawler = TablesCrawler(sql_backend, config.inventory_database) mount_crawler = Mounts(sql_backend, ws, config.inventory_database) if ws.config.is_azure: azure_acl = AzureACL.for_cli(ws, installation) @@ -447,7 +447,7 @@ def for_cli(cls, ws: WorkspaceClient, installation: Installation, sql_backend: S ws, sql_backend, installation, - table_crawler, + tables_crawler, mount_crawler, azure_acl.get_eligible_locations_principals(), ) @@ -459,16 +459,16 @@ def for_cli(cls, ws: WorkspaceClient, installation: Installation, sql_backend: S return None def get_interactive_cluster_grants(self) -> list[Grant]: - tables = self._table_crawler.snapshot() - mounts = list(self._mount_crawler.snapshot()) + tables = self._tables_crawler.snapshot() + mounts = list(self._mounts_crawler.snapshot()) grants: set[Grant] = set() for cluster_id, locations in self._cluster_locations.items(): principals = self._get_cluster_principal_mapping(cluster_id) if len(principals) == 0: continue - grant = self._get_grants(locations, principals, tables, mounts) - grants.update(grant) + cluster_usage = self._get_grants(locations, principals, tables, mounts) + grants.update(cluster_usage) catalog_grants = [Grant(principal, "USE", "hive_metastore") for principal in principals] grants.update(catalog_grants) diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index ddf6272dd9..6f37654412 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -465,7 +465,7 @@ def migrate_external_tables_sync( group_manager, migration_status_refresher, interactive_grants, - ).migrate_tables(what=What.EXTERNAL_SYNC, acl_strategy=[AclMigrationWhat.LEGACY_TACL]) + ).migrate_tables(what=What.EXTERNAL_SYNC, acl_strategy=[AclMigrationWhat.LEGACY_TACL, AclMigrationWhat.PRINCIPAL]) @task("migrate-tables", job_cluster="table_migration") @@ -508,7 +508,7 @@ def migrate_dbfs_root_delta_tables( group_manager, migration_status_refresher, interactive_grants, - ).migrate_tables(what=What.DBFS_ROOT_DELTA, acl_strategy=[AclMigrationWhat.LEGACY_TACL]) + ).migrate_tables(what=What.DBFS_ROOT_DELTA, acl_strategy=[AclMigrationWhat.LEGACY_TACL, AclMigrationWhat.PRINCIPAL]) @task("migrate-groups-experimental", depends_on=[crawl_groups]) From be7e757035986006e9ee9e56994a3bc5a3e3beb6 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sat, 30 Mar 2024 19:56:38 +0000 Subject: [PATCH 17/24] naming standards --- src/databricks/labs/ucx/hive_metastore/grants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 3626630d8d..316f4250ca 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -393,11 +393,11 @@ def get_eligible_locations_principals(self) -> dict[str, dict]: for cluster_spn in spn_cluster_mapping: for spn in cluster_spn.spn_info: - eligible_locations.update(self._get_external_location(spn, external_locations, permission_mappings)) + eligible_locations.update(self._get_external_locations(spn, external_locations, permission_mappings)) cluster_locations[cluster_spn.cluster_id] = eligible_locations return cluster_locations - def _get_external_location( + def _get_external_locations( self, spn: AzureServicePrincipalInfo, external_locations: list[ExternalLocationInfo], From 4abe7afd3ba4a8d29d8d89d43298898848a3c7a0 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sat, 30 Mar 2024 20:14:47 +0000 Subject: [PATCH 18/24] fixing storage account extraction in AzureACL --- src/databricks/labs/ucx/hive_metastore/grants.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 316f4250ca..6800b00537 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -408,11 +408,13 @@ def _get_external_locations( if location.url is None: continue for permission_mapping in permission_mappings: + prefix = permission_mapping.prefix if ( location.url.startswith(permission_mapping.prefix) and permission_mapping.client_id == spn.application_id and spn.storage_account is not None - and spn.storage_account in permission_mapping.prefix + # check for storage account name starting after @ in the prefix url + and prefix[prefix.index('@') + 1 :].startswith(spn.storage_account) ): matching_location[location.url] = permission_mapping.privilege return matching_location From cf65f47fb13a4fee4032482ff41b12f4555da50e Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sat, 30 Mar 2024 21:38:36 +0000 Subject: [PATCH 19/24] timeout change to 3 mins --- tests/integration/hive_metastore/test_migrate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index 579d8878bc..bc9ed12ad8 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -499,7 +499,7 @@ def test_mapping_reverts_table( assert "upgraded_to" not in results2 -@retried(on=[NotFound], timeout=timedelta(minutes=2)) +@retried(on=[NotFound], timeout=timedelta(minutes=3)) def test_migrate_managed_tables_with_acl( ws, sql_backend, inventory_schema, make_catalog, make_schema, make_table, make_user ): # pylint: disable=too-many-locals From b23d2d675a2060d0813b5ee229d4bfda0469da0b Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sun, 31 Mar 2024 08:22:24 +0100 Subject: [PATCH 20/24] fixes to int test failues --- .../hive_metastore/test_migrate.py | 109 ++++++++++++------ 1 file changed, 74 insertions(+), 35 deletions(-) diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index bc9ed12ad8..00f9a289e2 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -43,6 +43,27 @@ } +def principal_acl(ws, inventory_schema, sql_backend): + installation = MockInstallation( + { + "config.yml": { + 'inventory_database': inventory_schema, + }, + "azure_storage_account_info.csv": [ + { + 'prefix': 'dummy_prefix', + 'client_id': 'dummy_application_id', + 'principal': 'dummy_principal', + 'privilege': 'WRITE_FILES', + 'type': 'Application', + 'directory_id': 'dummy_directory', + } + ], + } + ) + return PrincipalACL.for_cli(ws, installation, sql_backend) + + @retried(on=[NotFound], timeout=timedelta(minutes=2)) def test_migrate_managed_tables(ws, sql_backend, inventory_schema, make_catalog, make_schema, make_table): # pylint: disable=too-many-locals @@ -72,7 +93,7 @@ def test_migrate_managed_tables(ws, sql_backend, inventory_schema, make_catalog, table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) + principal_grants = principal_acl(ws, inventory_schema, sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -141,7 +162,7 @@ def test_migrate_tables_with_cache_should_not_create_table( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) + principal_grants = principal_acl(ws, inventory_schema, sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -201,7 +222,7 @@ def test_migrate_external_table( # pylint: disable=too-many-locals ] group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) + principal_grants = principal_acl(ws, inventory_schema, sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -261,7 +282,7 @@ def test_migrate_external_table_failed_sync( ] group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) + principal_grants = principal_acl(ws, inventory_schema, sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -317,7 +338,7 @@ def test_revert_migrated_table( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) + principal_grants = principal_acl(ws, inventory_schema, sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -437,7 +458,7 @@ def test_mapping_reverts_table( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) group_manager = GroupManager(sql_backend, ws, inventory_schema) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) + principal_grants = principal_acl(ws, inventory_schema, sql_backend) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -534,7 +555,36 @@ def test_migrate_managed_tables_with_acl( table_mapping = StaticTableMapping(ws, sql_backend, rules=rules) group_manager = GroupManager(sql_backend, ws, inventory_schema) migration_status_refresher = MigrationStatusRefresher(ws, sql_backend, inventory_schema, table_crawler) - principal_grants = PrincipalACL.for_cli(ws, MockInstallation(), sql_backend) + installation = MockInstallation( + { + "config.yml": { + 'inventory_database': inventory_schema, + }, + "azure_storage_account_info.csv": [ + { + 'prefix': 'dummy_prefix', + 'client_id': 'dummy_application_id', + 'principal': 'dummy_principal', + 'privilege': 'WRITE_FILES', + 'type': 'Application', + 'directory_id': 'dummy_directory', + } + ], + } + ) + principal_grants = PrincipalACL( + ws, + sql_backend, + installation, + StaticTablesCrawler(sql_backend, inventory_schema, [src_managed_table]), + StaticMountCrawler( + [Mount('dummy_mount', 'abfss://dummy@dummy.dfs.core.windows.net/a')], + sql_backend, + ws, + inventory_schema, + ), + AzureACL.for_cli(ws, installation).get_eligible_locations_principals(), + ) table_migrate = TablesMigrator( table_crawler, grant_crawler, @@ -568,19 +618,24 @@ def test_prepare_principal_acl( make_dbfs_data_copy, make_table, make_catalog, + make_schema, + make_cluster, ): - existing_mounted_location = f'dbfs:/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a/b/c' + cluster = make_cluster(single_node=True, spark_conf=_SPARK_CONF, data_security_mode=DataSecurityMode.NONE) new_mounted_location = f'dbfs:/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a/b/{inventory_schema}' - make_dbfs_data_copy(src_path=existing_mounted_location, dst_path=new_mounted_location) - src_external_table = make_table(external_csv=new_mounted_location) - src_schema = src_external_table.schema_name + make_dbfs_data_copy(src_path=f'dbfs:/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a/b/c', dst_path=new_mounted_location) + src_schema = make_schema(catalog_name="hive_metastore") + src_external_table = make_table( + catalog_name=src_schema.catalog_name, schema_name=src_schema.name, external_csv=new_mounted_location + ) dst_catalog = make_catalog() + dst_schema = make_schema(catalog_name=dst_catalog.name, name=src_schema.name) rules = [ Rule( "workspace", dst_catalog.name, - src_schema, - src_schema, + src_schema.name, + dst_schema.name, src_external_table.name, src_external_table.name, ), @@ -620,7 +675,7 @@ def test_prepare_principal_acl( ws, inventory_schema, ), - AzureACL.for_cli(ws, installation), + AzureACL.for_cli(ws, installation).get_eligible_locations_principals(), ) table_migrate = TablesMigrator( StaticTablesCrawler(sql_backend, inventory_schema, [src_external_table]), @@ -638,45 +693,29 @@ def test_prepare_principal_acl( ), principal_grants, ) - return table_migrate + return table_migrate, f"{dst_catalog.name}.{dst_schema.name}.{src_external_table.name}", cluster.cluster_id @retried(on=[NotFound], timeout=timedelta(minutes=3)) def test_migrate_managed_tables_with_principal_acl_azure( ws, - inventory_schema, - make_catalog, - make_table, make_user, - make_dbfs_data_copy, test_prepare_principal_acl, make_cluster_permissions, make_cluster, ): if not ws.config.is_azure: pytest.skip("temporary: only works in azure test env") - table_migrate = test_prepare_principal_acl + table_migrate, table_full_name, cluster_id = test_prepare_principal_acl user = make_user() - cluster = make_cluster(single_node=True, spark_conf=_SPARK_CONF, data_security_mode=DataSecurityMode.NONE) make_cluster_permissions( - object_id=cluster.cluster_id, - permission_level=PermissionLevel.CAN_USE, + object_id=cluster_id, + permission_level=PermissionLevel.CAN_ATTACH_TO, user_name=user.user_name, ) - new_mounted_location = f'dbfs:/mnt/things/a/b/{inventory_schema}' - make_dbfs_data_copy(src_path='dbfs:/mnt/things/a/b/c', dst_path=new_mounted_location) - src_external_table = make_table(external_csv=new_mounted_location) - src_schema = src_external_table.schema_name - - dst_catalog = make_catalog() - - logger.info(f"dst_catalog={dst_catalog.name}, managed_table={src_external_table.full_name}") - table_migrate.migrate_tables(acl_strategy=[AclMigrationWhat.PRINCIPAL]) - target_table_grants = ws.grants.get( - SecurableType.TABLE, f"{dst_catalog.name}.{src_schema}.{src_external_table.name}" - ) + target_table_grants = ws.grants.get(SecurableType.TABLE, table_full_name) match = False for _ in target_table_grants.privilege_assignments: if _.principal == user.user_name and _.privileges == [Privilege.ALL_PRIVILEGES]: From 4d2ba619c75c745257c5801d77c174f2c6d8ac2e Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sun, 31 Mar 2024 17:51:19 +0100 Subject: [PATCH 21/24] removing circular references for Grants crawler and fixing test_installation failures --- .../labs/ucx/hive_metastore/__init__.py | 4 +- .../labs/ucx/hive_metastore/table_migrate.py | 4 +- src/databricks/labs/ucx/runtime.py | 9 ++- .../labs/ucx/workspace_access/manager.py | 3 +- .../labs/ucx/workspace_access/tacl.py | 3 +- tests/integration/conftest.py | 4 +- .../integration/hive_metastore/test_grants.py | 2 +- .../hive_metastore/test_migrate.py | 8 ++- tests/integration/test_installation.py | 71 +++++++++++++------ .../workspace_access/test_groups.py | 3 +- .../integration/workspace_access/test_tacl.py | 2 +- .../unit/hive_metastore/test_table_migrate.py | 3 +- tests/unit/workspace_access/test_tacl.py | 4 +- 13 files changed, 76 insertions(+), 44 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/__init__.py b/src/databricks/labs/ucx/hive_metastore/__init__.py index df09a1be5a..7390e15854 100644 --- a/src/databricks/labs/ucx/hive_metastore/__init__.py +++ b/src/databricks/labs/ucx/hive_metastore/__init__.py @@ -1,4 +1,4 @@ -from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler +# from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler from databricks.labs.ucx.hive_metastore.locations import ( ExternalLocations, Mounts, @@ -6,4 +6,4 @@ ) from databricks.labs.ucx.hive_metastore.tables import TablesCrawler -__all__ = ["TablesCrawler", "GrantsCrawler", "Mounts", "ExternalLocations", "TablesInMounts"] +__all__ = ["TablesCrawler", "Mounts", "ExternalLocations", "TablesInMounts"] diff --git a/src/databricks/labs/ucx/hive_metastore/table_migrate.py b/src/databricks/labs/ucx/hive_metastore/table_migrate.py index ff69a824f4..11bcfa37c8 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migrate.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migrate.py @@ -14,8 +14,8 @@ from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier -from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL +from databricks.labs.ucx.hive_metastore import TablesCrawler +from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler, PrincipalACL from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping from databricks.labs.ucx.hive_metastore.tables import ( AclMigrationWhat, diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index 6f37654412..928c56fa41 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -15,13 +15,12 @@ from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.framework.tasks import task, trigger -from databricks.labs.ucx.hive_metastore import ( - ExternalLocations, +from databricks.labs.ucx.hive_metastore import ExternalLocations, Mounts, TablesCrawler +from databricks.labs.ucx.hive_metastore.grants import ( + AzureACL, GrantsCrawler, - Mounts, - TablesCrawler, + PrincipalACL, ) -from databricks.labs.ucx.hive_metastore.grants import AzureACL, PrincipalACL from databricks.labs.ucx.hive_metastore.locations import TablesInMounts from databricks.labs.ucx.hive_metastore.mapping import TableMapping from databricks.labs.ucx.hive_metastore.table_migrate import ( diff --git a/src/databricks/labs/ucx/workspace_access/manager.py b/src/databricks/labs/ucx/workspace_access/manager.py index f37762e90d..832d7e7471 100644 --- a/src/databricks/labs/ucx/workspace_access/manager.py +++ b/src/databricks/labs/ucx/workspace_access/manager.py @@ -14,7 +14,8 @@ Dataclass, DataclassInstance, ) -from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler +from databricks.labs.ucx.hive_metastore import TablesCrawler +from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.workspace_access import generic, redash, scim, secrets from databricks.labs.ucx.workspace_access.base import AclSupport, Permissions diff --git a/src/databricks/labs/ucx/workspace_access/tacl.py b/src/databricks/labs/ucx/workspace_access/tacl.py index b15e9f085e..32c3e8a3bd 100644 --- a/src/databricks/labs/ucx/workspace_access/tacl.py +++ b/src/databricks/labs/ucx/workspace_access/tacl.py @@ -9,8 +9,7 @@ from databricks.labs.lsql.backends import SqlBackend from databricks.sdk.retries import retried -from databricks.labs.ucx.hive_metastore import GrantsCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler from databricks.labs.ucx.workspace_access.base import AclSupport, Permissions from databricks.labs.ucx.workspace_access.groups import MigrationState diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index d51d84a1d8..9f3ee74b93 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -15,8 +15,8 @@ AzureServicePrincipalCrawler, AzureServicePrincipalInfo, ) -from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore import TablesCrawler +from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler from databricks.labs.ucx.hive_metastore.locations import Mount, Mounts from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping from databricks.labs.ucx.hive_metastore.tables import Table diff --git a/tests/integration/hive_metastore/test_grants.py b/tests/integration/hive_metastore/test_grants.py index c739e8f1c3..0615b4f8af 100644 --- a/tests/integration/hive_metastore/test_grants.py +++ b/tests/integration/hive_metastore/test_grants.py @@ -5,7 +5,7 @@ from databricks.sdk.errors import NotFound from databricks.sdk.retries import retried -from databricks.labs.ucx.hive_metastore import GrantsCrawler +from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler from ..conftest import StaticTablesCrawler, StaticUdfsCrawler diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index 00f9a289e2..e456302906 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -10,8 +10,12 @@ from databricks.sdk.service.compute import DataSecurityMode from databricks.sdk.service.iam import PermissionLevel -from databricks.labs.ucx.hive_metastore import GrantsCrawler -from databricks.labs.ucx.hive_metastore.grants import AzureACL, Grant, PrincipalACL +from databricks.labs.ucx.hive_metastore.grants import ( + AzureACL, + Grant, + GrantsCrawler, + PrincipalACL, +) from databricks.labs.ucx.hive_metastore.locations import Mount from databricks.labs.ucx.hive_metastore.mapping import Rule from databricks.labs.ucx.hive_metastore.table_migrate import ( diff --git a/tests/integration/test_installation.py b/tests/integration/test_installation.py index a6a6f1095c..bbe45332c0 100644 --- a/tests/integration/test_installation.py +++ b/tests/integration/test_installation.py @@ -24,8 +24,10 @@ from databricks.sdk.service.iam import PermissionLevel import databricks +from databricks.labs.ucx.azure.access import StoragePermissionMapping from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore.locations import Mount from databricks.labs.ucx.hive_metastore.mapping import Rule from databricks.labs.ucx.install import WorkspaceInstallation, WorkspaceInstaller from databricks.labs.ucx.installer.workflows import WorkflowsInstallation @@ -556,8 +558,17 @@ def test_check_inventory_database_exists(ws, new_installation): @retried(on=[NotFound], timeout=timedelta(minutes=10)) def test_table_migration_job( - ws, new_installation, make_catalog, make_schema, make_table, env_or_skip, make_random, make_dbfs_data_copy + ws, + new_installation, + make_catalog, + make_schema, + make_table, + env_or_skip, + make_random, + make_dbfs_data_copy, + sql_backend, ): + # skip this test if not in nightly test job or debug mode if os.path.basename(sys.argv[0]) not in {"_jb_pytest_runner.py", "testlauncher.py"}: env_or_skip("TEST_NIGHTLY") @@ -585,26 +596,46 @@ def test_table_migration_job( inventory_schema_suffix="_migrate_inventory", ) installation = product_info.current_installation(ws) - migrate_rules = [ - Rule( - "ws_name", - dst_catalog.name, - src_schema.name, - dst_schema.name, - src_managed_table.name, - src_managed_table.name, - ), - Rule( - "ws_name", - dst_catalog.name, - src_schema.name, - dst_schema.name, - src_external_table.name, - src_external_table.name, - ), - ] - installation.save(migrate_rules, filename='mapping.csv') + installation.save( + [ + Rule( + "ws_name", + dst_catalog.name, + src_schema.name, + dst_schema.name, + src_managed_table.name, + src_managed_table.name, + ), + Rule( + "ws_name", + dst_catalog.name, + src_schema.name, + dst_schema.name, + src_external_table.name, + src_external_table.name, + ), + ], + filename='mapping.csv', + ) + sql_backend.save_table( + f"{installation.load(WorkspaceConfig).inventory_database}.mounts", + [Mount(f'/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a', 'abfss://things@labsazurethings.dfs.core.windows.net/a')], + Mount, + ) + installation.save( + [ + StoragePermissionMapping( + 'abfss://things@labsazurethings.dfs.core.windows.net', + 'dummy_application_id', + 'principal_1', + 'WRITE_FILES', + 'Application', + 'directory_id_ss1', + ) + ], + filename='azure_storage_account_info.csv', + ) workflows_install.run_workflow("migrate-tables") # assert the workflow is successful assert workflows_install.validate_step("migrate-tables") diff --git a/tests/integration/workspace_access/test_groups.py b/tests/integration/workspace_access/test_groups.py index fc8ab45ef4..a158ded514 100644 --- a/tests/integration/workspace_access/test_groups.py +++ b/tests/integration/workspace_access/test_groups.py @@ -7,8 +7,7 @@ from databricks.sdk.retries import retried from databricks.sdk.service.iam import Group, PermissionLevel, ResourceMeta -from databricks.labs.ucx.hive_metastore import GrantsCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler from databricks.labs.ucx.workspace_access.generic import ( GenericPermissionsSupport, Listing, diff --git a/tests/integration/workspace_access/test_tacl.py b/tests/integration/workspace_access/test_tacl.py index 6851784c88..9e15907a4a 100644 --- a/tests/integration/workspace_access/test_tacl.py +++ b/tests/integration/workspace_access/test_tacl.py @@ -2,7 +2,7 @@ import logging from collections import defaultdict -from databricks.labs.ucx.hive_metastore import GrantsCrawler +from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler from databricks.labs.ucx.workspace_access.base import Permissions from databricks.labs.ucx.workspace_access.groups import MigratedGroup, MigrationState from databricks.labs.ucx.workspace_access.tacl import TableAclSupport diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index 268e44f0f8..0d8dee2b32 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -8,8 +8,7 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.service.catalog import CatalogInfo, SchemaInfo, TableInfo -from databricks.labs.ucx.hive_metastore import GrantsCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant, PrincipalACL +from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler, PrincipalACL from databricks.labs.ucx.hive_metastore.mapping import ( Rule, TableMapping, diff --git a/tests/unit/workspace_access/test_tacl.py b/tests/unit/workspace_access/test_tacl.py index 1bceef9060..e8c6b1519c 100644 --- a/tests/unit/workspace_access/test_tacl.py +++ b/tests/unit/workspace_access/test_tacl.py @@ -3,8 +3,8 @@ import pytest from databricks.labs.lsql.backends import MockBackend -from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore import TablesCrawler +from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.workspace_access.base import Permissions from databricks.labs.ucx.workspace_access.groups import MigratedGroup, MigrationState From f2af3020f3312454250bde85ce739cab2874aede Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sun, 31 Mar 2024 18:51:43 +0100 Subject: [PATCH 22/24] removing circular references for Grants crawler and fixing test_installation failures --- tests/integration/test_installation.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_installation.py b/tests/integration/test_installation.py index bbe45332c0..2fd329e577 100644 --- a/tests/integration/test_installation.py +++ b/tests/integration/test_installation.py @@ -651,7 +651,7 @@ def test_table_migration_job( @retried(on=[NotFound], timeout=timedelta(minutes=5)) def test_table_migration_job_cluster_override( # pylint: disable=too-many-locals - ws, new_installation, make_catalog, make_schema, make_table, env_or_skip, make_random, make_dbfs_data_copy + ws, new_installation, make_catalog, make_schema, make_table, env_or_skip, make_random, make_dbfs_data_copy, sql_backend ): # create external and managed tables to be migrated src_schema = make_schema(catalog_name="hive_metastore", name=f"migrate_{make_random(5).lower()}") @@ -686,7 +686,24 @@ def test_table_migration_job_cluster_override( # pylint: disable=too-many-local ), ] installation.save(migrate_rules, filename='mapping.csv') - + sql_backend.save_table( + f"{installation.load(WorkspaceConfig).inventory_database}.mounts", + [Mount(f'/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a', 'abfss://things@labsazurethings.dfs.core.windows.net/a')], + Mount, + ) + installation.save( + [ + StoragePermissionMapping( + 'abfss://things@labsazurethings.dfs.core.windows.net', + 'dummy_application_id', + 'principal_1', + 'WRITE_FILES', + 'Application', + 'directory_id_ss1', + ) + ], + filename='azure_storage_account_info.csv', + ) workflows_install.run_workflow("migrate-tables") # assert the workflow is successful assert workflows_install.validate_step("migrate-tables") From 50bb6b20fe73ccd5e466c18e66d10ea108bd6e88 Mon Sep 17 00:00:00 2001 From: Hari Selvarajan Date: Sun, 31 Mar 2024 21:53:57 +0100 Subject: [PATCH 23/24] removing connection parameter in config.yml --- tests/integration/hive_metastore/test_migrate.py | 1 - tests/integration/test_installation.py | 10 +++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index e456302906..b9bec4bd05 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -648,7 +648,6 @@ def test_prepare_principal_acl( { "config.yml": { 'warehouse_id': env_or_skip("TEST_DEFAULT_WAREHOUSE_ID"), - 'connect': {'host': 'a', 'token': 'b'}, 'inventory_database': inventory_schema, }, "azure_storage_account_info.csv": [ diff --git a/tests/integration/test_installation.py b/tests/integration/test_installation.py index 2fd329e577..fcf9a1e772 100644 --- a/tests/integration/test_installation.py +++ b/tests/integration/test_installation.py @@ -651,7 +651,15 @@ def test_table_migration_job( @retried(on=[NotFound], timeout=timedelta(minutes=5)) def test_table_migration_job_cluster_override( # pylint: disable=too-many-locals - ws, new_installation, make_catalog, make_schema, make_table, env_or_skip, make_random, make_dbfs_data_copy, sql_backend + ws, + new_installation, + make_catalog, + make_schema, + make_table, + env_or_skip, + make_random, + make_dbfs_data_copy, + sql_backend, ): # create external and managed tables to be migrated src_schema = make_schema(catalog_name="hive_metastore", name=f"migrate_{make_random(5).lower()}") From a19aaebcc7b472fc1cc23f366d7dc09cd9510aad Mon Sep 17 00:00:00 2001 From: Serge Smertin <259697+nfx@users.noreply.github.com> Date: Sun, 31 Mar 2024 23:44:30 +0200 Subject: [PATCH 24/24] Update __init__.py --- src/databricks/labs/ucx/hive_metastore/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/databricks/labs/ucx/hive_metastore/__init__.py b/src/databricks/labs/ucx/hive_metastore/__init__.py index 7390e15854..a65ba1d779 100644 --- a/src/databricks/labs/ucx/hive_metastore/__init__.py +++ b/src/databricks/labs/ucx/hive_metastore/__init__.py @@ -1,4 +1,3 @@ -# from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler from databricks.labs.ucx.hive_metastore.locations import ( ExternalLocations, Mounts,