Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip unsupported locations while migrating to external location in Azure #1066

Merged
merged 3 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/databricks/labs/ucx/azure/locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,22 @@ def _create_external_location(
# if no credential found
return None

def _filter_unsupported_location(self, location_urls: list[str]) -> list[str]:
# remove unsupported external location
supported_urls = []
for url in location_urls:
if url.startswith("abfss://"):
supported_urls.append(url)
continue
logger.warning(f"Skip unsupported location: {url}")
return supported_urls

def run(self):
# list missing external locations in UC
_, missing_locations = self._hms_locations.match_table_external_locations()
# Extract the location URLs from the missing locations
missing_loc_urls = [loc.location for loc in missing_locations]
missing_loc_urls = self._filter_unsupported_location(missing_loc_urls)

# get prefix to storage credential name mapping
prefix_mapping_write, prefix_mapping_read = self._prefix_credential_name_mapping()
Expand Down
131 changes: 92 additions & 39 deletions tests/unit/azure/test_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import pytest
from databricks.labs.blueprint.installation import MockInstallation
from databricks.labs.lsql import Row
from databricks.labs.lsql.backends import MockBackend
from databricks.sdk import WorkspaceClient
from databricks.sdk.errors.platform import InvalidParameterValue, PermissionDenied
Expand All @@ -16,9 +15,9 @@

from databricks.labs.ucx.azure.access import AzureResourcePermissions
from databricks.labs.ucx.azure.locations import ExternalLocationsMigration
from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources
from databricks.labs.ucx.azure.resources import AzureResources
from databricks.labs.ucx.hive_metastore import ExternalLocations
from tests.unit.azure import get_az_api_mapping
from tests.unit.azure import azure_api_client


@pytest.fixture
Expand All @@ -27,13 +26,7 @@ def ws():


def location_migration_for_test(ws, mock_backend, mock_installation):
azure_mgmt_client = AzureAPIClient(
ws.config.arm_environment.resource_manager_endpoint,
ws.config.arm_environment.service_management_endpoint,
)
graph_client = AzureAPIClient("https://graph.microsoft.com", "https://graph.microsoft.com")
azurerm = AzureResources(azure_mgmt_client, graph_client)

azurerm = AzureResources(azure_api_client(), azure_api_client())
location_crawler = ExternalLocations(ws, mock_backend, "location_test")

return ExternalLocationsMigration(
Expand All @@ -45,12 +38,11 @@ def test_run_service_principal(ws):
"""test run with service principal based storage credentials"""

# mock crawled HMS external locations
row_factory = type("Row", (Row,), {"__columns__": ["location", "table_count"]})
mock_backend = MockBackend(
rows={
r"SELECT \* FROM location_test.external_locations": [
row_factory(["abfss://[email protected]/one/", 1]),
row_factory(["abfss://[email protected]/", 2]),
r"SELECT \* FROM location_test.external_locations": MockBackend.rows("location", "table_count")[
("abfss://[email protected]/one/", 1),
("abfss://[email protected]/", 2),
]
}
)
Expand Down Expand Up @@ -120,16 +112,89 @@ def test_run_service_principal(ws):
)


def test_skip_unsupported_location(ws, caplog):
# mock crawled HMS external locations with two unsupported locations adl and wasbs
mock_backend = MockBackend(
rows={
r"SELECT \* FROM location_test.external_locations": MockBackend.rows("location", "table_count")[
("abfss://[email protected]/one/", 1),
("adl://[email protected]/", 2),
("wasbs://[email protected]/", 2),
]
}
)

# mock listing storage credentials
ws.storage_credentials.list.return_value = [
StorageCredentialInfo(
name="credential_sp1",
azure_service_principal=AzureServicePrincipal(
"directory_id_1",
"application_id_1",
"test_secret",
),
)
]

# mock listing UC external locations, no HMS external location will be matched
ws.external_locations.list.return_value = [ExternalLocationInfo(name="none", url="none")]

# mock installation with permission mapping
mock_installation = MockInstallation(
{
"azure_storage_account_info.csv": [
{
'prefix': 'abfss://[email protected]/',
'client_id': 'application_id_1',
'principal': 'credential_sp1',
'privilege': 'WRITE_FILES',
'type': 'Application',
'directory_id': 'directory_id_1',
},
{
'prefix': 'adl://[email protected]/',
'client_id': 'application_id_1',
'principal': 'credential_sp1',
'privilege': 'WRITE_FILES',
'type': 'Application',
'directory_id': 'directory_id_1',
},
{
'prefix': 'wasbs://[email protected]/',
'client_id': 'application_id_1',
'principal': 'credential_sp1',
'privilege': 'WRITE_FILES',
'type': 'Application',
'directory_id': 'directory_id_1',
},
],
}
)

location_migration = location_migration_for_test(ws, mock_backend, mock_installation)
location_migration.run()

ws.external_locations.create.assert_called_once_with(
"container1_test_one",
"abfss://[email protected]/one/",
"credential_sp1",
comment="Created by UCX",
read_only=False,
skip_validation=False,
)
assert "Skip unsupported location: adl://[email protected]" in caplog.text
assert "Skip unsupported location: wasbs://[email protected]" in caplog.text


def test_run_managed_identity(ws, mocker):
"""test run with managed identity based storage credentials"""

# mock crawled HMS external locations
row_factory = type("Row", (Row,), {"__columns__": ["location", "table_count"]})
mock_backend = MockBackend(
rows={
r"SELECT \* FROM location_test.external_locations": [
row_factory(["abfss://[email protected]/", 4]),
row_factory(["abfss://[email protected]/a/b/", 5]),
r"SELECT \* FROM location_test.external_locations": MockBackend.rows("location", "table_count")[
("abfss://[email protected]/", 4),
("abfss://[email protected]/a/b/", 5),
]
}
)
Expand Down Expand Up @@ -177,11 +242,6 @@ def test_run_managed_identity(ws, mocker):
}
)

# mock Azure resource manager and graph API calls for getting application_id of managed identity
# TODO: (qziyuan) use a better way to mock the API calls
# pylint: disable-next=prohibited-patch
mocker.patch("databricks.sdk.core.ApiClient.do", side_effect=get_az_api_mapping)

location_migration = location_migration_for_test(ws, mock_backend, mock_installation)
location_migration.run()

Expand Down Expand Up @@ -220,12 +280,11 @@ def test_location_failed_to_read(ws):
"""If read-only location is empty, READ permission check will fail with PermissionDenied"""

# mock crawled HMS external locations
row_factory = type("Row", (Row,), {"__columns__": ["location", "table_count"]})
mock_backend = MockBackend(
rows={
r"SELECT \* FROM location_test.external_locations": [
row_factory(["abfss://[email protected]/", 1]),
row_factory(["abfss://[email protected]/", 2]),
r"SELECT \* FROM location_test.external_locations": MockBackend.rows("location", "table_count")[
("abfss://[email protected]/", 1),
("abfss://[email protected]/", 2),
]
}
)
Expand Down Expand Up @@ -288,12 +347,11 @@ def test_overlapping_locations(ws, caplog):
caplog.set_level(logging.INFO)

# mock crawled HMS external locations
row_factory = type("Row", (Row,), {"__columns__": ["location", "table_count"]})
mock_backend = MockBackend(
rows={
r"SELECT \* FROM location_test.external_locations": [
row_factory(["abfss://[email protected]/a/", 1]),
row_factory(["abfss://[email protected]/a/", 1]),
r"SELECT \* FROM location_test.external_locations": MockBackend.rows("location", "table_count")[
("abfss://[email protected]/a/", 1),
("abfss://[email protected]/a/", 1),
]
}
)
Expand Down Expand Up @@ -353,12 +411,11 @@ def test_corner_cases_with_missing_fields(ws, caplog, mocker):
caplog.set_level(logging.INFO)

# mock crawled HMS external locations
row_factory = type("Row", (Row,), {"__columns__": ["location", "table_count"]})
mock_backend = MockBackend(
rows={
r"SELECT \* FROM location_test.external_locations": [
row_factory(["abfss://[email protected]/", 1]),
row_factory(["abfss://[email protected]/", 2]),
r"SELECT \* FROM location_test.external_locations": MockBackend.rows("location", "table_count")[
("abfss://[email protected]/", 1),
("abfss://[email protected]/", 2),
]
}
)
Expand Down Expand Up @@ -399,10 +456,6 @@ def test_corner_cases_with_missing_fields(ws, caplog, mocker):
],
}
)
# return None when getting application_id of managed identity
# TODO: (qziyuan) use a better way to mock the API calls
# pylint: disable-next=prohibited-patch
mocker.patch("databricks.sdk.core.ApiClient.do", return_value={"dummy": "dummy"})

location_migration = location_migration_for_test(ws, mock_backend, mock_installation)
location_migration.run()
Expand Down
Loading