From a3dd531dc9bef65df19a4b1c14d289e92d060cb1 Mon Sep 17 00:00:00 2001 From: Honah J Date: Fri, 12 Jul 2024 13:14:40 -0700 Subject: [PATCH] Glue endpoint config variable, continue #530 (#920) Co-authored-by: Seb Pretzer <24555985+sebpretzer@users.noreply.github.com> --- mkdocs/docs/configuration.md | 10 ++++++++++ pyiceberg/catalog/glue.py | 6 +++++- tests/catalog/integration_test_glue.py | 8 +++++--- tests/catalog/test_glue.py | 10 ++++++++++ tests/conftest.py | 5 +++++ 5 files changed, 35 insertions(+), 4 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 5346e82c25..76e1816c3a 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -288,6 +288,16 @@ catalog: region_name: ``` + + +| Key | Example | Description | +| ----------------- | ------------------------------------ | ------------------------------------------------------------------------------- | +| glue.id | 111111111111 | Configure the 12-digit ID of the Glue Catalog | +| glue.skip-archive | true | Configure whether to skip the archival of older table versions. Default to true | +| glue.endpoint | https://glue.us-east-1.amazonaws.com | Configure an alternative endpoint of the Glue service for GlueCatalog to access | + + + ## DynamoDB Catalog If you want to use AWS DynamoDB as the catalog, you can use the last two ways to configure the pyiceberg and refer diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 8819c2e266..bc85b977f9 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -109,6 +109,10 @@ GLUE_SKIP_ARCHIVE = "glue.skip-archive" GLUE_SKIP_ARCHIVE_DEFAULT = True +# Configure an alternative endpoint of the Glue service for GlueCatalog to access. +# This could be used to use GlueCatalog with any glue-compatible metastore service that has a different endpoint +GLUE_CATALOG_ENDPOINT = "glue.endpoint" + ICEBERG_FIELD_ID = "iceberg.field.id" ICEBERG_FIELD_OPTIONAL = "iceberg.field.optional" ICEBERG_FIELD_CURRENT = "iceberg.field.current" @@ -289,7 +293,7 @@ def __init__(self, name: str, **properties: Any): aws_secret_access_key=properties.get("aws_secret_access_key"), aws_session_token=properties.get("aws_session_token"), ) - self.glue: GlueClient = session.client("glue") + self.glue: GlueClient = session.client("glue", endpoint_url=properties.get(GLUE_CATALOG_ENDPOINT)) if glue_catalog_id := properties.get(GLUE_ID): _register_glue_catalog_id_with_glue_client(self.glue, glue_catalog_id) diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index c69bc86ca8..a5293e38f2 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -25,7 +25,7 @@ from botocore.exceptions import ClientError from pyiceberg.catalog import Catalog, MetastoreCatalog -from pyiceberg.catalog.glue import GlueCatalog +from pyiceberg.catalog.glue import GLUE_CATALOG_ENDPOINT, GlueCatalog from pyiceberg.exceptions import ( NamespaceAlreadyExistsError, NamespaceNotEmptyError, @@ -36,7 +36,7 @@ from pyiceberg.io.pyarrow import _dataframe_to_data_files, schema_to_pyarrow from pyiceberg.schema import Schema from pyiceberg.types import IntegerType -from tests.conftest import clean_up, get_bucket_name, get_s3_path +from tests.conftest import clean_up, get_bucket_name, get_glue_endpoint, get_s3_path # The number of tables/databases used in list_table/namespace test LIST_TEST_NUMBER = 2 @@ -51,7 +51,9 @@ def fixture_glue_client() -> boto3.client: @pytest.fixture(name="test_catalog", scope="module") def fixture_test_catalog() -> Generator[Catalog, None, None]: """Configure the pre- and post-setting of aws integration test.""" - test_catalog = GlueCatalog(CATALOG_NAME, warehouse=get_s3_path(get_bucket_name())) + test_catalog = GlueCatalog( + CATALOG_NAME, **{"warehouse": get_s3_path(get_bucket_name()), GLUE_CATALOG_ENDPOINT: get_glue_endpoint()} + ) yield test_catalog clean_up(test_catalog) diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 6b57f1dfe6..c4afa50c52 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -862,3 +862,13 @@ def test_register_table_with_given_location( table = test_catalog.register_table(identifier, location) assert table.identifier == (catalog_name,) + identifier assert test_catalog.table_exists(identifier) is True + + +@mock_aws +def test_glue_endpoint_override(_bucket_initialize: None, moto_endpoint_url: str, database_name: str) -> None: + catalog_name = "glue" + test_endpoint = "https://test-endpoint" + test_catalog = GlueCatalog( + catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}", "glue.endpoint": test_endpoint} + ) + assert test_catalog.glue.meta.endpoint_url == test_endpoint diff --git a/tests/conftest.py b/tests/conftest.py index 6b1a2b43e2..91ab8f2e56 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2043,6 +2043,11 @@ def get_bucket_name() -> str: return bucket_name +def get_glue_endpoint() -> Optional[str]: + """Set the optional environment variable AWS_TEST_GLUE_ENDPOINT for a glue endpoint to test.""" + return os.getenv("AWS_TEST_GLUE_ENDPOINT") + + def get_s3_path(bucket_name: str, database_name: Optional[str] = None, table_name: Optional[str] = None) -> str: result_path = f"s3://{bucket_name}" if database_name is not None: