Skip to content

Commit

Permalink
Glue endpoint config variable, continue apache#530 (apache#920)
Browse files Browse the repository at this point in the history
Co-authored-by: Seb Pretzer <[email protected]>
  • Loading branch information
HonahX and sebpretzer authored Jul 12, 2024
1 parent 32e8f88 commit a3dd531
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 4 deletions.
10 changes: 10 additions & 0 deletions mkdocs/docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,16 @@ catalog:
region_name: <REGION_NAME>
```

<!-- markdown-link-check-disable -->

| Key | Example | Description |
| ----------------- | ------------------------------------ | ------------------------------------------------------------------------------- |
| glue.id | 111111111111 | Configure the 12-digit ID of the Glue Catalog |
| glue.skip-archive | true | Configure whether to skip the archival of older table versions. Default to true |
| glue.endpoint | https://glue.us-east-1.amazonaws.com | Configure an alternative endpoint of the Glue service for GlueCatalog to access |

<!-- markdown-link-check-enable-->

## DynamoDB Catalog

If you want to use AWS DynamoDB as the catalog, you can use the last two ways to configure the pyiceberg and refer
Expand Down
6 changes: 5 additions & 1 deletion pyiceberg/catalog/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@
GLUE_SKIP_ARCHIVE = "glue.skip-archive"
GLUE_SKIP_ARCHIVE_DEFAULT = True

# Configure an alternative endpoint of the Glue service for GlueCatalog to access.
# This could be used to use GlueCatalog with any glue-compatible metastore service that has a different endpoint
GLUE_CATALOG_ENDPOINT = "glue.endpoint"

ICEBERG_FIELD_ID = "iceberg.field.id"
ICEBERG_FIELD_OPTIONAL = "iceberg.field.optional"
ICEBERG_FIELD_CURRENT = "iceberg.field.current"
Expand Down Expand Up @@ -289,7 +293,7 @@ def __init__(self, name: str, **properties: Any):
aws_secret_access_key=properties.get("aws_secret_access_key"),
aws_session_token=properties.get("aws_session_token"),
)
self.glue: GlueClient = session.client("glue")
self.glue: GlueClient = session.client("glue", endpoint_url=properties.get(GLUE_CATALOG_ENDPOINT))

if glue_catalog_id := properties.get(GLUE_ID):
_register_glue_catalog_id_with_glue_client(self.glue, glue_catalog_id)
Expand Down
8 changes: 5 additions & 3 deletions tests/catalog/integration_test_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from botocore.exceptions import ClientError

from pyiceberg.catalog import Catalog, MetastoreCatalog
from pyiceberg.catalog.glue import GlueCatalog
from pyiceberg.catalog.glue import GLUE_CATALOG_ENDPOINT, GlueCatalog
from pyiceberg.exceptions import (
NamespaceAlreadyExistsError,
NamespaceNotEmptyError,
Expand All @@ -36,7 +36,7 @@
from pyiceberg.io.pyarrow import _dataframe_to_data_files, schema_to_pyarrow
from pyiceberg.schema import Schema
from pyiceberg.types import IntegerType
from tests.conftest import clean_up, get_bucket_name, get_s3_path
from tests.conftest import clean_up, get_bucket_name, get_glue_endpoint, get_s3_path

# The number of tables/databases used in list_table/namespace test
LIST_TEST_NUMBER = 2
Expand All @@ -51,7 +51,9 @@ def fixture_glue_client() -> boto3.client:
@pytest.fixture(name="test_catalog", scope="module")
def fixture_test_catalog() -> Generator[Catalog, None, None]:
"""Configure the pre- and post-setting of aws integration test."""
test_catalog = GlueCatalog(CATALOG_NAME, warehouse=get_s3_path(get_bucket_name()))
test_catalog = GlueCatalog(
CATALOG_NAME, **{"warehouse": get_s3_path(get_bucket_name()), GLUE_CATALOG_ENDPOINT: get_glue_endpoint()}
)
yield test_catalog
clean_up(test_catalog)

Expand Down
10 changes: 10 additions & 0 deletions tests/catalog/test_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,3 +862,13 @@ def test_register_table_with_given_location(
table = test_catalog.register_table(identifier, location)
assert table.identifier == (catalog_name,) + identifier
assert test_catalog.table_exists(identifier) is True


@mock_aws
def test_glue_endpoint_override(_bucket_initialize: None, moto_endpoint_url: str, database_name: str) -> None:
catalog_name = "glue"
test_endpoint = "https://test-endpoint"
test_catalog = GlueCatalog(
catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}", "glue.endpoint": test_endpoint}
)
assert test_catalog.glue.meta.endpoint_url == test_endpoint
5 changes: 5 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2043,6 +2043,11 @@ def get_bucket_name() -> str:
return bucket_name


def get_glue_endpoint() -> Optional[str]:
"""Set the optional environment variable AWS_TEST_GLUE_ENDPOINT for a glue endpoint to test."""
return os.getenv("AWS_TEST_GLUE_ENDPOINT")


def get_s3_path(bucket_name: str, database_name: Optional[str] = None, table_name: Optional[str] = None) -> str:
result_path = f"s3://{bucket_name}"
if database_name is not None:
Expand Down

0 comments on commit a3dd531

Please sign in to comment.