Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 37 additions & 23 deletions pyiceberg/catalog/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,32 +303,46 @@ def add_glue_catalog_id(params: Dict[str, str], **kwargs: Any) -> None:


class GlueCatalog(MetastoreCatalog):
def __init__(self, name: str, **properties: Any):
super().__init__(name, **properties)
glue: GlueClient

retry_mode_prop_value = get_first_property_value(properties, GLUE_RETRY_MODE)
def __init__(self, name: str, client: Optional[GlueClient] = None, **properties: Any):
"""Glue Catalog.

session = boto3.Session(
profile_name=properties.get(GLUE_PROFILE_NAME),
region_name=get_first_property_value(properties, GLUE_REGION, AWS_REGION),
botocore_session=properties.get(BOTOCORE_SESSION),
aws_access_key_id=get_first_property_value(properties, GLUE_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID),
aws_secret_access_key=get_first_property_value(properties, GLUE_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY),
aws_session_token=get_first_property_value(properties, GLUE_SESSION_TOKEN, AWS_SESSION_TOKEN),
)
self.glue: GlueClient = session.client(
"glue",
endpoint_url=properties.get(GLUE_CATALOG_ENDPOINT),
config=Config(
retries={
"max_attempts": properties.get(GLUE_MAX_RETRIES, MAX_RETRIES),
"mode": retry_mode_prop_value if retry_mode_prop_value in EXISTING_RETRY_MODES else STANDARD_RETRY_MODE,
}
),
)
You either need to provide a boto3 glue client, or one will be constructed from the properties.

Args:
name: Name to identify the catalog.
client: An optional boto3 glue client.
properties: Properties for glue client construction and configuration.
"""
super().__init__(name, **properties)

if client:
self.glue = client
else:
retry_mode_prop_value = get_first_property_value(properties, GLUE_RETRY_MODE)

session = boto3.Session(
profile_name=properties.get(GLUE_PROFILE_NAME),
region_name=get_first_property_value(properties, GLUE_REGION, AWS_REGION),
botocore_session=properties.get(BOTOCORE_SESSION),
aws_access_key_id=get_first_property_value(properties, GLUE_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID),
aws_secret_access_key=get_first_property_value(properties, GLUE_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY),
aws_session_token=get_first_property_value(properties, GLUE_SESSION_TOKEN, AWS_SESSION_TOKEN),
)
self.glue: GlueClient = session.client(
"glue",
endpoint_url=properties.get(GLUE_CATALOG_ENDPOINT),
config=Config(
retries={
"max_attempts": properties.get(GLUE_MAX_RETRIES, MAX_RETRIES),
"mode": retry_mode_prop_value if retry_mode_prop_value in EXISTING_RETRY_MODES else STANDARD_RETRY_MODE,
}
),
)

if glue_catalog_id := properties.get(GLUE_ID):
_register_glue_catalog_id_with_glue_client(self.glue, glue_catalog_id)
if glue_catalog_id := properties.get(GLUE_ID):
_register_glue_catalog_id_with_glue_client(self.glue, glue_catalog_id)
Comment on lines +344 to +345
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we also need to call this when the client is passed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kevinjqliu good question, I left this out intentionally so that we do not make any modifications to the customer's input client / impede on their existing events. Since the catalog_id parameter is optional, it doesn't make a functional difference afaik. Something to consider is using the unique_id arg when registering an event. Let me know what you think, and I can follow-up 👍

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 i think its a good idea to not modify the input client. Its assumed that using the custom client means its already pre-configured before passing here


def _convert_glue_to_iceberg(self, glue_table: TableTypeDef) -> Table:
properties: Properties = glue_table["Parameters"]
Expand Down
8 changes: 8 additions & 0 deletions tests/catalog/test_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -932,3 +932,11 @@ def test_glue_endpoint_override(_bucket_initialize: None, moto_endpoint_url: str
catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}", "glue.endpoint": test_endpoint}
)
assert test_catalog.glue.meta.endpoint_url == test_endpoint


@mock_aws
def test_glue_client_override() -> None:
catalog_name = "glue"
test_client = boto3.client("glue", region_name="us-west-2")
test_catalog = GlueCatalog(catalog_name, test_client)
assert test_catalog.glue is test_client