Skip to content

Commit

Permalink
[COST-5551] add config for local Glue Data Catalog utilization (#5471)
Browse files Browse the repository at this point in the history
  • Loading branch information
maskarb authored Feb 12, 2025
1 parent aa8f220 commit a44d344
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 20 deletions.
12 changes: 10 additions & 2 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,21 @@ AWS_RESOURCE_NAME=YOUR_COST_MANAGEMENT_AWS_ARN

# Glue
SCHEMA_SUFFIX="" # if DEVELOPMENT=True, this can be left empty and will default to $USER; otherwise, set this value to something unique
TRINO_S3A_OR_S3=s3
# set this to s3 if you want to utilize real s3 instead of minio
TRINO_S3A_OR_S3=s3a
# set HIVE_PROPERTIES_FILE=glue.properties to utilize AWS Glue Data Catalog (swap HIVE_PROPERTIES_FILE and GLUE_PROPERTIES_FILE)
HIVE_PROPERTIES_FILE=hive.properties
GLUE_PROPERTIES_FILE=glue.properties

AWS_CATALOG_ID=589173575009
# set MINIO_ENDPOINT and S3_ENDPOINT to https://s3.amazonaws.com to utilize real S3
MINIO_ENDPOINT=http://koku-minio:9000
S3_ENDPOINT=http://localhost:9000
# set S3_ACCESS_KEY and S3_SECRET equal to AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY to utilize real S3
S3_ACCESS_KEY=kokuminioaccess
S3_SECRET=kokuminiosecret
# AWS_ credentials are used by Trino to connect to AWS Glue
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=

S3_BUCKET_NAME=koku-bucket
S3_BUCKET_NAME_OCP_INGRESS=ocp-ingress
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ help:
@echo " requirements generate Pipfile.lock"
@echo " clowdapp generates a new clowdapp.yaml"
@echo " delete-db delete local directory $(TOPDIR)/dev/containers/postgresql/data"
@echo " delete-glue-data delete s3 files + database created in AWS/glue"
@echo " @param schema - (required) specify the schema to delete from catalog"
@echo " delete-test-db delete the django test db"
@echo " reset-db-statistics clear the pg_stat_statements statistics"
@echo " run-migrations run migrations against database"
Expand Down Expand Up @@ -216,6 +218,9 @@ make-migrations:
delete-db:
@$(PREFIX) rm -rf $(TOPDIR)/dev/containers/postgresql/data/

delete-glue-data:
@$(PYTHON) $(SCRIPTDIR)/delete_glue.py $(schema)

delete-test-db:
@PGPASSWORD=$$DATABASE_PASSWORD psql -h $$POSTGRES_SQL_SERVICE_HOST \
-p $$POSTGRES_SQL_SERVICE_PORT \
Expand Down
20 changes: 10 additions & 10 deletions dev/containers/trino/etc/catalog/glue.properties
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
connector.name=hive
hive.metastore=glue
hive.storage-format=Parquet
hive.compression-codec=SNAPPY
hive.collect-column-statistics-on-write=true
hive.recursive-directories=true
hive.compression-codec=SNAPPY
hive.non-managed-table-writes-enabled=true
hive.recursive-directories=true
hive.storage-format=Parquet

fs.hadoop.enabled=false
fs.native-s3.enabled=true
s3.region=${ENV:S3_REGION}
s3.endpoint=${ENV:S3_ENDPOINT}
s3.region=${ENV:AWS_REGION}
s3.endpoint=${ENV:MINIO_ENDPOINT}
s3.aws-access-key=${ENV:S3_ACCESS_KEY}
s3.aws-secret-key=${ENV:S3_SECRET}
s3.path-style-access=true

hive.metastore.glue.default-warehouse-dir=s3://${ENV:S3_BUCKET_NAME}/data
hive.metastore.glue.region=${ENV:S3_REGION}
hive.metastore.glue.aws-access-key=${ENV:S3_ACCESS_KEY}
hive.metastore.glue.aws-secret-key=${ENV:S3_SECRET}
hive.metastore.glue.catalogid=${ENV:AWS_CATALOG_ID}
hive.metastore.glue.default-warehouse-dir=${ENV:TRINO_S3A_OR_S3}://${ENV:S3_BUCKET_NAME}/data
hive.metastore.glue.region=${ENV:AWS_REGION}
hive.metastore.glue.aws-access-key=${ENV:AWS_ACCESS_KEY_ID}
hive.metastore.glue.aws-secret-key=${ENV:AWS_SECRET_ACCESS_KEY}
# hive.metastore.glue.skip-archive=true
4 changes: 2 additions & 2 deletions dev/containers/trino/etc/catalog/hive.properties
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ fs.native-s3.enabled=true

s3.aws-access-key=${ENV:S3_ACCESS_KEY}
s3.aws-secret-key=${ENV:S3_SECRET}
s3.endpoint=${ENV:S3_ENDPOINT}
s3.endpoint=${ENV:MINIO_ENDPOINT}
s3.path-style-access=true
s3.region=us-east-1
s3.region=${ENV:AWS_REGION}
s3.sse.type=None
56 changes: 56 additions & 0 deletions dev/scripts/delete_glue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#
# Copyright 2024 Red Hat Inc.
# SPDX-License-Identifier: Apache-2.0
#
"""Clear out our glue testing data."""
import argparse
import os

import boto3


def delete_glue_data(schema):
# this client requires AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY set in env
glue_client = boto3.client("glue", region_name="us-east-1")
try:
glue_client.delete_database(Name=schema)
print(f"Deleting database: {schema}")
except Exception as e:
print(f"Failed to delete db: {schema}, its possible it was already deleted: {e}")

# cleanup S3 data
endpoint = os.environ.get("S3_ENDPOINT")
bucket_name = os.environ.get("S3_BUCKET_NAME")
credentials = {
"aws_access_key_id": os.environ.get("S3_ACCESS_KEY"),
"aws_secret_access_key": os.environ.get("S3_SECRET"),
"region_name": os.environ.get("S3_REGION") or "us-east-1",
}
path_prefixes = {
"s3_csv_path": f"data/csv/{schema}",
"s3_parquet_path": f"data/parquet/{schema}",
"s3_daily_parquet": f"data/parquet/daily/{schema}",
"s3_schema_db_path": f"data/{schema}",
}

s3_client = boto3.client("s3", endpoint_url=endpoint, **credentials)
for _, file_prefix in path_prefixes.items():
paginator = s3_client.get_paginator("list_objects_v2")
for obj_list in paginator.paginate(Bucket=bucket_name, Prefix=file_prefix):
if "Contents" in obj_list:
s3_client.delete_objects(
Bucket=bucket_name, Delete={"Objects": [{"Key": x["Key"]} for x in obj_list["Contents"]]}
)
print(f"Removed s3 files for prefix: {file_prefix}")


def main():
parser = argparse.ArgumentParser()
parser.add_argument("schema")
args = parser.parse_args()

delete_glue_data(args.schema)


if __name__ == "__main__":
main()
15 changes: 9 additions & 6 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -654,7 +654,7 @@ services:

trino:
container_name: trino
image: quay.io/redhat-services-prod/cost-mgmt-dev-tenant/ubi-trino:461-007
image: quay.io/redhat-services-prod/cost-mgmt-dev-tenant/ubi-trino:461-009
user: root
ports:
- 8080:8080
Expand All @@ -663,12 +663,14 @@ services:
- MY_NODE_ID=${MY_NODE_ID-localhost}
- LOCAL=TRUE
- TRINO_LOG_LEVEL=${LOG_LEVEL-INFO}
- S3_ENDPOINT=${MINIO_ENDPOINT-http://koku-minio:9000}
- S3_BUCKET_NAME=${S3_BUCKET_NAME-koku-bucket}
- TRINO_S3A_OR_S3=${TRINO_S3A_OR_S3-s3a}
- MINIO_ENDPOINT=${MINIO_ENDPOINT-http://koku-minio:9000}
- S3_ACCESS_KEY=${S3_ACCESS_KEY-kokuminioaccess}
- S3_SECRET=${S3_SECRET-kokuminiosecret}
- S3_REGION=${S3_REGION-us-east-1}
- AWS_CATALOG_ID=${AWS_CATALOG_ID-}
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID-kokuminioaccess}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY-kokuminiosecret}
- AWS_REGION=us-east-1
- S3_BUCKET_NAME=${S3_BUCKET_NAME-koku-bucket}
- POSTGRES_SQL_SERVICE_HOST=db
- POSTGRES_SQL_SERVICE_PORT=5432
- DATABASE_NAME=${DATABASE_NAME-postgres}
Expand All @@ -678,7 +680,8 @@ services:
- ./dev/containers/trino/etc/config.properties:/etc/trino/config.properties
- ./dev/containers/trino/etc/jvm.config:/etc/trino/jvm.config
- ./dev/containers/trino/etc/log.properties:/etc/trino/log.properties
- ./dev/containers/trino/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties
- ./dev/containers/trino/etc/catalog/hive.properties:/etc/trino/catalog/${HIVE_PROPERTIES_FILE-hive.properties}
- ./dev/containers/trino/etc/catalog/glue.properties:/etc/trino/catalog/${GLUE_PROPERTIES_FILE-glue.properties}
- ./dev/containers/trino/etc/catalog/postgres.properties:/etc/trino/catalog/postgres.properties
- ./dev/containers/trino/data:/data/trino/data
- ./dev/containers/trino/logs:/data/trino/logs
Expand Down

0 comments on commit a44d344

Please sign in to comment.