Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[COST-5551] add config for local Glue Data Catalog utilization #5471

Merged
merged 17 commits into from
Feb 12, 2025
Merged
12 changes: 10 additions & 2 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,21 @@ AWS_RESOURCE_NAME=YOUR_COST_MANAGEMENT_AWS_ARN

# Glue
SCHEMA_SUFFIX="" # if DEVELOPMENT=True, this can be left empty and will default to $USER; otherwise, set this value to something unique
TRINO_S3A_OR_S3=s3
# set this to s3 if you want to utilize real s3 instead of minio
TRINO_S3A_OR_S3=s3a
# set HIVE_PROPERTIES_FILE=glue.properties to utilize AWS Glue Data Catalog (swap HIVE_PROPERTIES_FILE and GLUE_PROPERTIES_FILE)
HIVE_PROPERTIES_FILE=hive.properties
GLUE_PROPERTIES_FILE=glue.properties

AWS_CATALOG_ID=589173575009
# set MINIO_ENDPOINT and S3_ENDPOINT to https://s3.amazonaws.com to utilize real S3
MINIO_ENDPOINT=http://koku-minio:9000
S3_ENDPOINT=http://localhost:9000
# set S3_ACCESS_KEY and S3_SECRET equal to AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY to utilize real S3
S3_ACCESS_KEY=kokuminioaccess
S3_SECRET=kokuminiosecret
# AWS_ credentials are used by Trino to connect to AWS Glue
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=

S3_BUCKET_NAME=koku-bucket
S3_BUCKET_NAME_OCP_INGRESS=ocp-ingress
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ help:
@echo " requirements generate Pipfile.lock"
@echo " clowdapp generates a new clowdapp.yaml"
@echo " delete-db delete local directory $(TOPDIR)/dev/containers/postgresql/data"
@echo " delete-glue-data delete s3 files + database created in AWS/glue"
@echo " @param schema - (required) specify the schema to delete from catalog"
@echo " delete-test-db delete the django test db"
@echo " reset-db-statistics clear the pg_stat_statements statistics"
@echo " run-migrations run migrations against database"
Expand Down Expand Up @@ -216,6 +218,9 @@ make-migrations:
delete-db:
@$(PREFIX) rm -rf $(TOPDIR)/dev/containers/postgresql/data/

delete-glue-data:
@$(PYTHON) $(SCRIPTDIR)/delete_glue.py $(schema)

delete-test-db:
@PGPASSWORD=$$DATABASE_PASSWORD psql -h $$POSTGRES_SQL_SERVICE_HOST \
-p $$POSTGRES_SQL_SERVICE_PORT \
Expand Down
20 changes: 10 additions & 10 deletions dev/containers/trino/etc/catalog/glue.properties
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
connector.name=hive
hive.metastore=glue
hive.storage-format=Parquet
hive.compression-codec=SNAPPY
hive.collect-column-statistics-on-write=true
hive.recursive-directories=true
hive.compression-codec=SNAPPY
hive.non-managed-table-writes-enabled=true
hive.recursive-directories=true
hive.storage-format=Parquet

fs.hadoop.enabled=false
fs.native-s3.enabled=true
s3.region=${ENV:S3_REGION}
s3.endpoint=${ENV:S3_ENDPOINT}
s3.region=${ENV:AWS_REGION}
s3.endpoint=${ENV:MINIO_ENDPOINT}
s3.aws-access-key=${ENV:S3_ACCESS_KEY}
s3.aws-secret-key=${ENV:S3_SECRET}
s3.path-style-access=true

hive.metastore.glue.default-warehouse-dir=s3://${ENV:S3_BUCKET_NAME}/data
hive.metastore.glue.region=${ENV:S3_REGION}
hive.metastore.glue.aws-access-key=${ENV:S3_ACCESS_KEY}
hive.metastore.glue.aws-secret-key=${ENV:S3_SECRET}
hive.metastore.glue.catalogid=${ENV:AWS_CATALOG_ID}
hive.metastore.glue.default-warehouse-dir=${ENV:TRINO_S3A_OR_S3}://${ENV:S3_BUCKET_NAME}/data
hive.metastore.glue.region=${ENV:AWS_REGION}
hive.metastore.glue.aws-access-key=${ENV:AWS_ACCESS_KEY_ID}
hive.metastore.glue.aws-secret-key=${ENV:AWS_SECRET_ACCESS_KEY}
# hive.metastore.glue.skip-archive=true
4 changes: 2 additions & 2 deletions dev/containers/trino/etc/catalog/hive.properties
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ fs.native-s3.enabled=true

s3.aws-access-key=${ENV:S3_ACCESS_KEY}
s3.aws-secret-key=${ENV:S3_SECRET}
s3.endpoint=${ENV:S3_ENDPOINT}
s3.endpoint=${ENV:MINIO_ENDPOINT}
s3.path-style-access=true
s3.region=us-east-1
s3.region=${ENV:AWS_REGION}
s3.sse.type=None
56 changes: 56 additions & 0 deletions dev/scripts/delete_glue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#
# Copyright 2024 Red Hat Inc.
# SPDX-License-Identifier: Apache-2.0
#
"""Clear out our glue testing data."""
import argparse
import os

import boto3


def delete_glue_data(schema):
# this client requires AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY set in env
glue_client = boto3.client("glue", region_name="us-east-1")
try:
glue_client.delete_database(Name=schema)
print(f"Deleting database: {schema}")
except Exception as e:
print(f"Failed to delete db: {schema}, its possible it was already deleted: {e}")

# cleanup S3 data
endpoint = os.environ.get("S3_ENDPOINT")
bucket_name = os.environ.get("S3_BUCKET_NAME")
credentials = {
"aws_access_key_id": os.environ.get("S3_ACCESS_KEY"),
"aws_secret_access_key": os.environ.get("S3_SECRET"),
"region_name": os.environ.get("S3_REGION") or "us-east-1",
}
path_prefixes = {
"s3_csv_path": f"data/csv/{schema}",
"s3_parquet_path": f"data/parquet/{schema}",
"s3_daily_parquet": f"data/parquet/daily/{schema}",
"s3_schema_db_path": f"data/{schema}",
}

s3_client = boto3.client("s3", endpoint_url=endpoint, **credentials)
for _, file_prefix in path_prefixes.items():
paginator = s3_client.get_paginator("list_objects_v2")
for obj_list in paginator.paginate(Bucket=bucket_name, Prefix=file_prefix):
if "Contents" in obj_list:
s3_client.delete_objects(
Bucket=bucket_name, Delete={"Objects": [{"Key": x["Key"]} for x in obj_list["Contents"]]}
)
print(f"Removed s3 files for prefix: {file_prefix}")


def main():
parser = argparse.ArgumentParser()
parser.add_argument("schema")
args = parser.parse_args()

delete_glue_data(args.schema)


if __name__ == "__main__":
main()
15 changes: 9 additions & 6 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -654,7 +654,7 @@ services:

trino:
container_name: trino
image: quay.io/redhat-services-prod/cost-mgmt-dev-tenant/ubi-trino:461-007
image: quay.io/redhat-services-prod/cost-mgmt-dev-tenant/ubi-trino:461-009
user: root
ports:
- 8080:8080
Expand All @@ -663,12 +663,14 @@ services:
- MY_NODE_ID=${MY_NODE_ID-localhost}
- LOCAL=TRUE
- TRINO_LOG_LEVEL=${LOG_LEVEL-INFO}
- S3_ENDPOINT=${MINIO_ENDPOINT-http://koku-minio:9000}
- S3_BUCKET_NAME=${S3_BUCKET_NAME-koku-bucket}
- TRINO_S3A_OR_S3=${TRINO_S3A_OR_S3-s3a}
- MINIO_ENDPOINT=${MINIO_ENDPOINT-http://koku-minio:9000}
- S3_ACCESS_KEY=${S3_ACCESS_KEY-kokuminioaccess}
- S3_SECRET=${S3_SECRET-kokuminiosecret}
- S3_REGION=${S3_REGION-us-east-1}
- AWS_CATALOG_ID=${AWS_CATALOG_ID-}
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID-kokuminioaccess}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY-kokuminiosecret}
- AWS_REGION=us-east-1
- S3_BUCKET_NAME=${S3_BUCKET_NAME-koku-bucket}
- POSTGRES_SQL_SERVICE_HOST=db
- POSTGRES_SQL_SERVICE_PORT=5432
- DATABASE_NAME=${DATABASE_NAME-postgres}
Expand All @@ -678,7 +680,8 @@ services:
- ./dev/containers/trino/etc/config.properties:/etc/trino/config.properties
- ./dev/containers/trino/etc/jvm.config:/etc/trino/jvm.config
- ./dev/containers/trino/etc/log.properties:/etc/trino/log.properties
- ./dev/containers/trino/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties
- ./dev/containers/trino/etc/catalog/hive.properties:/etc/trino/catalog/${HIVE_PROPERTIES_FILE-hive.properties}
- ./dev/containers/trino/etc/catalog/glue.properties:/etc/trino/catalog/${GLUE_PROPERTIES_FILE-glue.properties}
- ./dev/containers/trino/etc/catalog/postgres.properties:/etc/trino/catalog/postgres.properties
- ./dev/containers/trino/data:/data/trino/data
- ./dev/containers/trino/logs:/data/trino/logs
Expand Down