diff --git a/.github/workflows/LakekeeperTesting.yml b/.github/workflows/LakekeeperTesting.yml new file mode 100644 index 000000000..d84a37131 --- /dev/null +++ b/.github/workflows/LakekeeperTesting.yml @@ -0,0 +1,279 @@ +name: Local Lakekeeper Testing +on: [push, pull_request,repository_dispatch] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} + cancel-in-progress: true +defaults: + run: + shell: bash + +env: + BASE_BRANCH: ${{ github.base_ref || (endsWith(github.ref, '_feature') && 'feature' || 'main') }} + +jobs: + rest: + name: Test against Lakekeeper Catalog + runs-on: ubuntu-latest + env: + VCPKG_TARGET_TRIPLET: 'x64-linux' + GEN: ninja + VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake + PIP_BREAK_SYSTEM_PACKAGES: 1 + + steps: + - name: Install required ubuntu packages + run: | + sudo apt-get update -y -qq + sudo apt-get install -y -qq software-properties-common + sudo add-apt-repository ppa:git-core/ppa + sudo apt-get update -y -qq + sudo apt-get install -y -qq \ + ninja-build \ + make gcc-multilib \ + g++-multilib \ + libssl-dev \ + wget \ + openjdk-8-jdk \ + zip \ + maven \ + unixodbc-dev \ + libc6-dev-i386 \ + lib32readline6-dev \ + libssl-dev \ + libcurl4-gnutls-dev \ + libexpat1-dev \ + gettext \ + unzip \ + build-essential \ + checkinstall \ + libffi-dev \ + curl \ + libz-dev \ + openssh-client + sudo apt-get install -y -qq tar pkg-config + sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose + + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: 'true' + + - name: Setup vcpkg + uses: lukka/run-vcpkg@v11.1 + with: + vcpkgGitCommitId: 5e5d0e1cd7785623065e77eff011afdeec1a3574 + + - name: Setup Ccache + uses: hendrikmuhs/ccache-action@main + continue-on-error: true + + - name: Build extension + env: + GEN: ninja + STATIC_LIBCPP: 1 + run: | + make release + + - name: Set up for Lakekeeper + run: | + # install java + # TODO: need a newer java version maybe? + sudo apt install -y -qq openjdk-21-jre-headless + sudo apt install -y -qq openjdk-21-jdk-headless + sudo apt-get install -y -qq python3-venv + git clone https://github.com/lakekeeper/lakekeeper.git lakekeeper + + - name: Start Keycloak + run: | + docker run -d \ + --name keycloak \ + -p 30080:8080 \ + -e KC_BOOTSTRAP_ADMIN_USERNAME=admin \ + -e KC_BOOTSTRAP_ADMIN_PASSWORD=admin \ + -v $(pwd)/lakekeeper/examples/access-control-simple/keycloak/realm.json:/opt/keycloak/data/import/realm.json \ + quay.io/keycloak/keycloak:26.0.7 \ + start-dev --metrics-enabled=true --health-enabled=true --import-realm --verbose --log-level=INFO --features=token-exchange + + - name: Wait for Keycloak + run: | + max_attempts=30 + attempt=1 + while ! curl -sf "http://localhost:30080/realms/master/.well-known/openid-configuration"; do + if [ $attempt -gt $max_attempts ]; then + echo "Keycloak failed to initialize after $max_attempts attempts" + exit 1 + fi + echo "Waiting for Keycloak to initialize (attempt $attempt/$max_attempts)..." + sleep 5 + attempt=$((attempt + 1)) + done + echo "Keycloak is healthy" + + - name: Start PostgreSQL + run: | + docker run -d \ + --name postgres \ + -p 5433:5432 \ + -e POSTGRESQL_USERNAME=postgres \ + -e POSTGRESQL_PASSWORD=postgres \ + -e POSTGRESQL_DATABASE=postgres \ + bitnami/postgresql:16.3.0 + + - name: Wait for PostgreSQL + run: | + max_attempts=30 + attempt=1 + while ! docker exec postgres pg_isready -U postgres -p 5432 -d postgres; do + if [ $attempt -gt $max_attempts ]; then + echo "PostgreSQL failed to initialize after $max_attempts attempts" + exit 1 + fi + echo "Waiting for PostgreSQL to initialize (attempt $attempt/$max_attempts)..." + sleep 5 + attempt=$((attempt + 1)) + done + echo "PostgreSQL is healthy" + + - name: Run OpenFGA DB migrations + run: | + docker run --rm \ + --network="host" \ + -e OPENFGA_DATASTORE_ENGINE=postgres \ + -e OPENFGA_DATASTORE_URI="postgres://postgres:postgres@localhost:5433/postgres?sslmode=disable" \ + openfga/openfga:v1.8 migrate + + - name: Start OpenFGA + run: | + docker run -d \ + --name openfga \ + -p 8081:8081 \ + -e OPENFGA_DATASTORE_ENGINE=postgres \ + -e OPENFGA_DATASTORE_URI="postgres://postgres:postgres@localhost:5433/postgres?sslmode=disable" \ + -e OPENFGA_AUTHN_METHOD=oidc \ + -e OPENFGA_AUTHN_OIDC_ISSUER=http://localhost:30080/realms/iceberg \ + -e OPENFGA_AUTHN_OIDC_AUDIENCE=openfga \ + -e OPENFGA_HTTP_TLS_ENABLED=false \ + --network="host" \ + openfga/openfga:v1.8 run --playground-enabled=false + + - name: Wait for OpenFGA + run: | + max_attempts=30 + attempt=1 + while ! curl -sf http://localhost:8080/healthz; do + if [ $attempt -gt $max_attempts ]; then + echo "OpenFGA failed to initialize after $max_attempts attempts" + exit 1 + fi + echo "Waiting for OpenFGA to initialize (attempt $attempt/$max_attempts)..." + sleep 5 + attempt=$((attempt + 1)) + done + echo "OpenFGA is healthy" + + - name: Start Minio + run: | + docker run -d \ + --name minio \ + --network="host" \ + -e MINIO_ROOT_USER=minio-root-user \ + -e MINIO_ROOT_PASSWORD=minio-root-password \ + -e MINIO_API_PORT_NUMBER=9000 \ + -e MINIO_CONSOLE_PORT_NUMBER=9001 \ + -e MINIO_SCHEME=http \ + -e MINIO_DEFAULT_BUCKETS=examples \ + --health-cmd="mc ls local | grep examples" \ + --health-interval=2s \ + --health-timeout=10s \ + --health-retries=2 \ + --health-start-period=15s \ + -p 9000:9000 \ + -p 9001:9001 \ + bitnami/minio:latest + + - name: Wait for Minio + run: | + max_attempts=30 + attempt=1 + while ! docker exec minio mc ls local | grep examples; do + if [ $attempt -gt $max_attempts ]; then + echo "Minio failed to initialize after $max_attempts attempts" + exit 1 + fi + echo "Waiting for Minio to initialize (attempt $attempt/$max_attempts)..." + sleep 5 + attempt=$((attempt + 1)) + done + echo "Minio is healthy" + + - name: Run Lakekeeper DB Migrations + run: | + docker run --rm \ + --network="host" \ + -e LAKEKEEPER__PG_ENCRYPTION_KEY=This-is-NOT-Secure! \ + -e LAKEKEEPER__PG_DATABASE_URL_READ=postgresql://postgres:postgres@localhost:5433/postgres?sslmode=disable \ + -e LAKEKEEPER__PG_DATABASE_URL_WRITE=postgresql://postgres:postgres@localhost:5433/postgres?sslmode=disable \ + -e LAKEKEEPER__AUTHZ_BACKEND=openfga \ + -e LAKEKEEPER__OPENFGA__ENDPOINT=http://localhost:8081 \ + -e LAKEKEEPER__OPENFGA__CLIENT_ID=openfga \ + -e LAKEKEEPER__OPENFGA__CLIENT_SECRET=xqE1vUrifVDKAZdLuz6JAnDxMYLdGu5z \ + -e LAKEKEEPER__OPENFGA__TOKEN_ENDPOINT=http://localhost:30080/realms/iceberg/protocol/openid-connect/token \ + -e RUST_LOG=info,iceberg-catalog=info,middle=trace \ + quay.io/lakekeeper/catalog:latest-main migrate + + - name: Start Lakekeeper + run: | + docker run -d \ + --name lakekeeper \ + --network="host" \ + -e LAKEKEEPER__PG_ENCRYPTION_KEY=This-is-NOT-Secure! \ + -e LAKEKEEPER__PG_DATABASE_URL_READ=postgresql://postgres:postgres@localhost:5433/postgres?sslmode=disable \ + -e LAKEKEEPER__PG_DATABASE_URL_WRITE=postgresql://postgres:postgres@localhost:5433/postgres?sslmode=disable \ + -e LAKEKEEPER__AUTHZ_BACKEND=openfga \ + -e LAKEKEEPER__OPENFGA__ENDPOINT=http://localhost:8081 \ + -e LAKEKEEPER__OPENID_PROVIDER_URI=http://localhost:30080/realms/iceberg \ + -e LAKEKEEPER__OPENID_AUDIENCE=lakekeeper \ + -e LAKEKEEPER__OPENFGA__CLIENT_ID=openfga \ + -e LAKEKEEPER__OPENFGA__CLIENT_SECRET=xqE1vUrifVDKAZdLuz6JAnDxMYLdGu5z \ + -e LAKEKEEPER__OPENFGA__TOKEN_ENDPOINT=http://localhost:30080/realms/iceberg/protocol/openid-connect/token \ + -e LAKEKEEPER__METRICS_PORT=9002 \ + quay.io/lakekeeper/catalog:latest-main serve + + - name: Wait for Lakekeeper + run: | + max_attempts=30 + attempt=1 + while ! docker exec lakekeeper /home/nonroot/iceberg-catalog healthcheck; do + if [ $attempt -gt $max_attempts ]; then + echo "Lakekeeper failed to initialize after $max_attempts attempts" + exit 1 + fi + echo "Waiting for Lakekeeper to initialize (attempt $attempt/$max_attempts)..." + sleep 5 + attempt=$((attempt + 1)) + done + echo "Lakekeeper is healthy" + + - name: Populate Lakekeeper + env: + JAVA_HOME: /usr/lib/jvm/java-21-openjdk-amd64 + run: | + sudo apt-get install -y -qq python3-venv + python3 -m venv . + source ./bin/activate + python3 -m pip install poetry + python3 -m pip install pyspark==3.5.0 + python3 -m pip install duckdb + python3 -m pip install pandas + python3 -m pip install setuptools + python3 scripts/lakekeeper/setup_lakekeeper_catalog.py + python3 scripts/data_generators/generate_data.py lakekeeper + + - name: Test with rest catalog + env: + LAKEKEEPER_SERVER_AVAILABLE: 1 + run: | + ./build/release/test/unittest "$PWD/test/*" --list-test-names-only || true + ./build/release/test/unittest "$PWD/test/*" + diff --git a/.github/workflows/PolarisTesting.yml b/.github/workflows/PolarisTesting.yml index 66ee86cef..d1646e418 100644 --- a/.github/workflows/PolarisTesting.yml +++ b/.github/workflows/PolarisTesting.yml @@ -9,7 +9,6 @@ defaults: env: BASE_BRANCH: ${{ github.base_ref || (endsWith(github.ref, '_feature') && 'feature' || 'main') }} - CMAKE_POLICY_VERSION_MINIMUM: 3.5 jobs: rest: @@ -55,11 +54,6 @@ jobs: sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose sudo chmod +x /usr/local/bin/docker-compose - - name: Install CMake 3.x - run: | - sudo apt-get remove -y cmake cmake-data - sudo apt-get install --allow-downgrades -y -qq 'cmake=3.*' 'cmake-data=3.*' - - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -86,14 +80,19 @@ jobs: # install java sudo apt install -y -qq openjdk-21-jre-headless sudo apt install -y -qq openjdk-21-jdk-headless - # install python virtual environment (is this needed?) sudo apt-get install -y -qq python3-venv - name: Wait for polaris initialization env: JAVA_HOME: /usr/lib/jvm/java-21-openjdk-amd64 run: | - make setup_polaris_ci + mkdir polaris_catalog + git clone https://github.com/apache/polaris.git polaris_catalog + cd polaris_catalog + ./gradlew clean :polaris-quarkus-server:assemble -Dquarkus.container-image.build=true --no-build-cache + ./gradlew --stop + nohup ./gradlew run > polaris-server.log 2> polaris-error.log & + cd .. # let polaris initialize max_attempts=50 attempt=1 @@ -132,4 +131,5 @@ jobs: run: | export POLARIS_CLIENT_ID=$(cat polaris_client_id.txt) export POLARIS_CLIENT_SECRET=$(cat polaris_client_secret.txt) - make test_release \ No newline at end of file + ./build/release/test/unittest "$PWD/test/*" --list-test-names-only || true + ./build/release/test/unittest "$PWD/test/*" diff --git a/Makefile b/Makefile index e98b0b8a9..1b0d4ad0f 100644 --- a/Makefile +++ b/Makefile @@ -23,14 +23,6 @@ data: data_clean start-rest-catalog data_large: data data_clean python3 scripts/data_generators/generate_data.py spark-rest local -# setup polaris server. See PolarisTesting.yml to see instructions for a specific machine. -setup_polaris_ci: - mkdir polaris_catalog - git clone https://github.com/apache/polaris.git polaris_catalog - cd polaris_catalog && ./gradlew clean :polaris-quarkus-server:assemble -Dquarkus.container-image.build=true --no-build-cache - cd polaris_catalog && ./gradlew --stop - cd polaris_catalog && nohup ./gradlew run > polaris-server.log 2> polaris-error.log & - data_clean: rm -rf data/generated diff --git a/scripts/data_generators/generate_data.py b/scripts/data_generators/generate_data.py index ee47f2306..8fac2fd53 100644 --- a/scripts/data_generators/generate_data.py +++ b/scripts/data_generators/generate_data.py @@ -1,6 +1,7 @@ from generate_spark_local.generate_iceberg_spark_local import IcebergSparkLocal from generate_spark_rest.generate_iceberg_spark_rest import IcebergSparkRest from generate_polaris_rest.generate_iceberg_polaris_rest import IcebergPolarisRest +from generate_lakekeeper_rest.generate_iceberg_lakekeeper_rest import IcebergLakekeeperRest import sys @@ -31,12 +32,24 @@ def GeneratePolarisData(): del conn +def GenerateLakekeeperData(): + db = IcebergLakekeeperRest() + conn = db.GetConnection() + db.GenerateTables(conn) + db.CloseConnection(conn) + del db + del conn + + if __name__ == "__main__": argv = sys.argv for i in range(1, len(argv)): if argv[i] == "polaris": print("generating polaris data") GeneratePolarisData() + elif argv[i] == "lakekeeper": + print("generating lakekeeper data") + GenerateLakekeeperData() elif argv[i] == "local": print("generating local iceberg data") GenerateSparkLocal() diff --git a/scripts/data_generators/generate_lakekeeper_rest/__init__.py b/scripts/data_generators/generate_lakekeeper_rest/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/scripts/data_generators/generate_lakekeeper_rest/__init__.py @@ -0,0 +1 @@ + diff --git a/scripts/data_generators/generate_lakekeeper_rest/generate_iceberg_lakekeeper_rest.py b/scripts/data_generators/generate_lakekeeper_rest/generate_iceberg_lakekeeper_rest.py new file mode 100644 index 000000000..e8e8619ee --- /dev/null +++ b/scripts/data_generators/generate_lakekeeper_rest/generate_iceberg_lakekeeper_rest.py @@ -0,0 +1,129 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyspark.sql import SparkSession + +#!/usr/bin/python3 +import pyspark +import pyspark.sql +import sys +import duckdb +import os +from pyspark import SparkContext +from pathlib import Path +import shutil + +PARQUET_SRC_FILE = f"scripts/data_generators/tmp_data/tmp.parquet" + +from pyspark.conf import SparkConf +from pyspark.sql import SparkSession + +CATALOG_URL = "http://localhost:8181/catalog" +MANAGEMENT_URL = "http://localhost:8181/management" +KEYCLOAK_TOKEN_URL = "http://localhost:30080/realms/iceberg/protocol/openid-connect/token" +WAREHOUSE = "demo" + +CLIENT_ID = "spark" +CLIENT_SECRET = "2OR3eRvYfSZzzZ16MlPd95jhLnOaLM52" + +SPARK_VERSION = pyspark.__version__ +SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2]) +ICEBERG_VERSION = "1.7.0" + + +class IcebergLakekeeperRest: + def __init__(self): + pass + + ### + ### Configure everyone's favorite apache product + ### + def GetConnection(self): + + conf = { + "spark.jars.packages": f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.12:{ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION}", + "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", + "spark.sql.catalog.lakekeeper": "org.apache.iceberg.spark.SparkCatalog", + "spark.sql.catalog.lakekeeper.type": "rest", + "spark.sql.catalog.lakekeeper.uri": CATALOG_URL, + "spark.sql.catalog.lakekeeper.credential": f"{CLIENT_ID}:{CLIENT_SECRET}", + "spark.sql.catalog.lakekeeper.warehouse": WAREHOUSE, + "spark.sql.catalog.lakekeeper.scope": "lakekeeper", + "spark.sql.catalog.lakekeeper.oauth2-server-uri": KEYCLOAK_TOKEN_URL, + } + + spark_config = SparkConf().setMaster('local').setAppName("Iceberg-REST") + for k, v in conf.items(): + spark_config = spark_config.set(k, v) + + spark = SparkSession.builder.config(conf=spark_config).getOrCreate() + + spark.sql("USE lakekeeper") + spark.sql("CREATE NAMESPACE IF NOT EXISTS default") + spark.sql("USE NAMESPACE default") + return spark + + def GetSQLFiles(self, table_dir): + sql_files = [f for f in os.listdir(table_dir) if f.endswith('.sql')] # Find .sql files + sql_files.sort() # Order matters obviously # Store results + return sql_files + + def GetTableDirs(self): + dir = "./scripts/data_generators/generate_lakekeeper_rest/" + subdirectories = [d for d in os.listdir(dir) if os.path.isdir(dir + d) and d != "__pycache__"] + return subdirectories + + def GetSetupFile(self, dir): + setup_files = [f for f in os.listdir(dir) if 'setup' in f.lower()] + if len(setup_files) == 0: + return "" + return setup_files[0] + + def GenerateTPCH(self, con): + duckdb_con = duckdb.connect() + duckdb_con.execute("call dbgen(sf=1)") + + for tbl in ['lineitem', 'customer', 'nation', 'orders', 'part', 'partsupp', 'region', 'supplier']: + create_statement = f""" + CREATE or REPLACE TABLE default.{tbl} + TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' + ) + AS SELECT * FROM parquet_file_view; + """ + duckdb_con.execute(f"copy {tbl} to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") + con.read.parquet(PARQUET_SRC_FILE).createOrReplaceTempView('parquet_file_view') + con.sql(create_statement) + + def GenerateTables(self, con): + # Generate the tpch tables + self.GenerateTPCH(con) + con.sql("CREATE NAMESPACE IF NOT EXISTS COLLADO_TEST") + con.sql("USE NAMESPACE COLLADO_TEST") + con.sql( + """ + CREATE TABLE IF NOT EXISTS quickstart_table ( + id BIGINT, data STRING + ) + USING ICEBERG + """ + ) + con.sql("INSERT INTO quickstart_table VALUES (1, 'some data'), (2, 'more data'), (3, 'yet more data')") + + def CloseConnection(self, con): + del con diff --git a/scripts/data_generators/generate_polaris_rest/generate_iceberg_polaris_rest.py b/scripts/data_generators/generate_polaris_rest/generate_iceberg_polaris_rest.py index 9f658c2a0..b4a5f0820 100644 --- a/scripts/data_generators/generate_polaris_rest/generate_iceberg_polaris_rest.py +++ b/scripts/data_generators/generate_polaris_rest/generate_iceberg_polaris_rest.py @@ -25,7 +25,6 @@ import os from pyspark import SparkContext from pathlib import Path -import duckdb import shutil DATA_GENERATION_DIR = f"./data/generated/iceberg/polaris-rest/" diff --git a/scripts/data_generators/generate_spark_local/generate_iceberg_spark_local.py b/scripts/data_generators/generate_spark_local/generate_iceberg_spark_local.py index b00a00a5c..b5ffed535 100644 --- a/scripts/data_generators/generate_spark_local/generate_iceberg_spark_local.py +++ b/scripts/data_generators/generate_spark_local/generate_iceberg_spark_local.py @@ -2,11 +2,9 @@ import pyspark import pyspark.sql import sys -import duckdb import os from pyspark import SparkContext from pathlib import Path -import duckdb import shutil diff --git a/scripts/data_generators/generate_spark_rest/generate_iceberg_spark_rest.py b/scripts/data_generators/generate_spark_rest/generate_iceberg_spark_rest.py index 37cb5f6e6..8091bc946 100644 --- a/scripts/data_generators/generate_spark_rest/generate_iceberg_spark_rest.py +++ b/scripts/data_generators/generate_spark_rest/generate_iceberg_spark_rest.py @@ -25,7 +25,6 @@ import os from pyspark import SparkContext from pathlib import Path -import duckdb import shutil DATA_GENERATION_DIR = f"./data/generated/iceberg/spark-rest/" diff --git a/scripts/lakekeeper/setup_lakekeeper_catalog.py b/scripts/lakekeeper/setup_lakekeeper_catalog.py new file mode 100644 index 000000000..d7891ec24 --- /dev/null +++ b/scripts/lakekeeper/setup_lakekeeper_catalog.py @@ -0,0 +1,137 @@ +import requests + +import pyspark +from pyspark.conf import SparkConf +from pyspark.sql import SparkSession +import pandas as pd + +CATALOG_URL = "http://localhost:8181/catalog" +MANAGEMENT_URL = "http://localhost:8181/management" +KEYCLOAK_TOKEN_URL = "http://localhost:30080/realms/iceberg/protocol/openid-connect/token" +WAREHOUSE = "demo" + +CLIENT_ID = "spark" +CLIENT_SECRET = "2OR3eRvYfSZzzZ16MlPd95jhLnOaLM52" + +SPARK_VERSION = pyspark.__version__ +SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2]) +ICEBERG_VERSION = "1.7.0" + +# Get an access token from keycloak (the idP server) +response = requests.post( + url=KEYCLOAK_TOKEN_URL, + data={ + "grant_type": "client_credentials", + "client_id": CLIENT_ID, + "client_secret": CLIENT_SECRET, + "scope": "lakekeeper", + }, + headers={"Content-type": "application/x-www-form-urlencoded"}, +) +response.raise_for_status() +access_token = response.json()['access_token'] +print(access_token) + +# Check the 'info' endpoint of the Lakekeeper management API + +response = requests.get( + url=f"{MANAGEMENT_URL}/v1/info", + headers={"Authorization": f"Bearer {access_token}"}, +) +response.raise_for_status() +print(response.json()) + +# Bootstrap + +response = requests.post( + url=f"{MANAGEMENT_URL}/v1/bootstrap", + headers={"Authorization": f"Bearer {access_token}"}, + json={ + "accept-terms-of-use": True, + # Optionally, we can override the name / type of the user: + # "user-email": "user@example.com", + # "user-name": "Roald Amundsen", + # "user-type": "human" + }, +) +response.raise_for_status() + +# Create a new user + +response = requests.post( + url=f"{MANAGEMENT_URL}/v1/permissions/server/assignments", + headers={"Authorization": f"Bearer {access_token}"}, + json={"writes": [{"type": "admin", "user": "oidc~cfb55bf6-fcbb-4a1e-bfec-30c6649b52f8"}]}, +) +response.raise_for_status() + +response = requests.post( + url=f"{MANAGEMENT_URL}/v1/permissions/project/assignments", + headers={"Authorization": f"Bearer {access_token}"}, + json={"writes": [{"type": "project_admin", "user": "oidc~cfb55bf6-fcbb-4a1e-bfec-30c6649b52f8"}]}, +) +response.raise_for_status() + +# Check the users, should have a result + +response = requests.get( + url=f"{MANAGEMENT_URL}/v1/user", + headers={"Authorization": f"Bearer {access_token}"}, +) +response.raise_for_status() +print(response.json()) + +# Create a warehouse + +response = requests.post( + url=f"{MANAGEMENT_URL}/v1/warehouse", + headers={"Authorization": f"Bearer {access_token}"}, + json={ + "warehouse-name": WAREHOUSE, + "storage-profile": { + "type": "s3", + "bucket": "examples", + "key-prefix": "initial-warehouse", + "endpoint": "http://localhost:9000", + "region": "local-01", + "path-style-access": True, + "flavor": "minio", + "sts-enabled": True, + }, + "storage-credential": { + "type": "s3", + "credential-type": "access-key", + "aws-access-key-id": "minio-root-user", + "aws-secret-access-key": "minio-root-password", + }, + }, +) +response.raise_for_status() +print(response.json()) + +# Populate the warehouse with Spark + +conf = { + "spark.jars.packages": f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.12:{ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION}", + "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", + "spark.sql.catalog.lakekeeper": "org.apache.iceberg.spark.SparkCatalog", + "spark.sql.catalog.lakekeeper.type": "rest", + "spark.sql.catalog.lakekeeper.uri": CATALOG_URL, + "spark.sql.catalog.lakekeeper.credential": f"{CLIENT_ID}:{CLIENT_SECRET}", + "spark.sql.catalog.lakekeeper.warehouse": WAREHOUSE, + "spark.sql.catalog.lakekeeper.scope": "lakekeeper", + "spark.sql.catalog.lakekeeper.oauth2-server-uri": KEYCLOAK_TOKEN_URL, +} + +spark_config = SparkConf().setMaster('local').setAppName("Iceberg-REST") +for k, v in conf.items(): + spark_config = spark_config.set(k, v) + +spark = SparkSession.builder.config(conf=spark_config).getOrCreate() + +spark.sql("USE lakekeeper") + +spark.sql(f"CREATE NAMESPACE IF NOT EXISTS my_namespace") +data = pd.DataFrame([[1, 'a-string', 2.2]], columns=['id', 'strings', 'floats']) +sdf = spark.createDataFrame(data) +sdf.writeTo(f"my_namespace.my_table").createOrReplace() diff --git a/test/sql/local/irc/test_lakekeeper.test b/test/sql/local/irc/test_lakekeeper.test new file mode 100644 index 000000000..99e90867b --- /dev/null +++ b/test/sql/local/irc/test_lakekeeper.test @@ -0,0 +1,33 @@ +# name: test/sql/local/irc/test_lakekeeper.test +# group: [irc] + +require httpfs + +require avro + +require parquet + +require iceberg + +require-env LAKEKEEPER_SERVER_AVAILABLE + +statement ok +create secret my_secret ( + TYPE s3, + KEY_ID 'spark', + SECRET '2OR3eRvYfSZzzZ16MlPd95jhLnOaLM52' +) + +statement ok +attach 'demo' as my_datalake ( + type ICEBERG, + ENDPOINT 'http://localhost:8181/catalog', + OAUTH2_SCOPE 'lakekeeper', + OAUTH2_SERVER_URI 'http://localhost:30080/realms/iceberg/protocol/openid-connect/token', + SECRET 'my_secret' +); + +query III +select * from my_datalake.my_namespace.my_table; +---- +1 a-string 2.2 diff --git a/test/sql/local/irc/test_lakekeeper_tpch.test b/test/sql/local/irc/test_lakekeeper_tpch.test new file mode 100644 index 000000000..b0bdce8c7 --- /dev/null +++ b/test/sql/local/irc/test_lakekeeper_tpch.test @@ -0,0 +1,55 @@ +# name: test/sql/local/irc/test_lakekeeper_tpch.test +# description: test integration with iceberg catalog read +# group: [irc] + +require avro + +require parquet + +require httpfs + +require iceberg + +require aws + +require tpch + +require-env LAKEKEEPER_SERVER_AVAILABLE + +statement ok +create secret my_secret ( + TYPE s3, + KEY_ID 'spark', + SECRET '2OR3eRvYfSZzzZ16MlPd95jhLnOaLM52' +) + +statement ok +attach 'demo' as my_datalake ( + type ICEBERG, + ENDPOINT 'http://localhost:8181/catalog', + OAUTH2_SCOPE 'lakekeeper', + OAUTH2_SERVER_URI 'http://localhost:30080/realms/iceberg/protocol/openid-connect/token', + SECRET 'my_secret' +); + +statement ok +use my_datalake.default; + +# FIXME: run smaller scale with full dataset +loop i 1 9 + +query I +PRAGMA tpch(${i}) +---- +:duckdb/extension/tpch/dbgen/answers/sf1/q0${i}.csv + +endloop + +loop i 10 23 + +query I +PRAGMA tpch(${i}) +---- +:duckdb/extension/tpch/dbgen/answers/sf1/q${i}.csv + +endloop \ No newline at end of file