Skip to content

Commit 6ac69bc

Browse files
committed
use spark connect
1 parent 23f4736 commit 6ac69bc

File tree

11 files changed

+150
-250
lines changed

11 files changed

+150
-250
lines changed

.gitignore

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,6 @@ coverage.xml
3737
bin/
3838
.vscode/
3939

40-
# Hive/metastore files
41-
metastore_db/
42-
43-
# Spark/metastore files
44-
spark-warehouse/
45-
derby.log
46-
4740
# Python stuff
4841
.mypy_cache/
4942
htmlcov

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
# Configuration Variables
1919
# ========================
2020

21-
PYTEST_ARGS ?= -vvv -s -x # Override with e.g. PYTEST_ARGS="-vv --tb=short"
21+
PYTEST_ARGS ?= -v -x # Override with e.g. PYTEST_ARGS="-vv --tb=short"
2222
COVERAGE ?= 0 # Set COVERAGE=1 to enable coverage: make test COVERAGE=1
2323
COVERAGE_FAIL_UNDER ?= 85 # Minimum coverage % to pass: make coverage-report COVERAGE_FAIL_UNDER=70
2424
KEEP_COMPOSE ?= 0 # Set KEEP_COMPOSE=1 to keep containers after integration tests
@@ -93,7 +93,7 @@ lint: ## Run code linters via pre-commit
9393
##@ Testing
9494

9595
test: ## Run all unit tests (excluding integration)
96-
$(TEST_RUNNER) pytest tests/ -m "(unmarked or parametrize) and not integration" $(PYTEST_ARGS)
96+
$(TEST_RUNNER) pytest tests/io/test_pyarrow.py -m "(unmarked or parametrize) and not integration" $(PYTEST_ARGS)
9797

9898
test-integration: test-integration-setup test-integration-exec test-integration-cleanup ## Run integration tests
9999

dev/Dockerfile

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,25 +36,38 @@ ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$
3636
RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events
3737
WORKDIR ${SPARK_HOME}
3838

39-
# Remember to also update `tests/conftest`'s spark setting
4039
ENV SPARK_VERSION=3.5.6
41-
ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
42-
ENV ICEBERG_VERSION=1.9.1
40+
ENV SCALA_VERSION=2.12
41+
ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_${SCALA_VERSION}
42+
ENV ICEBERG_VERSION=1.9.2
4343
ENV PYICEBERG_VERSION=0.10.0
44+
ENV HADOOP_VERSION=3.3.4
45+
ENV AWS_SDK_VERSION=1.12.753
4446

45-
RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
47+
RUN curl --retry 5 -s -C - https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
4648
&& tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \
4749
&& rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz
4850

51+
# Download Spark Connect server JAR
52+
RUN curl --retry 5 -s -L https://repo1.maven.org/maven2/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \
53+
-Lo /opt/spark/jars/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar
54+
4955
# Download iceberg spark runtime
5056
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
5157
-Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
5258

53-
5459
# Download AWS bundle
5560
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
5661
-Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
5762

63+
# Download hadoop-aws (required for S3 support)
64+
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
65+
-Lo /opt/spark/jars/hadoop-aws-${HADOOP_VERSION}.jar
66+
67+
# Download AWS SDK bundle
68+
RUN curl --retry 5 -s https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar \
69+
-Lo /opt/spark/jars/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar
70+
5871
COPY spark-defaults.conf /opt/spark/conf
5972
ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
6073

dev/docker-compose-integration.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ services:
3131
- AWS_SECRET_ACCESS_KEY=password
3232
- AWS_REGION=us-east-1
3333
ports:
34-
- 8888:8888
35-
- 8080:8080
34+
- 15002:15002 # Spark Connect
35+
- 4040:4040 # Spark UI
3636
links:
3737
- rest:rest
3838
- hive:hive

dev/entrypoint.sh

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
# under the License.
1919
#
2020

21-
start-master.sh -p 7077
22-
start-worker.sh spark://spark-iceberg:7077
23-
start-history-server.sh
21+
start-connect-server.sh
2422

2523
tail -f /dev/null

dev/spark-defaults.conf

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,35 @@
1616
#
1717

1818
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
19+
20+
# Configure Iceberg REST catalog
1921
spark.sql.catalog.rest org.apache.iceberg.spark.SparkCatalog
2022
spark.sql.catalog.rest.type rest
2123
spark.sql.catalog.rest.uri http://rest:8181
2224
spark.sql.catalog.rest.io-impl org.apache.iceberg.aws.s3.S3FileIO
2325
spark.sql.catalog.rest.warehouse s3://warehouse/rest/
2426
spark.sql.catalog.rest.s3.endpoint http://minio:9000
27+
spark.sql.catalog.rest.cache-enabled false
28+
29+
# Configure Iceberg Hive catalog
2530
spark.sql.catalog.hive org.apache.iceberg.spark.SparkCatalog
2631
spark.sql.catalog.hive.type hive
2732
spark.sql.catalog.hive.uri thrift://hive:9083
2833
spark.sql.catalog.hive.io-impl org.apache.iceberg.aws.s3.S3FileIO
2934
spark.sql.catalog.hive.warehouse s3://warehouse/hive/
3035
spark.sql.catalog.hive.s3.endpoint http://minio:9000
36+
37+
# Configure Spark's default session catalog (spark_catalog) to use Iceberg backed by the Hive Metastore
38+
spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog
39+
spark.sql.catalog.spark_catalog.type hive
40+
spark.sql.catalog.spark_catalog.uri thrift://hive:9083
41+
spark.hadoop.fs.s3a.endpoint http://minio:9000
42+
spark.sql.catalogImplementation hive
43+
spark.sql.warehouse.dir s3a://warehouse/hive/
44+
3145
spark.sql.defaultCatalog rest
46+
47+
# Configure Spark UI and event logging
48+
spark.ui.enabled true
3249
spark.eventLog.enabled true
3350
spark.eventLog.dir /home/iceberg/spark-events
34-
spark.history.fs.logDirectory /home/iceberg/spark-events
35-
spark.sql.catalogImplementation in-memory

0 commit comments

Comments
 (0)