diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9c4d2a6e..b22840bb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,6 +20,8 @@ jobs: test: spark3-iceberg - image: spark3-delta test: spark3-delta + - image: spark3-hudi + test: spark3-hudi - image: kerberos test: kerberos - image: gpdb-6 diff --git a/etc/compose/spark3-hudi/docker-compose.yml b/etc/compose/spark3-hudi/docker-compose.yml new file mode 100644 index 00000000..55a183df --- /dev/null +++ b/etc/compose/spark3-hudi/docker-compose.yml @@ -0,0 +1,4 @@ +version: '2.0' +services: + spark: + image: testing/spark3-hudi:latest diff --git a/testing/spark3-hudi/Dockerfile b/testing/spark3-hudi/Dockerfile new file mode 100644 index 00000000..29742be3 --- /dev/null +++ b/testing/spark3-hudi/Dockerfile @@ -0,0 +1,53 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM testing/centos7-oj11:unlabelled + +ARG SPARK_VERSION=3.2.1 +ARG HADOOP_VERSION=3.2 +ARG HUDI_VERSION=0.11.1 +ARG SCALA_VERSION=2.12 + +ARG SPARK_ARTIFACT="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" + +ENV SPARK_HOME=/spark + +RUN set -xeu; \ + wget -nv "https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARTIFACT}.tgz"; \ + tar -xf ${SPARK_ARTIFACT}.tgz; \ + rm ${SPARK_ARTIFACT}.tgz; \ + ln -sn /${SPARK_ARTIFACT} ${SPARK_HOME} + +WORKDIR ${SPARK_HOME}/jars + +# install AWS SDK so we can access S3; the version must match the hadoop-* jars which are part of SPARK distribution +RUN wget -nv "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.1/hadoop-aws-3.3.1.jar" +RUN wget -nv "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.48/aws-java-sdk-bundle-1.12.48.jar" + +# install Hudi +RUN wget -nv "https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark3-bundle_${SCALA_VERSION}/${HUDI_VERSION}/hudi-spark3-bundle_${SCALA_VERSION}-${HUDI_VERSION}.jar" + +# Create Hive user to match Hive container +RUN adduser hive + +ENV PATH="${SPARK_HOME}/bin:${PATH}" + +EXPOSE 10213 + +HEALTHCHECK --interval=10s --timeout=5s --start-period=10s \ + CMD curl -f http://localhost:10213/ +CMD spark-submit \ + --master "local[*]" \ + --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 \ + --name "Thrift JDBC/ODBC Server" \ + --conf spark.hive.server2.thrift.port=10213 \ + spark-internal