diff --git a/plugins/flytekit-spark/Dockerfile b/plugins/flytekit-spark/Dockerfile index 59ca1dfdac..06891d2679 100644 --- a/plugins/flytekit-spark/Dockerfile +++ b/plugins/flytekit-spark/Dockerfile @@ -1,5 +1,5 @@ # https://github.com/apache/spark/blob/master/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile -FROM apache/spark-py:3.3.1 +FROM apache/spark-py:v3.4.0 LABEL org.opencontainers.image.source=https://github.com/flyteorg/flytekit USER 0 @@ -12,11 +12,15 @@ ARG VERSION RUN pip install uv --no-cache-dir \ && uv pip install --system --no-cache-dir -U flytekitplugins-spark==$VERSION flytekit==$VERSION -RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.2/hadoop-aws-3.2.2.jar -P /opt/spark/jars && \ - wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar -P /opt/spark/jars +RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.4.0/hadoop-aws-3.4.0.jar -P /opt/spark/jars && \ + wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar -P /opt/spark/jars && \ + wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/1.4.2/iceberg-spark-runtime-3.4_2.12-1.4.2.jar -P /opt/spark/jars && \ + wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/1.4.2/iceberg-aws-bundle-1.4.2.jar -P /opt/spark/jars RUN wget https://github.com/GoogleCloudDataproc/hadoop-connectors/releases/download/v2.2.17/util-hadoop-hadoop3-2.2.17.jar -P /opt/spark/jars RUN chown -R ${spark_uid}:${spark_uid} /root +# Ability to write to jars directory +RUN chown -R ${spark_uid}:${spark_uid} /opt/spark/jars WORKDIR /root USER ${spark_uid} diff --git a/plugins/flytekit-spark/scripts/flytekit_install_spark3.sh b/plugins/flytekit-spark/scripts/flytekit_install_spark3.sh old mode 100644 new mode 100755 index a2c32f876c..49d48f9cc4 --- a/plugins/flytekit-spark/scripts/flytekit_install_spark3.sh +++ b/plugins/flytekit-spark/scripts/flytekit_install_spark3.sh @@ -22,8 +22,8 @@ mkdir -p /opt/spark/work-dir touch /opt/spark/RELEASE # Fetch Spark Distribution -wget https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz -O spark-dist.tgz -echo '224e058cb0c6fb68b39896427a3ccd11ae2246e9bf465b5e29e4fb192d39a59c spark-dist.tgz' | sha256sum --check +wget https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz -O spark-dist.tgz +echo '67bc912e9192ef2159540cb480820e5466dfd91e907c97c5a4787587e3020be042b76c40c51854f2a5dbeb8c3775fe12d9021c1200c4704463ec644132243a69 spark-dist.tgz' > spark-dist.tgz.sha512 && sha512sum --check spark-dist.tgz.sha512 mkdir -p spark-dist tar -xvf spark-dist.tgz -C spark-dist --strip-components 1 @@ -43,5 +43,5 @@ rm -rf spark-dist # Hadoop dist (via Apache) has older AWS SDK version. Fetch required AWS jars from maven directly (not-ideal) to support IAM role # https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts-minimum-sdk.html -wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.1/hadoop-aws-3.3.1.jar -P /opt/spark/jars -wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.901/aws-java-sdk-bundle-1.11.901.jar -P /opt/spark/jars +wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.4.0/hadoop-aws-3.4.0.jar -P /opt/spark/jars +wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar -P /opt/spark/jars diff --git a/plugins/flytekit-spark/setup.py b/plugins/flytekit-spark/setup.py index e591980d20..892f5fba87 100644 --- a/plugins/flytekit-spark/setup.py +++ b/plugins/flytekit-spark/setup.py @@ -4,7 +4,7 @@ microlib_name = f"flytekitplugins-{PLUGIN_NAME}" -plugin_requires = ["flytekit>=1.15.1", "pyspark>=3.0.0", "aiohttp", "flyteidl>=1.11.0b1", "pandas"] +plugin_requires = ["flytekit>=1.15.1", "pyspark>=3.4.0", "aiohttp", "flyteidl>=1.11.0b1", "pandas"] __version__ = "0.0.0+develop"