diff --git a/images/pyspark-notebook/Dockerfile b/images/pyspark-notebook/Dockerfile index ee7a780ae2..37cceb0ce1 100644 --- a/images/pyspark-notebook/Dockerfile +++ b/images/pyspark-notebook/Dockerfile @@ -15,10 +15,10 @@ USER root # Spark dependencies # Default values can be overridden at build time # (ARGS are in lower case to distinguish them from ENV) -ARG spark_version="3.4.1" +ARG spark_version="3.5.0" ARG hadoop_version="3" ARG scala_version -ARG spark_checksum="5a21295b4c3d1d3f8fc85375c711c7c23e3eeb3ec9ea91778f149d8d321e3905e2f44cf19c69a28df693cffd536f7316706c78932e7e148d224424150f18b2c5" +ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" ARG openjdk_version="17" ENV APACHE_SPARK_VERSION="${spark_version}" \ @@ -66,9 +66,14 @@ RUN fix-permissions "/etc/ipython/" USER ${NB_UID} # Install pyarrow -# Temporarily pin pandas to version 1.5.3, see: https://github.com/jupyter/docker-stacks/issues/1924 +# NOTE: It's important to ensure compatibility between Pandas versions. +# The pandas version in this Dockerfile should match the version +# on which the Pandas API for Spark is built. +# To find the right version: +# 1. Check out the Spark branch you are on. +# 2. Find the pandas version in the file spark/dev/infra/Dockerfile. RUN mamba install --yes \ - 'pandas>=1.5.3,<2.0.0' \ + 'pandas=2.0.3' \ 'pyarrow' && \ mamba clean --all -f -y && \ fix-permissions "${CONDA_DIR}" && \ diff --git a/tests/pyspark-notebook/units/unit_pandas_version.py b/tests/pyspark-notebook/units/unit_pandas_version.py index 1728effa35..03920db4b4 100644 --- a/tests/pyspark-notebook/units/unit_pandas_version.py +++ b/tests/pyspark-notebook/units/unit_pandas_version.py @@ -2,4 +2,4 @@ # Distributed under the terms of the Modified BSD License. import pandas -assert pandas.__version__ == "1.5.3" +assert pandas.__version__ == "2.0.3"