From 27719fd60b6632b47b6bb0d658247978ff1403a4 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 24 Jun 2025 11:21:45 +0800 Subject: [PATCH 01/16] test --- .github/workflows/build_python_minimum.yml | 2 +- dev/spark-test-image/python-minimum/Dockerfile | 6 ++++-- python/docs/source/getting_started/install.rst | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build_python_minimum.yml b/.github/workflows/build_python_minimum.yml index 4e65503006489..3514a82f6217c 100644 --- a/.github/workflows/build_python_minimum.yml +++ b/.github/workflows/build_python_minimum.yml @@ -38,7 +38,7 @@ jobs: envs: >- { "PYSPARK_IMAGE_TO_TEST": "python-minimum", - "PYTHON_TO_TEST": "python3.9" + "PYTHON_TO_TEST": "python3.10" } jobs: >- { diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 59d9ebed4e40f..3f8f2f3218f64 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -24,11 +24,12 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE=20250327 +ENV FULL_REFRESH_DATE=20250624 ENV DEBIAN_FRONTEND=noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Should keep the installation consistent with https://apache.github.io/spark/api/python/getting_started/install.html RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ @@ -52,6 +53,8 @@ RUN apt-get update && apt-get install -y \ libxml2-dev \ openjdk-17-jdk-headless \ pkg-config \ + python3.10 \ + python3-psutil \ qpdf \ tzdata \ software-properties-common \ @@ -59,7 +62,6 @@ RUN apt-get update && apt-get install -y \ zlib1g-dev -# Should keep the installation consistent with https://apache.github.io/spark/api/python/getting_started/install.html # Install Python 3.9 RUN add-apt-repository ppa:deadsnakes/ppa diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 73a80ce014a83..7e1a87eafffd4 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -30,7 +30,7 @@ and building from the source. Python Versions Supported ------------------------- -Python 3.9 and above. +Python 3.10 and above. Using PyPI @@ -143,7 +143,7 @@ the same session as pyspark (you can install in several steps too). .. code-block:: bash - conda install -c conda-forge pyspark # can also add "python=3.9 some_package [etc.]" here + conda install -c conda-forge pyspark # can also add "python=3.10 some_package [etc.]" here Note that `PySpark for conda `_ is maintained separately by the community; while new versions generally get packaged quickly, the From c65a576f759734e71e7a696931d4442a5fe7646b Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 24 Jun 2025 11:22:20 +0800 Subject: [PATCH 02/16] fix --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7580393ec0635..a116dcc04b426 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -41,7 +41,7 @@ on: description: Additional environment variables to set when running the tests. Should be in JSON format. required: false type: string - default: '{"PYSPARK_IMAGE_TO_TEST": "python-311", "PYTHON_TO_TEST": "python3.11"}' + default: '{"PYSPARK_IMAGE_TO_TEST": "python-minimum", "PYTHON_TO_TEST": "python3.10"}' jobs: description: >- Jobs to run, and should be in JSON format. The values should be matched with the job's key defined From d651304253f9ccc2080dfe6637f6e4fb89c68f16 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 24 Jun 2025 11:25:16 +0800 Subject: [PATCH 03/16] fix --- dev/spark-test-image/python-minimum/Dockerfile | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 3f8f2f3218f64..9eaa9892a632a 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -62,22 +62,11 @@ RUN apt-get update && apt-get install -y \ zlib1g-dev - -# Install Python 3.9 -RUN add-apt-repository ppa:deadsnakes/ppa -RUN apt-get update && apt-get install -y \ - python3.9 \ - python3.9-distutils \ - && apt-get autoremove --purge -y \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - - ARG BASIC_PIP_PKGS="numpy==1.21 pyarrow==11.0.0 pandas==2.0.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" # Install Python 3.9 packages -RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 -RUN python3.9 -m pip install --force $BASIC_PIP_PKGS $CONNECT_PIP_PKGS && \ - python3.9 -m pip cache purge +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 +RUN python3.10 -m pip install --force $BASIC_PIP_PKGS $CONNECT_PIP_PKGS && \ + python3.10 -m pip cache purge From ca91696849541e84d9d90882c56d70c11aad2139 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Wed, 25 Jun 2025 11:37:44 +0800 Subject: [PATCH 04/16] also try to upgrade pandas --- dev/spark-test-image/python-minimum/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 9eaa9892a632a..9d50c631254ab 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -62,7 +62,7 @@ RUN apt-get update && apt-get install -y \ zlib1g-dev -ARG BASIC_PIP_PKGS="numpy==1.21 pyarrow==11.0.0 pandas==2.0.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==1.21 pyarrow==11.0.0 pandas==2.1.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From 37a9ec84b631ed7bc820261382af5dfb3fca5ac3 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Wed, 25 Jun 2025 11:57:44 +0800 Subject: [PATCH 05/16] fix numpy --- dev/spark-test-image/python-minimum/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 9d50c631254ab..3e1b8a3796ed3 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -62,7 +62,7 @@ RUN apt-get update && apt-get install -y \ zlib1g-dev -ARG BASIC_PIP_PKGS="numpy==1.21 pyarrow==11.0.0 pandas==2.1.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==1.22 pyarrow==11.0.0 pandas==2.1.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From c849b4f8208f6e8f02287f4b9d1eaf254ac5ee75 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Wed, 25 Jun 2025 12:05:14 +0800 Subject: [PATCH 06/16] fix numpy --- dev/spark-test-image/python-minimum/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 3e1b8a3796ed3..8b3f118bd4e5a 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -62,7 +62,7 @@ RUN apt-get update && apt-get install -y \ zlib1g-dev -ARG BASIC_PIP_PKGS="numpy==1.22 pyarrow==11.0.0 pandas==2.1.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==1.23 pyarrow==11.0.0 pandas==2.1.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From 81b560e33242c99821d82a00615115ec3946ca35 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 26 Jun 2025 12:28:24 +0800 Subject: [PATCH 07/16] try pandas 203 --- dev/spark-test-image/python-minimum/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 8b3f118bd4e5a..cd6f683f430db 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -62,7 +62,7 @@ RUN apt-get update && apt-get install -y \ zlib1g-dev -ARG BASIC_PIP_PKGS="numpy==1.23 pyarrow==11.0.0 pandas==2.1.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==1.21 pyarrow==11.0.0 pandas==2.0.3 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From 1010ea461e1da2d2eac193e0d492678ac4f2f9b2 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 3 Jul 2025 10:52:42 +0800 Subject: [PATCH 08/16] test pandas 2.2.0 --- dev/spark-test-image/python-minimum/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index cd6f683f430db..3f798ebc86786 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE=20250624 +ENV FULL_REFRESH_DATE=20250703 ENV DEBIAN_FRONTEND=noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN=true @@ -62,7 +62,7 @@ RUN apt-get update && apt-get install -y \ zlib1g-dev -ARG BASIC_PIP_PKGS="numpy==1.21 pyarrow==11.0.0 pandas==2.0.3 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==1.21 pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From d1254e4b302d7fb269ecc544d8ba8a4c43f5c0b1 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 3 Jul 2025 11:19:59 +0800 Subject: [PATCH 09/16] test numpy 1.22.4 --- dev/spark-test-image/python-minimum/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 3f798ebc86786..c21786ed3e695 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -62,7 +62,7 @@ RUN apt-get update && apt-get install -y \ zlib1g-dev -ARG BASIC_PIP_PKGS="numpy==1.21 pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==1.22.4 pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From 497c02345fe692707245260927fcff47d551e1d7 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 3 Jul 2025 12:25:33 +0800 Subject: [PATCH 10/16] test numpy 2 --- dev/spark-test-image/python-minimum/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index c21786ed3e695..8207a40f4b51f 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -62,7 +62,7 @@ RUN apt-get update && apt-get install -y \ zlib1g-dev -ARG BASIC_PIP_PKGS="numpy==1.22.4 pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==2.0.0 pyarrow==11.0.0 pandas==2.1.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From 28f5a67d0ed526ab5f155db10eadbe16cfb57ec7 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 3 Jul 2025 16:38:36 +0800 Subject: [PATCH 11/16] revert numpy 2 --- dev/spark-test-image/python-minimum/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 8207a40f4b51f..c21786ed3e695 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -62,7 +62,7 @@ RUN apt-get update && apt-get install -y \ zlib1g-dev -ARG BASIC_PIP_PKGS="numpy==2.0.0 pyarrow==11.0.0 pandas==2.1.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==1.22.4 pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From da6451b47ca6becc96d58e4bfd123f38ed932b31 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 3 Jul 2025 16:44:34 +0800 Subject: [PATCH 12/16] test pandas 2.1 --- dev/spark-test-image/python-minimum/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index c21786ed3e695..a32090d842c2e 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -62,7 +62,7 @@ RUN apt-get update && apt-get install -y \ zlib1g-dev -ARG BASIC_PIP_PKGS="numpy==1.22.4 pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==1.22.4 pyarrow==11.0.0 pandas==2.1.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From 1ae00b3a18cfff864a5c2abeacdb92a85e2f85f2 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Fri, 4 Jul 2025 08:08:41 +0800 Subject: [PATCH 13/16] revert pandas 2.1 --- dev/spark-test-image/python-minimum/Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index a32090d842c2e..0b688b9de0e2f 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -61,8 +61,7 @@ RUN apt-get update && apt-get install -y \ wget \ zlib1g-dev - -ARG BASIC_PIP_PKGS="numpy==1.22.4 pyarrow==11.0.0 pandas==2.1.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==1.22.4 pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From 505becf7a9321940a1fa779eea99b67c3412c16c Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Fri, 4 Jul 2025 08:09:51 +0800 Subject: [PATCH 14/16] nit --- dev/spark-test-image/python-minimum/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 0b688b9de0e2f..ab4ec9895a3a1 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -61,7 +61,7 @@ RUN apt-get update && apt-get install -y \ wget \ zlib1g-dev -ARG BASIC_PIP_PKGS="numpy==1.22.4 pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==1.22 pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From 86fa47bce0c6d3a125f2038b069b84f54a2f765e Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Fri, 4 Jul 2025 10:31:12 +0800 Subject: [PATCH 15/16] numpy 1.22.4 --- dev/spark-test-image/python-minimum/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index ab4ec9895a3a1..0b688b9de0e2f 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -61,7 +61,7 @@ RUN apt-get update && apt-get install -y \ wget \ zlib1g-dev -ARG BASIC_PIP_PKGS="numpy==1.22 pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" +ARG BASIC_PIP_PKGS="numpy==1.22.4 pyarrow==11.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 googleapis-common-protos==1.65.0 graphviz==0.20 protobuf" From c22ee2cad3118c0e899021e12ad525adfaf7f712 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Fri, 4 Jul 2025 12:47:09 +0800 Subject: [PATCH 16/16] restore PR builder --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a116dcc04b426..7580393ec0635 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -41,7 +41,7 @@ on: description: Additional environment variables to set when running the tests. Should be in JSON format. required: false type: string - default: '{"PYSPARK_IMAGE_TO_TEST": "python-minimum", "PYTHON_TO_TEST": "python3.10"}' + default: '{"PYSPARK_IMAGE_TO_TEST": "python-311", "PYTHON_TO_TEST": "python3.11"}' jobs: description: >- Jobs to run, and should be in JSON format. The values should be matched with the job's key defined