apache · HyukjinKwon · Apr 25, 2024 · Apr 25, 2024 · Apr 26, 2024 · Apr 26, 2024
diff --git a/.github/workflows/build_python.yml b/.github/workflows/build_python.yml
@@ -17,7 +17,7 @@
 # under the License.
 #
 
-name: "Build / Python-only (master, PyPy 3.8/Python 3.10/Python 3.11/Python 3.12)"
+name: "Build / Python-only (master, PyPy 3.9/Python 3.10/Python 3.11/Python 3.12)"
 
 on:
   schedule:

diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
@@ -81,10 +81,10 @@ ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library
 
 
 RUN add-apt-repository ppa:pypy/ppa
-RUN mkdir -p /usr/local/pypy/pypy3.8 && \
-    curl -sqL https://downloads.python.org/pypy/pypy3.8-v7.3.11-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.8 --strip-components=1 && \
-    ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3.8 && \
-    ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3
+RUN mkdir -p /usr/local/pypy/pypy3.9 && \
+    curl -sqL https://downloads.python.org/pypy/pypy3.9-v7.3.16-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.9 --strip-components=1 && \
+    ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3.8 && \
+    ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
 RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.0.3' scipy coverage matplotlib lxml
 

diff --git a/python/docs/source/development/contributing.rst b/python/docs/source/development/contributing.rst
@@ -129,7 +129,7 @@ If you are using Conda, the development environment can be set as follows.
 
 .. code-block:: bash
 
-    # Python 3.8+ is required
+    # Python 3.9+ is required
     conda create --name pyspark-dev-env python=3.9
     conda activate pyspark-dev-env
     pip install --upgrade -r dev/requirements.txt
@@ -145,7 +145,7 @@ Now, you can start developing and `running the tests <testing.rst>`_.
 pip
 ~~~
 
-With Python 3.8+, pip can be used as below to install and set up the development environment.
+With Python 3.9+, pip can be used as below to install and set up the development environment.
 
 .. code-block:: bash
 

diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst
@@ -30,7 +30,7 @@ and building from the source.
 Python Versions Supported
 -------------------------
 
-Python 3.8 and above.
+Python 3.9 and above.
 
 
 Using PyPI
@@ -124,7 +124,7 @@ the same session as pyspark (you can install in several steps too).
 
 .. code-block:: bash
 
-    conda install -c conda-forge pyspark  # can also add "python=3.8 some_package [etc.]" here
+    conda install -c conda-forge pyspark  # can also add "python=3.9 some_package [etc.]" here
 
 Note that `PySpark for conda <https://anaconda.org/conda-forge/pyspark>`_ is maintained
 separately by the community; while new versions generally get packaged quickly, the

diff --git a/python/docs/source/user_guide/pandas_on_spark/typehints.rst b/python/docs/source/user_guide/pandas_on_spark/typehints.rst
@@ -62,7 +62,7 @@ it as a Spark schema. As an example, you can specify the return type hint as bel
 Notice that the function ``pandas_div`` actually takes and outputs a pandas DataFrame instead of
 pandas-on-Spark :class:`DataFrame`. So, technically the correct types should be of pandas.
 
-With Python 3.8+, you can specify the type hints by using pandas instances as follows:
+With Python 3.9+, you can specify the type hints by using pandas instances as follows:
 
 .. code-block:: python
 

diff --git a/python/packaging/classic/setup.py b/python/packaging/classic/setup.py
@@ -359,11 +359,10 @@ def run(self):
                 "numpy>=%s" % _minimum_numpy_version,
             ],
         },
-        python_requires=">=3.8",
+        python_requires=">=3.9",
         classifiers=[
             "Development Status :: 5 - Production/Stable",
             "License :: OSI Approved :: Apache Software License",
-            "Programming Language :: Python :: 3.8",
             "Programming Language :: Python :: 3.9",
             "Programming Language :: Python :: 3.10",
             "Programming Language :: Python :: 3.11",

diff --git a/python/packaging/connect/setup.py b/python/packaging/connect/setup.py
@@ -191,11 +191,10 @@
             "googleapis-common-protos>=%s" % _minimum_googleapis_common_protos_version,
             "numpy>=%s" % _minimum_numpy_version,
         ],
-        python_requires=">=3.8",
+        python_requires=">=3.9",
         classifiers=[
             "Development Status :: 5 - Production/Stable",
             "License :: OSI Approved :: Apache Software License",
-            "Programming Language :: Python :: 3.8",
             "Programming Language :: Python :: 3.9",
             "Programming Language :: Python :: 3.10",
             "Programming Language :: Python :: 3.11",

diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
@@ -76,7 +76,7 @@
     from_arrow_type,
 )
 from pyspark.sql.profiler import Profile
-from pyspark.sql.session import classproperty, SparkSession as PySparkSession
+from pyspark.sql.session import SparkSession as PySparkSession
 from pyspark.sql.types import (
     _infer_schema,
     _has_nulltype,
@@ -248,7 +248,7 @@ def getOrCreate(self) -> "SparkSession":
 
     # SPARK-47544: Explicitly declaring this as an identifier instead of a method.
     # If changing, make sure this bug is not reintroduced.
-    builder: Builder = classproperty(lambda cls: cls.Builder())  # type: ignore
+    builder: Builder = property(classmethod((lambda cls: cls.Builder())))  # type: ignore
     builder.__doc__ = PySparkSession.builder.__doc__
 
     def __init__(self, connection: Union[str, DefaultChannelBuilder], userId: Optional[str] = None):

diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
@@ -136,45 +136,6 @@ def toDF(self, schema=None, sampleRatio=None):
         RDD.toDF = toDF  # type: ignore[method-assign]
 
 
-# TODO(SPARK-38912): This method can be dropped once support for Python 3.8 is dropped
-# In Python 3.9, the @property decorator has been made compatible with the
-# @classmethod decorator (https://docs.python.org/3.9/library/functions.html#classmethod)
-#
-# @classmethod + @property is also affected by a bug in Python's docstring which was backported
-# to Python 3.9.6 (https://github.com/python/cpython/pull/28838)
-class classproperty(property):
-    """Same as Python's @property decorator, but for class attributes.
-
-    Examples
-    --------
-    >>> class Builder:
-    ...    def build(self):
-    ...        return MyClass()
-    ...
-    >>> class MyClass:
-    ...     @classproperty
-    ...     def builder(cls):
-    ...         print("instantiating new builder")
-    ...         return Builder()
-    ...
-    >>> c1 = MyClass.builder
-    instantiating new builder
-    >>> c2 = MyClass.builder
-    instantiating new builder
-    >>> c1 == c2
-    False
-    >>> isinstance(c1.build(), MyClass)
-    True
-    """
-
-    def __get__(self, instance: Any, owner: Any = None) -> "SparkSession.Builder":
-        # The "type: ignore" below silences the following error from mypy:
-        # error: Argument 1 to "classmethod" has incompatible
-        # type "Optional[Callable[[Any], Any]]";
-        # expected "Callable[..., Any]"  [arg-type]
-        return classmethod(self.fget).__get__(None, owner)()  # type: ignore
-
-
 class SparkSession(SparkConversionMixin):
     """The entry point to programming Spark with the Dataset and DataFrame API.
 
@@ -591,18 +552,9 @@ def create(self) -> "SparkSession":
                     message_parameters={"feature": "SparkSession.builder.create"},
                 )
 
-    # TODO(SPARK-38912): Replace classproperty with @classmethod + @property once support for
-    # Python 3.8 is dropped.
-    #
-    # In Python 3.9, the @property decorator has been made compatible with the
-    # @classmethod decorator (https://docs.python.org/3.9/library/functions.html#classmethod)
-    #
-    # @classmethod + @property is also affected by a bug in Python's docstring which was backported
-    # to Python 3.9.6 (https://github.com/python/cpython/pull/28838)
-    #
     # SPARK-47544: Explicitly declaring this as an identifier instead of a method.
     # If changing, make sure this bug is not reintroduced.
-    builder: Builder = classproperty(lambda cls: cls.Builder())  # type: ignore
+    builder: Builder = property(classmethod(lambda cls: cls.Builder()))  # type: ignore
     """Creates a :class:`Builder` for constructing a :class:`SparkSession`.
 
     .. versionchanged:: 3.4.0

diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow.py b/python/pyspark/sql/tests/connect/test_parity_arrow.py
@@ -16,7 +16,6 @@
 #
 
 import unittest
-import sys
 
 from pyspark.sql.tests.test_arrow import ArrowTestsMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
@@ -121,7 +120,6 @@ def test_createDataFrame_nested_timestamp(self):
     def test_toPandas_nested_timestamp(self):
         self.check_toPandas_nested_timestamp(True)
 
-    @unittest.skipIf(sys.version_info < (3, 9), "zoneinfo is available from Python 3.9+")
     def test_toPandas_timestmap_tzinfo(self):
         self.check_toPandas_timestmap_tzinfo(True)
 

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import sys
 import unittest
 from inspect import signature
 from typing import Union, Iterator, Tuple, cast, get_type_hints
@@ -114,7 +113,6 @@ def func(iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]]) -> Iterator
             infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR_ITER
         )
 
-    @unittest.skipIf(sys.version_info < (3, 9), "Type hinting generics require Python 3.9.")
     def test_type_annotation_tuple_generics(self):
         def func(iter: Iterator[tuple[pd.DataFrame, pd.Series]]) -> Iterator[pd.DataFrame]:
             pass

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py
@@ -16,7 +16,6 @@
 #
 from __future__ import annotations
 
-import sys
 import unittest
 from inspect import signature
 from typing import Union, Iterator, Tuple, cast, get_type_hints
@@ -308,10 +307,6 @@ def pandas_plus_one(iter: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
         expected = df.selectExpr("id + 1 as id")
         assert_frame_equal(expected.toPandas(), actual.toPandas())
 
-    @unittest.skipIf(
-        sys.version_info < (3, 9),
-        "string annotations with future annotations do not work under Python<3.9",
-    )
     def test_string_type_annotation(self):
         def func(col: "pd.Series") -> "pd.Series":
             pass

diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
@@ -23,7 +23,6 @@
 import unittest
 from typing import cast
 from collections import namedtuple
-import sys
 
 from pyspark import SparkConf
 from pyspark.sql import Row, SparkSession
@@ -997,7 +996,6 @@ def check_createDataFrame_nested_timestamp(self, arrow_enabled):
 
         self.assertEqual(df.first(), expected)
 
-    @unittest.skipIf(sys.version_info < (3, 9), "zoneinfo is available from Python 3.9+")
     def test_toPandas_timestmap_tzinfo(self):
         for arrow_enabled in [True, False]:
             with self.subTest(arrow_enabled=arrow_enabled):

diff --git a/python/run-tests b/python/run-tests
@@ -21,9 +21,9 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-PYTHON_VERSION_CHECK=$(python3 -c 'import sys; print(sys.version_info < (3, 8, 0))')
+PYTHON_VERSION_CHECK=$(python3 -c 'import sys; print(sys.version_info < (3, 9, 0))')
 if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then
-  echo "Python versions prior to 3.8 are not supported."
+  echo "Python versions prior to 3.9 are not supported."
   exit -1
 fi