apache · HyukjinKwon · Feb 7, 2024 · Feb 8, 2024 · Feb 8, 2024 · Feb 8, 2024
diff --git a/dev/lint-python b/dev/lint-python
@@ -278,7 +278,7 @@ function black_test {
     fi
 
     echo "starting black test..."
-    BLACK_REPORT=$( ($BLACK_BUILD  --config dev/pyproject.toml --check python/pyspark dev python/setup.py) 2>&1)
+    BLACK_REPORT=$( ($BLACK_BUILD  --config dev/pyproject.toml --check python/pyspark dev python/packaging) 2>&1)
     BLACK_STATUS=$?
 
     if [ "$BLACK_STATUS" -ne 0 ]; then

diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
@@ -248,7 +248,7 @@ if [ "$MAKE_PIP" == "true" ]; then
   pushd "$SPARK_HOME/python" > /dev/null
   # Delete the egg info file if it exists, this can cache older setup files.
   rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
-  python3 setup.py sdist
+  python3 packaging/classic/setup.py sdist
   popd > /dev/null
 else
   echo "Skipping building python distribution package"

diff --git a/dev/reformat-python b/dev/reformat-python
@@ -29,4 +29,4 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 
-$BLACK_BUILD --config dev/pyproject.toml python/pyspark dev python/setup.py
+$BLACK_BUILD --config dev/pyproject.toml python/pyspark dev python/packaging
diff --git a/dev/run-pip-tests b/dev/run-pip-tests
@@ -73,7 +73,7 @@ PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz"
 PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall"
 # Test both regular user and edit/dev install modes.
 PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST"
-	      "pip install $PIP_OPTIONS -e python/")
+	      "pip install $PIP_OPTIONS -e python/packaging/classic")
 
 # Jenkins has PySpark installed under user sitepackages shared for some reasons.
 # In this test, explicitly exclude user sitepackages to prevent side effects
@@ -103,7 +103,7 @@ for python in "${PYTHON_EXECS[@]}"; do
     cd "$FWDIR"/python
     # Delete the egg info file if it exists, this can cache the setup file.
     rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
-    python3 setup.py sdist
+    python3 packaging/classic/setup.py sdist
 
 
     echo "Installing dist into virtual env"
@@ -125,7 +125,7 @@ for python in "${PYTHON_EXECS[@]}"; do
     echo "Run basic sanity check with import based"
     python3 "$FWDIR"/dev/pip-sanity-check.py
     echo "Run the tests for context.py"
-    python3 "$FWDIR"/python/pyspark/context.py
+    python3 "$FWDIR"/python/pyspark/core/context.py
 
     cd "$FWDIR"
 

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -430,12 +430,12 @@ def __hash__(self):
     source_file_regexes=["python/(?!pyspark/(ml|mllib|sql|streaming))"],
     python_test_goals=[
         # doctests
-        "pyspark.rdd",
-        "pyspark.context",
-        "pyspark.conf",
-        "pyspark.broadcast",
+        "pyspark.core.rdd",
+        "pyspark.core.context",
+        "pyspark.core.conf",
+        "pyspark.core.broadcast",
         "pyspark.accumulators",
-        "pyspark.files",
+        "pyspark.core.files",
         "pyspark.serializers",
         "pyspark.profiler",
         "pyspark.shuffle",

diff --git a/docs/building-spark.md b/docs/building-spark.md
@@ -216,7 +216,7 @@ For information about how to run individual tests, refer to the
 
 If you are building Spark for use in a Python environment and you wish to pip install it, you will first need to build the Spark JARs as described above. Then you can construct an sdist package suitable for setup.py and pip installable package.
 
-    cd python; python setup.py sdist
+    cd python; python packaging/classic/setup.py sdist
 
 **Note:** Due to packaging requirements you can not directly pip install from the Python directory, rather you must first build the sdist package as described above.
 

diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
@@ -1321,7 +1321,7 @@ method. The code below shows this:
 
 {% highlight python %}
 >>> broadcastVar = sc.broadcast([1, 2, 3])
-<pyspark.broadcast.Broadcast object at 0x102789f10>
+<pyspark.core.broadcast.Broadcast object at 0x102789f10>
 
 >>> broadcastVar.value
 [1, 2, 3]

diff --git a/examples/src/main/python/avro_inputformat.py b/examples/src/main/python/avro_inputformat.py
@@ -47,7 +47,7 @@
 from typing import Any, Tuple
 
 from functools import reduce
-from pyspark.rdd import RDD
+from pyspark import RDD
 from pyspark.sql import SparkSession
 
 if __name__ == "__main__":

diff --git a/examples/src/main/python/parquet_inputformat.py b/examples/src/main/python/parquet_inputformat.py
@@ -32,7 +32,7 @@
 import sys
 from typing import Any, Tuple
 
-from pyspark.rdd import RDD
+from pyspark import RDD
 from pyspark.sql import SparkSession
 
 if __name__ == "__main__":

diff --git a/examples/src/main/python/sort.py b/examples/src/main/python/sort.py
@@ -18,7 +18,7 @@
 import sys
 from typing import Tuple
 
-from pyspark.rdd import RDD
+from pyspark import RDD
 from pyspark.sql import SparkSession
 
 

diff --git a/examples/src/main/python/streaming/network_wordjoinsentiments.py b/examples/src/main/python/streaming/network_wordjoinsentiments.py
@@ -34,7 +34,7 @@
 from typing import Tuple
 
 from pyspark import SparkContext
-from pyspark.rdd import RDD
+from pyspark import RDD
 from pyspark.streaming import DStream, StreamingContext
 
 

diff --git a/examples/src/main/python/streaming/recoverable_network_wordcount.py b/examples/src/main/python/streaming/recoverable_network_wordcount.py
@@ -40,10 +40,7 @@
 import sys
 from typing import List, Tuple
 
-from pyspark import SparkContext
-from pyspark.accumulators import Accumulator
-from pyspark.broadcast import Broadcast
-from pyspark.rdd import RDD
+from pyspark import SparkContext, Accumulator, Broadcast, RDD
 from pyspark.streaming import StreamingContext
 
 

diff --git a/examples/src/main/python/streaming/sql_network_wordcount.py b/examples/src/main/python/streaming/sql_network_wordcount.py
@@ -30,8 +30,7 @@
 import sys
 import datetime
 
-from pyspark import SparkConf, SparkContext
-from pyspark.rdd import RDD
+from pyspark import SparkConf, SparkContext, RDD
 from pyspark.streaming import StreamingContext
 from pyspark.sql import Row, SparkSession
 

diff --git a/pom.xml b/pom.xml
@@ -222,7 +222,8 @@
     <icu4j.version>72.1</icu4j.version>
     <!--
     If you are changing Arrow version specification, please check
-    ./python/pyspark/sql/pandas/utils.py, and ./python/setup.py too.
+    ./python/pyspark/sql/pandas/utils.py, ./python/packaging/classic/setup.py
+    and ./python/packaging/connect/setup.py too.
     -->
     <arrow.version>15.0.2</arrow.version>
     <ammonite.version>3.0.0-M1</ammonite.version>

diff --git a/python/.gitignore b/python/.gitignore
@@ -1,5 +1,8 @@
 *.pyc
 docs/_build/
 pyspark.egg-info
+pyspark_connect.egg-info
 build/
 dist/
+./setup.py
+./setup.cfg
diff --git a/python/setup.cfg → python/packaging/classic/setup.cfg b/python/setup.cfg → python/packaging/classic/setup.cfg
diff --git a/python/setup.py → python/packaging/classic/setup.py b/python/setup.py → python/packaging/classic/setup.py
@@ -24,6 +24,25 @@
 from setuptools import setup
 from setuptools.command.install import install
 from shutil import copyfile, copytree, rmtree
+from pathlib import Path
+
+if (
+    # When we package, the parent diectory 'classic' dir
+    # (as we pip install -e python/packaging/classic)
+    os.getcwd() == str(Path(__file__).parent.absolute())
+    and str(Path(__file__).parent.name) == "classic"
+):
+    # For:
+    # - pip install -e python/packaging/classic
+    #     It moves the current working directory to 'classic'
+    # - cd python/packaging/classic; python setup.py sdist
+    #
+    # For:
+    # - python packaging/classic/setup.py sdist, it does not
+    #     execute this branch.
+    #
+    # Move to spark/python
+    os.chdir(Path(__file__).parent.parent.parent.absolute())
 
 try:
     exec(open("pyspark/version.py").read())
@@ -58,7 +77,7 @@
       ./build/mvn -DskipTests clean package
     Building the source dist is done in the Python directory:
       cd python
-      python setup.py sdist
+      python packaging/classic/setup.py sdist
       pip install dist/*.tar.gz"""
 
 # Figure out where the jars are we need to package with PySpark.
@@ -129,7 +148,8 @@ def _supports_symlinks():
 # If you are changing the versions here, please also change ./python/pyspark/sql/pandas/utils.py
 # For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the
 # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications.
-# Also don't forget to update python/docs/source/getting_started/install.rst.
+# Also don't forget to update python/docs/source/getting_started/install.rst, and
+# python/packaging/connect/setup.py
 _minimum_pandas_version = "1.4.4"
 _minimum_numpy_version = "1.21"
 _minimum_pyarrow_version = "4.0.0"
@@ -184,8 +204,11 @@ def run(self):
     copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py")
 
     if in_spark:
-        # Construct the symlink farm - this is necessary since we can't refer to the path above the
-        # package root and we need to copy the jars and scripts which are up above the python root.
+        copyfile("packaging/classic/setup.py", "setup.py")
+        copyfile("packaging/classic/setup.cfg", "setup.cfg")
+        # Construct the symlink farm - this is nein_sparkcessary since we can't refer to
+        # the path above the package root and we need to copy the jars and scripts which
+        # are up above the python root.
         if _supports_symlinks():
             os.symlink(JARS_PATH, JARS_TARGET)
             os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
@@ -234,6 +257,7 @@ def run(self):
         url="https://github.com/apache/spark/tree/master/python",
         packages=[
             "pyspark",
+            "pyspark.core",
             "pyspark.cloudpickle",
             "pyspark.mllib",
             "pyspark.mllib.linalg",
@@ -352,6 +376,8 @@ def run(self):
     # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than
     # packaging.
     if in_spark:
+        os.remove("setup.py")
+        os.remove("setup.cfg")
         # Depending on cleaning up the symlink farm or copied version
         if _supports_symlinks():
             os.remove(os.path.join(TEMP_PATH, "jars"))

diff --git a/python/packaging/connect/setup.cfg b/python/packaging/connect/setup.cfg
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+[bdist_wheel]
+universal = 1
+
+[metadata]
+description_file = README.md
-Original file line number
+Diff line change
@@ Expand Up / @@ -29,4 +29,4 @@ if [ $? -ne 0 ]; then @@
         exit 1
     fi
-    $BLACK_BUILD --config dev/pyproject.toml python/pyspark dev python/setup.py
+    $BLACK_BUILD --config dev/pyproject.toml python/pyspark dev python/packaging