Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dev/lint-python
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ function black_test {
fi

echo "starting black test..."
BLACK_REPORT=$( ($BLACK_BUILD --config dev/pyproject.toml --check python/pyspark dev python/setup.py) 2>&1)
BLACK_REPORT=$( ($BLACK_BUILD --config dev/pyproject.toml --check python/pyspark dev python/packaging) 2>&1)
BLACK_STATUS=$?

if [ "$BLACK_STATUS" -ne 0 ]; then
Expand Down
2 changes: 1 addition & 1 deletion dev/make-distribution.sh
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ if [ "$MAKE_PIP" == "true" ]; then
pushd "$SPARK_HOME/python" > /dev/null
# Delete the egg info file if it exists, this can cache older setup files.
rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
python3 setup.py sdist
python3 packaging/classic/setup.py sdist
popd > /dev/null
else
echo "Skipping building python distribution package"
Expand Down
2 changes: 1 addition & 1 deletion dev/reformat-python
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ if [ $? -ne 0 ]; then
exit 1
fi

$BLACK_BUILD --config dev/pyproject.toml python/pyspark dev python/setup.py
$BLACK_BUILD --config dev/pyproject.toml python/pyspark dev python/packaging
6 changes: 3 additions & 3 deletions dev/run-pip-tests
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz"
PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall"
# Test both regular user and edit/dev install modes.
PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST"
"pip install $PIP_OPTIONS -e python/")
"pip install $PIP_OPTIONS -e python/packaging/classic")

# Jenkins has PySpark installed under user sitepackages shared for some reasons.
# In this test, explicitly exclude user sitepackages to prevent side effects
Expand Down Expand Up @@ -103,7 +103,7 @@ for python in "${PYTHON_EXECS[@]}"; do
cd "$FWDIR"/python
# Delete the egg info file if it exists, this can cache the setup file.
rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
python3 setup.py sdist
python3 packaging/classic/setup.py sdist


echo "Installing dist into virtual env"
Expand All @@ -125,7 +125,7 @@ for python in "${PYTHON_EXECS[@]}"; do
echo "Run basic sanity check with import based"
python3 "$FWDIR"/dev/pip-sanity-check.py
echo "Run the tests for context.py"
python3 "$FWDIR"/python/pyspark/context.py
python3 "$FWDIR"/python/pyspark/core/context.py

cd "$FWDIR"

Expand Down
10 changes: 5 additions & 5 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,12 +430,12 @@ def __hash__(self):
source_file_regexes=["python/(?!pyspark/(ml|mllib|sql|streaming))"],
python_test_goals=[
# doctests
"pyspark.rdd",
"pyspark.context",
"pyspark.conf",
"pyspark.broadcast",
"pyspark.core.rdd",
"pyspark.core.context",
"pyspark.core.conf",
"pyspark.core.broadcast",
"pyspark.accumulators",
"pyspark.files",
"pyspark.core.files",
"pyspark.serializers",
"pyspark.profiler",
"pyspark.shuffle",
Expand Down
2 changes: 1 addition & 1 deletion docs/building-spark.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ For information about how to run individual tests, refer to the

If you are building Spark for use in a Python environment and you wish to pip install it, you will first need to build the Spark JARs as described above. Then you can construct an sdist package suitable for setup.py and pip installable package.

cd python; python setup.py sdist
cd python; python packaging/classic/setup.py sdist

**Note:** Due to packaging requirements you can not directly pip install from the Python directory, rather you must first build the sdist package as described above.

Expand Down
2 changes: 1 addition & 1 deletion docs/rdd-programming-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -1321,7 +1321,7 @@ method. The code below shows this:

{% highlight python %}
>>> broadcastVar = sc.broadcast([1, 2, 3])
<pyspark.broadcast.Broadcast object at 0x102789f10>
<pyspark.core.broadcast.Broadcast object at 0x102789f10>

>>> broadcastVar.value
[1, 2, 3]
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/avro_inputformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from typing import Any, Tuple

from functools import reduce
from pyspark.rdd import RDD
from pyspark import RDD
from pyspark.sql import SparkSession

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/parquet_inputformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
import sys
from typing import Any, Tuple

from pyspark.rdd import RDD
from pyspark import RDD
from pyspark.sql import SparkSession

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import sys
from typing import Tuple

from pyspark.rdd import RDD
from pyspark import RDD
from pyspark.sql import SparkSession


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from typing import Tuple

from pyspark import SparkContext
from pyspark.rdd import RDD
from pyspark import RDD
from pyspark.streaming import DStream, StreamingContext


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,7 @@
import sys
from typing import List, Tuple

from pyspark import SparkContext
from pyspark.accumulators import Accumulator
from pyspark.broadcast import Broadcast
from pyspark.rdd import RDD
from pyspark import SparkContext, Accumulator, Broadcast, RDD
from pyspark.streaming import StreamingContext


Expand Down
3 changes: 1 addition & 2 deletions examples/src/main/python/streaming/sql_network_wordcount.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
import sys
import datetime

from pyspark import SparkConf, SparkContext
from pyspark.rdd import RDD
from pyspark import SparkConf, SparkContext, RDD
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession

Expand Down
3 changes: 2 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,8 @@
<icu4j.version>72.1</icu4j.version>
<!--
If you are changing Arrow version specification, please check
./python/pyspark/sql/pandas/utils.py, and ./python/setup.py too.
./python/pyspark/sql/pandas/utils.py, ./python/packaging/classic/setup.py
and ./python/packaging/connect/setup.py too.
-->
<arrow.version>15.0.2</arrow.version>
<ammonite.version>3.0.0-M1</ammonite.version>
Expand Down
3 changes: 3 additions & 0 deletions python/.gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
*.pyc
docs/_build/
pyspark.egg-info
pyspark_connect.egg-info
build/
dist/
./setup.py
./setup.cfg
File renamed without changes.
34 changes: 30 additions & 4 deletions python/setup.py → python/packaging/classic/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,25 @@
from setuptools import setup
from setuptools.command.install import install
from shutil import copyfile, copytree, rmtree
from pathlib import Path

if (
# When we package, the parent diectory 'classic' dir
# (as we pip install -e python/packaging/classic)
os.getcwd() == str(Path(__file__).parent.absolute())
and str(Path(__file__).parent.name) == "classic"
):
# For:
# - pip install -e python/packaging/classic
# It moves the current working directory to 'classic'
# - cd python/packaging/classic; python setup.py sdist
#
# For:
# - python packaging/classic/setup.py sdist, it does not
# execute this branch.
#
# Move to spark/python
os.chdir(Path(__file__).parent.parent.parent.absolute())

try:
exec(open("pyspark/version.py").read())
Expand Down Expand Up @@ -58,7 +77,7 @@
./build/mvn -DskipTests clean package
Building the source dist is done in the Python directory:
cd python
python setup.py sdist
python packaging/classic/setup.py sdist
pip install dist/*.tar.gz"""

# Figure out where the jars are we need to package with PySpark.
Expand Down Expand Up @@ -129,7 +148,8 @@ def _supports_symlinks():
# If you are changing the versions here, please also change ./python/pyspark/sql/pandas/utils.py
# For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the
# binary format protocol with the Java version, see ARROW_HOME/format/* for specifications.
# Also don't forget to update python/docs/source/getting_started/install.rst.
# Also don't forget to update python/docs/source/getting_started/install.rst, and
# python/packaging/connect/setup.py
_minimum_pandas_version = "1.4.4"
_minimum_numpy_version = "1.21"
_minimum_pyarrow_version = "4.0.0"
Expand Down Expand Up @@ -184,8 +204,11 @@ def run(self):
copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py")

if in_spark:
# Construct the symlink farm - this is necessary since we can't refer to the path above the
# package root and we need to copy the jars and scripts which are up above the python root.
copyfile("packaging/classic/setup.py", "setup.py")
copyfile("packaging/classic/setup.cfg", "setup.cfg")
# Construct the symlink farm - this is nein_sparkcessary since we can't refer to
# the path above the package root and we need to copy the jars and scripts which
# are up above the python root.
if _supports_symlinks():
os.symlink(JARS_PATH, JARS_TARGET)
os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
Expand Down Expand Up @@ -234,6 +257,7 @@ def run(self):
url="https://github.com/apache/spark/tree/master/python",
packages=[
"pyspark",
"pyspark.core",
"pyspark.cloudpickle",
"pyspark.mllib",
"pyspark.mllib.linalg",
Expand Down Expand Up @@ -352,6 +376,8 @@ def run(self):
# We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than
# packaging.
if in_spark:
os.remove("setup.py")
os.remove("setup.cfg")
# Depending on cleaning up the symlink farm or copied version
if _supports_symlinks():
os.remove(os.path.join(TEMP_PATH, "jars"))
Expand Down
22 changes: 22 additions & 0 deletions python/packaging/connect/setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

[bdist_wheel]
universal = 1

[metadata]
description_file = README.md
Loading