Skip to content

Commit c4f5778

Browse files
committed
[SPARK-3772] Allow ipython to be used by Pyspark workers; IPython fixes:
- Fix the remaining uses of the '-u' flag, which IPython doesn't support. - Change PYSPARK_PYTHON_OPTS to PYSPARK_DRIVER_PYTHON_OPTS, so that the old name is reserved in case we ever want to allow the worker Python options to be customized.
1 parent 1eb8389 commit c4f5778

File tree

4 files changed

+41
-17
lines changed

4 files changed

+41
-17
lines changed

bin/pyspark

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,21 +50,36 @@ fi
5050

5151
. "$FWDIR"/bin/load-spark-env.sh
5252

53-
# Figure out which Python executable to use
54-
if [[ -z "$PYSPARK_PYTHON" ]]; then
55-
if [[ "$IPYTHON" = "1" || -n "$IPYTHON_OPTS" ]]; then
56-
# for backward compatibility
57-
PYSPARK_PYTHON="ipython"
58-
else
59-
PYSPARK_PYTHON="python"
60-
fi
53+
# In Spark <= 1.1, setting IPYTHON=1 would cause the driver to be launched using the `ipython`
54+
# executable, while the worker would still be launched using PYSPARK_PYTHON. This allowed users to
55+
# launch IPython notebooks with PySpark without requiring IPython to be installed on the workers.
56+
# Unfortunately, this approach had a few drabacks:
57+
#
58+
# - It wasn't easy to use a custom IPython executable (SPARK-3265).
59+
# - There was a risk that the `ipython` and `PYSPARK_PYTHON` executables might run different
60+
# Python versions (e.g. 2.6 on driver and 2.7 on the workers), which might lead to issues
61+
# when using certain Python serializers that are incompatible across releases (e.g. marshal).
62+
#
63+
# In Spark 1.2, we removed the documentation of the IPYTHON and IPYTHON_OPTS variables, since
64+
# we've made the necessary changes to allow `ipython` to be used on the workers, too. Now,
65+
# users can simply set PYSPARK_PYTHON=ipython to use IPython and set PYSPARK_DRIVER_PYTHON_OPTS to
66+
# pass options when starting the Python driver (e.g. PYSPARK_DRIVER_PYTHON_OPTS='notebook').
67+
#
68+
# For backwards-compatibility, we retain the old IPYTHON and IPYTHON_OPTS variables.
69+
70+
# If IPython options are specified, assume user wants to run IPython (for backwards-compatibility)
71+
if [[ -n "$IPYTHON_OPTS" ]]; then
72+
IPYTHON=1
73+
# For backwards-compatibility:
74+
PYSPARK_DRIVER_PYTHON_OPTS="$PYSPARK_DRIVER_PYTHON_OPTS $IPYTHON_OPTS"
6175
fi
62-
export PYSPARK_PYTHON
6376

64-
if [[ -z "$PYSPARK_PYTHON_OPTS" && -n "$IPYTHON_OPTS" ]]; then
65-
# for backward compatibility
66-
PYSPARK_PYTHON_OPTS="$IPYTHON_OPTS"
77+
# Figure out which Python executable to use.
78+
# If we're not running in the legacy IPYTHON mode, then use a default PYSPARK_PYTHON
79+
if [[ "$IPYTHON" != "1" && -z "$PYSPARK_PYTHON" ]]; then
80+
PYSPARK_PYTHON="python"
6781
fi
82+
export PYSPARK_PYTHON
6883

6984
# Add the PySpark classes to the Python path:
7085
export PYTHONPATH="$SPARK_HOME/python/:$PYTHONPATH"
@@ -111,5 +126,9 @@ if [[ "$1" =~ \.py$ ]]; then
111126
else
112127
# PySpark shell requires special handling downstream
113128
export PYSPARK_SHELL=1
114-
exec "$PYSPARK_PYTHON" $PYSPARK_PYTHON_OPTS
129+
if [[ "$IPYTHON" = "1" ]]; then
130+
exec "${PYSPARK_PYTHON:-ipython}" $PYSPARK_DRIVER_PYTHON_OPTS
131+
else
132+
exec "$PYSPARK_PYTHON" $PYSPARK_DRIVER_PYTHON_OPTS
133+
fi
115134
fi

core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,12 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
108108
serverSocket = new ServerSocket(0, 1, InetAddress.getByAddress(Array(127, 0, 0, 1)))
109109

110110
// Create and start the worker
111-
val pb = new ProcessBuilder(Seq(pythonExec, "-u", "-m", "pyspark.worker"))
111+
val pb = new ProcessBuilder(Seq(pythonExec, "-m", "pyspark.worker"))
112112
val workerEnv = pb.environment()
113113
workerEnv.putAll(envVars)
114114
workerEnv.put("PYTHONPATH", pythonPath)
115+
// This is equivalent to setting the -u flag; we use it because ipython doesn't support -u:
116+
workerEnv.put("PYTHONUNBUFFERED", "YES")
115117
val worker = pb.start()
116118

117119
// Redirect worker stdout and stderr
@@ -149,10 +151,12 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
149151

150152
try {
151153
// Create and start the daemon
152-
val pb = new ProcessBuilder(Seq(pythonExec, "-u", "-m", "pyspark.daemon"))
154+
val pb = new ProcessBuilder(Seq(pythonExec, "-m", "pyspark.daemon"))
153155
val workerEnv = pb.environment()
154156
workerEnv.putAll(envVars)
155157
workerEnv.put("PYTHONPATH", pythonPath)
158+
// This is equivalent to setting the -u flag; we use it because ipython doesn't support -u:
159+
workerEnv.put("PYTHONUNBUFFERED", "YES")
156160
daemon = pb.start()
157161

158162
val in = new DataInputStream(daemon.getInputStream)

core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ object PythonRunner {
5757
val builder = new ProcessBuilder(Seq(pythonExec, formattedPythonFile) ++ otherArgs)
5858
val env = builder.environment()
5959
env.put("PYTHONPATH", pythonPath)
60+
// This is equivalent to setting the -u flag; we use it because ipython doesn't support -u:
6061
env.put("PYTHONUNBUFFERED", "YES") // value is needed to be set to a non-empty string
6162
env.put("PYSPARK_GATEWAY_PORT", "" + gatewayServer.getListeningPort)
6263
builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize

docs/programming-guide.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,11 +217,11 @@ use IPython, set the `PYSPARK_PYTHON` variable to `ipython` when running `bin/py
217217
$ PYSPARK_PYTHON=ipython ./bin/pyspark
218218
{% endhighlight %}
219219

220-
You can customize the `ipython` command by setting `PYSPARK_PYTHON_OPTS`. For example, to launch
220+
You can customize the `ipython` command by setting `PYSPARK_DRIVER_PYTHON_OPTS`. For example, to launch
221221
the [IPython Notebook](http://ipython.org/notebook.html) with PyLab plot support:
222222

223223
{% highlight bash %}
224-
$ PYSPARK_PYTHON=ipython PYSPARK_PYTHON_OPTS="notebook --pylab inline" ./bin/pyspark
224+
$ PYSPARK_PYTHON=ipython PYSPARK_DRIVER_PYTHON_OPTS="notebook --pylab inline" ./bin/pyspark
225225
{% endhighlight %}
226226

227227
</div>

0 commit comments

Comments
 (0)