Skip to content

Commit 547fd95

Browse files
committed
Update pyspark.zip auto
1 parent 3a0ec77 commit 547fd95

File tree

6 files changed

+34
-15
lines changed

6 files changed

+34
-15
lines changed

core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,6 @@ private[spark] object PythonUtils {
3535
pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.8.2.1-src.zip").mkString(File.separator)
3636
}
3737
pythonPath ++= SparkContext.jarOfObject(this)
38-
sys.env.get("PYSPARK_ARCHIVES_PATH") match {
39-
case Some(path) => pythonPath += path
40-
case None => // do nothing
41-
}
4238
pythonPath.mkString(File.pathSeparator)
4339
}
4440

core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
4949

5050
val pythonPath = PythonUtils.mergePythonPaths(
5151
PythonUtils.sparkPythonPath,
52-
envVars.getOrElse("PYTHONPATH", ""),
52+
envVars.getOrElse("PYTHONPATH", sys.env.getOrElse("PYSPARK_ARCHIVES_PATH", "")),
5353
sys.env.getOrElse("PYTHONPATH", ""))
5454

5555
def create(): Socket = {

core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ object PythonRunner {
5252
pathElements ++= formattedPyFiles
5353
pathElements += PythonUtils.sparkPythonPath
5454
pathElements += sys.env.getOrElse("PYTHONPATH", "")
55+
pathElements += sys.env.getOrElse("PYSPARK_ARCHIVES_PATH", "")
5556
val pythonPath = PythonUtils.mergePythonPaths(pathElements: _*)
5657

5758
// Launch Python process

docs/submitting-applications.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,6 @@ For Python, you can use the `--py-files` argument of `spark-submit` to add `.py`
2222
files to be distributed with your application. If you depend on multiple Python files we recommend
2323
packaging them into a `.zip` or `.egg`.
2424

25-
As Python can not read files from assembly jar which packaged by JDK1.7+, so packaging pyspark into a
26-
`.zip`(the name contains "pyspark") and use `--py-files` argument of `spark-submit` to distribute it.
27-
2825
# Launching Applications with spark-submit
2926

3027
Once a user application is bundled, it can be launched using the `bin/spark-submit` script.

yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ private[spark] class Client(
247247
List(
248248
(SPARK_JAR, sparkJar(sparkConf), CONF_SPARK_JAR),
249249
(APP_JAR, args.userJar, CONF_SPARK_USER_JAR),
250+
(PYSPARK_ARCHIVES, pysparkArchives(sparkConf), CONF_PYSPARK_ARCHIVES),
250251
("log4j.properties", oldLog4jConf.orNull, null)
251252
).foreach { case (destName, _localPath, confKey) =>
252253
val localPath: String = if (_localPath != null) _localPath.trim() else ""
@@ -386,6 +387,12 @@ private[spark] class Client(
386387
val appStagingDir = getAppStagingDir(appId)
387388
val localResources = prepareLocalResources(appStagingDir)
388389
val launchEnv = setupLaunchEnv(appStagingDir)
390+
// From SPARK-1920 and SPARK-1520 we know PySpark on Yarn can not work when the assembly jar are
391+
// package by JDK 1.7+, so we ship PySpark archives to executors as assembly jar, and add this
392+
// path to PYTHONPATH.
393+
for ((resPath, res) <- localResources if resPath.contains(PYSPARK_ARCHIVES)) {
394+
launchEnv("PYSPARK_ARCHIVES_PATH") = resPath
395+
}
389396
val amContainer = Records.newRecord(classOf[ContainerLaunchContext])
390397
amContainer.setLocalResources(localResources)
391398
amContainer.setEnvironment(launchEnv)
@@ -681,9 +688,10 @@ object Client extends Logging {
681688
new Client(args, sparkConf).run()
682689
}
683690

684-
// Alias for the Spark assembly jar and the user jar
691+
// Alias for the Spark assembly jar, the user jar and PySpark archives
685692
val SPARK_JAR: String = "__spark__.jar"
686693
val APP_JAR: String = "__app__.jar"
694+
val PYSPARK_ARCHIVES: String = "__pyspark__.zip"
687695

688696
// URI scheme that identifies local resources
689697
val LOCAL_SCHEME = "local"
@@ -695,6 +703,9 @@ object Client extends Logging {
695703
val CONF_SPARK_JAR = "spark.yarn.jar"
696704
val ENV_SPARK_JAR = "SPARK_JAR"
697705

706+
// Location of any user-defined PySpark archives
707+
val CONF_PYSPARK_ARCHIVES = "spark.pyspark.archives"
708+
698709
// Internal config to propagate the location of the user's jar to the driver/executors
699710
val CONF_SPARK_USER_JAR = "spark.yarn.user.jar"
700711

@@ -733,6 +744,19 @@ object Client extends Logging {
733744
}
734745
}
735746

747+
/**
748+
* Find the user-defined PySpark archives if configured, or return default.
749+
* The default pyspark.zip is in the same path with assembly jar.
750+
*/
751+
private def pysparkArchives(conf: SparkConf): String = {
752+
if (conf.contains(CONF_PYSPARK_ARCHIVES)) {
753+
conf.get(CONF_PYSPARK_ARCHIVES)
754+
} else {
755+
val sparkJarPath = SparkContext.jarOfClass(this.getClass).head
756+
sparkJarPath.substring(0, sparkJarPath.lastIndexOf('/')) + "/pyspark.zip"
757+
}
758+
}
759+
736760
/**
737761
* Return the path to the given application's staging directory.
738762
*/

yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,7 @@ class ExecutorRunnable(
7575

7676
val localResources = prepareLocalResources
7777
ctx.setLocalResources(localResources)
78-
// From SPARK-1920 and SPARK-1520 we know PySpark on Yarn can not work when the assembly jar are
79-
// package by JDK 1.7+, so we ship PySpark archives to executors by Yarn with --py-files, and
80-
// add this path to PYTHONPATH.
81-
for ((resPath, res) <- localResources if resPath.contains("pyspark")) {
82-
env("PYSPARK_ARCHIVES_PATH") = resPath
83-
}
78+
8479
ctx.setEnvironment(env)
8580

8681
val credentials = UserGroupInformation.getCurrentUser().getCredentials()
@@ -304,6 +299,12 @@ class ExecutorRunnable(
304299
}
305300

306301
System.getenv().filterKeys(_.startsWith("SPARK")).foreach { case (k, v) => env(k) = v }
302+
303+
// Add PySpark archives path
304+
sys.env.get("PYSPARK_ARCHIVES_PATH") match {
305+
case Some(pythonArchivesPath) => env("PYSPARK_ARCHIVES_PATH") = pythonArchivesPath
306+
case None =>
307+
}
307308
env
308309
}
309310
}

0 commit comments

Comments
 (0)