Skip to content

Commit 1c04652

Browse files
committed
SPARK-1843: Replace assemble-deps with env variable.
(This change is actually small, I moved some logic into compute-classpath that was previously in spark-class). Assemble deps has existed for a while to allow developers to run local code with new changes quickly. When I'm developing I typically use a simpler approach which just prepends the Spark classes to the classpath before the assembly jar. This is well defined in the JVM and the Spark classes take precedence over those in the assembly. This approach is portable across both builds which is the main reason I'd like to switch to it. It's also a bit easier to toggle on and off quickly. The way you use this is the following: ``` $ ./bin/spark-shell # Use spark with the normal assembly $ export SPARK_PREPEND_CLASSES=true $ ./bin/spark-shell # Now it's using compiled classes $ unset SPARK_PREPEND_CLASSES $ ./bin/spark-shell # Back to normal ``` Author: Patrick Wendell <[email protected]> Closes #877 from pwendell/assemble-deps and squashes the following commits: 8a11345 [Patrick Wendell] Merge remote-tracking branch 'apache/master' into assemble-deps faa3168 [Patrick Wendell] Adding a warning for compatibility 3f151a7 [Patrick Wendell] Small fix bbfb73c [Patrick Wendell] Review feedback 328e9f8 [Patrick Wendell] SPARK-1843: Replace assemble-deps with env variable.
1 parent ecde5b8 commit 1c04652

File tree

4 files changed

+40
-30
lines changed

4 files changed

+40
-30
lines changed

bin/compute-classpath.sh

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,10 @@ else
3838
JAR_CMD="jar"
3939
fi
4040

41-
# First check if we have a dependencies jar. If so, include binary classes with the deps jar
42-
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
41+
# A developer option to prepend more recently compiled Spark classes
42+
if [ -n "$SPARK_PREPEND_CLASSES" ]; then
43+
echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\
44+
"classes ahead of assembly." >&2
4345
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
4446
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
4547
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
@@ -51,17 +53,31 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
5153
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
5254
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
5355
CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"
56+
fi
5457

55-
ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar 2>/dev/null)
58+
# Use spark-assembly jar from either RELEASE or assembly directory
59+
if [ -f "$FWDIR/RELEASE" ]; then
60+
assembly_folder="$FWDIR"/lib
5661
else
57-
# Else use spark-assembly jar from either RELEASE or assembly directory
58-
if [ -f "$FWDIR/RELEASE" ]; then
59-
ASSEMBLY_JAR=$(ls "$FWDIR"/lib/spark-assembly*hadoop*.jar 2>/dev/null)
60-
else
61-
ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar 2>/dev/null)
62-
fi
62+
assembly_folder="$ASSEMBLY_DIR"
6363
fi
6464

65+
num_jars=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*\.jar" | wc -l)
66+
if [ "$num_jars" -eq "0" ]; then
67+
echo "Failed to find Spark assembly in $assembly_folder"
68+
echo "You need to build Spark before running this program."
69+
exit 1
70+
fi
71+
if [ "$num_jars" -gt "1" ]; then
72+
jars_list=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*.jar")
73+
echo "Found multiple Spark assembly jars in $assembly_folder:"
74+
echo "$jars_list"
75+
echo "Please remove all but one jar."
76+
exit 1
77+
fi
78+
79+
ASSEMBLY_JAR=$(ls "$assembly_folder"/spark-assembly*hadoop*.jar 2>/dev/null)
80+
6581
# Verify that versions of java used to build the jars and run Spark are compatible
6682
jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
6783
if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then

bin/spark-class

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -108,23 +108,6 @@ fi
108108
export JAVA_OPTS
109109
# Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!
110110

111-
if [ ! -f "$FWDIR/RELEASE" ]; then
112-
# Exit if the user hasn't compiled Spark
113-
num_jars=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar" | wc -l)
114-
jars_list=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar")
115-
if [ "$num_jars" -eq "0" ]; then
116-
echo "Failed to find Spark assembly in $FWDIR/assembly/target/scala-$SCALA_VERSION/" >&2
117-
echo "You need to build Spark before running this program." >&2
118-
exit 1
119-
fi
120-
if [ "$num_jars" -gt "1" ]; then
121-
echo "Found multiple Spark assembly jars in $FWDIR/assembly/target/scala-$SCALA_VERSION:" >&2
122-
echo "$jars_list"
123-
echo "Please remove all but one jar."
124-
exit 1
125-
fi
126-
fi
127-
128111
TOOLS_DIR="$FWDIR"/tools
129112
SPARK_TOOLS_JAR=""
130113
if [ -e "$TOOLS_DIR"/target/scala-$SCALA_VERSION/*assembly*[0-9Tg].jar ]; then

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,9 @@ class SparkContext(config: SparkConf) extends Logging {
290290
value <- Option(System.getenv(envKey)).orElse(Option(System.getProperty(propKey)))} {
291291
executorEnvs(envKey) = value
292292
}
293+
Option(System.getenv("SPARK_PREPEND_CLASSES")).foreach { v =>
294+
executorEnvs("SPARK_PREPEND_CLASSES") = v
295+
}
293296
// The Mesos scheduler backend relies on this environment variable to set executor memory.
294297
// TODO: Set this only in the Mesos scheduler.
295298
executorEnvs("SPARK_EXECUTOR_MEMORY") = executorMemory + "m"

project/SparkBuild.scala

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,16 @@ object SparkBuild extends Build {
9090
lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings)
9191
.dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*)
9292

93-
lazy val assembleDeps = TaskKey[Unit]("assemble-deps", "Build assembly of dependencies and packages Spark projects")
93+
lazy val assembleDepsTask = TaskKey[Unit]("assemble-deps")
94+
lazy val assembleDeps = assembleDepsTask := {
95+
println()
96+
println("**** NOTE ****")
97+
println("'sbt/sbt assemble-deps' is no longer supported.")
98+
println("Instead create a normal assembly and:")
99+
println(" export SPARK_PREPEND_CLASSES=1 (toggle on)")
100+
println(" unset SPARK_PREPEND_CLASSES (toggle off)")
101+
println()
102+
}
94103

95104
// A configuration to set an alternative publishLocalConfiguration
96105
lazy val MavenCompile = config("m2r") extend(Compile)
@@ -373,6 +382,7 @@ object SparkBuild extends Build {
373382
"net.sf.py4j" % "py4j" % "0.8.1"
374383
),
375384
libraryDependencies ++= maybeAvro,
385+
assembleDeps,
376386
previousArtifact := sparkPreviousArtifact("spark-core")
377387
)
378388

@@ -584,9 +594,7 @@ object SparkBuild extends Build {
584594

585595
def assemblyProjSettings = sharedSettings ++ Seq(
586596
name := "spark-assembly",
587-
assembleDeps in Compile <<= (packageProjects.map(packageBin in Compile in _) ++ Seq(packageDependency in Compile)).dependOn,
588-
jarName in assembly <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" },
589-
jarName in packageDependency <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" }
597+
jarName in assembly <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" }
590598
) ++ assemblySettings ++ extraAssemblySettings
591599

592600
def extraAssemblySettings() = Seq(

0 commit comments

Comments
 (0)