Skip to content

Commit 328e9f8

Browse files
committed
SPARK-1843: Replace assemble-deps with env variable.
(This change is actually small, I moved some logic into compute-classpath that was previously in spark-class). Assemble deps has existed for a while to allow developers to run local code with new changes quickly. When I'm developing I typically use a simpler approach which just prepends the Spark classes to the classpath before the assembly jar. This is well defined in the JVM and the Spark classes take predece over those in the assembly. This approach is portable across both builds which is the main reason I'd like to switch to it. It's also faster than creating a new assembly.
1 parent 55fddf9 commit 328e9f8

File tree

4 files changed

+28
-29
lines changed

4 files changed

+28
-29
lines changed

bin/compute-classpath.sh

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,10 @@ else
3838
JAR_CMD="jar"
3939
fi
4040

41-
# First check if we have a dependencies jar. If so, include binary classes with the deps jar
42-
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
41+
# A developer option to prepend more recently compiled Spark classes
42+
if [ -n "$SPARK_PREPEND_CLASSES" ]; then
43+
echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\
44+
"classes ahead of assembly." >&2
4345
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
4446
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
4547
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
@@ -51,17 +53,31 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
5153
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
5254
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
5355
CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"
56+
fi
5457

55-
ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar 2>/dev/null)
58+
# Use spark-assembly jar from either RELEASE or assembly directory
59+
if [ -f "$FWDIR/RELEASE" ]; then
60+
assembly_folder="$FWDIR"/lib
5661
else
57-
# Else use spark-assembly jar from either RELEASE or assembly directory
58-
if [ -f "$FWDIR/RELEASE" ]; then
59-
ASSEMBLY_JAR=$(ls "$FWDIR"/lib/spark-assembly*hadoop*.jar 2>/dev/null)
60-
else
61-
ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar 2>/dev/null)
62-
fi
62+
assembly_folder="$ASSEMBLY_DIR"
6363
fi
6464

65+
num_jars=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*.jar" | wc -l)
66+
if [ "$num_jars" -eq "0" ]; then
67+
echo "Failed to find Spark assembly in $assembly_folder"
68+
echo "You need to build Spark before running this program."
69+
exit 1
70+
fi
71+
if [ "$num_jars" -gt "1" ]; then
72+
jars_list=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*.jar")
73+
echo "Found multiple Spark assembly jars in $assembly_folder:"
74+
echo "$jars_list"
75+
echo "Please remove all but one jar."
76+
exit 1
77+
fi
78+
79+
ASSEMBLY_JAR=$(ls "$assembly_folder"/spark-assembly*hadoop*.jar 2>/dev/null)
80+
6581
# Verify that versions of java used to build the jars and run Spark are compatible
6682
jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
6783
if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then

bin/spark-class

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -108,23 +108,6 @@ fi
108108
export JAVA_OPTS
109109
# Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!
110110

111-
if [ ! -f "$FWDIR/RELEASE" ]; then
112-
# Exit if the user hasn't compiled Spark
113-
num_jars=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar" | wc -l)
114-
jars_list=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar")
115-
if [ "$num_jars" -eq "0" ]; then
116-
echo "Failed to find Spark assembly in $FWDIR/assembly/target/scala-$SCALA_VERSION/" >&2
117-
echo "You need to build Spark before running this program." >&2
118-
exit 1
119-
fi
120-
if [ "$num_jars" -gt "1" ]; then
121-
echo "Found multiple Spark assembly jars in $FWDIR/assembly/target/scala-$SCALA_VERSION:" >&2
122-
echo "$jars_list"
123-
echo "Please remove all but one jar."
124-
exit 1
125-
fi
126-
fi
127-
128111
TOOLS_DIR="$FWDIR"/tools
129112
SPARK_TOOLS_JAR=""
130113
if [ -e "$TOOLS_DIR"/target/scala-$SCALA_VERSION/*assembly*[0-9Tg].jar ]; then

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,9 @@ class SparkContext(config: SparkConf) extends Logging {
290290
value <- Option(System.getenv(envKey)).orElse(Option(System.getProperty(propKey)))} {
291291
executorEnvs(envKey) = value
292292
}
293+
Option(System.getenv("SPARK_PREPEND_CLASSES")).foreach { v =>
294+
executorEnvs("SPARK_PREPEND_CLASSES") = v
295+
}
293296
// The Mesos scheduler backend relies on this environment variable to set executor memory.
294297
// TODO: Set this only in the Mesos scheduler.
295298
executorEnvs("SPARK_EXECUTOR_MEMORY") = executorMemory + "m"

project/SparkBuild.scala

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,6 @@ object SparkBuild extends Build {
8484
lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings)
8585
.dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*)
8686

87-
lazy val assembleDeps = TaskKey[Unit]("assemble-deps", "Build assembly of dependencies and packages Spark projects")
88-
8987
// A configuration to set an alternative publishLocalConfiguration
9088
lazy val MavenCompile = config("m2r") extend(Compile)
9189
lazy val publishLocalBoth = TaskKey[Unit]("publish-local", "publish local for m2 and ivy")
@@ -574,7 +572,6 @@ object SparkBuild extends Build {
574572

575573
def assemblyProjSettings = sharedSettings ++ Seq(
576574
name := "spark-assembly",
577-
assembleDeps in Compile <<= (packageProjects.map(packageBin in Compile in _) ++ Seq(packageDependency in Compile)).dependOn,
578575
jarName in assembly <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" },
579576
jarName in packageDependency <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" }
580577
) ++ assemblySettings ++ extraAssemblySettings

0 commit comments

Comments
 (0)