Skip to content

Commit dd1a365

Browse files
committed
Eliminate need for SPARK_HIVE at runtime by d/ling datanucleus from Maven
1 parent a9269b5 commit dd1a365

File tree

4 files changed

+49
-17
lines changed

4 files changed

+49
-17
lines changed

bin/compute-classpath.sh

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,20 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
3030
# Build up classpath
3131
CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
3232

33-
# Support for interacting with Hive. Since hive pulls in a lot of dependencies that might break
34-
# existing Spark applications, it is not included in the standard spark assembly. Instead, we only
35-
# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
36-
# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
37-
# the future.
38-
if [ "$SPARK_HIVE" = "true" ]; then
39-
echo 1>&2 "SPARK_HIVE is set, including Hive support."
40-
41-
# Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
42-
DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
43-
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
44-
fi
45-
46-
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
33+
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
4734

4835
# First check if we have a dependencies jar. If so, include binary classes with the deps jar
4936
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
@@ -70,6 +57,23 @@ else
7057
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
7158
fi
7259

60+
# When Hive support is needed, Datanucleus jars must be included on the classpath.
61+
# Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
62+
# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
63+
# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
64+
# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
65+
# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
66+
num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ | grep "datanucleus-.*\\.jar" | wc -l)
67+
if [ $num_datanucleus_jars -gt 0 ]; then
68+
AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
69+
num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
70+
if [ $num_hive_files -gt 0 ]; then
71+
echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
72+
DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
73+
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
74+
fi
75+
fi
76+
7377
# Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
7478
if [[ $SPARK_TESTING == 1 ]]; then
7579
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/test-classes"

pom.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,12 @@
575575
</exclusion>
576576
</exclusions>
577577
</dependency>
578+
<dependency>
579+
<!-- Matches the version of jackson-core-asl pulled in by avro -->
580+
<groupId>org.codehaus.jackson</groupId>
581+
<artifactId>jackson-mapper-asl</artifactId>
582+
<version>1.8.8</version>
583+
</dependency>
578584
</dependencies>
579585
</dependencyManagement>
580586

project/SparkBuild.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ object SparkBuild extends Build {
419419

420420
// Since we don't include hive in the main assembly this project also acts as an alternative
421421
// assembly jar.
422-
def hiveSettings = sharedSettings ++ assemblyProjSettings ++ Seq(
422+
def hiveSettings = sharedSettings ++ Seq(
423423
name := "spark-hive",
424424
javaOptions += "-XX:MaxPermSize=1g",
425425
libraryDependencies ++= Seq(

sql/hive/pom.xml

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,8 @@
6464
<version>${hive.version}</version>
6565
</dependency>
6666
<dependency>
67-
<!-- Matches the version of jackson-core-asl pulled in by avro -->
6867
<groupId>org.codehaus.jackson</groupId>
6968
<artifactId>jackson-mapper-asl</artifactId>
70-
<version>1.8.8</version>
7169
</dependency>
7270
<dependency>
7371
<groupId>org.apache.hive</groupId>
@@ -93,6 +91,30 @@
9391
<groupId>org.scalatest</groupId>
9492
<artifactId>scalatest-maven-plugin</artifactId>
9593
</plugin>
94+
95+
<!-- Deploy datanucleus jars to the spark/lib_managed/jars directory -->
96+
<plugin>
97+
<groupId>org.apache.maven.plugins</groupId>
98+
<artifactId>maven-dependency-plugin</artifactId>
99+
<version>2.4</version>
100+
<executions>
101+
<execution>
102+
<id>copy-dependencies</id>
103+
<phase>package</phase>
104+
<goals>
105+
<goal>copy-dependencies</goal>
106+
</goals>
107+
<configuration>
108+
<!-- basedir is spark/sql/hive/ -->
109+
<outputDirectory>${basedir}/../../lib_managed/jars</outputDirectory>
110+
<overWriteReleases>false</overWriteReleases>
111+
<overWriteSnapshots>false</overWriteSnapshots>
112+
<overWriteIfNewer>true</overWriteIfNewer>
113+
<includeGroupIds>org.datanucleus</includeGroupIds>
114+
</configuration>
115+
</execution>
116+
</executions>
117+
</plugin>
96118
</plugins>
97119
</build>
98120
</project>

0 commit comments

Comments
 (0)