Eliminate need for SPARK_HIVE at runtime by d/ling datanucleus from Maven

aarondav · aarondav · commit dd1a3651d410 · 2014-04-05T17:30:54.000-07:00
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
@@ -30,20 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
 # Build up classpath
 CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
 
-# Support for interacting with Hive.  Since hive pulls in a lot of dependencies that might break
-# existing Spark applications, it is not included in the standard spark assembly.  Instead, we only
-# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
-# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
-# the future.
-if [ "$SPARK_HIVE" = "true" ]; then
-  echo 1>&2 "SPARK_HIVE is set, including Hive support."
-
-  # Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
-  DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
-  CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
-fi
-
-ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
+ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
 
 # First check if we have a dependencies jar. If so, include binary classes with the deps jar
 if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
@@ -70,6 +57,23 @@ else
   CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
 fi
 
+# When Hive support is needed, Datanucleus jars must be included on the classpath.
+# Datanucleus jars do not work if only included in the  uberjar as plugin.xml metadata is lost.
+# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
+# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
+# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
+# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
+num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ | grep "datanucleus-.*\\.jar" | wc -l)
+if [ $num_datanucleus_jars -gt 0 ]; then
+  AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
+  num_hive_files=$(jar tvf  "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
+  if [ $num_hive_files -gt 0 ]; then
+    echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
+    DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
+    CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
+  fi
+fi
+
 # Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
 if [[ $SPARK_TESTING == 1 ]]; then
   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/test-classes"
diff --git a/pom.xml b/pom.xml
@@ -575,6 +575,12 @@
           </exclusion>
         </exclusions>
       </dependency>
+      <dependency>
+        <!-- Matches the version of jackson-core-asl pulled in by avro -->
+        <groupId>org.codehaus.jackson</groupId>
+        <artifactId>jackson-mapper-asl</artifactId>
+        <version>1.8.8</version>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
@@ -419,7 +419,7 @@ object SparkBuild extends Build {
 
   // Since we don't include hive in the main assembly this project also acts as an alternative
   // assembly jar.
-  def hiveSettings = sharedSettings ++ assemblyProjSettings ++ Seq(
+  def hiveSettings = sharedSettings ++ Seq(
     name := "spark-hive",
     javaOptions += "-XX:MaxPermSize=1g",
     libraryDependencies ++= Seq(
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
@@ -64,10 +64,8 @@
             <version>${hive.version}</version>
         </dependency>
         <dependency>
-            <!-- Matches the version of jackson-core-asl pulled in by avro -->
             <groupId>org.codehaus.jackson</groupId>
             <artifactId>jackson-mapper-asl</artifactId>
-            <version>1.8.8</version>
         </dependency>
         <dependency>
             <groupId>org.apache.hive</groupId>
@@ -93,6 +91,30 @@
                 <groupId>org.scalatest</groupId>
                 <artifactId>scalatest-maven-plugin</artifactId>
             </plugin>
+
+            <!-- Deploy datanucleus jars to the spark/lib_managed/jars directory -->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-dependency-plugin</artifactId>
+                <version>2.4</version>
+                <executions>
+                    <execution>
+                        <id>copy-dependencies</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy-dependencies</goal>
+                        </goals>
+                        <configuration>
+                            <!-- basedir is spark/sql/hive/ -->
+                            <outputDirectory>${basedir}/../../lib_managed/jars</outputDirectory>
+                            <overWriteReleases>false</overWriteReleases>
+                            <overWriteSnapshots>false</overWriteSnapshots>
+                            <overWriteIfNewer>true</overWriteIfNewer>
+                            <includeGroupIds>org.datanucleus</includeGroupIds>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
         </plugins>
     </build>
 </project>