Skip to content

Commit 2409af9

Browse files
sryzapwendell
authored andcommitted
SPARK-1064
This reopens PR 649 from incubator-spark against the new repo Author: Sandy Ryza <[email protected]> Closes apache#102 from sryza/sandy-spark-1064 and squashes the following commits: 270e490 [Sandy Ryza] Handle different application classpath variables in different versions 88b04e0 [Sandy Ryza] SPARK-1064. Make it possible to run on YARN without bundling Hadoop jars in Spark assembly
1 parent 16788a6 commit 2409af9

File tree

3 files changed

+94
-1
lines changed

3 files changed

+94
-1
lines changed

docs/building-with-maven.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,9 @@ Running only java 8 tests and nothing else.
8888
Java 8 tests are run when -Pjava8-tests profile is enabled, they will run in spite of -DskipTests.
8989
For these tests to run your system must have a JDK 8 installation.
9090
If you have JDK 8 installed but it is not the system default, you can set JAVA_HOME to point to JDK 8 before running the tests.
91+
92+
## Packaging without Hadoop dependencies for deployment on YARN ##
93+
94+
The assembly jar produced by "mvn package" will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath. The "hadoop-provided" profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself.
95+
96+

pom.xml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -807,5 +807,51 @@
807807
</modules>
808808

809809
</profile>
810+
811+
<!-- Build without Hadoop dependencies that are included in some runtime environments. -->
812+
<profile>
813+
<id>hadoop-provided</id>
814+
<activation>
815+
<activeByDefault>false</activeByDefault>
816+
</activation>
817+
<dependencies>
818+
<dependency>
819+
<groupId>org.apache.hadoop</groupId>
820+
<artifactId>hadoop-client</artifactId>
821+
<scope>provided</scope>
822+
</dependency>
823+
<dependency>
824+
<groupId>org.apache.hadoop</groupId>
825+
<artifactId>hadoop-yarn-api</artifactId>
826+
<scope>provided</scope>
827+
</dependency>
828+
<dependency>
829+
<groupId>org.apache.hadoop</groupId>
830+
<artifactId>hadoop-yarn-common</artifactId>
831+
<scope>provided</scope>
832+
</dependency>
833+
<dependency>
834+
<groupId>org.apache.hadoop</groupId>
835+
<artifactId>hadoop-yarn-client</artifactId>
836+
<scope>provided</scope>
837+
</dependency>
838+
<dependency>
839+
<groupId>org.apache.avro</groupId>
840+
<artifactId>avro</artifactId>
841+
<scope>provided</scope>
842+
</dependency>
843+
<dependency>
844+
<groupId>org.apache.avro</groupId>
845+
<artifactId>avro-ipc</artifactId>
846+
<scope>provided</scope>
847+
</dependency>
848+
<dependency>
849+
<groupId>org.apache.zookeeper</groupId>
850+
<artifactId>zookeeper</artifactId>
851+
<scope>provided</scope>
852+
</dependency>
853+
</dependencies>
854+
</profile>
855+
810856
</profiles>
811857
</project>

yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@ import org.apache.hadoop.fs._
2929
import org.apache.hadoop.fs.permission.FsPermission;
3030
import org.apache.hadoop.io.DataOutputBuffer
3131
import org.apache.hadoop.mapred.Master
32+
import org.apache.hadoop.mapreduce.MRJobConfig
3233
import org.apache.hadoop.net.NetUtils
3334
import org.apache.hadoop.security.UserGroupInformation
35+
import org.apache.hadoop.util.StringUtils
3436
import org.apache.hadoop.yarn.api._
3537
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
3638
import org.apache.hadoop.yarn.api.protocolrecords._
@@ -379,9 +381,48 @@ object ClientBase {
379381

380382
// Based on code from org.apache.hadoop.mapreduce.v2.util.MRApps
381383
def populateHadoopClasspath(conf: Configuration, env: HashMap[String, String]) {
382-
for (c <- conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH)) {
384+
val classpathEntries = Option(conf.getStrings(
385+
YarnConfiguration.YARN_APPLICATION_CLASSPATH)).getOrElse(
386+
getDefaultYarnApplicationClasspath())
387+
for (c <- classpathEntries) {
383388
Apps.addToEnvironment(env, Environment.CLASSPATH.name, c.trim)
384389
}
390+
391+
val mrClasspathEntries = Option(conf.getStrings(
392+
"mapreduce.application.classpath")).getOrElse(
393+
getDefaultMRApplicationClasspath())
394+
if (mrClasspathEntries != null) {
395+
for (c <- mrClasspathEntries) {
396+
Apps.addToEnvironment(env, Environment.CLASSPATH.name, c.trim)
397+
}
398+
}
399+
}
400+
401+
def getDefaultYarnApplicationClasspath(): Array[String] = {
402+
try {
403+
val field = classOf[MRJobConfig].getField("DEFAULT_YARN_APPLICATION_CLASSPATH")
404+
field.get(null).asInstanceOf[Array[String]]
405+
} catch {
406+
case err: NoSuchFieldError => null
407+
}
408+
}
409+
410+
/**
411+
* In Hadoop 0.23, the MR application classpath comes with the YARN application
412+
* classpath. In Hadoop 2.0, it's an array of Strings, and in 2.2+ it's a String.
413+
* So we need to use reflection to retrieve it.
414+
*/
415+
def getDefaultMRApplicationClasspath(): Array[String] = {
416+
try {
417+
val field = classOf[MRJobConfig].getField("DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH")
418+
if (field.getType == classOf[String]) {
419+
StringUtils.getStrings(field.get(null).asInstanceOf[String])
420+
} else {
421+
field.get(null).asInstanceOf[Array[String]]
422+
}
423+
} catch {
424+
case err: NoSuchFieldError => null
425+
}
385426
}
386427

387428
def populateClasspath(conf: Configuration, sparkConf: SparkConf, addLog4j: Boolean, env: HashMap[String, String]) {

0 commit comments

Comments
 (0)