Skip to content

Commit 66b93b9

Browse files
jongyoulLeemoonsoo
authored andcommitted
[ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node
- Spark supports pyspark on yarn cluster without deploying python libraries from Spark 1.4 - https://issues.apache.org/jira/browse/SPARK-6869 - apache/spark#5580, apache/spark#5478 Author: Jongyoul Lee <[email protected]> Closes #118 from jongyoul/ZEPPELIN-18 and squashes the following commits: a47e27c [Jongyoul Lee] - Fixed test script for spark 1.4.0 72a65fd [Jongyoul Lee] - Fixed test script for spark 1.4.0 ee6d100 [Jongyoul Lee] - Cleanup codes 47fd9c9 [Jongyoul Lee] - Cleanup codes 248e330 [Jongyoul Lee] - Cleanup codes 4cd10b5 [Jongyoul Lee] - Removed meaningless codes comments c9cda29 [Jongyoul Lee] - Removed setting SPARK_HOME - Changed the location of pyspark's directory into interpreter/spark ef240f5 [Jongyoul Lee] - Fixed typo 06002fd [Jongyoul Lee] - Fixed typo 4b35c8d [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - Dummy for trigger 682986e [Jongyoul Lee] rebased 8a7bf47 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - rebasing ad610fb [Jongyoul Lee] rebased 94bdf30 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - Fixed checkstyle 929333d [Jongyoul Lee] rebased 64b8195 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - rebasing 0a2d90e [Jongyoul Lee] rebased b05ae6e [Jongyoul Lee] [ZEPPELIN-18] Remove setting SPARK_HOME for PySpark - Excludes python/** from apache-rat 71e2a92 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - Removed verbose setting 0ddb436 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - Followed spark's way to support pyspark - https://issues.apache.org/jira/browse/SPARK-6869 - apache/spark#5580 - https://github.com/apache/spark/pull/5478/files 1b192f6 [Jongyoul Lee] [ZEPPELIN-18] Remove setting SPARK_HOME for PySpark - Removed redundant dependency setting 32fd9e1 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - rebasing (cherry picked from commit 3bd2b21) Signed-off-by: Lee moon soo <[email protected]>
1 parent 8d15d7c commit 66b93b9

File tree

6 files changed

+117
-27
lines changed

6 files changed

+117
-27
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,4 @@ auto-save-list
7373
tramp
7474
.\#*
7575
*.swp
76+
**/dependency-reduced-pom.xml

.travis.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,16 @@ before_install:
2222
- "sh -e /etc/init.d/xvfb start"
2323

2424
install:
25-
- mvn package -DskipTests -Phadoop-2.3 -B
25+
- mvn package -DskipTests -Phadoop-2.3 -Ppyspark -B
2626

2727
before_script:
2828
-
2929

3030
script:
3131
# spark 1.4
32-
- mvn package -Pbuild-distr -Phadoop-2.3 -B
32+
- mvn package -Pbuild-distr -Phadoop-2.3 -Ppyspark -B
3333
- ./testing/startSparkCluster.sh 1.4.0 2.3
34-
- SPARK_HOME=./spark-1.4.1-bin-hadoop2.3 mvn verify -Pusing-packaged-distr -Phadoop-2.3 -B
34+
- SPARK_HOME=`pwd`/spark-1.4.0-bin-hadoop2.3 mvn verify -Pusing-packaged-distr -Phadoop-2.3 -Ppyspark -B
3535
- ./testing/stopSparkCluster.sh 1.4.0 2.3
3636
# spark 1.3
3737
- mvn clean package -DskipTests -Pspark-1.3 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,spark'

bin/interpreter.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,19 @@ if [[ ! -d "${ZEPPELIN_LOG_DIR}" ]]; then
7373
$(mkdir -p "${ZEPPELIN_LOG_DIR}")
7474
fi
7575

76+
if [[ ! -z "${SPARK_HOME}" ]]; then
77+
PYSPARKPATH="${SPARK_HOME}/python/lib/pyspark.zip:${SPARK_HOME}/python/lib/py4j-0.8.2.1-src.zip"
78+
else
79+
PYSPARKPATH="${ZEPPELIN_HOME}/interpreter/spark/pyspark/pyspark.zip:${ZEPPELIN_HOME}/interpreter/spark/pyspark/py4j-0.8.2.1-src.zip"
80+
fi
81+
82+
if [[ x"" == x"${PYTHONPATH}" ]]; then
83+
export PYTHONPATH="${PYSPARKPATH}"
84+
else
85+
export PYTHONPATH="${PYTHONPATH}:${PYSPARKPATH}"
86+
fi
87+
88+
unset PYSPARKPATH
7689

7790
${ZEPPELIN_RUNNER} ${JAVA_INTP_OPTS} -cp ${CLASSPATH} ${ZEPPELIN_SERVER} ${PORT} &
7891
pid=$!

spark/pom.xml

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@
4848

4949
<akka.group>org.spark-project.akka</akka.group>
5050
<akka.version>2.3.4-spark</akka.version>
51+
52+
<spark.download.url>http://www.apache.org/dist/spark/spark-${spark.version}/spark-${spark.version}.tgz</spark.download.url>
5153
</properties>
5254

5355
<repositories>
@@ -473,13 +475,6 @@
473475
</exclusions>
474476
</dependency>
475477

476-
<!-- pyspark -->
477-
<dependency>
478-
<groupId>net.sf.py4j</groupId>
479-
<artifactId>py4j</artifactId>
480-
<version>0.8.2.1</version>
481-
</dependency>
482-
483478
<dependency>
484479
<groupId>org.apache.commons</groupId>
485480
<artifactId>commons-exec</artifactId>
@@ -723,6 +718,74 @@
723718
</dependencies>
724719
</profile>
725720

721+
<profile>
722+
<id>pyspark</id>
723+
<properties>
724+
<spark.download.url>http://www.apache.org/dist/spark/spark-${spark.version}/spark-${spark.version}.tgz
725+
</spark.download.url>
726+
</properties>
727+
<build>
728+
<plugins>
729+
<plugin>
730+
<groupId>com.googlecode.maven-download-plugin</groupId>
731+
<artifactId>download-maven-plugin</artifactId>
732+
<version>1.2.1</version>
733+
<executions>
734+
<execution>
735+
<id>download-pyspark-files</id>
736+
<phase>validate</phase>
737+
<goals>
738+
<goal>wget</goal>
739+
</goals>
740+
<configuration>
741+
<url>${spark.download.url}</url>
742+
<unpack>true</unpack>
743+
<outputDirectory>${project.build.directory}/spark-dist</outputDirectory>
744+
</configuration>
745+
</execution>
746+
</executions>
747+
</plugin>
748+
<plugin>
749+
<artifactId>maven-clean-plugin</artifactId>
750+
<configuration>
751+
<filesets>
752+
<fileset>
753+
<directory>${basedir}/../python/build</directory>
754+
</fileset>
755+
<fileset>
756+
<directory>${project.build.direcoty}/spark-dist</directory>
757+
</fileset>
758+
</filesets>
759+
</configuration>
760+
</plugin>
761+
<plugin>
762+
<groupId>org.apache.maven.plugins</groupId>
763+
<artifactId>maven-antrun-plugin</artifactId>
764+
<version>1.7</version>
765+
<executions>
766+
<execution>
767+
<id>download-and-zip-pyspark-files</id>
768+
<phase>generate-resources</phase>
769+
<goals>
770+
<goal>run</goal>
771+
</goals>
772+
<configuration>
773+
<target>
774+
<delete dir="../interpreter/spark/pyspark"/>
775+
<copy todir="../interpreter/spark/pyspark"
776+
file="${project.build.directory}/spark-dist/spark-${spark.version}/python/lib/py4j-0.8.2.1-src.zip"/>
777+
<zip destfile="${project.build.directory}/../../interpreter/spark/pyspark/pyspark.zip"
778+
basedir="${project.build.directory}/spark-dist/spark-${spark.version}/python"
779+
includes="pyspark/*.py,pyspark/**/*.py"/>
780+
</target>
781+
</configuration>
782+
</execution>
783+
</executions>
784+
</plugin>
785+
</plugins>
786+
</build>
787+
</profile>
788+
726789
<!-- Build without Hadoop dependencies that are included in some runtime environments. -->
727790
<profile>
728791
<id>hadoop-provided</id>

spark/src/main/java/org/apache/zeppelin/spark/PySparkInterpreter.java

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -159,18 +159,6 @@ public void open() {
159159
try {
160160
Map env = EnvironmentUtils.getProcEnvironment();
161161

162-
String pythonPath = (String) env.get("PYTHONPATH");
163-
if (pythonPath == null) {
164-
pythonPath = "";
165-
} else {
166-
pythonPath += ":";
167-
}
168-
169-
pythonPath += getSparkHome() + "/python/lib/py4j-0.8.2.1-src.zip:"
170-
+ getSparkHome() + "/python";
171-
172-
env.put("PYTHONPATH", pythonPath);
173-
174162
executor.execute(cmd, env, this);
175163
pythonscriptRunning = true;
176164
} catch (IOException e) {

spark/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,9 @@
2626
import java.lang.reflect.Method;
2727
import java.net.URL;
2828
import java.net.URLClassLoader;
29-
import java.util.LinkedList;
30-
import java.util.List;
31-
import java.util.Map;
32-
import java.util.Properties;
33-
import java.util.Set;
29+
import java.util.*;
3430

31+
import com.google.common.base.Joiner;
3532
import org.apache.spark.HttpServer;
3633
import org.apache.spark.SparkConf;
3734
import org.apache.spark.SparkContext;
@@ -273,6 +270,34 @@ public SparkContext createSparkContext() {
273270
}
274271
}
275272

273+
//TODO(jongyoul): Move these codes into PySparkInterpreter.java
274+
275+
String pysparkBasePath = getSystemDefault("SPARK_HOME", "spark.home", null);
276+
File pysparkPath;
277+
if (null == pysparkBasePath) {
278+
pysparkBasePath = getSystemDefault("ZEPPELIN_HOME", "zeppelin.home", "../");
279+
pysparkPath = new File(pysparkBasePath,
280+
"interpreter" + File.separator + "spark" + File.separator + "pyspark");
281+
} else {
282+
pysparkPath = new File(pysparkBasePath,
283+
"python" + File.separator + "lib");
284+
}
285+
286+
String[] pythonLibs = new String[]{"pyspark.zip", "py4j-0.8.2.1-src.zip"};
287+
ArrayList<String> pythonLibUris = new ArrayList<>();
288+
for (String lib : pythonLibs) {
289+
File libFile = new File(pysparkPath, lib);
290+
if (libFile.exists()) {
291+
pythonLibUris.add(libFile.toURI().toString());
292+
}
293+
}
294+
pythonLibUris.trimToSize();
295+
if (pythonLibs.length == pythonLibUris.size()) {
296+
conf.set("spark.yarn.dist.files", Joiner.on(",").join(pythonLibUris));
297+
conf.set("spark.files", conf.get("spark.yarn.dist.files"));
298+
conf.set("spark.submit.pyArchives", Joiner.on(":").join(pythonLibs));
299+
}
300+
276301
SparkContext sparkContext = new SparkContext(conf);
277302
return sparkContext;
278303
}

0 commit comments

Comments
 (0)