apache · astroshim · Aug 10, 2016 · Aug 11, 2016 · Aug 11, 2016 · Aug 11, 2016
diff --git a/docs/_includes/themes/zeppelin/_navigation.html b/docs/_includes/themes/zeppelin/_navigation.html
@@ -105,6 +105,7 @@
                 <li class="title"><span><b>Advanced</b><span></li>
                 <li><a href="{{BASE_PATH}}/install/virtual_machine.html">Zeppelin on Vagrant VM</a></li>
                 <li><a href="{{BASE_PATH}}/install/spark_cluster_mode.html#spark-standalone-mode">Zeppelin on Spark Cluster Mode (Standalone)</a></li>
+                <li><a href="{{BASE_PATH}}/install/spark_cluster_mode.html#spark-standalone-mode">Zeppelin on Spark Cluster Mode (YARN)</a></li>
                 <li role="separator" class="divider"></li>
                 <li class="title"><span><b>Contibute</b><span></li>
                 <li><a href="{{BASE_PATH}}/development/writingzeppelininterpreter.html">Writing Zeppelin Interpreter</a></li>

diff --git a/docs/assets/themes/zeppelin/img/docs-img/yarn_applications.png b/docs/assets/themes/zeppelin/img/docs-img/yarn_applications.png
diff --git a/docs/assets/themes/zeppelin/img/docs-img/zeppelin_yarn_conf.png b/docs/assets/themes/zeppelin/img/docs-img/zeppelin_yarn_conf.png
diff --git a/docs/index.md b/docs/index.md
@@ -170,6 +170,7 @@ Join to our [Mailing list](https://zeppelin.apache.org/community.html) and repor
 * Advanced
   * [Apache Zeppelin on Vagrant VM](./install/virtual_machine.html)
   * [Zeppelin on Spark Cluster Mode (Standalone via Docker)](./install/spark_cluster_mode.html#spark-standalone-mode)
+  * [Zeppelin on Spark Cluster Mode (YARN via Docker)](./install/spark_cluster_mode.html#spark-yarn-mode)
 * Contribute
   * [Writing Zeppelin Interpreter](./development/writingzeppelininterpreter.html)
   * [Writing Zeppelin Application (Experimental)](./development/writingzeppelinapplication.html)

diff --git a/docs/install/spark_cluster_mode.md b/docs/install/spark_cluster_mode.md
@@ -1,7 +1,7 @@
 ---
 layout: page
 title: "Apache Zeppelin on Spark cluster mode"
-description: ""
+description: "This document will guide you how you can build and configure the environment on 3 types of Spark cluster manager with Apache Zeppelin using docker scripts."
 group: install
 ---
 <!--
@@ -56,12 +56,12 @@ spark_standalone bash;
 ```
 
 ### 3. Configure Spark interpreter in Zeppelin
-Set Spark master as `spark://localhost:7077` in Zeppelin **Interpreters** setting page.
+Set Spark master as `spark://<hostname>:7077` in Zeppelin **Interpreters** setting page.
 
 <img src="../assets/themes/zeppelin/img/docs-img/standalone_conf.png" />
 
 ### 4. Run Zeppelin with Spark interpreter
-After running single paragraph with Spark interpreter in Zeppelin, browse `https://localhost:8080` and check whether Spark cluster is running well or not.
+After running single paragraph with Spark interpreter in Zeppelin, browse `https://<hostname>:8080` and check whether Spark cluster is running well or not.
 
 <img src="../assets/themes/zeppelin/img/docs-img/spark_ui.png" />
 
@@ -72,3 +72,71 @@ ps -ef | grep spark
 ```
 
 
+## Spark on YARN mode
+You can simply set up [Spark on YARN](http://spark.apache.org/docs/latest/running-on-yarn.html) docker environment with below steps.
+
+> **Note :** Since Apache Zeppelin and Spark use same `8080` port for their web UI, you might need to change `zeppelin.server.port` in `conf/zeppelin-site.xml`.
+
+### 1. Build Docker file
+You can find docker script files under `scripts/docker/spark-cluster-managers`.
+
+```
+cd $ZEPPELIN_HOME/scripts/docker/spark-cluster-managers/spark_yarn
+docker build -t "spark_yarn" .
+```
+
+### 2. Run docker
+
+```
+docker run -it \
+ -p 5000:5000 \
+ -p 9000:9000 \
+ -p 9001:9001 \
+ -p 8088:8088 \
+ -p 8042:8042 \
+ -p 8030:8030 \
+ -p 8031:8031 \
+ -p 8032:8032 \
+ -p 8033:8033 \
+ -p 8080:8080 \
+ -p 7077:7077 \
+ -p 8888:8888 \
+ -p 8081:8081 \
+ -p 50010:50010 \
+ -p 50075:50075 \
+ -p 50020:50020 \
+ -p 50070:50070 \
+ --name spark_yarn \
+ -h sparkmaster \
+ spark_yarn bash;
+```
+
+### 3. Verify running Spark on YARN.
+
+You can simply verify the processes of Spark and YARN is running well in Docker with below command.
+
+```
+ps -ef
+```
+
+You can also check each application web UI for HDFS on `http://<hostname>:50070/`, YARN on `http://<hostname>:8088/cluster` and Spark on `http://<hostname>:8080/`.
+
+### 4. Configure Spark interpreter in Zeppelin
+Set following configurations to `conf/zeppelin-env.sh`.
+
+```
+export MASTER=yarn-client
+export HADOOP_CONF_DIR=[your_hadoop_conf_path]
+export SPARK_HOME=[your_spark_home_path]
+```
+
+`HADOOP_CONF_DIR`(Hadoop configuration path) is defined in `/scripts/docker/spark-cluster-managers/spark_yarn_cluster/hdfs_conf`.
+
+Don't forget to set Spark `master` as `yarn-client` in Zeppelin **Interpreters** setting page like below.
+
+<img src="../assets/themes/zeppelin/img/docs-img/zeppelin_yarn_conf.png" />
+
+### 5. Run Zeppelin with Spark interpreter
+After running a single paragraph with Spark interpreter in Zeppelin, browse `http://<hostname>:8088/cluster/apps` and check Zeppelin application is running well or not.
+
+<img src="../assets/themes/zeppelin/img/docs-img/yarn_applications.png" />
diff --git a/scripts/docker/spark-cluster-managers/spark_standalone/Dockerfile b/scripts/docker/spark-cluster-managers/spark_standalone/Dockerfile
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 FROM centos:centos6
-MAINTAINER [email protected]
 
 ENV SPARK_PROFILE 1.6
 ENV SPARK_VERSION 1.6.2

diff --git a/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile b/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+FROM centos:centos6
+
+ENV SPARK_PROFILE 2.0
+ENV SPARK_VERSION 2.0.0
+ENV HADOOP_PROFILE 2.7
+ENV HADOOP_VERSION 2.7.0
+
+# Update the image with the latest packages
+RUN yum update -y; yum clean all
+
+# Get utils
+RUN yum install -y \
+wget \
+tar \
+curl \
+&& \
+yum clean all
+
+# Remove old jdk
+RUN yum remove java; yum remove jdk
+
+# install jdk7
+RUN yum install -y java-1.7.0-openjdk-devel
+ENV JAVA_HOME /usr/lib/jvm/java
+ENV PATH $PATH:$JAVA_HOME/bin
+
+# install hadoop 
+RUN yum install -y curl which tar sudo openssh-server openssh-clients rsync
+
+# hadoop
+RUN curl -s https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz | tar -xz -C /usr/local/
+RUN cd /usr/local && ln -s ./hadoop-$HADOOP_VERSION hadoop
+
+ENV HADOOP_PREFIX /usr/local/hadoop
+ENV HADOOP_COMMON_HOME /usr/local/hadoop
+ENV HADOOP_HDFS_HOME /usr/local/hadoop
+ENV HADOOP_MAPRED_HOME /usr/local/hadoop
+ENV HADOOP_YARN_HOME /usr/local/hadoop
+ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
+
+RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/jre-1.7.0-openjdk.x86_64\nexport HADOOP_PREFIX=/usr/local/hadoop\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
+RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
+
+RUN mkdir $HADOOP_PREFIX/input
+RUN cp $HADOOP_PREFIX/etc/hadoop/*.xml $HADOOP_PREFIX/input
+
+# hadoop configurations
+ADD hdfs_conf/core-site.xml $HADOOP_PREFIX/etc/hadoop/core-site.xml
+ADD hdfs_conf/hdfs-site.xml $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml
+ADD hdfs_conf/mapred-site.xml $HADOOP_PREFIX/etc/hadoop/mapred-site.xml
+ADD hdfs_conf/yarn-site.xml $HADOOP_PREFIX/etc/hadoop/yarn-site.xml
+
+RUN mkdir /data/
+RUN chmod 777 /data/
+RUN $HADOOP_PREFIX/bin/hdfs namenode -format
+
+RUN rm  /usr/local/hadoop/lib/native/*
+RUN curl -Ls http://dl.bintray.com/sequenceiq/sequenceiq-bin/hadoop-native-64-$HADOOP_VERSION.tar|tar -x -C /usr/local/hadoop/lib/native/
+
+# install spark
+RUN curl -s http://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE.tgz | tar -xz -C /usr/local/
+RUN cd /usr/local && ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE spark
+ENV SPARK_HOME /usr/local/spark
+
+ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop
+ENV PATH $PATH:$SPARK_HOME/bin:$HADOOP_PREFIX/bin
+
+# passwordless ssh
+RUN ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key
+RUN ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key
+RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa
+RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
+
+ADD ssh_config /root/.ssh/config
+RUN chmod 600 /root/.ssh/config
+RUN chown root:root /root/.ssh/config
+RUN chmod +x /usr/local/hadoop/etc/hadoop/*-env.sh
+
+# update boot script
+COPY entrypoint.sh /etc/entrypoint.sh
+RUN chown root.root /etc/entrypoint.sh
+RUN chmod 700 /etc/entrypoint.sh
+
+# Hdfs ports
+EXPOSE 50010 50020 50070 50075 50090
+# Mapred ports
+EXPOSE 9000 9001
+#Yarn ports
+EXPOSE 8030 8031 8032 8033 8040 8042 8088
+#spark
+EXPOSE 8080 7077 8888 8081
+
+ENTRYPOINT ["/etc/entrypoint.sh"]
diff --git a/scripts/docker/spark-cluster-managers/spark_yarn_cluster/entrypoint.sh b/scripts/docker/spark-cluster-managers/spark_yarn_cluster/entrypoint.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo 'hadoop' |passwd root --stdin
+
+: ${HADOOP_PREFIX:=/usr/local/hadoop}
+
+$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
+
+rm /tmp/*.pid
+
+# installing libraries if any - (resource urls added comma separated to the ACP system variable)
+cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do  echo == $cp; curl -LO $cp ; done; cd -
+
+cp $SPARK_HOME/conf/metrics.properties.template $SPARK_HOME/conf/metrics.properties
+
+# start hadoop
+service sshd start
+$HADOOP_PREFIX/sbin/start-dfs.sh
+$HADOOP_PREFIX/sbin/start-yarn.sh
+
+$HADOOP_PREFIX/bin/hdfs dfsadmin -safemode leave && $HADOOP_PREFIX/bin/hdfs dfs -put $SPARK_HOME-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE/lib /spark
+
+# start spark
+export SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002
+  -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004
+  -Dspark.blockManager.port=7005 -Dspark.executor.port=7006
+  -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
+export SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002
+  -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004
+  -Dspark.blockManager.port=7005 -Dspark.executor.port=7006
+  -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
+
+export SPARK_MASTER_PORT=7077
+
+cd /usr/local/spark/sbin
+./start-master.sh
+./start-slave.sh spark://`hostname`:$SPARK_MASTER_PORT
+
+CMD=${1:-"exit 0"}
+if [[ "$CMD" == "-d" ]];
+then
+	service sshd stop
+	/usr/sbin/sshd -D -d
+else
+	/bin/bash -c "$*"
+fi
diff --git a/scripts/docker/spark-cluster-managers/spark_yarn_cluster/hdfs_conf/core-site.xml b/scripts/docker/spark-cluster-managers/spark_yarn_cluster/hdfs_conf/core-site.xml
@@ -0,0 +1,22 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<configuration>
+  <property>
+    <name>fs.defaultFS</name>
+    <value>hdfs://0.0.0.0:9000</value>
+  </property>
+</configuration>
diff --git a/scripts/docker/spark-cluster-managers/spark_yarn_cluster/hdfs_conf/hdfs-site.xml b/scripts/docker/spark-cluster-managers/spark_yarn_cluster/hdfs_conf/hdfs-site.xml
@@ -0,0 +1,78 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<configuration>
+  <property>
+    <name>dfs.replication</name>
+    <value>1</value>
+  </property>
+
+  <property>
+    <name>dfs.data.dir</name>
+    <value>/data/hdfs</value>
+    <final>true</final>
+  </property>
+
+  <property>
+    <name>dfs.permissions</name>
+    <value>false</value>
+  </property>
+
+  <property>
+    <name>dfs.client.use.datanode.hostname</name>
+    <value>true</value>
+    <description>Whether clients should use datanode hostnames when
+      connecting to datanodes.
+    </description>
+  </property>
+
+  <property>
+    <name>dfs.datanode.use.datanode.hostname</name>
+    <value>true</value>
+    <description>Whether datanodes should use datanode hostnames when
+      connecting to other datanodes for data transfer.
+    </description>
+  </property>
+
+  <property>
+    <name>dfs.datanode.address</name>
+    <value>0.0.0.0:50010</value>
+    <description>
+      The address where the datanode server will listen to.
+      If the port is 0 then the server will start on a free port.
+    </description>
+  </property>
+
+  <property>
+    <name>dfs.datanode.http.address</name>
+    <value>0.0.0.0:50075</value>
+    <description>
+      The datanode http server address and port.
+      If the port is 0 then the server will start on a free port.
+    </description>
+  </property>
+
+  <property>
+    <name>dfs.datanode.ipc.address</name>
+    <value>0.0.0.0:50020</value>
+    <description>
+      The datanode ipc server address and port.
+      If the port is 0 then the server will start on a free port.
+    </description>
+  </property>
+
+</configuration>
+