-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
82a7746
commit ee6c17a
Showing
17 changed files
with
713 additions
and
2 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# 采用bitnami/spark镜像,此镜像基于精简Debian 11系统 | ||
# 基于Spark 3.5.0版本 | ||
# 适配Hadoop 3.3+ | ||
FROM bitnami/spark:3.5.0 | ||
|
||
LABEL maintainer="somebottle <[email protected]>" | ||
LABEL description="Docker image with Spark 3.5.0 and Hadoop 3.3.6, based on bitnami/spark image. For my graduation project." | ||
|
||
# 环境变量配置 | ||
# 所有节点的主机名,用于SSH配置 | ||
ENV SH_HOSTS="shmain shworker1 shworker2" | ||
# Hadoop版本 | ||
ENV HADOOP_VER="3.3.6" | ||
# Hadoop安装目录 | ||
ENV HADOOP_HOME="/opt/hadoop" | ||
# Hadoop配置目录 | ||
ENV HADOOP_CONF_DIR="/opt/hadoop/etc/hadoop" | ||
# Hadoop日志目录 | ||
ENV HADOOP_LOG_DIR="/var/log/hadoop" | ||
# 把Hadoop目录加入环境变量 | ||
ENV PATH="$HADOOP_HOME/sbin:$HADOOP_HOME/bin:$PATH" | ||
|
||
# 以Root用户完成 | ||
USER root | ||
|
||
# 先生成一个临时SSH密码,用于首次启动时交换ssh密钥 | ||
RUN echo $(openssl rand -base64 32) > /root/temp.pass | ||
# 修改root用户的密码 | ||
RUN echo -e "$(cat /root/temp.pass)\n$(cat /root/temp.pass)" | passwd root | ||
# 把主机名写入文件 | ||
RUN echo "$SH_HOSTS" > /root/exchange_hosts | ||
|
||
|
||
# 若.ssh目录不存在则建立 | ||
RUN [ -d /root/.ssh ] || mkdir -p /root/.ssh | ||
# 建立标记目录 | ||
RUN mkdir -p /root/.ssh/exchange_flags | ||
|
||
# 更换镜像源 | ||
COPY resources/sources.list /tmp/sources.list | ||
RUN mv /tmp/sources.list /etc/apt/sources.list | ||
|
||
# 更新apt-get以及openssh-server, wget, vim, sshpass | ||
RUN apt-get update && apt-get install -y openssh-server wget vim sshpass | ||
|
||
# 切换到安装目录/opt | ||
WORKDIR /opt | ||
# 下载Hadoop并解压至/opt/hadoop,使用清华镜像 | ||
RUN wget https://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-${HADOOP_VER}/hadoop-${HADOOP_VER}.tar.gz \ | ||
&& tar -zxf hadoop-${HADOOP_VER}.tar.gz \ | ||
&& mv hadoop-${HADOOP_VER} hadoop \ | ||
&& rm -f hadoop-${HADOOP_VER}.tar.gz | ||
|
||
# 临时配置目录 | ||
RUN mkdir /tmp/tmp_configs | ||
|
||
# 拷贝配置文件 | ||
COPY configs/* /tmp/tmp_configs/ | ||
|
||
# 移动配置文件到对应目录 | ||
RUN mv /tmp/tmp_configs/core-site.xml ${HADOOP_CONF_DIR}/core-site.xml \ | ||
&& mv /tmp/tmp_configs/hdfs-site.xml ${HADOOP_CONF_DIR}/hdfs-site.xml \ | ||
&& mv /tmp/tmp_configs/mapred-site.xml ${HADOOP_CONF_DIR}/mapred-site.xml \ | ||
&& mv /tmp/tmp_configs/yarn-site.xml ${HADOOP_CONF_DIR}/yarn-site.xml \ | ||
&& mv /tmp/tmp_configs/hadoop-env.sh ${HADOOP_CONF_DIR}/hadoop-env.sh \ | ||
&& mv /tmp/tmp_configs/workers ${HADOOP_CONF_DIR}/workers \ | ||
&& mv /tmp/tmp_configs/ssh_config /root/.ssh/config \ | ||
&& mv /tmp/tmp_configs/sshd_config /etc/ssh/sshd_config \ | ||
&& rm -rf /tmp/tmp_configs | ||
|
||
# 调整.ssh目录下文件权限 | ||
RUN chmod 600 /root/.ssh/config \ | ||
&& chmod 700 /root/.ssh | ||
|
||
# 拷贝启动脚本 | ||
COPY scripts/* /opt/ | ||
|
||
# 增加执行权限 | ||
RUN chmod +x /opt/start-hadoop.sh \ | ||
&& chmod +x /opt/stop-hadoop.sh \ | ||
&& chmod +x /opt/entry.sh \ | ||
&& chmod +x /opt/ssh_key_exchange.sh \ | ||
&& chmod +x $HADOOP_HOME/sbin/start-dfs.sh \ | ||
&& chmod +x $HADOOP_HOME/sbin/start-yarn.sh \ | ||
&& chmod +x $HADOOP_HOME/sbin/stop-dfs.sh \ | ||
&& chmod +x $HADOOP_HOME/sbin/stop-yarn.sh | ||
|
||
# 建立HDFS目录 | ||
RUN mkdir -p /root/hdfs/name \ | ||
&& mkdir -p /root/hdfs/data | ||
|
||
# 初始化HDFS | ||
RUN hdfs namenode -format | ||
|
||
# 容器启动待执行的脚本 | ||
ENTRYPOINT [ "/opt/entry.sh" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,203 @@ | ||
# haspark | ||
Hadoop + Spark 伪分布式容器化部署 | ||
# Hadoop + Spark 伪分布式容器化部署 | ||
|
||
本镜像基于`bitnami/spark:3.5.0`镜像,系统为`Debian 11`,执行用户为`root`。 | ||
|
||
面向本地集群环境测试,即**伪分布式**。 | ||
|
||
* 本镜像配置完成,用docker compose上线容器后,能**自动交换SSH公钥实现节点间SSH免密登录**。 | ||
* 本镜像在**WSL**上测试完成。 | ||
* [Docker hub](https://hub.docker.com/r/somebottle/haspark) | ||
|
||
## 版本 | ||
|
||
* Hadoop `3.3.6` | ||
* Spark `3.5.0` | ||
|
||
## 节点分配 | ||
|
||
1 master + 2 workers. | ||
|
||
> 如果需要修改则需要[编辑多个文件](#修改节点数)进行重新构建。 | ||
## 特殊环境变量 | ||
|
||
在`bitnami/spark`的基础上添加如下环境变量: | ||
|
||
| 名称 | 说明 | 默认值 | | ||
| --- | --- | --- | | ||
| HADOOP_MODE | Hadoop模式,若设为`master`则会在此容器中执行启动集群的指令 | 空 | | ||
|
||
## 容器部署 | ||
|
||
### 1. 拉取 | ||
|
||
```bash | ||
docker pull somebottle/haspark[:tag] | ||
``` | ||
|
||
### 2. 编写Docker Compose配置 | ||
|
||
**首次上线**时,会创建几个Docker卷,并且将镜像内格式化过的Namenode数据复制过来。 | ||
|
||
随后这些Docker卷会保持映射到HDFS的`NameNode`和`DataNode`目录,实现HDFS数据持久化(除非你移除了这些卷)。 | ||
|
||
> Docker Compose Volume配置文档: | ||
> https://docs.docker.com/storage/volumes/#use-a-volume-with-docker-compose | ||
在某个新目录下建立`docker-compose.yml`。 | ||
|
||
示例配置如下,1 master + 2 worker的分配。 | ||
|
||
<details> | ||
<summary>展开查看</summary> | ||
|
||
```yaml | ||
version: '3' | ||
|
||
services: | ||
haspark-main: | ||
image: somebottle/haspark:3.0.1 | ||
hostname: shmain | ||
environment: | ||
- SPARK_MODE=master | ||
- SPARK_RPC_AUTHENTICATION_ENABLED=no | ||
- SPARK_RPC_ENCRYPTION_ENABLED=no | ||
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no | ||
- SPARK_SSL_ENABLED=no | ||
- HADOOP_MODE=master # 在主容器中其启动Hadoop集群 | ||
volumes: | ||
- haspark-hdfs-name-data:/root/hdfs/name:copy # 映射docker卷到主容器的/root/hdfs/name,创建卷时复制镜像中初始化过的namenode数据 | ||
- ~/docker/spark/share:/opt/share # 三个容器映射到相同的共享目录 | ||
ports: | ||
- '8080:8080' | ||
- '4040:4040' | ||
- '8088:8088' | ||
- '8042:8042' | ||
- '9870:9870' | ||
- '19888:19888' | ||
haspark-worker-1: | ||
image: somebottle/haspark:3.0.1 | ||
hostname: shworker1 | ||
environment: | ||
- SPARK_MODE=worker | ||
- SPARK_MASTER_URL=spark://shmain:7077 | ||
- SPARK_WORKER_MEMORY=1G | ||
- SPARK_WORKER_CORES=1 | ||
- SPARK_RPC_AUTHENTICATION_ENABLED=no | ||
- SPARK_RPC_ENCRYPTION_ENABLED=no | ||
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no | ||
- SPARK_SSL_ENABLED=no | ||
volumes: | ||
- ~/docker/spark/share:/opt/share | ||
- haspark-hdfs-worker1-data:/root/hdfs/data # datanode数据 | ||
ports: | ||
- '8081:8081' | ||
haspark-worker-2: | ||
image: somebottle/haspark:3.0.1 | ||
hostname: shworker2 | ||
environment: | ||
- SPARK_MODE=worker | ||
- SPARK_MASTER_URL=spark://shmain:7077 | ||
- SPARK_WORKER_MEMORY=1G | ||
- SPARK_WORKER_CORES=1 | ||
- SPARK_RPC_AUTHENTICATION_ENABLED=no | ||
- SPARK_RPC_ENCRYPTION_ENABLED=no | ||
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no | ||
- SPARK_SSL_ENABLED=no | ||
volumes: | ||
- ~/docker/spark/share:/opt/share | ||
- haspark-hdfs-worker2-data:/root/hdfs/data # datanode数据 | ||
ports: | ||
- '8082:8081' | ||
|
||
volumes: | ||
haspark-hdfs-name-data: | ||
haspark-hdfs-worker1-data: | ||
haspark-hdfs-worker2-data: | ||
``` | ||
</details> | ||
**当然你也可以直接用本仓库的`docker-compose.yml`配置**。 | ||
|
||
### 3. 上线容器 | ||
|
||
在`docker-compose.yml`所在目录中执行。 | ||
|
||
```bash | ||
docker compose up -d | ||
``` | ||
|
||
### 4. 下线容器 | ||
|
||
在`docker-compose.yml`所在目录中执行。 | ||
|
||
下线容器,保留HDFS数据: | ||
|
||
```bash | ||
docker compose down | ||
``` | ||
|
||
如果你想把HDFS的数据连带清空: | ||
|
||
(这个操作会把相关的Docker卷全部移除) | ||
|
||
```bash | ||
docker compose down -v # v代表volumes | ||
``` | ||
|
||
### 5. 启动与停止Hadoop | ||
|
||
按理说容器启动后,**在完成免密登录配置后会自动执行**Hadoop集群启动脚本,如果没有的话你可以手动执行: | ||
|
||
```bash | ||
/opt/start-hadoop.sh | ||
``` | ||
|
||
Hadoop集群停止脚本: | ||
|
||
```bash | ||
/opt/stop-hadoop.sh | ||
``` | ||
|
||
## 重构建容器镜像 | ||
|
||
### 修改节点数 | ||
|
||
默认的节点主机名是: | ||
|
||
- `shmain` (master) | ||
- `shworker1` (worker1) | ||
- `shworker2` (worker2) | ||
|
||
如果你要修改节点主机名或者新增工人(worker)节点: | ||
|
||
1. 修改`docker-compose.yml`的`hostname`, `SPARK_MASTER_URL`,目录挂载等配置。 | ||
2. 修改`Dockerfile`头部的`SH_HOSTS`环境变量。 | ||
3. 修改`Hadoop`相关配置。主要是`core-site.xml`, `workers`文件,可能也要改动`yarn-site.xml`。 | ||
4. 修改`ssh_config`配置文件。 | ||
5. 重新构建镜像。 | ||
|
||
```bash | ||
docker build -t somebottle/haspark[:tag] . --network host | ||
``` | ||
|
||
> `--network host` 在WSL平台上很有效,采用和宿主机相同的网络,否则可能在容器内无法联网。 | ||
|
||
### 修改目录 | ||
|
||
如果你想以非root用户来运行容器,那么就需要进行比较大面积的改动。 | ||
|
||
你可能需要改动的文件: | ||
|
||
1. `docker-compose.yml` | ||
2. `Dockerfile` | ||
3. Hadoop配置: `hdfs-site.xml` | ||
4. 脚本`ssh_key_exchange.sh` | ||
5. 脚本`start-hadoop.sh` | ||
|
||
然后重新构建镜像即可。 | ||
|
||
## 感谢 | ||
|
||
* [使用 Docker 快速部署 Spark + Hadoop 大数据集群 - s1mple的文章 - 知乎](https://zhuanlan.zhihu.com/p/421375012) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | ||
<configuration> | ||
<property> | ||
<!--HDFS端口,主节点主机名shmain--> | ||
<name>fs.defaultFS</name> | ||
<value>hdfs://shmain:9000</value> | ||
</property> | ||
</configuration> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
export JAVA_HOME=/opt/bitnami/java # 指定为bitnami镜像自带的jdk | ||
export HADOOP_HOME=/opt/hadoop # 安装目录 | ||
export HADOOP_MAPRED_HOME=/opt/hadoop | ||
export HADOOP_CONF_DIR=/opt/hadoop/etc/hadoop | ||
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native | ||
# JDK17环境下的Hadoop启动参数 | ||
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib --add-opens java.base/java.lang=ALL-UNNAMED" | ||
|
||
# 以root用户运行 | ||
export HDFS_NAMENODE_USER="root" | ||
export HDFS_DATANODE_USER="root" | ||
export HDFS_SECONDARYNAMENODE_USER="root" | ||
export YARN_RESOURCEMANAGER_USER="root" | ||
export YARN_NODEMANAGER_USER="root" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | ||
|
||
<configuration> | ||
<!--HDFS | ||
NameNode和DataNode目录安置在somebottle用户默认目录下--> | ||
<property> | ||
<name>dfs.namenode.name.dir</name> | ||
<value>file:///root/hdfs/name</value> | ||
<description>NameNode directory for namespace and transaction logs storage.</description> | ||
</property> | ||
<property> | ||
<name>dfs.datanode.data.dir</name> | ||
<value>file:///root/hdfs/data</value> | ||
<description>DataNode directory</description> | ||
</property> | ||
<property> | ||
<name>dfs.replication</name> | ||
<value>2</value> | ||
</property> | ||
</configuration> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
<?xml version="1.0"?> | ||
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | ||
|
||
<configuration> | ||
<property> | ||
<name>mapreduce.framework.name</name> | ||
<value>yarn</value> | ||
</property> | ||
<property> | ||
<name>yarn.app.mapreduce.am.env</name> | ||
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value> | ||
</property> | ||
<property> | ||
<name>mapreduce.map.env</name> | ||
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value> | ||
</property> | ||
<property> | ||
<name>mapreduce.reduce.env</name> | ||
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value> | ||
</property> | ||
<property> | ||
<name>mapreduce.application.classpath</name> | ||
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*</value> | ||
</property> | ||
</configuration> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
Host localhost | ||
StrictHostKeyChecking no | ||
|
||
Host 0.0.0.0 | ||
StrictHostKeyChecking no | ||
|
||
Host shmain | ||
StrictHostKeyChecking no | ||
|
||
Host shworker1 | ||
StrictHostKeyChecking no | ||
|
||
Host shworker2 | ||
StrictHostKeyChecking no |
Oops, something went wrong.