Skip to content

Commit

Permalink
Init commit
Browse files Browse the repository at this point in the history
  • Loading branch information
SomeBottle committed Jan 23, 2024
1 parent 82a7746 commit ee6c17a
Show file tree
Hide file tree
Showing 17 changed files with 713 additions and 2 deletions.
Empty file added .dockerignore
Empty file.
96 changes: 96 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# 采用bitnami/spark镜像,此镜像基于精简Debian 11系统
# 基于Spark 3.5.0版本
# 适配Hadoop 3.3+
FROM bitnami/spark:3.5.0

LABEL maintainer="somebottle <[email protected]>"
LABEL description="Docker image with Spark 3.5.0 and Hadoop 3.3.6, based on bitnami/spark image. For my graduation project."

# 环境变量配置
# 所有节点的主机名,用于SSH配置
ENV SH_HOSTS="shmain shworker1 shworker2"
# Hadoop版本
ENV HADOOP_VER="3.3.6"
# Hadoop安装目录
ENV HADOOP_HOME="/opt/hadoop"
# Hadoop配置目录
ENV HADOOP_CONF_DIR="/opt/hadoop/etc/hadoop"
# Hadoop日志目录
ENV HADOOP_LOG_DIR="/var/log/hadoop"
# 把Hadoop目录加入环境变量
ENV PATH="$HADOOP_HOME/sbin:$HADOOP_HOME/bin:$PATH"

# 以Root用户完成
USER root

# 先生成一个临时SSH密码,用于首次启动时交换ssh密钥
RUN echo $(openssl rand -base64 32) > /root/temp.pass
# 修改root用户的密码
RUN echo -e "$(cat /root/temp.pass)\n$(cat /root/temp.pass)" | passwd root
# 把主机名写入文件
RUN echo "$SH_HOSTS" > /root/exchange_hosts


# 若.ssh目录不存在则建立
RUN [ -d /root/.ssh ] || mkdir -p /root/.ssh
# 建立标记目录
RUN mkdir -p /root/.ssh/exchange_flags

# 更换镜像源
COPY resources/sources.list /tmp/sources.list
RUN mv /tmp/sources.list /etc/apt/sources.list

# 更新apt-get以及openssh-server, wget, vim, sshpass
RUN apt-get update && apt-get install -y openssh-server wget vim sshpass

# 切换到安装目录/opt
WORKDIR /opt
# 下载Hadoop并解压至/opt/hadoop,使用清华镜像
RUN wget https://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-${HADOOP_VER}/hadoop-${HADOOP_VER}.tar.gz \
&& tar -zxf hadoop-${HADOOP_VER}.tar.gz \
&& mv hadoop-${HADOOP_VER} hadoop \
&& rm -f hadoop-${HADOOP_VER}.tar.gz

# 临时配置目录
RUN mkdir /tmp/tmp_configs

# 拷贝配置文件
COPY configs/* /tmp/tmp_configs/

# 移动配置文件到对应目录
RUN mv /tmp/tmp_configs/core-site.xml ${HADOOP_CONF_DIR}/core-site.xml \
&& mv /tmp/tmp_configs/hdfs-site.xml ${HADOOP_CONF_DIR}/hdfs-site.xml \
&& mv /tmp/tmp_configs/mapred-site.xml ${HADOOP_CONF_DIR}/mapred-site.xml \
&& mv /tmp/tmp_configs/yarn-site.xml ${HADOOP_CONF_DIR}/yarn-site.xml \
&& mv /tmp/tmp_configs/hadoop-env.sh ${HADOOP_CONF_DIR}/hadoop-env.sh \
&& mv /tmp/tmp_configs/workers ${HADOOP_CONF_DIR}/workers \
&& mv /tmp/tmp_configs/ssh_config /root/.ssh/config \
&& mv /tmp/tmp_configs/sshd_config /etc/ssh/sshd_config \
&& rm -rf /tmp/tmp_configs

# 调整.ssh目录下文件权限
RUN chmod 600 /root/.ssh/config \
&& chmod 700 /root/.ssh

# 拷贝启动脚本
COPY scripts/* /opt/

# 增加执行权限
RUN chmod +x /opt/start-hadoop.sh \
&& chmod +x /opt/stop-hadoop.sh \
&& chmod +x /opt/entry.sh \
&& chmod +x /opt/ssh_key_exchange.sh \
&& chmod +x $HADOOP_HOME/sbin/start-dfs.sh \
&& chmod +x $HADOOP_HOME/sbin/start-yarn.sh \
&& chmod +x $HADOOP_HOME/sbin/stop-dfs.sh \
&& chmod +x $HADOOP_HOME/sbin/stop-yarn.sh

# 建立HDFS目录
RUN mkdir -p /root/hdfs/name \
&& mkdir -p /root/hdfs/data

# 初始化HDFS
RUN hdfs namenode -format

# 容器启动待执行的脚本
ENTRYPOINT [ "/opt/entry.sh" ]
205 changes: 203 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,203 @@
# haspark
Hadoop + Spark 伪分布式容器化部署
# Hadoop + Spark 伪分布式容器化部署

本镜像基于`bitnami/spark:3.5.0`镜像,系统为`Debian 11`,执行用户为`root`

面向本地集群环境测试,即**伪分布式**

* 本镜像配置完成,用docker compose上线容器后,能**自动交换SSH公钥实现节点间SSH免密登录**
* 本镜像在**WSL**上测试完成。
* [Docker hub](https://hub.docker.com/r/somebottle/haspark)

## 版本

* Hadoop `3.3.6`
* Spark `3.5.0`

## 节点分配

1 master + 2 workers.

> 如果需要修改则需要[编辑多个文件](#修改节点数)进行重新构建。
## 特殊环境变量

`bitnami/spark`的基础上添加如下环境变量:

| 名称 | 说明 | 默认值 |
| --- | --- | --- |
| HADOOP_MODE | Hadoop模式,若设为`master`则会在此容器中执行启动集群的指令 ||

## 容器部署

### 1. 拉取

```bash
docker pull somebottle/haspark[:tag]
```

### 2. 编写Docker Compose配置

**首次上线**时,会创建几个Docker卷,并且将镜像内格式化过的Namenode数据复制过来。

随后这些Docker卷会保持映射到HDFS的`NameNode``DataNode`目录,实现HDFS数据持久化(除非你移除了这些卷)。

> Docker Compose Volume配置文档:
> https://docs.docker.com/storage/volumes/#use-a-volume-with-docker-compose
在某个新目录下建立`docker-compose.yml`

示例配置如下,1 master + 2 worker的分配。

<details>
<summary>展开查看</summary>

```yaml
version: '3'

services:
haspark-main:
image: somebottle/haspark:3.0.1
hostname: shmain
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
- HADOOP_MODE=master # 在主容器中其启动Hadoop集群
volumes:
- haspark-hdfs-name-data:/root/hdfs/name:copy # 映射docker卷到主容器的/root/hdfs/name,创建卷时复制镜像中初始化过的namenode数据
- ~/docker/spark/share:/opt/share # 三个容器映射到相同的共享目录
ports:
- '8080:8080'
- '4040:4040'
- '8088:8088'
- '8042:8042'
- '9870:9870'
- '19888:19888'
haspark-worker-1:
image: somebottle/haspark:3.0.1
hostname: shworker1
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://shmain:7077
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_CORES=1
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
volumes:
- ~/docker/spark/share:/opt/share
- haspark-hdfs-worker1-data:/root/hdfs/data # datanode数据
ports:
- '8081:8081'
haspark-worker-2:
image: somebottle/haspark:3.0.1
hostname: shworker2
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://shmain:7077
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_CORES=1
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
volumes:
- ~/docker/spark/share:/opt/share
- haspark-hdfs-worker2-data:/root/hdfs/data # datanode数据
ports:
- '8082:8081'

volumes:
haspark-hdfs-name-data:
haspark-hdfs-worker1-data:
haspark-hdfs-worker2-data:
```
</details>
**当然你也可以直接用本仓库的`docker-compose.yml`配置**。

### 3. 上线容器

在`docker-compose.yml`所在目录中执行。

```bash
docker compose up -d
```

### 4. 下线容器

在`docker-compose.yml`所在目录中执行。

下线容器,保留HDFS数据:

```bash
docker compose down
```

如果你想把HDFS的数据连带清空:

(这个操作会把相关的Docker卷全部移除)

```bash
docker compose down -v # v代表volumes
```

### 5. 启动与停止Hadoop

按理说容器启动后,**在完成免密登录配置后会自动执行**Hadoop集群启动脚本,如果没有的话你可以手动执行:

```bash
/opt/start-hadoop.sh
```

Hadoop集群停止脚本:

```bash
/opt/stop-hadoop.sh
```

## 重构建容器镜像

### 修改节点数

默认的节点主机名是:

- `shmain` (master)
- `shworker1` (worker1)
- `shworker2` (worker2)

如果你要修改节点主机名或者新增工人(worker)节点:

1. 修改`docker-compose.yml`的`hostname`, `SPARK_MASTER_URL`,目录挂载等配置。
2. 修改`Dockerfile`头部的`SH_HOSTS`环境变量。
3. 修改`Hadoop`相关配置。主要是`core-site.xml`, `workers`文件,可能也要改动`yarn-site.xml`。
4. 修改`ssh_config`配置文件。
5. 重新构建镜像。

```bash
docker build -t somebottle/haspark[:tag] . --network host
```

> `--network host` 在WSL平台上很有效,采用和宿主机相同的网络,否则可能在容器内无法联网。

### 修改目录

如果你想以非root用户来运行容器,那么就需要进行比较大面积的改动。

你可能需要改动的文件:

1. `docker-compose.yml`
2. `Dockerfile`
3. Hadoop配置: `hdfs-site.xml`
4. 脚本`ssh_key_exchange.sh`
5. 脚本`start-hadoop.sh`

然后重新构建镜像即可。

## 感谢

* [使用 Docker 快速部署 Spark + Hadoop 大数据集群 - s1mple的文章 - 知乎](https://zhuanlan.zhihu.com/p/421375012)
9 changes: 9 additions & 0 deletions configs/core-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<!--HDFS端口,主节点主机名shmain-->
<name>fs.defaultFS</name>
<value>hdfs://shmain:9000</value>
</property>
</configuration>
14 changes: 14 additions & 0 deletions configs/hadoop-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
export JAVA_HOME=/opt/bitnami/java # 指定为bitnami镜像自带的jdk
export HADOOP_HOME=/opt/hadoop # 安装目录
export HADOOP_MAPRED_HOME=/opt/hadoop
export HADOOP_CONF_DIR=/opt/hadoop/etc/hadoop
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
# JDK17环境下的Hadoop启动参数
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib --add-opens java.base/java.lang=ALL-UNNAMED"

# 以root用户运行
export HDFS_NAMENODE_USER="root"
export HDFS_DATANODE_USER="root"
export HDFS_SECONDARYNAMENODE_USER="root"
export YARN_RESOURCEMANAGER_USER="root"
export YARN_NODEMANAGER_USER="root"
21 changes: 21 additions & 0 deletions configs/hdfs-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<!--HDFS
NameNode和DataNode目录安置在somebottle用户默认目录下-->
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///root/hdfs/name</value>
<description>NameNode directory for namespace and transaction logs storage.</description>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///root/hdfs/data</value>
<description>DataNode directory</description>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
</configuration>
25 changes: 25 additions & 0 deletions configs/mapred-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*</value>
</property>
</configuration>
14 changes: 14 additions & 0 deletions configs/ssh_config
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Host localhost
StrictHostKeyChecking no

Host 0.0.0.0
StrictHostKeyChecking no

Host shmain
StrictHostKeyChecking no

Host shworker1
StrictHostKeyChecking no

Host shworker2
StrictHostKeyChecking no
Loading

0 comments on commit ee6c17a

Please sign in to comment.