From ee6c17aefc498ed978025e1ee4bf10e7b73ae7ed Mon Sep 17 00:00:00 2001
From: SomeBottle <somebottle@outlook.com>
Date: Tue, 23 Jan 2024 20:52:15 +0800
Subject: [PATCH] Init commit

---
 .dockerignore               |   0
 Dockerfile                  |  96 +++++++++++++++++
 README.md                   | 205 +++++++++++++++++++++++++++++++++++-
 configs/core-site.xml       |   9 ++
 configs/hadoop-env.sh       |  14 +++
 configs/hdfs-site.xml       |  21 ++++
 configs/mapred-site.xml     |  25 +++++
 configs/ssh_config          |  14 +++
 configs/sshd_config         | 122 +++++++++++++++++++++
 configs/workers             |   2 +
 configs/yarn-site.xml       |  21 ++++
 docker-compose.yml          |  62 +++++++++++
 resources/sources.list      |  13 +++
 scripts/entry.sh            |  22 ++++
 scripts/ssh_key_exchange.sh |  75 +++++++++++++
 scripts/start-hadoop.sh     |  11 ++
 scripts/stop-hadoop.sh      |   3 +
 17 files changed, 713 insertions(+), 2 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 Dockerfile
 create mode 100644 configs/core-site.xml
 create mode 100644 configs/hadoop-env.sh
 create mode 100644 configs/hdfs-site.xml
 create mode 100644 configs/mapred-site.xml
 create mode 100644 configs/ssh_config
 create mode 100644 configs/sshd_config
 create mode 100644 configs/workers
 create mode 100644 configs/yarn-site.xml
 create mode 100644 docker-compose.yml
 create mode 100644 resources/sources.list
 create mode 100644 scripts/entry.sh
 create mode 100644 scripts/ssh_key_exchange.sh
 create mode 100644 scripts/start-hadoop.sh
 create mode 100644 scripts/stop-hadoop.sh

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..e69de29
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..b32f1e6
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,96 @@
+# 采用bitnami/spark镜像，此镜像基于精简Debian 11系统
+# 基于Spark 3.5.0版本
+# 适配Hadoop 3.3+
+FROM bitnami/spark:3.5.0
+
+LABEL maintainer="somebottle <somebottle@gmail.com>"
+LABEL description="Docker image with Spark 3.5.0 and Hadoop 3.3.6, based on bitnami/spark image. For my graduation project." 
+
+# 环境变量配置
+# 所有节点的主机名，用于SSH配置
+ENV SH_HOSTS="shmain shworker1 shworker2"
+# Hadoop版本
+ENV HADOOP_VER="3.3.6" 
+# Hadoop安装目录
+ENV HADOOP_HOME="/opt/hadoop"
+# Hadoop配置目录
+ENV HADOOP_CONF_DIR="/opt/hadoop/etc/hadoop"
+# Hadoop日志目录
+ENV HADOOP_LOG_DIR="/var/log/hadoop"
+# 把Hadoop目录加入环境变量
+ENV PATH="$HADOOP_HOME/sbin:$HADOOP_HOME/bin:$PATH"
+
+# 以Root用户完成
+USER root
+
+# 先生成一个临时SSH密码，用于首次启动时交换ssh密钥
+RUN echo $(openssl rand -base64 32) > /root/temp.pass
+# 修改root用户的密码
+RUN echo -e "$(cat /root/temp.pass)\n$(cat /root/temp.pass)" | passwd root
+# 把主机名写入文件
+RUN echo "$SH_HOSTS" > /root/exchange_hosts
+
+
+# 若.ssh目录不存在则建立
+RUN [ -d /root/.ssh ] || mkdir -p /root/.ssh
+# 建立标记目录
+RUN mkdir -p /root/.ssh/exchange_flags
+
+# 更换镜像源
+COPY resources/sources.list /tmp/sources.list
+RUN mv /tmp/sources.list /etc/apt/sources.list
+
+# 更新apt-get以及openssh-server, wget, vim, sshpass
+RUN apt-get update && apt-get install -y openssh-server wget vim sshpass
+
+# 切换到安装目录/opt
+WORKDIR /opt
+# 下载Hadoop并解压至/opt/hadoop，使用清华镜像
+RUN wget https://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-${HADOOP_VER}/hadoop-${HADOOP_VER}.tar.gz \
+    && tar -zxf hadoop-${HADOOP_VER}.tar.gz \
+    && mv hadoop-${HADOOP_VER} hadoop \
+    && rm -f hadoop-${HADOOP_VER}.tar.gz
+
+# 临时配置目录
+RUN mkdir /tmp/tmp_configs
+
+# 拷贝配置文件
+COPY configs/* /tmp/tmp_configs/  
+
+# 移动配置文件到对应目录
+RUN mv /tmp/tmp_configs/core-site.xml ${HADOOP_CONF_DIR}/core-site.xml \
+    && mv /tmp/tmp_configs/hdfs-site.xml ${HADOOP_CONF_DIR}/hdfs-site.xml \
+    && mv /tmp/tmp_configs/mapred-site.xml ${HADOOP_CONF_DIR}/mapred-site.xml \
+    && mv /tmp/tmp_configs/yarn-site.xml ${HADOOP_CONF_DIR}/yarn-site.xml \
+    && mv /tmp/tmp_configs/hadoop-env.sh ${HADOOP_CONF_DIR}/hadoop-env.sh \
+    && mv /tmp/tmp_configs/workers ${HADOOP_CONF_DIR}/workers \
+    && mv /tmp/tmp_configs/ssh_config /root/.ssh/config \
+    && mv /tmp/tmp_configs/sshd_config /etc/ssh/sshd_config \
+    && rm -rf /tmp/tmp_configs
+
+# 调整.ssh目录下文件权限
+RUN chmod 600 /root/.ssh/config \
+    && chmod 700 /root/.ssh
+
+# 拷贝启动脚本
+COPY scripts/* /opt/
+
+# 增加执行权限
+RUN chmod +x /opt/start-hadoop.sh \
+    && chmod +x /opt/stop-hadoop.sh \
+    && chmod +x /opt/entry.sh \
+    && chmod +x /opt/ssh_key_exchange.sh \
+    && chmod +x $HADOOP_HOME/sbin/start-dfs.sh \
+    && chmod +x $HADOOP_HOME/sbin/start-yarn.sh \
+    && chmod +x $HADOOP_HOME/sbin/stop-dfs.sh \
+    && chmod +x $HADOOP_HOME/sbin/stop-yarn.sh 
+
+# 建立HDFS目录
+RUN mkdir -p /root/hdfs/name \ 
+    && mkdir -p /root/hdfs/data 
+
+# 初始化HDFS
+RUN hdfs namenode -format
+
+# 容器启动待执行的脚本
+ENTRYPOINT [ "/opt/entry.sh" ]
\ No newline at end of file
diff --git a/README.md b/README.md
index 3c20ec0..c89a7f4 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,203 @@
-# haspark
-Hadoop + Spark 伪分布式容器化部署
+# Hadoop + Spark 伪分布式容器化部署
+
+本镜像基于`bitnami/spark:3.5.0`镜像，系统为`Debian 11`，执行用户为`root`。  
+
+面向本地集群环境测试，即**伪分布式**。
+
+* 本镜像配置完成，用docker compose上线容器后，能**自动交换SSH公钥实现节点间SSH免密登录**。
+* 本镜像在**WSL**上测试完成。
+* [Docker hub](https://hub.docker.com/r/somebottle/haspark)  
+
+## 版本
+
+* Hadoop `3.3.6`
+* Spark `3.5.0`  
+
+## 节点分配
+
+1 master + 2 workers.  
+
+> 如果需要修改则需要[编辑多个文件](#修改节点数)进行重新构建。
+
+## 特殊环境变量  
+
+在`bitnami/spark`的基础上添加如下环境变量: 
+
+| 名称 | 说明 | 默认值 |
+| --- | --- | --- |
+| HADOOP_MODE | Hadoop模式，若设为`master`则会在此容器中执行启动集群的指令 | 空 |  
+
+## 容器部署
+
+### 1. 拉取
+
+```bash
+docker pull somebottle/haspark[:tag]
+```
+
+### 2. 编写Docker Compose配置
+
+**首次上线**时，会创建几个Docker卷，并且将镜像内格式化过的Namenode数据复制过来。  
+
+随后这些Docker卷会保持映射到HDFS的`NameNode`和`DataNode`目录，实现HDFS数据持久化（除非你移除了这些卷）。
+
+> Docker Compose Volume配置文档:  
+> https://docs.docker.com/storage/volumes/#use-a-volume-with-docker-compose  
+
+在某个新目录下建立`docker-compose.yml`。
+
+示例配置如下，1 master + 2 worker的分配。  
+
+<details>
+<summary>展开查看</summary>  
+
+```yaml
+version: '3'
+
+services:
+  haspark-main:
+    image: somebottle/haspark:3.0.1
+    hostname: shmain
+    environment:
+      - SPARK_MODE=master
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+      - HADOOP_MODE=master # 在主容器中其启动Hadoop集群
+    volumes:
+      - haspark-hdfs-name-data:/root/hdfs/name:copy # 映射docker卷到主容器的/root/hdfs/name，创建卷时复制镜像中初始化过的namenode数据
+      - ~/docker/spark/share:/opt/share # 三个容器映射到相同的共享目录
+    ports:
+      - '8080:8080'
+      - '4040:4040'
+      - '8088:8088'
+      - '8042:8042'
+      - '9870:9870'
+      - '19888:19888'
+  haspark-worker-1:
+    image: somebottle/haspark:3.0.1
+    hostname: shworker1
+    environment:
+      - SPARK_MODE=worker
+      - SPARK_MASTER_URL=spark://shmain:7077
+      - SPARK_WORKER_MEMORY=1G
+      - SPARK_WORKER_CORES=1
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+    volumes:
+      - ~/docker/spark/share:/opt/share
+      - haspark-hdfs-worker1-data:/root/hdfs/data # datanode数据
+    ports:
+      - '8081:8081'
+  haspark-worker-2:
+    image: somebottle/haspark:3.0.1
+    hostname: shworker2
+    environment:
+      - SPARK_MODE=worker
+      - SPARK_MASTER_URL=spark://shmain:7077
+      - SPARK_WORKER_MEMORY=1G
+      - SPARK_WORKER_CORES=1
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+    volumes:
+      - ~/docker/spark/share:/opt/share
+      - haspark-hdfs-worker2-data:/root/hdfs/data # datanode数据
+    ports:
+      - '8082:8081'
+
+volumes:
+  haspark-hdfs-name-data:
+  haspark-hdfs-worker1-data:
+  haspark-hdfs-worker2-data:
+```
+
+</details>  
+
+**当然你也可以直接用本仓库的`docker-compose.yml`配置**。
+
+### 3. 上线容器
+
+在`docker-compose.yml`所在目录中执行。 
+
+```bash
+docker compose up -d
+```
+
+### 4. 下线容器
+
+在`docker-compose.yml`所在目录中执行。
+
+下线容器，保留HDFS数据:  
+
+```bash
+docker compose down
+```
+
+如果你想把HDFS的数据连带清空:  
+
+（这个操作会把相关的Docker卷全部移除）
+
+```bash
+docker compose down -v # v代表volumes
+```
+
+### 5. 启动与停止Hadoop
+
+按理说容器启动后，**在完成免密登录配置后会自动执行**Hadoop集群启动脚本，如果没有的话你可以手动执行:  
+
+```bash
+/opt/start-hadoop.sh  
+```
+
+Hadoop集群停止脚本：
+
+```bash
+/opt/stop-hadoop.sh  
+```
+
+## 重构建容器镜像
+
+### 修改节点数
+
+默认的节点主机名是:  
+
+- `shmain` (master)
+- `shworker1` (worker1)
+- `shworker2` (worker2)
+
+如果你要修改节点主机名或者新增工人(worker)节点：  
+
+1. 修改`docker-compose.yml`的`hostname`, `SPARK_MASTER_URL`，目录挂载等配置。  
+2. 修改`Dockerfile`头部的`SH_HOSTS`环境变量。
+3. 修改`Hadoop`相关配置。主要是`core-site.xml`, `workers`文件，可能也要改动`yarn-site.xml`。
+4. 修改`ssh_config`配置文件。
+5. 重新构建镜像。  
+
+    ```bash
+    docker build -t somebottle/haspark[:tag] . --network host
+    ```
+
+    > `--network host` 在WSL平台上很有效，采用和宿主机相同的网络，否则可能在容器内无法联网。 
+
+### 修改目录
+
+如果你想以非root用户来运行容器，那么就需要进行比较大面积的改动。  
+
+你可能需要改动的文件:  
+
+1. `docker-compose.yml`  
+2. `Dockerfile`
+3. Hadoop配置: `hdfs-site.xml`  
+4. 脚本`ssh_key_exchange.sh`  
+5. 脚本`start-hadoop.sh`  
+
+然后重新构建镜像即可。
+
+## 感谢
+
+* [使用 Docker 快速部署 Spark + Hadoop 大数据集群 - s1mple的文章 - 知乎](https://zhuanlan.zhihu.com/p/421375012)  
diff --git a/configs/core-site.xml b/configs/core-site.xml
new file mode 100644
index 0000000..f9cd04b
--- /dev/null
+++ b/configs/core-site.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<configuration>
+  <property>
+    <!--HDFS端口，主节点主机名shmain-->
+    <name>fs.defaultFS</name>
+    <value>hdfs://shmain:9000</value>
+  </property>
+</configuration>
\ No newline at end of file
diff --git a/configs/hadoop-env.sh b/configs/hadoop-env.sh
new file mode 100644
index 0000000..410bd14
--- /dev/null
+++ b/configs/hadoop-env.sh
@@ -0,0 +1,14 @@
+export JAVA_HOME=/opt/bitnami/java # 指定为bitnami镜像自带的jdk
+export HADOOP_HOME=/opt/hadoop # 安装目录
+export HADOOP_MAPRED_HOME=/opt/hadoop
+export HADOOP_CONF_DIR=/opt/hadoop/etc/hadoop
+export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
+# JDK17环境下的Hadoop启动参数
+export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib --add-opens java.base/java.lang=ALL-UNNAMED"
+
+# 以root用户运行
+export HDFS_NAMENODE_USER="root"
+export HDFS_DATANODE_USER="root"
+export HDFS_SECONDARYNAMENODE_USER="root"
+export YARN_RESOURCEMANAGER_USER="root"
+export YARN_NODEMANAGER_USER="root"
\ No newline at end of file
diff --git a/configs/hdfs-site.xml b/configs/hdfs-site.xml
new file mode 100644
index 0000000..17e9cf0
--- /dev/null
+++ b/configs/hdfs-site.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+    <!--HDFS
+    NameNode和DataNode目录安置在somebottle用户默认目录下-->
+    <property>
+        <name>dfs.namenode.name.dir</name>
+        <value>file:///root/hdfs/name</value>
+        <description>NameNode directory for namespace and transaction logs storage.</description>
+    </property>
+    <property>
+        <name>dfs.datanode.data.dir</name>
+        <value>file:///root/hdfs/data</value>
+        <description>DataNode directory</description>
+    </property>
+    <property>
+        <name>dfs.replication</name>
+        <value>2</value>
+    </property>
+</configuration>
\ No newline at end of file
diff --git a/configs/mapred-site.xml b/configs/mapred-site.xml
new file mode 100644
index 0000000..86d7039
--- /dev/null
+++ b/configs/mapred-site.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+    <property>
+        <name>mapreduce.framework.name</name>
+        <value>yarn</value>
+    </property>
+    <property>
+        <name>yarn.app.mapreduce.am.env</name>
+        <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
+    </property>
+    <property>
+        <name>mapreduce.map.env</name>
+        <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
+    </property>
+    <property>
+        <name>mapreduce.reduce.env</name>
+        <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
+    </property>
+    <property> 
+      <name>mapreduce.application.classpath</name>
+      <value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*</value>
+    </property>
+</configuration>
\ No newline at end of file
diff --git a/configs/ssh_config b/configs/ssh_config
new file mode 100644
index 0000000..3e86204
--- /dev/null
+++ b/configs/ssh_config
@@ -0,0 +1,14 @@
+Host localhost
+  StrictHostKeyChecking no
+
+Host 0.0.0.0
+  StrictHostKeyChecking no
+
+Host shmain
+  StrictHostKeyChecking no
+
+Host shworker1
+  StrictHostKeyChecking no
+
+Host shworker2
+  StrictHostKeyChecking no
\ No newline at end of file
diff --git a/configs/sshd_config b/configs/sshd_config
new file mode 100644
index 0000000..0d1bcea
--- /dev/null
+++ b/configs/sshd_config
@@ -0,0 +1,122 @@
+
+# This is the sshd server system-wide configuration file.  See
+# sshd_config(5) for more information.
+
+# This sshd was compiled with PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games
+
+# The strategy used for options in the default sshd_config shipped with
+# OpenSSH is to specify options with their default value where
+# possible, but leave them commented.  Uncommented options override the
+# default value.
+
+Include /etc/ssh/sshd_config.d/*.conf
+
+#Port 22
+#AddressFamily any
+#ListenAddress 0.0.0.0
+#ListenAddress ::
+
+#HostKey /etc/ssh/ssh_host_rsa_key
+#HostKey /etc/ssh/ssh_host_ecdsa_key
+#HostKey /etc/ssh/ssh_host_ed25519_key
+
+# Ciphers and keying
+#RekeyLimit default none
+
+# Logging
+#SyslogFacility AUTH
+#LogLevel INFO
+
+# Authentication:
+
+#LoginGraceTime 2m
+PermitRootLogin yes
+#StrictModes yes
+#MaxAuthTries 6
+#MaxSessions 10
+
+PubkeyAuthentication yes
+
+# Expect .ssh/authorized_keys2 to be disregarded by default in future.
+#AuthorizedKeysFile	.ssh/authorized_keys .ssh/authorized_keys2
+
+#AuthorizedPrincipalsFile none
+
+#AuthorizedKeysCommand none
+#AuthorizedKeysCommandUser nobody
+
+# For this to work you will also need host keys in /etc/ssh/ssh_known_hosts
+#HostbasedAuthentication no
+# Change to yes if you don't trust ~/.ssh/known_hosts for
+# HostbasedAuthentication
+#IgnoreUserKnownHosts no
+# Don't read the user's ~/.rhosts and ~/.shosts files
+#IgnoreRhosts yes
+
+# To disable tunneled clear text passwords, change to no here!
+PasswordAuthentication yes
+#PermitEmptyPasswords no
+
+# Change to yes to enable challenge-response passwords (beware issues with
+# some PAM modules and threads)
+KbdInteractiveAuthentication no
+
+# Kerberos options
+#KerberosAuthentication no
+#KerberosOrLocalPasswd yes
+#KerberosTicketCleanup yes
+#KerberosGetAFSToken no
+
+# GSSAPI options
+#GSSAPIAuthentication no
+#GSSAPICleanupCredentials yes
+#GSSAPIStrictAcceptorCheck yes
+#GSSAPIKeyExchange no
+
+# Set this to 'yes' to enable PAM authentication, account processing,
+# and session processing. If this is enabled, PAM authentication will
+# be allowed through the KbdInteractiveAuthentication and
+# PasswordAuthentication.  Depending on your PAM configuration,
+# PAM authentication via KbdInteractiveAuthentication may bypass
+# the setting of "PermitRootLogin without-password".
+# If you just want the PAM account and session checks to run without
+# PAM authentication, then enable this but set PasswordAuthentication
+# and KbdInteractiveAuthentication to 'no'.
+UsePAM yes
+
+#AllowAgentForwarding yes
+#AllowTcpForwarding yes
+#GatewayPorts no
+X11Forwarding yes
+#X11DisplayOffset 10
+#X11UseLocalhost yes
+#PermitTTY yes
+PrintMotd no
+#PrintLastLog yes
+#TCPKeepAlive yes
+#PermitUserEnvironment no
+#Compression delayed
+#ClientAliveInterval 0
+#ClientAliveCountMax 3
+#UseDNS no
+#PidFile /run/sshd.pid
+#MaxStartups 10:30:100
+#PermitTunnel no
+#ChrootDirectory none
+#VersionAddendum none
+
+# no default banner path
+#Banner none
+
+# Allow client to pass locale environment variables
+AcceptEnv LANG LC_*
+
+# override default of no subsystems
+Subsystem	sftp	/usr/lib/openssh/sftp-server
+
+# Example of overriding settings on a per-user basis
+#Match User anoncvs
+#	X11Forwarding no
+#	AllowTcpForwarding no
+#	PermitTTY no
+#	ForceCommand cvs server
diff --git a/configs/workers b/configs/workers
new file mode 100644
index 0000000..d19adc1
--- /dev/null
+++ b/configs/workers
@@ -0,0 +1,2 @@
+shworker1
+shworker2
\ No newline at end of file
diff --git a/configs/yarn-site.xml b/configs/yarn-site.xml
new file mode 100644
index 0000000..c0e26b7
--- /dev/null
+++ b/configs/yarn-site.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+
+<configuration>
+    <property>
+        <name>yarn.nodemanager.aux-services</name>
+        <value>mapreduce_shuffle</value>
+    </property>
+    <property>
+        <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
+        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
+    </property>
+    <property>
+        <name>yarn.resourcemanager.hostname</name>
+        <value>shmain</value>
+    </property>
+    <property>
+        <name>yarn.nodemanager.env-whitelist</name>
+        <value>
+            JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
+    </property>
+</configuration>
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..29ab027
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,62 @@
+version: '3'
+
+services:
+  haspark-main:
+    image: somebottle/haspark:3.0.1
+    hostname: shmain
+    environment:
+      - SPARK_MODE=master
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+      - HADOOP_MODE=master # 在主容器中其启动Hadoop集群
+    volumes:
+      - haspark-hdfs-name-data:/root/hdfs/name:copy # 映射docker卷到主容器的/root/hdfs/name，创建卷时复制镜像中初始化过的namenode数据
+      - ~/docker/spark/share:/opt/share # 三个容器映射到相同的共享目录
+    ports:
+      - '8080:8080'
+      - '4040:4040'
+      - '8088:8088'
+      - '8042:8042'
+      - '9870:9870'
+      - '19888:19888'
+  haspark-worker-1:
+    image: somebottle/haspark:3.0.1
+    hostname: shworker1
+    environment:
+      - SPARK_MODE=worker
+      - SPARK_MASTER_URL=spark://shmain:7077
+      - SPARK_WORKER_MEMORY=1G
+      - SPARK_WORKER_CORES=1
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+    volumes:
+      - ~/docker/spark/share:/opt/share
+      - haspark-hdfs-worker1-data:/root/hdfs/data # datanode数据
+    ports:
+      - '8081:8081'
+  haspark-worker-2:
+    image: somebottle/haspark:3.0.1
+    hostname: shworker2
+    environment:
+      - SPARK_MODE=worker
+      - SPARK_MASTER_URL=spark://shmain:7077
+      - SPARK_WORKER_MEMORY=1G
+      - SPARK_WORKER_CORES=1
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+    volumes:
+      - ~/docker/spark/share:/opt/share
+      - haspark-hdfs-worker2-data:/root/hdfs/data # datanode数据
+    ports:
+      - '8082:8081'
+
+volumes:
+  haspark-hdfs-name-data:
+  haspark-hdfs-worker1-data:
+  haspark-hdfs-worker2-data:
\ No newline at end of file
diff --git a/resources/sources.list b/resources/sources.list
new file mode 100644
index 0000000..d827972
--- /dev/null
+++ b/resources/sources.list
@@ -0,0 +1,13 @@
+# 替换apt源 https://mirrors.tuna.tsinghua.edu.cn/help/debian/
+
+deb https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm main contrib non-free non-free-firmware
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm main contrib non-free non-free-firmware
+
+deb https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm-updates main contrib non-free non-free-firmware
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm-updates main contrib non-free non-free-firmware
+
+deb https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm-backports main contrib non-free non-free-firmware
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm-backports main contrib non-free non-free-firmware
+
+deb https://mirrors.tuna.tsinghua.edu.cn/debian-security bookworm-security main contrib non-free non-free-firmware
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/debian-security bookworm-security main contrib non-free non-free-firmware
\ No newline at end of file
diff --git a/scripts/entry.sh b/scripts/entry.sh
new file mode 100644
index 0000000..49843a5
--- /dev/null
+++ b/scripts/entry.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# 容器启动时执行的脚本
+
+# 修正家目录，bitnami不知道怎么想的，把文件系统根目录当家目录
+export HOME="$(eval echo ~$(whoami))"
+
+# 启动SSH
+/etc/init.d/ssh start
+# 后台执行SSH KEY交换脚本，实现免密登录
+nohup /opt/ssh_key_exchange.sh > exchange.log 2>&1 &
+
+# 如果 HADOOP_MODE 为 master，则启动 Hadoop 集群
+if [ "$HADOOP_MODE" = "master" ]; then
+    # 在主容器下启动 Hadoop
+    nohup /opt/start-hadoop.sh > hadoop_launch.log 2>&1 &
+else
+    echo "Hadoop will not automatically start in this container. Set HADOOP_MODE to 'master' to start."
+fi
+
+# 执行bitnami的entry脚本
+
+source /opt/bitnami/scripts/spark/entrypoint.sh /opt/bitnami/scripts/spark/run.sh
\ No newline at end of file
diff --git a/scripts/ssh_key_exchange.sh b/scripts/ssh_key_exchange.sh
new file mode 100644
index 0000000..8e5b496
--- /dev/null
+++ b/scripts/ssh_key_exchange.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+HOSTS_FILE="/root/exchange_hosts"
+FLAG_DIR="/root/.ssh/exchange_flags"
+TEMP_PASS_FILE="/root/temp.pass"
+# 重试公钥分发次数
+# 因为其他容器SSH服务还没有完全启动时，有概率会导致公钥分发失败
+MAX_RETRY=5
+
+# 临时密码文件不存在，说明已经交换过了
+if [ ! -e $TEMP_PASS_FILE ]; then
+    echo "SSH KEY has been exchanged before, exit."
+    exit 0
+fi
+
+# 先建立RSA密钥对
+ssh-keygen -t rsa -f /root/.ssh/id_rsa -N ''
+
+retryCnt=0
+# 将公钥复制到其他容器
+# 利用sshpass结合ssh-copy-id命令分发本主机公钥到其他容器
+for i in $(cat $HOSTS_FILE); do
+    retryCnt=0
+    if [ $i != "$(hostname)" ]; then
+        while [ $retryCnt -lt $MAX_RETRY ]; do
+            # 分发公钥
+            # 然后在其他容器放置标记文件，表示已经分发过公钥
+            # 注意一定要配置.ssh/config中的StrictHostKeyChecking，不然首次连接会有警告，导致sshpass找不到prompt
+            sshpass -p $(cat $TEMP_PASS_FILE) ssh-copy-id -i /root/.ssh/id_rsa.pub root@$i && \
+            sshpass -p $(cat $TEMP_PASS_FILE) ssh root@$i "touch $FLAG_DIR/$(hostname)" && \
+            echo "Key sent: $(hostname) -> $i"
+            if [ $? -eq 0 ]; then
+                break
+            else
+                # 分发不成功则重试
+                ((retryCnt++))
+                echo "Failed to send key. Will retry $retry_count/$MAX_RETRY after 5 seconds..."
+                sleep 5 # 重试间隔5秒
+            fi
+        done
+        if [ $retryCnt -ge $MAX_RETRY ]; then # 分发失败
+            echo "Failed to send key to $i !"
+            exit 1
+        fi
+    fi
+done
+
+# 本机公钥也加入authorized_keys，Hadoop启动时还要和本机进行ssh连接
+cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
+# 把本机先标记上
+touch $FLAG_DIR/$(hostname)
+
+# 等待整个分发过程收敛
+while true; do
+    # 每个容器与本机交换公钥后会在$FLAG_DIR目录下放置一个标记文件，文件名为其hostname
+    finished=true
+    for i in $(cat $HOSTS_FILE); do
+        if [ ! -e $FLAG_DIR/$i ]; then # 如果有的主机名还没出现，则表示还没收敛
+            finished=false
+            break
+        fi
+    done
+    # 收敛
+    if $finished; then
+        break
+    fi
+    sleep 1
+done
+
+# 分发完成删除临时密码文件
+rm -f $TEMP_PASS_FILE
+
+# 禁止密码登录
+sed -i 's/PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config
+/etc/init.d/ssh restart
\ No newline at end of file
diff --git a/scripts/start-hadoop.sh b/scripts/start-hadoop.sh
new file mode 100644
index 0000000..0555c73
--- /dev/null
+++ b/scripts/start-hadoop.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+TEMP_PASS_FILE="/root/temp.pass"
+
+# 临时密码文件还存在就说明SSH公钥还没交换完毕，需要等待交换完毕后再启动Hadoop
+while [ -e $TEMP_PASS_FILE ]; do
+    sleep 3
+done
+
+$HADOOP_HOME/sbin/start-dfs.sh
+$HADOOP_HOME/sbin/start-yarn.sh
diff --git a/scripts/stop-hadoop.sh b/scripts/stop-hadoop.sh
new file mode 100644
index 0000000..cf9890e
--- /dev/null
+++ b/scripts/stop-hadoop.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+$HADOOP_HOME/sbin/stop-dfs.sh
+$HADOOP_HOME/sbin/stop-yarn.sh