diff --git a/Dockerfile b/Dockerfile index 28d0eee..58ddc1c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,9 +44,10 @@ ENV HADOOP_LAUNCH_MODE="general" \ GN_NODEMANAGER_WITH_RESOURCEMANAGER="false" \ GN_HDFS_SETUP_ON_STARTUP="false" \ GN_YARN_SETUP_ON_STARTUP="false" \ + GN_ZOOKEEPER_START_ON_STARTUP="false" \ HA_HDFS_NAMESERVICE="hacluster" \ HA_HDFS_SETUP_ON_STARTUP="false" \ - HA_YARN_SETUP_ON_STARTUP="false" + HA_YARN_SETUP_ON_STARTUP="false" # 以Root用户完成 USER root @@ -57,7 +58,9 @@ COPY resources/sources.list /tmp/sources.list # 将路径环境变量写入/etc/profile.d/path_env.sh RUN echo -e "#!/bin/bash\nexport PATH=$PATH\nexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH" > /etc/profile.d/path_env.sh && \ # 将Hadoop部分环境变量写入/etc/profile.d/hadoop.sh - echo -e "#!/bin/bash\nexport HADOOP_HOME=$HADOOP_HOME\nexport HADOOP_CONF_DIR=$HADOOP_CONF_DIR" >> /etc/profile.d/hadoop.sh && \ + echo -e "#!/bin/bash\nexport HADOOP_HOME=$HADOOP_HOME\nexport HADOOP_CONF_DIR=$HADOOP_CONF_DIR\nexport HADOOP_LOG_DIR=$HADOOP_LOG_DIR\nexport HADOOP_VER=$HADOOP_VER" >> /etc/profile.d/hadoop.sh && \ + # 将Zookeeper部分环境变量写入/etc/profile.d/zookeeper.sh + echo -e "#!/bin/bash\nexport ZOOKEEPER_HOME=$ZOOKEEPER_HOME\nexport ZOOKEEPER_CONF_DIR=$ZOOKEEPER_CONF_DIR\nexport ZOOKEEPER_VER=$ZOOKEEPER_VER\nexport ZOOKEEPER_DATA_DIR=$ZOOKEEPER_DATA_DIR" >> /etc/profile.d/zookeeper.sh && \ # 创建容器启动标识文件 touch $INIT_FLAG_FILE && \ # 先生成一个临时SSH密码,用于首次启动时交换ssh密钥 @@ -99,6 +102,8 @@ RUN wget https://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-${HADO tar -zxf hadoop-${HADOOP_VER}.tar.gz && \ mv hadoop-${HADOOP_VER} hadoop && \ rm -f hadoop-${HADOOP_VER}.tar.gz && \ + # 删除hadoop的docs,可以省下很多空间 + rm -rf ${HADOOP_HOME}/share/doc && \ # 移动配置文件到对应目录 mv /tmp/tmp_configs/core-site.xml ${HADOOP_CONF_DIR}/core-site.xml && \ mv /tmp/tmp_configs/hdfs-site.xml ${HADOOP_CONF_DIR}/hdfs-site.xml && \ @@ -114,6 +119,8 @@ RUN wget https://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-${HADO tar -zxf apache-zookeeper-${ZOOKEEPER_VER}-bin.tar.gz && \ mv apache-zookeeper-${ZOOKEEPER_VER}-bin zookeeper && \ rm -f apache-zookeeper-${ZOOKEEPER_VER}-bin.tar.gz && \ + # 删除zookeeper的docs + rm -rf ${ZOOKEEPER_HOME}/docs && \ # 拷贝Zookeeper基础配置文件 cp /opt/zookeeper/conf/zoo_sample.cfg /opt/zookeeper/conf/zoo.cfg && \ # 修改Zookeeper数据目录 diff --git a/README.md b/README.md index 3a38792..4fb6c9d 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ docker pull somebottle/haspark | `GN_NODEMANAGER_WITH_RESOURCEMANAGER` | 在ResourceManager所在节点是否启动NodeManager | `"false"` | | `GN_HDFS_SETUP_ON_STARTUP` | 是否在容器启动时自动启动HDFS各个节点的守护进程 | `"false"` | | `GN_YARN_SETUP_ON_STARTUP` | 是否在容器启动时自动启动Yarn各个节点的守护进程 | `"false"` | +| `GN_ZOOKEEPER_START_ON_STARTUP` | 是否在容器启动时自动启动Zookeeper各个节点的守护进程 | `"false"` | ### 3.4. Hadoop高可用(HA)分布式 @@ -83,6 +84,8 @@ docker pull somebottle/haspark 除了 `bitnami/spark` 提供的只读环境变量外,本镜像还提供了: +(可以调用 `source /etc/profile` 来载入这些环境变量到当前 Shell 中) + | 名称 | 说明 | | --- | --- | |`ZOOKEEPER_VER` | Zookeeper版本 | @@ -93,8 +96,8 @@ docker pull somebottle/haspark |`HADOOP_HOME` | Hadoop安装目录 | |`HADOOP_CONF_DIR` | Hadoop配置文件目录 | |`HADOOP_LOG_DIR` | Hadoop日志目录 | - - +|`HDFS_SERVICE_ADDR`| HDFS 服务地址。示例: 普通分布式-> `host:port`; HA 分布式-> `mycluster` | +|`ZOOKEEPER_QUORUM`| Zookeeper集群各节点地址,逗号分隔。示例: `host1:2181,host2:2181,host3:2181` | ## 4. 提供的脚本 @@ -160,7 +163,7 @@ version: '3' services: haspark-main: - image: somebottle/haspark:3.1.3 + image: somebottle/haspark:3.1.4 hostname: shmain env_file: ./conf.env environment: @@ -178,7 +181,7 @@ services: - '9870:9870' - '19888:19888' haspark-worker-1: - image: somebottle/haspark:3.1.3 + image: somebottle/haspark:3.1.4 hostname: shworker1 env_file: ./conf.env environment: @@ -194,7 +197,7 @@ services: ports: - '8081:8081' haspark-worker-2: - image: somebottle/haspark:3.1.3 + image: somebottle/haspark:3.1.4 hostname: shworker2 env_file: ./conf.env environment: diff --git a/conf.env b/conf.env index 16c98f5..f36b0d6 100644 --- a/conf.env +++ b/conf.env @@ -70,6 +70,10 @@ GN_HDFS_SETUP_ON_STARTUP=true # 容器集群启动时顺带启动Yarn集群 GN_YARN_SETUP_ON_STARTUP=true +# Whether to start Zookeeper on container startup +# 容器集群启动时是否启动Zookeeper集群 +GN_ZOOKEEPER_START_ON_STARTUP=false + # ***********Hadoop High Availability Section - Hadoop高可用分布式配置部分*********** diff --git a/docker-compose.yml b/docker-compose.yml index 46ccb3c..e64c37d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,7 @@ version: '3' services: haspark-main: - image: somebottle/haspark:3.1.3 + image: somebottle/haspark:3.1.4 hostname: shmain env_file: ./conf.env environment: @@ -20,7 +20,7 @@ services: - '9870:9870' - '19888:19888' haspark-worker-1: - image: somebottle/haspark:3.1.3 + image: somebottle/haspark:3.1.4 hostname: shworker1 env_file: ./conf.env environment: @@ -36,7 +36,7 @@ services: ports: - '8081:8081' haspark-worker-2: - image: somebottle/haspark:3.1.3 + image: somebottle/haspark:3.1.4 hostname: shworker2 env_file: ./conf.env environment: diff --git a/scripts/entry.sh b/scripts/entry.sh index 35c6004..6c82b70 100644 --- a/scripts/entry.sh +++ b/scripts/entry.sh @@ -1,6 +1,8 @@ #!/bin/bash # 容器启动时执行的脚本 +. /opt/somebottle/haspark/utils.sh # 导入工具函数 + # 指定家目录 # 不指定的话,ssh-copy-id没法正常运作 export HOME="$(eval echo ~$(whoami))" @@ -8,6 +10,9 @@ export HOME="$(eval echo ~$(whoami))" export HDFS_DAEMON_SEQ_FILE=/opt/somebottle/haspark/daemon_sequence/hdfs.seq export YARN_DAEMON_SEQ_FILE=/opt/somebottle/haspark/daemon_sequence/yarn.seq +# Zookeeper Quorum列表 +export ZOOKEEPER_QUORUM=$(join_by "$SH_HOSTS" ',' ':2181') + # 创建容器部署日志目录 mkdir -p /opt/somebottle/haspark/logs # 创建守护进程启动记录目录 @@ -23,7 +28,8 @@ export HOME='$HOME'\n\ export HDFS_DAEMON_SEQ_FILE='$HDFS_DAEMON_SEQ_FILE'\n\ export YARN_DAEMON_SEQ_FILE='$YARN_DAEMON_SEQ_FILE'\n\ export TEMP_PASS_FILE='$TEMP_PASS_FILE'\n\ -export INIT_FLAG_FILE='$INIT_FLAG_FILE'\n" >/etc/profile.d/sh_basics.sh +export INIT_FLAG_FILE='$INIT_FLAG_FILE'\n\ +export ZOOKEEPER_QUORUM='$ZOOKEEPER_QUORUM'\n" >/etc/profile.d/sh_basics.sh # 把JAVA_HOME也输出到/etc/profile echo "export JAVA_HOME=$JAVA_HOME" >/etc/profile.d/java.sh diff --git a/scripts/hadoop-general-setup.sh b/scripts/hadoop-general-setup.sh index 827144d..29f69ba 100644 --- a/scripts/hadoop-general-setup.sh +++ b/scripts/hadoop-general-setup.sh @@ -4,6 +4,11 @@ . /opt/somebottle/haspark/utils.sh # 导入工具函数 +if [[ "$GN_ZOOKEEPER_START_ON_STARTUP" == "true" ]]; then + # 容器启动时,启动Zookeeper守护进程 + $ZOOKEEPER_HOME/bin/zkServer.sh start +fi + if [ -e $INIT_FLAG_FILE ]; then # 仅在容器初次启动时执行 echo "Initializing Hadoop (General)." @@ -15,6 +20,8 @@ if [ -e $INIT_FLAG_FILE ]; then remove_ha_conf $HADOOP_CONF_DIR/mapred-site.xml # 修改core-site.xml sed -i "s/%%HDFS_DEF_HOST%%/$GN_NAMENODE_HOST:8020/g" $HADOOP_CONF_DIR/core-site.xml + # 将HDFS服务地址加入持久环境变量 + echo "export HDFS_SERVICE_ADDR='${GN_NAMENODE_HOST}:8020'" >>/etc/profile.d/sh_basics.sh # 修改hdfs-site.xml sed -i "s/%%HDFS_REPLICATION%%/$HADOOP_HDFS_REPLICATION/g" $HADOOP_CONF_DIR/hdfs-site.xml # 修改mapred-site.xml diff --git a/scripts/hadoop-ha-setup.sh b/scripts/hadoop-ha-setup.sh index 0a4e2ea..b7e0b8e 100644 --- a/scripts/hadoop-ha-setup.sh +++ b/scripts/hadoop-ha-setup.sh @@ -16,9 +16,6 @@ $ZOOKEEPER_HOME/bin/zkServer.sh start # 协调: 等待所有结点的Zookeeper守护进程启动 wait_for_java_process_on_specified_nodes QuorumPeerMain "$SH_HOSTS" -# Zookeeper Quorum列表 -zookeeper_nodes=$(join_by "$SH_HOSTS" ',' ':2181') - # **************************************************** 如果需要HDFS高可用 if [[ "$HA_HDFS_SETUP_ON_STARTUP" == "true" ]]; then @@ -33,9 +30,11 @@ if [[ "$HA_HDFS_SETUP_ON_STARTUP" == "true" ]]; then # ***********修改core-site.xml*********** # HDFS的NameNode的NameService名 sed -i "s/%%HDFS_DEF_HOST%%/$HA_HDFS_NAMESERVICE/g" $HADOOP_CONF_DIR/core-site.xml + # 将HDFS服务地址加入持久环境变量 + echo "export HDFS_SERVICE_ADDR='$HA_HDFS_NAMESERVICE'" >>/etc/profile.d/sh_basics.sh # 修改hdfs-site.xml sed -i "s/%%HDFS_NAMESERVICE%%/$HA_HDFS_NAMESERVICE/g" $HADOOP_CONF_DIR/hdfs-site.xml - sed -i "s/%%ZK_ADDRS%%/$zookeeper_nodes/g" $HADOOP_CONF_DIR/core-site.xml + sed -i "s/%%ZK_ADDRS%%/$ZOOKEEPER_QUORUM/g" $HADOOP_CONF_DIR/core-site.xml # ***********修改hdfs-site.xml*********** # HDFS副本数 @@ -201,7 +200,7 @@ if [[ "$HA_YARN_SETUP_ON_STARTUP" == "true" ]]; then # 处理完成后把HA_REPEAT_XXX_START/END部分用生成的配置替换 replace_repeat_conf 'RESOURCEMANAGER' "$generated_rm_conf" $HADOOP_CONF_DIR/yarn-site.xml # Zookeeper节点地址 - sed -i "s/%%ZK_ADDRS%%/$zookeeper_nodes/g" $HADOOP_CONF_DIR/yarn-site.xml + sed -i "s/%%ZK_ADDRS%%/$ZOOKEEPER_QUORUM/g" $HADOOP_CONF_DIR/yarn-site.xml fi # ################# 容器每次启动都执行的部分 SECTION-START #################