diff --git a/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java b/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java index a8b76204bd..aff76220e5 100644 --- a/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java +++ b/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java @@ -2429,7 +2429,7 @@ public static void main(String[] args) { Objects.requireNonNull(appSubmitTimeStr, ApplicationConstants.APP_SUBMIT_TIME_ENV + " is null"); - Configuration conf = new Configuration(); + Configuration conf = new TezConfiguration(); AMExtensions amExtensions = getFrameworkService(conf).getAMExtensions(); DAGProtos.ConfigurationProto confProto = amExtensions.loadConfigurationProto(); diff --git a/tez-dist/pom.xml b/tez-dist/pom.xml index 9777d0c0b9..31dae3a28e 100644 --- a/tez-dist/pom.xml +++ b/tez-dist/pom.xml @@ -118,6 +118,38 @@ + + docker + + + + org.codehaus.mojo + exec-maven-plugin + + + build-docker-image + package + + exec + + + /bin/bash + + ${project.basedir}/src/docker/build-docker.sh + -hadoop + ${hadoop.version} + -tez + ${project.version} + -repo + apache + + + + + + + + diff --git a/tez-dist/src/docker/Dockerfile b/tez-dist/src/docker/Dockerfile new file mode 100644 index 0000000000..a0170af795 --- /dev/null +++ b/tez-dist/src/docker/Dockerfile @@ -0,0 +1,85 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +ARG BUILD_ENV=unarchive + +FROM ubuntu AS unarchive +ONBUILD COPY hadoop-*.tar.gz /opt +# UPDATED: Matches "tez-1.0.0-SNAPSHOT.tar.gz" pattern +ONBUILD COPY tez-*.tar.gz /opt + +FROM ${BUILD_ENV} AS env +ARG HADOOP_VERSION +ARG TEZ_VERSION + +RUN mkdir -p /opt/hadoop \ + && tar -xzv \ + --exclude="hadoop-$HADOOP_VERSION/share/doc" \ + --exclude="*/jdiff" \ + --exclude="*/sources" \ + --exclude="*tests.jar" \ + --exclude="*/webapps" \ + -f /opt/hadoop-$HADOOP_VERSION.tar.gz \ + -C /opt/hadoop --strip-components 1 \ + && mkdir -p /opt/tez \ + && tar -xzv \ + -f /opt/tez-$TEZ_VERSION.tar.gz \ + -C /opt/tez \ + && rm -rf /opt/hadoop-$HADOOP_VERSION.tar.gz /opt/tez-$TEZ_VERSION.tar.gz + +FROM eclipse-temurin:21.0.3_9-jre-ubi9-minimal AS run + +ARG UID=1000 +ARG HADOOP_VERSION +ARG TEZ_VERSION + +# Install dependencies +RUN set -ex; \ + microdnf update -y; \ + microdnf -y install procps gettext findutils; \ + microdnf clean all; \ + useradd --no-create-home -s /sbin/nologin -c "" --uid $UID tez + +# Set necessary environment variables +ENV HADOOP_HOME=/opt/hadoop \ + TEZ_HOME=/opt/tez \ + TEZ_CONF_DIR=/opt/tez/conf \ + HADOOP_CONF_DIR=/opt/tez/conf + +ENV PATH=$TEZ_HOME/bin:$HADOOP_HOME/bin:$PATH + +COPY --from=env --chown=tez /opt/hadoop $HADOOP_HOME +# UPDATED: Copy from the normalized directory name created in 'env' stage +COPY --from=env --chown=tez /opt/tez $TEZ_HOME + +RUN mkdir -p $TEZ_CONF_DIR && chown tez:tez $TEZ_CONF_DIR + +COPY --chown=tez entrypoint.sh / +COPY --chown=tez conf $TEZ_CONF_DIR + +# Create Extension Point Directory +RUN mkdir -p /opt/tez/plugins && chown tez:tez /opt/tez/plugins && chmod 755 /opt/tez/plugins + +RUN chmod +x /entrypoint.sh + +USER tez +WORKDIR $TEZ_HOME + +# Expose AM ports via -p flag in docker command +# EXPOSE 10001 10002 10003 8042 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/tez-dist/src/docker/README.md b/tez-dist/src/docker/README.md new file mode 100644 index 0000000000..1d8bf1e8dc --- /dev/null +++ b/tez-dist/src/docker/README.md @@ -0,0 +1,30 @@ +# Tez AM Docker +--- + +1. Building the docker image: +```bash +mvn clean install -DskipTests -Pdocker,tools +``` +2. Install zookeeper in mac by +```bash +brew install zookeeper +zkServer start +``` + +3. Running the Tez AM container: +```bash +docker run \ + -p 10001:10001 -p 8042:8042 \ + --name tez-am \ + apache/tez-am:1.0.0-SNAPSHOT +``` + +4. Debugging the Tez AM container: +```bash +docker run \ + -p 10001:10001 -p 8042:8042 -p 5005:5005 \ + -e TEZ_FRAMEWORK_MODE="STANDALONE_ZOOKEEPER" \ + -e JAVA_TOOL_OPTIONS='-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005' \ + --name tez-am \ + apache/tez-am:1.0.0-SNAPSHOT +``` diff --git a/tez-dist/src/docker/build-docker.sh b/tez-dist/src/docker/build-docker.sh new file mode 100755 index 0000000000..fabe94ed77 --- /dev/null +++ b/tez-dist/src/docker/build-docker.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -xeou pipefail + +HADOOP_VERSION= +TEZ_VERSION= +REPO= + +usage() { + cat <&2 +Usage: $0 [-h] [-hadoop ] [-tez ] [-repo ] +Build the Apache Tez AM Docker image +-help Display help +-hadoop Build image with the specified Hadoop version +-tez Build image with the specified Tez version +-repo Docker repository +EOF +} + +while [ $# -gt 0 ]; do + case "$1" in + -h) + usage + exit 0 + ;; + -hadoop) + shift + HADOOP_VERSION=$1 + shift + ;; + -tez) + shift + TEZ_VERSION=$1 + shift + ;; + -repo) + shift + REPO=$1 + shift + ;; + *) + shift + ;; + esac +done + +SCRIPT_DIR=$( + cd "$(dirname "$0")" + pwd +) + +DIST_DIR=${DIST_DIR:-"$SCRIPT_DIR/../.."} +PROJECT_ROOT=${PROJECT_ROOT:-"$SCRIPT_DIR/../../.."} + +repo=${REPO:-apache} +WORK_DIR="$(mktemp -d)" +CACHE_DIR="$SCRIPT_DIR/cache" +mkdir -p "$CACHE_DIR" + +# Defaults Hadoop and Tez versions from pom.xml if not provided +HADOOP_VERSION=${HADOOP_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=hadoop.version -DforceStdout)} +TEZ_VERSION=${TEZ_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=project.version -DforceStdout)} + +###################### +# HADOOP FETCH LOGIC # +###################### +HADOOP_FILE_NAME="hadoop-$HADOOP_VERSION.tar.gz" +HADOOP_URL=${HADOOP_URL:-"https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/$HADOOP_FILE_NAME"} +if [ ! -f "$CACHE_DIR/$HADOOP_FILE_NAME" ]; then + echo "Downloading Hadoop from $HADOOP_URL..." + if ! curl --fail -L "$HADOOP_URL" -o "$CACHE_DIR/$HADOOP_FILE_NAME.tmp"; then + echo "Fail to download Hadoop, exiting...." + exit 1 + fi + mv "$CACHE_DIR/$HADOOP_FILE_NAME.tmp" "$CACHE_DIR/$HADOOP_FILE_NAME" +fi + +##################################### +# Pick tez tarball from local build # +##################################### +TEZ_FILE_NAME="tez-$TEZ_VERSION.tar.gz" +LOCAL_DIST_PATH="$DIST_DIR/target/$TEZ_FILE_NAME" + +if [ -f "$LOCAL_DIST_PATH" ]; then + echo "--> Found local Tez build artifact at: $LOCAL_DIST_PATH" + cp "$LOCAL_DIST_PATH" "$WORK_DIR/" +else + echo "--> Error: Local Tez artifact not found at $LOCAL_DIST_PATH" + echo "--> Please build the project first (e.g., mvn clean install -DskipTests)." + exit 1 +fi + +# ------------------------------------------------------------------------- +# BUILD CONTEXT PREPARATION +# ------------------------------------------------------------------------- +cp "$CACHE_DIR/$HADOOP_FILE_NAME" "$WORK_DIR/" +cp -R "$SCRIPT_DIR/conf" "$WORK_DIR/" 2>/dev/null || mkdir -p "$WORK_DIR/conf" +cp "$SCRIPT_DIR/entrypoint.sh" "$WORK_DIR/" +cp "$SCRIPT_DIR/Dockerfile" "$WORK_DIR/" + +echo "Building Docker image..." +docker build \ + "$WORK_DIR" \ + -f "$WORK_DIR/Dockerfile" \ + -t "$repo/tez-am:$TEZ_VERSION" \ + --build-arg "BUILD_ENV=unarchive" \ + --build-arg "HADOOP_VERSION=$HADOOP_VERSION" \ + --build-arg "TEZ_VERSION=$TEZ_VERSION" + +rm -r "${WORK_DIR}" +echo "Docker image $repo/tez-am:$TEZ_VERSION built successfully." diff --git a/tez-dist/src/docker/conf/log4j2.properties b/tez-dist/src/docker/conf/log4j2.properties new file mode 100644 index 0000000000..a9f3559f85 --- /dev/null +++ b/tez-dist/src/docker/conf/log4j2.properties @@ -0,0 +1,20 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{ISO8601} %5p [%t] %c{2}: %m%n + +rootLogger.level = INFO +rootLogger.appenderRef.console.ref = console diff --git a/tez-dist/src/docker/conf/tez-site.xml b/tez-dist/src/docker/conf/tez-site.xml new file mode 100644 index 0000000000..1da499adc0 --- /dev/null +++ b/tez-dist/src/docker/conf/tez-site.xml @@ -0,0 +1,57 @@ + + + + + + + tez.am.client.am.port-range + 10001-10003 + + + + tez.am.resource.memory.mb + 1024 + + + + tez.framework.mode + STANDALONE_ZOOKEEPER + + + + tez.am.tez-ui.webservice.enable + false + + + + tez.am.zookeeper.quorum + host.docker.internal:2181 + + + + tez.am.log.level + DEBUG + + + + tez.am.mode.session + true + + + + diff --git a/tez-dist/src/docker/entrypoint.sh b/tez-dist/src/docker/entrypoint.sh new file mode 100644 index 0000000000..0ce6272d2c --- /dev/null +++ b/tez-dist/src/docker/entrypoint.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -xeou pipefail + +################################################ +# 1. Mocking DAGAppMaster#main() env variables # +################################################ + +export CONTAINER_ID=${CONTAINER_ID:-"container_1700000000000_0001_01_000001"} +export USER=${USER:-"tez"} +export HADOOP_USER_NAME=${HADOOP_USER_NAME:-"tez"} + +export NM_HOST=${NM_HOST:-"localhost"} +export NM_PORT=${NM_PORT:-"12345"} +export NM_HTTP_PORT=${NM_HTTP_PORT:-"8042"} + +export LOCAL_DIRS=${LOCAL_DIRS:-"/tmp"} +export LOG_DIRS=${LOG_DIRS:-"/opt/tez/logs"} +export APP_SUBMIT_TIME_ENV=${APP_SUBMIT_TIME_ENV:-$(($(date +%s) * 1000))} + +export TEZ_AM_EXTERNAL_ID=${TEZ_AM_EXTERNAL_ID:-"tez-session-$(hostname)"} + +if [ ! -f "tez-conf.pb" ]; then + touch "tez-conf.pb" + echo "--> Created dummy tez-conf.pb" +fi + + mkdir -p "$LOG_DIRS" + +########################## +# CONFIGURATION HANDLING # +########################## + +# Symlink hadoop conf in tez conf dir +if [ -d "$HADOOP_HOME/etc/hadoop" ]; then + echo "--> Linking missing Hadoop configs to $TEZ_CONF_DIR..." + for f in "$HADOOP_HOME/etc/hadoop"/*; do + basename=$(basename "$f") + # this check helps in case user wants to provide its custom hfds-site.xml + # or any other configuration file + if [ ! -e "$TEZ_CONF_DIR/$basename" ]; then + ln -s "$f" "$TEZ_CONF_DIR/$basename" + fi + done +fi + +########################### +# Custom Config directory # +########################### +if [ -n "${TEZ_CUSTOM_CONF_DIR:-}" ] && [ -d "$TEZ_CUSTOM_CONF_DIR" ]; then + echo "--> Using custom configuration directory: $TEZ_CUSTOM_CONF_DIR" + find "${TEZ_CUSTOM_CONF_DIR}" -type f -exec \ + ln -sf {} "${TEZ_CONF_DIR}"/ \; + + # Remove template keyword if it exist + if [ -f "$TEZ_CONF_DIR/tez-site.xml.template" ]; then + envsubst < "$TEZ_CONF_DIR/tez-site.xml.template" > "$TEZ_CONF_DIR/tez-site.xml" + fi +fi + + +############# +# CLASSPATH # +############# + +export HADOOP_USER_CLASSPATH_FIRST=true +# Order is: conf -> plugins -> tez jars -> hadoop jars +CLASSPATH="${TEZ_CONF_DIR}" + +# Custom Plugins +# This allows mounting a volume at /opt/tez/plugins containing aux jars +PLUGIN_DIR="/opt/tez/plugins" +if [ -d "$PLUGIN_DIR" ]; then + count=$(find "$PLUGIN_DIR" -maxdepth 1 -name "*.jar" 2>/dev/null | wc -l) + if [ "$count" != "0" ]; then + echo "--> Found $count plugin jars. Prepending to classpath." + CLASSPATH="${CLASSPATH}:${PLUGIN_DIR}/*" + fi +fi + +# Tez Jars +CLASSPATH="${CLASSPATH}:${TEZ_HOME}/*:${TEZ_HOME}/lib/*" + +# Hadoop Jars +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/hdfs/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/hdfs/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/yarn/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/yarn/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/mapreduce/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/mapreduce/lib/*" + +############# +# Execution # +############# +TEZ_DAG_JAR=$(find "$TEZ_HOME" -maxdepth 1 -name "tez-dag-*.jar" ! -name "*-tests.jar" | head -n 1) + +if [ -z "$TEZ_DAG_JAR" ]; then + echo "Error: Could not find tez-dag-*.jar in $TEZ_HOME" + exit 1 +fi + +echo "--> Starting DAGAppMaster..." +echo "--> HADOOP_CONF_DIR: $HADOOP_CONF_DIR" + +# Check for Log4j2 Configuration +JAVA_OPTS="${JAVA_OPTS:-"-Xmx1024m"}" +LOG4J2_FILE="$TEZ_CONF_DIR/log4j2.properties" +if [ -f "$LOG4J2_FILE" ]; then + echo "--> [TEZ-AM] Found Log4j2 configuration: $LOG4J2_FILE" + JAVA_OPTS="$JAVA_OPTS -Dlog4j.configurationFile=file:$LOG4J2_FILE" +fi + +JAVA_ADD_OPENS="--add-opens java.base/java.lang=ALL-UNNAMED \ + --add-opens java.base/java.util=ALL-UNNAMED \ + --add-opens java.base/java.lang.reflect=ALL-UNNAMED \ + --add-opens java.base/java.text=ALL-UNNAMED \ + --add-opens java.base/java.nio=ALL-UNNAMED \ + --add-opens java.base/sun.nio.ch=ALL-UNNAMED \ + --add-opens java.base/java.util.concurrent=ALL-UNNAMED" + +exec java $JAVA_OPTS $JAVA_ADD_OPENS \ + -Duser.name="$HADOOP_USER_NAME" \ + -Djava.library.path="$HADOOP_HOME/lib/native" \ + -Dhadoop.home.dir="$HADOOP_HOME" \ + -Dhadoop.log.dir="$LOG_DIRS" \ + -Dtez.conf.dir="$TEZ_CONF_DIR" \ + -cp "$CLASSPATH" \ + org.apache.tez.dag.app.DAGAppMaster \ + "$@"