Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-26025][k8s] Speed up docker image build on dev repo. #23019

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 85 additions & 37 deletions bin/docker-image-tool.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,20 @@ if [ -z "${SPARK_HOME}" ]; then
fi
. "${SPARK_HOME}/bin/load-spark-env.sh"

CTX_DIR="$SPARK_HOME/target/tmp/docker"

function is_dev_build {
[ ! -f "$SPARK_HOME/RELEASE" ]
}

function cleanup_ctx_dir {
if is_dev_build; then
rm -rf "$CTX_DIR"
fi
}

trap cleanup_ctx_dir EXIT

function image_ref {
local image="$1"
local add_repo="${2:-1}"
Expand All @@ -53,80 +67,114 @@ function docker_push {
fi
}

# Create a smaller build context for docker in dev builds to make the build faster. Docker
# uploads all of the current directory to the daemon, and it can get pretty big with dev
# builds that contain test log files and other artifacts.
#
# Three build contexts are created, one for each image: base, pyspark, and sparkr. For them
# to have the desired effect, the docker command needs to be executed inside the appropriate
# context directory.
#
# Note: docker does not support symlinks in the build context.
function create_dev_build_context {(
set -e
local BASE_CTX="$CTX_DIR/base"
mkdir -p "$BASE_CTX/kubernetes"
cp -r "resource-managers/kubernetes/docker/src/main/dockerfiles" \
"$BASE_CTX/kubernetes/dockerfiles"

cp -r "assembly/target/scala-$SPARK_SCALA_VERSION/jars" "$BASE_CTX/jars"
cp -r "resource-managers/kubernetes/integration-tests/tests" \
"$BASE_CTX/kubernetes/tests"

mkdir "$BASE_CTX/examples"
cp -r "examples/src" "$BASE_CTX/examples/src"
# Copy just needed examples jars instead of everything.
mkdir "$BASE_CTX/examples/jars"
for i in examples/target/scala-$SPARK_SCALA_VERSION/jars/*; do
if [ ! -f "$BASE_CTX/jars/$(basename $i)" ]; then
cp $i "$BASE_CTX/examples/jars"
fi
done

for other in bin sbin data; do
cp -r "$other" "$BASE_CTX/$other"
done

local PYSPARK_CTX="$CTX_DIR/pyspark"
mkdir -p "$PYSPARK_CTX/kubernetes"
cp -r "resource-managers/kubernetes/docker/src/main/dockerfiles" \
"$PYSPARK_CTX/kubernetes/dockerfiles"
mkdir "$PYSPARK_CTX/python"
cp -r "python/lib" "$PYSPARK_CTX/python/lib"

local R_CTX="$CTX_DIR/sparkr"
mkdir -p "$R_CTX/kubernetes"
cp -r "resource-managers/kubernetes/docker/src/main/dockerfiles" \
"$R_CTX/kubernetes/dockerfiles"
cp -r "R" "$R_CTX/R"
)}

function img_ctx_dir {
if is_dev_build; then
echo "$CTX_DIR/$1"
else
echo "$SPARK_HOME"
fi
}

function build {
local BUILD_ARGS
local IMG_PATH
local JARS

if [ ! -f "$SPARK_HOME/RELEASE" ]; then
# Set image build arguments accordingly if this is a source repo and not a distribution archive.
#
# Note that this will copy all of the example jars directory into the image, and that will
# contain a lot of duplicated jars with the main Spark directory. In a proper distribution,
# the examples directory is cleaned up before generating the distribution tarball, so this
# issue does not occur.
IMG_PATH=resource-managers/kubernetes/docker/src/main/dockerfiles
JARS=assembly/target/scala-$SPARK_SCALA_VERSION/jars
BUILD_ARGS=(
${BUILD_PARAMS}
--build-arg
img_path=$IMG_PATH
--build-arg
spark_jars=$JARS
--build-arg
example_jars=examples/target/scala-$SPARK_SCALA_VERSION/jars
--build-arg
k8s_tests=resource-managers/kubernetes/integration-tests/tests
)
else
# Not passed as arguments to docker, but used to validate the Spark directory.
IMG_PATH="kubernetes/dockerfiles"
JARS=jars
BUILD_ARGS=(${BUILD_PARAMS})
local SPARK_ROOT="$SPARK_HOME"

if is_dev_build; then
create_dev_build_context || error "Failed to create docker build context."
SPARK_ROOT="$CTX_DIR/base"
fi

# Verify that the Docker image content directory is present
if [ ! -d "$IMG_PATH" ]; then
if [ ! -d "$SPARK_ROOT/kubernetes/dockerfiles" ]; then
error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark."
fi

# Verify that Spark has actually been built/is a runnable distribution
# i.e. the Spark JARs that the Docker files will place into the image are present
local TOTAL_JARS=$(ls $JARS/spark-* | wc -l)
local TOTAL_JARS=$(ls $SPARK_ROOT/jars/spark-* | wc -l)
TOTAL_JARS=$(( $TOTAL_JARS ))
if [ "${TOTAL_JARS}" -eq 0 ]; then
error "Cannot find Spark JARs. This script assumes that Apache Spark has first been built locally or this is a runnable distribution."
fi

local BUILD_ARGS=(${BUILD_PARAMS})
local BINDING_BUILD_ARGS=(
${BUILD_PARAMS}
--build-arg
base_img=$(image_ref spark)
)
local BASEDOCKERFILE=${BASEDOCKERFILE:-"$IMG_PATH/spark/Dockerfile"}
local BASEDOCKERFILE=${BASEDOCKERFILE:-"kubernetes/dockerfiles/spark/Dockerfile"}
local PYDOCKERFILE=${PYDOCKERFILE:-false}
local RDOCKERFILE=${RDOCKERFILE:-false}

docker build $NOCACHEARG "${BUILD_ARGS[@]}" \
(cd $(img_ctx_dir base) && docker build $NOCACHEARG "${BUILD_ARGS[@]}" \
-t $(image_ref spark) \
-f "$BASEDOCKERFILE" .
-f "$BASEDOCKERFILE" .)
if [ $? -ne 0 ]; then
error "Failed to build Spark JVM Docker image, please refer to Docker build output for details."
fi

if [ "${PYDOCKERFILE}" != "false" ]; then
docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
(cd $(img_ctx_dir pyspark) && docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
-t $(image_ref spark-py) \
-f "$PYDOCKERFILE" .
-f "$PYDOCKERFILE" .)
if [ $? -ne 0 ]; then
error "Failed to build PySpark Docker image, please refer to Docker build output for details."
fi
fi

if [ "${RDOCKERFILE}" != "false" ]; then
docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
(cd $(img_ctx_dir sparkr) && docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
-t $(image_ref spark-r) \
-f "$RDOCKERFILE" .
-f "$RDOCKERFILE" .)
if [ $? -ne 0 ]; then
error "Failed to build SparkR Docker image, please refer to Docker build output for details."
fi
Expand Down
3 changes: 2 additions & 1 deletion project/SparkBuild.scala
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,8 @@ object KubernetesIntegrationTests {
s"-Dspark.kubernetes.test.unpackSparkDir=$sparkHome"
),
// Force packaging before building images, so that the latest code is tested.
dockerBuild := dockerBuild.dependsOn(packageBin in Compile in assembly).value
dockerBuild := dockerBuild.dependsOn(packageBin in Compile in assembly)
.dependsOn(packageBin in Compile in examples).value
)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,6 @@

FROM openjdk:8-alpine

ARG spark_jars=jars
ARG example_jars=examples/jars
ARG img_path=kubernetes/dockerfiles
ARG k8s_tests=kubernetes/tests

# Before building the docker image, first build and make a Spark distribution following
# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
# If this docker file is being used in the context of building your images from a Spark
Expand All @@ -41,13 +36,12 @@ RUN set -ex && \
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
chgrp root /etc/passwd && chmod ug+rw /etc/passwd

COPY ${spark_jars} /opt/spark/jars
COPY jars /opt/spark/jars
COPY bin /opt/spark/bin
COPY sbin /opt/spark/sbin
COPY ${img_path}/spark/entrypoint.sh /opt/
COPY ${example_jars} /opt/spark/examples/jars
COPY examples/src /opt/spark/examples/src
COPY ${k8s_tests} /opt/spark/tests
COPY kubernetes/dockerfiles/spark/entrypoint.sh /opt/
COPY examples /opt/spark/examples
COPY kubernetes/tests /opt/spark/tests
COPY data /opt/spark/data

ENV SPARK_HOME /opt/spark
Expand Down