Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Unify centos7 Dockerfiles and remove install scripts (#18115)
Browse files Browse the repository at this point in the history
  • Loading branch information
leezu committed Apr 22, 2020
1 parent 84400fb commit b85e446
Show file tree
Hide file tree
Showing 26 changed files with 342 additions and 688 deletions.
4 changes: 2 additions & 2 deletions cd/mxnet_lib/mxnet_lib_pipeline.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ def get_stash(mxnet_variant) {
// The environment corresponds to the docker files in the 'docker' directory
def get_environment(mxnet_variant) {
if (mxnet_variant.startsWith("cu")) {
return "publish.centos7_gpu_${mxnet_variant}"
return "centos7_gpu_${mxnet_variant}"
}
return "publish.centos7_cpu"
return "centos7_cpu"
}

// Returns the variant appropriate jenkins node test in which
Expand Down
4 changes: 2 additions & 2 deletions cd/python/docker/Jenkins_pipeline.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def get_pipeline(mxnet_variant) {
// The environment corresponds to the docker files in the 'docker' directory
def get_environment(mxnet_variant) {
if (mxnet_variant.startsWith("cu")) {
return "publish.centos7_gpu_${mxnet_variant}"
return "centos7_gpu_${mxnet_variant}"
}
return "publish.centos7_cpu"
return "centos7_cpu"
}


Expand Down
6 changes: 3 additions & 3 deletions cd/python/pypi/Jenkins_pipeline.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def get_pipeline(mxnet_variant) {

def get_environment(mxnet_variant) {
if (mxnet_variant.startsWith('cu')) {
return "publish.centos7_gpu_${mxnet_variant}"
return "centos7_gpu_${mxnet_variant}"
}
return "publish.centos7_cpu"
return "centos7_cpu"
}

def build(mxnet_variant) {
Expand All @@ -58,7 +58,7 @@ def test(mxnet_variant) {
// test wheel file
def environment = get_environment(mxnet_variant)
def nvidia_docker = mxnet_variant.startsWith('cu')
ci_utils.docker_run(environment, "cd_integration_test_pypi python3 ${nvidia_docker}", nvidia_docker)
ci_utils.docker_run(environment, "cd_integration_test_pypi ${nvidia_docker}", nvidia_docker)
}
}

Expand Down
6 changes: 4 additions & 2 deletions ci/Jenkinsfile_utils.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,11 @@ def collect_test_results_windows(original_file_name, new_file_name) {
}


def docker_run(platform, function_name, use_nvidia, shared_mem = '500m', env_vars = "") {
def command = "ci/build.py %ENV_VARS% --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
def docker_run(platform, function_name, use_nvidia = false, shared_mem = '500m', env_vars = "",
build_args = "") {
def command = "ci/build.py %ENV_VARS% %BUILD_ARGS% --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
command = command.replaceAll('%ENV_VARS%', env_vars.length() > 0 ? "-e ${env_vars}" : '')
command = command.replaceAll('%BUILD_ARGS%', env_vars.length() > 0 ? "${build_args}" : '')
command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
command = command.replaceAll('%PLATFORM%', platform)
command = command.replaceAll('%FUNCTION_NAME%', function_name)
Expand Down
109 changes: 64 additions & 45 deletions ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,23 @@
import glob
import pprint
import re
import os
import shutil
import signal
import subprocess
from itertools import chain
from subprocess import check_call, check_output
from typing import *

import yaml

from safe_docker_run import SafeDockerClient
from util import *

# NOTE: Temporary whitelist used until all Dockerfiles are refactored for docker compose
DOCKER_COMPOSE_WHITELIST = ('centos7_cpu', 'centos7_gpu_cu92', 'centos7_gpu_cu100',
'centos7_gpu_cu101', 'centos7_gpu_cu102')


def get_dockerfiles_path():
return "docker"
Expand All @@ -55,6 +62,11 @@ def get_platforms(path: str = get_dockerfiles_path()) -> List[str]:

def get_docker_tag(platform: str, registry: str) -> str:
""":return: docker tag to be used for the container"""
if platform in DOCKER_COMPOSE_WHITELIST:
with open("docker/docker-compose.yml", "r") as f:
compose_config = yaml.load(f.read(), yaml.SafeLoader)
return compose_config["services"][platform]["image"]

platform = platform if any(x in platform for x in ['build.', 'publish.']) else 'build.{}'.format(platform)
if not registry:
registry = "mxnet_local"
Expand All @@ -66,72 +78,80 @@ def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
return os.path.join(path, "Dockerfile.{0}".format(platform))


def get_docker_binary(use_nvidia_docker: bool) -> str:
return "nvidia-docker" if use_nvidia_docker else "docker"


def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, no_cache: bool,
def build_docker(platform: str, registry: str, num_retries: int, no_cache: bool,
cache_intermediate: bool = False) -> str:
"""
Build a container for the given platform
:param platform: Platform
:param docker_binary: docker binary to use (docker/nvidia-docker)
:param registry: Dockerhub registry name
:param num_retries: Number of retries to build the docker image
:param no_cache: pass no-cache to docker to rebuild the images
:return: Id of the top level image
"""
tag = get_docker_tag(platform=platform, registry=registry)
logging.info("Building docker container tagged '%s' with %s", tag, docker_binary)
#
# We add a user with the same group as the executing non-root user so files created in the
# container match permissions of the local user. Same for the group.
#
# These variables are used in the docker files to create user and group with these ids.
# see: docker/install/ubuntu_adduser.sh
#
# cache-from is needed so we use the cached images tagged from the remote via
# docker pull see: docker_cache.load_docker_cache
#
# This also prevents using local layers for caching: https://github.com/moby/moby/issues/33002
# So to use local caching, we should omit the cache-from by using --no-dockerhub-cache argument to this
# script.
#
# This doesn't work with multi head docker files.
#
cmd = [docker_binary, "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
if no_cache:
cmd.append("--no-cache")
if cache_intermediate:
cmd.append("--rm=false")
elif registry:
cmd.extend(["--cache-from", tag])
cmd.extend(["-t", tag, get_dockerfiles_path()])

# Case 1: docker-compose
if platform in DOCKER_COMPOSE_WHITELIST:
logging.info('Building docker container tagged \'%s\' based on ci/docker/docker-compose.yml', tag)
# We add a user with the same group as the executing non-root user so files created in the
# container match permissions of the local user. Same for the group.
cmd = ['docker-compose', '-f', 'docker/docker-compose.yml', 'build',
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
if cache_intermediate:
cmd.append('--no-rm')
cmd.append(platform)
else: # Case 2: Deprecated way, will be removed
# We add a user with the same group as the executing non-root user so files created in the
# container match permissions of the local user. Same for the group.
#
# These variables are used in the docker files to create user and group with these ids.
# see: docker/install/ubuntu_adduser.sh
#
# cache-from is needed so we use the cached images tagged from the remote via
# docker pull see: docker_cache.load_docker_cache
#
# This also prevents using local layers for caching: https://github.com/moby/moby/issues/33002
# So to use local caching, we should omit the cache-from by using --no-dockerhub-cache argument to this
# script.
#
# This doesn't work with multi head docker files.
logging.info("Building docker container tagged '%s'", tag)
cmd = ["docker", "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
if no_cache:
cmd.append("--no-cache")
if cache_intermediate:
cmd.append("--rm=false")
elif registry:
cmd.extend(["--cache-from", tag])
cmd.extend(["-t", tag, get_dockerfiles_path()])


@retry(subprocess.CalledProcessError, tries=num_retries)
def run_cmd():
logging.info("Running command: '%s'", ' '.join(cmd))
check_call(cmd)

run_cmd()

# Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
# check_call would have failed
image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
image_id = _get_local_image_id(docker_tag=tag)
if not image_id:
raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag))
return image_id


def _get_local_image_id(docker_binary, docker_tag):
def _get_local_image_id(docker_tag):
"""
Get the image id of the local docker layer with the passed tag
:param docker_tag: docker tag
:return: Image id as string or None if tag does not exist
"""
cmd = [docker_binary, "images", "-q", docker_tag]
cmd = ["docker", "images", "-q", docker_tag]
image_id_b = check_output(cmd)
image_id = image_id_b.decode('utf-8').strip()
if not image_id:
Expand Down Expand Up @@ -196,7 +216,7 @@ def container_run(docker_client: SafeDockerClient,

# Equivalent command
docker_cmd_list = [
get_docker_binary(nvidia_runtime),
"nvidia-docker" if nvidia_runtime else "docker",
'run',
"--cap-add",
"SYS_PTRACE", # Required by ASAN
Expand Down Expand Up @@ -352,7 +372,6 @@ def main() -> int:
args = parser.parse_args()

command = list(chain(*args.command))
docker_binary = get_docker_binary(args.nvidiadocker)
docker_client = SafeDockerClient()

environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
Expand All @@ -363,12 +382,12 @@ def main() -> int:
elif args.platform:
platform = args.platform
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
if args.docker_registry:
if args.docker_registry and platform not in DOCKER_COMPOSE_WHITELIST:
# Caching logic for Dockerfiles not yet refactored with compose
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
if not args.run_only:
build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
num_retries=args.docker_build_retries, no_cache=args.no_cache,
cache_intermediate=args.cache_intermediate)
build_docker(platform=platform, registry=args.docker_registry, num_retries=args.docker_build_retries,
no_cache=args.no_cache, cache_intermediate=args.cache_intermediate)
else:
logging.info("Skipping docker build step.")

Expand Down Expand Up @@ -410,8 +429,8 @@ def main() -> int:
for platform in platforms:
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
build_docker(platform, docker_binary=docker_binary, registry=args.docker_registry,
num_retries=args.docker_build_retries, no_cache=args.no_cache)
build_docker(platform, registry=args.docker_registry, num_retries=args.docker_build_retries,
no_cache=args.no_cache)
if args.build_only:
continue
shutil.rmtree(buildir(), ignore_errors=True)
Expand Down
140 changes: 140 additions & 0 deletions ci/docker/Dockerfile.build.centos7
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# -*- mode: dockerfile -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Dockerfile declaring CentOS 7 related images.
# Via the CentOS 7 Dockerfiles, we ensure MXNet continues to run fine on older systems.
#
# See docker-compose.yml for supported BASE_IMAGE ARGs and targets.

####################################################################################################
# The Dockerfile uses a dynamic BASE_IMAGE (for examplecentos:7, nvidia/cuda:10.2-devel-centos7 etc)
# On top of BASE_IMAGE we install all dependencies shared by all MXNet build environments into a
# "base" target. At the end of this file, we specialize "base" for specific usecases.
# The target built by docker can be selected via "--target" option or docker-compose.yml
####################################################################################################
ARG BASE_IMAGE
FROM $BASE_IMAGE AS base

WORKDIR /work/deps

RUN yum -y check-update || true && \
yum -y install epel-release centos-release-scl && \
yum install -y \
# Utilities
wget \
unzip \
patchelf \
pandoc \
# Development tools
git \
make \
ninja-build \
automake \
autoconf \
libtool \
protobuf-compiler \
protobuf-devel \
# CentOS Software Collections https://www.softwarecollections.org
devtoolset-7 \
rh-python35 \
rh-maven35 \
# Libraries
# Provide clbas headerfiles
atlas-devel \
openblas-devel \
lapack-devel \
opencv-devel \
openssl-devel \
zeromq-devel \
# Build-dependencies for ccache 3.7.9
gperf \
libb2-devel \
libzstd-devel && \
yum clean all && \
# Centos 7 only provides ninja-build
ln -s /usr/bin/ninja-build /usr/bin/ninja

# Make GCC7, Python 3.5 and Maven 3.3 Software Collections available by default
# during build and runtime of this container
SHELL [ "/usr/bin/scl", "enable", "devtoolset-7", "rh-python35", "rh-maven35" ]

# Install minimum required cmake version
RUN cd /usr/local/src && \
wget -nv https://cmake.org/files/v3.13/cmake-3.13.5-Linux-x86_64.sh && \
sh cmake-3.13.5-Linux-x86_64.sh --prefix=/usr/local --skip-license && \
rm cmake-3.13.5-Linux-x86_64.sh

# ccache 3.7.9 has fixes for caching nvcc outputs
RUN cd /usr/local/src && \
git clone --recursive https://github.com/ccache/ccache.git && \
cd ccache && \
git checkout v3.7.9 && \
./autogen.sh && \
./configure --disable-man && \
make -j$(nproc) && \
make install && \
cd /usr/local/src && \
rm -rf ccache

# Python dependencies
RUN pip3 install --no-cache-dir --upgrade pip && \
pip3 install --no-cache-dir nose pylint cython numpy nose-timer requests h5py scipy==1.2.3 wheel


ARG USER_ID=0
# Add user in order to make sure the assumed user the container is running under
# actually exists inside the container to avoid problems like missing home dir
RUN if [[ "$USER_ID" -gt 0 ]]; then \
# -no-log-init required due to https://github.com/moby/moby/issues/5419
useradd -m --no-log-init --uid $USER_ID --system jenkins_slave; \
usermod -aG wheel jenkins_slave; \
# By default, docker creates all WORK_DIRs with root owner
mkdir /work/mxnet; \
mkdir /work/build; \
chown -R jenkins_slave /work/; \
fi

ENV PYTHONPATH=./python/
WORKDIR /work/mxnet

COPY runtime_functions.sh /work/

####################################################################################################
# Specialize base image to install more gpu specific dependencies.
# The target built by docker can be selected via "--target" option or docker-compose.yml
####################################################################################################
FROM base as gpu
# Different Cuda versions require different NCCL versions
# https://wiki.bash-hackers.org/syntax/pe#search_and_replace
RUN export SHORT_CUDA_VERSION=${CUDA_VERSION%.*} && \
if [[ ${SHORT_CUDA_VERSION} == 9.2 ]]; then \
export NCCL_VERSION=2.4.8; \
elif [[ ${SHORT_CUDA_VERSION} == 10.* ]]; then \
export NCCL_VERSION=2.6.4; \
else \
echo "ERROR: Cuda ${SHORT_CUDA_VERSION} not yet supported in Dockerfile.build.centos7"; \
exit 1; \
fi && \
curl -fsSL https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm -O && \
rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
yum -y check-update || true && \
yum -y install \
libnccl-${NCCL_VERSION}-1+cuda${SHORT_CUDA_VERSION} \
libnccl-devel-${NCCL_VERSION}-1+cuda${SHORT_CUDA_VERSION} \
libnccl-static-${NCCL_VERSION}-1+cuda${SHORT_CUDA_VERSION} && \
yum clean all
Loading

0 comments on commit b85e446

Please sign in to comment.