Skip to content

Commit

Permalink
Separate refactoring from apache#12276 in a prior PR (apache#12296)
Browse files Browse the repository at this point in the history
* Separate minor refactoring from apache#12276 in a prior PR

* Address CR comments
  • Loading branch information
larroy authored and anirudh2290 committed Sep 19, 2018
1 parent 9de5c63 commit f44fc79
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 78 deletions.
164 changes: 88 additions & 76 deletions ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,33 @@
"""

__author__ = 'Marco de Abreu, Kellen Sunderland, Anton Chernov, Pedro Larroy'
__version__ = '0.1'
__version__ = '0.2'

import argparse
import glob
import logging
import os
import re
import shutil
import subprocess
import sys
import tempfile
import platform
from copy import deepcopy
from itertools import chain
from subprocess import call, check_call
from subprocess import call, check_call, check_output
from typing import *
from util import *
import pprint
import requests


CCACHE_MAXSIZE = '500G'

def under_ci() -> bool:
""":return: True if we run in Jenkins."""
return 'JOB_NAME' in os.environ

def get_platforms(path: Optional[str] = "docker"):
def get_dockerfiles_path():
return "docker"


def get_platforms(path: str = get_dockerfiles_path()) -> List[str]:
"""Get a list of architectures given our dockerfiles"""
dockerfiles = glob.glob(os.path.join(path, "Dockerfile.build.*"))
dockerfiles = list(filter(lambda x: x[-1] != '~', dockerfiles))
Expand All @@ -57,29 +59,30 @@ def get_platforms(path: Optional[str] = "docker"):


def get_docker_tag(platform: str, registry: str) -> str:
""":return: docker tag to be used for the container"""
return "{0}/build.{1}".format(registry, platform)


def get_dockerfile(platform: str, path="docker") -> str:
def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
return os.path.join(path, "Dockerfile.build.{0}".format(platform))


def get_docker_binary(use_nvidia_docker: bool) -> str:
return "nvidia-docker" if use_nvidia_docker else "docker"


def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int) -> None:
def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, use_cache: bool) -> str:
"""
Build a container for the given platform
:param platform: Platform
:param docker_binary: docker binary to use (docker/nvidia-docker)
:param registry: Dockerhub registry name
:param num_retries: Number of retries to build the docker image
:param use_cache: will pass cache_from to docker to use the previously pulled tag
:return: Id of the top level image
"""

tag = get_docker_tag(platform=platform, registry=registry)
logging.info("Building container tagged '%s' with %s", tag, docker_binary)
logging.info("Building docker container tagged '%s' with %s", tag, docker_binary)
#
# We add a user with the same group as the executing non-root user so files created in the
# container match permissions of the local user. Same for the group.
Expand All @@ -90,41 +93,29 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
# cache-from is needed so we use the cached images tagged from the remote via
# docker pull see: docker_cache.load_docker_cache
#
# This also prevents using local layers for caching: https://github.com/moby/moby/issues/33002
# So to use local caching, we should omit the cache-from by using --no-dockerhub-cache argument to this
# script.
#
# This doesn't work with multi head docker files.
#

for i in range(num_retries):
logging.info('%d out of %d tries to build the docker image.', i + 1, num_retries)

cmd = [docker_binary, "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid()),
"--cache-from", tag,
"-t", tag,
"docker"]
#
cmd = [docker_binary, "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
if use_cache:
cmd.extend(["--cache-from", tag])
cmd.extend(["-t", tag, get_dockerfiles_path()])

@retry(subprocess.CalledProcessError, tries=num_retries)
def run_cmd():
logging.info("Running command: '%s'", ' '.join(cmd))
try:
check_call(cmd)
# Docker build was successful. Call break to break out of the retry mechanism
break
except subprocess.CalledProcessError as e:
saved_exception = e
logging.error('Failed to build docker image')
# Building the docker image failed. Call continue to trigger the retry mechanism
continue
else:
# Num retries exceeded
logging.exception('Exception during build of docker image', saved_exception)
logging.fatal('Failed to build the docker image, aborting...')
sys.exit(1)
check_call(cmd)

run_cmd()
# Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
# check_call would have failed
image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
if not image_id:
raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag))
return image_id
return _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)


def _get_local_image_id(docker_binary, docker_tag):
Expand All @@ -134,14 +125,17 @@ def _get_local_image_id(docker_binary, docker_tag):
:return: Image id as string or None if tag does not exist
"""
cmd = [docker_binary, "images", "-q", docker_tag]
image_id_b = subprocess.check_output(cmd)
image_id_b = check_output(cmd)
image_id = image_id_b.decode('utf-8').strip()
if not image_id:
raise RuntimeError('Unable to find docker image id matching with tag {}'.format(docker_tag))
return image_id


def buildir() -> str:
return os.path.join(get_mxnet_root(), "build")


def default_ccache_dir() -> str:
# Share ccache across containers
if 'CCACHE_DIR' in os.environ:
Expand All @@ -152,6 +146,7 @@ def default_ccache_dir() -> str:
except PermissionError:
logging.info('Unable to make dirs at %s, falling back to local temp dir', ccache_dir)
# In osx tmpdir is not mountable by default
import platform
if platform.system() == 'Darwin':
ccache_dir = "/tmp/_mxnet_ccache"
os.makedirs(ccache_dir, exist_ok=True)
Expand All @@ -166,7 +161,7 @@ def container_run(platform: str,
local_ccache_dir: str,
command: List[str],
dry_run: bool = False,
interactive: bool = False) -> str:
interactive: bool = False) -> int:
tag = get_docker_tag(platform=platform, registry=docker_registry)
mx_root = get_mxnet_root()
local_build_folder = buildir()
Expand All @@ -193,52 +188,61 @@ def container_run(platform: str,
logging.info("Executing:\n%s\n", cmd)
ret = call(runlist)

docker_run_cmd = ' '.join(runlist)
if not dry_run and interactive:
into_cmd = deepcopy(runlist)
# -ti can't be after the tag, as is interpreted as a command so hook it up after the -u argument
idx = into_cmd.index('-u') + 2
into_cmd[idx:idx] = ['-ti']
cmd = '\\\n\t'.join(into_cmd)
cmd = ' \\\n\t'.join(into_cmd)
logging.info("Executing:\n%s\n", cmd)
docker_run_cmd = ' '.join(into_cmd)
ret = call(into_cmd)

if not dry_run and not interactive and ret != 0:
logging.error("Running of command in container failed (%s):\n%s\n", ret, cmd)
logging.error("You can get into the container by adding the -i option")
raise subprocess.CalledProcessError(ret, cmd)

return docker_run_cmd
return ret


def list_platforms() -> str:
print("\nSupported platforms:\n{}".format('\n'.join(get_platforms())))
return "\nSupported platforms:\n{}".format('\n'.join(get_platforms()))


def load_docker_cache(tag, docker_registry) -> None:
if docker_registry:
try:
import docker_cache
logging.info('Docker cache download is enabled from registry %s', docker_registry)
docker_cache.load_docker_cache(registry=docker_registry, docker_tag=tag)
# noinspection PyBroadException
except Exception:
logging.exception('Unable to retrieve Docker cache. Continue without...')
else:
logging.info('Distributed docker cache disabled')

def main() -> int:
# We need to be in the same directory than the script so the commands in the dockerfiles work as
# expected. But the script can be invoked from a different path
base = os.path.split(os.path.realpath(__file__))[0]
os.chdir(base)

logging.getLogger().setLevel(logging.INFO)
def log_environment():
instance_id = ec2_instance_id_hostname()
if instance_id:
logging.info("EC2 Instance id: %s", instance_id)
pp = pprint.PrettyPrinter(indent=4)
logging.debug("Build environment: %s", pp.pformat(dict(os.environ)))


def script_name() -> str:
return os.path.split(sys.argv[0])[1]
def script_name() -> str:
return os.path.split(sys.argv[0])[1]


def main() -> int:
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.basicConfig(format='{}: %(asctime)-15s %(message)s'.format(script_name()))

logging.info("MXNet container based build tool.")
log_environment()
chdir_to_script_directory()

parser = argparse.ArgumentParser(description="""Utility for building and testing MXNet on docker
containers""", epilog="")
parser.add_argument("-p", "--platform",
Expand Down Expand Up @@ -284,8 +288,10 @@ def script_name() -> str:
default=1,
type=int)

parser.add_argument("-c", "--cache", action="store_true",
help="Enable docker registry cache")
parser.add_argument("-c", "--no-dockerhub-cache", action="store_true",
help="Disables use of --cache-from option on docker build, allowing docker"
" to use local layers for caching. If absent, we use the cache from dockerhub"
" which is the default.")

parser.add_argument("command",
help="command to run in the container",
Expand All @@ -297,35 +303,36 @@ def script_name() -> str:
type=str)

args = parser.parse_args()

def use_cache():
return args.cache or under_ci()
return not args.no_dockerhub_cache or under_ci()

command = list(chain(*args.command))
docker_binary = get_docker_binary(args.nvidiadocker)
shared_memory_size = args.shared_memory_size
num_docker_build_retires = args.docker_build_retries

if args.list:
list_platforms()
print(list_platforms())
elif args.platform:
platform = args.platform
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
if use_cache():
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
build_docker(platform, docker_binary, registry=args.docker_registry, num_retries=num_docker_build_retires)
build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
num_retries=args.docker_build_retries, use_cache=use_cache())
if args.build_only:
logging.warning("Container was just built. Exiting due to build-only.")
return 0

if command:
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, interactive=args.interactive)
elif args.print_docker_run:
print(container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
command=[], dry_run=True, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir))
print(container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
command=[], dry_run=True, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir))
elif args.interactive:
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, interactive=args.interactive)

Expand All @@ -334,7 +341,7 @@ def use_cache():
assert not args.interactive, "when running with -i must provide a command"
cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
logging.info("No command specified, trying default build: %s", ' '.join(cmd))
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
command=cmd, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir)

Expand All @@ -346,15 +353,20 @@ def use_cache():
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
if use_cache():
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
build_docker(platform, docker_binary, args.docker_registry, num_retries=num_docker_build_retires)
build_docker(platform, docker_binary, args.docker_registry, num_retries=args.docker_build_retries,
use_cache=use_cache())
if args.build_only:
continue
build_platform = "build_{}".format(platform)
cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
shutil.rmtree(buildir(), ignore_errors=True)
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
command=cmd, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
plat_buildir = os.path.join(get_mxnet_root(), build_platform)
build_platform = "build_{}".format(platform)
plat_buildir = os.path.abspath(os.path.join(get_mxnet_root(), '..',
"mxnet_{}".format(build_platform)))
if os.path.exists(plat_buildir):
logging.warning("{} already exists, skipping".format(plat_buildir))
continue
command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
shutil.move(buildir(), plat_buildir)
logging.info("Built files left in: %s", plat_buildir)

Expand Down Expand Up @@ -383,7 +395,7 @@ def use_cache():
./build.py -a
Builds for all platforms and leaves artifacts in build_<platform>
Builds for all platforms and leaves artifacts in build_<platform>.
""")

Expand Down
3 changes: 3 additions & 0 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,9 @@ build_ubuntu_gpu_cmake() {
ninja -v
}

build_ubuntu_blc() {
echo "pass"
}

# Testing

Expand Down
5 changes: 3 additions & 2 deletions ci/docker_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import sys
import subprocess
import json
from typing import *
import build as build_util


Expand Down Expand Up @@ -59,7 +60,7 @@ def build_save_containers(platforms, registry, load_cache) -> int:
return 1 if is_error else 0


def _build_save_container(platform, registry, load_cache) -> str:
def _build_save_container(platform, registry, load_cache) -> Optional[str]:
"""
Build image for passed platform and upload the cache to the specified S3 bucket
:param platform: Platform
Expand All @@ -77,7 +78,7 @@ def _build_save_container(platform, registry, load_cache) -> str:
logging.debug('Building %s as %s', platform, docker_tag)
try:
# Increase the number of retries for building the cache.
image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10)
image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10, use_cache=True)
logging.info('Built %s as %s', docker_tag, image_id)

# Push cache to registry
Expand Down
Loading

0 comments on commit f44fc79

Please sign in to comment.