-
Notifications
You must be signed in to change notification settings - Fork 6.8k
Separate refactoring from #12276 in a prior PR #12296
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,31 +23,33 @@ | |
""" | ||
|
||
__author__ = 'Marco de Abreu, Kellen Sunderland, Anton Chernov, Pedro Larroy' | ||
__version__ = '0.1' | ||
__version__ = '0.2' | ||
|
||
import argparse | ||
import glob | ||
import logging | ||
import os | ||
import re | ||
import shutil | ||
import subprocess | ||
import sys | ||
import tempfile | ||
import platform | ||
from copy import deepcopy | ||
from itertools import chain | ||
from subprocess import call, check_call | ||
from subprocess import call, check_call, check_output | ||
from typing import * | ||
from util import * | ||
import pprint | ||
import requests | ||
|
||
|
||
CCACHE_MAXSIZE = '500G' | ||
|
||
def under_ci() -> bool: | ||
""":return: True if we run in Jenkins.""" | ||
return 'JOB_NAME' in os.environ | ||
|
||
def get_platforms(path: Optional[str] = "docker"): | ||
def get_dockerfiles_path(): | ||
return "docker" | ||
|
||
|
||
def get_platforms(path: str = get_dockerfiles_path()) -> List[str]: | ||
"""Get a list of architectures given our dockerfiles""" | ||
dockerfiles = glob.glob(os.path.join(path, "Dockerfile.build.*")) | ||
dockerfiles = list(filter(lambda x: x[-1] != '~', dockerfiles)) | ||
|
@@ -57,29 +59,30 @@ def get_platforms(path: Optional[str] = "docker"): | |
|
||
|
||
def get_docker_tag(platform: str, registry: str) -> str: | ||
""":return: docker tag to be used for the container""" | ||
return "{0}/build.{1}".format(registry, platform) | ||
|
||
|
||
def get_dockerfile(platform: str, path="docker") -> str: | ||
def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str: | ||
return os.path.join(path, "Dockerfile.build.{0}".format(platform)) | ||
|
||
|
||
def get_docker_binary(use_nvidia_docker: bool) -> str: | ||
return "nvidia-docker" if use_nvidia_docker else "docker" | ||
|
||
|
||
def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int) -> None: | ||
def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, use_cache: bool) -> str: | ||
""" | ||
Build a container for the given platform | ||
:param platform: Platform | ||
:param docker_binary: docker binary to use (docker/nvidia-docker) | ||
:param registry: Dockerhub registry name | ||
:param num_retries: Number of retries to build the docker image | ||
:param use_cache: will pass cache_from to docker to use the previously pulled tag | ||
:return: Id of the top level image | ||
""" | ||
|
||
tag = get_docker_tag(platform=platform, registry=registry) | ||
logging.info("Building container tagged '%s' with %s", tag, docker_binary) | ||
logging.info("Building docker container tagged '%s' with %s", tag, docker_binary) | ||
# | ||
# We add a user with the same group as the executing non-root user so files created in the | ||
# container match permissions of the local user. Same for the group. | ||
|
@@ -90,41 +93,29 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries: | |
# cache-from is needed so we use the cached images tagged from the remote via | ||
# docker pull see: docker_cache.load_docker_cache | ||
# | ||
# This also prevents using local layers for caching: https://github.com/moby/moby/issues/33002 | ||
# So to use local caching, we should omit the cache-from by using --no-dockerhub-cache argument to this | ||
# script. | ||
# | ||
# This doesn't work with multi head docker files. | ||
# | ||
|
||
for i in range(num_retries): | ||
logging.info('%d out of %d tries to build the docker image.', i + 1, num_retries) | ||
|
||
cmd = [docker_binary, "build", | ||
"-f", get_dockerfile(platform), | ||
"--build-arg", "USER_ID={}".format(os.getuid()), | ||
"--build-arg", "GROUP_ID={}".format(os.getgid()), | ||
"--cache-from", tag, | ||
"-t", tag, | ||
"docker"] | ||
# | ||
cmd = [docker_binary, "build", | ||
"-f", get_dockerfile(platform), | ||
"--build-arg", "USER_ID={}".format(os.getuid()), | ||
"--build-arg", "GROUP_ID={}".format(os.getgid())] | ||
if use_cache: | ||
cmd.extend(["--cache-from", tag]) | ||
cmd.extend(["-t", tag, get_dockerfiles_path()]) | ||
|
||
@retry(subprocess.CalledProcessError, tries=num_retries) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Really like this. |
||
def run_cmd(): | ||
logging.info("Running command: '%s'", ' '.join(cmd)) | ||
try: | ||
check_call(cmd) | ||
# Docker build was successful. Call break to break out of the retry mechanism | ||
break | ||
except subprocess.CalledProcessError as e: | ||
saved_exception = e | ||
logging.error('Failed to build docker image') | ||
# Building the docker image failed. Call continue to trigger the retry mechanism | ||
continue | ||
else: | ||
# Num retries exceeded | ||
logging.exception('Exception during build of docker image', saved_exception) | ||
logging.fatal('Failed to build the docker image, aborting...') | ||
sys.exit(1) | ||
check_call(cmd) | ||
|
||
run_cmd() | ||
# Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the | ||
# check_call would have failed | ||
image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag) | ||
if not image_id: | ||
raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag)) | ||
return image_id | ||
return _get_local_image_id(docker_binary=docker_binary, docker_tag=tag) | ||
|
||
|
||
def _get_local_image_id(docker_binary, docker_tag): | ||
|
@@ -134,14 +125,17 @@ def _get_local_image_id(docker_binary, docker_tag): | |
:return: Image id as string or None if tag does not exist | ||
""" | ||
cmd = [docker_binary, "images", "-q", docker_tag] | ||
image_id_b = subprocess.check_output(cmd) | ||
image_id_b = check_output(cmd) | ||
image_id = image_id_b.decode('utf-8').strip() | ||
if not image_id: | ||
raise RuntimeError('Unable to find docker image id matching with tag {}'.format(docker_tag)) | ||
return image_id | ||
|
||
|
||
def buildir() -> str: | ||
return os.path.join(get_mxnet_root(), "build") | ||
|
||
|
||
def default_ccache_dir() -> str: | ||
# Share ccache across containers | ||
if 'CCACHE_DIR' in os.environ: | ||
|
@@ -152,6 +146,7 @@ def default_ccache_dir() -> str: | |
except PermissionError: | ||
logging.info('Unable to make dirs at %s, falling back to local temp dir', ccache_dir) | ||
# In osx tmpdir is not mountable by default | ||
import platform | ||
if platform.system() == 'Darwin': | ||
ccache_dir = "/tmp/_mxnet_ccache" | ||
os.makedirs(ccache_dir, exist_ok=True) | ||
|
@@ -166,7 +161,7 @@ def container_run(platform: str, | |
local_ccache_dir: str, | ||
command: List[str], | ||
dry_run: bool = False, | ||
interactive: bool = False) -> str: | ||
interactive: bool = False) -> int: | ||
tag = get_docker_tag(platform=platform, registry=docker_registry) | ||
mx_root = get_mxnet_root() | ||
local_build_folder = buildir() | ||
|
@@ -193,52 +188,61 @@ def container_run(platform: str, | |
logging.info("Executing:\n%s\n", cmd) | ||
ret = call(runlist) | ||
|
||
docker_run_cmd = ' '.join(runlist) | ||
if not dry_run and interactive: | ||
into_cmd = deepcopy(runlist) | ||
# -ti can't be after the tag, as is interpreted as a command so hook it up after the -u argument | ||
idx = into_cmd.index('-u') + 2 | ||
into_cmd[idx:idx] = ['-ti'] | ||
cmd = '\\\n\t'.join(into_cmd) | ||
cmd = ' \\\n\t'.join(into_cmd) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for fixing this. |
||
logging.info("Executing:\n%s\n", cmd) | ||
docker_run_cmd = ' '.join(into_cmd) | ||
ret = call(into_cmd) | ||
|
||
if not dry_run and not interactive and ret != 0: | ||
logging.error("Running of command in container failed (%s):\n%s\n", ret, cmd) | ||
logging.error("You can get into the container by adding the -i option") | ||
raise subprocess.CalledProcessError(ret, cmd) | ||
|
||
return docker_run_cmd | ||
return ret | ||
|
||
|
||
def list_platforms() -> str: | ||
print("\nSupported platforms:\n{}".format('\n'.join(get_platforms()))) | ||
return "\nSupported platforms:\n{}".format('\n'.join(get_platforms())) | ||
|
||
|
||
def load_docker_cache(tag, docker_registry) -> None: | ||
if docker_registry: | ||
try: | ||
import docker_cache | ||
logging.info('Docker cache download is enabled from registry %s', docker_registry) | ||
docker_cache.load_docker_cache(registry=docker_registry, docker_tag=tag) | ||
# noinspection PyBroadException | ||
except Exception: | ||
logging.exception('Unable to retrieve Docker cache. Continue without...') | ||
else: | ||
logging.info('Distributed docker cache disabled') | ||
|
||
def main() -> int: | ||
# We need to be in the same directory than the script so the commands in the dockerfiles work as | ||
# expected. But the script can be invoked from a different path | ||
base = os.path.split(os.path.realpath(__file__))[0] | ||
os.chdir(base) | ||
|
||
logging.getLogger().setLevel(logging.INFO) | ||
def log_environment(): | ||
instance_id = ec2_instance_id_hostname() | ||
if instance_id: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this always true? The function returns '?' if something goes wrong There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Empty string is false. In that case is not printed. For example outside of ci or locally. |
||
logging.info("EC2 Instance id: %s", instance_id) | ||
pp = pprint.PrettyPrinter(indent=4) | ||
logging.debug("Build environment: %s", pp.pformat(dict(os.environ))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be info? The global level is at that level. Or should we have separate levels for ci and local? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's too verbose. In general I don't think we need to view this environment. I had it to diagnose. I wouldn't bother right now. |
||
|
||
|
||
def script_name() -> str: | ||
return os.path.split(sys.argv[0])[1] | ||
def script_name() -> str: | ||
return os.path.split(sys.argv[0])[1] | ||
|
||
|
||
def main() -> int: | ||
logging.getLogger().setLevel(logging.INFO) | ||
logging.getLogger("requests").setLevel(logging.WARNING) | ||
logging.basicConfig(format='{}: %(asctime)-15s %(message)s'.format(script_name())) | ||
|
||
logging.info("MXNet container based build tool.") | ||
log_environment() | ||
chdir_to_script_directory() | ||
|
||
parser = argparse.ArgumentParser(description="""Utility for building and testing MXNet on docker | ||
containers""", epilog="") | ||
parser.add_argument("-p", "--platform", | ||
|
@@ -284,8 +288,10 @@ def script_name() -> str: | |
default=1, | ||
type=int) | ||
|
||
parser.add_argument("-c", "--cache", action="store_true", | ||
help="Enable docker registry cache") | ||
parser.add_argument("-c", "--no-dockerhub-cache", action="store_true", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this the same as not providing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rather have it explicit. but we would have to code it if we want it the way you say. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem is that for "default" behaviour, which I assume is use local cache (as for normal docker builds) we need to provide --no-dockerhub-cache explicitly. I think it should be the other way around. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The default behavior remains unchanged if I'm not mistaken. Which is to use the dockerhub cache. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kellen. One doesn't need to specify the registry. Hence this option to disable cache which is enabled by default. I'm not sure what's the cause of confusion here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a work around for broken behaviour to me. Would you mind if we leave it out of this PR, and try to think about how to fix the root cause of the problem? To me it makes sense to not use remote caching if a user doesn't specify '--docker-registry'. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That’s bad because we want to use cache by default. This was discussed at length with Marco before and we came to agree on this behavior. I wouldn’t want to change that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That’s bad because we want to use cache by default. This was discussed at length with Marco before and we came to agree on this behavior. I wouldn’t want to change that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, that's the information I was missing here. In this case it makes sense to me. |
||
help="Disables use of --cache-from option on docker build, allowing docker" | ||
" to use local layers for caching. If absent, we use the cache from dockerhub" | ||
" which is the default.") | ||
|
||
parser.add_argument("command", | ||
help="command to run in the container", | ||
|
@@ -297,35 +303,36 @@ def script_name() -> str: | |
type=str) | ||
|
||
args = parser.parse_args() | ||
|
||
def use_cache(): | ||
return args.cache or under_ci() | ||
return not args.no_dockerhub_cache or under_ci() | ||
|
||
command = list(chain(*args.command)) | ||
docker_binary = get_docker_binary(args.nvidiadocker) | ||
shared_memory_size = args.shared_memory_size | ||
num_docker_build_retires = args.docker_build_retries | ||
|
||
if args.list: | ||
list_platforms() | ||
print(list_platforms()) | ||
elif args.platform: | ||
platform = args.platform | ||
tag = get_docker_tag(platform=platform, registry=args.docker_registry) | ||
if use_cache(): | ||
load_docker_cache(tag=tag, docker_registry=args.docker_registry) | ||
build_docker(platform, docker_binary, registry=args.docker_registry, num_retries=num_docker_build_retires) | ||
build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry, | ||
num_retries=args.docker_build_retries, use_cache=use_cache()) | ||
if args.build_only: | ||
logging.warning("Container was just built. Exiting due to build-only.") | ||
return 0 | ||
|
||
if command: | ||
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size, | ||
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size, | ||
command=command, docker_registry=args.docker_registry, | ||
local_ccache_dir=args.ccache_dir, interactive=args.interactive) | ||
elif args.print_docker_run: | ||
print(container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size, | ||
command=[], dry_run=True, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)) | ||
print(container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size, | ||
command=[], dry_run=True, docker_registry=args.docker_registry, | ||
local_ccache_dir=args.ccache_dir)) | ||
elif args.interactive: | ||
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size, | ||
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size, | ||
command=command, docker_registry=args.docker_registry, | ||
local_ccache_dir=args.ccache_dir, interactive=args.interactive) | ||
|
||
|
@@ -334,7 +341,7 @@ def use_cache(): | |
assert not args.interactive, "when running with -i must provide a command" | ||
cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)] | ||
logging.info("No command specified, trying default build: %s", ' '.join(cmd)) | ||
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size, | ||
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size, | ||
command=cmd, docker_registry=args.docker_registry, | ||
local_ccache_dir=args.ccache_dir) | ||
|
||
|
@@ -346,15 +353,20 @@ def use_cache(): | |
tag = get_docker_tag(platform=platform, registry=args.docker_registry) | ||
if use_cache(): | ||
load_docker_cache(tag=tag, docker_registry=args.docker_registry) | ||
build_docker(platform, docker_binary, args.docker_registry, num_retries=num_docker_build_retires) | ||
build_docker(platform, docker_binary, args.docker_registry, num_retries=args.docker_build_retries, | ||
use_cache=use_cache()) | ||
if args.build_only: | ||
continue | ||
build_platform = "build_{}".format(platform) | ||
cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform] | ||
shutil.rmtree(buildir(), ignore_errors=True) | ||
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size, | ||
command=cmd, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir) | ||
plat_buildir = os.path.join(get_mxnet_root(), build_platform) | ||
build_platform = "build_{}".format(platform) | ||
plat_buildir = os.path.abspath(os.path.join(get_mxnet_root(), '..', | ||
"mxnet_{}".format(build_platform))) | ||
if os.path.exists(plat_buildir): | ||
logging.warning("{} already exists, skipping".format(plat_buildir)) | ||
continue | ||
command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform] | ||
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size, | ||
command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir) | ||
shutil.move(buildir(), plat_buildir) | ||
logging.info("Built files left in: %s", plat_buildir) | ||
|
||
|
@@ -383,7 +395,7 @@ def use_cache(): | |
|
||
./build.py -a | ||
|
||
Builds for all platforms and leaves artifacts in build_<platform> | ||
Builds for all platforms and leaves artifacts in build_<platform>. | ||
|
||
""") | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -567,6 +567,9 @@ build_ubuntu_gpu_cmake() { | |
ninja -v | ||
} | ||
|
||
build_ubuntu_blc() { | ||
echo "pass" | ||
} | ||
|
||
# Testing | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: I find this a bit of a misnomer. It sounds like we're disabling docker-cache, which is possible, but it's not what's done here. Reading the code I would expect it to pass --no-cache to the docker cmd being run. Maybe a more specific name could help?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm open. Any suggestions?