Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Separate refactoring from #12276 in a prior PR #12296

Merged
merged 2 commits into from
Aug 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 88 additions & 76 deletions ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,33 @@
"""

__author__ = 'Marco de Abreu, Kellen Sunderland, Anton Chernov, Pedro Larroy'
__version__ = '0.1'
__version__ = '0.2'

import argparse
import glob
import logging
import os
import re
import shutil
import subprocess
import sys
import tempfile
import platform
from copy import deepcopy
from itertools import chain
from subprocess import call, check_call
from subprocess import call, check_call, check_output
from typing import *
from util import *
import pprint
import requests


CCACHE_MAXSIZE = '500G'

def under_ci() -> bool:
""":return: True if we run in Jenkins."""
return 'JOB_NAME' in os.environ

def get_platforms(path: Optional[str] = "docker"):
def get_dockerfiles_path():
return "docker"


def get_platforms(path: str = get_dockerfiles_path()) -> List[str]:
"""Get a list of architectures given our dockerfiles"""
dockerfiles = glob.glob(os.path.join(path, "Dockerfile.build.*"))
dockerfiles = list(filter(lambda x: x[-1] != '~', dockerfiles))
Expand All @@ -57,29 +59,30 @@ def get_platforms(path: Optional[str] = "docker"):


def get_docker_tag(platform: str, registry: str) -> str:
""":return: docker tag to be used for the container"""
return "{0}/build.{1}".format(registry, platform)


def get_dockerfile(platform: str, path="docker") -> str:
def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
return os.path.join(path, "Dockerfile.build.{0}".format(platform))


def get_docker_binary(use_nvidia_docker: bool) -> str:
return "nvidia-docker" if use_nvidia_docker else "docker"


def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int) -> None:
def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, use_cache: bool) -> str:
"""
Build a container for the given platform
:param platform: Platform
:param docker_binary: docker binary to use (docker/nvidia-docker)
:param registry: Dockerhub registry name
:param num_retries: Number of retries to build the docker image
:param use_cache: will pass cache_from to docker to use the previously pulled tag
:return: Id of the top level image
"""

tag = get_docker_tag(platform=platform, registry=registry)
logging.info("Building container tagged '%s' with %s", tag, docker_binary)
logging.info("Building docker container tagged '%s' with %s", tag, docker_binary)
#
# We add a user with the same group as the executing non-root user so files created in the
# container match permissions of the local user. Same for the group.
Expand All @@ -90,41 +93,29 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
# cache-from is needed so we use the cached images tagged from the remote via
# docker pull see: docker_cache.load_docker_cache
#
# This also prevents using local layers for caching: https://github.com/moby/moby/issues/33002
# So to use local caching, we should omit the cache-from by using --no-dockerhub-cache argument to this
# script.
#
# This doesn't work with multi head docker files.
#

for i in range(num_retries):
logging.info('%d out of %d tries to build the docker image.', i + 1, num_retries)

cmd = [docker_binary, "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid()),
"--cache-from", tag,
"-t", tag,
"docker"]
#
cmd = [docker_binary, "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
if use_cache:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I find this a bit of a misnomer. It sounds like we're disabling docker-cache, which is possible, but it's not what's done here. Reading the code I would expect it to pass --no-cache to the docker cmd being run. Maybe a more specific name could help?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm open. Any suggestions?

cmd.extend(["--cache-from", tag])
cmd.extend(["-t", tag, get_dockerfiles_path()])

@retry(subprocess.CalledProcessError, tries=num_retries)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Really like this.

def run_cmd():
logging.info("Running command: '%s'", ' '.join(cmd))
try:
check_call(cmd)
# Docker build was successful. Call break to break out of the retry mechanism
break
except subprocess.CalledProcessError as e:
saved_exception = e
logging.error('Failed to build docker image')
# Building the docker image failed. Call continue to trigger the retry mechanism
continue
else:
# Num retries exceeded
logging.exception('Exception during build of docker image', saved_exception)
logging.fatal('Failed to build the docker image, aborting...')
sys.exit(1)
check_call(cmd)

run_cmd()
# Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
# check_call would have failed
image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
if not image_id:
raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag))
return image_id
return _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)


def _get_local_image_id(docker_binary, docker_tag):
Expand All @@ -134,14 +125,17 @@ def _get_local_image_id(docker_binary, docker_tag):
:return: Image id as string or None if tag does not exist
"""
cmd = [docker_binary, "images", "-q", docker_tag]
image_id_b = subprocess.check_output(cmd)
image_id_b = check_output(cmd)
image_id = image_id_b.decode('utf-8').strip()
if not image_id:
raise RuntimeError('Unable to find docker image id matching with tag {}'.format(docker_tag))
return image_id


def buildir() -> str:
return os.path.join(get_mxnet_root(), "build")


def default_ccache_dir() -> str:
# Share ccache across containers
if 'CCACHE_DIR' in os.environ:
Expand All @@ -152,6 +146,7 @@ def default_ccache_dir() -> str:
except PermissionError:
logging.info('Unable to make dirs at %s, falling back to local temp dir', ccache_dir)
# In osx tmpdir is not mountable by default
import platform
if platform.system() == 'Darwin':
ccache_dir = "/tmp/_mxnet_ccache"
os.makedirs(ccache_dir, exist_ok=True)
Expand All @@ -166,7 +161,7 @@ def container_run(platform: str,
local_ccache_dir: str,
command: List[str],
dry_run: bool = False,
interactive: bool = False) -> str:
interactive: bool = False) -> int:
tag = get_docker_tag(platform=platform, registry=docker_registry)
mx_root = get_mxnet_root()
local_build_folder = buildir()
Expand All @@ -193,52 +188,61 @@ def container_run(platform: str,
logging.info("Executing:\n%s\n", cmd)
ret = call(runlist)

docker_run_cmd = ' '.join(runlist)
if not dry_run and interactive:
into_cmd = deepcopy(runlist)
# -ti can't be after the tag, as is interpreted as a command so hook it up after the -u argument
idx = into_cmd.index('-u') + 2
into_cmd[idx:idx] = ['-ti']
cmd = '\\\n\t'.join(into_cmd)
cmd = ' \\\n\t'.join(into_cmd)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for fixing this.

logging.info("Executing:\n%s\n", cmd)
docker_run_cmd = ' '.join(into_cmd)
ret = call(into_cmd)

if not dry_run and not interactive and ret != 0:
logging.error("Running of command in container failed (%s):\n%s\n", ret, cmd)
logging.error("You can get into the container by adding the -i option")
raise subprocess.CalledProcessError(ret, cmd)

return docker_run_cmd
return ret


def list_platforms() -> str:
print("\nSupported platforms:\n{}".format('\n'.join(get_platforms())))
return "\nSupported platforms:\n{}".format('\n'.join(get_platforms()))


def load_docker_cache(tag, docker_registry) -> None:
if docker_registry:
try:
import docker_cache
logging.info('Docker cache download is enabled from registry %s', docker_registry)
docker_cache.load_docker_cache(registry=docker_registry, docker_tag=tag)
# noinspection PyBroadException
except Exception:
logging.exception('Unable to retrieve Docker cache. Continue without...')
else:
logging.info('Distributed docker cache disabled')

def main() -> int:
# We need to be in the same directory than the script so the commands in the dockerfiles work as
# expected. But the script can be invoked from a different path
base = os.path.split(os.path.realpath(__file__))[0]
os.chdir(base)

logging.getLogger().setLevel(logging.INFO)
def log_environment():
instance_id = ec2_instance_id_hostname()
if instance_id:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this always true? The function returns '?' if something goes wrong

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty string is false. In that case is not printed. For example outside of ci or locally.

logging.info("EC2 Instance id: %s", instance_id)
pp = pprint.PrettyPrinter(indent=4)
logging.debug("Build environment: %s", pp.pformat(dict(os.environ)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be info? The global level is at that level. Or should we have separate levels for ci and local?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's too verbose. In general I don't think we need to view this environment. I had it to diagnose. I wouldn't bother right now.



def script_name() -> str:
return os.path.split(sys.argv[0])[1]
def script_name() -> str:
return os.path.split(sys.argv[0])[1]


def main() -> int:
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.basicConfig(format='{}: %(asctime)-15s %(message)s'.format(script_name()))

logging.info("MXNet container based build tool.")
log_environment()
chdir_to_script_directory()

parser = argparse.ArgumentParser(description="""Utility for building and testing MXNet on docker
containers""", epilog="")
parser.add_argument("-p", "--platform",
Expand Down Expand Up @@ -284,8 +288,10 @@ def script_name() -> str:
default=1,
type=int)

parser.add_argument("-c", "--cache", action="store_true",
help="Enable docker registry cache")
parser.add_argument("-c", "--no-dockerhub-cache", action="store_true",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this the same as not providing --docker-registry?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather have it explicit. but we would have to code it if we want it the way you say.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that for "default" behaviour, which I assume is use local cache (as for normal docker builds) we need to provide --no-dockerhub-cache explicitly. I think it should be the other way around.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-c is not a good shortening anymore for the option

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default behavior remains unchanged if I'm not mistaken. Which is to use the dockerhub cache.

Copy link
Contributor Author

@larroy larroy Aug 23, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kellen. One doesn't need to specify the registry. Hence this option to disable cache which is enabled by default. I'm not sure what's the cause of confusion here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like a work around for broken behaviour to me. Would you mind if we leave it out of this PR, and try to think about how to fix the root cause of the problem? To me it makes sense to not use remote caching if a user doesn't specify '--docker-registry'.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That’s bad because we want to use cache by default. This was discussed at length with Marco before and we came to agree on this behavior. I wouldn’t want to change that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That’s bad because we want to use cache by default. This was discussed at length with Marco before and we came to agree on this behavior. I wouldn’t want to change that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, that's the information I was missing here. In this case it makes sense to me.

help="Disables use of --cache-from option on docker build, allowing docker"
" to use local layers for caching. If absent, we use the cache from dockerhub"
" which is the default.")

parser.add_argument("command",
help="command to run in the container",
Expand All @@ -297,35 +303,36 @@ def script_name() -> str:
type=str)

args = parser.parse_args()

def use_cache():
return args.cache or under_ci()
return not args.no_dockerhub_cache or under_ci()

command = list(chain(*args.command))
docker_binary = get_docker_binary(args.nvidiadocker)
shared_memory_size = args.shared_memory_size
num_docker_build_retires = args.docker_build_retries

if args.list:
list_platforms()
print(list_platforms())
elif args.platform:
platform = args.platform
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
if use_cache():
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
build_docker(platform, docker_binary, registry=args.docker_registry, num_retries=num_docker_build_retires)
build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
num_retries=args.docker_build_retries, use_cache=use_cache())
if args.build_only:
logging.warning("Container was just built. Exiting due to build-only.")
return 0

if command:
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, interactive=args.interactive)
elif args.print_docker_run:
print(container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
command=[], dry_run=True, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir))
print(container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
command=[], dry_run=True, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir))
elif args.interactive:
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, interactive=args.interactive)

Expand All @@ -334,7 +341,7 @@ def use_cache():
assert not args.interactive, "when running with -i must provide a command"
cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
logging.info("No command specified, trying default build: %s", ' '.join(cmd))
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
command=cmd, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir)

Expand All @@ -346,15 +353,20 @@ def use_cache():
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
if use_cache():
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
build_docker(platform, docker_binary, args.docker_registry, num_retries=num_docker_build_retires)
build_docker(platform, docker_binary, args.docker_registry, num_retries=args.docker_build_retries,
use_cache=use_cache())
if args.build_only:
continue
build_platform = "build_{}".format(platform)
cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
shutil.rmtree(buildir(), ignore_errors=True)
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
command=cmd, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
plat_buildir = os.path.join(get_mxnet_root(), build_platform)
build_platform = "build_{}".format(platform)
plat_buildir = os.path.abspath(os.path.join(get_mxnet_root(), '..',
"mxnet_{}".format(build_platform)))
if os.path.exists(plat_buildir):
logging.warning("{} already exists, skipping".format(plat_buildir))
continue
command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
shutil.move(buildir(), plat_buildir)
logging.info("Built files left in: %s", plat_buildir)

Expand Down Expand Up @@ -383,7 +395,7 @@ def use_cache():

./build.py -a

Builds for all platforms and leaves artifacts in build_<platform>
Builds for all platforms and leaves artifacts in build_<platform>.

""")

Expand Down
3 changes: 3 additions & 0 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,9 @@ build_ubuntu_gpu_cmake() {
ninja -v
}

build_ubuntu_blc() {
echo "pass"
}

# Testing

Expand Down
5 changes: 3 additions & 2 deletions ci/docker_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import sys
import subprocess
import json
from typing import *
import build as build_util


Expand Down Expand Up @@ -59,7 +60,7 @@ def build_save_containers(platforms, registry, load_cache) -> int:
return 1 if is_error else 0


def _build_save_container(platform, registry, load_cache) -> str:
def _build_save_container(platform, registry, load_cache) -> Optional[str]:
"""
Build image for passed platform and upload the cache to the specified S3 bucket
:param platform: Platform
Expand All @@ -77,7 +78,7 @@ def _build_save_container(platform, registry, load_cache) -> str:
logging.debug('Building %s as %s', platform, docker_tag)
try:
# Increase the number of retries for building the cache.
image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10)
image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10, use_cache=True)
logging.info('Built %s as %s', docker_tag, image_id)

# Push cache to registry
Expand Down
Loading