Skip to content

Commit

Permalink
Separate minor refactoring from apache#12276 in a prior PR
Browse files Browse the repository at this point in the history
  • Loading branch information
Pedro Larroy authored and larroy committed Aug 22, 2018
1 parent 902c579 commit 0a253f7
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 57 deletions.
189 changes: 134 additions & 55 deletions ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"""

__author__ = 'Marco de Abreu, Kellen Sunderland, Anton Chernov, Pedro Larroy'
__version__ = '0.1'
__version__ = '0.2'

import argparse
import glob
Expand All @@ -37,17 +37,71 @@
import platform
from copy import deepcopy
from itertools import chain
from subprocess import call, check_call
from subprocess import call, check_call, check_output
from typing import *
from util import *
import pprint
import requests

def retry(ExceptionToCheck, tries=4, delay_s=1, backoff=2):
"""Retry calling the decorated function using an exponential backoff.
http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
:param ExceptionToCheck: the exception to check. may be a tuple of
exceptions to check
:type ExceptionToCheck: Exception or tuple
:param tries: number of times to try (not retry) before giving up
:type tries: int
:param delay_s: initial delay between retries in seconds
:type delay_s: int
:param backoff: backoff multiplier e.g. value of 2 will double the delay
each retry
:type backoff: int
"""
import time
from functools import wraps
def decorated_retry(f):
@wraps(f)
def f_retry(*args, **kwargs):
mtries, mdelay = tries, delay_s
while mtries > 1:
try:
return f(*args, **kwargs)
except ExceptionToCheck as e:
logging.warning("Exception: %s, Retrying in %d seconds...", str(e), mdelay)
time.sleep(mdelay)
mtries -= 1
mdelay *= backoff
return f(*args, **kwargs)

return f_retry # true decorator

return decorated_retry

CCACHE_MAXSIZE = '500G'

def under_ci() -> bool:
""":return: True if we run in Jenkins."""
return 'JOB_NAME' in os.environ

def get_platforms(path: Optional[str] = "docker"):

def git_cleanup() -> None:
"""Clean repo and subrepos, update subrepos"""
logging.info("cleaning up repository")
with remember_cwd():
os.chdir(get_mxnet_root())
check_call(['git', 'clean', '-ffdx'])
check_call(['git', 'submodule', 'foreach', '--recursive', 'git', 'clean', '-ffdx'])
check_call(['git', 'submodule', 'update', '--recursive', '--init'])


def get_dockerfiles_path():
return "docker"


def get_platforms(path: str = get_dockerfiles_path()) -> list[str]:
"""Get a list of architectures given our dockerfiles"""
dockerfiles = glob.glob(os.path.join(path, "Dockerfile.build.*"))
dockerfiles = list(filter(lambda x: x[-1] != '~', dockerfiles))
Expand All @@ -57,18 +111,19 @@ def get_platforms(path: Optional[str] = "docker"):


def get_docker_tag(platform: str, registry: str) -> str:
""":return: docker tag to be used for the container"""
return "{0}/build.{1}".format(registry, platform)


def get_dockerfile(platform: str, path="docker") -> str:
def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
return os.path.join(path, "Dockerfile.build.{0}".format(platform))


def get_docker_binary(use_nvidia_docker: bool) -> str:
return "nvidia-docker" if use_nvidia_docker else "docker"


def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int) -> None:
def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, use_cache: bool) -> str:
"""
Build a container for the given platform
:param platform: Platform
Expand All @@ -77,9 +132,8 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
:param num_retries: Number of retries to build the docker image
:return: Id of the top level image
"""

tag = get_docker_tag(platform=platform, registry=registry)
logging.info("Building container tagged '%s' with %s", tag, docker_binary)
logging.info("Building docker container tagged '%s' with %s", tag, docker_binary)
#
# We add a user with the same group as the executing non-root user so files created in the
# container match permissions of the local user. Same for the group.
Expand All @@ -91,40 +145,24 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
# docker pull see: docker_cache.load_docker_cache
#
# This doesn't work with multi head docker files.
#

for i in range(num_retries):
logging.info('%d out of %d tries to build the docker image.', i + 1, num_retries)

cmd = [docker_binary, "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid()),
"--cache-from", tag,
"-t", tag,
"docker"]
#
cmd = [docker_binary, "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
if use_cache:
cmd.extend(["--cache-from", tag,])
cmd.extend(["-t", tag, get_dockerfiles_path()])

@retry(subprocess.CalledProcessError, tries=num_retries)
def run_cmd():
logging.info("Running command: '%s'", ' '.join(cmd))
try:
check_call(cmd)
# Docker build was successful. Call break to break out of the retry mechanism
break
except subprocess.CalledProcessError as e:
saved_exception = e
logging.error('Failed to build docker image')
# Building the docker image failed. Call continue to trigger the retry mechanism
continue
else:
# Num retries exceeded
logging.exception('Exception during build of docker image', saved_exception)
logging.fatal('Failed to build the docker image, aborting...')
sys.exit(1)
check_call(cmd)

run_cmd()
# Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
# check_call would have failed
image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
if not image_id:
raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag))
return image_id
return _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)


def _get_local_image_id(docker_binary, docker_tag):
Expand All @@ -136,12 +174,15 @@ def _get_local_image_id(docker_binary, docker_tag):
cmd = [docker_binary, "images", "-q", docker_tag]
image_id_b = subprocess.check_output(cmd)
image_id = image_id_b.decode('utf-8').strip()
if not image_id:
raise RuntimeError('Unable to find docker image id matching with tag {}'.format(tag))
return image_id


def buildir() -> str:
return os.path.join(get_mxnet_root(), "build")


def default_ccache_dir() -> str:
# Share ccache across containers
if 'CCACHE_DIR' in os.environ:
Expand All @@ -166,7 +207,7 @@ def container_run(platform: str,
local_ccache_dir: str,
command: List[str],
dry_run: bool = False,
interactive: bool = False) -> str:
interactive: bool = False) -> int:
tag = get_docker_tag(platform=platform, registry=docker_registry)
mx_root = get_mxnet_root()
local_build_folder = buildir()
Expand Down Expand Up @@ -199,7 +240,7 @@ def container_run(platform: str,
# -ti can't be after the tag, as is interpreted as a command so hook it up after the -u argument
idx = into_cmd.index('-u') + 2
into_cmd[idx:idx] = ['-ti']
cmd = '\\\n\t'.join(into_cmd)
cmd = ' \\\n\t'.join(into_cmd)
logging.info("Executing:\n%s\n", cmd)
docker_run_cmd = ' '.join(into_cmd)
ret = call(into_cmd)
Expand All @@ -209,11 +250,12 @@ def container_run(platform: str,
logging.error("You can get into the container by adding the -i option")
raise subprocess.CalledProcessError(ret, cmd)

return docker_run_cmd
return ret



def list_platforms() -> str:
print("\nSupported platforms:\n{}".format('\n'.join(get_platforms())))
return "\nSupported platforms:\n{}".format('\n'.join(get_platforms()))

def load_docker_cache(tag, docker_registry) -> None:
if docker_registry:
Expand All @@ -226,19 +268,49 @@ def load_docker_cache(tag, docker_registry) -> None:
else:
logging.info('Distributed docker cache disabled')

def main() -> int:
def ec2_instance_id_hostname() -> str:
if under_ci():
result = []
try:
r = requests.get("http://instance-data/latest/meta-data/instance-id")
if r.status_code == 200:
result.append(r.content.decode())
r = requests.get("http://instance-data/latest/meta-data/public-hostname")
if r.status_code == 200:
result.append(r.content.decode())
return ' '.join(result)
except ConnectionError as e:
pass
return '?'
else:
return ''

def log_environment():
instance_id = ec2_instance_id_hostname()
if instance_id:
logging.info("Instance id: %s", instance_id)
pp = pprint.PrettyPrinter(indent=4)
logging.info("Build environment: %s", pp.pformat(dict(os.environ)))

def chdir_to_script_directory():
# We need to be in the same directory than the script so the commands in the dockerfiles work as
# expected. But the script can be invoked from a different path
base = os.path.split(os.path.realpath(__file__))[0]
os.chdir(base)

logging.getLogger().setLevel(logging.INFO)
def script_name() -> str:
return os.path.split(sys.argv[0])[1]

def script_name() -> str:
return os.path.split(sys.argv[0])[1]

def main() -> int:
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.basicConfig(format='{}: %(asctime)-15s %(message)s'.format(script_name()))

logging.info("MXNet container based build tool.")
log_environment()
chdir_to_script_directory()

parser = argparse.ArgumentParser(description="""Utility for building and testing MXNet on docker
containers""", epilog="")
parser.add_argument("-p", "--platform",
Expand Down Expand Up @@ -284,7 +356,7 @@ def script_name() -> str:
default=1,
type=int)

parser.add_argument("-c", "--cache", action="store_true",
parser.add_argument("--no-cache", action="store_true",
help="Enable docker registry cache")

parser.add_argument("command",
Expand All @@ -297,22 +369,23 @@ def script_name() -> str:
type=str)

args = parser.parse_args()

def use_cache():
return args.cache or under_ci()
return not args.no_cache or under_ci()

command = list(chain(*args.command))
docker_binary = get_docker_binary(args.nvidiadocker)
shared_memory_size = args.shared_memory_size
num_docker_build_retires = args.docker_build_retries

if args.list:
list_platforms()
print(list_platforms())
elif args.platform:
platform = args.platform
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
if use_cache():
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
build_docker(platform, docker_binary, registry=args.docker_registry, num_retries=num_docker_build_retires)
build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry, num_retries=num_docker_build_retires, use_cache=use_cache())
if args.build_only:
logging.warning("Container was just built. Exiting due to build-only.")
return 0
Expand Down Expand Up @@ -346,15 +419,20 @@ def use_cache():
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
if use_cache():
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
build_docker(platform, docker_binary, args.docker_registry, num_retries=num_docker_build_retires)
build_docker(platform, docker_binary, args.docker_registry, num_retries=args.docker_build_retries, use_cache=use_cache())
if args.build_only:
continue
build_platform = "build_{}".format(platform)
cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
git_cleanup()
shutil.rmtree(buildir(), ignore_errors=True)
build_platform = "build_{}".format(platform)
plat_buildir = os.path.abspath(os.path.join(get_mxnet_root(), '..',
"mxnet_{}".format(build_platform)))
if os.path.exists(plat_buildir):
logging.warning("{} already exists, skipping".format(plat_buildir))
continue
command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
command=cmd, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
plat_buildir = os.path.join(get_mxnet_root(), build_platform)
command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
shutil.move(buildir(), plat_buildir)
logging.info("Built files left in: %s", plat_buildir)

Expand Down Expand Up @@ -383,7 +461,8 @@ def use_cache():
./build.py -a
Builds for all platforms and leaves artifacts in build_<platform>
Builds for all platforms and leaves artifacts in build_<platform>. **WARNING** it performs git
cleanup of the repo.
""")

Expand Down
5 changes: 3 additions & 2 deletions ci/docker_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import sys
import subprocess
import json
from typing import *
import build as build_util


Expand Down Expand Up @@ -59,7 +60,7 @@ def build_save_containers(platforms, registry, load_cache) -> int:
return 1 if is_error else 0


def _build_save_container(platform, registry, load_cache) -> str:
def _build_save_container(platform, registry, load_cache) -> Optional[str]:
"""
Build image for passed platform and upload the cache to the specified S3 bucket
:param platform: Platform
Expand All @@ -77,7 +78,7 @@ def _build_save_container(platform, registry, load_cache) -> str:
logging.debug('Building %s as %s', platform, docker_tag)
try:
# Increase the number of retries for building the cache.
image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10)
image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10, use_cache=True)
logging.info('Built %s as %s', docker_tag, image_id)

# Push cache to registry
Expand Down

0 comments on commit 0a253f7

Please sign in to comment.