From 91bb39898e6b12fb45451852019939f694e54f71 Mon Sep 17 00:00:00 2001 From: perdasilva Date: Mon, 21 Oct 2019 18:38:48 +0200 Subject: [PATCH] [CD] Adds python docker pipeline (#16547) * Adds python docker pipeline * Refactors docker login functionality out of docker_cache script tweak * Refactors docker run functionality out of build.py Test safe docker run tweaks Fix up * Adds CD docker utils * Updates logging configuration to file config * Adds restore static and dynamic methods, instead of making the library type a parameter * Removes CUDA 9.1 from base images script * Adds explanatory comment to signal masking in safe_docker_run * Adds push success message with pull command and link to repository * Fixes logging --- cd/Jenkinsfile_cd_pipeline | 14 +- cd/Jenkinsfile_release_job | 3 +- cd/Jenkinsfile_utils.groovy | 12 + cd/python/docker/Dockerfile | 40 +++ cd/python/docker/Dockerfile.test | 39 +++ cd/python/docker/Jenkins_pipeline.groovy | 74 ++++ cd/python/docker/python_images.sh | 128 +++++++ cd/python/docker/test_python_image.sh | 47 +++ cd/python/pypi/Jenkins_pipeline.groovy | 2 +- cd/utils/docker_tag.sh | 59 ++++ cd/utils/mxnet_base_image.sh | 49 +++ ci/build.py | 179 ++-------- ci/docker_cache.py | 87 +---- ci/docker_login.py | 137 ++++++++ ci/logging.conf | 41 +++ ci/safe_docker_run.py | 247 +++++++++++++ ci/test_docker_cache.py | 40 ++- ci/test_docker_login.py | 234 +++++++++++++ ci/test_safe_docker_run.py | 427 +++++++++++++++++++++++ ci/util.py | 17 + 20 files changed, 1631 insertions(+), 245 deletions(-) create mode 100644 cd/python/docker/Dockerfile create mode 100644 cd/python/docker/Dockerfile.test create mode 100644 cd/python/docker/Jenkins_pipeline.groovy create mode 100755 cd/python/docker/python_images.sh create mode 100755 cd/python/docker/test_python_image.sh create mode 100755 cd/utils/docker_tag.sh create mode 100755 cd/utils/mxnet_base_image.sh create mode 100755 ci/docker_login.py create mode 100644 ci/logging.conf create mode 100755 ci/safe_docker_run.py create mode 100644 ci/test_docker_login.py create mode 100644 ci/test_safe_docker_run.py diff --git a/cd/Jenkinsfile_cd_pipeline b/cd/Jenkinsfile_cd_pipeline index e0e94770b682..afb7b9b6d27f 100644 --- a/cd/Jenkinsfile_cd_pipeline +++ b/cd/Jenkinsfile_cd_pipeline @@ -61,9 +61,17 @@ pipeline { stage("Build") { cd_utils.trigger_release_job("Build static libmxnet", "mxnet_lib/static", params.MXNET_VARIANTS) } - stage("PyPI Release") { - echo "Building PyPI Release" - cd_utils.trigger_release_job("Release PyPI Packages", "python/pypi", params.MXNET_VARIANTS) + stage("Releases") { + cd_utils.error_checked_parallel([ + "PyPI Release": { + echo "Building PyPI Release" + cd_utils.trigger_release_job("Release PyPI Packages", "python/pypi", params.MXNET_VARIANTS) + }, + "Python Docker Release": { + echo "Building Python Docker Release" + cd_utils.trigger_release_job("Release Python Docker Images", "python/docker", params.MXNET_VARIANTS) + } + ]) } }, diff --git a/cd/Jenkinsfile_release_job b/cd/Jenkinsfile_release_job index c2be26124029..4d6f3b5e9012 100644 --- a/cd/Jenkinsfile_release_job +++ b/cd/Jenkinsfile_release_job @@ -92,7 +92,8 @@ pipeline { def valid_job_types = [ "mxnet_lib/static", "mxnet_lib/dynamic", - "python/pypi" + "python/pypi", + "python/docker" ] // Convert mxnet variants to a list diff --git a/cd/Jenkinsfile_utils.groovy b/cd/Jenkinsfile_utils.groovy index 5182b04a3b5b..966f0a218057 100644 --- a/cd/Jenkinsfile_utils.groovy +++ b/cd/Jenkinsfile_utils.groovy @@ -160,6 +160,18 @@ def restore_artifact(variant, libtype) { } } + +// Restores the statically linked libmxnet for the given variant +def restore_static_libmxnet(variant) { + restore_artifact(variant, 'static') +} + + +// Restores the dynamically linked libmxnet for the given variant +def restore_dynamic_libmxnet(variant) { + restore_artifact(variant, 'dynamic') +} + // A generic pipeline that can be used by *most* CD jobs // It can be used when implementing the pipeline steps in the Jenkins_steps.groovy // script for a particular delivery channel. However, it should also implement the diff --git a/cd/python/docker/Dockerfile b/cd/python/docker/Dockerfile new file mode 100644 index 000000000000..dc70da188793 --- /dev/null +++ b/cd/python/docker/Dockerfile @@ -0,0 +1,40 @@ +# -*- mode: dockerfile -*- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Python MXNet Dockerfile + +# NOTE: Assumes wheel_build directory is the context root when building + +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ARG PYTHON_CMD=python +RUN apt-get update && \ + apt-get install -y wget ${PYTHON_CMD}-dev gcc && \ + wget https://bootstrap.pypa.io/get-pip.py && \ + ${PYTHON_CMD} get-pip.py + +ARG MXNET_COMMIT_ID +ENV MXNET_COMMIT_ID=${MXNET_COMMIT_ID} + +RUN mkdir -p /mxnet +COPY dist/*.whl /mxnet/. + +WORKDIR /mxnet +RUN WHEEL_FILE=$(ls -t /mxnet | head -n 1) && pip install ${WHEEL_FILE} && rm -f ${WHEEL_FILE} + diff --git a/cd/python/docker/Dockerfile.test b/cd/python/docker/Dockerfile.test new file mode 100644 index 000000000000..bed059d0fc73 --- /dev/null +++ b/cd/python/docker/Dockerfile.test @@ -0,0 +1,39 @@ +# -*- mode: dockerfile -*- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Python MXNet Dockerfile + +# NOTE: Assumes 'ci' directory is root of the context when building + +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +# Install test dependencies +RUN pip install nose + +ARG USER_ID=1001 +ARG GROUP_ID=1001 + +COPY ./docker/install/ubuntu_adduser.sh /work/ubuntu_adduser.sh +COPY ./docker/install/requirements /work/requirements + +RUN mkdir -p /work +RUN /work/ubuntu_adduser.sh +RUN pip install -r /work/requirements + +WORKDIR /work/mxnet diff --git a/cd/python/docker/Jenkins_pipeline.groovy b/cd/python/docker/Jenkins_pipeline.groovy new file mode 100644 index 000000000000..0d4925e00576 --- /dev/null +++ b/cd/python/docker/Jenkins_pipeline.groovy @@ -0,0 +1,74 @@ +// -*- mode: groovy -*- + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Jenkins pipeline +// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/ + +// NOTE: +// ci_utils and cd_utils are loaded by the originating Jenkins job, e.g. jenkins/Jenkinsfile_release_job + +def get_pipeline(mxnet_variant) { + def node_type = mxnet_variant.startsWith('cu') ? NODE_LINUX_GPU : NODE_LINUX_CPU + return cd_utils.generic_pipeline(mxnet_variant, this, node_type) +} + +// Returns the (Docker) environment for the given variant +// The environment corresponds to the docker files in the 'docker' directory +def get_environment(mxnet_variant) { + if (mxnet_variant.startsWith("cu")) { + // Remove 'mkl' suffix from variant to properly format test environment + return "ubuntu_gpu_${mxnet_variant.replace('mkl', '')}" + } + return "ubuntu_cpu" +} + + +def build(mxnet_variant) { + ws("workspace/python_docker/${mxnet_variant}/${env.BUILD_NUMBER}") { + ci_utils.init_git() + cd_utils.restore_static_libmxnet(mxnet_variant) + + // package wheel file + def nvidia_docker = mxnet_variant.startsWith('cu') + def environment = get_environment(mxnet_variant) + ci_utils.docker_run(environment, "cd_package_pypi ${mxnet_variant}", nvidia_docker) + + // build python docker images + sh "./cd/python/docker/python_images.sh build ${mxnet_variant} py3" + sh "./cd/python/docker/python_images.sh build ${mxnet_variant} py2" + } +} + +def test(mxnet_variant) { + ws("workspace/python_docker/${mxnet_variant}/${env.BUILD_NUMBER}") { + // test python docker images + sh "./cd/python/docker/python_images.sh test ${mxnet_variant} py3" + sh "./cd/python/docker/python_images.sh test ${mxnet_variant} py2" + } +} + +def push(mxnet_variant) { + ws("workspace/python_docker/${mxnet_variant}/${env.BUILD_NUMBER}") { + // push python docker images + sh "./cd/python/docker/python_images.sh push ${mxnet_variant} py3" + sh "./cd/python/docker/python_images.sh push ${mxnet_variant} py2" + } +} + +return this diff --git a/cd/python/docker/python_images.sh b/cd/python/docker/python_images.sh new file mode 100755 index 000000000000..305676d2c40a --- /dev/null +++ b/cd/python/docker/python_images.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Executes mxnet python images pipeline functions: build, test, publish +# Assumes script is run from the root of the mxnet repository +# Assumes script is being run within MXNet CD infrastructure + +set -xe + +usage="Usage: python_images.sh MXNET-VARIANT " + +command=${1:?$usage} +mxnet_variant=${2:?$usage} +python_version=${3:?usage} + +cd_utils='cd/utils' +ci_utils='ci/' + +case ${python_version} in + py3) + python_cmd="python3" + ;; + py2) + python_cmd="python" + ;; + *) + echo "Error: specify python version with either 'py2' or 'py3'" + exit 1 + ;; +esac + +docker_tags=($(./${cd_utils}/docker_tag.sh ${mxnet_variant})) +main_tag="${docker_tags[0]}_${python_version}" +base_image=$(./${cd_utils}/mxnet_base_image.sh ${mxnet_variant}) +repository="python" +image_name="${repository}:${main_tag}" + +resources_path='cd/python/docker' + +if [ ! -z "${RELEASE_DOCKERHUB_REPOSITORY}" ]; then + image_name="${RELEASE_DOCKERHUB_REPOSITORY}/${image_name}" +fi + +build() { + # NOTE: Ensure the correct context root is passed in when building - Dockerfile expects ./wheel_build + docker build -t "${image_name}" --build-arg PYTHON_CMD=${python_cmd} --build-arg BASE_IMAGE="${base_image}" --build-arg MXNET_COMMIT_ID=${GIT_COMMIT} -f ${resources_path}/Dockerfile ./wheel_build +} + +test() { + local runtime_param="" + if [[ ${mxnet_variant} == cu* ]]; then + runtime_param="--runtime=nvidia" + fi + local test_image_name="${image_name}_test" + + # Ensure the correct context root is passed in when building - Dockerfile.test expects ci directory + docker build -t "${test_image_name}" --build-arg USER_ID=`id -u` --build-arg GROUP_ID=`id -g` --build-arg BASE_IMAGE="${image_name}" -f ${resources_path}/Dockerfile.test ./ci + ./ci/safe_docker_run.py ${runtime_param} --cap-add "SYS_PTRACE" -u `id -u`:`id -g` -v `pwd`:/work/mxnet "${test_image_name}" ${resources_path}/test_python_image.sh "${mxnet_variant}" "${python_cmd}" +} + +push() { + if [ -z "${RELEASE_DOCKERHUB_REPOSITORY}" ]; then + echo "Cannot publish image without RELEASE_DOCKERHUB_REPOSITORY environment variable being set." + exit 1 + fi + + # The secret name env var is set in the Jenkins configuration + # Manage Jenkins -> Configure System + ./${ci_utils}/docker_login.py --secret-name "${RELEASE_DOCKERHUB_SECRET_NAME}" + + # Push image + docker push "${image_name}" + + # Iterate over remaining tags, if any + for ((i=1;i<${#docker_tags[@]};i++)); do + local docker_tag="${docker_tags[${i}]}" + local latest_image_name="${RELEASE_DOCKERHUB_REPOSITORY}/${repository}:${docker_tag}" + + # latest and latest gpu should only be pushed for py3 + if [[ ${docker_tag} == "latest" || ${docker_tag} == "latest_gpu" ]]; then + if [[ ${python_version} == "py2" ]]; then + continue + fi + else + latest_image_name="${latest_image_name}_${python_version}" + fi + + docker tag "${image_name}" "${latest_image_name}" + docker push "${latest_image_name}" + echo "Successfully pushed ${latest_image_name}. Pull it with:" + echo "docker pull ${latest_image_name}" + echo "For a complete list of tags see https://hub.docker.com/u/${RELEASE_DOCKERHUB_REPOSITORY}/${repository}" + done +} + +case ${command} in + "build") + build + ;; + + "test") + test + ;; + + "push") + push + ;; + + *) + echo $usage + exit 1 +esac diff --git a/cd/python/docker/test_python_image.sh b/cd/python/docker/test_python_image.sh new file mode 100755 index 000000000000..88e03ea84a12 --- /dev/null +++ b/cd/python/docker/test_python_image.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# To be run _within_ a runtime image +# Tests the Runtime docker image +# Assumes the mxnet source directory is mounted on /mxnet and cwd is /mxnet + +set -ex + +# Variant parameter should be passed in +mxnet_variant=${1:?"Missing mxnet variant"} +python_cmd=${2:?"Missing python version (python or python3)"} + +if [ -z "${MXNET_COMMIT_ID}" ]; then + echo "MXNET_COMMIT_ID environment variable is empty. Please rebuild the image with MXNET_COMMIT_ID build-arg specified." + exit 1 +fi + +# Execute tests +if [[ $mxnet_variant == cu* ]]; then + mnist_params="--gpu 0" + test_conv_params="--gpu" +fi + +if [[ $mxnet_variant == *mkl ]]; then + ${python_cmd} tests/python/mkl/test_mkldnn.py +fi + +${python_cmd} tests/python/train/test_conv.py ${test_conv_params} +${python_cmd} example/image-classification/train_mnist.py ${mnist_params} + diff --git a/cd/python/pypi/Jenkins_pipeline.groovy b/cd/python/pypi/Jenkins_pipeline.groovy index bf8103270146..e9f172a570fe 100644 --- a/cd/python/pypi/Jenkins_pipeline.groovy +++ b/cd/python/pypi/Jenkins_pipeline.groovy @@ -45,7 +45,7 @@ def get_environment(mxnet_variant) { def build(mxnet_variant) { ws("workspace/python_pypi/${mxnet_variant}/${env.BUILD_NUMBER}") { ci_utils.init_git() - cd_utils.restore_artifact(mxnet_variant, 'static') + cd_utils.restore_static_libmxnet(mxnet_variant) // create wheel file def environment = get_environment(mxnet_variant) diff --git a/cd/utils/docker_tag.sh b/cd/utils/docker_tag.sh new file mode 100755 index 000000000000..d16da4930774 --- /dev/null +++ b/cd/utils/docker_tag.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# -*- coding: utf-8 -*- + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +mxnet_variant=${1:?"Please specify the mxnet variant as the first parameter"} +is_release=${RELEASE_BUILD:-false} +version=${VERSION:-nightly} + +# The docker tags will be in the form _(_mkl) +# Eg. nightly_cpu, 1.4.0_cpu_mkl, nightly_gpu_cu80_mkl, etc. + +if [[ ${mxnet_variant} == "cpu" ]]; then + tag_suffix="cpu" +elif [[ ${mxnet_variant} == "mkl" ]]; then + tag_suffix="cpu_mkl" +elif [[ ${mxnet_variant} == cu* ]]; then + tag_suffix="gpu_${mxnet_variant}" + + # *mkl => *_mkl + if [[ $tag_suffix == *mkl ]]; then + tag_suffix="${tag_suffix:0:${#tag_suffix}-3}_mkl" + fi +else + echo "Error: Unrecognized mxnet variant: '${mxnet_variant}'." + exit 1 +fi + +echo "${version}_${tag_suffix}" + +# Print out latest tags as well +if [[ ${is_release} == "true" ]]; then + if [[ ${mxnet_variant} == "cpu" ]]; then + echo "latest" + echo "latest_cpu" + elif [[ ${mxnet_variant} == "mkl" ]]; then + echo "latest_cpu_mkl" + elif [[ ${mxnet_variant} == "cu90" ]]; then + echo "latest_gpu" + elif [[ ${mxnet_variant} == "cu90mkl" ]]; then + echo "latest_gpu_mkl" + fi +fi diff --git a/cd/utils/mxnet_base_image.sh b/cd/utils/mxnet_base_image.sh new file mode 100755 index 000000000000..dcfe7216dcb4 --- /dev/null +++ b/cd/utils/mxnet_base_image.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# -*- coding: utf-8 -*- + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +mxnet_variant=${1:?"Please specify the mxnet variant as the first parameter"} + +case ${mxnet_variant} in + cu80*) + echo "nvidia/cuda:8.0-cudnn7-runtime-ubuntu16.04" + ;; + cu90*) + echo "nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04" + ;; + cu92*) + echo "nvidia/cuda:9.2-cudnn7-runtime-ubuntu16.04" + ;; + cu100*) + echo "nvidia/cuda:10.0-cudnn7-runtime-ubuntu16.04" + ;; + cu101*) + echo "nvidia/cuda:10.1-cudnn7-runtime-ubuntu16.04" + ;; + cpu) + echo "ubuntu:16.04" + ;; + mkl) + echo "ubuntu:16.04" + ;; + *) + echo "Error: Unrecognized mxnet-variant: '${mxnet_variant}'" + exit 1 + ;; +esac diff --git a/ci/build.py b/ci/build.py index e6a183fefa34..8798c7ed2ef2 100755 --- a/ci/build.py +++ b/ci/build.py @@ -27,63 +27,17 @@ import argparse import glob -import logging -import os +import pprint import re import shutil +import signal import subprocess -import sys -import tempfile from itertools import chain from subprocess import check_call, check_output from typing import * -from util import * -import docker -import docker.models -import docker.errors -import signal -import atexit -import pprint - - -class Cleanup: - """A class to cleanup containers""" - def __init__(self): - self.containers = set() - self.docker_stop_timeout = 3 - def add_container(self, container: docker.models.containers.Container): - assert isinstance(container, docker.models.containers.Container) - self.containers.add(container) - - def remove_container(self, container: docker.models.containers.Container): - assert isinstance(container, docker.models.containers.Container) - self.containers.remove(container) - - def _cleanup_containers(self): - if self.containers: - logging.warning("Cleaning up containers") - else: - return - # noinspection PyBroadException - try: - stop_timeout = int(os.environ.get("DOCKER_STOP_TIMEOUT", self.docker_stop_timeout)) - except Exception: - stop_timeout = 3 - for container in self.containers: - try: - container.stop(timeout=stop_timeout) - logging.info("☠: stopped container %s", trim_container_id(container.id)) - container.remove() - logging.info("🚽: removed container %s", trim_container_id(container.id)) - except Exception as e: - logging.exception(e) - self.containers.clear() - logging.info("Cleaning up containers finished.") - - def __call__(self): - """Perform cleanup""" - self._cleanup_containers() +from safe_docker_run import SafeDockerClient +from util import * def get_dockerfiles_path(): @@ -205,18 +159,13 @@ def default_ccache_dir() -> str: return os.path.join(os.path.expanduser("~"), ".ccache") -def trim_container_id(cid): - """:return: trimmed container id""" - return cid[:12] - - -def container_run(platform: str, +def container_run(docker_client: SafeDockerClient, + platform: str, nvidia_runtime: bool, docker_registry: str, shared_memory_size: str, local_ccache_dir: str, command: List[str], - cleanup: Cleanup, environment: Dict[str, str], dry_run: bool = False) -> int: """Run command in a container""" @@ -232,13 +181,6 @@ def container_run(platform: str, 'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache # verification. }) - # These variables are passed to the container to the process tree killer can find runaway - # process inside the container - # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller - # https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393 - # - jenkins_env_vars = ['BUILD_NUMBER', 'BUILD_ID', 'BUILD_TAG'] - environment.update({k: os.environ[k] for k in jenkins_env_vars if k in os.environ}) environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ}) tag = get_docker_tag(platform=platform, registry=docker_registry) @@ -248,7 +190,7 @@ def container_run(platform: str, os.makedirs(local_build_folder, exist_ok=True) os.makedirs(local_ccache_dir, exist_ok=True) logging.info("Using ccache directory: %s", local_ccache_dir) - docker_client = docker.from_env() + # Equivalent command docker_cmd_list = [ get_docker_binary(nvidia_runtime), @@ -276,8 +218,7 @@ def container_run(platform: str, docker_cmd = ' \\\n\t'.join(docker_cmd_list) logging.info("Running %s in container %s", command, tag) logging.info("Executing the equivalent of:\n%s\n", docker_cmd) - # return code of the command inside docker - ret = 0 + if not dry_run: ############################# # @@ -288,10 +229,10 @@ def container_run(platform: str, # noinspection PyShadowingNames # runc is default (docker info | grep -i runtime) runtime = 'nvidia' - container = docker_client.containers.run( + + return docker_client.run( tag, runtime=runtime, - detach=True, command=command, shm_size=shared_memory_size, user='{}:{}'.format(os.getuid(), os.getgid()), @@ -305,61 +246,7 @@ def container_run(platform: str, {'bind': '/work/ccache', 'mode': 'rw'}, }, environment=environment) - try: - logging.info("Started container: %s", trim_container_id(container.id)) - # Race condition: - # If the previous call is interrupted then it's possible that the container is not cleaned up - # We avoid by masking the signals temporarily - cleanup.add_container(container) - signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM}) - # - ############################# - - stream = container.logs(stream=True, stdout=True, stderr=True) - sys.stdout.flush() - for chunk in stream: - sys.stdout.buffer.write(chunk) - sys.stdout.buffer.flush() - sys.stdout.flush() - stream.close() - try: - logging.info("Waiting for status of container %s for %d s.", - trim_container_id(container.id), - container_wait_s) - wait_result = container.wait(timeout=container_wait_s) - logging.info("Container exit status: %s", wait_result) - ret = wait_result.get('StatusCode', 200) - if ret != 0: - logging.error("Container exited with an error 😞") - logging.info("Executed command for reproduction:\n\n%s\n", " ".join(sys.argv)) - else: - logging.info("Container exited with success 👍") - except Exception as e: - logging.exception(e) - ret = 150 - - # Stop - try: - logging.info("Stopping container: %s", trim_container_id(container.id)) - container.stop() - except Exception as e: - logging.exception(e) - ret = 151 - - # Remove - try: - logging.info("Removing container: %s", trim_container_id(container.id)) - container.remove() - except Exception as e: - logging.exception(e) - ret = 152 - cleanup.remove_container(container) - containers = docker_client.containers.list() - if containers: - logging.info("Other running containers: %s", [trim_container_id(x.id) for x in containers]) - except docker.errors.NotFound as e: - logging.info("Container was stopped before cleanup started: %s", e) - return ret + return 0 def list_platforms() -> str: @@ -388,17 +275,6 @@ def log_environment(): logging.debug("Build environment: %s", pp.pformat(dict(os.environ))) -def script_name() -> str: - """:returns: script name with leading paths removed""" - return os.path.split(sys.argv[0])[1] - -def config_logging(): - import time - logging.getLogger().setLevel(logging.INFO) - logging.getLogger("requests").setLevel(logging.WARNING) - logging.basicConfig(format='{}: %(asctime)sZ %(levelname)s %(message)s'.format(script_name())) - logging.Formatter.converter = time.gmtime - def main() -> int: config_logging() @@ -471,20 +347,7 @@ def main() -> int: command = list(chain(*args.command)) docker_binary = get_docker_binary(args.nvidiadocker) - - # Cleanup on signals and exit - cleanup = Cleanup() - - def signal_handler(signum, _): - signal.pthread_sigmask(signal.SIG_BLOCK, {signum}) - logging.warning("Signal %d received, cleaning up...", signum) - cleanup() - logging.warning("done. Exiting with error.") - sys.exit(1) - - atexit.register(cleanup) - signal.signal(signal.SIGTERM, signal_handler) - signal.signal(signal.SIGINT, signal_handler) + docker_client = SafeDockerClient() environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e])) for e in args.environment]) @@ -498,7 +361,7 @@ def signal_handler(signum, _): load_docker_cache(tag=tag, docker_registry=args.docker_registry) if not args.run_only: build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry, - num_retries=args.docker_build_retries, no_cache=args.no_cache) + num_retries=args.docker_build_retries, no_cache=args.no_cache) else: logging.info("Skipping docker build step.") @@ -510,23 +373,23 @@ def signal_handler(signum, _): ret = 0 if command: ret = container_run( - platform=platform, nvidia_runtime=args.nvidiadocker, + docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, - local_ccache_dir=args.ccache_dir, cleanup=cleanup, environment=environment) + local_ccache_dir=args.ccache_dir, environment=environment) elif args.print_docker_run: command = [] ret = container_run( - platform=platform, nvidia_runtime=args.nvidiadocker, + docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, - local_ccache_dir=args.ccache_dir, dry_run=True, cleanup=cleanup, environment=environment) + local_ccache_dir=args.ccache_dir, dry_run=True, environment=environment) else: # With no commands, execute a build function for the target platform command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)] logging.info("No command specified, trying default build: %s", ' '.join(command)) ret = container_run( - platform=platform, nvidia_runtime=args.nvidiadocker, + docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, - local_ccache_dir=args.ccache_dir, cleanup=cleanup, environment=environment) + local_ccache_dir=args.ccache_dir, environment=environment) if ret != 0: logging.critical("Execution of %s failed with status: %d", command, ret) @@ -553,9 +416,9 @@ def signal_handler(signum, _): continue command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform] container_run( - platform=platform, nvidia_runtime=args.nvidiadocker, + docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, - local_ccache_dir=args.ccache_dir, cleanup=cleanup, environment=environment) + local_ccache_dir=args.ccache_dir, environment=environment) shutil.move(buildir(), plat_buildir) logging.info("Built files left in: %s", plat_buildir) diff --git a/ci/docker_cache.py b/ci/docker_cache.py index 3a2a1fb415ee..254d6237d6e2 100755 --- a/ci/docker_cache.py +++ b/ci/docker_cache.py @@ -24,21 +24,21 @@ state as if the container would have been built locally already. """ -import os -import logging import argparse -import sys +import logging +import os import subprocess -import json +import sys from typing import * + import build as build_util +from docker_login import login_dockerhub, logout_dockerhub from util import retry -DOCKERHUB_LOGIN_NUM_RETRIES = 5 -DOCKERHUB_RETRY_SECONDS = 5 DOCKER_CACHE_NUM_RETRIES = 3 DOCKER_CACHE_TIMEOUT_MINS = 45 PARALLEL_BUILDS = 10 +DOCKER_CACHE_RETRY_SECONDS = 5 def build_save_containers(platforms, registry, load_cache) -> int: @@ -111,41 +111,8 @@ def _upload_image(registry, docker_tag, image_id) -> None: subprocess.check_call(push_cmd) -@retry(target_exception=subprocess.CalledProcessError, tries=DOCKERHUB_LOGIN_NUM_RETRIES, - delay_s=DOCKERHUB_RETRY_SECONDS) -def _login_dockerhub(): - """ - Login to the Docker Hub account - :return: None - """ - dockerhub_credentials = _get_dockerhub_credentials() - - logging.info('Logging in to DockerHub') - # We use password-stdin instead of --password to avoid leaking passwords in case of an error. - # This method will produce the following output: - # > WARNING! Your password will be stored unencrypted in /home/jenkins_slave/.docker/config.json. - # > Configure a credential helper to remove this warning. See - # > https://docs.docker.com/engine/reference/commandline/login/#credentials-store - # Since we consider the restricted slaves a secure environment, that's fine. Also, using this will require - # third party applications which would need a review first as well. - p = subprocess.run(['docker', 'login', '--username', dockerhub_credentials['username'], '--password-stdin'], - stdout=subprocess.PIPE, input=str.encode(dockerhub_credentials['password'])) - logging.info(p.stdout) - logging.info('Successfully logged in to DockerHub') - - -def _logout_dockerhub(): - """ - Log out of DockerHub to delete local credentials - :return: None - """ - logging.info('Logging out of DockerHub') - subprocess.call(['docker', 'logout']) - logging.info('Successfully logged out of DockerHub') - - @retry(target_exception=subprocess.TimeoutExpired, tries=DOCKER_CACHE_NUM_RETRIES, - delay_s=DOCKERHUB_RETRY_SECONDS) + delay_s=DOCKER_CACHE_RETRY_SECONDS) def load_docker_cache(registry, docker_tag) -> None: """ Load the precompiled docker cache from the registry @@ -187,37 +154,6 @@ def delete_local_docker_cache(docker_tag): logging.debug('Error during local cache deletion %s', error) -def _get_dockerhub_credentials(): # pragma: no cover - import boto3 - import botocore - secret_name = os.environ['DOCKERHUB_SECRET_NAME'] - endpoint_url = os.environ['DOCKERHUB_SECRET_ENDPOINT_URL'] - region_name = os.environ['DOCKERHUB_SECRET_ENDPOINT_REGION'] - - session = boto3.Session() - client = session.client( - service_name='secretsmanager', - region_name=region_name, - endpoint_url=endpoint_url - ) - try: - get_secret_value_response = client.get_secret_value( - SecretId=secret_name - ) - except botocore.exceptions.ClientError as client_error: - if client_error.response['Error']['Code'] == 'ResourceNotFoundException': - logging.exception("The requested secret %s was not found", secret_name) - elif client_error.response['Error']['Code'] == 'InvalidRequestException': - logging.exception("The request was invalid due to:") - elif client_error.response['Error']['Code'] == 'InvalidParameterException': - logging.exception("The request had invalid params:") - raise - else: - secret = get_secret_value_response['SecretString'] - secret_dict = json.loads(secret) - return secret_dict - - def main() -> int: """ Utility to create and publish the Docker cache to Docker Hub @@ -248,11 +184,16 @@ def script_name() -> str: args = parser.parse_args() platforms = build_util.get_platforms() + + secret_name = os.environ['DOCKERHUB_SECRET_NAME'] + endpoint_url = os.environ['DOCKERHUB_SECRET_ENDPOINT_URL'] + region_name = os.environ['DOCKERHUB_SECRET_ENDPOINT_REGION'] + try: - _login_dockerhub() + login_dockerhub(secret_name, endpoint_url, region_name) return build_save_containers(platforms=platforms, registry=args.docker_registry, load_cache=True) finally: - _logout_dockerhub() + logout_dockerhub() if __name__ == '__main__': diff --git a/ci/docker_login.py b/ci/docker_login.py new file mode 100755 index 000000000000..b3b4d46e17ce --- /dev/null +++ b/ci/docker_login.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import json +import logging +import os +import subprocess +import sys + +from util import retry, config_logging + +DOCKERHUB_LOGIN_NUM_RETRIES = 5 +DOCKERHUB_RETRY_SECONDS = 5 + + +def _get_dockerhub_credentials(secret_name: str, secret_endpoint_url: str, secret_endpoint_region_name: str): + import boto3 + import botocore + + session = boto3.Session() + client = session.client( + service_name='secretsmanager', + region_name=secret_endpoint_region_name, + endpoint_url=secret_endpoint_url + ) + try: + get_secret_value_response = client.get_secret_value( + SecretId=secret_name + ) + except botocore.exceptions.ClientError as client_error: + if client_error.response['Error']['Code'] == 'ResourceNotFoundException': + logging.exception("The requested secret %s was not found", secret_name) + elif client_error.response['Error']['Code'] == 'InvalidRequestException': + logging.exception("The request was invalid due to:") + elif client_error.response['Error']['Code'] == 'InvalidParameterException': + logging.exception("The request had invalid params:") + raise + else: + secret = get_secret_value_response['SecretString'] + secret_dict = json.loads(secret) + return secret_dict + + +@retry(target_exception=subprocess.CalledProcessError, tries=DOCKERHUB_LOGIN_NUM_RETRIES, + delay_s=DOCKERHUB_RETRY_SECONDS) +def login_dockerhub(secret_name: str, secret_endpoint_url: str, secret_endpoint_region_name: str): + """ + Login to the Docker Hub account + :return: None + """ + dockerhub_credentials = _get_dockerhub_credentials(secret_name, secret_endpoint_url, secret_endpoint_region_name) + + logging.info('Logging in to DockerHub') + # We use password-stdin instead of --password to avoid leaking passwords in case of an error. + # This method will produce the following output: + # > WARNING! Your password will be stored unencrypted in /home/jenkins_slave/.docker/config.json. + # > Configure a credential helper to remove this warning. See + # > https://docs.docker.com/engine/reference/commandline/login/#credentials-store + # Since we consider the restricted slaves a secure environment, that's fine. Also, using this will require + # third party applications which would need a review first as well. + p = subprocess.run(['docker', 'login', '--username', dockerhub_credentials['username'], '--password-stdin'], + stdout=subprocess.PIPE, input=str.encode(dockerhub_credentials['password'])) + logging.info(p.stdout) + if p.returncode == 0: + logging.info('Successfully logged in to DockerHub') + return + + raise RuntimeError("Failed to login to DockerHub") + + +def logout_dockerhub(): + """ + Log out of DockerHub to delete local credentials + :return: None + """ + logging.info('Logging out of DockerHub') + subprocess.call(['docker', 'logout']) + logging.info('Successfully logged out of DockerHub') + + +def main(command_line_arguments): + config_logging() + + parser = argparse.ArgumentParser( + description="Safe docker login utility to avoid leaking passwords", + epilog="" + ) + parser.add_argument("--secret-name", + help="Secret name", + type=str, + required=True) + + parser.add_argument("--secret-endpoint-url", + help="Endpoint Url", + type=str, + default=os.environ.get("DOCKERHUB_SECRET_ENDPOINT_URL", None)) + + parser.add_argument("--secret-endpoint-region", + help="AWS Region", + type=str, + default=os.environ.get("DOCKERHUB_SECRET_ENDPOINT_REGION", None)) + + args = parser.parse_args(args=command_line_arguments) + + if args.secret_endpoint_url is None: + raise RuntimeError("Could not determine secret-endpoint-url, please specify with --secret-endpoint-url") + + if args.secret_endpoint_region is None: + raise RuntimeError("Could not determine secret-endpoint-region, please specify with --secret-endpoint-region") + + try: + login_dockerhub(args.secret_name, args.secret_endpoint_url, args.secret_endpoint_region) + except Exception as err: + logging.exception(err) + exit(1) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/ci/logging.conf b/ci/logging.conf new file mode 100644 index 000000000000..d80365e27bf1 --- /dev/null +++ b/ci/logging.conf @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[loggers] +keys=root + +[handlers] +keys=consoleHandler + +[formatters] +keys=simpleFormatter + +[logger_root] +level=DEBUG +handlers=consoleHandler + +[handler_consoleHandler] +class=StreamHandler +level=DEBUG +formatter=simpleFormatter +args=(sys.stdout,) + +[formatter_simpleFormatter] +format=%(asctime)s - %(name)s - %(levelname)s - %(message)s +datefmt= \ No newline at end of file diff --git a/ci/safe_docker_run.py b/ci/safe_docker_run.py new file mode 100755 index 000000000000..e3b55bccdff8 --- /dev/null +++ b/ci/safe_docker_run.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Docker command wrapper to guard against Zombie containers +""" + +import argparse +import atexit +import logging +import os +import signal +import sys +from functools import reduce +from itertools import chain +from typing import Dict, Any + +import docker +from docker.errors import NotFound +from docker.models.containers import Container + +from util import config_logging + +DOCKER_STOP_TIMEOUT_SECONDS = 3 +CONTAINER_WAIT_SECONDS = 600 + + +class SafeDockerClient: + """ + A wrapper around the docker client to ensure that no zombie containers are left hanging around + in case the script is not allowed to finish normally + """ + + @staticmethod + def _trim_container_id(cid): + """:return: trimmed container id""" + return cid[:12] + + def __init__(self): + self._docker_client = docker.from_env() + self._containers = set() + self._docker_stop_timeout = DOCKER_STOP_TIMEOUT_SECONDS + self._container_wait_seconds = CONTAINER_WAIT_SECONDS + + def signal_handler(signum, _): + signal.pthread_sigmask(signal.SIG_BLOCK, {signum}) + logging.warning("Signal %d received, cleaning up...", signum) + self._clean_up() + logging.warning("done. Exiting with error.") + sys.exit(1) + + atexit.register(self._clean_up) + signal.signal(signal.SIGTERM, signal_handler) + signal.signal(signal.SIGINT, signal_handler) + + def _clean_up(self): + if self._containers: + logging.warning("Cleaning up containers") + else: + return + # noinspection PyBroadException + try: + stop_timeout = int(os.environ.get("DOCKER_STOP_TIMEOUT", self._docker_stop_timeout)) + except Exception: + stop_timeout = 3 + for container in self._containers: + try: + container.stop(timeout=stop_timeout) + logging.info("☠: stopped container %s", self._trim_container_id(container.id)) + container.remove() + logging.info("🚽: removed container %s", self._trim_container_id(container.id)) + except Exception as e: + logging.exception(e) + self._containers.clear() + logging.info("Cleaning up containers finished.") + + def _add_container(self, container: Container) -> Container: + self._containers.add(container) + return container + + def _remove_container(self, container: Container): + self._containers.remove(container) + + def run(self, *args, **kwargs) -> int: + if "detach" in kwargs and kwargs.get("detach") is False: + raise ValueError("Can only safe run with 'detach' set to True") + else: + kwargs["detach"] = True + + # These variables are passed to the container so the process tree killer can find runaway + # process inside the container + # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller + # https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393 + if "environment" not in kwargs: + kwargs["environment"] = {} + + jenkins_env_vars = ["BUILD_NUMBER", "BUILD_ID", "BUILD_TAG"] + kwargs["environment"].update({k: os.environ[k] for k in jenkins_env_vars if k in os.environ}) + + ret = 0 + try: + # Race condition: + # If the call to docker_client.containers.run is interrupted, it is possible that + # the container won't be cleaned up. We avoid this by temporarily masking the signals. + signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM}) + container = self._add_container(self._docker_client.containers.run(*args, **kwargs)) + signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM}) + logging.info("Started container: %s", self._trim_container_id(container.id)) + stream = container.logs(stream=True, stdout=True, stderr=True) + sys.stdout.flush() + for chunk in stream: + sys.stdout.buffer.write(chunk) + sys.stdout.buffer.flush() + sys.stdout.flush() + stream.close() + + try: + logging.info("Waiting for status of container %s for %d s.", + self._trim_container_id(container.id), + self._container_wait_seconds) + wait_result = container.wait(timeout=self._container_wait_seconds) + logging.info("Container exit status: %s", wait_result) + ret = wait_result.get('StatusCode', 200) + if ret != 0: + logging.error("Container exited with an error 😞") + logging.info("Executed command for reproduction:\n\n%s\n", " ".join(sys.argv)) + else: + logging.info("Container exited with success 👍") + except Exception as err: + logging.exception(err) + return 150 + + try: + logging.info("Stopping container: %s", self._trim_container_id(container.id)) + container.stop() + except Exception as e: + logging.exception(e) + ret = 151 + + try: + logging.info("Removing container: %s", self._trim_container_id(container.id)) + container.remove() + except Exception as e: + logging.exception(e) + ret = 152 + self._remove_container(container) + containers = self._docker_client.containers.list() + if containers: + logging.info("Other running containers: %s", [self._trim_container_id(x.id) for x in containers]) + except NotFound as e: + logging.info("Container was stopped before cleanup started: %s", e) + + return ret + + +def _volume_mount(volume_dfn: str) -> Dict[str, Any]: + """ + Converts docker volume mount format, e.g. docker run --volume /local/path:/container/path:ro + to an object understood by the python docker library, e.g. {"local/path": {"bind": "/container/path", "mode": "ro"}} + This is used by the argparser for automatic conversion and input validation. + If the mode is not specified, 'rw' is assumed. + :param volume_dfn: A string to convert to a volume mount object in the format :[:ro|rw] + :return: An object in the form {"" : {"bind": "", "mode": "rw|ro"}} + """ + if volume_dfn is None: + raise argparse.ArgumentTypeError("Missing value for volume definition") + + parts = volume_dfn.split(":") + + if len(parts) < 2 or len(parts) > 3: + raise argparse.ArgumentTypeError("Invalid volume definition {}".format(volume_dfn)) + + mode = "rw" + if len(parts) == 3: + mode = parts[2] + + if mode not in ["rw", "ro"]: + raise argparse.ArgumentTypeError("Invalid volume mount mode {} in volume definition {}".format(mode, volume_dfn)) + + return {parts[0]: {"bind": parts[1], "mode": mode}} + + +def main(command_line_arguments): + config_logging() + + parser = argparse.ArgumentParser( + description="""Wrapper around docker run that protects against Zombie containers""", epilog="") + + parser.add_argument("-u", "--user", + help="Username or UID (format: [:])", + default=None) + + parser.add_argument("-v", "--volume", + action='append', + type=_volume_mount, + help="Bind mount a volume", + default=[]) + + parser.add_argument("--cap-add", + help="Add Linux capabilities", + action="append", + type=str, + default=[]) + + parser.add_argument("--runtime", + help="Runtime to use for this container", + default=None) + + parser.add_argument("--name", + help="Assign a name to the container", + default=None) + + parser.add_argument("image", metavar="IMAGE") + parser.add_argument("command", metavar="COMMAND") + parser.add_argument("args", nargs='*', metavar="ARG") + + args = parser.parse_args(args=command_line_arguments) + docker_client = SafeDockerClient() + return docker_client.run(args.image, **{ + "command": " ".join(list(chain([args.command] + args.args))), + "user": args.user, + "runtime": args.runtime, + "name": args.name, + "volumes": reduce(lambda dct, v: {**dct, **v}, args.volume, {}), + "cap_add": args.cap_add + }) + + +if __name__ == "__main__": + exit(main(sys.argv[1:])) diff --git a/ci/test_docker_cache.py b/ci/test_docker_cache.py index 0a3bc4640c05..aeb399ff6b45 100644 --- a/ci/test_docker_cache.py +++ b/ci/test_docker_cache.py @@ -88,7 +88,7 @@ def setUp(self): base = os.path.split(os.path.realpath(__file__))[0] os.chdir(base) - docker_cache._login_dockerhub = MagicMock() # Override login + docker_cache.login_dockerhub = MagicMock() # Override login # Stop in case previous execution was dirty try: @@ -135,7 +135,7 @@ def test_full_cache(self): """ platform = 'test_full_cache' docker_tag = build_util.get_docker_tag(platform=platform, registry=DOCKER_REGISTRY_PATH) - dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.' + platform) + dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.build.' + platform) try: with open(dockerfile_path, 'w') as dockerfile_handle: dockerfile_handle.write(dockerfile_content) @@ -144,13 +144,25 @@ def test_full_cache(self): docker_cache.delete_local_docker_cache(docker_tag=docker_tag) def warm_up_lambda_func(): - build_util.build_docker(docker_binary='docker', platform=platform, registry=DOCKER_REGISTRY_PATH) + build_util.build_docker( + docker_binary='docker', + platform=platform, + registry=DOCKER_REGISTRY_PATH, + num_retries=3, + no_cache=False + ) _assert_docker_build(lambda_func=warm_up_lambda_func, expected_cache_hit_count=0, expected_cache_miss_count=4) # Assert local cache is properly primed def primed_cache_lambda_func(): - build_util.build_docker(docker_binary='docker', platform=platform, registry=DOCKER_REGISTRY_PATH) + build_util.build_docker( + docker_binary='docker', + platform=platform, + registry=DOCKER_REGISTRY_PATH, + num_retries=3, + no_cache=False + ) _assert_docker_build(lambda_func=primed_cache_lambda_func, expected_cache_hit_count=4, expected_cache_miss_count=0) @@ -169,8 +181,6 @@ def clean_cache_lambda_func(): os.remove(dockerfile_path) docker_cache.delete_local_docker_cache(docker_tag=docker_tag) - - def test_partial_cache(self): """ Test whether it's possible to restore cache and then pit it up partially by using a Dockerfile which shares @@ -196,7 +206,7 @@ def test_partial_cache(self): """ platform = 'test_partial_cache' docker_tag = build_util.get_docker_tag(platform=platform, registry=DOCKER_REGISTRY_PATH) - dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.' + platform) + dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.build.' + platform) try: # Write initial Dockerfile with open(dockerfile_path, 'w') as dockerfile_handle: @@ -206,13 +216,25 @@ def test_partial_cache(self): docker_cache.delete_local_docker_cache(docker_tag=docker_tag) def warm_up_lambda_func(): - build_util.build_docker(docker_binary='docker', platform=platform, registry=DOCKER_REGISTRY_PATH) + build_util.build_docker( + docker_binary='docker', + platform=platform, + registry=DOCKER_REGISTRY_PATH, + num_retries=3, + no_cache=False + ) _assert_docker_build(lambda_func=warm_up_lambda_func, expected_cache_hit_count=0, expected_cache_miss_count=4) # Assert local cache is properly primed def primed_cache_lambda_func(): - build_util.build_docker(docker_binary='docker', platform=platform, registry=DOCKER_REGISTRY_PATH) + build_util.build_docker( + docker_binary='docker', + platform=platform, + registry=DOCKER_REGISTRY_PATH, + num_retries=3, + no_cache=False + ) _assert_docker_build(lambda_func=primed_cache_lambda_func, expected_cache_hit_count=4, expected_cache_miss_count=0) diff --git a/ci/test_docker_login.py b/ci/test_docker_login.py new file mode 100644 index 000000000000..6c989ade92ff --- /dev/null +++ b/ci/test_docker_login.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Docker login tests +""" +import os +import subprocess +import unittest +from unittest.mock import create_autospec, patch, call, MagicMock + +import boto3 +from boto3 import client +from botocore.stub import Stubber + +from docker_login import login_dockerhub, logout_dockerhub, main, DOCKERHUB_RETRY_SECONDS, DOCKERHUB_LOGIN_NUM_RETRIES + + +SECRET_NAME = "secret_name" +SECRET_ENDPOINT_URL = "https://endpoint.url" +SECRET_ENDPOINT_REGION = "us-east-2" + + +def mock_boto(num_calls: int = 1): + mock_client = client("secretsmanager", region_name="us-east-1") + mock_session = create_autospec(boto3.Session) + mock_session.client.return_value = mock_client + + # Stub get_secret_value response + stub = Stubber(mock_client) + for i in range(num_calls): + stub.add_response( + method="get_secret_value", + expected_params={ + "SecretId": "secret_name" # Matches os.environ['SECRET_NAME'] + }, service_response={ + "SecretString": """{"username": "myuser", "password": "mypass"}""" + }) + return mock_session, stub + + +class TestDockerLogin(unittest.TestCase): + + @patch("subprocess.run", name="mock_subprocess_run") + def test_docker_login_success(self, mock_run): + """ + Tests successful docker login returns True and calls docker appropriately + """ + mock_session, stub = mock_boto() + stub.activate() + with patch("boto3.Session", return_value=mock_session): + mock_process = MagicMock(auto_spec=subprocess.Popen, name="mock_process") + + # Simulate successful login + mock_process.returncode = 0 + mock_run.return_value = mock_process + + login_dockerhub(SECRET_NAME, SECRET_ENDPOINT_URL, SECRET_ENDPOINT_REGION) + + # Check boto client is properly created + print(mock_session.client.call_args_list) + assert mock_session.client.call_args_list == [ + call(service_name="secretsmanager", region_name="us-east-2", endpoint_url="https://endpoint.url") + ] + + # Check that login call passes in the password in the correct way + assert mock_run.call_args_list == [ + call( + ["docker", "login", "--username", "myuser", "--password-stdin"], + stdout=subprocess.PIPE, + input=str.encode("mypass") + ) + ] + stub.deactivate() + + @patch("subprocess.run", name="mock_subprocess_run") + @patch("time.sleep") + def test_docker_login_retry(self, mock_sleep, mock_run): + """ + Tests retry mechanism + """ + num_tries = 3 + mock_session, stub = mock_boto(num_calls=num_tries) + stub.activate() + with patch("boto3.Session", return_value=mock_session): + mock_process = MagicMock(auto_spec=subprocess.Popen, name="mock_process") + + # Simulate successful login + mock_process.returncode = 0 + + # Simulate (num_tries - 1) errors + 1 success + mock_run.side_effect = \ + [subprocess.CalledProcessError(1, "cmd", "some error")] * (num_tries - 1) + [mock_process] + + login_dockerhub(SECRET_NAME, SECRET_ENDPOINT_URL, SECRET_ENDPOINT_REGION) + + # Check boto client is properly created + print(mock_session.client.call_args_list) + assert mock_session.client.call_args_list == [ + call(service_name="secretsmanager", region_name="us-east-2", endpoint_url="https://endpoint.url") + ] * num_tries + + # Check that login call passes in the password in the correct way + cmd = ["docker", "login", "--username", "myuser", "--password-stdin"] + assert mock_run.call_args_list == [ + call(cmd, stdout=subprocess.PIPE, input=str.encode("mypass")) + ] * num_tries + + # Assert sleep was called appropriately + assert mock_sleep.call_args_list == [ + call(2 ** retry_num * DOCKERHUB_RETRY_SECONDS) for retry_num in range(0, num_tries - 1) + ] + stub.deactivate() + + @patch("subprocess.run", name="mock_subprocess_run") + @patch("time.sleep") + def test_docker_login_retry_exhausted(self, mock_sleep, mock_run): + """ + Tests retry mechanism + """ + num_tries = DOCKERHUB_LOGIN_NUM_RETRIES + mock_session, stub = mock_boto(num_calls=num_tries) + stub.activate() + with patch("boto3.Session", return_value=mock_session): + # Simulate num_tries errors + mock_run.side_effect = [subprocess.CalledProcessError(1, "cmd", "some error")] * num_tries + + with self.assertRaises(subprocess.CalledProcessError): + login_dockerhub(SECRET_NAME, SECRET_ENDPOINT_URL, SECRET_ENDPOINT_REGION) + + # Check boto client is properly created + assert mock_session.client.call_args_list == [ + call(service_name="secretsmanager", region_name="us-east-2", endpoint_url="https://endpoint.url") + ] * num_tries + + # Check that login call passes in the password in the correct way + cmd = ["docker", "login", "--username", "myuser", "--password-stdin"] + assert mock_run.call_args_list == [ + call(cmd, stdout=subprocess.PIPE, input=str.encode("mypass")) + ] * num_tries + + # Assert sleep was called appropriately + assert mock_sleep.call_args_list == [ + call(2 ** retry_num * DOCKERHUB_RETRY_SECONDS) for retry_num in range(0, num_tries-1) + ] + stub.deactivate() + + @patch("subprocess.run", name="mock_subprocess_run") + def test_docker_login_failed(self, mock_run): + """ + Tests failed docker login return false + """ + mock_session, stub = mock_boto() + stub.activate() + with patch("boto3.Session", return_value=mock_session): + + mock_process = MagicMock(auto_spec=subprocess.Popen, name="mock_process") + + # Simulate failed login + mock_process.returncode = 1 + mock_run.return_value = mock_process + + with self.assertRaises(RuntimeError): + login_dockerhub(SECRET_NAME, SECRET_ENDPOINT_URL, SECRET_ENDPOINT_REGION) + stub.deactivate() + + @patch("subprocess.call", name="mock_subprocess_call") + def test_logout(self, mock_call): + """ + Tests logout calls docker command appropriately + """ + logout_dockerhub() + assert mock_call.call_args_list == [ + call(["docker", "logout"]) + ] + + @patch("docker_login.login_dockerhub") + def test_main_exit(self, mock_login): + """ + Tests main exits with error on failed docker login + """ + mock_login.side_effect = RuntimeError("Didn't work") + with self.assertRaises(SystemExit): + main(["--secret-name", "name", "--secret-endpoint-url", "url", "--secret-endpoint-region", "r"]) + + @patch("docker_login.login_dockerhub") + def test_main_default_argument_values(self, mock_login): + """ + Tests default arguments + """ + + # Good env + env = { + "DOCKERHUB_SECRET_ENDPOINT_URL": "url", + "DOCKERHUB_SECRET_ENDPOINT_REGION": "region" + } + with patch.dict(os.environ, env): + main(["--secret-name", "name"]) + assert mock_login.call_args_list == [ + call("name", "url", "region") + ] + + # Bad envs - none or not all required vars defined + tests = [ + {}, + {"DOCKERHUB_SECRET_ENDPOINT_URL": "url"}, + {"DOCKERHUB_SECRET_ENDPOINT_REGION": "region"} + ] + for bad_env in tests: + with patch.dict(os.environ, bad_env): + with self.assertRaises(RuntimeError): + main(["--secret-name", "name"]) + + +if __name__ == '__main__': + import nose + nose.main() diff --git a/ci/test_safe_docker_run.py b/ci/test_safe_docker_run.py new file mode 100644 index 000000000000..433d42e8b2ea --- /dev/null +++ b/ci/test_safe_docker_run.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Safe docker run tests +""" +import itertools +import os +import signal +import unittest +from typing import Optional +from unittest.mock import create_autospec, patch, call + +from docker import DockerClient +from docker.models.containers import Container, ContainerCollection + +from safe_docker_run import SafeDockerClient, main + + +def create_mock_container(status_code: int = 0): + """ + Creates a mock docker container that exits with the specified status code + """ + mock_container = create_autospec(Container, name="mock_container") + mock_container.wait.return_value = { + "StatusCode": status_code + } + return mock_container + + +def create_mock_container_collection(container: Container): + """ + Creates a mock ContainerCollection that return the supplied container when the 'run' method is called + """ + mock_container_collection = create_autospec(ContainerCollection, name="mock_collection") + mock_container_collection.run.return_value = container + return mock_container_collection + + +class MockDockerClient: + """ + A mock DockerClient when docker.from_env is called + The supplied container will be returned when the client.containers.run method is called + """ + def __init__(self, container: Container): + self._mock_client = create_autospec(DockerClient, name="mock_client") + self._mock_client.containers = create_mock_container_collection(container) + self._patch = patch("docker.from_env", return_value=self._mock_client) + + def __enter__(self): + self._patch.start() + return self._mock_client + + def __exit__(self, _, __, ___): + self._patch.stop() + + +class TestSafeDockerRun(unittest.TestCase): + + @patch("safe_docker_run.signal.pthread_sigmask") + @patch.dict(os.environ, { + "BUILD_NUMBER": "BUILD_NUMBER_5", + "BUILD_ID": "BUILD_ID_1", + "BUILD_TAG": "BUILD_TAG_7" + }) + def test_run_successful(self, mock_pthread_sigmask): + """ + Tests successful run + """ + mock_container = create_mock_container() + + with MockDockerClient(mock_container) as mock_client: + safe_docker = SafeDockerClient() + + # Check return code is 0 + assert safe_docker.run("image", "command") == 0 + + # Check call to container is correct + assert mock_client.containers.run.call_args_list == [ + call("image", "command", detach=True, environment={ + "BUILD_NUMBER": "BUILD_NUMBER_5", + "BUILD_ID": "BUILD_ID_1", + "BUILD_TAG": "BUILD_TAG_7" + }) + ] + + # Check correct signals are blocked then unblocked + assert mock_pthread_sigmask.call_args_list == [ + call(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM}), + call(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM}) + ] + + # Assert container is stopped and removed + assert mock_container.stop.call_count == 1 + assert mock_container.remove.call_count == 1 + assert len(safe_docker._containers) == 0 + + def test_run_detach(self): + """ + Tests detach=True is passed to the underlying call by default + """ + mock_container = create_mock_container() + + # Test detach=True is passed in even if not specified + with MockDockerClient(mock_container) as mock_client: + safe_docker = SafeDockerClient() + assert safe_docker.run("image", "command") == 0 + assert mock_client.containers.run.call_count == 1 + _, kwargs = mock_client.containers.run.call_args + assert kwargs["detach"] is True + + # Test passing in detach=True does not cause any issues + with MockDockerClient(mock_container) as mock_client: + safe_docker = SafeDockerClient() + assert safe_docker.run("image", "command", detach=True) == 0 + assert mock_client.containers.run.call_count == 1 + _, kwargs = mock_client.containers.run.call_args + assert kwargs["detach"] is True + + # Test detach=False fails + with MockDockerClient(mock_container) as mock_client: + safe_docker = SafeDockerClient() + with self.assertRaises(ValueError): + safe_docker.run("image", "command", detach=False) + assert mock_client.containers.run.call_args_list == [] + + def test_jenkins_vars(self): + """ + Tests jenkins environment variables are appropriately passed to the underlying docker run call + """ + # NOTE: It's important that these variables are passed to the underlying docker container + # These variables are passed to the container so the process tree killer can find runaway + # process inside the container + # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller + # https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393 + + jenkins_vars = { + "BUILD_NUMBER": "BUILD_NUMBER_5", + "BUILD_ID": "BUILD_ID_1", + "BUILD_TAG": "BUILD_TAG_7" + } + mock_container = create_mock_container() + + # Test environment is empty if the jenkins vars are not present + with MockDockerClient(mock_container) as mock_client: + safe_docker = SafeDockerClient() + assert safe_docker.run("image", "command") == 0 + assert mock_client.containers.run.call_count == 1 + _, kwargs = mock_client.containers.run.call_args + assert kwargs["environment"] == {} + + # Test environment contains jenkins env vars if they are present + with MockDockerClient(mock_container) as mock_client: + with patch.dict(os.environ, jenkins_vars): + safe_docker = SafeDockerClient() + assert safe_docker.run("image", "command") == 0 + assert mock_client.containers.run.call_count == 1 + _, kwargs = mock_client.containers.run.call_args + assert kwargs["environment"] == jenkins_vars + + # Test jenkins env vars are added to callers env vars + user_env = {"key1": "value1", "key2": "value2"} + with MockDockerClient(mock_container) as mock_client: + with patch.dict(os.environ, jenkins_vars): + safe_docker = SafeDockerClient() + assert safe_docker.run("image", "command", environment=user_env) == 0 + assert mock_client.containers.run.call_count == 1 + _, kwargs = mock_client.containers.run.call_args + assert kwargs["environment"] == {**jenkins_vars, **user_env} + + def test_run_args_kwargs_passed(self): + """ + Tests args and kwargs are passed to the container run call + """ + mock_container = create_mock_container() + + # Test detach=True is passed in even if not specified + with MockDockerClient(mock_container) as mock_client: + safe_docker = SafeDockerClient() + assert safe_docker.run( + "image", + "command", + "another_arg", + str_param="value", + bool_param=True, + none_param=None, + int_param=5, + float_param=5.2, + list_param=["this", "is", "a", "list"], + map_param={ + "a": "5", + "b": True, + "c": 2 + }) == 0 + assert mock_client.containers.run.call_args_list == [ + call( + "image", + "command", + "another_arg", + detach=True, + environment={}, + str_param="value", + bool_param=True, + none_param=None, + int_param=5, + float_param=5.2, + list_param=["this", "is", "a", "list"], + map_param={ + "a": "5", + "b": True, + "c": 2 + } + ) + ] + + def test_container_returns_non_zero_status_code(self): + """ + Tests non-zero code from container is returned and the container + is cleaned up + """ + mock_container = create_mock_container(status_code=10) + with MockDockerClient(mock_container): + safe_docker = SafeDockerClient() + # check return code and that container gets cleaned up + assert safe_docker.run("image", "command") == 10 + assert mock_container.stop.call_count == 1 + assert mock_container.remove.call_count == 1 + assert len(safe_docker._containers) == 0 + + def test_container_wait_raises_returns_150(self): + """ + Tests 150 is returned if an error is raised when calling container.wait + """ + mock_container = create_mock_container() + mock_container.wait.side_effect = RuntimeError("Something bad happened") + with MockDockerClient(mock_container): + safe_docker = SafeDockerClient() + assert safe_docker.run("image", "command") == 150 + + def test_container_stop_raises_returns_151(self): + """ + Tests 151 is returned if an error is raised when calling container.stop + """ + mock_container = create_mock_container() + mock_container.stop.side_effect = RuntimeError("Something bad happened") + with MockDockerClient(mock_container): + safe_docker = SafeDockerClient() + assert safe_docker.run("image", "command") == 151 + + def test_container_remove_raises_returns_152(self): + """ + Tests 152 is returned if an error is raised when calling container.remove + """ + mock_container = create_mock_container() + mock_container.remove.side_effect = RuntimeError("Something bad happened") + with MockDockerClient(mock_container): + safe_docker = SafeDockerClient() + assert safe_docker.run("image", "command") == 152 + + def test_main(self): + """ + Tests main function against different command line arguments + """ + tests = [ + # ( supplied command line arguments, expected call ) + ( + ["image", "command"], + call("image", command="command", runtime=None, user=None, name=None, volumes={}, cap_add=[]) + ), + ( + ["image", "command", "arg1", "arg2"], + call("image", command="command arg1 arg2", runtime=None, user=None, name=None, volumes={}, cap_add=[]) + ), + ( + ["--runtime", "nvidia", "image", "command"], + call("image", command="command", runtime="nvidia", user=None, name=None, volumes={}, cap_add=[]) + ), + ( + ["--user", "1001:1001", "image", "command"], + call("image", command="command", runtime=None, user="1001:1001", name=None, volumes={}, cap_add=[]) + ), + ([ + "--volume", "/local/path1:/container/path1", + "--volume", "/local/path2:/container/path2:ro", + "image", + "command" + ], call("image", command="command", runtime=None, user=None, name=None, volumes={ + "/local/path1": { + "bind": "/container/path1", + "mode": "rw" + }, + "/local/path2": { + "bind": "/container/path2", + "mode": "ro" + } + }, cap_add=[])), + ([ + "--runtime", "nvidia", + "-u", "1001:1001", + "-v", "/local/path1:/container/path1", + "-v", "/local/path2:/container/path2:ro", + "--cap-add", "bob", + "--cap-add", "jimmy", + "--name", + "container_name", + "image", + "command", + "arg1", + "arg2" + ], call( + "image", + command="command arg1 arg2", + runtime="nvidia", + user="1001:1001", + name="container_name", + volumes={ + "/local/path1": { + "bind": "/container/path1", + "mode": "rw" + }, + "/local/path2": { + "bind": "/container/path2", + "mode": "ro" + } + }, cap_add=["bob", "jimmy"]) + ) + ] + + # Tests valid arguments + mock_docker = create_autospec(SafeDockerClient) + mock_docker.run.return_value = 0 + with patch("safe_docker_run.SafeDockerClient", return_value=mock_docker): + for test in tests: + arguments, expected_call = test + main(arguments) + assert mock_docker.run.call_args == expected_call + + # Tests invalid arguments + tests = [ + [], + None, + ["image"], + # Test some bad volume mounts + ["-v", "bob", "image", "args"], + ["-v", "/local/path", "image", "args"], + ["-v", "/local/path:/container/path:blah", "image", "args"], + ["-v", "", "image", "args"], + ["-v", "a:b:c:d", "image", "args"] + ] + + mock_docker = create_autospec(SafeDockerClient) + with patch("safe_docker_run.SafeDockerClient", return_value=mock_docker): + with self.assertRaises(SystemExit): + for test in tests: + main(test) + + def test_clean_up(self): + """ + Tests container clean up in case of SIGTERM and SIGINT + """ + import subprocess + import time + import docker.errors + + docker_client = docker.from_env() + container_name = "safedockertestcontainer1234" + + def get_container(name: str) -> Optional[Container]: + try: + return docker_client.containers.get(name) + except docker.errors.NotFound: + return None + + def remove_container_if_exists(name: str): + container = get_container(name) + if container: + container.stop() + container.remove() + + def wait_for_container(name: str) -> bool: + for _ in itertools.count(5): + if get_container(name): + return True + time.sleep(1) + return False + + # Clear any containers with container name + remove_container_if_exists(container_name) + + # None => not signal is emitted - we should still finish with no containers at the end due + # to the atexit + for sig in [None, signal.SIGTERM, signal.SIGINT]: + # Execute the safe docker run script in a different process + proc = subprocess.Popen(['./safe_docker_run.py', "--name", container_name, "ubuntu:18.04", "sleep 10"]) + # NOTE: we need to wait for the container to come up as not all operating systems support blocking signals + if wait_for_container(container_name) is False: + raise RuntimeError("Test container did not come up") + + # Issue the signal and wait for the process to finish + if sig: + proc.send_signal(sig) + proc.wait() + + # The container should no longer exist + assert get_container(container_name) is None + + +if __name__ == '__main__': + import nose + nose.main() diff --git a/ci/util.py b/ci/util.py index 9a8d52eb1716..4b3a399184f9 100644 --- a/ci/util.py +++ b/ci/util.py @@ -18,6 +18,9 @@ import os import contextlib import logging +import logging.config +import sys + def get_mxnet_root() -> str: curpath = os.path.abspath(os.path.dirname(__file__)) @@ -32,6 +35,7 @@ def is_mxnet_root(path: str) -> bool: curpath = parent return curpath + @contextlib.contextmanager def remember_cwd(): ''' @@ -113,3 +117,16 @@ def chdir_to_script_directory(): os.chdir(base) +def script_name() -> str: + """:returns: script name with leading paths removed""" + return os.path.split(sys.argv[0])[1] + + +def config_logging(): + conf_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "logging.conf") + logging.config.fileConfig(os.getenv('LOGGING_CONF', conf_path)) + + # Force botocore and requests are set to WARNING to avoid leaking any credentials + # or sensitive information + logging.getLogger("botocore").setLevel(logging.WARNING) + logging.getLogger("requests").setLevel(logging.WARNING)