From e203f034e4cf78dc9fe87907c8863c86ad9fee9b Mon Sep 17 00:00:00 2001 From: Zhaoqi Zhu Date: Mon, 1 Mar 2021 16:21:28 -0800 Subject: [PATCH] [V1.8.x] Attemp to fix cd for v1.8.x (#19947) * [v1.x] Migrate to use ECR as docker cache instead of dockerhub (#19654) * [v1.x] Update CI build scripts to install python 3.6 from deadsnakes repo (#19788) * Install python3.6 from deadsnakes repo, since 3.5 is EOL'd and get-pip.py no longer works with 3.5. * Set symlink for python3 to point to newly installed 3.6 version. * Setting symlink or using update-alternatives causes add-apt-repository to fail, so instead just set alias in environment to call the correct python version. * Setup symlinks in /usr/local/bin, since it comes first in the path. * Don't use absolute path for python3 executable, just use python3 from path. Co-authored-by: Joe Evans * Disable unix-gpu-cu110 pipeline for v1.x build since we now build with cuda 11.0 in windows pipelines. (#19828) Co-authored-by: Joe Evans * [v1.x] For ECR, ensure we sanitize region input from environment variable (#19882) * Set default for cache_intermediate. * Make sure we sanitize region extracted from registry, since we pass it to os.system. Co-authored-by: Joe Evans * [v1.x] Address CI failures with docker timeouts (v2) (#19890) * Add random sleep only, since retry attempts are already implemented. * Reduce random sleep to 2-10 sec. Co-authored-by: Joe Evans * [v1.x] CI fixes to make more stable and upgradable (#19895) * Test moving pipelines from p3 to g4. * Remove fallback codecov command - the existing (first) command works and the second always fails a few times before finally succeeding (and also doesn't support the -P parameter, which causes an error.) * Stop using docker python client, since it still doesn't support latest nvidia 'gpus' attribute. Switch to using subprocess calls using list parameter (to avoid shell injections). See https://github.com/docker/docker-py/issues/2395 * Remove old files. * Fix comment * Set default environment variables * Fix GPU syntax. * Use subprocess.run and redirect output to stdout, don't run docker in interactive mode. * Check if codecov works without providing parameters now. * Send docker stderr to sys.stderr * Support both nvidia-docker configurations, first try '--gpus all', and if that fails, then try '--runtime nvidia'. Co-authored-by: Joe Evans * fix cd * fix cudnn version for cu10.2 buiuld * WAR the dataloader issue with forked processes holding stale references (#19924) * skip some tests * fix ski[ * [v.1x] Attempt to fix v1.x cd by installing new cuda compt package (#19959) * update cude compt for cd * Update Dockerfile.build.ubuntu_gpu_cu102 * Update Dockerfile.build.ubuntu_gpu_cu102 * Update Dockerfile.build.ubuntu_gpu_cu110 * Update runtime_functions.sh * Update Dockerfile.build.ubuntu_gpu_cu110 * Update Dockerfile.build.ubuntu_gpu_cu102 * update command Co-authored-by: Joe Evans Co-authored-by: Joe Evans Co-authored-by: Joe Evans Co-authored-by: Przemyslaw Tredak --- cd/python/docker/Dockerfile | 12 +- ci/Jenkinsfile_docker_cache | 2 +- ci/Jenkinsfile_utils.groovy | 17 +- ci/build.py | 122 ++--- ci/docker/Dockerfile.build.ubuntu_gpu_cu102 | 4 + ci/docker/Dockerfile.build.ubuntu_gpu_cu110 | 4 + ci/docker/install/ubuntu_python.sh | 7 +- ci/docker/runtime_functions.sh | 18 +- ci/docker_cache.py | 46 +- ci/jenkins/Jenkins_steps.groovy | 6 +- ci/jenkins/Jenkinsfile_full | 1 - ci/safe_docker_run.py | 248 ---------- ci/test_safe_docker_run.py | 427 ------------------ python/mxnet/gluon/data/dataloader.py | 4 + .../unittest/test_numpy_interoperability.py | 2 + tests/python/unittest/test_numpy_op.py | 1 + tools/setup_gpu_build_tools.sh | 2 +- 17 files changed, 145 insertions(+), 778 deletions(-) delete mode 100755 ci/safe_docker_run.py delete mode 100644 ci/test_safe_docker_run.py diff --git a/cd/python/docker/Dockerfile b/cd/python/docker/Dockerfile index ed97bdc8316a..accbe9bd1d97 100644 --- a/cd/python/docker/Dockerfile +++ b/cd/python/docker/Dockerfile @@ -23,11 +23,15 @@ ARG BASE_IMAGE FROM ${BASE_IMAGE} -ARG PYTHON_CMD=python3 RUN apt-get update && \ - apt-get install -y wget ${PYTHON_CMD}-dev gcc && \ - wget https://bootstrap.pypa.io/get-pip.py && \ - ${PYTHON_CMD} get-pip.py + apt-get install -y software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install -y python3.7-dev python3.7-distutils virtualenv wget && \ + ln -sf /usr/bin/python3.7 /usr/local/bin/python3 && \ + wget -nv https://bootstrap.pypa.io/get-pip.py && \ + python3 get-pip.py + ARG MXNET_COMMIT_ID ENV MXNET_COMMIT_ID=${MXNET_COMMIT_ID} diff --git a/ci/Jenkinsfile_docker_cache b/ci/Jenkinsfile_docker_cache index 35e6ff9e7d56..f90bf0459f03 100644 --- a/ci/Jenkinsfile_docker_cache +++ b/ci/Jenkinsfile_docker_cache @@ -37,7 +37,7 @@ core_logic: { ws('workspace/docker_cache') { timeout(time: total_timeout, unit: 'MINUTES') { utils.init_git() - sh "ci/docker_cache.py --docker-registry ${env.DOCKER_CACHE_REGISTRY}" + sh "ci/docker_cache.py --docker-registry ${env.DOCKER_ECR_REGISTRY}" } } } diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy index 8ecc7e193b97..523fad92cec2 100644 --- a/ci/Jenkinsfile_utils.groovy +++ b/ci/Jenkinsfile_utils.groovy @@ -112,20 +112,7 @@ def get_git_commit_hash() { } def publish_test_coverage() { - // CodeCovs auto detection has trouble with our CIs PR validation due the merging strategy - git_commit_hash = get_git_commit_hash() - - if (env.CHANGE_ID) { - // PR execution - codecovArgs = "-B ${env.CHANGE_TARGET} -C ${git_commit_hash} -P ${env.CHANGE_ID}" - } else { - // Branch execution - codecovArgs = "-B ${env.BRANCH_NAME} -C ${git_commit_hash}" - } - - // To make sure we never fail because test coverage reporting is not available - // Fall back to our own copy of the bash helper if it failed to download the public version - sh "(curl --retry 10 -s https://codecov.io/bash | bash -s - ${codecovArgs}) || (curl --retry 10 -s https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data/codecov-bash.txt | bash -s - ${codecovArgs}) || true" + sh "curl -s https://codecov.io/bash | bash" } def collect_test_results_unix(original_file_name, new_file_name) { @@ -160,7 +147,7 @@ def collect_test_results_windows(original_file_name, new_file_name) { def docker_run(platform, function_name, use_nvidia, shared_mem = '500m', env_vars = "") { - def command = "ci/build.py %ENV_VARS% --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" + def command = "ci/build.py %ENV_VARS% --docker-registry ${env.DOCKER_ECR_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" command = command.replaceAll('%ENV_VARS%', env_vars.length() > 0 ? "-e ${env_vars}" : '') command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '') command = command.replaceAll('%PLATFORM%', platform) diff --git a/ci/build.py b/ci/build.py index 8c2a6e9ac20b..645eb96875e9 100755 --- a/ci/build.py +++ b/ci/build.py @@ -27,6 +27,8 @@ import argparse import glob +import hashlib +import os import pprint import re import shutil @@ -36,7 +38,6 @@ from subprocess import check_call, check_output from typing import * -from safe_docker_run import SafeDockerClient from util import * @@ -52,13 +53,41 @@ def get_platforms(path: str = get_dockerfiles_path()) -> List[str]: platforms = list(map(lambda x: os.path.split(x)[1], sorted(files))) return platforms +def _find_copied_files(dockerfile): + """ + Creates a list of files copied into given dockerfile. + """ + copied_files = [] + basedir = os.path.dirname(dockerfile) + with open(dockerfile, "r") as f: + for line in f.readlines(): + if line.startswith("COPY "): + copied_files.append(os.path.join(basedir, line.split(" ")[1])) + return copied_files + +def _hash_file(ctx, filename): + """ + Add contents of passed file into passed hash context. + """ + bufsiz = 16384 + with open(filename,"rb") as f: + while True: + d = f.read(bufsiz) + if not d: + break + ctx.update(d) def get_docker_tag(platform: str, registry: str) -> str: """:return: docker tag to be used for the container""" platform = platform if any(x in platform for x in ['build.', 'publish.']) else 'build.{}'.format(platform) if not registry: registry = "mxnet_local" - return "{0}/{1}".format(registry, platform) + dockerfile = get_dockerfile(platform) + sha256 = hashlib.sha256() + _hash_file(sha256, dockerfile) + for f in _find_copied_files(dockerfile): + _hash_file(sha256, f) + return "{0}:{1}-{2}".format(registry, platform, sha256.hexdigest()[:12]) def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str: @@ -67,7 +96,7 @@ def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str: def build_docker(platform: str, registry: str, num_retries: int, no_cache: bool, - cache_intermediate: bool) -> str: + cache_intermediate: bool=False) -> str: """ Build a container for the given platform :param platform: Platform @@ -157,8 +186,7 @@ def default_ccache_dir() -> str: return os.path.join(os.path.expanduser("~"), ".ccache") -def container_run(docker_client: SafeDockerClient, - platform: str, +def container_run(platform: str, nvidia_runtime: bool, docker_registry: str, shared_memory_size: str, @@ -167,17 +195,12 @@ def container_run(docker_client: SafeDockerClient, environment: Dict[str, str], dry_run: bool = False) -> int: """Run command in a container""" - container_wait_s = 600 - # - # Environment setup - # + # set default environment variables environment.update({ 'CCACHE_MAXSIZE': '500G', 'CCACHE_TEMPDIR': '/tmp/ccache', # temp dir should be local and not shared - 'CCACHE_DIR': '/work/ccache', # this path is inside the container as /work/ccache is - # mounted - 'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache - # verification. + 'CCACHE_DIR': '/work/ccache', # this path is inside the container as /work/ccache is mounted + 'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache verification. }) environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ}) @@ -189,13 +212,9 @@ def container_run(docker_client: SafeDockerClient, os.makedirs(local_ccache_dir, exist_ok=True) logging.info("Using ccache directory: %s", local_ccache_dir) - # Equivalent command - docker_cmd_list = [ - "docker", - 'run', - "--gpus all" if nvidia_runtime else "", - "--cap-add", - "SYS_PTRACE", # Required by ASAN + # Build docker command + docker_arg_list = [ + "--cap-add", "SYS_PTRACE", # Required by ASAN '--rm', '--shm-size={}'.format(shared_memory_size), # mount mxnet root @@ -211,40 +230,27 @@ def container_run(docker_client: SafeDockerClient, '-e', "CCACHE_DIR={}".format(environment['CCACHE_DIR']), # a container-scoped log, useful for ccache verification. '-e', "CCACHE_LOGFILE={}".format(environment['CCACHE_LOGFILE']), - '-ti', - tag] - docker_cmd_list.extend(command) - docker_cmd = ' \\\n\t'.join(docker_cmd_list) - logging.info("Running %s in container %s", command, tag) - logging.info("Executing the equivalent of:\n%s\n", docker_cmd) + ] + docker_arg_list += [tag] + docker_arg_list.extend(command) + + def docker_run_cmd(cmd): + logging.info("Running %s in container %s", command, tag) + logging.info("Executing command:\n%s\n", ' \\\n\t'.join(cmd)) + subprocess.run(cmd, stdout=sys.stdout, stderr=sys.stderr, check=True) if not dry_run: - ############################# - # - signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM}) - # noinspection PyShadowingNames - runtime = None - if nvidia_runtime: - # noinspection PyShadowingNames - # runc is default (docker info | grep -i runtime) - runtime = 'nvidia' - - return docker_client.run( - tag, - runtime=runtime, - command=command, - shm_size=shared_memory_size, - user='{}:{}'.format(os.getuid(), os.getgid()), - cap_add='SYS_PTRACE', - volumes={ - mx_root: - {'bind': '/work/mxnet', 'mode': 'rw'}, - local_build_folder: - {'bind': '/work/build', 'mode': 'rw'}, - local_ccache_dir: - {'bind': '/work/ccache', 'mode': 'rw'}, - }, - environment=environment) + if not nvidia_runtime: + docker_run_cmd(['docker', 'run'] + docker_arg_list) + else: + try: + docker_run_cmd(['docker', 'run', '--gpus', 'all'] + docker_arg_list) + except subprocess.CalledProcessError as e: + if e.returncode == 125: + docker_run_cmd(['docker', 'run', '--runtime', 'nvidia'] + docker_arg_list) + else: + raise + return 0 @@ -348,7 +354,6 @@ def main() -> int: args = parser.parse_args() command = list(chain(*args.command)) - docker_client = SafeDockerClient() environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e])) for e in args.environment]) @@ -375,13 +380,13 @@ def main() -> int: ret = 0 if command: ret = container_run( - docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker, + platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir, environment=environment) elif args.print_docker_run: command = [] ret = container_run( - docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker, + platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir, dry_run=True, environment=environment) else: @@ -389,7 +394,7 @@ def main() -> int: command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)] logging.info("No command specified, trying default build: %s", ' '.join(command)) ret = container_run( - docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker, + platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir, environment=environment) @@ -406,7 +411,8 @@ def main() -> int: tag = get_docker_tag(platform=platform, registry=args.docker_registry) load_docker_cache(tag=tag, docker_registry=args.docker_registry) build_docker(platform, registry=args.docker_registry, - num_retries=args.docker_build_retries, no_cache=args.no_cache) + num_retries=args.docker_build_retries, no_cache=args.no_cache, + cache_intermediate=args.cache_intermediate) if args.build_only: continue shutil.rmtree(buildir(), ignore_errors=True) @@ -418,7 +424,7 @@ def main() -> int: continue command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform] container_run( - docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker, + platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir, environment=environment) shutil.move(buildir(), plat_buildir) diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu102 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu102 index 52d65aaba0f4..a929a486a2f3 100644 --- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu102 +++ b/ci/docker/Dockerfile.build.ubuntu_gpu_cu102 @@ -69,6 +69,10 @@ ENV CUDNN_VERSION=7.6.5.32 COPY install/ubuntu_cudnn.sh /work/ RUN /work/ubuntu_cudnn.sh +# update the cuda compatibity package because cd host uses nvidia driver 460 +RUN apt-get update && apt-get install -y cuda-compat-11-2 +RUN ln -sfn /usr/local/cuda-11.2 /usr/local/cuda + # Always last ARG USER_ID=0 ARG GROUP_ID=0 diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu110 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu110 index 336b76e2a3e8..db1f606076b5 100644 --- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu110 +++ b/ci/docker/Dockerfile.build.ubuntu_gpu_cu110 @@ -32,6 +32,10 @@ COPY install/ubuntu_python.sh /work/ COPY install/requirements /work/ RUN /work/ubuntu_python.sh +# update the cuda compatibity package because cd host uses nvidia driver 460 +RUN apt-get update && apt-get install -y cuda-compat-11-2 +RUN ln -sfn /usr/local/cuda-11.2 /usr/local/cuda + # Always last ARG USER_ID=0 ARG GROUP_ID=0 diff --git a/ci/docker/install/ubuntu_python.sh b/ci/docker/install/ubuntu_python.sh index b6792a286fad..d31a18d9c303 100755 --- a/ci/docker/install/ubuntu_python.sh +++ b/ci/docker/install/ubuntu_python.sh @@ -23,7 +23,12 @@ set -ex # install libraries for mxnet's python package on ubuntu apt-get update || true -apt-get install -y python-dev python3-dev virtualenv wget +apt-get install -y software-properties-common +add-apt-repository -y ppa:deadsnakes/ppa +apt-get update || true +apt-get install -y python3.6-dev virtualenv wget +# setup symlink in /usr/local/bin to override python3 version +ln -sf /usr/bin/python3.6 /usr/local/bin/python3 # the version of the pip shipped with ubuntu may be too lower, install a recent version here wget -nv https://bootstrap.pypa.io/get-pip.py diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 4d5a94df9c6a..c230ccdccd44 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -517,7 +517,7 @@ build_ubuntu_cpu_cmake_debug() { cmake \ -DUSE_CUDA=OFF \ -DUSE_TVM_OP=ON \ - -DPython3_EXECUTABLE=/usr/bin/python3 \ + -DPython3_EXECUTABLE=python3 \ -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_OPENMP=OFF \ -DUSE_OPENCV=ON \ @@ -538,7 +538,7 @@ build_ubuntu_cpu_cmake_no_tvm_op() { cmake \ -DUSE_CUDA=OFF \ -DUSE_TVM_OP=OFF \ - -DPython3_EXECUTABLE=/usr/bin/python3 \ + -DPython3_EXECUTABLE=python3 \ -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_OPENMP=OFF \ -DUSE_OPENCV=ON \ @@ -874,7 +874,7 @@ build_ubuntu_gpu_cmake_mkldnn() { -DUSE_CUDA=1 \ -DUSE_CUDNN=1 \ -DUSE_TVM_OP=0 \ - -DPython3_EXECUTABLE=/usr/bin/python3 \ + -DPython3_EXECUTABLE=python3 \ -DUSE_MKLML_MKL=1 \ -DCMAKE_BUILD_TYPE=Release \ -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ @@ -892,8 +892,8 @@ build_ubuntu_gpu_cmake() { -DUSE_SIGNAL_HANDLER=ON \ -DUSE_CUDA=ON \ -DUSE_CUDNN=ON \ - -DUSE_TVM_OP=OFF \ - -DPython3_EXECUTABLE=/usr/bin/python3 \ + -DUSE_TVM_OP=OFF \ + -DPython3_EXECUTABLE=python3 \ -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_MKLML_MKL=OFF \ -DUSE_MKLDNN=OFF \ @@ -915,8 +915,8 @@ build_ubuntu_gpu_cmake_no_rtc() { -DUSE_SIGNAL_HANDLER=ON \ -DUSE_CUDA=ON \ -DUSE_CUDNN=ON \ - -DUSE_TVM_OP=OFF \ - -DPython3_EXECUTABLE=/usr/bin/python3 \ + -DUSE_TVM_OP=OFF \ + -DPython3_EXECUTABLE=python3 \ -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_MKLML_MKL=OFF \ -DUSE_MKLDNN=ON \ @@ -956,8 +956,8 @@ build_ubuntu_gpu_large_tensor() { -DUSE_SIGNAL_HANDLER=ON \ -DUSE_CUDA=ON \ -DUSE_CUDNN=ON \ - -DUSE_TVM_OP=OFF \ - -DPython3_EXECUTABLE=/usr/bin/python3 \ + -DUSE_TVM_OP=OFF \ + -DPython3_EXECUTABLE=python3 \ -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_MKLML_MKL=OFF \ -DUSE_MKLDNN=OFF \ diff --git a/ci/docker_cache.py b/ci/docker_cache.py index 254d6237d6e2..da01314f5f8d 100755 --- a/ci/docker_cache.py +++ b/ci/docker_cache.py @@ -28,6 +28,7 @@ import logging import os import subprocess +import re import sys from typing import * @@ -84,7 +85,7 @@ def _build_save_container(platform, registry, load_cache) -> Optional[str]: logging.debug('Building %s as %s', platform, docker_tag) try: # Increase the number of retries for building the cache. - image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10, no_cache=False) + image_id = build_util.build_docker(platform=platform, registry=registry, num_retries=10, no_cache=False) logging.info('Built %s as %s', docker_tag, image_id) # Push cache to registry @@ -96,15 +97,33 @@ def _build_save_container(platform, registry, load_cache) -> Optional[str]: # Error handling is done by returning the errorous platform name. This is necessary due to # Parallel being unable to handle exceptions +ECR_LOGGED_IN = False +def _ecr_login(registry): + """ + Use the AWS CLI to get credentials to login to ECR. + """ + # extract region from registry + global ECR_LOGGED_IN + if ECR_LOGGED_IN: + return + regionMatch = re.match(r'.*?\.dkr\.ecr\.([a-z]+\-[a-z]+\-\d+)\.amazonaws\.com', registry) + assert(regionMatch) + region = regionMatch.group(1) + logging.info("Logging into ECR region %s using aws-cli..", region) + os.system("$(aws ecr get-login --region "+region+" --no-include-email)") + ECR_LOGGED_IN = True def _upload_image(registry, docker_tag, image_id) -> None: """ - Upload the passed image by id, tag it with docker tag and upload to S3 bucket + Upload the passed image by id, tag it with docker tag and upload to docker registry. :param registry: Docker registry name :param docker_tag: Docker tag :param image_id: Image id :return: None """ + if "dkr.ecr" in registry: + _ecr_login(registry) + # We don't have to retag the image since it is already in the right format logging.info('Uploading %s (%s) to %s', docker_tag, image_id, registry) push_cmd = ['docker', 'push', docker_tag] @@ -125,6 +144,9 @@ def load_docker_cache(registry, docker_tag) -> None: return assert docker_tag + if "dkr.ecr" in registry: + _ecr_login(registry) + logging.info('Loading Docker cache for %s from %s', docker_tag, registry) pull_cmd = ['docker', 'pull', docker_tag] @@ -185,15 +207,19 @@ def script_name() -> str: platforms = build_util.get_platforms() - secret_name = os.environ['DOCKERHUB_SECRET_NAME'] - endpoint_url = os.environ['DOCKERHUB_SECRET_ENDPOINT_URL'] - region_name = os.environ['DOCKERHUB_SECRET_ENDPOINT_REGION'] - - try: - login_dockerhub(secret_name, endpoint_url, region_name) + if "dkr.ecr" in args.docker_registry: + _ecr_login(args.docker_registry) return build_save_containers(platforms=platforms, registry=args.docker_registry, load_cache=True) - finally: - logout_dockerhub() + else: + secret_name = os.environ['DOCKERHUB_SECRET_NAME'] + endpoint_url = os.environ['DOCKERHUB_SECRET_ENDPOINT_URL'] + region_name = os.environ['DOCKERHUB_SECRET_ENDPOINT_REGION'] + + try: + login_dockerhub(secret_name, endpoint_url, region_name) + return build_save_containers(platforms=platforms, registry=args.docker_registry, load_cache=True) + finally: + logout_dockerhub() if __name__ == '__main__': diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy index 499cc8db79b6..1c5ebaace976 100644 --- a/ci/jenkins/Jenkins_steps.groovy +++ b/ci/jenkins/Jenkins_steps.groovy @@ -769,7 +769,7 @@ def test_unix_python3_gpu_cu110() { def test_unix_python3_quantize_gpu() { return ['Python3: Quantize GPU': { - node(NODE_LINUX_GPU_P3) { + node(NODE_LINUX_GPU_G4) { ws('workspace/ut-python3-quantize-gpu') { timeout(time: max_time, unit: 'MINUTES') { try { @@ -787,7 +787,7 @@ def test_unix_python3_quantize_gpu() { def test_unix_python3_quantize_gpu_cu110() { return ['Python3+CUDA11.0: Quantize GPU': { - node(NODE_LINUX_GPU_P3) { + node(NODE_LINUX_GPU_G4) { ws('workspace/ut-python3-quantize-gpu') { timeout(time: max_time, unit: 'MINUTES') { try { @@ -903,7 +903,7 @@ def test_unix_python3_mkldnn_nocudnn_gpu() { def test_unix_python3_tensorrt_gpu() { return ['Python3: TensorRT GPU': { - node(NODE_LINUX_GPU_P3) { + node(NODE_LINUX_GPU_G4) { ws('workspace/build-tensorrt') { timeout(time: max_time, unit: 'MINUTES') { try { diff --git a/ci/jenkins/Jenkinsfile_full b/ci/jenkins/Jenkinsfile_full index fcdb4c2a5eeb..415bd7b8dde0 100644 --- a/ci/jenkins/Jenkinsfile_full +++ b/ci/jenkins/Jenkinsfile_full @@ -31,7 +31,6 @@ def buildJobs = [ 'miscellaneous', 'unix-cpu', 'unix-gpu', - 'unix-gpu-cu110', 'website', 'windows-cpu', 'windows-gpu' diff --git a/ci/safe_docker_run.py b/ci/safe_docker_run.py deleted file mode 100755 index 97ece4aecd2f..000000000000 --- a/ci/safe_docker_run.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Docker command wrapper to guard against Zombie containers -""" - -import argparse -import atexit -import logging -import os -import signal -import sys -from functools import reduce -from itertools import chain -from typing import Dict, Any - -import docker -from docker.errors import NotFound -from docker.models.containers import Container - -from util import config_logging - -DOCKER_STOP_TIMEOUT_SECONDS = 3 -CONTAINER_WAIT_SECONDS = 600 - - -class SafeDockerClient: - """ - A wrapper around the docker client to ensure that no zombie containers are left hanging around - in case the script is not allowed to finish normally - """ - - @staticmethod - def _trim_container_id(cid): - """:return: trimmed container id""" - return cid[:12] - - def __init__(self): - self._docker_client = docker.from_env() - self._containers = set() - self._docker_stop_timeout = DOCKER_STOP_TIMEOUT_SECONDS - self._container_wait_seconds = CONTAINER_WAIT_SECONDS - - def signal_handler(signum, _): - signal.pthread_sigmask(signal.SIG_BLOCK, {signum}) - logging.warning("Signal %d received, cleaning up...", signum) - self._clean_up() - logging.warning("done. Exiting with error.") - sys.exit(1) - - atexit.register(self._clean_up) - signal.signal(signal.SIGTERM, signal_handler) - signal.signal(signal.SIGINT, signal_handler) - - def _clean_up(self): - if self._containers: - logging.warning("Cleaning up containers") - else: - return - # noinspection PyBroadException - try: - stop_timeout = int(os.environ.get("DOCKER_STOP_TIMEOUT", self._docker_stop_timeout)) - except Exception: - stop_timeout = 3 - for container in self._containers: - try: - container.stop(timeout=stop_timeout) - logging.info("☠: stopped container %s", self._trim_container_id(container.id)) - container.remove() - logging.info("🚽: removed container %s", self._trim_container_id(container.id)) - except Exception as e: - logging.exception(e) - self._containers.clear() - logging.info("Cleaning up containers finished.") - - def _add_container(self, container: Container) -> Container: - self._containers.add(container) - return container - - def _remove_container(self, container: Container): - self._containers.remove(container) - - def run(self, *args, **kwargs) -> int: - if "detach" in kwargs and kwargs.get("detach") is False: - raise ValueError("Can only safe run with 'detach' set to True") - else: - kwargs["detach"] = True - - # These variables are passed to the container so the process tree killer can find runaway - # process inside the container - # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller - # https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393 - if "environment" not in kwargs: - kwargs["environment"] = {} - - jenkins_env_vars = ["BUILD_NUMBER", "BUILD_ID", "BUILD_TAG"] - kwargs["environment"].update({k: os.environ[k] for k in jenkins_env_vars if k in os.environ}) - - ret = 0 - try: - # Race condition: - # If the call to docker_client.containers.run is interrupted, it is possible that - # the container won't be cleaned up. We avoid this by temporarily masking the signals. - signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM}) - container = self._add_container(self._docker_client.containers.run(*args, **kwargs)) - signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM}) - logging.info("Started container: %s", self._trim_container_id(container.id)) - stream = container.logs(stream=True, stdout=True, stderr=True) - sys.stdout.flush() - for chunk in stream: - sys.stdout.buffer.write(chunk) - sys.stdout.buffer.flush() - sys.stdout.flush() - stream.close() - - try: - logging.info("Waiting for status of container %s for %d s.", - self._trim_container_id(container.id), - self._container_wait_seconds) - wait_result = container.wait(timeout=self._container_wait_seconds) - logging.info("Container exit status: %s", wait_result) - ret = wait_result.get('StatusCode', 200) - if ret != 0: - logging.error("Container exited with an error 😞") - logging.info("Executed command for reproduction:\n\n%s\n", " ".join(sys.argv)) - else: - logging.info("Container exited with success 👍") - logging.info("Executed command for reproduction:\n\n%s\n", " ".join(sys.argv)) - except Exception as err: - logging.exception(err) - return 150 - - try: - logging.info("Stopping container: %s", self._trim_container_id(container.id)) - container.stop() - except Exception as e: - logging.exception(e) - ret = 151 - - try: - logging.info("Removing container: %s", self._trim_container_id(container.id)) - container.remove() - except Exception as e: - logging.exception(e) - ret = 152 - self._remove_container(container) - containers = self._docker_client.containers.list() - if containers: - logging.info("Other running containers: %s", [self._trim_container_id(x.id) for x in containers]) - except NotFound as e: - logging.info("Container was stopped before cleanup started: %s", e) - - return ret - - -def _volume_mount(volume_dfn: str) -> Dict[str, Any]: - """ - Converts docker volume mount format, e.g. docker run --volume /local/path:/container/path:ro - to an object understood by the python docker library, e.g. {"local/path": {"bind": "/container/path", "mode": "ro"}} - This is used by the argparser for automatic conversion and input validation. - If the mode is not specified, 'rw' is assumed. - :param volume_dfn: A string to convert to a volume mount object in the format :[:ro|rw] - :return: An object in the form {"" : {"bind": "", "mode": "rw|ro"}} - """ - if volume_dfn is None: - raise argparse.ArgumentTypeError("Missing value for volume definition") - - parts = volume_dfn.split(":") - - if len(parts) < 2 or len(parts) > 3: - raise argparse.ArgumentTypeError("Invalid volume definition {}".format(volume_dfn)) - - mode = "rw" - if len(parts) == 3: - mode = parts[2] - - if mode not in ["rw", "ro"]: - raise argparse.ArgumentTypeError("Invalid volume mount mode {} in volume definition {}".format(mode, volume_dfn)) - - return {parts[0]: {"bind": parts[1], "mode": mode}} - - -def main(command_line_arguments): - config_logging() - - parser = argparse.ArgumentParser( - description="""Wrapper around docker run that protects against Zombie containers""", epilog="") - - parser.add_argument("-u", "--user", - help="Username or UID (format: [:])", - default=None) - - parser.add_argument("-v", "--volume", - action='append', - type=_volume_mount, - help="Bind mount a volume", - default=[]) - - parser.add_argument("--cap-add", - help="Add Linux capabilities", - action="append", - type=str, - default=[]) - - parser.add_argument("--runtime", - help="Runtime to use for this container", - default=None) - - parser.add_argument("--name", - help="Assign a name to the container", - default=None) - - parser.add_argument("image", metavar="IMAGE") - parser.add_argument("command", metavar="COMMAND") - parser.add_argument("args", nargs='*', metavar="ARG") - - args = parser.parse_args(args=command_line_arguments) - docker_client = SafeDockerClient() - return docker_client.run(args.image, **{ - "command": " ".join(list(chain([args.command] + args.args))), - "user": args.user, - "runtime": args.runtime, - "name": args.name, - "volumes": reduce(lambda dct, v: {**dct, **v}, args.volume, {}), - "cap_add": args.cap_add - }) - - -if __name__ == "__main__": - exit(main(sys.argv[1:])) diff --git a/ci/test_safe_docker_run.py b/ci/test_safe_docker_run.py deleted file mode 100644 index 433d42e8b2ea..000000000000 --- a/ci/test_safe_docker_run.py +++ /dev/null @@ -1,427 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Safe docker run tests -""" -import itertools -import os -import signal -import unittest -from typing import Optional -from unittest.mock import create_autospec, patch, call - -from docker import DockerClient -from docker.models.containers import Container, ContainerCollection - -from safe_docker_run import SafeDockerClient, main - - -def create_mock_container(status_code: int = 0): - """ - Creates a mock docker container that exits with the specified status code - """ - mock_container = create_autospec(Container, name="mock_container") - mock_container.wait.return_value = { - "StatusCode": status_code - } - return mock_container - - -def create_mock_container_collection(container: Container): - """ - Creates a mock ContainerCollection that return the supplied container when the 'run' method is called - """ - mock_container_collection = create_autospec(ContainerCollection, name="mock_collection") - mock_container_collection.run.return_value = container - return mock_container_collection - - -class MockDockerClient: - """ - A mock DockerClient when docker.from_env is called - The supplied container will be returned when the client.containers.run method is called - """ - def __init__(self, container: Container): - self._mock_client = create_autospec(DockerClient, name="mock_client") - self._mock_client.containers = create_mock_container_collection(container) - self._patch = patch("docker.from_env", return_value=self._mock_client) - - def __enter__(self): - self._patch.start() - return self._mock_client - - def __exit__(self, _, __, ___): - self._patch.stop() - - -class TestSafeDockerRun(unittest.TestCase): - - @patch("safe_docker_run.signal.pthread_sigmask") - @patch.dict(os.environ, { - "BUILD_NUMBER": "BUILD_NUMBER_5", - "BUILD_ID": "BUILD_ID_1", - "BUILD_TAG": "BUILD_TAG_7" - }) - def test_run_successful(self, mock_pthread_sigmask): - """ - Tests successful run - """ - mock_container = create_mock_container() - - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - - # Check return code is 0 - assert safe_docker.run("image", "command") == 0 - - # Check call to container is correct - assert mock_client.containers.run.call_args_list == [ - call("image", "command", detach=True, environment={ - "BUILD_NUMBER": "BUILD_NUMBER_5", - "BUILD_ID": "BUILD_ID_1", - "BUILD_TAG": "BUILD_TAG_7" - }) - ] - - # Check correct signals are blocked then unblocked - assert mock_pthread_sigmask.call_args_list == [ - call(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM}), - call(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM}) - ] - - # Assert container is stopped and removed - assert mock_container.stop.call_count == 1 - assert mock_container.remove.call_count == 1 - assert len(safe_docker._containers) == 0 - - def test_run_detach(self): - """ - Tests detach=True is passed to the underlying call by default - """ - mock_container = create_mock_container() - - # Test detach=True is passed in even if not specified - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 0 - assert mock_client.containers.run.call_count == 1 - _, kwargs = mock_client.containers.run.call_args - assert kwargs["detach"] is True - - # Test passing in detach=True does not cause any issues - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command", detach=True) == 0 - assert mock_client.containers.run.call_count == 1 - _, kwargs = mock_client.containers.run.call_args - assert kwargs["detach"] is True - - # Test detach=False fails - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - with self.assertRaises(ValueError): - safe_docker.run("image", "command", detach=False) - assert mock_client.containers.run.call_args_list == [] - - def test_jenkins_vars(self): - """ - Tests jenkins environment variables are appropriately passed to the underlying docker run call - """ - # NOTE: It's important that these variables are passed to the underlying docker container - # These variables are passed to the container so the process tree killer can find runaway - # process inside the container - # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller - # https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393 - - jenkins_vars = { - "BUILD_NUMBER": "BUILD_NUMBER_5", - "BUILD_ID": "BUILD_ID_1", - "BUILD_TAG": "BUILD_TAG_7" - } - mock_container = create_mock_container() - - # Test environment is empty if the jenkins vars are not present - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 0 - assert mock_client.containers.run.call_count == 1 - _, kwargs = mock_client.containers.run.call_args - assert kwargs["environment"] == {} - - # Test environment contains jenkins env vars if they are present - with MockDockerClient(mock_container) as mock_client: - with patch.dict(os.environ, jenkins_vars): - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 0 - assert mock_client.containers.run.call_count == 1 - _, kwargs = mock_client.containers.run.call_args - assert kwargs["environment"] == jenkins_vars - - # Test jenkins env vars are added to callers env vars - user_env = {"key1": "value1", "key2": "value2"} - with MockDockerClient(mock_container) as mock_client: - with patch.dict(os.environ, jenkins_vars): - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command", environment=user_env) == 0 - assert mock_client.containers.run.call_count == 1 - _, kwargs = mock_client.containers.run.call_args - assert kwargs["environment"] == {**jenkins_vars, **user_env} - - def test_run_args_kwargs_passed(self): - """ - Tests args and kwargs are passed to the container run call - """ - mock_container = create_mock_container() - - # Test detach=True is passed in even if not specified - with MockDockerClient(mock_container) as mock_client: - safe_docker = SafeDockerClient() - assert safe_docker.run( - "image", - "command", - "another_arg", - str_param="value", - bool_param=True, - none_param=None, - int_param=5, - float_param=5.2, - list_param=["this", "is", "a", "list"], - map_param={ - "a": "5", - "b": True, - "c": 2 - }) == 0 - assert mock_client.containers.run.call_args_list == [ - call( - "image", - "command", - "another_arg", - detach=True, - environment={}, - str_param="value", - bool_param=True, - none_param=None, - int_param=5, - float_param=5.2, - list_param=["this", "is", "a", "list"], - map_param={ - "a": "5", - "b": True, - "c": 2 - } - ) - ] - - def test_container_returns_non_zero_status_code(self): - """ - Tests non-zero code from container is returned and the container - is cleaned up - """ - mock_container = create_mock_container(status_code=10) - with MockDockerClient(mock_container): - safe_docker = SafeDockerClient() - # check return code and that container gets cleaned up - assert safe_docker.run("image", "command") == 10 - assert mock_container.stop.call_count == 1 - assert mock_container.remove.call_count == 1 - assert len(safe_docker._containers) == 0 - - def test_container_wait_raises_returns_150(self): - """ - Tests 150 is returned if an error is raised when calling container.wait - """ - mock_container = create_mock_container() - mock_container.wait.side_effect = RuntimeError("Something bad happened") - with MockDockerClient(mock_container): - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 150 - - def test_container_stop_raises_returns_151(self): - """ - Tests 151 is returned if an error is raised when calling container.stop - """ - mock_container = create_mock_container() - mock_container.stop.side_effect = RuntimeError("Something bad happened") - with MockDockerClient(mock_container): - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 151 - - def test_container_remove_raises_returns_152(self): - """ - Tests 152 is returned if an error is raised when calling container.remove - """ - mock_container = create_mock_container() - mock_container.remove.side_effect = RuntimeError("Something bad happened") - with MockDockerClient(mock_container): - safe_docker = SafeDockerClient() - assert safe_docker.run("image", "command") == 152 - - def test_main(self): - """ - Tests main function against different command line arguments - """ - tests = [ - # ( supplied command line arguments, expected call ) - ( - ["image", "command"], - call("image", command="command", runtime=None, user=None, name=None, volumes={}, cap_add=[]) - ), - ( - ["image", "command", "arg1", "arg2"], - call("image", command="command arg1 arg2", runtime=None, user=None, name=None, volumes={}, cap_add=[]) - ), - ( - ["--runtime", "nvidia", "image", "command"], - call("image", command="command", runtime="nvidia", user=None, name=None, volumes={}, cap_add=[]) - ), - ( - ["--user", "1001:1001", "image", "command"], - call("image", command="command", runtime=None, user="1001:1001", name=None, volumes={}, cap_add=[]) - ), - ([ - "--volume", "/local/path1:/container/path1", - "--volume", "/local/path2:/container/path2:ro", - "image", - "command" - ], call("image", command="command", runtime=None, user=None, name=None, volumes={ - "/local/path1": { - "bind": "/container/path1", - "mode": "rw" - }, - "/local/path2": { - "bind": "/container/path2", - "mode": "ro" - } - }, cap_add=[])), - ([ - "--runtime", "nvidia", - "-u", "1001:1001", - "-v", "/local/path1:/container/path1", - "-v", "/local/path2:/container/path2:ro", - "--cap-add", "bob", - "--cap-add", "jimmy", - "--name", - "container_name", - "image", - "command", - "arg1", - "arg2" - ], call( - "image", - command="command arg1 arg2", - runtime="nvidia", - user="1001:1001", - name="container_name", - volumes={ - "/local/path1": { - "bind": "/container/path1", - "mode": "rw" - }, - "/local/path2": { - "bind": "/container/path2", - "mode": "ro" - } - }, cap_add=["bob", "jimmy"]) - ) - ] - - # Tests valid arguments - mock_docker = create_autospec(SafeDockerClient) - mock_docker.run.return_value = 0 - with patch("safe_docker_run.SafeDockerClient", return_value=mock_docker): - for test in tests: - arguments, expected_call = test - main(arguments) - assert mock_docker.run.call_args == expected_call - - # Tests invalid arguments - tests = [ - [], - None, - ["image"], - # Test some bad volume mounts - ["-v", "bob", "image", "args"], - ["-v", "/local/path", "image", "args"], - ["-v", "/local/path:/container/path:blah", "image", "args"], - ["-v", "", "image", "args"], - ["-v", "a:b:c:d", "image", "args"] - ] - - mock_docker = create_autospec(SafeDockerClient) - with patch("safe_docker_run.SafeDockerClient", return_value=mock_docker): - with self.assertRaises(SystemExit): - for test in tests: - main(test) - - def test_clean_up(self): - """ - Tests container clean up in case of SIGTERM and SIGINT - """ - import subprocess - import time - import docker.errors - - docker_client = docker.from_env() - container_name = "safedockertestcontainer1234" - - def get_container(name: str) -> Optional[Container]: - try: - return docker_client.containers.get(name) - except docker.errors.NotFound: - return None - - def remove_container_if_exists(name: str): - container = get_container(name) - if container: - container.stop() - container.remove() - - def wait_for_container(name: str) -> bool: - for _ in itertools.count(5): - if get_container(name): - return True - time.sleep(1) - return False - - # Clear any containers with container name - remove_container_if_exists(container_name) - - # None => not signal is emitted - we should still finish with no containers at the end due - # to the atexit - for sig in [None, signal.SIGTERM, signal.SIGINT]: - # Execute the safe docker run script in a different process - proc = subprocess.Popen(['./safe_docker_run.py', "--name", container_name, "ubuntu:18.04", "sleep 10"]) - # NOTE: we need to wait for the container to come up as not all operating systems support blocking signals - if wait_for_container(container_name) is False: - raise RuntimeError("Test container did not come up") - - # Issue the signal and wait for the process to finish - if sig: - proc.send_signal(sig) - proc.wait() - - # The container should no longer exist - assert get_container(container_name) is None - - -if __name__ == '__main__': - import nose - nose.main() diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py index d34148417355..c4d319fcf58b 100644 --- a/python/mxnet/gluon/data/dataloader.py +++ b/python/mxnet/gluon/data/dataloader.py @@ -607,6 +607,10 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None, self._num_workers = num_workers if num_workers >= 0 else 0 self._worker_pool = None self._prefetch = max(0, int(prefetch) if prefetch is not None else 2 * self._num_workers) + nd.waitall() + import gc + gc.collect() + nd.waitall() if self._num_workers > 0: if self._thread_pool: self._worker_pool = ThreadPool(self._num_workers, diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py index fd8abf1849be..b675aab344ec 100644 --- a/tests/python/unittest/test_numpy_interoperability.py +++ b/tests/python/unittest/test_numpy_interoperability.py @@ -3035,6 +3035,8 @@ def check_interoperability(op_list): continue if name in ['shares_memory', 'may_share_memory', 'empty_like']: # skip list continue + if name in ['delete']: # https://github.com/apache/incubator-mxnet/issues/18600 + continue if name in ['full_like', 'zeros_like', 'ones_like'] and \ StrictVersion(platform.python_version()) < StrictVersion('3.0.0'): continue diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py index 4bdaf5203ef1..ff4c0ac2490c 100644 --- a/tests/python/unittest/test_numpy_op.py +++ b/tests/python/unittest/test_numpy_op.py @@ -3573,6 +3573,7 @@ def hybrid_forward(self, F, x): @with_seed() @use_np +@unittest.skip('https://github.com/apache/incubator-mxnet/issues/18600') def test_np_delete(): class TestDelete(HybridBlock): def __init__(self, obj, axis=None): diff --git a/tools/setup_gpu_build_tools.sh b/tools/setup_gpu_build_tools.sh index 05323f95a504..754022a9b516 100755 --- a/tools/setup_gpu_build_tools.sh +++ b/tools/setup_gpu_build_tools.sh @@ -42,7 +42,7 @@ elif [[ $VARIANT == cu102* ]]; then CUDA_VERSION='10.2.89-1' CUDA_PATCH_VERSION='10.2.2.89-1' LIBCUDA_VERSION='440.33.01-0ubuntu1' - LIBCUDNN_VERSION='7.6.5.32-1+cuda10.2' + LIBCUDNN_VERSION='8.0.4.30-1+cuda10.2' LIBNCCL_VERSION='2.5.6-1+cuda10.2' elif [[ $VARIANT == cu101* ]]; then CUDA_VERSION='10.1.105-1'