From 382cc0b98e799e74592eaf5cd79d097f05146021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joonhyung=20Lee/=EC=9D=B4=EC=A4=80=ED=98=95?= <33523965+veritas9872@users.noreply.github.com> Date: Fri, 20 Sep 2024 18:07:47 +0900 Subject: [PATCH] Fix/docs (#215) * Change the `zsh` binary to the one actually being used. * Remove incorrect content from the README. * Update to CUDA 12.4 but lower Python back to 3.10 for the build. * Found that the pip and apt URLs were not being provided to all images from base. * Reformat. --------- Co-authored-by: joonhyung.lee --- README.md | 16 +++++------ docker-compose.yaml | 52 +++++++++++++++++------------------ dockerfiles/ngc.Dockerfile | 8 ++++-- dockerfiles/simple.Dockerfile | 5 +++- dockerfiles/train.Dockerfile | 2 +- tests/test_run.py | 1 + 6 files changed, 45 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index c856493..47007ca 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ If this is your first time using this project, follow these steps: [repo](https://github.com/docker/docker-install). 3. Install the NVIDIA Container Toolkit as specified in this -[link](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). + [link](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). 4. Run `make install-compose` to install Docker Compose V2 for Linux hosts. Installation does _**not**_ require `root` permissions. Visit the @@ -91,9 +91,6 @@ Different Docker Compose services are organized to serve different needs. their projects on the NGC images provided by NVIDIA. Note that the NGC images change between different releases and that configurations for one release may not work for another one. -- `hub` is derived from the official PyTorch Docker Hub image and serves a - similar function as the `ngc` service described above. However, - the PyTorch Docker images have a more stable interface than the NGC images. - `simple` is derived from the Official Ubuntu Linux image by default as some corporations restrict the use of Docker images not officially verified by Docker. It installs all packages via `conda` by default and can optionally @@ -169,15 +166,19 @@ can either be downloaded or installed via `apt`, `conda`, or `pip`. ```text # Generated automatically by `make env`. +# When using the `root` user with `UID=0`/`USR=root`, set `ADD_USER=exclude`. GID=1000 UID=1000 GRP=GROUPNAME USR=USERNAME -PROJECT=train-username # `PROJECT` must be in lowercase. +HOST_ROOT=. SERVICE=train -COMMAND=/bin/zsh # Command to execute on starting the container. -IMAGE_NAME=cresset:train-username # `IMAGE_NAME` is also converted to lowercase. +# Do not use the same `PROJECT` name for different projects on the same host! +PROJECT=train-username # `PROJECT` must be in lowercase. PROJECT_ROOT=/opt/project +IMAGE_NAME=cresset:train-username # `IMAGE_NAME` is also converted to lowercase. +COMMAND=/usr/bin/zsh --login # Command to execute on starting the container. +TZ=Asia/Seoul # Set the container timezone. # [[Optional]]: Fill in these configurations manually if the defaults do not suffice. @@ -199,7 +200,6 @@ CUDA_VERSION=11.8.0 # Must be compatible with hardware and CUDA driver. CUDNN_VERSION=8 # Only major version specifications are available. PYTHON_VERSION=3.10 # Specify the Python version. MKL_MODE=include # Enable MKL for Intel CPUs. -TZ=Asia/Seoul # Set the container timezone. # Advanced Usage. TARGET_STAGE=train # Target Dockerfile stage. The `*.whl` files are available in `train-builds`. diff --git a/docker-compose.yaml b/docker-compose.yaml index c377312..fc75e65 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -45,9 +45,9 @@ services: # `ipc: host` removes the shared memory cap but is a known security vulnerability. ipc: host # Equivalent to `--ipc=host` in `docker run`. **Disable this on WSL.** # shm_size: 1GB # Explicit shared memory limit. No security issues this way. -# hostname: ${SERVICE} # Makes `pure` terminals easier to tell apart. -# extra_hosts: # Prevents "unknown host" issue when using `sudo`. -# - "${SERVICE}:127.0.0.1" + # hostname: ${SERVICE} # Makes `pure` terminals easier to tell apart. + # extra_hosts: # Prevents "unknown host" issue when using `sudo`. + # - "${SERVICE}:127.0.0.1" # Common environment variables for the container runtime. No effect on build. environment: # Equivalent to `--env` @@ -91,7 +91,7 @@ services: GRP: ${GRP:-user} USR: ${USR:-user} TZ: ${TZ:-UTC} - TMUX_HIST_LIMIT: 50000 # Size of `tmux` scrolling history. + TMUX_HIST_LIMIT: 50000 # Size of `tmux` scrolling history. # Change the `CONDA_URL` for different hardware architectures. # URLs from https://github.com/conda-forge/miniforge are recommended over # Miniconda URLs from https://docs.conda.io/en/latest/miniconda.html. @@ -101,6 +101,13 @@ services: # Installing `mamba` via mini-forge is strongly recommended. CONDA_URL: ${CONDA_URL:-https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh} CONDA_MANAGER: ${CONDA_MANAGER:-mamba} + # URLs for faster `apt` and `pip` installs. + # Use URLs optimized for location and security requirements. + # DEB_OLD: ${DEB_OLD:-http://archive.ubuntu.com} + # DEB_NEW: ${DEB_NEW:-http://kr.archive.ubuntu.com} + INDEX_URL: ${INDEX_URL:-https://pypi.org/simple} + EXTRA_INDEX_URL: ${INDEX_URL:-https://pypi.ngc.nvidia.com} + TRUSTED_HOST: ${TRUSTED_HOST:-pypi.ngc.nvidia.com} deploy: # API dependent on compose version. resources: reservations: @@ -117,38 +124,31 @@ services: dockerfile: dockerfiles/train.Dockerfile args: # Equivalent to `--build-arg`. BUILD_MODE: ${BUILD_MODE:-exclude} -# BUILD_TEST: 0 -# USE_NNPACK: 0 # Disabling NNPack and QNNPack by default as they are -# USE_QNNPACK: 0 # legacy libraries and most users do not need them. -# BUILD_CAFFE2: 0 # Most users do not need Caffe2. -# BUILD_CAFFE2_OPS: 0 -# USE_PRECOMPILED_HEADERS: 1 + # BUILD_TEST: 0 + # USE_NNPACK: 0 # Disabling NNPack and QNNPack by default as they are + # USE_QNNPACK: 0 # legacy libraries and most users do not need them. + # BUILD_CAFFE2: 0 # Most users do not need Caffe2. + # BUILD_CAFFE2_OPS: 0 + # USE_PRECOMPILED_HEADERS: 1 LINUX_DISTRO: ${LINUX_DISTRO:-ubuntu} DISTRO_VERSION: ${DISTRO_VERSION:-22.04} - CUDA_VERSION: ${CUDA_VERSION:-12.1.1} - CUDNN_VERSION: ${CUDNN_VERSION:-8} + CUDA_VERSION: ${CUDA_VERSION:-12.4.1} + CUDNN_VERSION: ${CUDNN_VERSION} # Leave empty for CUDA 12.4+. IMAGE_FLAVOR: ${IMAGE_FLAVOR:-devel} - PYTHON_VERSION: ${PYTHON_VERSION:-3.12} + PYTHON_VERSION: ${PYTHON_VERSION:-3.10} MKL_MODE: ${MKL_MODE:-include} # MKL_MODE can be `include` or `exclude`. # Fails if `BUILD_MODE=include` but `CCC` is not set explicitly. TORCH_CUDA_ARCH_LIST: ${CCC} # Ignore the missing CCC warning otherwise. # Variables for building PyTorch. Must be valid git tags or commits. - PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-v2.4.0} - TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-v0.19.0} + PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-v2.4.1} + TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-v0.19.1} # Variables for downloading PyTorch instead of building. - PYTORCH_INDEX_URL: ${PYTORCH_INDEX_URL:-https://download.pytorch.org/whl/cu121} + PYTORCH_INDEX_URL: ${PYTORCH_INDEX_URL:-https://download.pytorch.org/whl/cu124} # Set `PYTORCH_FETCH_NIGHTLY` to any value to fetch the nightly binaries. # Also remember to change the index url to the nightly version. PYTORCH_FETCH_NIGHTLY: ${PYTORCH_FETCH_NIGHTLY:+--pre} - PYTORCH_VERSION: ${PYTORCH_VERSION:-2.4.0} - TORCHVISION_VERSION: ${TORCHVISION_VERSION:-0.19.0} - # URLs for faster `apt` and `pip` installs. - # Use URLs optimized for location and security requirements. -# DEB_OLD: ${DEB_OLD:-http://archive.ubuntu.com} -# DEB_NEW: ${DEB_NEW:-http://kr.archive.ubuntu.com} - INDEX_URL: ${INDEX_URL:-https://pypi.org/simple} - EXTRA_INDEX_URL: ${INDEX_URL:-https://pypi.ngc.nvidia.com} - TRUSTED_HOST: ${TRUSTED_HOST:-pypi.ngc.nvidia.com} + PYTORCH_VERSION: ${PYTORCH_VERSION:-2.4.1} + TORCHVISION_VERSION: ${TORCHVISION_VERSION:-0.19.1} devel: # Skeleton service for development and debugging. extends: # This service may be useful for PyTorch CUDA/C++ contributors. @@ -167,7 +167,7 @@ services: dockerfile: dockerfiles/ngc.Dockerfile args: NGC_YEAR: ${NGC_YEAR:-24} - NGC_MONTH: ${NGC_MONTH:-07} + NGC_MONTH: ${NGC_MONTH:-08} simple: # Service installed purely from official/verified Docker images and `conda`. extends: diff --git a/dockerfiles/ngc.Dockerfile b/dockerfiles/ngc.Dockerfile index f6e5801..be23712 100644 --- a/dockerfiles/ngc.Dockerfile +++ b/dockerfiles/ngc.Dockerfile @@ -90,11 +90,13 @@ ENV SHELL='' # Install `apt` requirements. # `tzdata` requires noninteractive mode. ARG TZ +ARG DEB_OLD +ARG DEB_NEW ARG DEBIAN_FRONTEND=noninteractive RUN --mount=type=bind,from=stash,source=/tmp/apt,target=/tmp/apt \ ln -snf /usr/share/zoneinfo/${TZ} /etc/localtime && echo ${TZ} > /etc/timezone && \ - apt-get update && \ - sed -e 's/#.*//g' -e 's/\r//g' /tmp/apt/requirements.txt | \ + if [ ${DEB_NEW} ]; then sed -i "s%${DEB_OLD}%${DEB_NEW}%g" /etc/apt/sources.list; fi && \ + apt-get update && sed -e 's/#.*//g' -e 's/\r//g' /tmp/apt/requirements.txt | \ xargs -r apt-get install -y --no-install-recommends && \ rm -rf /var/lib/apt/lists/* @@ -207,4 +209,4 @@ ENV PYTHONPATH=${PYTHONPATH}:/usr/local/lib/python3/dist-packages ENV PYTHONPATH=${PYTHONPATH}:/opt/conda/lib/python3/site-packages WORKDIR ${PROJECT_ROOT} -CMD ["/bin/zsh"] +CMD ["/usr/bin/zsh"] diff --git a/dockerfiles/simple.Dockerfile b/dockerfiles/simple.Dockerfile index d02952b..24e4252 100644 --- a/dockerfiles/simple.Dockerfile +++ b/dockerfiles/simple.Dockerfile @@ -165,9 +165,12 @@ ARG PYTHONUNBUFFERED=1 # The `--mount=type=bind` temporarily mounts a directory from another stage. # `tzdata` requires noninteractive mode. ARG TZ +ARG DEB_OLD +ARG DEB_NEW ARG DEBIAN_FRONTEND=noninteractive RUN --mount=type=bind,from=stash,source=/tmp/apt,target=/tmp/apt \ ln -snf /usr/share/zoneinfo/${TZ} /etc/localtime && echo ${TZ} > /etc/timezone && \ + if [ ${DEB_NEW} ]; then sed -i "s%${DEB_OLD}%${DEB_NEW}%g" /etc/apt/sources.list; fi && \ apt-get update && sed -e 's/#.*//g' -e 's/\r//g' /tmp/apt/requirements.txt | \ xargs apt-get install -y --no-install-recommends && \ rm -rf /var/lib/apt/lists/* @@ -260,4 +263,4 @@ ENV PATH=/opt/conda/bin:${PATH} ARG PROJECT_ROOT=/opt/project ENV PYTHONPATH=${PROJECT_ROOT}${PYTHONPATH:+:${PYTHONPATH}} WORKDIR ${PROJECT_ROOT} -CMD ["/bin/zsh"] +CMD ["/usr/bin/zsh"] diff --git a/dockerfiles/train.Dockerfile b/dockerfiles/train.Dockerfile index 77ba5c4..e84eb61 100644 --- a/dockerfiles/train.Dockerfile +++ b/dockerfiles/train.Dockerfile @@ -559,4 +559,4 @@ ENV PATH=/opt/conda/bin:${PATH} ARG PROJECT_ROOT=/opt/project ENV PYTHONPATH=${PROJECT_ROOT}${PYTHONPATH:+:${PYTHONPATH}} WORKDIR ${PROJECT_ROOT} -CMD ["/bin/zsh"] +CMD ["/usr/bin/zsh"] diff --git a/tests/test_run.py b/tests/test_run.py index 3afa235..d0af9f7 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -183,5 +183,6 @@ def _get_cuda_info(device): # Using as a fixture to get device info. ], capture_output=True, text=True, + check=False, ).stdout.strip() logger.info(f"NVIDIA Driver Version: {dv}")