diff --git a/.dockerignore b/.dockerignore index a5aa48cb04..8e4e560ff5 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,8 @@ # Adding to .gitignore helps reduce the size of your working_dir -.git +# Note: removing .git from .dockerignore since it is valuable to have the git history to +# know where this container was built +# .git *.out *.log *.tar diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index f112c9ca26..2e2d178dc5 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -162,13 +162,15 @@ jobs: build-container: if: ${{ needs.pre-flight.outputs.test_level != 'none' }} needs: [pre-flight] - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.30.0 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.52.0 with: build-ref: ${{ github.sha }} image-name: nemo_rl_container dockerfile: docker/Dockerfile image-label: nemo-rl target: hermetic + build-contexts: | + nemo-rl=. build-args: | MAX_JOBS=32 NEMO_RL_COMMIT=${{ github.sha }} diff --git a/docker/Dockerfile b/docker/Dockerfile index 828156d039..b12e1b929f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,14 @@ +# Usage: +# Self-contained build (default: builds from main): docker buildx build -f docker/Dockerfile --tag /nemo-rl:latest --push . +# Self-contained build (specific git ref): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . +# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git +# Local NeMo RL source override: docker buildx build --build-context nemo-rl=. -f docker/Dockerfile --tag /nemo-rl:latest --push . + ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04 +FROM scratch AS nemo-rl +ARG NRL_GIT_REF=main +ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} / + FROM ${BASE_IMAGE} AS base # It is more convenient for users to run as root @@ -65,8 +75,8 @@ VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT uv pip install --link-mode symlink flash-att EOF # First copy only the dependency files -COPY pyproject.toml uv.lock ./ -COPY --link 3rdparty/ ./3rdparty/ +COPY --from=nemo-rl pyproject.toml uv.lock ./ +COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/ RUN <<"EOF" bash -exu # uv sync has a more reliable resolver than simple uv pip install which can fail @@ -100,7 +110,11 @@ LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}" ENV NEMO_RL_VENV_DIR=/opt/ray_venvs -# Copy in source and prefetch all virtual environments -COPY . /opt/nemo-rl +# Copy in source from build context (defaults to cloned repo, can be overridden) +COPY --from=nemo-rl . /opt/nemo-rl +# Unshallow the repo to get the full history (in the case it was from the scratch layer). +# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history), +# so do a quick check before trying to unshallow. +RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true RUN UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py diff --git a/docker/README.md b/docker/README.md index b21c3e7401..66b1da6855 100644 --- a/docker/README.md +++ b/docker/README.md @@ -3,8 +3,8 @@ NOTE: *We use `docker buildx` instead of `docker build` for these containers* This directory contains the `Dockerfile` for NeMo-RL Docker images. You can build two types of images: -- A **base image**: A minimal image where Python dependencies can be specified at runtime. -- A **hermetic image**: An image that includes default dependencies for offline use. +- A **release image** (recommended): Contains everything from the hermetic image, plus the nemo-rl source code and pre-fetched virtual environments for isolated workers. +- A **hermetic image**: Includes the base image plus pre-fetched NeMo RL python packages in the `uv` cache. For detailed instructions on building these images, please see [docs/docker.md](../docs/docker.md). \ No newline at end of file diff --git a/docs/docker.md b/docs/docker.md index 1157e92ebc..f6f93fc1b8 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -1,39 +1,50 @@ # Build Docker Images -This guide provides three methods for building Docker images: +This guide provides two methods for building Docker images: * **release**: Contains everything from the hermetic image, plus the nemo-rl source code and pre-fetched virtual environments for isolated workers. * **hermetic**: Includes the base image plus pre-fetched NeMo RL python packages in the `uv` cache. -* **base**: A minimal image with CUDA, `ray`, and `uv` installed, ideal for specifying Python dependencies at runtime. Use the: * **release** (recommended): if you want to pre-fetch the NeMo RL [worker virtual environments](./design-docs/uv.md#worker-configuration) and copy in the project source code. * **hermetic**: if you want to pre-fetch NeMo RL python packages into the `uv` cache to eliminate the initial overhead of program start. -* **base**: if you just need a minimal image with CUDA, `ray`, and `uv` installed and are okay with dynamically downloading your requirements at runtime. This option trades off fast container download/startup with slower initial overhead to download python packages. ## Release Image The release image is our recommended option as it provides the most complete environment. It includes everything from the hermetic image, plus the nemo-rl source code and pre-fetched virtual environments for isolated workers. This is the ideal choice for production deployments. ```sh -cd docker/ -docker buildx build --target release -t nemo_rl -f Dockerfile .. +# Self-contained build (default: builds from main): +docker buildx build --target release -f docker/Dockerfile --tag /nemo-rl:latest --push . + +# Self-contained build (specific git ref): +docker buildx build --target release -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . + +# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): +docker buildx build --target release -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git + +# Local NeMo RL source override: +docker buildx build --target release --build-context nemo-rl=. -f docker/Dockerfile --tag /nemo-rl:latest --push . ``` +**Note:** The `--tag /nemo-rl:latest --push` flags are not necessary if you just want to build locally. + ## Hermetic Image The hermetic image includes all Python dependencies pre-downloaded in the `uv` cache, eliminating the initial overhead of downloading packages at runtime. This is useful when you need a more predictable environment or have limited network connectivity. ```sh -cd docker/ -docker buildx build --target hermetic -t nemo_rl -f Dockerfile .. -``` +# Self-contained build (default: builds from main): +docker buildx build --target hermetic -f docker/Dockerfile --tag /nemo-rl:latest --push . -## Base Image +# Self-contained build (specific git ref): +docker buildx build --target hermetic -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . -The base image provides a minimal environment with CUDA, `ray`, and `uv` installed. While it's the smallest image, it requires downloading Python dependencies at runtime, which may not be ideal for all use cases. +# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): +docker buildx build --target hermetic -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git -```sh -cd docker/ -docker buildx build --target base -t nemo_rl -f Dockerfile .. +# Local NeMo RL source override: +docker buildx build --target hermetic --build-context nemo-rl=. -f docker/Dockerfile --tag /nemo-rl:latest --push . ``` + +**Note:** The `--tag /nemo-rl:latest --push` flags are not necessary if you just want to build locally.