Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
8494039
Optionally build RL with ngc torch container
chtruong814 Jul 27, 2025
29a9732
Do not install TE when using pytorch image
chtruong814 Jul 27, 2025
c5a07b3
Exclude transformer-engine-cu-12 when using pytorch container
chtruong814 Jul 28, 2025
9ff3a6b
Do not install custom python in ngc torch container
chtruong814 Jul 29, 2025
ced97c4
Add no-install-pytorch-deps
chtruong814 Jul 29, 2025
fc49eb3
Use system executable
chtruong814 Jul 29, 2025
d711bc3
Update mcore ref to use 0.13.0 fork with sahil cherry-picks
chtruong814 Jul 29, 2025
51fe21c
Revert original Dockerfile
chtruong814 Aug 8, 2025
bbee575
Revert "Revert original Dockerfile"
chtruong814 Aug 8, 2025
32e09ff
Build vllm with uv
chtruong814 Aug 8, 2025
b452cc0
Fix vllm build in uv
chtruong814 Aug 8, 2025
9dba4a7
Fix vllm output directory
chtruong814 Aug 8, 2025
ddf4d3c
Revert docker container
chtruong814 Aug 9, 2025
0c73b13
Add ngc pytorch Dockerfile
chtruong814 Aug 9, 2025
26b42a0
Use system executable given NEMO_RL_PY_EXECUTABLES_SYSTEM
chtruong814 Aug 9, 2025
45c9f9a
Ensure uv is installed
chtruong814 Aug 9, 2025
a15ac1c
Fix vllm build
chtruong814 Aug 9, 2025
bed0c3d
Fix vllm install
chtruong814 Aug 9, 2025
534e426
Fix no install env var
chtruong814 Aug 9, 2025
f0ad336
Fix uv install
chtruong814 Aug 9, 2025
d5517cf
Ensure numpy is not upgraded during install
chtruong814 Aug 11, 2025
150b17f
Fix ngc override location
chtruong814 Aug 11, 2025
653a39b
Do not use uv to build vllm
chtruong814 Aug 11, 2025
b17c3c6
Fix vllm build
chtruong814 Aug 11, 2025
aed522f
Build vllm with pip
chtruong814 Aug 12, 2025
4a2eec4
Ensure numpy is not upgraded during vllm install
chtruong814 Aug 12, 2025
2b05167
Do not install triton
chtruong814 Aug 12, 2025
3a1ddd0
Attempt to override numpy and vllm install
chtruong814 Aug 12, 2025
54f69b5
Install vllm with no-deps initially
chtruong814 Aug 12, 2025
1006b12
Use uv to install vllm with no-deps
chtruong814 Aug 12, 2025
0616be6
Prevent uv sync from removing vllm
chtruong814 Aug 12, 2025
92df91f
Add UV_CACHE_DIR
chtruong814 Aug 12, 2025
1c8bfb5
Remove ngc overrides file
chtruong814 Aug 12, 2025
eef6b59
Remove ngc overrides file from .gitignore
chtruong814 Aug 12, 2025
8995d0c
Remove prefetch in original Dockerfile
chtruong814 Aug 12, 2025
8675d2f
Fix lint error in ray actor registry
chtruong814 Aug 12, 2025
ea9803c
Revert "Update mcore ref to use 0.13.0 fork with sahil cherry-picks"
chtruong814 Aug 13, 2025
4e07ced
Update Dockerfile based on feedback
chtruong814 Aug 14, 2025
0096d65
Remove unused file from .gitignore
chtruong814 Aug 14, 2025
b734d24
Revert Dockerfile
chtruong814 Aug 14, 2025
aba3b85
Merge remote-tracking branch 'origin/main' into chtruong/build-ngc-torch
chtruong814 Aug 14, 2025
bfe5610
Add comment around installing new dependencies
chtruong814 Aug 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ hf_datasets_cache/
datasets/
docker/*
!docker/Dockerfile
!docker/Dockerfile.ngc_pytorch
!docker/README.md
wandb/
checkpoints/
Expand Down
128 changes: 128 additions & 0 deletions docker/Dockerfile.ngc_pytorch
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# This Dockerfile is used to build a Docker image for NeMo RL with the NGC PyTorch base image.
# However, it is still a work in progress and is not yet ready for production use.
#
# Usage:
# Self-contained build (default: builds from main): docker buildx build -f docker/Dockerfile.ngc_pytorch --tag <registry>/nemo-rl:latest --push .
# Self-contained build (specific git ref): docker buildx build -f docker/Dockerfile.ngc_pytorch --build-arg NRL_GIT_REF=r0.3.0 --tag <registry>/nemo-rl:r0.3.0 --push .
# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): docker buildx build -f docker/Dockerfile.ngc_pytorch --build-arg NRL_GIT_REF=r0.3.0 --tag <registry>/nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git
# Local NeMo RL source override: docker buildx build --build-context nemo-rl=. -f docker/Dockerfile.ngc_pytorch --tag <registry>/nemo-rl:latest --push .
#
# If installing new dependencies in the container, then use "uv pip install new-dependency"
ARG BASE_IMAGE=nvcr.io/nvidia/nemo:25.06-py3
FROM scratch AS nemo-rl
ARG NRL_GIT_REF=main
ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} /

FROM ${BASE_IMAGE} AS base

# It is more convenient for users to run as root
USER root

RUN <<"EOF" bash -exu -o pipefail
export DEBIAN_FRONTEND=noninteractive
export TZ=America/Los_Angeles

apt-get update
apt-get install -y --no-install-recommends \
jq \
curl \
git \
rsync \
wget \
less \
vim \


apt-get clean
rm -rf /var/lib/apt/lists/*
EOF

# Install uv at /usr/local/bin in case the root home directory is bind mounted
ARG UV_VERSION=0.7.2
RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | XDG_BIN_HOME=/usr/local/bin sh

# Disable usage stats by default for users who are sensitive to sharing usage.
# Users are encouraged to enable if they wish.
ENV RAY_USAGE_STATS_ENABLED=0
ENV NEMO_RL_VENV_DIR=/opt/ray_venvs

# Build vLLM from source to use with the NVIDIA PyTorch base image
FROM base AS build_vllm

ARG MAX_JOBS=32
WORKDIR /opt
COPY --from=nemo-rl uv.lock /tmp/uv.lock

RUN <<"EOF" bash -exu
echo "Building vLLM from source for PyTorch base image"
VLLM_VERSION=$(grep -A 1 'name = "vllm"' /tmp/uv.lock | grep 'version =' | sed 's/version = "\(.*\)"/\1/') && \
echo "Building vLLM version: $VLLM_VERSION"
git clone https://github.com/vllm-project/vllm.git
cd vllm
git checkout v$VLLM_VERSION
python use_existing_torch.py
pip install -r requirements/build.txt
pip wheel --no-deps --no-build-isolation -v .
EOF

FROM base AS hermetic

WORKDIR /opt/nemo-rl

# Variables to control the build of TE. If there are issues with parallelization, consider
# setting these to 1.
ARG MAX_JOBS
ARG NVTE_BUILD_THREADS_PER_JOB

ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv
ENV UV_CACHE_DIR=/opt/uv_cache
ENV UV_LINK_MODE=copy

# Define the no-install-package arguments for PyTorch base images
ARG BASE_IMAGE
ARG UV_NO_INSTALL_PACKAGES="--no-install-package torch --no-install-package torchvision --no-install-package triton --no-install-package nvidia-cublas-cu12 --no-install-package nvidia-cuda-cupti-cu12 --no-install-package nvidia-cuda-nvrtc-cu12 --no-install-package nvidia-cuda-runtime-cu12 --no-install-package nvidia-cudnn-cu12 --no-install-package nvidia-cufft-cu12 --no-install-package nvidia-cufile-cu12 --no-install-package nvidia-curand-cu12 --no-install-package nvidia-cusolver-cu12 --no-install-package nvidia-cusparse-cu12 --no-install-package nvidia-cusparselt-cu12 --no-install-package nvidia-nccl-cu12 --no-install-package vllm --no-install-package flash-attn --no-install-package transformer-engine --no-install-package transformer-engine-cu12 --no-install-package transformer-engine-torch --no-install-package numpy"
ENV UV_NO_INSTALL_PACKAGES=${UV_NO_INSTALL_PACKAGES}
ENV PATH="/opt/nemo_rl_venv/bin:$PATH"

# First copy only the dependency files
COPY --from=nemo-rl pyproject.toml uv.lock ./
COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/


RUN --mount=type=bind,from=build_vllm,source=/opt/,target=/tmp/build_vllm/ <<"EOF" bash -exu

# uv sync has a more reliable resolver than simple uv pip install which can fail
# The venv is symlinked to avoid bloating the layer size
uv venv --system-site-packages ${UV_PROJECT_ENVIRONMENT}
uv pip install --no-cache-dir --no-deps /tmp/build_vllm/vllm/vllm*.whl
uv sync --link-mode symlink --locked --inexact --extra vllm --extra mcore --extra automodel --all-groups --no-install-project $UV_NO_INSTALL_PACKAGES
EOF

ENV NEMO_RL_VENV_DIR=/opt/ray_venvs

WORKDIR /opt/nemo-rl

FROM hermetic AS release

ARG NEMO_RL_COMMIT
ARG NVIDIA_BUILD_ID
ARG NVIDIA_BUILD_REF
ENV UV_NO_SYNC=1
ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-<unknown>}
ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-<unknown>}
ENV NVIDIA_BUILD_REF=${NVIDIA_BUILD_REF:-<unknown>}
ENV NEMO_RL_PY_EXECUTABLES_SYSTEM=1
# The 25.06 Pytorch container is not compatible with vllm standalone compile so we disable it
ENV VLLM_USE_STANDALONE_COMPILE=0
LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}"
LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}"

ENV NEMO_RL_VENV_DIR=/opt/ray_venvs

# Copy in source from build context (defaults to cloned repo, can be overridden)
COPY --from=nemo-rl . /opt/nemo-rl
# Unshallow the repo to get the full history (in the case it was from the scratch layer).
# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history),
# so do a quick check before trying to unshallow.
RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true
RUN UV_LINK_MODE=symlink uv sync --locked --inexact $UV_NO_INSTALL_PACKAGES
18 changes: 14 additions & 4 deletions nemo_rl/distributed/ray_actor_environment_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES

USE_SYSTEM_EXECUTABLE = os.environ.get("NEMO_RL_PY_EXECUTABLES_SYSTEM", "0") == "1"
VLLM_EXECUTABLE = (
PY_EXECUTABLES.SYSTEM if USE_SYSTEM_EXECUTABLE else PY_EXECUTABLES.VLLM
)
MCORE_EXECUTABLE = (
PY_EXECUTABLES.SYSTEM if USE_SYSTEM_EXECUTABLE else PY_EXECUTABLES.MCORE
)

ACTOR_ENVIRONMENT_REGISTRY: dict[str, str] = {
"nemo_rl.models.generation.vllm.vllm_worker.VllmGenerationWorker": PY_EXECUTABLES.VLLM,
"nemo_rl.models.generation.vllm.vllm_worker_async.VllmAsyncGenerationWorker": PY_EXECUTABLES.VLLM,
"nemo_rl.models.generation.vllm.vllm_worker.VllmGenerationWorker": VLLM_EXECUTABLE,
"nemo_rl.models.generation.vllm.vllm_worker_async.VllmAsyncGenerationWorker": VLLM_EXECUTABLE,
# Temporary workaround for the coupled implementation of DTensorPolicyWorker and vLLM.
# This will be reverted to PY_EXECUTABLES.BASE once https://github.com/NVIDIA-NeMo/RL/issues/501 is resolved.
"nemo_rl.models.policy.dtensor_policy_worker.DTensorPolicyWorker": PY_EXECUTABLES.VLLM,
"nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker": PY_EXECUTABLES.MCORE,
"nemo_rl.models.policy.dtensor_policy_worker.DTensorPolicyWorker": VLLM_EXECUTABLE,
"nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker": MCORE_EXECUTABLE,
"nemo_rl.environments.math_environment.MathEnvironment": PY_EXECUTABLES.SYSTEM,
"nemo_rl.environments.code_environment.CodeEnvironment": PY_EXECUTABLES.SYSTEM,
"nemo_rl.environments.games.sliding_puzzle.SlidingPuzzleEnv": PY_EXECUTABLES.SYSTEM,
Expand Down