diff --git a/.gitignore b/.gitignore index 55a992fece..926b8a11de 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ hf_datasets_cache/ datasets/ docker/* !docker/Dockerfile +!docker/Dockerfile.ngc_pytorch !docker/README.md wandb/ checkpoints/ diff --git a/docker/Dockerfile.ngc_pytorch b/docker/Dockerfile.ngc_pytorch new file mode 100644 index 0000000000..dc874b2189 --- /dev/null +++ b/docker/Dockerfile.ngc_pytorch @@ -0,0 +1,128 @@ +# This Dockerfile is used to build a Docker image for NeMo RL with the NGC PyTorch base image. +# However, it is still a work in progress and is not yet ready for production use. +# +# Usage: +# Self-contained build (default: builds from main): docker buildx build -f docker/Dockerfile.ngc_pytorch --tag /nemo-rl:latest --push . +# Self-contained build (specific git ref): docker buildx build -f docker/Dockerfile.ngc_pytorch --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . +# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): docker buildx build -f docker/Dockerfile.ngc_pytorch --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git +# Local NeMo RL source override: docker buildx build --build-context nemo-rl=. -f docker/Dockerfile.ngc_pytorch --tag /nemo-rl:latest --push . +# +# If installing new dependencies in the container, then use "uv pip install new-dependency" +ARG BASE_IMAGE=nvcr.io/nvidia/nemo:25.06-py3 +FROM scratch AS nemo-rl +ARG NRL_GIT_REF=main +ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} / + +FROM ${BASE_IMAGE} AS base + +# It is more convenient for users to run as root +USER root + +RUN <<"EOF" bash -exu -o pipefail +export DEBIAN_FRONTEND=noninteractive +export TZ=America/Los_Angeles + +apt-get update +apt-get install -y --no-install-recommends \ + jq \ + curl \ + git \ + rsync \ + wget \ + less \ + vim \ + + +apt-get clean +rm -rf /var/lib/apt/lists/* +EOF + +# Install uv at /usr/local/bin in case the root home directory is bind mounted +ARG UV_VERSION=0.7.2 +RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | XDG_BIN_HOME=/usr/local/bin sh + +# Disable usage stats by default for users who are sensitive to sharing usage. +# Users are encouraged to enable if they wish. +ENV RAY_USAGE_STATS_ENABLED=0 +ENV NEMO_RL_VENV_DIR=/opt/ray_venvs + +# Build vLLM from source to use with the NVIDIA PyTorch base image +FROM base AS build_vllm + +ARG MAX_JOBS=32 +WORKDIR /opt +COPY --from=nemo-rl uv.lock /tmp/uv.lock + +RUN <<"EOF" bash -exu +echo "Building vLLM from source for PyTorch base image" +VLLM_VERSION=$(grep -A 1 'name = "vllm"' /tmp/uv.lock | grep 'version =' | sed 's/version = "\(.*\)"/\1/') && \ +echo "Building vLLM version: $VLLM_VERSION" +git clone https://github.com/vllm-project/vllm.git +cd vllm +git checkout v$VLLM_VERSION +python use_existing_torch.py +pip install -r requirements/build.txt +pip wheel --no-deps --no-build-isolation -v . +EOF + +FROM base AS hermetic + +WORKDIR /opt/nemo-rl + +# Variables to control the build of TE. If there are issues with parallelization, consider +# setting these to 1. +ARG MAX_JOBS +ARG NVTE_BUILD_THREADS_PER_JOB + +ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv +ENV UV_CACHE_DIR=/opt/uv_cache +ENV UV_LINK_MODE=copy + +# Define the no-install-package arguments for PyTorch base images +ARG BASE_IMAGE +ARG UV_NO_INSTALL_PACKAGES="--no-install-package torch --no-install-package torchvision --no-install-package triton --no-install-package nvidia-cublas-cu12 --no-install-package nvidia-cuda-cupti-cu12 --no-install-package nvidia-cuda-nvrtc-cu12 --no-install-package nvidia-cuda-runtime-cu12 --no-install-package nvidia-cudnn-cu12 --no-install-package nvidia-cufft-cu12 --no-install-package nvidia-cufile-cu12 --no-install-package nvidia-curand-cu12 --no-install-package nvidia-cusolver-cu12 --no-install-package nvidia-cusparse-cu12 --no-install-package nvidia-cusparselt-cu12 --no-install-package nvidia-nccl-cu12 --no-install-package vllm --no-install-package flash-attn --no-install-package transformer-engine --no-install-package transformer-engine-cu12 --no-install-package transformer-engine-torch --no-install-package numpy" +ENV UV_NO_INSTALL_PACKAGES=${UV_NO_INSTALL_PACKAGES} +ENV PATH="/opt/nemo_rl_venv/bin:$PATH" + +# First copy only the dependency files +COPY --from=nemo-rl pyproject.toml uv.lock ./ +COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/ + + +RUN --mount=type=bind,from=build_vllm,source=/opt/,target=/tmp/build_vllm/ <<"EOF" bash -exu + +# uv sync has a more reliable resolver than simple uv pip install which can fail +# The venv is symlinked to avoid bloating the layer size +uv venv --system-site-packages ${UV_PROJECT_ENVIRONMENT} +uv pip install --no-cache-dir --no-deps /tmp/build_vllm/vllm/vllm*.whl +uv sync --link-mode symlink --locked --inexact --extra vllm --extra mcore --extra automodel --all-groups --no-install-project $UV_NO_INSTALL_PACKAGES +EOF + +ENV NEMO_RL_VENV_DIR=/opt/ray_venvs + +WORKDIR /opt/nemo-rl + +FROM hermetic AS release + +ARG NEMO_RL_COMMIT +ARG NVIDIA_BUILD_ID +ARG NVIDIA_BUILD_REF +ENV UV_NO_SYNC=1 +ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-} +ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-} +ENV NVIDIA_BUILD_REF=${NVIDIA_BUILD_REF:-} +ENV NEMO_RL_PY_EXECUTABLES_SYSTEM=1 +# The 25.06 Pytorch container is not compatible with vllm standalone compile so we disable it +ENV VLLM_USE_STANDALONE_COMPILE=0 +LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}" +LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}" + +ENV NEMO_RL_VENV_DIR=/opt/ray_venvs + +# Copy in source from build context (defaults to cloned repo, can be overridden) +COPY --from=nemo-rl . /opt/nemo-rl +# Unshallow the repo to get the full history (in the case it was from the scratch layer). +# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history), +# so do a quick check before trying to unshallow. +RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true +RUN UV_LINK_MODE=symlink uv sync --locked --inexact $UV_NO_INSTALL_PACKAGES diff --git a/nemo_rl/distributed/ray_actor_environment_registry.py b/nemo_rl/distributed/ray_actor_environment_registry.py index 2ea3dfa64b..4a7649c09d 100644 --- a/nemo_rl/distributed/ray_actor_environment_registry.py +++ b/nemo_rl/distributed/ray_actor_environment_registry.py @@ -12,15 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES +USE_SYSTEM_EXECUTABLE = os.environ.get("NEMO_RL_PY_EXECUTABLES_SYSTEM", "0") == "1" +VLLM_EXECUTABLE = ( + PY_EXECUTABLES.SYSTEM if USE_SYSTEM_EXECUTABLE else PY_EXECUTABLES.VLLM +) +MCORE_EXECUTABLE = ( + PY_EXECUTABLES.SYSTEM if USE_SYSTEM_EXECUTABLE else PY_EXECUTABLES.MCORE +) + ACTOR_ENVIRONMENT_REGISTRY: dict[str, str] = { - "nemo_rl.models.generation.vllm.vllm_worker.VllmGenerationWorker": PY_EXECUTABLES.VLLM, - "nemo_rl.models.generation.vllm.vllm_worker_async.VllmAsyncGenerationWorker": PY_EXECUTABLES.VLLM, + "nemo_rl.models.generation.vllm.vllm_worker.VllmGenerationWorker": VLLM_EXECUTABLE, + "nemo_rl.models.generation.vllm.vllm_worker_async.VllmAsyncGenerationWorker": VLLM_EXECUTABLE, # Temporary workaround for the coupled implementation of DTensorPolicyWorker and vLLM. # This will be reverted to PY_EXECUTABLES.BASE once https://github.com/NVIDIA-NeMo/RL/issues/501 is resolved. - "nemo_rl.models.policy.dtensor_policy_worker.DTensorPolicyWorker": PY_EXECUTABLES.VLLM, - "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker": PY_EXECUTABLES.MCORE, + "nemo_rl.models.policy.dtensor_policy_worker.DTensorPolicyWorker": VLLM_EXECUTABLE, + "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker": MCORE_EXECUTABLE, "nemo_rl.environments.math_environment.MathEnvironment": PY_EXECUTABLES.SYSTEM, "nemo_rl.environments.code_environment.CodeEnvironment": PY_EXECUTABLES.SYSTEM, "nemo_rl.environments.games.sliding_puzzle.SlidingPuzzleEnv": PY_EXECUTABLES.SYSTEM,