diff --git a/docker/Dockerfile b/docker/Dockerfile index 72fa56f28c..ff147aee1c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -72,6 +72,7 @@ RUN GITHUB_ARTIFACTORY=github.com \ ARG UV_VERSION=0.9.7 ARG PYTHON_VERSION=3.12 ENV PATH="/root/.local/bin:$PATH" +ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \ uv python install ${PYTHON_VERSION} @@ -109,8 +110,8 @@ ARG SKIP_SGLANG_BUILD ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv ENV UV_LINK_MODE=copy -# Ensure DeepEP is built for H100 and B200 (also mcore inference unified memory API now invokes a torch API that requires these to be set) -ENV TORCH_CUDA_ARCH_LIST="9.0 10.0" +# Ensure DeepEP is built for hopper and (grace)blackwell (also mcore inference unified memory API now invokes a torch API that requires these to be set) +ENV TORCH_CUDA_ARCH_LIST="9.0 10.0 10.3" # First copy only the dependency files COPY --from=nemo-rl pyproject.toml uv.lock ./ @@ -197,6 +198,10 @@ else UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py fi EOF +RUN for d in /opt/ray_venvs /opt/nemo_rl_venv; do \ + [ -d "$d" ] || continue; \ + find "$d" -name "ptxas-blackwell" -exec ln -sf /usr/local/cuda/bin/ptxas {} +; \ + done # Generate container fingerprint for frozen environment support # Store outside /opt/nemo-rl to avoid being overwritten by user mounts diff --git a/nemo_rl/models/policy/lm_policy.py b/nemo_rl/models/policy/lm_policy.py index 8545fa2bc4..93c90e3eaf 100644 --- a/nemo_rl/models/policy/lm_policy.py +++ b/nemo_rl/models/policy/lm_policy.py @@ -100,7 +100,7 @@ def __init__( if "TORCH_CUDA_ARCH_LIST" not in os.environ: raise RuntimeError( "TORCH_CUDA_ARCH_LIST is not set. This is required in Megatron backend. This variable is set in our container, but " - "if you are running a custom container or baremetal, you may need to set this variable manually. Example: export TORCH_CUDA_ARCH_LIST='9.0 10.0'" + "if you are running a custom container or baremetal, you may need to set this variable manually. Example: export TORCH_CUDA_ARCH_LIST='9.0 10.0 10.3'" ) else: @@ -118,7 +118,7 @@ def __init__( if "TORCH_CUDA_ARCH_LIST" not in os.environ: warnings.warn( "TORCH_CUDA_ARCH_LIST is not set. This is needed if using DeepEP in DTensorPolicyWorker V2. This variable is set in our container, but " - "if you are running a custom container or baremetal, you may need to set this variable manually. Example: export TORCH_CUDA_ARCH_LIST='9.0 10.0'" + "if you are running a custom container or baremetal, you may need to set this variable manually. Example: export TORCH_CUDA_ARCH_LIST='9.0 10.0 10.3'" ) else: assert (