NVIDIA-NeMo · yfw · Nov 24, 2025 · Nov 25, 2025 · Nov 26, 2025 · Nov 27, 2025
@@ -11,5 +11,5 @@
 [submodule "3rdparty/Automodel-workspace/Automodel"]
 	path = 3rdparty/Automodel-workspace/Automodel
 	url = https://github.com/NVIDIA-NeMo/Automodel.git
-	branch = nemo-rl-submodule
+	branch = yifu/bump-torch-and-hf
 	shallow = true
@@ -4,7 +4,7 @@
 # Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag <registry>/nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git
 # Local NeMo RL source override: docker buildx build --build-context nemo-rl=. -f docker/Dockerfile --tag <registry>/nemo-rl:latest --push .
 
-ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04
+ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.11-cuda13.0-devel-ubuntu24.04
 FROM scratch AS nemo-rl
 ARG NRL_GIT_REF=main
 ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} /
@@ -129,4 +129,4 @@ RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallo
 RUN UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py
 # NOTICES.txt file points to where the OSS source code is archived
 RUN echo "This distribution includes open source which is archived at the following URL: https://opensource.nvidia.com/oss/teams/nvidia/nemo-rl/${RC_DATE}:linux-${TARGETARCH}/index.html" > NOTICES.txt && \
-    echo "For further inquiries or assistance, contact us at [email protected]" >> NOTICES.txt
+    echo "For further inquiries or assistance, contact us at [email protected]" >> NOTICES.txt
@@ -6,7 +6,7 @@ grpo:
 loss_fn:
   use_importance_sampling_correction: true
 checkpointing:
-  checkpoint_dir: results/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
+  checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
 policy:
   model_name: meta-llama/Llama-3.1-8B-Instruct
   tokenizer:
@@ -48,11 +48,12 @@ policy:
 data:
   max_input_seq_length: 4096
 logger:
-  log_dir: logs/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
+  log_dir: logs/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
   wandb_enabled: true
   tensorboard_enabled: true
   wandb:
     project: nemo-rl
-    name: grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
+    name: grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
 cluster:
+  num_nodes: 2
   gpus_per_node: 8
@@ -423,14 +423,46 @@ def cast_tensor_to_fp8_blockwise(
     return fp_data, descale_fp
 
 
+# Ref: https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/vllm/model_executor/layers/quantization/utils/fp8_utils.py#L1175
+# Patches this method to not create new torch.nn.Parameter for layer weights
+# to maintain weight loaders.
+def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
+    assert layer.weight_block_size is not None
+
+    from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+        deepgemm_post_process_fp8_weight_block,
+    )
+    from vllm.utils.deep_gemm import (
+        is_deep_gemm_e8m0_used,
+        should_use_deepgemm_for_fp8_linear,
+    )
+
+    # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to
+    # requantize the weight and input to the specific scale
+    # at the same time.
+    should_use_deepgemm = should_use_deepgemm_for_fp8_linear(
+        layer.orig_dtype, layer.weight
+    )
+    if should_use_deepgemm:
+        dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
+            wq=layer.weight.data,
+            ws=layer.weight_scale.data,
+            quant_block_shape=tuple(layer.weight_block_size),
+            use_e8m0=is_deep_gemm_e8m0_used(),
+        )
+        # This is the only part we change from the original function (https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/vllm/model_executor/layers/quantization/utils/fp8_utils.py#L1196-L1197)
+        # Instead of creating new torch.nn.Parameter, we update the data in place.
+        layer.weight.data.copy_(dg_weight)
+        layer.weight_scale.data.copy_(dg_weight_scale)
+
+
 def process_weights_after_loading(self, layer) -> None:
     """This function is used to process the weights after loading for a Linear layer.
 
     Compared to the original process_weights_after_loading in vllm, we just avoid creation of
     new torch.nn.Parameter objects, because that removes the weight_loader attribute which we need for refit.
     """
     from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-        maybe_post_process_fp8_weight_block,
         process_fp8_weight_block_strategy,
     )
 
@@ -448,7 +480,7 @@ def process_weights_after_loading(self, layer) -> None:
         layer.weight_scale = torch.nn.Parameter(weight_scale.data, requires_grad=False)
         layer.update_param_tp_status()
 
-    maybe_post_process_fp8_weight_block(layer, self.cutlass_block_fp8_supported)
+    maybe_post_process_fp8_weight_block(layer)
 
 
 def process_weights_after_loading_moe(self, layer) -> None:

@@ -16,6 +16,7 @@
 import gc
 import os
 import sys
+from importlib.util import find_spec
 from typing import Any, Optional, cast
 
 import ray
@@ -156,63 +157,134 @@ def __init__(
         self.rank = 0
         self.world_size = 1
 
-        # Monkey patch for vLLM to ensure RAY_ADDRESS is set in Ray actors.
-        try:
-            from vllm.logger import init_logger
+        # Monkey patches for vLLM behavior. We avoid importing vllm modules
+        # here to prevent side effects during initialization and instead
+        # locate the files via importlib metadata.
 
-            logger = init_logger("vllm_patch")
+        from vllm.logger import init_logger
 
-            def _patch_vllm_init_workers_ray():
-                """Patch the vLLM ray_distributed_executor.py file.
+        logger = init_logger("vllm_patch")
 
-                1. Pass custom runtime_env in _init_workers_ray call.
-                    - This allows passing custom py_executable to worker initialization.
-                2. Add NCCL_CUMEM_ENABLE and NCCL_NVLS_ENABLE to vLLM ADDITIONAL_ENV_VARS.
-                    - This is a workaround to fix async vllm in some scenarios.
-                    - See https://github.com/NVIDIA-NeMo/RL/pull/898 for more details.
-                """
-                try:
-                    import vllm.executor.ray_distributed_executor as ray_executor_module
+        def _get_vllm_file(relative_path: str) -> str:
+            """Return absolute path to a vLLM file or raise if it cannot be found.
+
+            The relative_path should be a POSIX-style path under the vllm
+            package root, e.g. "v1/executor/ray_executor.py" or
+            "attention/layer.py".
+            """
+            spec = find_spec("vllm")
+            if spec is None or not spec.submodule_search_locations:
+                raise RuntimeError(
+                    "vLLM package not found while attempting to patch "
+                    f"'{relative_path}'. Ensure vLLM is installed and "
+                    "available in this environment."
+                )
 
-                    file_to_patch = ray_executor_module.__file__
+            base_dir = next(iter(spec.submodule_search_locations))
+            file_path = os.path.join(base_dir, *relative_path.split("/"))
 
-                    with open(file_to_patch, "r") as f:
-                        content = f.read()
+            if not os.path.exists(file_path):
+                raise RuntimeError(
+                    "Failed to locate expected vLLM file to patch. "
+                    f"Looked for '{relative_path}' at '{file_path}'. "
+                    "This likely indicates an unexpected vLLM installation "
+                    "layout or version mismatch."
+                )
+
+            return file_path
+
+        def _patch_vllm_init_workers_ray():
+            """Patch the vLLM ray_distributed_executor.py file.
+
+            1. Pass custom runtime_env in _init_workers_ray call.
+                - This allows passing custom py_executable to worker initialization.
+            2. Add NCCL_CUMEM_ENABLE and NCCL_NVLS_ENABLE to vLLM ADDITIONAL_ENV_VARS.
+                - This is a workaround to fix async vllm in some scenarios.
+                - See https://github.com/NVIDIA-NeMo/RL/pull/898 for more details.
+            """
+            file_to_patch = _get_vllm_file("v1/executor/ray_executor.py")
 
-                    old_lines = [
-                        "self._init_workers_ray(placement_group)",
-                        'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}',
-                    ]
+            with open(file_to_patch, "r") as f:
+                content = f.read()
 
-                    new_lines = [
-                        f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": "{self.py_executable}"}})',
-                        'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "NCCL_CUMEM_ENABLE", "NCCL_NVLS_ENABLE", "RAY_ENABLE_UV_RUN_RUNTIME_ENV"}',
-                    ]
+            old_lines = [
+                "self._init_workers_ray(placement_group)",
+                'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}',
+            ]
 
-                    need_replace = False
-                    for old_line, new_line in zip(old_lines, new_lines):
-                        if new_line in content or old_line not in content:
-                            continue
-                        content = content.replace(old_line, new_line)
-                        need_replace = True
+            new_lines = [
+                f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": "{self.py_executable}"}})',
+                'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "NCCL_CUMEM_ENABLE", "NCCL_NVLS_ENABLE", "RAY_ENABLE_UV_RUN_RUNTIME_ENV"}',
+            ]
+
+            need_replace = False
+            for old_line, new_line in zip(old_lines, new_lines):
+                if new_line in content or old_line not in content:
+                    continue
+                content = content.replace(old_line, new_line)
+                need_replace = True
+
+            if not need_replace:
+                return
+
+            # Write back the patched content
+            with open(file_to_patch, "w") as f:
+                f.write(content)
+
+        def _patch_vllm_vit_flash_attn_backend():
+            """Patch vLLM vision attention backend selection logic.
+
+            Modify the CUDA branch of maybe_get_vit_flash_attn_backend in
+            vllm.attention.layer to avoid overriding the backend when it
+            is already set to XFORMERS. This avoids flash attention related
+            errors when the ViT head dimension is not a multiple of 32.
+
+            Related issues:
+            - https://github.com/vllm-project/vllm/issues/27562
+            - https://github.com/vllm-project/vllm/issues/26989
+
+            This is properly fixed in https://github.com/vllm-project/vllm/pull/28763. We can remove this patch once we upgrade to a version of vllm that contains this fix.
+            """
+            file_to_patch = _get_vllm_file("attention/layer.py")
+            with open(file_to_patch, "r") as f:
+                content = f.read()
+
+            old_snippet = (
+                "    elif current_platform.is_cuda():\n"
+                "        if (\n"
+                "            attn_backend != AttentionBackendEnum.FLASH_ATTN\n"
+                "            and check_upstream_fa_availability(torch.get_default_dtype())\n"
+                "        ):\n"
+                "            attn_backend = AttentionBackendEnum.FLASH_ATTN\n"
+                "            use_upstream_fa = True\n"
+            )
+
+            new_snippet = (
+                "    elif current_platform.is_cuda():\n"
+                "        if (\n"
+                "            attn_backend != AttentionBackendEnum.FLASH_ATTN\n"
+                "            and attn_backend != AttentionBackendEnum.XFORMERS\n"
+                "            and check_upstream_fa_availability(torch.get_default_dtype())\n"
+                "        ):\n"
+                "            attn_backend = AttentionBackendEnum.FLASH_ATTN\n"
+                "            use_upstream_fa = True\n"
+            )
 
-                    if not need_replace:
-                        return
+            # Only patch if the file still has the old snippet and
+            # hasn't been patched already.
+            if new_snippet in content or old_snippet not in content:
+                return
 
-                    # Write back the patched content
-                    with open(file_to_patch, "w") as f:
-                        f.write(content)
+            content = content.replace(old_snippet, new_snippet)
 
-                except (ImportError, FileNotFoundError, PermissionError):
-                    # Allow failures gracefully
-                    pass
+            with open(file_to_patch, "w") as f:
+                f.write(content)
 
-            _patch_vllm_init_workers_ray()
-            logger.info("Successfully patched vllm _init_workers_ray.")
+        _patch_vllm_init_workers_ray()
+        logger.info("Successfully patched vllm _init_workers_ray.")
 
-        except (ImportError, AttributeError):
-            # vllm not installed or has a different structure, skipping patch.
-            pass
+        _patch_vllm_vit_flash_attn_backend()
+        logger.info("Successfully patched vllm vit flash attention backend.")
 
         try:
             import vllm

@@ -18,11 +18,11 @@ license = { text = "Apache 2.0" }
 dependencies = [
   "setuptools",
   "ninja",                                                                                                            # for flash-attn parallel build
-  "torch==2.8.0",
+  "torch==2.9.0",
   "triton; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')",
   "colored==2.2.3",
   "ray[default]==2.49.2",
-  "transformers>=4.55.4",
+  "transformers>=4.57.1",
   "wandb",
   "numpy",
   "datasets>=4.0.0",
@@ -57,7 +57,7 @@ automodel = [
   # Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular)
   # https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108
   # https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76
-  "vllm==0.11.0",      # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved
+  "vllm==0.11.2",      # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved
   "flash-attn==2.8.1",
   "mamba-ssm",
   "causal-conv1d",
@@ -69,7 +69,7 @@ vllm = [
   # sudo apt-get update
   # sudo apt-get install libibverbs-dev
   "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@e3908bf5bd0cc6265bcb225d15cd8c996d4759ef",
-  "vllm==0.11.0",
+  "vllm==0.11.2",
   "num2words>=0.5.14",
   # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved
   "flash-attn==2.8.1",
@@ -92,7 +92,7 @@ mcore = [
   "megatron-core",
   "megatron-bridge",
   # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved
-  "vllm==0.11.0",
+  "vllm==0.11.2",
   # Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular)
   # https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108
   # https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76
@@ -105,7 +105,7 @@ penguin = ["penguin"]
 # This is a default group so that we install these even with bare `uv sync`
 build = [
   # Build requirement for TE
-  "torch==2.8.0",
+  "torch==2.9.0",
   # Build requirement for TE
   "setuptools",
   "packaging",
@@ -153,15 +153,15 @@ penguin = { workspace = true }
 nemo_run = { git = "https://github.com/NVIDIA-NeMo/Run", rev = "414f0077c648fde2c71bb1186e97ccbf96d6844c" }
 # torch/torchvision/triton all come from the torch index in order to pick up aarch64 wheels
 torch = [
-  { index = "pytorch-cu129", marker = "sys_platform != 'darwin'" },
+  { index = "pytorch-cu130", marker = "sys_platform != 'darwin'" },
   { index = "pypi", marker = "sys_platform == 'darwin'" },
 ]
 torchvision = [
-  { index = "pytorch-cu129", marker = "sys_platform != 'darwin'" },
+  { index = "pytorch-cu130", marker = "sys_platform != 'darwin'" },
   { index = "pypi", marker = "sys_platform == 'darwin'" },
 ]
 triton = [
-  { index = "pytorch-cu129", marker = "sys_platform != 'darwin'" },
+  { index = "pytorch-cu130", marker = "sys_platform != 'darwin'" },
   { index = "pypi", marker = "sys_platform == 'darwin'" },
 ]
 causal-conv1d = { git = "https://github.com/Dao-AILab/causal-conv1d", tag = "v1.5.0.post8" }
@@ -187,8 +187,8 @@ url = "https://pypi.org/simple"
 explicit = true
 
 [[tool.uv.index]]
-name = "pytorch-cu129"
-url = "https://download.pytorch.org/whl/cu129"
+name = "pytorch-cu130"
+url = "https://download.pytorch.org/whl/cu130"
 explicit = true
 
 [tool.uv]

@@ -3,7 +3,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 source $SCRIPT_DIR/common.env
 
 # ===== BEGIN CONFIG =====
-NUM_NODES=1
+NUM_NODES=2
 STEPS_PER_RUN=100
 MAX_STEPS=100
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up