Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@
[submodule "3rdparty/Automodel-workspace/Automodel"]
path = 3rdparty/Automodel-workspace/Automodel
url = https://github.com/NVIDIA-NeMo/Automodel.git
branch = nemo-rl-submodule
branch = yifu/bump-torch-and-hf
shallow = true
2 changes: 1 addition & 1 deletion 3rdparty/Automodel-workspace/Automodel
Submodule Automodel updated 1 files
+6 −6 pyproject.toml
4 changes: 2 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag <registry>/nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git
# Local NeMo RL source override: docker buildx build --build-context nemo-rl=. -f docker/Dockerfile --tag <registry>/nemo-rl:latest --push .

ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04
ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.11-cuda13.0-devel-ubuntu24.04
FROM scratch AS nemo-rl
ARG NRL_GIT_REF=main
ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} /
Expand Down Expand Up @@ -129,4 +129,4 @@ RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallo
RUN UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py
# NOTICES.txt file points to where the OSS source code is archived
RUN echo "This distribution includes open source which is archived at the following URL: https://opensource.nvidia.com/oss/teams/nvidia/nemo-rl/${RC_DATE}:linux-${TARGETARCH}/index.html" > NOTICES.txt && \
echo "For further inquiries or assistance, contact us at [email protected]" >> NOTICES.txt
echo "For further inquiries or assistance, contact us at [email protected]" >> NOTICES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ grpo:
loss_fn:
use_importance_sampling_correction: true
checkpointing:
checkpoint_dir: results/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
policy:
model_name: meta-llama/Llama-3.1-8B-Instruct
tokenizer:
Expand Down Expand Up @@ -48,11 +48,12 @@ policy:
data:
max_input_seq_length: 4096
logger:
log_dir: logs/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
log_dir: logs/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
wandb_enabled: true
tensorboard_enabled: true
wandb:
project: nemo-rl
name: grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
name: grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
cluster:
num_nodes: 2
gpus_per_node: 8
36 changes: 34 additions & 2 deletions nemo_rl/models/generation/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,14 +423,46 @@ def cast_tensor_to_fp8_blockwise(
return fp_data, descale_fp


# Ref: https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/vllm/model_executor/layers/quantization/utils/fp8_utils.py#L1175
# Patches this method to not create new torch.nn.Parameter for layer weights
# to maintain weight loaders.
def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
assert layer.weight_block_size is not None

from vllm.model_executor.layers.quantization.utils.fp8_utils import (
deepgemm_post_process_fp8_weight_block,
)
from vllm.utils.deep_gemm import (
is_deep_gemm_e8m0_used,
should_use_deepgemm_for_fp8_linear,
)

# On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to
# requantize the weight and input to the specific scale
# at the same time.
should_use_deepgemm = should_use_deepgemm_for_fp8_linear(
layer.orig_dtype, layer.weight
)
if should_use_deepgemm:
dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
wq=layer.weight.data,
ws=layer.weight_scale.data,
quant_block_shape=tuple(layer.weight_block_size),
use_e8m0=is_deep_gemm_e8m0_used(),
)
# This is the only part we change from the original function (https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/vllm/model_executor/layers/quantization/utils/fp8_utils.py#L1196-L1197)
# Instead of creating new torch.nn.Parameter, we update the data in place.
layer.weight.data.copy_(dg_weight)
layer.weight_scale.data.copy_(dg_weight_scale)


def process_weights_after_loading(self, layer) -> None:
"""This function is used to process the weights after loading for a Linear layer.

Compared to the original process_weights_after_loading in vllm, we just avoid creation of
new torch.nn.Parameter objects, because that removes the weight_loader attribute which we need for refit.
"""
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
maybe_post_process_fp8_weight_block,
process_fp8_weight_block_strategy,
)

Expand All @@ -448,7 +480,7 @@ def process_weights_after_loading(self, layer) -> None:
layer.weight_scale = torch.nn.Parameter(weight_scale.data, requires_grad=False)
layer.update_param_tp_status()

maybe_post_process_fp8_weight_block(layer, self.cutlass_block_fp8_supported)
maybe_post_process_fp8_weight_block(layer)


def process_weights_after_loading_moe(self, layer) -> None:
Expand Down
160 changes: 116 additions & 44 deletions nemo_rl/models/generation/vllm/vllm_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import gc
import os
import sys
from importlib.util import find_spec
from typing import Any, Optional, cast

import ray
Expand Down Expand Up @@ -156,63 +157,134 @@ def __init__(
self.rank = 0
self.world_size = 1

# Monkey patch for vLLM to ensure RAY_ADDRESS is set in Ray actors.
try:
from vllm.logger import init_logger
# Monkey patches for vLLM behavior. We avoid importing vllm modules
# here to prevent side effects during initialization and instead
# locate the files via importlib metadata.

logger = init_logger("vllm_patch")
from vllm.logger import init_logger

def _patch_vllm_init_workers_ray():
"""Patch the vLLM ray_distributed_executor.py file.
logger = init_logger("vllm_patch")

1. Pass custom runtime_env in _init_workers_ray call.
- This allows passing custom py_executable to worker initialization.
2. Add NCCL_CUMEM_ENABLE and NCCL_NVLS_ENABLE to vLLM ADDITIONAL_ENV_VARS.
- This is a workaround to fix async vllm in some scenarios.
- See https://github.com/NVIDIA-NeMo/RL/pull/898 for more details.
"""
try:
import vllm.executor.ray_distributed_executor as ray_executor_module
def _get_vllm_file(relative_path: str) -> str:
"""Return absolute path to a vLLM file or raise if it cannot be found.

The relative_path should be a POSIX-style path under the vllm
package root, e.g. "v1/executor/ray_executor.py" or
"attention/layer.py".
"""
spec = find_spec("vllm")
if spec is None or not spec.submodule_search_locations:
raise RuntimeError(
"vLLM package not found while attempting to patch "
f"'{relative_path}'. Ensure vLLM is installed and "
"available in this environment."
)

file_to_patch = ray_executor_module.__file__
base_dir = next(iter(spec.submodule_search_locations))
file_path = os.path.join(base_dir, *relative_path.split("/"))

with open(file_to_patch, "r") as f:
content = f.read()
if not os.path.exists(file_path):
raise RuntimeError(
"Failed to locate expected vLLM file to patch. "
f"Looked for '{relative_path}' at '{file_path}'. "
"This likely indicates an unexpected vLLM installation "
"layout or version mismatch."
)

return file_path

def _patch_vllm_init_workers_ray():
"""Patch the vLLM ray_distributed_executor.py file.

1. Pass custom runtime_env in _init_workers_ray call.
- This allows passing custom py_executable to worker initialization.
2. Add NCCL_CUMEM_ENABLE and NCCL_NVLS_ENABLE to vLLM ADDITIONAL_ENV_VARS.
- This is a workaround to fix async vllm in some scenarios.
- See https://github.com/NVIDIA-NeMo/RL/pull/898 for more details.
"""
file_to_patch = _get_vllm_file("v1/executor/ray_executor.py")

old_lines = [
"self._init_workers_ray(placement_group)",
'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}',
]
with open(file_to_patch, "r") as f:
content = f.read()

new_lines = [
f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": "{self.py_executable}"}})',
'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "NCCL_CUMEM_ENABLE", "NCCL_NVLS_ENABLE", "RAY_ENABLE_UV_RUN_RUNTIME_ENV"}',
]
old_lines = [
"self._init_workers_ray(placement_group)",
'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}',
]

need_replace = False
for old_line, new_line in zip(old_lines, new_lines):
if new_line in content or old_line not in content:
continue
content = content.replace(old_line, new_line)
need_replace = True
new_lines = [
f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": "{self.py_executable}"}})',
'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "NCCL_CUMEM_ENABLE", "NCCL_NVLS_ENABLE", "RAY_ENABLE_UV_RUN_RUNTIME_ENV"}',
]

need_replace = False
for old_line, new_line in zip(old_lines, new_lines):
if new_line in content or old_line not in content:
continue
content = content.replace(old_line, new_line)
need_replace = True

if not need_replace:
return

# Write back the patched content
with open(file_to_patch, "w") as f:
f.write(content)
Comment on lines +196 to +232
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Code injection risk and concurrency hazard.

This function has several critical issues:

  1. Code injection vulnerability (Line 216): self.py_executable is directly embedded into an f-string that becomes Python source code. If the path contains quotes or special characters, it could break syntax or enable code injection.

  2. File modification race condition: Multiple workers or processes may attempt to patch the same installed vLLM files concurrently, risking corruption or incomplete patches.

  3. Persistent modifications: Changes to installed package files persist across runs and affect all projects using the same vLLM installation, potentially causing unexpected behavior.

  4. Missing strict parameter (Line 221): The zip() call should include strict=True for Python 3.10+.

Apply these fixes:

         new_lines = [
-            f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": "{self.py_executable}"}})',
+            f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": {self.py_executable!r}}})',
             'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "NCCL_CUMEM_ENABLE", "NCCL_NVLS_ENABLE", "RAY_ENABLE_UV_RUN_RUNTIME_ENV"}',
         ]
 
         need_replace = False
-        for old_line, new_line in zip(old_lines, new_lines):
+        for old_line, new_line in zip(old_lines, new_lines, strict=True):
             if new_line in content or old_line not in content:
                 continue

Additional recommendations:

  • Consider using vLLM's official extension or plugin mechanisms if available, rather than patching installed files.
  • Add file locking if the patching approach must be retained.
  • Document that this approach requires write permissions to the vLLM installation directory.
🧰 Tools
🪛 Ruff (0.14.6)

221-221: zip() without an explicit strict= parameter

Add explicit value for parameter strict=

(B905)


def _patch_vllm_vit_flash_attn_backend():
"""Patch vLLM vision attention backend selection logic.

Modify the CUDA branch of maybe_get_vit_flash_attn_backend in
vllm.attention.layer to avoid overriding the backend when it
is already set to XFORMERS. This avoids flash attention related
errors when the ViT head dimension is not a multiple of 32.

Related issues:
- https://github.com/vllm-project/vllm/issues/27562
- https://github.com/vllm-project/vllm/issues/26989

This is properly fixed in https://github.com/vllm-project/vllm/pull/28763. We can remove this patch once we upgrade to a version of vllm that contains this fix.
"""
file_to_patch = _get_vllm_file("attention/layer.py")
with open(file_to_patch, "r") as f:
content = f.read()

old_snippet = (
" elif current_platform.is_cuda():\n"
" if (\n"
" attn_backend != AttentionBackendEnum.FLASH_ATTN\n"
" and check_upstream_fa_availability(torch.get_default_dtype())\n"
" ):\n"
" attn_backend = AttentionBackendEnum.FLASH_ATTN\n"
" use_upstream_fa = True\n"
)

new_snippet = (
" elif current_platform.is_cuda():\n"
" if (\n"
" attn_backend != AttentionBackendEnum.FLASH_ATTN\n"
" and attn_backend != AttentionBackendEnum.XFORMERS\n"
" and check_upstream_fa_availability(torch.get_default_dtype())\n"
" ):\n"
" attn_backend = AttentionBackendEnum.FLASH_ATTN\n"
" use_upstream_fa = True\n"
)

if not need_replace:
return
# Only patch if the file still has the old snippet and
# hasn't been patched already.
if new_snippet in content or old_snippet not in content:
return

# Write back the patched content
with open(file_to_patch, "w") as f:
f.write(content)
content = content.replace(old_snippet, new_snippet)

except (ImportError, FileNotFoundError, PermissionError):
# Allow failures gracefully
pass
with open(file_to_patch, "w") as f:
f.write(content)
Comment on lines +234 to +281
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Same file modification concerns apply.

This patch function has the same operational risks as _patch_vllm_init_workers_ray: concurrent modification hazards and persistent changes to installed packages.

Since the comment indicates this is fixed in vllm PR #28763, consider adding a version check to skip this patch when using vLLM versions that include the fix.

     def _patch_vllm_vit_flash_attn_backend():
         """Patch vLLM vision attention backend selection logic.
 
         Modify the CUDA branch of maybe_get_vit_flash_attn_backend in
         vllm.attention.layer to avoid overriding the backend when it
         is already set to XFORMERS. This avoids flash attention related
         errors when the ViT head dimension is not a multiple of 32.
 
         Related issues:
         - https://github.com/vllm-project/vllm/issues/27562
         - https://github.com/vllm-project/vllm/issues/26989
 
         This is properly fixed in https://github.com/vllm-project/vllm/pull/28763. We can remove this patch once we upgrade to a version of vllm that contains this fix.
         """
+        # TODO: Add version check to skip patching when using vLLM versions >= X.Y.Z that include PR #28763
         file_to_patch = _get_vllm_file("attention/layer.py")

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In nemo_rl/models/generation/vllm/vllm_worker.py around lines 234 to 281, add a
guard that checks the installed vLLM package version and skips applying this
patch when the vLLM version already includes PR #28763; implement by importing
importlib.metadata (or importlib_metadata for py<3.8) and
packaging.version.Version, retrieving the installed vllm version in a
try/except, comparing it to a MIN_VLLM_VERSION constant (set to the first
release that contains PR #28763), and if installed_version >= MIN_VLLM_VERSION
log an informational message and return without modifying files; ensure
PackageNotFoundError is handled (proceed with patch if vllm not installed) and
keep existing behavior otherwise.


_patch_vllm_init_workers_ray()
logger.info("Successfully patched vllm _init_workers_ray.")
_patch_vllm_init_workers_ray()
logger.info("Successfully patched vllm _init_workers_ray.")

except (ImportError, AttributeError):
# vllm not installed or has a different structure, skipping patch.
pass
_patch_vllm_vit_flash_attn_backend()
logger.info("Successfully patched vllm vit flash attention backend.")
Comment on lines +283 to +287
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Add error handling for patch invocations.

The patch functions can raise RuntimeError (from _get_vllm_file), but there's no error handling here. If patching fails, the entire worker initialization fails.

Based on learnings from previous reviews, Ray actor methods should handle errors gracefully rather than propagating exceptions.

Apply this diff:

-        _patch_vllm_init_workers_ray()
-        logger.info("Successfully patched vllm _init_workers_ray.")
-
-        _patch_vllm_vit_flash_attn_backend()
-        logger.info("Successfully patched vllm vit flash attention backend.")
+        try:
+            _patch_vllm_init_workers_ray()
+            logger.info("Successfully patched vllm _init_workers_ray.")
+        except Exception:
+            import traceback
+            logger.error("Failed to patch vllm _init_workers_ray:")
+            traceback.print_exc()
+            # Consider whether to continue or return based on criticality
+
+        try:
+            _patch_vllm_vit_flash_attn_backend()
+            logger.info("Successfully patched vllm vit flash attention backend.")
+        except Exception:
+            import traceback
+            logger.error("Failed to patch vllm vit flash attention backend:")
+            traceback.print_exc()
+            # This patch may not be needed for all vLLM versions, so continue

Based on learnings, Ray actor methods benefit from explicit error handling with print/traceback patterns.

🤖 Prompt for AI Agents
In nemo_rl/models/generation/vllm/vllm_worker.py around lines 283 to 287, the
two patch calls (_patch_vllm_init_workers_ray() and
_patch_vllm_vit_flash_attn_backend()) must be wrapped in explicit error handling
so exceptions from _get_vllm_file (RuntimeError or others) do not abort worker
init; add try/except around each patch (or a single try that calls both) that
catches Exception, calls traceback.print_exc(), and logs the failure with
logger.error including the exception information (or use logger.exception), then
continue initialization without re-raising so the Ray actor method does not
propagate the exception.


try:
import vllm
Expand Down
22 changes: 11 additions & 11 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ license = { text = "Apache 2.0" }
dependencies = [
"setuptools",
"ninja", # for flash-attn parallel build
"torch==2.8.0",
"torch==2.9.0",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🌐 Web query:

torch 2.9.0 vllm 0.11.2 compatibility

💡 Result:

Short answer: Yes — vLLM v0.11.2 is compatible with PyTorch 2.9.0, but you must use a matching torch CUDA build (e.g. torch==2.9.0+cu129) or build vLLM from source if your CUDA/PyTorch build differs. [1][2]

Sources:

  • vLLM GitHub release notes (v0.11.1 → v0.11.2 is a patch; v0.11.x notes state default build uses PyTorch 2.9.0 + CUDA 12.9.1). [1]
  • vLLM installation / GPU requirements (binary CUDA/PyTorch build compatibility and recommendation to build from source if versions differ). [2]

References:
[1] https://github.com/vllm-project/vllm/releases (v0.11.2 / v0.11.1 notes).
[2] https://docs.vllm.ai/en/stable/getting_started/installation/gpu/


🌐 Web query:

transformers 4.57.1 torch 2.9.0 compatibility

💡 Result:

Short answer: usually yes — transformers 4.57.1 can run with PyTorch 2.9.0, but it’s not guaranteed to be fully supported and several incompatibility bugs have been reported. Use a supported PyTorch release recommended by the Transformers docs (Transformers expects PyTorch 2.x; current docs note PyTorch 2.2+/2.0+ depending on doc version) for best results. [1][2]

Details / evidence:

  • Transformers installation docs state the library is tested on PyTorch 2.x (docs show 2.2+ on main and earlier pages mention 2.0+). [1]
  • Multiple GitHub issues show users running transformers==4.57.1 with torch 2.9.0 (and 2.9.x builds) but encountering problems (torchcodec loading, tensor-parallel loading errors, and other incompatibilities or warnings from downstream packages). These indicate practical issues may arise even when import succeeds. [2][3][4]

Recommendation:

  • If you need maximum stability, use a PyTorch version explicitly listed in the Transformers documentation / tested matrix (avoid bleeding-edge torch builds). If you must use torch 2.9.0, test your model/workflow thoroughly and be prepared to fall back to a supported torch minor version if you hit the issues reported in the linked issues. [1][2][3]

Sources:

  • Transformers installation / compatibility notes. [1]
  • Hugging Face transformers GitHub issues noting usage / problems with torch 2.9.0 and transformers 4.57.1. [2][3][4]

References:
[1] Transformers installation docs (main / v4.xx).
[2] GH issue: “Cannot load LLaMA3 with transformers==4.57.1 …” (mentions PyTorch 2.9).
[3] GH issue: “Compatability with torchcodec” (user with torch 2.9.0+cu128).
[4] GH issue: “Transformers 4.57.1 not working with torch < 2.3” (discussion of torch version constraints).


Verify torch 2.9.0 compatibility—known issues with transformers 4.57.1.

vLLM 0.11.2 is compatible with torch 2.9.0, but ensure the CUDA build matches (e.g., torch==2.9.0+cu129). Transformers 4.57.1 with torch 2.9.0 has reported incompatibility issues (torchcodec loading, tensor-parallel errors) and is not in the officially tested matrix. Test this combination thoroughly before merging, or consider pinning to a tested PyTorch version (2.2+) if issues arise during integration testing.

🤖 Prompt for AI Agents
In pyproject.toml around line 21, the pinned "torch==2.9.0" may be incompatible
with transformers 4.57.1 (and requires a matching CUDA build), so update the
dependency to a compatible specification or test accordingly: either pin to a
tested PyTorch release (e.g., "torch>=2.2,<3.0" or a specific CUDA build like
"torch==2.9.0+cu129" matching your environment) and add a comment noting
required CUDA tag, or run integration tests with transformers 4.57.1 and vLLM
0.11.2 to confirm compatibility before merging; if tests fail, change the torch
pin to a known-good version and document the decision in the file.

"triton; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')",
"colored==2.2.3",
"ray[default]==2.49.2",
"transformers>=4.55.4",
"transformers>=4.57.1",
"wandb",
"numpy",
"datasets>=4.0.0",
Expand Down Expand Up @@ -57,7 +57,7 @@ automodel = [
# Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular)
# https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108
# https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76
"vllm==0.11.0", # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved
"vllm==0.11.2", # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved
"flash-attn==2.8.1",
"mamba-ssm",
"causal-conv1d",
Expand All @@ -69,7 +69,7 @@ vllm = [
# sudo apt-get update
# sudo apt-get install libibverbs-dev
"deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@e3908bf5bd0cc6265bcb225d15cd8c996d4759ef",
"vllm==0.11.0",
"vllm==0.11.2",
"num2words>=0.5.14",
# Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved
"flash-attn==2.8.1",
Expand All @@ -92,7 +92,7 @@ mcore = [
"megatron-core",
"megatron-bridge",
# Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved
"vllm==0.11.0",
"vllm==0.11.2",
# Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular)
# https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108
# https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76
Expand All @@ -105,7 +105,7 @@ penguin = ["penguin"]
# This is a default group so that we install these even with bare `uv sync`
build = [
# Build requirement for TE
"torch==2.8.0",
"torch==2.9.0",
# Build requirement for TE
"setuptools",
"packaging",
Expand Down Expand Up @@ -153,15 +153,15 @@ penguin = { workspace = true }
nemo_run = { git = "https://github.com/NVIDIA-NeMo/Run", rev = "414f0077c648fde2c71bb1186e97ccbf96d6844c" }
# torch/torchvision/triton all come from the torch index in order to pick up aarch64 wheels
torch = [
{ index = "pytorch-cu129", marker = "sys_platform != 'darwin'" },
{ index = "pytorch-cu130", marker = "sys_platform != 'darwin'" },
{ index = "pypi", marker = "sys_platform == 'darwin'" },
]
torchvision = [
{ index = "pytorch-cu129", marker = "sys_platform != 'darwin'" },
{ index = "pytorch-cu130", marker = "sys_platform != 'darwin'" },
{ index = "pypi", marker = "sys_platform == 'darwin'" },
]
triton = [
{ index = "pytorch-cu129", marker = "sys_platform != 'darwin'" },
{ index = "pytorch-cu130", marker = "sys_platform != 'darwin'" },
{ index = "pypi", marker = "sys_platform == 'darwin'" },
]
causal-conv1d = { git = "https://github.com/Dao-AILab/causal-conv1d", tag = "v1.5.0.post8" }
Expand All @@ -187,8 +187,8 @@ url = "https://pypi.org/simple"
explicit = true

[[tool.uv.index]]
name = "pytorch-cu129"
url = "https://download.pytorch.org/whl/cu129"
name = "pytorch-cu130"
url = "https://download.pytorch.org/whl/cu130"
explicit = true

[tool.uv]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env

# ===== BEGIN CONFIG =====
NUM_NODES=1
NUM_NODES=2
STEPS_PER_RUN=100
MAX_STEPS=100
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
Expand Down
Loading
Loading