Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
[submodule "3rdparty/NeMo"]
path = 3rdparty/NeMo-workspace/NeMo
url = https://github.com/NVIDIA/NeMo.git
branch = pjin/ashors/rl-qwen3-export
shallow = true
[submodule "3rdparty/Megatron-LM"]
path = 3rdparty/Megatron-LM-workspace/Megatron-LM
url = https://github.com/terrykong/Megatron-LM.git
branch = sahilj/megatron-external-loss-norm
branch = yuya/nemo-rl-use
shallow = true
[submodule "3rdparty/Megatron-Bridge"]
path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
branch = yuya/nemo-rl-use-chunkpatch
shallow = true
[submodule "3rdparty/Automodel-workspace/Automodel"]
path = 3rdparty/Automodel-workspace/Automodel
Expand Down
1 change: 1 addition & 0 deletions 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
Submodule Megatron-Bridge added at a1bbfc
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
try:
from megatron.bridge import AutoBridge # noqa: F401

from .common import (
get_global_expert_num,
get_global_layer_num,
get_local_expert_num,
get_local_layer_num,
)
INSTALLED = True
except Exception:
INSTALLED = False

__all__ = [
"get_global_expert_num",
"get_global_layer_num",
"get_local_expert_num",
"get_local_layer_num",
]
print(f"Megatron Bridge {INSTALLED=}")
14 changes: 14 additions & 0 deletions 3rdparty/Megatron-Bridge-workspace/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[build-system]
requires = [
"setuptools>=61.0",
"wheel",
]
build-backend = "setuptools.build_meta"

[project]
name = "megatron-bridge"
dynamic = ["dependencies", "version"]
authors = [{ name = "NVIDIA", email = "nemo-toolkit@nvidia.com" }]
description = "Standalone packaging for the Megatron Bridge sub-module."
requires-python = ">=3.10"

115 changes: 115 additions & 0 deletions 3rdparty/Megatron-Bridge-workspace/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import tomllib

import setuptools

# Conditional packaging mirroring NeMo and Megatron-LM workspaces
final_packages = []
final_package_dir = {}

# If the submodule is present, expose `megatron.bridge` package from the checkout
bridge_src_dir = "Megatron-Bridge/src/megatron/bridge"
bridge_package_name = "megatron.bridge"

CACHED_DEPENDENCIES = [
"accelerate>=1.6.0",
"datasets",
"numpy<2",
"omegaconf>=2.3.0",
"packaging",
"tensorboard>=2.19.0",
"torch",
"transformers>=4.51.3",
"typing-extensions",
"rich",
"wandb>=0.19.10",
"six>=1.17.0",
"regex>=2024.11.6",
"pyyaml>=6.0.2",
"einops>=0.8.1",
"sentencepiece>=0.2.0",
"tiktoken>=0.9.0",
"tqdm>=4.67.1",
"hydra-core>1.3,<=1.3.2",
"megatron-core>=0.14.0a0,<0.15.0",
"nvidia-modelopt[torch,onnx]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'",
"nvidia-resiliency-ext>=0.4.0a0,<0.5.0; sys_platform != 'darwin'",
"transformer-engine[pytorch]>=2.5.0a0,<2.6.0; sys_platform != 'darwin'",
]

# If the bridge source exists, compare cached dependencies with the submodule's pyproject
if os.path.exists(bridge_src_dir):
pyproject_path = os.path.join("Megatron-Bridge", "pyproject.toml")
if not os.path.exists(pyproject_path):
raise FileNotFoundError(
f"[megatron-bridge][setup] {pyproject_path} not found; skipping dependency consistency check."
)

with open(pyproject_path, "rb") as f:
data = tomllib.load(f)
project = data["project"]
deps_list = project["dependencies"]
submodule_deps = set(str(d).strip() for d in deps_list)

missing_in_cached = submodule_deps - set(CACHED_DEPENDENCIES)
extra_in_cached = set(CACHED_DEPENDENCIES) - submodule_deps

if missing_in_cached or extra_in_cached:
print(
"[megatron-bridge][setup] Dependency mismatch between Megatron-Bridge-workspace/Megatron-Bridge/pyproject.toml vs Megatron-Bridge-workspace/setup.py::CACHED_DEPENDENCIES.",
file=sys.stderr,
)
if missing_in_cached:
print(
" - Present in Megatron-Bridge/pyproject.toml but missing from CACHED_DEPENDENCIES:",
file=sys.stderr,
)
for dep in sorted(missing_in_cached):
print(f" * {dep}", file=sys.stderr)
if extra_in_cached:
print(
" - Present in CACHED_DEPENDENCIES but not in Megatron-Bridge/pyproject.toml:",
file=sys.stderr,
)
for dep in sorted(extra_in_cached):
print(f" * {dep}", file=sys.stderr)
print(
" Please update CACHED_DEPENDENCIES or the submodule pyproject to keep them in sync.",
file=sys.stderr,
)
sys.exit(1)
else:
print(
"[megatron-bridge][setup] Dependency sets are consistent with the submodule pyproject.",
file=sys.stderr,
)

if os.path.exists(bridge_src_dir):
final_packages.append(bridge_package_name)
final_package_dir[bridge_package_name] = bridge_src_dir

setuptools.setup(
name="megatron-bridge",
version="0.0.0",
description="Standalone packaging for the Megatron Bridge sub-module.",
author="NVIDIA",
author_email="nemo-toolkit@nvidia.com",
packages=final_packages,
package_dir=final_package_dir,
py_modules=["is_megatron_bridge_installed"],
install_requires=CACHED_DEPENDENCIES,
)
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-LM-workspace/Megatron-LM
1 change: 0 additions & 1 deletion 3rdparty/NeMo-workspace/NeMo
Submodule NeMo deleted from 5c4264
27 changes: 0 additions & 27 deletions 3rdparty/NeMo-workspace/is_nemo_installed.py

This file was deleted.

10 changes: 0 additions & 10 deletions 3rdparty/NeMo-workspace/pyproject.toml

This file was deleted.

55 changes: 0 additions & 55 deletions 3rdparty/NeMo-workspace/setup.py

This file was deleted.

2 changes: 1 addition & 1 deletion nemo_rl/models/megatron/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import torch
import torch.distributed as dist
from megatron.bridge.training.state import GlobalState
from megatron.core.models.gpt import GPTModel
from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.parallel_state import (
Expand All @@ -26,7 +27,6 @@
get_tensor_model_parallel_rank,
)
from megatron.training.utils import get_ltor_masks_and_position_ids
from nemo.tron.state import GlobalState

from nemo_rl.algorithms.loss_functions import LossFunction, SequencePackingLossWrapper
from nemo_rl.distributed.batched_data_dict import BatchedDataDict
Expand Down
Loading
Loading