Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
bbfc9eb
deepseek w/ megatron-bridge
yfw Sep 3, 2025
9bf6481
Print fix
yfw Sep 3, 2025
e6f98bc
Update megatron-bridge
yfw Sep 3, 2025
e9c5390
mcore tot
yfw Sep 3, 2025
379b013
Fixes for latest mcore
yfw Sep 3, 2025
5645f51
Correct strict loading megatron bridge config
yfw Sep 3, 2025
28d193d
Add sequence+context parallel to import
yfw Sep 4, 2025
8caa938
Add moe_permute_fusion config
yfw Sep 4, 2025
3ddefa2
Update Megatron-Bridge
yfw Sep 4, 2025
cbae53c
Update .gitmodules
yfw Sep 4, 2025
32e0720
Update Megatron-Bridge with expert_tensor_parallel_size fix
yfw Sep 4, 2025
50da32c
Include both commits for expert tp fix
yfw Sep 4, 2025
89aaca8
Reset parallelism configs to default after initial import
yfw Sep 5, 2025
9b2b5d8
Formatting
yfw Sep 5, 2025
31b0d7f
Update Megatron-Bridge
yfw Sep 5, 2025
62bae7d
Remove prev fix
yfw Sep 5, 2025
8ce7a95
Fix get_ltor_masks_and_position_ids regression
yfw Sep 5, 2025
a5c3210
Another fix
yfw Sep 6, 2025
a24dc44
general config fixes
ahmadki Sep 2, 2025
92d2840
fix gemma and fix common.env
terrykong Sep 6, 2025
3ed7e81
Merge remote-tracking branch 'origin' into yifu/mbridge_dsv3_mcoretot
yfw Sep 6, 2025
d83ba06
Merge commit 'refs/pull/1085/head' of https://github.com/NVIDIA-NeMo/…
yfw Sep 6, 2025
07eb8a5
ruff
yfw Sep 8, 2025
fb25dfd
Merge remote-tracking branch 'origin' into yifu/mbridge_dsv3_mcoretot
yfw Sep 8, 2025
f2e21fc
Merge remote-tracking branch 'origin' into yifu/mbridge_dsv3_mcoretot
yfw Sep 9, 2025
66bd252
Update Megatron-Bridge
yfw Sep 10, 2025
32eaed7
Update Megatron-Bridge
yfw Sep 10, 2025
26f4f4d
Update Megatron-Bridge
yfw Sep 10, 2025
b1847d1
Update cached dependencies
yfw Sep 10, 2025
2999f2b
Use correct branch
yfw Sep 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
[submodule "3rdparty/Megatron-LM"]
path = 3rdparty/Megatron-LM-workspace/Megatron-LM
url = https://github.com/terrykong/Megatron-LM.git
branch = yuya/nemo-rl-use
branch = yuya/nemo-rl-use-2
shallow = true
[submodule "3rdparty/Megatron-Bridge"]
path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
branch = yuya/nemo-rl-use-chunkpatch
branch = yifu/nemo-rl-use-chunkpatch-ds
shallow = true
[submodule "3rdparty/Automodel-workspace/Automodel"]
path = 3rdparty/Automodel-workspace/Automodel
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
Submodule Megatron-Bridge updated 25 files
+2 −2 examples/models/generate_from_hf.py
+1 −1 examples/models/multi_gpu_hf.py
+2 −1 pyproject.toml
+10 −0 src/megatron/bridge/models/__init__.py
+57 −39 src/megatron/bridge/models/conversion/auto_bridge.py
+52 −32 src/megatron/bridge/models/conversion/model_bridge.py
+9 −1 src/megatron/bridge/models/conversion/param_mapping.py
+49 −1 src/megatron/bridge/models/conversion/utils.py
+58 −13 src/megatron/bridge/models/decorators/dispatch.py
+30 −0 src/megatron/bridge/models/deepseek/__init__.py
+143 −0 src/megatron/bridge/models/deepseek/common.py
+222 −0 src/megatron/bridge/models/deepseek/deepseek_provider.py
+54 −0 src/megatron/bridge/models/deepseek/deepseek_v2_bridge.py
+70 −0 src/megatron/bridge/models/deepseek/deepseek_v3_bridge.py
+3 −2 src/megatron/bridge/models/hf_pretrained/causal_lm.py
+147 −0 src/megatron/bridge/models/hf_pretrained/safe_config_loader.py
+3 −2 src/megatron/bridge/models/hf_pretrained/vlm.py
+17 −17 src/megatron/bridge/models/model_provider.py
+163 −0 tests/functional_tests/models/test_deepseek_conversion.py
+46 −0 tests/functional_tests/models/test_deepseek_provider_mapping.py
+134 −0 tests/unit_tests/models/decorators/test_dispatch.py
+220 −0 tests/unit_tests/models/deepseek/test_deepseek_bridges.py
+107 −0 tests/unit_tests/models/deepseek/test_deepseek_provider.py
+15 −8 tests/unit_tests/models/test_auto_bridge.py
+489 −0 tests/unit_tests/utils/test_safe_config_loader.py
1 change: 1 addition & 0 deletions 3rdparty/Megatron-Bridge-workspace/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
"nvidia-modelopt[torch,onnx]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'",
"nvidia-resiliency-ext>=0.4.0a0,<0.5.0; sys_platform != 'darwin'",
"transformer-engine[pytorch]>=2.5.0a0,<2.6.0; sys_platform != 'darwin'",
"filelock",
]

# If the bridge source exists, compare cached dependencies with the submodule's pyproject
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-LM-workspace/Megatron-LM
Submodule Megatron-LM updated 1265 files
1 change: 1 addition & 0 deletions examples/configs/dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
1 change: 1 addition & 0 deletions examples/configs/grpo_math_1B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
defer_fp32_logits: null
Expand Down
1 change: 1 addition & 0 deletions examples/configs/grpo_math_1B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
apply_rope_fusion: True
activation_checkpointing: True
defer_fp32_logits: True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
apply_rope_fusion: True
activation_checkpointing: True
defer_fp32_logits: True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
# Causes logprob error divergence for moonlight
apply_rope_fusion: False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ policy:
moe_router_dtype: null
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ policy:
moe_router_dtype: null
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
1 change: 1 addition & 0 deletions examples/configs/sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ policy:
moe_router_dtype: null
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
1 change: 1 addition & 0 deletions examples/configs/sft_openmathinstruct2_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ policy:
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

Expand Down
8 changes: 7 additions & 1 deletion nemo_rl/models/megatron/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,13 @@ def forward_step_arbitrary_loss(
else:
input_ids_cp_sharded = input_ids
attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
input_ids, 0, False, False, False
data=input_ids,
eod_token=0, # used for loss_mask, which we don't use
pad_token=0, # used for loss_mask, which we don't use
reset_position_ids=False,
reset_attention_mask=False,
eod_mask_loss=False,
pad_mask_loss=False,
)

with straggler_timer:
Expand Down
13 changes: 11 additions & 2 deletions nemo_rl/models/policy/megatron_policy_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
from megatron.bridge.utils.instantiate_utils import InstantiationMode
from megatron.core import parallel_state
from megatron.core.distributed import DistributedDataParallel
from megatron.core.distributed.custom_fsdp import (
from megatron.core.distributed.fsdp.mcore_fsdp_adapter import (
FullyShardedDataParallel as custom_FSDP,
)
from megatron.core.inference.engines import (
Expand Down Expand Up @@ -234,6 +234,7 @@ def setup_megatron_model(
make_vocab_size_divisible_by=cfg.model.make_vocab_size_divisible_by
// cfg.model.tensor_model_parallel_size,
tensor_model_parallel_size=cfg.model.tensor_model_parallel_size,
trust_remote_code=True,
)
if not cfg.model.vocab_size:
cfg.model.vocab_size = cfg.tokenizer.padded_vocab_size
Expand Down Expand Up @@ -562,6 +563,7 @@ def __init__(
"moe_router_bias_update_rate"
]

model_cfg.moe_permute_fusion = self.cfg["megatron_cfg"]["moe_permute_fusion"]
if "layernorm_epsilon" in self.cfg["megatron_cfg"]:
model_cfg.layernorm_epsilon = self.cfg["megatron_cfg"]["layernorm_epsilon"]

Expand Down Expand Up @@ -767,6 +769,7 @@ def __init__(
tensor_model_parallel_size=self.cfg["megatron_cfg"][
"tensor_model_parallel_size"
],
trust_remote_code=True,
)
self.final_padded_vocab_size = tokenizer_config.padded_vocab_size
self.dp_size = worker_sharding_annotations.get_axis_size("data_parallel")
Expand Down Expand Up @@ -1164,7 +1167,13 @@ def forward_step_fn(
input_ids = data_dict["input_ids"]
input_ids_cp_sharded = input_ids
attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
input_ids, 0, False, False, False
data=input_ids,
eod_token=0, # used for loss_mask, which we don't use
pad_token=0, # used for loss_mask, which we don't use
reset_position_ids=False,
reset_attention_mask=False,
eod_mask_loss=False,
pad_mask_loss=False,
)
packed_seq_params = None
unpacked_input_ids = input_ids
Expand Down
1 change: 1 addition & 0 deletions tests/unit/models/generation/test_vllm_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def get_basic_megatron_test_config(
"moe_router_dtype": "fp64",
"moe_router_load_balancing_type": "none",
"moe_router_bias_update_rate": 0.0,
"moe_permute_fusion": False,
"apply_rope_fusion": True,
"train_iters": 100, # Required for Megatron training
"optimizer": {
Expand Down
1 change: 1 addition & 0 deletions tests/unit/models/policy/test_megatron_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def create_megatron_test_config(
"moe_router_dtype": "fp64",
"moe_router_load_balancing_type": "none",
"moe_router_bias_update_rate": 0.0,
"moe_permute_fusion": False,
"apply_rope_fusion": True,
"defer_fp32_logits": defer_fp32_logits,
"train_iters": 100, # Required for Megatron training
Expand Down
1 change: 1 addition & 0 deletions tools/refit_verifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def setup_configs(args, tokenizer):
"moe_router_dtype": "fp64",
"moe_router_load_balancing_type": "none",
"moe_router_bias_update_rate": 0.0,
"moe_permute_fusion": False,
"pipeline_dtype": "bfloat16",
"freeze_moe_router": False,
"apply_rope_fusion": False,
Expand Down
2 changes: 2 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.