Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/configs/distillation_math.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ policy: &POLICY_BASE
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_token_dispatcher_type: "alltoall"
moe_shared_expert_overlap: false

optimizer:
Expand Down
2 changes: 1 addition & 1 deletion examples/configs/distillation_math_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ policy: &POLICY_BASE
moe_per_layer_logging: False
defer_fp32_logits: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_token_dispatcher_type: "alltoall"
moe_shared_expert_overlap: false
peft:
enabled: false
Expand Down
2 changes: 1 addition & 1 deletion examples/configs/dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ policy:
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_token_dispatcher_type: "alltoall"
moe_shared_expert_overlap: false

optimizer:
Expand Down
2 changes: 1 addition & 1 deletion examples/configs/grpo_math_1B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ policy:
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_token_dispatcher_type: "alltoall"
moe_shared_expert_overlap: false

peft:
Expand Down
2 changes: 1 addition & 1 deletion examples/configs/grpo_math_1B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ policy:
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_token_dispatcher_type: "alltoall"
moe_shared_expert_overlap: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
Expand Down
2 changes: 1 addition & 1 deletion examples/configs/sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ policy:
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_token_dispatcher_type: "alltoall"
moe_shared_expert_overlap: false

peft:
Expand Down
2 changes: 1 addition & 1 deletion examples/configs/sft_openmathinstruct2_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ policy:
bias_activation_fusion: True
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_token_dispatcher_type: "alltoall"
moe_shared_expert_overlap: false
peft:
enabled: false
Expand Down
2 changes: 1 addition & 1 deletion examples/configs/vlm_grpo_3B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ policy:
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_token_dispatcher_type: "alltoall"
moe_shared_expert_overlap: false

optimizer:
Expand Down
2 changes: 1 addition & 1 deletion examples/configs/vlm_grpo_3B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ policy:
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_token_dispatcher_type: "alltoall"
moe_shared_expert_overlap: false
peft:
enabled: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ policy:
defer_fp32_logits: false
moe_permute_fusion: false
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_token_dispatcher_type: "alltoall"
moe_shared_expert_overlap: false

optimizer:
Expand Down
2 changes: 1 addition & 1 deletion nemo_rl/models/policy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ class MegatronConfig(TypedDict):
# Must set moe_token_dispatcher_type to 'flex'
# Must set moe_shared_expert_overlap to False
moe_enable_deepep: bool
# The type of token dispatcher to use. The default is 'allgather'.
# The type of token dispatcher to use. The default is 'alltoall'.
# Options are 'allgather','alltoall' and 'flex'
# Use 'flex' when using DeepEP
moe_token_dispatcher_type: str
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/models/generation/test_vllm_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def get_basic_megatron_test_config(
"moe_router_bias_update_rate": 0.0,
"moe_permute_fusion": False,
"moe_enable_deepep": False,
"moe_token_dispatcher_type": "allgather",
"moe_token_dispatcher_type": "alltoall",
"moe_shared_expert_overlap": False,
"apply_rope_fusion": True,
"bias_activation_fusion": True,
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/models/megatron/test_megatron_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def test_moe_configuration(self):
"moe_router_bias_update_rate": 0.0,
"moe_permute_fusion": True,
"moe_enable_deepep": False,
"moe_token_dispatcher_type": "allgather",
"moe_token_dispatcher_type": "alltoall",
"moe_shared_expert_overlap": True,
}
}
Expand All @@ -201,7 +201,7 @@ def test_moe_configuration(self):
assert model_cfg.moe_router_bias_update_rate == 0.0
assert model_cfg.moe_permute_fusion is True
assert model_cfg.moe_enable_deepep is False
assert model_cfg.moe_token_dispatcher_type == "allgather"
assert model_cfg.moe_token_dispatcher_type == "alltoall"
assert model_cfg.moe_shared_expert_overlap is True


Expand Down
2 changes: 1 addition & 1 deletion tests/unit/models/policy/test_megatron_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def create_megatron_test_config(
"bias_activation_fusion": True,
"moe_per_layer_logging": False,
"moe_enable_deepep": False,
"moe_token_dispatcher_type": "allgather",
"moe_token_dispatcher_type": "alltoall",
"moe_shared_expert_overlap": False,
"defer_fp32_logits": defer_fp32_logits,
"train_iters": 100, # Required for Megatron training
Expand Down
Loading