diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml index bb3743a4fc..b32664fa63 100644 --- a/examples/configs/distillation_math.yaml +++ b/examples/configs/distillation_math.yaml @@ -109,7 +109,7 @@ policy: &POLICY_BASE defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false optimizer: diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml index 8a6f89f4b1..1c9edb17eb 100644 --- a/examples/configs/distillation_math_megatron.yaml +++ b/examples/configs/distillation_math_megatron.yaml @@ -60,7 +60,7 @@ policy: &POLICY_BASE moe_per_layer_logging: False defer_fp32_logits: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false peft: enabled: false diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index fec83a6199..ddeb0fa7b8 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -133,7 +133,7 @@ policy: defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false optimizer: diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 1cde24af65..98d720f92b 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -144,7 +144,7 @@ policy: defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false peft: diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index 1826901b6c..8947ee63e4 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -96,7 +96,7 @@ policy: moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo moe_permute_fusion: false moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 7b90a90c38..821da4e530 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -115,7 +115,7 @@ policy: defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false peft: diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml index 6abbea1c32..3f4289511a 100644 --- a/examples/configs/sft_openmathinstruct2_megatron.yaml +++ b/examples/configs/sft_openmathinstruct2_megatron.yaml @@ -94,7 +94,7 @@ policy: bias_activation_fusion: True moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false peft: enabled: false diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml index 81d4bf8dce..03f184afef 100644 --- a/examples/configs/vlm_grpo_3B.yaml +++ b/examples/configs/vlm_grpo_3B.yaml @@ -114,7 +114,7 @@ policy: defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false optimizer: diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml index c7aa9913c8..94a9185f78 100644 --- a/examples/configs/vlm_grpo_3B_megatron.yaml +++ b/examples/configs/vlm_grpo_3B_megatron.yaml @@ -156,7 +156,7 @@ policy: defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false peft: enabled: false diff --git a/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml b/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml index 9f7b96d619..60d40b7ca4 100644 --- a/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml +++ b/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml @@ -118,7 +118,7 @@ policy: defer_fp32_logits: false moe_permute_fusion: false moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false optimizer: diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index 6151928519..4099543bde 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -205,7 +205,7 @@ class MegatronConfig(TypedDict): # Must set moe_token_dispatcher_type to 'flex' # Must set moe_shared_expert_overlap to False moe_enable_deepep: bool - # The type of token dispatcher to use. The default is 'allgather'. + # The type of token dispatcher to use. The default is 'alltoall'. # Options are 'allgather','alltoall' and 'flex' # Use 'flex' when using DeepEP moe_token_dispatcher_type: str diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index ac5d2484ab..2d27abad27 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -194,7 +194,7 @@ def get_basic_megatron_test_config( "moe_router_bias_update_rate": 0.0, "moe_permute_fusion": False, "moe_enable_deepep": False, - "moe_token_dispatcher_type": "allgather", + "moe_token_dispatcher_type": "alltoall", "moe_shared_expert_overlap": False, "apply_rope_fusion": True, "bias_activation_fusion": True, diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py index 16d77389a6..7b2a5d3622 100644 --- a/tests/unit/models/megatron/test_megatron_setup.py +++ b/tests/unit/models/megatron/test_megatron_setup.py @@ -187,7 +187,7 @@ def test_moe_configuration(self): "moe_router_bias_update_rate": 0.0, "moe_permute_fusion": True, "moe_enable_deepep": False, - "moe_token_dispatcher_type": "allgather", + "moe_token_dispatcher_type": "alltoall", "moe_shared_expert_overlap": True, } } @@ -201,7 +201,7 @@ def test_moe_configuration(self): assert model_cfg.moe_router_bias_update_rate == 0.0 assert model_cfg.moe_permute_fusion is True assert model_cfg.moe_enable_deepep is False - assert model_cfg.moe_token_dispatcher_type == "allgather" + assert model_cfg.moe_token_dispatcher_type == "alltoall" assert model_cfg.moe_shared_expert_overlap is True diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 3c4625ba23..1835c9908b 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -135,7 +135,7 @@ def create_megatron_test_config( "bias_activation_fusion": True, "moe_per_layer_logging": False, "moe_enable_deepep": False, - "moe_token_dispatcher_type": "allgather", + "moe_token_dispatcher_type": "alltoall", "moe_shared_expert_overlap": False, "defer_fp32_logits": defer_fp32_logits, "train_iters": 100, # Required for Megatron training