diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml
index bb3743a4fc..b32664fa63 100644
--- a/examples/configs/distillation_math.yaml
+++ b/examples/configs/distillation_math.yaml
@@ -109,7 +109,7 @@ policy: &POLICY_BASE
         defer_fp32_logits: False
         moe_per_layer_logging: False
         moe_enable_deepep: false
-        moe_token_dispatcher_type: "allgather"
+        moe_token_dispatcher_type: "alltoall"
         moe_shared_expert_overlap: false
         
         optimizer:
diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml
index 8a6f89f4b1..1c9edb17eb 100644
--- a/examples/configs/distillation_math_megatron.yaml
+++ b/examples/configs/distillation_math_megatron.yaml
@@ -60,7 +60,7 @@ policy: &POLICY_BASE
         moe_per_layer_logging: False
         defer_fp32_logits: False
         moe_enable_deepep: false
-        moe_token_dispatcher_type: "allgather"
+        moe_token_dispatcher_type: "alltoall"
         moe_shared_expert_overlap: false
         peft:
             enabled: false
diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml
index fec83a6199..ddeb0fa7b8 100755
--- a/examples/configs/dpo.yaml
+++ b/examples/configs/dpo.yaml
@@ -133,7 +133,7 @@ policy:
     defer_fp32_logits: False
     moe_per_layer_logging: False
     moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
+    moe_token_dispatcher_type: "alltoall"
     moe_shared_expert_overlap: false
     
     optimizer:
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 1cde24af65..98d720f92b 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -144,7 +144,7 @@ policy:
     defer_fp32_logits: False
     moe_per_layer_logging: False
     moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
+    moe_token_dispatcher_type: "alltoall"
     moe_shared_expert_overlap: false
 
     peft:
diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
index 1826901b6c..8947ee63e4 100644
--- a/examples/configs/grpo_math_1B_megatron.yaml
+++ b/examples/configs/grpo_math_1B_megatron.yaml
@@ -96,7 +96,7 @@ policy:
     moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
     moe_permute_fusion: false
     moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
+    moe_token_dispatcher_type: "alltoall"
     moe_shared_expert_overlap: false
     #gives ~20% training perf speedup with sequence packing 
     apply_rope_fusion: True
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 7b90a90c38..821da4e530 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -115,7 +115,7 @@ policy:
     defer_fp32_logits: False
     moe_per_layer_logging: False
     moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
+    moe_token_dispatcher_type: "alltoall"
     moe_shared_expert_overlap: false
 
     peft:
diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml
index 6abbea1c32..3f4289511a 100644
--- a/examples/configs/sft_openmathinstruct2_megatron.yaml
+++ b/examples/configs/sft_openmathinstruct2_megatron.yaml
@@ -94,7 +94,7 @@ policy:
     bias_activation_fusion: True
     moe_per_layer_logging: False
     moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
+    moe_token_dispatcher_type: "alltoall"
     moe_shared_expert_overlap: false
     peft:
       enabled: false
diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml
index 81d4bf8dce..03f184afef 100644
--- a/examples/configs/vlm_grpo_3B.yaml
+++ b/examples/configs/vlm_grpo_3B.yaml
@@ -114,7 +114,7 @@ policy:
     defer_fp32_logits: False
     moe_per_layer_logging: False
     moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
+    moe_token_dispatcher_type: "alltoall"
     moe_shared_expert_overlap: false
 
     optimizer:
diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml
index c7aa9913c8..94a9185f78 100644
--- a/examples/configs/vlm_grpo_3B_megatron.yaml
+++ b/examples/configs/vlm_grpo_3B_megatron.yaml
@@ -156,7 +156,7 @@ policy:
     defer_fp32_logits: False
     moe_per_layer_logging: False
     moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
+    moe_token_dispatcher_type: "alltoall"
     moe_shared_expert_overlap: false
     peft:
       enabled: false
diff --git a/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml b/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml
index 9f7b96d619..60d40b7ca4 100644
--- a/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml
+++ b/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml
@@ -118,7 +118,7 @@ policy:
     defer_fp32_logits: false
     moe_permute_fusion: false
     moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
+    moe_token_dispatcher_type: "alltoall"
     moe_shared_expert_overlap: false
 
     optimizer:
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index 6151928519..4099543bde 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -205,7 +205,7 @@ class MegatronConfig(TypedDict):
     # Must set moe_token_dispatcher_type to 'flex'
     # Must set moe_shared_expert_overlap to False
     moe_enable_deepep: bool
-    # The type of token dispatcher to use. The default is 'allgather'.
+    # The type of token dispatcher to use. The default is 'alltoall'.
     # Options are 'allgather','alltoall' and 'flex'
     # Use 'flex' when using DeepEP
     moe_token_dispatcher_type: str
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index ac5d2484ab..2d27abad27 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -194,7 +194,7 @@ def get_basic_megatron_test_config(
             "moe_router_bias_update_rate": 0.0,
             "moe_permute_fusion": False,
             "moe_enable_deepep": False,
-            "moe_token_dispatcher_type": "allgather",
+            "moe_token_dispatcher_type": "alltoall",
             "moe_shared_expert_overlap": False,
             "apply_rope_fusion": True,
             "bias_activation_fusion": True,
diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py
index 16d77389a6..7b2a5d3622 100644
--- a/tests/unit/models/megatron/test_megatron_setup.py
+++ b/tests/unit/models/megatron/test_megatron_setup.py
@@ -187,7 +187,7 @@ def test_moe_configuration(self):
                 "moe_router_bias_update_rate": 0.0,
                 "moe_permute_fusion": True,
                 "moe_enable_deepep": False,
-                "moe_token_dispatcher_type": "allgather",
+                "moe_token_dispatcher_type": "alltoall",
                 "moe_shared_expert_overlap": True,
             }
         }
@@ -201,7 +201,7 @@ def test_moe_configuration(self):
         assert model_cfg.moe_router_bias_update_rate == 0.0
         assert model_cfg.moe_permute_fusion is True
         assert model_cfg.moe_enable_deepep is False
-        assert model_cfg.moe_token_dispatcher_type == "allgather"
+        assert model_cfg.moe_token_dispatcher_type == "alltoall"
         assert model_cfg.moe_shared_expert_overlap is True
 
 
diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index 3c4625ba23..1835c9908b 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -135,7 +135,7 @@ def create_megatron_test_config(
             "bias_activation_fusion": True,
             "moe_per_layer_logging": False,
             "moe_enable_deepep": False,
-            "moe_token_dispatcher_type": "allgather",
+            "moe_token_dispatcher_type": "alltoall",
             "moe_shared_expert_overlap": False,
             "defer_fp32_logits": defer_fp32_logits,
             "train_iters": 100,  # Required for Megatron training