remove scatter_add in MoE implementation (#1974)

garrett361 · web-flow · commit e1f7f318a36d · 2025-11-21T16:55:04.000-08:00
PR for removing `scatter_add` in the MoE implementation. `scatter_add` is somewhat problematic as it is non-deterministic due to the necessity of [atomic adds](https://discuss.pytorch.org/t/why-does-index-add-and-scatter-add-induce-non-deterministic-behavior-on-the-cuda-backend/45544/2) for correctness. Determinism, correctness, and performance tests using scripts under `torchtitan/moe_bench_and_test`: ``` # Determinism: run same forward 100x and compute standard deviations pytest -rsfP torchtitan/moe_bench_and_test/test_moe.py -k test_determinism out_old_std=tensor(0.0297, device='cuda:0', dtype=torch.bfloat16) out_std=tensor(0., device='cuda:0', dtype=torch.bfloat16) out_old_std/out_moe_old.abs().mean()=tensor(0.0006, device='cuda:0', dtype=torch.bfloat16) out_std/out_moe.abs().mean()=tensor(0., device='cuda:0', dtype=torch.bfloat16) ``` ``` # Accuracy: compare MoE outputs to FFN outputs, with weights set such that outputs should be the same # Relative error decreased by 3x pytest -rsfP torchtitan/moe_bench_and_test/test_moe.py -k test_moe_ffn_equivalence moe_old_rel_err=0.009754068047048696 moe_rel_err=0.002507858727736454 moe_old_rel_err/moe_rel_err=3.8894009216589858 ``` ``` # Timing: triton do_bench for DSv3 16B layer fwd + bwd. ~3% faster runtime python torchtitan/moe_bench_and_test/moe_timing.py moe_old && python torchtitan/moe_bench_and_test/moe_timing.py moe args=Namespace(cls='moe_old', perf_reps=1000, perf_warmups=100, seqlen=4096, bsz=4) moe_time_ms=19.712812881469727 args=Namespace(cls='moe', perf_reps=1000, perf_warmups=100, seqlen=4096, bsz=4) moe_time_ms=19.03301840562087 ``` ``` # Memory: for DSv3 16B layer fwd + bwd. ~15% reduction in active mem, ~18% in reserved mem. python torchtitan/moe_bench_and_test/moe_memory.py moe_old && python torchtitan/moe_bench_and_test/moe_memory.py moe args=Namespace(cls='moe_old', iters=1, seqlen=4096, bsz=4) peak_stats.max_active_gib=5.926029682159424 peak_stats.max_reserved_gib=7.224609375 args=Namespace(cls='moe', iters=1, seqlen=4096, bsz=4) peak_stats.max_active_gib=5.051033020019531 peak_stats.max_reserved_gib=5.91015625 ``` Testing fwd + bwd correctness for `tp_degree=ep_degree=world_size=8` and `etp=1` ``` # Similar relative errors torchrun --nproc-per-node 8 torchtitan/moe_bench_and_test/test_tp.py args=Namespace(seqlen=256, bsz=4, tol=0.01), world_size=8, tp=8, ep=8, etp=1 err_ratio_fsdp_ep_old=0.0028211805268959435 err_ratio_fsdp_ep=0.002805679534989922 err_ratio_ep_ep_old=0.0022941468020912068 kl_fsdp_ep_old=tensor(2.4915e-05, device='cuda:0', dtype=torch.bfloat16) kl_fsdp_ep=tensor(2.0981e-05, device='cuda:0', dtype=torch.bfloat16) kl_ep_ep_old=tensor(2.1458e-05, device='cuda:0', dtype=torch.bfloat16) ``` Everything under `torchtitan/moe_bench_and_test` is temporary testing utilities and is to be deleted prior to merging.
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -264,12 +264,9 @@ def _prepare_output_fn(self, mod, outputs, device_mesh):
         # NOTE: As we shard routed tokens along bs*slen dim across the TP ranks,
         #       the MoE gather and scatter still require global token indices.
         local_rank = device_mesh.get_local_rank()
-        # fact: top_scores.shape[0] // mod.top_k = batch_size * seq_len // ep_degree
-        if not hasattr(mod, "top_k"):
-            raise ValueError(
-                "TokenReorderer class in MoE should always have top_k attribute."
-            )
-        token_indices_experts_sorted += top_scores.shape[0] // mod.top_k * local_rank
+        token_indices_experts_sorted = (
+            token_indices_experts_sorted + top_scores.shape[0] * local_rank
+        )
 
         return top_scores, token_indices_experts_sorted, num_tokens_per_expert
 
diff --git a/torchtitan/models/moe/moe.py b/torchtitan/models/moe/moe.py
@@ -345,7 +345,6 @@ def forward(
         )
 
         top_scores_experts_sorted = top_scores.view(-1)[token_indices_experts_sorted]
-        token_indices_experts_sorted = token_indices_experts_sorted // self.top_k
 
         return (
             top_scores_experts_sorted,
@@ -414,7 +413,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         bs, slen, dim = x.shape
         x = x.view(-1, dim)
 
-        # top_scores and selected_experts_indices shape (bs*slen*top_k,)
+        # top_scores and selected_experts_indices shape (bs*slen, top_k)
         # num_tokens_per_expert shape (num_experts,)
         (
             top_scores,
@@ -430,7 +429,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         with torch.no_grad():
             self.tokens_per_expert.add_(num_tokens_per_expert)
 
-        # top_scores and token_indices_experts_sorted shape (bs*slen*top_k,)
+        # top_scores_experts_sorted and token_indices_experts_sorted shape (bs*slen*top_k,)
         # num_tokens_per_expert shape (num_experts,)
         # NOTE: the reason we need to compute num_tokens_per_expert again is:
         #       1st computation in router is to update self.tokens_per_expert
@@ -445,12 +444,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         ) = self.reorderer(top_scores, selected_experts_indices)
 
         # shape (bs*slen*top_k, dim)
-        token_indices_experts_sorted = token_indices_experts_sorted.reshape(
-            -1, 1
-        ).expand(-1, dim)
-
-        # shape (bs*slen*top_k, dim)
-        routed_input = torch.gather(x, dim=0, index=token_indices_experts_sorted)
+        routed_input = x[token_indices_experts_sorted // self.router.top_k]
 
         if self.score_before_experts:
             routed_input = (
@@ -464,22 +458,33 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # shared expert
         # Note: we execute the shared expert before scoring the output of the routed expert
         # to "implicitly" overlap the shared expert compute with token combine communication
-        if self.shared_experts is not None:
-            out = self.shared_experts(x)
-        else:
-            out = torch.zeros_like(x)
+        out = self.shared_experts(x) if self.shared_experts is not None else None
 
+        # Unsort routed outputs
+        routed_output_unsorted = torch.zeros(
+            (bs * slen * self.router.top_k, dim),
+            dtype=routed_output.dtype,
+            device=routed_output.device,
+        )
+        routed_output_unsorted[token_indices_experts_sorted] = routed_output
+        routed_output_unsorted = routed_output_unsorted.reshape(
+            -1, self.router.top_k, dim
+        )
         if not self.score_before_experts:
-            routed_output = (
-                routed_output.to(torch.float32)
-                * top_scores_experts_sorted.reshape(-1, 1)
-            ).to(x.dtype)
+            out_experts = (
+                torch.bmm(
+                    top_scores.reshape(-1, 1, self.router.top_k),
+                    routed_output_unsorted.float(),
+                )
+                .to(x.dtype)
+                .squeeze(1)
+            )
+        else:
+            out_experts = routed_output_unsorted.sum(dim=1)
 
-        out = out.scatter_add(
-            dim=0, index=token_indices_experts_sorted, src=routed_output
-        )
-        out = out.reshape(bs, slen, dim)
-        return out
+        if out is None:
+            return out_experts.reshape(bs, slen, dim)
+        return (out + out_experts).reshape(bs, slen, dim)
 
     def init_weights(
         self,