pytorch
diff --git a/‎test/prototype/moe_training/mxfp8/test_mxfp8_a2a.py‎
Lines changed: 79 additions & 0 deletions b/‎test/prototype/moe_training/mxfp8/test_mxfp8_a2a.py‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎torchao/prototype/moe_training/kernels/mxfp8/__init__.py‎
Lines changed: 11 additions & 0 deletions b/‎torchao/prototype/moe_training/kernels/mxfp8/__init__.py‎
Lines changed: 11 additions & 0 deletions
@@ -0,0 +1,79 @@
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    run_tests,
+)
+from torchao.prototype.moe_training.kernels.mxfp8.comms import mxfp8_on_device_all_to_all_v
+
+
+@instantiate_parametrized_tests
+class TritonAllReduceTest(MultiProcessTestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(f"cuda:{self.rank}")
+
+    def _init_process(self):
+        torch.cuda.set_device(self.device)
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        torch.manual_seed(42 + self.rank)
+
+    def _init_device(self):
+        symm_mem.set_backend("NVSHMEM")
+
+    @skip_if_lt_x_gpu(4)
+    def test_a2a_fwd_bwd(self):
+        self._init_process()
+        try:
+            torch.manual_seed(42 + self.rank)
+            self._init_device()
+
+            group_name = dist.group.WORLD.group_name
+            symm_mem.enable_symm_mem_for_group(group_name)
+
+            experts_per_rank = 2
+            num_splits = experts_per_rank * self.world_size
+
+            # Number of elements for an expert is random between [0, k)
+            tokens_per_ep_rank = 1024
+            dim = 2048
+            input_tensor = torch.randn(tokens_per_ep_rank, dim, device=self.device, dtype=torch.bfloat16)
+            input_splits = torch.randint(
+                tokens_per_ep_rank, (num_splits,), dtype=torch.int64, device=self.device
+            )
+
+            max_output_len_per_rank = tokens_per_ep_rank # Alias for clarity
+
+            # Test forward
+            output, output_splits = mxfp8_on_device_all_to_all_v(
+                input_tensor,
+                input_splits,
+                max_output_len_per_rank,
+                group_name,
+            )
+
+        finally:
+            dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    run_tests()
@@ -0,0 +1,11 @@
+from torchao.prototype.moe_training.kernels.mxfp8.quant import (
+    compute_blocked_scale_offsets_for_K_groups,  # noqa: F401
+    compute_blocked_scale_offsets_for_M_groups,  # noqa: F401
+    mxfp8_quantize_cuda_3d,  # noqa: F401
+    torch_to_blocked_2d_K_groups,  # noqa: F401
+    torch_to_blocked_2d_M_groups,  # noqa: F401
+    torch_to_blocked_per_group_3d,  # noqa: F401
+    triton_mx_block_rearrange_2d_K_groups,  # noqa: F401
+    triton_mx_block_rearrange_2d_M_groups,  # noqa: F401
+    triton_mx_block_rearrange_per_group_3d,  # noqa: F401
+)