Incorporate 2056; Add test for legacy APIs

timlee0212 · timlee0212 · commit a2670e8c69a6 · 2025-11-19T20:12:22.000-08:00
diff --git a/flashinfer/comm/mnnvl.py b/flashinfer/comm/mnnvl.py
@@ -1133,6 +1133,7 @@ def __init__(
         group_rank: int,
         device: torch.device,
         mn_nvlink: bool = True,
+        comm_backend_for_handle_transfer: Optional[CommBackend] = None,
     ):
         """
         Constructor for McastGpuBuffer.
@@ -1143,9 +1144,15 @@ def __init__(
             group_rank: The rank of the local process within the group
             device: The CUDA device for buffer allocation
             mn_nvlink: Flag indicating if multi-node NVLink is used
+            comm_backend_for_handle_transfer: The communicator to use for handle transfer
         """
         self.mcast_device_memory = McastDeviceMemory(
-            buf_size, group_size, group_rank, device.index, mn_nvlink
+            buf_size,
+            group_size,
+            group_rank,
+            device.index,
+            mn_nvlink,
+            comm_backend_for_handle_transfer,
         )
         self.buf_size = buf_size
         self.local_device = device
diff --git a/flashinfer/comm/trtllm_mnnvl_ar.py b/flashinfer/comm/trtllm_mnnvl_ar.py
@@ -17,7 +17,7 @@
 
 from ..jit import gen_trtllm_mnnvl_comm_module
 from ..utils import register_custom_op
-from .mnnvl import McastGPUBuffer
+from .mnnvl import McastGPUBuffer, CommBackend, MPIBackend
 
 
 def mpi_barrier():
@@ -41,14 +41,18 @@ def is_one_shot(
 
 
 # Empirical result calculated from num_tokens * hidden_dim * tp_size * elem_size
-# TODO(Refactor): Consider moving this to a configuration class or file
 MNNVL_ONE_SHOT_THRESHOLD = 64 * 1024 * 8 * 2
 
 
 class MNNVLAllreduceFusionWorkspace:
     NUM_LAMPORT_BUFFERS = 3
 
-    def __init__(self, mapping: Mapping, buffer_size_in_bytes: Optional[int] = None):
+    def __init__(
+        self,
+        mapping: Mapping,
+        buffer_size_in_bytes: Optional[int] = None,
+        comm_backend: Optional[CommBackend] = None,
+    ):
         """
         Initialize the MNNVL Allreduce Fusion Workspace. COMM_WORLD will be used for creating the workspace and synchronization. The process might hang if the intended communication group in mapping is not COMM_WORLD.
 
@@ -64,7 +68,8 @@ def __init__(self, mapping: Mapping, buffer_size_in_bytes: Optional[int] = None)
             buffer_size_in_bytes = math.ceil(buffer_size_in_bytes / (8 * (1024**2))) * (
                 8 * (1024**2)
             )
-
+        if comm_backend is None:
+            comm_backend = MPIBackend()
         if buffer_size_in_bytes > (2**32 - 1):
             raise ValueError(
                 f"The buffer size in bytes {buffer_size_in_bytes} is greater than the maximum supported size (UINT32_MAX)."
@@ -83,14 +88,14 @@ def __init__(self, mapping: Mapping, buffer_size_in_bytes: Optional[int] = None)
             mapping.tp_rank,
             torch.device("cuda", mapping.local_rank),
             mapping.is_multi_node(),
+            comm_backend,
         )
 
         # We use FP32 for sentinel value regardless of the real dtype
         self.mcast_buffer_handle.lamport_initialize(mapping.tp_rank, torch.float32)
         # Wait until the initialization is done
         torch.cuda.synchronize()
-        # FIXME: We are assuming using the COMM_WORLD.
-        mpi_barrier()
+        comm_backend.barrier()
 
         # This is a buffer to maintain the state of this allreduce Op
         # Should have the same lifetime with self._buffer
@@ -391,7 +396,10 @@ def trtllm_mnnvl_fused_allreduce_add_rmsnorm(
     "get_allreduce_mnnvl_workspace is deprecated, use MNNVLAllreduceFusionWorkspace class to manage the workspace instead"
 )
 def get_allreduce_mnnvl_workspace(
-    mapping: Mapping, dtype: torch.dtype, buffer_size_in_bytes: Optional[int] = None
+    mapping: Mapping,
+    dtype: torch.dtype,
+    comm_backend_for_handle_transfer: Optional[CommBackend] = None,
+    buffer_size_in_bytes: Optional[int] = None,
 ) -> Tuple[McastGPUBuffer, torch.Tensor, int]:
     """Get workspace buffers needed for multi-node NVLink all-reduce operation.
 
@@ -428,7 +436,9 @@ def get_allreduce_mnnvl_workspace(
     ) * (lcm_hidden_dim * stride)
 
     # Redirect to the new workspace allocation logic. The new kernel needs the new flag buffer layout.
-    workspace = MNNVLAllreduceFusionWorkspace(mapping, buffer_size_in_bytes)
+    workspace = MNNVLAllreduceFusionWorkspace(
+        mapping, buffer_size_in_bytes, comm_backend_for_handle_transfer
+    )
 
     mcast_buffer = workspace.mcast_buffer_handle
     buffer_flags = workspace.buffer_flags
@@ -497,7 +507,7 @@ def trtllm_mnnvl_all_reduce(
     )
     module = get_trtllm_mnnvl_comm_module()
     module.trtllm_mnnvl_allreduce_fusion(
-        input,
+        inp,
         multicast_buffer_ptr,
         buffer_ptrs_dev,
         0,  # Allreduce kernel itself does not use this local pointer; still this could be risky but it is only used for legacy code compatibility.
diff --git a/tests/comm/test_trtllm_mnnvl_allreduce.py b/tests/comm/test_trtllm_mnnvl_allreduce.py
@@ -101,6 +101,131 @@ def func(
         )
 
 
+@torch.inference_mode()
+def row_linear_residual_norm_fusion_forward_legacy(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    norm_weight: torch.Tensor,
+    eps: float,
+    hidden_size: int,
+    dtype: torch.dtype,
+    mapping: Mapping,
+    fusion: bool,
+    reference_output: tuple[torch.Tensor, ...],
+    multicast_ptr: int,
+    buffer_ptrs_dev: int,
+    unicast_ptr: int,
+    max_num_elements_mnnvl: int,
+    buffer_flags_mnnvl: torch.Tensor,
+):
+    tensor_parallel_size = mapping.tp_size
+    tensor_parallel_rank = mapping.tp_rank
+    MPI.COMM_WORLD.barrier()
+
+    def func(
+        input,
+        residual,
+        norm_weight,
+        eps,
+        enable_fusion,
+        multicast_ptr,
+        buffer_ptrs_dev,
+        unicast_ptr,
+        max_num_elements_mnnvl,
+    ):
+        # For both fused and unfused cases:
+        shape = input.shape
+        input = input.view(-1, shape[-1])
+        buffer_M = max_num_elements_mnnvl // hidden_size
+
+        if enable_fusion:
+            use_pdl = True
+
+            prenorm_output = torch.empty_like(residual)
+            normed_output = torch.empty_like(residual)
+
+            trtllm_mnnvl_ar.mpi_barrier()
+
+            trtllm_mnnvl_ar.trtllm_mnnvl_fused_allreduce_rmsnorm(
+                prenorm_output,
+                normed_output,
+                input,
+                multicast_ptr,
+                buffer_ptrs_dev,
+                unicast_ptr,
+                buffer_M,
+                buffer_flags_mnnvl,
+                tensor_parallel_size,
+                tensor_parallel_rank,
+                norm_weight,
+                eps,
+                residual,
+                use_pdl,
+            )
+
+            return normed_output.view(shape), prenorm_output.view(shape)
+
+        else:
+            output = torch.empty_like(input)
+
+            trtllm_mnnvl_ar.trtllm_mnnvl_all_reduce(
+                input,
+                multicast_ptr,
+                buffer_ptrs_dev,
+                buffer_M,
+                buffer_flags_mnnvl,
+                tensor_parallel_size,
+                tensor_parallel_rank,
+                True,  # wait_for_results
+                False,  # launch_with_pdl
+                output,  # Need to provide output tensor since we are writing them out.
+            )
+            return (output.view(shape),)
+
+    output = func(
+        x.clone(),
+        residual.clone(),
+        norm_weight,
+        eps,
+        fusion,
+        multicast_ptr,
+        buffer_ptrs_dev,
+        unicast_ptr,
+        max_num_elements_mnnvl,
+    )
+
+    assert output[0].shape == reference_output[0].shape
+
+    if tensor_parallel_rank == 0:
+        print("output[0] (first 10 values):", output[0].flatten()[:10])
+        print(
+            "reference_output[0] (first 10 values):",
+            reference_output[0].flatten()[:10],
+        )
+
+        if fusion:
+            print("output[1] (first 10 values):", output[1].flatten()[:10])
+            print(
+                "reference_output[1] (first 10 values):",
+                reference_output[1].flatten()[:10],
+            )
+
+    torch.testing.assert_close(
+        output[0],
+        reference_output[0],
+        rtol=0.05,
+        atol=0.15,
+    )
+
+    if fusion:
+        torch.testing.assert_close(
+            output[1],
+            reference_output[1],
+            rtol=0.05,
+            atol=0.15,
+        )
+
+
 """Helper function to run the core MNNVL AllReduce test logic"""
 
 
@@ -146,7 +271,13 @@ def prepare_test_data(seq_len: int, hidden_size: int, dtype: torch.dtype, fusion
 
 
 def run_mnnvl_ar_full(
-    monkeypatch, seq_lens: list[int], fusion: bool, dtype: torch.dtype, hidden_size: int
+    monkeypatch,
+    seq_lens: list[int],
+    fusion: bool,
+    dtype: torch.dtype,
+    hidden_size: int,
+    legacy_explicit_workspace_bytes: int = None,
+    legacy_api: bool = False,
 ):
     """Core test logic for MNNVL AllReduce operations.
 
@@ -195,16 +326,30 @@ def run_mnnvl_ar_full(
     failure_message = ""
 
     try:
-        required_workspace_bytes = trtllm_mnnvl_ar.MNNVLAllreduceFusionWorkspace.get_required_buffer_size_bytes(
-            mapping.tp_size,
-            max(seq_lens),
-            hidden_size,
-            dtype,
-            trtllm_mnnvl_ar.MNNVLAllreduceFusionStrategy.AUTO,
-        )
-        workspace = trtllm_mnnvl_ar.MNNVLAllreduceFusionWorkspace(
-            mapping, required_workspace_bytes
-        )
+        if legacy_api:
+            mcast_buffer_mnnvl, buffer_flags_mnnvl, max_num_elements_mnnvl = (
+                trtllm_mnnvl_ar.get_allreduce_mnnvl_workspace(
+                    mapping, dtype, buffer_size_in_bytes=legacy_explicit_workspace_bytes
+                )
+            )
+
+            multicast_ptr = mcast_buffer_mnnvl.get_multicast_ptr()
+            buffer_ptrs_dev = mcast_buffer_mnnvl.get_buffer_ptrs_dev()
+            unicast_ptr = mcast_buffer_mnnvl.mcast_device_memory.get_unicast_ptr(
+                mapping.tp_rank
+            )
+
+        else:
+            required_workspace_bytes = trtllm_mnnvl_ar.MNNVLAllreduceFusionWorkspace.get_required_buffer_size_bytes(
+                mapping.tp_size,
+                max(seq_lens),
+                hidden_size,
+                dtype,
+                trtllm_mnnvl_ar.MNNVLAllreduceFusionStrategy.AUTO,
+            )
+            workspace = trtllm_mnnvl_ar.MNNVLAllreduceFusionWorkspace(
+                mapping, required_workspace_bytes
+            )
 
         test_data = []
         for seq_len in seq_lens:
@@ -221,18 +366,34 @@ def run_mnnvl_ar_full(
                 print(
                     f"Testing seq_len={seq_len}, hidden_size={hidden_size}, fusion={fusion}, dtype={dtype}"
                 )
-
-            # Run the test with the same workspace
-            row_linear_residual_norm_fusion_forward(
-                x,
-                residual,
-                norm_weight,
-                eps,
-                mapping,
-                fusion,
-                reference_output,
-                workspace,
-            )
+            if legacy_api:
+                row_linear_residual_norm_fusion_forward_legacy(
+                    x,
+                    residual,
+                    norm_weight,
+                    eps,
+                    hidden_size,
+                    dtype,
+                    mapping,
+                    fusion,
+                    reference_output,
+                    multicast_ptr,
+                    buffer_ptrs_dev,
+                    unicast_ptr,
+                    max_num_elements_mnnvl,
+                    buffer_flags_mnnvl,
+                )
+            else:
+                row_linear_residual_norm_fusion_forward(
+                    x,
+                    residual,
+                    norm_weight,
+                    eps,
+                    mapping,
+                    fusion,
+                    reference_output,
+                    workspace,
+                )
 
             # Synchronize before next test
             trtllm_mnnvl_ar.mpi_barrier()
@@ -283,8 +444,23 @@ def run_mnnvl_ar_full(
 @pytest.mark.parametrize("fusion", [False, True])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [2880, 5120, 7168, 8192])
-def test_mnnvl_allreduce_default_workspace(
+def test_mnnvl_allreduce_refactored(
+    monkeypatch, seq_lens: list[int], fusion: bool, dtype: torch.dtype, hidden_size: int
+):
+    """Test MNNVL AllReduce with refactored API."""
+    run_mnnvl_ar_full(
+        monkeypatch, seq_lens, fusion, dtype, hidden_size, legacy_api=False
+    )
+
+
+@pytest.mark.parametrize("seq_lens", [[1], [4], [15], [27, 11, 24], [127]])
+@pytest.mark.parametrize("fusion", [False, True])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [2048, 4096, 5120, 7168, 8192])
+def test_mnnvl_allreduce_legacy(
     monkeypatch, seq_lens: list[int], fusion: bool, dtype: torch.dtype, hidden_size: int
 ):
-    """Test MNNVL AllReduce with default workspace size."""
-    run_mnnvl_ar_full(monkeypatch, seq_lens, fusion, dtype, hidden_size)
+    """Test MNNVL AllReduce with legacy API."""
+    run_mnnvl_ar_full(
+        monkeypatch, seq_lens, fusion, dtype, hidden_size, legacy_api=True
+    )