MNNVL All Reduce for large number of tokens (#2074)

nvmbreughe · web-flow · commit ba8f3ed98ab2 · 2025-11-14T12:12:41.000-08:00
## 📌 Description This PR does two things: * Add a check for the number of tokens and raise an exception if the max token size was exceeded * Adds an optional parameter to allow users to dial in an arbitrary workspace ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **New Features** * Added an optional configurable workspace buffer size for all-reduce operations with a sensible default to preserve backwards compatibility. * Runtime input validation now enforces 2D inputs and token-count limits, with clearer error messages guiding corrective actions. * **Tests** * Expanded test coverage for workspace behavior: default sizing, explicit sizing, and negative tests for insufficient workspace. * Tests now allow supplying an explicit workspace size to validate allocation and reuse scenarios.
diff --git a/flashinfer/comm/trtllm_mnnvl_ar.py b/flashinfer/comm/trtllm_mnnvl_ar.py
@@ -122,7 +122,7 @@ def trtllm_mnnvl_rmsnorm(
 
 
 def get_allreduce_mnnvl_workspace(
-    mapping: Mapping, dtype: torch.dtype
+    mapping: Mapping, dtype: torch.dtype, buffer_size_in_bytes: Optional[int] = None
 ) -> Tuple[McastGPUBuffer, torch.Tensor, int]:
     """Get workspace buffers needed for multi-node NVLink all-reduce operation.
 
@@ -138,6 +138,7 @@ def get_allreduce_mnnvl_workspace(
     Args:
         mapping: Tensor parallel mapping configuration containing rank info
         dtype: Data type of the tensors being reduced
+        buffer_size_in_bytes: Optional buffer size. Practically, assign this to 3 * 2 * dtype.itemsize * hidden_dim * max_tokens
 
     Returns:
         Tuple containing:
@@ -152,7 +153,9 @@ def get_allreduce_mnnvl_workspace(
     # LCM for hidden_dim: 2048, 4096, 5120, 7168, 8192 = 286720
     # max_num_elements must be a multiple of 286720
     lcm_hidden_dim = 286720
-    TARGET_WORKSPACE_SIZE_BYTES = 12_000_000
+    TARGET_WORKSPACE_SIZE_BYTES = (
+        buffer_size_in_bytes if buffer_size_in_bytes is not None else 12_000_000
+    )
     buffer_size_in_bytes = math.ceil(
         TARGET_WORKSPACE_SIZE_BYTES / (lcm_hidden_dim * stride)
     ) * (lcm_hidden_dim * stride)
@@ -223,6 +226,17 @@ def trtllm_mnnvl_all_reduce(
         [Optional] out: Output tensor to store the result (required if wait_for_results is True)
 
     """
+
+    if len(inp.shape) != 2:
+        raise ValueError(
+            f"The input tensor must be 2D, got {len(inp.shape)}D. The shape is {inp.shape}."
+        )
+
+    if inp.shape[0] > buffer_M:
+        raise ValueError(
+            f"The number of tokens in the input tensor {inp.shape[0]} is greater than the buffer_M {buffer_M}. This is not supported. Please increase the workspace size, or decrease the amount of tokens to at most {buffer_M}."
+        )
+
     module = get_trtllm_mnnvl_comm_module()
     module.trtllm_mnnvl_all_reduce(
         inp,
diff --git a/tests/comm/test_trtllm_mnnvl_allreduce.py b/tests/comm/test_trtllm_mnnvl_allreduce.py
@@ -147,25 +147,27 @@ def func(
         )
 
 
-"""Main test function that runs on each MPI rank"""
+"""Helper function to run the core MNNVL AllReduce test logic"""
 
 
-@pytest.mark.parametrize(
-    "seq_lens",
-    [
-        [1],
-        [4],
-        [15],
-        [27, 11, 24],
-        [127],
-    ],
-)  # Test with different sequence length lists
-@pytest.mark.parametrize("fusion", [False, True])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("hidden_size", [2048, 4096, 5120, 7168, 8192])
-def test_mnnvl_allreduce_full(
-    monkeypatch, seq_lens: list[int], fusion: bool, dtype: torch.dtype, hidden_size: int
+def run_mnnvl_ar_full(
+    monkeypatch,
+    seq_lens: list[int],
+    fusion: bool,
+    dtype: torch.dtype,
+    hidden_size: int,
+    explicit_workspace_bytes: int | None = None,
 ):
+    """Core test logic for MNNVL AllReduce operations.
+
+    Args:
+        monkeypatch: pytest monkeypatch fixture
+        seq_lens: List of sequence lengths to test
+        fusion: Whether to test fused allreduce+rmsnorm or just allreduce
+        dtype: Data type for tensors
+        hidden_size: Hidden dimension size
+        explicit_workspace_bytes: If provided, use this workspace size instead of default
+    """
     monkeypatch.setenv("TRTLLM_FORCE_MNNVL_AR", "1")  # force multi-node allreduce.
 
     # Get MPI info
@@ -211,7 +213,9 @@ def test_mnnvl_allreduce_full(
         # This workspace is sized for the maximum expected sequence length and can be reused within each list
         # Each parameterized list gets its own fresh workspace allocation
         mcast_buffer_mnnvl, buffer_flags_mnnvl, max_num_elements_mnnvl = (
-            trtllm_mnnvl_ar.get_allreduce_mnnvl_workspace(mapping, dtype)
+            trtllm_mnnvl_ar.get_allreduce_mnnvl_workspace(
+                mapping, dtype, buffer_size_in_bytes=explicit_workspace_bytes
+            )
         )
 
         multicast_ptr = mcast_buffer_mnnvl.get_multicast_ptr()
@@ -291,18 +295,21 @@ def test_mnnvl_allreduce_full(
         rank_failed = True
         failure_message = f"FAILED[rank={rank}]: seq_lens={seq_lens}, fusion={fusion}, dtype={dtype} failed: {e}"
         print(failure_message)
-        # Gather failure status from all ranks
+
+        # Gather failure status from all ranks for logging
         all_failures = MPI.COMM_WORLD.allgather(rank_failed)
 
-        # If any rank failed, fail the test
         if any(all_failures):
             failed_ranks = [i for i, failed in enumerate(all_failures) if failed]
             if rank == 0:
                 print(f"Test failed on ranks: {failed_ranks}")
 
-            # Fail the test on all ranks
-            pytest.fail(f"Test failed on ranks {failed_ranks}")
-            trtllm_mnnvl_ar.mpi_barrier()
+        # Cleanup before re-raising
+        if "mcast_buffer_mnnvl" in locals():
+            del mcast_buffer_mnnvl
+
+        # Re-raise the original exception so it can be caught by pytest.raises in negative tests
+        raise
 
     finally:
         # Ensure cleanup happens for this list's workspace
@@ -311,3 +318,86 @@ def test_mnnvl_allreduce_full(
 
     # Final synchronization and check for failures across all ranks
     trtllm_mnnvl_ar.mpi_barrier()
+
+
+"""Test with default workspace size"""
+
+
+@pytest.mark.parametrize(
+    "seq_lens",
+    [
+        [1],
+        [4],
+        [15],
+        [27, 11, 24],
+        [127],
+    ],
+)
+@pytest.mark.parametrize("fusion", [False, True])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [2048, 4096, 5120, 7168, 8192])
+def test_mnnvl_allreduce_default_workspace(
+    monkeypatch, seq_lens: list[int], fusion: bool, dtype: torch.dtype, hidden_size: int
+):
+    """Test MNNVL AllReduce with default workspace size."""
+    run_mnnvl_ar_full(monkeypatch, seq_lens, fusion, dtype, hidden_size)
+
+
+"""Test with explicit workspace size"""
+
+
+@pytest.mark.parametrize(
+    "seq_lens",
+    [
+        [1, 4, 180],
+    ],
+)
+@pytest.mark.parametrize("fusion", [False, True])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [2048, 4096, 5120, 7168, 8192])
+def test_mnnvl_allreduce_explicit_workspace(
+    monkeypatch, seq_lens: list[int], fusion: bool, dtype: torch.dtype, hidden_size: int
+):
+    """Test MNNVL AllReduce with explicitly calculated workspace size."""
+    # Calculate workspace to fit the maximum sequence length
+    # buffer shape: [3, 2, buffer_tokens, hidden_dim]
+    explicit_workspace_bytes = 3 * 2 * dtype.itemsize * hidden_size * max(seq_lens)
+    run_mnnvl_ar_full(
+        monkeypatch,
+        seq_lens,
+        fusion,
+        dtype,
+        hidden_size,
+        explicit_workspace_bytes=explicit_workspace_bytes,
+    )
+
+
+"""Negative test: workspace too small"""
+
+
+@pytest.mark.parametrize("fusion", [False, True])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [2048, 4096])
+def test_mnnvl_allreduce_workspace_too_small(
+    monkeypatch, fusion: bool, dtype: torch.dtype, hidden_size: int
+):
+    """Test that MNNVL AllReduce fails gracefully when workspace is too small."""
+    # Use a large sequence length that won't fit in a small workspace
+    seq_len = 180
+
+    # Create a workspace that's too small (only enough for 10 tokens)
+    small_workspace_bytes = 3 * 2 * dtype.itemsize * hidden_size * 10
+
+    # Expect a ValueError with a message about buffer_M being too small
+    with pytest.raises((ValueError, RuntimeError)) as exc_info:
+        run_mnnvl_ar_full(
+            monkeypatch,
+            [seq_len],
+            fusion,
+            dtype,
+            hidden_size,
+            explicit_workspace_bytes=small_workspace_bytes,
+        )
+
+    # Verify the error message contains the expected text
+    assert "greater than the buffer_M" in str(exc_info.value)