Address review comments.

timlee0212 · timlee0212 · commit c6ed14724f52 · 2025-11-21T14:37:32.000-08:00
diff --git a/csrc/trtllm_mnnvl_allreduce.cu b/csrc/trtllm_mnnvl_allreduce.cu
@@ -26,8 +26,6 @@ using tvm::ffi::Optional;
     }                                                                               \
   }()
 
-// FIXME: is bool flag for oneshot a good idea? Trying to avoid defining a new type/enum at this
-// level
 void trtllm_mnnvl_allreduce_fusion(TensorView input, int64_t multicast_buffer_ptr,
                                    int64_t buffer_ptrs_dev, int64_t buffer_ptr_local,
                                    TensorView buffer_flags_mnnvl, int64_t nranks, int64_t rank,
diff --git a/flashinfer/comm/trtllm_mnnvl_ar.py b/flashinfer/comm/trtllm_mnnvl_ar.py
@@ -33,11 +33,14 @@ class MNNVLAllreduceFusionStrategy(Enum):
     AUTO = 99
 
     @staticmethod
-    def is_one_shot(
+    def select_strategy(
         tp_size: int, num_tokens: int, hidden_dim: int, dtype: torch.dtype
-    ) -> bool:
+    ) -> "MNNVLAllreduceFusionStrategy":
         elem_size = torch.tensor([], dtype=dtype).element_size()
-        return num_tokens * hidden_dim * tp_size * elem_size <= MNNVL_ONE_SHOT_THRESHOLD
+        if num_tokens * hidden_dim * tp_size * elem_size <= MNNVL_ONE_SHOT_THRESHOLD:
+            return MNNVLAllreduceFusionStrategy.ONESHOT
+        else:
+            return MNNVLAllreduceFusionStrategy.TWOSHOT
 
 
 # Empirical result calculated from num_tokens * hidden_dim * tp_size * elem_size
@@ -54,15 +57,15 @@ def __init__(
         comm_backend: Optional[CommBackend] = None,
     ):
         """
-        Initialize the MNNVL Allreduce Fusion Workspace. COMM_WORLD will be used for creating the workspace and synchronization. The process might hang if the intended communication group in mapping is not COMM_WORLD.
+        Initialize the MNNVL Allreduce Fusion Workspace. comm_backend will be used for creating the workspace and synchronization. If not provided, MPIBackend will be used which will use COMM_WORLD for synchronization.
 
         Args:
             mapping: Mapping configuration containing rank info
             buffer_size_in_bytes: The size in bytes for each lamport buffer. The actual allocation size will be NUM_LAMPORT_BUFFERS * buffer_size_in_bytes.
         """
         if buffer_size_in_bytes is None:
-            # Default to 16MB workspace size if not provided
-            buffer_size_in_bytes = 16 * (1024**2)
+            # Default to 512MB workspace size if not provided
+            buffer_size_in_bytes = 512 * (1024**2)
         else:
             # Round up to the nearest multiple of 8MB
             buffer_size_in_bytes = math.ceil(buffer_size_in_bytes / (8 * (1024**2))) * (
@@ -112,7 +115,28 @@ def __init__(
         self.uc_ptr_local = self.mcast_buffer_handle.get_unicast_ptr(self.rank)
         self.mc_ptr = self.mcast_buffer_handle.get_multicast_ptr()
 
+    @functools.cache
+    def is_buffer_size_sufficient(
+        self,
+        tp_size: int,
+        num_tokens: int,
+        hidden_dim: int,
+        dtype: torch.dtype,
+        strategy: MNNVLAllreduceFusionStrategy = MNNVLAllreduceFusionStrategy.AUTO,
+    ) -> bool:
+        """
+        Calculate the required buffer size for a given problem size.
+        """
+        required_buffer_size = self.get_required_buffer_size_bytes(
+            tp_size, num_tokens, hidden_dim, dtype, strategy
+        )
+        if required_buffer_size > self.buffer_size_bytes:
+            return False
+        else:
+            return True
+
     @staticmethod
+    @functools.cache
     def get_required_buffer_size_bytes(
         tp_size: int,
         num_tokens: int,
@@ -124,12 +148,12 @@ def get_required_buffer_size_bytes(
         Calculate the required buffer size for a given problem size.
         """
         elem_size = torch.tensor([], dtype=dtype).element_size()
-        is_one_shot = MNNVLAllreduceFusionStrategy.is_one_shot(
-            tp_size, num_tokens, hidden_dim, dtype
-        )
-        if strategy == MNNVLAllreduceFusionStrategy.ONESHOT or (
-            strategy == MNNVLAllreduceFusionStrategy.AUTO and is_one_shot
-        ):
+        if strategy == MNNVLAllreduceFusionStrategy.AUTO:
+            strategy = MNNVLAllreduceFusionStrategy.select_strategy(
+                tp_size, num_tokens, hidden_dim, dtype
+            )
+
+        if strategy == MNNVLAllreduceFusionStrategy.ONESHOT:
             # For one-shot, each rank needs to store num_tokens * tp_size tokens
             buffer_size = num_tokens * hidden_dim * tp_size * elem_size
         else:
@@ -268,12 +292,18 @@ def trtllm_mnnvl_allreduce(
 
     module = get_trtllm_mnnvl_comm_module()
 
-    use_oneshot = strategy == MNNVLAllreduceFusionStrategy.ONESHOT or (
-        strategy == MNNVLAllreduceFusionStrategy.AUTO
-        and MNNVLAllreduceFusionStrategy.is_one_shot(
+    if strategy == MNNVLAllreduceFusionStrategy.AUTO:
+        strategy = MNNVLAllreduceFusionStrategy.select_strategy(
             workspace.tp_size, input.shape[0], input.shape[1], input.dtype
         )
-    )
+
+    if not workspace.is_buffer_size_sufficient(
+        workspace.tp_size, input.shape[0], input.shape[1], input.dtype, strategy
+    ):
+        raise ValueError(
+            f"The buffer size in the given workspace is insufficient for the given problem size. Buffer: {workspace.buffer_size_bytes} bytes, Required: {workspace.get_required_buffer_size_bytes(workspace.tp_size, input.shape[0], input.shape[1], input.dtype, strategy)} bytes."
+        )
+
     module.trtllm_mnnvl_allreduce_fusion(
         input,
         workspace.mc_ptr,
@@ -284,7 +314,7 @@ def trtllm_mnnvl_allreduce(
         workspace.rank,
         False,  # No RMSNorm Fusion
         launch_with_pdl,
-        use_oneshot,
+        strategy == MNNVLAllreduceFusionStrategy.ONESHOT,
         output,
         None,
         None,
@@ -358,15 +388,16 @@ def trtllm_mnnvl_fused_allreduce_add_rmsnorm(
 
     module = get_trtllm_mnnvl_comm_module()
 
-    use_oneshot = strategy == MNNVLAllreduceFusionStrategy.ONESHOT or (
-        strategy == MNNVLAllreduceFusionStrategy.AUTO
-        and MNNVLAllreduceFusionStrategy.is_one_shot(
-            workspace.tp_size,
-            input.shape[0],
-            input.shape[1],
-            input.dtype,
+    if strategy == MNNVLAllreduceFusionStrategy.AUTO:
+        strategy = MNNVLAllreduceFusionStrategy.select_strategy(
+            workspace.tp_size, input.shape[0], input.shape[1], input.dtype
+        )
+    if not workspace.is_buffer_size_sufficient(
+        workspace.tp_size, input.shape[0], input.shape[1], input.dtype, strategy
+    ):
+        raise ValueError(
+            f"The buffer size in the given workspace is insufficient for the given problem size. Buffer: {workspace.buffer_size_bytes} bytes, Required: {workspace.get_required_buffer_size_bytes(workspace.tp_size, input.shape[0], input.shape[1], input.dtype, strategy)} bytes."
         )
-    )
 
     module.trtllm_mnnvl_allreduce_fusion(
         input,
@@ -378,7 +409,7 @@ def trtllm_mnnvl_fused_allreduce_add_rmsnorm(
         workspace.rank,
         True,  # RMSNorm Fusion
         launch_with_pdl,
-        use_oneshot,
+        strategy == MNNVLAllreduceFusionStrategy.ONESHOT,
         output,
         residual_out,
         residual_in,