feat: Add fp8-qkv, fp16/bf16 output MHA (#1540)

weireweire · web-flow · commit 018b551825c8 · 2025-08-24T22:37:37.000-07:00
## 📌 Description  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -48,7 +48,7 @@
     _check_cached_qkv_data_type,
     _check_kv_layout,
     _check_pos_encoding_mode,
-    _check_shape_dtype_device,
+    check_shape_dtype_device,
     _get_cache_alibi_slopes_buf,
     _get_cache_buf,
     _get_range_buf,
@@ -1229,14 +1229,14 @@ def run(
                     (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
                 )
             else:
-                _check_shape_dtype_device(
+                check_shape_dtype_device(
                     lse, (q.size(0), q.size(1)), torch.float32, q.device, "lse"
                 )
 
         if out is None:
             out = torch.empty_like(q)
         else:
-            _check_shape_dtype_device(out, q.shape, q.dtype, q.device, "out")
+            check_shape_dtype_device(out, q.shape, q.dtype, q.device, "out")
 
         if self.use_tensor_cores:
             run_args = [
@@ -1747,7 +1747,7 @@ def run(
         if out is None:
             out = torch.empty_like(q_nope, device=device)
         else:
-            _check_shape_dtype_device(
+            check_shape_dtype_device(
                 out, q_nope.shape, q_nope.dtype, q_nope.device, "out"
             )
 
@@ -1759,7 +1759,7 @@ def run(
                     device=device,
                 )
             else:
-                _check_shape_dtype_device(
+                check_shape_dtype_device(
                     lse,
                     (q_nope.size(0), q_nope.size(1)),
                     q_nope.dtype,
@@ -2107,9 +2107,9 @@ def trtllm_batch_decode_with_kv_cache(
         assert isinstance(out, torch.Tensor)
 
         # Use uint8 as the container dtype to compliant with next fp4 gemm.
-        _check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
+        check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
 
-        _check_shape_dtype_device(
+        check_shape_dtype_device(
             out_scale_factor,
             fp4_out_scale_shape,
             torch.float8_e4m3fn,
@@ -2135,7 +2135,9 @@ def trtllm_batch_decode_with_kv_cache(
         o_sf_start_index = 0
         out_dtype = out_dtype or query.dtype
         out = out if out is not None else torch.empty_like(query, dtype=out_dtype)
-        _check_shape_dtype_device(out, query.shape, query.dtype, query.device, "out")
+        if out_dtype not in (query.dtype, torch.float16, torch.bfloat16):
+            raise ValueError(f"Unsupported out_dtype: {out_dtype}")
+        check_shape_dtype_device(out, query.shape, out_dtype, query.device, "out")
     else:
         raise ValueError(f"Invalid out_dtype: {out_dtype}")
 
@@ -2288,7 +2290,7 @@ def trtllm_batch_decode_with_kv_cache_mla(
         out = torch.empty(out_shape, dtype=torch.bfloat16, device=query.device)
     else:
         batch_size, _, num_q_heads, _ = query.shape
-        _check_shape_dtype_device(
+        check_shape_dtype_device(
             out,
             [batch_size, num_q_heads, kv_lora_rank],
             torch.bfloat16,
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -36,7 +36,7 @@
 from ..jit.cubin_loader import get_cubin
 from ..jit.cutlass_gemm.generate_kernels import generate_gemm_operations
 from ..utils import (
-    _check_shape_dtype_device,
+    check_shape_dtype_device,
     device_support_pdl,
     get_shuffle_matrix_a_row_indices,
     get_shuffle_matrix_sf_a_row_indices,
@@ -868,7 +868,7 @@ def cutlass_fused_moe(
     if output is None:
         output = torch.empty(output_shape, dtype=output_dtype, device=input.device)
     else:
-        _check_shape_dtype_device(
+        check_shape_dtype_device(
             output, output_shape, output_dtype, input.device, "output"
         )
 
diff --git a/flashinfer/mla.py b/flashinfer/mla.py
@@ -22,7 +22,7 @@
 from .jit import JitSpec
 from .jit import env as jit_env
 from .jit import gen_batch_mla_module, gen_jit_spec, sm100a_nvcc_flags
-from .utils import MaskMode, _check_shape_dtype_device, determine_mla_backend
+from .utils import MaskMode, check_shape_dtype_device, determine_mla_backend
 
 
 def _check_cutlass_shape(q_nope_pe, ckv_kpe_cache, kv_len, page_table):
@@ -394,7 +394,7 @@ def run(
             if out is None:
                 out = torch.empty_like(q_nope)
             else:
-                _check_shape_dtype_device(
+                check_shape_dtype_device(
                     out, q_nope.shape, q_nope.dtype, q_nope.device, "out"
                 )
             q_nope_pe = torch.cat([q_nope, q_pe], dim=-1)
@@ -426,15 +426,15 @@ def run(
         if out is None:
             out = torch.empty_like(q_nope)
         else:
-            _check_shape_dtype_device(
+            check_shape_dtype_device(
                 out, q_nope.shape, q_nope.dtype, q_nope.device, "out"
             )
 
         if return_lse:
             if lse is None:
                 lse = torch.empty(q_nope.shape[:2], dtype=torch.float32, device=device)
             else:
-                _check_shape_dtype_device(
+                check_shape_dtype_device(
                     lse, q_nope.shape[:2], torch.float32, q_nope.device, "lse"
                 )
         profiler_args = (profiler_buffer,) if self._use_profiler else ()
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -43,7 +43,7 @@
     _check_cached_qkv_data_type,
     _check_kv_layout,
     _check_pos_encoding_mode,
-    _check_shape_dtype_device,
+    check_shape_dtype_device,
     _get_cache_alibi_slopes_buf,
     _get_cache_buf,
     _unpack_paged_kv_cache,
@@ -2032,7 +2032,7 @@ def run(
                     (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
                 )
             else:
-                _check_shape_dtype_device(
+                check_shape_dtype_device(
                     lse, (q.size(0), q.size(1)), torch.float32, q.device, "lse"
                 )
 
@@ -2041,7 +2041,7 @@ def run(
                 q.shape[:-1] + v_cache.shape[-1:], dtype=q.dtype, device=q.device
             )
         else:
-            _check_shape_dtype_device(
+            check_shape_dtype_device(
                 out, q.shape[:-1] + v_cache.shape[-1:], q.dtype, q.device, "out"
             )
 
@@ -2831,15 +2831,15 @@ def run(
                     (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
                 )
             else:
-                _check_shape_dtype_device(
+                check_shape_dtype_device(
                     lse, (q.size(0), q.size(1)), torch.float32, q.device, "lse"
                 )
         if out is None:
             out = torch.empty(
                 q.shape[:-1] + v.shape[-1:], dtype=q.dtype, device=q.device
             )
         else:
-            _check_shape_dtype_device(
+            check_shape_dtype_device(
                 out, q.shape[:-1] + v.shape[-1:], q.dtype, q.device, "out"
             )
         if self._backend == "cutlass":
@@ -3365,9 +3365,9 @@ def trtllm_batch_context_with_kv_cache(
         assert isinstance(out, torch.Tensor)
 
         # Use uint8 as the container dtype to compliant with next fp4 gemm.
-        _check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
+        check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
 
-        _check_shape_dtype_device(
+        check_shape_dtype_device(
             out_scale_factor,
             fp4_out_scale_shape,
             torch.float8_e4m3fn,
@@ -3392,8 +3392,10 @@ def trtllm_batch_context_with_kv_cache(
         out_scale_factor = None
         o_sf_start_index = 0
         out_dtype = out_dtype or query.dtype
+        if out_dtype not in (query.dtype, torch.float16, torch.bfloat16):
+            raise ValueError(f"Unsupported out_dtype: {out_dtype}")
         out = out if out is not None else torch.empty_like(query, dtype=out_dtype)
-        _check_shape_dtype_device(out, query.shape, query.dtype, query.device, "out")
+        check_shape_dtype_device(out, query.shape, out_dtype, query.device, "out")
     else:
         raise ValueError(f"Invalid out_dtype: {out_dtype}")
 
diff --git a/flashinfer/sparse.py b/flashinfer/sparse.py
@@ -28,7 +28,7 @@
     PosEncodingMode,
     TensorLayout,
     _check_pos_encoding_mode,
-    _check_shape_dtype_device,
+    check_shape_dtype_device,
     _get_cache_alibi_slopes_buf,
     canonicalize_torch_dtype,
     determine_attention_backend,
@@ -577,14 +577,14 @@ def run(
                     (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
                 )
             else:
-                _check_shape_dtype_device(
+                check_shape_dtype_device(
                     lse, (q.size(0), q.size(1)), torch.float32, q.device, "lse"
                 )
 
         if out is None:
             out = torch.empty_like(q, dtype=self._o_dtype)
         else:
-            _check_shape_dtype_device(out, q.shape, self._o_dtype, q.device, "out")
+            check_shape_dtype_device(out, q.shape, self._o_dtype, q.device, "out")
 
         if is_float8(q):
             assert q.dtype == k.dtype == v.dtype
@@ -1157,14 +1157,14 @@ def run(
                     (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
                 )
             else:
-                _check_shape_dtype_device(
+                check_shape_dtype_device(
                     lse, (q.size(0), q.size(1)), torch.float32, q.device, "lse"
                 )
 
         if out is None:
             out = torch.empty_like(q, dtype=self._o_dtype)
         else:
-            _check_shape_dtype_device(out, q.shape, self._o_dtype, q.device, "out")
+            check_shape_dtype_device(out, q.shape, self._o_dtype, q.device, "out")
 
         if self._backend == "fa3":
             if (
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -443,22 +443,22 @@ def determine_mla_backend(device: torch.device) -> str:
     return "fa3" if is_sm90a_supported(device) else "fa2"
 
 
-def _check_shape_dtype_device(
+def check_shape_dtype_device(
     x: torch.Tensor,
-    expected_shape: Sequence[int],
-    expected_dtype: torch.dtype,
-    expected_device: torch.device,
+    expected_shape: Optional[Sequence[int]],
+    expected_dtype: Optional[torch.dtype],
+    expected_device: Optional[torch.device],
     name: str,
 ) -> None:
-    if x.shape != torch.Size(expected_shape):
+    if expected_shape and x.shape != torch.Size(expected_shape):
         raise ValueError(
             f"Invalid shape of {name}: expected {expected_shape}, got {x.shape}"
         )
-    if x.dtype != expected_dtype:
+    if expected_dtype and x.dtype != expected_dtype:
         raise ValueError(
             f"Invalid dtype of {name}: expected {expected_dtype}, got {x.dtype}"
         )
-    if x.device != expected_device:
+    if expected_device and x.device != expected_device:
         raise ValueError(
             f"Invalid device of {name}: expected {expected_device}, got {x.device}"
         )
diff --git a/tests/test_trtllm_gen_attention.py b/tests/test_trtllm_gen_attention.py
@@ -8,7 +8,7 @@
 from flashinfer.utils import FP4Tensor, ceil_div, round_up
 
 DTYPE_MAP = {
-    "half": torch.float16,
+    "fp16": torch.float16,
     "bf16": torch.bfloat16,
     "fp8": torch.float8_e4m3fn,
     "nvfp4": "nvfp4",
@@ -237,8 +237,10 @@ def unpack_compare_nvfp4(
 @pytest.mark.parametrize(
     "q_dtype,kv_dtype,o_dtype",
     [
-        ("half", "half", "half"),
         ("bf16", "bf16", "bf16"),
+        ("fp16", "fp16", "fp16"),
+        ("fp8", "fp8", "bf16"),
+        ("fp8", "fp8", "fp16"),
         ("fp8", "fp8", "fp8"),
         ("fp8", "fp8", "nvfp4"),
     ],
@@ -355,8 +357,10 @@ def test_trtllm_batch_prefill(
         )
         assert o_scale == 1.0
         rtol, atol = 4e-1, 1e0
-    elif o_dtype == "fp8":
+    elif q_dtype == "fp8" and o_dtype == "fp8":
         rtol, atol = 5e-2, 7e-2
+    elif q_dtype == "fp8" and o_dtype in ["bf16", "fp16"]:
+        rtol, atol = 4e-2, 6e-2
     else:
         rtol, atol = 1e-2, 1e-2
 
@@ -399,10 +403,12 @@ def test_trtllm_batch_prefill(
 @pytest.mark.parametrize(
     "q_dtype,kv_dtype,o_dtype",
     [
-        ("half", "half", "half"),
-        ("half", "fp8", "half"),
         ("bf16", "bf16", "bf16"),
+        ("fp16", "fp16", "fp16"),
         ("bf16", "fp8", "bf16"),
+        ("fp16", "fp8", "fp16"),
+        ("fp8", "fp8", "bf16"),
+        ("fp8", "fp8", "fp16"),
         ("fp8", "fp8", "fp8"),
         ("fp8", "fp8", "nvfp4"),
     ],
@@ -512,8 +518,10 @@ def test_trtllm_batch_decode(
         )
         assert o_scale == 1.0
         rtol, atol = 3e-1, 1e0
-    elif o_dtype == "fp8":
+    elif q_dtype == "fp8" and o_dtype == "fp8":
         rtol, atol = 5e-2, 7e-2
+    elif q_dtype == "fp8" and o_dtype in ["bf16", "fp16"]:
+        rtol, atol = 4e-2, 6e-2
     else:
         rtol, atol = 1e-2, 1e-2