test: Enable testing for trtllm-gen decode bs1 (#2103)

bkryu · yzh119 · web-flow · commit b9964cc30edd · 2025-11-18T21:10:28.000-08:00
## 📌 Description In #1898, it was raised that trtllm-gen's attention kernels fail for batch size 1. The prefill kernel was fixed in #1912 and prefill tests have been enabled. Further updates to trtllm-gen kernels have also fixed the decode batch size 1 issue. Current PR re-enables testing.  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **Tests** * Expanded batch_decode test scenarios to cover additional small-batch and page-size combinations. * Increased coverage for max_in_kv_len by testing multiple length options instead of a single value. * Restored previously marked-as-expected-failure case to run normally, improving overall test pass coverage.  --------- Co-authored-by: Zihao Ye <expye@outlook.com>
diff --git a/tests/attention/test_trtllm_gen_attention.py b/tests/attention/test_trtllm_gen_attention.py
@@ -1041,6 +1041,7 @@ def test_trtllm_batch_decode(
     "batch_size,q_len_per_req,page_size,num_kv_heads,head_grp_size",
     [
         (1, 1, 16, 8, 8),
+        (1, 1, 32, 8, 8),
     ],
 )
 @pytest.mark.parametrize("window_left", [-1])
@@ -1052,7 +1053,7 @@ def test_trtllm_batch_decode(
 )
 @pytest.mark.parametrize("enable_pdl", [None])
 @pytest.mark.parametrize("enable_sink", [False])
-@pytest.mark.parametrize("max_in_kv_len", [8192])
+@pytest.mark.parametrize("max_in_kv_len", [4096, 8192])
 @pytest.mark.parametrize("head_dim", [128])
 @pytest.mark.parametrize("device_scale", [True, False])
 def test_trtllm_batch_decode_bs1(
@@ -1073,7 +1074,6 @@ def test_trtllm_batch_decode_bs1(
     device_scale,
 ):
     # Small number of test cases for batch size 1
-    pytest.xfail("trtllm-gen decode gets incorrect output with bs1")
     _test_trtllm_batch_decode(
         "trtllm-gen",
         kv_layout,