fix ut for vllm kv layout

qsang-nv · qsang-nv · commit 9ef83d1a19c0 · 2025-10-17T03:24:23.000-07:00
Signed-off-by: Qidi Sang &lt;200703406+qsang-nv@users.noreply.github.com&gt;
diff --git a/flashinfer/jit/xqa.py b/flashinfer/jit/xqa.py
@@ -97,4 +97,5 @@ def gen_xqa_module(
         + flag_head_grp_size
         + flag_sliding_window,
         extra_ldflags=["-lcuda"],  # Add CUDA Driver API library
+        extra_cflags=["-DPAGED_KV_CACHE_LAYOUT=1"],
     )
diff --git a/tests/attention/test_xqa.py b/tests/attention/test_xqa.py
@@ -160,7 +160,7 @@ def ref_attention(
 @pytest.mark.parametrize("use_attention_sinks", [True, False])
 @pytest.mark.parametrize("seq_len", [2, 15, 256, 514])
 @pytest.mark.parametrize("batch_size", [1, 4])
-@pytest.mark.parametrize("nb_k_heads", [1, 4, 8])
+@pytest.mark.parametrize("nb_k_heads", [2, 4])
 @pytest.mark.parametrize("tokens_per_page", [16, 64])
 @pytest.mark.parametrize("valid_elems_per_head", [32, 128])
 @pytest.mark.parametrize("head_grp_size", [8, 16])

Original file line number	Diff line number	Diff line change
`@@ -97,4 +97,5 @@ def gen_xqa_module(`
`97`	`97`	`+ flag_head_grp_size`
`98`	`98`	`+ flag_sliding_window,`
`99`	`99`	`extra_ldflags=["-lcuda"], # Add CUDA Driver API library`
	`100`	`+ extra_cflags=["-DPAGED_KV_CACHE_LAYOUT=1"],`
`100`	`101`	`)`