use VLLM kv layout, but some ut failed

qsang-nv · qsang-nv · commit 67bdec3d2d85 · 2025-10-15T02:37:37.000-07:00
Signed-off-by: Qidi Sang &lt;200703406+qsang-nv@users.noreply.github.com&gt;
diff --git a/csrc/flashinfer_xqa_binding.cu b/csrc/flashinfer_xqa_binding.cu
@@ -21,7 +21,12 @@ void xqa_wrapper(bool run_fp8_mha, int64_t multiProcessorCount, int64_t nbKHeads
 #if LOW_PREC_OUTPUT
                  TensorView rcpOutScale,
 #endif
-                 TensorView q, tvm::ffi::Optional<TensorView> attentionSinks, TensorView pool,
+                 TensorView q, tvm::ffi::Optional<TensorView> attentionSinks,
+#if PAGED_KV_CACHE_LAYOUT == 1
+                 TensorView kCacheVLLM, TensorView vCacheVLLM,
+#else
+                 TensorView pool,
+#endif
                  TensorView kvCachePageList, int64_t maxSeqLen, TensorView seqLen,
                  int64_t batchSize, TensorView kvCacheScale,
 #if SPEC_DEC
diff --git a/csrc/xqa/mha.cu b/csrc/xqa/mha.cu
@@ -2659,7 +2659,12 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
 #if LOW_PREC_OUTPUT
                          float const* rcpOutScale,
 #endif
-                         InputHead const* q, float const* attentionSinks, GMemCacheHead* pool,
+                         InputHead const* q, float const* attentionSinks,
+#if PAGED_KV_CACHE_LAYOUT == 1
+                         GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+#else
+                         GMemCacheHead* pool,
+#endif
                          KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
                          uint32_t const* seqLen, uint32_t batchSize,
                          float const* __restrict__ kvCacheScale,
@@ -2691,7 +2696,12 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
   auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
 #if USE_PAGED_KV_CACHE
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
+#if PAGED_KV_CACHE_LAYOUT == 1
+  KVCacheList<true> const cacheList{kCacheVLLM, vCacheVLLM, kvCachePageList, seqLen,
+                                    maxNbPagesPerSeq};
+#else
   KVCacheList<true> const cacheList{pool, kvCachePageList, seqLen, maxNbPagesPerSeq};
+#endif
   cudaLaunchKernelEx(&launchCfg, kernel_mha,
 #if SPEC_DEC
                      qSeqLen, nbKHeads, headGrpSize, qCuSeqLens,
diff --git a/csrc/xqa/mha.h b/csrc/xqa/mha.h
@@ -135,7 +135,12 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
 #if LOW_PREC_OUTPUT
                          float const* rcpOutScale,
 #endif
-                         InputHead const* q, float const* attentionSinks, GMemCacheHead* pool,
+                         InputHead const* q, float const* attentionSinks,
+#if PAGED_KV_CACHE_LAYOUT == 1
+                         GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+#else
+                         GMemCacheHead* pool,
+#endif
                          KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
                          uint32_t const* seqLen, uint32_t batchSize,
                          float const* __restrict__ kvCacheScale,
@@ -192,8 +197,13 @@ void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads
                                  float const* rcpOutScale,
 #endif
                                  InputHead const* q, float const* attentionSinks,
-                                 GMemCacheHead* pool, KVCachePageIndex const* kvCachePageList,
-                                 uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
+#if PAGED_KV_CACHE_LAYOUT == 1
+                                 GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+#else
+                                 GMemCacheHead* pool,
+#endif
+                                 KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
+                                 uint32_t const* seqLen, uint32_t batchSize,
                                  float const* __restrict__ kvCacheScale,
 #if SPEC_DEC
                                  uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
diff --git a/csrc/xqa/mha_sm90.cu b/csrc/xqa/mha_sm90.cu
@@ -3171,8 +3171,13 @@ void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads
                                  float const* rcpOutScale,
 #endif
                                  InputHead const* q, float const* attentionSinks,
-                                 GMemCacheHead* pool, KVCachePageIndex const* kvCachePageList,
-                                 uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
+#if PAGED_KV_CACHE_LAYOUT == 1
+                                 GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+#else
+                                 GMemCacheHead* pool,
+#endif
+                                 KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
+                                 uint32_t const* seqLen, uint32_t batchSize,
                                  float const* __restrict__ kvCacheScale,
 #if SPEC_DEC
                                  uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
diff --git a/csrc/xqa/xqa_wrapper.cu b/csrc/xqa/xqa_wrapper.cu
@@ -24,7 +24,12 @@ void xqa_wrapper(bool run_fp8_mha, int64_t multiProcessorCount, int64_t nbKHeads
 #if LOW_PREC_OUTPUT
                  TensorView rcpOutScale,
 #endif
-                 TensorView q, Optional<TensorView> attentionSinks, TensorView pool,
+                 TensorView q, Optional<TensorView> attentionSinks,
+#if PAGED_KV_CACHE_LAYOUT == 1
+                 TensorView kCacheVLLM, TensorView vCacheVLLM,
+#else
+                 TensorView pool,
+#endif
                  TensorView kvCachePageList, int64_t maxSeqLen, TensorView seqLen,
                  int64_t batchSize, TensorView kvCacheScale,
 #if SPEC_DEC
@@ -43,7 +48,12 @@ void xqa_wrapper(bool run_fp8_mha, int64_t multiProcessorCount, int64_t nbKHeads
            reinterpret_cast<float const*>(rcpOutScale->data),
 #endif
            reinterpret_cast<InputHead const*>(q->data), attentionSinksPtr,
+#if PAGED_KV_CACHE_LAYOUT == 1
+           reinterpret_cast<GMemCacheHead*>(kCacheVLLM->data),
+           reinterpret_cast<GMemCacheHead*>(vCacheVLLM->data),
+#else
            reinterpret_cast<GMemCacheHead*>(pool->data),
+#endif
            reinterpret_cast<KVCachePageIndex const*>(kvCachePageList->data), maxSeqLen,
            reinterpret_cast<uint32_t const*>(seqLen->data), batchSize,
            reinterpret_cast<float const*>(kvCacheScale->data),
diff --git a/flashinfer/jit/xqa.py b/flashinfer/jit/xqa.py
@@ -25,6 +25,8 @@
 
 xqa_nvcc_flags = [
     "-DNDEBUG=1",
+    "-DUSE_PAGED_KV_CACHE=1",
+    "-DPAGED_KV_CACHE_LAYOUT=1",
     "-DBEAM_WIDTH=1",
     "-DUSE_INPUT_KV=0",
     "-DUSE_CUSTOM_BARRIER=1",
diff --git a/flashinfer/xqa.py b/flashinfer/xqa.py
@@ -60,7 +60,8 @@ def xqa(
         output: torch.Tensor,
         q: torch.Tensor,
         attentionSinks: Optional[torch.Tensor],
-        pool: torch.Tensor,
+        kCacheVLLM: torch.Tensor,
+        vCacheVLLM: torch.Tensor,
         kvCachePageList: torch.Tensor,
         maxSeqLen: int,
         seqLen: torch.Tensor,
@@ -78,7 +79,8 @@ def xqa(
             output,
             q,
             attentionSinks,
-            pool,
+            kCacheVLLM,
+            vCacheVLLM,
             kvCachePageList,
             maxSeqLen,
             seqLen,
@@ -100,7 +102,8 @@ def _fake_xqa(
         output: torch.Tensor,
         q: torch.Tensor,
         attentionSinks: Optional[torch.Tensor],
-        pool: torch.Tensor,
+        kCacheVLLM: torch.Tensor,
+        vCacheVLLM: torch.Tensor,
         kvCachePageList: torch.Tensor,
         maxSeqLen: int,
         seqLen: torch.Tensor,
@@ -131,7 +134,8 @@ def xqa(
     output: torch.Tensor,
     q: torch.Tensor,
     attentionSinks: Optional[torch.Tensor],
-    pool: torch.Tensor,
+    kCacheVLLM: torch.Tensor,
+    vCacheVLLM: torch.Tensor,
     kvCachePageList: torch.Tensor,
     maxSeqLen: int,
     seqLen: torch.Tensor,
@@ -161,7 +165,8 @@ def xqa(
         output,
         q,
         attentionSinks,
-        pool,
+        kCacheVLLM,
+        vCacheVLLM,
         kvCachePageList,
         maxSeqLen,
         seqLen,
diff --git a/tests/attention/test_xqa.py b/tests/attention/test_xqa.py