add kv scale in test parameter

qsang-nv · qsang-nv · commit 33db49e11196 · 2025-11-04T20:03:40.000-08:00
Signed-off-by: Qidi Sang &lt;200703406+qsang-nv@users.noreply.github.com&gt;
diff --git a/csrc/xqa/mha.cu b/csrc/xqa/mha.cu
@@ -1301,8 +1301,7 @@ CUBIN_EXPORT __global__
 #endif
 #endif
         uint32_t const batchSize,
-        float kvCacheScale,  // Device memory scalar. Same scale for K and V
-                             // cache. Used only for int8/fp8 KV cache.
+        float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
         uint32_t kv_stride_page, uint32_t kv_stride_token, uint32_t kv_stride_head,
         uint32_t* __restrict__ semaphores = nullptr, void* __restrict__ scratch = nullptr) {
   assert(allowMultiBlockMode || gridDim.x == 1);
@@ -2410,8 +2409,7 @@ CUBIN_EXPORT __global__ __launch_bounds__(256, nbCtaPerSM) void kernel_mha(
     BeamSearchParams const beamSearchParams,
 #endif
     uint32_t const batchSize,
-    float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                         // Used only for int8/fp8 KV cache.
+    float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
     uint32_t kv_stride_page, uint32_t kv_stride_token, uint32_t kv_stride_head,
     uint32_t* __restrict__ semaphores = nullptr, void* __restrict__ scratch = nullptr) {
 #if SPEC_DEC
@@ -2442,39 +2440,39 @@ static constexpr auto kernel_mha = kernel_mha_impl;
 #endif
 
 #ifndef GENERATE_CUBIN
-void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
+void launchMHA(
+    cudaDeviceProp const& prop, uint32_t nbKHeads,
 #if SLIDING_WINDOW
-               uint32_t slidingWinSize,
+    uint32_t slidingWinSize,
 #endif
-               float qScale, OutputHead* output,
+    float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-               float const* rcpOutScale,
+    float const* rcpOutScale,
 #endif
 #if USE_INPUT_KV
-               InputHead const* qkv,
+    InputHead const* qkv,
 #if ROPE_STYLE != 0
-               Vec<float, validElemsPerHead> const* ropeCosSin,
+    Vec<float, validElemsPerHead> const* ropeCosSin,
 #endif
 #else
-               InputHead const* q,
-#endif
-               float const* attentionSinks,  // [headGrpSize]
-               GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-               KVCachePageIndex const*
-                   kvCachePageList,  // device pointer. shape:
-                                     // KVCachePageIndex[batchSize][beamWidth][2][maxNbPagesPerSeq].
-               uint32_t maxSeqLen, uint32_t const* seqLen,
+    InputHead const* q,
+#endif
+    float const* attentionSinks,  // [headGrpSize]
+    GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+    KVCachePageIndex const*
+        kvCachePageList,  // device pointer. shape:
+                          // KVCachePageIndex[batchSize][beamWidth][2][maxNbPagesPerSeq].
+    uint32_t maxSeqLen, uint32_t const* seqLen,
 #if BEAM_WIDTH > 1
-               BeamSearchParams const& beamSearchParams,
+    BeamSearchParams const& beamSearchParams,
 #endif
-               uint32_t batchSize,
-               float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                    // Used only for int8/fp8 KV cache.
+    uint32_t batchSize,
+    float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
 #if SPEC_DEC
-               SpecDecParams const& specDecParams,
+    SpecDecParams const& specDecParams,
 #endif
-               uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
-               uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
+    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
 #if SPEC_DEC
   auto const qSeqLen = specDecParams.qSeqLen;
   auto const qCuSeqLens = specDecParams.qCuSeqLens;
diff --git a/csrc/xqa/mha.h b/csrc/xqa/mha.h
@@ -88,39 +88,39 @@ struct BeamSearchParams {
                                             // but we have to match trt-llm API.
 };
 
-void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads,
+void launchMHA(
+    cudaDeviceProp const& prop, uint32_t const nbKHeads,
 #if SLIDING_WINDOW
-               uint32_t slidingWinSize,
+    uint32_t slidingWinSize,
 #endif
-               float qScale, OutputHead* output,
+    float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-               float const* rcpOutScale,
+    float const* rcpOutScale,
 #endif
 #if USE_INPUT_KV
-               InputHead const* qkv,
+    InputHead const* qkv,
 #if ROPE_STYLE != 0
-               Vec<float, validElemsPerHead> const* ropeCosSin,
+    Vec<float, validElemsPerHead> const* ropeCosSin,
 #endif
 #else
-               InputHead const* q,
-#endif
-               float const* attentionSinks,  // [headGrpSize]
-               GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-               KVCachePageIndex const*
-                   kvCachePageList,  // device pointer. shape:
-                                     // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
-               uint32_t maxSeqLen, uint32_t const* seqLen,
+    InputHead const* q,
+#endif
+    float const* attentionSinks,  // [headGrpSize]
+    GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+    KVCachePageIndex const*
+        kvCachePageList,  // device pointer. shape:
+                          // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
+    uint32_t maxSeqLen, uint32_t const* seqLen,
 #if BEAM_WIDTH > 1
-               BeamSearchParams const& beamSearchParams,
+    BeamSearchParams const& beamSearchParams,
 #endif
-               uint32_t batchSize,
-               float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                    // Used only for int8/fp8 KV cache.
+    uint32_t batchSize,
+    float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
 #if SPEC_DEC
-               SpecDecParams const& specDecParams,
+    SpecDecParams const& specDecParams,
 #endif
-               uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
-               uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream);
+    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream);
 
 void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32_t slidingWinSize,
                          float qScale, OutputHead* output,
@@ -165,8 +165,7 @@ void launchHopperF8MHA(
     BeamSearchParams const& beamSearchParams,
 #endif
     uint32_t batchSize,
-    float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                         // Used only for int8/fp8 KV cache.
+    float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
@@ -188,18 +187,18 @@ void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads
                                  uint64_t kv_stride_page, uint64_t kv_stride_token,
                                  uint64_t kv_stride_head, cudaStream_t stream);
 
-void launchMLA(cudaDeviceProp const& prop,
-               uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
-               float qScale, OutputHead* output, InputHead const* q, GMemCacheHead* kCacheVLLM,
-               GMemCacheHead* vCacheVLLM,
-               KVCachePageIndex const*
-                   kvCachePageList,  // device pointer. shape:
-                                     // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
-                                     // (Layout 0) or [batchSize][maxNbPagesPerSeq] (Layout 1)
-               uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-               float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                    // Used only for int8/fp8 KV cache.
-               uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream);
+void launchMLA(
+    cudaDeviceProp const& prop,
+    uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
+    float qScale, OutputHead* output, InputHead const* q, GMemCacheHead* kCacheVLLM,
+    GMemCacheHead* vCacheVLLM,
+    KVCachePageIndex const*
+        kvCachePageList,  // device pointer. shape:
+                          // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
+                          // (Layout 0) or [batchSize][maxNbPagesPerSeq] (Layout 1)
+    uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
+    float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
+    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream);
 
 void launchMLAFlashInfer(
     uint32_t multiProcessorCount,
@@ -211,8 +210,7 @@ void launchMLAFlashInfer(
                           // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq] (Layout 0) or
                           // [batchSize][maxNbPagesPerSeq] (Layout 1)
     uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-    float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                         // Used only for int8/fp8 KV cache.
+    float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
     uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
     uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream);
 
diff --git a/csrc/xqa/mha_sm90.cu b/csrc/xqa/mha_sm90.cu
@@ -626,8 +626,7 @@ __launch_bounds__(128 * 3)
         BeamSearchParams const beamSearchParams,
 #endif
         uint32_t const batchSize,
-        float kvCacheScale,  // Device memory scalar. Same scale for K and
-                             // V cache. Used only for int8/fp8 KV cache.
+        float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
         __grid_constant__ CUtensorMap const tensorMapVLLMK,
         __grid_constant__ CUtensorMap const tensorMapVLLMV,
 #if SPEC_DEC
@@ -2931,8 +2930,7 @@ void launchHopperF8MHA(
     BeamSearchParams const& beamSearchParams,
 #endif
     uint32_t batchSize,
-    float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                         // Used only for int8/fp8 KV cache.
+    float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
diff --git a/csrc/xqa/mla_sm120.cu b/csrc/xqa/mla_sm120.cu
@@ -395,8 +395,7 @@ struct KernelArgs {
   OutputHead* __restrict__ const& output;  // [totalNbIntputTokens][nbQHeads]
   KVCacheList<usePagedKVCache> const& cacheList;
   uint32_t const& batchSize;
-  float kvCacheScale;  // Device memory scalar. Same scale for K and V
-                       // cache. Used only for int8/fp8 KV cache.
+  float kvCacheScale;  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
   Vec<CgaXBuffer, nbProducerCtasPerCga>* __restrict__ const&
       cgaXBuf;                                        // [totalNbInputTokens][maxNbSubSeq]
   uint32_t* __restrict__ const& semaphores;           // [totalNbInputTokens]
@@ -1553,8 +1552,7 @@ __launch_bounds__(32 * 4 * 3, 1) __cluster_dims__(cgaSize, 1, 1) void kernel_mha
     float const qScale,
     OutputHead* __restrict__ const output,  // [totalNbIntputTokens][nbQHeads]
     KVCacheList<usePagedKVCache> const cacheList, uint32_t const batchSize,
-    float kvCacheScale,  // Device memory scalar. Same scale for K and V
-                         // cache. Used only for int8/fp8 KV cache.
+    float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
     Vec<CgaXBuffer,
         nbProducerCtasPerCga>* __restrict__ const cgaXBuf,  // [totalNbInputTokens][maxNbSubSeq]
     uint32_t* __restrict__ const semaphores = nullptr,      // [totalNbInputTokens]
@@ -1648,18 +1646,18 @@ CUtensorMap makeTensorMapForQ(void const* addr, CUtensorMapDataType_enum dataTyp
 }
 #endif  // IS_MLA
 
-void launchMLA(cudaDeviceProp const& prop,
-               uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
-               float qScale, OutputHead* output, InputHead const* q,
-               GMemCacheHead* kCacheVLLM,                // K cache pool for VLLM layout
-               GMemCacheHead* vCacheVLLM,                // V cache pool for VLLM layout
-               KVCachePageIndex const* kvCachePageList,  // device pointer. shape:
-                                                         // [batchSize][maxNbPagesPerSeq] (Layout 1)
-               uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-               float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                    // Used only for int8/fp8 KV cache.
-               uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
-               uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
+void launchMLA(
+    cudaDeviceProp const& prop,
+    uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
+    float qScale, OutputHead* output, InputHead const* q,
+    GMemCacheHead* kCacheVLLM,                // K cache pool for VLLM layout
+    GMemCacheHead* vCacheVLLM,                // V cache pool for VLLM layout
+    KVCachePageIndex const* kvCachePageList,  // device pointer. shape:
+                                              // [batchSize][maxNbPagesPerSeq] (Layout 1)
+    uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
+    float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
+    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
 #if IS_MLA
   static_assert(
       SLIDING_WINDOW == 0 && LOW_PREC_OUTPUT == 0 && USE_INPUT_KV == 0 && USE_BEAM_SEARCH == 0,
@@ -1778,8 +1776,7 @@ void launchMLAFlashInfer(
     KVCachePageIndex const* kvCachePageList,  // device pointer. shape:
                                               // [batchSize][maxNbPagesPerSeq] (Layout 1)
     uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-    float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                         // Used only for int8/fp8 KV cache.
+    float kvCacheScale,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
     uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
     uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
 #if IS_MLA
diff --git a/tests/attention/test_xqa.py b/tests/attention/test_xqa.py
@@ -29,7 +29,6 @@ def div_up(a, b):
 sm_count = props.multi_processor_count
 
 beam_width = 1
-q_scale = 1.0
 
 
 class CacheSeq:
@@ -181,6 +180,8 @@ def ref_attention(
 @pytest.mark.parametrize("valid_elems_per_head", [32, 128])
 @pytest.mark.parametrize("head_grp_size", [8, 16])
 @pytest.mark.parametrize("kv_layout", ["NHD", "HND"])
+@pytest.mark.parametrize("kv_scale", [1.0, 0.5])
+@pytest.mark.parametrize("q_scale", [1.0, 0.5])
 def test_xqa(
     batch_size,
     nb_k_heads,
@@ -194,7 +195,11 @@ def test_xqa(
     use_sliding_window,
     enable_pdl,
     kv_layout,
+    kv_scale,
+    q_scale,
 ):
+    if kv_scale != 1.0 and fp8_kv_cache is False:
+        pytest.skip("kv cache scale works only for fp8 kv cache")
     set_random_seed(42)
 
     nb_q_heads = nb_k_heads * head_grp_size
@@ -347,7 +352,7 @@ def cache_head_at(
     )
     seq_len_list.fill_(seq_len)
 
-    kv_cache_scale = 1.0
+    kv_cache_scale = kv_scale
 
     nb_seq = nb_k_heads * batch_size
     nb_semaphores = round_up(nb_seq, 2) + 2 + nb_seq + 2
@@ -443,6 +448,8 @@ def cache_head_at(
     get_compute_capability(torch.device(device="cuda"))[0] not in [12],
     reason="XQA mla is only supported on SM120 GPUs",
 )
+@pytest.mark.parametrize("kv_scale", [1.0, 0.5])
+@pytest.mark.parametrize("q_scale", [1.0, 0.5])
 @pytest.mark.parametrize("enable_pdl", [True, False])
 @pytest.mark.parametrize("seq_len", [2, 15, 256, 514, 2048])
 @pytest.mark.parametrize("batch_size", [1, 2])
@@ -451,6 +458,8 @@ def test_xqa_mla(
     batch_size,
     seq_len,
     tokens_per_page,
+    kv_scale,
+    q_scale,
     enable_pdl,
 ):
     set_random_seed(42)
@@ -570,7 +579,7 @@ def cache_head_at(
     )
     seq_len_list.fill_(seq_len)
 
-    kv_cache_scale = 1.0
+    kv_cache_scale = kv_scale
 
     nb_seq = nb_k_heads * batch_size
     nb_semaphores = round_up(nb_seq, 2) + 2 + nb_seq + 2