flashinfer-ai
diff --git a/‎csrc/flashinfer_xqa_binding.cu‎
Lines changed: 3 additions & 4 deletions b/‎csrc/flashinfer_xqa_binding.cu‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎csrc/xqa/mha.cu‎
Lines changed: 28 additions & 29 deletions b/‎csrc/xqa/mha.cu‎
Lines changed: 28 additions & 29 deletions
diff --git a/‎csrc/xqa/mha.h‎
Lines changed: 39 additions & 42 deletions b/‎csrc/xqa/mha.h‎
Lines changed: 39 additions & 42 deletions
diff --git a/‎csrc/xqa/mha_sm90.cu‎
Lines changed: 9 additions & 10 deletions b/‎csrc/xqa/mha_sm90.cu‎
Lines changed: 9 additions & 10 deletions
@@ -19,9 +19,8 @@
 #if MLA_WRAPPER
 void xqa_wrapper_mla(int64_t multiProcessorCount, double qScale, TensorView output, TensorView q,
                      TensorView kCacheVLLM, TensorView vCacheVLLM, TensorView kvCachePageList,
-                     int64_t maxSeqLen, TensorView seqLen, int64_t batchSize,
-                     TensorView kvCacheScale, TensorView semaphores, TensorView scratch,
-                     bool enable_pdl);
+                     int64_t maxSeqLen, TensorView seqLen, int64_t batchSize, double kvCacheScale,
+                     TensorView semaphores, TensorView scratch, bool enable_pdl);
 
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(xqa_wrapper_mla, xqa_wrapper_mla);
 
@@ -34,7 +33,7 @@ void xqa_wrapper(bool run_sm90_fp8_mha, int64_t multiProcessorCount, int64_t nbK
 #endif
                  TensorView q, tvm::ffi::Optional<TensorView> attentionSinks, TensorView kCacheVLLM,
                  TensorView vCacheVLLM, TensorView kvCachePageList, int64_t maxSeqLen,
-                 TensorView seqLen, int64_t batchSize, TensorView kvCacheScale,
+                 TensorView seqLen, int64_t batchSize, double kvCacheScale,
 #if SPEC_DEC
                  int64_t qSeqLen, TensorView qCuSeqLens, TensorView mask,
 #endif
 
@@ -1301,8 +1301,8 @@ CUBIN_EXPORT __global__
 #endif
 #endif
         uint32_t const batchSize,
-        float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V
-                                                 // cache. Used only for int8/fp8 KV cache.
+        float kvCacheScale,  // Device memory scalar. Same scale for K and V
+                             // cache. Used only for int8/fp8 KV cache.
         uint32_t kv_stride_page, uint32_t kv_stride_token, uint32_t kv_stride_head,
         uint32_t* __restrict__ semaphores = nullptr, void* __restrict__ scratch = nullptr) {
   assert(allowMultiBlockMode || gridDim.x == 1);
@@ -1503,7 +1503,7 @@ CUBIN_EXPORT __global__
   };
   if (warpIdx.z == 0) {
     float const qkScale =
-        qScale * (isKVCacheQuantized ? kvCacheScale[0] : 1.f) *
+        qScale * (isKVCacheQuantized ? kvCacheScale : 1.f) *
         rsqrtf(validElemsPerHead);  // qkScale is applied onto Q*K.T before softmax.
     CircIdx<nbKBuffers> idxCurrSMemKBuf{nbKBuffers - 1};
     auto const getSMemKTile = [&](uint32_t idx) -> SharedMem::KSmemBuffer& {
@@ -2156,7 +2156,7 @@ CUBIN_EXPORT __global__
       }
     }
 
-    float voScale = (isKVCacheQuantized ? kvCacheScale[0] : 1.F);
+    float voScale = (isKVCacheQuantized ? kvCacheScale : 1.F);
     if (seqIterInit < nbSeqIters) {  // otherwise rcpRowSum will be NAN.
       // The attention sinks are moved to the multi-block reduction part if the multi-block is
       // enabled.
@@ -2410,8 +2410,8 @@ CUBIN_EXPORT __global__ __launch_bounds__(256, nbCtaPerSM) void kernel_mha(
     BeamSearchParams const beamSearchParams,
 #endif
     uint32_t const batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
+    float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
+                         // Used only for int8/fp8 KV cache.
     uint32_t kv_stride_page, uint32_t kv_stride_token, uint32_t kv_stride_head,
     uint32_t* __restrict__ semaphores = nullptr, void* __restrict__ scratch = nullptr) {
 #if SPEC_DEC
@@ -2442,40 +2442,39 @@ static constexpr auto kernel_mha = kernel_mha_impl;
 #endif
 
 #ifndef GENERATE_CUBIN
-void launchMHA(
-    cudaDeviceProp const& prop, uint32_t nbKHeads,
+void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #if SLIDING_WINDOW
-    uint32_t slidingWinSize,
+               uint32_t slidingWinSize,
 #endif
-    float qScale, OutputHead* output,
+               float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+               float const* rcpOutScale,
 #endif
 #if USE_INPUT_KV
-    InputHead const* qkv,
+               InputHead const* qkv,
 #if ROPE_STYLE != 0
-    Vec<float, validElemsPerHead> const* ropeCosSin,
+               Vec<float, validElemsPerHead> const* ropeCosSin,
 #endif
 #else
-    InputHead const* q,
-#endif
-    float const* attentionSinks,  // [headGrpSize]
-    GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-    KVCachePageIndex const*
-        kvCachePageList,  // device pointer. shape:
-                          // KVCachePageIndex[batchSize][beamWidth][2][maxNbPagesPerSeq].
-    uint32_t maxSeqLen, uint32_t const* seqLen,
+               InputHead const* q,
+#endif
+               float const* attentionSinks,  // [headGrpSize]
+               GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+               KVCachePageIndex const*
+                   kvCachePageList,  // device pointer. shape:
+                                     // KVCachePageIndex[batchSize][beamWidth][2][maxNbPagesPerSeq].
+               uint32_t maxSeqLen, uint32_t const* seqLen,
 #if BEAM_WIDTH > 1
-    BeamSearchParams const& beamSearchParams,
+               BeamSearchParams const& beamSearchParams,
 #endif
-    uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
+               uint32_t batchSize,
+               float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
+                                    // Used only for int8/fp8 KV cache.
 #if SPEC_DEC
-    SpecDecParams const& specDecParams,
+               SpecDecParams const& specDecParams,
 #endif
-    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
-    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
+               uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+               uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
 #if SPEC_DEC
   auto const qSeqLen = specDecParams.qSeqLen;
   auto const qCuSeqLens = specDecParams.qCuSeqLens;
@@ -2571,7 +2570,7 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
                          InputHead const* q, float const* attentionSinks, GMemCacheHead* kCacheVLLM,
                          GMemCacheHead* vCacheVLLM, KVCachePageIndex const* kvCachePageList,
                          uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-                         float const* __restrict__ kvCacheScale,
+                         float kvCacheScale,
 #if SPEC_DEC
                          uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
 
@@ -88,40 +88,39 @@ struct BeamSearchParams {
                                             // but we have to match trt-llm API.
 };
 
-void launchMHA(
-    cudaDeviceProp const& prop, uint32_t const nbKHeads,
+void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads,
 #if SLIDING_WINDOW
-    uint32_t slidingWinSize,
+               uint32_t slidingWinSize,
 #endif
-    float qScale, OutputHead* output,
+               float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+               float const* rcpOutScale,
 #endif
 #if USE_INPUT_KV
-    InputHead const* qkv,
+               InputHead const* qkv,
 #if ROPE_STYLE != 0
-    Vec<float, validElemsPerHead> const* ropeCosSin,
+               Vec<float, validElemsPerHead> const* ropeCosSin,
 #endif
 #else
-    InputHead const* q,
-#endif
-    float const* attentionSinks,  // [headGrpSize]
-    GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-    KVCachePageIndex const*
-        kvCachePageList,  // device pointer. shape:
-                          // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
-    uint32_t maxSeqLen, uint32_t const* seqLen,
+               InputHead const* q,
+#endif
+               float const* attentionSinks,  // [headGrpSize]
+               GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+               KVCachePageIndex const*
+                   kvCachePageList,  // device pointer. shape:
+                                     // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
+               uint32_t maxSeqLen, uint32_t const* seqLen,
 #if BEAM_WIDTH > 1
-    BeamSearchParams const& beamSearchParams,
+               BeamSearchParams const& beamSearchParams,
 #endif
-    uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
+               uint32_t batchSize,
+               float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
+                                    // Used only for int8/fp8 KV cache.
 #if SPEC_DEC
-    SpecDecParams const& specDecParams,
+               SpecDecParams const& specDecParams,
 #endif
-    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
-    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream);
+               uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+               uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream);
 
 void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32_t slidingWinSize,
                          float qScale, OutputHead* output,
@@ -131,7 +130,7 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
                          InputHead const* q, float const* attentionSinks, GMemCacheHead* kCacheVLLM,
                          GMemCacheHead* vCacheVLLM, KVCachePageIndex const* kvCachePageList,
                          uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-                         float const* __restrict__ kvCacheScale,
+                         float kvCacheScale,
 #if SPEC_DEC
                          uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
@@ -166,8 +165,8 @@ void launchHopperF8MHA(
     BeamSearchParams const& beamSearchParams,
 #endif
     uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
+    float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
+                         // Used only for int8/fp8 KV cache.
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
@@ -181,28 +180,26 @@ void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads
                                  InputHead const* q, float const* attentionSinks,
                                  GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
                                  KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
-                                 uint32_t const* seqLen, uint32_t batchSize,
-                                 float const* __restrict__ kvCacheScale,
+                                 uint32_t const* seqLen, uint32_t batchSize, float kvCacheScale,
 #if SPEC_DEC
                                  uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
                                  uint32_t* semaphores, void* scratch, bool enable_pdl,
                                  uint64_t kv_stride_page, uint64_t kv_stride_token,
                                  uint64_t kv_stride_head, cudaStream_t stream);
 
-void launchMLA(
-    cudaDeviceProp const& prop,
-    uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
-    float qScale, OutputHead* output, InputHead const* q, GMemCacheHead* kCacheVLLM,
-    GMemCacheHead* vCacheVLLM,
-    KVCachePageIndex const*
-        kvCachePageList,  // device pointer. shape:
-                          // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq] (Layout 0) or
-                          // [batchSize][maxNbPagesPerSeq] (Layout 1)
-    uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
-    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream);
+void launchMLA(cudaDeviceProp const& prop,
+               uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
+               float qScale, OutputHead* output, InputHead const* q, GMemCacheHead* kCacheVLLM,
+               GMemCacheHead* vCacheVLLM,
+               KVCachePageIndex const*
+                   kvCachePageList,  // device pointer. shape:
+                                     // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
+                                     // (Layout 0) or [batchSize][maxNbPagesPerSeq] (Layout 1)
+               uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
+               float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
+                                    // Used only for int8/fp8 KV cache.
+               uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream);
 
 void launchMLAFlashInfer(
     uint32_t multiProcessorCount,
@@ -214,8 +211,8 @@ void launchMLAFlashInfer(
                           // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq] (Layout 0) or
                           // [batchSize][maxNbPagesPerSeq] (Layout 1)
     uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
+    float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
+                         // Used only for int8/fp8 KV cache.
     uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
     uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream);
 
 
@@ -626,8 +626,8 @@ __launch_bounds__(128 * 3)
         BeamSearchParams const beamSearchParams,
 #endif
         uint32_t const batchSize,
-        float const* __restrict__ const kvCacheScale,  // Device memory scalar. Same scale for K and
-                                                       // V cache. Used only for int8/fp8 KV cache.
+        float kvCacheScale,  // Device memory scalar. Same scale for K and
+                             // V cache. Used only for int8/fp8 KV cache.
         __grid_constant__ CUtensorMap const tensorMapVLLMK,
         __grid_constant__ CUtensorMap const tensorMapVLLMV,
 #if SPEC_DEC
@@ -773,7 +773,7 @@ __launch_bounds__(128 * 3)
     }
 
     float const qkScale =
-        qScale * (isKVCacheQuantized ? kvCacheScale[0] : 1.f) *
+        qScale * (isKVCacheQuantized ? kvCacheScale : 1.f) *
         rsqrtf(validElemsPerHead);  // qkScale is applied onto Q*K.T before softmax.
     uint32_t const warpRank = warpIdx.x;
 
@@ -962,7 +962,7 @@ __launch_bounds__(128 * 3)
 #else
     constexpr float oScale = 1.F;
 #endif
-    float const xvoScale = xScale * (isKVCacheQuantized ? kvCacheScale[0] : 1.f) * oScale;
+    float const xvoScale = xScale * (isKVCacheQuantized ? kvCacheScale : 1.f) * oScale;
 
     Gemm1Acc acc{};  // init to zeros to avoid runtime checking for first gmma instruction.
     gmma::fence();
@@ -1316,7 +1316,7 @@ __launch_bounds__(128 * 3)
               headGrpSize * nbKHeads + idxHeadGrp + (headGrpSize + 2) * nbKHeads * idxReq;
           IOHead const& inKHead = qkv[inputKHeadOffset];
           uint32_t const lane = laneId();
-          float const rcpKScale = 1.F / kvCacheScale[0];
+          float const rcpKScale = 1.F / kvCacheScale;
 #if ROPE_STYLE == 0
           constexpr bool isNeox = false;
           auto const pairs =
@@ -1375,7 +1375,7 @@ __launch_bounds__(128 * 3)
               (headGrpSize + 1) * nbKHeads + idxHeadGrp + (headGrpSize + 2) * nbKHeads * idxReq;
           IOHead const& inVHead = qkv[inputVHeadOffset];
           uint32_t const lane = laneId();
-          float const rcpVScale = 1.F / kvCacheScale[0];
+          float const rcpVScale = 1.F / kvCacheScale;
           constexpr bool isNeox = false;
           auto const pairs =
               loadHead<InputElem, isNeox, warp_size, float>(inVHead, lane) * rcpVScale;
@@ -2931,8 +2931,8 @@ void launchHopperF8MHA(
     BeamSearchParams const& beamSearchParams,
 #endif
     uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
+    float kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
+                         // Used only for int8/fp8 KV cache.
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
@@ -3044,8 +3044,7 @@ void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads
                                  InputHead const* q, float const* attentionSinks,
                                  GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
                                  KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
-                                 uint32_t const* seqLen, uint32_t batchSize,
-                                 float const* __restrict__ kvCacheScale,
+                                 uint32_t const* seqLen, uint32_t batchSize, float kvCacheScale,
 #if SPEC_DEC
                                  uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif