flashinfer-ai · maomao123321 · Mar 3, 2026 · gemini-code-assist · Mar 3, 2026 · gemini-code-assist
@@ -41,11 +41,23 @@ __constant__ constexpr XQAKernelType kernelType = XQAKernelType::kSM120_MLA;
 
 inline constexpr bool allowMultipleInputTokens = true;
 
-inline constexpr uint32_t partElemsK = 64;  // @fixme: change this to 128 to save L2 traffic
+using MathElem = CacheElem;
+inline constexpr uint32_t mathElemBytes = sizeof(MathElem);
+inline constexpr bool is_fp8  = (mathElemBytes == 1);
+inline constexpr bool is_bf16 = (mathElemBytes == 2);
+// BF16: partElemsK=64, nbKBufs=2 → ~100KB, under 99KB opt-in (101376).
+inline constexpr uint32_t partElemsK =
+    is_fp8  ? 64 :
+    is_bf16 ? 64 :
+              64;
 inline constexpr uint32_t nbKParts = exactDiv(validElemsPerKHead, partElemsK);
 inline constexpr uint32_t nbQParts = nbKParts;
 
-inline constexpr uint32_t tokensPerTile = 64;
+inline constexpr uint32_t tokensPerTile =
+    is_fp8  ? 64 :
+    is_bf16 ? 32 :
+              64;
+
 inline constexpr uint32_t partElemsV = 128;
 inline constexpr uint32_t nbVSplit = 2;
 inline constexpr uint32_t gemm1V = exactDiv(validElemsPerVHead, nbVSplit);
@@ -54,12 +66,12 @@ inline constexpr uint32_t nbProducerCtasPerCga = nbVSplit;
 inline constexpr uint32_t multiBlockMinNbTilesPerCta = 2;
 inline constexpr uint32_t multiBlockMinNbTiles = multiBlockMinNbTilesPerCta * 2;
 
-using MathElem = CacheElem;
-inline constexpr uint32_t mathElemBytes = sizeof(MathElem);
 inline constexpr uint32_t grainsPerPartK = exactDiv(partElemsK * mathElemBytes, grainBytes);
 
 inline constexpr uint32_t grainElems = exactDiv(grainBytes, mathElemBytes);
 
+inline constexpr mmaShape kernelQmmaShape = is_fp8 ? mmaShape{16, 8, 32} : mmaShape{16, 8, 16};
+
 inline constexpr float xScale = 1.f / kE4M3_MAX;
 __constant__ constexpr float rcpXScale = kE4M3_MAX;
 
@@ -162,16 +174,16 @@ class Mat16x32Loader {
   __device__ inline Mat16x32Loader(Src const& src, uint32_t baseRow, uint32_t idxInstK,
                                    uint32_t r = laneId() % 16, uint32_t c = laneId() / 16)
       : src{src}, baseRow{baseRow}, idxInstK{idxInstK}, r{r}, c{c}, basePtr{getPtrRef(0)} {
-    static_assert((grainBytes * srcCols * qmmaShape.m) % 1024 == 0);
+    static_assert((grainBytes * srcCols * kernelQmmaShape.m) % 1024 == 0);
   }
 
   __device__ inline Mat16x32 load(uint32_t idxInstM) const {
     return ldmatrix<false, 4>(getPtr(idxInstM));
   }
 
   template <uint32_t tileM>
-  __device__ inline Vec<Mat16x32, exactDiv(tileM, qmmaShape.m)> loadWholeCol() const {
-    uint32_t const nbInstM = exactDiv(tileM, qmmaShape.m);
+  __device__ inline Vec<Mat16x32, exactDiv(tileM, kernelQmmaShape.m)> loadWholeCol() const {
+    uint32_t const nbInstM = exactDiv(tileM, kernelQmmaShape.m);
     Vec<Mat16x32, nbInstM> ret;
 #pragma unroll
     for (uint32_t i = 0; i < nbInstM; i++) {
@@ -181,13 +193,13 @@ class Mat16x32Loader {
   }
 
   __device__ inline LdGrain const* getPtr(uint32_t idxInstM) const {
-    return checkedVal(basePtr + idxInstM * qmmaShape.m * srcCols, getPtrRef(idxInstM));
+    return checkedVal(basePtr + idxInstM * kernelQmmaShape.m * srcCols, getPtrRef(idxInstM));
   }
 
  private:
   __device__ inline LdGrain const* getPtrRef(uint32_t idxInstM) const {
-    return &src.template at<true>(baseRow + idxInstM * qmmaShape.m + r,
-                                  idxInstK * exactDiv(qmmaShape.k, grainElems) + c);
+    return &src.template at<true>(baseRow + idxInstM * kernelQmmaShape.m + r,
+                                  idxInstK * exactDiv(kernelQmmaShape.k, grainElems) + c);
   }
 
   Src const& src;
@@ -263,7 +275,9 @@ constexpr uint32_t multiBlockMathWarps = 8;
 constexpr bool useRegQ = USE_REG_Q;
 
 struct SharedMemA {
-  static inline constexpr uint32_t nbKBufs = 12;
+  // BF16: 2 K-buffers to fit ≤99KB opt-in (~100096 bytes); 3 buffers would need ~104KB (128KB arch).
+  static inline constexpr uint32_t nbKBufs =
+      is_fp8 ? 12 : (is_bf16 ? 2 : 12);
 
   static inline constexpr uint32_t regQParts = (useRegQ ? 4 : 0);
   static inline constexpr uint32_t shmQParts = nbQParts - regQParts;
@@ -587,12 +601,12 @@ struct Producer {
     uint32_t const tileBaseRow = warpTile.y * warpIdx.x;
     PingPongMutex tensorCoreMutex{smem.tensorCoreMutex, grpIdx};
 
-    constexpr uint32_t partNbInstK = exactDiv(partElemsK, qmmaShape.k);
+    constexpr uint32_t partNbInstK = exactDiv(partElemsK, kernelQmmaShape.k);
     using AtomA = Vec<uint32_t, 4>;  // for 16x32 data, working as mat A of QMMA.16832
-    using RegQPartCol = Vec<AtomA, exactDiv(warpTile.y, qmmaShape.m)>;
+    using RegQPartCol = Vec<AtomA, exactDiv(warpTile.y, kernelQmmaShape.m)>;
     using RegQPart = Vec<RegQPartCol, partNbInstK>;
     using RegQ = Vec<RegQPart, SharedMemA::regQParts>;
-    constexpr uint32_t tileNbAtomBx2 = exactDiv(tokensPerTile, qmmaShape.n * 2);
+    constexpr uint32_t tileNbAtomBx2 = exactDiv(tokensPerTile, kernelQmmaShape.n * 2);
     using AtomBx2 = Vec<uint32_t, 4>;  // one AtomB is 8x32 and AtomBx2 is 16x32
     using RegKPartCol = Vec<AtomBx2, tileNbAtomBx2>;
     using RegKPart = Vec<RegKPartCol, partNbInstK>;
@@ -656,7 +670,8 @@ struct Producer {
       RegKPart regKBuf;
       regKBuf[0] = loadRegKCol(smem.k[kBarWaiter.idxBuf], 0);
 
-      auto shouldTestWait = [](uint32_t idxInstK, uint32_t idxAtomBx2) {
+      auto shouldTestWait = [partNbInstK, tileNbAtomBx2](uint32_t idxInstK,
+                                                         uint32_t idxAtomBx2) {
         return idxInstK == partNbInstK - 1 && idxAtomBx2 == tileNbAtomBx2 - 2;
       };
       BarWaiter kBarWaiterNext = kBarWaiter.next();
@@ -698,7 +713,7 @@ struct Producer {
             for (uint32_t i = 0; i < WarpAcc::rows; i++) {
 #pragma unroll
               for (uint32_t j = 0; j < 2; j++) {
-                mma<__nv_fp8_e4m3>(reinterpret_cast<float(&)[2][2]>(acc(i, 2 * idxAtomBx2 + j)),
+                mma<MathElem>(reinterpret_cast<float(&)[2][2]>(acc(i, 2 * idxAtomBx2 + j)),
                                    reinterpret_cast<uint32_t const(&)[2][2]>(regQBuf[idxInstK][i]),
                                    reinterpret_cast<uint32_t const(&)[2][1]>(atomBx2[2 * j]));
               }
@@ -749,7 +764,7 @@ struct Producer {
             for (uint32_t i = 0; i < WarpAcc::rows; i++) {
 #pragma unroll
               for (uint32_t j = 0; j < 2; j++) {
-                mma<__nv_fp8_e4m3>(reinterpret_cast<float(&)[2][2]>(acc(i, 2 * idxAtomBx2 + j)),
+                mma<MathElem>(reinterpret_cast<float(&)[2][2]>(acc(i, 2 * idxAtomBx2 + j)),
                                    reinterpret_cast<uint32_t const(&)[2][2]>(regQBuf[idxInstK][i]),
                                    reinterpret_cast<uint32_t const(&)[2][1]>(atomBx2[2 * j]));
               }
@@ -776,14 +791,14 @@ struct Producer {
 
       auto& xBar = smem.xBars[grpIdx];
       bool const skipXBarWait = xBar.consumed.test_wait_parity(toParity<1>(grpIter));
-      // convert to fp8
+      ThrdRegRowMax rowSum;
+      if constexpr (is_fp8) {
       WarpAcc const xF32Quant = xF32 * rcpXScale;
-      // 0, 1, 8, 9,  2, 3, 10, 11,  4, 5, 12, 13,  6, 7, 14, 15
       Array2D<Array2D<uint32_t, 2, 1>, WarpAcc::rows, exactDiv(WarpAcc::cols, 2)> xF8;
 #pragma unroll
       for (uint32_t i = 0; i < WarpAcc::rows; i++) {
 #pragma unroll
-        for (uint32_t m = 0; m < exactDiv(qmmaShape.m, 8); m++) {
+        for (uint32_t m = 0; m < exactDiv(kernelQmmaShape.m, 8); m++) {
 #pragma unroll
           for (uint32_t j = 0; j < WarpAcc::cols; j += 2) {
             auto& dst = reinterpret_cast<__nv_fp8x2_e4m3(&)[2]>(xF8(i, j / 2)(m, 0));
@@ -792,18 +807,24 @@ struct Producer {
           }
         }
       }
-      // use tensor core to compute rowSum
-      ThrdRegRowMax const rowSum =
+      rowSum =
           computeRowSumFromF8 ? computeRowSumF8<warpTile.y, warpTile.x>(this_warp(), xF8)
                               : computeRowSumF32<warpTile.y, warpTile.x>(this_warp(), xF32);
-
-      // store xF8 and rowSum into L2 scratch buffer
       if (!skipXBarWait) {
         xBar.consumed.wait_parity(toParity<1>(grpIter));
       }
       storeRowMax<warpTile.y>(smem.x.rowMaxLog2e, rowMaxLog2e, tileBaseRow, lane);
       storeRowMax<warpTile.y>(smem.x.rowSum, rowSum, tileBaseRow, lane);
       storeOrderedXToShm(smem.x.x, xF8, tileBaseRow, lane);
+      } else {
+      rowSum = computeRowSumF32<warpTile.y, warpTile.x>(this_warp(), xF32);
+      if (!skipXBarWait) {
+        xBar.consumed.wait_parity(toParity<1>(grpIter));
+      }
+      storeRowMax<warpTile.y>(smem.x.rowMaxLog2e, rowMaxLog2e, tileBaseRow, lane);
+      storeRowMax<warpTile.y>(smem.x.rowSum, rowSum, tileBaseRow, lane);
+      storeOrderedXToShmBf16(smem.x.x, xF32, tileBaseRow, lane);
+      }
       xBar.produced.arrive();
     }
   }
@@ -816,6 +837,9 @@ struct Producer {
       XBuffer& dst,
       Array2D<Array2D<uint32_t, 2, 1>, WarpAcc::rows, exactDiv(WarpAcc::cols, 2)> const& src,
       uint32_t const tileBaseRow, uint32_t const lane = laneId());
+  __device__ inline void storeOrderedXToShmBf16(XBuffer& dst, WarpAcc const& src,
+                                                uint32_t const tileBaseRow,
+                                                uint32_t const lane = laneId());
 };
 
 __device__ inline void Producer::loadK() {
@@ -966,6 +990,29 @@ __device__ inline void Producer::storeOrderedXToShm(
   }
 }
 
+__device__ inline void Producer::storeOrderedXToShmBf16(XBuffer& dst, WarpAcc const& src,
+                                                        uint32_t const tileBaseRow,
+                                                        uint32_t const lane) {
+  constexpr uint32_t grainsPerRow = exactDiv(warpTile.x * sizeof(__nv_bfloat16), grainBytes);
+  constexpr uint32_t totalGrains = warpTile.y * grainsPerRow;
+  constexpr uint32_t grainsPerThread = exactDiv(totalGrains, 32);
+#pragma unroll
+  for (uint32_t i = 0; i < grainsPerThread; i++) {
+    uint32_t const idx = lane + i * 32;
+    uint32_t const row = idx / grainsPerRow;
+    uint32_t const g = idx % grainsPerRow;
+    if (row < warpTile.y) {
+      __nv_bfloat16* p =
+          reinterpret_cast<__nv_bfloat16*>(&dst.template at<true>(tileBaseRow + row, g));
+#pragma unroll
+      for (uint32_t j = 0; j < 8; j++) {
+        uint32_t const col = g * 8 + j;
+        p[j] = __float2bfloat16(src(row / 2, col / 2)(row % 2, col % 2));
+      }
+    }
+  }
+}
+
 struct Consumer {
   static inline constexpr uint32_t nbMathWarps = nbMathWarpsB;
   static inline constexpr uint32_t nbMathThrds = warp_size * nbMathWarps;
@@ -1115,8 +1162,8 @@ __device__ inline void Consumer::compute() {
   uint2 const tileIdx = {warpIdx.y, warpIdx.x};
   uint2 const tileBase = {tileIdx.x * warpTile.x, tileIdx.y * warpTile.y};
 
-  constexpr uint32_t tileNbInstK = exactDiv(tokensPerTile, qmmaShape.k);
-  constexpr uint32_t warpTileNbAtomBx2 = exactDiv(warpTile.x, qmmaShape.n * 2);
+  constexpr uint32_t tileNbInstK = exactDiv(tokensPerTile, kernelQmmaShape.k);
+  constexpr uint32_t warpTileNbAtomBx2 = exactDiv(warpTile.x, kernelQmmaShape.n * 2);
 
   uint32_t const lane = laneId();
   uint32_t const idxHalf = lane / 16;
@@ -1195,19 +1242,19 @@ __device__ inline void Consumer::compute() {
 #pragma unroll
     for (uint32_t idxInstK = 0; idxInstK < tileNbInstK; idxInstK++) {
       Mat16x32Loader const loaderX(xBuf, tileBase.y, idxInstK, rA, cA);
-      Vec<Mat16x32, exactDiv(warpTile.y, qmmaShape.m)> const x = loaderX.loadWholeCol<warpTile.y>();
+      Vec<Mat16x32, exactDiv(warpTile.y, kernelQmmaShape.m)> const x = loaderX.loadWholeCol<warpTile.y>();
       using AtomB = Vec<uint32_t, 2>;
 #pragma unroll
       for (uint32_t idxAtomBx2 = 0; idxAtomBx2 < warpTileNbAtomBx2; idxAtomBx2++) {
         auto const data = ldmatrix_16x16_trans<2>(
-            &vBuf.template at<true>(qmmaShape.k * idxInstK + rB, idxAtomBx2 + cB));
+            &vBuf.template at<true>(kernelQmmaShape.k * idxInstK + rB, idxAtomBx2 + cB));
         AtomB const v[2] = {data[0], data[2], data[1], data[3]};
 #pragma unroll
         for (uint32_t i = 0; i < WarpAcc::rows; i++) {
 #pragma unroll
           for (uint32_t j = 0; j < 2; j++) {
 #if 1
-            mma<__nv_fp8_e4m3>(
+            mma<MathElem>(
 #else
             mmaF8_k32_2inst(
 #endif
@@ -1630,7 +1677,9 @@ __launch_bounds__(32 * 4 * 3, 1) __cluster_dims__(cgaSize, 1, 1) void kernel_mha
 }
 
 __constant__ constexpr uint32_t smemSize = mha::max(sizeof(SharedMemA), sizeof(SharedMemB));
-static_assert(smemSize <= 99 * 1024, "Shared memory size exceeded");
+// BF16 with nbKBufs=2 uses ~100KB; allow up to 99KB opt-in (101376) for devices that support it.
+static constexpr uint32_t kSmemLimitBytes = is_bf16 ? 101376 : 99 * 1024;
+static_assert(smemSize <= kSmemLimitBytes, "Shared memory size exceeded");
 #endif  // is_MLA
 
 #ifndef GENERATE_CUBIN
@@ -1674,9 +1723,19 @@ void launchMLA(
     throw std::runtime_error("not implemented");
   }
   static uint32_t const hostSmemSize = [&]() {
-    // printf("smemSize = %u\n", smemSize);
     uint32_t size;
     checkCuda(cudaMemcpyFromSymbol(&size, smemSize, sizeof(smemSize)));
+    int devMaxShmem = 0;
+    checkCuda(cudaDeviceGetAttribute(&devMaxShmem,
+                                      cudaDevAttrMaxSharedMemoryPerBlockOptin, 0));
-    int devMaxShmem = 0;
-    checkCuda(cudaDeviceGetAttribute(&devMaxShmem,
-                                      cudaDevAttrMaxSharedMemoryPerBlockOptin, 0));
+    int devMaxShmem = 0;
+    int device = -1;
+    checkCuda(cudaGetDevice(&device));
+    checkCuda(cudaDeviceGetAttribute(&devMaxShmem,
+                                     cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                                     device));
-    int devMaxShmem = 0;
-    checkCuda(cudaDeviceGetAttribute(&devMaxShmem,
-                                      cudaDevAttrMaxSharedMemoryPerBlockOptin, 0));
+    int devMaxShmem = 0;
+    int device = -1;
+    checkCuda(cudaGetDevice(&device));
+    checkCuda(cudaDeviceGetAttribute(&devMaxShmem,
+                                     cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                                     device));
+    if (size > (uint32_t)devMaxShmem) {
+      throw std::runtime_error(
+          "XQA MLA kernel requires " + std::to_string(size) + " bytes shared memory per block, but "
+          "device opt-in max is " + std::to_string(devMaxShmem) + ". BF16 MLA needs 128 KB (e.g. SM12x).");
+    }
+    checkCuda(cudaFuncSetAttribute(kernel_mha,
+                                   cudaFuncAttributePreferredSharedMemoryCarveout,
+                                   cudaSharedmemCarveoutMaxShared));
     checkCuda(cudaFuncSetAttribute(kernel_mha, cudaFuncAttributeMaxDynamicSharedMemorySize, size));
     return size;
   }();
@@ -1768,8 +1827,19 @@ void launchMLA(
 
 static uint32_t configureKernel() {
   uint32_t size;
-  cudaMemcpyFromSymbol(&size, smemSize, sizeof(smemSize));
-  cudaFuncSetAttribute(kernel_mha, cudaFuncAttributeMaxDynamicSharedMemorySize, size);
+  checkCuda(cudaMemcpyFromSymbol(&size, smemSize, sizeof(smemSize)));
+  int devMaxShmem = 0;
+  checkCuda(cudaDeviceGetAttribute(&devMaxShmem,
+                                    cudaDevAttrMaxSharedMemoryPerBlockOptin, 0));
+  if (size > (uint32_t)devMaxShmem) {
+    throw std::runtime_error(
+        "XQA MLA kernel requires " + std::to_string(size) + " bytes shared memory per block, but "
+        "device opt-in max is " + std::to_string(devMaxShmem) + ". BF16 MLA needs 128 KB (e.g. SM12x).");
+  }
+  checkCuda(cudaFuncSetAttribute(kernel_mha,
+                                 cudaFuncAttributePreferredSharedMemoryCarveout,
+                                 cudaSharedmemCarveoutMaxShared));
+  checkCuda(cudaFuncSetAttribute(kernel_mha, cudaFuncAttributeMaxDynamicSharedMemorySize, size));
   return size;
 }
 

@@ -599,13 +599,17 @@ def trtllm_batch_decode_with_kv_cache_mla(
     if isinstance(bmm2_scale, torch.Tensor):
         assert bmm2_scale.dtype == torch.float32
     if backend == "xqa":
-        if (
-            get_compute_capability(query.device)[0] != 12
-            or query.dtype != torch.float8_e4m3fn
-            or kv_cache.dtype != torch.float8_e4m3fn
-        ):
+        if get_compute_capability(query.device)[0] != 12:
+            raise ValueError("XQA MLA is only supported on SM120/SM121 GPUs")
+        fp8_ok = (
+            query.dtype == torch.float8_e4m3fn and kv_cache.dtype == torch.float8_e4m3fn
+        )
+        bf16_ok = (
+            query.dtype == torch.bfloat16 and kv_cache.dtype == torch.bfloat16
+        )
+        if not (fp8_ok or bf16_ok):
             raise ValueError(
-                f"XQA MLA only supports fp8 operation on SM120/SM121 GPUs, got {query.dtype} and {kv_cache.dtype}"
+                f"XQA MLA on SM120/SM121 supports (fp8, fp8) or (bfloat16, bfloat16) only, got {query.dtype} and {kv_cache.dtype}"
             )
         if sinks is not None:
             raise ValueError("XQA MLA does not support sinks")
@@ -767,9 +771,13 @@ def xqa_batch_decode_with_kv_cache_mla(
         raise ValueError(
             f"XQA MLA only supports q_len_per_request == 1, got {q_len_per_request}"
         )
-    if query.dtype != torch.float8_e4m3fn or kv_cache.dtype != torch.float8_e4m3fn:
+    fp8_ok = (
+        query.dtype == torch.float8_e4m3fn and kv_cache.dtype == torch.float8_e4m3fn
+    )
+    bf16_ok = query.dtype == torch.bfloat16 and kv_cache.dtype == torch.bfloat16
+    if not (fp8_ok or bf16_ok):
         raise ValueError(
-            f"XQA MLA only supports fp8 tensor core operation, got {query.dtype} and {kv_cache.dtype}"
+            f"XQA MLA supports (fp8, fp8) or (bfloat16, bfloat16) only, got {query.dtype} and {kv_cache.dtype}"
         )
     if sinks is not None:
         raise ValueError("XQA MLA does not support sinks")