diff --git a/benchmarks/bench_gdn_prefill.py b/benchmarks/bench_gdn_prefill.py
new file mode 100644
index 0000000000..aa94143d85
--- /dev/null
+++ b/benchmarks/bench_gdn_prefill.py
@@ -0,0 +1,282 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import numpy as np
+import torch
+
+from flashinfer.gdn_prefill import chunk_gated_delta_rule
+from flashinfer.testing.utils import bench_gpu_time
+
+
+def gdn_flops(
+    total_seq_len: int,
+    num_q_heads: int,
+    num_k_heads: int,
+    num_v_heads: int,
+    head_size: int,
+    num_seqs: int,
+) -> int:
+    """
+    Calculate FLOPs for Gated Delta Rule (GDN) attention.
+
+    Delta Rule formula:
+        state_t = alpha_t * state_{t-1} + beta_t * (k_t @ v_t^T)
+        output_t = q_t @ state_t
+
+    Matrix multiplications per token per head:
+    1. k @ v^T (outer product): 2 * d^2 FLOPs
+    2. q @ state: 2 * d^2 FLOPs
+
+    Note: alpha/beta gating are element-wise scalar multiplications,
+    not counted in TFLOPS.
+    """
+    num_o_heads = max(num_q_heads, num_v_heads)
+
+    # k @ v^T (outer product): 2 * d^2 per token per head
+    outer_product_flops = 2 * total_seq_len * num_o_heads * head_size * head_size
+
+    # q @ state: 2 * d^2 per token per head
+    output_flops = 2 * total_seq_len * num_o_heads * head_size * head_size
+
+    total_flops = outer_product_flops + output_flops
+    return total_flops
+
+
+def gdn_bytes(
+    total_seq_len: int,
+    num_q_heads: int,
+    num_k_heads: int,
+    num_v_heads: int,
+    head_size: int,
+    num_seqs: int,
+    dtype: torch.dtype,
+) -> int:
+    """
+    Calculate memory bytes for GDN attention.
+
+    Includes:
+    - Q, K, V tensors (input)
+    - Output tensor
+    - State tensor (float32)
+    - Alpha, Beta tensors (optional, float32)
+    """
+    num_o_heads = max(num_q_heads, num_v_heads)
+    num_sab_heads = num_o_heads
+    elem_size = dtype.itemsize
+
+    # Input tensors
+    q_bytes = total_seq_len * num_q_heads * head_size * elem_size
+    k_bytes = total_seq_len * num_k_heads * head_size * elem_size
+    v_bytes = total_seq_len * num_v_heads * head_size * elem_size
+
+    # Output tensor
+    o_bytes = total_seq_len * num_o_heads * head_size * elem_size
+
+    # State tensor (float32)
+    state_bytes = num_seqs * num_sab_heads * head_size * head_size * 4
+
+    # Alpha and Beta (float32)
+    alpha_bytes = total_seq_len * num_sab_heads * 4
+    beta_bytes = total_seq_len * num_sab_heads * 4
+
+    total_bytes = (
+        q_bytes + k_bytes + v_bytes + o_bytes + state_bytes + alpha_bytes + beta_bytes
+    )
+    return total_bytes
+
+
+def bench_gdn_prefill(
+    batch_size: int,
+    seq_len: int,
+    num_q_heads: int,
+    num_k_heads: int,
+    num_v_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    use_alpha: bool = True,
+    use_beta: bool = True,
+):
+    """Benchmark GDN prefill kernel."""
+    total_seq_len = batch_size * seq_len
+    num_o_heads = max(num_q_heads, num_v_heads)
+    num_sab_heads = num_o_heads
+
+    # Create inputs
+    q = torch.randn(total_seq_len, num_q_heads, head_size, dtype=dtype, device="cuda")
+    k = torch.randn(total_seq_len, num_k_heads, head_size, dtype=dtype, device="cuda")
+    # L2 normalize k for numerical stability
+    k = torch.nn.functional.normalize(k, p=2.0, dim=-1)
+    v = torch.randn(total_seq_len, num_v_heads, head_size, dtype=dtype, device="cuda")
+
+    cu_seqlens = torch.arange(
+        0, batch_size * seq_len + 1, seq_len, dtype=torch.int64, device="cuda"
+    )
+
+    alpha = (
+        torch.rand(total_seq_len, num_sab_heads, dtype=torch.float32, device="cuda")
+        if use_alpha
+        else None
+    )
+    beta = (
+        torch.rand(total_seq_len, num_sab_heads, dtype=torch.float32, device="cuda")
+        if use_beta
+        else None
+    )
+
+    # Pre-allocate outputs
+    output = torch.empty(
+        total_seq_len, num_o_heads, head_size, dtype=dtype, device="cuda"
+    )
+    output_state = torch.empty(
+        batch_size,
+        num_sab_heads,
+        head_size,
+        head_size,
+        dtype=torch.float32,
+        device="cuda",
+    )
+
+    # Warmup
+    chunk_gated_delta_rule(
+        q, k, v, alpha, beta, None, None, True, cu_seqlens, False, output, output_state
+    )
+    torch.cuda.synchronize()
+
+    # Benchmark
+    times = bench_gpu_time(
+        lambda: chunk_gated_delta_rule(
+            q,
+            k,
+            v,
+            alpha,
+            beta,
+            None,
+            None,
+            True,
+            cu_seqlens,
+            False,
+            output,
+            output_state,
+        ),
+        dry_run_time_ms=100,
+        repeat_time_ms=1000,
+        enable_cupti=True,
+    )
+
+    median_ms = np.median(times)
+
+    # Calculate metrics
+    flops = gdn_flops(
+        total_seq_len, num_q_heads, num_k_heads, num_v_heads, head_size, batch_size
+    )
+    bytes_accessed = gdn_bytes(
+        total_seq_len,
+        num_q_heads,
+        num_k_heads,
+        num_v_heads,
+        head_size,
+        batch_size,
+        dtype,
+    )
+
+    tflops = flops / median_ms / 1e9
+    tb_per_sec = bytes_accessed / median_ms / 1e9
+
+    # Get device info for bandwidth calculation
+    props = torch.cuda.get_device_properties(0)
+    props.total_memory * 2 / 1e12  # Approximate peak bandwidth
+
+    return {
+        "batch_size": batch_size,
+        "seq_len": seq_len,
+        "num_q_heads": num_q_heads,
+        "num_k_heads": num_k_heads,
+        "num_v_heads": num_v_heads,
+        "head_size": head_size,
+        "dtype": str(dtype).replace("torch.", ""),
+        "median_ms": median_ms,
+        "tflops": tflops,
+        "tb_per_sec": tb_per_sec,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark GDN Prefill Kernel")
+    parser.add_argument("--batch-size", type=int, nargs="+", default=[1, 4, 16, 64])
+    parser.add_argument("--seq-len", type=int, nargs="+", default=[128, 256, 512, 1024])
+    parser.add_argument("--num-q-heads", type=int, default=16)
+    parser.add_argument("--num-k-heads", type=int, default=16)
+    parser.add_argument("--num-v-heads", type=int, default=32)
+    parser.add_argument("--head-size", type=int, default=128)
+    parser.add_argument(
+        "--dtype", type=str, choices=["float16", "bfloat16"], default="bfloat16"
+    )
+    parser.add_argument(
+        "--preset",
+        type=str,
+        choices=["qwen3-next", "custom"],
+        default="custom",
+        help="Use preset config. qwen3-next: q=k=16, v=32, d=128",
+    )
+    args = parser.parse_args()
+
+    # Apply preset configurations
+    if args.preset == "qwen3-next":
+        # Qwen3-Next-80B-A3B linear attention config (GVA)
+        args.num_q_heads = 16
+        args.num_k_heads = 16
+        args.num_v_heads = 32
+        args.head_size = 128
+
+    # Check SM90 support
+    device_capability = torch.cuda.get_device_capability()
+    if device_capability[0] < 9:
+        print(f"Current device capability: {device_capability}")
+        print("GDN requires SM90 (Hopper) or later. Exiting...")
+        return
+
+    dtype = getattr(torch, args.dtype)
+
+    print(
+        f"GDN Prefill Benchmark (heads: q={args.num_q_heads}, k={args.num_k_heads}, v={args.num_v_heads}, d={args.head_size}, dtype={args.dtype})"
+    )
+    print("-" * 100)
+    print(f"{'batch':>6} {'seq_len':>8} {'time(ms)':>10} {'TFLOPS':>10} {'TB/s':>10}")
+    print("-" * 100)
+
+    for batch_size in args.batch_size:
+        for seq_len in args.seq_len:
+            result = bench_gdn_prefill(
+                batch_size=batch_size,
+                seq_len=seq_len,
+                num_q_heads=args.num_q_heads,
+                num_k_heads=args.num_k_heads,
+                num_v_heads=args.num_v_heads,
+                head_size=args.head_size,
+                dtype=dtype,
+            )
+            print(
+                f"{result['batch_size']:>6} {result['seq_len']:>8} "
+                f"{result['median_ms']:>10.3f} {result['tflops']:>10.2f} "
+                f"{result['tb_per_sec']:>10.2f}"
+            )
+
+    print("-" * 100)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/csrc/flat/ampere/collective/flat_collective_inverse.hpp b/csrc/flat/ampere/collective/flat_collective_inverse.hpp
new file mode 100644
index 0000000000..c9535c6396
--- /dev/null
+++ b/csrc/flat/ampere/collective/flat_collective_inverse.hpp
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/tensor.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/cutlass.h"
+#include "flat/cute_ext.hpp"
+
+namespace flat::collective {
+
+using namespace cute;
+
+template <int dim, typename Layout>
+constexpr bool is_contiguous(Layout&& layout) {
+  auto dim_layout = get<dim>(layout);
+  if constexpr (rank(dim_layout) == 0) {
+    return stride(dim_layout) == 1;
+  } else {
+    return stride<0>(dim_layout) == 1;
+  }
+}
+
+namespace detail::SM80 {
+
+// SM80 version of make_acc_into_op in "flat/hopper/collective/flat_common.hpp"
+template <typename CLayout, typename TiledMMA>
+CUTE_DEVICE constexpr auto convert_c_layout_to_a_layout(CLayout const& c,
+                                                        TiledMMA const& tiled_mma) {
+  constexpr auto c_frag_atom_size = size<0>(CLayout{});
+  constexpr auto a_frag_atom_size = size<1>(typename TiledMMA::AtomLayoutA_TV{});
+  static_assert(a_frag_atom_size % c_frag_atom_size == 0);
+  constexpr auto ratio = a_frag_atom_size / c_frag_atom_size;
+  if constexpr (ratio == 1) {
+    return CLayout{};
+  } else {
+    // e.g. the mma instruction shape is 16x8x16, we need to convert from ((2,2), MMA_M, MMA_N) to
+    // ((2,2,2), MMA_M, MMA_N/2)
+
+    constexpr auto tiler =
+        make_shape(_, _, Int<ratio>{});  // keep the first mode (FragAtom) and second mode (MMA_M)
+    constexpr auto divided =
+        logical_divide(CLayout{}, tiler);  // (FragAtom, MMA_M, (ratio, MMA_N/ratio))
+
+    return make_layout(flatten(make_layout(get<0>(divided), get<2, 0>(divided))), get<1>(divided),
+                       get<2, 1>(divided));
+  }
+}
+
+template <class Element, class Accumulator, class TiledMMA>
+CUTE_DEVICE auto make_acc_into_op(Accumulator const& acc, TiledMMA const& tiled_mma) {
+  Tensor operand =
+      make_fragment_like<Element>(convert_c_layout_to_a_layout(acc.layout(), tiled_mma));
+  Tensor operand_as_acc = make_tensor(operand.data(), acc.layout());
+  cute::copy(acc, operand_as_acc);
+  return operand;
+}
+
+}  // namespace detail::SM80
+
+template <class Element, bool GarbageFilledDiagonal, bool GarbageFilledUpperTriangular>
+struct CollectiveInverse {
+  // FIXME: precision is not good due to half
+  static_assert(std::is_same_v<Element, half> || std::is_same_v<Element, cutlass::half_t>,
+                "only half is implemented");
+
+  CUTE_DEVICE
+  CollectiveInverse(int wg_sync_named_barrier_id)
+      : wg_sync_named_barrier_id_(wg_sync_named_barrier_id) {}
+
+  template <typename TensorT>
+  CUTE_DEVICE void compute(TensorT&& sT) {
+    constexpr auto L =
+        typename std::remove_const_t<std::remove_reference_t<TensorT>>::layout_type{};
+    static_assert(rank(L) == 2);
+    static_assert(size<0>(L) == 64);
+    static_assert(size<1>(L) == 64);
+
+    int thread_idx = threadIdx.x % cutlass::NumThreadsPerWarpGroup;
+
+    if (thread_idx < 64) {  // compute 8x8 inverse on diagnal directly
+      auto t8X8sT = flat_divide(sT, Shape<_8, _8>{});
+      compute_diagonal_inverse_NxN<8>(t8X8sT(_, _, thread_idx / 8, thread_idx / 8), thread_idx % 8);
+    }
+
+    cutlass::arch::NamedBarrier::arrive_and_wait(cutlass::NumThreadsPerWarpGroup,
+                                                 wg_sync_named_barrier_id_);
+
+    auto t16X16sT = flat_divide(sT, Shape<_16, _16>{});
+    blockwise_diagonal_inversed_8x8_to_16x16(t16X16sT(_, _, thread_idx / 32, thread_idx / 32));
+
+    cutlass::arch::NamedBarrier::arrive_and_wait(cutlass::NumThreadsPerWarpGroup,
+                                                 wg_sync_named_barrier_id_);
+
+    if (thread_idx < 64) {
+      auto t32X32sT = flat_divide(sT, Shape<_32, _32>{});
+      blockwise_diagonal_inversed_16x16_to_32x32(t32X32sT(_, _, thread_idx / 32, thread_idx / 32));
+    }
+    cutlass::arch::NamedBarrier::arrive_and_wait(cutlass::NumThreadsPerWarpGroup,
+                                                 wg_sync_named_barrier_id_);
+    blockwise_diagonal_inversed_32x32_to_64x64(sT);
+  }
+
+ private:
+  template <int N, typename TensorT>
+  CUTE_DEVICE void compute_diagonal_inverse_NxN(TensorT&& mat,
+                                                int tid_in_group) {  // group_size = N
+    constexpr auto L =
+        typename std::remove_const_t<std::remove_reference_t<TensorT>>::layout_type{};
+    static_assert(rank(L) == 2);
+    static_assert(size<0>(L) == N);
+    static_assert(size<1>(L) == N);
+
+    using ElementCompute = float;
+
+    using CopyOp =
+        Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<sizeof(Element) * 8 * N>, Element>;
+
+    auto load_row = [&](int y) {
+      auto row = make_tensor<Element>(Shape<Int<N>>{});
+      copy(CopyOp{}, std::forward<TensorT>(mat)(y, _), row);
+
+      auto row_cvt = make_tensor_like<ElementCompute>(row);
+      copy(row, row_cvt);
+
+      if constexpr (GarbageFilledDiagonal || GarbageFilledUpperTriangular) {
+        CUTE_UNROLL
+        for (int i = 0; i < N; ++i) {
+          row_cvt(i) = i == y ? 1.0f : (i > y ? 0.0f : row_cvt(i));
+        }
+      }
+      return row_cvt;
+    };
+
+    auto store_row = [&](int y, auto row) {
+      auto row_cvt = make_tensor_like<Element>(row);
+      copy(row, row_cvt);
+      copy(CopyOp{}, row_cvt, std::forward<TensorT>(mat)(y, _));
+    };
+
+    auto row = load_row(tid_in_group);
+#define LOAD(y, x) __shfl_sync(0xffffffff, row(x), y, N)
+
+    CUTE_UNROLL
+    for (int src_row = 0; src_row < N - 1; ++src_row) {  // idx of src row to eliminate
+      auto row_scale = -row(src_row);                    // scale the src row
+      CUTE_UNROLL
+      for (int i = 0; i < src_row; ++i) {
+        auto src_row_value = LOAD(src_row, i);
+        row(i) = tid_in_group > src_row ? row_scale * src_row_value + row(i) : row(i);
+      }
+      row(src_row) = tid_in_group > src_row ? row_scale : row(src_row);
+    }
+
+#undef LOAD
+
+    store_row(tid_in_group, row);
+  }
+
+  /*
+  blockwise inverse has relation as follows
+  inv(| A 0 |)     = |          inv(A)       0  |
+      | C D |        | -inv(D)C inv(A)   inv(D) |
+  */
+
+  template <typename TensorT>
+  CUTE_DEVICE void blockwise_diagonal_inversed_4x4_to_8x8(TensorT&& mat) {
+    constexpr auto L =
+        typename std::remove_const_t<std::remove_reference_t<TensorT>>::layout_type{};
+    static_assert(rank(L) == 2);
+    static_assert(size<0>(L) == 8);
+    static_assert(size<1>(L) == 8);
+    auto mat_NxN_2x2 = flat_divide(std::forward<TensorT>(mat), Shape<_4, _4>{});
+
+    // FIXME: implement
+  }
+
+  template <typename TensorT>
+  CUTE_DEVICE void blockwise_diagonal_inversed_8x8_to_16x16(TensorT&& mat) {
+    constexpr auto L =
+        typename std::remove_const_t<std::remove_reference_t<TensorT>>::layout_type{};
+    static_assert(rank(L) == 2);
+    static_assert(size<0>(L) == 16);
+    static_assert(size<1>(L) == 16);
+
+    static_assert(is_contiguous<0>(L) == 1 || is_contiguous<1>(L) == 1);
+    constexpr bool is_col_major = is_contiguous<0>(L);
+
+    auto mat_8x8_2x2 = flat_divide(std::forward<TensorT>(mat), Shape<_8, _8>{});
+    using MMA = SM80_16x8x8_F32F16F16F32_TN;
+    using TiledMMA = decltype(make_tiled_mma(MMA{}, Layout<Shape<_1, _1>>{}, Shape<_16, _8, _8>{}));
+
+    using CopyOpD_S2R = std::conditional_t<is_col_major, SM75_U16x2_LDSM_T, SM75_U32x1_LDSM_N>;
+    using CopyOpC_S2R = std::conditional_t<is_col_major, SM75_U32x1_LDSM_N, SM75_U16x2_LDSM_T>;
+    using CopyOpA_S2R = std::conditional_t<is_col_major, SM75_U32x1_LDSM_N, SM75_U16x2_LDSM_T>;
+#ifdef CUTE_ARCH_STSM_SM90_ENABLED
+    using CopyOpO_R2S = std::conditional_t<is_col_major, SM90_U16x2_STSM_T, SM90_U32x1_STSM_N>;
+#else
+    using CopyOpO_R2S = UniversalCopy<Element, Element>;
+#endif
+
+    int lane_id = cutlass::canonical_lane_idx();
+    auto tiled_mma = TiledMMA{};
+    auto thr_mma = tiled_mma.get_thread_slice(lane_id);
+
+    auto D_tiled_copy = make_tiled_copy_A(Copy_Atom<CopyOpD_S2R, Element>{}, tiled_mma);
+    auto C_tiled_copy = make_tiled_copy_B(Copy_Atom<CopyOpC_S2R, Element>{}, tiled_mma);
+    auto A_tiled_copy = make_tiled_copy_B(Copy_Atom<CopyOpA_S2R, Element>{}, tiled_mma);
+    auto O_tiled_copy = make_tiled_copy_C(Copy_Atom<CopyOpO_R2S, Element>{}, tiled_mma);
+
+    auto D_thr_copy = D_tiled_copy.get_thread_slice(lane_id);
+    auto C_thr_copy = C_tiled_copy.get_thread_slice(lane_id);
+    auto A_thr_copy = A_tiled_copy.get_thread_slice(lane_id);
+    auto O_thr_copy = O_tiled_copy.get_thread_slice(lane_id);
+
+    Tensor sDinv = mat_8x8_2x2(_, _, _1{}, _1{});
+    Tensor sC = select_tensor<1, 0>(mat_8x8_2x2(_, _, _1{}, _0{}));
+    Tensor sAinv = select_tensor<1, 0>(mat_8x8_2x2(_, _, _0{}, _0{}));
+    Tensor sO = mat_8x8_2x2(_, _, _1{}, _0{});
+
+    Tensor sDinv_m_bcast =
+        make_tensor(sDinv.data(), logical_product(sDinv.layout(), Tile<Layout<_2, _0>>{}));
+    Tensor sO_m_bcast =
+        make_tensor(sO.data(), logical_product(sO.layout(), Tile<Layout<_2, _0>>{}));
+
+    Tensor tOrDinv = make_fragment_like<Element>(partition_shape_A(tiled_mma, Shape<_16, _8>{}));
+    Tensor tOrC = thr_mma.partition_fragment_B(sC);
+    Tensor tOrAinv = thr_mma.partition_fragment_B(sAinv);
+
+    Tensor tDCrDC = partition_fragment_C(tiled_mma, Shape<_16, _8>{});  // output of -inv(D)C
+    Tensor tOrO = partition_fragment_C(tiled_mma, Shape<_16, _8>{});    // output of -inv(D)C inv(A)
+
+    Tensor tOsDinv = D_thr_copy.partition_S(sDinv_m_bcast);
+    Tensor tOrDinv_cv = D_thr_copy.retile_D(tOrDinv);
+    Tensor tOsC = C_thr_copy.partition_S(sC);
+    Tensor tOrC_cv = C_thr_copy.retile_D(tOrC);
+    Tensor tOsAinv = A_thr_copy.partition_S(sAinv);
+    Tensor tOrAinv_cv = A_thr_copy.retile_D(tOrAinv);
+    Tensor tOsO = O_thr_copy.partition_D(sO_m_bcast);
+    Tensor tOrO_cv = O_thr_copy.retile_S(tOrO);
+
+    /////////////////////////////////////////////////////////////////////////////
+    // -inv(D)C
+    copy(D_tiled_copy, tOsDinv(make_coord(_, _0{}), _, _), tOrDinv_cv(make_coord(_, _0{}), _, _));
+    copy(C_tiled_copy, tOsC, tOrC_cv);
+
+    clear(tDCrDC);
+    gemm(tiled_mma, tOrDinv, tOrC, tDCrDC);
+    transform(tDCrDC(make_coord(_, _0{}), _, _), [](auto v) { return -v; });
+
+    /////////////////////////////////////////////////////////////////////////////
+    // -inv(D)C inv(A)
+    Tensor tOrDC = detail::SM80::make_acc_into_op<Element>(tDCrDC, tiled_mma);
+
+    copy(A_tiled_copy, tOsAinv, tOrAinv_cv);
+    clear(tOrO);
+    gemm(tiled_mma, tOrDC, tOrAinv, tOrO);
+
+    auto tOrO_cv_cvt = make_tensor_like<Element>(tOrO_cv(make_coord(_, _0{}), _, _));
+    transform(tOrO_cv(make_coord(_, _0{}), _, _), tOrO_cv_cvt, [](auto v) { return Element(v); });
+    copy(O_tiled_copy, tOrO_cv_cvt, tOsO(make_coord(_, _0{}), _, _));
+  }
+
+  template <typename TensorT>
+  CUTE_DEVICE void blockwise_diagonal_inversed_16x16_to_32x32(TensorT&& mat) {
+    constexpr auto L =
+        typename std::remove_const_t<std::remove_reference_t<TensorT>>::layout_type{};
+    static_assert(rank(L) == 2);
+    static_assert(size<0>(L) == 32);
+    static_assert(size<1>(L) == 32);
+
+    static_assert(is_contiguous<0>(L) == 1 || is_contiguous<1>(L) == 1);
+    constexpr bool is_col_major = is_contiguous<0>(L);
+
+    using TileShape = Shape<_16, _16, _16>;
+    auto mat_16x16_2x2 = flat_divide(std::forward<TensorT>(mat), select<0, 1>(TileShape{}));
+
+    using MMA = SM80_16x8x16_F32F16F16F32_TN;
+    using TiledMMA = decltype(make_tiled_mma(MMA{}, Layout<Shape<_1, _1>>{}, TileShape{}));
+
+    using CopyOpD_S2R = std::conditional_t<is_col_major, SM75_U16x4_LDSM_T, SM75_U32x2_LDSM_N>;
+    using CopyOpC_S2R = std::conditional_t<is_col_major, SM75_U32x2_LDSM_N, SM75_U16x4_LDSM_T>;
+    using CopyOpA_S2R = std::conditional_t<is_col_major, SM75_U32x2_LDSM_N, SM75_U16x4_LDSM_T>;
+#ifdef CUTE_ARCH_STSM_SM90_ENABLED
+    using CopyOpO_R2S = std::conditional_t<is_col_major, SM90_U16x4_STSM_T, SM90_U32x2_STSM_N>;
+#else
+    using CopyOpO_R2S = UniversalCopy<Element, Element>;
+#endif
+
+    int lane_id = cutlass::canonical_lane_idx();
+    auto tiled_mma = TiledMMA{};
+    auto thr_mma = tiled_mma.get_thread_slice(lane_id);
+
+    auto D_tiled_copy = make_tiled_copy_A(Copy_Atom<CopyOpD_S2R, Element>{}, tiled_mma);
+    auto C_tiled_copy = make_tiled_copy_B(Copy_Atom<CopyOpC_S2R, Element>{}, tiled_mma);
+    auto A_tiled_copy = make_tiled_copy_B(Copy_Atom<CopyOpA_S2R, Element>{}, tiled_mma);
+    auto O_tiled_copy = make_tiled_copy_C(Copy_Atom<CopyOpO_R2S, Element>{}, tiled_mma);
+
+    auto D_thr_copy = D_tiled_copy.get_thread_slice(lane_id);
+    auto C_thr_copy = C_tiled_copy.get_thread_slice(lane_id);
+    auto A_thr_copy = A_tiled_copy.get_thread_slice(lane_id);
+    auto O_thr_copy = O_tiled_copy.get_thread_slice(lane_id);
+
+    Tensor sDinv = mat_16x16_2x2(_, _, _1{}, _1{});
+    Tensor sC = select_tensor<1, 0>(mat_16x16_2x2(_, _, _1{}, _0{}));
+    Tensor sAinv = select_tensor<1, 0>(mat_16x16_2x2(_, _, _0{}, _0{}));
+    Tensor sO = mat_16x16_2x2(_, _, _1{}, _0{});
+
+    Tensor tOrDinv = thr_mma.partition_fragment_A(sDinv);
+    Tensor tOrC = thr_mma.partition_fragment_B(sC);
+    Tensor tOrAinv = thr_mma.partition_fragment_B(sAinv);
+
+    Tensor tDCrDC =
+        partition_fragment_C(tiled_mma, select<0, 1>(TileShape{}));  // output of -inv(D)C
+    Tensor tOrO =
+        partition_fragment_C(tiled_mma, select<0, 1>(TileShape{}));  // output of -inv(D)C inv(A)
+
+    Tensor tOsDinv = D_thr_copy.partition_S(sDinv);
+    Tensor tOrDinv_cv = D_thr_copy.retile_D(tOrDinv);
+    Tensor tOsC = C_thr_copy.partition_S(sC);
+    Tensor tOrC_cv = C_thr_copy.retile_D(tOrC);
+    Tensor tOsAinv = A_thr_copy.partition_S(sAinv);
+    Tensor tOrAinv_cv = A_thr_copy.retile_D(tOrAinv);
+    Tensor tOsO = O_thr_copy.partition_D(sO);
+    Tensor tOrO_cv = O_thr_copy.retile_S(tOrO);
+
+    /////////////////////////////////////////////////////////////////////////////
+    // -inv(D)C
+    copy(D_tiled_copy, tOsDinv, tOrDinv_cv);
+    copy(C_tiled_copy, tOsC, tOrC_cv);
+
+    clear(tDCrDC);
+    gemm(tiled_mma, tOrDinv, tOrC, tDCrDC);
+    transform(tDCrDC, [](auto v) { return -v; });
+
+    /////////////////////////////////////////////////////////////////////////////
+    // -inv(D)C inv(A)
+    Tensor tOrDC = detail::SM80::make_acc_into_op<Element>(tDCrDC, tiled_mma);
+
+    copy(A_tiled_copy, tOsAinv, tOrAinv_cv);
+    clear(tOrO);
+    gemm(tiled_mma, tOrDC, tOrAinv, tOrO);
+
+    auto tOrO_cv_cvt = make_tensor_like<Element>(tOrO_cv);
+    transform(tOrO_cv, tOrO_cv_cvt, [](auto v) { return Element(v); });
+    copy(O_tiled_copy, tOrO_cv_cvt, tOsO);
+  }
+
+  template <typename TensorT>
+  CUTE_DEVICE void blockwise_diagonal_inversed_32x32_to_64x64(TensorT&& mat) {
+    constexpr auto L =
+        typename std::remove_const_t<std::remove_reference_t<TensorT>>::layout_type{};
+    static_assert(rank(L) == 2);
+    static_assert(size<0>(L) == 64);
+    static_assert(size<1>(L) == 64);
+
+    static_assert(is_contiguous<0>(L) == 1 || is_contiguous<1>(L) == 1);
+    constexpr bool is_col_major = is_contiguous<0>(L);
+
+    auto mat_32x32_2x2 = flat_divide(std::forward<TensorT>(mat), select<0, 1>(Shape<_32, _32>{}));
+    auto mat_16x2X16x2_2x2 = logical_divide(mat_32x32_2x2, Shape<_16, _16>{});
+
+    using MMA = SM80_16x8x16_F32F16F16F32_TN;
+    using TiledMMA1 =
+        decltype(make_tiled_mma(MMA{}, Layout<Shape<_1, _1>>{}, Shape<_16, _16, _32>{}));
+    using TiledMMA2 =
+        decltype(make_tiled_mma(MMA{}, Layout<Shape<_1, _1>>{}, Shape<_16, _32, _16>{}));
+
+    using CopyOpD_S2R = std::conditional_t<is_col_major, SM75_U16x8_LDSM_T, SM75_U32x4_LDSM_N>;
+    using CopyOpC_S2R = std::conditional_t<is_col_major, SM75_U32x4_LDSM_N, SM75_U16x8_LDSM_T>;
+    using CopyOpA_S2R = std::conditional_t<is_col_major, SM75_U32x2_LDSM_N, SM75_U16x4_LDSM_T>;
+    using CopyOpO_S2R = std::conditional_t<is_col_major, SM75_U16x8_LDSM_T, SM75_U32x4_LDSM_N>;
+    using CopyOpO_S2R = std::conditional_t<is_col_major, SM75_U16x8_LDSM_T, SM75_U32x4_LDSM_N>;
+#ifdef CUTE_ARCH_STSM_SM90_ENABLED
+    using CopyOpO_R2S = std::conditional_t<is_col_major, SM90_U16x8_STSM_T, SM90_U32x4_STSM_N>;
+#else
+    using CopyOpO_R2S = UniversalCopy<Element, Element>;
+#endif
+
+    int warp_id_in_wg = cutlass::canonical_warp_idx() -
+                        cutlass::NumWarpsPerWarpGroup * cutlass::canonical_warp_group_idx();
+    int x = warp_id_in_wg / 2;
+    int y = warp_id_in_wg % 2;
+
+    int lane_id = cutlass::canonical_lane_idx();
+    auto tiled_mma1 = TiledMMA1{};
+    auto thr_mma1 = tiled_mma1.get_thread_slice(lane_id);
+
+    auto tiled_mma2 = TiledMMA2{};
+    auto thr_mma2 = tiled_mma2.get_thread_slice(lane_id);
+
+    auto D_tiled_copy = make_tiled_copy_A(Copy_Atom<CopyOpD_S2R, Element>{}, tiled_mma1);
+    auto C_tiled_copy = make_tiled_copy_B(Copy_Atom<CopyOpC_S2R, Element>{}, tiled_mma1);
+    auto A_tiled_copy = make_tiled_copy_B(Copy_Atom<CopyOpA_S2R, Element>{}, tiled_mma2);
+    auto O_tiled_s2r = make_tiled_copy_C(Copy_Atom<CopyOpO_S2R, Element>{}, tiled_mma2);
+    auto O_tiled_r2s = make_tiled_copy_C(Copy_Atom<CopyOpO_R2S, Element>{}, tiled_mma2);
+
+    auto D_thr_copy = D_tiled_copy.get_thread_slice(lane_id);
+    auto C_thr_copy = C_tiled_copy.get_thread_slice(lane_id);
+    auto A_thr_copy = A_tiled_copy.get_thread_slice(lane_id);
+    auto O_thr_s2r = O_tiled_s2r.get_thread_slice(lane_id);
+    auto O_thr_r2s = O_tiled_r2s.get_thread_slice(lane_id);
+
+    Tensor sDinv = mat_16x2X16x2_2x2(make_coord(_, y), _, _1{}, _1{});
+    Tensor sC = select_tensor<1, 0>(mat_16x2X16x2_2x2(_, make_coord(_, x), _1{}, _0{}));
+    Tensor sAinv =
+        select_tensor<1, 0>(mat_16x2X16x2_2x2(make_coord(_, x), _, _0{}, _0{}));  // NOTE: not y!
+    Tensor sO = mat_16x2X16x2_2x2(make_coord(_, y), _, _1{}, _0{});  // needs cross-warp reduction
+
+    Tensor tOrDinv = thr_mma1.partition_fragment_A(sDinv);
+    Tensor tOrC = thr_mma1.partition_fragment_B(sC);
+    Tensor tOrAinv = thr_mma2.partition_fragment_B(sAinv);
+
+    Tensor tDCrDC = partition_fragment_C(tiled_mma1, Shape<_16, _16>{});  // output of -inv(D)C
+    Tensor tOrO = partition_fragment_C(tiled_mma2, Shape<_16, _32>{});  // output of -inv(D)C inv(A)
+
+    Tensor tOsDinv = D_thr_copy.partition_S(sDinv);
+    Tensor tOrDinv_cv = D_thr_copy.retile_D(tOrDinv);
+    Tensor tOsC = C_thr_copy.partition_S(sC);
+    Tensor tOrC_cv = C_thr_copy.retile_D(tOrC);
+    Tensor tOsAinv = A_thr_copy.partition_S(sAinv);
+    Tensor tOrAinv_cv = A_thr_copy.retile_D(tOrAinv);
+
+    /////////////////////////////////////////////////////////////////////////////
+    // -inv(D)C
+    copy(D_tiled_copy, tOsDinv, tOrDinv_cv);
+    copy(C_tiled_copy, tOsC, tOrC_cv);
+
+    clear(tDCrDC);
+    gemm(tiled_mma1, tOrDinv, tOrC, tDCrDC);
+    transform(tDCrDC, [](auto v) { return -v; });
+
+    /////////////////////////////////////////////////////////////////////////////
+    // -inv(D)C inv(A)
+    Tensor tOrDC = detail::SM80::make_acc_into_op<Element>(tDCrDC, tiled_mma2);
+
+    copy(A_tiled_copy, tOsAinv, tOrAinv_cv);
+    clear(tOrO);
+    gemm(tiled_mma2, tOrDC, tOrAinv, tOrO);
+
+    auto tOrO_cvt = make_tensor_like<Element>(tOrO);
+    transform(tOrO, tOrO_cvt, [](auto v) { return Element(v); });
+
+    // ensure tOsC consumed, tOsC and tOsO are the same buffer
+    cutlass::arch::NamedBarrier::arrive_and_wait(cutlass::NumThreadsPerWarpGroup,
+                                                 wg_sync_named_barrier_id_);
+
+    Tensor tOsO = O_thr_r2s.partition_D(sO);
+    Tensor tOrO_cvt_cv = O_thr_r2s.retile_S(tOrO_cvt);
+    if (x == 0) {
+      copy(O_tiled_r2s, tOrO_cvt_cv, tOsO);
+    }
+    cutlass::arch::NamedBarrier::arrive_and_wait(cutlass::NumThreadsPerWarpGroup,
+                                                 wg_sync_named_barrier_id_);
+    if (x == 1) {
+      Tensor tOrO_red = make_tensor_like(tOrO_cvt);
+      Tensor tOsO_s = O_thr_s2r.partition_S(sO);
+      Tensor tOrO_red_cv = O_thr_s2r.retile_D(tOrO_red);
+      copy(O_tiled_s2r, tOsO_s, tOrO_red_cv);
+      transform(tOrO_cvt, tOrO_red, tOrO_cvt, [](auto a, auto b) { return a + b; });
+      copy(O_tiled_r2s, tOrO_cvt_cv, tOsO);
+    }
+  }
+
+ private:
+  int wg_sync_named_barrier_id_;
+};
+
+}  // namespace flat::collective
diff --git a/csrc/flat/ampere/collective/flat_collective_load.hpp b/csrc/flat/ampere/collective/flat_collective_load.hpp
new file mode 100644
index 0000000000..3a7f517eff
--- /dev/null
+++ b/csrc/flat/ampere/collective/flat_collective_load.hpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+#include "flat/unused.hpp"
+
+namespace flat::collective {
+
+using namespace cute;
+
+enum class LoadKindVector {
+  kAlpha,
+  kBeta,
+};
+
+CUTE_HOST_DEVICE constexpr char const* to_string(LoadKindVector kind) {
+  if (kind == LoadKindVector::kAlpha) {
+    return "alpha";
+  } else if (kind == LoadKindVector::kBeta) {
+    return "beta";
+  } else {
+    return "unknown loadkind";
+  }
+}
+
+template <LoadKindVector kKind, class Pipeline, class ElementSrc, class GmemLayout,
+          class ElementDst, class SmemLayout, class VectorProcessor_ = Unused>
+struct CollectiveLoadVector {
+  using SharedStorage = cute::array_aligned<ElementDst, cute::cosize_v<SmemLayout>>;
+  using PipelineState = typename cutlass::PipelineState<Pipeline::Stages>;
+
+  using VectorProcessor = VectorProcessor_;
+
+  static_assert(rank_v<SmemLayout> == 2 || rank_v<SmemLayout> == 3);
+
+  static constexpr LoadKindVector kind = kKind;
+  static constexpr int VectorSize = size<0>(SmemLayout{});
+
+  CUTE_DEVICE
+  CollectiveLoadVector(ElementSrc const* src, GmemLayout layout, ElementSrc oob_value,
+                       Pipeline& pipeline, SharedStorage& storage)
+      : src_(src),
+        src_layout_(layout),
+        src_oob_value_(oob_value),
+        pipeline_(pipeline),
+        storage_(storage) {}
+
+  template <class ProblemSize, class TileShape, class WorkDesc>
+  CUTE_DEVICE auto partition_SD(ProblemSize const& problem_size, TileShape const& tile_shape,
+                                WorkDesc const& work_desc) {
+    constexpr auto BlkSeqQ = decltype(get<0>(tile_shape))::value;
+
+    Tensor g = [&] {
+      auto head_idx = work_desc.o_head_idx();  // num_o_heads == num_sab_heads
+      DPRINTF0_W("slice view GMEM %s: seq_idx:%d head_idx:%d tok_offset:%lld\n", to_string(kind),
+                 work_desc.seq_idx, head_idx, work_desc.tok_offset);
+      Tensor m_varlen_head = make_tensor(make_gmem_ptr(src_), src_layout_);
+
+      Tensor m_varlen = m_varlen_head(_, head_idx);  // slice into current head_idx
+      Tensor m_offset = domain_offset(make_coord(work_desc.tok_offset),
+                                      m_varlen);       // offset to start of the current sequence
+      Tensor g_full = flat_divide(m_offset, BlkSeqQ);  // (blk, iter_blk)
+      return g_full;
+    }();
+    // (blk, pipe) or (blk, pipe, N), N for feature rich preprocess, data will be stored at 0
+    Tensor s = make_tensor(make_smem_ptr(storage_.data()), SmemLayout{});
+
+    auto thr_layout = Layout<_32>{};
+    auto val_layout = Layout<_1>{};
+    auto tiled_copy =
+        make_tiled_copy(Copy_Atom<UniversalCopy<ElementSrc>, ElementDst>{}, thr_layout, val_layout);
+    auto thr_copy = tiled_copy.get_thread_slice(cutlass::canonical_lane_idx());
+
+    auto coord = thr_copy.partition_S(make_identity_tensor(Shape<Int<BlkSeqQ>, _1>{}));
+    auto len_of_last_blk = work_desc.seq_len - (ceil_div(work_desc.seq_len, BlkSeqQ) - 1) * BlkSeqQ;
+
+    // auto mask = FunctionPredTensor([coord, len_of_last_blk](auto frag_coord) {
+    //   auto coord_in_blk = get<0>(coord(frag_coord));
+    //   return coord_in_blk < len_of_last_blk;
+    // });
+    // NOTE: old FunctionPredTensor is easier to understand, cute::lazy::transform means
+    //   coord(runtime_input) and then transfrom with the given lambda
+    auto mask = cute::lazy::transform(coord, [len_of_last_blk](auto const& c) {
+      auto coord_in_blk = get<0>(c);
+      return coord_in_blk < len_of_last_blk;
+    });
+
+    auto src = thr_copy.partition_S(g);  // (cpy, iter_cpy, iter_blk)
+    auto dst = thr_copy.partition_D(s);  // (cpy, iter_cpy, pipe)
+
+    return make_tuple(src, dst, mask);
+  }
+
+  template <bool IsTail, class SrcDst>
+  CUTE_DEVICE void step(SrcDst const& src_dst, int src_iter, PipelineState& dst_pipe, int num_iters,
+                        VectorProcessor processor = {}) {
+    auto src = get<0>(src_dst);
+    auto dst = get<1>(src_dst);
+
+    auto regs = make_fragment_like<ElementSrc>(take<0, 2>(shape(dst)));
+    if constexpr (!IsTail) {
+      copy(src(_, _, src_iter), regs);
+    } else {
+      auto mask = get<2>(src_dst);
+      fill(regs, src_oob_value_);
+      copy_if(mask, src(_, _, src_iter), regs);
+    }
+
+    int dst_pipe_idx = dst_pipe.index();
+
+    DPRINTF0_WG("%s pipeline.producer_acquire smem_pipe_write:%d\n", to_string(kind), dst_pipe_idx);
+    pipeline_.producer_acquire(dst_pipe);
+    cutlass::arch::fence_view_async_shared();
+
+    if constexpr (rank_v<SmemLayout> == 3) {
+      copy(regs, dst(_, _, _0{}, dst_pipe_idx));
+    } else {
+      copy(regs, dst(_, _, dst_pipe_idx));
+    }
+
+    Tensor s = make_tensor(make_smem_ptr(storage_.data()), SmemLayout{});
+    if constexpr (!std::is_same_v<VectorProcessor, Unused>) {
+      if constexpr (rank_v<SmemLayout> == 3) {
+        processor(s(_, _, dst_pipe_idx));
+      } else {
+        processor(s(_, dst_pipe_idx));
+      }
+    }
+
+    cutlass::arch::fence_view_async_shared();
+    pipeline_.producer_commit(dst_pipe);
+    ++dst_pipe;
+  }
+
+ private:
+  ElementSrc const* src_;
+  GmemLayout src_layout_;  // in (packed_seq, H) coordinate
+  ElementSrc src_oob_value_;
+  Pipeline& pipeline_;
+  SharedStorage& storage_;
+};
+
+}  // namespace flat::collective
diff --git a/csrc/flat/common.hpp b/csrc/flat/common.hpp
new file mode 100644
index 0000000000..91939e6085
--- /dev/null
+++ b/csrc/flat/common.hpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstdio>
+#include <stdexcept>
+#include <string>
+
+#include "debug.hpp"
+
+#define FLAT_UNUSED_PARAMETER(x) (void)x
+
+#define CHECK(expr, msg)                                                                           \
+  do {                                                                                             \
+    if (!(expr)) {                                                                                 \
+      std::string buffer(1024, '\0');                                                              \
+      sprintf(buffer.data(), "Failed to check %s, %s at %s:%d\n", ##expr, msg __FILE__, __LINE__); \
+      throw std::runtime_error(buffer.c_str());                                                    \
+    }                                                                                              \
+  } while (0)
+
+#define CUDA_CHECK(expr)                                                                        \
+  do {                                                                                          \
+    cudaError_t err = (expr);                                                                   \
+    if (err != cudaSuccess) {                                                                   \
+      std::string buffer(1024, '\0');                                                           \
+      sprintf(buffer.data(), "CUDA Error: %s, Code: %d at %s:%d\n", cudaGetErrorName(err), err, \
+              __FILE__, __LINE__);                                                              \
+      throw std::runtime_error(buffer.c_str());                                                 \
+    }                                                                                           \
+  } while (0)
diff --git a/csrc/flat/cute_ext.hpp b/csrc/flat/cute_ext.hpp
new file mode 100644
index 0000000000..91dd0dc2ab
--- /dev/null
+++ b/csrc/flat/cute_ext.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/tensor.hpp"
+#include "cutlass/detail/layout.hpp"
+
+namespace flat {
+
+using namespace cute;
+
+template <int... Is, typename Layout>
+__forceinline__ __host__ __device__ constexpr auto select_layout(Layout&& l) {
+  if constexpr (is_composed_layout<Layout>::value) {
+    return make_composed_layout(l.layout_a(), l.offset(), select<Is...>(l.layout_b()));
+  } else {
+    return select<Is...>(l);
+  }
+}
+
+template <int... Is, typename Tensor>
+__forceinline__ __host__ __device__ constexpr auto select_tensor(Tensor&& t) {
+  if constexpr (is_composed_layout<decltype(t.layout())>::value) {
+    return make_tensor(
+        std::forward<Tensor>(t).data(),
+        make_composed_layout(std::forward<Tensor>(t).layout().layout_a(),
+                             std::forward<Tensor>(t).layout().offset(),
+                             select<Is...>(std::forward<Tensor>(t).layout().layout_b())));
+  } else {
+    return make_tensor(std::forward<Tensor>(t).data(), select<Is...>(t.layout()));
+  }
+}
+
+template <class Layout>
+CUTE_DEVICE constexpr size_t alignment_for_swizzle(Layout&& layout) {
+  return cutlass::detail::alignment_for_swizzle(std::forward<Layout>(layout));
+}
+
+}  // namespace flat
diff --git a/csrc/flat/debug.hpp b/csrc/flat/debug.hpp
new file mode 100644
index 0000000000..b3f27b2c5c
--- /dev/null
+++ b/csrc/flat/debug.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/config.hpp"
+
+#if DEBUG_PIPE
+#define PIPE_DEBUG_PRINTF(fmt, ...) \
+  if (threadIdx.x == 0) printf("%s:%d " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
+#else
+#define PIPE_DEBUG_PRINTF(...)
+#endif
+
+#ifndef FLAT_DEBUG_PRINT
+#define FLAT_DEBUG_PRINT 0
+#endif
+
+#if FLAT_DEBUG_PRINT
+#define IS_PRINT_BLOCK cute::block(1)
+#define DPRINTF(fmt, ...) \
+  if (IS_PRINT_BLOCK) printf("%s:%d " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
+#define DPRINTF0(fmt, ...) \
+  if (IS_PRINT_BLOCK && threadIdx.x == 0) printf("%s:%d " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
+#define DPRINTF_W(fmt, ...)                                                                        \
+  if (IS_PRINT_BLOCK)                                                                              \
+  printf("%s:%d [WG%d][W%d][T%-3d] " fmt, __FILE__, __LINE__, threadIdx.x / 128, threadIdx.x / 32, \
+         threadIdx.x, ##__VA_ARGS__)
+#define DPRINTF0_W(fmt, ...)                                                                       \
+  if (IS_PRINT_BLOCK && threadIdx.x % 32 == 0)                                                     \
+  printf("%s:%d [WG%d][W%d][T%-3d] " fmt, __FILE__, __LINE__, threadIdx.x / 128, threadIdx.x / 32, \
+         threadIdx.x, ##__VA_ARGS__)
+#define DPRINTF_WG(fmt, ...)                                                                       \
+  if (IS_PRINT_BLOCK)                                                                              \
+  printf("%s:%d [WG%d][W%d][T%-3d] " fmt, __FILE__, __LINE__, threadIdx.x / 128, threadIdx.x / 32, \
+         threadIdx.x, ##__VA_ARGS__)
+#define DPRINTF0_WG(fmt, ...)                                                                      \
+  if (IS_PRINT_BLOCK && threadIdx.x % 128 == 0)                                                    \
+  printf("%s:%d [WG%d][W%d][T%-3d] " fmt, __FILE__, __LINE__, threadIdx.x / 128, threadIdx.x / 32, \
+         threadIdx.x, ##__VA_ARGS__)
+#else
+#define DPRINTF(...)
+#define DPRINTF0(...)
+#define DPRINTF_W(...)
+#define DPRINTF0_W(...)
+#define DPRINTF_WG(...)
+#define DPRINTF0_WG(...)
+#endif
+
+#if FLAT_DEBUG_PRINT
+#define DPRINT_TMA_DESC(tma_dess_addr)                                                             \
+  do {                                                                                             \
+    auto p = reinterpret_cast<const unsigned int*>(tma_dess_addr);                                 \
+    DPRINTF(                                                                                       \
+        "\n"                                                                                       \
+        "%08X%08X %08X%08X %08X%08X %08X%08X\n"                                                    \
+        "%08X%08X %08X%08X %08X%08X %08X%08X\n"                                                    \
+        "%08X%08X %08X%08X %08X%08X %08X%08X\n"                                                    \
+        "%08X%08X %08X%08X %08X%08X %08X%08X\n",                                                   \
+        p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13],    \
+        p[14], p[15], p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25], p[26], \
+        p[27], p[28], p[29], p[30], p[31]);                                                        \
+  } while (0)
+#else
+#define DPRINT_TMA_DESC(tma_dess_addr)
+#endif
diff --git a/csrc/flat/hopper/collective/flat_collective_load.hpp b/csrc/flat/hopper/collective/flat_collective_load.hpp
new file mode 100644
index 0000000000..b587a648be
--- /dev/null
+++ b/csrc/flat/hopper/collective/flat_collective_load.hpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+
+namespace flat::collective {
+
+using namespace cute;
+
+enum class LoadKind {
+  kQ,
+  kK,
+  kV,
+};
+
+CUTE_HOST_DEVICE constexpr char const* to_string(LoadKind kind) {
+  if (kind == LoadKind::kQ) {
+    return "Q";
+  } else if (kind == LoadKind::kK) {
+    return "K";
+  } else if (kind == LoadKind::kV) {
+    return "V";
+  } else {
+    return "unknown loadkind";
+  }
+}
+
+template <LoadKind kKind, class Pipeline, class Element, class SmemLayout, class TMA>
+struct CollectiveLoadTma {
+  using SharedStorage = cute::array_aligned<Element, cute::cosize_v<SmemLayout>>;
+  using PipelineState = typename cutlass::PipelineState<Pipeline::Stages>;
+
+  static constexpr LoadKind kind = kKind;
+  TMA const& tma_load;
+  Pipeline& pipeline;
+  SharedStorage& storage;
+
+  CUTE_DEVICE
+  CollectiveLoadTma(TMA const& tma_load, Pipeline& pipeline, SharedStorage& storage)
+      : tma_load(tma_load), pipeline(pipeline), storage(storage) {}
+
+  template <class ProblemSize, class TileShape, class WorkDesc>
+  CUTE_DEVICE auto partition_SD(ProblemSize const& problem_size, TileShape const& tile_shape,
+                                WorkDesc const& work_desc) {
+    constexpr auto BlkSeqQ = decltype(get<0>(tile_shape))::value;
+    constexpr auto BlkSeqKV = decltype(get<1>(tile_shape))::value;
+    constexpr auto HeadSize = decltype(get<2>(tile_shape))::value;
+
+    Tensor g = [&] {
+      if constexpr (kind == LoadKind::kQ) {
+        DPRINTF0_W("slice view GMEM %s: seq_idx:%d head_idx:%d tok_offset:%lld\n", to_string(kind),
+                   work_desc.seq_idx, work_desc.q_head_idx(), work_desc.tok_offset);
+        Tensor m_varlen_head = tma_load.get_tma_tensor(
+            make_shape(problem_size.total_seqlen, problem_size.head_size,
+                       problem_size.num_q_heads));  // global view to the packed varlen sequence
+        Tensor m_varlen =
+            m_varlen_head(_, _, work_desc.q_head_idx());  // slice into current head_idx
+        Tensor m_offset = domain_offset(make_coord(work_desc.tok_offset, _0{}),
+                                        m_varlen);  // offset to start of the current sequence
+        Tensor g_full = local_tile(m_offset, make_tile(BlkSeqQ, HeadSize),
+                                   make_coord(_, _0{}));  // (blk, d, iter_blk)
+        return g_full;
+      } else {
+        auto num_heads =
+            (kind == LoadKind::kK ? problem_size.num_k_heads : problem_size.num_v_heads);
+        auto head_idx = (kind == LoadKind::kK ? work_desc.k_head_idx() : work_desc.v_head_idx());
+        DPRINTF0_W("slice view GMEM %s: seq_idx:%d head_idx:%d tok_offset:%lld\n", to_string(kind),
+                   work_desc.seq_idx, head_idx, work_desc.tok_offset);
+        Tensor m_varlen_head = tma_load.get_tma_tensor(
+            make_shape(problem_size.head_size, problem_size.total_seqlen,
+                       num_heads));  // global view to the packed varlen sequence
+        Tensor m_varlen = m_varlen_head(_, _, head_idx);  // slice into current head_idx
+        Tensor m_offset = domain_offset(make_coord(_0{}, work_desc.tok_offset),
+                                        m_varlen);  // offset to start of the current sequence
+        Tensor g_full = local_tile(m_offset, make_tile(HeadSize, BlkSeqKV),
+                                   make_coord(_0{}, _));  // (d, blk, iter_blk)
+        return g_full;
+      }
+    }();
+    Tensor s = make_tensor(make_smem_ptr(storage.data()), SmemLayout{});
+
+    auto block_tma = tma_load.get_slice(_0{});  // do not support cluster
+    return make_tuple(block_tma.partition_S(g), block_tma.partition_D(s));
+  }
+
+  template <bool kAcquireBarrier = true, class SrcDst>
+  CUTE_DEVICE void step(SrcDst const& src_dst, int src_iter, PipelineState& dst_pipe,
+                        uint32_t lane_predicate) {
+    if (lane_predicate == 1) {
+      DPRINTF_WG("%s pipeline.producer_acquire smem_pipe_write:%d\n", to_string(kind),
+                 dst_pipe.index());
+      if constexpr (kAcquireBarrier) {
+        pipeline.producer_acquire(dst_pipe);
+      }
+      using BarrierType = typename Pipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(dst_pipe);
+
+      auto src = get<0>(src_dst);
+      auto dst = get<1>(src_dst);
+
+      copy(tma_load.with(*tma_barrier), src(_, _, _, src_iter), dst(_, _, _, dst_pipe.index()));
+      ++dst_pipe;
+    }
+  }
+};
+
+}  // namespace flat::collective
diff --git a/csrc/flat/hopper/collective/flat_collective_store.hpp b/csrc/flat/hopper/collective/flat_collective_store.hpp
new file mode 100644
index 0000000000..8cca5b4fba
--- /dev/null
+++ b/csrc/flat/hopper/collective/flat_collective_store.hpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda/ptx>
+
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "flat/cute_ext.hpp"
+
+namespace flat::collective {
+
+using namespace cute;
+
+/*
+NOTE: what we need is as follows
+
+  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementO,
+ElementAccumulatorO, void>; using CollectiveStoreO = typename
+cutlass::epilogue::collective::CollectiveBuilder< cutlass::arch::Sm90,
+cutlass::arch::OpClassTensorOp, TileShapeO1, ClusterShape,
+cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulatorO, ElementAccumulatorO, void,
+LayoutO, Alignment,                                 // C, not exists ElementO,
+decltype(select<1,0,2>(LayoutO{})), Alignment,  // D
+      cutlass::epilogue::TmaWarpSpecializedCooperative, DefaultOperation>::CollectiveOp;
+
+but unfortunately the required type alias is only useful for our purpose is private so we roll out
+our own.
+*/
+
+CUTE_DEVICE uint32_t smid() {
+#ifdef __CUDA_ARCH__
+  uint32_t virtual_smid;
+  asm("mov.u32 %0, %%smid;" : "=r"(virtual_smid));
+  return virtual_smid;
+#else
+  return 0;
+#endif
+}
+
+template <typename TileShape_MNK_, typename ClusterShape, typename ElementO,
+          typename ElementAccumulator, typename SmemElementO, typename StrideO, int Stages>
+struct CollectiveStoreTma {
+  static_assert(size_v<ClusterShape> == 1);
+  using TileShape_MNK = TileShape_MNK_;
+  using TileShape_MN = decltype(select<0, 1>(
+      TileShape_MNK{}));  // Collective work on TileShape_MN, it is also the OutputTile
+  using SizeM = decltype(get<0>(TileShape_MNK{}));  // head_size
+  using SizeN = decltype(get<1>(TileShape_MNK{}));  // seqlen
+
+  constexpr static bool is_m_major_O = cutlass::epilogue::collective::detail::is_m_major<StrideO>();
+
+#if 0
+  // NOTE: the following derived layout is a bit slower than the manual one, will evaluate it later
+  using SmemLayoutAtom = decltype(cutlass::epilogue::collective::detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+                                  StrideO, ElementO, TileShape_MN>());
+#else
+  static_assert(sizeof(SmemElementO) == 2);
+  using SmemLayoutAtom = GMMA::Layout_MN_SW32_Atom<SmemElementO>;
+#endif
+
+  using SmemLayoutO = decltype(tile_to_shape(
+      SmemLayoutAtom{}, make_shape(SizeM{}, SizeN{}, Int<Stages>{}),
+      cute::conditional_t<is_m_major_O, Step<_2, _1, _3>, Step<_1, _2, _3>>{}));
+
+  constexpr static uint32_t TmaTransactionBytes =
+      (size(take<0, 2>(SmemLayoutO{})) * static_cast<uint32_t>(sizeof_bits<SmemElementO>::value)) /
+      8;
+
+  using CopyOpR2S =
+      decltype(cutlass::epilogue::collective::detail::sm90_get_smem_store_op_for_accumulator<
+               StrideO, ElementO, TileShape_MN>());
+  using CopyAtomR2S = Copy_Atom<CopyOpR2S, SmemElementO>;
+
+  using CopyOpS2G = SM90_TMA_STORE;
+
+  using SharedStorage = cute::array_aligned<SmemElementO, cute::cosize_v<SmemLayoutO>,
+                                            alignment_for_swizzle(SmemLayoutO{})>;
+  using Pipeline = cutlass::PipelineAsync<Stages>;  // NOT PipelineTmaStore!
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct Arguments {
+    ElementO* ptr_O;
+    StrideO dO;
+  };
+
+  struct Params {
+    using TMA_O = decltype(make_tma_copy(CopyOpS2G{},
+                                         make_tensor(make_gmem_ptr<ElementO>(nullptr),
+                                                     repeat_like(StrideO{}, int32_t(0)), StrideO{}),
+                                         take<0, 2>(SmemLayoutO{}), TileShape_MN{}, _1{}));
+
+    TMA_O tma_store_o;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    void* tensormaps;
+  };
+  using TMA = typename Params::TMA_O;
+
+  CUTE_DEVICE
+  CollectiveStoreTma(TMA const& tma_store, Pipeline& pipeline, SharedStorage& storage,
+                     void* tensormaps)
+      : tma_store_(tma_store), pipeline_(pipeline), storage_(storage), tensormaps_(tensormaps) {}
+
+  template <class ProblemSize>
+  static Params to_underlying_arguments(ProblemSize const& problem_size, Arguments const& args,
+                                        void* workspace) {
+    auto problem_size_MNKL = append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_size_MNKL;
+
+    Tensor tensor_o =
+        make_tensor(make_gmem_ptr<ElementO>(args.ptr_O), make_layout(make_shape(M, N, L), args.dO));
+    TMA tma_store_o =
+        make_tma_copy_C_sm90(CopyOpS2G{}, tensor_o, take<0, 2>(SmemLayoutO{}), TileShape_MN{});
+
+    return {
+        .tma_store_o = tma_store_o,
+        .tma_transaction_bytes = TmaTransactionBytes,
+        .tensormaps = workspace,
+    };
+  }
+
+  static size_t get_workspace_size(/*Arguments const& args,*/ int sm_count) {
+    // only use additional TMA desc for output tail tiles
+    size_t num_bytes = sizeof(cute::TmaDescriptor) * sm_count;
+    DPRINTF("workspace num_bytes:%zu\n", num_bytes);
+    return num_bytes;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status initialize_workspace(ProblemShape const& problem_shape,
+                                              /*Arguments const& args,*/ void* workspace,
+                                              cudaStream_t stream) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTE_DEVICE static void prefetch_tma_descriptors(Params const& params) {
+    cute::prefetch_tma_descriptor(params.tma_store_o.get_tma_descriptor());
+  }
+
+  template <class ProblemSize, class TileShape, class WorkDesc>
+  CUTE_DEVICE auto partition_SD(ProblemSize const& problem_size, TileShape const& tile_shape,
+                                WorkDesc const& work_desc) {
+    constexpr auto BlkSeqQ = decltype(get<0>(tile_shape))::value;
+    constexpr auto HeadSize = decltype(get<2>(tile_shape))::value;
+
+    Tensor g = [&] {
+      DPRINTF0_W("slice view GMEM O: seq_idx:%d head_idx:%d tok_offset:%lld\n", work_desc.seq_idx,
+                 work_desc.o_head_idx(), work_desc.tok_offset);
+      Tensor m_varlen_head = tma_store_.get_tma_tensor(
+          make_shape(problem_size.head_size, problem_size.total_seqlen,
+                     problem_size.num_o_heads));  // global view to the packed varlen sequence
+      Tensor m_varlen = m_varlen_head(_, _, work_desc.o_head_idx());  // slice into current head_idx
+      Tensor m_offset = domain_offset(make_coord(_0{}, work_desc.tok_offset),
+                                      m_varlen);  // offset to start of the current sequence
+      Tensor g_full = local_tile(m_offset, make_tile(HeadSize, BlkSeqQ),
+                                 make_coord(_0{}, _));  // (d, blk, iter_blk)
+      return g_full;
+    }();
+    Tensor s = make_tensor(make_smem_ptr(storage_.data()), SmemLayoutO{});
+
+    auto block_tma = tma_store_.get_slice(_0{});  // do not support cluster
+    return make_tuple(block_tma.partition_S(s), block_tma.partition_D(g));
+  }
+
+  template <typename ProblemSize, typename WorkDesc>
+  CUTE_DEVICE static bool can_process(ProblemSize const& problem_size, WorkDesc const& work_desc,
+                                      int blk, int num_blocks) {
+    if (blk < num_blocks - 1) {
+      // intermediate full tiles, always use TMA
+      return true;
+    } else if (work_desc.seq_len % SizeN{} == 0 || work_desc.seq_idx == problem_size.num_seqs - 1) {
+      // 1. last tile but full, also use TMA
+      // 2. last tile but last seq, oob can be handled by TMA
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  template <bool kAcquireBarrier = true, typename ProblemSize, typename WorkDesc, typename SrcDst>
+  CUTE_DEVICE void step(ProblemSize const& problem_size, WorkDesc const& work_desc,
+                        SrcDst const& src_dst, PipelineState& src_pipe, int dst_iter, int num_iters,
+                        uint32_t lane_predicate) {
+    auto src = get<0>(src_dst);
+    auto dst = get<1>(src_dst);
+
+    if (dst_iter == 0) {
+      bool can_process_tail = can_process(problem_size, work_desc, num_iters - 1, num_iters);
+      if (!can_process_tail) {
+        create_tensormap_for_tail(work_desc, lane_predicate);
+      }
+    }
+
+    DPRINTF0_WG("pipeline.producer_acquire smem_pipe_read:%d\n", src_pipe.index());
+    if constexpr (kAcquireBarrier) {
+      pipeline_.consumer_wait(src_pipe);
+    }
+
+    if (can_process(problem_size, work_desc, dst_iter, num_iters)) {
+      DPRINTF0_W("store src_pipe:%d -> blk:%d\n", src_pipe.index(), dst_iter);
+      if (lane_predicate == 1) {
+        copy(tma_store_, src(_, _, _, src_pipe.index()), dst(_, _, _, dst_iter));
+      }
+    } else {
+      cute::TmaDescriptor* tensormap = acquire_tensormap_for_tail();
+      DPRINTF0_W("store tail with tensormap:%p src_pipe:%d -> blk:%d\n", tensormap,
+                 src_pipe.index(), dst_iter);
+      if (lane_predicate == 1) {
+        copy(tma_store_.with(tensormap), src(_, _, _, src_pipe.index()), dst(_, _, _, dst_iter));
+      }
+    }
+
+    if constexpr (kAcquireBarrier) {
+      pipeline_.consumer_release(src_pipe);
+    }
+    ++src_pipe;
+  }
+
+  template <typename WorkDesc>
+  CUTE_DEVICE void create_tensormap_for_tail(WorkDesc const& work_desc, uint32_t lane_predicate) {
+    namespace ptx = cuda::ptx;
+    constexpr int num_of_16B = sizeof(cute::TmaDescriptor) / sizeof(uint128_t);
+
+    cute::TmaDescriptor* tensormap = static_cast<cute::TmaDescriptor*>(tensormaps_) + smid();
+
+    auto lane_idx = cutlass::canonical_lane_idx();
+    if (lane_idx < num_of_16B) {
+      auto src = reinterpret_cast<uint128_t const*>(tma_store_.get_tma_descriptor());
+      auto dst = reinterpret_cast<uint128_t*>(tensormap);
+
+      dst[lane_idx] = src[lane_idx];
+    }
+    __syncwarp();
+
+    if (lane_predicate == 1) {
+      uint32_t new_total_seqlen = work_desc.tok_offset + work_desc.seq_len;
+      ptx::tensormap_replace_global_dim(ptx::space_global, tensormap, /*ord=*/ptx::n32_t<1>{},
+                                        new_total_seqlen);
+    }
+    __syncwarp();
+
+    ptx::fence_proxy_tensormap_generic(ptx::sem_release, ptx::scope_cta);
+  }
+
+  CUTE_DEVICE cute::TmaDescriptor* acquire_tensormap_for_tail() {
+    namespace ptx = cuda::ptx;
+    cute::TmaDescriptor* tensormap = static_cast<cute::TmaDescriptor*>(tensormaps_) + smid();
+    ptx::fence_proxy_tensormap_generic(ptx::sem_acquire, ptx::scope_cta, tensormap,
+                                       /*size=*/ptx::n32_t<128>{});
+    return tensormap;
+  }
+
+ private:
+  TMA const& tma_store_;
+  Pipeline& pipeline_;
+  SharedStorage& storage_;
+  void* tensormaps_;
+};
+
+}  // namespace flat::collective
diff --git a/csrc/flat/hopper/collective/flat_collective_tma_warpspecialized_delta_rule.hpp b/csrc/flat/hopper/collective/flat_collective_tma_warpspecialized_delta_rule.hpp
new file mode 100644
index 0000000000..49f499511a
--- /dev/null
+++ b/csrc/flat/hopper/collective/flat_collective_tma_warpspecialized_delta_rule.hpp
@@ -0,0 +1,1239 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../../cute_ext.hpp"
+#include "../../math_order_barrier.hpp"
+#include "../../unused.hpp"
+#include "../collective/flat_collective_load.hpp"
+#include "../collective/flat_collective_store.hpp"
+#include "../collective/flat_common.hpp"
+#include "../collective/flat_named_barriers.hpp"
+#include "../kernel/flat_options.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "flat/ampere/collective/flat_collective_inverse.hpp"
+#include "flat/ampere/collective/flat_collective_load.hpp"
+
+// #define INLINE_LAMBDA [[gnu::always_inline]]
+#define INLINE_LAMBDA __attribute__((always_inline))
+// #define INLINE_LAMBDA [[msvc::forceinline]]
+
+#define WORKAROUND_WGMMA_PERFORMANCE_LOSS() \
+  if (thread_idx > 8192) {                  \
+    __syncwarp();                           \
+  }
+
+namespace flat::collective {
+
+struct DeltaRuleNamedBarriers : FlatSharedNamedBarriers {
+  static constexpr int KKLaunched = FlatSharedNamedBarriers::NumBarriersUsed + 0;
+  static constexpr int AuxMath = FlatSharedNamedBarriers::NumBarriersUsed + 1;
+};
+
+using namespace cute;
+using flat::kernel::find_option_t;
+using flat::kernel::Tag;
+
+template <class Element_, class ElementAccumulatorQK_, class ElementAccumulatorKV_,
+          class TileShape_,  // (seqlen_q, seqlen_kv, d)
+          class LayoutQ_, class LayoutK_, class LayoutV_, class LayoutO_,  // (seqlen_q/k, d, h)
+          class Options>
+struct FlatMainloopTmaWarpSpecializedDeltaRule {
+  using Element = Element_;
+  using ElementAccumulatorQK = ElementAccumulatorQK_;
+  using ElementAccumulatorO = ElementAccumulatorQK;
+  using ElementAccumulatorKV = ElementAccumulatorKV_;
+  using ElementO = Element;
+
+  using TileShape = TileShape_;
+
+  using LayoutQ = LayoutQ_;  // (seqlen_q, d, h)
+  using LayoutK = LayoutK_;  // (seqlen_k, d, h)
+  using LayoutV = LayoutV_;  // (seqlen_k, d, h)
+  using LayoutO = LayoutO_;  // (seqlen_k, d, h)
+
+  // Options
+  static constexpr bool kIsPersistent =
+      find_option_t<Tag::kIsPersistent, false_type, Options>::value;
+
+  static constexpr bool kInitStateFromInput =
+      find_option_t<Tag::kInitStateFromInput, false_type, Options>::value;
+
+  static constexpr int NumLoadWarpGroups = 1;
+  static constexpr int NumStateMmaWarpGroups = 2;
+  static constexpr int NumAuxMmaWarpGroups = 1;
+
+  static constexpr int StageCountQ = find_option_t<Tag::kStagesQ, Int<2>, Options>::value;
+  static constexpr int StageCountK = find_option_t<Tag::kStagesK, Int<3>, Options>::value;
+  static constexpr int StageCountV = find_option_t<Tag::kStagesV, Int<2>, Options>::value;
+
+  static constexpr int NeedsAlpha =
+      find_option_t<Tag::kNeedsAlpha, cute::true_type, Options>::value;
+  static constexpr int NeedsBeta = find_option_t<Tag::kNeedsBeta, cute::true_type, Options>::value;
+
+  static constexpr int NeedsDecay =
+      find_option_t<Tag::kNeedsDecay, cute::false_type, Options>::value;
+  static_assert(!NeedsDecay, "DeltaRule does not supports decay");
+
+  static constexpr int NumLoadThreads = NumLoadWarpGroups * 128;
+  static constexpr int NumStateMmaThreads = NumStateMmaWarpGroups * 128;
+  static constexpr int NumAuxMmaThreads = NumAuxMmaWarpGroups * 128;
+
+  static constexpr uint32_t OrderedBarrierId0 =
+      uint32_t(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0);
+  static constexpr uint32_t OrderedBarrierId1 =
+      uint32_t(cutlass::arch::ReservedNamedBarriers::StreamkBarrier1);
+
+  using OrderedMathBarriers = std::conditional_t<
+      NumStateMmaWarpGroups == 2,
+      OrderedNamedBarriers</*UseReservedNB=*/true, OrderedBarrierId0, OrderedBarrierId1>,
+      OrderedNamedBarriers</*UseReservedNB=*/true, OrderedBarrierId0>>;
+
+  using StagesQ = cutlass::gemm::collective::StageCount<StageCountQ>;
+  using StagesK = cutlass::gemm::collective::StageCount<StageCountK>;
+  using StagesV = cutlass::gemm::collective::StageCount<StageCountV>;
+  using StagesO = cutlass::gemm::collective::StageCount<2>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using StagesQK = cutlass::gemm::collective::StageCount<2>;
+  using StagesKK = cutlass::gemm::collective::StageCount<2>;
+
+  using StagesAlphaBeta = cutlass::gemm::collective::StageCount<5>;
+
+  static constexpr int Alignment = 16 / sizeof(Element);
+
+  static constexpr auto BlkSeqQ = get<0>(TileShape{});   // Blk_Q
+  static constexpr auto BlkSeqKV = get<1>(TileShape{});  // Blk_K/V
+  static constexpr auto HeadSize = get<2>(TileShape{});  // D (Dq, Dk, Dv all equal)
+  static constexpr auto HeadSizeQK = HeadSize;
+  static constexpr auto HeadSizeV = HeadSize;
+
+  using TileShapeQK = decltype(make_shape(BlkSeqQ, BlkSeqKV, HeadSizeQK));
+  using TileShapeKK = decltype(make_shape(BlkSeqKV, BlkSeqKV, HeadSizeQK));
+  using TileShapeKV = decltype(make_shape(HeadSizeV, HeadSizeQK, BlkSeqKV));
+  static_assert(std::is_same_v<TileShapeQK, TileShapeKK>);
+
+  using TileShapeO2 = decltype(make_shape(HeadSizeV, BlkSeqQ, BlkSeqKV));
+  using TileShapeO1 = decltype(make_shape(HeadSizeV, BlkSeqQ, HeadSizeQK));
+
+  static_assert(BlkSeqQ % 64 == 0);
+  static_assert(BlkSeqQ == 64 || BlkSeqQ == 128);
+  static constexpr bool IsQKCooperative = BlkSeqQ == 128;
+  static constexpr bool IsKKCooperative = IsQKCooperative;
+
+  using DummyStages = cutlass::gemm::collective::StageCount<2>;
+  ;
+  using CollectiveMmaQK = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, Element, LayoutQ, Alignment, Element,
+      LayoutK, Alignment, ElementAccumulatorQK, TileShapeQK, ClusterShape, DummyStages,
+      std::conditional_t<IsQKCooperative, cutlass::gemm::KernelTmaWarpSpecializedCooperative,
+                         cutlass::gemm::KernelTmaWarpSpecialized>>::CollectiveOp;
+
+  using CollectiveMmaKV_G2S = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, Element,
+      decltype(select<1, 0, 2>(LayoutV{})), Alignment,  // direct TMA copy for GMEM -> SMEM
+      Element, decltype(select<1, 0, 2>(LayoutK{})), Alignment, ElementAccumulatorKV, TileShapeKV,
+      ClusterShape, DummyStages, cutlass::gemm::KernelTmaWarpSpecializedCooperative>::CollectiveOp;
+
+  // raw layout for copy
+  using SmemLayoutQ_SD =
+      decltype(unstage_smem_layout(typename CollectiveMmaQK::SmemLayoutA{}, Int<StagesQ::value>{}));
+  using SmemLayoutK_DS = decltype(unstage_smem_layout(typename CollectiveMmaKV_G2S::SmemLayoutB{},
+                                                      Int<StagesK::value>{}));
+  using SmemLayoutV_DS = decltype(unstage_smem_layout(typename CollectiveMmaKV_G2S::SmemLayoutA{},
+                                                      Int<StagesV::value>{}));
+
+  using RefLayoutV = decltype(make_layout(select<0, 2>(TileShapeKV{}), LayoutRight{}));
+  using CollectiveMmaKV = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, Element, RefLayoutV,
+      Alignment,  // needs a S2R transposition for MMA
+      Element, decltype(select<1, 0, 2>(LayoutK{})), Alignment, ElementAccumulatorKV, TileShapeKV,
+      ClusterShape, DummyStages, cutlass::gemm::KernelTmaWarpSpecializedCooperative>::CollectiveOp;
+
+  using RefLayoutKV =
+      decltype(make_layout(select<0, 1>(TileShapeKV{}), LayoutRight{}));  // (dv, dk)
+  using CollectiveMmaO1 = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, Element, RefLayoutKV, Alignment, Element,
+      LayoutQ, Alignment, ElementAccumulatorO, TileShapeO1, ClusterShape, DummyStages,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative>::CollectiveOp;
+
+  // (blk_q,blk_k) to align with O2 mma, LayoutRight to align with QK mma output
+  using DesiredLayoutQK = decltype(make_layout(select<0, 1>(TileShapeQK{}), LayoutRight{}));
+  using CollectiveMmaO2 = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, Element, RefLayoutV, Alignment, Element,
+      DesiredLayoutQK, Alignment, ElementAccumulatorO, TileShapeO2, ClusterShape, DummyStages,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative>::CollectiveOp;
+
+  using TiledMmaQK = typename CollectiveMmaQK::TiledMma;  // Q@K^t
+  using TiledMmaKV = decltype(convert_to_gmma_rs(typename CollectiveMmaKV::TiledMma{}));
+  using TiledMmaO1 = decltype(convert_to_gmma_rs(typename CollectiveMmaO1::TiledMma{}));
+  using TiledMmaO2 = decltype(convert_to_gmma_rs(typename CollectiveMmaO2::TiledMma{}));
+
+  static constexpr int TiledMmaQKNumThreads = size(TiledMmaQK{});
+  static_assert(size(TiledMmaQK{}) == NumAuxMmaThreads);
+
+  static_assert(size(TiledMmaKV{}) == NumStateMmaThreads);
+  static_assert(size(TiledMmaO1{}) == NumStateMmaThreads);
+  static_assert(size(TiledMmaO2{}) == NumStateMmaThreads);
+
+  using CollectiveStoreO =
+      CollectiveStoreTma<TileShapeO1, ClusterShape, ElementO, ElementAccumulatorO,
+                         /*Seme*/ ElementO, decltype(select<1, 0, 2>(LayoutO{})), StagesO::value>;
+
+  // layout for compute
+  using QKSmemLayoutQ = SmemLayoutQ_SD;
+  using QKSmemLayoutK = decltype(select_layout<1, 0, 2>(SmemLayoutK_DS{}));
+
+  using KVSmemLayoutK = SmemLayoutK_DS;
+  using KVSmemLayoutV = SmemLayoutV_DS;
+
+  // layout for compute output
+  using SmemLayoutQK = decltype(tile_to_shape(
+      GMMA::Layout_K_INTER_Atom<Element>{},
+      flatten(make_shape(select<0, 1>(TileShapeQK{}), Int<StagesQK::value>{})),
+      Step<_1, _2, _3>{}));
+  using SmemLayoutO = typename CollectiveStoreO::SmemLayoutO;
+
+  using SmemLayoutKK = decltype(tile_to_shape(
+      GMMA::Layout_K_INTER_Atom<Element>{},
+      flatten(make_shape(select<0, 1>(TileShapeQK{}), Int<StagesQK::value>{})),
+      Step<_1, _2, _3>{}));
+
+  using InverseType = cutlass::half_t;
+  using CollectiveInverse = flat::collective::CollectiveInverse<InverseType, true, false>;
+
+  using ElementAccumulatorSK = float;
+  using TileShapeSK = decltype(make_shape(HeadSizeV, BlkSeqKV, HeadSizeQK));
+  using CollectiveMmaSK =
+      typename cutlass::gemm::collective::CollectiveBuilder<  // basically the same as O1
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, Element, RefLayoutKV, Alignment,
+          Element, LayoutK, Alignment, ElementAccumulatorSK, TileShapeSK, ClusterShape, DummyStages,
+          cutlass::gemm::KernelTmaWarpSpecializedCooperative>::CollectiveOp;
+
+  using ElementAccumulatorNewV = float;
+  using TileShapeNewV = decltype(make_shape(HeadSizeV, BlkSeqKV, BlkSeqKV));
+  using RefLayoutSK =
+      decltype(make_layout(select<0, 2>(TileShapeNewV{}), LayoutRight{}));  // (dv, Blk)
+  using DesiredLayoutKK = decltype(make_layout(select<1, 2>(TileShapeNewV{}), LayoutRight{}));  //
+  using CollectiveMmaNewV = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, Element, RefLayoutSK, Alignment, Element,
+      DesiredLayoutKK, Alignment, ElementAccumulatorKV, TileShapeNewV, ClusterShape, DummyStages,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative>::CollectiveOp;
+
+  // FIXME: K@K^t are not exactly the same as Q@K^t, but similar enough
+  using TiledMmaKK =
+      typename CollectiveMmaQK::TiledMma;  // T = inv(I + strict_lower_triangular(K@K^t))
+  using TiledMmaSK =
+      decltype(convert_to_gmma_rs(typename CollectiveMmaSK::TiledMma{}));  // ??   = -S@K^t + V^t
+  using TiledMmaNewV =
+      decltype(convert_to_gmma_rs(typename CollectiveMmaNewV::TiledMma{}));  // NewV = ??@T^t
+
+  static constexpr int TiledMmaKKNumThreads = size(TiledMmaKK{});
+  static_assert(size(TiledMmaKK{}) == NumAuxMmaThreads);
+
+  using GmemStrideAlphaBeta = Stride<int64_t, int32_t>;
+  using GmemLayoutAlphaBeta = Layout<Shape<int64_t, int32_t>, GmemStrideAlphaBeta>;  // (seq, head)
+
+  // (blk, pipe, cumsum_log/cumprod),
+  //   0 for cumsum(log(alpha)) aka log(cumprod(alpha))
+  //   1 for cumprod(alpha)
+  //   2 for cumprod(alpha) * scale
+  using AlphaCumSumLogIdx = _0;
+  using AlphaCumProdIdx = _1;
+  using AlphaCumProdScaleIdx = _2;
+
+  using SmemLayoutAlpha =
+      decltype(make_layout(make_shape(BlkSeqQ, Int<3>{}, Int<StagesAlphaBeta::value>{})));
+  using SmemLayoutBeta = decltype(make_layout(make_shape(BlkSeqQ, Int<StagesAlphaBeta::value>{})));
+
+  using MainloopQPipeline = cutlass::PipelineTmaAsync<StagesQ::value>;
+  using MainloopKPipeline = cutlass::PipelineTmaAsync<StagesK::value>;
+  using MainloopVPipeline = cutlass::PipelineTmaAsync<StagesV::value>;
+  using MainloopOPipeline = typename CollectiveStoreO::Pipeline;
+
+  using MainloopQKPipeline = cutlass::PipelineAsync<StagesQK::value>;
+  using MainloopKKPipeline = cutlass::PipelineAsync<StagesKK::value>;
+
+  using MainloopAlphaPipeline =
+      std::conditional_t<NeedsAlpha, cutlass::PipelineAsync<StagesAlphaBeta::value>, Unused>;
+  using MainloopBetaPipeline =
+      std::conditional_t<NeedsBeta, cutlass::PipelineAsync<StagesAlphaBeta::value>, Unused>;
+
+  using QPipelineState = typename cutlass::PipelineState<MainloopQPipeline::Stages>;
+  using KPipelineState = typename cutlass::PipelineState<MainloopKPipeline::Stages>;
+  using VPipelineState = typename cutlass::PipelineState<MainloopVPipeline::Stages>;
+  using OPipelineState = typename CollectiveStoreO::PipelineState;
+
+  using QKPipelineState = cutlass::PipelineState<MainloopQKPipeline::Stages>;
+  using KKPipelineState = cutlass::PipelineState<MainloopKKPipeline::Stages>;
+
+  using AlphaPipelineState =
+      std::conditional_t<NeedsAlpha, cutlass::PipelineState<MainloopAlphaPipeline::Stages>, Unused>;
+  using BetaPipelineState =
+      std::conditional_t<NeedsBeta, cutlass::PipelineState<MainloopBetaPipeline::Stages>, Unused>;
+
+  struct AlphaProcessor {
+    CUTE_DEVICE
+    AlphaProcessor(float scale) : scale_(scale) {}
+
+    template <typename T>
+    CUTE_DEVICE void operator()(T&& vecs) {
+      constexpr int WarpSize = cutlass::NumThreadsPerWarp;
+      int lane_id = cutlass::canonical_lane_idx();
+
+      Tensor vecs_32 = flat_divide(
+          std::forward<T>(vecs),
+          make_tile(Int<WarpSize>{}));  // ((32), iter, cumsum_log/cumprod/cumprod_scale)
+      Tensor vec_cumsum_log = vecs_32(make_coord(_), _, AlphaCumSumLogIdx{});
+      Tensor vec_cumprod = vecs_32(make_coord(_), _, AlphaCumProdIdx{});
+      Tensor vec_cumprod_s = vecs_32(make_coord(_), _, AlphaCumProdScaleIdx{});  // cumprod * scale
+      Tensor frag = make_tensor<float>(size<1>(vec_cumprod));
+
+      CUTE_UNROLL
+      for (int iter = 0; iter < size(frag); ++iter) {
+        frag(iter) = log2f(vec_cumsum_log(lane_id, iter) + 1e-10f);
+      }
+
+      CUTE_UNROLL
+      for (int offset = 1; offset < WarpSize; offset *= 2) {
+        CUTE_UNROLL
+        for (int iter = 0; iter < size(frag); ++iter) {
+          auto v = __shfl_up_sync(0xFFFFFFFF, frag(iter), offset);
+          if (lane_id >= offset) {
+            frag(iter) += v;
+          }
+        }
+      }
+
+      float sum = 0.0f;
+      CUTE_UNROLL
+      for (int iter = 1; iter < size(frag); ++iter) {
+        sum += __shfl_sync(0xFFFFFFFF, frag(iter - 1), 31);
+        frag(iter) += sum;
+      }
+
+      CUTE_UNROLL
+      for (int iter = 0; iter < size(frag); ++iter) {
+        vec_cumsum_log(lane_id, iter) = frag(iter);
+        float cumprod = exp2f(frag(iter));
+        vec_cumprod(lane_id, iter) = cumprod;
+        vec_cumprod_s(lane_id, iter) = cumprod * scale_;
+      }
+    }
+
+    float scale_ = 1.0f;
+  };
+
+  using BetaProcessor = Unused;
+  // struct BetaProcessor {
+  //   template <typename T>
+  //   CUTE_DEVICE
+  //   void operator()(T&& vec) {
+  //     int lane_id = cutlass::canonical_lane_idx();
+  //     int warp_size = cutlass::NumThreadsPerWarp;
+  //     for (int i = lane_id; i < size(vec); i += warp_size) {
+  //       auto val = vec(i);
+  //       val = max(val, 1e-10f);  // clamp due to fusion with IKK before matrix inverse
+  //       vec(i) = 1.0f / val;
+  //     }
+  //   }
+  // };
+
+  static constexpr int LoadQBytes = size(QKSmemLayoutQ{}(_, _, _0{})) * sizeof(Element);
+  static constexpr int LoadKBytes = size(KVSmemLayoutK{}(_, _, _0{})) * sizeof(Element);
+  static constexpr int LoadVBytes = size(KVSmemLayoutV{}(_, _, _0{})) * sizeof(Element);
+  static constexpr int StoreOBytes = CollectiveStoreO::TmaTransactionBytes;
+
+  using SharedStorageO = typename CollectiveStoreO::SharedStorage;
+
+  struct SharedStorage {
+    alignas(alignment_for_swizzle(
+        QKSmemLayoutQ{})) cute::array_aligned<Element, cute::cosize_v<QKSmemLayoutQ>> smem_q;
+    alignas(alignment_for_swizzle(
+        KVSmemLayoutK{})) cute::array_aligned<Element, cute::cosize_v<KVSmemLayoutK>> smem_k;
+    alignas(alignment_for_swizzle(
+        KVSmemLayoutV{})) cute::array_aligned<Element, cute::cosize_v<KVSmemLayoutV>> smem_v;
+    alignas(alignment_for_swizzle(
+        SmemLayoutQK{})) cute::array_aligned<Element, cute::cosize_v<SmemLayoutQK>> smem_qk;
+    alignas(alignment_for_swizzle(
+        SmemLayoutKK{})) cute::array_aligned<InverseType, cute::cosize_v<SmemLayoutKK>> smem_kk;
+
+    SharedStorageO smem_o;
+    // TODO: make optional
+    cute::array_aligned<float, cute::cosize_v<SmemLayoutBeta>> smem_beta;
+    cute::array_aligned<float, cute::cosize_v<SmemLayoutAlpha>> smem_alpha;
+  };
+
+  using TMA_Q = typename CollectiveMmaQK::Params::TMA_A;
+  using TMA_K = typename CollectiveMmaKV_G2S::Params::TMA_B;
+  using TMA_V = typename CollectiveMmaKV_G2S::Params::TMA_A;
+  using TMA_O = typename CollectiveStoreO::Params::TMA_O;
+
+  using LoadQ = CollectiveLoadTma<LoadKind::kQ, MainloopQPipeline, Element, QKSmemLayoutQ, TMA_Q>;
+  using LoadK = CollectiveLoadTma<LoadKind::kK, MainloopKPipeline, Element, KVSmemLayoutK, TMA_K>;
+  using LoadV = CollectiveLoadTma<LoadKind::kV, MainloopVPipeline, Element, KVSmemLayoutV, TMA_V>;
+
+  using LoadAlpha =
+      CollectiveLoadVector<LoadKindVector::kAlpha, MainloopAlphaPipeline, float,
+                           GmemLayoutAlphaBeta, float, SmemLayoutAlpha, AlphaProcessor>;
+  using LoadBeta = CollectiveLoadVector<LoadKindVector::kBeta, MainloopBetaPipeline, float,
+                                        GmemLayoutAlphaBeta, float, SmemLayoutBeta, BetaProcessor>;
+
+  struct Arguments {  // clang-format off
+    Element const* ptr_Q; LayoutQ dQ;
+    Element const* ptr_K; LayoutK dK;
+    Element const* ptr_V; LayoutV dV;
+    Element*       ptr_O; LayoutO dO;
+    float*        ptr_output_state; // layout fixed (kdim, vdim, num_heads, num_seqs):LayoutLeft{}
+    float const*  ptr_input_state;
+    float scale;
+    float const* alpha_ptr; GmemStrideAlphaBeta alpha_stride;
+    float const* beta_ptr;  GmemStrideAlphaBeta beta_stride;
+  };  // clang-format on
+
+  struct Params {
+    TMA_Q tma_load_q;
+    TMA_K tma_load_k;
+    TMA_V tma_load_v;
+    TMA_O tma_store_o;
+    void* tensormaps;
+    float scale;
+
+    float* ptr_output_state;
+    float const* ptr_input_state;
+
+    float const* alpha_ptr;
+    GmemLayoutAlphaBeta alpha_layout;
+    float const* beta_ptr;
+    GmemLayoutAlphaBeta beta_layout;
+  };
+
+  template <class ProblemShape>
+  static bool can_implement(ProblemShape const& problem_size, Arguments const& args) {
+    auto ratio = problem_size.num_q_heads > problem_size.num_v_heads
+                     ? problem_size.num_q_heads / problem_size.num_v_heads
+                     : problem_size.num_v_heads / problem_size.num_q_heads;
+
+    constexpr bool IsGVAEnabled = find_option_t<Tag::kIsGVA, false_type, Options>::value;
+
+    bool is_gqa_like = (problem_size.num_k_heads == problem_size.num_v_heads) &&
+                       (problem_size.num_q_heads == ratio * problem_size.num_k_heads) &&
+                       (problem_size.num_q_heads == ratio * problem_size.num_v_heads);
+
+    bool is_gva_like = (problem_size.num_q_heads == problem_size.num_k_heads) &&
+                       (problem_size.num_v_heads == ratio * problem_size.num_q_heads) &&
+                       (problem_size.num_v_heads == ratio * problem_size.num_k_heads);
+    return true && ((!IsGVAEnabled && is_gqa_like) || (IsGVAEnabled && is_gva_like)) &&
+           (problem_size.head_size <= get<2>(TileShape{})) &&
+           ((problem_size.head_size % Alignment) == 0);
+  }
+
+  template <class ProblemShape>
+  static Params to_underlying_arguments(ProblemShape const& problem_size, Arguments const& args,
+                                        void* workspace) {
+    int64_t s = problem_size.total_seqlen;
+    int64_t t = problem_size.total_seqlen;
+    int32_t d = problem_size.head_size;
+
+    auto params_qk = CollectiveMmaQK::to_underlying_arguments(
+        make_shape(s, t, d, problem_size.num_q_heads),
+        typename CollectiveMmaQK::Arguments{
+            args.ptr_Q, args.dQ, args.ptr_K, args.dK,  // never used, dummy
+        },
+        /*workspace=*/nullptr);
+
+    auto params_kv_k = CollectiveMmaKV_G2S::to_underlying_arguments(
+        make_shape(d, d, s, problem_size.num_k_heads),
+        typename CollectiveMmaKV_G2S::Arguments{
+            args.ptr_V, select<1, 0, 2>(args.dV),  // not used
+            args.ptr_K, select<1, 0, 2>(args.dK),  // used as G2S for K
+        },
+        /*workspace=*/nullptr);
+
+    auto params_kv_v = CollectiveMmaKV_G2S::to_underlying_arguments(
+        make_shape(d, d, s, problem_size.num_v_heads),
+        typename CollectiveMmaKV_G2S::Arguments{
+            args.ptr_V, select<1, 0, 2>(args.dV),  // used as G2S for V
+            args.ptr_K, select<1, 0, 2>(args.dK),  // not used
+        },
+        /*workspace=*/nullptr);
+
+    auto params_o = CollectiveStoreO::to_underlying_arguments(
+        make_shape(d, s, d, problem_size.num_o_heads),  // in O1
+        // make_shape(d, s, s, problem_size.num_o_heads),  // in O2
+        typename CollectiveStoreO::Arguments{args.ptr_O, select<1, 0, 2>(args.dO)}, workspace);
+
+    return Params{
+        .tma_load_q = params_qk.tma_load_a,
+        .tma_load_k = params_kv_k.tma_load_b,
+        .tma_load_v = params_kv_v.tma_load_a,
+        .tma_store_o = params_o.tma_store_o,
+        .tensormaps = params_o.tensormaps,
+        .scale = args.scale,
+
+        .ptr_output_state = args.ptr_output_state,
+        .ptr_input_state = args.ptr_input_state,
+
+        // TODO: refactor all name to varname_vartype
+        .alpha_ptr = args.alpha_ptr,
+        .alpha_layout = make_layout(make_shape(s, problem_size.num_sab_heads), args.alpha_stride),
+        .beta_ptr = args.beta_ptr,
+        .beta_layout = make_layout(make_shape(s, problem_size.num_sab_heads), args.beta_stride),
+    };
+  }
+
+  static size_t get_workspace_size(Arguments const& args, int sm_count) {
+    return CollectiveStoreO::get_workspace_size(sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status initialize_workspace(ProblemShape const& problem_shape,
+                                              Arguments const& args, void* workspace,
+                                              cudaStream_t stream) {
+    return CollectiveStoreO::initialize_workspace(problem_shape, workspace, stream);
+  }
+
+  CUTE_DEVICE static void prefetch_tma_descriptors(Params const& params) {
+    cute::prefetch_tma_descriptor(params.tma_load_q.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_k.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_v.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_store_o.get_tma_descriptor());
+  }
+
+  template <typename ProblemShape, typename LoadTileShape, typename WorkDesc>
+  CUTE_DEVICE void load_qkv(Params const& params, ProblemShape const& problem_size,
+                            LoadTileShape const& load_tile_shape, WorkDesc const& work_desc,
+                            MainloopQPipeline& q_pipeline, QPipelineState& q_smem_pipe_write,
+                            MainloopKPipeline& k_pipeline, KPipelineState& k_smem_pipe_write,
+                            MainloopVPipeline& v_pipeline, VPipelineState& v_smem_pipe_write,
+                            SharedStorage& storage) {
+    int32_t num_blocks = ceil_div(work_desc.seq_len, get<0>(TileShape{}));
+    uint32_t lane_predicate = cute::elect_one_sync();
+
+    auto q_collective_load = LoadQ(params.tma_load_q, q_pipeline, storage.smem_q);
+    auto k_collective_load = LoadK(params.tma_load_k, k_pipeline, storage.smem_k);
+    auto v_collective_load = LoadV(params.tma_load_v, v_pipeline, storage.smem_v);
+
+    auto q_src_dst = q_collective_load.partition_SD(problem_size, load_tile_shape, work_desc);
+    auto k_src_dst = k_collective_load.partition_SD(problem_size, load_tile_shape, work_desc);
+    auto v_src_dst = v_collective_load.partition_SD(problem_size, load_tile_shape, work_desc);
+
+    CUTE_NO_UNROLL
+    for (int blk = 0; blk < num_blocks; ++blk) {
+      k_collective_load.step(k_src_dst, blk, k_smem_pipe_write, lane_predicate);
+      q_collective_load.step(q_src_dst, blk, q_smem_pipe_write, lane_predicate);
+      v_collective_load.step(v_src_dst, blk, v_smem_pipe_write, lane_predicate);
+    }
+  }
+
+  template <typename ProblemShape, typename TileShape, typename WorkDesc>
+  CUTE_DEVICE void load_beta(Params const& params, ProblemShape const& problem_size,
+                             TileShape const& tile_shape, WorkDesc const& work_desc,
+                             MainloopBetaPipeline& pipeline, BetaPipelineState& smem_pipe_write,
+                             SharedStorage& storage) {
+    int32_t num_blocks = ceil_div(work_desc.seq_len, get<0>(TileShape{}));
+
+    // fuse post inverse diag(beta) into diagonal of IKK
+    // auto collective_load = LoadBeta{params.beta_ptr, params.beta_layout, /*oob_value=*/1.0f,
+    // pipeline, storage.smem_beta};
+    auto collective_load = LoadBeta{params.beta_ptr, params.beta_layout, /*oob_value=*/0.0f,
+                                    pipeline, storage.smem_beta};
+    auto src_dst = collective_load.partition_SD(problem_size, tile_shape, work_desc);
+
+    CUTE_NO_UNROLL
+    for (int blk = 0; blk < num_blocks - 1; ++blk) {
+      collective_load.step</*IsTail=*/false>(src_dst, blk, smem_pipe_write, num_blocks);
+    }
+    collective_load.step</*IsTail=*/true>(src_dst, num_blocks - 1, smem_pipe_write, num_blocks);
+  }
+
+  template <typename ProblemShape, typename TileShape, typename WorkDesc>
+  CUTE_DEVICE void load_alpha(Params const& params, ProblemShape const& problem_size,
+                              TileShape const& tile_shape, WorkDesc const& work_desc,
+                              MainloopAlphaPipeline& pipeline, AlphaPipelineState& smem_pipe_write,
+                              SharedStorage& storage) {
+    int32_t num_blocks = ceil_div(work_desc.seq_len, get<0>(TileShape{}));
+
+    auto collective_load = LoadAlpha{params.alpha_ptr, params.alpha_layout, /*oob_value=*/1.0f,
+                                     pipeline, storage.smem_alpha};
+    auto src_dst = collective_load.partition_SD(problem_size, tile_shape, work_desc);
+
+    typename LoadAlpha::VectorProcessor processor{params.scale};
+
+    CUTE_NO_UNROLL
+    for (int blk = 0; blk < num_blocks - 1; ++blk) {
+      collective_load.step</*IsTail=*/false>(src_dst, blk, smem_pipe_write, num_blocks, processor);
+    }
+    collective_load.step</*IsTail=*/true>(src_dst, num_blocks - 1, smem_pipe_write, num_blocks,
+                                          processor);
+  }
+
+  template <typename ProblemSize, typename StoreTileShape, typename WorkDesc,
+            typename PipelineState>
+  CUTE_DEVICE void store(TMA_O const& tma_store, void* tensormaps, ProblemSize const& problem_size,
+                         StoreTileShape const& store_tile_shape, WorkDesc const& work_desc,
+                         MainloopOPipeline& pipeline, PipelineState& smem_pipe_read,
+                         SharedStorageO& storage) {
+    int32_t num_blocks = ceil_div(work_desc.seq_len, get<0>(TileShape{}));
+    uint32_t lane_predicate = cute::elect_one_sync();
+
+    auto collective_store = CollectiveStoreO{tma_store, pipeline, storage, tensormaps};
+    auto src_dst = collective_store.partition_SD(problem_size, store_tile_shape, work_desc);
+
+    CUTE_NO_UNROLL
+    for (int blk = 0; blk < num_blocks; ++blk) {
+      DPRINTF0_W("O collective_store.step smem_pipe_read:%d -> blk_idx:%d, num_blocks:%d\n",
+                 smem_pipe_read.index(), blk, num_blocks);
+      collective_store.step(problem_size, work_desc, src_dst, smem_pipe_read, blk, num_blocks,
+                            lane_predicate);
+    }
+  }
+
+  template <class ProblemShape, class WorkDesc>
+  CUTE_DEVICE void compute(
+      Params const& params, ProblemShape const& problem_size, WorkDesc const& work_desc,
+      MainloopQPipeline& q_pipeline, QPipelineState& q_smem_pipe_read,
+      MainloopKPipeline& k_pipeline, KPipelineState& k_smem_pipe_read,
+      MainloopVPipeline& v_pipeline, VPipelineState& v_smem_pipe_read,
+      MainloopOPipeline& o_pipeline, OPipelineState& o_smem_pipe_write,
+      MainloopQKPipeline& qk_pipeline, QKPipelineState& qk_smem_pipe_read,
+      MainloopKKPipeline& kk_pipeline, KKPipelineState& kk_smem_pipe_read,
+      MainloopAlphaPipeline& alpha_pipeline, AlphaPipelineState& alpha_smem_pipe_read,
+      // MainloopBetaPipeline& beta_pipeline, BetaPipelineState& beta_smem_pipe_read,
+      OrderedMathBarriers& math_barriers, SharedStorage& storage) {
+    // MAKE NVCC HAPPY!
+    constexpr auto zero = Element{};
+
+    int32_t num_blocks = ceil_div(work_desc.seq_len, get<0>(TileShape{}));
+    DPRINTF0_WG("num_blocks: %d\n", num_blocks);
+
+    int thread_idx = int(threadIdx.x) - NumLoadThreads;
+    int warpgroup_idx = thread_idx / cutlass::NumThreadsPerWarpGroup;
+
+    float scale = params.scale;
+
+    // Tensor Beta  = make_tensor(make_smem_ptr(storage.smem_beta.data()), SmemLayoutBeta{});
+    Tensor Alpha = make_tensor(make_smem_ptr(storage.smem_alpha.data()), SmemLayoutAlpha{});
+
+    Tensor sQqk = make_tensor(make_smem_ptr(storage.smem_q.data()), QKSmemLayoutQ{});
+    Tensor sKqk = make_tensor(make_smem_ptr(storage.smem_k.data()), QKSmemLayoutK{});
+    Tensor sKkv = make_tensor(make_smem_ptr(storage.smem_k.data()), KVSmemLayoutK{});
+    Tensor sVkv = make_tensor(make_smem_ptr(storage.smem_v.data()), KVSmemLayoutV{});
+    Tensor sQK = make_tensor(make_smem_ptr(storage.smem_qk.data()), SmemLayoutQK{});
+    Tensor sO = make_tensor(make_smem_ptr(storage.smem_o.data()), SmemLayoutO{});
+
+    static_assert(sizeof(InverseType) == sizeof(Element));
+    Tensor sKK_inv = make_tensor(make_smem_ptr(storage.smem_kk.data()), SmemLayoutKK{});
+    Tensor sKK_opd = make_tensor(make_smem_ptr(reinterpret_cast<Element*>(storage.smem_kk.data())),
+                                 SmemLayoutKK{});
+
+    ///////////////////////////////////////////////////////////////////////////
+    // S@K  (-S K^T  +  V^T)
+    auto sk_tiled_mma = TiledMmaSK{};
+    auto sk_thr_mma = sk_tiled_mma.get_thread_slice(thread_idx);
+
+    auto layout_SKAlpha = flatten(make_layout(  // broadcast Alpha vector to SK size
+        make_layout(select<0, 1>(TileShapeSK{}), Stride<_0, _1>{}),  // (D, Blk_KV)
+        select<1, 2>(SmemLayoutAlpha{})                              // (Idx, pipe)
+        ));                                                          // (D, Blk_KV, Idx, pipe)
+
+    auto tSKrAlpha = sk_thr_mma.partition_C(Alpha.compose(layout_SKAlpha))(
+        _, _, _, AlphaCumProdIdx{}, _);  // (frag, iter_D, iter_Blk_Q, pipe)
+
+    // tSKrV adds to tSKrSK (acc)
+    using SK_V_S2R = Copy_Atom<SM75_U16x8_LDSM_T, Element>;
+    auto tSKrV_tiled_copy = make_tiled_copy_C(SK_V_S2R{}, sk_tiled_mma);
+    auto tSKrV_thr_copy = tSKrV_tiled_copy.get_thread_slice(thread_idx);
+
+    Tensor tSKsK = sk_thr_mma.partition_B(sKqk);
+    Tensor tSKrK = sk_thr_mma.make_fragment_B(tSKsK);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // NewV = (S@K result) @ T^t
+    auto newv_tiled_mma = TiledMmaNewV{};
+    auto newv_thr_mma = newv_tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tNewVsB = newv_thr_mma.partition_B(sKK_opd);
+    Tensor tNewVrB = newv_thr_mma.make_fragment_B(tNewVsB);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // K@V
+    auto kv_tiled_mma = TiledMmaKV{};
+    auto kv_thr_mma = kv_tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tKVrKV = partition_fragment_C(kv_thr_mma, select<0, 1>(TileShapeKV{}));
+
+    // Tensor tKVrV    = kv_thr_mma.partition_fragment_A(sVkv(_, _, _0{}));  // mma src
+    // Tensor tKVrV_cv = tKVrV_thr_copy.retile_D(tKVrV);                     // copy view dst
+    // Tensor tKVsV    = tKVrV_thr_copy.partition_S(sVkv);                   // copy view src
+
+    Tensor tKVsK = kv_thr_mma.partition_B(sKkv);
+    Tensor tKVrK = kv_thr_mma.make_fragment_B(tKVsK);
+
+    auto const cV = make_identity_tensor(Shape<Int<HeadSizeV>, Int<BlkSeqKV>>{});
+    Tensor tKVcV = kv_thr_mma.partition_A(cV);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Q@K@V
+    auto o1_tiled_mma = TiledMmaO1{};
+    auto o1_thr_mma = o1_tiled_mma.get_thread_slice(thread_idx);
+    auto o2_tiled_mma = TiledMmaO2{};
+    auto o2_thr_mma = o2_tiled_mma.get_thread_slice(thread_idx);
+
+    // A1 for Q@(KV)
+    // Tensor tOrKV = make_acc_into_op<Element>(tKVrKV, typename TiledMmaO1::LayoutA_TV{});
+    // B1 for Q@(KV)
+    Tensor tOsQ = o1_thr_mma.partition_B(sQqk);
+    Tensor tOrQ = o1_thr_mma.make_fragment_B(tOsQ);
+
+    // A2 for QK@V
+    // Tensor tOsV = o2_thr_mma.partition_A(sVkv);
+    // Tensor tOrV = o2_thr_mma.make_fragment_A(tOsV);
+    // B2 for QK@V
+    Tensor tOsQK = o2_thr_mma.partition_B(sQK);
+    Tensor tOrQK = o2_thr_mma.make_fragment_B(tOsQK);
+
+    using O_R2S = typename CollectiveStoreO::CopyAtomR2S;
+    auto tiled_copy_o = make_tiled_copy_C(O_R2S{}, o1_tiled_mma);
+    auto thr_copy_o = tiled_copy_o.get_thread_slice(thread_idx);
+    auto tOsO = thr_copy_o.partition_D(sO);
+
+    auto const cO = make_identity_tensor(Shape<Int<HeadSizeQK>, Int<BlkSeqQ>>{});
+    Tensor tOcO = o1_thr_mma.partition_C(cO);
+
+    auto layout_OAlpha = flatten(make_layout(  // broadcast Alpha vector to O size
+        make_layout(select<0, 1>(TileShapeO1{}), Stride<_0, _1>{}),  // (D, Blk_Q)
+        select<1, 2>(SmemLayoutAlpha{})                              // (Idx, pipe)
+        ));                                                          // (D, Blk_Q, Idx, pipe)
+
+    auto tOrAlphaScale = o1_thr_mma.partition_C(Alpha.compose(layout_OAlpha))(
+        _, _, _, AlphaCumProdScaleIdx{}, _);  // (frag, iter_D, iter_Blk_Q, pipe)
+
+    auto const seq_idx = work_desc.seq_idx;
+    auto const q_head_idx = work_desc.q_head_idx();
+    auto const k_head_idx = work_desc.k_head_idx();
+    auto const v_head_idx = work_desc.v_head_idx();
+
+    auto sk_epi = [&](auto& tSKrSK, auto const& alpha_smem_pipe_read) INLINE_LAMBDA {
+      if constexpr (NeedsAlpha) {
+        transform(tSKrSK, tSKrAlpha(_, _, _, alpha_smem_pipe_read.index()), tSKrSK,
+                  [&](auto sk, auto coeff) { return sk * coeff; });
+      }
+    };
+
+    auto sk_load_v = [&](int pipe_idx) INLINE_LAMBDA {
+      Tensor tSKrV = make_fragment_like<Element>(
+          partition_fragment_C(sk_thr_mma, sVkv(_, _, _0{})));  // mma acc
+      Tensor tSKrV_cv = tSKrV_thr_copy.retile_D(tSKrV);         // copy view dst
+      Tensor tSKsV = tSKrV_thr_copy.partition_S(sVkv);          // copy view src
+      copy(tSKrV_tiled_copy, tSKsV(_, _, _, pipe_idx), tSKrV_cv);
+      return tSKrV;
+    };
+
+    auto kv_decay_v = [&](auto& tKVrV, auto const& alpha_smem_pipe_read, auto is_final_block_,
+                          auto B) INLINE_LAMBDA {
+      constexpr bool is_final_block = decltype(is_final_block_)::value;
+      if constexpr (NeedsAlpha) {
+        Tensor Alpha_cumsum_log = Alpha(_, AlphaCumSumLogIdx{}, alpha_smem_pipe_read.index());
+        float block_coeff_log = Alpha_cumsum_log(B - 1);
+        cute::transform(tKVrV, tKVcV, tKVrV, [&](auto val, auto coord) {
+          auto tok = get<1>(coord);
+          float coeff = [&] {
+            if constexpr (!is_final_block) {
+              return exp2f(block_coeff_log - Alpha_cumsum_log(tok));
+            } else {
+              return tok < B ? exp2f(block_coeff_log - Alpha_cumsum_log(tok)) : 0.0f;
+            }
+          }();
+          return decltype(val)(val * coeff);
+        });
+      }
+      if constexpr (is_final_block) {
+        if constexpr (!NeedsAlpha) {
+          cute::transform(tKVrV, tKVcV, tKVrV, [&](auto val, auto coord) {
+            auto tok = get<1>(coord);
+            return tok < B ? val : zero;  // mask v of tail oob values
+          });
+        }
+      }
+    };
+
+    auto kv_load = [&](auto& tKVrKV) INLINE_LAMBDA {
+      DPRINTF0_WG("[%d,%d,%d,%d]>> load tKVgKV -> tKVrKV\n", seq_idx, q_head_idx, k_head_idx,
+                  v_head_idx);
+      int num_state_heads = problem_size.num_sab_heads;
+      int state_head_idx = work_desc.o_head_idx();
+      auto gKV = make_tensor(make_gmem_ptr(params.ptr_input_state),
+                             make_layout(make_shape(Int<HeadSizeQK>{}, Int<HeadSizeV>{},
+                                                    num_state_heads, problem_size.num_seqs)))(
+          _, _, state_head_idx, seq_idx);  // (KDim, VDim), K-contiguous
+
+      auto tiled_copy_kv =
+          make_tiled_copy_C(Copy_Atom<AutoVectorizingCopy, Element>{}, kv_tiled_mma);
+      auto thr_copy_kv = tiled_copy_kv.get_thread_slice(thread_idx);
+
+      auto tKVgKV = thr_copy_kv.partition_S(select_tensor<1, 0>(gKV));
+      copy(tiled_copy_kv, tKVgKV, tKVrKV);
+    };
+
+    auto kv_store = [&]() INLINE_LAMBDA {  // tKVrKV is carried over whole mainloop
+      DPRINTF0_WG("[%d,%d,%d,%d]>> save tKVrKV -> tKVgKV\n", seq_idx, q_head_idx, k_head_idx,
+                  v_head_idx);
+      int num_state_heads = problem_size.num_sab_heads;
+      int state_head_idx = work_desc.o_head_idx();  // num_o_heads == num_sab_heads
+      auto gKV = make_tensor(make_gmem_ptr(params.ptr_output_state),
+                             make_layout(make_shape(Int<HeadSizeQK>{}, Int<HeadSizeV>{},
+                                                    num_state_heads, problem_size.num_seqs)))(
+          _, _, state_head_idx, seq_idx);  // (KDim, VDim), K-contiguous
+
+      auto tiled_copy_kv =
+          make_tiled_copy_C(Copy_Atom<AutoVectorizingCopy, Element>{}, kv_tiled_mma);
+      auto thr_copy_kv = tiled_copy_kv.get_thread_slice(thread_idx);
+
+      auto tKVgKV = thr_copy_kv.partition_D(select_tensor<1, 0>(gKV));
+      copy(tiled_copy_kv, tKVrKV, tKVgKV);
+    };
+
+    auto o1_epi = [&](auto& tOrO1, auto const& alpha_smem_pipe_read) INLINE_LAMBDA {
+      if constexpr (NeedsAlpha) {
+        auto tOrAlphaScale_ = tOrAlphaScale(_, _, _, alpha_smem_pipe_read.index());
+        CUTE_UNROLL
+        for (int i = 0; i < size(tOrO1); ++i) {
+          tOrO1(i) = tOrAlphaScale_(i) * tOrO1(i);
+        }
+      } else {
+        CUTE_UNROLL
+        for (int i = 0; i < size(tOrO1); ++i) {
+          tOrO1(i) = scale * tOrO1(i);
+        }
+      }
+    };
+
+    auto o_store = [&](auto tOrO) INLINE_LAMBDA {
+      auto tOrO_cvt = make_fragment_like<ElementO>(tOrO);
+      copy(tOrO, tOrO_cvt);
+
+      DPRINTF0_WG("compute: o_pipeline.producer_wait: smem_pipe_write:%d\n",
+                  o_smem_pipe_write.index());
+      o_pipeline.producer_acquire(o_smem_pipe_write);
+      Tensor tOrO_cvt_cv = thr_copy_o.retile_S(tOrO_cvt);
+      cutlass::arch::fence_view_async_shared();
+      copy(tiled_copy_o, tOrO_cvt_cv, tOsO(_, _, _, o_smem_pipe_write.index()));
+      cutlass::arch::fence_view_async_shared();
+      o_pipeline.producer_commit(o_smem_pipe_write);
+      ++o_smem_pipe_write;
+    };
+
+    auto compute_loop_body = [&](int blk, auto is_first_block_,
+                                 auto is_final_block_) INLINE_LAMBDA {
+      constexpr bool is_first_block = decltype(is_first_block_)::value;
+      constexpr bool is_final_block = decltype(is_final_block_)::value;
+      int B = is_final_block ? valid_seq_len(work_desc, blk) : BlkSeqKV;
+
+      // 2.1 Q @ KV, NOTE: use old KV here
+      DPRINTF0_WG("compute: q_pipeline.consumer_wait: smem_pipe_read:%d\n",
+                  q_smem_pipe_read.index());
+      q_pipeline.consumer_wait(q_smem_pipe_read);
+      if constexpr (NeedsAlpha) {
+        alpha_pipeline.consumer_wait(alpha_smem_pipe_read);
+      }
+
+      DPRINTF0_WG("[%d,%d,%d,%d]** dispatch O WGMMA\n", seq_idx, q_head_idx, k_head_idx,
+                  v_head_idx);
+      auto tOrO = partition_fragment_C(o1_thr_mma, select<0, 1>(TileShapeO1{}));
+      if constexpr (is_first_block) {
+        DPRINTF0_WG("compute: q_pipeline.consumer_release: smem_pipe_read:%d\n",
+                    q_smem_pipe_read.index());
+        q_pipeline.consumer_release(q_smem_pipe_read);
+        ++q_smem_pipe_read;
+      } else {
+        Tensor tOrKV = make_acc_into_op<Element>(tKVrKV, typename TiledMmaO1::LayoutA_TV{});
+        warpgroup_fence_operand(tOrKV);
+        warpgroup_fence_operand(tOrO);
+        math_barriers.ordered_or_wait(warpgroup_idx);
+        warpgroup_arrive();
+        gemm_zero_acc(o1_thr_mma, tOrKV, tOrQ(_, _, _, q_smem_pipe_read.index()), tOrO);
+        warpgroup_commit_batch();  // q@kv batch
+        math_barriers.notify_next_blocked(warpgroup_idx);
+      }
+      if constexpr (!is_first_block) {
+        warpgroup_wait<0>();  // q@kv batch
+        DPRINTF0_WG("compute: q_pipeline.consumer_release: smem_pipe_read:%d\n",
+                    q_smem_pipe_read.index());
+        q_pipeline.consumer_release(q_smem_pipe_read);
+        ++q_smem_pipe_read;
+        o1_epi(tOrO, alpha_smem_pipe_read);
+      }
+
+      DPRINTF0_WG("compute: k_pipeline.consumer_wait: smem_pipe_read:%d\n",
+                  k_smem_pipe_read.index());
+      k_pipeline.consumer_wait(k_smem_pipe_read);
+
+      auto tSKrSK = partition_fragment_C(sk_thr_mma, sVkv(_, _, _0{}));
+      if constexpr (!is_first_block) {
+        auto tSKrS = make_acc_into_op<Element>(tKVrKV, typename TiledMmaSK::LayoutA_TV{});
+        warpgroup_fence_operand(tSKrSK);
+        warpgroup_fence_operand(tSKrS);
+        math_barriers.ordered_or_wait(warpgroup_idx);
+        warpgroup_arrive();
+        gemm_zero_acc(sk_tiled_mma, tSKrS, tSKrK(_, _, _, k_smem_pipe_read.index()), tSKrSK);
+        warpgroup_commit_batch();
+        math_barriers.notify_next_blocked(warpgroup_idx);
+        warpgroup_wait<0>();
+      }
+
+      DPRINTF0_WG("compute: v_pipeline.consumer_wait: smem_pipe_read:%d\n",
+                  v_smem_pipe_read.index());
+      v_pipeline.consumer_wait(v_smem_pipe_read);
+      auto tSKrV = sk_load_v(v_smem_pipe_read.index());
+      if constexpr (!is_first_block) {
+        sk_epi(tSKrSK, alpha_smem_pipe_read);
+        transform(tSKrV, tSKrSK, tSKrV, [](auto v, auto sk) { return v - Element(sk); });
+      }
+
+      kk_pipeline.consumer_wait(kk_smem_pipe_read);
+      auto tNewVrA = make_acc_into_op<Element>(tSKrV, typename TiledMmaNewV::LayoutA_TV{});
+      auto tNewVrC = partition_fragment_C(newv_thr_mma, select<0, 1>(TileShapeNewV{}));
+      warpgroup_fence_operand(tNewVrA);
+      warpgroup_fence_operand(tNewVrC);
+      math_barriers.ordered_or_wait(warpgroup_idx);
+      warpgroup_arrive();
+      gemm_zero_acc(o1_thr_mma, tNewVrA, tNewVrB(_, _, _, kk_smem_pipe_read.index()), tNewVrC);
+      warpgroup_commit_batch();  // new_v batch
+      math_barriers.notify_next_blocked(warpgroup_idx);
+      warpgroup_wait<0>();  // new_v batch
+      DPRINTF0_WG("compute: v_pipeline.consumer_release: smem_pipe_read:%d\n",
+                  v_smem_pipe_read.index());
+      ++v_smem_pipe_read;  // NOTE: if we delay this increment after consumer_release, race
+                           // condition happens, why?
+      v_pipeline.consumer_release(v_smem_pipe_read);
+
+      kk_pipeline.consumer_release(kk_smem_pipe_read);
+      ++kk_smem_pipe_read;
+
+      /////////////////////////////////////////////////////////////////////////
+      // 2. compute qkv
+      // 2.2 QK @ V, NOTE: use old KV here and QK is scaled
+      qk_pipeline.consumer_wait(qk_smem_pipe_read);
+      auto tOrV_or_tKVrV = make_acc_into_op<Element>(tNewVrC, typename TiledMmaKV::LayoutA_TV{});
+      warpgroup_fence_operand(tOrV_or_tKVrV);
+      warpgroup_fence_operand(tOrO);
+      math_barriers.ordered_or_wait(warpgroup_idx);
+      warpgroup_arrive();
+      if constexpr (is_first_block) {
+        gemm_zero_acc(o2_tiled_mma, tOrV_or_tKVrV, tOrQK(_, _, _, qk_smem_pipe_read.index()), tOrO);
+      } else {
+        gemm(o2_tiled_mma, tOrV_or_tKVrV, tOrQK(_, _, _, qk_smem_pipe_read.index()), tOrO);
+      }
+      warpgroup_commit_batch();  // qk@v batch
+      math_barriers.notify_next_blocked(warpgroup_idx);
+      warpgroup_wait<0>();  // qk@v batch
+      qk_pipeline.consumer_release(qk_smem_pipe_read);
+      ++qk_smem_pipe_read;
+      o_store(tOrO);
+
+      /////////////////////////////////////////////////////////////////////////
+      // 3. update KV
+      float block_coeff = 1.0f;
+      if constexpr (NeedsAlpha) {
+        block_coeff = Alpha(B - 1, AlphaCumProdIdx{}, alpha_smem_pipe_read.index());
+      }
+
+      cute::transform(tKVrKV, [&](auto kv) { return block_coeff * kv; });
+      kv_decay_v(tOrV_or_tKVrV, alpha_smem_pipe_read, is_final_block_, B);
+
+      DPRINTF0_WG("[%d,%d,%d,%d]** dispatch KV WGMMA\n", seq_idx, q_head_idx, k_head_idx,
+                  v_head_idx);
+      warpgroup_fence_operand(tOrV_or_tKVrV);
+      warpgroup_fence_operand(tKVrKV);
+      math_barriers.ordered_or_wait(warpgroup_idx);
+      warpgroup_arrive();
+      gemm(kv_tiled_mma, tOrV_or_tKVrV, tKVrK(_, _, _, k_smem_pipe_read.index()), tKVrKV);
+      warpgroup_commit_batch();  // k@v batch
+      math_barriers.notify_next_blocked(warpgroup_idx);
+      warpgroup_wait<0>();
+
+      DPRINTF0_WG("compute: k_pipeline.consumer_release: smem_pipe_read:%d\n",
+                  k_smem_pipe_read.index());
+      k_pipeline.consumer_release(k_smem_pipe_read);
+      ++k_smem_pipe_read;
+
+      if constexpr (NeedsAlpha) {
+        alpha_pipeline.consumer_release(alpha_smem_pipe_read);
+        ++alpha_smem_pipe_read;
+      }
+    };
+
+    if constexpr (!kInitStateFromInput) {
+      clear(tKVrKV);
+      compute_loop_body(0, /*is_first_block_=*/cute::true_type{},
+                        /*is_final_block_=*/cute::true_type{});
+    } else {
+      kv_load(tKVrKV);
+      compute_loop_body(0, /*is_first_block_=*/cute::false_type{},
+                        /*is_final_block_=*/cute::true_type{});
+    }
+    CUTE_NO_UNROLL
+    for (int blk = 1; blk < num_blocks - 1; ++blk) {
+      compute_loop_body(blk, /*is_first_block_=*/cute::false_type{},
+                        /*is_final_block_=*/cute::false_type{});
+    }
+    if (num_blocks != 1) {
+      compute_loop_body(num_blocks - 1, /*is_first_block_=*/cute::false_type{},
+                        /*is_final_block_=*/cute::true_type{});
+    }
+    kv_store();
+  }
+
+  template <class ProblemShape, class WorkDesc>
+  CUTE_DEVICE void compute_aux(Params const& params, ProblemShape const& problem_size,
+                               WorkDesc const& work_desc, MainloopQPipeline& q_pipeline,
+                               QPipelineState& q_smem_pipe_read, MainloopKPipeline& k_pipeline,
+                               KPipelineState& k_smem_pipe_read, MainloopQKPipeline& qk_pipeline,
+                               QKPipelineState& qk_smem_pipe_write, MainloopKKPipeline& kk_pipeline,
+                               KKPipelineState& kk_smem_pipe_write,
+                               MainloopAlphaPipeline& alpha_pipeline,
+                               AlphaPipelineState& alpha_smem_pipe_read,
+                               MainloopBetaPipeline& beta_pipeline,
+                               BetaPipelineState& beta_smem_pipe_read, SharedStorage& storage) {
+    int thread_idx = threadIdx.x % cutlass::NumThreadsPerWarpGroup;
+
+    float scale = params.scale;
+
+    Tensor Beta = make_tensor(make_smem_ptr(storage.smem_beta.data()), SmemLayoutBeta{});
+    Tensor Alpha = make_tensor(make_smem_ptr(storage.smem_alpha.data()), SmemLayoutAlpha{});
+
+    Tensor sQqk = make_tensor(make_smem_ptr(storage.smem_q.data()), QKSmemLayoutQ{});
+    Tensor sKqk = make_tensor(make_smem_ptr(storage.smem_k.data()), QKSmemLayoutK{});
+    Tensor sKkv = make_tensor(make_smem_ptr(storage.smem_k.data()), KVSmemLayoutK{});
+    Tensor sVkv = make_tensor(make_smem_ptr(storage.smem_v.data()), KVSmemLayoutV{});
+    Tensor sQK = make_tensor(make_smem_ptr(storage.smem_qk.data()), SmemLayoutQK{});
+    Tensor sO = make_tensor(make_smem_ptr(storage.smem_o.data()), SmemLayoutO{});
+
+    static_assert(sizeof(InverseType) == sizeof(Element));
+    Tensor sKK_inv = make_tensor(make_smem_ptr(storage.smem_kk.data()), SmemLayoutKK{});
+    Tensor sKK_opd = make_tensor(make_smem_ptr(reinterpret_cast<Element*>(storage.smem_kk.data())),
+                                 SmemLayoutKK{});
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Q@K
+    auto qk_tiled_mma = TiledMmaQK{};
+    auto qk_thr_mma = qk_tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tQKsQ = qk_thr_mma.partition_A(sQqk);
+    Tensor tQKsK = qk_thr_mma.partition_B(sKqk);
+    Tensor tQKrQ = qk_thr_mma.make_fragment_A(tQKsQ);
+    Tensor tQKrK = qk_thr_mma.make_fragment_B(tQKsK);
+
+    auto cMqk = make_identity_tensor(select<0, 1>(TileShapeQK{}));  // (QTok, KTok)
+    auto tQKcMqk = qk_thr_mma.partition_C(cMqk);                    // (idx) -> (tok_q, tok_k)
+
+    ///////////////////////////////////////////////////////////////////////////
+    // K@K  (basically I + strict_lower_triangular(K K^T)
+    auto kk_tiled_mma = TiledMmaKK{};
+    auto kk_thr_mma = kk_tiled_mma.get_thread_slice(thread_idx);
+    Tensor tKKsK = kk_thr_mma.partition_B(sKqk);
+    Tensor tKKrA = kk_thr_mma.make_fragment_A(tKKsK);
+    Tensor tKKrB = kk_thr_mma.make_fragment_B(tKKsK);
+
+    auto const& cMkk = cMqk;
+    auto tKKcMkk = kk_thr_mma.partition_C(cMkk);
+
+    auto const seq_idx = work_desc.seq_idx;
+    auto const q_head_idx = work_desc.q_head_idx();
+    auto const k_head_idx = work_desc.k_head_idx();
+    auto const v_head_idx = work_desc.v_head_idx();
+
+    auto qk_and_kk_epi = [&](auto& tQKrQK, auto& tKKrKK, auto const& alpha_smem_pipe_read,
+                             auto const& beta_smem_pipe_read, auto is_final_block_,
+                             auto B /*valid seqlen*/) {
+      if constexpr (NeedsAlpha) {
+        Tensor Alpha_cumsum_log = Alpha(_, AlphaCumSumLogIdx{}, alpha_smem_pipe_read.index());
+        for_each(make_int_sequence<size(tKKcMkk)>{}, [&](auto i) {
+          auto coord = tQKcMqk(i);
+          auto [s, t] = coord;
+          float alpha = exp2f(Alpha_cumsum_log(s) - Alpha_cumsum_log(t));
+          tQKrQK(i) *= alpha * scale;
+          tKKrKK(i) *= alpha;
+        });
+      } else {
+        transform(tQKrQK, [scale](auto v) { return v * scale; });
+      }
+
+      if constexpr (NeedsBeta) {
+        Tensor Beta_ = Beta(_, beta_smem_pipe_read.index());
+        for_each(make_int_sequence<size(tKKcMkk)>{}, [&](auto i) {
+          auto coord = tQKcMqk(i);
+          auto [s, t] = coord;
+          tKKrKK(i) *= Beta_(s);
+        });
+      }
+
+      constexpr bool is_final_block = decltype(is_final_block_)::value;
+      for_each(make_int_sequence<size(tKKcMkk)>{}, [&](auto i) {
+        auto coord = tQKcMqk(i);
+        auto [s, t] = coord;
+        bool pred = s >= t;
+        tQKrQK(i) = pred ? tQKrQK(i) : 0.0f;
+        tKKrKK(i) =
+            pred ? tKKrKK(i) : 0.0f;  // diagonal is garbage filled, will process during inversion
+        if constexpr (is_final_block) {
+          bool pred = s < B || t < B;
+          tQKrQK(i) = pred ? tQKrQK(i) : 0.0f;
+          tKKrKK(i) = pred ? tKKrKK(i) : 0.0f;
+        }
+      });
+    };
+
+    auto qk_store = [&](auto tQKrQK, auto const& qk_smem_pipe_write) {
+      auto sQK_pipe_slice = sQK(_, _, qk_smem_pipe_write.index());
+
+      static_assert(sizeof(Element) == 2);
+      using CopyOpR2S = SM90_U32x4_STSM_N;
+      auto tiled_copy_qk = make_tiled_copy_C(Copy_Atom<CopyOpR2S, Element>{}, qk_tiled_mma);
+      auto thr_copy_qk = tiled_copy_qk.get_thread_slice(thread_idx);
+      auto tQKsQK = thr_copy_qk.partition_D(sQK_pipe_slice);
+      auto tQKrQK_cv = thr_copy_qk.retile_S(tQKrQK);
+      auto tQKrQK_cvt_cv = make_fragment_like<Element>(tQKrQK_cv);
+      cute::transform(tQKrQK_cv, tQKrQK_cvt_cv, [](auto v) { return Element(v); });
+      copy(tiled_copy_qk, tQKrQK_cvt_cv, tQKsQK);
+    };
+
+    auto kk_store_and_inv = [&](auto tKKrKK, auto const& kk_smem_pipe_write) INLINE_LAMBDA {
+      auto sKK_inv_pipe_slice = sKK_inv(_, _, kk_smem_pipe_write.index());
+
+      static_assert(sizeof(Element) == 2);
+      using CopyOpR2S = SM90_U32x4_STSM_N;
+      auto tiled_store_kk = make_tiled_copy_C(Copy_Atom<CopyOpR2S, InverseType>{}, kk_tiled_mma);
+      auto thr_store_kk = tiled_store_kk.get_thread_slice(thread_idx);
+      auto tKKsKK = thr_store_kk.partition_D(sKK_inv_pipe_slice);
+      auto tKKrKK_cv = thr_store_kk.retile_S(tKKrKK);
+      auto tKKrKK_cvt_cv = make_fragment_like<InverseType>(tKKrKK_cv);
+      cute::transform(tKKrKK_cv, tKKrKK_cvt_cv, [](auto v) { return InverseType(v); });
+      copy(tiled_store_kk, tKKrKK_cvt_cv, tKKsKK);
+
+      cutlass::arch::NamedBarrier::arrive_and_wait(cutlass::NumThreadsPerWarpGroup,
+                                                   DeltaRuleNamedBarriers::AuxMath);
+
+      auto collective_inverse = CollectiveInverse(DeltaRuleNamedBarriers::AuxMath);
+      collective_inverse.compute(sKK_inv_pipe_slice);
+
+      // FIXME: we can ignore core matrices above diagonal
+      if constexpr (NeedsBeta || !std::is_same_v<InverseType, Element>) {
+        cutlass::arch::NamedBarrier::arrive_and_wait(cutlass::NumThreadsPerWarpGroup,
+                                                     DeltaRuleNamedBarriers::AuxMath);
+        using CopyOpS2R = SM75_U32x4_LDSM_N;
+        auto tiled_load_kk = make_tiled_copy_C(Copy_Atom<CopyOpS2R, InverseType>{}, kk_tiled_mma);
+        auto thr_load_kk = tiled_load_kk.get_thread_slice(thread_idx);
+        auto tKKrKK_cpy = make_fragment_like<InverseType>(tKKrKK_cvt_cv);
+        auto tKKrKK_cvt = make_fragment_like<Element>(tKKrKK_cvt_cv);
+        auto tKKcMkk_cv = thr_load_kk.retile_D(tKKcMkk);
+        copy(tiled_load_kk, thr_load_kk.partition_S(sKK_inv_pipe_slice), tKKrKK_cpy);
+        cute::transform(tKKrKK_cpy, tKKcMkk_cv, tKKrKK_cvt, [&](auto val, auto coord) {
+          auto [_, t] = coord;
+          if constexpr (NeedsBeta) {
+            return Element(float(val) * Beta(t, beta_smem_pipe_read.index()));
+          } else {
+            return Element(val);
+          }
+        });
+        copy(tiled_store_kk, tKKrKK_cvt, recast<Element>(tKKsKK));
+      }
+    };
+
+    auto compute_aux_loop_body = [&](int blk, auto is_final_block_) INLINE_LAMBDA {
+      constexpr bool is_final_block = decltype(is_final_block_)::value;
+
+      int B = is_final_block ? valid_seq_len(work_desc, blk) : BlkSeqKV;
+
+      Tensor tKKrKK = partition_fragment_C(TiledMmaKK{}, select<0, 1>(TileShapeKK{}));
+      Tensor tQKrQK = partition_fragment_C(TiledMmaQK{}, select<0, 1>(TileShapeQK{}));
+
+      k_pipeline.consumer_wait(k_smem_pipe_read);
+      DPRINTF0_WG("[%d,%d,%d,%d]** dispatch KK WGMMA\n", seq_idx, q_head_idx, k_head_idx,
+                  v_head_idx);
+      warpgroup_fence_operand(tKKrKK);
+      warpgroup_arrive();
+      gemm_zero_acc(kk_tiled_mma, tKKrA(_, _, _, k_smem_pipe_read.index()),
+                    tKKrB(_, _, _, k_smem_pipe_read.index()), tKKrKK);
+      warpgroup_commit_batch();  // K@Kt batch
+
+      q_pipeline.consumer_wait(q_smem_pipe_read);
+      DPRINTF0_WG("[%d,%d,%d,%d]** dispatch QK WGMMA\n", seq_idx, q_head_idx, k_head_idx,
+                  v_head_idx);
+      warpgroup_fence_operand(tQKrQK);
+      warpgroup_arrive();
+      gemm_zero_acc(qk_tiled_mma, tQKrQ(_, _, _, q_smem_pipe_read.index()),
+                    tQKrK(_, _, _, k_smem_pipe_read.index()), tQKrQK);
+      warpgroup_commit_batch();  // Q@Kt batch
+
+      // K@Kt and Q@Kt batch finished, we fused masking logic for qk and kk so wait for all of them
+      warpgroup_wait<0>();
+
+      k_pipeline.consumer_release(k_smem_pipe_read);
+      ++k_smem_pipe_read;
+      q_pipeline.consumer_release(q_smem_pipe_read);
+      ++q_smem_pipe_read;
+
+      if constexpr (NeedsAlpha) {
+        alpha_pipeline.consumer_wait(alpha_smem_pipe_read);
+      }
+      if constexpr (NeedsBeta) {
+        beta_pipeline.consumer_wait(beta_smem_pipe_read);
+      }
+      cutlass::arch::fence_view_async_shared();
+
+      qk_and_kk_epi(tQKrQK, tKKrKK, alpha_smem_pipe_read, beta_smem_pipe_read, is_final_block_, B);
+
+      kk_pipeline.producer_acquire(kk_smem_pipe_write);
+      kk_store_and_inv(tKKrKK, kk_smem_pipe_write);
+      cutlass::arch::fence_view_async_shared();
+      kk_pipeline.producer_commit(kk_smem_pipe_write);
+      ++kk_smem_pipe_write;
+
+      qk_pipeline.producer_acquire(qk_smem_pipe_write);
+      qk_store(tQKrQK, qk_smem_pipe_write);
+      cutlass::arch::fence_view_async_shared();
+      qk_pipeline.producer_commit(qk_smem_pipe_write);
+      ++qk_smem_pipe_write;
+
+      if constexpr (NeedsAlpha) {
+        alpha_pipeline.consumer_release(alpha_smem_pipe_read);
+        ++alpha_smem_pipe_read;
+      }
+      if constexpr (NeedsBeta) {
+        beta_pipeline.consumer_release(beta_smem_pipe_read);
+        ++beta_smem_pipe_read;
+      }
+    };
+
+    int32_t num_blocks = ceil_div(work_desc.seq_len, get<0>(TileShape{}));
+    CUTE_NO_UNROLL
+    for (int blk = 0; blk < num_blocks - 1; ++blk) {
+      compute_aux_loop_body(blk, /*is_final_block_=*/cute::false_type{});
+    }
+    compute_aux_loop_body(num_blocks - 1, /*is_final_block_=*/cute::true_type{});
+  }
+
+  template <typename WorkDesc>
+  CUTE_DEVICE int valid_seq_len(WorkDesc work_desc, int blk_idx) {
+    int remain_len = work_desc.seq_len - BlkSeqKV * blk_idx;
+    return remain_len <= BlkSeqKV ? remain_len : BlkSeqKV;
+  }
+};
+
+}  // namespace flat::collective
diff --git a/csrc/flat/hopper/collective/flat_common.hpp b/csrc/flat/hopper/collective/flat_common.hpp
new file mode 100644
index 0000000000..df3f66ce54
--- /dev/null
+++ b/csrc/flat/hopper/collective/flat_common.hpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/tensor.hpp"
+#include "cutlass/kernel_hardware_info.h"
+
+namespace flat::collective {
+
+using namespace cute;
+
+template <typename Atom, typename TA, typename TB, typename TC>
+CUTE_DEVICE void gemm_reset_zero_acc(Atom& atom, TA const& tA, TB const& tB, TC&& tC) {
+  constexpr int rA = decltype(rank(tA))::value;
+  constexpr int rB = decltype(rank(tB))::value;
+  constexpr int rC = decltype(rank(tC))::value;
+  if constexpr (rA == 2 && rB == 2 && rC == 1) {
+    CUTE_UNROLL
+    for (int k_block = 0; k_block < size<1>(tA); k_block++) {
+      cute::gemm(atom, tA(_, k_block), tB(_, k_block), tC);
+      atom.accumulate_ = GMMA::ScaleOut::One;
+    }
+  } else {
+    static_assert(rA == 3 && rB == 3 && rC == 3);
+    CUTE_UNROLL
+    for (int k_block = 0; k_block < size<2>(tA); k_block++) {
+      cute::gemm(atom, tA(_, _, k_block), tB(_, _, k_block), tC);
+      atom.accumulate_ = GMMA::ScaleOut::One;
+    }
+  }
+}
+
+template <typename Atom, typename TA, typename TB, typename TC>
+CUTE_DEVICE void gemm_zero_acc(Atom& atom, TA const& tA, TB const& tB, TC&& tC) {
+  atom.accumulate_ = GMMA::ScaleOut::Zero;
+  gemm_reset_zero_acc(atom, tA, tB, tC);
+}
+
+template <template <cute::GMMA::Major, cute::GMMA::Major, cute::GMMA::ScaleIn,
+                    cute::GMMA::ScaleIn> class Primitive,
+          cute::GMMA::Major tA, cute::GMMA::Major tB, cute::GMMA::ScaleIn sA,
+          cute::GMMA::ScaleIn sB>
+CUTE_DEVICE constexpr auto convert_to_gmma_rs(
+    cute::MMA_Atom<Primitive<tA, tB, sA, sB>> const& tiled_mma) {
+  using Atom = cute::MMA_Atom<Primitive<tA, tB, sA, sB>>;
+  using ElementA = typename Atom::ValTypeA;
+  using ElementB = typename Atom::ValTypeB;
+  using ElementC = typename Atom::ValTypeC;
+  using Shape_MNK = typename Atom::Shape_MNK;
+  using RS = decltype(cute::GMMA::rs_op_selector<ElementA, ElementB, ElementC, Shape_MNK, tA, tB,
+                                                 sA, sB>());
+  return cute::MMA_Atom<RS>{};
+}
+
+template <template <cute::GMMA::ScaleIn, cute::GMMA::ScaleIn> class Primitive,
+          cute::GMMA::ScaleIn sA, cute::GMMA::ScaleIn sB>
+CUTE_DEVICE constexpr auto convert_to_gmma_rs(cute::MMA_Atom<Primitive<sA, sB>> const& tiled_mma) {
+  using Atom = cute::MMA_Atom<Primitive<sA, sB>>;
+  using ElementA = typename Atom::ValTypeA;
+  using ElementB = typename Atom::ValTypeB;
+  using ElementC = typename Atom::ValTypeC;
+  using Shape_MNK = typename Atom::Shape_MNK;
+  constexpr auto tA = cute::GMMA::Major::K;
+  constexpr auto tB = cute::GMMA::Major::K;
+  using RS = decltype(cute::GMMA::rs_op_selector<ElementA, ElementB, ElementC, Shape_MNK, tA, tB,
+                                                 sA, sB>());
+  return cute::MMA_Atom<RS>{};
+}
+
+template <class Atom, class... Args>
+CUTE_DEVICE constexpr auto convert_to_gmma_rs(cute::TiledMMA<Atom, Args...> const& tiled_mma) {
+  return cute::TiledMMA<decltype(convert_to_gmma_rs(Atom{})), Args...>{};
+}
+
+template <typename CLayout, typename AValueShape>
+CUTE_DEVICE constexpr auto convert_c_layout_to_a_layout(CLayout const& c, AValueShape const& a) {
+  return make_layout(make_shape(a, shape<1>(c), make_shape(shape<2>(c), size<0>(c) / size(a))),
+                     make_stride(stride<0>(c), stride<1>(c),
+                                 make_stride(stride<2>(c), size<2>(a) * stride<0, 2>(c))));
+}
+
+template <class Layout, class Stages = _1>
+CUTE_DEVICE constexpr auto unstage_smem_layout(Layout const& layout, Stages stages = {}) {
+  return composition(layout, make_tuple(_, _, make_layout(stages)));
+}
+
+template <class Element, class Accumulator, class OperandLayout_TV>
+CUTE_DEVICE auto make_acc_into_op(Accumulator const& acc,
+                                  OperandLayout_TV const& operand_layout_tv) {
+  Tensor operand = make_fragment_like<Element>(
+      convert_c_layout_to_a_layout(acc.layout(), shape<1>(operand_layout_tv)));
+  Tensor operand_as_acc = make_tensor(operand.data(), acc.layout());
+
+  cute::copy(acc, operand_as_acc);
+
+  if constexpr (sizeof(Element) == 1) {
+    // 00 11 22 33 00 11 22 33 acc layout
+    // 00 00 11 11 22 22 33 33 operand layout
+    // BB AA AA BB AA BB BB AA conflict-free exchange pattern
+    //                         16-bit exchange; so process two at a time potentially
+    int tid = threadIdx.x % 4;
+    auto values_u32 = recast<uint32_t>(operand);
+
+    CUTE_UNROLL
+    for (int n = 0; n < size<1>(values_u32); n++) {
+      CUTE_UNROLL
+      for (int k = 0; k < size<2>(values_u32); k++) {
+        CUTE_UNROLL
+        for (int ii = 0; ii < 8; ii += 4) {
+          uint32_t values_tmp_0 = values_u32(ii / 2 + 0, n, k);
+          uint32_t values_tmp_1 = values_u32(ii / 2 + 1, n, k);
+
+          // step A:
+          // t 1 v 0 -> t 0 v 1
+          // t 2 v 0 -> t 1 v 0
+          // t 0 v 1 -> t 2 v 0
+          // t 3 v 1 -> t 3 v 1
+
+          int v_to_send = tid == 1 || tid == 2 ? 0 : 1;
+          int v_to_recv = v_to_send;
+          int t_to_recv_from = (0x3021 >> (tid * 4)) & 0xF;
+
+          uint32_t values_tmp_a = v_to_send == 0 ? values_tmp_0 : values_tmp_1;
+
+          values_tmp_a = __shfl_sync(0xFFFFFFFF, values_tmp_a, t_to_recv_from, 4);
+
+          // step B:
+          // t 0 v 0 -> t 0 v 0
+          // t 3 v 0 -> t 1 v 1
+          // t 1 v 1 -> t 2 v 1
+          // t 2 v 1 -> t 3 v 0
+
+          v_to_send = 1 - v_to_send;
+          v_to_recv = 1 - v_to_recv;
+          t_to_recv_from = (0x2130 >> (tid * 4)) & 0xF;
+
+          uint32_t values_tmp_b = v_to_send == 0 ? values_tmp_0 : values_tmp_1;
+
+          values_tmp_b = __shfl_sync(0xFFFFFFFF, values_tmp_b, t_to_recv_from, 4);
+
+          values_u32(ii / 2 + 0, n, k) =
+              __byte_perm(values_tmp_a, values_tmp_b, v_to_send == 0 ? 0x1054 : 0x5410);
+          values_u32(ii / 2 + 1, n, k) =
+              __byte_perm(values_tmp_a, values_tmp_b, v_to_send == 0 ? 0x3276 : 0x7632);
+        }
+      }
+    }
+  }
+
+  return operand;
+}
+
+}  // namespace flat::collective
diff --git a/csrc/flat/hopper/collective/flat_named_barriers.hpp b/csrc/flat/hopper/collective/flat_named_barriers.hpp
new file mode 100644
index 0000000000..66a6d07052
--- /dev/null
+++ b/csrc/flat/hopper/collective/flat_named_barriers.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace flat::collective {
+
+struct FlatSharedNamedBarriers {
+  static constexpr int AllMmaThreadsSync = 0;
+  static constexpr int AllLdStThreadsSync = 1;
+  static constexpr int MmaCooperativeStore = 2;
+
+ protected:
+  static constexpr int NumBarriersUsed = 4;
+};
+
+}  // namespace flat::collective
diff --git a/csrc/flat/hopper/device/device_universal.hpp b/csrc/flat/hopper/device/device_universal.hpp
new file mode 100644
index 0000000000..774a6828ed
--- /dev/null
+++ b/csrc/flat/hopper/device/device_universal.hpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif  // !defined(__CUDACC_RTC__)
+
+namespace cutlass::device {
+
+template <class Kernel_>
+class Universal {
+ public:
+  using Kernel = Kernel_;
+
+  static int const kThreadCount = Kernel::MaxThreadsPerBlock;
+
+  /// Argument structure: User API
+  using Arguments = typename Kernel::Arguments;
+  /// Argument structure: Kernel API
+  using Params = typename Kernel::Params;
+
+ private:
+  /// Kernel API parameters object
+  Params params_;
+
+  bool is_initialized(bool set = false) {
+    static bool initialized = false;
+    if (set) initialized = true;
+    return initialized;
+  }
+
+ public:
+  /// Access the Params structure
+  Params const& params() const { return params_; }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const& args) {
+    if (Kernel::can_implement(args)) {
+      return Status::kSuccess;
+    } else {
+      return Status::kInvalid;
+    }
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    workspace_bytes += Kernel::get_workspace_size(args);
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Params const& params) { return Kernel::get_grid_shape(params); }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("Universal::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = Kernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(device_kernel<Kernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError();  // to clear the error bit
+        CUTLASS_TRACE_HOST(
+            "  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks, device_kernel<Kernel>, Kernel::MaxThreadsPerBlock, smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError();  // to clear the error bit
+      CUTLASS_TRACE_HOST("  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+                         << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const& args, void* workspace = nullptr,
+                    cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("Universal::initialize() - workspace "
+                       << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status = Kernel::initialize_workspace(args, workspace, stream);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Initialize the Params structure
+    params_ = Kernel::to_underlying_arguments(args, workspace);
+
+    if (is_initialized()) return Status::kSuccess;
+
+    // account for dynamic smem capacity if needed
+    int smem_size = Kernel::SharedStorageSize;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      cudaError_t result = cudaFuncSetAttribute(
+          device_kernel<Kernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError();  // to clear the error bit
+        CUTLASS_TRACE_HOST(
+            "  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    is_initialized(true);
+
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("Universal()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_ = Kernel::to_underlying_arguments(args, workspace);
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own
+  /// params. Supplied params struct must be construct by calling Kernel::to_underling_arguments()
+  static Status run(Params& params, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("Universal::run()");
+    dim3 const block = Kernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = Kernel::SharedStorageSize;
+
+    Status launch_result;
+    // Use extended launch API only for mainloops that use it
+    if constexpr (Kernel::ArchTag::kMinComputeCapability >= 90) {
+      dim3 cluster(cute::size<0>(typename Kernel::ClusterShape{}),
+                   cute::size<1>(typename Kernel::ClusterShape{}),
+                   cute::size<2>(typename Kernel::ClusterShape{}));
+      void const* kernel = (void const*)device_kernel<Kernel>;
+      void* kernel_params[] = {&params};
+      launch_result =
+          ClusterLauncher::launch(grid, cluster, block, smem_size, stream, kernel, kernel_params);
+    } else {
+      launch_result = Status::kSuccess;
+      cutlass::arch::synclog_setup();
+      device_kernel<Kernel><<<grid, block, smem_size, stream>>>(params);
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    } else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel
+  // handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status run(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+    if (Status::kSuccess == status) {
+      status = run(params_, stream);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status operator()(Arguments const& args, void* workspace = nullptr,
+                    cudaStream_t stream = nullptr) {
+    return run(args, workspace, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params
+  /// struct.
+  Status run(cudaStream_t stream = nullptr) { return run(params_, stream); }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params
+  /// struct.
+  Status operator()(cudaStream_t stream = nullptr) { return run(params_, stream); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/flat/hopper/kernel/flat_kernel_builder_delta_rule.hpp b/csrc/flat/hopper/kernel/flat_kernel_builder_delta_rule.hpp
new file mode 100644
index 0000000000..60f920c3b1
--- /dev/null
+++ b/csrc/flat/hopper/kernel/flat_kernel_builder_delta_rule.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../collective/flat_collective_tma_warpspecialized_delta_rule.hpp"
+#include "../kernel/flat_kernel_tma_warpspecialized_delta_rule.hpp"
+#include "../kernel/flat_options.hpp"
+#include "../kernel/flat_tile_scheduler.hpp"
+#include "flat/type_traits.hpp"
+
+namespace flat::kernel {
+
+template <class Element_, class ElementAccumulatorQK_, class ElementAccumulatorPV_,
+          class TileShape_,  // BlkSeqQO, BlkSeqKV, HeadSize
+          class LayoutQ_, class LayoutK_, class LayoutV_, class LayoutO_, class DispatchPolicy,
+          class Options = DefaultOptions>
+struct FlatBuilderDeltaRule;
+
+template <class Element, class ElementAccumulatorQK, class ElementAccumulatorPV,
+          class TileShape,  // BlkSeqQO, BlkSeqKV, HeadSize
+          class LayoutQ, class LayoutK, class LayoutV, class LayoutO, class Options>
+struct FlatBuilderDeltaRule<Element, ElementAccumulatorQK, ElementAccumulatorPV, TileShape, LayoutQ,
+                            LayoutK, LayoutV, LayoutO,
+                            cutlass::gemm::KernelTmaWarpSpecializedCooperative, Options> {
+  using CollectiveMainloop = flat::collective::FlatMainloopTmaWarpSpecializedDeltaRule<
+      Element, ElementAccumulatorQK, ElementAccumulatorPV, TileShape, LayoutQ, LayoutK, LayoutV,
+      LayoutO, Options>;
+
+  static constexpr bool kIsPersistent =
+      find_option_t<Tag::kIsPersistent, false_type, Options>::value;
+  static_assert(!kIsPersistent, "not implemented");
+
+  static constexpr bool kIsGVA = find_option_t<Tag::kIsGVA, false_type, Options>::value;
+  using GroupingTag = std::conditional_t<kIsGVA, GVATag, GQATag>;
+  using TileScheduler = flat::kernel::IndividualTileScheduler<GroupingTag>;
+  // using TileScheduler = std::conditional_t<kIsPersistent, flat::kernel::PersistentTileScheduler,
+  // flat::kernel::IndividualTileScheduler>;
+
+  using Kernel = flat::kernel::FlatKernelTmaWarpSpecializedDeltaRule<CollectiveMainloop,
+                                                                     TileScheduler, Options>;
+};
+
+}  // namespace flat::kernel
diff --git a/csrc/flat/hopper/kernel/flat_kernel_tma_warpspecialized_delta_rule.hpp b/csrc/flat/hopper/kernel/flat_kernel_tma_warpspecialized_delta_rule.hpp
new file mode 100644
index 0000000000..d332328382
--- /dev/null
+++ b/csrc/flat/hopper/kernel/flat_kernel_tma_warpspecialized_delta_rule.hpp
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../kernel/flat_options.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "flat/common.hpp"
+#include "flat/unused.hpp"
+
+namespace flat::kernel {
+
+using namespace cute;
+
+template <typename T1, typename T2>
+constexpr T1 round_down(T1 a, T2 b) {
+  return (a / b) * b;
+}
+
+constexpr std::tuple<uint32_t, uint32_t, uint32_t> get_register_requirements(
+    uint32_t max_threads_per_block, uint32_t min_blocks_per_multiprocessor,
+    uint32_t num_state_mma_warp_groups  // state related mma
+) {
+  uint32_t reg_alloc_granularity = 8;
+
+#if !defined(FLAT_DEBUG_PRINT) || !FLAT_DEBUG_PRINT
+  uint32_t load_registers = 40 - 2 * reg_alloc_granularity;
+#else
+  uint32_t load_registers = 40;
+#endif
+  uint32_t aux_registers = 128 - load_registers;  // (24 + 104) or (40 + 88)
+
+  uint32_t total_registers = round_down(64 * 1024 / min_blocks_per_multiprocessor,
+                                        max_threads_per_block * reg_alloc_granularity) /
+                             cutlass::NumThreadsPerWarpGroup;
+  uint32_t mma_registers =
+      round_down((total_registers - load_registers - aux_registers) / num_state_mma_warp_groups,
+                 reg_alloc_granularity);
+
+  // max reg is 255, 248 round to multiple of reg_alloc_granularity;
+  return {cute::min(248, load_registers), cute::min(248, mma_registers),
+          cute::min(248, aux_registers)};
+}
+
+template <class CollectiveMainloop, class TileScheduler, class Options>
+struct FlatKernelTmaWarpSpecializedDeltaRule {
+  using ArchTag = cutlass::arch::Sm90;
+
+  static const int NumLoadWarpGroups = 1;
+  static constexpr int NumStateMmaWarpGroups = CollectiveMainloop::NumStateMmaWarpGroups;
+  static constexpr int NumAuxMmaWarpGroups = CollectiveMainloop::NumAuxMmaWarpGroups;
+
+  static constexpr int NeedsAlpha = CollectiveMainloop::NeedsAlpha;
+  static constexpr int NeedsBeta = CollectiveMainloop::NeedsBeta;
+
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using ClusterShape = typename CollectiveMainloop::ClusterShape;
+
+  using MainloopQPipeline = typename CollectiveMainloop::MainloopQPipeline;
+  using MainloopKPipeline = typename CollectiveMainloop::MainloopKPipeline;
+  using MainloopVPipeline = typename CollectiveMainloop::MainloopVPipeline;
+  using MainloopOPipeline = typename CollectiveMainloop::MainloopOPipeline;
+
+  using MainloopQKPipeline = typename CollectiveMainloop::MainloopQKPipeline;
+  using MainloopKKPipeline = typename CollectiveMainloop::MainloopKKPipeline;
+
+  using MainloopAlphaPipeline = typename CollectiveMainloop::MainloopAlphaPipeline;
+  using MainloopBetaPipeline = typename CollectiveMainloop::MainloopBetaPipeline;
+
+  using OrderedMathBarriers = typename CollectiveMainloop::OrderedMathBarriers;
+
+  static constexpr uint32_t StagesPerMathWarpGroup = 2;
+
+  using MathWarpGroupOrderBarrier =
+      cutlass::OrderedSequenceBarrier<StagesPerMathWarpGroup, NumStateMmaWarpGroups>;
+
+  struct TensorStorage {
+    typename CollectiveMainloop::SharedStorage mainloop;
+  };
+
+  struct SharedStorage {
+    TensorStorage tensors;
+
+    using QPipelineStorage = typename MainloopQPipeline::SharedStorage;
+    using KPipelineStorage = typename MainloopKPipeline::SharedStorage;
+    using VPipelineStorage = typename MainloopVPipeline::SharedStorage;
+    using OPipelineStorage = typename MainloopOPipeline::SharedStorage;
+
+    alignas(16) QPipelineStorage q_pipeline_storage;
+    alignas(16) KPipelineStorage k_pipeline_storage;
+    alignas(16) VPipelineStorage v_pipeline_storage;
+    alignas(16) OPipelineStorage o_pipeline_storage;
+
+    using QKPipelineStorage = typename MainloopQKPipeline::SharedStorage;
+    using KKPipelineStorage = typename MainloopKKPipeline::SharedStorage;
+
+    alignas(16) QKPipelineStorage qk_pipeline_storage;
+    alignas(16) KKPipelineStorage kk_pipeline_storage;
+
+    using AlphaPipelineStorage = typename MainloopAlphaPipeline::SharedStorage;
+    using BetaPipelineStorage = typename MainloopBetaPipeline::SharedStorage;
+    alignas(16) AlphaPipelineStorage alpha_pipeline_storage;
+    alignas(16) BetaPipelineStorage beta_pipeline_storage;
+
+    alignas(16) cutlass::arch::ClusterBarrier load_warp_barrier;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  struct VarlenProblemShape {
+    int64_t const* cu_seqlens;
+    int64_t total_seqlen;
+    int32_t num_seqs;
+    int32_t num_q_heads;
+    int32_t num_k_heads;
+    int32_t num_v_heads;
+    int32_t num_o_heads;
+    int32_t num_sab_heads;  // state, alpha, beta
+    int32_t head_size;      // d
+  };
+  using ProblemShape = VarlenProblemShape;
+
+  struct Arguments {
+    ProblemShape problem_size;
+    typename CollectiveMainloop::Arguments mainloop;
+    cutlass::KernelHardwareInfo hw_info;
+  };
+
+  struct Params {
+    ProblemShape problem_size;
+    typename CollectiveMainloop::Params mainloop;
+    typename TileScheduler::Params scheduler;
+  };
+
+  using QPipelineParams = typename MainloopQPipeline::Params;
+  using QPipelineState = typename cutlass::PipelineState<MainloopQPipeline::Stages>;
+
+  using KPipelineParams = typename MainloopKPipeline::Params;
+  using KPipelineState = typename cutlass::PipelineState<MainloopKPipeline::Stages>;
+
+  using VPipelineParams = typename MainloopVPipeline::Params;
+  using VPipelineState = typename cutlass::PipelineState<MainloopVPipeline::Stages>;
+
+  using OPipelineParams = typename MainloopOPipeline::Params;
+  using OPipelineState = typename cutlass::PipelineState<MainloopOPipeline::Stages>;
+
+  using QKPipelineParams = typename MainloopQKPipeline::Params;
+  using QKPipelineState = typename cutlass::PipelineState<MainloopQKPipeline::Stages>;
+
+  using KKPipelineParams = typename MainloopKKPipeline::Params;
+  using KKPipelineState = typename cutlass::PipelineState<MainloopKKPipeline::Stages>;
+
+  using AlphaPipelineParams =
+      std::conditional_t<NeedsAlpha, typename MainloopAlphaPipeline::Params, Unused>;
+  using AlphaPipelineState =
+      std::conditional_t<NeedsAlpha, cutlass::PipelineState<MainloopAlphaPipeline::Stages>, Unused>;
+
+  using BetaPipelineParams =
+      std::conditional_t<NeedsBeta, typename MainloopBetaPipeline::Params, Unused>;
+  using BetaPipelineState =
+      std::conditional_t<NeedsBeta, cutlass::PipelineState<MainloopBetaPipeline::Stages>, Unused>;
+
+  static constexpr int MinBlocksPerMultiprocessor = 1;
+  static constexpr int MaxThreadsPerBlock =
+      (NumLoadWarpGroups + NumStateMmaWarpGroups + NumAuxMmaWarpGroups) *
+      cutlass::NumThreadsPerWarpGroup;
+
+  static constexpr auto RegisterRequirements = get_register_requirements(
+      MaxThreadsPerBlock, MinBlocksPerMultiprocessor, NumStateMmaWarpGroups);
+  static constexpr uint32_t LdStRegisterRequirement = get<0>(RegisterRequirements);
+  static constexpr uint32_t StateMmaRegisterRequirement = get<1>(RegisterRequirements);
+  static constexpr uint32_t AuxMmaRegisterRequirement = get<2>(RegisterRequirements);
+
+  static size_t get_workspace_size(Arguments const& args) {
+    return CollectiveMainloop::get_workspace_size(args.mainloop, args.hw_info.sm_count);
+  }
+
+  static cutlass::Status initialize_workspace(Arguments const& args, void* workspace,
+                                              cudaStream_t stream) {
+    return CollectiveMainloop::initialize_workspace(args.problem_size, args.mainloop, workspace,
+                                                    stream);
+  }
+
+  static bool can_implement(Arguments const& args) {
+    return CollectiveMainloop::can_implement(args.problem_size, args.mainloop);
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    return TileScheduler::get_grid_shape(params.scheduler);
+  }
+
+  static dim3 get_block_shape() {
+    dim3 block(MaxThreadsPerBlock, 1, 1);
+    return block;
+  }
+
+  static Params to_underlying_arguments(Arguments const& args, void* workspace) {
+    return Params{
+        args.problem_size,
+        CollectiveMainloop::to_underlying_arguments(args.problem_size, args.mainloop, workspace),
+        TileScheduler::to_underlying_arguments(args.problem_size, args.hw_info, ClusterShape{},
+                                               TileShape{})};
+  }
+
+  CUTE_DEVICE void operator()(const Params& params, char* smem) {
+    enum class WarpGroupRole {
+      LdSt = 0,
+      Math0 = 1,
+      Math1 = 2,
+      MathA = 3,  // auxiliary math WG
+    };
+
+    // NOTE: CollectiveInverse will have more utilization on warp 0&1
+    //       so we put beta and alpha preprocessing on warp 2&3
+    enum class LdStWarpRole {
+      LoadQKV = 0,
+      StoreO = 1,
+      LoadBeta = 2,
+      LoadAlpha = 3,
+    };
+
+    TileScheduler scheduler{params.scheduler};
+
+    // Shared memory.
+    auto& storage = *reinterpret_cast<SharedStorage*>(smem);
+
+    int lane_idx = cutlass::canonical_lane_idx();
+    int warp_idx = cutlass::canonical_warp_idx_sync();
+    int warp_idx_in_wg = warp_idx % cutlass::NumWarpsPerWarpGroup;
+    int warp_group_idx = cutlass::canonical_warp_group_idx();
+    auto warp_group_role = WarpGroupRole(warp_group_idx);
+    auto ldst_warp_role = LdStWarpRole(warp_idx_in_wg);
+
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+    }
+
+    constexpr int NumStateMathThreads = NumStateMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
+    constexpr int NumAuxMathThreads = NumAuxMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
+
+    QPipelineParams q_pipeline_params;
+    q_pipeline_params.transaction_bytes = CollectiveMainloop::LoadQBytes;
+    q_pipeline_params.is_leader = lane_predicate && (ldst_warp_role == LdStWarpRole::LoadQKV);
+    q_pipeline_params.num_consumers = NumStateMathThreads + NumAuxMathThreads;
+
+    KPipelineParams k_pipeline_params;
+    k_pipeline_params.transaction_bytes = CollectiveMainloop::LoadKBytes;
+    k_pipeline_params.is_leader = lane_predicate && (ldst_warp_role == LdStWarpRole::LoadQKV);
+    k_pipeline_params.num_consumers = NumStateMathThreads + NumAuxMathThreads;
+
+    VPipelineParams v_pipeline_params;
+    v_pipeline_params.transaction_bytes = CollectiveMainloop::LoadVBytes;
+    v_pipeline_params.is_leader = lane_predicate && (ldst_warp_role == LdStWarpRole::LoadQKV);
+    v_pipeline_params.num_consumers = NumStateMathThreads;
+
+    OPipelineParams o_pipeline_params;
+    o_pipeline_params.producer_arv_count = NumStateMathThreads;
+    o_pipeline_params.consumer_arv_count = cutlass::NumThreadsPerWarp;
+
+    QKPipelineParams qk_pipeline_params;
+    qk_pipeline_params.producer_arv_count = NumAuxMathThreads;
+    qk_pipeline_params.consumer_arv_count = NumStateMathThreads;
+
+    KKPipelineParams kk_pipeline_params;
+    kk_pipeline_params.producer_arv_count = NumAuxMathThreads;
+    kk_pipeline_params.consumer_arv_count = NumStateMathThreads;
+
+    AlphaPipelineParams alpha_pipeline_params;
+    if constexpr (NeedsAlpha) {
+      alpha_pipeline_params.producer_arv_count = cutlass::NumThreadsPerWarp;
+      alpha_pipeline_params.consumer_arv_count = NumStateMathThreads + NumAuxMathThreads;
+    }
+
+    BetaPipelineParams beta_pipeline_params;
+    if constexpr (NeedsBeta) {
+      beta_pipeline_params.producer_arv_count = cutlass::NumThreadsPerWarp;
+      beta_pipeline_params.consumer_arv_count = NumAuxMathThreads;
+    }
+
+    OrderedMathBarriers math_barriers;
+
+    if (warp_group_role == WarpGroupRole::LdSt && ldst_warp_role == LdStWarpRole::LoadQKV) {
+      DPRINTF0_W("ldst_warp_role: LoadQKV\n");
+      q_pipeline_params.role = MainloopQPipeline::ThreadCategory::Producer;
+      k_pipeline_params.role = MainloopKPipeline::ThreadCategory::Producer;
+      v_pipeline_params.role = MainloopVPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::LdSt && ldst_warp_role == LdStWarpRole::StoreO) {
+      DPRINTF0_W("ldst_warp_role: StoreO\n");
+      o_pipeline_params.role = MainloopOPipeline::ThreadCategory::Consumer;
+    }
+    if (warp_group_role == WarpGroupRole::LdSt && ldst_warp_role == LdStWarpRole::LoadBeta) {
+      if constexpr (NeedsBeta) {
+        beta_pipeline_params.role = MainloopBetaPipeline::ThreadCategory::Producer;
+      }
+    }
+    if (warp_group_role == WarpGroupRole::LdSt && ldst_warp_role == LdStWarpRole::LoadAlpha) {
+      if constexpr (NeedsAlpha) {
+        alpha_pipeline_params.role = MainloopAlphaPipeline::ThreadCategory::Producer;
+      }
+    }
+    if (warp_group_role == WarpGroupRole::Math0 || warp_group_role == WarpGroupRole::Math1) {
+      DPRINTF0_WG("warp_group_role: MathX\n");
+      q_pipeline_params.role = MainloopQPipeline::ThreadCategory::Consumer;
+      k_pipeline_params.role = MainloopKPipeline::ThreadCategory::Consumer;
+      v_pipeline_params.role = MainloopVPipeline::ThreadCategory::Consumer;
+      o_pipeline_params.role = MainloopOPipeline::ThreadCategory::Producer;
+
+      qk_pipeline_params.role = MainloopQKPipeline::ThreadCategory::Consumer;
+      kk_pipeline_params.role = MainloopKKPipeline::ThreadCategory::Consumer;
+
+      if constexpr (NeedsAlpha) {
+        alpha_pipeline_params.role = MainloopAlphaPipeline::ThreadCategory::Consumer;
+      }
+
+      math_barriers.init(warp_group_idx - 1);
+    }
+    if (warp_group_role == WarpGroupRole::MathA) {
+      DPRINTF0_WG("warp_group_role: MathA\n");
+      q_pipeline_params.role = MainloopQPipeline::ThreadCategory::Consumer;
+      k_pipeline_params.role = MainloopKPipeline::ThreadCategory::Consumer;
+
+      qk_pipeline_params.role = MainloopQKPipeline::ThreadCategory::Producer;
+      kk_pipeline_params.role = MainloopKKPipeline::ThreadCategory::Producer;
+
+      if constexpr (NeedsAlpha) {
+        alpha_pipeline_params.role = MainloopAlphaPipeline::ThreadCategory::Consumer;
+      }
+      if constexpr (NeedsBeta) {
+        beta_pipeline_params.role = MainloopBetaPipeline::ThreadCategory::Consumer;
+      }
+    }
+
+    MainloopQPipeline q_pipeline(storage.q_pipeline_storage, q_pipeline_params, ClusterShape{});
+    MainloopKPipeline k_pipeline(storage.k_pipeline_storage, k_pipeline_params, ClusterShape{});
+    MainloopVPipeline v_pipeline(storage.v_pipeline_storage, v_pipeline_params, ClusterShape{});
+    MainloopOPipeline o_pipeline(storage.o_pipeline_storage, o_pipeline_params,
+                                 /*InitBarriers=*/cute::true_type{});
+
+    MainloopQKPipeline qk_pipeline(storage.qk_pipeline_storage, qk_pipeline_params,
+                                   /*InitBarriers=*/cute::true_type{});
+    MainloopKKPipeline kk_pipeline(storage.kk_pipeline_storage, kk_pipeline_params,
+                                   /*InitBarriers=*/cute::true_type{});
+
+    MainloopAlphaPipeline alpha_pipeline(storage.alpha_pipeline_storage, alpha_pipeline_params,
+                                         /*InitBarriers=*/cute::true_type{});
+    MainloopBetaPipeline beta_pipeline(storage.beta_pipeline_storage, beta_pipeline_params,
+                                       /*InitBarriers=*/cute::true_type{});
+
+    QPipelineState q_smem_pipe_read;
+    QPipelineState q_smem_pipe_write = cutlass::make_producer_start_state<MainloopQPipeline>();
+    KPipelineState k_smem_pipe_read;
+    KPipelineState k_smem_pipe_write = cutlass::make_producer_start_state<MainloopKPipeline>();
+    VPipelineState v_smem_pipe_read;
+    VPipelineState v_smem_pipe_write = cutlass::make_producer_start_state<MainloopVPipeline>();
+    OPipelineState o_smem_pipe_read;
+    OPipelineState o_smem_pipe_write = cutlass::make_producer_start_state<MainloopOPipeline>();
+
+    QKPipelineState qk_smem_pipe_read;
+    QKPipelineState qk_smem_pipe_write = cutlass::make_producer_start_state<MainloopQKPipeline>();
+    KKPipelineState kk_smem_pipe_read;
+    KKPipelineState kk_smem_pipe_write = cutlass::make_producer_start_state<MainloopKKPipeline>();
+
+    AlphaPipelineState alpha_smem_pipe_read;
+    AlphaPipelineState alpha_smem_pipe_write;
+    if constexpr (NeedsAlpha) {
+      alpha_smem_pipe_write = cutlass::make_producer_start_state<MainloopAlphaPipeline>();
+    }
+    BetaPipelineState beta_smem_pipe_read;
+    BetaPipelineState beta_smem_pipe_write;
+    if constexpr (NeedsBeta) {
+      beta_smem_pipe_write = cutlass::make_producer_start_state<MainloopBetaPipeline>();
+    }
+
+    // barrier sm or cluster level for initialization
+    if constexpr (size(ClusterShape{}) > 1) {
+      cute::cluster_arrive_relaxed();
+      cute::cluster_wait();
+    } else {
+      __syncthreads();
+    }
+    DPRINTF0_WG("warpspecialized grid initialized\n");
+
+    CollectiveMainloop collective_mainloop;
+
+    if (warp_group_role == WarpGroupRole::LdSt) {
+      DPRINTF0_WG("LsSt warp_group_idx:%d, RegisterRequirement:%d\n", warp_group_idx,
+                  LdStRegisterRequirement);
+      cutlass::arch::warpgroup_reg_dealloc<LdStRegisterRequirement>();
+      if (ldst_warp_role == LdStWarpRole::LoadQKV) {
+        auto work_desc = scheduler.get_next_work(params.scheduler, params.problem_size);
+        CUTE_NO_UNROLL
+        for (; work_desc.is_valid(params.scheduler);
+             work_desc = scheduler.get_next_work(params.scheduler, params.problem_size)) {
+          DPRINTF0_WG(
+              "LsSt working on LoadQ/K/V, seq_idx:%d, q/k/v_head_idx:(%d,%d,%d), seq_len:%lld)\n",
+              work_desc.seq_idx, work_desc.q_head_idx(), work_desc.k_head_idx(),
+              work_desc.v_head_idx(), work_desc.seq_len);
+          auto tile_shape = typename CollectiveMainloop::TileShape{};
+          collective_mainloop.load_qkv(params.mainloop, params.problem_size, tile_shape, work_desc,
+                                       q_pipeline, q_smem_pipe_write, k_pipeline, k_smem_pipe_write,
+                                       v_pipeline, v_smem_pipe_write, storage.tensors.mainloop);
+        }
+      } else if (ldst_warp_role == LdStWarpRole::LoadBeta) {
+        if constexpr (NeedsBeta) {
+          auto work_desc = scheduler.get_next_work(params.scheduler, params.problem_size);
+          CUTE_NO_UNROLL
+          for (; work_desc.is_valid(params.scheduler);
+               work_desc = scheduler.get_next_work(params.scheduler, params.problem_size)) {
+            DPRINTF0_WG("LsSt working on LoadBeta, seq_idx:%d, sab_head_idx:%d, seq_len:%lld)\n",
+                        work_desc.seq_idx, work_desc.o_head_idx(), work_desc.seq_len);
+            auto tile_shape = typename CollectiveMainloop::TileShape{};
+            collective_mainloop.load_beta(params.mainloop, params.problem_size, tile_shape,
+                                          work_desc, beta_pipeline, beta_smem_pipe_write,
+                                          storage.tensors.mainloop);
+          }
+        }
+      } else if (ldst_warp_role == LdStWarpRole::LoadAlpha) {
+        if constexpr (NeedsAlpha) {
+          auto work_desc = scheduler.get_next_work(params.scheduler, params.problem_size);
+          CUTE_NO_UNROLL
+          for (; work_desc.is_valid(params.scheduler);
+               work_desc = scheduler.get_next_work(params.scheduler, params.problem_size)) {
+            DPRINTF0_WG("LsSt working on LoadAlpha, seq_idx:%d, sab_head_idx:%d, seq_len:%lld)\n",
+                        work_desc.seq_idx, work_desc.o_head_idx(), work_desc.seq_len);
+            auto tile_shape = typename CollectiveMainloop::TileShape{};
+            collective_mainloop.load_alpha(params.mainloop, params.problem_size, tile_shape,
+                                           work_desc, alpha_pipeline, alpha_smem_pipe_write,
+                                           storage.tensors.mainloop);
+          }
+        }
+      } else if (ldst_warp_role == LdStWarpRole::StoreO) {
+        auto work_desc = scheduler.get_next_work(params.scheduler, params.problem_size);
+        DPRINTF0_WG("LsSt working on StoreO, seq_idx:%d, o_head_idx:%d, seq_len:%lld)\n",
+                    work_desc.seq_idx, work_desc.o_head_idx(), work_desc.seq_len);
+        auto tile_shape = typename CollectiveMainloop::TileShape{};
+        collective_mainloop.store(params.mainloop.tma_store_o, params.mainloop.tensormaps,
+                                  params.problem_size, tile_shape, work_desc, o_pipeline,
+                                  o_smem_pipe_read, storage.tensors.mainloop.smem_o);
+      }
+    } else if (warp_group_role == WarpGroupRole::Math0 || warp_group_role == WarpGroupRole::Math1) {
+      DPRINTF0_WG("Compute[state]: warp_group_idx:%d, RegisterRequirement:%d\n", warp_group_idx,
+                  StateMmaRegisterRequirement);
+      cutlass::arch::warpgroup_reg_alloc<StateMmaRegisterRequirement>();
+      auto work_desc = scheduler.get_next_work(params.scheduler, params.problem_size);
+      CUTE_NO_UNROLL
+      for (; work_desc.is_valid(params.scheduler);
+           work_desc = scheduler.get_next_work(params.scheduler, params.problem_size)) {
+        DPRINTF0_WG("Compute[state]: seq_idx:%d, qk/v/o_head_idx:(%d,%d,%d,%d), seq_len:%lld)\n",
+                    work_desc.seq_idx, work_desc.q_head_idx(), work_desc.k_head_idx(),
+                    work_desc.v_head_idx(), work_desc.o_head_idx(), work_desc.seq_len);
+        collective_mainloop.compute(params.mainloop, params.problem_size, work_desc, q_pipeline,
+                                    q_smem_pipe_read, k_pipeline, k_smem_pipe_read, v_pipeline,
+                                    v_smem_pipe_read, o_pipeline, o_smem_pipe_write, qk_pipeline,
+                                    qk_smem_pipe_read, kk_pipeline, kk_smem_pipe_read,
+                                    alpha_pipeline, alpha_smem_pipe_read,
+                                    // beta_pipeline, beta_smem_pipe_read,
+                                    math_barriers, storage.tensors.mainloop);
+      }
+    } else if (warp_group_role == WarpGroupRole::MathA) {
+      DPRINTF0_WG("Compute[aux]: warp_group_idx:%d, RegisterRequirement:%d\n", warp_group_idx,
+                  AuxMmaRegisterRequirement);
+      cutlass::arch::warpgroup_reg_dealloc<AuxMmaRegisterRequirement>();
+      auto work_desc = scheduler.get_next_work(params.scheduler, params.problem_size);
+      CUTE_NO_UNROLL
+      for (; work_desc.is_valid(params.scheduler);
+           work_desc = scheduler.get_next_work(params.scheduler, params.problem_size)) {
+        DPRINTF0_WG("Compute[aux]: seq_idx:%d, qk/v/o_head_idx:(%d,%d,%d,%d), seq_len:%lld)\n",
+                    work_desc.seq_idx, work_desc.q_head_idx(), work_desc.k_head_idx(),
+                    work_desc.v_head_idx(), work_desc.o_head_idx(), work_desc.seq_len);
+        collective_mainloop.compute_aux(params.mainloop, params.problem_size, work_desc, q_pipeline,
+                                        q_smem_pipe_read, k_pipeline, k_smem_pipe_read, qk_pipeline,
+                                        qk_smem_pipe_write, kk_pipeline, kk_smem_pipe_write,
+                                        alpha_pipeline, alpha_smem_pipe_read, beta_pipeline,
+                                        beta_smem_pipe_read, storage.tensors.mainloop);
+      }
+    } else {
+      DPRINTF0_WG("Unknown warp role, warp_group_idx:%d\n", warp_group_idx);
+    }
+
+    __syncthreads();
+  }
+};
+
+}  // namespace flat::kernel
diff --git a/csrc/flat/hopper/kernel/flat_options.hpp b/csrc/flat/hopper/kernel/flat_options.hpp
new file mode 100644
index 0000000000..7e039071d8
--- /dev/null
+++ b/csrc/flat/hopper/kernel/flat_options.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <tuple>
+
+#include "cutlass/cutlass.h"
+
+namespace flat::kernel {
+
+template <auto kTag, class Value>
+struct Option {
+  static constexpr auto tag = kTag;
+  using option_value = Value;
+};
+
+using DefaultOptions = std::tuple<>;
+
+namespace detail {
+
+template <auto kTag, typename Default, typename... Options>
+struct find_option_impl;
+
+template <auto kTag, typename Default>
+struct find_option_impl<kTag, Default> {
+  using option_value = Default;
+};
+
+template <auto kTag, typename Default>
+struct find_option_impl<kTag, Default, void> : find_option_impl<kTag, Default> {};
+
+template <auto kTag, typename Default, typename Option, typename... Options>
+struct find_option_impl<kTag, Default, Option, Options...>
+    : std::conditional_t<Option::tag == kTag, Option, find_option_impl<kTag, Default, Options...>> {
+};
+
+template <auto kTag, typename Default, typename... Options>
+struct find_option_impl<kTag, Default, std::tuple<Options...>>
+    : find_option_impl<kTag, Default, Options...> {};
+
+template <typename NewOption, typename... Options>
+struct add_option_impl;
+
+template <typename NewOption, typename... Options>
+struct add_option_impl<NewOption, std::tuple<Options...>> {
+  using options = std::tuple<Options..., NewOption>;
+};
+
+}  // namespace detail
+
+template <auto kTag, typename Default, typename... Options>
+using find_option_t =
+    typename detail::find_option_impl<kTag, Default, std::tuple<Options...>>::option_value;
+
+template <auto kTag, typename Value, typename... Options>
+using add_option_t =
+    typename detail::add_option_impl<Option<kTag, Value>, std::tuple<Options...>>::options;
+
+template <auto kTag, typename Value, typename... Options>
+constexpr auto add_option(Option<kTag, Value> new_option, std::tuple<Options...> options_tuple) {
+  return add_option_t<kTag, Value, Options...>();
+}
+
+enum class Tag {
+  kIsLinearAttn,
+  kIsDeltaRule,
+  kIsPersistent,
+  kNumMmaWarpGroups,
+  kStagesQ,
+  kStagesK,
+  kStagesV,
+  kNeedsScale,
+  kNeedsDecay,  // linear/lighting attn
+  kNeedsAlpha,  // gated delta rule
+  kNeedsBeta,   // delta rule
+  kIsGVA,
+  kInitStateFromInput,  // if true, initialize state by reading global memory instead of zero
+                        // initialization.
+};
+
+}  // namespace flat::kernel
diff --git a/csrc/flat/hopper/kernel/flat_tile_scheduler.hpp b/csrc/flat/hopper/kernel/flat_tile_scheduler.hpp
new file mode 100644
index 0000000000..576efecccd
--- /dev/null
+++ b/csrc/flat/hopper/kernel/flat_tile_scheduler.hpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.h"
+
+namespace flat::kernel {
+
+using namespace cute;
+
+struct GQATag {};  //         num_q_heads == ratio * num_k_heads == ratio * num_v_heads
+struct GVATag {};  // ratio * num_q_heads == ratio * num_k_heads ==         num_v_heads
+
+template <typename GroupingTag = GQATag>
+struct WorkDesc {
+  // coord
+  int32_t seq_idx;
+  int32_t private_q_head_idx;
+  int32_t private_v_head_idx;
+  int64_t tok_offset;  // offset to the start of the start
+
+  // shape
+  int64_t seq_len;
+
+  // update by mainloop
+  int32_t tile_idx = 0;
+
+  template <typename Params>
+  CUTE_DEVICE bool is_valid(Params const& params) {
+    return seq_idx >= 0 && seq_idx < params.num_seqs;
+  }
+
+  CUTE_DEVICE int32_t q_head_idx() const { return private_q_head_idx; }
+
+  CUTE_DEVICE int32_t k_head_idx() const {
+    if constexpr (std::is_same_v<GroupingTag, GQATag>) {
+      return private_v_head_idx;
+    } else if constexpr (std::is_same_v<GroupingTag, GVATag>) {
+      return private_q_head_idx;
+    } else {
+      static_assert(dependent_false<GroupingTag>, "unknown grouping relation");
+    }
+  }
+
+  CUTE_DEVICE int32_t v_head_idx() const { return private_v_head_idx; }
+
+  CUTE_DEVICE int32_t o_head_idx() const {
+    if constexpr (std::is_same_v<GroupingTag, GQATag>) {
+      return private_q_head_idx;
+    } else if constexpr (std::is_same_v<GroupingTag, GVATag>) {
+      return private_v_head_idx;
+    } else {
+      static_assert(dependent_false<GroupingTag>, "unknown grouping relation");
+    }
+  }
+};
+
+template <typename GroupingTag = GQATag>
+struct IndividualTileScheduler {
+  struct Params {
+    dim3 grid;
+    int32_t num_seqs;
+    int32_t num_q_heads;
+    int32_t num_v_heads;
+  };
+
+  bool scheduled = false;  // a once flag
+
+  CUTE_DEVICE
+  IndividualTileScheduler(Params const& params) {}
+
+  template <typename ProblemSize, typename ClusterShape, typename TileShape>
+  static Params to_underlying_arguments(ProblemSize const& problem_size,
+                                        cutlass::KernelHardwareInfo const& hw_info,
+                                        ClusterShape const& cluster_shape,
+                                        TileShape const& tile_shape) {
+    dim3 grid(0, 1, 1);
+    if constexpr (std::is_same_v<GroupingTag, GQATag>) {
+      grid.x = problem_size.num_seqs * problem_size.num_q_heads;
+    } else if constexpr (std::is_same_v<GroupingTag, GVATag>) {
+      grid.x = problem_size.num_seqs * problem_size.num_v_heads;
+    } else {
+      static_assert(dependent_false<GroupingTag>, "unknown grouping relation");
+    }
+    DPRINTF(
+        "to_underlying_arguments: grid:{.x:%d, .y:%d, .z:%d}, num_seqs:%d, num_q_heads:%d, "
+        "num_v_heads:%d\n",
+        grid.x, grid.y, grid.z, problem_size.num_seqs, problem_size.num_q_heads,
+        problem_size.num_v_heads);
+    return {
+        .grid = grid,
+        .num_seqs = problem_size.num_seqs,
+        .num_q_heads = problem_size.num_q_heads,
+        .num_v_heads = problem_size.num_v_heads,
+    };
+  }
+
+  static dim3 get_grid_shape(Params const& params) { return params.grid; }
+
+  template <typename ProblemSize>
+  CUTE_DEVICE WorkDesc<GroupingTag> get_next_work(Params params, ProblemSize const& problem_size) {
+    int32_t seq_idx;
+    ;
+    int32_t q_head_idx;
+    int32_t v_head_idx;
+    if constexpr (std::is_same_v<GroupingTag, GQATag>) {
+      seq_idx = blockIdx.x / params.num_q_heads;
+      q_head_idx = blockIdx.x % params.num_q_heads;
+      v_head_idx = q_head_idx / (params.num_q_heads / params.num_v_heads);
+    } else if constexpr (std::is_same_v<GroupingTag, GVATag>) {
+      seq_idx = blockIdx.x / params.num_v_heads;
+      v_head_idx = blockIdx.x % params.num_v_heads;
+      q_head_idx = v_head_idx / (params.num_v_heads / params.num_q_heads);
+    } else {
+      static_assert(dependent_false<GroupingTag>, "unknown grouping relation");
+    }
+
+    int64_t s = problem_size.cu_seqlens[seq_idx];
+    int64_t e = problem_size.cu_seqlens[seq_idx + 1];
+    int64_t seq_len = e - s;
+
+    if (scheduled) {
+      seq_idx = -1;
+    } else {
+      scheduled = true;
+      DPRINTF0_W(
+          "get_next_work: this_work={seq_idx:%d q_head_idx:%d v_head_idx:%d tok_offset:%lld "
+          "seq_len:%lld}\n",
+          seq_idx, q_head_idx, v_head_idx, s, seq_len);
+    }
+
+    return {
+        .seq_idx = seq_idx,
+        .private_q_head_idx = q_head_idx,
+        .private_v_head_idx = v_head_idx,
+        .tok_offset = s,
+        .seq_len = seq_len,
+    };
+  }
+};
+
+}  // namespace flat::kernel
diff --git a/csrc/flat/math.hpp b/csrc/flat/math.hpp
new file mode 100644
index 0000000000..89a682f963
--- /dev/null
+++ b/csrc/flat/math.hpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/config.hpp"
+
+namespace flat {
+
+namespace detail {
+
+template <typename T>
+CUTE_HOST_DEVICE constexpr T ceil_log2(T n) {
+  return n <= 1 ? 0 : 1 + ceil_log2((n + 1) / 2);
+}
+
+}  // namespace detail
+
+template <typename T>
+CUTE_HOST_DEVICE constexpr T next_power_of_two(T n) {
+  return static_cast<T>(1) << detail::ceil_log2(n);
+}
+
+}  // namespace flat
diff --git a/csrc/flat/math_order_barrier.hpp b/csrc/flat/math_order_barrier.hpp
new file mode 100644
index 0000000000..61da886afd
--- /dev/null
+++ b/csrc/flat/math_order_barrier.hpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/arch/barrier.h"
+#include "cutlass/cutlass.h"
+
+namespace flat {
+
+// cutlass' OrderedSequenceBarrier uses mbarrier
+template <bool UseReservedNB_,           // treat nb_id as cutlass::ReservedNamedBarriers
+          uint32_t... WGIdToNBIdMapping  // say 6,4 is passed, means wg0 use nb6 and wg1 use nb4
+          >
+struct OrderedNamedBarriers {
+  static constexpr bool UseReservedNB = UseReservedNB_;
+  static constexpr int NumWG = sizeof...(WGIdToNBIdMapping);
+  using NBId_t = std::conditional_t<UseReservedNB, cutlass::arch::ReservedNamedBarriers, uint32_t>;
+
+  CUTE_DEVICE
+  OrderedNamedBarriers() : mapping_{NBId_t(WGIdToNBIdMapping)...} {}
+
+  CUTE_DEVICE
+  void init(int wg_idx) {  // wg_idx in among all WG participants
+    for (int i = wg_idx; i > 0; --i) {
+      cutlass::arch::NamedBarrier::arrive(cutlass::NumThreadsPerWarpGroup * NumWG, mapping_[i - 1]);
+    }
+    // with 3 WGs, init to namedbarrier_id:(arrived_wg,expected_wg)
+    // 0:(2,3)
+    // 1:(1,3)
+    // 2:(0,3)
+  }
+
+  CUTE_DEVICE
+  ~OrderedNamedBarriers() {
+    // FIXME: this will be a problem for persistent scheduler
+  }
+
+  CUTE_DEVICE
+  void ordered_or_wait(int wg_idx) {  // wg_idx in participants
+    // during first call, before
+    // 0:(2,3)
+    // 1:(1,3)
+    // 2:(0,3)
+    cutlass::arch::NamedBarrier::sync(cutlass::NumThreadsPerWarpGroup * NumWG, mapping_[wg_idx]);
+    // after
+    // 0:(3,3) immediately unblock wg0, and named barrier automatically reset to (0,3)
+    // 1:(2,3)
+    // 2:(1,3)
+  }
+
+  CUTE_DEVICE
+  void notify_next_blocked(int wg_idx) {  // wg_idx in participants
+    // allways call this after ordered_or_wait
+    // during first call, before
+    // 0:(0,3)
+    // 1:(2,3)
+    // 2:(1,3)
+    CUTE_UNROLL
+    for (int i = 1; i < NumWG; ++i) {
+      cutlass::arch::NamedBarrier::arrive(cutlass::NumThreadsPerWarpGroup * NumWG,
+                                          mapping_[(wg_idx + i) % NumWG]);
+    }
+    // after wg0 called this function
+    // 0:(0,3), wg0 has not reached on second ordered_or_wait() or (1,3) wg0 wait on second
+    // ordered_or_wait() call 1:(0,3), unblocked wg1's first ordered_or_wait() and reset nb1
+    // 2:(2,3), still wait on first ordered_or_wait() call
+    //
+    // after wg1 called this function
+    // 0:(1,3), wg0 has not reached on second ordered_or_wait() or (2,3) wg0 wait on second
+    // ordered_or_wait() call 1:(0,3), wg1 has not reached on second ordered_or_wait() or (1,3) wg1
+    // wait on second ordered_or_wait() call 2:(0,3), unblocked wg2's first ordered_or_wait() and
+    // reset nb2
+    //
+    // after wg2 called this function
+    // 0:(2,3), wg0 has not reached on second ordered_or_wait() or (0,3) wg0 wait on second
+    // ordered_or_wait() call, unblocked 1:(1,3), wg1 has not reached on second ordered_or_wait() or
+    // (2,3) wg1 wait on second ordered_or_wait() call, still block 2:(0,3), unblock wg0
+    // ordered_or_wait() and reset
+    //
+  }
+
+ private:
+  cute::array<NBId_t, NumWG> mapping_;
+};
+}  // namespace flat
diff --git a/csrc/flat/prefill/prefill_kernel.hpp b/csrc/flat/prefill/prefill_kernel.hpp
new file mode 100644
index 0000000000..ab30b073be
--- /dev/null
+++ b/csrc/flat/prefill/prefill_kernel.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstdint>
+
+#include "cuda_runtime_api.h"
+
+// Forward declarations to avoid including full cutlass headers
+namespace cutlass::arch {
+struct Sm90;
+}  // namespace cutlass::arch
+
+namespace flat {
+
+template <typename ArchTag,  // TODO: hide this
+          typename TO, typename TQKV, typename TState>
+void launch_delta_rule_prefill_kernel(cudaStream_t stream, TO* output, TState* output_state,
+                                      TQKV const* q, TQKV const* k, TQKV const* v,
+                                      TState const* input_state, float const* alpha,
+                                      float const* beta, int64_t const* cu_seqlens,
+                                      int32_t num_seqs, int32_t num_q_heads, int32_t num_k_heads,
+                                      int32_t num_v_heads, int32_t num_o_heads, int32_t head_size,
+                                      int64_t total_seqlen, float scale, int32_t sm_count = 0);
+
+}  // namespace flat
diff --git a/csrc/flat/prefill/prefill_kernel_delta_rule_sm90.cu b/csrc/flat/prefill/prefill_kernel_delta_rule_sm90.cu
new file mode 100644
index 0000000000..af4b6cb87d
--- /dev/null
+++ b/csrc/flat/prefill/prefill_kernel_delta_rule_sm90.cu
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cuda_bf16.h>
+
+#include "prefill_kernel_delta_rule_sm90.cuh"
+
+namespace flat {
+
+using namespace cute;
+
+template <typename ArchTag,  // FIXME: hide this
+          typename TO, typename TQKV, typename TState>
+void launch_delta_rule_prefill_kernel(cudaStream_t stream, TO* output, TState* output_state,
+                                      TQKV const* q, TQKV const* k, TQKV const* v,
+                                      TState const* input_state, float const* alpha,
+                                      float const* beta, int64_t const* cu_seqlens,
+                                      int32_t num_seqs, int32_t num_q_heads, int32_t num_k_heads,
+                                      int32_t num_v_heads, int32_t num_o_heads, int32_t head_size,
+                                      int64_t total_seqlen, float scale, int32_t sm_count) {
+  bool is_gva = num_v_heads > num_q_heads;
+  bool needs_beta = beta != nullptr;
+  bool needs_alpha = alpha != nullptr;
+  bool init_state = input_state != nullptr;
+
+#define LAUNCH(is_gva, needs_beta, needs_alpha, init_state)                                    \
+  launch_delta_rule_prefill_kernel_gbai<is_gva, needs_beta, needs_alpha, init_state, ArchTag>( \
+      stream, output, output_state, q, k, v, input_state, alpha, beta, cu_seqlens, num_seqs,   \
+      num_q_heads, num_k_heads, num_v_heads, num_o_heads, head_size, total_seqlen, scale,      \
+      sm_count);
+
+  if (init_state) {
+    if (is_gva && needs_beta && needs_alpha) {
+      LAUNCH(true, true, true, true);
+    } else if (is_gva && needs_beta && !needs_alpha) {
+      LAUNCH(true, true, false, true);
+    } else if (is_gva && !needs_beta && needs_alpha) {
+      LAUNCH(true, false, true, true);
+    } else if (is_gva && !needs_beta && !needs_alpha) {
+      LAUNCH(true, false, false, true);
+    } else if (!is_gva && needs_beta && needs_alpha) {
+      LAUNCH(false, true, true, true);
+    } else if (!is_gva && needs_beta && !needs_alpha) {
+      LAUNCH(false, true, false, true);
+    } else if (!is_gva && !needs_beta && needs_alpha) {
+      LAUNCH(false, false, true, true);
+    } else if (!is_gva && !needs_beta && !needs_alpha) {
+      LAUNCH(false, false, false, true);
+    } else {
+      throw std::runtime_error("unreachable");
+    }
+  } else {
+    if (is_gva && needs_beta && needs_alpha) {
+      LAUNCH(true, true, true, false);
+    } else if (is_gva && needs_beta && !needs_alpha) {
+      LAUNCH(true, true, false, false);
+    } else if (is_gva && !needs_beta && needs_alpha) {
+      LAUNCH(true, false, true, false);
+    } else if (is_gva && !needs_beta && !needs_alpha) {
+      LAUNCH(true, false, false, false);
+    } else if (!is_gva && needs_beta && needs_alpha) {
+      LAUNCH(false, true, true, false);
+    } else if (!is_gva && needs_beta && !needs_alpha) {
+      LAUNCH(false, true, false, false);
+    } else if (!is_gva && !needs_beta && needs_alpha) {
+      LAUNCH(false, false, true, false);
+    } else if (!is_gva && !needs_beta && !needs_alpha) {
+      LAUNCH(false, false, false, false);
+    } else {
+      throw std::runtime_error("unreachable");
+    }
+  }
+
+#undef LAUNCH
+}
+
+template void launch_delta_rule_prefill_kernel<cutlass::arch::Sm90, half, half, float>(
+    cudaStream_t stream, half* output, float* state, half const* q, half const* k, half const* v,
+    float const* input_state, float const* alpha, float const* beta, int64_t const* cu_seqlens,
+    int32_t num_seqs, int32_t num_q_heads, int32_t num_k_heads, int32_t num_v_heads,
+    int32_t num_o_heads, int32_t head_size, int64_t total_seqlen, float scale, int32_t sm_count);
+
+template void
+launch_delta_rule_prefill_kernel<cutlass::arch::Sm90, nv_bfloat16, nv_bfloat16, float>(
+    cudaStream_t stream, nv_bfloat16* output, float* state, nv_bfloat16 const* q,
+    nv_bfloat16 const* k, nv_bfloat16 const* v, float const* input_state, float const* alpha,
+    float const* beta, int64_t const* cu_seqlens, int32_t num_seqs, int32_t num_q_heads,
+    int32_t num_k_heads, int32_t num_v_heads, int32_t num_o_heads, int32_t head_size,
+    int64_t total_seqlen, float scale, int32_t sm_count);
+
+}  // namespace flat
diff --git a/csrc/flat/prefill/prefill_kernel_delta_rule_sm90.cuh b/csrc/flat/prefill/prefill_kernel_delta_rule_sm90.cuh
new file mode 100644
index 0000000000..78097cdef1
--- /dev/null
+++ b/csrc/flat/prefill/prefill_kernel_delta_rule_sm90.cuh
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstdio>
+
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.h"
+#include "cutlass/util/device_memory.h"
+#include "flat/common.hpp"
+#include "flat/hopper/device/device_universal.hpp"
+#include "flat/hopper/kernel/flat_kernel_builder_delta_rule.hpp"
+
+namespace flat {
+
+using namespace cute;
+
+template <bool IsGVA, bool NeedsBeta, bool NeedsAlpha, bool InitStateFromInput, typename ArchTag,
+          typename TO, typename TQKV, typename TState>
+void launch_delta_rule_prefill_kernel_gbai(cudaStream_t stream, TO* output, TState* output_state,
+                                           TQKV const* q, TQKV const* k, TQKV const* v,
+                                           TState const* input_state, float const* alpha,
+                                           float const* beta, int64_t const* cu_seqlens,
+                                           int32_t num_seqs, int32_t num_q_heads,
+                                           int32_t num_k_heads, int32_t num_v_heads,
+                                           int32_t num_o_heads, int32_t head_size,
+                                           int64_t total_seqlen, float scale, int32_t sm_count) {
+#if defined(FLAT_SM90A_ENABLED)
+  constexpr bool HopperSupported = true;
+#else
+  constexpr bool HopperSupported = false;
+#endif
+
+  if constexpr (HopperSupported) {
+    static_assert(std::is_same_v<TQKV, TO>);
+
+    using namespace flat::kernel;
+    using T = map_to_cutlass_t<TQKV>;
+
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.sm_count = sm_count;
+
+    using Options = decltype([&]() {
+      constexpr auto options_0 = DefaultOptions{};
+      constexpr auto options_1 =
+          add_option(Option<Tag::kIsDeltaRule, cute::true_type>{}, options_0);
+      constexpr auto options_2 = add_option(
+          Option<Tag::kIsGVA, std::conditional_t<IsGVA, cute::true_type, cute::false_type>>{},
+          options_1);
+      constexpr auto options_3 =
+          add_option(Option<Tag::kNeedsBeta,
+                            std::conditional_t<NeedsBeta, cute::true_type, cute::false_type>>{},
+                     options_2);
+      constexpr auto options_4 =
+          add_option(Option<Tag::kNeedsAlpha,
+                            std::conditional_t<NeedsAlpha, cute::true_type, cute::false_type>>{},
+                     options_3);
+      constexpr auto options_5 = add_option(
+          Option<Tag::kInitStateFromInput,
+                 std::conditional_t<InitStateFromInput, cute::true_type, cute::false_type>>{},
+          options_4);
+      return options_5;
+    }());
+
+    using TileShape = Shape<_64, _64, _128>;
+    using Scheduler = cutlass::gemm::KernelTmaWarpSpecializedCooperative;
+    using Operation = cutlass::device::Universal<typename flat::kernel::FlatBuilderDeltaRule<
+        T, float, float, TileShape,
+        /*LayoutQ=*/cute::tuple<int64_t, _1, int32_t>,
+        /*LayoutK=*/cute::tuple<int64_t, _1, int32_t>,
+        /*LayoutV=*/cute::tuple<int64_t, _1, int32_t>,
+        /*LayoutO=*/cute::tuple<int64_t, _1, int32_t>, Scheduler, Options>::Kernel>;
+    using Arguments = typename Operation::Arguments;
+
+    // NOTE: LayoutQ/K/V in (seq, head_size, (b,h)) coordinate semantics
+
+    int32_t num_sab_heads = std::max(num_q_heads, num_v_heads);
+
+    int32_t q_tok_stride = num_q_heads * head_size;
+    int32_t o_tok_stride = num_o_heads * head_size;
+    int32_t k_tok_stride = num_k_heads * head_size;
+    int32_t v_tok_stride = num_v_heads * head_size;
+
+    int32_t q_head_stride = head_size;
+    int32_t o_head_stride = head_size;
+    int32_t k_head_stride = head_size;
+    int32_t v_head_stride = head_size;
+
+    Operation op;
+    Arguments arguments{.problem_size =
+                            {
+                                .cu_seqlens = cu_seqlens,
+                                .total_seqlen = total_seqlen,
+                                .num_seqs = num_seqs,
+                                .num_q_heads = num_q_heads,
+                                .num_k_heads = num_k_heads,
+                                .num_v_heads = num_v_heads,
+                                .num_o_heads = num_o_heads,
+                                .num_sab_heads = num_sab_heads,
+                                .head_size = head_size,
+                            },
+                        .mainloop =
+                            {
+                                // clang-format off
+                .ptr_Q = (T*)q,      .dQ = {q_tok_stride, _1{}, q_head_stride},
+                .ptr_K = (T*)k,      .dK = {k_tok_stride, _1{}, k_head_stride},
+                .ptr_V = (T*)v,      .dV = {v_tok_stride, _1{}, v_head_stride},
+                .ptr_O = (T*)output, .dO = {o_tok_stride, _1{}, o_head_stride},
+                .ptr_output_state = (float*)output_state,
+                .ptr_input_state  = (float*)input_state,
+                .scale = scale,
+                .alpha_ptr = alpha, .alpha_stride = {num_sab_heads, 1},
+                .beta_ptr  = beta,  .beta_stride  = {num_sab_heads, 1},
+        },  // clang-format on
+                        .hw_info = hw_info};
+
+    size_t workspace_size = op.get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status;
+    status = op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      throw std::runtime_error("can_implement failed");
+    }
+
+    status = op.initialize(arguments, workspace.get(), stream);
+    if (status != cutlass::Status::kSuccess) {
+      throw std::runtime_error("initialize failed");
+    }
+
+    status = op.run(stream);
+    if (status != cutlass::Status::kSuccess) {
+      throw std::runtime_error("run failed");
+    }
+  } else {
+    throw std::runtime_error("hopper not supported");
+  }
+}
+
+}  // namespace flat
diff --git a/csrc/flat/type_traits.hpp b/csrc/flat/type_traits.hpp
new file mode 100644
index 0000000000..29e6eaa723
--- /dev/null
+++ b/csrc/flat/type_traits.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <type_traits>
+
+#include "cutlass/numeric_types.h"
+
+namespace flat {
+
+// clang-format off
+template <typename T> struct map_to_cutlass;
+template<> struct map_to_cutlass<cutlass::half_t>             { using type = cutlass::half_t;                    };
+template<> struct map_to_cutlass<cutlass::bfloat16_t>         { using type = cutlass::bfloat16_t;                };
+template<> struct map_to_cutlass<half>                        { using type = cutlass::half_t;                    };
+template<> struct map_to_cutlass<nv_bfloat16>                 { using type = cutlass::bfloat16_t;                };
+
+template <typename T> using map_to_cutlass_t = typename map_to_cutlass<T>::type;
+// clang-format on
+
+template <typename... Ts>
+struct first_non_void {
+  static_assert(sizeof...(Ts) > 0, "all voids is not allowed");
+  using type = void;
+};
+
+template <typename T, typename... Ts>
+struct first_non_void<T, Ts...> {
+  using type = T;
+};
+
+template <typename... Ts>
+struct first_non_void<void, Ts...> : first_non_void<Ts...> {};
+
+template <typename... Ts>
+using first_non_void_t = typename first_non_void<Ts...>::type;
+
+}  // namespace flat
diff --git a/csrc/flat/unused.hpp b/csrc/flat/unused.hpp
new file mode 100644
index 0000000000..ef5a12e6ab
--- /dev/null
+++ b/csrc/flat/unused.hpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/config.hpp"
+
+namespace flat {
+
+struct Unused {
+  using Params = Unused;
+  using SharedStorage = char;
+  static constexpr uint32_t Stages = 0;
+
+  template <typename... Ts>
+  CUTE_HOST_DEVICE Unused(Ts... vs) {}
+
+  template <typename T>
+  CUTE_HOST_DEVICE Unused operator=(T&& v) {
+    return Unused{};
+  }
+};
+
+}  // namespace flat
diff --git a/csrc/gdn_prefill_launcher.cu b/csrc/gdn_prefill_launcher.cu
new file mode 100644
index 0000000000..52cd19bb90
--- /dev/null
+++ b/csrc/gdn_prefill_launcher.cu
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <flashinfer/allocator.h>
+#include <flashinfer/exception.h>
+#include <tvm_ffi_utils.h>
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+
+#include "flat/prefill/prefill_kernel.hpp"
+
+using tvm::ffi::Optional;
+using tvm::ffi::TensorView;
+using tvm::ffi::Variant;
+
+namespace flashinfer {
+
+void gdn_prefill_launcher(void* output, void* output_state, void* q, void* k, void* v,
+                          void* input_state, void* alpha, void* beta, int64_t* cu_seqlens,
+                          int64_t num_seqs, int64_t num_q_heads, int64_t num_k_heads,
+                          int64_t num_v_heads, int64_t num_o_heads, int64_t head_size,
+                          int64_t packed_seq, float scale, int64_t sm_count, DLDataType dtype,
+                          cudaStream_t stream) {
+  DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(dtype, DType, [&] {
+    int dev_id;
+    cudaGetDevice(&dev_id);
+    cudaDeviceProp device_properties;
+    cudaGetDeviceProperties(&device_properties, dev_id);
+
+#if defined(FLAT_SM90A_ENABLED)
+    if (device_properties.major == 9) {
+      flat::launch_delta_rule_prefill_kernel<cutlass::arch::Sm90, DType, DType, float>(
+          stream, static_cast<DType*>(output), static_cast<float*>(output_state),
+          static_cast<DType const*>(q), static_cast<DType const*>(k), static_cast<DType const*>(v),
+          static_cast<float const*>(input_state), static_cast<float const*>(alpha),
+          static_cast<float const*>(beta), cu_seqlens, num_seqs, num_q_heads, num_k_heads,
+          num_v_heads, num_o_heads, head_size, packed_seq, scale, sm_count);
+      return true;
+    } else {
+      std::ostringstream err_msg;
+      err_msg << "delta rule kernel does not support this device major version: "
+              << device_properties.major;
+      FLASHINFER_ERROR(err_msg.str());
+      return false;
+    }
+#else
+    FLASHINFER_ERROR("sm_90a is not enabled, delta rule kernel is not built");
+    return false;
+#endif
+  });
+}
+
+void gdn_prefill(TensorView output, TensorView output_state, TensorView q, TensorView k,
+                 TensorView v, TensorView cu_seqlens, Optional<TensorView> input_state,
+                 Optional<TensorView> alpha, Optional<TensorView> beta, double scale) {
+  int64_t num_seqs = cu_seqlens.size(0) - 1;
+  int64_t packed_seq = q.size(0);
+  int64_t head_size = q.size(2);
+  int64_t num_q_heads = q.size(1);
+  int64_t num_k_heads = k.size(1);
+  int64_t num_v_heads = v.size(1);
+
+  // NOTE: Qwen3-next alpha and beta heads are 32, equal to v heads, we limit it to larger one
+  int32_t num_sab_heads = std::max(num_q_heads, num_v_heads);
+
+  if (num_q_heads >= num_v_heads) {  // GQA
+    auto ratio = num_q_heads / num_v_heads;
+    TVM_FFI_ICHECK_EQ(num_k_heads, num_v_heads);
+    TVM_FFI_ICHECK_EQ(num_q_heads, ratio * num_k_heads);
+    TVM_FFI_ICHECK_EQ(num_q_heads, ratio * num_v_heads);
+  } else {  // GVA
+    auto ratio = num_v_heads / num_q_heads;
+    TVM_FFI_ICHECK_EQ(num_q_heads, num_k_heads);
+    TVM_FFI_ICHECK_EQ(num_v_heads, ratio * num_q_heads);
+    TVM_FFI_ICHECK_EQ(num_v_heads, ratio * num_k_heads);
+  }
+
+  int64_t num_o_heads = output.size(1);
+  TVM_FFI_ICHECK_EQ(num_o_heads, num_sab_heads);
+
+  void* input_state_ptr = nullptr;
+  if (input_state.has_value()) {
+    CHECK_SHAPE(input_state.value(), output_state);
+    TVM_FFI_ICHECK_EQ(input_state.value().dtype(), dl_float32);
+    input_state_ptr = input_state.value().data_ptr();
+  }
+
+  CHECK_INPUT(output);
+  CHECK_INPUT(output_state);
+  CHECK_INPUT(q);
+  CHECK_INPUT(k);
+  CHECK_INPUT(v);
+  CHECK_INPUT(cu_seqlens);
+
+  TVM_FFI_ICHECK(output.dtype() == dl_float16 || output.dtype() == dl_bfloat16);
+  TVM_FFI_ICHECK_EQ(output_state.dtype(), dl_float32);
+  TVM_FFI_ICHECK_EQ(output.dtype(), q.dtype());
+  TVM_FFI_ICHECK_EQ(output.dtype(), k.dtype());
+  TVM_FFI_ICHECK_EQ(output.dtype(), v.dtype());
+  TVM_FFI_ICHECK_EQ(cu_seqlens.dtype(), dl_int64);
+
+  TVM_FFI_ICHECK_EQ(packed_seq, k.size(0));
+  TVM_FFI_ICHECK_EQ(packed_seq, v.size(0));
+  TVM_FFI_ICHECK_EQ(packed_seq, output.size(0));
+
+  TVM_FFI_ICHECK_EQ(num_seqs, output_state.size(0));
+  TVM_FFI_ICHECK_EQ(num_sab_heads, output_state.size(1));
+
+  TVM_FFI_ICHECK_EQ(head_size, output.size(2));
+  TVM_FFI_ICHECK_EQ(head_size, k.size(2));
+  TVM_FFI_ICHECK_EQ(head_size, v.size(2));
+  TVM_FFI_ICHECK_EQ(head_size, output_state.size(2));
+  TVM_FFI_ICHECK_EQ(head_size, output_state.size(3));
+
+  void* alpha_ptr = nullptr;
+  if (alpha.has_value()) {
+    TensorView alpha_tensor = alpha.value();
+    TVM_FFI_ICHECK_EQ(alpha_tensor.dtype(), dl_float32);
+    TVM_FFI_ICHECK_EQ(alpha_tensor.size(0), packed_seq);
+    TVM_FFI_ICHECK_EQ(alpha_tensor.size(1), num_sab_heads);
+    CHECK_INPUT(alpha_tensor);
+    alpha_ptr = alpha_tensor.data_ptr();
+  }
+
+  void* beta_ptr = nullptr;
+  if (beta.has_value()) {
+    TensorView beta_tensor = beta.value();
+    TVM_FFI_ICHECK_EQ(beta_tensor.dtype(), dl_float32);
+    TVM_FFI_ICHECK_EQ(beta_tensor.size(0), packed_seq);
+    TVM_FFI_ICHECK_EQ(beta_tensor.size(1), num_sab_heads);
+    CHECK_INPUT(beta_tensor);
+    beta_ptr = beta_tensor.data_ptr();
+  }
+
+  if (scale == 0.0) {
+    scale = 1.0 / std::sqrt(head_size);
+  }
+
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  cudaDeviceProp device_properties;
+  cudaGetDeviceProperties(&device_properties, dev_id);
+  int32_t sm_count = device_properties.multiProcessorCount;
+
+  auto stream = get_stream(q.device());
+
+  gdn_prefill_launcher(output.data_ptr(), output_state.data_ptr(), q.data_ptr(), k.data_ptr(),
+                       v.data_ptr(), input_state_ptr, alpha_ptr, beta_ptr,
+                       static_cast<int64_t*>(cu_seqlens.data_ptr()), num_seqs, num_q_heads,
+                       num_k_heads, num_v_heads, num_o_heads, head_size, packed_seq,
+                       static_cast<float>(scale), sm_count, q.dtype(), stream);
+}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(gdn_prefill, gdn_prefill);
+
+}  // namespace flashinfer
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
index 8c46b100a5..e678406ddb 100644
--- a/flashinfer/__init__.py
+++ b/flashinfer/__init__.py
@@ -84,6 +84,7 @@
     trtllm_fp8_block_scale_moe,
     trtllm_fp8_per_tensor_scale_moe,
 )
+from .gdn_prefill import chunk_gated_delta_rule as chunk_gated_delta_rule
 from .gemm import SegmentGEMMWrapper as SegmentGEMMWrapper
 from .gemm import bmm_fp8 as bmm_fp8
 from .gemm import bmm_mxfp8 as bmm_mxfp8
diff --git a/flashinfer/aot.py b/flashinfer/aot.py
index 34096af940..e2201279fe 100644
--- a/flashinfer/aot.py
+++ b/flashinfer/aot.py
@@ -41,6 +41,7 @@
     gen_fp4_quantization_sm121_module,
 )
 from .jit.fp8_quantization import gen_mxfp8_quantization_sm100_module
+from .jit.gdn import gen_gdn_prefill_sm90_module
 from .jit.fused_moe import (
     gen_cutlass_fused_moe_sm120_module,
     gen_cutlass_fused_moe_sm103_module,
@@ -535,6 +536,7 @@ def gen_all_modules(
         ]
         if has_sm90:
             jit_specs.append(gen_trtllm_utils_module())
+            jit_specs.append(gen_gdn_prefill_sm90_module())
 
     if (
         add_xqa and get_cuda_version() > Version("12.8")
diff --git a/flashinfer/gdn_prefill.py b/flashinfer/gdn_prefill.py
new file mode 100644
index 0000000000..0008ed536a
--- /dev/null
+++ b/flashinfer/gdn_prefill.py
@@ -0,0 +1,202 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import functools
+from types import SimpleNamespace
+from typing import Optional, Union, Tuple
+import torch
+
+from .api_logging import flashinfer_api
+from .jit.gdn import gen_gdn_prefill_sm90_module
+from .utils import (
+    register_custom_op,
+    register_fake_op,
+)
+
+
+@functools.cache
+def get_gdn_prefill_module():
+    module = gen_gdn_prefill_sm90_module().build_and_load()
+
+    @register_custom_op(
+        "flashinfer::gdn_prefill", mutates_args=("output", "output_state")
+    )
+    def gdn_prefill(
+        output: torch.Tensor,
+        output_state: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        initial_state: Optional[torch.Tensor],
+        g: Optional[torch.Tensor],
+        beta: Optional[torch.Tensor],
+        scale: float,
+    ) -> None:
+        module.gdn_prefill(
+            output,
+            output_state,
+            q,
+            k,
+            v,
+            cu_seqlens,
+            initial_state,
+            g,
+            beta,
+            scale,
+        )
+
+    @register_fake_op("flashinfer::gdn_prefill")
+    def _fake_gdn_prefill(
+        output: torch.Tensor,
+        output_state: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        initial_state: Optional[torch.Tensor],
+        g: Optional[torch.Tensor],
+        beta: Optional[torch.Tensor],
+        scale: float,
+    ) -> None:
+        pass
+
+    return SimpleNamespace(gdn_prefill=gdn_prefill)
+
+
+@flashinfer_api
+def chunk_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
+    beta: Optional[torch.Tensor] = None,
+    scale: Optional[float] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    use_qk_l2norm_in_kernel: bool = False,
+    output: Optional[torch.Tensor] = None,
+    output_state: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    r"""Chunked Gated Delta Rule (GDN) attention for prefill.
+
+    This implements the gated delta rule linear attention mechanism for efficient
+    training and inference. Supports both GQA (grouped query attention) and GVA
+    (grouped value attention) configurations.
+
+    Args:
+        q (torch.Tensor):
+            Queries of shape ``[total_seq_len, num_q_heads, head_size]``.
+            Must be contiguous and on CUDA.
+        k (torch.Tensor):
+            Keys of shape ``[total_seq_len, num_k_heads, head_size]``.
+            Must be contiguous and on CUDA.
+        v (torch.Tensor):
+            Values of shape ``[total_seq_len, num_v_heads, head_size]``.
+            Must be contiguous and on CUDA.
+        g (Optional[torch.Tensor]):
+            Forget gate (alpha) of shape ``[total_seq_len, num_sab_heads]`` where
+            ``num_sab_heads = max(num_q_heads, num_v_heads)``. Must be float32.
+            If None, defaults to all ones. Default: ``None``.
+        beta (Optional[torch.Tensor]):
+            Update gate (beta) of shape ``[total_seq_len, num_sab_heads]``.
+            Must be float32. If None, defaults to all ones. Default: ``None``.
+        scale (Optional[float]):
+            Scale factor for the attention scores.
+            If not provided, defaults to ``1 / sqrt(head_size)``. Default: ``None``.
+        initial_state (Optional[torch.Tensor]):
+            Initial KV state of shape ``[num_seqs, num_sab_heads, head_size, head_size]``.
+            Must be float32. If None, starts from zero state. Default: ``None``.
+        output_final_state (bool):
+            Whether to output the final state. Default: ``False``.
+        cu_seqlens (torch.Tensor):
+            Cumulative sequence lengths of shape ``[num_seqs + 1]``, int64.
+            Required for variable-length sequences (varlen mode).
+        use_qk_l2norm_in_kernel (bool):
+            Whether to use QK L2 normalization in kernel. Default: ``False``.
+        output (Optional[torch.Tensor]):
+            Pre-allocated output tensor of shape ``[total_seq_len, num_o_heads, head_size]``
+            where ``num_o_heads = max(num_q_heads, num_v_heads)``.
+            If None, will be allocated automatically. Default: ``None``.
+        output_state (Optional[torch.Tensor]):
+            Pre-allocated output state tensor of shape
+            ``[num_seqs, num_sab_heads, head_size, head_size]``, float32.
+            Required if ``output_final_state=True``. Default: ``None``.
+
+    Returns:
+        Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+            - If ``output_final_state=False``: Returns output tensor of shape
+              ``[total_seq_len, num_o_heads, head_size]``.
+            - If ``output_final_state=True``: Returns tuple of (output, final_state) where
+              final_state has shape ``[num_seqs, num_sab_heads, head_size, head_size]``.
+
+    Note:
+        - Supports GQA: ``num_q_heads > num_k_heads = num_v_heads``
+        - Supports GVA: ``num_v_heads > num_q_heads = num_k_heads``
+        - The final state is in k-major layout ``[N, H, K, V]``.
+        - Requires SM90 (Hopper) architecture.
+    """
+    assert cu_seqlens is not None, "cu_seqlens is required for varlen mode"
+
+    num_seqs = cu_seqlens.size(0) - 1
+    total_seq_len = q.size(0)
+    num_q_heads = q.size(1)
+    num_v_heads = v.size(1)
+    head_size = q.size(2)
+    num_o_heads = max(num_q_heads, num_v_heads)
+    num_sab_heads = num_o_heads
+
+    # Allocate output if not provided
+    if output is None:
+        output = torch.empty(
+            (total_seq_len, num_o_heads, head_size),
+            dtype=q.dtype,
+            device=q.device,
+        )
+
+    # Allocate output_state if needed
+    if output_final_state and output_state is None:
+        output_state = torch.empty(
+            (num_seqs, num_sab_heads, head_size, head_size),
+            dtype=torch.float32,
+            device=q.device,
+        )
+    elif not output_final_state and output_state is None:
+        # Still need to allocate since kernel always writes state
+        output_state = torch.empty(
+            (num_seqs, num_sab_heads, head_size, head_size),
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+    get_gdn_prefill_module().gdn_prefill(
+        output,
+        output_state,
+        q,
+        k,
+        v,
+        cu_seqlens.to(torch.int64),  # C++ kernel expects int64
+        initial_state,
+        g,
+        beta,
+        scale if scale is not None else 0.0,
+    )
+
+    if output_final_state:
+        return output, output_state
+    else:
+        return output
diff --git a/flashinfer/jit/gdn.py b/flashinfer/jit/gdn.py
new file mode 100644
index 0000000000..a86038c82b
--- /dev/null
+++ b/flashinfer/jit/gdn.py
@@ -0,0 +1,37 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from . import env as jit_env
+from .core import (
+    JitSpec,
+    gen_jit_spec,
+    sm90a_nvcc_flags,
+)
+
+
+def gen_gdn_prefill_sm90_module() -> JitSpec:
+    return gen_jit_spec(
+        name="gdn_prefill_launcher",
+        sources=[
+            jit_env.FLASHINFER_CSRC_DIR / "gdn_prefill_launcher.cu",
+            jit_env.FLASHINFER_CSRC_DIR
+            / "flat"
+            / "prefill"
+            / "prefill_kernel_delta_rule_sm90.cu",
+        ],
+        extra_cuda_cflags=sm90a_nvcc_flags + ["-DFLAT_SM90A_ENABLED", "-std=c++20"],
+        extra_include_paths=[jit_env.FLASHINFER_CSRC_DIR],
+    )
diff --git a/tests/gdn/conftest.py b/tests/gdn/conftest.py
new file mode 100644
index 0000000000..292f8aa5ab
--- /dev/null
+++ b/tests/gdn/conftest.py
@@ -0,0 +1,67 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import functools
+
+import pytest
+import torch
+
+from flashinfer.utils import is_sm90a_supported
+
+
+@pytest.fixture(autouse=True)
+def skip_if_not_sm90a():
+    """Skip GDN tests if SM90a is not supported."""
+    if not is_sm90a_supported(torch.device("cuda")):
+        pytest.skip("GDN requires SM90a (Hopper) architecture")
+
+
+def multidist_randn(
+    num_dists, dim, mean_mean=0.0, mean_std=1.0, scale_lower=0.5, scale_upper=1.5
+):
+    means = torch.distributions.Normal(mean_mean, mean_std).sample((num_dists,))
+    scales = torch.distributions.Uniform(scale_lower, scale_upper).sample((num_dists,))
+    data = torch.distributions.Normal(means, scales).sample((dim,))
+    return data.T.contiguous()
+
+
+def multidist_randu(num_dists, dim, mean_mean=0.0, mean_std=1.0, lower=-1.0, upper=1.0):
+    means = torch.distributions.Normal(mean_mean, mean_std).sample((num_dists,))
+    data = torch.distributions.Uniform(means + lower, means + upper).sample((dim,))
+    return data.T.contiguous()
+
+
+def gen_qkv(
+    seq_lens, num_q_heads, num_k_heads, num_v_heads, head_size, dtype=torch.float16
+):
+    # qkv_rng = functools.partial(multidist_randn, mean_std=0.1)
+    qkv_rng = functools.partial(multidist_randu, mean_std=0.05, lower=-0.25, upper=0.25)
+
+    total_seq_lens = sum(seq_lens)
+    q = qkv_rng(total_seq_lens * num_q_heads, head_size)
+    k = qkv_rng(total_seq_lens * num_k_heads, head_size)
+    v = qkv_rng(total_seq_lens * num_v_heads, head_size)
+
+    q = q.reshape(total_seq_lens, num_q_heads, head_size).to(dtype).contiguous()
+    k = k.reshape(total_seq_lens, num_k_heads, head_size).to(dtype).contiguous()
+    v = v.reshape(total_seq_lens, num_v_heads, head_size).to(dtype).contiguous()
+
+    return q, k, v
+
+
+@pytest.fixture()
+def qkv_factory():
+    return gen_qkv
diff --git a/tests/gdn/reference_delta_rule.py b/tests/gdn/reference_delta_rule.py
new file mode 100644
index 0000000000..4ca2e214ec
--- /dev/null
+++ b/tests/gdn/reference_delta_rule.py
@@ -0,0 +1,494 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import torch.nn.functional as F
+
+
+def exclusive_cumsum(a: list[int]):
+    r = [0]
+    for v in a:
+        r.append(r[-1] + v)
+    return r
+
+
+def matmul(a: torch.Tensor, b: torch.Tensor):
+    assert a.dtype in [torch.float16, torch.bfloat16, torch.float32, torch.float64]
+    assert b.dtype in [torch.float16, torch.bfloat16, torch.float32, torch.float64]
+    if (
+        a.dtype == torch.float16
+        or b.dtype == torch.float16
+        or a.dtype == torch.bfloat16
+        or b.dtype == torch.bfloat16
+    ):
+        a_f32 = a.to(torch.float32)
+        b_f32 = b.to(torch.float32)
+        c_f32 = a_f32 @ b_f32
+        if a.dtype == torch.bfloat16:
+            return c_f32
+        else:
+            return c_f32.to(torch.float16)
+    else:
+        return a @ b
+
+
+def LambdaQ(decay_factor, valid_nrows, block_size, device, offset=0):
+    e = (
+        F.pad(
+            torch.arange(valid_nrows, device=device) + offset,
+            (0, block_size - valid_nrows),
+        )
+        .unsqueeze(1)
+        .unsqueeze(0)
+    )
+    return torch.pow(decay_factor, e)
+
+
+def LambdaK(decay_factor, valid_nrows, block_size, device, offset=0):
+    # NOTE: IT IS valid_nrows - ..., NOT block_size - ..., this is crucial for tail blocks
+    e = (
+        (
+            (valid_nrows - offset)
+            - F.pad(
+                torch.arange(valid_nrows, device=device),
+                (0, block_size - valid_nrows),
+                value=block_size,
+            )
+        )
+        .unsqueeze(1)
+        .unsqueeze(0)
+    )
+    return torch.pow(decay_factor, e)
+
+
+# sequence/block level linear attention
+def _linear_attention(
+    q: torch.Tensor,  # [seq_len, num_heads, head_size]
+    k: torch.Tensor,  # [seq_len, num_heads, head_size]
+    v: torch.Tensor,  # [seq_len, num_heads, head_size]
+    *,
+    decay_factor: torch.Tensor | None = None,
+    qk_weight: torch.Tensor | None = None,
+) -> torch.Tensor:
+    # Compute Q @ K^T
+    num_qo_heads = q.shape[1]
+    num_kv_heads = k.shape[1]
+    assert num_qo_heads == num_kv_heads
+    q = q.transpose(0, 1)
+    k = k.transpose(0, 1)
+    v = v.transpose(0, 1)
+
+    # print(q.shape, k.shape, v.shape)
+    scores = matmul(q, k.transpose(-2, -1))
+
+    # Create causal mask
+    seq_len = q.size(-2)
+    mask = torch.tril(
+        torch.ones(num_qo_heads, seq_len, seq_len, dtype=q.dtype, device=q.device)
+    )
+    if decay_factor is not None and (decay_factor != 1.0).any():
+        _, sq, sk = mask.shape
+        with torch.device(q.device):
+            e = (
+                torch.arange(sq).unsqueeze(1) - torch.arange(sk).unsqueeze(0)
+            ).unsqueeze(0)
+            M = torch.pow(decay_factor, e)
+            M[mask == 0.0] = 0.0
+    elif qk_weight is not None:
+        M = qk_weight.clone()
+        M[mask == 0.0] = 0.0
+    else:
+        M = mask
+
+    # Apply mask (Q @ K^T \odot M)
+    masked_scores = scores * M
+
+    # Apply to values (Q @ K^T \odot M) V
+    out = matmul(masked_scores, v)
+    out = out.transpose(0, 1)
+
+    return out
+
+
+@torch.inference_mode
+def blockwise_linear_attention(
+    q: torch.Tensor,  # [total_seq_len, num_qo_heads, head_size]
+    k: torch.Tensor,  # [total_seq_len, num_kv_heads, head_size]
+    v: torch.Tensor,  # [total_seq_len, num_kv_heads, head_size]
+    seq_lens: list[int],  # sequence length for each sequence
+    block_size: int = 32,
+    scale_factor=1.0,
+    decay_factor: float
+    | torch.Tensor = 1.0,  # float or tensor with num_elems == num_qo_heads
+    decay_exponent_offset=0,
+    kv_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    num_qo_heads = q.size(1)
+    head_size = q.size(2)
+    num_kv_heads = k.size(1)
+
+    if scale_factor != 1.0:
+        k = k * scale_factor
+    if isinstance(decay_factor, float):
+        decay_factor = torch.ones(num_qo_heads) * decay_factor
+        decay_factor = decay_factor.to(q.device)
+    assert decay_factor.numel() == num_qo_heads
+    decay_factor = decay_factor.reshape(num_qo_heads, 1, 1)
+
+    k = k.repeat_interleave(num_qo_heads // num_kv_heads, dim=1)
+    v = v.repeat_interleave(num_qo_heads // num_kv_heads, dim=1)
+
+    KVs = []  # FIXME: kernel debug only
+    kv = torch.zeros(
+        (len(seq_lens), num_qo_heads, head_size, head_size),
+        dtype=kv_dtype,
+        device=q.device,
+    )
+    output = torch.zeros_like(q)
+
+    seq_offset = exclusive_cumsum(seq_lens)
+    for seq_idx, seq_start in enumerate(seq_offset[:-1]):
+        seq_end = seq_offset[seq_idx + 1]
+        blk_offset = seq_start
+        carried_kv = torch.zeros(
+            (num_qo_heads, head_size, head_size), dtype=kv_dtype, device=q.device
+        )
+        while blk_offset < seq_end:
+            is_full_block = seq_end - blk_offset >= block_size
+            valid_len = block_size if is_full_block else seq_end - blk_offset
+            o_t = output[blk_offset : min(seq_end, blk_offset + block_size)]
+            if is_full_block:
+                q_t = q[blk_offset : blk_offset + block_size]
+                k_t = k[blk_offset : blk_offset + block_size]
+                v_t = v[blk_offset : blk_offset + block_size]
+            else:
+                q_t = torch.zeros(
+                    (block_size, num_qo_heads, head_size),
+                    dtype=q.dtype,
+                    device=q.device,
+                )
+                k_t = torch.zeros(
+                    (block_size, num_qo_heads, head_size),
+                    dtype=q.dtype,
+                    device=q.device,
+                )
+                v_t = torch.zeros(
+                    (block_size, num_qo_heads, head_size),
+                    dtype=q.dtype,
+                    device=q.device,
+                )
+                q_t[: seq_end - blk_offset] = q[blk_offset:seq_end]
+                k_t[: seq_end - blk_offset] = k[blk_offset:seq_end]
+                v_t[: seq_end - blk_offset] = v[blk_offset:seq_end]
+
+            Lq = LambdaQ(
+                decay_factor,
+                valid_len,
+                block_size,
+                device=q.device,
+                offset=decay_exponent_offset,
+            )
+
+            o_inter = (
+                matmul(q_t.transpose(0, 1).to(kv_dtype) * Lq, carried_kv)
+                .transpose(0, 1)
+                .to(q.dtype)
+            )
+            o_intra = _linear_attention(q_t, k_t, v_t, decay_factor=decay_factor)
+            if is_full_block:
+                # print(seq_idx, blk_offset, seq_end, o_t.shape, o_inter.shape, o_intra.shape)
+                o_t[:] = o_inter + o_intra
+            else:
+                # print(seq_idx, blk_offset, seq_end, o_t.shape, o_inter.shape, o_intra.shape)
+                o_t[:] = (o_inter + o_intra)[: o_t.shape[0]]
+
+            if (decay_factor == 1.0).all():
+                inc_kv = matmul(
+                    k_t.transpose(0, 1).transpose(-2, -1).to(kv_dtype),
+                    v_t.transpose(0, 1).to(kv_dtype),
+                )
+                carried_kv = carried_kv + inc_kv
+            else:
+                Lk = LambdaK(
+                    decay_factor,
+                    valid_len,
+                    block_size,
+                    device=q.device,
+                    offset=decay_exponent_offset,
+                )
+                inc_kv = matmul(
+                    (k_t.transpose(0, 1) * Lk).transpose(-2, -1).to(kv_dtype),
+                    v_t.transpose(0, 1).to(kv_dtype),
+                )
+                block_decay = decay_factor**valid_len
+                carried_kv = block_decay * carried_kv + inc_kv
+            KVs.append(carried_kv.clone())
+
+            blk_offset += block_size
+
+        # print(kv.shape, carried_kv.shape)
+        kv[seq_idx, :, :] = carried_kv
+
+    return output, kv, KVs
+
+
+def delta_rule(
+    q: torch.Tensor,  # [total_seq_len, num_qo_heads, head_size]
+    k: torch.Tensor,  # [total_seq_len, num_kv_heads, head_size]
+    v: torch.Tensor,  # [total_seq_len, num_kv_heads, head_size]
+    seq_lens: list[int],  # sequence length for each sequence
+    *,
+    alpha: torch.Tensor | None = None,  # [total_seq_len, num_qo_heads]
+    beta: torch.Tensor | None = None,  # [total_seq_len, num_qo_heads]
+    scale_factor=1.0,
+    kv_dtype: torch.dtype = torch.float32,
+):
+    o = []
+    kv = []
+    total_seqlen = q.size(0)
+    num_q_heads = q.size(1)
+    num_k_heads = k.size(1)
+    num_v_heads = v.size(1)
+    num_sab_heads = max(num_q_heads, num_v_heads)
+    head_size = k.size(2)
+
+    if alpha is None:
+        alpha = torch.ones(
+            total_seqlen, num_sab_heads, dtype=torch.float32, device=q.device
+        )
+    if beta is None:
+        beta = torch.ones(
+            total_seqlen, num_sab_heads, dtype=torch.float32, device=q.device
+        )
+
+    if num_q_heads > num_v_heads:  # GQA
+        k = k.repeat_interleave(num_q_heads // num_k_heads, dim=1)
+        v = v.repeat_interleave(num_q_heads // num_v_heads, dim=1)
+    else:  # GVA
+        q = q.repeat_interleave(num_v_heads // num_q_heads, dim=1)
+        k = k.repeat_interleave(num_v_heads // num_k_heads, dim=1)
+
+    seq_offset = exclusive_cumsum(seq_lens)
+    for seq_idx, seq_start in enumerate(seq_offset[:-1]):
+        seq_end = seq_offset[seq_idx + 1]
+        seq_len = seq_end - seq_start
+        s = slice(seq_start, seq_end)
+
+        # slices
+        qs = q[s]
+        ks = k[s]
+        vs = v[s]
+        alphas = alpha[s]
+        betas = beta[s]
+
+        state_HKV = torch.zeros(
+            num_q_heads, head_size, head_size, dtype=kv_dtype, device=q.device
+        )
+        for i in range(seq_len):
+            # var_DS where var is variable basename and DS is the dimensional semantics.
+            # Q/K/V are Dq/Dk/Dv respectively
+            q_H1Q = qs[i].unsqueeze(1)
+            k_H1K = ks[i].unsqueeze(1)
+            v_H1V = vs[i].unsqueeze(1)
+            alpha_H11 = alphas[i].unsqueeze(1).unsqueeze(2)
+            beta_H11 = betas[i].unsqueeze(1).unsqueeze(2)
+
+            ### listed at the bottom of page3 of section 2.2 DELTA NETWORKS: LINEAR ATTENTION WITH DELTA RULE
+
+            # state update rule, use the middle version for clearer dimensional semantics
+            old_state_HKV = alpha_H11 * state_HKV
+            old_v_H1V = matmul(k_H1K, old_state_HKV)
+            new_v_H1V = beta_H11 * v_H1V + (1 - beta_H11) * old_v_H1V
+            state_remove = torch.einsum("htv,htk->hkv", old_v_H1V, k_H1K)
+            state_update = torch.einsum("htv,htk->hkv", new_v_H1V, k_H1K)
+            state_HKV[:] = old_state_HKV - state_remove + state_update
+
+            o_H1V = scale_factor * matmul(q_H1Q, state_HKV)
+            o.append(o_H1V.squeeze(1))
+
+        kv.append(state_HKV.clone())
+
+    return torch.stack(o), torch.stack(kv)
+
+
+def identity_add_strict_lower_diagonal(m: torch.Tensor):
+    SIZE = m.size(-1)
+    assert m.size(-2) == SIZE
+    with torch.device(m.device):
+        m = m.clone()
+        mask = torch.arange(SIZE).unsqueeze(1) <= torch.arange(SIZE)
+        m[:, mask] = 0.0
+        # m[mask.unsqueeze(0)] = 0.0
+        m = m + torch.eye(SIZE).unsqueeze(0)
+    return m
+
+
+def to_logspace_Gamma_and_gamma(alpha_HS: torch.Tensor, epsilon=1e-10):
+    g = torch.log(alpha_HS + epsilon)
+    cu_g = torch.cumsum(g, dim=-1)
+    cu_g_HSS = cu_g.unsqueeze(2) - cu_g.unsqueeze(1)
+    cu_g_HS1 = cu_g.unsqueeze(2)
+    return cu_g_HSS, cu_g_HS1
+
+
+@torch.inference_mode
+def blockwise_delta_rule(
+    q: torch.Tensor,  # [total_seq_len, num_qo_heads, head_size]
+    k: torch.Tensor,  # [total_seq_len, num_kv_heads, head_size]
+    v: torch.Tensor,  # [total_seq_len, num_kv_heads, head_size]
+    seq_lens: list[int],  # sequence length for each sequence
+    alpha: torch.Tensor | None = None,  # [total_seq_len, num_qo_heads]
+    beta: torch.Tensor | None = None,  # [total_seq_len, num_qo_heads]
+    block_size: int = 32,
+    scale_factor=1.0,
+    kv_dtype: torch.dtype = torch.float32,
+    # intermediate_outputs = None,  # debug output
+) -> torch.Tensor:
+    total_seqlen = q.size(0)
+    num_q_heads = q.size(1)
+    num_k_heads = k.size(1)
+    num_v_heads = v.size(1)
+    num_sab_heads = max(num_q_heads, num_v_heads)
+    head_size = q.size(2)
+
+    if alpha is None:
+        alpha = torch.ones(
+            total_seqlen, num_sab_heads, dtype=torch.float32, device=q.device
+        )
+    if beta is None:
+        beta = torch.ones(
+            total_seqlen, num_sab_heads, dtype=torch.float32, device=q.device
+        )
+
+    if num_q_heads > num_v_heads:  # GQA
+        num_qkv_heads = num_q_heads
+        k = k.repeat_interleave(num_q_heads // num_k_heads, dim=1)
+        v = v.repeat_interleave(num_q_heads // num_v_heads, dim=1)
+    else:  # GVA
+        num_qkv_heads = num_v_heads
+        q = q.repeat_interleave(num_v_heads // num_q_heads, dim=1)
+        k = k.repeat_interleave(num_v_heads // num_k_heads, dim=1)
+
+    kv = torch.zeros(
+        (len(seq_lens), num_sab_heads, head_size, head_size),
+        dtype=kv_dtype,
+        device=q.device,
+    )
+    output = torch.zeros_like(q)
+
+    seq_offset = exclusive_cumsum(seq_lens)
+    for seq_idx, seq_start in enumerate(seq_offset[:-1]):
+        seq_end = seq_offset[seq_idx + 1]
+        blk_offset = seq_start
+        state_HKV = torch.zeros(
+            (num_sab_heads, head_size, head_size), dtype=kv_dtype, device=q.device
+        )
+        while blk_offset < seq_end:
+            is_full_block = seq_end - blk_offset >= block_size
+            valid_len = block_size if is_full_block else seq_end - blk_offset
+            o_t = output[blk_offset : min(seq_end, blk_offset + block_size)]
+            if is_full_block:
+                q_SHQ = q[blk_offset : blk_offset + block_size]
+                k_SHK = k[blk_offset : blk_offset + block_size]
+                v_SHV = v[blk_offset : blk_offset + block_size]
+                alpha_SH = alpha[blk_offset : blk_offset + block_size]
+                beta_SH = beta[blk_offset : blk_offset + block_size]
+            else:
+                q_SHQ = torch.zeros(
+                    (block_size, num_qkv_heads, head_size),
+                    dtype=q.dtype,
+                    device=q.device,
+                )
+                k_SHK = torch.zeros(
+                    (block_size, num_qkv_heads, head_size),
+                    dtype=k.dtype,
+                    device=k.device,
+                )
+                v_SHV = torch.zeros(
+                    (block_size, num_qkv_heads, head_size),
+                    dtype=v.dtype,
+                    device=v.device,
+                )
+                alpha_SH = torch.ones(
+                    block_size, num_sab_heads, dtype=alpha.dtype, device=alpha.device
+                )
+                beta_SH = torch.zeros(
+                    block_size, num_sab_heads, dtype=beta.dtype, device=beta.device
+                )
+                q_SHQ[:valid_len] = q[blk_offset:seq_end]
+                k_SHK[:valid_len] = k[blk_offset:seq_end]
+                v_SHV[:valid_len] = v[blk_offset:seq_end]
+                alpha_SH[:valid_len] = alpha[blk_offset:seq_end]
+                beta_SH[:valid_len] = beta[blk_offset:seq_end]
+
+            alpha_HS = alpha_SH.transpose(0, 1)
+            beta_HS1 = beta_SH.transpose(0, 1).unsqueeze(2)
+            Gamma_HSS, gamma_HS1 = to_logspace_Gamma_and_gamma(alpha_HS)
+            block_gamma = gamma_HS1[:, [valid_len - 1], :]
+
+            q_HSQ = q_SHQ.transpose(0, 1)
+            k_HSK = k_SHK.transpose(0, 1)
+            v_HSV = v_SHV.transpose(0, 1)
+
+            IKK = identity_add_strict_lower_diagonal(
+                beta_HS1 * torch.exp(Gamma_HSS) * matmul(k_HSK, k_HSK.transpose(-2, -1))
+            )  # NOTE: beta scale row-wise
+            T = torch.inverse(IKK) * beta_HS1.transpose(
+                1, 2
+            )  # NOTE: beta scale col-wise
+            T = T.to(q.dtype)
+            # new_v_HSV = matmul(T, (v_HSV - matmul(torch.exp(gamma_HS1) * k_HSK, state_HKV)))
+            u_HSV = matmul(T, v_HSV)
+            w_HSK = matmul(T, torch.exp(gamma_HS1) * k_HSK)
+            new_v_HSV = u_HSV - matmul(w_HSK.to(kv_dtype), state_HKV).to(u_HSV.dtype)
+            new_v_SHV = new_v_HSV.transpose(0, 1)
+
+            # if intermediate_outputs is not None:
+            #     intermediate_outputs["G"].append(Gamma_HSS.clone())
+            #     intermediate_outputs["g"].append(gamma_HS1.clone())
+            #     intermediate_outputs["IKK"].append(IKK.clone())
+            #     intermediate_outputs["T"].append(T.clone())
+            #     intermediate_outputs["u"].append(u_HSV.clone())
+            #     intermediate_outputs["w"].append(w_HSK.clone())
+            #     intermediate_outputs["new_v"].append(new_v_HSV.clone())
+
+            o_inter = (
+                matmul(torch.exp(gamma_HS1) * q_HSQ.to(kv_dtype), state_HKV)
+                .transpose(0, 1)
+                .to(q.dtype)
+            )
+            o_intra = _linear_attention(
+                q_SHQ, k_SHK, new_v_SHV, qk_weight=torch.exp(Gamma_HSS)
+            )
+
+            if is_full_block:
+                o_t[:] = scale_factor * (o_inter + o_intra)
+            else:
+                o_t[:] = scale_factor * (o_inter + o_intra)[: o_t.shape[0]]
+
+            inc_HKV = matmul(
+                (torch.exp(block_gamma - gamma_HS1) * k_HSK)
+                .transpose(-2, -1)
+                .to(kv_dtype),
+                new_v_HSV.to(kv_dtype),
+            )
+            state_HKV = torch.exp(block_gamma) * state_HKV + inc_HKV
+
+            blk_offset += block_size
+
+        kv[seq_idx, :, :, :] = state_HKV
+
+    return output, kv
diff --git a/tests/gdn/test_prefill_delta_rule.py b/tests/gdn/test_prefill_delta_rule.py
new file mode 100644
index 0000000000..72977b6fec
--- /dev/null
+++ b/tests/gdn/test_prefill_delta_rule.py
@@ -0,0 +1,428 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from __future__ import annotations
+
+import math
+import os
+import random
+
+import torch
+import pytest
+
+from .reference_delta_rule import exclusive_cumsum, blockwise_delta_rule
+
+
+from flashinfer.gdn_prefill import chunk_gated_delta_rule
+
+
+def _test_prefill_kernel(
+    qkv_factory,
+    dtype: str,
+    num_q_heads: int,
+    num_k_heads: int,
+    num_v_heads: int,
+    head_size: int,
+    block_size: int,
+    seq_lens: list[int],
+    scale: float,
+    alpha: bool,
+    beta: bool,
+    seed: int | None = None,
+):
+    if not alpha and not beta:
+        pytest.skip(
+            "large diff due to output value amplitude explosion along token dimension"
+        )
+
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    num_seqs = len(seq_lens)
+    total_seqlen = sum(seq_lens)
+    num_o_heads = max(num_q_heads, num_v_heads)
+    num_sab_heads = max(num_q_heads, num_v_heads)
+
+    dtype = getattr(torch, dtype)
+    kv_dtype = torch.float32
+    device = torch.device("cuda")
+    with device:
+        q, k, v = qkv_factory(
+            seq_lens, num_q_heads, num_k_heads, num_v_heads, head_size, dtype
+        )
+        # l2 norm k to avoid numerical instability
+        k = torch.nn.functional.normalize(k, p=2.0, dim=-1)
+        cu_seq_lens = torch.tensor(exclusive_cumsum(seq_lens), dtype=torch.int64)
+        alpha = torch.rand(total_seqlen, num_sab_heads) if alpha else None
+        beta = torch.rand(total_seqlen, num_sab_heads) if beta else None
+
+    our_o = torch.empty(
+        [total_seqlen, num_o_heads, head_size], dtype=q.dtype, device=q.device
+    )
+    our_state = torch.empty(
+        (num_seqs, num_sab_heads, head_size, head_size),
+        dtype=torch.float32,
+        device=q.device,
+    )
+    our_o.fill_(float("nan"))
+    our_state.fill_(float("nan"))
+
+    chunk_gated_delta_rule(
+        q,
+        k,
+        v,
+        alpha,
+        beta,
+        scale,
+        None,
+        True,
+        cu_seq_lens,
+        True,
+        output=our_o,
+        output_state=our_state,
+    )
+
+    torch.cuda.synchronize()
+
+    # postprocessing raw output, ref_state is v-major, our_state is k-major, unify to v-major for testing
+    our_state = our_state.transpose(-1, -2)
+
+    ref_o, ref_state = blockwise_delta_rule(
+        q.float(),
+        k.float(),
+        v.float(),
+        seq_lens,
+        scale_factor=scale,
+        alpha=alpha,
+        beta=beta,
+        kv_dtype=torch.float32,
+    )
+    ref_o = ref_o.to(q.dtype)
+    ref_state = ref_state.to(kv_dtype)
+
+    if dtype == torch.bfloat16:
+        ref_o = ref_o.to(dtype)
+        atol_o = 1e-2
+        rtol_o = 1e-2
+        atol_kv = 5e-3
+        rtol_kv = 1e-3
+    else:
+        atol_o = 1e-3
+        rtol_o = 1e-3
+        atol_kv = 1e-3
+        rtol_kv = 1e-4
+
+    torch.testing.assert_close(our_o, ref_o, atol=atol_o, rtol=rtol_o)
+    torch.testing.assert_close(our_state, ref_state, atol=atol_kv, rtol=rtol_kv)
+
+
+@pytest.mark.parametrize("beta", [False, True])
+@pytest.mark.parametrize("alpha", [False, True])
+@pytest.mark.parametrize("scale", [1.0, "auto"])
+@pytest.mark.parametrize("head_size", [128])
+@pytest.mark.parametrize(
+    "num_q_heads, num_k_heads, num_v_heads",
+    [(1, 1, 1), (4, 1, 1), (3, 3, 3), (6, 2, 2), (1, 1, 2), (2, 2, 4)],
+)
+@pytest.mark.parametrize("seq_lens", [[64], [128], [256], [256, 256], [64, 128, 512]])
+@pytest.mark.parametrize("block_size", [64])
+@pytest.mark.parametrize("dtype", ["float16", "bfloat16"])
+def test_prefill_kernel_basic(
+    qkv_factory,
+    dtype: str,
+    num_q_heads: int,
+    num_k_heads: int,
+    num_v_heads: int,
+    head_size: int,
+    block_size: int,
+    seq_lens: list[int],
+    scale: float | str,
+    alpha: bool,
+    beta: bool,
+    seed: int = int(os.environ.get("SEED", "0")),
+):
+    scale = 1.0 / math.sqrt(head_size) if scale == "auto" else scale
+    _test_prefill_kernel(
+        qkv_factory,
+        dtype,
+        num_q_heads,
+        num_k_heads,
+        num_v_heads,
+        head_size,
+        block_size,
+        seq_lens,
+        scale,
+        alpha,
+        beta,
+        seed,
+    )
+
+
+@pytest.mark.parametrize("beta", [False, True])
+@pytest.mark.parametrize("alpha", [False, True])
+@pytest.mark.parametrize("scale", [1.0, "auto"])
+@pytest.mark.parametrize("head_size", [128])
+@pytest.mark.parametrize(
+    "num_q_heads, num_k_heads, num_v_heads",
+    [(1, 1, 1), (4, 1, 1), (3, 3, 3), (6, 2, 2), (1, 1, 2), (2, 2, 4)],
+)
+@pytest.mark.parametrize(
+    "seq_lens",
+    [[31], [61], [91], [121], [251], [511, 501], [31, 63, 93, 123, 150, 500]],
+)
+@pytest.mark.parametrize("block_size", [32])
+@pytest.mark.parametrize("dtype", ["float16", "bfloat16"])
+def test_prefill_kernel_nonfull(
+    qkv_factory,
+    dtype: str,
+    num_q_heads: int,
+    num_k_heads: int,
+    num_v_heads: int,
+    head_size: int,
+    block_size: int,
+    seq_lens: list[int],
+    scale: float | str,
+    alpha: bool,
+    beta: bool,
+    seed: int = int(os.environ.get("SEED", "0")),
+):
+    scale = 1.0 / math.sqrt(head_size) if scale == "auto" else scale
+    _test_prefill_kernel(
+        qkv_factory,
+        dtype,
+        num_q_heads,
+        num_k_heads,
+        num_v_heads,
+        head_size,
+        block_size,
+        seq_lens,
+        scale,
+        alpha,
+        beta,
+        seed,
+    )
+
+
+def _test_chunked_prefill(
+    qkv_factory,
+    dtype: str,
+    num_q_heads: int,
+    num_k_heads: int,
+    num_v_heads: int,
+    head_size: int,
+    block_size: int,
+    seq_lens1: list[int],
+    seq_lens2: list[int],
+    scale: float,
+    alpha: bool,
+    beta: bool,
+    seed: int | None = None,
+):
+    if not alpha and not beta:
+        pytest.skip(
+            "large diff due to output value amplitude explosion along token dimension"
+        )
+
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    num_seqs = len(seq_lens1)
+    assert num_seqs == len(seq_lens2)
+    total_seqlen1 = sum(seq_lens1)
+    total_seqlen2 = sum(seq_lens2)
+    num_o_heads = max(num_q_heads, num_v_heads)
+    num_sab_heads = max(num_q_heads, num_v_heads)
+
+    dtype = getattr(torch, dtype)
+    kv_dtype = torch.float32
+    device = torch.device("cuda")
+    with device:
+        q1, k1, v1 = qkv_factory(
+            seq_lens1, num_q_heads, num_k_heads, num_v_heads, head_size, dtype
+        )
+        q2, k2, v2 = qkv_factory(
+            seq_lens2, num_q_heads, num_k_heads, num_v_heads, head_size, dtype
+        )
+        # l2 norm k to avoid numerical instability
+        k1 = torch.nn.functional.normalize(k1, p=2.0, dim=-1)
+        k2 = torch.nn.functional.normalize(k2, p=2.0, dim=-1)
+        cu_seq_lens1 = torch.tensor(exclusive_cumsum(seq_lens1), dtype=torch.int64)
+        cu_seq_lens2 = torch.tensor(exclusive_cumsum(seq_lens2), dtype=torch.int64)
+        alpha1 = torch.rand(total_seqlen1, num_sab_heads) if alpha else None
+        alpha2 = torch.rand(total_seqlen2, num_sab_heads) if alpha else None
+        beta1 = torch.rand(total_seqlen1, num_sab_heads) if beta else None
+        beta2 = torch.rand(total_seqlen2, num_sab_heads) if beta else None
+
+    our_o1 = torch.empty(
+        [total_seqlen1, num_o_heads, head_size], dtype=q1.dtype, device=q1.device
+    )
+    our_o2 = torch.empty(
+        [total_seqlen2, num_o_heads, head_size], dtype=q2.dtype, device=q2.device
+    )
+    our_state1 = torch.empty(
+        (num_seqs, num_sab_heads, head_size, head_size),
+        dtype=torch.float32,
+        device=q1.device,
+    )
+    our_state2 = torch.empty(
+        (num_seqs, num_sab_heads, head_size, head_size),
+        dtype=torch.float32,
+        device=q1.device,
+    )
+    our_o1.fill_(float("nan"))
+    our_o2.fill_(float("nan"))
+    our_state1.fill_(float("nan"))
+    our_state2.fill_(float("nan"))
+
+    chunk_gated_delta_rule(
+        q1,
+        k1,
+        v1,
+        alpha1,
+        beta1,
+        scale,
+        None,
+        True,
+        cu_seq_lens1,
+        True,
+        output=our_o1,
+        output_state=our_state1,
+    )
+    chunk_gated_delta_rule(
+        q2,
+        k2,
+        v2,
+        alpha2,
+        beta2,
+        scale,
+        our_state1,
+        True,
+        cu_seq_lens2,
+        True,
+        output=our_o2,
+        output_state=our_state2,
+    )
+    our_state = our_state2
+
+    torch.cuda.synchronize()
+
+    # postprocessing raw output, ref_state is v-major, our_state is k-major, unify to v-major for testing
+    our_state = our_state.transpose(-1, -2)
+
+    def concat_varlen(t1, cu_seq_lens1, t2, cu_seq_lens2):
+        output = []
+        for i in range(cu_seq_lens1.size(0) - 1):
+            s1 = cu_seq_lens1[i]
+            s2 = cu_seq_lens2[i]
+            e1 = cu_seq_lens1[i + 1]
+            e2 = cu_seq_lens2[i + 1]
+            output.append(t1[s1:e1])
+            output.append(t2[s2:e2])
+        return torch.concat(output)
+
+    cu_seq_lens1 = cu_seq_lens1.cpu()
+    cu_seq_lens2 = cu_seq_lens2.cpu()
+    our_o = concat_varlen(our_o1, cu_seq_lens1, our_o2, cu_seq_lens2)
+
+    q = concat_varlen(q1, cu_seq_lens1, q2, cu_seq_lens2)
+    k = concat_varlen(k1, cu_seq_lens1, k2, cu_seq_lens2)
+    v = concat_varlen(v1, cu_seq_lens1, v2, cu_seq_lens2)
+    alpha = concat_varlen(alpha1, cu_seq_lens1, alpha2, cu_seq_lens2) if alpha else None
+    beta = concat_varlen(beta1, cu_seq_lens1, beta2, cu_seq_lens2) if beta else None
+
+    seq_lens = [a + b for a, b in zip(seq_lens1, seq_lens2, strict=True)]
+
+    ref_o, ref_state = blockwise_delta_rule(
+        q.float(),
+        k.float(),
+        v.float(),
+        seq_lens,
+        scale_factor=scale,
+        alpha=alpha,
+        beta=beta,
+        kv_dtype=torch.float32,
+    )
+    ref_o = ref_o.to(q.dtype)
+    ref_state = ref_state.to(kv_dtype)
+
+    if dtype == torch.bfloat16:
+        ref_o = ref_o.to(dtype)
+        atol_o = 1e-2
+        rtol_o = 1e-2
+        atol_kv = 5e-3
+        rtol_kv = 1e-3
+    else:
+        atol_o = 2e-3
+        rtol_o = 1e-3
+        atol_kv = 1e-3
+        rtol_kv = 1e-4
+
+    torch.testing.assert_close(our_o, ref_o, atol=atol_o, rtol=rtol_o)
+    torch.testing.assert_close(our_state, ref_state, atol=atol_kv, rtol=rtol_kv)
+
+
+@pytest.mark.parametrize("beta", [False, True])
+@pytest.mark.parametrize("alpha", [False, True])
+@pytest.mark.parametrize("scale", [1.0, "auto"])
+@pytest.mark.parametrize("head_size", [128])
+@pytest.mark.parametrize(
+    "num_q_heads, num_k_heads, num_v_heads", [(6, 2, 2), (2, 2, 4)]
+)
+@pytest.mark.parametrize(
+    "seq_lens1, seq_lens2",
+    list(
+        zip(
+            [[61], [128], [511, 501], [256, 256], [123, 150, 500], [64, 128, 512]],
+            [[128], [61], [256, 256], [511, 501], [64, 128, 512], [123, 150, 500]],
+            strict=True,
+        )
+    ),
+)
+@pytest.mark.parametrize("block_size", [64])
+@pytest.mark.parametrize("dtype", ["float16", "bfloat16"])
+def test_chunked_prefill(
+    qkv_factory,
+    dtype: str,
+    num_q_heads: int,
+    num_k_heads: int,
+    num_v_heads: int,
+    head_size: int,
+    block_size: int,
+    seq_lens1: list[int],
+    seq_lens2: list[int],
+    scale: float | str,
+    alpha: bool,
+    beta: bool,
+    seed: int = int(os.environ.get("SEED", "0")),
+):
+    scale = 1.0 / math.sqrt(head_size) if scale == "auto" else scale
+    _test_chunked_prefill(
+        qkv_factory,
+        dtype,
+        num_q_heads,
+        num_k_heads,
+        num_v_heads,
+        head_size,
+        block_size,
+        seq_lens1,
+        seq_lens2,
+        scale,
+        alpha,
+        beta,
+        seed,
+    )