flashinfer-ai · djmmoss · Nov 13, 2025 · Nov 13, 2025 · Nov 14, 2025 · Nov 17, 2025
@@ -51,6 +51,7 @@
     _check_pos_encoding_mode,
     check_shape_dtype_device,
     _get_cache_alibi_slopes_buf,
+    _get_sink_buf,
     _get_cache_buf,
     _get_range_buf,
     _unpack_paged_kv_cache,
@@ -242,6 +243,7 @@ def run_batch_decode(
         window_left: int,
         enable_pdl: bool,
         alibi_slopes: Optional[torch.Tensor],
+        maybe_s_aux: Optional[torch.Tensor],
         logits_soft_cap: float,
         sm_scale: float,
         rope_scale: float,
@@ -263,6 +265,7 @@ def run_batch_decode(
             window_left,
             enable_pdl,
             alibi_slopes,
+            maybe_s_aux,
             logits_soft_cap,
             sm_scale,
             1.0 / rope_scale,  # rope_rcp_scale
@@ -286,6 +289,7 @@ def _fake_run_batch_decode(
         window_left: int,
         enable_pdl: bool,
         alibi_slopes: Optional[torch.Tensor],
+        maybe_s_aux: Optional[torch.Tensor],
         logits_soft_cap: float,
         sm_scale: float,
         rope_scale: float,
@@ -384,6 +388,7 @@ def single_decode_with_kv_cache(
     rope_scale: Optional[float] = None,
     rope_theta: Optional[float] = None,
     return_lse: Literal[True] = True,
+    sinks: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
 
@@ -403,6 +408,7 @@ def single_decode_with_kv_cache(
     rope_scale: Optional[float] = None,
     rope_theta: Optional[float] = None,
     return_lse: bool = False,
+    sinks: Optional[torch.Tensor] = None,
 ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
     r"""Decode attention with KV Cache for single request, return attention output.
 
@@ -529,6 +535,7 @@ def single_decode_with_kv_cache(
             window_left,
             None,  # packed_custom_mask
             _get_cache_alibi_slopes_buf(num_qo_heads, q.device),
+            sinks,  # maybe_s_aux
             logits_soft_cap,
             sm_scale,
             None,  # scale_q, not supported yet
@@ -1330,7 +1337,7 @@ def run(
                     self._kv_lens_buffer,
                     page_size,
                     self._max_kv_len,
-                    sinks,
+                    _get_sink_buf(sinks),
                 ]
 
             self._cached_module.paged_run(*run_args)
@@ -1364,6 +1371,7 @@ def run(
             else:
                 run_args += [
                     _get_cache_alibi_slopes_buf(q.shape[1], q.device),
+                    _get_sink_buf(sinks),
                     logits_soft_cap,
                     sm_scale,
                     rope_scale,

@@ -467,8 +467,8 @@ def gen_single_decode_module(
         dtype_o,
         head_dim_qk,
         head_dim_vo,
-        ["maybe_alibi_slopes"],  # additional_tensor_names
-        ["float"],  # additional_tensor_dtypes
+        ["maybe_alibi_slopes", "maybe_s_aux"],  # additional_tensor_names
+        ["float", "float"],  # additional_tensor_dtypes
         [
             "logits_soft_cap",
             "sm_scale",
@@ -516,8 +516,12 @@ def gen_single_prefill_module(
 
     if backend == "fa2":
         assert not fp8_enabled, "fp8 tensor core is not supported in fa2 backend"
-        additional_tensor_names = ["maybe_custom_mask", "maybe_alibi_slopes"]
-        additional_tensor_dtypes = ["uint8_t", "float"]
+        additional_tensor_names = [
+            "maybe_custom_mask",
+            "maybe_alibi_slopes",
+            "maybe_s_aux",
+        ]
+        additional_tensor_dtypes = ["uint8_t", "float", "float"]
         additional_scalar_names = [
             "logits_soft_cap",
             "sm_scale",
@@ -760,8 +764,8 @@ def gen_batch_decode_module(
         dtype_idx,
         head_dim_qk,
         head_dim_vo,
-        ["maybe_alibi_slopes"],  # additional_tensor_names
-        ["float"],  # additional_tensor_dtypes
+        ["maybe_alibi_slopes", "maybe_s_aux"],  # additional_tensor_names
+        ["float", "float"],  # additional_tensor_dtypes
         [
             "logits_soft_cap",
             "sm_scale",

@@ -277,6 +277,7 @@ def run_single_prefill(
         window_left: int,
         maybe_packed_custom_mask: Optional[torch.Tensor],
         maybe_alibi_slopes: Optional[torch.Tensor],
+        maybe_s_aux: Optional[torch.Tensor],
         logits_soft_cap: float,
         sm_scale: float,
         scale_q: Optional[torch.Tensor],
@@ -330,6 +331,7 @@ def run_single_prefill(
                 window_left,
                 maybe_packed_custom_mask,
                 maybe_alibi_slopes,
+                maybe_s_aux,
                 logits_soft_cap,
                 sm_scale,
                 1.0 / rope_scale,  # rope_rcp_scale
@@ -350,6 +352,7 @@ def _fake_run_single_prefill(
         window_left: int,
         maybe_packed_custom_mask: Optional[torch.Tensor],
         maybe_alibi_slopes: Optional[torch.Tensor],
+        maybe_s_aux: Optional[torch.Tensor],
         logits_soft_cap: float,
         sm_scale: float,
         rope_scale: float,

@@ -237,6 +237,23 @@ def _get_cache_alibi_slopes_buf(
     return buf
 
 
+def _get_sink_buf(
+    sinks: Optional[torch.Tensor],
+) -> Optional[torch.Tensor]:
+    """Convert sinks tensor to proper format for CUDA kernels.
+
+    Args:
+        sinks: Optional tensor of shape [num_qo_heads] with sink values per head
+
+    Returns:
+        Contiguous float32 tensor or None if sinks is None
+    """
+    if sinks is None:
+        return None
+    # Ensure it's float32 and contiguous as expected by CUDA kernels
+    return sinks.to(torch.float32).contiguous()
+
+
 def canonicalize_torch_dtype(dtype: Union[torch.dtype, str]) -> torch.dtype:
     if isinstance(dtype, str):
         return getattr(torch, dtype)

@@ -355,6 +355,14 @@ __global__ void SingleDecodeWithKVCacheKernel(const __grid_constant__ Params par
   // sync local state of all warps inside a threadblock
   sync_state<vec_size, bdx, bdy, bdz>(variant, st_local, reinterpret_cast<float*>(smem), smem_md,
                                       tx, ty, tz);
+  // Add s_aux (learnable sink) contribution to softmax denominator after all tiles processed
+  if constexpr (variant.use_softmax) {
+    if (params.maybe_s_aux != nullptr) {
+      constexpr float LOG2_E = 1.4426950408889634f;  // log2(e)
+      float s_aux_val = params.maybe_s_aux[qo_head_idx];
+      st_local.d += math::ptx_exp2((s_aux_val - st_local.m) * LOG2_E);
+    }
+  }
-  // Add s_aux (learnable sink) contribution to softmax denominator after all tiles processed
-  if constexpr (variant.use_softmax) {
-    if (params.maybe_s_aux != nullptr) {
-      constexpr float LOG2_E = 1.4426950408889634f;  // log2(e)
-      float s_aux_val = params.maybe_s_aux[qo_head_idx];
-      st_local.d += math::ptx_exp2((s_aux_val - st_local.m) * LOG2_E);
-    }
-  }
+  // Add s_aux (learnable sink) contribution to softmax denominator after all tiles processed
+  if constexpr (variant.use_softmax) {
+    if (params.maybe_s_aux != nullptr) {
+      float s_aux_scaled = params.maybe_s_aux[qo_head_idx] * variant.sm_scale_log2;
+      st_local.d += math::ptx_exp2(s_aux_scaled - st_local.m);
+    }
+  }
-  // Add s_aux (learnable sink) contribution to softmax denominator after all tiles processed
-  if constexpr (variant.use_softmax) {
-    if (params.maybe_s_aux != nullptr) {
-      constexpr float LOG2_E = 1.4426950408889634f;  // log2(e)
-      float s_aux_val = params.maybe_s_aux[qo_head_idx];
-      st_local.d += math::ptx_exp2((s_aux_val - st_local.m) * LOG2_E);
-    }
-  }
+  // Add s_aux (learnable sink) contribution to softmax denominator after all tiles processed
+  if constexpr (variant.use_softmax) {
+    if (params.maybe_s_aux != nullptr) {
+      float s_aux_scaled = params.maybe_s_aux[qo_head_idx] * variant.sm_scale_log2;
+      st_local.d += math::ptx_exp2(s_aux_scaled - st_local.m);
+    }
+  }
 #pragma unroll
   for (size_t i = 0; i < vec_size; ++i) {
     st_local.o[i] = variant.OutputTransform(params, st_local.o[i], /*batch_idx=*/0, /*qo_idx=*/0,
@@ -589,6 +597,14 @@ __device__ __inline__ void BatchDecodeWithPagedKVCacheDevice(const Params& param
   // sync local state of all warps inside a threadblock
   sync_state<vec_size, bdx, bdy, bdz>(variant, st, reinterpret_cast<float*>(smem), smem_md, tx, ty,
                                       tz);
+  // Add s_aux (learnable sink) contribution to softmax denominator after all tiles processed
+  if constexpr (variant.use_softmax) {
+    if (params.maybe_s_aux != nullptr) {
+      constexpr float LOG2_E = 1.4426950408889634f;  // log2(e)
+      float s_aux_val = params.maybe_s_aux[qo_head_idx];
+      st.d += math::ptx_exp2((s_aux_val - st.m) * LOG2_E);
+    }
+  }
 #pragma unroll
   for (size_t i = 0; i < vec_size; ++i) {
     st.o[i] = variant.OutputTransform(params, st.o[i], bx, /*qo_idx=*/0, qo_head_idx, st.m, st.d,

@@ -37,6 +37,7 @@ struct SingleDecodeParams {
   DTypeO* o;
   float* lse;
   float* maybe_alibi_slopes;
+  float* maybe_s_aux;
   uint32_t kv_len;
   uint32_t num_qo_heads;
   uint32_t num_kv_heads;
@@ -58,6 +59,7 @@ struct SingleDecodeParams {
         o(nullptr),
         lse(nullptr),
         maybe_alibi_slopes(nullptr),
+        maybe_s_aux(nullptr),
         kv_len(0),
         num_qo_heads(0),
         num_kv_heads(0),
@@ -84,6 +86,7 @@ struct SingleDecodeParams {
         o(o),
         lse(nullptr),
         maybe_alibi_slopes(maybe_alibi_slopes),
+        maybe_s_aux(nullptr),
         kv_len(seq_len),
         num_qo_heads(num_qo_heads),
         num_kv_heads(num_kv_heads),
@@ -118,6 +121,7 @@ struct BatchDecodeParams {
   DTypeO* o;
   float* lse;
   float* maybe_alibi_slopes;
+  float* maybe_s_aux;
   uint32_t padded_batch_size;
   uint32_t num_qo_heads;
   IdType q_stride_n;
@@ -142,6 +146,7 @@ struct BatchDecodeParams {
         o(nullptr),
         lse(nullptr),
         maybe_alibi_slopes(nullptr),
+        maybe_s_aux(nullptr),
         padded_batch_size(0),
         num_qo_heads(0),
         q_stride_n(0),
@@ -170,6 +175,7 @@ struct BatchDecodeParams {
         o(o),
         lse(lse),
         maybe_alibi_slopes(maybe_alibi_slopes),
+        maybe_s_aux(nullptr),
         padded_batch_size(0),
         num_qo_heads(num_qo_heads),
         q_stride_n(q_stride_n),

@@ -38,6 +38,7 @@ struct SinglePrefillParams {
   DTypeO* o;
   float* lse;
   float* maybe_alibi_slopes;
+  float* maybe_s_aux;
   uint_fastdiv group_size;
   uint32_t qo_len;
   uint32_t kv_len;
@@ -66,6 +67,7 @@ struct SinglePrefillParams {
         o(nullptr),
         lse(nullptr),
         maybe_alibi_slopes(nullptr),
+        maybe_s_aux(nullptr),
         group_size(),
         qo_len(0),
         kv_len(0),
@@ -86,7 +88,7 @@ struct SinglePrefillParams {
         partition_kv(false) {}
 
   __host__ SinglePrefillParams(DTypeQ* q, DTypeKV* k, DTypeKV* v, uint8_t* maybe_custom_mask,
-                               DTypeO* o, float* lse, float* maybe_alibi_slopes,
+                               DTypeO* o, float* lse, float* maybe_alibi_slopes, float* maybe_s_aux,
                                uint32_t num_qo_heads, uint32_t num_kv_heads, uint32_t qo_len,
                                uint32_t kv_len, uint32_t q_stride_n, uint32_t q_stride_h,
                                uint32_t kv_stride_n, uint32_t kv_stride_h, uint32_t head_dim,
@@ -99,6 +101,7 @@ struct SinglePrefillParams {
         o(o),
         lse(lse),
         maybe_alibi_slopes(maybe_alibi_slopes),
+        maybe_s_aux(maybe_s_aux),
         group_size(num_qo_heads / num_kv_heads),
         num_qo_heads(num_qo_heads),
         num_kv_heads(num_kv_heads),
@@ -146,6 +149,7 @@ struct BatchPrefillRaggedParams {
   DTypeO* o;
   float* lse;
   float* maybe_alibi_slopes;
+  float* maybe_s_aux;
   uint_fastdiv group_size;
   uint32_t num_qo_heads;
   uint32_t num_kv_heads;
@@ -190,6 +194,7 @@ struct BatchPrefillRaggedParams {
         o(nullptr),
         lse(nullptr),
         maybe_alibi_slopes(nullptr),
+        maybe_s_aux(nullptr),
         group_size(),
         num_qo_heads(0),
         num_kv_heads(0),
@@ -224,9 +229,9 @@ struct BatchPrefillRaggedParams {
                                     IdType* q_indptr, IdType* kv_indptr, IdType* maybe_mask_indptr,
                                     IdType* maybe_q_rope_offset, IdType* maybe_k_rope_offset,
                                     DTypeO* o, float* lse, float* maybe_alibi_slopes,
-                                    uint32_t num_qo_heads, uint32_t num_kv_heads,
-                                    uint32_t q_stride_n, uint32_t q_stride_h, uint32_t kv_stride_n,
-                                    uint32_t kv_stride_h, int32_t window_left,
+                                    float* maybe_s_aux, uint32_t num_qo_heads,
+                                    uint32_t num_kv_heads, uint32_t q_stride_n, uint32_t q_stride_h,
+                                    uint32_t kv_stride_n, uint32_t kv_stride_h, int32_t window_left,
                                     float logits_soft_cap, float sm_scale, float rope_scale,
                                     float rope_theta)
       : q(q),
@@ -241,6 +246,7 @@ struct BatchPrefillRaggedParams {
         o(o),
         lse(lse),
         maybe_alibi_slopes(maybe_alibi_slopes),
+        maybe_s_aux(maybe_s_aux),
         group_size(num_qo_heads / num_kv_heads),
         num_qo_heads(num_qo_heads),
         num_kv_heads(num_kv_heads),
@@ -296,6 +302,7 @@ struct BatchPrefillPagedParams {
   DTypeO* o;
   float* lse;
   float* maybe_alibi_slopes;
+  float* maybe_s_aux;
   uint_fastdiv group_size;
   uint32_t num_qo_heads;
   IdType q_stride_n;
@@ -332,6 +339,7 @@ struct BatchPrefillPagedParams {
         o(nullptr),
         lse(nullptr),
         maybe_alibi_slopes(nullptr),
+        maybe_s_aux(nullptr),
         group_size(),
         num_qo_heads(0),
         q_stride_n(0),
@@ -361,9 +369,9 @@ struct BatchPrefillPagedParams {
                                    uint8_t* maybe_custom_mask, IdType* q_indptr,
                                    IdType* maybe_mask_indptr, IdType* maybe_q_rope_offset,
                                    DTypeO* o, float* lse, float* maybe_alibi_slopes,
-                                   uint32_t num_qo_heads, IdType q_stride_n, IdType q_stride_h,
-                                   int32_t window_left, float logits_soft_cap, float sm_scale,
-                                   float rope_scale, float rope_theta)
+                                   float* maybe_s_aux, uint32_t num_qo_heads, IdType q_stride_n,
+                                   IdType q_stride_h, int32_t window_left, float logits_soft_cap,
+                                   float sm_scale, float rope_scale, float rope_theta)
       : q(q),
         paged_kv(paged_kv),
         maybe_custom_mask(maybe_custom_mask),
@@ -373,6 +381,7 @@ struct BatchPrefillPagedParams {
         o(o),
         lse(lse),
         maybe_alibi_slopes(maybe_alibi_slopes),
+        maybe_s_aux(maybe_s_aux),
         group_size(num_qo_heads / paged_kv.num_heads),
         num_qo_heads(num_qo_heads),
         q_stride_n(q_stride_n),

@@ -90,6 +90,16 @@ struct DefaultAttention : AttentionVariantBase {
     }
     return mask;
   })
+
+  REGISTER_M_D_UPDATE(params, kv_tile_idx, qo_head_idx, m, d, scale, {
+    if constexpr (use_softmax) {
+      if (params.maybe_s_aux != nullptr) {
+        constexpr float LOG2_E = 1.4426950408889634f;  // log2(e)
+        float s_aux_val = params.maybe_s_aux[qo_head_idx];
+        d += math::ptx_exp2((s_aux_val - m) * LOG2_E);
+      }
+    }
+  })
 };
 
 };  // namespace flashinfer