flashinfer-ai · yzh119 · Mar 4, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
@@ -24,6 +24,10 @@ flashinfer/cute_dsl/benchmark_gated_delta_rule.py
 # vscode
 .vscode/
 
+# zed text editor
+.zed/
+.rules
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

@@ -38,10 +38,13 @@ void selective_state_update(
     bool dt_softplus,
     Optional<TensorView> state_batch_indices,  // (batch,)
     int64_t pad_slot_id,
-    TensorView output,  // same as x
+    Optional<TensorView> state_scale,  // float32: (state_cache_size, nheads, dim)
+    TensorView output,                 // same as x
     bool disable_state_update,
     Optional<TensorView> intermediate_states_buffer,  // (batch, cache_steps, nheads, dim, dstate)
     Optional<TensorView> intermediate_state_indices,  // (batch,)
+    Optional<TensorView> intermediate_state_scales,   // float32: (batch, cache_steps, nheads, dim)
+    Optional<TensorView> rand_seed,  // device-side int64 tensor for Philox rounding
     int64_t cache_steps,
     int64_t algorithm);  // SSUAlgorithm: 0=auto, 1=simple, 2=vertical, 3=horizontal
 

@@ -16,7 +16,7 @@
 // clang-format off
 // config.inc MUST come before the header: it defines DIM, DSTATE, NTOKENS_MTP
 // constexprs that the header's function templates rely on. Reordering breaks compilation.
-// NOTE: the .inc file is generated from the jinja templates
+// NOTE: the .inc file is generated from the jinja template csrc/selective_state_update_customize_config.jinja
 #include "selective_state_update_config.inc"
 #include <flashinfer/mamba/selective_state_update.cuh>
 // clang-format on
@@ -99,6 +99,22 @@ inline void validate_intermediate_states_buffer(
   CHECK_CONTIGUOUS(intermediate_states_buffer.value());
 }
 
+inline void validate_state_scale(Optional<TensorView> const& state_scale, int64_t state_cache_size,
+                                 int64_t nheads, int64_t dim) {
+  if (!state_scale.has_value()) return;
+  auto const& scale = state_scale.value();
+  CHECK_CUDA(scale);
+  CHECK_DIM(3, scale);  // state_scale: {state_cache_size, nheads, dim}
+  FLASHINFER_CHECK(scale.size(0) == state_cache_size,
+                   "state_scale.size(0) must equal state_cache_size");
+  FLASHINFER_CHECK(scale.size(1) == nheads, "state_scale.size(1) must equal nheads");
+  FLASHINFER_CHECK(scale.size(2) == dim, "state_scale.size(2) must equal dim");
+  // Inner dims (nheads, dim) must be contiguous
+  FLASHINFER_CHECK(scale.stride(2) == 1, "state_scale.stride(2) must be 1, got ", scale.stride(2));
+  FLASHINFER_CHECK(scale.stride(1) == dim, "state_scale.stride(1) must equal dim, got ",
+                   scale.stride(1));
+}
+
 // Validates dtype consistency across tensors
 inline void validate_dtype_consistency(
     TensorView const& state, TensorView const& dt, TensorView const& D, TensorView const& x,
@@ -133,8 +149,9 @@ void run_selective_state_update_stp(TensorView const& state, TensorView const& x
                                     TensorView const& C, TensorView const& D,
                                     Optional<TensorView> z, Optional<TensorView> dt_bias,
                                     bool dt_softplus, Optional<TensorView> state_batch_indices,
-                                    int64_t pad_slot_id, Optional<TensorView> out,
-                                    bool disable_state_update, int64_t algorithm) {
+                                    Optional<TensorView> state_scale, int64_t pad_slot_id,
+                                    Optional<TensorView> out, bool disable_state_update,
+                                    Optional<TensorView> rand_seed, int64_t algorithm) {
   // Extract dimensions from input tensors
   auto const batch = x.size(0);
   auto const state_cache_size = state.size(0);
@@ -219,6 +236,7 @@ void run_selective_state_update_stp(TensorView const& state, TensorView const& x
 
   // Validate dtype consistency
   validate_dtype_consistency(state, dt, D, x, B, C, dt_bias, z, out);
+  validate_state_scale(state_scale, state_cache_size, nheads, dim);
 
   // Initialize params struct
   SelectiveStateUpdateParams p;
@@ -248,6 +266,18 @@ void run_selective_state_update_stp(TensorView const& state, TensorView const& x
   if (state_batch_indices.has_value()) {
     p.state_batch_indices = const_cast<void*>(state_batch_indices.value().data_ptr());
   }
+  if (state_scale.has_value()) {
+    p.state_scale = state_scale.value().data_ptr();
+    p.state_scale_stride_batch = state_scale.value().stride(0);
+  }
+  if (rand_seed.has_value()) {
+    auto const& rs = rand_seed.value();
+    CHECK_CUDA(rs);
+    FLASHINFER_CHECK(rs.numel() == 1,
+                     "rand_seed must be a single-element tensor, got numel=", rs.numel());
+    FLASHINFER_CHECK(rs.dtype().code == kDLInt && rs.dtype().bits == 64, "rand_seed must be int64");
+    p.rand_seed = static_cast<const int64_t*>(rs.data_ptr());
+  }
 
   // Copy pointers
   p.state = const_cast<void*>(state.data_ptr());
@@ -275,16 +305,18 @@ void run_selective_state_update_stp(TensorView const& state, TensorView const& x
   const cudaStream_t stream = get_stream(state.device());
 
   auto algo = static_cast<SSUAlgorithm>(algorithm);
-  invokeSelectiveStateUpdate<input_t, weight_t, matrixA_t, state_t, stateIndex_t>(p, algo, stream);
+  invokeSelectiveStateUpdate<input_t, weight_t, matrixA_t, state_t, stateIndex_t, state_scale_t>(
+      p, algo, stream);
 }
 
 void run_selective_state_update_mtp(
     TensorView const& state, TensorView const& x, TensorView const& dt, TensorView const& A,
     TensorView const& B, TensorView const& C, TensorView const& D, Optional<TensorView> z,
     Optional<TensorView> dt_bias, bool dt_softplus, Optional<TensorView> state_batch_indices,
-    int64_t pad_slot_id, Optional<TensorView> out, bool disable_state_update,
-    Optional<TensorView> intermediate_states_buffer,
-    Optional<TensorView> intermediate_state_indices, int64_t cache_steps, int64_t algorithm) {
+    Optional<TensorView> state_scale, int64_t pad_slot_id, Optional<TensorView> out,
+    bool disable_state_update, Optional<TensorView> intermediate_states_buffer,
+    Optional<TensorView> intermediate_state_indices, Optional<TensorView> intermediate_state_scales,
+    Optional<TensorView> rand_seed, int64_t cache_steps, int64_t algorithm) {
   // Extract dimensions from input tensors
   auto const batch = x.size(0);
   auto const ntokens_mtp = x.size(1);
@@ -378,6 +410,7 @@ void run_selective_state_update_mtp(
   validate_dtype_consistency(state, dt, D, x, B, C, dt_bias, z, out, intermediate_states_buffer);
   validate_intermediate_state_indices(intermediate_state_indices, batch);
   validate_intermediate_states_buffer(intermediate_states_buffer);
+  validate_state_scale(state_scale, state_cache_size, nheads, dim);
 
   // Validate that state_batch_indices and intermediate_state_indices have the same dtype
   if (state_batch_indices.has_value() && intermediate_state_indices.has_value()) {
@@ -435,6 +468,10 @@ void run_selective_state_update_mtp(
   if (state_batch_indices.has_value()) {
     p.state_batch_indices = const_cast<void*>(state_batch_indices.value().data_ptr());
   }
+  if (state_scale.has_value()) {
+    p.state_scale = state_scale.value().data_ptr();
+    p.state_scale_stride_batch = state_scale.value().stride(0);
+  }
 
   if (intermediate_states_buffer.has_value()) {
     p.intermediate_states = const_cast<void*>(intermediate_states_buffer.value().data_ptr());
@@ -445,6 +482,30 @@ void run_selective_state_update_mtp(
     p.intermediate_state_indices = const_cast<void*>(intermediate_state_indices.value().data_ptr());
   }
 
+  if (intermediate_state_scales.has_value()) {
+    auto const& iscales = intermediate_state_scales.value();
+    CHECK_CUDA(iscales);
+    CHECK_CONTIGUOUS(iscales);
+    CHECK_DIM(4, iscales);  // (batch, cache_steps, nheads, dim)
+    FLASHINFER_CHECK(iscales.size(0) == batch,
+                     "intermediate_state_scales.size(0) must equal batch");
+    FLASHINFER_CHECK(iscales.size(1) == cache_steps,
+                     "intermediate_state_scales.size(1) must equal cache_steps");
+    FLASHINFER_CHECK(iscales.size(2) == nheads,
+                     "intermediate_state_scales.size(2) must equal nheads");
+    FLASHINFER_CHECK(iscales.size(3) == dim, "intermediate_state_scales.size(3) must equal dim");
+    p.intermediate_state_scales = iscales.data_ptr();
+    p.intermediate_state_scales_stride_batch = iscales.stride(0);
+  }
+  if (rand_seed.has_value()) {
+    auto const& rs = rand_seed.value();
+    CHECK_CUDA(rs);
+    FLASHINFER_CHECK(rs.numel() == 1,
+                     "rand_seed must be a single-element tensor, got numel=", rs.numel());
+    FLASHINFER_CHECK(rs.dtype().code == kDLInt && rs.dtype().bits == 64, "rand_seed must be int64");
+    p.rand_seed = static_cast<const int64_t*>(rs.data_ptr());
+  }
+
   // Copy pointers
   p.state = const_cast<void*>(state.data_ptr());
   p.x = const_cast<void*>(x.data_ptr());
@@ -472,30 +533,29 @@ void run_selective_state_update_mtp(
   const cudaStream_t stream = get_stream(state.device());
 
   auto algo = static_cast<SSUAlgorithm>(algorithm);
-  mtp::invokeSelectiveStateUpdateMTP<input_t, weight_t, matrixA_t, state_t, stateIndex_t>(p, algo,
-                                                                                          stream);
+  mtp::invokeSelectiveStateUpdateMTP<input_t, weight_t, matrixA_t, state_t, stateIndex_t,
+                                     state_scale_t>(p, algo, stream);
 }
 
 // =============================================================================
 // Generic dispatcher - routes to single-token or multi-token based on x.dim()
 // =============================================================================
-void selective_state_update(TensorView state, TensorView x, TensorView dt, TensorView A,
-                            TensorView B, TensorView C, TensorView D, Optional<TensorView> z,
-                            Optional<TensorView> dt_bias, bool dt_softplus,
-                            Optional<TensorView> state_batch_indices, int64_t pad_slot_id,
-                            TensorView output, bool disable_state_update,
-                            Optional<TensorView> intermediate_states_buffer,
-                            Optional<TensorView> intermediate_state_indices, int64_t cache_steps,
-                            int64_t algorithm) {
+void selective_state_update(
+    TensorView state, TensorView x, TensorView dt, TensorView A, TensorView B, TensorView C,
+    TensorView D, Optional<TensorView> z, Optional<TensorView> dt_bias, bool dt_softplus,
+    Optional<TensorView> state_batch_indices, int64_t pad_slot_id, Optional<TensorView> state_scale,
+    TensorView output, bool disable_state_update, Optional<TensorView> intermediate_states_buffer,
+    Optional<TensorView> intermediate_state_indices, Optional<TensorView> intermediate_state_scales,
+    Optional<TensorView> rand_seed, int64_t cache_steps, int64_t algorithm) {
   if (x.dim() == 3) {
     run_selective_state_update_stp(state, x, dt, A, B, C, D, z, dt_bias, dt_softplus,
-                                   state_batch_indices, pad_slot_id, output, disable_state_update,
-                                   algorithm);
+                                   state_batch_indices, state_scale, pad_slot_id, output,
+                                   disable_state_update, rand_seed, algorithm);
   } else if (x.dim() == 4) {
-    run_selective_state_update_mtp(state, x, dt, A, B, C, D, z, dt_bias, dt_softplus,
-                                   state_batch_indices, pad_slot_id, output, disable_state_update,
-                                   intermediate_states_buffer, intermediate_state_indices,
-                                   cache_steps, algorithm);
+    run_selective_state_update_mtp(
+        state, x, dt, A, B, C, D, z, dt_bias, dt_softplus, state_batch_indices, state_scale,
+        pad_slot_id, output, disable_state_update, intermediate_states_buffer,
+        intermediate_state_indices, intermediate_state_scales, rand_seed, cache_steps, algorithm);
   } else {
     FLASHINFER_CHECK(false,
                      "x must have 3 dimensions (single-token) or 4 dimensions (multi-token), got ",

@@ -8,7 +8,13 @@ using input_t = {{ input_dtype }};
 using weight_t = {{ weight_dtype }};
 using matrixA_t = {{ matrixA_dtype }};
 using stateIndex_t = {{ stateIndex_dtype }};
+// Type for block-scale decode factors (e.g. float, __half).
+// void = no scaling (state_t is used as-is).
+using state_scale_t = {{ state_scale_type }};
 
 constexpr int DIM = {{ dim }};
 constexpr int DSTATE = {{ dstate }};
 constexpr int NTOKENS_MTP = {{ ntokens_mtp }};
+// Philox PRNG rounds for stochastic rounding of fp16 state stores.
+// 0 = no stochastic rounding; typical value = 10.
+constexpr int PHILOX_ROUNDS = {{ philox_rounds }};
@@ -7,12 +7,14 @@
 
 namespace flashinfer::mamba {
 
-template void invokeSelectiveStateUpdate<input_t, weight_t, matrixA_t, state_t, stateIndex_t>(
-    SelectiveStateUpdateParams&, SSUAlgorithm, cudaStream_t);
+template void invokeSelectiveStateUpdate<input_t, weight_t, matrixA_t, state_t, stateIndex_t,
+                                         state_scale_t>(SelectiveStateUpdateParams&, SSUAlgorithm,
+                                                        cudaStream_t);
 
 namespace mtp {
-template void invokeSelectiveStateUpdateMTP<input_t, weight_t, matrixA_t, state_t, stateIndex_t>(
-    SelectiveStateMTPParams&, SSUAlgorithm, cudaStream_t);
+template void invokeSelectiveStateUpdateMTP<input_t, weight_t, matrixA_t, state_t, stateIndex_t,
+                                            state_scale_t>(SelectiveStateMTPParams&, SSUAlgorithm,
+                                                           cudaStream_t);
 }  // namespace mtp
 
 }  // namespace flashinfer::mamba
@@ -548,15 +548,32 @@ def gen_all_modules(
         ]
         # selective_state_update: one module per dtype combo per GPU arch
         _ssu_dtype_combos = [
-            # (state,        input,          weight,         matrixA,      stateIndex)
+            # (state,        input,          weight,         matrixA,      stateIndex, state_scale_dtype)
             (
                 torch.bfloat16,
                 torch.bfloat16,
                 torch.bfloat16,
                 torch.float32,
                 torch.int64,
+                None,
+            ),
+            # int16 state (block-scaled quantization, scale stored as float32)
+            (
+                torch.int16,
+                torch.bfloat16,
+                torch.bfloat16,
+                torch.float32,
+                torch.int64,
+                torch.float32,
+            ),
+            (
+                torch.float32,
+                torch.bfloat16,
+                torch.bfloat16,
+                torch.float32,
+                torch.int64,
+                None,
             ),
-            (torch.float32, torch.bfloat16, torch.bfloat16, torch.float32, torch.int64),
         ]
         _ssu_dims = [64]
         _ssu_dstates = [128]