flashinfer-ai · ishovkun · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
@@ -456,7 +456,7 @@ def gen_jit_spec(
         cuda_cflags += ["-DNDEBUG", "-O3"]
         cflags += ["-O3"]
 
-    # useful for ncu
+    # useful for ncu source correlation
     if os.environ.get("FLASHINFER_JIT_LINEINFO", "0") == "1":
         cuda_cflags += ["-lineinfo"]
 

@@ -16,12 +16,14 @@
 
 from .selective_state_update import (
     gen_selective_state_update_module,
+    gen_selective_state_update_sm100_module,
     gen_selective_state_update_sm90_module,
 )
 from .seq_chunk_cumsum import gen_seq_chunk_cumsum_module
 
 __all__ = [
     "gen_selective_state_update_module",
     "gen_selective_state_update_sm90_module",
+    "gen_selective_state_update_sm100_module",
     "gen_seq_chunk_cumsum_module",
 ]
@@ -182,7 +182,6 @@ def gen_selective_state_update_module(
         cu_seqlens_dtype,
         num_accepted_tokens_dtype,
         philox_rounds=philox_rounds,
-        extra_cuda_cflags=["-lineinfo"],
     )
 
 
@@ -238,3 +237,57 @@ def gen_selective_state_update_sm90_module(
         philox_rounds=philox_rounds,
         extra_cuda_cflags=nvcc_flags,
     )
+
+
+def gen_selective_state_update_sm100_module(
+    state_dtype: torch.dtype,
+    input_dtype: torch.dtype,
+    weight_dtype: torch.dtype,
+    matrixA_dtype: torch.dtype,
+    stateIndex_dtype: torch.dtype,
+    state_scale_dtype: Optional[torch.dtype],
+    dim: int,
+    dstate: int,
+    ntokens_mtp: int,
+    cu_seqlens_dtype: torch.dtype,
+    num_accepted_tokens_dtype: torch.dtype,
+    philox_rounds: int = 0,
+) -> JitSpec:
+    uri = (
+        get_selective_state_update_uri(
+            state_dtype,
+            input_dtype,
+            weight_dtype,
+            matrixA_dtype,
+            stateIndex_dtype,
+            state_scale_dtype,
+            dim,
+            dstate,
+            ntokens_mtp,
+            cu_seqlens_dtype,
+            num_accepted_tokens_dtype,
+            philox_rounds,
+        )
+        + "_sm100"
+    )
+    compilation_context = CompilationContext()
+    nvcc_flags = compilation_context.get_nvcc_flags_list(
+        supported_major_versions=[10, 11, 12]
+    )
+    nvcc_flags += ["-DFLASHINFER_MAMBA_ENABLE_SM90", "-DFLASHINFER_MAMBA_ENABLE_SM100"]
+    return _gen_module(
+        uri,
+        state_dtype,
+        input_dtype,
+        weight_dtype,
+        matrixA_dtype,
+        stateIndex_dtype,
+        state_scale_dtype,
+        dim,
+        dstate,
+        ntokens_mtp,
+        cu_seqlens_dtype,
+        num_accepted_tokens_dtype,
+        philox_rounds=philox_rounds,
+        extra_cuda_cflags=nvcc_flags,
+    )
@@ -22,6 +22,7 @@
 from ..api_logging import flashinfer_api
 from ..jit.mamba import (
     gen_selective_state_update_module,
+    gen_selective_state_update_sm100_module,
     gen_selective_state_update_sm90_module,
 )
 from ..utils import get_compute_capability, register_custom_op, register_fake_op
@@ -57,7 +58,9 @@ def _get_module(
         num_accepted_tokens_dtype,
         philox_rounds,
     )
-    if sm_major >= 9:
+    if sm_major >= 10:
+        return gen_selective_state_update_sm100_module(*args).build_and_load()
+    elif sm_major >= 9:
         return gen_selective_state_update_sm90_module(*args).build_and_load()
     else:
         return gen_selective_state_update_module(*args).build_and_load()
@@ -266,6 +269,11 @@ def selective_state_update(
         # No stochastic rounding when rand_seed is None
         philox_rounds = 0
 
+    if intermediate_states_buffer is not None and dst_state_batch_indices is not None:
+        raise ValueError(
+            "intermediate_states_buffer and dst_state_batch_indices are mutually exclusive"
+        )
+
     if out is None:
         output = torch.empty_like(x)
     else:
@@ -298,6 +306,9 @@ def selective_state_update(
         algorithm_int = 2
     elif algorithm == "horizontal":
         algorithm_int = 3
+    elif algorithm == "async_horizontal":
+        # Backward compat: async_horizontal is now merged into simple
+        algorithm_int = 1
     else:
         raise ValueError(f"Unknown algorithm: {algorithm}")
 

@@ -32,9 +32,17 @@ constexpr unsigned warpSize = 32;
 // Common types and utilities
 // =============================================================================
 
-// Simple packed vector type for loading N elements of type T
+// Largest power of 2 that divides v (i.e. v & -v). Returns 1 when v == 0.
+inline constexpr unsigned largestPow2Divisor(unsigned v) { return v ? (v & (~v + 1)) : 1; }
+
+// Simple packed vector type for loading N elements of type T.
+// Alignment is the largest power-of-2 factor of the total byte size,
+// so it is always valid even when N * sizeof(T) is not a power of 2 (e.g. 3 × 2 = 6).
 template <typename T, int N = sizeof(float4) / sizeof(T)>
-struct alignas(N * sizeof(T)) PackedAligned {
+struct alignas(largestPow2Divisor(N * sizeof(T))) PackedAligned {
+  static_assert(N > 0,
+                "PackedAligned instantiated with N == 0; "
+                "ensure getVectorLoadSizeForFullUtilization() returns > 0");
   T val[N];
   static constexpr int count = N;
   using dtype = T;

@@ -22,6 +22,39 @@ inline __device__ float toFloat(__nv_bfloat16 val) { return __bfloat162float(val
 // (24-bit mantissa represents all integers up to 2^24 = 16M exactly).
 inline __device__ float toFloat(int16_t val) { return static_cast<float>(val); }
 
+// Packed 2-element conversion: convert a packed pair to float2.
+// Uses native packed intrinsics for bf16/fp16 (fewer PRMT/SHF instructions).
+inline __device__ float2 toFloat2(float2 packed) { return packed; }
+
+inline __device__ float2 toFloat2(__half2 packed) { return __half22float2(packed); }
+
+// Pointer-based overloads: read two consecutive elements and convert to float2.
+// Dispatches to the packed intrinsic for bf16/fp16 via the overloads above.
+inline __device__ float2 toFloat2(float const* ptr) { return {ptr[0], ptr[1]}; }
+
+inline __device__ float2 toFloat2(__half const* ptr) {
+  return toFloat2(*reinterpret_cast<__half2 const*>(ptr));
+}
+
+#ifdef FLASHINFER_ENABLE_BF16
+// inline __device__ float2 toFloat2(__nv_bfloat162 packed) { return __bfloat1622float2(packed); }
+inline __device__ float2 toFloat2(__nv_bfloat162 packed) {
+  // bf16 is the upper 16 bits of f32 — shift/mask is cheaper than PRMT byte permutation.
+  // NOTE: this ignores denormals
+  uint32_t bits = reinterpret_cast<uint32_t const&>(packed);
+  float2 out;
+  out.x = __uint_as_float(bits << 16);          // low bf16 → upper 16 bits of f32
+  out.y = __uint_as_float(bits & 0xFFFF0000u);  // high bf16 already in upper 16 bits
+  return out;
+}
+
+inline __device__ float2 toFloat2(__nv_bfloat16 const* ptr) {
+  return toFloat2(*reinterpret_cast<__nv_bfloat162 const*>(ptr));
+}
+#endif
+
+inline __device__ float2 toFloat2(int16_t const* ptr) { return {toFloat(ptr[0]), toFloat(ptr[1])}; }
+
 inline __device__ void convertAndStore(float* output, float input) { *output = input; }
 
 inline __device__ void convertAndStore(__half* output, float input) {

@@ -12,10 +12,10 @@
 
 namespace flashinfer::mamba::tma {
 
-inline CUtensorMap buildNdDescriptor(std::type_info const& dtype,
-                                     std::vector<uint64_t> const& shapes,
-                                     std::vector<uint64_t> const& strides,
-                                     std::vector<int32_t> const& tileShapes, void* gmemAddr) {
+inline CUtensorMap buildNdDescriptor(
+    std::type_info const& dtype, std::vector<uint64_t> const& shapes,
+    std::vector<uint64_t> const& strides, std::vector<int32_t> const& tileShapes, void* gmemAddr,
+    CUtensorMapFloatOOBfill oobFill = CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE) {
   // The multiplication factor of the data padding in SMEM.
   CUtensorMap desc{};
   CUtensorMapDataType tmaDataFormat;
@@ -85,7 +85,7 @@ inline CUtensorMap buildNdDescriptor(std::type_info const& dtype,
                              boxDim.data(), tileStrides.data(),
                              /*interleave=*/CU_TENSOR_MAP_INTERLEAVE_NONE, swizzleType,
                              /*l2Promotion=*/CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
-                             /*oobFill=*/CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+                             /*oobFill=*/oobFill);
 
   if (result != CUDA_SUCCESS) {
     char const* errorString;