flashinfer-ai · yzh119 · Mar 9, 2026 · Jan 7, 2026 · Jan 8, 2026 · Jan 8, 2026
@@ -114,7 +114,7 @@ struct Hmma_gmem_tile_o {
     //
     // row_offset += binfo.bidx * VALID_BYTES_PER_ROW;
     //
-    row_offset += binfo.bidx * valid_bytes_per_row;
+    row_offset += (int64_t)binfo.bidx * valid_bytes_per_row;
 
     // Assemble the final pointer.
     o_ptr_ += row_offset + col_in_bytes_;
@@ -753,7 +753,7 @@ struct Gmem_tile_o_8bit {
     // The amount of bytes per row without padding (runtime).
     int const valid_bytes_per_row = params.dv * BYTES_PER_ELEMENT;
     // Take the batch/head offset into account.
-    row_offset += block_info.bidx * valid_bytes_per_row;
+    row_offset += (int64_t)block_info.bidx * valid_bytes_per_row;
     // Assemble the final pointer.
     o_ptr_ += row_offset + col_in_bytes_;
 
@@ -1088,7 +1088,7 @@ struct Gmem_tile_o_16bit {
     // The amount of bytes per row without padding (runtime).
     int const valid_bytes_per_row = params.dv * BYTES_PER_ELEMENT;
     // Take the batch/head offset into account.
-    row_offset += block_info.bidx * valid_bytes_per_row;
+    row_offset += (int64_t)block_info.bidx * valid_bytes_per_row;
     // Assemble the final pointer.
     o_ptr_ += row_offset + col_in_bytes_;
 

@@ -558,7 +558,7 @@ struct Gmem_tile_ps {
     int col = warp / Cta_tile::WARPS_M * Mma_tile::N_PER_MMA + lane % 4 * ELEMENTS_PER_STG;
 
     // The offset of the 1st row written by the thread. We store the P matrix interleaved.
-    int64_t row_offset = (int64_t)row * params_stride_in_bytes_ + bidx * BYTES_PER_ROW;
+    int64_t row_offset = (int64_t)row * params_stride_in_bytes_ + (int64_t)bidx * BYTES_PER_ROW;
     // Finalize the pointer.
     ptr_ += row_offset + col * BYTES_PER_ELEMENT;
   }
@@ -654,7 +654,7 @@ struct Gmem_tile_ps<Volta_hmma_fp16_traits, Cta_tile, 16> {
 
     // The offset of the 1st row written by the thread. We store the P matrix interleaved.
     int64_t row_offset =
-        (int64_t)row * params_stride_in_bytes_ + bidx * BYTES_PER_ROW + cta_row_offset;
+        (int64_t)row * params_stride_in_bytes_ + (int64_t)bidx * BYTES_PER_ROW + cta_row_offset;
 
     // Finalize the pointer.
     ptr_ += row_offset + col * BYTES_PER_ELEMENT;
@@ -760,7 +760,7 @@ struct Gmem_tile_ps_hopper {
     int col = warpgroup_idx * Mma_tile::N_PER_MMA + lane % 4 * ELEMENTS_PER_STG;
 
     // The offset of the 1st row written by the thread. We store the P matrix interleaved.
-    int64_t row_offset = (int64_t)row * params_stride_in_bytes_ + bidx * bytes_per_row;
+    int64_t row_offset = (int64_t)row * params_stride_in_bytes_ + (int64_t)bidx * bytes_per_row;
     // Finalize the pointer.
     ptr_ += row_offset + col * BYTES_PER_ELEMENT;
   }

@@ -96,7 +96,7 @@ struct Gmem_tile_o_hopper_16bits {
 
     // The offset of the 1st row written by the thread. We store the P matrix interleaved.
     int64_t row_offset =
-        (int64_t)row_ * params_o_stride_in_bytes_ + block_info.bidx * BYTES_PER_ROW;
+        (int64_t)row_ * params_o_stride_in_bytes_ + (int64_t)block_info.bidx * BYTES_PER_ROW;
     // Finalize the pointer.
     o_ptr_ += row_offset + col * BYTES_PER_ELEMENT;
   }
@@ -599,7 +599,7 @@ struct Gmem_tile_o_gmma_32bit_8bit {
 
       // The offset of the 1st row written by the thread. We store the P matrix interleaved.
       int64_t row_offset =
-          (int64_t)row_ * params_o_stride_in_bytes_ + block_info.bidx * BYTES_PER_ROW;
+          (int64_t)row_ * params_o_stride_in_bytes_ + (int64_t)block_info.bidx * BYTES_PER_ROW;
       // Finalize the pointer.
       o_ptr_ += row_offset + col_ * BYTES_PER_ELEMENT;
     }
@@ -1065,7 +1065,7 @@ struct Gmem_tile_o_qgmma_fp32_16bits {
 
     // The offset of the 1st row written by the thread. We store the P matrix interleaved.
     int64_t row_offset =
-        (int64_t)row_ * params_o_stride_in_bytes_ + block_info.bidx * BYTES_PER_ROW;
+        (int64_t)row_ * params_o_stride_in_bytes_ + (int64_t)block_info.bidx * BYTES_PER_ROW;
     // Finalize the pointer.
     o_ptr_ += row_offset + col * BYTES_PER_ELEMENT;
   }

@@ -195,13 +195,14 @@ struct Kernel_traits_ {
 
   // Compute the total BMM2_MMAS_K (might not the same as Mma_tile_o::MMAS_K if the granular tiling
   // is used).
-  static_assert(S % CTA_O_TILE_K == 0, "");
+  // S=0 for flash attention (variable sequence length): tile counts are determined at runtime.
+  static_assert(S == 0 || S % CTA_O_TILE_K == 0, "");
 
-  enum { TOTAL_BMM2_MMAS_K = Mma_tile_o::MMAS_K * (S / CTA_O_TILE_K) };
+  enum { TOTAL_BMM2_MMAS_K = S == 0 ? 0 : Mma_tile_o::MMAS_K * (S / CTA_O_TILE_K) };
 
   // Constraints on the K dimension.
   static_assert(Mma_tile_p::K_PER_MMA <= static_cast<int>(D));
-  static_assert(Mma_tile_o::K_PER_MMA <= S);
+  static_assert(S == 0 || Mma_tile_o::K_PER_MMA <= S);
 
   // The version.
   enum { VERSION = VERSION_ };

@@ -179,7 +179,7 @@ struct Compute {
       USE_CUSTOM_MASK ? (head_info.mask_sum_s + q_step_idx * STEP_Q + local_q_tile_offset)       \
                       : (q_step_idx * STEP_Q + head_info.q_tile_offset),                         \
       kv_step_idx * STEP_KV, sage_scale_row, cbr, cbr_v, mutex_accessor,                         \
-      kv_step_idx == kv_idx_end - 1);
+      &shared->skip_softmax_votes[kv_step_idx & 1][warpgroup_id], kv_step_idx == kv_idx_end - 1);
 
   ////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -277,6 +277,12 @@ struct Compute {
       int const actual_kv_seqlen =
           SEPARATE_Q_KV_BUFFER ? head_info.actual_kv_seqlen : actual_q_seqlen;
 
+      // Update threshold of Skip-Softmax
+      if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX) {
+        softmax.skip_softmax_threshold =
+            params.skip_softmax_threshold_scale_factor / actual_kv_seqlen;
+      }
+
       // Calculate the alibi head_scaling_factor.
       float alibi_head_scale = APPLY_ALIBI ? get_alibi_head_scaling_factor<AlibiParams>(
                                                  head_info.bidh, params.alibi_params)
@@ -411,6 +417,12 @@ struct Compute {
         }
       }
     }
+#ifdef SKIP_SOFTMAX_STAT
+    if (tidx == 0) {
+      atomicAdd(params.skip_softmax_total_blocks, softmax.total_blocks);
+      atomicAdd(params.skip_softmax_skipped_blocks, softmax.skipped_blocks);
+    }
+#endif
   }
 
   ////////////////////////////////////////////////////////////////////////////////////////////////
@@ -421,7 +433,14 @@ struct Compute {
       float (&p_max)[Mma_tile_p::CORES_M], float (&p_sum)[Mma_tile_p::CORES_M], int const tidx,
       int const actual_kv_seqlen, float const alibi_head_scale, int const row_offset,
       int const col_offset, int const sage_scale_row, Circular_buffer_q_reader& cbr,
-      Circular_buffer_kv_reader& cbr_v, OrderedMutexAccessor& mutex, bool complete = false) {
+      Circular_buffer_kv_reader& cbr_v, OrderedMutexAccessor& mutex, uint32_t* skip_softmax_vote,
+      bool complete = false) {
+    // Skip-softmax vote initialization
+    if (tidx == 0) {
+      // Note that we need a named_barrier_wait in compute_single_tile to make sure init is before
+      // voting.
+      *skip_softmax_vote = 1;
+    }
-    // Skip-softmax vote initialization
-    if (tidx == 0) {
-      // Note that we need a named_barrier_wait in compute_single_tile to make sure init is before
-      // voting.
-      *skip_softmax_vote = 1;
-    }
+    // Skip-softmax vote initialization
+    if ((tidx % 128) == 0) {
+      // Note that we need a named_barrier_wait in compute_single_tile to make sure init is before
+      // voting.
+      *skip_softmax_vote = 1;
+    }
-    // Skip-softmax vote initialization
-    if (tidx == 0) {
-      // Note that we need a named_barrier_wait in compute_single_tile to make sure init is before
-      // voting.
-      *skip_softmax_vote = 1;
-    }
+    // Skip-softmax vote initialization
+    if ((tidx % 128) == 0) {
+      // Note that we need a named_barrier_wait in compute_single_tile to make sure init is before
+      // voting.
+      *skip_softmax_vote = 1;
+    }
 // load the scales of K/V from global memory
 #define LOAD_SCALES_KV(dst, which, blocks_per_step, block_size)                            \
   if constexpr (block_size > 0) {                                                          \
@@ -453,6 +472,10 @@ struct Compute {
     // Ctile_p is only used once by each n step.
     ctile_p.clear();
 
+    // If skip_softmax is enabled, make sure there is no racing between the initialization and
+    // writing of skip_softmax_vote.
+    named_barrier_wait(Kernel_traits::SKIP_SOFTMAX_BARRIER_ID + threadIdx.x / 128, 128);
+
     // BMM1 (Q x K').
     warpgroup_arrive();
 
@@ -513,8 +536,27 @@ struct Compute {
     softmax.apply_alibi_and_mask<APPLY_MASK>(ctile_p, params.alibi_params, alibi_head_scale,
                                              actual_kv_seqlen, row_offset, col_offset);
 
-    // Softmax Exp, max/sum, and update scales.
-    softmax.compute_and_update_scale<IS_FIRST_COL>(p_max, p_sum);
+    // Softmax Exp, max/sum, and update scales. If returns false we skip the rest.
+    if (!softmax.compute_and_update_scale<IS_FIRST_COL>(p_max, p_sum, skip_softmax_vote)) {
+      if constexpr (ENABLE_MUTEX && Kernel_traits::ELEMENT_BYTES == 1) {
+        // Notify another warpgroup to execute QGMMA.
+        mutex.named_bar_arrive();
+      }
+      // Need to wait V, otherwise compute-sanitizer synccheck will fail.
+      int ready2 = cbr_v.peek();
+      if (!ready2) {
+        cbr_v.wait();
+      }
+
+#pragma unroll
+      // Advance V descriptor by the same amount as the BMM2 loop would,
+      // so that the descriptor stays in sync for subsequent KV steps.
+      for (int kbi = 0; kbi < BMM2_MMAS_K_GROUPS - 1; kbi++) {
+        ctile_o.increment_gmma_desc_group();
+      }
+
+      return;
+    }
 
     // experiments show that here is the best place to load scales of V
     float scales_v[SAGE_BLOCKS_PER_STEP_V];