Dao-AILab · tridao · Oct 24, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/flash_attn/cute/barrier.py b/flash_attn/cute/barrier.py
@@ -4,8 +4,9 @@
 from cutlass.cutlass_dsl import T, dsl_user_op
 from cutlass._mlir.dialects import llvm
 
+
 @dsl_user_op
-def ld_acquire(lock_ptr : cute.Pointer, *, loc=None, ip=None) -> cutlass.Int32:
+def ld_acquire(lock_ptr: cute.Pointer, *, loc=None, ip=None) -> cutlass.Int32:
     lock_ptr_i64 = lock_ptr.toint(loc=loc, ip=ip).ir_value()
     state = llvm.inline_asm(
         T.i32(),
@@ -18,8 +19,11 @@ def ld_acquire(lock_ptr : cute.Pointer, *, loc=None, ip=None) -> cutlass.Int32:
     )
     return cutlass.Int32(state)
 
+
 @dsl_user_op
-def red_relaxed(lock_ptr : cute.Pointer, val: cutlass.Constexpr[Int32], *, loc=None, ip=None) -> None:
+def red_relaxed(
+    lock_ptr: cute.Pointer, val: cutlass.Constexpr[Int32], *, loc=None, ip=None
+) -> None:
     lock_ptr_i64 = lock_ptr.toint(loc=loc, ip=ip).ir_value()
     llvm.inline_asm(
         None,
@@ -31,8 +35,11 @@ def red_relaxed(lock_ptr : cute.Pointer, val: cutlass.Constexpr[Int32], *, loc=N
         asm_dialect=llvm.AsmDialect.AD_ATT,
     )
 
+
 @dsl_user_op
-def red_release(lock_ptr : cute.Pointer, val: cutlass.Constexpr[Int32], *, loc=None, ip=None) -> None:
+def red_release(
+    lock_ptr: cute.Pointer, val: cutlass.Constexpr[Int32], *, loc=None, ip=None
+) -> None:
     lock_ptr_i64 = lock_ptr.toint(loc=loc, ip=ip).ir_value()
     llvm.inline_asm(
         None,
@@ -43,28 +50,22 @@ def red_release(lock_ptr : cute.Pointer, val: cutlass.Constexpr[Int32], *, loc=N
         is_align_stack=False,
         asm_dialect=llvm.AsmDialect.AD_ATT,
     )
-
+
+
 @cute.jit
-def wait_eq(
-    lock_ptr : cute.Pointer,
-    thread_idx : int | Int32,
-    flag_offset : int,
-    val : Int32
-) -> None:
+def wait_eq(lock_ptr: cute.Pointer, thread_idx: int | Int32, flag_offset: int, val: Int32) -> None:
     flag_ptr = lock_ptr + flag_offset
     if thread_idx == 0:
         read_val = Int32(0)
         while read_val != val:
             read_val = ld_acquire(flag_ptr)
 
+
 @cute.jit
 def arrive_inc(
-    lock_ptr : cute.Pointer,
-    thread_idx : int | Int32,
-    flag_offset : int,
-    val : cutlass.Constexpr[Int32]
+    lock_ptr: cute.Pointer, thread_idx: int | Int32, flag_offset: int, val: cutlass.Constexpr[Int32]
 ) -> None:
     flag_ptr = lock_ptr + flag_offset
     if thread_idx == 0:
         red_release(flag_ptr, val)
-        # red_relaxed(flag_ptr, val)
+        # red_relaxed(flag_ptr, val)
diff --git a/flash_attn/cute/benchmark_mask_mod.py b/flash_attn/cute/benchmark_mask_mod.py
@@ -5,7 +5,6 @@
 
 from dataclasses import dataclass
 import math
-from pickle import FALSE
 from typing import Any, Dict, Optional, Tuple
 
 import cuda.bindings.driver as cuda
@@ -51,7 +50,7 @@ class BenchmarkConfig:
     # Mask parameters
     use_mask_mod: bool = True
     mask_mod_name: str = "causal"
-    has_buffers: bool = mask_mod_name == "document"
+    has_aux_tensors: bool = mask_mod_name == "document"
 
     # Sliding window parameter (used when mask_mod_name == "sliding_window")
     window_size: int = 128
@@ -235,7 +234,6 @@ def _create_tensors(self) -> Dict[str, torch.Tensor]:
                 dtype=torch.float32,
                 device=device,
             )
-
 
             tensors = {
                 "q": q.contiguous(),
@@ -244,10 +242,10 @@ def _create_tensors(self) -> Dict[str, torch.Tensor]:
                 "out": out.contiguous(),
                 "lse": lse.contiguous(),
             }
-        
+
         if config.use_learnable_sink:
             learnable_sink = torch.rand(config.nheads, dtype=torch.bfloat16, device=device)
-            
+
             tensors["learnable_sink"] = learnable_sink.contiguous()
 
         # Compute block sparsity when using mask_mod
@@ -256,14 +254,14 @@ def _create_tensors(self) -> Dict[str, torch.Tensor]:
                 doc_id = random_doc_id_tensor(
                     config.batch_size, config.nheads, config.seqlen_q, device=device
                 )
-                tensors["buffers"] = [doc_id.contiguous()]
+                tensors["aux_tensors"] = [doc_id.contiguous()]
             full_cnt, full_idx, mask_cnt, mask_idx = compute_block_sparsity(
                 config=self.config,
                 mask_mod_flex=self.mask_mod_flex,
                 device=device,
                 cu_seqlens_q=tensors.get("cu_seqlens_q"),
                 cu_seqlens_k=tensors.get("cu_seqlens_k"),
-                buffers=tensors.get("buffers"),
+                aux_tensors=tensors.get("aux_tensors"),
             )
 
             if all(t is not None for t in [full_cnt, full_idx, mask_cnt, mask_idx]):
@@ -329,7 +327,7 @@ def _compile_kernel(self, tensors: Dict[str, torch.Tensor]) -> Tuple[Any, tuple]
             mma_pv_is_rs=config.mma_pv_is_rs,
             mask_mod=self.mask_mod_cute,
             Q_in_regs=False,
-            has_buffers=config.has_buffers,
+            has_aux_tensors=config.has_aux_tensors,
         )
 
         softmax_scale = 1.0 / math.sqrt(config.headdim)
@@ -405,14 +403,14 @@ def _compile_kernel(self, tensors: Dict[str, torch.Tensor]) -> Tuple[Any, tuple]
             else None
         )
 
-        if "buffers" in tensors:
-            buffers_cute = []
-            for i in range(len(tensors["buffers"])):
-                buf = from_dlpack(tensors["buffers"][i].detach(), assumed_align=4)
-                buffers_cute.append(buf.mark_layout_dynamic(leading_dim=2))
+        if "aux_tensors" in tensors:
+            aux_tensors_cute = []
+            for i in range(len(tensors["aux_tensors"])):
+                buf = from_dlpack(tensors["aux_tensors"][i].detach(), assumed_align=4)
+                aux_tensors_cute.append(buf.mark_layout_dynamic(leading_dim=2))
 
         else:
-            buffers_cute = None
+            aux_tensors_cute = None
 
         # Window parameters for is_local
         window_left_cute = (
@@ -443,7 +441,7 @@ def _compile_kernel(self, tensors: Dict[str, torch.Tensor]) -> Tuple[Any, tuple]
             full_block_idx_cute,
             mask_block_cnt_cute,
             mask_block_idx_cute,
-            buffers_cute,
+            aux_tensors_cute,
             # None,
         )
 
@@ -467,7 +465,7 @@ def _compile_kernel(self, tensors: Dict[str, torch.Tensor]) -> Tuple[Any, tuple]
             full_block_idx_cute,
             mask_block_cnt_cute,
             mask_block_idx_cute,
-            buffers_cute,
+            aux_tensors_cute,
             # None,
         )
 
@@ -496,7 +494,7 @@ def _calculate_flops(self, tensors: Dict[str, torch.Tensor]) -> float:
                 num_blocks = (config.seqlen_k + block_size - 1) // block_size
                 sparsity_ratio = 1.0 / num_blocks if num_blocks > 1 else 1.0
             elif config.mask_mod_name == "document":
-                vals = tensors["buffers"][0]
+                vals = tensors["aux_tensors"][0]
                 val_mask = torch.ones_like(vals, dtype=torch.bool)
                 val_mask[..., 1:] = vals[..., 1:] != vals[..., :-1]
                 total = torch.where(val_mask, vals.square(), 0).sum()
@@ -573,7 +571,7 @@ def benchmark(self) -> Dict[str, Any]:
             torch.cuda.synchronize()
 
             times.append(start.elapsed_time(end))
-        
+
         times_tensor = torch.tensor(times)
         mean_time = times_tensor.mean().item()
         std_time = times_tensor.std().item() if len(times) > 1 else 0.0
@@ -683,7 +681,7 @@ def _print_results(self, results: Dict[str, Any]):
         # seqlen_k=192,
         use_varlen=False,
         use_mask_mod=True,
-        mask_mod_name="identity",
+        mask_mod_name="causal",
         window_size=128,  # Configurable window size for mask_mod
         use_learnable_sink=False,
         causal=False,