fla-org · yzhangcs · Jun 26, 2025 · Jun 21, 2025 · Jun 21, 2025 · Jun 22, 2025
diff --git a/fla/ops/attn/parallel.py b/fla/ops/attn/parallel.py
@@ -22,7 +22,7 @@
 @triton.autotune(
     configs=[
         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4] + ([8] if check_shared_mem('hopper') else [])
+        for num_warps in [2, 4] + ([8] if check_shared_mem('hopper') else [])
         for num_stages in [2, 3, 4, 5]
     ],
     key=['B', 'H', 'HQ', 'G', 'K', 'V', 'BK', 'BV', 'USE_G', 'IS_VARLEN'],
@@ -177,7 +177,7 @@ def parallel_attn_bwd_kernel_preprocess(
 @triton.autotune(
     configs=[
         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4] + ([8] if check_shared_mem('hopper') else [])
+        for num_warps in [2, 4] + ([8] if check_shared_mem('hopper') else [])
         for num_stages in [2, 3, 4, 5]
     ],
     key=['B', 'H', 'HQ', 'G', 'K', 'V', 'BK', 'BV', 'USE_G', 'IS_VARLEN'],
@@ -319,7 +319,7 @@ def parallel_attn_bwd_kernel_dq(
 @triton.autotune(
     configs=[
         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4] + ([8] if check_shared_mem('hopper') else [])
+        for num_warps in [2, 4] + ([8] if check_shared_mem('hopper') else [])
         for num_stages in [2, 3, 4, 5]
     ],
     key=['B', 'H', 'HQ', 'G', 'K', 'V', 'BK', 'BV', 'USE_G', 'IS_VARLEN'],

diff --git a/fla/ops/common/chunk_o.py b/fla/ops/common/chunk_o.py
@@ -22,11 +22,9 @@
 })
 @triton.autotune(
     configs=[
-        triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages)
-        for BK in BKV_LIST
-        for BV in BKV_LIST
-        for num_warps in NUM_WARPS
-        for num_stages in [2, 3, 4]
+        triton.Config({'BK': 128, 'BV': 128}, num_warps=8, num_stages=3),
+        triton.Config({'BK': 64, 'BV': 64}, num_warps=4, num_stages=3),
+        triton.Config({'BK': 32, 'BV': 32}, num_warps=2, num_stages=3),
     ],
     key=['H', 'K', 'V', 'BT'],
 )

diff --git a/fla/ops/common/fused_recurrent.py b/fla/ops/common/fused_recurrent.py
@@ -24,7 +24,7 @@
     ],
     key=['BK', 'BV', 'USE_G', 'USE_G_GAMMA', 'USE_GK', 'USE_GV'],
 )
-@triton.jit(do_not_specialize=['T'])
+@triton.jit(do_not_specialize=['B', 'T'])
 def fused_recurrent_fwd_kernel(
     q,
     k,
@@ -38,8 +38,8 @@ def fused_recurrent_fwd_kernel(
     ht,
     cu_seqlens,
     scale,
+    B,
     T,
-    B: tl.constexpr,
     H: tl.constexpr,
     K: tl.constexpr,
     V: tl.constexpr,
@@ -137,7 +137,7 @@ def fused_recurrent_fwd_kernel(
     ],
     key=['BK', 'BV', 'USE_G', 'USE_G_GAMMA', 'USE_GK', 'USE_GV'],
 )
-@triton.jit(do_not_specialize=['T'])
+@triton.jit(do_not_specialize=['B', 'T'])
 def fused_recurrent_bwd_kernel(
     q,
     k,
@@ -156,8 +156,8 @@ def fused_recurrent_bwd_kernel(
     dh0,
     cu_seqlens,
     scale,
+    B,
     T,
-    B: tl.constexpr,
     H: tl.constexpr,
     K: tl.constexpr,
     V: tl.constexpr,
@@ -320,7 +320,7 @@ def fused_recurrent_fwd(
 ):
     B, T, H, K, V = *k.shape, v.shape[-1]
     N = B if cu_seqlens is None else len(cu_seqlens) - 1
-    BK, BV = min(K, 64), min(V, 64)
+    BK, BV = min(triton.next_power_of_2(K), 64), min(triton.next_power_of_2(V), 64)
     NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
 
     h0 = initial_state
@@ -377,7 +377,7 @@ def fused_recurrent_bwd(
     B, T, H, K, V = *k.shape, v.shape[-1]
     N = B if cu_seqlens is None else len(cu_seqlens) - 1
 
-    BK, BV = min(K, 64), min(V, 64)
+    BK, BV = min(triton.next_power_of_2(K), 64), min(triton.next_power_of_2(V), 64)
     NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
 
     h0 = initial_state

diff --git a/fla/ops/utils/cumsum.py b/fla/ops/utils/cumsum.py
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional
 
 import torch
@@ -165,7 +164,7 @@ def chunk_global_cumsum_scalar_kernel(
     b_z = tl.zeros([], dtype=tl.float32)
     NT = tl.cdiv(T, BT)
     for i_c in range(NT):
-        i_t = NT-1-i_c if REVERSE else i_c
+        i_t = NT - 1 - i_c if REVERSE else i_c
         if HEAD_FIRST:
             p_s = tl.make_block_ptr(s + bos*H + i_h*T, (T,), (1,), (i_t * BT,), (BT,), (0,))
             p_o = tl.make_block_ptr(o + bos*H + i_h*T, (T,), (1,), (i_t * BT,), (BT,), (0,))
@@ -232,7 +231,7 @@ def chunk_global_cumsum_vector_kernel(
     b_z = tl.zeros([BS], dtype=tl.float32)
     NT = tl.cdiv(T, BT)
     for i_c in range(NT):
-        i_t = NT-1-i_c if REVERSE else i_c
+        i_t = NT - 1 - i_c if REVERSE else i_c
         if HEAD_FIRST:
             p_s = tl.make_block_ptr(s + (bos * H + i_h*T)*S, (T, S), (S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
             p_o = tl.make_block_ptr(o + (bos * H + i_h*T)*S, (T, S), (S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
@@ -245,8 +244,7 @@ def chunk_global_cumsum_vector_kernel(
         if HAS_SCALE:
             b_c *= scale
         tl.store(p_o, b_c.to(p_o.dtype.element_ty), boundary_check=(0, 1))
-        if i_c >= 0:
-            b_z += tl.sum(b_s, 0)
+        b_z += tl.sum(b_s, 0)
 
 
 def chunk_local_cumsum_scalar(
@@ -437,13 +435,6 @@ def chunk_local_cumsum(
     output_dtype: Optional[torch.dtype] = torch.float,
     **kwargs
 ) -> torch.Tensor:
-    if not head_first and g.shape[1] < g.shape[2]:
-        warnings.warn(
-            f"Input tensor shape suggests potential format mismatch: seq_len ({g.shape[1]}) < num_heads ({g.shape[2]}). "
-            "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
-            "when `head_first=False` was specified. "
-            "Please verify your input tensor format matches the expected shape [B, T, H, ...]."
-        )
     if cu_seqlens is not None:
         assert g.shape[0] == 1, "Only batch size 1 is supported when cu_seqlens are provided"
     if len(g.shape) == 3:

diff --git a/tests/models/test_modeling_abc.py b/tests/models/test_modeling_abc.py
@@ -11,25 +11,47 @@
 # ===================================================================================
 # Test for Modeling (Forward/Backward Pass)
 # ===================================================================================
-@pytest.mark.parametrize("L", [4])
-@pytest.mark.parametrize("B", [4])
-@pytest.mark.parametrize("T", [1024])
-@pytest.mark.parametrize("H", [4])
-@pytest.mark.parametrize("D", [64, 128])
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.parametrize("use_l2warp", [True, False])
-def test_modeling(L, B, T, H, D, dtype, use_l2warp):
+@pytest.mark.parametrize(
+    ['L', 'B', 'T', 'H', 'D', 'dtype', 'use_l2warp'],
+    [
+        pytest.param(*test, id="L{}-B{}-T{}-H{}-D{}-use_l2warp{}-{}".format(*test))
+        for test in [
+            (4, 4, 1024, 4, 64, True, torch.bfloat16),
+            (4, 4, 1024, 4, 64, False, torch.bfloat16),
+            (4, 4, 1024, 4, 128, False, torch.bfloat16),
+        ]
+    ]
+)
+def test_modeling(
+    L: int,
+    B: int,
+    T: int,
+    H: int,
+    D: int,
+    dtype: torch.dtype,
+    use_l2warp: bool,
+):
     run_test_model_forward_backward(L, B, T, H, D, ABCConfig, dtype, use_l2warp)
 
 
 # ===================================================================================
 # Test for Generation
 # ===================================================================================
-@pytest.mark.parametrize("L", [2])
-@pytest.mark.parametrize("B", [4])
-@pytest.mark.parametrize("T", [4000])
-@pytest.mark.parametrize("H", [8])
-@pytest.mark.parametrize("D", [64])
-@pytest.mark.parametrize("dtype", [torch.float16])
-def test_generation(L, B, T, H, D, dtype):
+@pytest.mark.parametrize(
+    ['L', 'B', 'T', 'H', 'D', 'dtype'],
+    [
+        pytest.param(*test, id="L{}-B{}-T{}-H{}-D{}-{}".format(*test))
+        for test in [
+            (2, 4, 2000, 8, 64, torch.float16),
+        ]
+    ]
+)
+def test_generation(
+    L: int,
+    B: int,
+    T: int,
+    H: int,
+    D: int,
+    dtype: torch.dtype,
+):
     run_test_generation(L, B, T, H, D, ABCConfig, dtype)
diff --git a/tests/models/test_modeling_bitnet.py b/tests/models/test_modeling_bitnet.py
@@ -11,25 +11,47 @@
 # ===================================================================================
 # Test for Modeling (Forward/Backward Pass)
 # ===================================================================================
-@pytest.mark.parametrize("L", [4])
-@pytest.mark.parametrize("B", [4])
-@pytest.mark.parametrize("T", [1024])
-@pytest.mark.parametrize("H", [4])
-@pytest.mark.parametrize("D", [64, 128])
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.parametrize("use_l2warp", [True, False])
-def test_modeling(L, B, T, H, D, dtype, use_l2warp):
+@pytest.mark.parametrize(
+    ['L', 'B', 'T', 'H', 'D', 'use_l2warp', 'dtype'],
+    [
+        pytest.param(*test, id="L{}-B{}-T{}-H{}-D{}-use_l2warp{}-{}".format(*test))
+        for test in [
+            (4, 4, 1024, 4, 64, True, torch.bfloat16),
+            (4, 4, 1024, 4, 64, False, torch.bfloat16),
+            (4, 4, 1024, 4, 128, False, torch.bfloat16),
+        ]
+    ]
+)
+def test_modeling(
+    L: int,
+    B: int,
+    T: int,
+    H: int,
+    D: int,
+    dtype: torch.dtype,
+    use_l2warp: bool,
+):
     run_test_model_forward_backward(L, B, T, H, D, BitNetConfig, dtype, use_l2warp)
 
 
 # ===================================================================================
 # Test for Generation
 # ===================================================================================
-@pytest.mark.parametrize("L", [2])
-@pytest.mark.parametrize("B", [4])
-@pytest.mark.parametrize("T", [4000])
-@pytest.mark.parametrize("H", [8])
-@pytest.mark.parametrize("D", [64])
-@pytest.mark.parametrize("dtype", [torch.float16])
-def test_generation(L, B, T, H, D, dtype):
+@pytest.mark.parametrize(
+    ['L', 'B', 'T', 'H', 'D', 'dtype'],
+    [
+        pytest.param(*test, id="L{}-B{}-T{}-H{}-D{}-{}".format(*test))
+        for test in [
+            (2, 4, 2000, 8, 64, torch.float16),
+        ]
+    ]
+)
+def test_generation(
+    L: int,
+    B: int,
+    T: int,
+    H: int,
+    D: int,
+    dtype: torch.dtype,
+):
     run_test_generation(L, B, T, H, D, BitNetConfig, dtype)
diff --git a/tests/models/test_modeling_comba.py b/tests/models/test_modeling_comba.py
@@ -11,25 +11,47 @@
 # ===================================================================================
 # Test for Modeling (Forward/Backward Pass)
 # ===================================================================================
-@pytest.mark.parametrize("L", [4])
-@pytest.mark.parametrize("B", [4])
-@pytest.mark.parametrize("T", [1024])
-@pytest.mark.parametrize("H", [4])
-@pytest.mark.parametrize("D", [64, 128])
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.parametrize("use_l2warp", [True, False])
-def test_modeling(L, B, T, H, D, dtype, use_l2warp):
+@pytest.mark.parametrize(
+    ['L', 'B', 'T', 'H', 'D', 'use_l2warp', 'dtype'],
+    [
+        pytest.param(*test, id="L{}-B{}-T{}-H{}-D{}-use_l2warp{}-{}".format(*test))
+        for test in [
+            (4, 4, 1024, 4, 64, True, torch.bfloat16),
+            (4, 4, 1024, 4, 64, False, torch.bfloat16),
+            (4, 4, 1024, 4, 128, False, torch.bfloat16),
+        ]
+    ]
+)
+def test_modeling(
+    L: int,
+    B: int,
+    T: int,
+    H: int,
+    D: int,
+    dtype: torch.dtype,
+    use_l2warp: bool,
+):
     run_test_model_forward_backward(L, B, T, H, D, CombaConfig, dtype, use_l2warp)
 
 
 # ===================================================================================
 # Test for Generation
 # ===================================================================================
-@pytest.mark.parametrize("L", [2])
-@pytest.mark.parametrize("B", [4])
-@pytest.mark.parametrize("T", [4000])
-@pytest.mark.parametrize("H", [8])
-@pytest.mark.parametrize("D", [64])
-@pytest.mark.parametrize("dtype", [torch.float16])
-def test_generation(L, B, T, H, D, dtype):
+@pytest.mark.parametrize(
+    ['L', 'B', 'T', 'H', 'D', 'dtype'],
+    [
+        pytest.param(*test, id="L{}-B{}-T{}-H{}-D{}-{}".format(*test))
+        for test in [
+            (2, 4, 2000, 8, 64, torch.float16),
+        ]
+    ]
+)
+def test_generation(
+    L: int,
+    B: int,
+    T: int,
+    H: int,
+    D: int,
+    dtype: torch.dtype,
+):
     run_test_generation(L, B, T, H, D, CombaConfig, dtype)
diff --git a/tests/models/test_modeling_deltanet.py b/tests/models/test_modeling_deltanet.py
@@ -11,25 +11,47 @@
 # ===================================================================================
 # Test for Modeling (Forward/Backward Pass)
 # ===================================================================================
-@pytest.mark.parametrize("L", [4])
-@pytest.mark.parametrize("B", [4])
-@pytest.mark.parametrize("T", [1024])
-@pytest.mark.parametrize("H", [4])
-@pytest.mark.parametrize("D", [64, 128])
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.parametrize("use_l2warp", [True, False])
-def test_modeling(L, B, T, H, D, dtype, use_l2warp):
+@pytest.mark.parametrize(
+    ['L', 'B', 'T', 'H', 'D', 'use_l2warp', 'dtype'],
+    [
+        pytest.param(*test, id="L{}-B{}-T{}-H{}-D{}-use_l2warp{}-{}".format(*test))
+        for test in [
+            (4, 4, 1024, 4, 64, True, torch.bfloat16),
+            (4, 4, 1024, 4, 64, False, torch.bfloat16),
+            (4, 4, 1024, 4, 128, False, torch.bfloat16),
+        ]
+    ]
+)
+def test_modeling(
+    L: int,
+    B: int,
+    T: int,
+    H: int,
+    D: int,
+    dtype: torch.dtype,
+    use_l2warp: bool,
+):
     run_test_model_forward_backward(L, B, T, H, D, DeltaNetConfig, dtype, use_l2warp)
 
 
 # ===================================================================================
 # Test for Generation
 # ===================================================================================
-@pytest.mark.parametrize("L", [2])
-@pytest.mark.parametrize("B", [4])
-@pytest.mark.parametrize("T", [4000])
-@pytest.mark.parametrize("H", [8])
-@pytest.mark.parametrize("D", [64])
-@pytest.mark.parametrize("dtype", [torch.float16])
-def test_generation(L, B, T, H, D, dtype):
+@pytest.mark.parametrize(
+    ['L', 'B', 'T', 'H', 'D', 'dtype'],
+    [
+        pytest.param(*test, id="L{}-B{}-T{}-H{}-D{}-{}".format(*test))
+        for test in [
+            (2, 4, 2000, 8, 64, torch.float16),
+        ]
+    ]
+)
+def test_generation(
+    L: int,
+    B: int,
+    T: int,
+    H: int,
+    D: int,
+    dtype: torch.dtype,
+):
     run_test_generation(L, B, T, H, D, DeltaNetConfig, dtype)