fla-org · yzhangcs · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025
diff --git a/fla/ops/abc/chunk.py b/fla/ops/abc/chunk.py
@@ -1082,16 +1082,16 @@ def chunk_abc(
     s: torch.Tensor,
     initial_state: Optional[Tuple[torch.Tensor]] = None,
     output_final_state: bool = False,
-    head_first: bool = True
+    head_first: bool = False
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     r"""
     Args:
         q (torch.Tensor):
-            queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`
+            queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`
         k (torch.Tensor):
-            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`
+            keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`
         v (torch.Tensor):
-            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`
+            values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`
         s (torch.Tensor):
             slot representations of shape `[B, H, T, M]` if `head_first=True` else `[B, T, H, M]`
         initial_state (Optional[Tuple[torch.Tensor, torch.Tensor]]):
@@ -1100,11 +1100,11 @@ def chunk_abc(
             Whether to output the final state of shape `[B, H, K, M]` and `[B, H, M, V]`. Default: `False`.
         head_first (Optional[bool]):
             Whether the inputs are in the head-first format.
-            Default: `True`.
+            Default: `False`.
 
     Returns:
         o (torch.Tensor):
-            Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+            Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
         final_state (torch.Tensor):
             Final state of shape `[B, H, K, M]` and `[B, H, M, V]` if `output_final_state=True` else `None`.
     """

diff --git a/fla/ops/based/fused_chunk.py b/fla/ops/based/fused_chunk.py
@@ -359,7 +359,7 @@ def fused_chunk_based(
     v: torch.Tensor,
     scale: Optional[float] = None,
     use_norm: bool = True,
-    head_first: bool = True
+    head_first: bool = False
 ):
     assert q.shape[-1] <= 16, 'only support feature dimension up to 16.'
     if scale is None:

diff --git a/fla/ops/based/parallel.py b/fla/ops/based/parallel.py
@@ -395,7 +395,7 @@ def parallel_based(
     v: torch.Tensor,
     scale: Optional[float] = None,
     use_norm: bool = True,
-    head_first: bool = True
+    head_first: bool = False
 ):
     assert q.shape[-1] <= 128, "only support feature dim up to 128"
     if scale is None:

diff --git a/fla/ops/common/chunk_delta_h.py b/fla/ops/common/chunk_delta_h.py
@@ -424,9 +424,9 @@ def chunk_gated_delta_rule_fwd_h(
     output_final_state: bool = False,
     offsets: Optional[torch.LongTensor] = None,
     indices: Optional[torch.LongTensor] = None,
-    head_first: bool = True,
     chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
-    save_new_value: bool = True
+    save_new_value: bool = True,
+    head_first: bool = False
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     if head_first:
         B, H, T, K, V = *k.shape, u.shape[-1]
@@ -490,8 +490,8 @@ def chunk_gated_delta_rule_bwd_dhu(
     scale: float,
     offsets: Optional[torch.LongTensor] = None,
     indices: Optional[torch.LongTensor] = None,
-    head_first: bool = True,
-    chunk_size: int = 64  # SY: remove this argument and force chunk size 64?
+    chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
+    head_first: bool = False
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     if head_first:
         B, H, T, K, V = *q.shape, do.shape[-1]

diff --git a/fla/ops/common/chunk_h.py b/fla/ops/common/chunk_h.py
@@ -302,7 +302,7 @@ def chunk_fwd_h(
     h0: torch.Tensor,
     output_final_state: bool,
     offsets: Optional[torch.Tensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64,
     split_size: Optional[int] = None,
     states_in_fp32: bool = False
@@ -364,7 +364,7 @@ def chunk_bwd_dh(
     dht: torch.Tensor,
     scale: float,
     offsets: Optional[torch.Tensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64,
     split_size: Optional[int] = None,
     states_in_fp32: bool = False

diff --git a/fla/ops/common/chunk_h_parallel.py b/fla/ops/common/chunk_h_parallel.py
@@ -488,7 +488,7 @@ def chunk_fwd_h(
     states_in_fp32: bool = False,
     offsets: Optional[torch.Tensor] = None,
     indices: Optional[torch.Tensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     if head_first:
@@ -569,7 +569,7 @@ def chunk_bwd_dh(
     states_in_fp32: bool = False,
     offsets: Optional[torch.Tensor] = None,
     indices: Optional[torch.Tensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     if head_first:

diff --git a/fla/ops/common/chunk_h_split.py b/fla/ops/common/chunk_h_split.py
@@ -498,7 +498,7 @@ def chunk_fwd_h(
     offsets: Optional[torch.LongTensor] = None,
     split_offsets: Optional[torch.LongTensor] = None,
     split_indices: Optional[torch.LongTensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64,
     split_size: int = 256,
     states_in_fp32: bool = True
@@ -590,7 +590,7 @@ def chunk_bwd_dh(
     offsets: Optional[torch.Tensor] = None,
     split_offsets: Optional[torch.Tensor] = None,
     split_indices: Optional[torch.Tensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64,
     split_size: int = 256,
     states_in_fp32: bool = True

diff --git a/fla/ops/common/chunk_o.py b/fla/ops/common/chunk_o.py
@@ -462,7 +462,7 @@ def chunk_fwd_o(
     scale: Optional[float] = None,
     offsets: Optional[torch.LongTensor] = None,
     indices: Optional[torch.LongTensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64
 ) -> torch.Tensor:
     if head_first:
@@ -506,7 +506,7 @@ def chunk_bwd_dv(
     scale: float,
     offsets: Optional[torch.LongTensor] = None,
     indices: Optional[torch.LongTensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64
 ) -> torch.Tensor:
     if head_first:
@@ -559,7 +559,7 @@ def chunk_bwd_dv_local(
     scale: float,
     offsets: Optional[torch.LongTensor] = None,
     indices: Optional[torch.LongTensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64
 ) -> torch.Tensor:
     if head_first:
@@ -615,7 +615,7 @@ def chunk_bwd_dqkwg(
     indices: Optional[torch.LongTensor] = None,
     chunk_size: int = 64,
     scale: float = 1.0,
-    head_first: bool = True,
+    head_first: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
 
     if head_first:

diff --git a/fla/ops/common/fused_recurrent.py b/fla/ops/common/fused_recurrent.py
@@ -332,7 +332,7 @@ def fused_recurrent_fwd(
     output_final_state: bool = False,
     reverse: bool = False,
     offsets: Optional[torch.LongTensor] = None,
-    head_first: bool = True
+    head_first: bool = False
 ):
     if head_first:
         B, H, T, K, V = *k.shape, v.shape[-1]
@@ -393,7 +393,7 @@ def fused_recurrent_bwd(
     initial_state: Optional[torch.Tensor] = None,
     reverse: bool = False,
     offsets: Optional[torch.LongTensor] = None,
-    head_first: bool = True
+    head_first: bool = False
 ):
     if head_first:
         B, H, T, K, V = *k.shape, v.shape[-1]
@@ -487,7 +487,7 @@ def forward(
         output_final_state: bool = False,
         reverse: bool = False,
         offsets: Optional[torch.LongTensor] = None,
-        head_first: bool = True
+        head_first: bool = False
     ):
         o, ht = fused_recurrent_fwd(
             q=q,
@@ -555,7 +555,7 @@ def fused_recurrent(
     output_final_state: bool = False,
     reverse: bool = False,
     cu_seqlens: Optional[torch.LongTensor] = None,
-    head_first: bool = True
+    head_first: bool = False
 ):
     if scale is None:
         scale = k.shape[-1] ** -0.5

diff --git a/fla/ops/delta_rule/chunk.py b/fla/ops/delta_rule/chunk.py
@@ -25,7 +25,7 @@ def chunk_delta_rule_fwd(
     output_final_state: bool,
     offsets: Optional[torch.LongTensor] = None,
     indices: Optional[torch.LongTensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64
 ):
     T = q.shape[2] if head_first else q.shape[1]
@@ -80,7 +80,7 @@ def chunk_delta_rule_bwd(
     dht: torch.Tensor,
     offsets: Optional[torch.LongTensor] = None,
     indices: Optional[torch.LongTensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64
 ):
     T = q.shape[2] if head_first else q.shape[1]
@@ -181,7 +181,7 @@ def forward(
         initial_state: torch.Tensor,
         output_final_state: bool,
         offsets: Optional[torch.LongTensor] = None,
-        head_first: bool = True,
+        head_first: bool = False,
         use_qk_l2norm_in_kernel: bool = True
     ):
         T = q.shape[2] if head_first else q.shape[1]

diff --git a/fla/ops/delta_rule/fused_recurrent.py b/fla/ops/delta_rule/fused_recurrent.py
@@ -301,7 +301,7 @@ def fused_recurrent_delta_rule_fwd(
     initial_state: torch.Tensor,
     output_final_state: bool,
     offsets: Optional[torch.LongTensor] = None,
-    head_first: bool = True
+    head_first: bool = False
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     if head_first:
         B, H, T, K, V = *k.shape, v.shape[-1]
@@ -359,7 +359,7 @@ def fused_recurrent_delta_rule_bwd(
     scale: float,
     initial_state: torch.Tensor,
     offsets: Optional[torch.LongTensor] = None,
-    head_first: bool = True
+    head_first: bool = False
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     if head_first:
         B, H, T, K, V = *k.shape, v.shape[-1]
@@ -438,7 +438,7 @@ def forward(
         initial_state: torch.Tensor,
         output_final_state: bool,
         offsets: Optional[torch.LongTensor] = None,
-        head_first: bool = True,
+        head_first: bool = False,
         use_qk_l2norm_in_kernel: bool = False
     ):
         q_orig = q
@@ -501,7 +501,7 @@ def fused_recurrent_delta_rule(
     initial_state: torch.Tensor = None,
     output_final_state: bool = False,
     cu_seqlens: Optional[torch.LongTensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     use_qk_l2norm_in_kernel: bool = False
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     r"""

diff --git a/fla/ops/delta_rule/parallel.py b/fla/ops/delta_rule/parallel.py
@@ -305,18 +305,18 @@ def parallel_delta_rule(
     beta: torch.Tensor,
     scale: float = None,
     output_attentions: bool = False,
-    head_first: bool = True
+    head_first: bool = False
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     r"""
     Args:
         q (torch.Tensor):
-            queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+            queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
         k (torch.Tensor):
-            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+            keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
         v (torch.Tensor):
-            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+            values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
         beta (torch.Tensor):
-            betas of shape `[B, H, T]` if `head_first=True` else `[B, T, H]`.
+            betas of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`.
         scale (Optional[int]):
             Scale factor for attention scores.
             If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
@@ -328,7 +328,7 @@ def parallel_delta_rule(
 
     Returns:
         o (torch.Tensor):
-            Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+            Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
         attn (torch.Tensor):
             Attention scores of shape `[B, H, T, T]` if `output_attentions=True` else `None`.
     """

diff --git a/fla/ops/gated_delta_rule/chunk.py b/fla/ops/gated_delta_rule/chunk.py
@@ -26,7 +26,7 @@ def chunk_gated_delta_rule_fwd(
     output_final_state: bool,
     offsets: Optional[torch.LongTensor] = None,
     indices: Optional[torch.LongTensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64
 ):
     g = chunk_local_cumsum(g, chunk_size, offsets=offsets, indices=indices, head_first=head_first)
@@ -85,7 +85,7 @@ def chunk_gated_delta_rule_bwd(
     dht: torch.Tensor,
     offsets: Optional[torch.LongTensor] = None,
     indices: Optional[torch.LongTensor] = None,
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64
 ):
     T = q.shape[2] if head_first else q.shape[1]
@@ -193,7 +193,7 @@ def forward(
         initial_state: torch.Tensor,
         output_final_state: bool,
         offsets: Optional[torch.LongTensor] = None,
-        head_first: bool = True,
+        head_first: bool = False,
         use_qk_l2norm_in_kernel: bool = False
     ):
         chunk_size = 64

diff --git a/fla/ops/gated_delta_rule/wy_fast.py b/fla/ops/gated_delta_rule/wy_fast.py
@@ -327,7 +327,7 @@ def fwd_prepare_wy_repr(
     beta: torch.Tensor,
     offsets: Optional[torch.LongTensor],
     indices: Optional[torch.LongTensor],
-    head_first: bool = True,
+    head_first: bool = False,
     chunk_size: int = 64
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     if head_first: