From 2073c20c0dfeb1f9823d7cf1d2b98345b2ca0ab6 Mon Sep 17 00:00:00 2001
From: Michael <micmelesse@gmail.com>
Date: Tue, 29 Apr 2025 11:40:10 -0500
Subject: [PATCH 1/8] fp8 stuff

find test case

compute delta fp8

basic fp8 config passing

non causal path works
---
 .../bwd_prefill_onekernel.py                  | 192 +++++++++++++-----
 .../flash_attn_triton_amd/interface_fa.py     |  20 +-
 flash_attn/flash_attn_triton_amd/test.py      |   2 +-
 3 files changed, 158 insertions(+), 56 deletions(-)

diff --git a/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py b/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
index 62b5f3d0213..03386a0f8c3 100644
--- a/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
+++ b/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
@@ -2,8 +2,8 @@
 import triton # type: ignore
 import triton.language as tl # type: ignore
 from typing import Literal, Optional
-from .utils import AUTOTUNE, DROPOUT_USE_PYTORCH, DROPOUT_DUMP, get_shapes_from_layout, compute_fp8_scaling_factors, \
-    get_strides_from_layout, create_dropout_mask, create_dropout_mask_varlen, is_cdna, is_rdna
+from .utils import DEBUG, AUTOTUNE, DROPOUT_USE_PYTORCH, DROPOUT_DUMP, get_shapes_from_layout, compute_fp8_scaling_factors, \
+    get_strides_from_layout, create_dropout_mask, create_dropout_mask_varlen, is_cdna, is_fp8, is_rdna
 
 # NOTE: triton fails to import tl.constexprs so create them here for the file
 tl_DROPOUT_USE_PYTORCH: tl.constexpr = triton.language.constexpr(DROPOUT_USE_PYTORCH)
@@ -461,12 +461,14 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
     stride_deltab, stride_deltah, stride_deltam,
     stride_dob, stride_doh, stride_dom, stride_dod,
     stride_dropoutb, stride_dropouth, stride_dropoutm, stride_dropoutn,
+    stride_descale_q_z, stride_descale_k_z, stride_descale_v_z, stride_descale_do_z,
     stride_az, stride_ah,
     HQ, HK,
     cu_seqlens_q, cu_seqlens_k,
     max_seqlen_q, max_seqlen_k,
     Dropout_mask, dropout_p, philox_seed, philox_offset_base,
     Alibi_slopes,
+    Descale_q, Descale_k, Descale_v, Descale_do,
     BLOCK_M1: tl.constexpr,
     BLOCK_N1: tl.constexpr,
     BLOCK_M2: tl.constexpr,
@@ -478,6 +480,9 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
     IS_VARLEN: tl.constexpr,
     USE_ALIBI: tl.constexpr,
     USE_EXP2: tl.constexpr,
+    IS_FP8: tl.constexpr,
+    FP8_MAX: tl.constexpr,
+    FP8_OUTPUT: tl.constexpr,
     DEBUG_TRITON: tl.constexpr,
     DEBUG_TRITON_DETAIL: tl.constexpr,
 ):
@@ -533,17 +538,15 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
         # Mask for loading K and V
         mask_kv = offs_n[:, None] < seqlen_k
         if PADDED_HEAD:
-            mask_k = offs_d < ACTUAL_HEAD_DIM
-            mask_kv &= mask_k[None, :]
-        offs_k = offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kd
-        offs_v = offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vd
+            mask_d = offs_d < ACTUAL_HEAD_DIM
+            mask_kv &= mask_d[None, :]
 
         # K/V tensors not changed for the group
-        adj_k = bid * stride_kb + hkid * stride_kh + k_start * stride_kn
-        adj_v = bid * stride_vb + hkid * stride_vh + k_start * stride_vn
+        adj_k = bid * stride_kb + hkid * stride_kh + k_start * stride_kn + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kd
+        adj_v = bid * stride_vb + hkid * stride_vh + k_start * stride_vn + offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vd
         # load K and V: they stay in SRAM throughout the inner loop.
-        k = tl.load(K + adj_k + offs_k, mask=mask_kv, other=0.0)
-        v = tl.load(V + adj_v + offs_v, mask=mask_kv, other=0.0)
+        k = tl.load(K + adj_k, mask=mask_kv, other=0.0)
+        v = tl.load(V + adj_v, mask=mask_kv, other=0.0)
         # If MQA / GQA, set the K and V head offsets appropriately.
         # hqid = hkid
         for hqid in range(hkid * GROUP_SIZE, hkid * GROUP_SIZE + GROUP_SIZE):
@@ -586,6 +589,14 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
                 dropout_offset = Dropout_mask + bid * stride_dropoutb + \
                                 hqid * stride_dropouth
 
+            if IS_FP8:
+                descale_q = tl.load(Descale_q + bid * stride_descale_q_z + hqid)
+                descale_k = tl.load(Descale_k + bid * stride_descale_k_z + hkid)
+                descale_v = tl.load(Descale_v + bid * stride_descale_v_z + hkid)
+                descale_do = tl.load(Descale_do + bid * stride_descale_do_z + hqid)
+            else:
+                descale_q, descale_k, descale_v, descale_do = 1.0, 1.0, 1.0, 1.0
+
             MASK_BLOCK_M1: tl.constexpr = BLOCK_M1 // BLK_SLICE_FACTOR
             # bound the masked operation to q len so it does not have to wast cycles
             len_m = min(len_m, seqlen_q)
@@ -610,13 +621,13 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
                 alibi_slope,
                 seqlen_q, seqlen_k,  # max sequence length for q and k
                 start_n, start_m, num_steps,  # iteration numbers
-                None, None, None, None,
+                descale_q, descale_k, descale_v, descale_do,
                 MASK=True,  # causal masking
                 ENABLE_DROPOUT=ENABLE_DROPOUT,  # activate dropout
                 USE_ALIBI=USE_ALIBI,
                 USE_EXP2=USE_EXP2,
-                IS_FP8=False,
-                FP8_MAX=None,
+                IS_FP8=IS_FP8,
+                FP8_MAX=FP8_MAX,
                 DEBUG_TRITON=DEBUG_TRITON,
                 DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL,
             )
@@ -640,13 +651,13 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
                 alibi_slope,
                 seqlen_q, seqlen_k,  # max sequence length for q and k
                 start_n, start_m, num_steps,  # iteration numbers
-                None, None, None, None,
+                descale_q, descale_k, descale_v, descale_do,
                 MASK=False,  # causal masking
                 ENABLE_DROPOUT=ENABLE_DROPOUT,  # activate dropout
                 USE_ALIBI=USE_ALIBI,
                 USE_EXP2=USE_EXP2,
-                IS_FP8=False,
-                FP8_MAX=None,
+                IS_FP8=IS_FP8,
+                FP8_MAX=FP8_MAX,
                 DEBUG_TRITON=DEBUG_TRITON,
                 DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL,
             )
@@ -674,8 +685,8 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
         # Mask for loading K and V
         mask_q = offs_m[:, None] < seqlen_q
         if PADDED_HEAD:
-            mask_k = offs_d < ACTUAL_HEAD_DIM
-            mask_q &= mask_k[None, :]
+            mask_d = offs_d < ACTUAL_HEAD_DIM
+            mask_q &= mask_d[None, :]
         offs_q = offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qd
         offs_do = offs_m[:, None] * stride_dom + offs_d[None, :] * stride_dod
         # NOTE: don't assume that the strides for k and v are the same!
@@ -725,26 +736,36 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
             # start can only be 0 at minimum
             start_n = max(end_n - BLOCK_M2, 0)
             num_steps = tl.cdiv(end_n - start_n, MASK_BLOCK_N2)
+
+            if IS_FP8:
+                descale_q = tl.load(Descale_q + bid * stride_descale_q_z + hqid)
+                descale_k = tl.load(Descale_k + bid * stride_descale_k_z + hkid)
+                descale_v = tl.load(Descale_v + bid * stride_descale_v_z + hkid)
+                descale_do = tl.load(Descale_do + bid * stride_descale_do_z + hqid)
+            else:
+                descale_q, descale_k, descale_v, descale_do = 1.0, 1.0, 1.0, 1.0
+
+
             dq = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32)
             dq = _bwd_dq_inner(
                 dq,
-                q, K, V, do, m, Delta_ptr, sm_scale, #
+                q, K, V, do, m, Delta_ptr, sm_scale,
                 stride_qm, stride_qd, stride_kn, stride_kd, stride_vn, stride_vd,
-                stride_dropoutm, stride_dropoutn,  #
+                stride_dropoutm, stride_dropoutn,
                 stride_deltam,
-                seqlen_q, seqlen_k,  #
-                BLOCK_M2, MASK_BLOCK_N2,  #
-                HEAD_DIM, ACTUAL_HEAD_DIM,  #
+                seqlen_q, seqlen_k,
+                BLOCK_M2, MASK_BLOCK_N2,
+                HEAD_DIM, ACTUAL_HEAD_DIM,
                 dropout_p, philox_seed, batch_philox_offset, dropout_offset,
                 alibi_slope,
-                start_m, start_n, end_n, num_steps,  #
-                None, None, None, None,
+                start_m, start_n, end_n, num_steps,
+                descale_q, descale_k, descale_v, descale_do,
                 MASK=True,  #
                 ENABLE_DROPOUT=ENABLE_DROPOUT,
                 USE_ALIBI=USE_ALIBI,
                 USE_EXP2=USE_EXP2,
-                IS_FP8=False,
-                FP8_MAX=None,
+                IS_FP8=IS_FP8,
+                FP8_MAX=FP8_MAX,
                 DEBUG_TRITON=DEBUG_TRITON,
                 DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL,
             )
@@ -754,8 +775,8 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
             if DEBUG_TRITON: print(f"unMasked: start_m: {start_m}, start_n: {start_n}, end_n: {end_n}, num_steps: {num_steps}")  # noqa: E701
             dq = _bwd_dq_inner(
                 dq,
-                q, K, V, do, m, Delta_ptr, sm_scale, #
-                stride_qm, stride_qd, stride_kn, stride_kd, stride_vn, stride_vd, #
+                q, K, V, do, m, Delta_ptr, sm_scale,
+                stride_qm, stride_qd, stride_kn, stride_kd, stride_vn, stride_vd,
                 stride_dropoutm, stride_dropoutn,
                 stride_deltam,
                 seqlen_q, seqlen_k,
@@ -764,13 +785,13 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
                 dropout_p, philox_seed, batch_philox_offset, dropout_offset,
                 alibi_slope,
                 start_m, start_n, end_n, num_steps,
-                None, None, None, None,
+                descale_q, descale_k, descale_v, descale_do,
                 MASK=False,
                 ENABLE_DROPOUT=ENABLE_DROPOUT,
                 USE_ALIBI=USE_ALIBI,
                 USE_EXP2=USE_EXP2,
-                IS_FP8=False,
-                FP8_MAX=None,
+                IS_FP8=IS_FP8,
+                FP8_MAX=FP8_MAX,
                 DEBUG_TRITON=DEBUG_TRITON,
                 DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL,
             )
@@ -799,12 +820,14 @@ def bwd_kernel_noncausal(
     stride_deltab, stride_deltah, stride_deltam,
     stride_dob, stride_doh, stride_dom, stride_dod,
     stride_dropoutb, stride_dropouth, stride_dropoutm, stride_dropoutn,
+    stride_descale_q_z, stride_descale_k_z, stride_descale_v_z, stride_descale_do_z,
     stride_az, stride_ah,
     HQ, HK,
     cu_seqlens_q, cu_seqlens_k,
     max_seqlen_q, max_seqlen_k,
     Dropout_mask, dropout_p, philox_seed, philox_offset_base,
     Alibi_slopes,
+    Descale_q, Descale_k, Descale_v, Descale_do,
     BLOCK_M1: tl.constexpr,  # 32
     BLOCK_N1: tl.constexpr,  # 128
     BLOCK_M2: tl.constexpr,  # 128
@@ -816,6 +839,9 @@ def bwd_kernel_noncausal(
     IS_VARLEN: tl.constexpr,
     USE_ALIBI: tl.constexpr,
     USE_EXP2: tl.constexpr,
+    IS_FP8: tl.constexpr,
+    FP8_MAX: tl.constexpr,
+    FP8_OUTPUT: tl.constexpr,
     DEBUG_TRITON: tl.constexpr,
     DEBUG_TRITON_DETAIL: tl.constexpr,
 ):
@@ -851,18 +877,15 @@ def bwd_kernel_noncausal(
         # Mask for loading K and V
         mask_kv = offs_n[:, None] < seqlen_k
         if PADDED_HEAD:
-            mask_k = offs_d < ACTUAL_HEAD_DIM
-            mask_kv &= mask_k[None, :]
+            mask_d = offs_d < ACTUAL_HEAD_DIM
+            mask_kv &= mask_d[None, :]
         # NOTE: don't assume that the strides for k and v are the same!
-        offs_k = offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kd
-        offs_v = offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vd
-
         # K/V tensors not changed for the group
-        adj_k = bid * stride_kb + hkid * stride_kh + k_start * stride_kn
-        adj_v = bid * stride_vb + hkid * stride_vh + k_start * stride_vn
+        adj_k = bid * stride_kb + hkid * stride_kh + k_start * stride_kn + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kd
+        adj_v = bid * stride_vb + hkid * stride_vh + k_start * stride_vn + offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vd
         # load K and V: they stay in SRAM throughout the inner loop.
-        k = tl.load(K + adj_k + offs_k, mask=mask_kv, other=0.0)
-        v = tl.load(V + adj_v + offs_v, mask=mask_kv, other=0.0)
+        k = tl.load(K + adj_k, mask=mask_kv, other=0.0)
+        v = tl.load(V + adj_v, mask=mask_kv, other=0.0)
         # If MQA / GQA, set the K and V head offsets appropriately.
         for hqid in range(hkid * GROUP_SIZE, hkid * GROUP_SIZE + GROUP_SIZE):
             # offset input and output tensor by batch and Q/K heads
@@ -890,6 +913,14 @@ def bwd_kernel_noncausal(
                 dropout_offset = Dropout_mask + bid * stride_dropoutb + \
                                 hqid * stride_dropouth
 
+            if IS_FP8:
+                descale_q = tl.load(Descale_q + bid * stride_descale_q_z + hqid)
+                descale_k = tl.load(Descale_k + bid * stride_descale_k_z + hkid)
+                descale_v = tl.load(Descale_v + bid * stride_descale_v_z + hkid)
+                descale_do = tl.load(Descale_do + bid * stride_descale_do_z + hqid)
+            else:
+                descale_q, descale_k, descale_v, descale_do = 1.0, 1.0, 1.0, 1.0
+
             # because there is no causal, we always start from the beginning
             start_m = 0
             num_steps = tl.cdiv(seqlen_q, BLOCK_M1)
@@ -906,13 +937,13 @@ def bwd_kernel_noncausal(
                 alibi_slope,
                 seqlen_q, seqlen_k,  # max sequence length for q and k
                 start_n, start_m, num_steps,  # iteration numbers
-                None, None, None, None,
+                descale_q, descale_k, descale_v, descale_do, # fp8 descale factors from user
                 MASK=False,  # causal masking
                 ENABLE_DROPOUT=ENABLE_DROPOUT,  # activate dropout
                 USE_ALIBI=USE_ALIBI,
                 USE_EXP2=USE_EXP2,
-                IS_FP8=False,
-                FP8_MAX=None,
+                IS_FP8=IS_FP8,
+                FP8_MAX=FP8_MAX,
                 DEBUG_TRITON=DEBUG_TRITON,
                 DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL,
             )
@@ -934,8 +965,8 @@ def bwd_kernel_noncausal(
         # Mask for loading K and V
         mask_q = offs_m[:, None] < seqlen_q
         if PADDED_HEAD:
-            mask_k = offs_d < ACTUAL_HEAD_DIM
-            mask_q &= mask_k[None, :]
+            mask_d = offs_d < ACTUAL_HEAD_DIM
+            mask_q &= mask_d[None, :]
         offs_q = offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qd
         offs_do = offs_m[:, None] * stride_dom + offs_d[None, :] * stride_dod
         adj_k = bid * stride_kb + hkid * stride_kh + k_start * stride_kn
@@ -974,6 +1005,14 @@ def bwd_kernel_noncausal(
                         mask=offs_m < seqlen_q)
             m = m[:, None]
 
+            if IS_FP8:
+                descale_q = tl.load(Descale_q + bid * stride_descale_q_z + hqid)
+                descale_k = tl.load(Descale_k + bid * stride_descale_k_z + hkid)
+                descale_v = tl.load(Descale_v + bid * stride_descale_v_z + hkid)
+                descale_do = tl.load(Descale_do + bid * stride_descale_do_z + hqid)
+            else:
+                descale_q, descale_k, descale_v, descale_do = 1.0, 1.0, 1.0, 1.0
+
             # start can only be 0 at minimum
             start_n = 0
             end_n = seqlen_k
@@ -992,13 +1031,13 @@ def bwd_kernel_noncausal(
                 dropout_p, philox_seed, batch_philox_offset, dropout_offset,
                 alibi_slope,
                 start_m, start_n, end_n, num_steps,
-                None, None, None, None,
+                descale_q, descale_k, descale_v, descale_do,
                 MASK=False,
                 ENABLE_DROPOUT=ENABLE_DROPOUT,
                 USE_ALIBI=USE_ALIBI,
                 USE_EXP2=USE_EXP2,
-                IS_FP8=False,
-                FP8_MAX=None,
+                IS_FP8=IS_FP8,
+                FP8_MAX=FP8_MAX,
                 DEBUG_TRITON=DEBUG_TRITON,
                 DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL,
             )
@@ -1037,6 +1076,15 @@ def attention_prefill_backward_triton_split_oneKernel_impl(
     philox_seed: Optional[int],
     philox_offset: Optional[int],
     use_exp2: bool,
+    # fp8
+    descale_q: Optional[torch.Tensor],
+    descale_k: Optional[torch.Tensor],
+    descale_v: Optional[torch.Tensor],
+    descale_o: Optional[torch.Tensor],
+    descale_do: Optional[torch.Tensor],
+    descale_dq: Optional[torch.Tensor],
+    descale_dk: Optional[torch.Tensor],
+    descale_dv: Optional[torch.Tensor],
 ):
     # debug
     DEBUG_TRITON: bool = False
@@ -1052,6 +1100,31 @@ def attention_prefill_backward_triton_split_oneKernel_impl(
     # dk = is_contiguous(dk, "dk")
     # dv = is_contiguous(dv, "dv")
 
+    IS_FP8 = is_fp8(q)
+    if IS_FP8:
+        FP8_MAX = torch.finfo(q.dtype).max
+        # assert that the main inputs are fp8
+        assert is_fp8(do) and is_fp8(q) and is_fp8(k) and is_fp8(v), f"Non fp8 type found: do.dtype={do.dtype}, q.dtype={q.dtype}, k.dtype={k.dtype}, v.dtype={v.dtype}. All tensors must be fp8."
+        if is_fp8(o):
+            FP8_OUTPUT = True
+            assert descale_o is not None, f"descale_o is None. In fp8, you need to pass a tensor for descale_o along with a tensor o."
+            assert descale_dq is not None, f"descale_dq is None. In fp8, you need to pass a tensor for descale_dq along with a tensor dq."
+            assert descale_dk is not None, f"descale_dk is None. In fp8, you need to pass a tensor for descale_dk along with a tensor dk."
+            assert descale_dv is not None, f"descale_dv is None. In fp8, you need to pass a tensor for descale_dv along with a tensor dv."
+        else:
+            FP8_OUTPUT = False
+
+        stride_descale_q_z = descale_q.stride(0) if descale_q is not None else None
+        stride_descale_k_z = descale_k.stride(0) if descale_k is not None else None
+        stride_descale_v_z = descale_v.stride(0) if descale_v is not None else None
+        stride_descale_o_z = descale_o.stride(0) if descale_o is not None else None
+        stride_descale_do_z = descale_do.stride(0) if descale_do is not None else None
+    else:
+        FP8_MAX = None
+        FP8_OUTPUT = False
+        stride_descale_q_z = stride_descale_k_z = stride_descale_v_z = stride_descale_o_z = stride_descale_do_z = None
+
+
     # get strides and shape
     batch, nheads_q, nheads_k, head_size, max_seqlen_q_final, max_seqlen_k_final = \
         get_shapes_from_layout(
@@ -1094,15 +1167,18 @@ def attention_prefill_backward_triton_split_oneKernel_impl(
         delta,
         stride_ob, stride_oh, stride_om, stride_od,
         stride_deltab, stride_deltah, stride_deltam,
-        0,
+        stride_descale_do_z,
         cu_seqlens_q, max_seqlen_q_final,
-        None,
+        descale_do,
         HEAD_DIM=HEAD_DIM,
         ACTUAL_HEAD_DIM=ACTUAL_HEAD_DIM,
         IS_VARLEN=IS_VARLEN,
-        IS_FP8=False
+        IS_FP8=IS_FP8
     )
 
+    if DEBUG:
+        print("delta:", delta, delta.shape)
+
     # dropout mask tensor for debugging. We dump the dropout mask created in
     #   the kernel for testing
     dropout_mask = None
@@ -1146,18 +1222,23 @@ def attention_prefill_backward_triton_split_oneKernel_impl(
             stride_deltab, stride_deltah, stride_deltam,
             stride_dob, stride_doh, stride_dom, stride_dod,
             stride_dropoutb, stride_dropouth, stride_dropoutm, stride_dropoutn,
+            stride_descale_q_z, stride_descale_k_z, stride_descale_v_z, stride_descale_do_z,
             stride_az, stride_ah,
             nheads_q, nheads_k,
             cu_seqlens_q, cu_seqlens_k,
             max_seqlen_q_final, max_seqlen_k_final,
             dropout_mask, dropout_p, philox_seed, philox_offset,
             alibi_slopes,
-            HEAD_DIM=HEAD_DIM, 
+            descale_q, descale_k, descale_v, descale_do,
+            HEAD_DIM=HEAD_DIM,
             ACTUAL_HEAD_DIM=ACTUAL_HEAD_DIM,
             ENABLE_DROPOUT=use_dropout,
             IS_VARLEN=IS_VARLEN,
             USE_ALIBI=use_alibi,
             USE_EXP2=use_exp2,
+            IS_FP8=IS_FP8,
+            FP8_MAX=FP8_MAX,
+            FP8_OUTPUT=FP8_OUTPUT,
             DEBUG_TRITON=DEBUG_TRITON,
             DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL,
         )
@@ -1174,18 +1255,23 @@ def attention_prefill_backward_triton_split_oneKernel_impl(
             stride_deltab, stride_deltah, stride_deltam,
             stride_dob, stride_doh, stride_dom, stride_dod,
             stride_dropoutb, stride_dropouth, stride_dropoutm, stride_dropoutn,
+            stride_descale_q_z, stride_descale_k_z, stride_descale_v_z, stride_descale_do_z,
             stride_az, stride_ah,
             nheads_q, nheads_k,
             cu_seqlens_q, cu_seqlens_k,
             max_seqlen_q_final, max_seqlen_k_final,
             dropout_mask, dropout_p, philox_seed, philox_offset,
             alibi_slopes,
+            descale_q, descale_k, descale_v, descale_do,
             HEAD_DIM=HEAD_DIM,
             ACTUAL_HEAD_DIM=ACTUAL_HEAD_DIM,
             ENABLE_DROPOUT=use_dropout,
             IS_VARLEN=IS_VARLEN,
             USE_ALIBI=use_alibi,
             USE_EXP2=use_exp2,
+            IS_FP8=IS_FP8,
+            FP8_MAX=FP8_MAX,
+            FP8_OUTPUT=FP8_OUTPUT,
             DEBUG_TRITON=DEBUG_TRITON,
             DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL,
         )
diff --git a/flash_attn/flash_attn_triton_amd/interface_fa.py b/flash_attn/flash_attn_triton_amd/interface_fa.py
index 68260cdd91f..a92b6f5d65d 100644
--- a/flash_attn/flash_attn_triton_amd/interface_fa.py
+++ b/flash_attn/flash_attn_triton_amd/interface_fa.py
@@ -354,7 +354,15 @@ def bwd(
                 dropout_p,
                 philox_seed,
                 philox_offset,
-                USE_EXP2
+                USE_EXP2,
+                descale_q,
+                descale_k,
+                descale_v,
+                descale_o,
+                descale_do,
+                descale_dq,
+                descale_dk,
+                descale_dv,
             )
             delta = delta_triton
         else:
@@ -723,7 +731,15 @@ def varlen_bwd(
                 dropout_p,
                 philox_seed,
                 philox_offset,
-                USE_EXP2
+                USE_EXP2,
+                descale_q,
+                descale_k,
+                descale_v,
+                descale_o,
+                descale_do,
+                descale_dq,
+                descale_dk,
+                descale_dv,
             )
             delta = delta_triton
         else:
diff --git a/flash_attn/flash_attn_triton_amd/test.py b/flash_attn/flash_attn_triton_amd/test.py
index fed61583229..29477a933ec 100644
--- a/flash_attn/flash_attn_triton_amd/test.py
+++ b/flash_attn/flash_attn_triton_amd/test.py
@@ -490,7 +490,7 @@ def fp8_assert_close(tensor_a, tensor_b, atol=ATOL_fp8, rtol=RTOL_fp8, max_diff_
         (2, 6, 6, 2048, 2048, 32),
     ],
 )
-@pytest.mark.parametrize('causal', [False, True])
+@pytest.mark.parametrize('causal', [False])
 @pytest.mark.parametrize('dropout_p', [0.0])
 @pytest.mark.parametrize('layout', ["bshd", "thd"])
 @pytest.mark.parametrize('packing', [None, "qkv"])

From ff8367d5c10f32ac53ffb0644912d46cba243fe9 Mon Sep 17 00:00:00 2001
From: Michael <micmelesse@gmail.com>
Date: Fri, 2 May 2025 15:33:20 -0500
Subject: [PATCH 2/8] isolate bad case

---
 flash_attn/flash_attn_triton_amd/test.py | 128 +++++++++++------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/flash_attn/flash_attn_triton_amd/test.py b/flash_attn/flash_attn_triton_amd/test.py
index 29477a933ec..7ac6833c85b 100644
--- a/flash_attn/flash_attn_triton_amd/test.py
+++ b/flash_attn/flash_attn_triton_amd/test.py
@@ -427,75 +427,75 @@ def fp8_assert_close(tensor_a, tensor_b, atol=ATOL_fp8, rtol=RTOL_fp8, max_diff_
     "Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD",
     [
         # seqlen q == k
-        (1, 1, 1, 1, 1, 1),
-        (1, 1, 1, 2, 2, 2),  # small enough to debug
+        # (1, 1, 1, 1, 1, 1),
+        # (1, 1, 1, 2, 2, 2),  # small enough to debug
         (1, 1, 1, 4, 4, 16),
-        (1, 2, 2, 4, 4, 16),
-        (2, 1, 1, 4, 4, 16),
-        (2, 2, 2, 4, 4, 16),
-        (1, 1, 1, 128, 128, 32),  # only one block
-        (3, 3, 3, 128, 128, 64),
-        (1, 1, 1, 127, 127, 32),  # only one block but with masking
-        # (1, 1, 1, 129, 129, 1),  # two blocks with 2nd block small enough to debug # fails
-        (1, 2, 2, 129, 129, 32),  # two blocks with 2nd block small enough to debug
-        (1, 1, 1, 350, 350, 32),  # two blocks with 2nd block small enough to debug
-        (1, 1, 1, 350, 350, 68),  # generic masking on q, k and head
-        (4, 1, 1, 512, 512, 128), # batch > 1
-        (4, 2, 2, 512, 512, 128),
-        (4, 2, 2, 512, 512, 68),
-        (4, 2, 2, 500, 500, 68),
-        (2, 4, 4, 1024, 1024, 64),
-        (4, 8, 8, 2048, 2048, 128),
-        (2, 8, 8, 4096, 4096, 64),
-        (2, 4, 4, 8192, 8192, 32),
-        # seqlen q > k
-        (1, 1, 1, 4, 2, 16),
-        (1, 1, 1, 64, 32, 8),
-        (1, 1, 1, 128, 64, 16),
-        (1, 1, 1, 192, 128, 32),
-        (1, 2, 2, 1024, 512, 68),
-        (1, 4, 4, 729, 516, 68),
-        (2, 4, 4, 2753, 1528, 68),  # a comprehensive seqlen_q > seqlen_k
-        # seqlen q < k
-        (1, 1, 1, 2, 4, 16),
-        (1, 2, 2, 2, 4, 16),
-        (1, 4, 1, 2, 4, 16),
-        (1, 4, 2, 2, 4, 16),
-        (2, 2, 2, 2, 128, 1),
-        (2, 3, 3, 2, 128, 16),
-        (1, 1, 1, 32, 64, 8),
-        (1, 1, 1, 128, 192, 32),
-        (4, 6, 6, 108, 256, 32),
-        (3, 2, 2, 256, 512, 16),
-        (2, 2, 2, 512, 1024, 68),
-        (1, 1, 1, 200, 413, 32),
-        (1, 1, 1, 782, 1546, 32),
-        # gqa/mqa                   # mismatch issue on varlen
-        (4, 8, 2, 500, 500, 68), 
-        (4, 8, 2, 512, 512, 68),
-        (4, 8, 2, 512, 512, 128),
-        (4, 8, 2, 512, 1024, 68),
-        (4, 8, 2, 1024, 512, 64),
-        (4, 16, 4, 1528, 2753, 68),
-        # fa configs
-        (2, 4, 1, 113, 203, 64),
-        (2, 4, 2, 128, 217, 64),
-        (2, 6, 2, 113, 211, 128),
-        (2, 6, 2, 108, 256, 128),
-        (2, 6, 2, 256, 512, 64),
-        (2, 6, 2, 512, 256, 64),
-        (2, 6, 2, 1024, 1024, 32),
-        (2, 6, 2, 1023, 1024, 32),
-        (2, 6, 6, 1024, 1023, 32),
-        (2, 6, 6, 2048, 2048, 32),
+        # (1, 2, 2, 4, 4, 16),
+        # (2, 1, 1, 4, 4, 16),
+        # (2, 2, 2, 4, 4, 16),
+        # (1, 1, 1, 128, 128, 32),  # only one block
+        # (3, 3, 3, 128, 128, 64),
+        # (1, 1, 1, 127, 127, 32),  # only one block but with masking
+        # # (1, 1, 1, 129, 129, 1),  # two blocks with 2nd block small enough to debug # fails
+        # (1, 2, 2, 129, 129, 32),  # two blocks with 2nd block small enough to debug
+        # (1, 1, 1, 350, 350, 32),  # two blocks with 2nd block small enough to debug
+        # (1, 1, 1, 350, 350, 68),  # generic masking on q, k and head
+        # (4, 1, 1, 512, 512, 128), # batch > 1
+        # (4, 2, 2, 512, 512, 128),
+        # (4, 2, 2, 512, 512, 68),
+        # (4, 2, 2, 500, 500, 68),
+        # (2, 4, 4, 1024, 1024, 64),
+        # (4, 8, 8, 2048, 2048, 128),
+        # (2, 8, 8, 4096, 4096, 64),
+        # (2, 4, 4, 8192, 8192, 32),
+        # # seqlen q > k
+        # (1, 1, 1, 4, 2, 16),
+        # (1, 1, 1, 64, 32, 8),
+        # (1, 1, 1, 128, 64, 16),
+        # (1, 1, 1, 192, 128, 32),
+        # (1, 2, 2, 1024, 512, 68),
+        # (1, 4, 4, 729, 516, 68),
+        # (2, 4, 4, 2753, 1528, 68),  # a comprehensive seqlen_q > seqlen_k
+        # # seqlen q < k
+        # (1, 1, 1, 2, 4, 16),
+        # (1, 2, 2, 2, 4, 16),
+        # (1, 4, 1, 2, 4, 16),
+        # (1, 4, 2, 2, 4, 16),
+        # (2, 2, 2, 2, 128, 1),
+        # (2, 3, 3, 2, 128, 16),
+        # (1, 1, 1, 32, 64, 8),
+        # (1, 1, 1, 128, 192, 32),
+        # (4, 6, 6, 108, 256, 32),
+        # (3, 2, 2, 256, 512, 16),
+        # (2, 2, 2, 512, 1024, 68),
+        # (1, 1, 1, 200, 413, 32),
+        # (1, 1, 1, 782, 1546, 32),
+        # # gqa/mqa                   # mismatch issue on varlen
+        # (4, 8, 2, 500, 500, 68), 
+        # (4, 8, 2, 512, 512, 68),
+        # (4, 8, 2, 512, 512, 128),
+        # (4, 8, 2, 512, 1024, 68),
+        # (4, 8, 2, 1024, 512, 64),
+        # (4, 16, 4, 1528, 2753, 68),
+        # # fa configs
+        # (2, 4, 1, 113, 203, 64),
+        # (2, 4, 2, 128, 217, 64),
+        # (2, 6, 2, 113, 211, 128),
+        # (2, 6, 2, 108, 256, 128),
+        # (2, 6, 2, 256, 512, 64),
+        # (2, 6, 2, 512, 256, 64),
+        # (2, 6, 2, 1024, 1024, 32),
+        # (2, 6, 2, 1023, 1024, 32),
+        # (2, 6, 6, 1024, 1023, 32),
+        # (2, 6, 6, 2048, 2048, 32),
     ],
 )
-@pytest.mark.parametrize('causal', [False])
+@pytest.mark.parametrize('causal', [True])
 @pytest.mark.parametrize('dropout_p', [0.0])
-@pytest.mark.parametrize('layout', ["bshd", "thd"])
-@pytest.mark.parametrize('packing', [None, "qkv"])
+@pytest.mark.parametrize('layout', ["bshd"])
+@pytest.mark.parametrize('packing', [None])
 @pytest.mark.parametrize('DEBUG_INPUT', [False])
-@pytest.mark.flaky(reruns=3, reason="Retry failures")
+# @pytest.mark.flaky(reruns=3, reason="Retry failures")
 @pytest.mark.skipif(not arch_supports_fp8(), reason="fp8 not supported on this device")
 def test_fp8(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, dropout_p, layout, packing, DEBUG_INPUT):
     torch.manual_seed(20)

From 00f511b34c9af6bd4c78ec8a796c5f80963b4ab2 Mon Sep 17 00:00:00 2001
From: Michael <micmelesse@gmail.com>
Date: Fri, 2 May 2025 17:33:27 -0500
Subject: [PATCH 3/8] fix fp8 bug

---
 .../bwd_prefill_onekernel.py                  |  28 ++--
 flash_attn/flash_attn_triton_amd/test.py      | 128 +++++++++---------
 2 files changed, 75 insertions(+), 81 deletions(-)

diff --git a/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py b/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
index 03386a0f8c3..0490d1294c3 100644
--- a/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
+++ b/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
@@ -316,7 +316,6 @@ def _bwd_dkdv_inner(
         curr_m += step_m
         qT_ptrs += step_m * stride_qm
         do_ptrs += step_m * stride_dom
-    return dk, dv
 
 # the main inner-loop logic for computing dQ
 @triton.jit
@@ -441,7 +440,6 @@ def _bwd_dq_inner(
         curr_n += step_n
         kT_ptrs += step_n * stride_kn
         vT_ptrs += step_n * stride_vn
-    return dq
 
 @triton.autotune(
     configs=causal_autotune_configs,
@@ -608,7 +606,7 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
             # if start_m is negative, the current N-tile has no block on the
             #   diagonal of causal mask, so everything have no causal mask
             if DEBUG_TRITON: print(f"Masked: start_n: {start_n}; start_m: {start_m}, num_steps: {num_steps}")  # noqa: E701
-            dk, dv = _bwd_dkdv_inner(
+            _bwd_dkdv_inner(
                 dk, dv,  # output tensors
                 Q_ptr, k, v, DO_ptr, M_ptr, Delta_ptr, sm_scale, # input tensors
                 stride_qm, stride_qd,  # strides for q
@@ -638,7 +636,7 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
             if DEBUG_TRITON: print(f"start_m after Masked step: {start_m}; num_steps: {num_steps}")  # noqa: E701
             if DEBUG_TRITON: print(f"unMasked: start_n: {start_n}, start_m: {start_m}, end_m: {end_m}, num_steps: {num_steps}")  # noqa: E701
             if DEBUG_TRITON: print("unMasked")  # noqa: E701
-            dk, dv = _bwd_dkdv_inner(
+            _bwd_dkdv_inner(
                 dk, dv,  # output tensors
                 Q_ptr, k, v, DO_ptr, M_ptr, Delta_ptr, sm_scale, # input tensors
                 stride_qm, stride_qd,  # strides for q
@@ -690,10 +688,9 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
         offs_q = offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qd
         offs_do = offs_m[:, None] * stride_dom + offs_d[None, :] * stride_dod
         # NOTE: don't assume that the strides for k and v are the same!
-        adj_k = bid * stride_kb + hkid * stride_kh + k_start * stride_kn
-        adj_v = bid * stride_vb + hkid * stride_vh + k_start * stride_vn
-        K +=  adj_k
-        V +=  adj_v
+        K +=  bid * stride_kb + hkid * stride_kh + k_start * stride_kn
+        V +=  bid * stride_vb + hkid * stride_vh + k_start * stride_vn
+
         # If MQA / GQA, set the K and V head offsets appropriately.
         for hqid in range(hkid * GROUP_SIZE, hkid * GROUP_SIZE + GROUP_SIZE):
             # seqlen_q < seqlen_k: delta_qk more kv tokens are added at the front
@@ -725,7 +722,6 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
                                         hqid * stride_dropouth
                 dropout_offset = \
                     Dropout_mask + bid * stride_dropoutb + hqid * stride_dropouth
-
             q = tl.load(Q + adj_q + offs_q, mask=mask_q, other=0.0)
             do = tl.load(DO + adj_do + offs_do, mask=mask_q, other=0.0)
             m = tl.load(M + adj_delta + offs_m * stride_deltam,
@@ -747,7 +743,7 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
 
 
             dq = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32)
-            dq = _bwd_dq_inner(
+            _bwd_dq_inner(
                 dq,
                 q, K, V, do, m, Delta_ptr, sm_scale,
                 stride_qm, stride_qd, stride_kn, stride_kd, stride_vn, stride_vd,
@@ -773,7 +769,7 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
             num_steps = tl.cdiv(end_n, BLOCK_N2)
             start_n = max(end_n - num_steps * BLOCK_N2, 0)
             if DEBUG_TRITON: print(f"unMasked: start_m: {start_m}, start_n: {start_n}, end_n: {end_n}, num_steps: {num_steps}")  # noqa: E701
-            dq = _bwd_dq_inner(
+            _bwd_dq_inner(
                 dq,
                 q, K, V, do, m, Delta_ptr, sm_scale,
                 stride_qm, stride_qd, stride_kn, stride_kd, stride_vn, stride_vd,
@@ -924,7 +920,7 @@ def bwd_kernel_noncausal(
             # because there is no causal, we always start from the beginning
             start_m = 0
             num_steps = tl.cdiv(seqlen_q, BLOCK_M1)
-            dk, dv = _bwd_dkdv_inner(
+            _bwd_dkdv_inner(
                 dk, dv,  # output tensors
                 Q_ptr, k, v, DO_ptr, M_ptr, Delta_ptr, sm_scale, # input tensors
                 stride_qm, stride_qd,  # strides for q
@@ -969,10 +965,8 @@ def bwd_kernel_noncausal(
             mask_q &= mask_d[None, :]
         offs_q = offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qd
         offs_do = offs_m[:, None] * stride_dom + offs_d[None, :] * stride_dod
-        adj_k = bid * stride_kb + hkid * stride_kh + k_start * stride_kn
-        adj_v = bid * stride_vb + hkid * stride_vh + k_start * stride_vn
-        K +=  adj_k
-        V +=  adj_v
+        K +=  bid * stride_kb + hkid * stride_kh + k_start * stride_kn
+        V +=  bid * stride_vb + hkid * stride_vh + k_start * stride_vn
         # If MQA / GQA, set the K and V head offsets appropriately.
         for hqid in range(hkid * GROUP_SIZE, hkid * GROUP_SIZE + GROUP_SIZE):
             # offset input and output tensor by batch and Q/K heads
@@ -1019,7 +1013,7 @@ def bwd_kernel_noncausal(
             num_steps = tl.cdiv(seqlen_k, BLOCK_N2)
 
             dq = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32)
-            dq = _bwd_dq_inner(
+            _bwd_dq_inner(
                 dq,
                 q, K, V, do, m, Delta_ptr, sm_scale,
                 stride_qm, stride_qd, stride_kn, stride_kd, stride_vn, stride_vd,
diff --git a/flash_attn/flash_attn_triton_amd/test.py b/flash_attn/flash_attn_triton_amd/test.py
index 7ac6833c85b..fed61583229 100644
--- a/flash_attn/flash_attn_triton_amd/test.py
+++ b/flash_attn/flash_attn_triton_amd/test.py
@@ -427,75 +427,75 @@ def fp8_assert_close(tensor_a, tensor_b, atol=ATOL_fp8, rtol=RTOL_fp8, max_diff_
     "Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD",
     [
         # seqlen q == k
-        # (1, 1, 1, 1, 1, 1),
-        # (1, 1, 1, 2, 2, 2),  # small enough to debug
+        (1, 1, 1, 1, 1, 1),
+        (1, 1, 1, 2, 2, 2),  # small enough to debug
         (1, 1, 1, 4, 4, 16),
-        # (1, 2, 2, 4, 4, 16),
-        # (2, 1, 1, 4, 4, 16),
-        # (2, 2, 2, 4, 4, 16),
-        # (1, 1, 1, 128, 128, 32),  # only one block
-        # (3, 3, 3, 128, 128, 64),
-        # (1, 1, 1, 127, 127, 32),  # only one block but with masking
-        # # (1, 1, 1, 129, 129, 1),  # two blocks with 2nd block small enough to debug # fails
-        # (1, 2, 2, 129, 129, 32),  # two blocks with 2nd block small enough to debug
-        # (1, 1, 1, 350, 350, 32),  # two blocks with 2nd block small enough to debug
-        # (1, 1, 1, 350, 350, 68),  # generic masking on q, k and head
-        # (4, 1, 1, 512, 512, 128), # batch > 1
-        # (4, 2, 2, 512, 512, 128),
-        # (4, 2, 2, 512, 512, 68),
-        # (4, 2, 2, 500, 500, 68),
-        # (2, 4, 4, 1024, 1024, 64),
-        # (4, 8, 8, 2048, 2048, 128),
-        # (2, 8, 8, 4096, 4096, 64),
-        # (2, 4, 4, 8192, 8192, 32),
-        # # seqlen q > k
-        # (1, 1, 1, 4, 2, 16),
-        # (1, 1, 1, 64, 32, 8),
-        # (1, 1, 1, 128, 64, 16),
-        # (1, 1, 1, 192, 128, 32),
-        # (1, 2, 2, 1024, 512, 68),
-        # (1, 4, 4, 729, 516, 68),
-        # (2, 4, 4, 2753, 1528, 68),  # a comprehensive seqlen_q > seqlen_k
-        # # seqlen q < k
-        # (1, 1, 1, 2, 4, 16),
-        # (1, 2, 2, 2, 4, 16),
-        # (1, 4, 1, 2, 4, 16),
-        # (1, 4, 2, 2, 4, 16),
-        # (2, 2, 2, 2, 128, 1),
-        # (2, 3, 3, 2, 128, 16),
-        # (1, 1, 1, 32, 64, 8),
-        # (1, 1, 1, 128, 192, 32),
-        # (4, 6, 6, 108, 256, 32),
-        # (3, 2, 2, 256, 512, 16),
-        # (2, 2, 2, 512, 1024, 68),
-        # (1, 1, 1, 200, 413, 32),
-        # (1, 1, 1, 782, 1546, 32),
-        # # gqa/mqa                   # mismatch issue on varlen
-        # (4, 8, 2, 500, 500, 68), 
-        # (4, 8, 2, 512, 512, 68),
-        # (4, 8, 2, 512, 512, 128),
-        # (4, 8, 2, 512, 1024, 68),
-        # (4, 8, 2, 1024, 512, 64),
-        # (4, 16, 4, 1528, 2753, 68),
-        # # fa configs
-        # (2, 4, 1, 113, 203, 64),
-        # (2, 4, 2, 128, 217, 64),
-        # (2, 6, 2, 113, 211, 128),
-        # (2, 6, 2, 108, 256, 128),
-        # (2, 6, 2, 256, 512, 64),
-        # (2, 6, 2, 512, 256, 64),
-        # (2, 6, 2, 1024, 1024, 32),
-        # (2, 6, 2, 1023, 1024, 32),
-        # (2, 6, 6, 1024, 1023, 32),
-        # (2, 6, 6, 2048, 2048, 32),
+        (1, 2, 2, 4, 4, 16),
+        (2, 1, 1, 4, 4, 16),
+        (2, 2, 2, 4, 4, 16),
+        (1, 1, 1, 128, 128, 32),  # only one block
+        (3, 3, 3, 128, 128, 64),
+        (1, 1, 1, 127, 127, 32),  # only one block but with masking
+        # (1, 1, 1, 129, 129, 1),  # two blocks with 2nd block small enough to debug # fails
+        (1, 2, 2, 129, 129, 32),  # two blocks with 2nd block small enough to debug
+        (1, 1, 1, 350, 350, 32),  # two blocks with 2nd block small enough to debug
+        (1, 1, 1, 350, 350, 68),  # generic masking on q, k and head
+        (4, 1, 1, 512, 512, 128), # batch > 1
+        (4, 2, 2, 512, 512, 128),
+        (4, 2, 2, 512, 512, 68),
+        (4, 2, 2, 500, 500, 68),
+        (2, 4, 4, 1024, 1024, 64),
+        (4, 8, 8, 2048, 2048, 128),
+        (2, 8, 8, 4096, 4096, 64),
+        (2, 4, 4, 8192, 8192, 32),
+        # seqlen q > k
+        (1, 1, 1, 4, 2, 16),
+        (1, 1, 1, 64, 32, 8),
+        (1, 1, 1, 128, 64, 16),
+        (1, 1, 1, 192, 128, 32),
+        (1, 2, 2, 1024, 512, 68),
+        (1, 4, 4, 729, 516, 68),
+        (2, 4, 4, 2753, 1528, 68),  # a comprehensive seqlen_q > seqlen_k
+        # seqlen q < k
+        (1, 1, 1, 2, 4, 16),
+        (1, 2, 2, 2, 4, 16),
+        (1, 4, 1, 2, 4, 16),
+        (1, 4, 2, 2, 4, 16),
+        (2, 2, 2, 2, 128, 1),
+        (2, 3, 3, 2, 128, 16),
+        (1, 1, 1, 32, 64, 8),
+        (1, 1, 1, 128, 192, 32),
+        (4, 6, 6, 108, 256, 32),
+        (3, 2, 2, 256, 512, 16),
+        (2, 2, 2, 512, 1024, 68),
+        (1, 1, 1, 200, 413, 32),
+        (1, 1, 1, 782, 1546, 32),
+        # gqa/mqa                   # mismatch issue on varlen
+        (4, 8, 2, 500, 500, 68), 
+        (4, 8, 2, 512, 512, 68),
+        (4, 8, 2, 512, 512, 128),
+        (4, 8, 2, 512, 1024, 68),
+        (4, 8, 2, 1024, 512, 64),
+        (4, 16, 4, 1528, 2753, 68),
+        # fa configs
+        (2, 4, 1, 113, 203, 64),
+        (2, 4, 2, 128, 217, 64),
+        (2, 6, 2, 113, 211, 128),
+        (2, 6, 2, 108, 256, 128),
+        (2, 6, 2, 256, 512, 64),
+        (2, 6, 2, 512, 256, 64),
+        (2, 6, 2, 1024, 1024, 32),
+        (2, 6, 2, 1023, 1024, 32),
+        (2, 6, 6, 1024, 1023, 32),
+        (2, 6, 6, 2048, 2048, 32),
     ],
 )
-@pytest.mark.parametrize('causal', [True])
+@pytest.mark.parametrize('causal', [False, True])
 @pytest.mark.parametrize('dropout_p', [0.0])
-@pytest.mark.parametrize('layout', ["bshd"])
-@pytest.mark.parametrize('packing', [None])
+@pytest.mark.parametrize('layout', ["bshd", "thd"])
+@pytest.mark.parametrize('packing', [None, "qkv"])
 @pytest.mark.parametrize('DEBUG_INPUT', [False])
-# @pytest.mark.flaky(reruns=3, reason="Retry failures")
+@pytest.mark.flaky(reruns=3, reason="Retry failures")
 @pytest.mark.skipif(not arch_supports_fp8(), reason="fp8 not supported on this device")
 def test_fp8(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, dropout_p, layout, packing, DEBUG_INPUT):
     torch.manual_seed(20)

From 6fe6297b16dac9914471c73d2e6c49dcb4c3a10f Mon Sep 17 00:00:00 2001
From: Michael <micmelesse@gmail.com>
Date: Fri, 2 May 2025 21:26:01 -0500
Subject: [PATCH 4/8] didnot fix fp8 bug

---
 .../bwd_prefill_onekernel.py                      | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py b/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
index 0490d1294c3..55f93616374 100644
--- a/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
+++ b/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
@@ -316,6 +316,7 @@ def _bwd_dkdv_inner(
         curr_m += step_m
         qT_ptrs += step_m * stride_qm
         do_ptrs += step_m * stride_dom
+    return dk, dv
 
 # the main inner-loop logic for computing dQ
 @triton.jit
@@ -440,6 +441,7 @@ def _bwd_dq_inner(
         curr_n += step_n
         kT_ptrs += step_n * stride_kn
         vT_ptrs += step_n * stride_vn
+    return dq
 
 @triton.autotune(
     configs=causal_autotune_configs,
@@ -606,7 +608,7 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
             # if start_m is negative, the current N-tile has no block on the
             #   diagonal of causal mask, so everything have no causal mask
             if DEBUG_TRITON: print(f"Masked: start_n: {start_n}; start_m: {start_m}, num_steps: {num_steps}")  # noqa: E701
-            _bwd_dkdv_inner(
+            dk, dv = _bwd_dkdv_inner(
                 dk, dv,  # output tensors
                 Q_ptr, k, v, DO_ptr, M_ptr, Delta_ptr, sm_scale, # input tensors
                 stride_qm, stride_qd,  # strides for q
@@ -636,7 +638,7 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
             if DEBUG_TRITON: print(f"start_m after Masked step: {start_m}; num_steps: {num_steps}")  # noqa: E701
             if DEBUG_TRITON: print(f"unMasked: start_n: {start_n}, start_m: {start_m}, end_m: {end_m}, num_steps: {num_steps}")  # noqa: E701
             if DEBUG_TRITON: print("unMasked")  # noqa: E701
-            _bwd_dkdv_inner(
+            dk, dv = _bwd_dkdv_inner(
                 dk, dv,  # output tensors
                 Q_ptr, k, v, DO_ptr, M_ptr, Delta_ptr, sm_scale, # input tensors
                 stride_qm, stride_qd,  # strides for q
@@ -741,9 +743,8 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
             else:
                 descale_q, descale_k, descale_v, descale_do = 1.0, 1.0, 1.0, 1.0
 
-
             dq = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32)
-            _bwd_dq_inner(
+            dq = _bwd_dq_inner(
                 dq,
                 q, K, V, do, m, Delta_ptr, sm_scale,
                 stride_qm, stride_qd, stride_kn, stride_kd, stride_vn, stride_vd,
@@ -769,7 +770,7 @@ def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nhead
             num_steps = tl.cdiv(end_n, BLOCK_N2)
             start_n = max(end_n - num_steps * BLOCK_N2, 0)
             if DEBUG_TRITON: print(f"unMasked: start_m: {start_m}, start_n: {start_n}, end_n: {end_n}, num_steps: {num_steps}")  # noqa: E701
-            _bwd_dq_inner(
+            dq = _bwd_dq_inner(
                 dq,
                 q, K, V, do, m, Delta_ptr, sm_scale,
                 stride_qm, stride_qd, stride_kn, stride_kd, stride_vn, stride_vd,
@@ -920,7 +921,7 @@ def bwd_kernel_noncausal(
             # because there is no causal, we always start from the beginning
             start_m = 0
             num_steps = tl.cdiv(seqlen_q, BLOCK_M1)
-            _bwd_dkdv_inner(
+            dk, dv = _bwd_dkdv_inner(
                 dk, dv,  # output tensors
                 Q_ptr, k, v, DO_ptr, M_ptr, Delta_ptr, sm_scale, # input tensors
                 stride_qm, stride_qd,  # strides for q
@@ -1013,7 +1014,7 @@ def bwd_kernel_noncausal(
             num_steps = tl.cdiv(seqlen_k, BLOCK_N2)
 
             dq = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32)
-            _bwd_dq_inner(
+            dq = _bwd_dq_inner(
                 dq,
                 q, K, V, do, m, Delta_ptr, sm_scale,
                 stride_qm, stride_qd, stride_kn, stride_kd, stride_vn, stride_vd,

From 4b45f1d7db8cc524f2b053c7e745bc7fcf748a66 Mon Sep 17 00:00:00 2001
From: Michael <micmelesse@gmail.com>
Date: Fri, 2 May 2025 21:48:45 -0500
Subject: [PATCH 5/8] back to failing test

---
 flash_attn/flash_attn_triton_amd/test.py | 128 +++++++++++------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/flash_attn/flash_attn_triton_amd/test.py b/flash_attn/flash_attn_triton_amd/test.py
index fed61583229..7ac6833c85b 100644
--- a/flash_attn/flash_attn_triton_amd/test.py
+++ b/flash_attn/flash_attn_triton_amd/test.py
@@ -427,75 +427,75 @@ def fp8_assert_close(tensor_a, tensor_b, atol=ATOL_fp8, rtol=RTOL_fp8, max_diff_
     "Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD",
     [
         # seqlen q == k
-        (1, 1, 1, 1, 1, 1),
-        (1, 1, 1, 2, 2, 2),  # small enough to debug
+        # (1, 1, 1, 1, 1, 1),
+        # (1, 1, 1, 2, 2, 2),  # small enough to debug
         (1, 1, 1, 4, 4, 16),
-        (1, 2, 2, 4, 4, 16),
-        (2, 1, 1, 4, 4, 16),
-        (2, 2, 2, 4, 4, 16),
-        (1, 1, 1, 128, 128, 32),  # only one block
-        (3, 3, 3, 128, 128, 64),
-        (1, 1, 1, 127, 127, 32),  # only one block but with masking
-        # (1, 1, 1, 129, 129, 1),  # two blocks with 2nd block small enough to debug # fails
-        (1, 2, 2, 129, 129, 32),  # two blocks with 2nd block small enough to debug
-        (1, 1, 1, 350, 350, 32),  # two blocks with 2nd block small enough to debug
-        (1, 1, 1, 350, 350, 68),  # generic masking on q, k and head
-        (4, 1, 1, 512, 512, 128), # batch > 1
-        (4, 2, 2, 512, 512, 128),
-        (4, 2, 2, 512, 512, 68),
-        (4, 2, 2, 500, 500, 68),
-        (2, 4, 4, 1024, 1024, 64),
-        (4, 8, 8, 2048, 2048, 128),
-        (2, 8, 8, 4096, 4096, 64),
-        (2, 4, 4, 8192, 8192, 32),
-        # seqlen q > k
-        (1, 1, 1, 4, 2, 16),
-        (1, 1, 1, 64, 32, 8),
-        (1, 1, 1, 128, 64, 16),
-        (1, 1, 1, 192, 128, 32),
-        (1, 2, 2, 1024, 512, 68),
-        (1, 4, 4, 729, 516, 68),
-        (2, 4, 4, 2753, 1528, 68),  # a comprehensive seqlen_q > seqlen_k
-        # seqlen q < k
-        (1, 1, 1, 2, 4, 16),
-        (1, 2, 2, 2, 4, 16),
-        (1, 4, 1, 2, 4, 16),
-        (1, 4, 2, 2, 4, 16),
-        (2, 2, 2, 2, 128, 1),
-        (2, 3, 3, 2, 128, 16),
-        (1, 1, 1, 32, 64, 8),
-        (1, 1, 1, 128, 192, 32),
-        (4, 6, 6, 108, 256, 32),
-        (3, 2, 2, 256, 512, 16),
-        (2, 2, 2, 512, 1024, 68),
-        (1, 1, 1, 200, 413, 32),
-        (1, 1, 1, 782, 1546, 32),
-        # gqa/mqa                   # mismatch issue on varlen
-        (4, 8, 2, 500, 500, 68), 
-        (4, 8, 2, 512, 512, 68),
-        (4, 8, 2, 512, 512, 128),
-        (4, 8, 2, 512, 1024, 68),
-        (4, 8, 2, 1024, 512, 64),
-        (4, 16, 4, 1528, 2753, 68),
-        # fa configs
-        (2, 4, 1, 113, 203, 64),
-        (2, 4, 2, 128, 217, 64),
-        (2, 6, 2, 113, 211, 128),
-        (2, 6, 2, 108, 256, 128),
-        (2, 6, 2, 256, 512, 64),
-        (2, 6, 2, 512, 256, 64),
-        (2, 6, 2, 1024, 1024, 32),
-        (2, 6, 2, 1023, 1024, 32),
-        (2, 6, 6, 1024, 1023, 32),
-        (2, 6, 6, 2048, 2048, 32),
+        # (1, 2, 2, 4, 4, 16),
+        # (2, 1, 1, 4, 4, 16),
+        # (2, 2, 2, 4, 4, 16),
+        # (1, 1, 1, 128, 128, 32),  # only one block
+        # (3, 3, 3, 128, 128, 64),
+        # (1, 1, 1, 127, 127, 32),  # only one block but with masking
+        # # (1, 1, 1, 129, 129, 1),  # two blocks with 2nd block small enough to debug # fails
+        # (1, 2, 2, 129, 129, 32),  # two blocks with 2nd block small enough to debug
+        # (1, 1, 1, 350, 350, 32),  # two blocks with 2nd block small enough to debug
+        # (1, 1, 1, 350, 350, 68),  # generic masking on q, k and head
+        # (4, 1, 1, 512, 512, 128), # batch > 1
+        # (4, 2, 2, 512, 512, 128),
+        # (4, 2, 2, 512, 512, 68),
+        # (4, 2, 2, 500, 500, 68),
+        # (2, 4, 4, 1024, 1024, 64),
+        # (4, 8, 8, 2048, 2048, 128),
+        # (2, 8, 8, 4096, 4096, 64),
+        # (2, 4, 4, 8192, 8192, 32),
+        # # seqlen q > k
+        # (1, 1, 1, 4, 2, 16),
+        # (1, 1, 1, 64, 32, 8),
+        # (1, 1, 1, 128, 64, 16),
+        # (1, 1, 1, 192, 128, 32),
+        # (1, 2, 2, 1024, 512, 68),
+        # (1, 4, 4, 729, 516, 68),
+        # (2, 4, 4, 2753, 1528, 68),  # a comprehensive seqlen_q > seqlen_k
+        # # seqlen q < k
+        # (1, 1, 1, 2, 4, 16),
+        # (1, 2, 2, 2, 4, 16),
+        # (1, 4, 1, 2, 4, 16),
+        # (1, 4, 2, 2, 4, 16),
+        # (2, 2, 2, 2, 128, 1),
+        # (2, 3, 3, 2, 128, 16),
+        # (1, 1, 1, 32, 64, 8),
+        # (1, 1, 1, 128, 192, 32),
+        # (4, 6, 6, 108, 256, 32),
+        # (3, 2, 2, 256, 512, 16),
+        # (2, 2, 2, 512, 1024, 68),
+        # (1, 1, 1, 200, 413, 32),
+        # (1, 1, 1, 782, 1546, 32),
+        # # gqa/mqa                   # mismatch issue on varlen
+        # (4, 8, 2, 500, 500, 68), 
+        # (4, 8, 2, 512, 512, 68),
+        # (4, 8, 2, 512, 512, 128),
+        # (4, 8, 2, 512, 1024, 68),
+        # (4, 8, 2, 1024, 512, 64),
+        # (4, 16, 4, 1528, 2753, 68),
+        # # fa configs
+        # (2, 4, 1, 113, 203, 64),
+        # (2, 4, 2, 128, 217, 64),
+        # (2, 6, 2, 113, 211, 128),
+        # (2, 6, 2, 108, 256, 128),
+        # (2, 6, 2, 256, 512, 64),
+        # (2, 6, 2, 512, 256, 64),
+        # (2, 6, 2, 1024, 1024, 32),
+        # (2, 6, 2, 1023, 1024, 32),
+        # (2, 6, 6, 1024, 1023, 32),
+        # (2, 6, 6, 2048, 2048, 32),
     ],
 )
-@pytest.mark.parametrize('causal', [False, True])
+@pytest.mark.parametrize('causal', [True])
 @pytest.mark.parametrize('dropout_p', [0.0])
-@pytest.mark.parametrize('layout', ["bshd", "thd"])
-@pytest.mark.parametrize('packing', [None, "qkv"])
+@pytest.mark.parametrize('layout', ["bshd"])
+@pytest.mark.parametrize('packing', [None])
 @pytest.mark.parametrize('DEBUG_INPUT', [False])
-@pytest.mark.flaky(reruns=3, reason="Retry failures")
+# @pytest.mark.flaky(reruns=3, reason="Retry failures")
 @pytest.mark.skipif(not arch_supports_fp8(), reason="fp8 not supported on this device")
 def test_fp8(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, dropout_p, layout, packing, DEBUG_INPUT):
     torch.manual_seed(20)

From ecdc98bbbfc4eec5c4c85bcccffd4a197acd3d8a Mon Sep 17 00:00:00 2001
From: Michael <micmelesse@gmail.com>
Date: Fri, 2 May 2025 22:48:13 -0500
Subject: [PATCH 6/8] fp8 tests passing

---
 .../bwd_prefill_onekernel.py                  |   2 +-
 flash_attn/flash_attn_triton_amd/test.py      | 128 +++++++++---------
 2 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py b/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
index 55f93616374..9f8a1ab46a2 100644
--- a/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
+++ b/flash_attn/flash_attn_triton_amd/bwd_prefill_onekernel.py
@@ -1145,7 +1145,7 @@ def attention_prefill_backward_triton_split_oneKernel_impl(
 
     # get closest power of 2 over or equal to 32.
     padded_d_model = 1 << (head_size - 1).bit_length()
-    padded_d_model = max(padded_d_model, 16)
+    padded_d_model = max(padded_d_model, 32)
     HEAD_DIM = padded_d_model
     ACTUAL_HEAD_DIM = head_size
 
diff --git a/flash_attn/flash_attn_triton_amd/test.py b/flash_attn/flash_attn_triton_amd/test.py
index 7ac6833c85b..fed61583229 100644
--- a/flash_attn/flash_attn_triton_amd/test.py
+++ b/flash_attn/flash_attn_triton_amd/test.py
@@ -427,75 +427,75 @@ def fp8_assert_close(tensor_a, tensor_b, atol=ATOL_fp8, rtol=RTOL_fp8, max_diff_
     "Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD",
     [
         # seqlen q == k
-        # (1, 1, 1, 1, 1, 1),
-        # (1, 1, 1, 2, 2, 2),  # small enough to debug
+        (1, 1, 1, 1, 1, 1),
+        (1, 1, 1, 2, 2, 2),  # small enough to debug
         (1, 1, 1, 4, 4, 16),
-        # (1, 2, 2, 4, 4, 16),
-        # (2, 1, 1, 4, 4, 16),
-        # (2, 2, 2, 4, 4, 16),
-        # (1, 1, 1, 128, 128, 32),  # only one block
-        # (3, 3, 3, 128, 128, 64),
-        # (1, 1, 1, 127, 127, 32),  # only one block but with masking
-        # # (1, 1, 1, 129, 129, 1),  # two blocks with 2nd block small enough to debug # fails
-        # (1, 2, 2, 129, 129, 32),  # two blocks with 2nd block small enough to debug
-        # (1, 1, 1, 350, 350, 32),  # two blocks with 2nd block small enough to debug
-        # (1, 1, 1, 350, 350, 68),  # generic masking on q, k and head
-        # (4, 1, 1, 512, 512, 128), # batch > 1
-        # (4, 2, 2, 512, 512, 128),
-        # (4, 2, 2, 512, 512, 68),
-        # (4, 2, 2, 500, 500, 68),
-        # (2, 4, 4, 1024, 1024, 64),
-        # (4, 8, 8, 2048, 2048, 128),
-        # (2, 8, 8, 4096, 4096, 64),
-        # (2, 4, 4, 8192, 8192, 32),
-        # # seqlen q > k
-        # (1, 1, 1, 4, 2, 16),
-        # (1, 1, 1, 64, 32, 8),
-        # (1, 1, 1, 128, 64, 16),
-        # (1, 1, 1, 192, 128, 32),
-        # (1, 2, 2, 1024, 512, 68),
-        # (1, 4, 4, 729, 516, 68),
-        # (2, 4, 4, 2753, 1528, 68),  # a comprehensive seqlen_q > seqlen_k
-        # # seqlen q < k
-        # (1, 1, 1, 2, 4, 16),
-        # (1, 2, 2, 2, 4, 16),
-        # (1, 4, 1, 2, 4, 16),
-        # (1, 4, 2, 2, 4, 16),
-        # (2, 2, 2, 2, 128, 1),
-        # (2, 3, 3, 2, 128, 16),
-        # (1, 1, 1, 32, 64, 8),
-        # (1, 1, 1, 128, 192, 32),
-        # (4, 6, 6, 108, 256, 32),
-        # (3, 2, 2, 256, 512, 16),
-        # (2, 2, 2, 512, 1024, 68),
-        # (1, 1, 1, 200, 413, 32),
-        # (1, 1, 1, 782, 1546, 32),
-        # # gqa/mqa                   # mismatch issue on varlen
-        # (4, 8, 2, 500, 500, 68), 
-        # (4, 8, 2, 512, 512, 68),
-        # (4, 8, 2, 512, 512, 128),
-        # (4, 8, 2, 512, 1024, 68),
-        # (4, 8, 2, 1024, 512, 64),
-        # (4, 16, 4, 1528, 2753, 68),
-        # # fa configs
-        # (2, 4, 1, 113, 203, 64),
-        # (2, 4, 2, 128, 217, 64),
-        # (2, 6, 2, 113, 211, 128),
-        # (2, 6, 2, 108, 256, 128),
-        # (2, 6, 2, 256, 512, 64),
-        # (2, 6, 2, 512, 256, 64),
-        # (2, 6, 2, 1024, 1024, 32),
-        # (2, 6, 2, 1023, 1024, 32),
-        # (2, 6, 6, 1024, 1023, 32),
-        # (2, 6, 6, 2048, 2048, 32),
+        (1, 2, 2, 4, 4, 16),
+        (2, 1, 1, 4, 4, 16),
+        (2, 2, 2, 4, 4, 16),
+        (1, 1, 1, 128, 128, 32),  # only one block
+        (3, 3, 3, 128, 128, 64),
+        (1, 1, 1, 127, 127, 32),  # only one block but with masking
+        # (1, 1, 1, 129, 129, 1),  # two blocks with 2nd block small enough to debug # fails
+        (1, 2, 2, 129, 129, 32),  # two blocks with 2nd block small enough to debug
+        (1, 1, 1, 350, 350, 32),  # two blocks with 2nd block small enough to debug
+        (1, 1, 1, 350, 350, 68),  # generic masking on q, k and head
+        (4, 1, 1, 512, 512, 128), # batch > 1
+        (4, 2, 2, 512, 512, 128),
+        (4, 2, 2, 512, 512, 68),
+        (4, 2, 2, 500, 500, 68),
+        (2, 4, 4, 1024, 1024, 64),
+        (4, 8, 8, 2048, 2048, 128),
+        (2, 8, 8, 4096, 4096, 64),
+        (2, 4, 4, 8192, 8192, 32),
+        # seqlen q > k
+        (1, 1, 1, 4, 2, 16),
+        (1, 1, 1, 64, 32, 8),
+        (1, 1, 1, 128, 64, 16),
+        (1, 1, 1, 192, 128, 32),
+        (1, 2, 2, 1024, 512, 68),
+        (1, 4, 4, 729, 516, 68),
+        (2, 4, 4, 2753, 1528, 68),  # a comprehensive seqlen_q > seqlen_k
+        # seqlen q < k
+        (1, 1, 1, 2, 4, 16),
+        (1, 2, 2, 2, 4, 16),
+        (1, 4, 1, 2, 4, 16),
+        (1, 4, 2, 2, 4, 16),
+        (2, 2, 2, 2, 128, 1),
+        (2, 3, 3, 2, 128, 16),
+        (1, 1, 1, 32, 64, 8),
+        (1, 1, 1, 128, 192, 32),
+        (4, 6, 6, 108, 256, 32),
+        (3, 2, 2, 256, 512, 16),
+        (2, 2, 2, 512, 1024, 68),
+        (1, 1, 1, 200, 413, 32),
+        (1, 1, 1, 782, 1546, 32),
+        # gqa/mqa                   # mismatch issue on varlen
+        (4, 8, 2, 500, 500, 68), 
+        (4, 8, 2, 512, 512, 68),
+        (4, 8, 2, 512, 512, 128),
+        (4, 8, 2, 512, 1024, 68),
+        (4, 8, 2, 1024, 512, 64),
+        (4, 16, 4, 1528, 2753, 68),
+        # fa configs
+        (2, 4, 1, 113, 203, 64),
+        (2, 4, 2, 128, 217, 64),
+        (2, 6, 2, 113, 211, 128),
+        (2, 6, 2, 108, 256, 128),
+        (2, 6, 2, 256, 512, 64),
+        (2, 6, 2, 512, 256, 64),
+        (2, 6, 2, 1024, 1024, 32),
+        (2, 6, 2, 1023, 1024, 32),
+        (2, 6, 6, 1024, 1023, 32),
+        (2, 6, 6, 2048, 2048, 32),
     ],
 )
-@pytest.mark.parametrize('causal', [True])
+@pytest.mark.parametrize('causal', [False, True])
 @pytest.mark.parametrize('dropout_p', [0.0])
-@pytest.mark.parametrize('layout', ["bshd"])
-@pytest.mark.parametrize('packing', [None])
+@pytest.mark.parametrize('layout', ["bshd", "thd"])
+@pytest.mark.parametrize('packing', [None, "qkv"])
 @pytest.mark.parametrize('DEBUG_INPUT', [False])
-# @pytest.mark.flaky(reruns=3, reason="Retry failures")
+@pytest.mark.flaky(reruns=3, reason="Retry failures")
 @pytest.mark.skipif(not arch_supports_fp8(), reason="fp8 not supported on this device")
 def test_fp8(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, dropout_p, layout, packing, DEBUG_INPUT):
     torch.manual_seed(20)

From 35ac48826337d15c96bf4ebb2a80d533513ddf6b Mon Sep 17 00:00:00 2001
From: Michael <micmelesse@gmail.com>
Date: Fri, 2 May 2025 22:59:52 -0500
Subject: [PATCH 7/8] skip

---
 .github/workflows/amd_tests.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/amd_tests.yml b/.github/workflows/amd_tests.yml
index 6056b9397d9..2f49567f960 100644
--- a/.github/workflows/amd_tests.yml
+++ b/.github/workflows/amd_tests.yml
@@ -51,7 +51,6 @@ jobs:
           pip install matplotlib pandas tabulate
 
       - name: AMD Internal Tests
-        if: False
         run: |
           FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=0 pytest flash_attn/flash_attn_triton_amd/test.py
 

From ce03b586365e01d8eff31a3a8bc1bcd51b15f40d Mon Sep 17 00:00:00 2001
From: Michael <micmelesse@gmail.com>
Date: Fri, 2 May 2025 23:25:52 -0500
Subject: [PATCH 8/8] skip ref tests

---
 flash_attn/flash_attn_triton_amd/bwd_ref.py | 2 +-
 flash_attn/flash_attn_triton_amd/test.py    | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/flash_attn/flash_attn_triton_amd/bwd_ref.py b/flash_attn/flash_attn_triton_amd/bwd_ref.py
index 90a98ce4fcc..639211a51f6 100644
--- a/flash_attn/flash_attn_triton_amd/bwd_ref.py
+++ b/flash_attn/flash_attn_triton_amd/bwd_ref.py
@@ -122,7 +122,7 @@ def attention_backward_core_ref_impl(
             print("dp:", dp, dp.shape)
 
     # calculate ds
-    if False:
+    if True:
         delta = torch.sum(o * do, axis=-1).unsqueeze(-1)
     else:
         delta = torch.sum(p * dp, axis=-1).unsqueeze(-1)
diff --git a/flash_attn/flash_attn_triton_amd/test.py b/flash_attn/flash_attn_triton_amd/test.py
index fed61583229..ea82de065b5 100644
--- a/flash_attn/flash_attn_triton_amd/test.py
+++ b/flash_attn/flash_attn_triton_amd/test.py
@@ -23,7 +23,7 @@
 from .utils import DEBUG, input_helper, arch_supports_fp8
 from .fwd_ref import attention_forward_pytorch_ref_impl
 from .fwd_prefill import attention_prefill_forward_triton_impl
-from .bwd_prefill_split import attention_prefill_backward_triton_split_impl
+from .bwd_prefill_onekernel import attention_prefill_backward_triton_split_oneKernel_impl
 from .bwd_ref import attention_backward_pytorch_ref_impl
 
 # set print options
@@ -83,6 +83,7 @@
 @pytest.mark.parametrize('dtype', [torch.float16])
 @pytest.mark.parametrize('use_exp2', [True, False]) # works when use_exp2 is false
 @pytest.mark.parametrize('DEBUG_INPUT', [False]) # NOTE: debug input can overflow when the tensors are large. Just use to figure out issues
+@pytest.mark.skip()
 def test_op_prefill_fwd_impl(BATCH, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, dropout_p, alibi_slopes, layout, dtype, use_exp2, DEBUG_INPUT):
     torch.manual_seed(42)
     device = "cuda"
@@ -258,6 +259,7 @@ def test_op_prefill_fwd_impl(BATCH, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, dr
 @pytest.mark.parametrize('dtype', [torch.float16])
 @pytest.mark.parametrize('use_exp2', [False]) # FIXME: using exp2 causes issue when used with causal
 @pytest.mark.parametrize('DEBUG_INPUT', [False]) # debug output causes nans on larger tensors
+@pytest.mark.skip()
 def test_op_prefill_bwd_impl(BATCH, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, dropout_p, alibi_slopes, layout, dtype, use_exp2, DEBUG_INPUT):
     torch.manual_seed(20)
     device="cuda"
@@ -332,7 +334,7 @@ def test_op_prefill_bwd_impl(BATCH, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, dr
     dq_triton = torch.zeros_like(q_triton, dtype=q.dtype) # NOTE: the kernel does inplace accumlation on dq so dq has to be zeros
     dk_triton = torch.zeros_like(k_triton, dtype=k.dtype) if DEBUG_INPUT else torch.empty_like(k_triton, dtype=k.dtype)
     dv_triton = torch.zeros_like(v_triton, dtype=v.dtype) if DEBUG_INPUT else torch.empty_like(v_triton, dtype=v.dtype)
-    delta_triton = attention_prefill_backward_triton_split_impl(
+    delta_triton = attention_prefill_backward_triton_split_oneKernel_impl(
         do_triton,
         q_triton,
         k_triton,