supports fa3_varlen api (#72805)

carryyu · web-flow · commit c0032d70a5b7 · 2025-05-30T14:42:16.000+08:00
* supports fa3_varlen api

* supports fa3_varlen api

* supports fa3_varlen api

* supports fa3_varlen api

* supports fa3_varlen api

* supports fa3_varlen api

* supports fa3_varlen api

* supports fa3_varlen api

* supports fa3_varlen api
diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu
@@ -1071,6 +1071,127 @@ void FlashAttnV3Kernel(const Context &ctx,
 #endif
 }
 
+template <typename T, typename Context>
+void FlashAttnV3VarLenKernel(const Context &ctx,
+                             const DenseTensor &q,
+                             const DenseTensor &k,
+                             const DenseTensor &v,
+                             const DenseTensor &cu_seqlens_q,
+                             const DenseTensor &cu_seqlens_k,
+                             const paddle::optional<DenseTensor> &q_v_,
+                             const paddle::optional<DenseTensor> &q_descale_,
+                             const paddle::optional<DenseTensor> &k_descale_,
+                             const paddle::optional<DenseTensor> &v_descale_,
+                             const float softmax_scale,
+                             bool is_causal,
+                             int window_size_left,
+                             int window_size_right,
+                             const float softcap,
+                             int num_splits,
+                             const bool manual_set_pack_gqa,
+                             const bool pack_gqa_,
+                             const int sm_margin,
+                             const int max_seqlen_q,
+                             const int max_seqlen_k,
+                             DenseTensor *out,
+                             DenseTensor *softmax_lse) {
+#ifdef PADDLE_WITH_FLASHATTN_V3
+  // umiswing: the following options have not been fully tested
+  PADDLE_ENFORCE_EQ(q_v_.is_initialized(),
+                    false,
+                    common::errors::InvalidArgument("q_v_ is not supported"));
+  PADDLE_ENFORCE_EQ(
+      q_descale_.is_initialized(),
+      false,
+      common::errors::InvalidArgument("q_descale_ is not supported"));
+  PADDLE_ENFORCE_EQ(
+      k_descale_.is_initialized(),
+      false,
+      common::errors::InvalidArgument("k_descale_ is not supported"));
+  PADDLE_ENFORCE_EQ(
+      v_descale_.is_initialized(),
+      false,
+      common::errors::InvalidArgument("v_descale_ is not supported"));
+  PADDLE_ENFORCE_EQ(
+      window_size_left,
+      -1,
+      common::errors::InvalidArgument("window_size is not supported, please "
+                                      "set window_size_left/right to -1"));
+  PADDLE_ENFORCE_EQ(
+      window_size_right,
+      -1,
+      common::errors::InvalidArgument("window_size is not supported, please "
+                                      "set window_size_left/right to -1"));
+  PADDLE_ENFORCE_EQ(softcap,
+                    0,
+                    common::errors::InvalidArgument(
+                        "softcap is not supported, please set softcap to 0"));
+  PADDLE_ENFORCE_EQ(
+      num_splits,
+      1,
+      common::errors::InvalidArgument(
+          "num_splits is not supported, please set num_splits to 1"));
+  PADDLE_ENFORCE_EQ(manual_set_pack_gqa,
+                    false,
+                    common::errors::InvalidArgument(
+                        "manual_set_pack_gqa is not supported, please set "
+                        "manual_set_pack_gqa to false"));
+  PADDLE_ENFORCE_EQ(
+      pack_gqa_,
+      false,
+      common::errors::InvalidArgument(
+          "pack_gqa_ is not supported, please set pack_gqa_ to false"));
+  PADDLE_ENFORCE_EQ(
+      sm_margin,
+      0,
+      common::errors::InvalidArgument(
+          "sm_margin is not supported, please set sm_margin to 0"));
+
+  DenseTensor out_accum;
+  DenseTensor softmax_lse_accum;
+  FlashAttnV3BaseKernel<T, Context>(ctx,
+                                    q,
+                                    k,
+                                    v,
+                                    paddle::none,  // k_new_
+                                    paddle::none,  // v_new_
+                                    q_v_,
+                                    paddle::none,  // out_
+                                    cu_seqlens_q,  // cu_seqlens_q_
+                                    cu_seqlens_k,  // cu_seqlens_k_
+                                    paddle::none,  // cu_seqlens_k_new_
+                                    paddle::none,  // seqused_q_
+                                    paddle::none,  // seqused_k_
+                                    paddle::none,  // page_table_
+                                    paddle::none,  // kv_batch_idx_
+                                    paddle::none,  // leftpad_k_
+                                    paddle::none,  // rotary_cos_
+                                    paddle::none,  // rotary_sin_
+                                    q_descale_,
+                                    k_descale_,
+                                    v_descale_,
+                                    paddle::none,  // scheduler_metadata
+                                    max_seqlen_q,  // max_seqlen_q_
+                                    max_seqlen_k,  // max_seqlen_k_
+                                    softmax_scale,
+                                    is_causal,
+                                    window_size_left,
+                                    window_size_right,
+                                    softcap,
+                                    true,  // is_rotary_interleaved
+                                    num_splits,
+                                    manual_set_pack_gqa,
+                                    pack_gqa_,
+                                    sm_margin,
+                                    out,
+                                    softmax_lse,
+                                    &out_accum,
+                                    &softmax_lse_accum);
+#else
+  RaiseNotSupportedError();
+#endif
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(flash_attn_v3,
@@ -1079,3 +1200,10 @@ PD_REGISTER_KERNEL(flash_attn_v3,
                    phi::FlashAttnV3Kernel,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(flash_attn_v3_varlen,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FlashAttnV3VarLenKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_kernel.h b/paddle/phi/kernels/gpu/flash_attn_v3_kernel.h
@@ -34,4 +34,29 @@ void FlashAttnV3Kernel(const Context &ctx,
                        const int sm_margin,
                        DenseTensor *out,
                        DenseTensor *softmax_lse);
+
+template <typename T, typename Context>
+void FlashAttnV3VarLenKernel(const Context &ctx,
+                             const DenseTensor &q,
+                             const DenseTensor &k,
+                             const DenseTensor &v,
+                             const DenseTensor &cu_seqlens_q,
+                             const DenseTensor &cu_seqlens_k,
+                             const paddle::optional<DenseTensor> &q_v_,
+                             const paddle::optional<DenseTensor> &q_descale_,
+                             const paddle::optional<DenseTensor> &k_descale_,
+                             const paddle::optional<DenseTensor> &v_descale_,
+                             const float softmax_scale,
+                             bool is_causal,
+                             int window_size_left,
+                             int window_size_right,
+                             const float softcap,
+                             int num_splits,
+                             const bool manual_set_pack_gqa,
+                             const bool pack_gqa_,
+                             const int sm_margin,
+                             const int max_seqlen_q,
+                             const int max_seqlen_k,
+                             DenseTensor *out,
+                             DenseTensor *softmax_lse);
 }  // namespace phi
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -2055,6 +2055,17 @@
     data_type : q
   backward : flash_attn_v3_grad
 
+- op : flash_attn_v3_varlen
+  args : (Tensor q, Tensor k, Tensor v, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor q_v_, Tensor q_descale_, Tensor k_descale_, Tensor v_descale_, float softmax_scale, bool is_causal, int window_size_left, int window_size_right, float softcap, int num_splits, bool manual_set_pack_gqa, bool pack_gqa_, int sm_margin, int max_seqlen_q, int max_seqlen_k)
+  output : Tensor(out), Tensor(softmax_lse)
+  optional : q_v_, q_descale_, k_descale_, v_descale_
+  infer_meta :
+    func : FlashAttnV3InferMeta
+    param : [q, k, v]
+  kernel :
+    func : flash_attn_v3_varlen
+    data_type : q
+
 - op : flash_attn_varlen_qkvpacked
   args : (Tensor qkv, Tensor cu_seqlens_q,  Tensor cu_seqlens_k, Tensor fixed_seed_offset, Tensor attn_mask, Scalar max_seqlen_q, Scalar max_seqlen_k, float scale, float dropout = 0.0, bool causal = false, bool return_softmax = false, bool is_test = false, str rng_name = "", bool varlen_padded = true)
   output : Tensor(out), Tensor(softmax), Tensor(softmax_lse), Tensor(seed_offset)
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
@@ -87,6 +87,7 @@
     temporal_shift,
 )
 from .flash_attention import (
+    flash_attention_v3_varlen,
     flash_attn_qkvpacked,
     flash_attn_varlen_qkvpacked,
     flashmask_attention,
@@ -294,6 +295,7 @@
     'scaled_dot_product_attention',
     'flashmask_attention',
     'flash_attn_qkvpacked',
+    "flash_attention_v3_varlen",
     'flash_attn_varlen_qkvpacked',
     'group_norm',
 ]
diff --git a/python/paddle/nn/functional/flash_attention.py b/python/paddle/nn/functional/flash_attention.py
@@ -434,15 +434,15 @@ def flash_attention(
         query(Tensor): The query tensor in the Attention module.
                         4-D tensor with shape:
                         [batch_size, seq_len, num_heads, head_dim].
-                        The dtype can be float61 or bfloat16.
+                        The dtype can be float16 or bfloat16.
         key(Tensor): The key tensor in the Attention module.
                         4-D tensor with shape:
                         [batch_size, seq_len, num_heads, head_dim].
-                        The dtype can be float61 or bfloat16.
+                        The dtype can be float16 or bfloat16.
         value(Tensor): The value tensor in the Attention module.
                         4-D tensor with shape:
                         [batch_size, seq_len, num_heads, head_dim].
-                        The dtype can be float61 or bfloat16.
+                        The dtype can be float16 or bfloat16.
         dropout(float): The dropout ratio.
         causal(bool): Whether enable causal mode.
         return_softmax(bool): Whether to return softmax.
@@ -623,6 +623,157 @@ def flash_attention(
             )
 
 
+@overload
+def flash_attention_v3_varlen(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    cu_seqlens_q: Tensor,
+    cu_seqlens_k: Tensor,
+    dropout: float = ...,
+    causal: bool = ...,
+    return_softmax: Literal[False] = ...,
+    *,
+    fixed_seed_offset: Tensor | None = ...,
+    rng_name: str = ...,
+    training: bool = ...,
+    softmax_scale: float | None = ...,
+    max_seqlen_q: int = ...,
+    max_seqlen_k: int = ...,
+    name: str | None = ...,
+) -> tuple[Tensor, None]: ...
+
+
+@overload
+def flash_attention_v3_varlen(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    cu_seqlens_q: Tensor,
+    cu_seqlens_k: Tensor,
+    dropout: float = ...,
+    causal: bool = ...,
+    return_softmax: Literal[True] = ...,
+    *,
+    fixed_seed_offset: Tensor | None = ...,
+    rng_name: str = ...,
+    training: bool = ...,
+    softmax_scale: float | None = ...,
+    max_seqlen_q: int = ...,
+    max_seqlen_k: int = ...,
+    name: str | None = ...,
+) -> tuple[Tensor, Tensor]: ...
+
+
+def flash_attention_v3_varlen(
+    query,
+    key,
+    value,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    dropout=0.0,
+    causal=False,
+    return_softmax=False,
+    *,
+    fixed_seed_offset=None,
+    rng_name="",
+    training=True,
+    softmax_scale=None,
+    max_seqlen_q=0,
+    max_seqlen_k=0,
+    name=None,
+):
+    r"""
+    The equation is:
+
+    .. math::
+
+        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
+
+    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
+    The dimensions of the three parameters are the same.
+    ``d`` represents the size of the last dimension of the three parameters.
+    This is the varlen version of flash attention.
+
+    Warning:
+        This API is only support inputs with dtype float16 and bfloat16.
+
+    Args:
+        query(Tensor): The query tensor in the Attention module.
+                        3-D tensor with shape:
+                        [token_num, num_heads, head_dim].
+                        The dtype can be float16 or bfloat16.
+        key(Tensor): The key tensor in the Attention module.
+                        3-D tensor with shape:
+                        [token_num, num_heads, head_dim].
+                        The dtype can be float16 or bfloat16.
+        value(Tensor): The value tensor in the Attention module.
+                        3-D tensor with shape:
+                        [token_num, num_heads, head_dim].
+                        The dtype can be float16 or bfloat16.
+        cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
+                        used to index query.
+        cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
+                        used to index key and value.
+        dropout(float): The dropout ratio.
+        causal(bool): Whether enable causal mode.
+        return_softmax(bool): Whether to return softmax.
+        fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
+        rng_name(str): The name to select Generator.
+        training(bool): Whether it is in the training phase.
+        softmax_scale(float): The softmax scale of the attention.
+        max_seqlen_q(int): Maximum sequence length of query in the batch. Note it's the padding length, not the max actual seqlen.
+        max_seqlen_k(int): Maximum sequence length of key/value in the batch.
+        name(str|None, optional): The default value is None. Normally there is no need for user
+                        to set this property. For more information, please refer to
+                        :ref:`api_guide_Name`.
+
+    Returns:
+        out(Tensor): The attention tensor. 3-D tensor with shape: [token_num, num_heads, head_dim]. The dtype can be float16 or bfloat16.
+        softmax(Tensor): The softmax tensor. None if return_softmax is False.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +SKIP('flash_attn_v3 need H100 compile')
+            >>> import paddle
+
+            >>> paddle.seed(2023)
+            >>> q = paddle.rand((10, 2, 128), dtype="bfloat16")
+            >>> cu_seqlens_q = paddle.to_tensor([0, 10], dtype="int32")
+            >>> max_seq_len_q = 10
+
+            >>> output = paddle.nn.functional.flash_attention.flash_attention_v3_varlen(q, q, q, cu_seqlens_q, cu_seqlens_q, max_seqlen_q=max_seq_len_q, max_seqlen_k=max_seq_len_q, causal=True)
+            >>> # doctest: -SKIP
+
+    """
+    if softmax_scale is None:
+        softmax_scale = query.shape[-1] ** (-0.5)
+    out, softmax_lse = _C_ops.flash_attn_v3_varlen(
+        query,
+        key,
+        value,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        None,  # q_v_
+        None,  # q_descale_
+        None,  # k_descale_
+        None,  # v_descale_
+        softmax_scale,
+        causal,
+        -1,  # window_size_left
+        -1,  # window_size_right
+        0.0,  # softcap
+        1,  # num_splits
+        False,  # manual_set_pack_gqa
+        False,  # pack_gqa_
+        0,  # sm_margin,
+        max_seqlen_q,
+        max_seqlen_k,
+    )
+    return out, softmax_lse  # return_softmax
+
+
 @overload
 def flash_attn_qkvpacked(
     qkv: Tensor,
@@ -912,15 +1063,15 @@ def flash_attn_unpadded(
         query(Tensor): The query tensor in the Attention module.
                         3-D tensor with shape:
                         [total_seq_len, num_heads, head_dim].
-                        The dtype can be float61 or bfloat16.
+                        The dtype can be float16 or bfloat16.
         key(Tensor): The key tensor in the Attention module.
                         3-D tensor with shape:
                         [total_seq_len, num_heads, head_dim].
-                        The dtype can be float61 or bfloat16.
+                        The dtype can be float16 or bfloat16.
         value(Tensor): The value tensor in the Attention module.
                         3-D tensor with shape:
                         [total_seq_len, num_heads, head_dim].
-                        The dtype can be float61 or bfloat16.
+                        The dtype can be float16 or bfloat16.
         cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                         used to index query.
         cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
diff --git a/test/legacy_test/test_flash_attention.py b/test/legacy_test/test_flash_attention.py