From 020ac15a7904a38b6369880c427be6f569080cf6 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 01:16:42 +0000
Subject: [PATCH 01/35] ONNX Attention thin-dispatcher: direct
 flash/MEA/unfused dispatch with nonpad_kv_seqlen support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace MHA/GQA routing with direct flash→MEA→unfused cascade
- Decouple GQA: direct flash (native) + MEA (LaunchUngroup head expansion)
- Add nonpad_kv_seqlen support for opset 24 (MHA and GQA)
- Add 3 CUDA kernels for nonpad seqlen conversion (flash, MEA, unfused bias)
- Register ONNX Attention opset 24 kernel
- Delete ComputeGQA (-265 lines), net reduction in code
- Add comprehensive tests: TensorScatter end-to-end, MHA/GQA nonpad

Addresses #27516 (decouple MHA/GQA kernels) and #27485 (nonpad_kv_seqlen)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../cuda/bert/group_query_attention_impl.h    |    9 +
 .../core/providers/cpu/llm/attention_helper.h |   16 +-
 .../providers/cuda/cuda_execution_provider.cc |   18 +-
 .../core/providers/cuda/llm/attention.cc      | 1255 ++++++++++-------
 .../core/providers/cuda/llm/attention.h       |   26 +
 .../providers/cuda/llm/attention_mask_impl.cu |  140 ++
 .../providers/cuda/llm/attention_mask_impl.h  |   40 +
 .../test_onnx_attention/common.py             |   28 +-
 .../test_onnx_attention/test_gqa.py           |  228 +++
 .../test_onnx_attention/test_mha.py           |  234 +++
 .../test_tensorscatter_attention.py           |  589 ++++++++
 11 files changed, 2026 insertions(+), 557 deletions(-)
 create mode 100644 onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h
index 78b061837e402..a7fefe6509277 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h
@@ -122,6 +122,15 @@ struct GQABufferRequirements {
   }
 };
 
+template <typename T>
+Status LaunchUngroup(const GroupQueryAttentionParameters& parameters,
+                     float2* k_buff, float2* v_buff,
+                     const float2* k_og, const float2* v_og,
+                     const int buff_seqlen, const int og_seqlen,
+                     const bool is_bsnh,
+                     cudaStream_t stream,
+                     const int max_threads_per_block);
+
 Status LaunchGetSequenceLengths(
     const int* total_seq_lens_minus_one,
     int* past_seq_lens,
diff --git a/onnxruntime/core/providers/cpu/llm/attention_helper.h b/onnxruntime/core/providers/cpu/llm/attention_helper.h
index e7df1a078472a..c41c275a61340 100644
--- a/onnxruntime/core/providers/cpu/llm/attention_helper.h
+++ b/onnxruntime/core/providers/cpu/llm/attention_helper.h
@@ -27,7 +27,8 @@ inline Status ComputeOutputShapeForAttention(
     TensorShape& y_shape,
     TensorShape& present_key_shape,
     TensorShape& present_value_shape,
-    TensorShape& output_qk_shape) {
+    TensorShape& output_qk_shape,
+    bool skip_nonpad_data_validation = false) {
   ORT_ENFORCE(Q != nullptr && K != nullptr && V != nullptr,
               "Q, K, and V inputs must not be null");
   int q_dims = onnxruntime::narrow<int>(Q->Shape().NumDimensions());
@@ -115,11 +116,14 @@ inline Status ComputeOutputShapeForAttention(
     parameters.has_nonpad_kv_seqlen = true;
     parameters.nonpad_kv_seqlen_data = nonpad_kv_seqlen->Data<int64_t>();
     // Validate each value is in [0, total_sequence_length].
-    for (int i = 0; i < parameters.batch_size; ++i) {
-      ORT_ENFORCE(parameters.nonpad_kv_seqlen_data[i] >= 0 &&
-                      parameters.nonpad_kv_seqlen_data[i] <= parameters.total_sequence_length,
-                  "nonpad_kv_seqlen[", i, "] = ", parameters.nonpad_kv_seqlen_data[i],
-                  " is out of range [0, ", parameters.total_sequence_length, "]");
+    // Skip when data is on GPU (CUDA path sets skip_nonpad_data_validation=true).
+    if (!skip_nonpad_data_validation) {
+      for (int i = 0; i < parameters.batch_size; ++i) {
+        ORT_ENFORCE(parameters.nonpad_kv_seqlen_data[i] >= 0 &&
+                        parameters.nonpad_kv_seqlen_data[i] <= parameters.total_sequence_length,
+                    "nonpad_kv_seqlen[", i, "] = ", parameters.nonpad_kv_seqlen_data[i],
+                    " is out of range [0, ", parameters.total_sequence_length, "]");
+      }
     }
   } else {
     parameters.has_nonpad_kv_seqlen = false;
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index bf6fcc7ccf0a8..92e1e5108cb8a 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1592,9 +1592,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, HardSwish);
 
 // Opset 23.
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, float, Attention);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, MLFloat16, Attention);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, BFloat16, Attention);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, 23, float, Attention);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, 23, MLFloat16, Attention);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, 23, BFloat16, Attention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, float_float, RMSNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, double_double, RMSNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, MLFloat16_MLFloat16, RMSNormalization);
@@ -1633,6 +1633,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 
 // Opset 24.
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 24, TensorScatter);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 24, float, Attention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 24, MLFloat16, Attention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 24, BFloat16, Attention);
 
 #endif
 
@@ -2671,9 +2674,9 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, HardSwish)>,
 
       // Opset 23
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, float, Attention)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, MLFloat16, Attention)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, BFloat16, Attention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, 23, float, Attention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, 23, MLFloat16, Attention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, 23, BFloat16, Attention)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, float_float, RMSNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, double_double, RMSNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, MLFloat16_MLFloat16, RMSNormalization)>,
@@ -2711,6 +2714,9 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
 
       // Opset 24
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 24, TensorScatter)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 24, float, Attention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 24, MLFloat16, Attention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 24, BFloat16, Attention)>,
 #endif
   };
 
diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 8fc7a49827087..d55f228ae5c4c 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -20,10 +20,11 @@ namespace onnxruntime {
 namespace cuda {
 
 #define REGISTER_KERNEL_TYPED(T)                                      \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                      \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                            \
       Attention,                                                      \
       kOnnxDomain,                                                    \
       23,                                                             \
+      23,                                                             \
       T,                                                              \
       kCudaExecutionProvider,                                         \
       (*KernelDefBuilder::Create())                                   \
@@ -36,11 +37,26 @@ REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
 REGISTER_KERNEL_TYPED(BFloat16)
 
+#define REGISTER_KERNEL_TYPED_24(T)                                   \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                      \
+      Attention,                                                      \
+      kOnnxDomain,                                                    \
+      24,                                                             \
+      T,                                                              \
+      kCudaExecutionProvider,                                         \
+      (*KernelDefBuilder::Create())                                   \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())     \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>())     \
+          .TypeConstraint("U", BuildKernelDefConstraints<bool, T>()), \
+      Attention<T>);
+
+REGISTER_KERNEL_TYPED_24(float)
+REGISTER_KERNEL_TYPED_24(MLFloat16)
+REGISTER_KERNEL_TYPED_24(BFloat16)
+
 template <typename T>
 Attention<T>::Attention(const OpKernelInfo& info) : CudaKernel(info) {
   is_causal_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("is_causal", 0)) == 1;
-  // kv_num_heads, q_num_head are mandatory for 3D inputs but not used for 4D inputs.
-  // The dimension is not yet known. If not specified, the inputs is assumed to be 4D.
   kv_num_heads_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("kv_num_heads", 0));
   q_num_heads_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("q_num_heads", 0));
   int mode = static_cast<int>(info.GetAttrOrDefault<int64_t>("qk_matmul_output_mode", 0));
@@ -53,13 +69,636 @@ Attention<T>::Attention(const OpKernelInfo& info) : CudaKernel(info) {
                   qk_matmul_output_mode_ == attention_helper::QKMatMulOutputMode::kQKSoftCap ||
                   qk_matmul_output_mode_ == attention_helper::QKMatMulOutputMode::kQKSoftMax,
               "qk_matmul_output_mode must be 0, 1, 2, or 3.");
-  // The default scale depends on the input dimensions. It is set to nan to indicate that it should be computed.
   scale_ = info.GetAttrOrDefault<float>("scale", std::numeric_limits<T>::quiet_NaN());
   softcap_ = info.GetAttrOrDefault<float>("softcap", 0.0f);
   softmax_precision_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("softmax_precision", 0));
   ORT_ENFORCE(scale_ > 0 || std::isnan(scale_), "scale must be greater than 0 if specified");
 }
 
+// ============================================================================
+// RunFlashAttention: Direct flash attention kernel call
+// ============================================================================
+template <typename T>
+Status Attention<T>::RunFlashAttention(
+    OpKernelContext* context,
+    const Tensor* Q, const Tensor* K, const Tensor* V,
+    const Tensor* attn_mask, const Tensor* past_key, const Tensor* past_value,
+    const Tensor* nonpad_kv_seqlen,
+    Tensor* Y, Tensor* present_key, Tensor* present_value,
+    const attention_helper::AttentionParameters& parameters) const {
+#if USE_FLASH_ATTENTION
+  ORT_UNUSED_PARAMETER(attn_mask);
+  ORT_UNUSED_PARAMETER(past_key);
+  ORT_UNUSED_PARAMETER(past_value);
+  auto& device_prop = GetDeviceProp();
+  auto cuda_stream = static_cast<cudaStream_t>(context->GetComputeStream()->GetHandle());
+  const bool is_bf16 = std::is_same<T, BFloat16>::value;
+  const bool is_bsnh = parameters.transpose_output;  // 3D inputs → BSNH
+
+  // Allocate softmax_lse and accumulation buffers
+  size_t softmax_lse_bytes = onnxruntime::flash::get_softmax_lse_size(
+      parameters.q_sequence_length, parameters.batch_size, parameters.q_num_heads);
+
+  auto [num_splits, softmax_lse_accum_bytes, out_accum_bytes] =
+      onnxruntime::flash::get_num_splits_and_buffer_sizes(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.total_sequence_length, parameters.q_num_heads,
+          parameters.head_size, device_prop.multiProcessorCount);
+
+  auto softmax_lse_buffer = GetScratchBuffer<void>(softmax_lse_bytes, context->GetComputeStream());
+  auto softmax_lse_accum_buffer = GetScratchBuffer<void>(softmax_lse_accum_bytes, context->GetComputeStream());
+  auto out_accum_buffer = GetScratchBuffer<void>(out_accum_bytes, context->GetComputeStream());
+
+  if (softmax_lse_accum_bytes > 0) {
+    CUDA_RETURN_IF_ERROR(cudaMemsetAsync(softmax_lse_accum_buffer.get(), 0,
+                                         softmax_lse_accum_bytes, cuda_stream));
+  }
+  if (out_accum_bytes > 0) {
+    CUDA_RETURN_IF_ERROR(cudaMemsetAsync(out_accum_buffer.get(), 0,
+                                         out_accum_bytes, cuda_stream));
+  }
+
+  // Handle nonpad_kv_seqlen: external KV cache path (opset 24)
+  if (nonpad_kv_seqlen != nullptr) {
+    ORT_ENFORCE(parameters.past_sequence_length == 0,
+                "RunFlashAttention with nonpad_kv_seqlen requires K/V to be the full cache "
+                "(past_sequence_length must be 0, got ",
+                parameters.past_sequence_length, ").");
+
+    auto seqlens_k_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
+    ORT_RETURN_IF_ERROR(LaunchConvertNonpadKvSeqlenToFlashSeqlensK(
+        nonpad_kv_seqlen->Data<int64_t>(),
+        seqlens_k_buffer.get(),
+        parameters.batch_size,
+        parameters.total_sequence_length,
+        cuda_stream,
+        device_prop.maxThreadsPerBlock));
+
+    // K/V are the full cache in BSNH. No new tokens to append (k=nullptr, v=nullptr).
+    ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd_kvcache(
+        device_prop, cuda_stream,
+        const_cast<void*>(static_cast<const void*>(Q->Data<T>())),
+        const_cast<void*>(static_cast<const void*>(K->Data<T>())),
+        const_cast<void*>(static_cast<const void*>(V->Data<T>())),
+        /*k=*/nullptr, /*v=*/nullptr,
+        static_cast<void*>(Y->MutableData<T>()),
+        softmax_lse_buffer.get(),
+        const_cast<void*>(static_cast<const void*>(seqlens_k_buffer.get())),
+        /*rotary_cos=*/nullptr, /*rotary_sin=*/nullptr,
+        /*cache_batch_idx=*/nullptr, /*leftpad_k=*/nullptr,
+        /*head_sink=*/nullptr, /*block_table=*/nullptr,
+        parameters.batch_size, parameters.q_num_heads, parameters.kv_num_heads,
+        parameters.head_size,
+        parameters.q_sequence_length, parameters.kv_sequence_length,
+        /*seqlen_k_new=*/0, /*rotary_dim=*/0,
+        parameters.scale, parameters.softcap,
+        parameters.is_causal, is_bf16, /*use_smooth_softmax=*/false,
+        /*past_bsnh=*/is_bsnh,
+        static_cast<int>(num_splits),
+        softmax_lse_accum_buffer.get(), out_accum_buffer.get(),
+        /*local_window_size=*/-1, /*is_rotary_interleaved=*/false,
+        /*is_packed_qkv=*/false));
+
+    // Populate present_key/value (BNSH) from external cache K/V (BSNH)
+    if (present_key != nullptr && is_bsnh) {
+      if constexpr (std::is_same_v<T, MLFloat16>) {
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.head_size,
+            reinterpret_cast<const half*>(K->Data<T>()),
+            reinterpret_cast<half*>(present_key->MutableData<T>()),
+            cuda_stream, device_prop.maxThreadsPerBlock));
+      } else if constexpr (std::is_same_v<T, BFloat16>) {
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.head_size,
+            K->Data<BFloat16>(), present_key->MutableData<BFloat16>(),
+            cuda_stream, device_prop.maxThreadsPerBlock));
+      }
+    }
+    if (present_value != nullptr && is_bsnh) {
+      if constexpr (std::is_same_v<T, MLFloat16>) {
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.v_head_size,
+            reinterpret_cast<const half*>(V->Data<T>()),
+            reinterpret_cast<half*>(present_value->MutableData<T>()),
+            cuda_stream, device_prop.maxThreadsPerBlock));
+      } else if constexpr (std::is_same_v<T, BFloat16>) {
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.v_head_size,
+            V->Data<BFloat16>(), present_value->MutableData<BFloat16>(),
+            cuda_stream, device_prop.maxThreadsPerBlock));
+      }
+    }
+    return Status::OK();
+  }
+
+  // Note: Flash with past_key is excluded by flash_eligible (requires past_key == nullptr).
+  // Those cases fall through to unfused attention which handles past concatenation.
+
+  // No past, no nonpad: prompt-only flash attention
+  ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd(
+      device_prop, cuda_stream,
+      const_cast<void*>(static_cast<const void*>(Q->Data<T>())),
+      const_cast<void*>(static_cast<const void*>(K->Data<T>())),
+      const_cast<void*>(static_cast<const void*>(V->Data<T>())),
+      static_cast<void*>(Y->MutableData<T>()),
+      softmax_lse_buffer.get(),
+      parameters.batch_size, parameters.q_num_heads, parameters.kv_num_heads,
+      parameters.head_size,
+      parameters.q_sequence_length, parameters.kv_sequence_length,
+      parameters.scale, parameters.softcap,
+      parameters.is_causal, is_bf16, /*use_smooth_softmax=*/false,
+      static_cast<int>(num_splits),
+      softmax_lse_accum_buffer.get(), out_accum_buffer.get(),
+      is_bsnh));
+
+  // Populate present_key/present_value (BNSH) from K/V (BSNH) for no-past case
+  if (present_key != nullptr && is_bsnh) {
+    if constexpr (std::is_same_v<T, MLFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.head_size,
+          reinterpret_cast<const half*>(K->Data<T>()),
+          reinterpret_cast<half*>(present_key->MutableData<T>()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else if constexpr (std::is_same_v<T, BFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.head_size,
+          K->Data<BFloat16>(), present_key->MutableData<BFloat16>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    }
+  }
+  if (present_value != nullptr && is_bsnh) {
+    if constexpr (std::is_same_v<T, MLFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.v_head_size,
+          reinterpret_cast<const half*>(V->Data<T>()),
+          reinterpret_cast<half*>(present_value->MutableData<T>()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else if constexpr (std::is_same_v<T, BFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.v_head_size,
+          V->Data<BFloat16>(), present_value->MutableData<BFloat16>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    }
+  }
+
+  return Status::OK();
+#else
+  ORT_UNUSED_PARAMETER(context);
+  ORT_UNUSED_PARAMETER(Q);
+  ORT_UNUSED_PARAMETER(K);
+  ORT_UNUSED_PARAMETER(V);
+  ORT_UNUSED_PARAMETER(attn_mask);
+  ORT_UNUSED_PARAMETER(past_key);
+  ORT_UNUSED_PARAMETER(past_value);
+  ORT_UNUSED_PARAMETER(nonpad_kv_seqlen);
+  ORT_UNUSED_PARAMETER(Y);
+  ORT_UNUSED_PARAMETER(present_key);
+  ORT_UNUSED_PARAMETER(present_value);
+  ORT_UNUSED_PARAMETER(parameters);
+  return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                         "Flash attention is not available in this build.");
+#endif
+}
+
+// ============================================================================
+// RunMemoryEfficientAttention: Direct memory-efficient attention kernel call
+// ============================================================================
+template <typename T>
+Status Attention<T>::RunMemoryEfficientAttention(
+    OpKernelContext* context,
+    const Tensor* Q, const Tensor* K, const Tensor* V,
+    const Tensor* attn_mask, const Tensor* past_key, const Tensor* past_value,
+    const Tensor* nonpad_kv_seqlen,
+    Tensor* Y, Tensor* present_key, Tensor* present_value,
+    const attention_helper::AttentionParameters& parameters) const {
+#if USE_MEMORY_EFFICIENT_ATTENTION
+  ORT_UNUSED_PARAMETER(past_key);
+  ORT_UNUSED_PARAMETER(past_value);
+  auto& device_prop = GetDeviceProp();
+  auto cuda_stream = static_cast<cudaStream_t>(context->GetComputeStream()->GetHandle());
+  const bool is_bsnh = parameters.transpose_output;
+  const int sm = device_prop.major * 10 + device_prop.minor;
+
+  // Q/K/V pointers — MEA expects BSNH format
+  const void* q_data = Q->Data<T>();
+  const void* k_data = K->Data<T>();
+  const void* v_data = V->Data<T>();
+
+  // GQA head expansion: MEA requires matching num_heads for Q/K/V.
+  // When q_num_heads != kv_num_heads, expand K/V via LaunchUngroup.
+  const bool is_gqa = parameters.q_num_heads != parameters.kv_num_heads;
+  IAllocatorUniquePtr<void> k_expand_buffer;
+  IAllocatorUniquePtr<void> v_expand_buffer;
+
+  if (is_gqa) {
+    // GQA+MEA only works with fp16/bf16 (MEA doesn't support fp32).
+    // Use if constexpr to avoid instantiating LaunchUngroup<float> which has no explicit
+    // template instantiation in group_query_attention_impl.cu.
+    if constexpr (std::is_same_v<T, float>) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "GQA with Memory Efficient Attention requires fp16 or bf16, not fp32.");
+    } else {
+      ORT_ENFORCE(parameters.head_size == parameters.v_head_size,
+                  "GQA with MEA requires head_size == v_head_size for LaunchUngroup.");
+      ORT_ENFORCE(parameters.head_size % 4 == 0,
+                  "GQA with MEA requires head_size divisible by 4 for LaunchUngroup (float2 access).");
+      const size_t expanded_kv_elements = static_cast<size_t>(parameters.batch_size) *
+                                          static_cast<size_t>(parameters.total_sequence_length) *
+                                          static_cast<size_t>(parameters.q_num_heads) *
+                                          static_cast<size_t>(parameters.head_size);
+      k_expand_buffer = GetScratchBuffer<void>(expanded_kv_elements * sizeof(T), context->GetComputeStream());
+      v_expand_buffer = GetScratchBuffer<void>(expanded_kv_elements * sizeof(T), context->GetComputeStream());
+
+      onnxruntime::contrib::GroupQueryAttentionParameters ungroup_params = {};
+      ungroup_params.batch_size = parameters.batch_size;
+      ungroup_params.num_heads = parameters.q_num_heads;
+      ungroup_params.kv_num_heads = parameters.kv_num_heads;
+      ungroup_params.head_size = parameters.head_size;
+
+      using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::LaunchUngroup<NativeCudaT>(
+          ungroup_params,
+          reinterpret_cast<float2*>(k_expand_buffer.get()),
+          reinterpret_cast<float2*>(v_expand_buffer.get()),
+          reinterpret_cast<const float2*>(k_data),
+          reinterpret_cast<const float2*>(v_data),
+          parameters.total_sequence_length,
+          parameters.total_sequence_length,
+          is_bsnh,
+          cuda_stream,
+          device_prop.maxThreadsPerBlock));
+
+      k_data = k_expand_buffer.get();
+      v_data = v_expand_buffer.get();
+    }
+  }
+
+  // Note: MEA with past_key/value is handled by the unfused fallback.
+  // The cascade in ComputeInternal ensures past_key == nullptr when we reach here.
+
+  // Handle attention mask → attention_bias conversion
+  IAllocatorUniquePtr<void> converted_mask_buffer;
+  IAllocatorUniquePtr<void> nonpad_bias_buffer;
+  const void* attn_bias_data = nullptr;
+  bool broadcast_bias_dim_0 = false;
+  bool broadcast_bias_dim_1 = false;
+
+  if (nonpad_kv_seqlen != nullptr) {
+    // Convert nonpad_kv_seqlen to seqlens_k for custom right padding.
+    // MEA expects actual token count (not count-1), so use FlashSeqlensK variant.
+    auto seqlens_k_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
+    ORT_RETURN_IF_ERROR(LaunchConvertNonpadKvSeqlenToFlashSeqlensK(
+        nonpad_kv_seqlen->Data<int64_t>(),
+        seqlens_k_buffer.get(),
+        parameters.batch_size,
+        parameters.total_sequence_length,
+        cuda_stream,
+        device_prop.maxThreadsPerBlock));
+
+    onnxruntime::contrib::cuda::MemoryEfficientAttentionParams p;
+    p.sm = sm;
+    p.is_half = std::is_same<T, MLFloat16>::value;
+    p.is_bf16 = std::is_same<T, BFloat16>::value;
+    p.is_kv_bsnh = is_bsnh;
+    p.batch_size = parameters.batch_size;
+    p.num_heads = parameters.q_num_heads;
+    p.sequence_length = parameters.q_sequence_length;
+    p.kv_sequence_length = parameters.total_sequence_length;
+    p.max_sequence_length = parameters.total_sequence_length;
+    p.qk_head_size = parameters.head_size;
+    p.v_head_size = parameters.v_head_size;
+    p.causal = parameters.is_causal;
+    p.scale = parameters.scale;
+    p.seqlen_k_ptr = seqlens_k_buffer.get();
+    p.has_custom_right_padding = true;
+    p.query = q_data;
+    p.key = k_data;
+    p.value = v_data;
+    p.attn_bias = nullptr;
+    p.stream = cuda_stream;
+    p.output = Y->MutableData<T>();
+
+    IAllocatorUniquePtr<void> workspace_buffer;
+    if (onnxruntime::contrib::cuda::MemoryEfficientAttentionParams::need_workspace(
+            parameters.v_head_size, sizeof(T) == sizeof(float))) {
+      size_t workspace_bytes = sizeof(float) * parameters.batch_size * parameters.q_sequence_length *
+                               parameters.q_num_heads * parameters.v_head_size;
+      workspace_buffer = GetScratchBuffer<void>(workspace_bytes, context->GetComputeStream());
+      p.workspace = workspace_buffer.get();
+    } else {
+      p.workspace = nullptr;
+    }
+    onnxruntime::contrib::cuda::run_memory_efficient_attention(p);
+  } else {
+    // Standard MEA path (no nonpad)
+    if (attn_mask != nullptr) {
+      if (attn_mask->IsDataType<bool>()) {
+        using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
+        int64_t num_elements = attn_mask->Shape().Size();
+        converted_mask_buffer = GetScratchBuffer<void>(
+            num_elements * sizeof(NativeCudaT), context->GetComputeStream());
+        float mask_filter_value = static_cast<float>(std::numeric_limits<T>::lowest());
+        ORT_RETURN_IF_ERROR(LaunchConvertBoolMaskToAttentionBias<NativeCudaT>(
+            attn_mask->Data<bool>(),
+            reinterpret_cast<NativeCudaT*>(converted_mask_buffer.get()),
+            num_elements, mask_filter_value, cuda_stream,
+            device_prop.maxThreadsPerBlock));
+        attn_bias_data = converted_mask_buffer.get();
+      } else {
+        attn_bias_data = attn_mask->Data<T>();
+      }
+
+      // Determine broadcast flags
+      size_t mask_dims = attn_mask->Shape().NumDimensions();
+      auto dims = attn_mask->Shape().GetDims();
+      if (mask_dims == 2) {
+        broadcast_bias_dim_0 = true;
+        broadcast_bias_dim_1 = true;
+      } else if (mask_dims == 3) {
+        broadcast_bias_dim_0 = true;
+        broadcast_bias_dim_1 = dims[0] == 1;
+      } else {
+        broadcast_bias_dim_0 = dims[0] == 1;
+        broadcast_bias_dim_1 = dims[1] == 1;
+      }
+    }
+
+    onnxruntime::contrib::cuda::MemoryEfficientAttentionParams p;
+    p.sm = sm;
+    p.is_half = std::is_same<T, MLFloat16>::value;
+    p.is_bf16 = std::is_same<T, BFloat16>::value;
+    p.is_kv_bsnh = is_bsnh;
+    p.batch_size = parameters.batch_size;
+    p.num_heads = parameters.q_num_heads;
+    p.sequence_length = parameters.q_sequence_length;
+    p.kv_sequence_length = parameters.total_sequence_length;
+    p.max_sequence_length = parameters.total_sequence_length;
+    p.qk_head_size = parameters.head_size;
+    p.v_head_size = parameters.v_head_size;
+    p.causal = parameters.is_causal;
+    p.scale = parameters.scale;
+    p.broadcast_attn_bias_dim_0 = broadcast_bias_dim_0;
+    p.broadcast_attn_bias_dim_1 = broadcast_bias_dim_1;
+    p.query = q_data;
+    p.key = k_data;
+    p.value = v_data;
+    p.attn_bias = attn_bias_data;
+    p.stream = cuda_stream;
+    p.output = Y->MutableData<T>();
+
+    if (onnxruntime::contrib::cuda::MemoryEfficientAttentionParams::need_workspace(
+            parameters.v_head_size, sizeof(T) == sizeof(float))) {
+      size_t workspace_bytes = sizeof(float) * parameters.batch_size * parameters.q_sequence_length *
+                               parameters.q_num_heads * parameters.v_head_size;
+      auto workspace_buffer = GetScratchBuffer<void>(workspace_bytes, context->GetComputeStream());
+      p.workspace = workspace_buffer.get();
+      onnxruntime::contrib::cuda::run_memory_efficient_attention(p);
+    } else {
+      p.workspace = nullptr;
+      onnxruntime::contrib::cuda::run_memory_efficient_attention(p);
+    }
+  }
+
+  // Populate present_key/present_value (BNSH) if requested
+  if (present_key != nullptr && is_bsnh) {
+    if constexpr (std::is_same_v<T, MLFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.head_size,
+          reinterpret_cast<const half*>(K->Data<T>()),
+          reinterpret_cast<half*>(present_key->MutableData<T>()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else if constexpr (std::is_same_v<T, BFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.head_size,
+          K->Data<BFloat16>(), present_key->MutableData<BFloat16>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.head_size,
+          K->Data<float>(), present_key->MutableData<float>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    }
+  }
+  if (present_value != nullptr && is_bsnh) {
+    if constexpr (std::is_same_v<T, MLFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.v_head_size,
+          reinterpret_cast<const half*>(V->Data<T>()),
+          reinterpret_cast<half*>(present_value->MutableData<T>()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else if constexpr (std::is_same_v<T, BFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.v_head_size,
+          V->Data<BFloat16>(), present_value->MutableData<BFloat16>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.v_head_size,
+          V->Data<float>(), present_value->MutableData<float>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    }
+  }
+
+  return Status::OK();
+#else
+  ORT_UNUSED_PARAMETER(context);
+  ORT_UNUSED_PARAMETER(Q);
+  ORT_UNUSED_PARAMETER(K);
+  ORT_UNUSED_PARAMETER(V);
+  ORT_UNUSED_PARAMETER(attn_mask);
+  ORT_UNUSED_PARAMETER(past_key);
+  ORT_UNUSED_PARAMETER(past_value);
+  ORT_UNUSED_PARAMETER(nonpad_kv_seqlen);
+  ORT_UNUSED_PARAMETER(Y);
+  ORT_UNUSED_PARAMETER(present_key);
+  ORT_UNUSED_PARAMETER(present_value);
+  ORT_UNUSED_PARAMETER(parameters);
+  return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                         "Memory efficient attention is not available in this build.");
+#endif
+}
+
+// ============================================================================
+// RunUnfusedAttention: Delegates to MHA's QkvToContext (unfused GEMM+softmax+GEMM)
+// ============================================================================
+template <typename T>
+Status Attention<T>::RunUnfusedAttention(
+    OpKernelContext* context,
+    const Tensor* Q, const Tensor* K, const Tensor* V,
+    const Tensor* attn_mask, const Tensor* past_key, const Tensor* past_value,
+    const Tensor* nonpad_kv_seqlen,
+    Tensor* Y, Tensor* present_key, Tensor* present_value,
+    Tensor* output_qk,
+    const attention_helper::AttentionParameters& parameters) const {
+  typedef typename ToCudaType<T>::MappedType CudaT;
+  auto& device_prop = GetDeviceProp();
+  auto cuda_stream = static_cast<cudaStream_t>(context->GetComputeStream()->GetHandle());
+
+  // Bridge to contrib::AttentionParameters for the MHA unfused path
+  onnxruntime::contrib::AttentionParameters contribop_parameters;
+
+  if (!parameters.transpose_output) {
+    contribop_parameters.qkv_format = onnxruntime::contrib::AttentionQkvFormat::Q_K_V_BNSH;
+    contribop_parameters.is_output_bnsh = true;
+  } else {
+    contribop_parameters.qkv_format = onnxruntime::contrib::AttentionQkvFormat::Q_K_V_BSNH;
+    contribop_parameters.is_output_bnsh = false;
+  }
+
+  contribop_parameters.batch_size = parameters.batch_size;
+  contribop_parameters.sequence_length = parameters.q_sequence_length;
+  contribop_parameters.kv_sequence_length = parameters.kv_sequence_length;
+  contribop_parameters.past_sequence_length = parameters.past_sequence_length;
+  contribop_parameters.total_sequence_length = parameters.total_sequence_length;
+  contribop_parameters.max_sequence_length = parameters.total_sequence_length;
+  contribop_parameters.input_hidden_size = 0;
+  contribop_parameters.hidden_size = parameters.q_num_heads * parameters.head_size;
+  contribop_parameters.head_size = parameters.head_size;
+  contribop_parameters.v_head_size = parameters.v_head_size;
+  contribop_parameters.v_hidden_size = parameters.kv_num_heads * parameters.v_head_size;
+  contribop_parameters.num_heads = parameters.q_num_heads;
+  contribop_parameters.rotary_dim = 0;
+  contribop_parameters.num_splits = 1;
+  contribop_parameters.beam_width = 1;
+  contribop_parameters.is_unidirectional = parameters.is_causal;
+  contribop_parameters.past_present_share_buffer = false;
+  contribop_parameters.is_packed_qkv = false;
+  contribop_parameters.do_rotary = false;
+  contribop_parameters.mask_type = onnxruntime::contrib::AttentionMaskType::MASK_NONE;
+  contribop_parameters.mask_filter_value = static_cast<float>(std::numeric_limits<T>::lowest());
+  contribop_parameters.scale = parameters.scale;
+  contribop_parameters.use_tf32 = UseTF32();
+
+  // Determine broadcast flags for attention_bias
+  if (attn_mask != nullptr) {
+    size_t attn_mask_dims_size = attn_mask->Shape().NumDimensions();
+    auto attn_mask_dims = attn_mask->Shape().GetDims();
+    if (attn_mask_dims_size == 2) {
+      contribop_parameters.broadcast_attn_bias_dim_0 = true;
+      contribop_parameters.broadcast_attn_bias_dim_1 = true;
+    } else if (attn_mask_dims_size == 3) {
+      contribop_parameters.broadcast_attn_bias_dim_0 = true;
+      contribop_parameters.broadcast_attn_bias_dim_1 = attn_mask_dims[0] == 1;
+    } else {
+      contribop_parameters.broadcast_attn_bias_dim_0 = attn_mask_dims[0] == 1;
+      contribop_parameters.broadcast_attn_bias_dim_1 = attn_mask_dims[1] == 1;
+    }
+  } else {
+    contribop_parameters.broadcast_attn_bias_dim_0 = false;
+    contribop_parameters.broadcast_attn_bias_dim_1 = false;
+  }
+
+  // Construct AttentionData
+  onnxruntime::contrib::cuda::AttentionData<CudaT> data;
+  data.query = reinterpret_cast<const CudaT*>(Q->Data<T>());
+  data.key = reinterpret_cast<const CudaT*>(K->Data<T>());
+  data.value = reinterpret_cast<const CudaT*>(V->Data<T>());
+  data.mask_index = nullptr;
+  data.mask_index_dims = gsl::span<const int64_t>();
+  data.past_key = (past_key == nullptr) ? nullptr : reinterpret_cast<const CudaT*>(past_key->Data<T>());
+  data.past_value = (past_value == nullptr) ? nullptr : reinterpret_cast<const CudaT*>(past_value->Data<T>());
+  data.output = reinterpret_cast<CudaT*>(Y->MutableData<T>());
+  data.present_key = (present_key == nullptr) ? nullptr : reinterpret_cast<CudaT*>(present_key->MutableData<T>());
+  data.present_value = (present_value == nullptr) ? nullptr : reinterpret_cast<CudaT*>(present_value->MutableData<T>());
+  if (output_qk != nullptr) {
+    data.output_qk = reinterpret_cast<CudaT*>(output_qk->MutableData<T>());
+  }
+  data.bias = nullptr;
+
+  // Handle attention mask / nonpad_kv_seqlen → attention_bias
+  IAllocatorUniquePtr<void> converted_mask_buffer;
+  if (nonpad_kv_seqlen != nullptr) {
+    // Convert nonpad_kv_seqlen to additive attention bias: [B, q_seq, total_seq]
+    using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
+    int64_t bias_elements = static_cast<int64_t>(parameters.batch_size) *
+                            parameters.q_sequence_length *
+                            parameters.total_sequence_length;
+    converted_mask_buffer = GetScratchBuffer<void>(bias_elements * sizeof(NativeCudaT), context->GetComputeStream());
+    ORT_RETURN_IF_ERROR(LaunchConvertNonpadKvSeqlenToAttentionBias<NativeCudaT>(
+        nonpad_kv_seqlen->Data<int64_t>(),
+        reinterpret_cast<NativeCudaT*>(converted_mask_buffer.get()),
+        parameters.batch_size,
+        parameters.q_sequence_length,
+        parameters.total_sequence_length,
+        contribop_parameters.mask_filter_value,
+        cuda_stream,
+        device_prop.maxThreadsPerBlock));
+    data.attention_bias = reinterpret_cast<const CudaT*>(converted_mask_buffer.get());
+    // nonpad bias is [B, q_seq, total_seq] → broadcasts over heads but not batch
+    contribop_parameters.broadcast_attn_bias_dim_0 = false;
+    contribop_parameters.broadcast_attn_bias_dim_1 = true;
+  } else if (attn_mask != nullptr) {
+    if (attn_mask->IsDataType<bool>()) {
+      using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
+      int64_t num_elements = attn_mask->Shape().Size();
+      converted_mask_buffer = GetScratchBuffer<void>(num_elements * sizeof(NativeCudaT), context->GetComputeStream());
+      ORT_RETURN_IF_ERROR(LaunchConvertBoolMaskToAttentionBias<NativeCudaT>(
+          attn_mask->Data<bool>(),
+          reinterpret_cast<NativeCudaT*>(converted_mask_buffer.get()),
+          num_elements,
+          contribop_parameters.mask_filter_value,
+          cuda_stream,
+          device_prop.maxThreadsPerBlock));
+      data.attention_bias = reinterpret_cast<const CudaT*>(converted_mask_buffer.get());
+    } else {
+      data.attention_bias = reinterpret_cast<const CudaT*>(attn_mask->Data<T>());
+    }
+  }
+
+  data.qkv_format = contribop_parameters.qkv_format;
+  data.use_flash_attention = false;
+  data.use_memory_efficient_attention = false;
+  data.fused_runner = nullptr;
+  data.fused_cross_attention_kernel = nullptr;
+  data.kernel_type = onnxruntime::contrib::AttentionKernelType::AttentionKernel_Unfused;
+
+  // Allocate workspace
+  const bool no_qkv_workspace = onnxruntime::contrib::cuda::NoQkvWorkspace(contribop_parameters, data);
+  size_t workspace_bytes = onnxruntime::contrib::cuda::GetAttentionWorkspaceSize(
+      sizeof(T),
+      contribop_parameters.batch_size,
+      contribop_parameters.num_heads,
+      contribop_parameters.head_size,
+      contribop_parameters.v_head_size,
+      contribop_parameters.sequence_length,
+      contribop_parameters.kv_sequence_length,
+      contribop_parameters.total_sequence_length,
+      nullptr, false, false, false, false, false,
+      no_qkv_workspace);
+  auto work_space = GetScratchBuffer<void>(workspace_bytes, context->GetComputeStream());
+
+  data.has_qkv_workspace = !no_qkv_workspace;
+  data.workspace = reinterpret_cast<CudaT*>(work_space.get());
+  data.workspace_bytes = workspace_bytes;
+
+  cublasHandle_t cublas = GetCublasHandle(context);
+  cudnnHandle_t cudnn = GetCudnnHandle(context);
+
+  return onnxruntime::contrib::cuda::QkvToContext<CudaT, CudaT>(
+      device_prop, cublas, cudnn, context->GetComputeStream(), contribop_parameters, data);
+}
+
+// ============================================================================
+// ComputeInternal: Dispatch to appropriate attention kernel
+// ============================================================================
+// MHA path (q_num_heads == kv_num_heads): uses direct kernel dispatch cascade
+//   flash → memory efficient → unfused
+// GQA path (q_num_heads != kv_num_heads): routes through GQA dispatch (kept for now)
+// ============================================================================
 template <typename T>
 Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* Q = context->Input<Tensor>(0);
@@ -70,6 +709,9 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* past_value = context->Input<Tensor>(5);
   const Tensor* nonpad_kv_seqlen = context->Input<Tensor>(6);  // optional, Opset 24
 
+  ORT_ENFORCE(nonpad_kv_seqlen == nullptr || attn_mask == nullptr,
+              "nonpad_kv_seqlen and attn_mask cannot both be provided.");
+
   attention_helper::AttentionParameters parameters;
   TensorShape y_shape;
   TensorShape present_key_shape;
@@ -77,25 +719,11 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   TensorShape output_qk_shape;
 
   ORT_ENFORCE(attention_helper::ComputeOutputShapeForAttention(
-                  Q,
-                  K,
-                  V,
-                  attn_mask,
-                  past_key,
-                  past_value,
-                  nonpad_kv_seqlen,
-                  is_causal_,
-                  softcap_,
-                  softmax_precision_,
-                  qk_matmul_output_mode_,
-                  kv_num_heads_,
-                  q_num_heads_,
-                  scale_,
-                  parameters,
-                  y_shape,
-                  present_key_shape,
-                  present_value_shape,
-                  output_qk_shape)
+                  Q, K, V, attn_mask, past_key, past_value, nonpad_kv_seqlen,
+                  is_causal_, softcap_, softmax_precision_,
+                  qk_matmul_output_mode_, kv_num_heads_, q_num_heads_, scale_,
+                  parameters, y_shape, present_key_shape, present_value_shape, output_qk_shape,
+                  true /* skip_nonpad_data_validation: data is on GPU */)
                   .IsOK(),
               "Output shapes for Attention could not be computed.");
 
@@ -104,537 +732,78 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   Tensor* present_value = context->Output(2, present_value_shape);
   Tensor* output_qk = context->Output(3, output_qk_shape);
 
-  // To reuse the existing attention-cuda implementation in contrib ops,
-  // map the parameters to contribop_parameters (MHA).
-  onnxruntime::contrib::AttentionParameters contribop_parameters;
-
-  // QKV format: Determine based on input dimensions
-  // 3D inputs (B, S, D): Q_K_V_BSNH - will be transposed by PrepareQkv to BNSH
-  // transpose_output is true for 3D inputs, false for 4D inputs
-  if (!parameters.transpose_output) {
-    contribop_parameters.qkv_format = onnxruntime::contrib::AttentionQkvFormat::Q_K_V_BNSH;
-    contribop_parameters.is_output_bnsh = true;
-  } else {
-    // 3D inputs in BSNH format (will be transposed)
-    contribop_parameters.qkv_format = onnxruntime::contrib::AttentionQkvFormat::Q_K_V_BSNH;
-    contribop_parameters.is_output_bnsh = false;
-  }
-
-  // Check if this is Group Query Attention (GQA)
   const bool is_gqa = parameters.kv_num_heads != parameters.q_num_heads;
 
-  if (is_gqa) {
-    // Use GQA path with Flash Attention or Memory Efficient Attention
-    // GQA only supports float16 and bfloat16 types
-    if constexpr (std::is_same<T, float>::value) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "GQA in Attention op (CUDA) does not support float32. "
-                             "Please use float16 or bfloat16.");
-    } else {
-      // GQA only supports 3D inputs (B, S, D) in BSNH format, not 4D inputs (B, num_heads, S, head_size) in BNSH format
-      if (!parameters.transpose_output) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
-                               "4D QKV inputs (BNSH format) are not supported yet in GQA path of Attention op (CUDA). "
-                               "Please use 3D inputs (B, S, hidden_size) instead.");
-      }
-      // For now, GQA doesn't support qk_matmul_output_mode other than kNone
-      if (qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kNone) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
-                               "qk_matmul_output_mode is not supported yet in GQA path of Attention op (CUDA).");
-      }
-      // GQA doesn't support softmax_precision yet
-      if (parameters.softmax_precision != 0) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
-                               "softmax_precision is not supported yet in GQA path of Attention op (CUDA).");
-      }
-      // causal attention is required for GQA
-      if (!parameters.is_causal) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
-                               "Non-causal attention is not supported yet in GQA path of Attention op (CUDA).");
-      }
-      // GQA kernel expects K/V input sequence length == Q sequence length (self-attention only)
-      // Cross-attention (kv_sequence_length != q_sequence_length) is not supported
-      if (parameters.kv_sequence_length != parameters.q_sequence_length) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
-                               "Cross-attention (kv_sequence_length != q_sequence_length) is not supported in "
-                               "GQA path of Attention op (CUDA). kv_sequence_length=",
-                               parameters.kv_sequence_length, ", q_sequence_length=", parameters.q_sequence_length);
-      }
-
-      auto& device_prop = GetDeviceProp();
-
-      // Bridge parameters to GroupQueryAttentionParameters
-      onnxruntime::contrib::GroupQueryAttentionParameters gqa_parameters;
-      gqa_parameters.batch_size = parameters.batch_size;
-      gqa_parameters.sequence_length = parameters.q_sequence_length;
-      gqa_parameters.seqlen_past_kv_cache = parameters.past_sequence_length;
-      gqa_parameters.seqlen_present_kv_cache = parameters.total_sequence_length;
-      gqa_parameters.total_sequence_length = parameters.total_sequence_length;
-      gqa_parameters.kv_sequence_length = parameters.kv_sequence_length;
-      gqa_parameters.hidden_size = parameters.q_num_heads * parameters.head_size;
-      gqa_parameters.num_heads = parameters.q_num_heads;
-      gqa_parameters.head_size = parameters.head_size;
-      gqa_parameters.v_head_size = parameters.v_head_size;
-      gqa_parameters.kv_hidden_size = parameters.kv_num_heads * parameters.v_head_size;
-      gqa_parameters.kv_num_heads = parameters.kv_num_heads;
-      gqa_parameters.scale = parameters.scale;
-      gqa_parameters.softcap = parameters.softcap;
-      gqa_parameters.qkv_format = contribop_parameters.qkv_format;
-
-      // Unset or set to default values for GQA-specific fields
-      gqa_parameters.rotary_dim = 0;            // New Attention op doesn't use rotary embeddings directly
-      gqa_parameters.is_unidirectional = true;  // GQA requires causal attention
-      gqa_parameters.is_packed_qkv = false;     // New Attention op has separate Q, K, V inputs
-      gqa_parameters.is_subsequent_prompt = false;
-      gqa_parameters.is_first_prompt = parameters.past_sequence_length == 0;
-      gqa_parameters.do_rotary = false;  // New Attention op doesn't use rotary embeddings
-      gqa_parameters.rotary_interleaved = false;
-      gqa_parameters.use_smooth_softmax = false;
-      gqa_parameters.mask_type = onnxruntime::contrib::AttentionMaskType::MASK_NONE;
-      gqa_parameters.past_kv_format = onnxruntime::contrib::AttentionQkvFormat::Q_K_V_BNSH;
-      gqa_parameters.local_window_size = -1;  // No local window for standard attention
-      gqa_parameters.zeros_count = 0;
-      gqa_parameters.zero_ptr = nullptr;
-      gqa_parameters.num_splits = 1;
-
-      // Construct GroupQueryAttentionData
-      typedef typename onnxruntime::cuda::OrtToCudaType<T>::type CudaT;
-      onnxruntime::contrib::cuda::GroupQueryAttentionData<CudaT, CudaT> gqa_data;
-
-      // Scratch buffers for flash/memory efficient attention
-      IAllocatorUniquePtr<void> k_buffer;
-      IAllocatorUniquePtr<void> v_buffer;
-      IAllocatorUniquePtr<void> fmha_buffer;
-      IAllocatorUniquePtr<void> unpacked_qkv_buffer;
-      IAllocatorUniquePtr<int> seq_lens_buffer;
-      IAllocatorUniquePtr<int> seqlens_k_buffer;
-
-      // Present KV cache buffers - GQA kernel uses these as working buffers
-      // If outputs are not provided, we allocate scratch buffers
-      IAllocatorUniquePtr<void> present_key_scratch;
-      IAllocatorUniquePtr<void> present_value_scratch;
-
-      // Set input pointers
-      gqa_data.query = reinterpret_cast<const CudaT*>(Q->Data<T>());
-      gqa_data.key = reinterpret_cast<const CudaT*>(K->Data<T>());
-      gqa_data.value = reinterpret_cast<const CudaT*>(V->Data<T>());
-      gqa_data.past_key = (past_key == nullptr) ? nullptr : reinterpret_cast<const CudaT*>(past_key->Data<T>());
-      gqa_data.past_value = (past_value == nullptr) ? nullptr : reinterpret_cast<const CudaT*>(past_value->Data<T>());
-
-      // Set output pointers
-      gqa_data.output = reinterpret_cast<CudaT*>(Y->MutableData<T>());
-
-      // GQA kernel requires present_key/present_value buffers as working storage for KV cache
-      // Allocate scratch buffers if outputs are not provided
-      size_t present_kv_size = static_cast<size_t>(parameters.batch_size) *
-                               static_cast<size_t>(parameters.kv_num_heads) *
-                               static_cast<size_t>(parameters.total_sequence_length) *
-                               static_cast<size_t>(parameters.head_size) * sizeof(CudaT);
-      if (present_key != nullptr) {
-        gqa_data.present_key = reinterpret_cast<CudaT*>(present_key->MutableData<T>());
-      } else {
-        present_key_scratch = GetScratchBuffer<void>(present_kv_size, context->GetComputeStream());
-        gqa_data.present_key = reinterpret_cast<CudaT*>(present_key_scratch.get());
-      }
-      if (present_value != nullptr) {
-        gqa_data.present_value = reinterpret_cast<CudaT*>(present_value->MutableData<T>());
-      } else {
-        present_value_scratch = GetScratchBuffer<void>(present_kv_size, context->GetComputeStream());
-        gqa_data.present_value = reinterpret_cast<CudaT*>(present_value_scratch.get());
-      }
-
-      // Compute past_present_share_buffer early since it's needed for flash attention path selection
-      gqa_parameters.past_present_share_buffer = (gqa_data.past_key == gqa_data.present_key);
+  // === KERNEL SELECTION CASCADE ===
+  // Priority: flash attention > memory efficient attention > unfused attention
+  const bool has_output_qk = (qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kNone);
 
-      // Flash Attention buffers
-      IAllocatorUniquePtr<void> softmax_lse_buffer;
-      IAllocatorUniquePtr<void> softmax_lse_accum_buffer;
-      IAllocatorUniquePtr<void> out_accum_buffer;
-
-      // Check Flash Attention support
 #if USE_FLASH_ATTENTION
-      bool use_flash_attention = onnxruntime::flash::is_supported<T>(device_prop,
-                                                                     gqa_parameters.head_size,
-                                                                     gqa_parameters.num_heads,
-                                                                     gqa_parameters.kv_num_heads);
-
-      gqa_data.use_flash_attention = use_flash_attention;
-      gqa_data.use_flash_attention_fast_decode = use_flash_attention &&
-                                                 !gqa_parameters.is_first_prompt &&
-                                                 gqa_parameters.past_present_share_buffer;
-
-      if (use_flash_attention) {
-        // Allocate Flash specific buffers (Softmax LSE, Accum)
-        size_t softmax_lse_bytes = onnxruntime::flash::get_softmax_lse_size(
-            gqa_parameters.sequence_length, gqa_parameters.batch_size, gqa_parameters.num_heads);
-
-        int num_heads_for_split = gqa_data.use_flash_attention_fast_decode
-                                      ? gqa_parameters.kv_num_heads
-                                      : gqa_parameters.num_heads;
-        auto [num_splits, softmax_lse_accum_bytes, out_accum_bytes] =
-            onnxruntime::flash::get_num_splits_and_buffer_sizes(
-                gqa_parameters.batch_size, gqa_parameters.sequence_length,
-                gqa_parameters.total_sequence_length, num_heads_for_split,
-                gqa_parameters.head_size, device_prop.multiProcessorCount);
-
-        gqa_parameters.num_splits = static_cast<int>(num_splits);
-
-        if (gqa_data.use_flash_attention_fast_decode && num_splits > 1) {
-          // The heuristic used kv_num_heads to maximize occupancy for the GQA-aware kernel.
-          // However, the LSE and Accum buffers must store results for ALL num_heads.
-          softmax_lse_accum_bytes = onnxruntime::flash::get_softmax_lse_accum_size(
-              num_splits, gqa_parameters.batch_size, gqa_parameters.num_heads, gqa_parameters.sequence_length);
-          auto round_multiple = [](size_t x, size_t m) { return (x + m - 1) / m * m; };
-          out_accum_bytes = onnxruntime::flash::get_out_accum_size(
-              num_splits, gqa_parameters.batch_size, gqa_parameters.num_heads, gqa_parameters.sequence_length,
-              round_multiple(gqa_parameters.head_size, 32));
-        }
-
-        softmax_lse_buffer = GetScratchBuffer<void>(softmax_lse_bytes, context->GetComputeStream());
-        softmax_lse_accum_buffer = GetScratchBuffer<void>(softmax_lse_accum_bytes, context->GetComputeStream());
-        out_accum_buffer = GetScratchBuffer<void>(out_accum_bytes, context->GetComputeStream());
-
-        gqa_data.softmax_lse = reinterpret_cast<CudaT*>(softmax_lse_buffer.get());
-        gqa_data.softmax_lse_accum = reinterpret_cast<CudaT*>(softmax_lse_accum_buffer.get());
-        gqa_data.out_accum = reinterpret_cast<CudaT*>(out_accum_buffer.get());
-      } else {
-        gqa_data.softmax_lse = nullptr;
-        gqa_data.softmax_lse_accum = nullptr;
-        gqa_data.out_accum = nullptr;
-      }
-#else
-      gqa_data.use_flash_attention = false;
-      gqa_data.use_flash_attention_fast_decode = false;
-      gqa_data.softmax_lse = nullptr;
-      gqa_data.softmax_lse_accum = nullptr;
-      gqa_data.out_accum = nullptr;
+  {
+    auto& device_prop = GetDeviceProp();
+    bool flash_eligible =
+        !std::is_same<T, float>::value &&
+        onnxruntime::flash::is_supported<T>(device_prop, parameters.head_size,
+                                            parameters.q_num_heads, parameters.kv_num_heads) &&
+        parameters.head_size == parameters.v_head_size &&
+        !has_output_qk &&
+        parameters.softcap == 0.0f &&
+        parameters.softmax_precision == 0 &&
+        past_key == nullptr &&  // Flash with past requires buffer management; use unfused
+        attn_mask == nullptr;   // Flash prompt path does not support attention mask
+
+    if (flash_eligible) {
+      return RunFlashAttention(context, Q, K, V, attn_mask, past_key, past_value,
+                               nonpad_kv_seqlen, Y, present_key, present_value, parameters);
+    }
+  }
 #endif
 
-      // Check Memory Efficient Attention support (fallback if flash attention not available)
 #if USE_MEMORY_EFFICIENT_ATTENTION
-      if (!gqa_data.use_flash_attention) {
-        int sm = (device_prop.major * 10) + device_prop.minor;
-        bool use_memory_efficient_attention =
-            onnxruntime::contrib::cuda::has_memory_efficient_attention(
-                sm, std::is_same<T, MLFloat16>::value, std::is_same<T, BFloat16>::value,
-                gqa_parameters.head_size, gqa_parameters.head_size);
-        gqa_data.use_memory_efficient_attention = use_memory_efficient_attention;
-
-        // KV buffer for head expansion (when num_heads != kv_num_heads)
-        size_t kv_buffer_bytes = (use_memory_efficient_attention &&
-                                  (gqa_parameters.num_heads != gqa_parameters.kv_num_heads))
-                                     ? (sizeof(T) * gqa_parameters.batch_size * gqa_parameters.num_heads *
-                                        gqa_parameters.seqlen_present_kv_cache * gqa_parameters.head_size)
-                                     : 0;
-        // FMHA workspace
-        size_t fmha_buffer_bytes =
-            (use_memory_efficient_attention &&
-             onnxruntime::contrib::cuda::MemoryEfficientAttentionParams::need_workspace(
-                 gqa_parameters.head_size, sizeof(T) == sizeof(float)))
-                ? (sizeof(float) * gqa_parameters.batch_size * gqa_parameters.sequence_length *
-                   gqa_parameters.num_heads * gqa_parameters.head_size)
-                : 0;
-
-        k_buffer = GetScratchBuffer<void>(kv_buffer_bytes, context->GetComputeStream());
-        v_buffer = GetScratchBuffer<void>(kv_buffer_bytes, context->GetComputeStream());
-        fmha_buffer = GetScratchBuffer<void>(fmha_buffer_bytes, context->GetComputeStream());
-
-        gqa_data.k = reinterpret_cast<CudaT*>(k_buffer.get());
-        gqa_data.v = reinterpret_cast<CudaT*>(v_buffer.get());
-        gqa_data.fmha_buffer = reinterpret_cast<CudaT*>(fmha_buffer.get());
-      } else {
-        gqa_data.use_memory_efficient_attention = false;
-        gqa_data.k = nullptr;
-        gqa_data.v = nullptr;
-        gqa_data.fmha_buffer = nullptr;
-      }
-#else
-      gqa_data.use_memory_efficient_attention = false;
-      gqa_data.k = nullptr;
-      gqa_data.v = nullptr;
-      gqa_data.fmha_buffer = nullptr;
-#endif
-
-      // Centralized scratch buffer allocation using GQABufferRequirements
-      auto buffer_req = onnxruntime::contrib::cuda::GQABufferRequirements::Compute<T>(
-          gqa_parameters,
-          false,  // use_xqa
-          gqa_data.use_flash_attention,
-          gqa_data.use_flash_attention_fast_decode,
-          gqa_data.use_memory_efficient_attention);
-
-      if (buffer_req.qkv_buffer_bytes > 0) {
-        unpacked_qkv_buffer = GetScratchBuffer<void>(buffer_req.qkv_buffer_bytes, context->GetComputeStream());
-        gqa_data.qkv_buffer = reinterpret_cast<CudaT*>(unpacked_qkv_buffer.get());
-      } else {
-        gqa_data.qkv_buffer = nullptr;
-      }
-
-      // Allocate GPU buffer for seqlens_k (total_sequence_length - 1) for GQA compatibility
-      // The GQA kernel expects sequence length information for flash/memory efficient attention
-      seqlens_k_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
-      auto cuda_stream = static_cast<cudaStream_t>(context->GetComputeStream()->GetHandle());
-
-      // GQA only supports masking, not additive bias.
-      // For bool mask, we need to convert it to sequence lengths on GPU.
-      // Note: The GQA path interprets 2D bool masks as (batch_size, total_seq_len) since it converts
-      // masks to seqlens_k directly (bypassing ONNX right-aligned broadcasting). This differs from
-      // the MHA path below, where 2D masks follow ONNX broadcasting: [A, B] → [1, 1, A, B], so
-      // 2D = (q_seq_len, total_seq_len) with both batch and heads broadcast.
-      if (attn_mask != nullptr && attn_mask->IsDataType<bool>()) {
-        // Allocate validation result buffer on GPU
-        // Get mask dimensions for broadcasting
-        // attn_mask can be 2D, 3D, or 4D and broadcasts to (batch_size, num_heads, q_seq_len, total_seq_len)
-        const auto& mask_shape = attn_mask->Shape();
-        int mask_dims = static_cast<int>(mask_shape.NumDimensions());
-        int64_t mask_dim0 = 0, mask_dim1 = 0, mask_dim2 = 0;
-
-        if (mask_dims == 2) {
-          // Shape: (batch_size or 1, total_seq_len)
-          mask_dim0 = mask_shape[0];
-          mask_dim1 = 0;
-          mask_dim2 = 0;
-        } else if (mask_dims == 3) {
-          // Shape: (num_heads or 1, q_seq_len, total_seq_len)
-          mask_dim0 = mask_shape[0];
-          mask_dim1 = mask_shape[1];
-          mask_dim2 = 0;
-        } else if (mask_dims == 4) {
-          // Shape: (batch_size or 1, num_heads or 1, q_seq_len, total_seq_len)
-          mask_dim0 = mask_shape[0];
-          mask_dim1 = mask_shape[1];
-          mask_dim2 = mask_shape[2];
-        } else {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                                 "Boolean attn_mask must be 2D, 3D, or 4D. Got ", mask_dims, "D.");
-        }
-
-        // Launch CUDA kernel to convert mask to seqlens_k and validate
-        // Mask validity (right-padding, contiguous) is checked asynchronously via CUDA_KERNEL_ASSERT.
-        ORT_RETURN_IF_ERROR(LaunchConvertMaskToSeqlensK(
-            attn_mask->Data<bool>(),
-            seqlens_k_buffer.get(),
-            parameters.batch_size,
-            parameters.total_sequence_length,
-            mask_dims,
-            mask_dim0,
-            mask_dim1,
-            mask_dim2,
-            cuda_stream,
-            device_prop.maxThreadsPerBlock));
-      } else if (attn_mask != nullptr) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
-                               "Non-boolean attn_mask is not supported yet in GQA path of Attention op (CUDA).");
-      } else {
-        // No mask provided - use full sequence length for all batches
-        // seqlens_k is total_sequence_length - 1 for historical reasons (matching GroupQueryAttention convention)
-        // Fill on GPU using cudaMemset-like approach or a simple kernel
-        std::vector<int> seqlens_k_host(parameters.batch_size, parameters.total_sequence_length - 1);
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(seqlens_k_buffer.get(), seqlens_k_host.data(),
-                                             sizeof(int) * parameters.batch_size,
-                                             cudaMemcpyHostToDevice, cuda_stream));
-      }
-
-      // Process seqlens_k to compute past_seq_lens, total_seq_lens, and padded_seq_lens
-      // This is always needed for flash/memory efficient attention
-      seq_lens_buffer = GetScratchBuffer<int>(3 * parameters.batch_size, context->GetComputeStream());
-      gqa_data.past_seq_lens = seq_lens_buffer.get();
-      gqa_data.total_seq_lens = seq_lens_buffer.get() + parameters.batch_size;
-      gqa_data.padded_seq_lens = gqa_data.total_seq_lens + parameters.batch_size;
-
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::LaunchGetSequenceLengths(
-          seqlens_k_buffer.get(),
-          gqa_data.past_seq_lens,
-          gqa_data.total_seq_lens,
-          gqa_data.padded_seq_lens,
-          parameters.batch_size,
-          parameters.q_sequence_length,
-          gqa_parameters.is_first_prompt,
-          cuda_stream,
-          device_prop.maxThreadsPerBlock));
-
-      // Set GQA-specific fields
-      gqa_data.cos_cache = nullptr;  // No rotary embeddings
-      gqa_data.sin_cache = nullptr;
-      gqa_data.head_sink = nullptr;
-      gqa_data.position_ids = nullptr;
-
-      // Call GQA kernel (with flash or memory efficient attention)
-      cublasHandle_t cublas = GetCublasHandle(context);
-
-      return onnxruntime::contrib::cuda::QkvToContext<CudaT, CudaT>(
-          device_prop, cublas, context->GetComputeStream(), gqa_parameters, gqa_data);
-    }  // else (non-float GQA path)
-  } else {  // MHA path (kv_num_heads == q_num_heads)
-    typedef typename ToCudaType<T>::MappedType CudaT;
-    contribop_parameters.batch_size = parameters.batch_size;
-    contribop_parameters.sequence_length = parameters.q_sequence_length;
-    contribop_parameters.kv_sequence_length = parameters.kv_sequence_length;
-    contribop_parameters.past_sequence_length = parameters.past_sequence_length;
-    contribop_parameters.total_sequence_length = parameters.total_sequence_length;
-    // max_sequence_length: For non-buffer-sharing case, this equals total_sequence_length (the present KV cache size)
-    contribop_parameters.max_sequence_length = parameters.total_sequence_length;
-    contribop_parameters.input_hidden_size = 0;  // Not applicable - new Attention op takes pre-projected Q/K/V
-    contribop_parameters.hidden_size = parameters.q_num_heads * parameters.head_size;
-    contribop_parameters.head_size = parameters.head_size;
-    contribop_parameters.v_head_size = parameters.v_head_size;
-    contribop_parameters.v_hidden_size = parameters.kv_num_heads * parameters.v_head_size;
-    contribop_parameters.num_heads = parameters.q_num_heads;
-    contribop_parameters.rotary_dim = 0;
-    contribop_parameters.num_splits = 1;
-    contribop_parameters.beam_width = 1;
-    contribop_parameters.is_unidirectional = parameters.is_causal;
-    contribop_parameters.past_present_share_buffer = false;  // New Attention op doesn't share buffer
-    contribop_parameters.is_packed_qkv = false;
-    contribop_parameters.do_rotary = false;
-
-    // The new Attention op uses attn_mask as attention_bias (additive bias), not as key_padding_mask
-    // So mask_type should always be MASK_NONE since we don't have a separate padding mask input
-    contribop_parameters.mask_type = onnxruntime::contrib::AttentionMaskType::MASK_NONE;
-
-    // Determine broadcast flags for attention_bias (if it exists)
-    // The MHA path uses attn_mask as attention_bias (additive bias added before softmax).
-    // Bool masks are element-wise converted to additive bias (true → 0.0, false → -inf),
-    // preserving the original shape, so the same broadcasting rules apply to both types.
-    //
-    // ONNX broadcasting is right-aligned to target shape (batch, heads, q_seq, total_seq):
-    //   2D [A, B]       → [1, 1, A, B]    : A = q_seq_len, B = total_seq_len
-    //   3D [A, B, C]    → [1, A, B, C]    : A = heads, B = q_seq_len, C = total_seq_len
-    //   4D [A, B, C, D] → [A, B, C, D]    : A = batch, B = heads, C = q_seq_len, D = total_seq_len
-    //
-    // Note: A 2D mask cannot represent per-batch padding because the batch dimension is broadcast.
-    // For per-batch boolean padding masks, use 4D shape (batch, 1, 1, total_seq_len).
-    if (attn_mask != nullptr) {
-      size_t attn_mask_dims_size = attn_mask->Shape().NumDimensions();
-      auto attn_mask_dims = attn_mask->Shape().GetDims();
-      // For 2D mask (q_seq_len, total_seq_len): both batch and heads dimensions need broadcasting
-      // For 3D mask (heads_or_1, q_seq_len, total_seq_len): batch always broadcasts, heads broadcasts if dim[0]==1
-      // For 4D mask (B, H, q_seq_len, total_seq_len): check if B==1 and H==1
-
-      if (attn_mask_dims_size == 2) {
-        // 2D mask: both dimensions need broadcasting
-        contribop_parameters.broadcast_attn_bias_dim_0 = true;
-        contribop_parameters.broadcast_attn_bias_dim_1 = true;
-      } else if (attn_mask_dims_size == 3) {
-        // 3D mask [A, q_seq_len, total_seq_len]: right-aligned to [_, A, q_seq, total_seq]
-        // A maps to heads dimension (validated to be 1 or q_num_heads by attention_helper.h)
-        // Batch dimension is missing, so always broadcasts
-        contribop_parameters.broadcast_attn_bias_dim_0 = true;
-        contribop_parameters.broadcast_attn_bias_dim_1 = attn_mask_dims[0] == 1;
-      } else {
-        // 4D mask: check both dim 0 and dim 1 explicitly
-        contribop_parameters.broadcast_attn_bias_dim_0 = attn_mask_dims[0] == 1;
-        contribop_parameters.broadcast_attn_bias_dim_1 = attn_mask_dims[1] == 1;
-      }
-    } else {
-      contribop_parameters.broadcast_attn_bias_dim_0 = false;
-      contribop_parameters.broadcast_attn_bias_dim_1 = false;
-    }
-
-    contribop_parameters.mask_filter_value = static_cast<float>(std::numeric_limits<T>::lowest());
-    contribop_parameters.scale = parameters.scale;
-    contribop_parameters.use_tf32 = UseTF32();
-    // TODO(titaiwang, xadupre): qk_matmul_output_mode only supports kNone and kQK for now
-    if (qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kNone &&
-        qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kQK) {
-      ORT_THROW("qk_matmul_output_mode other than -1 (None) and 0 (QK) is not supported yet in Attention op (CUDA).");
-    }
-    // TODO(titaiwang, xadupre): softcap and softmax_precision are not used yet
-    if (parameters.softcap != 0.0f) {
-      ORT_THROW("softcap is not supported yet in Attention op (CUDA).");
-    }
-    if (parameters.softmax_precision != 0) {
-      ORT_THROW("softmax_precision is not supported yet in Attention op (CUDA).");
-    }
-
-    // Construct AttentionData to pass to QkvToContext
-    onnxruntime::contrib::cuda::AttentionData<CudaT> data;
-
-    // Set input pointers
-    data.query = reinterpret_cast<const CudaT*>(Q->Data<T>());
-    data.key = reinterpret_cast<const CudaT*>(K->Data<T>());
-    data.value = reinterpret_cast<const CudaT*>(V->Data<T>());
-    data.mask_index = nullptr;  // New Attention op doesn't have key_padding_mask
-    data.mask_index_dims = gsl::span<const int64_t>();
-    data.past_key = (past_key == nullptr) ? nullptr : reinterpret_cast<const CudaT*>(past_key->Data<T>());
-    data.past_value = (past_value == nullptr) ? nullptr : reinterpret_cast<const CudaT*>(past_value->Data<T>());
-
-    // Set output pointers
-    data.output = reinterpret_cast<CudaT*>(Y->MutableData<T>());
-    data.present_key = (present_key == nullptr) ? nullptr : reinterpret_cast<CudaT*>(present_key->MutableData<T>());
-    data.present_value = (present_value == nullptr) ? nullptr : reinterpret_cast<CudaT*>(present_value->MutableData<T>());
-    if (nullptr != output_qk) {
-      data.output_qk = reinterpret_cast<CudaT*>(output_qk->MutableData<T>());
-    }
-
-    // Set additional fields
-    data.bias = nullptr;  // New Attention op doesn't have bias
-    IAllocatorUniquePtr<void> converted_mask_buffer;
-    if (nullptr != attn_mask) {
-      if (attn_mask->IsDataType<bool>()) {
-        // Convert boolean mask to additive attention bias: true -> 0.0, false -> mask_filter_value.
-        // The conversion is element-wise and preserves the original shape, so the broadcast flags
-        // set above apply identically to the converted float buffer.
-        using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
-        int64_t num_elements = attn_mask->Shape().Size();
-        converted_mask_buffer = GetScratchBuffer<void>(num_elements * sizeof(NativeCudaT), context->GetComputeStream());
-        auto cuda_stream = static_cast<cudaStream_t>(context->GetComputeStream()->GetHandle());
-        ORT_RETURN_IF_ERROR(LaunchConvertBoolMaskToAttentionBias<NativeCudaT>(
-            attn_mask->Data<bool>(),
-            reinterpret_cast<NativeCudaT*>(converted_mask_buffer.get()),
-            num_elements,
-            contribop_parameters.mask_filter_value,
-            cuda_stream,
-            GetDeviceProp().maxThreadsPerBlock));
-        data.attention_bias = reinterpret_cast<const CudaT*>(converted_mask_buffer.get());
-      } else {
-        data.attention_bias = reinterpret_cast<const CudaT*>(attn_mask->Data<T>());
-      }
-    }
-    data.qkv_format = contribop_parameters.qkv_format;
-
-    // For now, set flags to false and let QkvToContext use the unfused path
-    data.use_flash_attention = false;
-    data.use_memory_efficient_attention = false;
-    data.fused_runner = nullptr;
-    data.fused_cross_attention_kernel = nullptr;
-    data.kernel_type = onnxruntime::contrib::AttentionKernelType::AttentionKernel_Unfused;
-
-    // Allocate workspace for Q, K, V processing and scratch buffer
-    const bool no_qkv_workspace = onnxruntime::contrib::cuda::NoQkvWorkspace(contribop_parameters, data);
-    size_t workspace_bytes = onnxruntime::contrib::cuda::GetAttentionWorkspaceSize(
-        sizeof(T),
-        contribop_parameters.batch_size,
-        contribop_parameters.num_heads,
-        contribop_parameters.head_size,
-        contribop_parameters.v_head_size,
-        contribop_parameters.sequence_length,
-        contribop_parameters.kv_sequence_length,
-        contribop_parameters.total_sequence_length,
-        nullptr,  // fused_runner
-        false,    // use_flash_attention
-        false,    // use_lean_attention
-        false,    // use_fused_cross_attention
-        false,    // use_memory_efficient_attention
-        false,    // use_cudnn_flash_attention
-        no_qkv_workspace);
-    auto work_space = GetScratchBuffer<void>(workspace_bytes, context->GetComputeStream());
-
-    data.has_qkv_workspace = !no_qkv_workspace;
-    data.workspace = reinterpret_cast<CudaT*>(work_space.get());
-    data.workspace_bytes = workspace_bytes;
-
-    // Call QkvToContext to perform the attention computation
+  {
     auto& device_prop = GetDeviceProp();
-    cublasHandle_t cublas = GetCublasHandle(context);
-    cudnnHandle_t cudnn = GetCudnnHandle(context);
+    int sm = device_prop.major * 10 + device_prop.minor;
+    bool mea_eligible =
+        onnxruntime::contrib::cuda::has_memory_efficient_attention(
+            sm, std::is_same<T, MLFloat16>::value, std::is_same<T, BFloat16>::value,
+            parameters.head_size, parameters.v_head_size) &&
+        !has_output_qk &&
+        parameters.softcap == 0.0f &&
+        parameters.softmax_precision == 0 &&
+        past_key == nullptr;
+
+    if (mea_eligible) {
+      return RunMemoryEfficientAttention(context, Q, K, V, attn_mask, past_key, past_value,
+                                         nonpad_kv_seqlen, Y, present_key, present_value, parameters);
+    }
+  }
+#endif
 
-    // QkvToContext takes two template parameters: T for computation type, QK for output_qk type
-    // For now, both are the same type (CudaT)
+  // Fallback: unfused attention
+  if (parameters.softcap != 0.0f) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                           "softcap is not supported yet in Attention op (CUDA).");
+  }
+  if (parameters.softmax_precision != 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                           "softmax_precision is not supported yet in Attention op (CUDA).");
+  }
+  if (qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kNone &&
+      qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kQK) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                           "qk_matmul_output_mode other than kNone and kQK is not supported yet "
+                           "in Attention op (CUDA).");
+  }
 
-    return onnxruntime::contrib::cuda::QkvToContext<CudaT, CudaT>(
-        device_prop, cublas, cudnn, context->GetComputeStream(), contribop_parameters, data);
+  if (is_gqa) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                           "GQA (q_num_heads != kv_num_heads) requires flash or memory efficient attention, "
+                           "but neither is eligible. Ensure fp16/bf16 on Ampere+ GPU, or check head_size constraints.");
   }
+
+  return RunUnfusedAttention(context, Q, K, V, attn_mask, past_key, past_value,
+                             nonpad_kv_seqlen, Y, present_key, present_value, output_qk, parameters);
 }
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/llm/attention.h b/onnxruntime/core/providers/cuda/llm/attention.h
index 951b3b2e2f3c1..690ae5c22bd18 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.h
+++ b/onnxruntime/core/providers/cuda/llm/attention.h
@@ -14,6 +14,32 @@ class Attention final : public CudaKernel {
   Attention(const OpKernelInfo& info);
   Status ComputeInternal(OpKernelContext* context) const override;
 
+ private:
+  Status RunFlashAttention(
+      OpKernelContext* context,
+      const Tensor* Q, const Tensor* K, const Tensor* V,
+      const Tensor* attn_mask, const Tensor* past_key, const Tensor* past_value,
+      const Tensor* nonpad_kv_seqlen,
+      Tensor* Y, Tensor* present_key, Tensor* present_value,
+      const attention_helper::AttentionParameters& parameters) const;
+
+  Status RunMemoryEfficientAttention(
+      OpKernelContext* context,
+      const Tensor* Q, const Tensor* K, const Tensor* V,
+      const Tensor* attn_mask, const Tensor* past_key, const Tensor* past_value,
+      const Tensor* nonpad_kv_seqlen,
+      Tensor* Y, Tensor* present_key, Tensor* present_value,
+      const attention_helper::AttentionParameters& parameters) const;
+
+  Status RunUnfusedAttention(
+      OpKernelContext* context,
+      const Tensor* Q, const Tensor* K, const Tensor* V,
+      const Tensor* attn_mask, const Tensor* past_key, const Tensor* past_value,
+      const Tensor* nonpad_kv_seqlen,
+      Tensor* Y, Tensor* present_key, Tensor* present_value,
+      Tensor* output_qk,
+      const attention_helper::AttentionParameters& parameters) const;
+
  protected:
   bool is_causal_;
   int kv_num_heads_;
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
index 08f93d48ebcaa..003856f4ca7ff 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -173,5 +173,145 @@ template Status LaunchConvertBoolMaskToAttentionBias<__half>(
 template Status LaunchConvertBoolMaskToAttentionBias<__nv_bfloat16>(
     const bool*, __nv_bfloat16*, int64_t, float, cudaStream_t, int);
 
+// CUDA kernel to convert nonpad_kv_seqlen (int64) to seqlens_k (int32) for GQA.
+// GQA convention: seqlens_k = nonpad_kv_seqlen - 1 (last valid index, not count).
+//
+// Validation (via CUDA_KERNEL_ASSERT, reported asynchronously):
+// - val must be > 0 (nonpad_kv_seqlen=0 → seqlens_k=0 → attends to garbage at pos 0)
+// - val must be <= total_sequence_length (out of bounds)
+__global__ void ConvertNonpadKvSeqlenToSeqlensKKernel(
+    const int64_t* __restrict__ nonpad_kv_seqlen,
+    int* __restrict__ seqlens_k,
+    const int batch_size,
+    const int total_sequence_length,
+    const int min_expected_seqlen) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < batch_size) {
+    int64_t val = nonpad_kv_seqlen[idx];
+    CUDA_KERNEL_ASSERT(val > 0);
+    CUDA_KERNEL_ASSERT(val <= static_cast<int64_t>(total_sequence_length));
+    if (min_expected_seqlen > 0) {
+      CUDA_KERNEL_ASSERT(val >= static_cast<int64_t>(min_expected_seqlen));
+    }
+    val = max(static_cast<int64_t>(1), min(val, static_cast<int64_t>(total_sequence_length)));
+    seqlens_k[idx] = static_cast<int>(val) - 1;
+  }
+}
+
+Status LaunchConvertNonpadKvSeqlenToSeqlensK(
+    const int64_t* nonpad_kv_seqlen,
+    int* seqlens_k,
+    int batch_size,
+    int total_sequence_length,
+    cudaStream_t stream,
+    int max_threads_per_block,
+    int min_expected_seqlen) {
+  if (batch_size == 0) {
+    return Status::OK();
+  }
+
+  int threads = std::min(batch_size, max_threads_per_block);
+  int blocks = (batch_size + threads - 1) / threads;
+
+  ConvertNonpadKvSeqlenToSeqlensKKernel<<<blocks, threads, 0, stream>>>(
+      nonpad_kv_seqlen, seqlens_k, batch_size, total_sequence_length, min_expected_seqlen);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
+// Like ConvertNonpadKvSeqlenToSeqlensKKernel but produces the actual count (no -1 offset).
+// Flash attention's mha_fwd_kvcache expects seqlens_k_ = number of valid tokens.
+__global__ void ConvertNonpadKvSeqlenToFlashSeqlensKKernel(
+    const int64_t* __restrict__ nonpad_kv_seqlen,
+    int* __restrict__ seqlens_k,
+    const int batch_size,
+    const int total_sequence_length) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < batch_size) {
+    int64_t val = nonpad_kv_seqlen[idx];
+    CUDA_KERNEL_ASSERT(val > 0);
+    CUDA_KERNEL_ASSERT(val <= static_cast<int64_t>(total_sequence_length));
+    val = max(static_cast<int64_t>(0), min(val, static_cast<int64_t>(total_sequence_length)));
+    seqlens_k[idx] = static_cast<int>(val);  // count, not index
+  }
+}
+
+Status LaunchConvertNonpadKvSeqlenToFlashSeqlensK(
+    const int64_t* nonpad_kv_seqlen,
+    int* seqlens_k,
+    int batch_size,
+    int total_sequence_length,
+    cudaStream_t stream,
+    int max_threads_per_block) {
+  if (batch_size == 0) {
+    return Status::OK();
+  }
+
+  int threads = std::min(batch_size, max_threads_per_block);
+  int blocks = (batch_size + threads - 1) / threads;
+
+  ConvertNonpadKvSeqlenToFlashSeqlensKKernel<<<blocks, threads, 0, stream>>>(
+      nonpad_kv_seqlen, seqlens_k, batch_size, total_sequence_length);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
+// CUDA kernel to convert nonpad_kv_seqlen to an additive attention bias.
+// Generates (batch_size, q_seq_len, total_seq_len) output where:
+//   position t < nonpad_kv_seqlen[b] → 0.0 (attend)
+//   position t >= nonpad_kv_seqlen[b] → mask_filter_value (mask out)
+template <typename T>
+__global__ void ConvertNonpadKvSeqlenToAttentionBiasKernel(
+    const int64_t* __restrict__ nonpad_kv_seqlen,
+    T* __restrict__ attention_bias,
+    const int batch_size,
+    const int q_seq_len,
+    const int total_seq_len,
+    const float mask_filter_value) {
+  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  int64_t total = static_cast<int64_t>(batch_size) * q_seq_len * total_seq_len;
+  for (; idx < total; idx += static_cast<int64_t>(gridDim.x) * blockDim.x) {
+    int b = static_cast<int>(idx / (static_cast<int64_t>(q_seq_len) * total_seq_len));
+    int t = static_cast<int>(idx % total_seq_len);
+    int64_t valid_len = nonpad_kv_seqlen[b];
+    CUDA_KERNEL_ASSERT(valid_len > 0 && valid_len <= static_cast<int64_t>(total_seq_len));
+    valid_len = max(static_cast<int64_t>(0), min(valid_len, static_cast<int64_t>(total_seq_len)));
+    attention_bias[idx] = (t < static_cast<int>(valid_len)) ? T(0.0f) : T(mask_filter_value);
+  }
+}
+
+template <typename T>
+Status LaunchConvertNonpadKvSeqlenToAttentionBias(
+    const int64_t* nonpad_kv_seqlen,
+    T* attention_bias,
+    int batch_size,
+    int q_seq_len,
+    int total_seq_len,
+    float mask_filter_value,
+    cudaStream_t stream,
+    int max_threads_per_block) {
+  int64_t total = static_cast<int64_t>(batch_size) * q_seq_len * total_seq_len;
+  if (total == 0) {
+    return Status::OK();
+  }
+
+  int threads = static_cast<int>(std::min(static_cast<int64_t>(max_threads_per_block), total));
+  int64_t blocks = (total + threads - 1) / threads;
+  constexpr int64_t kMaxGridDimX = 65535;
+  unsigned int grid_size = static_cast<unsigned int>(std::min(blocks, kMaxGridDimX));
+
+  ConvertNonpadKvSeqlenToAttentionBiasKernel<T><<<grid_size, threads, 0, stream>>>(
+      nonpad_kv_seqlen, attention_bias, batch_size, q_seq_len, total_seq_len, mask_filter_value);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
+template Status LaunchConvertNonpadKvSeqlenToAttentionBias<float>(
+    const int64_t*, float*, int, int, int, float, cudaStream_t, int);
+template Status LaunchConvertNonpadKvSeqlenToAttentionBias<__half>(
+    const int64_t*, __half*, int, int, int, float, cudaStream_t, int);
+template Status LaunchConvertNonpadKvSeqlenToAttentionBias<__nv_bfloat16>(
+    const int64_t*, __nv_bfloat16*, int, int, int, float, cudaStream_t, int);
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
index e2f5383de05f1..b2f4972acfa7f 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
@@ -62,5 +62,45 @@ Status LaunchConvertBoolMaskToAttentionBias(
     cudaStream_t stream,
     int max_threads_per_block);
 
+// Convert nonpad_kv_seqlen (int64, per-batch valid KV lengths) to seqlens_k (int32) for GQA.
+// GQA convention: seqlens_k[i] = nonpad_kv_seqlen[i] - 1 (last valid index, not count).
+//
+// IMPORTANT: nonpad_kv_seqlen must be >= 1 for every batch element.
+// A value of 0 would produce seqlens_k=0, which GQA interprets as "1 valid token at
+// position 0" (last-valid-index convention), causing silent attention to garbage data.
+Status LaunchConvertNonpadKvSeqlenToSeqlensK(
+    const int64_t* nonpad_kv_seqlen,
+    int* seqlens_k,
+    int batch_size,
+    int total_sequence_length,
+    cudaStream_t stream,
+    int max_threads_per_block,
+    int min_expected_seqlen = 0);
+
+// Like LaunchConvertNonpadKvSeqlenToSeqlensK but produces the actual count (no -1 offset).
+// Flash attention's mha_fwd_kvcache expects seqlens_k_ = number of valid tokens.
+Status LaunchConvertNonpadKvSeqlenToFlashSeqlensK(
+    const int64_t* nonpad_kv_seqlen,
+    int* seqlens_k,
+    int batch_size,
+    int total_sequence_length,
+    cudaStream_t stream,
+    int max_threads_per_block);
+
+// Convert nonpad_kv_seqlen to an additive attention bias for the MHA unfused path.
+// Generates a (batch_size, q_seq_len, total_seq_len) tensor where:
+//   position t < nonpad_kv_seqlen[b] → 0.0 (attend)
+//   position t >= nonpad_kv_seqlen[b] → mask_filter_value (mask out)
+template <typename T>
+Status LaunchConvertNonpadKvSeqlenToAttentionBias(
+    const int64_t* nonpad_kv_seqlen,
+    T* attention_bias,
+    int batch_size,
+    int q_seq_len,
+    int total_seq_len,
+    float mask_filter_value,
+    cudaStream_t stream,
+    int max_threads_per_block);
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/common.py b/onnxruntime/test/python/transformers/test_onnx_attention/common.py
index 10a38329549a8..ad54efdbd6294 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/common.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/common.py
@@ -95,6 +95,7 @@ class AttentionConfig:
     has_attn_mask: bool = False
     attn_mask_dims: int = 2  # 2D, 3D, or 4D boolean mask
     attn_mask_type: str = "bool"  # "bool" for GQA path, "additive" for MHA path
+    has_nonpad_kv_seqlen: bool = False  # Opset 24: nonpad_kv_seqlen input
 
 
 # #################################################################################################
@@ -157,6 +158,7 @@ def create_attention_node_and_io(
         "attn_mask" if config.has_attn_mask else "",
         "past_key" if is_past else "",
         "past_value" if is_past else "",
+        "nonpad_kv_seqlen" if config.has_nonpad_kv_seqlen else "",
     ]
 
     # Remove trailing empty strings
@@ -239,6 +241,10 @@ def create_attention_node_and_io(
             ]
         )
 
+    # nonpad_kv_seqlen for Opset 24: int64 tensor [batch_size]
+    if config.has_nonpad_kv_seqlen:
+        graph_input.append(helper.make_tensor_value_info("nonpad_kv_seqlen", TensorProto.INT64, [config.batch_size]))
+
     # --- Graph Outputs ---
     output_k_shape = [config.batch_size, config.kv_num_heads, present_kv_seqlen, config.head_size]
 
@@ -262,11 +268,17 @@ def create_attention_node_and_io(
     return node, graph_input, graph_output
 
 
+def _get_opset_version(config: AttentionConfig):
+    """Return 24 when nonpad_kv_seqlen is used, otherwise 23."""
+    return 24 if config.has_nonpad_kv_seqlen else 23
+
+
 def create_attention_graph_prompt(config: AttentionConfig, ort_type):
     """Create ONNX graph for prompt phase (no past KV cache)."""
     node, graph_input, graph_output = create_attention_node_and_io(config, ort_type, is_past=False)
     graph = helper.make_graph([node], "Attention_Graph", graph_input, graph_output)
-    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 23)])
+    opset = _get_opset_version(config)
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset)])
     return model.SerializeToString()
 
 
@@ -274,7 +286,8 @@ def create_attention_graph_past(config: AttentionConfig, ort_type):
     """Create ONNX graph for decoding phase (with past KV cache)."""
     node, graph_input, graph_output = create_attention_node_and_io(config, ort_type, is_past=True)
     graph = helper.make_graph([node], "Attention_Graph", graph_input, graph_output)
-    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 23)])
+    opset = _get_opset_version(config)
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset)])
     return model.SerializeToString()
 
 
@@ -338,6 +351,7 @@ def attention_prompt_func(
     ep,
     device,
     ort_type=TensorProto.FLOAT16,
+    nonpad_kv_seqlen=None,
 ):
     """
     Run ONNX Attention op for prompt phase (no past KV cache).
@@ -351,6 +365,7 @@ def attention_prompt_func(
         ep: Execution provider (e.g., "CUDAExecutionProvider")
         device: Device string (e.g., "cuda")
         ort_type: ONNX tensor type
+        nonpad_kv_seqlen: Optional int64 tensor [batch_size] for opset 24
     """
     if not config.kv_cache_type:
         config.kv_cache_type = {
@@ -383,6 +398,15 @@ def attention_prompt_func(
         mask_ort_type = _get_mask_ort_type(config, ort_type)
         bind_tensor(io_binding, "attn_mask", attn_mask, device, mask_ort_type)
 
+    # Bind optional nonpad_kv_seqlen (opset 24+)
+    if config.has_nonpad_kv_seqlen:
+        if nonpad_kv_seqlen is None:
+            raise ValueError(
+                "Invariant violated: config.has_nonpad_kv_seqlen=True but the nonpad_kv_seqlen "
+                "tensor is None. Either provide the tensor or set has_nonpad_kv_seqlen=False."
+            )
+        bind_tensor(io_binding, "nonpad_kv_seqlen", nonpad_kv_seqlen, device, TensorProto.INT64)
+
     # Bind Outputs
     hidden_size = config.q_num_heads * config.head_size
     out_dtype = _get_out_dtype(ort_type)
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index d6a9246f7b792..14a0a91487a08 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -961,5 +961,233 @@ def test_gqa_past_padding_mea(self, name, config):
         )
 
 
+# #################################################################################################
+#  Parity Check with nonpad_kv_seqlen (Opset 24)
+# #################################################################################################
+
+
+def parity_check_gqa_prompt_with_nonpad_kv_seqlen(
+    config: AttentionConfig,
+    nonpad_seqlens: torch.Tensor,
+    ep,
+    device,
+    torch_type,
+    ort_type,
+    rtol,
+    atol,
+    std=0.2,
+):
+    """
+    Parity check for ONNX Attention op (opset 24) GQA path with nonpad_kv_seqlen.
+
+    nonpad_kv_seqlen tells the op how many KV positions per batch are valid.
+    Positions beyond the valid length are treated as padding and masked out.
+    Cannot be used together with past_key/past_value.
+    """
+    torch.manual_seed(0)
+
+    q = (
+        torch.randn(
+            config.batch_size,
+            config.q_sequence_length,
+            config.q_num_heads,
+            config.head_size,
+            device=device,
+            dtype=torch_type,
+        )
+        * std
+    )
+    k = (
+        torch.randn(
+            config.batch_size,
+            config.kv_sequence_length,
+            config.kv_num_heads,
+            config.head_size,
+            device=device,
+            dtype=torch_type,
+        )
+        * std
+    )
+    v = torch.randn_like(k) * std
+
+    # Zero out padded positions in K, V for proper comparison
+    for b in range(config.batch_size):
+        valid_len = nonpad_seqlens[b].item()
+        if valid_len < config.kv_sequence_length:
+            k[b, valid_len:, :, :] = 0
+            v[b, valid_len:, :, :] = 0
+
+    # Reference: use key_padding_mask [batch, kv_seq]
+    key_padding_mask = create_boolean_mask_from_seqlens(
+        seqlens=nonpad_seqlens.to(torch.int32),
+        total_seq_len=config.kv_sequence_length,
+        mask_dims=2,
+        device=device,
+    )
+
+    out_ref, _ = attention_ref(
+        q=q,
+        k=k,
+        v=v,
+        key_padding_mask=key_padding_mask,
+        causal=config.is_causal == 1,
+        softcap=config.softcap,
+    )
+
+    # ORT path: use nonpad_kv_seqlen (int64 tensor)
+    nonpad_kv_seqlen_tensor = nonpad_seqlens.to(torch.int64).to(device)
+
+    out, present_k, present_v = attention_prompt_func(
+        q=q,
+        k=k,
+        v=v,
+        config=config,
+        attn_mask=None,
+        ep=ep,
+        device=device,
+        ort_type=ort_type,
+        nonpad_kv_seqlen=nonpad_kv_seqlen_tensor,
+    )
+
+    out = torch.reshape(out, (config.batch_size, config.q_sequence_length, config.q_num_heads, config.head_size))
+
+    # When nonpad_kv_seqlen=0 for a batch, all KV positions are masked → softmax yields NaN.
+    # Zero out those batches in both ORT and reference for comparison.
+    for b in range(config.batch_size):
+        if nonpad_seqlens[b].item() == 0:
+            out[b, :, :, :] = 0
+            out_ref[b, :, :, :] = 0
+
+    out_np = out.to(torch.float32).detach().cpu().numpy()
+    out_ref_np = out_ref.to(torch.float32).detach().cpu().numpy()
+
+    print_diff_statistics(torch.tensor(out_np - out_ref_np), "out")
+    numpy.testing.assert_allclose(out_np, out_ref_np, rtol=rtol, atol=atol)
+
+
+def gqa_nonpad_kv_seqlen_test_cases():
+    """
+    Generate test cases for ONNX Attention op (opset 24) GQA path with nonpad_kv_seqlen.
+
+    In prompt mode (q_seq == kv_seq), the GQA kernel ignores seqlens_k and uses
+    padded_seq_lens = sequence_length unconditionally. nonpad_kv_seqlen masking is only
+    meaningful for decode (q_seq != kv_seq), which routes to FlashAttentionForExternalKVCache.
+    In the real TensorScatter workflow, prompt mode always has all tokens valid, so
+    nonpad_kv_seqlen = total_kv_sequence_length (mask nothing).
+    """
+    h = 128
+    sq = 16
+    skv = 16
+    n = 8
+    n2 = 2
+
+    # In prompt mode, nonpad_kv_seqlen should equal total_kv_sequence_length (all tokens valid).
+    # Partial masking (nonpad < kv_sequence_length) is not supported by the GQA kernel in prompt mode.
+    seqlen_scenarios = [
+        (1, [16], "single_batch"),
+        (2, [16, 16], "full_len"),
+        (4, [16, 16, 16, 16], "multi_batch"),
+    ]
+
+    for batch_size, seqlens, label in seqlen_scenarios:
+        config = AttentionConfig(
+            batch_size=batch_size,
+            q_sequence_length=sq,
+            kv_sequence_length=skv,
+            past_kv_sequence_length=0,
+            q_num_heads=n,
+            kv_num_heads=n2,
+            head_size=h,
+            is_causal=1,
+            has_nonpad_kv_seqlen=True,
+        )
+        name = f"b{batch_size}_sq{sq}_skv{skv}_nh{n}_{n2}_h{h}_{label}"
+        yield name, config, seqlens
+
+
+def gqa_nonpad_kv_seqlen_cpu_test_cases():
+    """CPU-only test cases including zero_seqlen (triggers CUDA_KERNEL_ASSERT in debug builds)."""
+    yield from gqa_nonpad_kv_seqlen_test_cases()
+
+    h = 128
+    sq = 16
+    skv = 16
+    n = 8
+    n2 = 2
+    config = AttentionConfig(
+        batch_size=2,
+        q_sequence_length=sq,
+        kv_sequence_length=skv,
+        past_kv_sequence_length=0,
+        q_num_heads=n,
+        kv_num_heads=n2,
+        head_size=h,
+        is_causal=1,
+        has_nonpad_kv_seqlen=True,
+    )
+    yield f"b2_sq{sq}_skv{skv}_nh{n}_{n2}_h{h}_zero_seqlen", config, [0, 16]
+
+
+@unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
+class TestONNXAttentionGQANonpadKVSeqlen(unittest.TestCase):
+    """Test ONNX Attention op (opset 24) GQA path with nonpad_kv_seqlen (Flash Attention)."""
+
+    @parameterized.expand(gqa_nonpad_kv_seqlen_test_cases())
+    def test_gqa_nonpad_kv_seqlen_flash(self, name, config, seqlens):
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
+        nonpad_seqlens = torch.tensor(seqlens, dtype=torch.int64, device="cuda")
+
+        parity_check_gqa_prompt_with_nonpad_kv_seqlen(
+            config=config,
+            nonpad_seqlens=nonpad_seqlens,
+            ep="CUDAExecutionProvider",
+            device="cuda",
+            torch_type=torch.float16,
+            ort_type=TensorProto.FLOAT16,
+            rtol=rtol["fp16"],
+            atol=atol["fp16"],
+        )
+
+
+@unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping tests.")
+class TestONNXAttentionGQANonpadKVSeqlenMEA(unittest.TestCase):
+    """Test ONNX Attention op (opset 24) GQA path with nonpad_kv_seqlen (Memory Efficient Attention)."""
+
+    @parameterized.expand(gqa_nonpad_kv_seqlen_test_cases())
+    def test_gqa_nonpad_kv_seqlen_mea(self, name, config, seqlens):
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
+        nonpad_seqlens = torch.tensor(seqlens, dtype=torch.int64, device="cuda")
+
+        parity_check_gqa_prompt_with_nonpad_kv_seqlen(
+            config=config,
+            nonpad_seqlens=nonpad_seqlens,
+            ep="CUDAExecutionProvider",
+            device="cuda",
+            torch_type=torch.float16,
+            ort_type=TensorProto.FLOAT16,
+            rtol=rtol["fp16"],
+            atol=atol["fp16"],
+        )
+
+
+class TestONNXAttentionGQANonpadKVSeqlenCPU(unittest.TestCase):
+    """Test ONNX Attention op (opset 24) GQA path with nonpad_kv_seqlen on CPU (includes zero_seqlen)."""
+
+    @parameterized.expand(gqa_nonpad_kv_seqlen_cpu_test_cases())
+    def test_gqa_nonpad_kv_seqlen_cpu(self, name, config, seqlens):
+        nonpad_seqlens = torch.tensor(seqlens, dtype=torch.int64, device="cpu")
+
+        parity_check_gqa_prompt_with_nonpad_kv_seqlen(
+            config=config,
+            nonpad_seqlens=nonpad_seqlens,
+            ep="CPUExecutionProvider",
+            device="cpu",
+            torch_type=torch.float32,
+            ort_type=TensorProto.FLOAT,
+            rtol=rtol["fp32"],
+            atol=atol["fp32"],
+        )
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
index daa644f40ff41..02beb6ea5d4d9 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
@@ -920,5 +920,239 @@ def test_mha_bool_mask_fp16(self, name, config):
         )
 
 
+# #################################################################################################
+#  Parity Check with nonpad_kv_seqlen (Opset 24)
+# #################################################################################################
+
+
+def parity_check_mha_prompt_with_nonpad_kv_seqlen(
+    config: AttentionConfig,
+    nonpad_seqlens: torch.Tensor,
+    ep,
+    device,
+    torch_type,
+    ort_type,
+    rtol,
+    atol,
+    std=0.2,
+):
+    """
+    Parity check for ONNX Attention op (opset 24) MHA path with nonpad_kv_seqlen.
+
+    nonpad_kv_seqlen tells the op how many KV positions per batch are valid.
+    Positions beyond the valid length are treated as padding and masked out.
+    Cannot be used together with past_key/past_value.
+    """
+    torch.manual_seed(0)
+
+    q = (
+        torch.randn(
+            config.batch_size,
+            config.q_sequence_length,
+            config.q_num_heads,
+            config.head_size,
+            device=device,
+            dtype=torch_type,
+        )
+        * std
+    )
+    k = (
+        torch.randn(
+            config.batch_size,
+            config.kv_sequence_length,
+            config.kv_num_heads,
+            config.head_size,
+            device=device,
+            dtype=torch_type,
+        )
+        * std
+    )
+    v = torch.randn_like(k) * std
+
+    # Zero out padded positions in K, V for proper comparison
+    for b in range(config.batch_size):
+        valid_len = nonpad_seqlens[b].item()
+        if valid_len < config.kv_sequence_length:
+            k[b, valid_len:, :, :] = 0
+            v[b, valid_len:, :, :] = 0
+
+    # Reference: use key_padding_mask [batch, kv_seq]
+    key_padding_mask = create_boolean_mask_from_seqlens(
+        seqlens=nonpad_seqlens.to(torch.int32),
+        total_seq_len=config.kv_sequence_length,
+        mask_dims=2,
+        device=device,
+    )
+
+    out_ref, _ = attention_ref(
+        q=q,
+        k=k,
+        v=v,
+        key_padding_mask=key_padding_mask,
+        causal=config.is_causal == 1,
+        softcap=config.softcap,
+    )
+
+    # ORT path: use nonpad_kv_seqlen (int64 tensor)
+    nonpad_kv_seqlen_tensor = nonpad_seqlens.to(torch.int64).to(device)
+
+    out, present_k, present_v = attention_prompt_func(
+        q=q,
+        k=k,
+        v=v,
+        config=config,
+        attn_mask=None,
+        ep=ep,
+        device=device,
+        ort_type=ort_type,
+        nonpad_kv_seqlen=nonpad_kv_seqlen_tensor,
+    )
+
+    out = torch.reshape(out, (config.batch_size, config.q_sequence_length, config.q_num_heads, config.head_size))
+
+    # When nonpad_kv_seqlen=0 for a batch, all KV positions are masked → softmax yields NaN.
+    # Zero out those batches in both ORT and reference for comparison.
+    for b in range(config.batch_size):
+        if nonpad_seqlens[b].item() == 0:
+            out[b, :, :, :] = 0
+            out_ref[b, :, :, :] = 0
+
+    out_np = out.to(torch.float32).detach().cpu().numpy()
+    out_ref_np = out_ref.to(torch.float32).detach().cpu().numpy()
+
+    print_diff_statistics(torch.tensor(out_np - out_ref_np), "out")
+    numpy.testing.assert_allclose(out_np, out_ref_np, rtol=rtol, atol=atol)
+
+
+def mha_nonpad_kv_seqlen_test_cases():
+    """
+    Generate test cases for ONNX Attention op (opset 24) MHA path with nonpad_kv_seqlen.
+
+    MHA supports partial masking in prompt mode via FlashAttentionForExternalKVCache.
+    Full-length tests verify the no-masking case; partial-length tests verify actual masking.
+    """
+    h = 128
+    sq = 16
+    skv = 16
+    n = 8
+
+    seqlen_scenarios = [
+        (1, [16], "single_batch"),
+        (2, [16, 16], "full_len"),
+        (2, [3, 5], "partial_mask"),
+        (4, [16, 16, 16, 16], "multi_batch"),
+    ]
+
+    for batch_size, seqlens, label in seqlen_scenarios:
+        config = AttentionConfig(
+            batch_size=batch_size,
+            q_sequence_length=sq,
+            kv_sequence_length=skv,
+            past_kv_sequence_length=0,
+            q_num_heads=n,
+            kv_num_heads=n,
+            head_size=h,
+            is_causal=0,
+            has_nonpad_kv_seqlen=True,
+            attn_mask_type="additive",
+        )
+        name = f"b{batch_size}_sq{sq}_skv{skv}_nh{n}_h{h}_{label}"
+        yield name, config, seqlens
+
+    # Causal variation with full length
+    config_c = AttentionConfig(
+        batch_size=2,
+        q_sequence_length=sq,
+        kv_sequence_length=skv,
+        past_kv_sequence_length=0,
+        q_num_heads=n,
+        kv_num_heads=n,
+        head_size=h,
+        is_causal=1,
+        has_nonpad_kv_seqlen=True,
+        attn_mask_type="additive",
+    )
+    yield f"b2_sq{sq}_skv{skv}_nh{n}_h{h}_causal", config_c, [16, 16]
+
+
+def mha_nonpad_kv_seqlen_cpu_test_cases():
+    """CPU-only test cases including zero_seqlen (triggers CUDA_KERNEL_ASSERT in debug builds)."""
+    yield from mha_nonpad_kv_seqlen_test_cases()
+
+    h = 128
+    sq = 16
+    skv = 16
+    n = 8
+    config = AttentionConfig(
+        batch_size=2,
+        q_sequence_length=sq,
+        kv_sequence_length=skv,
+        past_kv_sequence_length=0,
+        q_num_heads=n,
+        kv_num_heads=n,
+        head_size=h,
+        is_causal=0,
+        has_nonpad_kv_seqlen=True,
+        attn_mask_type="additive",
+    )
+    yield f"b2_sq{sq}_skv{skv}_nh{n}_h{h}_zero_seqlen", config, [0, 5]
+
+
+@unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping MHA tests.")
+class TestONNXAttentionMHANonpadKVSeqlen(unittest.TestCase):
+    """Test ONNX Attention op (opset 24) MHA path with nonpad_kv_seqlen on CUDA."""
+
+    @parameterized.expand(mha_nonpad_kv_seqlen_test_cases())
+    def test_mha_nonpad_kv_seqlen_fp16(self, name, config, seqlens):
+        nonpad_seqlens = torch.tensor(seqlens, dtype=torch.int64, device="cuda")
+
+        parity_check_mha_prompt_with_nonpad_kv_seqlen(
+            config=config,
+            nonpad_seqlens=nonpad_seqlens,
+            ep="CUDAExecutionProvider",
+            device="cuda",
+            torch_type=torch.float16,
+            ort_type=TensorProto.FLOAT16,
+            rtol=rtol["fp16"],
+            atol=atol["fp16"],
+        )
+
+    @parameterized.expand(mha_nonpad_kv_seqlen_test_cases())
+    def test_mha_nonpad_kv_seqlen_fp32(self, name, config, seqlens):
+        config.kv_cache_type = "float32"
+        nonpad_seqlens = torch.tensor(seqlens, dtype=torch.int64, device="cuda")
+
+        parity_check_mha_prompt_with_nonpad_kv_seqlen(
+            config=config,
+            nonpad_seqlens=nonpad_seqlens,
+            ep="CUDAExecutionProvider",
+            device="cuda",
+            torch_type=torch.float32,
+            ort_type=TensorProto.FLOAT,
+            rtol=rtol["fp32"],
+            atol=atol["fp32"],
+        )
+
+
+class TestONNXAttentionMHANonpadKVSeqlenCPU(unittest.TestCase):
+    """Test ONNX Attention op (opset 24) MHA path with nonpad_kv_seqlen on CPU (includes zero_seqlen)."""
+
+    @parameterized.expand(mha_nonpad_kv_seqlen_cpu_test_cases())
+    def test_mha_nonpad_kv_seqlen_cpu(self, name, config, seqlens):
+        config.kv_cache_type = "float32"
+        nonpad_seqlens = torch.tensor(seqlens, dtype=torch.int64, device="cpu")
+
+        parity_check_mha_prompt_with_nonpad_kv_seqlen(
+            config=config,
+            nonpad_seqlens=nonpad_seqlens,
+            ep="CPUExecutionProvider",
+            device="cpu",
+            torch_type=torch.float32,
+            ort_type=TensorProto.FLOAT,
+            rtol=rtol["fp32"],
+            atol=atol["fp32"],
+        )
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
new file mode 100644
index 0000000000000..521b8469ffa25
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
@@ -0,0 +1,589 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+"""
+Tests for TensorScatter(opset 24) + Attention(opset 24) pattern.
+
+Demonstrates a decode step where new KV entries are scattered into a
+pre-allocated cache via TensorScatter, then Attention uses the updated
+KV cache with nonpad_kv_seqlen to mask out padding positions.
+
+Uses IO Binding for in-place KV cache updates, matching the real-world LLM
+inference pattern where KV cache buffers are pre-allocated on the device
+and reused across decode steps.
+
+The graph looks like:
+
+  key_cache (B, S, kv_hidden)  ──────────┐
+  new_k (B, q_seq, kv_hidden)  ──────────┤
+  write_indices (B,)  ───────────────────┤
+                                          ├─ TensorScatter(axis=1) ─→ updated_key_cache ─┐
+                                                                                          │
+  value_cache (B, S, kv_hidden)  ────────┐                                                │
+  new_v (B, q_seq, kv_hidden)  ──────────┤                                                │
+  write_indices (B,)  ──────────────────┤                                                 │
+                                          ├─ TensorScatter(axis=1) ─→ updated_value_cache ┤
+                                                                                           │
+  Q (B, q_seq, q_hidden) ──────────────┬─ Attention(opset 24)  ←──────────────────────────┘
+  nonpad_kv_seqlen (B,)  ──────────────┘          │
+                                                   ├─ output
+                                                   ├─ present_key
+                                                   └─ present_value
+
+IO Binding enables in-place cache updates: the same OrtValue buffer is bound as
+both TensorScatter input (key_cache/value_cache) and output
+(updated_key_cache/updated_value_cache), avoiding unnecessary copies.
+
+CUDA support:
+  - GQA path (kv_num_heads != q_num_heads) uses flash attention for external KV cache (fp16/bf16)
+  - MHA path (kv_num_heads == q_num_heads) uses flash attention for fp16/bf16,
+    unfused attention_bias fallback for fp32
+"""
+
+import math
+import unittest
+
+import numpy
+import torch
+from onnx import TensorProto, helper
+from parameterized import parameterized
+
+from onnxruntime import InferenceSession, OrtValue, SessionOptions, get_available_providers
+
+# #################################################################################################
+#  Helper Functions
+# #################################################################################################
+
+
+def has_cuda_provider():
+    return "CUDAExecutionProvider" in get_available_providers()
+
+
+def has_cuda_device(min_capability: int = 53):
+    if not has_cuda_provider() or not torch.cuda.is_available():
+        return False
+    major, minor = torch.cuda.get_device_capability()
+    return major * 10 + minor >= min_capability
+
+
+def has_flash_attention():
+    """Return True if the CUDA device meets the SM80+ requirement for Flash Attention."""
+    return has_cuda_device(80)
+
+
+def numpy_attention_ref(q, k, v, nonpad_kv_seqlen, is_causal=False):
+    """
+    NumPy reference implementation of scaled dot-product attention with padding mask.
+
+    Args:
+        q: Query [batch, q_seq, num_heads, head_size]
+        k: Key [batch, kv_seq, kv_num_heads, head_size]
+        v: Value [batch, kv_seq, kv_num_heads, head_size]
+        nonpad_kv_seqlen: [batch] — number of valid KV positions per batch
+        is_causal: whether to apply causal masking
+
+    Returns:
+        output: [batch, q_seq, num_heads, head_size]
+    """
+    batch_size, q_seq, num_heads, head_size = q.shape
+    _, kv_seq, kv_num_heads, _ = k.shape
+    groups = num_heads // kv_num_heads
+
+    # Repeat KV heads for GQA
+    if groups > 1:
+        k = numpy.repeat(k, groups, axis=2)
+        v = numpy.repeat(v, groups, axis=2)
+
+    scale = 1.0 / math.sqrt(head_size)
+
+    # scores: [batch, num_heads, q_seq, kv_seq]
+    q_t = numpy.transpose(q, (0, 2, 1, 3))
+    k_t = numpy.transpose(k, (0, 2, 3, 1))
+    scores = numpy.matmul(q_t, k_t) * scale
+
+    # Apply nonpad_kv_seqlen mask: positions >= valid_len get -inf
+    for b in range(batch_size):
+        valid_len = int(nonpad_kv_seqlen[b])
+        if valid_len < kv_seq:
+            scores[b, :, :, valid_len:] = -numpy.inf
+
+    # Apply causal mask
+    if is_causal:
+        for sq in range(q_seq):
+            offset = kv_seq - q_seq
+            for sk in range(kv_seq):
+                if sk > sq + offset:
+                    scores[:, :, sq, sk] = -numpy.inf
+
+    # Softmax along last axis
+    # Handle all-masked rows: if entire row is -inf, softmax gives nan; we want 0.
+    # This happens when nonpad_kv_seqlen=0 for a batch (all KV positions masked).
+    # Callers zero out those batches in both ORT and reference outputs for comparison.
+    max_scores = numpy.max(scores, axis=-1, keepdims=True)
+    # Clip -inf max to 0 to avoid nan in exp
+    max_scores = numpy.where(numpy.isinf(max_scores) & (max_scores < 0), 0.0, max_scores)
+    exp_scores = numpy.exp(scores - max_scores)
+    sum_exp = numpy.sum(exp_scores, axis=-1, keepdims=True)
+    sum_exp = numpy.where(sum_exp == 0.0, 1.0, sum_exp)
+    attention = exp_scores / sum_exp
+
+    # output: [batch, num_heads, q_seq, head_size]
+    v_t = numpy.transpose(v, (0, 2, 1, 3))
+    output = numpy.matmul(attention, v_t)
+
+    # Transpose back: [batch, q_seq, num_heads, head_size]
+    output = numpy.transpose(output, (0, 2, 1, 3))
+    return output
+
+
+def build_tensorscatter_attention_graph(
+    batch_size,
+    total_kv_seq_len,
+    q_seq_len,
+    q_num_heads,
+    kv_num_heads,
+    head_size,
+    ort_type,
+    is_causal=0,
+):
+    """
+    Build ONNX graph: TensorScatter(opset 24) → Attention(opset 24).
+
+    TensorScatter uses write_indices [B] to scatter new KV entries into cache
+    at per-batch positions. Attention uses updated cache with nonpad_kv_seqlen
+    to mask padding.
+
+    The graph exposes updated_key_cache and updated_value_cache as graph outputs
+    to enable in-place buffer binding via IO Binding.
+
+    Inputs:
+      0: key_cache        [B, total_kv_seq_len, kv_hidden]
+      1: value_cache      [B, total_kv_seq_len, kv_hidden]
+      2: new_k            [B, q_seq_len, kv_hidden]
+      3: new_v            [B, q_seq_len, kv_hidden]
+      4: write_indices    [B]   (int64 — per-batch write position)
+      5: query            [B, q_seq_len, q_hidden]
+      6: nonpad_kv_seqlen [B]   (int64 — valid KV length after scatter)
+
+    Outputs:
+      0: output              [B, q_seq_len, q_hidden]
+      1: present_key         [B, kv_num_heads, total_kv_seq_len, head_size]
+      2: present_value       [B, kv_num_heads, total_kv_seq_len, head_size]
+      3: updated_key_cache   [B, total_kv_seq_len, kv_hidden]
+      4: updated_value_cache [B, total_kv_seq_len, kv_hidden]
+    """
+    kv_hidden = kv_num_heads * head_size
+    q_hidden = q_num_heads * head_size
+
+    # TensorScatter for key cache update (axis=1: sequence dim in [B, S, H])
+    scatter_k_node = helper.make_node(
+        "TensorScatter",
+        inputs=["key_cache", "new_k", "write_indices"],
+        outputs=["updated_key_cache"],
+        name="TensorScatterKey",
+        axis=1,
+    )
+
+    # TensorScatter for value cache update
+    scatter_v_node = helper.make_node(
+        "TensorScatter",
+        inputs=["value_cache", "new_v", "write_indices"],
+        outputs=["updated_value_cache"],
+        name="TensorScatterValue",
+        axis=1,
+    )
+
+    # Attention node with nonpad_kv_seqlen
+    attention_node = helper.make_node(
+        "Attention",
+        inputs=[
+            "query",
+            "updated_key_cache",
+            "updated_value_cache",
+            "",  # attn_mask
+            "",  # past_key
+            "",  # past_value
+            "nonpad_kv_seqlen",
+        ],
+        outputs=["output", "present_key", "present_value"],
+        name="Attention_0",
+        is_causal=is_causal,
+        kv_num_heads=kv_num_heads,
+        q_num_heads=q_num_heads,
+        softcap=0.0,
+        qk_matmul_output_mode=0,
+        domain="",
+    )
+
+    # Graph inputs
+    cache_shape = [batch_size, total_kv_seq_len, kv_hidden]
+    graph_inputs = [
+        helper.make_tensor_value_info("key_cache", ort_type, cache_shape),
+        helper.make_tensor_value_info("value_cache", ort_type, cache_shape),
+        helper.make_tensor_value_info("new_k", ort_type, [batch_size, q_seq_len, kv_hidden]),
+        helper.make_tensor_value_info("new_v", ort_type, [batch_size, q_seq_len, kv_hidden]),
+        helper.make_tensor_value_info("write_indices", TensorProto.INT64, [batch_size]),
+        helper.make_tensor_value_info("query", ort_type, [batch_size, q_seq_len, q_hidden]),
+        helper.make_tensor_value_info("nonpad_kv_seqlen", TensorProto.INT64, [batch_size]),
+    ]
+
+    # Graph outputs: Attention outputs + TensorScatter outputs for in-place binding
+    present_shape = [batch_size, kv_num_heads, total_kv_seq_len, head_size]
+    graph_outputs = [
+        helper.make_tensor_value_info("output", ort_type, [batch_size, q_seq_len, q_hidden]),
+        helper.make_tensor_value_info("present_key", ort_type, present_shape),
+        helper.make_tensor_value_info("present_value", ort_type, present_shape),
+        helper.make_tensor_value_info("updated_key_cache", ort_type, cache_shape),
+        helper.make_tensor_value_info("updated_value_cache", ort_type, cache_shape),
+    ]
+
+    graph = helper.make_graph(
+        [scatter_k_node, scatter_v_node, attention_node],
+        "TensorScatterAttention_Graph",
+        graph_inputs,
+        graph_outputs,
+    )
+
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 24)])
+    return model.SerializeToString()
+
+
+def run_tensorscatter_attention(
+    batch_size,
+    total_kv_seq_len,
+    q_seq_len,
+    q_num_heads,
+    kv_num_heads,
+    head_size,
+    nonpad_seqlens,
+    scatter_positions,
+    ep,
+    device,
+    torch_type,
+    ort_type,
+    is_causal=0,
+    std=0.2,
+):
+    """
+    Run TensorScatter + Attention test with IO Binding and compare against NumPy reference.
+
+    Uses IO Binding to:
+    1. Pre-allocate KV cache as OrtValues on the target device
+    2. Bind the same OrtValue as both TensorScatter input and output (in-place update)
+    3. Feed the updated cache to Attention
+    4. Pre-allocate output buffers on the target device
+
+    Args:
+        scatter_positions: list of ints per batch — the write index for TensorScatter.
+        nonpad_seqlens: list of ints per batch — valid KV length AFTER scatter.
+        is_causal: 1 for causal attention, 0 for non-causal.
+    """
+    torch.manual_seed(42)
+    kv_hidden = kv_num_heads * head_size
+    q_hidden = q_num_heads * head_size
+    np_type = numpy.float16 if torch_type == torch.float16 else numpy.float32
+
+    # Generate test data as numpy arrays via torch for reproducible seeding
+    key_cache_np = (torch.randn(batch_size, total_kv_seq_len, kv_hidden, dtype=torch_type) * std).numpy()
+    value_cache_np = (torch.randn(batch_size, total_kv_seq_len, kv_hidden, dtype=torch_type) * std).numpy()
+
+    # Zero out padding positions in cache
+    for b in range(batch_size):
+        old_valid = max(0, nonpad_seqlens[b] - q_seq_len)
+        if old_valid < total_kv_seq_len:
+            key_cache_np[b, old_valid:, :] = 0
+            value_cache_np[b, old_valid:, :] = 0
+
+    new_k_np = (torch.randn(batch_size, q_seq_len, kv_hidden, dtype=torch_type) * std).numpy()
+    new_v_np = (torch.randn(batch_size, q_seq_len, kv_hidden, dtype=torch_type) * std).numpy()
+    query_np = (torch.randn(batch_size, q_seq_len, q_hidden, dtype=torch_type) * std).numpy()
+    write_indices_np = numpy.array(scatter_positions, dtype=numpy.int64)
+    nonpad_kv_seqlen_np = numpy.array(nonpad_seqlens, dtype=numpy.int64)
+
+    # --- NumPy reference ---
+    # Compute reference in float32 for accuracy
+    key_cache_ref = key_cache_np.astype(numpy.float32).copy()
+    value_cache_ref = value_cache_np.astype(numpy.float32).copy()
+    new_k_ref = new_k_np.astype(numpy.float32)
+    new_v_ref = new_v_np.astype(numpy.float32)
+
+    for b in range(batch_size):
+        pos = scatter_positions[b]
+        for t in range(q_seq_len):
+            key_cache_ref[b, pos + t, :] = new_k_ref[b, t, :]
+            value_cache_ref[b, pos + t, :] = new_v_ref[b, t, :]
+
+    # Reshape to BSNH for reference attention
+    q_ref = query_np.astype(numpy.float32).reshape(batch_size, q_seq_len, q_num_heads, head_size)
+    k_ref = key_cache_ref.reshape(batch_size, total_kv_seq_len, kv_num_heads, head_size)
+    v_ref = value_cache_ref.reshape(batch_size, total_kv_seq_len, kv_num_heads, head_size)
+
+    ref_output = numpy_attention_ref(q_ref, k_ref, v_ref, nonpad_seqlens, is_causal=bool(is_causal))
+    ref_output_3d = ref_output.reshape(batch_size, q_seq_len, q_hidden)
+
+    # --- ORT execution with IO Binding ---
+    onnx_model_str = build_tensorscatter_attention_graph(
+        batch_size=batch_size,
+        total_kv_seq_len=total_kv_seq_len,
+        q_seq_len=q_seq_len,
+        q_num_heads=q_num_heads,
+        kv_num_heads=kv_num_heads,
+        head_size=head_size,
+        ort_type=ort_type,
+        is_causal=is_causal,
+    )
+
+    sess_options = SessionOptions()
+    session = InferenceSession(onnx_model_str, sess_options, providers=[ep])
+
+    # Determine device for OrtValue allocation
+    ort_device = "cuda" if "CUDA" in ep else "cpu"
+    device_id = 0
+
+    # Create OrtValues for inputs on target device
+    key_cache_ort = OrtValue.ortvalue_from_numpy(key_cache_np, ort_device, device_id)
+    value_cache_ort = OrtValue.ortvalue_from_numpy(value_cache_np, ort_device, device_id)
+    new_k_ort = OrtValue.ortvalue_from_numpy(new_k_np, ort_device, device_id)
+    new_v_ort = OrtValue.ortvalue_from_numpy(new_v_np, ort_device, device_id)
+    write_indices_ort = OrtValue.ortvalue_from_numpy(write_indices_np, ort_device, device_id)
+    query_ort = OrtValue.ortvalue_from_numpy(query_np, ort_device, device_id)
+    nonpad_ort = OrtValue.ortvalue_from_numpy(nonpad_kv_seqlen_np, ort_device, device_id)
+
+    # Pre-allocate output buffers on target device
+    present_shape = [batch_size, kv_num_heads, total_kv_seq_len, head_size]
+    output_ort = OrtValue.ortvalue_from_shape_and_type(
+        [batch_size, q_seq_len, q_hidden], np_type, ort_device, device_id
+    )
+    present_k_ort = OrtValue.ortvalue_from_shape_and_type(present_shape, np_type, ort_device, device_id)
+    present_v_ort = OrtValue.ortvalue_from_shape_and_type(present_shape, np_type, ort_device, device_id)
+
+    # Set up IO binding
+    io_binding = session.io_binding()
+
+    # Bind all inputs
+    io_binding.bind_ortvalue_input("key_cache", key_cache_ort)
+    io_binding.bind_ortvalue_input("value_cache", value_cache_ort)
+    io_binding.bind_ortvalue_input("new_k", new_k_ort)
+    io_binding.bind_ortvalue_input("new_v", new_v_ort)
+    io_binding.bind_ortvalue_input("write_indices", write_indices_ort)
+    io_binding.bind_ortvalue_input("query", query_ort)
+    io_binding.bind_ortvalue_input("nonpad_kv_seqlen", nonpad_ort)
+
+    # Bind Attention outputs to pre-allocated buffers
+    io_binding.bind_ortvalue_output("output", output_ort)
+    io_binding.bind_ortvalue_output("present_key", present_k_ort)
+    io_binding.bind_ortvalue_output("present_value", present_v_ort)
+
+    # Bind TensorScatter outputs to the SAME OrtValues as inputs (in-place update).
+    # TensorScatter declares MayInplace(0, 0), so ORT will skip the copy when
+    # input and output share the same buffer.
+    io_binding.bind_ortvalue_output("updated_key_cache", key_cache_ort)
+    io_binding.bind_ortvalue_output("updated_value_cache", value_cache_ort)
+
+    # Execute with IO binding
+    io_binding.synchronize_inputs()
+    session.run_with_iobinding(io_binding)
+    io_binding.synchronize_outputs()
+
+    # Read results from pre-bound OrtValues
+    output_result = output_ort.numpy()
+    present_k_result = present_k_ort.numpy()
+    present_v_result = present_v_ort.numpy()
+
+    return output_result, ref_output_3d, present_k_result, present_v_result
+
+
+# #################################################################################################
+#  Test Case Generator
+# #################################################################################################
+
+# Shared test dimensions
+_HEAD_SIZE = 64
+_TOTAL_KV_SEQ_LEN = 8
+
+_GQA_CASES = [
+    # (batch, q_seq, q_heads, kv_heads, scatter_positions, nonpad_seqlens, label)
+    (1, 1, 8, 2, [3], [4], "gqa_batch1"),
+    (2, 1, 8, 2, [2, 4], [3, 5], "gqa_diff_lens"),
+    (2, 1, 8, 2, [4, 4], [5, 5], "gqa_same_lens"),
+    (2, 1, 8, 2, [0, 3], [1, 4], "gqa_one_empty"),
+    (2, 1, 8, 2, [7, 7], [8, 8], "gqa_full_len"),
+    # Additional GQA ratios
+    (2, 1, 16, 4, [2, 5], [3, 6], "gqa_16h_4kvh"),
+    (2, 1, 6, 3, [3, 3], [4, 4], "gqa_6h_3kvh"),
+]
+
+_MHA_CASES = [
+    (1, 1, 4, 4, [3], [4], "mha_batch1"),
+    (2, 1, 4, 4, [2, 4], [3, 5], "mha_diff_lens"),
+    (2, 1, 4, 4, [4, 4], [5, 5], "mha_same_lens"),
+    (2, 1, 4, 4, [0, 3], [1, 4], "mha_one_empty"),
+    (2, 1, 4, 4, [7, 7], [8, 8], "mha_full_len"),
+]
+
+
+def _make_test_params(cases, is_causal):
+    """Convert raw case tuples into parameterized test parameter tuples."""
+    causal_str = "causal" if is_causal else "noncausal"
+    for batch, q_seq, q_heads, kv_heads, scatter_pos, seqlens, label in cases:
+        name = f"b{batch}_qs{q_seq}_qh{q_heads}_kvh{kv_heads}_h{_HEAD_SIZE}_{label}_{causal_str}"
+        yield (
+            name,
+            batch,
+            q_seq,
+            q_heads,
+            kv_heads,
+            _HEAD_SIZE,
+            _TOTAL_KV_SEQ_LEN,
+            scatter_pos,
+            seqlens,
+            is_causal,
+        )
+
+
+def cpu_test_cases():
+    """CPU: all modes, non-causal and causal (both GQA and MHA work without restrictions)."""
+    yield from _make_test_params(_GQA_CASES + _MHA_CASES, is_causal=0)
+    yield from _make_test_params(_GQA_CASES + _MHA_CASES, is_causal=1)
+
+
+def cuda_fp16_test_cases():
+    """CUDA fp16: both GQA and MHA cases. Flash attention handles external KV cache directly."""
+    yield from _make_test_params(_GQA_CASES + _MHA_CASES, is_causal=0)
+    yield from _make_test_params(_GQA_CASES + _MHA_CASES, is_causal=1)
+
+
+def cuda_fp32_test_cases():
+    """CUDA fp32: MHA only. GQA requires fp16/bf16, and flash attention requires fp16/bf16.
+    fp32 MHA uses the unfused attention_bias fallback path."""
+    yield from _make_test_params(_MHA_CASES, is_causal=0)
+    yield from _make_test_params(_MHA_CASES, is_causal=1)
+
+
+# #################################################################################################
+#  Test Classes
+# #################################################################################################
+
+# Default tolerances (CUDA fp16/fp32 need looser tolerances due to TF32 and reduced precision)
+rtol = {"fp16": 5e-3, "fp32": 5e-3}
+atol = {"fp16": 5e-3, "fp32": 5e-3}
+# CPU fp32 has no TF32 — use tighter tolerance
+cpu_fp32_rtol = 1e-5
+cpu_fp32_atol = 1e-5
+
+
+class TestTensorScatterAttentionCPU(unittest.TestCase):
+    """Test TensorScatter + Attention (opset 24) on CPU with float32 and IO Binding."""
+
+    @parameterized.expand(cpu_test_cases())
+    def test_tensorscatter_attention_cpu_fp32(
+        self,
+        name,
+        batch,
+        q_seq,
+        q_heads,
+        kv_heads,
+        head_size,
+        total_kv,
+        scatter_pos,
+        seqlens,
+        is_causal,
+    ):
+        output, ref_output, _, _ = run_tensorscatter_attention(
+            batch_size=batch,
+            total_kv_seq_len=total_kv,
+            q_seq_len=q_seq,
+            q_num_heads=q_heads,
+            kv_num_heads=kv_heads,
+            head_size=head_size,
+            nonpad_seqlens=seqlens,
+            scatter_positions=scatter_pos,
+            ep="CPUExecutionProvider",
+            device="cpu",
+            torch_type=torch.float32,
+            ort_type=TensorProto.FLOAT,
+            is_causal=is_causal,
+        )
+        numpy.testing.assert_allclose(output, ref_output, rtol=cpu_fp32_rtol, atol=cpu_fp32_atol)
+
+
+@unittest.skipIf(not has_flash_attention(), "Flash Attention (SM80+) is not available, skipping tests.")
+class TestTensorScatterAttentionCUDAFP16(unittest.TestCase):
+    """Test TensorScatter + Attention (opset 24) on CUDA with float16 and IO Binding."""
+
+    @parameterized.expand(cuda_fp16_test_cases())
+    def test_tensorscatter_attention_cuda_fp16(
+        self,
+        name,
+        batch,
+        q_seq,
+        q_heads,
+        kv_heads,
+        head_size,
+        total_kv,
+        scatter_pos,
+        seqlens,
+        is_causal,
+    ):
+        output, ref_output, _, _ = run_tensorscatter_attention(
+            batch_size=batch,
+            total_kv_seq_len=total_kv,
+            q_seq_len=q_seq,
+            q_num_heads=q_heads,
+            kv_num_heads=kv_heads,
+            head_size=head_size,
+            nonpad_seqlens=seqlens,
+            scatter_positions=scatter_pos,
+            ep="CUDAExecutionProvider",
+            device="cuda",
+            torch_type=torch.float16,
+            ort_type=TensorProto.FLOAT16,
+            is_causal=is_causal,
+        )
+        numpy.testing.assert_allclose(output, ref_output, rtol=rtol["fp16"], atol=atol["fp16"])
+
+
+@unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping tests.")
+class TestTensorScatterAttentionCUDAFP32(unittest.TestCase):
+    """Test TensorScatter + Attention (opset 24) on CUDA with float32 and IO Binding.
+
+    Only MHA cases: CUDA GQA path requires float16.
+    """
+
+    @parameterized.expand(cuda_fp32_test_cases())
+    def test_tensorscatter_attention_cuda_fp32(
+        self,
+        name,
+        batch,
+        q_seq,
+        q_heads,
+        kv_heads,
+        head_size,
+        total_kv,
+        scatter_pos,
+        seqlens,
+        is_causal,
+    ):
+        output, ref_output, _, _ = run_tensorscatter_attention(
+            batch_size=batch,
+            total_kv_seq_len=total_kv,
+            q_seq_len=q_seq,
+            q_num_heads=q_heads,
+            kv_num_heads=kv_heads,
+            head_size=head_size,
+            nonpad_seqlens=seqlens,
+            scatter_positions=scatter_pos,
+            ep="CUDAExecutionProvider",
+            device="cuda",
+            torch_type=torch.float32,
+            ort_type=TensorProto.FLOAT,
+            is_causal=is_causal,
+        )
+        numpy.testing.assert_allclose(output, ref_output, rtol=rtol["fp32"], atol=atol["fp32"])
+
+
+if __name__ == "__main__":
+    unittest.main()

From e270a6ad0dc1460d756b8abe8b4aa0ee63109690 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 19:00:13 +0000
Subject: [PATCH 02/35] Update Operator Kernel document

---
 docs/OperatorKernels.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index abdcc81586909..75b43c363f371 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -655,7 +655,8 @@ Do not modify directly.*
 |ArgMin|*in* data:**T**<br> *out* reduced:**tensor(int64)**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||12|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 11]|**T** = tensor(double), tensor(float), tensor(float16)|
-|Attention|*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *in* nonpad_kv_seqlen:**tensor(int64)**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**<br><br>or<br><br>*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**|23+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
+|Attention|*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *in* nonpad_kv_seqlen:**tensor(int64)**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**<br><br>or<br><br>*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**|24+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
+|||23|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
 |AveragePool|*in* X:**T**<br> *out* Y:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[19, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[11, 18]|**T** = tensor(double), tensor(float), tensor(float16)|

From 6c5009286d193a066d1f1d23d87736b35515ecb5 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 19:56:46 +0000
Subject: [PATCH 03/35] Fix cutlass FMHA crash when attention bias stride is
 unaligned

When attn_bias is present, bias_strideM (= kv_sequence_length) must be
divisible by kAlignmentQ. The is_aligned check only verified head_size
alignment, so unaligned kv_sequence_length values would dispatch to the
aligned kernel path which then crashed with:
  'p.bias_strideM % kAlignmentQ == 0'

Add the bias stride alignment check so we fall back to the unaligned
kernel path when kv_sequence_length isn't properly aligned.

Fixes attention_op_test Attention4DAttnMask* test crashes.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../cuda/bert/cutlass_fmha/fmha_launch_template.h            | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
index 1b68d70617744..f720a7fb2f714 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
+++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
@@ -258,6 +258,11 @@ void DispatchIsAligned(const MemoryEfficientAttentionParams& params) {
                     params.qk_head_size % AlignedAK::kAlignmentK == 0 &&
                     params.v_head_size % AlignedAK::kAlignmentV == 0;
 
+  // Attention bias stride (kv_sequence_length) must also satisfy alignment requirements.
+  if (params.attn_bias != nullptr) {
+    is_aligned = is_aligned && params.kv_sequence_length % AlignedAK::kAlignmentQ == 0;
+  }
+
   DISPATCH_BOOL(is_aligned, kIsAligned, ([&]() {
                   LaunchCutlassFmha<T, ArchTag, kIsAligned::value, queries_per_block, keys_per_block, max_head_size>(params);
                 }));

From 6ac08bf16146a1a804b1500cf6ebd6818eacb26f Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 20:27:45 +0000
Subject: [PATCH 04/35] Fix GQA decode eligibility, padding mask wiring, and 4D
 BNSH Q transpose
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix 1: GQA Decode Eligibility
- Remove past_key==nullptr restriction from flash_eligible check
- Add mha_fwd_kvcache decode path in RunFlashAttention for past_key
- Copy past KV (BNSH) into present buffers via cudaMemcpy2DAsync
- Set uniform seqlens_k = past_sequence_length per batch
- Supports GQA natively via num_heads_k param in mha_fwd_kvcache

Fix 2: Padding Mask Wiring
- Add LaunchConvertMaskToFlashSeqlensK (stores count, not count-1)
- Add seqlen_offset parameter to ConvertMaskToSeqlensKernel
- Wire bool mask→seqlens_k in RunFlashAttention (mha_fwd_kvcache path)
- Wire bool mask→seqlens_k in RunMemoryEfficientAttention (has_custom_right_padding)
- Replaces broken mask→attention_bias conversion for padding masks

Fix 3: 4D BNSH Q Transpose
- Add Transpose_BNSH_to_BSNH wrappers (float, half, BFloat16) using LaunchTransCtx
- Add declarations in attention_impl.h
- In RunFlashAttention: transpose Q BNSH→BSNH before kernel, output BSNH→BNSH after
- In RunMemoryEfficientAttention: same Q/output transposes
- Also transpose K/V new tokens BNSH→BSNH for decode kvcache path

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../contrib_ops/cuda/bert/attention_impl.h    |  10 +
 .../cuda/bert/attention_transpose.cu          |  20 +
 .../core/providers/cuda/llm/attention.cc      | 453 +++++++++++++++---
 .../providers/cuda/llm/attention_mask_impl.cu |  47 +-
 .../providers/cuda/llm/attention_mask_impl.h  |  14 +
 5 files changed, 470 insertions(+), 74 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h
index 6c08d7fbd9b3f..8cccc2f1a725c 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h
@@ -132,6 +132,16 @@ Status Transpose_BSNH_to_BNSH(const int batch_size, const int sequence_length, c
 Status Transpose_BSNH_to_BNSH(const int batch_size, const int sequence_length, const int num_heads, const int head_size,
                               const BFloat16* input, BFloat16* output, cudaStream_t stream, const int max_threads_per_block);
 
+// BxNxSxH => BxSxNxH
+Status Transpose_BNSH_to_BSNH(const int batch_size, const int sequence_length, const int num_heads, const int head_size,
+                              const float* input, float* output, cudaStream_t stream, const int max_threads_per_block);
+
+Status Transpose_BNSH_to_BSNH(const int batch_size, const int sequence_length, const int num_heads, const int head_size,
+                              const half* input, half* output, cudaStream_t stream, const int max_threads_per_block);
+
+Status Transpose_BNSH_to_BSNH(const int batch_size, const int sequence_length, const int num_heads, const int head_size,
+                              const BFloat16* input, BFloat16* output, cudaStream_t stream, const int max_threads_per_block);
+
 template <typename T>
 Status ConcatPastToPresent(int batch_size, int num_heads, int qk_head_size, int v_head_size,
                            int sequence_length, int total_sequence_length,
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_transpose.cu b/onnxruntime/contrib_ops/cuda/bert/attention_transpose.cu
index e7177987fa2d1..e5f1ffca251b7 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_transpose.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_transpose.cu
@@ -393,6 +393,26 @@ Status Transpose_BSNH_to_BNSH(const int batch_size, const int sequence_length, c
                         max_threads_per_block, false, input, output);
 }
 
+// BxNxSxH => BxSxNxH (BNSH to BSNH) — reverse of Transpose_BSNH_to_BNSH.
+// Reuses the existing TransposeCtx kernel which does exactly this transformation.
+Status Transpose_BNSH_to_BSNH(const int batch_size, const int sequence_length, const int num_heads, const int head_size,
+                              const float* input, float* output, cudaStream_t stream, const int max_threads_per_block) {
+  return LaunchTransCtx(stream, sequence_length, batch_size, head_size, num_heads,
+                        max_threads_per_block, false, input, output);
+}
+
+Status Transpose_BNSH_to_BSNH(const int batch_size, const int sequence_length, const int num_heads, const int head_size,
+                              const half* input, half* output, cudaStream_t stream, const int max_threads_per_block) {
+  return LaunchTransCtx(stream, sequence_length, batch_size, head_size, num_heads,
+                        max_threads_per_block, false, input, output);
+}
+
+Status Transpose_BNSH_to_BSNH(const int batch_size, const int sequence_length, const int num_heads, const int head_size,
+                              const BFloat16* input, BFloat16* output, cudaStream_t stream, const int max_threads_per_block) {
+  return LaunchTransCtx(stream, sequence_length, batch_size, head_size, num_heads,
+                        max_threads_per_block, false, input, output);
+}
+
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index d55f228ae5c4c..b267f854175b7 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -87,15 +87,12 @@ Status Attention<T>::RunFlashAttention(
     Tensor* Y, Tensor* present_key, Tensor* present_value,
     const attention_helper::AttentionParameters& parameters) const {
 #if USE_FLASH_ATTENTION
-  ORT_UNUSED_PARAMETER(attn_mask);
-  ORT_UNUSED_PARAMETER(past_key);
-  ORT_UNUSED_PARAMETER(past_value);
   auto& device_prop = GetDeviceProp();
   auto cuda_stream = static_cast<cudaStream_t>(context->GetComputeStream()->GetHandle());
   const bool is_bf16 = std::is_same<T, BFloat16>::value;
   const bool is_bsnh = parameters.transpose_output;  // 3D inputs → BSNH
 
-  // Allocate softmax_lse and accumulation buffers
+  // --- Common buffer allocation ---
   size_t softmax_lse_bytes = onnxruntime::flash::get_softmax_lse_size(
       parameters.q_sequence_length, parameters.batch_size, parameters.q_num_heads);
 
@@ -118,7 +115,49 @@ Status Attention<T>::RunFlashAttention(
                                          out_accum_bytes, cuda_stream));
   }
 
-  // Handle nonpad_kv_seqlen: external KV cache path (opset 24)
+  // --- Fix 3: Prepare Q in BSNH format (flash always expects Q as BSNH) ---
+  const void* q_data = Q->Data<T>();
+  IAllocatorUniquePtr<void> q_bsnh_buffer;
+  if (!is_bsnh) {
+    size_t q_bytes = sizeof(T) * parameters.batch_size * parameters.q_sequence_length *
+                     parameters.q_num_heads * parameters.head_size;
+    q_bsnh_buffer = GetScratchBuffer<void>(q_bytes, context->GetComputeStream());
+    if constexpr (std::is_same_v<T, MLFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.head_size,
+          reinterpret_cast<const half*>(Q->Data<T>()),
+          reinterpret_cast<half*>(q_bsnh_buffer.get()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else if constexpr (std::is_same_v<T, BFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.head_size,
+          Q->Data<BFloat16>(), reinterpret_cast<BFloat16*>(q_bsnh_buffer.get()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.head_size,
+          Q->Data<float>(), reinterpret_cast<float*>(q_bsnh_buffer.get()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    }
+    q_data = q_bsnh_buffer.get();
+  }
+
+  // Flash outputs BSNH. If Y expects BNSH, write to scratch then transpose.
+  void* out_data = Y->MutableData<T>();
+  IAllocatorUniquePtr<void> out_bsnh_buffer;
+  if (!is_bsnh) {
+    size_t out_bytes = sizeof(T) * parameters.batch_size * parameters.q_sequence_length *
+                       parameters.q_num_heads * parameters.v_head_size;
+    out_bsnh_buffer = GetScratchBuffer<void>(out_bytes, context->GetComputeStream());
+    out_data = out_bsnh_buffer.get();
+  }
+
+  bool present_kv_already_populated = false;
+
+  // --- Path 1: nonpad_kv_seqlen (opset 24 external KV cache) ---
   if (nonpad_kv_seqlen != nullptr) {
     ORT_ENFORCE(parameters.past_sequence_length == 0,
                 "RunFlashAttention with nonpad_kv_seqlen requires K/V to be the full cache "
@@ -134,14 +173,13 @@ Status Attention<T>::RunFlashAttention(
         cuda_stream,
         device_prop.maxThreadsPerBlock));
 
-    // K/V are the full cache in BSNH. No new tokens to append (k=nullptr, v=nullptr).
     ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd_kvcache(
         device_prop, cuda_stream,
-        const_cast<void*>(static_cast<const void*>(Q->Data<T>())),
+        const_cast<void*>(q_data),
         const_cast<void*>(static_cast<const void*>(K->Data<T>())),
         const_cast<void*>(static_cast<const void*>(V->Data<T>())),
         /*k=*/nullptr, /*v=*/nullptr,
-        static_cast<void*>(Y->MutableData<T>()),
+        out_data,
         softmax_lse_buffer.get(),
         const_cast<void*>(static_cast<const void*>(seqlens_k_buffer.get())),
         /*rotary_cos=*/nullptr, /*rotary_sin=*/nullptr,
@@ -158,8 +196,212 @@ Status Attention<T>::RunFlashAttention(
         softmax_lse_accum_buffer.get(), out_accum_buffer.get(),
         /*local_window_size=*/-1, /*is_rotary_interleaved=*/false,
         /*is_packed_qkv=*/false));
+  }
+  // --- Path 2 (Fix 1): Decode with past KV cache ---
+  else if (past_key != nullptr) {
+    ORT_ENFORCE(past_value != nullptr, "past_key requires past_value.");
+    ORT_ENFORCE(present_key != nullptr && present_value != nullptr,
+                "present_key/value outputs are required when past_key is provided.");
+
+    // Copy past KV (BNSH) into present buffers (BNSH). Strided copy because
+    // past has [B, N_kv, past_seq, H] and present has [B, N_kv, total_seq, H].
+    const size_t num_kv_rows = static_cast<size_t>(parameters.batch_size) * parameters.kv_num_heads;
+    const size_t past_k_row_bytes = static_cast<size_t>(parameters.past_sequence_length) *
+                                    parameters.head_size * sizeof(T);
+    const size_t present_k_row_bytes = static_cast<size_t>(parameters.total_sequence_length) *
+                                       parameters.head_size * sizeof(T);
+    CUDA_RETURN_IF_ERROR(cudaMemcpy2DAsync(
+        present_key->MutableData<T>(), present_k_row_bytes,
+        past_key->Data<T>(), past_k_row_bytes,
+        past_k_row_bytes, num_kv_rows,
+        cudaMemcpyDeviceToDevice, cuda_stream));
+
+    const size_t past_v_row_bytes = static_cast<size_t>(parameters.past_sequence_length) *
+                                    parameters.v_head_size * sizeof(T);
+    const size_t present_v_row_bytes = static_cast<size_t>(parameters.total_sequence_length) *
+                                       parameters.v_head_size * sizeof(T);
+    CUDA_RETURN_IF_ERROR(cudaMemcpy2DAsync(
+        present_value->MutableData<T>(), present_v_row_bytes,
+        past_value->Data<T>(), past_v_row_bytes,
+        past_v_row_bytes, num_kv_rows,
+        cudaMemcpyDeviceToDevice, cuda_stream));
+
+    // seqlens_k = past_sequence_length (count of cached tokens before new input)
+    auto seqlens_k_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
+    std::vector<int> seqlens_k_host(parameters.batch_size, parameters.past_sequence_length);
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(seqlens_k_buffer.get(), seqlens_k_host.data(),
+                                         parameters.batch_size * sizeof(int),
+                                         cudaMemcpyHostToDevice, cuda_stream));
+
+    // K/V new tokens: mha_fwd_kvcache expects BSNH for k_new/v_new.
+    // When !is_bsnh (4D BNSH input), transpose new tokens to BSNH.
+    const void* k_new = K->Data<T>();
+    const void* v_new = V->Data<T>();
+    IAllocatorUniquePtr<void> k_bsnh_buffer;
+    IAllocatorUniquePtr<void> v_bsnh_buffer;
+    if (!is_bsnh) {
+      size_t k_bytes = sizeof(T) * parameters.batch_size * parameters.kv_sequence_length *
+                       parameters.kv_num_heads * parameters.head_size;
+      size_t v_bytes = sizeof(T) * parameters.batch_size * parameters.kv_sequence_length *
+                       parameters.kv_num_heads * parameters.v_head_size;
+      k_bsnh_buffer = GetScratchBuffer<void>(k_bytes, context->GetComputeStream());
+      v_bsnh_buffer = GetScratchBuffer<void>(v_bytes, context->GetComputeStream());
+      if constexpr (std::is_same_v<T, MLFloat16>) {
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.head_size,
+            reinterpret_cast<const half*>(K->Data<T>()),
+            reinterpret_cast<half*>(k_bsnh_buffer.get()),
+            cuda_stream, device_prop.maxThreadsPerBlock));
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.v_head_size,
+            reinterpret_cast<const half*>(V->Data<T>()),
+            reinterpret_cast<half*>(v_bsnh_buffer.get()),
+            cuda_stream, device_prop.maxThreadsPerBlock));
+      } else if constexpr (std::is_same_v<T, BFloat16>) {
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.head_size,
+            K->Data<BFloat16>(), reinterpret_cast<BFloat16*>(k_bsnh_buffer.get()),
+            cuda_stream, device_prop.maxThreadsPerBlock));
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.v_head_size,
+            V->Data<BFloat16>(), reinterpret_cast<BFloat16*>(v_bsnh_buffer.get()),
+            cuda_stream, device_prop.maxThreadsPerBlock));
+      } else {
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.head_size,
+            K->Data<float>(), reinterpret_cast<float*>(k_bsnh_buffer.get()),
+            cuda_stream, device_prop.maxThreadsPerBlock));
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.v_head_size,
+            V->Data<float>(), reinterpret_cast<float*>(v_bsnh_buffer.get()),
+            cuda_stream, device_prop.maxThreadsPerBlock));
+      }
+      k_new = k_bsnh_buffer.get();
+      v_new = v_bsnh_buffer.get();
+    }
 
-    // Populate present_key/value (BNSH) from external cache K/V (BSNH)
+    // mha_fwd_kvcache: present_key/value as cache (BNSH), K/V as new tokens (BSNH).
+    // The kernel appends new tokens at position seqlens_k[b] and attends to all.
+    ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd_kvcache(
+        device_prop, cuda_stream,
+        const_cast<void*>(q_data),
+        static_cast<void*>(present_key->MutableData<T>()),
+        static_cast<void*>(present_value->MutableData<T>()),
+        const_cast<void*>(k_new), const_cast<void*>(v_new),
+        out_data,
+        softmax_lse_buffer.get(),
+        static_cast<void*>(seqlens_k_buffer.get()),
+        /*rotary_cos=*/nullptr, /*rotary_sin=*/nullptr,
+        /*cache_batch_idx=*/nullptr, /*leftpad_k=*/nullptr,
+        /*head_sink=*/nullptr, /*block_table=*/nullptr,
+        parameters.batch_size, parameters.q_num_heads, parameters.kv_num_heads,
+        parameters.head_size,
+        parameters.q_sequence_length, parameters.total_sequence_length,
+        parameters.kv_sequence_length, /*rotary_dim=*/0,
+        parameters.scale, parameters.softcap,
+        parameters.is_causal, is_bf16, /*use_smooth_softmax=*/false,
+        /*past_bsnh=*/false,  // present cache is BNSH
+        static_cast<int>(num_splits),
+        softmax_lse_accum_buffer.get(), out_accum_buffer.get(),
+        /*local_window_size=*/-1, /*is_rotary_interleaved=*/false,
+        /*is_packed_qkv=*/false));
+
+    present_kv_already_populated = true;
+  }
+  // --- Path 3 (Fix 2): Bool attention mask (right-padding) ---
+  else if (attn_mask != nullptr && attn_mask->IsDataType<bool>()) {
+    // Convert bool padding mask → seqlens_k (token count per batch).
+    // Use mha_fwd_kvcache with seqlens_k for variable-length attention.
+    auto seqlens_k_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
+    size_t mask_dims = attn_mask->Shape().NumDimensions();
+    auto dims = attn_mask->Shape().GetDims();
+    int64_t mask_dim0 = dims[0];
+    int64_t mask_dim1 = mask_dims >= 3 ? dims[1] : 0;
+    int64_t mask_dim2 = mask_dims >= 4 ? dims[2] : 0;
+    ORT_RETURN_IF_ERROR(LaunchConvertMaskToFlashSeqlensK(
+        attn_mask->Data<bool>(), seqlens_k_buffer.get(),
+        parameters.batch_size, parameters.total_sequence_length,
+        static_cast<int>(mask_dims), mask_dim0, mask_dim1, mask_dim2,
+        cuda_stream, device_prop.maxThreadsPerBlock));
+
+    ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd_kvcache(
+        device_prop, cuda_stream,
+        const_cast<void*>(q_data),
+        const_cast<void*>(static_cast<const void*>(K->Data<T>())),
+        const_cast<void*>(static_cast<const void*>(V->Data<T>())),
+        /*k=*/nullptr, /*v=*/nullptr,
+        out_data,
+        softmax_lse_buffer.get(),
+        static_cast<void*>(seqlens_k_buffer.get()),
+        /*rotary_cos=*/nullptr, /*rotary_sin=*/nullptr,
+        /*cache_batch_idx=*/nullptr, /*leftpad_k=*/nullptr,
+        /*head_sink=*/nullptr, /*block_table=*/nullptr,
+        parameters.batch_size, parameters.q_num_heads, parameters.kv_num_heads,
+        parameters.head_size,
+        parameters.q_sequence_length, parameters.kv_sequence_length,
+        /*seqlen_k_new=*/0, /*rotary_dim=*/0,
+        parameters.scale, parameters.softcap,
+        parameters.is_causal, is_bf16, /*use_smooth_softmax=*/false,
+        /*past_bsnh=*/is_bsnh,
+        static_cast<int>(num_splits),
+        softmax_lse_accum_buffer.get(), out_accum_buffer.get(),
+        /*local_window_size=*/-1, /*is_rotary_interleaved=*/false,
+        /*is_packed_qkv=*/false));
+  }
+  // --- Path 4: Prompt flash (no past, no mask) ---
+  else {
+    ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd(
+        device_prop, cuda_stream,
+        const_cast<void*>(q_data),
+        const_cast<void*>(static_cast<const void*>(K->Data<T>())),
+        const_cast<void*>(static_cast<const void*>(V->Data<T>())),
+        out_data,
+        softmax_lse_buffer.get(),
+        parameters.batch_size, parameters.q_num_heads, parameters.kv_num_heads,
+        parameters.head_size,
+        parameters.q_sequence_length, parameters.kv_sequence_length,
+        parameters.scale, parameters.softcap,
+        parameters.is_causal, is_bf16, /*use_smooth_softmax=*/false,
+        static_cast<int>(num_splits),
+        softmax_lse_accum_buffer.get(), out_accum_buffer.get(),
+        is_bsnh));
+  }
+
+  // --- Fix 3: Transpose output BSNH → BNSH if input was 4D (BNSH) ---
+  if (!is_bsnh && out_bsnh_buffer != nullptr) {
+    if constexpr (std::is_same_v<T, MLFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.v_head_size,
+          reinterpret_cast<const half*>(out_bsnh_buffer.get()),
+          reinterpret_cast<half*>(Y->MutableData<T>()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else if constexpr (std::is_same_v<T, BFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.v_head_size,
+          reinterpret_cast<const BFloat16*>(out_bsnh_buffer.get()),
+          Y->MutableData<BFloat16>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.v_head_size,
+          reinterpret_cast<const float*>(out_bsnh_buffer.get()),
+          Y->MutableData<float>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    }
+  }
+
+  // --- Populate present_key/value (BNSH) from K/V (BSNH) ---
+  // Skip for decode path where mha_fwd_kvcache already populated present buffers.
+  if (!present_kv_already_populated) {
     if (present_key != nullptr && is_bsnh) {
       if constexpr (std::is_same_v<T, MLFloat16>) {
         ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
@@ -174,6 +416,12 @@ Status Attention<T>::RunFlashAttention(
             parameters.kv_num_heads, parameters.head_size,
             K->Data<BFloat16>(), present_key->MutableData<BFloat16>(),
             cuda_stream, device_prop.maxThreadsPerBlock));
+      } else {
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.head_size,
+            K->Data<float>(), present_key->MutableData<float>(),
+            cuda_stream, device_prop.maxThreadsPerBlock));
       }
     }
     if (present_value != nullptr && is_bsnh) {
@@ -190,63 +438,14 @@ Status Attention<T>::RunFlashAttention(
             parameters.kv_num_heads, parameters.v_head_size,
             V->Data<BFloat16>(), present_value->MutableData<BFloat16>(),
             cuda_stream, device_prop.maxThreadsPerBlock));
+      } else {
+        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+            parameters.batch_size, parameters.kv_sequence_length,
+            parameters.kv_num_heads, parameters.v_head_size,
+            V->Data<float>(), present_value->MutableData<float>(),
+            cuda_stream, device_prop.maxThreadsPerBlock));
       }
     }
-    return Status::OK();
-  }
-
-  // Note: Flash with past_key is excluded by flash_eligible (requires past_key == nullptr).
-  // Those cases fall through to unfused attention which handles past concatenation.
-
-  // No past, no nonpad: prompt-only flash attention
-  ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd(
-      device_prop, cuda_stream,
-      const_cast<void*>(static_cast<const void*>(Q->Data<T>())),
-      const_cast<void*>(static_cast<const void*>(K->Data<T>())),
-      const_cast<void*>(static_cast<const void*>(V->Data<T>())),
-      static_cast<void*>(Y->MutableData<T>()),
-      softmax_lse_buffer.get(),
-      parameters.batch_size, parameters.q_num_heads, parameters.kv_num_heads,
-      parameters.head_size,
-      parameters.q_sequence_length, parameters.kv_sequence_length,
-      parameters.scale, parameters.softcap,
-      parameters.is_causal, is_bf16, /*use_smooth_softmax=*/false,
-      static_cast<int>(num_splits),
-      softmax_lse_accum_buffer.get(), out_accum_buffer.get(),
-      is_bsnh));
-
-  // Populate present_key/present_value (BNSH) from K/V (BSNH) for no-past case
-  if (present_key != nullptr && is_bsnh) {
-    if constexpr (std::is_same_v<T, MLFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.kv_sequence_length,
-          parameters.kv_num_heads, parameters.head_size,
-          reinterpret_cast<const half*>(K->Data<T>()),
-          reinterpret_cast<half*>(present_key->MutableData<T>()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else if constexpr (std::is_same_v<T, BFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.kv_sequence_length,
-          parameters.kv_num_heads, parameters.head_size,
-          K->Data<BFloat16>(), present_key->MutableData<BFloat16>(),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    }
-  }
-  if (present_value != nullptr && is_bsnh) {
-    if constexpr (std::is_same_v<T, MLFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.kv_sequence_length,
-          parameters.kv_num_heads, parameters.v_head_size,
-          reinterpret_cast<const half*>(V->Data<T>()),
-          reinterpret_cast<half*>(present_value->MutableData<T>()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else if constexpr (std::is_same_v<T, BFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.kv_sequence_length,
-          parameters.kv_num_heads, parameters.v_head_size,
-          V->Data<BFloat16>(), present_value->MutableData<BFloat16>(),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    }
   }
 
   return Status::OK();
@@ -287,11 +486,50 @@ Status Attention<T>::RunMemoryEfficientAttention(
   const bool is_bsnh = parameters.transpose_output;
   const int sm = device_prop.major * 10 + device_prop.minor;
 
-  // Q/K/V pointers — MEA expects BSNH format
+  // Q/K/V pointers — MEA expects BSNH format for Q
   const void* q_data = Q->Data<T>();
   const void* k_data = K->Data<T>();
   const void* v_data = V->Data<T>();
 
+  // --- Fix 3: Transpose Q from BNSH to BSNH if 4D input ---
+  IAllocatorUniquePtr<void> q_bsnh_buffer;
+  if (!is_bsnh) {
+    size_t q_bytes = sizeof(T) * parameters.batch_size * parameters.q_sequence_length *
+                     parameters.q_num_heads * parameters.head_size;
+    q_bsnh_buffer = GetScratchBuffer<void>(q_bytes, context->GetComputeStream());
+    if constexpr (std::is_same_v<T, MLFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.head_size,
+          reinterpret_cast<const half*>(Q->Data<T>()),
+          reinterpret_cast<half*>(q_bsnh_buffer.get()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else if constexpr (std::is_same_v<T, BFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.head_size,
+          Q->Data<BFloat16>(), reinterpret_cast<BFloat16*>(q_bsnh_buffer.get()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.head_size,
+          Q->Data<float>(), reinterpret_cast<float*>(q_bsnh_buffer.get()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    }
+    q_data = q_bsnh_buffer.get();
+  }
+
+  // MEA output is BSNH. If Y expects BNSH, write to scratch then transpose.
+  void* out_data = Y->MutableData<T>();
+  IAllocatorUniquePtr<void> out_bsnh_buffer;
+  if (!is_bsnh) {
+    size_t out_bytes = sizeof(T) * parameters.batch_size * parameters.q_sequence_length *
+                       parameters.q_num_heads * parameters.v_head_size;
+    out_bsnh_buffer = GetScratchBuffer<void>(out_bytes, context->GetComputeStream());
+    out_data = out_bsnh_buffer.get();
+  }
+
   // GQA head expansion: MEA requires matching num_heads for Q/K/V.
   // When q_num_heads != kv_num_heads, expand K/V via LaunchUngroup.
   const bool is_gqa = parameters.q_num_heads != parameters.kv_num_heads;
@@ -384,7 +622,56 @@ Status Attention<T>::RunMemoryEfficientAttention(
     p.value = v_data;
     p.attn_bias = nullptr;
     p.stream = cuda_stream;
-    p.output = Y->MutableData<T>();
+    p.output = out_data;
+
+    IAllocatorUniquePtr<void> workspace_buffer;
+    if (onnxruntime::contrib::cuda::MemoryEfficientAttentionParams::need_workspace(
+            parameters.v_head_size, sizeof(T) == sizeof(float))) {
+      size_t workspace_bytes = sizeof(float) * parameters.batch_size * parameters.q_sequence_length *
+                               parameters.q_num_heads * parameters.v_head_size;
+      workspace_buffer = GetScratchBuffer<void>(workspace_bytes, context->GetComputeStream());
+      p.workspace = workspace_buffer.get();
+    } else {
+      p.workspace = nullptr;
+    }
+    onnxruntime::contrib::cuda::run_memory_efficient_attention(p);
+  }
+  // --- Fix 2: Bool attention mask → seqlens_k with custom right padding ---
+  else if (attn_mask != nullptr && attn_mask->IsDataType<bool>()) {
+    auto seqlens_k_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
+    size_t mask_dims = attn_mask->Shape().NumDimensions();
+    auto dims = attn_mask->Shape().GetDims();
+    int64_t mask_dim0 = dims[0];
+    int64_t mask_dim1 = mask_dims >= 3 ? dims[1] : 0;
+    int64_t mask_dim2 = mask_dims >= 4 ? dims[2] : 0;
+    ORT_RETURN_IF_ERROR(LaunchConvertMaskToFlashSeqlensK(
+        attn_mask->Data<bool>(), seqlens_k_buffer.get(),
+        parameters.batch_size, parameters.total_sequence_length,
+        static_cast<int>(mask_dims), mask_dim0, mask_dim1, mask_dim2,
+        cuda_stream, device_prop.maxThreadsPerBlock));
+
+    onnxruntime::contrib::cuda::MemoryEfficientAttentionParams p;
+    p.sm = sm;
+    p.is_half = std::is_same<T, MLFloat16>::value;
+    p.is_bf16 = std::is_same<T, BFloat16>::value;
+    p.is_kv_bsnh = is_bsnh;
+    p.batch_size = parameters.batch_size;
+    p.num_heads = parameters.q_num_heads;
+    p.sequence_length = parameters.q_sequence_length;
+    p.kv_sequence_length = parameters.total_sequence_length;
+    p.max_sequence_length = parameters.total_sequence_length;
+    p.qk_head_size = parameters.head_size;
+    p.v_head_size = parameters.v_head_size;
+    p.causal = parameters.is_causal;
+    p.scale = parameters.scale;
+    p.seqlen_k_ptr = seqlens_k_buffer.get();
+    p.has_custom_right_padding = true;
+    p.query = q_data;
+    p.key = k_data;
+    p.value = v_data;
+    p.attn_bias = nullptr;
+    p.stream = cuda_stream;
+    p.output = out_data;
 
     IAllocatorUniquePtr<void> workspace_buffer;
     if (onnxruntime::contrib::cuda::MemoryEfficientAttentionParams::need_workspace(
@@ -398,9 +685,10 @@ Status Attention<T>::RunMemoryEfficientAttention(
     }
     onnxruntime::contrib::cuda::run_memory_efficient_attention(p);
   } else {
-    // Standard MEA path (no nonpad)
+    // Standard MEA path (no nonpad, no bool mask — float attention bias or no mask)
     if (attn_mask != nullptr) {
       if (attn_mask->IsDataType<bool>()) {
+        // This shouldn't be reached (bool masks are handled above) but keep as fallback
         using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
         int64_t num_elements = attn_mask->Shape().Size();
         converted_mask_buffer = GetScratchBuffer<void>(
@@ -452,7 +740,7 @@ Status Attention<T>::RunMemoryEfficientAttention(
     p.value = v_data;
     p.attn_bias = attn_bias_data;
     p.stream = cuda_stream;
-    p.output = Y->MutableData<T>();
+    p.output = out_data;
 
     if (onnxruntime::contrib::cuda::MemoryEfficientAttentionParams::need_workspace(
             parameters.v_head_size, sizeof(T) == sizeof(float))) {
@@ -467,6 +755,32 @@ Status Attention<T>::RunMemoryEfficientAttention(
     }
   }
 
+  // --- Fix 3: Transpose output BSNH → BNSH if input was 4D (BNSH) ---
+  if (!is_bsnh && out_bsnh_buffer != nullptr) {
+    if constexpr (std::is_same_v<T, MLFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.v_head_size,
+          reinterpret_cast<const half*>(out_bsnh_buffer.get()),
+          reinterpret_cast<half*>(Y->MutableData<T>()),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else if constexpr (std::is_same_v<T, BFloat16>) {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.v_head_size,
+          reinterpret_cast<const BFloat16*>(out_bsnh_buffer.get()),
+          Y->MutableData<BFloat16>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else {
+      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+          parameters.batch_size, parameters.q_sequence_length,
+          parameters.q_num_heads, parameters.v_head_size,
+          reinterpret_cast<const float*>(out_bsnh_buffer.get()),
+          Y->MutableData<float>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    }
+  }
+
   // Populate present_key/present_value (BNSH) if requested
   if (present_key != nullptr && is_bsnh) {
     if constexpr (std::is_same_v<T, MLFloat16>) {
@@ -749,8 +1063,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
         !has_output_qk &&
         parameters.softcap == 0.0f &&
         parameters.softmax_precision == 0 &&
-        past_key == nullptr &&  // Flash with past requires buffer management; use unfused
-        attn_mask == nullptr;   // Flash prompt path does not support attention mask
+        (attn_mask == nullptr || attn_mask->IsDataType<bool>());  // bool masks handled via seqlens_k
 
     if (flash_eligible) {
       return RunFlashAttention(context, Q, K, V, attn_mask, past_key, past_value,
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
index 003856f4ca7ff..cbab99e7c2e3e 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -31,7 +31,8 @@ __global__ void ConvertMaskToSeqlensKernel(
     const int mask_dims,
     const int64_t mask_dim0,
     const int64_t mask_dim1,
-    const int64_t mask_dim2) {
+    const int64_t mask_dim2,
+    const int seqlen_offset) {
   int batch_idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (batch_idx >= batch_size) {
     return;
@@ -93,8 +94,10 @@ __global__ void ConvertMaskToSeqlensKernel(
     }
   }
 
-  // seqlens_k is total_sequence_length - 1 for GQA convention
-  seqlens_k[batch_idx] = seq_len - 1;
+  // seqlens_k output: seq_len + seqlen_offset
+  // GQA convention (seqlen_offset=-1): stores last valid index (count - 1)
+  // Flash convention (seqlen_offset=0): stores actual count
+  seqlens_k[batch_idx] = seq_len + seqlen_offset;
 }
 
 Status LaunchConvertMaskToSeqlensK(
@@ -123,7 +126,43 @@ Status LaunchConvertMaskToSeqlensK(
       mask_dims,
       mask_dim0,
       mask_dim1,
-      mask_dim2);
+      mask_dim2,
+      /*seqlen_offset=*/-1);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
+// Like LaunchConvertMaskToSeqlensK but stores actual token count (no -1 offset).
+// Flash attention's mha_fwd_kvcache and MEA's has_custom_right_padding expect
+// seqlens_k = number of valid tokens, not last-valid-index.
+Status LaunchConvertMaskToFlashSeqlensK(
+    const bool* attn_mask_bool,
+    int* seqlens_k,
+    int batch_size,
+    int total_seq_len,
+    int mask_dims,
+    int64_t mask_dim0,
+    int64_t mask_dim1,
+    int64_t mask_dim2,
+    cudaStream_t stream,
+    int max_threads_per_block) {
+  if (batch_size == 0 || total_seq_len == 0) {
+    return Status::OK();
+  }
+
+  int threads = std::min(batch_size, max_threads_per_block);
+  int blocks = (batch_size + threads - 1) / threads;
+
+  ConvertMaskToSeqlensKernel<<<blocks, threads, 0, stream>>>(
+      attn_mask_bool,
+      seqlens_k,
+      batch_size,
+      total_seq_len,
+      mask_dims,
+      mask_dim0,
+      mask_dim1,
+      mask_dim2,
+      /*seqlen_offset=*/0);
 
   return CUDA_CALL(cudaGetLastError());
 }
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
index b2f4972acfa7f..92e3180cfe0f6 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
@@ -50,6 +50,20 @@ Status LaunchConvertMaskToSeqlensK(
     cudaStream_t stream,
     int max_threads_per_block);
 
+// Like LaunchConvertMaskToSeqlensK but stores actual token count (no -1 offset).
+// Flash attention and MEA custom right padding expect count, not last-valid-index.
+Status LaunchConvertMaskToFlashSeqlensK(
+    const bool* attn_mask_bool,
+    int* seqlens_k,
+    int batch_size,
+    int total_seq_len,
+    int mask_dims,
+    int64_t mask_dim0,
+    int64_t mask_dim1,
+    int64_t mask_dim2,
+    cudaStream_t stream,
+    int max_threads_per_block);
+
 // Convert a boolean attention mask to an additive attention bias for the MHA path.
 // Maps true -> 0.0 (attend) and false -> mask_filter_value (mask out).
 // The output has the same shape as the input mask.

From f198af22525ea6947b4e62d96e0527959c35c9b7 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 20:34:28 +0000
Subject: [PATCH 05/35] Fix all-false mask crash in ConvertMaskToSeqlensKernel

Replace CUDA_KERNEL_ASSERT(mask_row[0]) with graceful handling: when
mask_row[0] is false (entire row is padding), set seq_len=0 instead of
crashing the CUDA context. This prevents cascade failures in 23+ tests.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../providers/cuda/llm/attention_mask_impl.cu | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
index cbab99e7c2e3e..32625ae5cdd68 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -73,24 +73,27 @@ __global__ void ConvertMaskToSeqlensKernel(
     mask_row = attn_mask + effective_batch * batch_stride + h_idx * head_stride + q_idx * q_stride;
   }
 
-  // Validate that mask starts with True (right-padding convention)
-  CUDA_KERNEL_ASSERT(mask_row[0]);  // mask must start with True
-
   // Find the first False (where padding starts)
   // All elements before this should be True, all after should be False
-  int seq_len = total_seq_len;  // Default: all True (no padding)
-  bool found_first_false = false;
-
-  for (int i = 1; i < total_seq_len; ++i) {
-    bool current = mask_row[i];
-
-    if (!found_first_false && !current) {
-      // Found first False - this is where padding starts
-      seq_len = i;
-      found_first_false = true;
-    } else if (found_first_false && current) {
-      // Found True after False - mask is not contiguous (invalid)
-      CUDA_KERNEL_ASSERT(false);  // mask must be contiguous (no True after False)
+  int seq_len;
+  if (!mask_row[0]) {
+    // Entire row is padding (all-false mask)
+    seq_len = 0;
+  } else {
+    seq_len = total_seq_len;  // Default: all True (no padding)
+    bool found_first_false = false;
+
+    for (int i = 1; i < total_seq_len; ++i) {
+      bool current = mask_row[i];
+
+      if (!found_first_false && !current) {
+        // Found first False - this is where padding starts
+        seq_len = i;
+        found_first_false = true;
+      } else if (found_first_false && current) {
+        // Found True after False - mask is not contiguous (invalid)
+        CUDA_KERNEL_ASSERT(false);  // mask must be contiguous (no True after False)
+      }
     }
   }
 

From 9eb11e4b08285e41e88dfca91307d71957209f1d Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 20:35:36 +0000
Subject: [PATCH 06/35] Replace host memcpy with device-side fill for CUDA
 graph capture

The decode path used std::vector + cudaMemcpyAsync(H->D) to fill
seqlens_k_buffer, which breaks CUDA graph capture (unpinned host
memory) and has a potential lifetime issue (stack vector destroyed
before async copy completes).

Add FillInt32Kernel to attention_mask_impl.cu and use it instead.
This is entirely device-side and CUDA-graph-capturable.

Also remove now-unused <vector> include from attention.cc.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cuda/llm/attention.cc      |  9 ++++----
 .../providers/cuda/llm/attention_mask_impl.cu | 22 +++++++++++++++++++
 .../providers/cuda/llm/attention_mask_impl.h  |  4 ++++
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index b267f854175b7..f92b6ab5a7e86 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <vector>
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cpu/llm/attention.h"
 #include "core/providers/cpu/llm/attention_helper.h"
@@ -227,11 +226,11 @@ Status Attention<T>::RunFlashAttention(
         cudaMemcpyDeviceToDevice, cuda_stream));
 
     // seqlens_k = past_sequence_length (count of cached tokens before new input)
+    // Use device-side fill instead of host vector + cudaMemcpyAsync for CUDA graph capture.
     auto seqlens_k_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
-    std::vector<int> seqlens_k_host(parameters.batch_size, parameters.past_sequence_length);
-    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(seqlens_k_buffer.get(), seqlens_k_host.data(),
-                                         parameters.batch_size * sizeof(int),
-                                         cudaMemcpyHostToDevice, cuda_stream));
+    ORT_RETURN_IF_ERROR(LaunchFillInt32(seqlens_k_buffer.get(), parameters.past_sequence_length,
+                                        parameters.batch_size, cuda_stream,
+                                        device_prop.maxThreadsPerBlock));
 
     // K/V new tokens: mha_fwd_kvcache expects BSNH for k_new/v_new.
     // When !is_bsnh (4D BNSH input), transpose new tokens to BSNH.
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
index 32625ae5cdd68..df64726fb8a0d 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -355,5 +355,27 @@ template Status LaunchConvertNonpadKvSeqlenToAttentionBias<__half>(
 template Status LaunchConvertNonpadKvSeqlenToAttentionBias<__nv_bfloat16>(
     const int64_t*, __nv_bfloat16*, int, int, int, float, cudaStream_t, int);
 
+// Simple kernel to fill an int32 buffer with a constant value on device.
+// Used for CUDA-graph-capturable seqlens_k initialization (no host memory).
+__global__ void FillInt32Kernel(int* __restrict__ output, const int value, const int count) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < count) {
+    output[idx] = value;
+  }
+}
+
+Status LaunchFillInt32(int* output, int value, int count, cudaStream_t stream, int max_threads_per_block) {
+  if (count == 0) {
+    return Status::OK();
+  }
+
+  int threads = std::min(count, max_threads_per_block);
+  int blocks = (count + threads - 1) / threads;
+
+  FillInt32Kernel<<<blocks, threads, 0, stream>>>(output, value, count);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
index 92e3180cfe0f6..6cce5f2ae5753 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
@@ -116,5 +116,9 @@ Status LaunchConvertNonpadKvSeqlenToAttentionBias(
     cudaStream_t stream,
     int max_threads_per_block);
 
+// Fill an int32 buffer with a constant value entirely on device.
+// CUDA-graph-capturable alternative to host vector + cudaMemcpyAsync.
+Status LaunchFillInt32(int* output, int value, int count, cudaStream_t stream, int max_threads_per_block);
+
 }  // namespace cuda
 }  // namespace onnxruntime

From 16e8b650003e8c57f1350a63bc08b098d056eda6 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 20:36:36 +0000
Subject: [PATCH 07/35] Handle bool mask in decode path to support variable
 padding

The decode path (Path 2, past_key != nullptr) previously filled
seqlens_k uniformly with past_sequence_length, silently ignoring
any bool attention mask. This caused incorrect results when decode
batches have different padding lengths.

Now check for bool mask in the decode path: if present, use
LaunchConvertMaskToFlashSeqlensK to derive per-batch seqlens_k
from the mask. Fall back to uniform fill only when no bool mask.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cuda/llm/attention.cc      | 24 +++++++++++++++----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index f92b6ab5a7e86..afa7350211028 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -225,12 +225,26 @@ Status Attention<T>::RunFlashAttention(
         past_v_row_bytes, num_kv_rows,
         cudaMemcpyDeviceToDevice, cuda_stream));
 
-    // seqlens_k = past_sequence_length (count of cached tokens before new input)
-    // Use device-side fill instead of host vector + cudaMemcpyAsync for CUDA graph capture.
+    // seqlens_k: derive per-batch sequence lengths for the KV cache.
+    // When a bool mask is present, use it to get per-batch lengths (handles variable padding).
+    // Otherwise, fill uniformly with past_sequence_length.
     auto seqlens_k_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
-    ORT_RETURN_IF_ERROR(LaunchFillInt32(seqlens_k_buffer.get(), parameters.past_sequence_length,
-                                        parameters.batch_size, cuda_stream,
-                                        device_prop.maxThreadsPerBlock));
+    if (attn_mask != nullptr && attn_mask->IsDataType<bool>()) {
+      size_t mask_dims = attn_mask->Shape().NumDimensions();
+      auto dims = attn_mask->Shape().GetDims();
+      int64_t mask_dim0 = dims[0];
+      int64_t mask_dim1 = mask_dims >= 3 ? dims[1] : 0;
+      int64_t mask_dim2 = mask_dims >= 4 ? dims[2] : 0;
+      ORT_RETURN_IF_ERROR(LaunchConvertMaskToFlashSeqlensK(
+          attn_mask->Data<bool>(), seqlens_k_buffer.get(),
+          parameters.batch_size, parameters.total_sequence_length,
+          static_cast<int>(mask_dims), mask_dim0, mask_dim1, mask_dim2,
+          cuda_stream, device_prop.maxThreadsPerBlock));
+    } else {
+      ORT_RETURN_IF_ERROR(LaunchFillInt32(seqlens_k_buffer.get(), parameters.past_sequence_length,
+                                          parameters.batch_size, cuda_stream,
+                                          device_prop.maxThreadsPerBlock));
+    }
 
     // K/V new tokens: mha_fwd_kvcache expects BSNH for k_new/v_new.
     // When !is_bsnh (4D BNSH input), transpose new tokens to BSNH.

From d3b49680bb40d9ecbcbe73934f58c2cab65aa4dc Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 20:44:10 +0000
Subject: [PATCH 08/35] Fix NaN output for all-false bool masks in MEA path

The MEA bool mask path used custom_right_padding with seqlens_k
derived from the mask. When all mask entries are false, seqlens_k=0
for all batches, causing empty softmax -> NaN output.

Remove the MEA-specific bool->seqlens path and let bool masks fall
through to the standard additive bias conversion (true->0.0,
false->mask_filter_value). This correctly produces uniform softmax
weights for all-false masks (matching CPU behavior) and also handles
partial padding correctly via the extreme bias values.

Flash attention retains its bool->seqlens path since it doesn't
support attention bias natively.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cuda/llm/attention.cc      | 58 +++----------------
 1 file changed, 7 insertions(+), 51 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index afa7350211028..424d3253ddde5 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -649,59 +649,15 @@ Status Attention<T>::RunMemoryEfficientAttention(
     }
     onnxruntime::contrib::cuda::run_memory_efficient_attention(p);
   }
-  // --- Fix 2: Bool attention mask → seqlens_k with custom right padding ---
-  else if (attn_mask != nullptr && attn_mask->IsDataType<bool>()) {
-    auto seqlens_k_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
-    size_t mask_dims = attn_mask->Shape().NumDimensions();
-    auto dims = attn_mask->Shape().GetDims();
-    int64_t mask_dim0 = dims[0];
-    int64_t mask_dim1 = mask_dims >= 3 ? dims[1] : 0;
-    int64_t mask_dim2 = mask_dims >= 4 ? dims[2] : 0;
-    ORT_RETURN_IF_ERROR(LaunchConvertMaskToFlashSeqlensK(
-        attn_mask->Data<bool>(), seqlens_k_buffer.get(),
-        parameters.batch_size, parameters.total_sequence_length,
-        static_cast<int>(mask_dims), mask_dim0, mask_dim1, mask_dim2,
-        cuda_stream, device_prop.maxThreadsPerBlock));
-
-    onnxruntime::contrib::cuda::MemoryEfficientAttentionParams p;
-    p.sm = sm;
-    p.is_half = std::is_same<T, MLFloat16>::value;
-    p.is_bf16 = std::is_same<T, BFloat16>::value;
-    p.is_kv_bsnh = is_bsnh;
-    p.batch_size = parameters.batch_size;
-    p.num_heads = parameters.q_num_heads;
-    p.sequence_length = parameters.q_sequence_length;
-    p.kv_sequence_length = parameters.total_sequence_length;
-    p.max_sequence_length = parameters.total_sequence_length;
-    p.qk_head_size = parameters.head_size;
-    p.v_head_size = parameters.v_head_size;
-    p.causal = parameters.is_causal;
-    p.scale = parameters.scale;
-    p.seqlen_k_ptr = seqlens_k_buffer.get();
-    p.has_custom_right_padding = true;
-    p.query = q_data;
-    p.key = k_data;
-    p.value = v_data;
-    p.attn_bias = nullptr;
-    p.stream = cuda_stream;
-    p.output = out_data;
-
-    IAllocatorUniquePtr<void> workspace_buffer;
-    if (onnxruntime::contrib::cuda::MemoryEfficientAttentionParams::need_workspace(
-            parameters.v_head_size, sizeof(T) == sizeof(float))) {
-      size_t workspace_bytes = sizeof(float) * parameters.batch_size * parameters.q_sequence_length *
-                               parameters.q_num_heads * parameters.v_head_size;
-      workspace_buffer = GetScratchBuffer<void>(workspace_bytes, context->GetComputeStream());
-      p.workspace = workspace_buffer.get();
-    } else {
-      p.workspace = nullptr;
-    }
-    onnxruntime::contrib::cuda::run_memory_efficient_attention(p);
-  } else {
-    // Standard MEA path (no nonpad, no bool mask — float attention bias or no mask)
+  // Standard MEA path: float attention bias, bool mask (converted to bias), or no mask.
+  // Bool masks are converted to additive attention bias (true→0, false→mask_filter_value)
+  // which correctly handles all-false masks (uniform softmax weights) unlike the
+  // custom_right_padding seqlens approach which would produce NaN.
+  else {
     if (attn_mask != nullptr) {
       if (attn_mask->IsDataType<bool>()) {
-        // This shouldn't be reached (bool masks are handled above) but keep as fallback
+        // Convert bool mask to additive attention bias (true→0.0, false→mask_filter_value).
+        // This handles all-false masks correctly (uniform softmax weights from extreme bias).
         using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
         int64_t num_elements = attn_mask->Shape().Size();
         converted_mask_buffer = GetScratchBuffer<void>(

From e0760a85d5bd5d17b72fc80587f48cfd1365d010 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 20:55:43 +0000
Subject: [PATCH 09/35] Fix cutlass FMHA bias alignment crash for unaligned
 kv_sequence_length

On Sm80+ with fp32 (and Sm75+ with fp16/bf16), the cutlass FMHA kernel
requires bias_strideM to be 4-element aligned even in the 'unaligned'
kernel path (kMinimumAlignment=4 for TensorOp). When kv_sequence_length
is not divisible by 4, both aligned and unaligned kernel variants fail
the check_supported assertion.

Two-level fix:
1. fmha_launch_template.h: Strengthen DispatchIsAligned to check all
   three bias strides (strideM, strideH, strideB) against kAlignmentQ,
   mirroring the exact checks in AttentionKernel::check_supported.
2. attention.cc: Add MEA eligibility check that skips memory-efficient
   attention when bias strides cannot satisfy the kernel's minimum
   alignment requirement, falling back to unfused attention instead.

This fixes Attention4DAttnMask, Attention4DAttnMaskBool, and
Attention4DAttnMaskBoolAllFalse test cases where kv_sequence_length=6
(6 % 4 != 0) caused crashes with 4D float and bool attention masks.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../bert/cutlass_fmha/fmha_launch_template.h    | 15 +++++++++++++--
 .../core/providers/cuda/llm/attention.cc        | 17 +++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
index f720a7fb2f714..6748aa8a29e4e 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
+++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
@@ -258,9 +258,20 @@ void DispatchIsAligned(const MemoryEfficientAttentionParams& params) {
                     params.qk_head_size % AlignedAK::kAlignmentK == 0 &&
                     params.v_head_size % AlignedAK::kAlignmentV == 0;
 
-  // Attention bias stride (kv_sequence_length) must also satisfy alignment requirements.
+  // Attention bias strides must also satisfy alignment requirements.
+  // Mirror the checks in AttentionKernel::check_supported to avoid ORT_ENFORCE crashes.
   if (params.attn_bias != nullptr) {
-    is_aligned = is_aligned && params.kv_sequence_length % AlignedAK::kAlignmentQ == 0;
+    int num_keys = params.kv_sequence_length;
+    int num_queries = params.sequence_length;
+    int bias_strideM = num_keys;
+    int bias_strideH = params.broadcast_attn_bias_dim_1 ? 0 : num_queries * num_keys;
+    int bias_strideB = params.broadcast_attn_bias_dim_0
+                           ? 0
+                           : ((params.broadcast_attn_bias_dim_1 ? 1 : params.num_heads) * num_queries * num_keys);
+    is_aligned = is_aligned &&
+                 bias_strideM % AlignedAK::kAlignmentQ == 0 &&
+                 (params.num_heads <= 1 || bias_strideH % AlignedAK::kAlignmentQ == 0) &&
+                 (params.batch_size <= 1 || bias_strideB % AlignedAK::kAlignmentQ == 0);
   }
 
   DISPATCH_BOOL(is_aligned, kIsAligned, ([&]() {
diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 424d3253ddde5..1a4f4ff0e5b1b 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -1054,6 +1054,23 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
         parameters.softmax_precision == 0 &&
         past_key == nullptr;
 
+    // Cutlass FMHA requires bias strides to satisfy minimum alignment even in the
+    // "unaligned" kernel path. When an attention mask is present without nonpad_kv_seqlen,
+    // it becomes an additive bias with bias_strideM = total_sequence_length. Skip MEA if
+    // this stride can't satisfy the kernel's minimum alignment requirement.
+    if (mea_eligible && attn_mask != nullptr && nonpad_kv_seqlen == nullptr) {
+      int min_bias_align = 1;
+      if ((std::is_same<T, float>::value && sm >= 80) ||
+          (!std::is_same<T, float>::value && sm >= 75)) {
+        min_bias_align = 4;  // TensorOp on Sm80+ (float) or Sm75+ (fp16/bf16)
+      } else if (!std::is_same<T, float>::value && sm >= 70) {
+        min_bias_align = 2;  // TensorOp on Volta (fp16)
+      }
+      if (parameters.total_sequence_length % min_bias_align != 0) {
+        mea_eligible = false;
+      }
+    }
+
     if (mea_eligible) {
       return RunMemoryEfficientAttention(context, Q, K, V, attn_mask, past_key, past_value,
                                          nonpad_kv_seqlen, Y, present_key, present_value, parameters);

From a827a1b0a4543340d9a171bcf8d2825e8975709d Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 21:11:31 +0000
Subject: [PATCH 10/35] Fix 3D mask test to use consistent per-batch padding
 semantics

The 3D boolean mask [H, q_seq, kv_seq] broadcasts across batches,
so it can only represent one padding pattern. The test was using
batch 0's seqlen for the 3D mask but per-batch seqlens for the
reference comparison, causing a mismatch for batches with different
padding amounts.

Fix: use effective_seqlens (batch 0's pattern for all batches) when
the mask is 3D, so the reference K/V zeroing, key_padding_mask, and
output comparison all reflect the actual mask semantics.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../transformers/test_onnx_attention/test_gqa.py   | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index 14a0a91487a08..685f61155e085 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -388,9 +388,17 @@ def parity_check_gqa_prompt_with_padding(
     )
     v = torch.randn_like(k) * std
 
+    # 3D masks broadcast across batches (no batch dimension), so they can only
+    # represent one padding pattern. The mask uses batch 0's seqlen for all batches.
+    # Adjust effective_seqlens so the reference comparison matches the actual mask.
+    if config.attn_mask_dims == 3:
+        effective_seqlens = torch.full_like(seqlens, seqlens[0].item())
+    else:
+        effective_seqlens = seqlens
+
     # Zero out padded positions in K, V for proper comparison
     for b in range(config.batch_size):
-        valid_len = seqlens[b].item()
+        valid_len = effective_seqlens[b].item()
         if valid_len < config.kv_sequence_length:
             k[b, valid_len:, :, :] = 0
             v[b, valid_len:, :, :] = 0
@@ -405,7 +413,7 @@ def parity_check_gqa_prompt_with_padding(
     )
 
     key_padding_mask = create_boolean_mask_from_seqlens(
-        seqlens=seqlens,
+        seqlens=effective_seqlens,
         total_seq_len=config.kv_sequence_length,
         mask_dims=2,
         device=device,
@@ -437,7 +445,7 @@ def parity_check_gqa_prompt_with_padding(
 
     # --- Comparison ---
     for b in range(config.batch_size):
-        valid_len = seqlens[b].item()
+        valid_len = effective_seqlens[b].item()
         if valid_len < config.q_sequence_length:
             out[b, valid_len:, :, :] = 0
             out_ref[b, valid_len:, :, :] = 0

From 9a0b546e6abe2e44ba4345cfbdbe74e962442053 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 21:23:35 +0000
Subject: [PATCH 11/35] Remove 11 redundant GQA tests from test_gqa.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove 8 MEA past tests (GQA+past_key is ineligible for MEA since
ComputeInternal requires past_key==nullptr; these always ran through
flash regardless of ORT_DISABLE_FLASH_ATTENTION):
- 4x TestONNXAttentionMemoryEfficientGQA.test_gqa_past_memory_efficient
- 4x TestONNXAttentionMemoryEfficientGQABF16.test_gqa_past_memory_efficient_bf16

Remove 3 flash prompt padding tests (prompt+bool_mask is ineligible for
flash; these duplicated the MEA prompt padding tests):
- 3x TestONNXAttentionPaddingMaskGQA.test_gqa_prompt_padding_flash

62 → 51 tests, all passing.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../test_onnx_attention/test_gqa.py           | 64 ++-----------------
 1 file changed, 6 insertions(+), 58 deletions(-)

diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index 685f61155e085..60cba2b2dbd70 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -822,42 +822,9 @@ def test_gqa_prompt_memory_efficient(self, name, config):
             atol=atol["fp16"],
         )
 
-    @parameterized.expand(gqa_past_test_cases())
-    def test_gqa_past_memory_efficient(self, name, config):
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
-        parity_check_gqa_past(
-            config=config,
-            ep="CUDAExecutionProvider",
-            device="cuda",
-            torch_type=torch.float16,
-            ort_type=TensorProto.FLOAT16,
-            causal=True,
-            rtol=rtol["fp16"],
-            atol=atol["fp16"],
-        )
-
-
-@unittest.skipIf(not has_cuda_device(80), "BF16 requires Ampere or higher GPU, skipping tests.")
-class TestONNXAttentionMemoryEfficientGQABF16(unittest.TestCase):
-    """Test ONNX Attention op (opset 23) GQA path with Memory Efficient Attention using BFloat16."""
-
-    @parameterized.expand(gqa_past_test_cases())
-    def test_gqa_past_memory_efficient_bf16(self, name, config):
-        if not torch.cuda.is_bf16_supported():
-            self.skipTest("BFloat16 not supported on this device")
-
-        config.kv_cache_type = "bfloat16"
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
-        parity_check_gqa_past(
-            config=config,
-            ep="CUDAExecutionProvider",
-            device="cuda",
-            torch_type=torch.bfloat16,
-            ort_type=TensorProto.BFLOAT16,
-            causal=True,
-            rtol=rtol["bf16"],
-            atol=atol["bf16"],
-        )
+    # Note: GQA past tests removed — MEA is ineligible when past_key is present
+    # (ComputeInternal requires past_key == nullptr for MEA). GQA past goes through
+    # flash attention regardless of ORT_DISABLE_FLASH_ATTENTION.
 
 
 @unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
@@ -868,29 +835,10 @@ class TestONNXAttentionPaddingMaskGQA(unittest.TestCase):
     These tests verify that the boolean attn_mask is correctly converted to
     sequence lengths on GPU and that the attention computation respects the
     padding. Tests cover 2D, 3D, and 4D mask shapes.
-    """
-
-    @parameterized.expand(gqa_prompt_padding_test_cases())
-    def test_gqa_prompt_padding_flash(self, name, config):
-        """Test prompt phase with padding mask using Flash Attention."""
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
-
-        seqlens = torch.tensor(
-            [config.kv_sequence_length - 6, config.kv_sequence_length],
-            dtype=torch.int32,
-            device="cuda",
-        )
 
-        parity_check_gqa_prompt_with_padding(
-            config=config,
-            seqlens=seqlens,
-            ep="CUDAExecutionProvider",
-            device="cuda",
-            torch_type=torch.float16,
-            ort_type=TensorProto.FLOAT16,
-            rtol=rtol["fp16"],
-            atol=atol["fp16"],
-        )
+    Note: prompt+bool_mask is ineligible for flash (routed to MEA), so prompt
+    padding tests live in TestONNXAttentionPaddingMaskMemoryEfficientGQA only.
+    """
 
     @parameterized.expand(gqa_past_padding_test_cases())
     def test_gqa_past_padding_flash(self, name, config):

From 939a08cff48e588eae9899794eaec438485a44ef Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 21:23:45 +0000
Subject: [PATCH 12/35] Fix padding mask bugs: zero present buffers, decode
 offset, MEA 2D expansion

Three root causes fixed:

1. Present KV buffers not zeroed before strided copy (flaky decode):
   mha_fwd_kvcache reads positions [0, seqlens_k+new-1] in the cache.
   Positions beyond past_seq contained stale data from the allocator,
   causing non-deterministic results. Fix: zero present_key/value before
   cudaMemcpy2DAsync, matching the GQA contrib op pattern.

2. Flash decode seqlens_k offset (decode mask off-by-kv_seq):
   Bool mask encodes total valid tokens, but mha_fwd_kvcache expects
   pre-append count. Added configurable seqlen_offset parameter to
   LaunchConvertMaskToFlashSeqlensK; decode path uses -kv_sequence_length.

3. MEA 2D mask stride mismatch (prompt mask garbage):
   MEA hardcodes bias_strideM=kv_seq, so a 2D mask [B, kv_seq] with
   broadcast_dim_0=true would read across batch boundaries instead of
   replaying the same mask per query. Fix: expand 2D masks to
   [B, 1, q_seq, kv_seq] before passing to MEA.

4. Flash prompt+bool mask excluded from flash_eligible:
   mha_fwd_kvcache causal semantics are decode-oriented (window offset
   by seqlens_k), wrong for standard lower-triangular prompt causal.
   Route prompt+bool mask to MEA instead.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cuda/llm/attention.cc      | 104 ++++++++++--------
 .../providers/cuda/llm/attention_mask_impl.cu |  10 +-
 .../providers/cuda/llm/attention_mask_impl.h  |  13 ++-
 3 files changed, 72 insertions(+), 55 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 1a4f4ff0e5b1b..9587a64352b68 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -202,9 +202,20 @@ Status Attention<T>::RunFlashAttention(
     ORT_ENFORCE(present_key != nullptr && present_value != nullptr,
                 "present_key/value outputs are required when past_key is provided.");
 
+    // Zero present buffers before strided copy to avoid stale data in positions
+    // beyond past_seq that mha_fwd_kvcache might read during attention (matching GQA pattern).
+    const size_t num_kv_rows = static_cast<size_t>(parameters.batch_size) * parameters.kv_num_heads;
+    const size_t present_k_bytes = num_kv_rows * parameters.total_sequence_length *
+                                   parameters.head_size * sizeof(T);
+    const size_t present_v_bytes = num_kv_rows * parameters.total_sequence_length *
+                                   parameters.v_head_size * sizeof(T);
+    CUDA_RETURN_IF_ERROR(cudaMemsetAsync(present_key->MutableData<T>(), 0,
+                                         present_k_bytes, cuda_stream));
+    CUDA_RETURN_IF_ERROR(cudaMemsetAsync(present_value->MutableData<T>(), 0,
+                                         present_v_bytes, cuda_stream));
+
     // Copy past KV (BNSH) into present buffers (BNSH). Strided copy because
     // past has [B, N_kv, past_seq, H] and present has [B, N_kv, total_seq, H].
-    const size_t num_kv_rows = static_cast<size_t>(parameters.batch_size) * parameters.kv_num_heads;
     const size_t past_k_row_bytes = static_cast<size_t>(parameters.past_sequence_length) *
                                     parameters.head_size * sizeof(T);
     const size_t present_k_row_bytes = static_cast<size_t>(parameters.total_sequence_length) *
@@ -226,8 +237,9 @@ Status Attention<T>::RunFlashAttention(
         cudaMemcpyDeviceToDevice, cuda_stream));
 
     // seqlens_k: derive per-batch sequence lengths for the KV cache.
-    // When a bool mask is present, use it to get per-batch lengths (handles variable padding).
-    // Otherwise, fill uniformly with past_sequence_length.
+    // mha_fwd_kvcache expects seqlens_k = tokens already in cache BEFORE appending new ones.
+    // When a bool mask is present, it encodes total valid token count (past + new).
+    // Subtract kv_sequence_length to get the pre-append count.
     auto seqlens_k_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
     if (attn_mask != nullptr && attn_mask->IsDataType<bool>()) {
       size_t mask_dims = attn_mask->Shape().NumDimensions();
@@ -235,11 +247,13 @@ Status Attention<T>::RunFlashAttention(
       int64_t mask_dim0 = dims[0];
       int64_t mask_dim1 = mask_dims >= 3 ? dims[1] : 0;
       int64_t mask_dim2 = mask_dims >= 4 ? dims[2] : 0;
+      // Offset: mask gives total valid count, subtract kv_sequence_length for pre-append count.
+      int seqlen_offset = -parameters.kv_sequence_length;
       ORT_RETURN_IF_ERROR(LaunchConvertMaskToFlashSeqlensK(
           attn_mask->Data<bool>(), seqlens_k_buffer.get(),
           parameters.batch_size, parameters.total_sequence_length,
           static_cast<int>(mask_dims), mask_dim0, mask_dim1, mask_dim2,
-          cuda_stream, device_prop.maxThreadsPerBlock));
+          cuda_stream, device_prop.maxThreadsPerBlock, seqlen_offset));
     } else {
       ORT_RETURN_IF_ERROR(LaunchFillInt32(seqlens_k_buffer.get(), parameters.past_sequence_length,
                                           parameters.batch_size, cuda_stream,
@@ -327,47 +341,8 @@ Status Attention<T>::RunFlashAttention(
 
     present_kv_already_populated = true;
   }
-  // --- Path 3 (Fix 2): Bool attention mask (right-padding) ---
-  else if (attn_mask != nullptr && attn_mask->IsDataType<bool>()) {
-    // Convert bool padding mask → seqlens_k (token count per batch).
-    // Use mha_fwd_kvcache with seqlens_k for variable-length attention.
-    auto seqlens_k_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
-    size_t mask_dims = attn_mask->Shape().NumDimensions();
-    auto dims = attn_mask->Shape().GetDims();
-    int64_t mask_dim0 = dims[0];
-    int64_t mask_dim1 = mask_dims >= 3 ? dims[1] : 0;
-    int64_t mask_dim2 = mask_dims >= 4 ? dims[2] : 0;
-    ORT_RETURN_IF_ERROR(LaunchConvertMaskToFlashSeqlensK(
-        attn_mask->Data<bool>(), seqlens_k_buffer.get(),
-        parameters.batch_size, parameters.total_sequence_length,
-        static_cast<int>(mask_dims), mask_dim0, mask_dim1, mask_dim2,
-        cuda_stream, device_prop.maxThreadsPerBlock));
-
-    ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd_kvcache(
-        device_prop, cuda_stream,
-        const_cast<void*>(q_data),
-        const_cast<void*>(static_cast<const void*>(K->Data<T>())),
-        const_cast<void*>(static_cast<const void*>(V->Data<T>())),
-        /*k=*/nullptr, /*v=*/nullptr,
-        out_data,
-        softmax_lse_buffer.get(),
-        static_cast<void*>(seqlens_k_buffer.get()),
-        /*rotary_cos=*/nullptr, /*rotary_sin=*/nullptr,
-        /*cache_batch_idx=*/nullptr, /*leftpad_k=*/nullptr,
-        /*head_sink=*/nullptr, /*block_table=*/nullptr,
-        parameters.batch_size, parameters.q_num_heads, parameters.kv_num_heads,
-        parameters.head_size,
-        parameters.q_sequence_length, parameters.kv_sequence_length,
-        /*seqlen_k_new=*/0, /*rotary_dim=*/0,
-        parameters.scale, parameters.softcap,
-        parameters.is_causal, is_bf16, /*use_smooth_softmax=*/false,
-        /*past_bsnh=*/is_bsnh,
-        static_cast<int>(num_splits),
-        softmax_lse_accum_buffer.get(), out_accum_buffer.get(),
-        /*local_window_size=*/-1, /*is_rotary_interleaved=*/false,
-        /*is_packed_qkv=*/false));
-  }
-  // --- Path 4: Prompt flash (no past, no mask) ---
+  // --- Path 3: Prompt flash (no past, no mask) ---
+  // Note: prompt with bool mask is handled by MEA (flash_eligible excludes it).
   else {
     ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd(
         device_prop, cuda_stream,
@@ -673,11 +648,40 @@ Status Attention<T>::RunMemoryEfficientAttention(
         attn_bias_data = attn_mask->Data<T>();
       }
 
-      // Determine broadcast flags
+      // Determine broadcast flags based on bias logical shape [B, num_heads, q_seq, kv_seq].
+      // MEA always uses bias_strideM = kv_seq, so each query row must have kv_seq elements.
+      // For 2D masks [B, kv_seq]: the mask is constant across q positions, so we must
+      // expand to [B, 1, q_seq, kv_seq] by repeating each row q_seq times. Without this,
+      // bias_strideM would walk through batch boundaries instead of replaying the same mask.
       size_t mask_dims = attn_mask->Shape().NumDimensions();
       auto dims = attn_mask->Shape().GetDims();
       if (mask_dims == 2) {
-        broadcast_bias_dim_0 = true;
+        // Expand [B, kv_seq] → [B, 1, q_seq, kv_seq] by repeating each batch's row
+        using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
+        const int kv_len = parameters.total_sequence_length;
+        const int q_len = parameters.q_sequence_length;
+        int64_t expanded_elements = static_cast<int64_t>(parameters.batch_size) * q_len * kv_len;
+        auto expanded_buffer = GetScratchBuffer<void>(
+            expanded_elements * sizeof(NativeCudaT), context->GetComputeStream());
+        const auto* src = (attn_mask->IsDataType<bool>())
+                              ? reinterpret_cast<const NativeCudaT*>(converted_mask_buffer.get())
+                              : reinterpret_cast<const NativeCudaT*>(attn_mask->Data<T>());
+        auto* dst = reinterpret_cast<NativeCudaT*>(expanded_buffer.get());
+        const size_t row_bytes = static_cast<size_t>(kv_len) * sizeof(NativeCudaT);
+        for (int b = 0; b < parameters.batch_size; ++b) {
+          const auto* src_row = src + static_cast<int64_t>(b) * kv_len;
+          auto* dst_base = dst + static_cast<int64_t>(b) * q_len * kv_len;
+          for (int q = 0; q < q_len; ++q) {
+            CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
+                dst_base + static_cast<int64_t>(q) * kv_len,
+                src_row, row_bytes,
+                cudaMemcpyDeviceToDevice, cuda_stream));
+          }
+        }
+        attn_bias_data = expanded_buffer.get();
+        converted_mask_buffer = std::move(expanded_buffer);
+        // Expanded shape is [B, 1, q_seq, kv_seq]
+        broadcast_bias_dim_0 = false;
         broadcast_bias_dim_1 = true;
       } else if (mask_dims == 3) {
         broadcast_bias_dim_0 = true;
@@ -1032,7 +1036,11 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
         !has_output_qk &&
         parameters.softcap == 0.0f &&
         parameters.softmax_precision == 0 &&
-        (attn_mask == nullptr || attn_mask->IsDataType<bool>());  // bool masks handled via seqlens_k
+        // Bool masks without past_key (prompt) can't use flash because mha_fwd_kvcache's
+        // causal semantics are decode-oriented (window offset by seqlens_k). For causal
+        // prompt with padding, MEA handles it correctly via attention bias conversion.
+        // Flash handles: no mask, decode with past (±mask), nonpad_kv_seqlen.
+        (attn_mask == nullptr || (attn_mask->IsDataType<bool>() && past_key != nullptr));
 
     if (flash_eligible) {
       return RunFlashAttention(context, Q, K, V, attn_mask, past_key, past_value,
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
index df64726fb8a0d..d105f3c16eeff 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -135,9 +135,8 @@ Status LaunchConvertMaskToSeqlensK(
   return CUDA_CALL(cudaGetLastError());
 }
 
-// Like LaunchConvertMaskToSeqlensK but stores actual token count (no -1 offset).
-// Flash attention's mha_fwd_kvcache and MEA's has_custom_right_padding expect
-// seqlens_k = number of valid tokens, not last-valid-index.
+// Like LaunchConvertMaskToSeqlensK but with a configurable offset.
+// seqlens_k[b] = num_true_tokens + seqlen_offset
 Status LaunchConvertMaskToFlashSeqlensK(
     const bool* attn_mask_bool,
     int* seqlens_k,
@@ -148,7 +147,8 @@ Status LaunchConvertMaskToFlashSeqlensK(
     int64_t mask_dim1,
     int64_t mask_dim2,
     cudaStream_t stream,
-    int max_threads_per_block) {
+    int max_threads_per_block,
+    int seqlen_offset) {
   if (batch_size == 0 || total_seq_len == 0) {
     return Status::OK();
   }
@@ -165,7 +165,7 @@ Status LaunchConvertMaskToFlashSeqlensK(
       mask_dim0,
       mask_dim1,
       mask_dim2,
-      /*seqlen_offset=*/0);
+      seqlen_offset);
 
   return CUDA_CALL(cudaGetLastError());
 }
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
index 6cce5f2ae5753..8e34c73d535e2 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
@@ -50,8 +50,16 @@ Status LaunchConvertMaskToSeqlensK(
     cudaStream_t stream,
     int max_threads_per_block);
 
-// Like LaunchConvertMaskToSeqlensK but stores actual token count (no -1 offset).
+// Like LaunchConvertMaskToSeqlensK but with a configurable offset.
 // Flash attention and MEA custom right padding expect count, not last-valid-index.
+//
+// seqlen_offset adjusts the raw token count:
+//   seqlens_k[b] = num_true_tokens + seqlen_offset
+//
+// Common offsets:
+//   0: actual token count (for prompt with mha_fwd_kvcache, MEA custom right padding)
+//  -N: subtract N from count (for decode with mha_fwd_kvcache where N=kv_sequence_length,
+//      giving the number of tokens already in cache BEFORE appending new ones)
 Status LaunchConvertMaskToFlashSeqlensK(
     const bool* attn_mask_bool,
     int* seqlens_k,
@@ -62,7 +70,8 @@ Status LaunchConvertMaskToFlashSeqlensK(
     int64_t mask_dim1,
     int64_t mask_dim2,
     cudaStream_t stream,
-    int max_threads_per_block);
+    int max_threads_per_block,
+    int seqlen_offset = 0);
 
 // Convert a boolean attention mask to an additive attention bias for the MHA path.
 // Maps true -> 0.0 (attend) and false -> mask_filter_value (mask out).

From 118546ddf5a8f4ad1be3a23170c3f2372e985b11 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 21:24:15 +0000
Subject: [PATCH 13/35] Add TODO for GQA unfused attention fallback

Mark the GQA rejection point in the unfused path with a TODO
referencing issue #27516 for future implementation.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 onnxruntime/core/providers/cuda/llm/attention.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 9587a64352b68..65ec1d91f7efd 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -1103,6 +1103,9 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   }
 
   if (is_gqa) {
+    // TODO: Support GQA in unfused attention path for fp32/old-GPU fallback.
+    // Requires ~160 lines: ExpandKVHeads kernel to replicate KV heads, wiring in unfused dispatch.
+    // See issue #27516 for tracking.
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
                            "GQA (q_num_heads != kv_num_heads) requires flash or memory efficient attention, "
                            "but neither is eligible. Ensure fp16/bf16 on Ampere+ GPU, or check head_size constraints.");

From b8ea59e3a1418c77d7432244cfccc08cf604fd74 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 21:43:51 +0000
Subject: [PATCH 14/35] Add TODO comments for GQA+float_mask and 4D present
 gaps

Add TODO markers for two known design limitations:
1. GQA + float_mask + decode has no viable kernel path (flash rejects
   float masks, MEA rejects past_key, unfused rejects GQA).
2. 4D (BNSH) inputs don't populate present_key/present_value in
   flash prompt and MEA prompt paths due to is_bsnh guard.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 onnxruntime/core/providers/cuda/llm/attention.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 65ec1d91f7efd..f6dafb3b628dd 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -389,6 +389,9 @@ Status Attention<T>::RunFlashAttention(
 
   // --- Populate present_key/value (BNSH) from K/V (BSNH) ---
   // Skip for decode path where mha_fwd_kvcache already populated present buffers.
+  // TODO: 4D (BNSH) inputs don't populate present_key/present_value in prompt paths.
+  // The is_bsnh guard skips population when input is 4D. Need BNSH→BNSH copy.
+  // Only flash decode (Path 2) and unfused currently populate present for 4D.
   if (!present_kv_already_populated) {
     if (present_key != nullptr && is_bsnh) {
       if constexpr (std::is_same_v<T, MLFloat16>) {
@@ -755,6 +758,9 @@ Status Attention<T>::RunMemoryEfficientAttention(
   }
 
   // Populate present_key/present_value (BNSH) if requested
+  // TODO: 4D (BNSH) inputs don't populate present_key/present_value in prompt paths.
+  // The is_bsnh guard skips population when input is 4D. Need BNSH→BNSH copy.
+  // Only flash decode (Path 2) and unfused currently populate present for 4D.
   if (present_key != nullptr && is_bsnh) {
     if constexpr (std::is_same_v<T, MLFloat16>) {
       ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
@@ -1106,6 +1112,11 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
     // TODO: Support GQA in unfused attention path for fp32/old-GPU fallback.
     // Requires ~160 lines: ExpandKVHeads kernel to replicate KV heads, wiring in unfused dispatch.
     // See issue #27516 for tracking.
+    //
+    // TODO: GQA + float_mask + decode is not supported by any kernel path.
+    // Flash rejects float masks, MEA rejects past_key, unfused rejects GQA.
+    // Fix options: (a) convert float mask to bool in decode path, or
+    // (b) support float masks in flash decode via mha_fwd_kvcache.
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
                            "GQA (q_num_heads != kv_num_heads) requires flash or memory efficient attention, "
                            "but neither is eligible. Ensure fp16/bf16 on Ampere+ GPU, or check head_size constraints.");

From aca1cf8b5f66aa0a76b1510682f41c743cc79aab Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 21:45:30 +0000
Subject: [PATCH 15/35] Add TODO comments for softcap/softmax_precision and
 output_qk gaps

Mark two more unsupported feature gaps with TODO comments:
1. softcap and softmax_precision rejected by all three CUDA kernel paths
2. output_qk modes beyond kNone/kQK only supported in unfused path

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 onnxruntime/core/providers/cuda/llm/attention.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index f6dafb3b628dd..7171ffc531065 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -1093,6 +1093,8 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
 #endif
 
   // Fallback: unfused attention
+  // TODO: Support softcap and softmax_precision on CUDA kernels.
+  // Currently rejected by all three kernel paths (flash, MEA, unfused).
   if (parameters.softcap != 0.0f) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
                            "softcap is not supported yet in Attention op (CUDA).");
@@ -1101,6 +1103,8 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
                            "softmax_precision is not supported yet in Attention op (CUDA).");
   }
+  // TODO: Support additional output_qk modes beyond kNone and kQK.
+  // Currently only unfused handles output_qk, and only kNone/kQK modes.
   if (qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kNone &&
       qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kQK) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,

From 09900762c1cd52b86248511566e721504c0953b0 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 21:45:53 +0000
Subject: [PATCH 16/35] Revert "Add TODO comments for GQA+float_mask and 4D
 present gaps"

This reverts commit b8ea59e3a1418c77d7432244cfccc08cf604fd74.
---
 onnxruntime/core/providers/cuda/llm/attention.cc | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 7171ffc531065..2b04fcaf7b5ec 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -389,9 +389,6 @@ Status Attention<T>::RunFlashAttention(
 
   // --- Populate present_key/value (BNSH) from K/V (BSNH) ---
   // Skip for decode path where mha_fwd_kvcache already populated present buffers.
-  // TODO: 4D (BNSH) inputs don't populate present_key/present_value in prompt paths.
-  // The is_bsnh guard skips population when input is 4D. Need BNSH→BNSH copy.
-  // Only flash decode (Path 2) and unfused currently populate present for 4D.
   if (!present_kv_already_populated) {
     if (present_key != nullptr && is_bsnh) {
       if constexpr (std::is_same_v<T, MLFloat16>) {
@@ -758,9 +755,6 @@ Status Attention<T>::RunMemoryEfficientAttention(
   }
 
   // Populate present_key/present_value (BNSH) if requested
-  // TODO: 4D (BNSH) inputs don't populate present_key/present_value in prompt paths.
-  // The is_bsnh guard skips population when input is 4D. Need BNSH→BNSH copy.
-  // Only flash decode (Path 2) and unfused currently populate present for 4D.
   if (present_key != nullptr && is_bsnh) {
     if constexpr (std::is_same_v<T, MLFloat16>) {
       ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
@@ -1116,11 +1110,6 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
     // TODO: Support GQA in unfused attention path for fp32/old-GPU fallback.
     // Requires ~160 lines: ExpandKVHeads kernel to replicate KV heads, wiring in unfused dispatch.
     // See issue #27516 for tracking.
-    //
-    // TODO: GQA + float_mask + decode is not supported by any kernel path.
-    // Flash rejects float masks, MEA rejects past_key, unfused rejects GQA.
-    // Fix options: (a) convert float mask to bool in decode path, or
-    // (b) support float masks in flash decode via mha_fwd_kvcache.
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
                            "GQA (q_num_heads != kv_num_heads) requires flash or memory efficient attention, "
                            "but neither is eligible. Ensure fp16/bf16 on Ampere+ GPU, or check head_size constraints.");

From cb6475135448fcf89a1573ec0d967156659f4051 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 21:49:03 +0000
Subject: [PATCH 17/35] Code cleanup: remove dead function, fix comments,
 CUDA-graph-safe 2D broadcast

1. Remove dead LaunchConvertMaskToSeqlensK (hardcoded offset=-1, never called).
   LaunchConvertMaskToFlashSeqlensK with configurable offset is the only caller.

2. Remove confusing 'Fix N' labels from development. Clean up Path numbering
   in RunFlashAttention (Path 1/2/3 consistently named).

3. Replace host-side batch*q_seq cudaMemcpyAsync loop for 2D mask expansion
   with LaunchBroadcastBias2DToQSeq CUDA kernel. Single kernel launch is
   CUDA-graph-capturable and eliminates B*q_seq individual D2D memcpy calls.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cuda/llm/attention.cc      | 24 ++---
 .../providers/cuda/llm/attention_mask_impl.cu | 89 ++++++++++++-------
 .../providers/cuda/llm/attention_mask_impl.h  | 52 ++++-------
 3 files changed, 82 insertions(+), 83 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 2b04fcaf7b5ec..f894a8b141ce5 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -114,7 +114,7 @@ Status Attention<T>::RunFlashAttention(
                                          out_accum_bytes, cuda_stream));
   }
 
-  // --- Fix 3: Prepare Q in BSNH format (flash always expects Q as BSNH) ---
+  // --- Transpose Q from BNSH to BSNH (flash always expects Q as BSNH) ---
   const void* q_data = Q->Data<T>();
   IAllocatorUniquePtr<void> q_bsnh_buffer;
   if (!is_bsnh) {
@@ -196,7 +196,7 @@ Status Attention<T>::RunFlashAttention(
         /*local_window_size=*/-1, /*is_rotary_interleaved=*/false,
         /*is_packed_qkv=*/false));
   }
-  // --- Path 2 (Fix 1): Decode with past KV cache ---
+  // --- Path 2: Decode with past KV cache ---
   else if (past_key != nullptr) {
     ORT_ENFORCE(past_value != nullptr, "past_key requires past_value.");
     ORT_ENFORCE(present_key != nullptr && present_value != nullptr,
@@ -361,7 +361,7 @@ Status Attention<T>::RunFlashAttention(
         is_bsnh));
   }
 
-  // --- Fix 3: Transpose output BSNH → BNSH if input was 4D (BNSH) ---
+  // --- Transpose output BSNH → BNSH if input was 4D (BNSH) ---
   if (!is_bsnh && out_bsnh_buffer != nullptr) {
     if constexpr (std::is_same_v<T, MLFloat16>) {
       ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
@@ -479,7 +479,7 @@ Status Attention<T>::RunMemoryEfficientAttention(
   const void* k_data = K->Data<T>();
   const void* v_data = V->Data<T>();
 
-  // --- Fix 3: Transpose Q from BNSH to BSNH if 4D input ---
+  // --- Transpose Q from BNSH to BSNH if 4D input ---
   IAllocatorUniquePtr<void> q_bsnh_buffer;
   if (!is_bsnh) {
     size_t q_bytes = sizeof(T) * parameters.batch_size * parameters.q_sequence_length *
@@ -667,17 +667,9 @@ Status Attention<T>::RunMemoryEfficientAttention(
                               ? reinterpret_cast<const NativeCudaT*>(converted_mask_buffer.get())
                               : reinterpret_cast<const NativeCudaT*>(attn_mask->Data<T>());
         auto* dst = reinterpret_cast<NativeCudaT*>(expanded_buffer.get());
-        const size_t row_bytes = static_cast<size_t>(kv_len) * sizeof(NativeCudaT);
-        for (int b = 0; b < parameters.batch_size; ++b) {
-          const auto* src_row = src + static_cast<int64_t>(b) * kv_len;
-          auto* dst_base = dst + static_cast<int64_t>(b) * q_len * kv_len;
-          for (int q = 0; q < q_len; ++q) {
-            CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
-                dst_base + static_cast<int64_t>(q) * kv_len,
-                src_row, row_bytes,
-                cudaMemcpyDeviceToDevice, cuda_stream));
-          }
-        }
+        ORT_RETURN_IF_ERROR(LaunchBroadcastBias2DToQSeq<NativeCudaT>(
+            src, dst, parameters.batch_size, q_len, kv_len,
+            cuda_stream, device_prop.maxThreadsPerBlock));
         attn_bias_data = expanded_buffer.get();
         converted_mask_buffer = std::move(expanded_buffer);
         // Expanded shape is [B, 1, q_seq, kv_seq]
@@ -728,7 +720,7 @@ Status Attention<T>::RunMemoryEfficientAttention(
     }
   }
 
-  // --- Fix 3: Transpose output BSNH → BNSH if input was 4D (BNSH) ---
+  // --- Transpose output BSNH → BNSH if input was 4D (BNSH) ---
   if (!is_bsnh && out_bsnh_buffer != nullptr) {
     if constexpr (std::is_same_v<T, MLFloat16>) {
       ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
index d105f3c16eeff..df8bae59e5d8a 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -103,39 +103,7 @@ __global__ void ConvertMaskToSeqlensKernel(
   seqlens_k[batch_idx] = seq_len + seqlen_offset;
 }
 
-Status LaunchConvertMaskToSeqlensK(
-    const bool* attn_mask_bool,
-    int* seqlens_k,
-    int batch_size,
-    int total_seq_len,
-    int mask_dims,
-    int64_t mask_dim0,
-    int64_t mask_dim1,
-    int64_t mask_dim2,
-    cudaStream_t stream,
-    int max_threads_per_block) {
-  if (batch_size == 0 || total_seq_len == 0) {
-    return Status::OK();
-  }
-
-  int threads = std::min(batch_size, max_threads_per_block);
-  int blocks = (batch_size + threads - 1) / threads;
-
-  ConvertMaskToSeqlensKernel<<<blocks, threads, 0, stream>>>(
-      attn_mask_bool,
-      seqlens_k,
-      batch_size,
-      total_seq_len,
-      mask_dims,
-      mask_dim0,
-      mask_dim1,
-      mask_dim2,
-      /*seqlen_offset=*/-1);
-
-  return CUDA_CALL(cudaGetLastError());
-}
-
-// Like LaunchConvertMaskToSeqlensK but with a configurable offset.
+// Convert boolean mask to sequence lengths with a configurable offset.
 // seqlens_k[b] = num_true_tokens + seqlen_offset
 Status LaunchConvertMaskToFlashSeqlensK(
     const bool* attn_mask_bool,
@@ -215,6 +183,61 @@ template Status LaunchConvertBoolMaskToAttentionBias<__half>(
 template Status LaunchConvertBoolMaskToAttentionBias<__nv_bfloat16>(
     const bool*, __nv_bfloat16*, int64_t, float, cudaStream_t, int);
 
+// Broadcast a 2D attention bias [B, kv_seq] to [B, 1, q_seq, kv_seq] by repeating
+// each batch's row across all query positions. This is needed because MEA uses
+// bias_strideM = kv_seq, so each query position must have its own row of kv_seq values.
+// Without expansion, a 2D mask would cause bias_strideM to walk across batch boundaries.
+template <typename T>
+__global__ void BroadcastBias2DToQSeqKernel(
+    const T* __restrict__ src,
+    T* __restrict__ dst,
+    const int batch_size,
+    const int q_seq_len,
+    const int kv_seq_len) {
+  int64_t total = static_cast<int64_t>(batch_size) * q_seq_len * kv_seq_len;
+  for (int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+       idx < total;
+       idx += static_cast<int64_t>(gridDim.x) * blockDim.x) {
+    // Map flat index → (batch, q_pos, kv_pos)
+    int kv_pos = static_cast<int>(idx % kv_seq_len);
+    int b = static_cast<int>(idx / (static_cast<int64_t>(q_seq_len) * kv_seq_len));
+    // All q positions read from the same src row for this batch
+    dst[idx] = src[static_cast<int64_t>(b) * kv_seq_len + kv_pos];
+  }
+}
+
+template <typename T>
+Status LaunchBroadcastBias2DToQSeq(
+    const T* src,
+    T* dst,
+    int batch_size,
+    int q_seq_len,
+    int kv_seq_len,
+    cudaStream_t stream,
+    int max_threads_per_block) {
+  int64_t total = static_cast<int64_t>(batch_size) * q_seq_len * kv_seq_len;
+  if (total == 0) {
+    return Status::OK();
+  }
+
+  int threads = static_cast<int>(std::min(static_cast<int64_t>(max_threads_per_block), total));
+  int64_t blocks = (total + threads - 1) / threads;
+  constexpr int64_t kMaxGridDimX = 65535;
+  unsigned int grid_size = static_cast<unsigned int>(std::min(blocks, kMaxGridDimX));
+
+  BroadcastBias2DToQSeqKernel<T><<<grid_size, threads, 0, stream>>>(
+      src, dst, batch_size, q_seq_len, kv_seq_len);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
+template Status LaunchBroadcastBias2DToQSeq<float>(
+    const float*, float*, int, int, int, cudaStream_t, int);
+template Status LaunchBroadcastBias2DToQSeq<__half>(
+    const __half*, __half*, int, int, int, cudaStream_t, int);
+template Status LaunchBroadcastBias2DToQSeq<__nv_bfloat16>(
+    const __nv_bfloat16*, __nv_bfloat16*, int, int, int, cudaStream_t, int);
+
 // CUDA kernel to convert nonpad_kv_seqlen (int64) to seqlens_k (int32) for GQA.
 // GQA convention: seqlens_k = nonpad_kv_seqlen - 1 (last valid index, not count).
 //
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
index 8e34c73d535e2..73462d76c4c04 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
@@ -8,7 +8,7 @@
 namespace onnxruntime {
 namespace cuda {
 
-// Convert a boolean attention mask to sequence lengths for use with GQA kernels.
+// Convert a boolean attention mask to sequence lengths with a configurable offset.
 //
 // The mask is expected to have the following properties:
 // 1. It represents right-padding only (valid tokens first, padding at the end)
@@ -20,39 +20,6 @@ namespace cuda {
 // For 3D mask (num_heads, q_seq_len, total_seq_len): broadcasts across batches, uses first head/q
 // For 4D mask (B, H, q_seq_len, total_seq_len): uses first head, first q position
 //
-// Parameters:
-//   attn_mask_bool: Input boolean mask on GPU (True = valid, False = padding)
-//   seqlens_k: Output buffer for sequence lengths (seqlen - 1 for GQA convention)
-//   batch_size: Number of batches
-//   total_seq_len: Total sequence length (last dimension of mask)
-//   mask_dims: Number of dimensions in the mask (2, 3, or 4)
-//   mask_dim0: First dimension of mask (batch_size for 2D, num_heads for 3D, batch_size for 4D)
-//   mask_dim1: Second dimension (0 for 2D, q_seq_len for 3D, num_heads for 4D)
-//   mask_dim2: Third dimension (0 for 2D/3D, q_seq_len for 4D)
-//   stream: CUDA stream
-//   max_threads_per_block: Maximum threads per block
-//
-// Returns:
-//   Status::OK() on success
-//
-// Note: Mask validity (right-padding convention, starts with True, contiguous True/False)
-//   is checked asynchronously via CUDA_KERNEL_ASSERT inside the kernel. Invalid masks will
-//   trigger a device-side assertion failure.
-Status LaunchConvertMaskToSeqlensK(
-    const bool* attn_mask_bool,
-    int* seqlens_k,
-    int batch_size,
-    int total_seq_len,
-    int mask_dims,
-    int64_t mask_dim0,
-    int64_t mask_dim1,
-    int64_t mask_dim2,
-    cudaStream_t stream,
-    int max_threads_per_block);
-
-// Like LaunchConvertMaskToSeqlensK but with a configurable offset.
-// Flash attention and MEA custom right padding expect count, not last-valid-index.
-//
 // seqlen_offset adjusts the raw token count:
 //   seqlens_k[b] = num_true_tokens + seqlen_offset
 //
@@ -60,6 +27,10 @@ Status LaunchConvertMaskToSeqlensK(
 //   0: actual token count (for prompt with mha_fwd_kvcache, MEA custom right padding)
 //  -N: subtract N from count (for decode with mha_fwd_kvcache where N=kv_sequence_length,
 //      giving the number of tokens already in cache BEFORE appending new ones)
+//
+// Note: Mask validity (right-padding convention, starts with True, contiguous True/False)
+//   is checked asynchronously via CUDA_KERNEL_ASSERT inside the kernel. Invalid masks will
+//   trigger a device-side assertion failure.
 Status LaunchConvertMaskToFlashSeqlensK(
     const bool* attn_mask_bool,
     int* seqlens_k,
@@ -85,6 +56,19 @@ Status LaunchConvertBoolMaskToAttentionBias(
     cudaStream_t stream,
     int max_threads_per_block);
 
+// Broadcast a 2D attention bias [B, kv_seq] → [B, 1, q_seq, kv_seq] by repeating
+// each batch's row across all query positions. CUDA-graph-capturable replacement for
+// the host-side batch×q_seq cudaMemcpyAsync loop.
+template <typename T>
+Status LaunchBroadcastBias2DToQSeq(
+    const T* src,
+    T* dst,
+    int batch_size,
+    int q_seq_len,
+    int kv_seq_len,
+    cudaStream_t stream,
+    int max_threads_per_block);
+
 // Convert nonpad_kv_seqlen (int64, per-batch valid KV lengths) to seqlens_k (int32) for GQA.
 // GQA convention: seqlens_k[i] = nonpad_kv_seqlen[i] - 1 (last valid index, not count).
 //

From 76b006a6abda8515e2020d07fc4b854e633f6f44 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 21:55:48 +0000
Subject: [PATCH 18/35] Add test improvements: unfused MHA, 4D BNSH GQA,
 broadcast mask, float mask tests

- Fix test_mha.py docstring (flash/MEA on SM80+, not just unfused)
- Add TestONNXAttentionMHAUnfused: 3 tests forcing unfused path via env vars
- Add TestONNXAttentionMHABroadcastMask: (1,1,q,kv) additive mask broadcast
- Add TestONNXAttentionGQA4DBNSH: prompt + decode with 4D BNSH inputs
- Add TestONNXAttentionGQAFloatMask: GQA + 4D float additive mask prompt
- Update common.py: use_4d_bnsh support, broadcast_mask_batch/heads fields
- All 55 GQA + 45 MHA tests pass

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../test_onnx_attention/common.py             | 124 +++++++++---
 .../test_onnx_attention/test_gqa.py           | 183 ++++++++++++++++--
 .../test_onnx_attention/test_mha.py           | 150 +++++++++++++-
 3 files changed, 407 insertions(+), 50 deletions(-)

diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/common.py b/onnxruntime/test/python/transformers/test_onnx_attention/common.py
index ad54efdbd6294..9bf7645e2c539 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/common.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/common.py
@@ -96,6 +96,9 @@ class AttentionConfig:
     attn_mask_dims: int = 2  # 2D, 3D, or 4D boolean mask
     attn_mask_type: str = "bool"  # "bool" for GQA path, "additive" for MHA path
     has_nonpad_kv_seqlen: bool = False  # Opset 24: nonpad_kv_seqlen input
+    use_4d_bnsh: bool = False  # Use 4D [B, num_heads, seq, head_size] inputs instead of 3D
+    broadcast_mask_batch: bool = False  # Use batch dim 1 in 4D mask for broadcasting
+    broadcast_mask_heads: bool = False  # Use heads dim 1 in 4D mask for broadcasting
 
 
 # #################################################################################################
@@ -180,17 +183,37 @@ def create_attention_node_and_io(
     )
 
     # --- Graph Inputs ---
-    # ONNX Attention op uses 3D inputs: [batch, seq_len, hidden_size]
-    q_hidden_size = config.q_num_heads * config.head_size
-    kv_hidden_size = config.kv_num_heads * config.head_size
-
-    graph_input = [
-        helper.make_tensor_value_info("query", ort_type, [config.batch_size, config.q_sequence_length, q_hidden_size]),
-        helper.make_tensor_value_info("key", ort_type, [config.batch_size, config.kv_sequence_length, kv_hidden_size]),
-        helper.make_tensor_value_info(
-            "value", ort_type, [config.batch_size, config.kv_sequence_length, kv_hidden_size]
-        ),
-    ]
+    if config.use_4d_bnsh:
+        # 4D BNSH inputs: [batch, num_heads, seq_len, head_size]
+        graph_input = [
+            helper.make_tensor_value_info(
+                "query", ort_type,
+                [config.batch_size, config.q_num_heads, config.q_sequence_length, config.head_size],
+            ),
+            helper.make_tensor_value_info(
+                "key", ort_type,
+                [config.batch_size, config.kv_num_heads, config.kv_sequence_length, config.head_size],
+            ),
+            helper.make_tensor_value_info(
+                "value", ort_type,
+                [config.batch_size, config.kv_num_heads, config.kv_sequence_length, config.head_size],
+            ),
+        ]
+    else:
+        # 3D inputs: [batch, seq_len, hidden_size]
+        q_hidden_size = config.q_num_heads * config.head_size
+        kv_hidden_size = config.kv_num_heads * config.head_size
+        graph_input = [
+            helper.make_tensor_value_info(
+                "query", ort_type, [config.batch_size, config.q_sequence_length, q_hidden_size]
+            ),
+            helper.make_tensor_value_info(
+                "key", ort_type, [config.batch_size, config.kv_sequence_length, kv_hidden_size]
+            ),
+            helper.make_tensor_value_info(
+                "value", ort_type, [config.batch_size, config.kv_sequence_length, kv_hidden_size]
+            ),
+        ]
 
     if isinstance(config.kv_cache_type, torch.dtype):
         cache_ort_type = TORCH_DTYPE_TO_ONNX_MAP[config.kv_cache_type]
@@ -219,7 +242,9 @@ def create_attention_node_and_io(
             elif config.attn_mask_dims == 3:
                 mask_shape = [config.q_num_heads, config.q_sequence_length, mask_seq_len]
             else:  # 4D
-                mask_shape = [config.batch_size, config.q_num_heads, config.q_sequence_length, mask_seq_len]
+                mask_batch = 1 if config.broadcast_mask_batch else config.batch_size
+                mask_heads = 1 if config.broadcast_mask_heads else config.q_num_heads
+                mask_shape = [mask_batch, mask_heads, config.q_sequence_length, mask_seq_len]
         else:  # additive, or bool on MHA path
             if config.attn_mask_dims == 2:
                 mask_shape = [config.q_sequence_length, mask_seq_len]
@@ -227,7 +252,9 @@ def create_attention_node_and_io(
                 # 3D aligns to [_, heads, q_seq, total_seq] — dim 0 must be 1 or num_heads
                 mask_shape = [config.q_num_heads, config.q_sequence_length, mask_seq_len]
             else:  # 4D
-                mask_shape = [config.batch_size, config.q_num_heads, config.q_sequence_length, mask_seq_len]
+                mask_batch = 1 if config.broadcast_mask_batch else config.batch_size
+                mask_heads = 1 if config.broadcast_mask_heads else config.q_num_heads
+                mask_shape = [mask_batch, mask_heads, config.q_sequence_length, mask_seq_len]
         graph_input.append(helper.make_tensor_value_info("attn_mask", mask_ort_type, mask_shape))
 
     # past_key and past_value for ONNX Attention op
@@ -248,10 +275,13 @@ def create_attention_node_and_io(
     # --- Graph Outputs ---
     output_k_shape = [config.batch_size, config.kv_num_heads, present_kv_seqlen, config.head_size]
 
+    if config.use_4d_bnsh:
+        output_shape = [config.batch_size, config.q_num_heads, config.q_sequence_length, config.head_size]
+    else:
+        output_shape = [config.batch_size, config.q_sequence_length, config.q_num_heads * config.head_size]
+
     graph_output = [
-        helper.make_tensor_value_info(
-            "output", ort_type, [config.batch_size, config.q_sequence_length, config.q_num_heads * config.head_size]
-        ),
+        helper.make_tensor_value_info("output", ort_type, output_shape),
         helper.make_tensor_value_info("present_key", cache_ort_type, output_k_shape),
         helper.make_tensor_value_info("present_value", cache_ort_type, output_k_shape),
     ]
@@ -379,19 +409,26 @@ def attention_prompt_func(
         ort_type=ort_type,
     )
 
-    # Reshape to 3D [batch, seq_len, hidden_size]
-    q_3d = torch.reshape(q, (config.batch_size, config.q_sequence_length, -1))
-    k_3d = torch.reshape(k, (config.batch_size, config.kv_sequence_length, -1))
-    v_3d = torch.reshape(v, (config.batch_size, config.kv_sequence_length, -1))
+    # Reshape inputs for ONNX graph
+    if config.use_4d_bnsh:
+        # 4D BNSH: [batch, num_heads, seq_len, head_size]
+        q_input = q.transpose(1, 2).contiguous()  # BSNH → BNSH
+        k_input = k.transpose(1, 2).contiguous()
+        v_input = v.transpose(1, 2).contiguous()
+    else:
+        # 3D: [batch, seq_len, hidden_size]
+        q_input = torch.reshape(q, (config.batch_size, config.q_sequence_length, -1))
+        k_input = torch.reshape(k, (config.batch_size, config.kv_sequence_length, -1))
+        v_input = torch.reshape(v, (config.batch_size, config.kv_sequence_length, -1))
 
     sess_options = SessionOptions()
     ort_session = InferenceSession(onnx_model_str, sess_options, providers=[ep])
     io_binding = ort_session.io_binding()
 
     # Bind inputs
-    bind_tensor(io_binding, "query", q_3d, device, ort_type)
-    bind_tensor(io_binding, "key", k_3d, device, ort_type)
-    bind_tensor(io_binding, "value", v_3d, device, ort_type)
+    bind_tensor(io_binding, "query", q_input, device, ort_type)
+    bind_tensor(io_binding, "key", k_input, device, ort_type)
+    bind_tensor(io_binding, "value", v_input, device, ort_type)
 
     # Bind optional attention mask
     if config.has_attn_mask and attn_mask is not None:
@@ -411,7 +448,15 @@ def attention_prompt_func(
     hidden_size = config.q_num_heads * config.head_size
     out_dtype = _get_out_dtype(ort_type)
 
-    out_torch = torch.zeros((config.batch_size, config.q_sequence_length, hidden_size), dtype=out_dtype, device=device)
+    if config.use_4d_bnsh:
+        out_torch = torch.zeros(
+            (config.batch_size, config.q_num_heads, config.q_sequence_length, config.head_size),
+            dtype=out_dtype, device=device,
+        )
+    else:
+        out_torch = torch.zeros(
+            (config.batch_size, config.q_sequence_length, hidden_size), dtype=out_dtype, device=device
+        )
     bind_output_tensor(io_binding, "output", out_torch, device, ort_type)
 
     # present KV shape for prompt (no past)
@@ -474,10 +519,17 @@ def attention_past_func(
         ort_type=ort_type,
     )
 
-    # Reshape to 3D [batch, seq_len, hidden_size]
-    q_3d = torch.reshape(q, (config.batch_size, config.q_sequence_length, -1))
-    new_k_3d = torch.reshape(new_k, (config.batch_size, config.kv_sequence_length, -1))
-    new_v_3d = torch.reshape(new_v, (config.batch_size, config.kv_sequence_length, -1))
+    # Reshape inputs for ONNX graph
+    if config.use_4d_bnsh:
+        # 4D BNSH: [batch, num_heads, seq_len, head_size]
+        q_input = q.transpose(1, 2).contiguous()  # BSNH → BNSH
+        new_k_input = new_k.transpose(1, 2).contiguous()
+        new_v_input = new_v.transpose(1, 2).contiguous()
+    else:
+        # 3D: [batch, seq_len, hidden_size]
+        q_input = torch.reshape(q, (config.batch_size, config.q_sequence_length, -1))
+        new_k_input = torch.reshape(new_k, (config.batch_size, config.kv_sequence_length, -1))
+        new_v_input = torch.reshape(new_v, (config.batch_size, config.kv_sequence_length, -1))
 
     sess_options = SessionOptions()
     ort_session = InferenceSession(onnx_model_str, sess_options, providers=[ep])
@@ -487,9 +539,9 @@ def attention_past_func(
     total_seq_len = config.past_kv_sequence_length + config.kv_sequence_length
 
     # Bind inputs
-    bind_tensor(io_binding, "query", q_3d, device, ort_type)
-    bind_tensor(io_binding, "key", new_k_3d, device, ort_type)
-    bind_tensor(io_binding, "value", new_v_3d, device, ort_type)
+    bind_tensor(io_binding, "query", q_input, device, ort_type)
+    bind_tensor(io_binding, "key", new_k_input, device, ort_type)
+    bind_tensor(io_binding, "value", new_v_input, device, ort_type)
 
     # Bind optional attention mask
     if config.has_attn_mask and attn_mask is not None:
@@ -513,7 +565,15 @@ def attention_past_func(
     hidden_size = config.q_num_heads * config.head_size
     out_dtype = _get_out_dtype(ort_type)
 
-    out_torch = torch.zeros((config.batch_size, config.q_sequence_length, hidden_size), dtype=out_dtype, device=device)
+    if config.use_4d_bnsh:
+        out_torch = torch.zeros(
+            (config.batch_size, config.q_num_heads, config.q_sequence_length, config.head_size),
+            dtype=out_dtype, device=device,
+        )
+    else:
+        out_torch = torch.zeros(
+            (config.batch_size, config.q_sequence_length, hidden_size), dtype=out_dtype, device=device
+        )
     bind_output_tensor(io_binding, "output", out_torch, device, ort_type)
 
     # present KV shape (past + new)
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index 60cba2b2dbd70..1b4a995607122 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -36,6 +36,7 @@
     attention_past_func,
     attention_prompt_func,
     attention_ref,
+    create_additive_mask_from_seqlens,
     create_boolean_mask_from_seqlens,
     enable_debug_print,
     enable_deterministic_check,
@@ -152,7 +153,11 @@ def parity_check_gqa_prompt(
                 )
             torch.testing.assert_close(out, first_out, rtol=0, atol=0, msg="Output mismatch between two runs")
 
-    out = torch.reshape(out, (config.batch_size, config.q_sequence_length, config.q_num_heads, config.head_size))
+    if config.use_4d_bnsh:
+        # 4D BNSH output → BSNH for comparison
+        out = out.transpose(1, 2).contiguous()
+    else:
+        out = torch.reshape(out, (config.batch_size, config.q_sequence_length, config.q_num_heads, config.head_size))
     out_np = out.to(torch.float32).detach().cpu().numpy()
 
     # --- Comparison ---
@@ -163,18 +168,21 @@ def parity_check_gqa_prompt(
         print(f"DEBUG_NAN: First 5 NaN indices: {nan_indices[:5]}")
 
     # Compare KV cache (present_k should match k, present_v should match v)
-    k_ref_bnsh = k.transpose(1, 2)  # BSNH -> BNSH
-    v_ref_bnsh = v.transpose(1, 2)  # BSNH -> BNSH
-
-    k_ref_np = k_ref_bnsh.to(torch.float32).detach().cpu().numpy()
-    v_ref_np = v_ref_bnsh.to(torch.float32).detach().cpu().numpy()
-    present_k_np = present_k.to(torch.float32).detach().cpu().numpy()
-    present_v_np = present_v.to(torch.float32).detach().cpu().numpy()
-
-    print_diff_statistics(torch.tensor(present_k_np - k_ref_np), "present_k")
-    numpy.testing.assert_allclose(present_k_np, k_ref_np, rtol=rtol, atol=atol)
-    print_diff_statistics(torch.tensor(present_v_np - v_ref_np), "present_v")
-    numpy.testing.assert_allclose(present_v_np, v_ref_np, rtol=rtol, atol=atol)
+    # Skip for 4D BNSH prompt: the dispatcher doesn't populate present_key/value
+    # when inputs are already BNSH and there's no past (known limitation).
+    if not config.use_4d_bnsh or config.past_kv_sequence_length > 0:
+        k_ref_bnsh = k.transpose(1, 2)  # BSNH -> BNSH
+        v_ref_bnsh = v.transpose(1, 2)  # BSNH -> BNSH
+
+        k_ref_np = k_ref_bnsh.to(torch.float32).detach().cpu().numpy()
+        v_ref_np = v_ref_bnsh.to(torch.float32).detach().cpu().numpy()
+        present_k_np = present_k.to(torch.float32).detach().cpu().numpy()
+        present_v_np = present_v.to(torch.float32).detach().cpu().numpy()
+
+        print_diff_statistics(torch.tensor(present_k_np - k_ref_np), "present_k")
+        numpy.testing.assert_allclose(present_k_np, k_ref_np, rtol=rtol, atol=atol)
+        print_diff_statistics(torch.tensor(present_v_np - v_ref_np), "present_v")
+        numpy.testing.assert_allclose(present_v_np, v_ref_np, rtol=rtol, atol=atol)
 
     print_diff_statistics(torch.tensor(out_np - out_ref_np), "out")
     numpy.testing.assert_allclose(out_np, out_ref_np, rtol=rtol, atol=atol)
@@ -315,7 +323,10 @@ def parity_check_gqa_past(
                     present_v, first_present_v, rtol=0, atol=0, msg="present_v mismatch between two runs"
                 )
 
-    out = torch.reshape(out, (config.batch_size, config.q_sequence_length, config.q_num_heads, config.head_size))
+    if config.use_4d_bnsh:
+        out = out.transpose(1, 2).contiguous()
+    else:
+        out = torch.reshape(out, (config.batch_size, config.q_sequence_length, config.q_num_heads, config.head_size))
     out_np = out.to(torch.float32).detach().cpu().numpy()
 
     if enable_debug_print:
@@ -1145,5 +1156,149 @@ def test_gqa_nonpad_kv_seqlen_cpu(self, name, config, seqlens):
         )
 
 
+# #################################################################################################
+#  GQA 4D BNSH Format Tests
+# #################################################################################################
+
+
+def gqa_4d_bnsh_test_cases():
+    """Generate test cases for GQA with 4D BNSH input format."""
+    return [
+        ("prompt_nomask", AttentionConfig(
+            batch_size=2, q_sequence_length=16, kv_sequence_length=16,
+            q_num_heads=8, kv_num_heads=2, head_size=128, is_causal=1,
+            use_4d_bnsh=True,
+        )),
+        ("prompt_smallhead", AttentionConfig(
+            batch_size=2, q_sequence_length=16, kv_sequence_length=16,
+            q_num_heads=8, kv_num_heads=4, head_size=64, is_causal=1,
+            use_4d_bnsh=True,
+        )),
+    ]
+
+
+def gqa_4d_bnsh_past_test_cases():
+    """Generate test cases for GQA decode with 4D BNSH input format."""
+    return [
+        ("decode_nomask", AttentionConfig(
+            batch_size=2, q_sequence_length=1, kv_sequence_length=1,
+            past_kv_sequence_length=32, q_num_heads=8, kv_num_heads=2,
+            head_size=128, is_causal=1, use_4d_bnsh=True,
+        )),
+    ]
+
+
+@unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping 4D BNSH tests.")
+class TestONNXAttentionGQA4DBNSH(unittest.TestCase):
+    """
+    Test GQA with 4D BNSH input format [batch, num_heads, seq, head_size].
+
+    The C++ attention op detects 4D inputs and sets transpose_output=false.
+    Flash/MEA always expect BSNH, so the dispatcher transposes Q internally.
+    """
+
+    @parameterized.expand(gqa_4d_bnsh_test_cases())
+    def test_gqa_4d_bnsh_prompt(self, name, config):
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
+        parity_check_gqa_prompt(
+            config=config,
+            ep="CUDAExecutionProvider",
+            device="cuda",
+            torch_type=torch.float16,
+            ort_type=TensorProto.FLOAT16,
+            causal=True,
+            rtol=rtol["fp16"],
+            atol=atol["fp16"],
+        )
+
+    @parameterized.expand(gqa_4d_bnsh_past_test_cases())
+    def test_gqa_4d_bnsh_decode(self, name, config):
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
+        parity_check_gqa_past(
+            config=config,
+            ep="CUDAExecutionProvider",
+            device="cuda",
+            torch_type=torch.float16,
+            ort_type=TensorProto.FLOAT16,
+            causal=True,
+            rtol=rtol["fp16"],
+            atol=atol["fp16"],
+        )
+
+
+# #################################################################################################
+#  GQA Float Additive Mask Tests
+# #################################################################################################
+
+
+@unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping float mask tests.")
+class TestONNXAttentionGQAFloatMask(unittest.TestCase):
+    """
+    Test GQA with float additive attention mask (not bool) during prompt.
+
+    This exercises MEA's GQA expansion + float bias path. The GQA path converts
+    the additive mask to attention bias for MEA cutlass FMHA.
+    """
+
+    def test_gqa_prompt_float_mask_4d(self):
+        """Test GQA prompt with 4D float additive mask."""
+        config = AttentionConfig(
+            batch_size=2,
+            q_sequence_length=16,
+            kv_sequence_length=16,
+            q_num_heads=8,
+            kv_num_heads=2,
+            head_size=128,
+            is_causal=0,
+            has_attn_mask=True,
+            attn_mask_dims=4,
+            attn_mask_type="additive",
+        )
+
+        torch.manual_seed(0)
+        device = "cuda"
+        torch_type = torch.float16
+
+        q = torch.randn(2, 16, 8, 128, device=device, dtype=torch_type) * 0.2
+        k = torch.randn(2, 16, 2, 128, device=device, dtype=torch_type) * 0.2
+        v = torch.randn_like(k) * 0.2
+
+        # Create additive mask with padding pattern: batch 0 has 10 valid, batch 1 full
+        seqlens = torch.tensor([10, 16], dtype=torch.int32, device=device)
+        attn_mask = create_additive_mask_from_seqlens(
+            seqlens=seqlens, total_seq_len=16, mask_dims=4,
+            q_seq_len=16, num_heads=8, device=device, dtype=torch_type,
+        )
+
+        # Zero padded KV positions
+        k[0, 10:, :, :] = 0
+        v[0, 10:, :, :] = 0
+
+        # Reference
+        attn_bias_ref = attn_mask
+        out_ref, _ = attention_ref(q=q, k=k, v=v, attn_bias=attn_bias_ref, causal=False)
+
+        # ORT path (MEA handles GQA+float mask)
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
+        try:
+            out_ort, _, _ = attention_prompt_func(
+                q=q, k=k, v=v, config=config, attn_mask=attn_mask,
+                ep="CUDAExecutionProvider", device=device, ort_type=TensorProto.FLOAT16,
+            )
+        finally:
+            os.environ.pop("ORT_DISABLE_FLASH_ATTENTION", None)
+
+        out_ort = out_ort.reshape(2, 16, 8, 128)
+
+        # Zero padded output for comparison
+        out_ort[0, 10:, :, :] = 0
+        out_ref[0, 10:, :, :] = 0
+
+        out_np = out_ort.float().detach().cpu().numpy()
+        out_ref_np = out_ref.float().detach().cpu().numpy()
+        print_diff_statistics(torch.tensor(out_np - out_ref_np), "out")
+        numpy.testing.assert_allclose(out_np, out_ref_np, rtol=rtol["fp16"], atol=atol["fp16"])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
index 02beb6ea5d4d9..d1762687e837d 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
@@ -8,16 +8,22 @@
 Tests for ONNX Attention op (opset 23) — MHA path (kv_num_heads == q_num_heads).
 
 The MHA path in attention.cc is exercised when kv_num_heads == q_num_heads.
-It uses the unfused attention kernel and supports:
+On Ampere+ GPUs with fp16/bf16, the dispatch cascade routes to flash attention
+or memory efficient attention first. The unfused kernel handles fp32 and
+cases where flash/MEA are ineligible. Tests below marked "unfused" explicitly
+disable flash and MEA to verify the unfused kernel.
+
+Supports:
   - float32, float16, bfloat16
-  - 3D inputs (BSNH format)
+  - 3D inputs (BSNH format) and 4D inputs (BNSH format)
   - Causal and non-causal attention
   - Self-attention and cross-attention (kv_seq != q_seq)
-  - Additive attention bias (NOT boolean masks)
+  - Additive attention bias (and boolean masks converted to additive bias)
   - Past KV cache
-  - 2D, 3D, 4D additive masks with broadcasting
+  - 2D, 3D, 4D additive/bool masks with broadcasting
 """
 
+import os
 import unittest
 
 import numpy
@@ -1154,5 +1160,141 @@ def test_mha_nonpad_kv_seqlen_cpu(self, name, config, seqlens):
         )
 
 
+# #################################################################################################
+#  Unfused Kernel Tests
+# #################################################################################################
+
+
+def mha_unfused_test_cases():
+    """
+    Generate test cases that force the unfused attention kernel.
+
+    On Ampere+ GPUs, fp16/bf16 normally route to flash attention. By disabling
+    both flash and MEA, we force the unfused path even for fp16. These tests
+    verify the unfused kernel handles MHA correctly.
+    """
+    cases = [
+        ("prompt_causal", AttentionConfig(
+            batch_size=2, q_sequence_length=16, kv_sequence_length=16,
+            q_num_heads=8, kv_num_heads=8, head_size=64, is_causal=1,
+            attn_mask_type="additive",
+        )),
+        ("prompt_noncausal", AttentionConfig(
+            batch_size=2, q_sequence_length=16, kv_sequence_length=16,
+            q_num_heads=8, kv_num_heads=8, head_size=64, is_causal=0,
+            attn_mask_type="additive",
+        )),
+        ("decode_causal", AttentionConfig(
+            batch_size=2, q_sequence_length=1, kv_sequence_length=1,
+            past_kv_sequence_length=32, q_num_heads=8, kv_num_heads=8,
+            head_size=64, is_causal=1, attn_mask_type="additive",
+        )),
+    ]
+    return cases
+
+
+@unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping unfused tests.")
+class TestONNXAttentionMHAUnfused(unittest.TestCase):
+    """
+    Test the unfused attention kernel by disabling flash and MEA.
+
+    On Ampere+ GPUs, fp16 normally routes to flash attention. These tests
+    disable both flash and MEA to exercise the unfused path.
+    """
+
+    @parameterized.expand(mha_unfused_test_cases())
+    def test_mha_unfused_fp16(self, name, config):
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
+        os.environ["ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION"] = "1"
+        try:
+            if "decode" in name:
+                parity_check_mha_past(
+                    config=config,
+                    ep="CUDAExecutionProvider",
+                    device="cuda",
+                    torch_type=torch.float16,
+                    ort_type=TensorProto.FLOAT16,
+                    causal=config.is_causal == 1,
+                    rtol=rtol["fp16"],
+                    atol=atol["fp16"],
+                )
+            else:
+                parity_check_mha_prompt(
+                    config=config,
+                    ep="CUDAExecutionProvider",
+                    device="cuda",
+                    torch_type=torch.float16,
+                    ort_type=TensorProto.FLOAT16,
+                    causal=config.is_causal == 1,
+                    rtol=rtol["fp16"],
+                    atol=atol["fp16"],
+                )
+        finally:
+            os.environ.pop("ORT_DISABLE_FLASH_ATTENTION", None)
+            os.environ.pop("ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION", None)
+
+
+# #################################################################################################
+#  Broadcast Mask (1,1,q,kv) Tests
+# #################################################################################################
+
+
+@unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping broadcast mask tests.")
+class TestONNXAttentionMHABroadcastMask(unittest.TestCase):
+    """
+    Test attention with a (1,1,q_seq,kv_seq) mask that broadcasts across batch and heads.
+
+    This is a 4D mask with dim_0=1 (batch) and dim_1=1 (heads), verifying that
+    the broadcast_attn_bias_dim_0 and broadcast_attn_bias_dim_1 flags work correctly.
+    """
+
+    def test_mha_broadcast_mask_additive(self):
+        """Test broadcast additive mask (1,1,q,kv) with MHA on CUDA."""
+        config = AttentionConfig(
+            batch_size=2,
+            q_sequence_length=16,
+            kv_sequence_length=16,
+            q_num_heads=8,
+            kv_num_heads=8,
+            head_size=128,
+            is_causal=0,
+            has_attn_mask=True,
+            attn_mask_dims=4,
+            attn_mask_type="additive",
+            broadcast_mask_batch=True,
+            broadcast_mask_heads=True,
+        )
+
+        torch.manual_seed(0)
+        device = "cuda"
+        torch_type = torch.float16
+
+        q = torch.randn(2, 16, 8, 128, device=device, dtype=torch_type) * 0.2
+        k = torch.randn(2, 16, 8, 128, device=device, dtype=torch_type) * 0.2
+        v = torch.randn_like(k) * 0.2
+
+        # Create (1,1,q,kv) additive mask: lower-triangular causal pattern
+        mask_filter = float(torch.finfo(torch_type).min)
+        mask_2d = torch.zeros(16, 16, device=device, dtype=torch_type)
+        for i in range(16):
+            mask_2d[i, i + 1:] = mask_filter
+        attn_mask = mask_2d.unsqueeze(0).unsqueeze(0)  # (1, 1, 16, 16)
+
+        # Reference: expand to full (B, H, Q, K)
+        attn_bias_ref = attn_mask.expand(2, 8, -1, -1).contiguous()
+        out_ref, _ = attention_ref(q=q, k=k, v=v, attn_bias=attn_bias_ref, causal=False)
+
+        # ORT path
+        out_ort, _, _ = attention_prompt_func(
+            q=q, k=k, v=v, config=config, attn_mask=attn_mask,
+            ep="CUDAExecutionProvider", device=device, ort_type=TensorProto.FLOAT16,
+        )
+        out_ort = out_ort.reshape(2, 16, 8, 128)
+
+        out_np = out_ort.float().detach().cpu().numpy()
+        out_ref_np = out_ref.float().detach().cpu().numpy()
+        numpy.testing.assert_allclose(out_np, out_ref_np, rtol=rtol["fp16"], atol=atol["fp16"])
+
+
 if __name__ == "__main__":
     unittest.main()

From c3f771af0a89068295670d50f9363095d5db9c5d Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 4 Mar 2026 22:10:26 +0000
Subject: [PATCH 19/35] lint

---
 .../test_onnx_attention/common.py             | 15 ++--
 .../test_onnx_attention/test_gqa.py           | 74 ++++++++++++++-----
 .../test_onnx_attention/test_mha.py           | 67 ++++++++++++-----
 3 files changed, 114 insertions(+), 42 deletions(-)

diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/common.py b/onnxruntime/test/python/transformers/test_onnx_attention/common.py
index 9bf7645e2c539..28145a7b693d8 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/common.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/common.py
@@ -187,15 +187,18 @@ def create_attention_node_and_io(
         # 4D BNSH inputs: [batch, num_heads, seq_len, head_size]
         graph_input = [
             helper.make_tensor_value_info(
-                "query", ort_type,
+                "query",
+                ort_type,
                 [config.batch_size, config.q_num_heads, config.q_sequence_length, config.head_size],
             ),
             helper.make_tensor_value_info(
-                "key", ort_type,
+                "key",
+                ort_type,
                 [config.batch_size, config.kv_num_heads, config.kv_sequence_length, config.head_size],
             ),
             helper.make_tensor_value_info(
-                "value", ort_type,
+                "value",
+                ort_type,
                 [config.batch_size, config.kv_num_heads, config.kv_sequence_length, config.head_size],
             ),
         ]
@@ -451,7 +454,8 @@ def attention_prompt_func(
     if config.use_4d_bnsh:
         out_torch = torch.zeros(
             (config.batch_size, config.q_num_heads, config.q_sequence_length, config.head_size),
-            dtype=out_dtype, device=device,
+            dtype=out_dtype,
+            device=device,
         )
     else:
         out_torch = torch.zeros(
@@ -568,7 +572,8 @@ def attention_past_func(
     if config.use_4d_bnsh:
         out_torch = torch.zeros(
             (config.batch_size, config.q_num_heads, config.q_sequence_length, config.head_size),
-            dtype=out_dtype, device=device,
+            dtype=out_dtype,
+            device=device,
         )
     else:
         out_torch = torch.zeros(
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index 1b4a995607122..2d848b030a63b 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -1164,27 +1164,52 @@ def test_gqa_nonpad_kv_seqlen_cpu(self, name, config, seqlens):
 def gqa_4d_bnsh_test_cases():
     """Generate test cases for GQA with 4D BNSH input format."""
     return [
-        ("prompt_nomask", AttentionConfig(
-            batch_size=2, q_sequence_length=16, kv_sequence_length=16,
-            q_num_heads=8, kv_num_heads=2, head_size=128, is_causal=1,
-            use_4d_bnsh=True,
-        )),
-        ("prompt_smallhead", AttentionConfig(
-            batch_size=2, q_sequence_length=16, kv_sequence_length=16,
-            q_num_heads=8, kv_num_heads=4, head_size=64, is_causal=1,
-            use_4d_bnsh=True,
-        )),
+        (
+            "prompt_nomask",
+            AttentionConfig(
+                batch_size=2,
+                q_sequence_length=16,
+                kv_sequence_length=16,
+                q_num_heads=8,
+                kv_num_heads=2,
+                head_size=128,
+                is_causal=1,
+                use_4d_bnsh=True,
+            ),
+        ),
+        (
+            "prompt_smallhead",
+            AttentionConfig(
+                batch_size=2,
+                q_sequence_length=16,
+                kv_sequence_length=16,
+                q_num_heads=8,
+                kv_num_heads=4,
+                head_size=64,
+                is_causal=1,
+                use_4d_bnsh=True,
+            ),
+        ),
     ]
 
 
 def gqa_4d_bnsh_past_test_cases():
     """Generate test cases for GQA decode with 4D BNSH input format."""
     return [
-        ("decode_nomask", AttentionConfig(
-            batch_size=2, q_sequence_length=1, kv_sequence_length=1,
-            past_kv_sequence_length=32, q_num_heads=8, kv_num_heads=2,
-            head_size=128, is_causal=1, use_4d_bnsh=True,
-        )),
+        (
+            "decode_nomask",
+            AttentionConfig(
+                batch_size=2,
+                q_sequence_length=1,
+                kv_sequence_length=1,
+                past_kv_sequence_length=32,
+                q_num_heads=8,
+                kv_num_heads=2,
+                head_size=128,
+                is_causal=1,
+                use_4d_bnsh=True,
+            ),
+        ),
     ]
 
 
@@ -1266,8 +1291,13 @@ def test_gqa_prompt_float_mask_4d(self):
         # Create additive mask with padding pattern: batch 0 has 10 valid, batch 1 full
         seqlens = torch.tensor([10, 16], dtype=torch.int32, device=device)
         attn_mask = create_additive_mask_from_seqlens(
-            seqlens=seqlens, total_seq_len=16, mask_dims=4,
-            q_seq_len=16, num_heads=8, device=device, dtype=torch_type,
+            seqlens=seqlens,
+            total_seq_len=16,
+            mask_dims=4,
+            q_seq_len=16,
+            num_heads=8,
+            device=device,
+            dtype=torch_type,
         )
 
         # Zero padded KV positions
@@ -1282,8 +1312,14 @@ def test_gqa_prompt_float_mask_4d(self):
         os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
         try:
             out_ort, _, _ = attention_prompt_func(
-                q=q, k=k, v=v, config=config, attn_mask=attn_mask,
-                ep="CUDAExecutionProvider", device=device, ort_type=TensorProto.FLOAT16,
+                q=q,
+                k=k,
+                v=v,
+                config=config,
+                attn_mask=attn_mask,
+                ep="CUDAExecutionProvider",
+                device=device,
+                ort_type=TensorProto.FLOAT16,
             )
         finally:
             os.environ.pop("ORT_DISABLE_FLASH_ATTENTION", None)
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
index d1762687e837d..15d214da52c58 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
@@ -1174,21 +1174,46 @@ def mha_unfused_test_cases():
     verify the unfused kernel handles MHA correctly.
     """
     cases = [
-        ("prompt_causal", AttentionConfig(
-            batch_size=2, q_sequence_length=16, kv_sequence_length=16,
-            q_num_heads=8, kv_num_heads=8, head_size=64, is_causal=1,
-            attn_mask_type="additive",
-        )),
-        ("prompt_noncausal", AttentionConfig(
-            batch_size=2, q_sequence_length=16, kv_sequence_length=16,
-            q_num_heads=8, kv_num_heads=8, head_size=64, is_causal=0,
-            attn_mask_type="additive",
-        )),
-        ("decode_causal", AttentionConfig(
-            batch_size=2, q_sequence_length=1, kv_sequence_length=1,
-            past_kv_sequence_length=32, q_num_heads=8, kv_num_heads=8,
-            head_size=64, is_causal=1, attn_mask_type="additive",
-        )),
+        (
+            "prompt_causal",
+            AttentionConfig(
+                batch_size=2,
+                q_sequence_length=16,
+                kv_sequence_length=16,
+                q_num_heads=8,
+                kv_num_heads=8,
+                head_size=64,
+                is_causal=1,
+                attn_mask_type="additive",
+            ),
+        ),
+        (
+            "prompt_noncausal",
+            AttentionConfig(
+                batch_size=2,
+                q_sequence_length=16,
+                kv_sequence_length=16,
+                q_num_heads=8,
+                kv_num_heads=8,
+                head_size=64,
+                is_causal=0,
+                attn_mask_type="additive",
+            ),
+        ),
+        (
+            "decode_causal",
+            AttentionConfig(
+                batch_size=2,
+                q_sequence_length=1,
+                kv_sequence_length=1,
+                past_kv_sequence_length=32,
+                q_num_heads=8,
+                kv_num_heads=8,
+                head_size=64,
+                is_causal=1,
+                attn_mask_type="additive",
+            ),
+        ),
     ]
     return cases
 
@@ -1277,7 +1302,7 @@ def test_mha_broadcast_mask_additive(self):
         mask_filter = float(torch.finfo(torch_type).min)
         mask_2d = torch.zeros(16, 16, device=device, dtype=torch_type)
         for i in range(16):
-            mask_2d[i, i + 1:] = mask_filter
+            mask_2d[i, i + 1 :] = mask_filter
         attn_mask = mask_2d.unsqueeze(0).unsqueeze(0)  # (1, 1, 16, 16)
 
         # Reference: expand to full (B, H, Q, K)
@@ -1286,8 +1311,14 @@ def test_mha_broadcast_mask_additive(self):
 
         # ORT path
         out_ort, _, _ = attention_prompt_func(
-            q=q, k=k, v=v, config=config, attn_mask=attn_mask,
-            ep="CUDAExecutionProvider", device=device, ort_type=TensorProto.FLOAT16,
+            q=q,
+            k=k,
+            v=v,
+            config=config,
+            attn_mask=attn_mask,
+            ep="CUDAExecutionProvider",
+            device=device,
+            ort_type=TensorProto.FLOAT16,
         )
         out_ort = out_ort.reshape(2, 16, 8, 128)
 

From d6f16af596ae37ed4cb55eb99d87767b756c3616 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 5 Mar 2026 02:35:24 +0000
Subject: [PATCH 20/35] Fix 2D mask shape, add 4D BNSH present_kv, cleanup and
 docs

- Fix 2D attn_mask MEA path: use broadcast flags instead of incorrect [B,kv_seq] expansion
- Add D2D memcpy for 4D BNSH present_key/value in flash and MEA prompt paths
- Remove dead BroadcastBias2DToQSeq kernel code
- Add TODO for shorter attn_mask last dim (spec allows, we enforce exact match)
- Add architectural comment on XQA/shared buffer incompatibility
- Add regression tests for 2D mask with batch>q_seq
- Remove 4D BNSH present_kv test skip

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cpu/llm/attention_helper.h |   5 +
 .../core/providers/cuda/llm/attention.cc      |  72 +++++---
 .../providers/cuda/llm/attention_mask_impl.cu |  66 +------
 .../providers/cuda/llm/attention_mask_impl.h  |  15 +-
 .../test_onnx_attention/test_gqa.py           |  27 ++-
 .../test_onnx_attention/test_mha.py           | 174 ++++++++++++++++++
 6 files changed, 247 insertions(+), 112 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/llm/attention_helper.h b/onnxruntime/core/providers/cpu/llm/attention_helper.h
index c41c275a61340..7030eaa81a16c 100644
--- a/onnxruntime/core/providers/cpu/llm/attention_helper.h
+++ b/onnxruntime/core/providers/cpu/llm/attention_helper.h
@@ -131,6 +131,11 @@ inline Status ComputeOutputShapeForAttention(
   }
 
   ORT_ENFORCE(parameters.q_num_heads % parameters.kv_num_heads == 0, "q_num_heads must be a multiple of kv_num_heads. This is required for grouped/multi-query and multi-headed attention.");
+  // TODO: The ONNX spec allows attn_mask last dim to be shorter than total_sequence_length,
+  // with positions beyond the mask padded with -inf. Currently we enforce exact match.
+  // To support: change == to <=, allocate padded buffer, fill remainder with -inf.
+  // See ONNX spec: 'The last dimension can also be shorter than total_sequence_length
+  // and will be padded to total_sequence_length with negative infinity.'
   ORT_ENFORCE(attn_mask == nullptr || attn_mask->Shape()[attn_mask->Shape().NumDimensions() - 1] == parameters.total_sequence_length,
               "inconsistent total_sequence_length (between attn_mask and past_key and past_value)");
   ORT_ENFORCE(attn_mask == nullptr ||
diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index f894a8b141ce5..8c81bbb049fb1 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -77,6 +77,25 @@ Attention<T>::Attention(const OpKernelInfo& info) : CudaKernel(info) {
 // ============================================================================
 // RunFlashAttention: Direct flash attention kernel call
 // ============================================================================
+//
+// PERFORMANCE NOTE: ONNX Attention's decode path is inherently ~15-30% slower than
+// contrib GQA's decode path for grouped-query attention workloads. This is because:
+//
+// 1. No past_present_share_buffer: The ONNX Attention spec requires past_key/value
+//    shape = (B, H, past_seq, head_size) and present_key/value shape =
+//    (B, H, total_seq, head_size) where total_seq = past_seq + kv_seq.
+//    Since past and present have different shapes, they cannot share the same buffer.
+//    Contrib GQA allows past and present to be the same tensor (in-place append),
+//    eliminating the memset + strided copy overhead (~67MB per decode step for typical LLM).
+//
+// 2. No XQA kernel: GQA's specialized XQA decode kernel (xqa_loader.h) requires
+//    past_present_share_buffer to function. Since ONNX Attention cannot share buffers
+//    (see point 1), XQA is fundamentally incompatible with this op's spec design.
+//
+// 3. These are spec-level limitations, not implementation gaps. A graph optimizer that
+//    transparently replaces ONNX Attention with contrib GQA on supported hardware
+//    would be the recommended approach to close this performance gap.
+//
 template <typename T>
 Status Attention<T>::RunFlashAttention(
     OpKernelContext* context,
@@ -204,6 +223,8 @@ Status Attention<T>::RunFlashAttention(
 
     // Zero present buffers before strided copy to avoid stale data in positions
     // beyond past_seq that mha_fwd_kvcache might read during attention (matching GQA pattern).
+    // NOTE: This memset + strided copy is the main decode overhead vs contrib GQA.
+    // See PERFORMANCE NOTE above for why buffer sharing is impossible under the ONNX spec.
     const size_t num_kv_rows = static_cast<size_t>(parameters.batch_size) * parameters.kv_num_heads;
     const size_t present_k_bytes = num_kv_rows * parameters.total_sequence_length *
                                    parameters.head_size * sizeof(T);
@@ -411,6 +432,11 @@ Status Attention<T>::RunFlashAttention(
             K->Data<float>(), present_key->MutableData<float>(),
             cuda_stream, device_prop.maxThreadsPerBlock));
       }
+    } else if (present_key != nullptr && !is_bsnh) {
+      // 4D BNSH prompt: K is already BNSH, just D2D copy to present
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
+          present_key->MutableData<T>(), K->Data<T>(),
+          K->SizeInBytes(), cudaMemcpyDeviceToDevice, cuda_stream));
     }
     if (present_value != nullptr && is_bsnh) {
       if constexpr (std::is_same_v<T, MLFloat16>) {
@@ -433,6 +459,11 @@ Status Attention<T>::RunFlashAttention(
             V->Data<float>(), present_value->MutableData<float>(),
             cuda_stream, device_prop.maxThreadsPerBlock));
       }
+    } else if (present_value != nullptr && !is_bsnh) {
+      // 4D BNSH prompt: V is already BNSH, just D2D copy to present
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
+          present_value->MutableData<T>(), V->Data<T>(),
+          V->SizeInBytes(), cudaMemcpyDeviceToDevice, cuda_stream));
     }
   }
 
@@ -649,32 +680,17 @@ Status Attention<T>::RunMemoryEfficientAttention(
       }
 
       // Determine broadcast flags based on bias logical shape [B, num_heads, q_seq, kv_seq].
-      // MEA always uses bias_strideM = kv_seq, so each query row must have kv_seq elements.
-      // For 2D masks [B, kv_seq]: the mask is constant across q positions, so we must
-      // expand to [B, 1, q_seq, kv_seq] by repeating each row q_seq times. Without this,
-      // bias_strideM would walk through batch boundaries instead of replaying the same mask.
+      // MEA indexes bias as: offset = batch_id * strideB + head_id * strideH + q_pos * strideM + kv_pos.
+      // broadcast_attn_bias_dim_0=true sets strideB=0; dim_1=true sets strideH=0.
+      // strideM is always total_seq (num_keys), so the data must have [q_seq, total_seq] as inner dims.
       size_t mask_dims = attn_mask->Shape().NumDimensions();
       auto dims = attn_mask->Shape().GetDims();
       if (mask_dims == 2) {
-        // Expand [B, kv_seq] → [B, 1, q_seq, kv_seq] by repeating each batch's row
-        using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
-        const int kv_len = parameters.total_sequence_length;
-        const int q_len = parameters.q_sequence_length;
-        int64_t expanded_elements = static_cast<int64_t>(parameters.batch_size) * q_len * kv_len;
-        auto expanded_buffer = GetScratchBuffer<void>(
-            expanded_elements * sizeof(NativeCudaT), context->GetComputeStream());
-        const auto* src = (attn_mask->IsDataType<bool>())
-                              ? reinterpret_cast<const NativeCudaT*>(converted_mask_buffer.get())
-                              : reinterpret_cast<const NativeCudaT*>(attn_mask->Data<T>());
-        auto* dst = reinterpret_cast<NativeCudaT*>(expanded_buffer.get());
-        ORT_RETURN_IF_ERROR(LaunchBroadcastBias2DToQSeq<NativeCudaT>(
-            src, dst, parameters.batch_size, q_len, kv_len,
-            cuda_stream, device_prop.maxThreadsPerBlock));
-        attn_bias_data = expanded_buffer.get();
-        converted_mask_buffer = std::move(expanded_buffer);
-        // Expanded shape is [B, 1, q_seq, kv_seq]
-        broadcast_bias_dim_0 = false;
-        broadcast_bias_dim_1 = true;
+        // 2D mask: [q_seq, total_seq] per ONNX spec. Broadcasts over batch and heads.
+        // MEA reads bias[q_pos * total_seq + kv_pos] for all (batch, head) pairs
+        // via strideB=0, strideH=0, strideM=total_seq.
+        broadcast_bias_dim_0 = true;  // broadcast over batch
+        broadcast_bias_dim_1 = true;  // broadcast over heads
       } else if (mask_dims == 3) {
         broadcast_bias_dim_0 = true;
         broadcast_bias_dim_1 = dims[0] == 1;
@@ -768,6 +784,11 @@ Status Attention<T>::RunMemoryEfficientAttention(
           K->Data<float>(), present_key->MutableData<float>(),
           cuda_stream, device_prop.maxThreadsPerBlock));
     }
+  } else if (present_key != nullptr && !is_bsnh) {
+    // 4D BNSH prompt: K is already BNSH, just D2D copy to present
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
+        present_key->MutableData<T>(), K->Data<T>(),
+        K->SizeInBytes(), cudaMemcpyDeviceToDevice, cuda_stream));
   }
   if (present_value != nullptr && is_bsnh) {
     if constexpr (std::is_same_v<T, MLFloat16>) {
@@ -790,6 +811,11 @@ Status Attention<T>::RunMemoryEfficientAttention(
           V->Data<float>(), present_value->MutableData<float>(),
           cuda_stream, device_prop.maxThreadsPerBlock));
     }
+  } else if (present_value != nullptr && !is_bsnh) {
+    // 4D BNSH prompt: V is already BNSH, just D2D copy to present
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
+        present_value->MutableData<T>(), V->Data<T>(),
+        V->SizeInBytes(), cudaMemcpyDeviceToDevice, cuda_stream));
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
index df8bae59e5d8a..b80b040e27103 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -19,7 +19,7 @@ namespace cuda {
 // - After the first False, all remaining elements must be False (contiguous padding)
 //
 // Handle broadcasting:
-// - 2D mask (batch_size, total_seq_len): stride = total_seq_len, batch_idx = threadIdx
+// - 2D mask (q_seq_len, total_seq_len): broadcasts over batch; uses first query position (row 0)
 // - 3D mask (num_heads, q_seq_len, total_seq_len): broadcasts to [1, num_heads, q_seq, total_seq]
 //   No per-batch variation; uses first head, first q position for all batches
 // - 4D mask (B, H, q_seq_len, total_seq_len): we look at first head, first q position
@@ -43,10 +43,11 @@ __global__ void ConvertMaskToSeqlensKernel(
   const bool* mask_row = nullptr;
 
   if (mask_dims == 2) {
-    // Shape: (batch_size or 1, total_seq_len)
-    // If mask_dim0 == 1, broadcast across all batches
-    int effective_batch = (mask_dim0 == 1) ? 0 : batch_idx;
-    mask_row = attn_mask + effective_batch * total_seq_len;
+    // Shape: (q_seq_len, total_seq_len) per ONNX spec. Broadcasts over batch.
+    // Use first query position (row 0) for sequence length determination.
+    // For 2D masks [q_seq, total_seq], only used in decode path where q_seq=1,
+    // so row 0 is always correct. Flash excludes 2D bool masks for prompt.
+    mask_row = attn_mask;
   } else if (mask_dims == 3) {
     // Shape: (num_heads, q_seq_len, total_seq_len)
     // This broadcasts to [1, num_heads, q_seq, total_seq] - same mask for all batches
@@ -183,61 +184,6 @@ template Status LaunchConvertBoolMaskToAttentionBias<__half>(
 template Status LaunchConvertBoolMaskToAttentionBias<__nv_bfloat16>(
     const bool*, __nv_bfloat16*, int64_t, float, cudaStream_t, int);
 
-// Broadcast a 2D attention bias [B, kv_seq] to [B, 1, q_seq, kv_seq] by repeating
-// each batch's row across all query positions. This is needed because MEA uses
-// bias_strideM = kv_seq, so each query position must have its own row of kv_seq values.
-// Without expansion, a 2D mask would cause bias_strideM to walk across batch boundaries.
-template <typename T>
-__global__ void BroadcastBias2DToQSeqKernel(
-    const T* __restrict__ src,
-    T* __restrict__ dst,
-    const int batch_size,
-    const int q_seq_len,
-    const int kv_seq_len) {
-  int64_t total = static_cast<int64_t>(batch_size) * q_seq_len * kv_seq_len;
-  for (int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
-       idx < total;
-       idx += static_cast<int64_t>(gridDim.x) * blockDim.x) {
-    // Map flat index → (batch, q_pos, kv_pos)
-    int kv_pos = static_cast<int>(idx % kv_seq_len);
-    int b = static_cast<int>(idx / (static_cast<int64_t>(q_seq_len) * kv_seq_len));
-    // All q positions read from the same src row for this batch
-    dst[idx] = src[static_cast<int64_t>(b) * kv_seq_len + kv_pos];
-  }
-}
-
-template <typename T>
-Status LaunchBroadcastBias2DToQSeq(
-    const T* src,
-    T* dst,
-    int batch_size,
-    int q_seq_len,
-    int kv_seq_len,
-    cudaStream_t stream,
-    int max_threads_per_block) {
-  int64_t total = static_cast<int64_t>(batch_size) * q_seq_len * kv_seq_len;
-  if (total == 0) {
-    return Status::OK();
-  }
-
-  int threads = static_cast<int>(std::min(static_cast<int64_t>(max_threads_per_block), total));
-  int64_t blocks = (total + threads - 1) / threads;
-  constexpr int64_t kMaxGridDimX = 65535;
-  unsigned int grid_size = static_cast<unsigned int>(std::min(blocks, kMaxGridDimX));
-
-  BroadcastBias2DToQSeqKernel<T><<<grid_size, threads, 0, stream>>>(
-      src, dst, batch_size, q_seq_len, kv_seq_len);
-
-  return CUDA_CALL(cudaGetLastError());
-}
-
-template Status LaunchBroadcastBias2DToQSeq<float>(
-    const float*, float*, int, int, int, cudaStream_t, int);
-template Status LaunchBroadcastBias2DToQSeq<__half>(
-    const __half*, __half*, int, int, int, cudaStream_t, int);
-template Status LaunchBroadcastBias2DToQSeq<__nv_bfloat16>(
-    const __nv_bfloat16*, __nv_bfloat16*, int, int, int, cudaStream_t, int);
-
 // CUDA kernel to convert nonpad_kv_seqlen (int64) to seqlens_k (int32) for GQA.
 // GQA convention: seqlens_k = nonpad_kv_seqlen - 1 (last valid index, not count).
 //
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
index 73462d76c4c04..a0a6654e0d196 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
@@ -16,7 +16,7 @@ namespace cuda {
 // 3. True values should be contiguous, followed by contiguous False (padding) values
 // 4. The mask must be broadcastable to (batch_size, num_heads, q_seq_len, total_seq_len)
 //
-// For 2D mask (batch_size, total_seq_len): uses the mask directly per batch
+// For 2D mask (q_seq_len, total_seq_len): broadcasts over batch; uses first query position (row 0)
 // For 3D mask (num_heads, q_seq_len, total_seq_len): broadcasts across batches, uses first head/q
 // For 4D mask (B, H, q_seq_len, total_seq_len): uses first head, first q position
 //
@@ -56,19 +56,6 @@ Status LaunchConvertBoolMaskToAttentionBias(
     cudaStream_t stream,
     int max_threads_per_block);
 
-// Broadcast a 2D attention bias [B, kv_seq] → [B, 1, q_seq, kv_seq] by repeating
-// each batch's row across all query positions. CUDA-graph-capturable replacement for
-// the host-side batch×q_seq cudaMemcpyAsync loop.
-template <typename T>
-Status LaunchBroadcastBias2DToQSeq(
-    const T* src,
-    T* dst,
-    int batch_size,
-    int q_seq_len,
-    int kv_seq_len,
-    cudaStream_t stream,
-    int max_threads_per_block);
-
 // Convert nonpad_kv_seqlen (int64, per-batch valid KV lengths) to seqlens_k (int32) for GQA.
 // GQA convention: seqlens_k[i] = nonpad_kv_seqlen[i] - 1 (last valid index, not count).
 //
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index 2d848b030a63b..ab0c93db42dc1 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -168,21 +168,18 @@ def parity_check_gqa_prompt(
         print(f"DEBUG_NAN: First 5 NaN indices: {nan_indices[:5]}")
 
     # Compare KV cache (present_k should match k, present_v should match v)
-    # Skip for 4D BNSH prompt: the dispatcher doesn't populate present_key/value
-    # when inputs are already BNSH and there's no past (known limitation).
-    if not config.use_4d_bnsh or config.past_kv_sequence_length > 0:
-        k_ref_bnsh = k.transpose(1, 2)  # BSNH -> BNSH
-        v_ref_bnsh = v.transpose(1, 2)  # BSNH -> BNSH
-
-        k_ref_np = k_ref_bnsh.to(torch.float32).detach().cpu().numpy()
-        v_ref_np = v_ref_bnsh.to(torch.float32).detach().cpu().numpy()
-        present_k_np = present_k.to(torch.float32).detach().cpu().numpy()
-        present_v_np = present_v.to(torch.float32).detach().cpu().numpy()
-
-        print_diff_statistics(torch.tensor(present_k_np - k_ref_np), "present_k")
-        numpy.testing.assert_allclose(present_k_np, k_ref_np, rtol=rtol, atol=atol)
-        print_diff_statistics(torch.tensor(present_v_np - v_ref_np), "present_v")
-        numpy.testing.assert_allclose(present_v_np, v_ref_np, rtol=rtol, atol=atol)
+    k_ref_bnsh = k.transpose(1, 2)  # BSNH -> BNSH
+    v_ref_bnsh = v.transpose(1, 2)  # BSNH -> BNSH
+
+    k_ref_np = k_ref_bnsh.to(torch.float32).detach().cpu().numpy()
+    v_ref_np = v_ref_bnsh.to(torch.float32).detach().cpu().numpy()
+    present_k_np = present_k.to(torch.float32).detach().cpu().numpy()
+    present_v_np = present_v.to(torch.float32).detach().cpu().numpy()
+
+    print_diff_statistics(torch.tensor(present_k_np - k_ref_np), "present_k")
+    numpy.testing.assert_allclose(present_k_np, k_ref_np, rtol=rtol, atol=atol)
+    print_diff_statistics(torch.tensor(present_v_np - v_ref_np), "present_v")
+    numpy.testing.assert_allclose(present_v_np, v_ref_np, rtol=rtol, atol=atol)
 
     print_diff_statistics(torch.tensor(out_np - out_ref_np), "out")
     numpy.testing.assert_allclose(out_np, out_ref_np, rtol=rtol, atol=atol)
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
index 15d214da52c58..836469597d206 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
@@ -1327,5 +1327,179 @@ def test_mha_broadcast_mask_additive(self):
         numpy.testing.assert_allclose(out_np, out_ref_np, rtol=rtol["fp16"], atol=atol["fp16"])
 
 
+# #################################################################################################
+#  2D Mask Broadcast Regression Test
+# #################################################################################################
+
+
+@unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping 2D mask broadcast tests.")
+class TestONNXAttentionMHA2DMaskBroadcast(unittest.TestCase):
+    """
+    Regression test for 2D mask [q_seq, total_seq] broadcast correctness.
+
+    Per ONNX spec, a 2D attention mask has shape [q_seq, total_seq] and broadcasts
+    over batch and heads. This test uses batch_size > q_seq with a non-uniform
+    mask (different values per row) to verify correct broadcast behavior.
+
+    The old bug indexed the 2D mask by batch index instead of query position,
+    causing OOB reads when batch_size > q_seq.
+    """
+
+    def test_2d_additive_mask_batch_gt_qseq(self):
+        """2D additive mask [q_seq=2, total_seq=8] with batch=4 — would OOB on old code."""
+        config = AttentionConfig(
+            batch_size=4,
+            q_sequence_length=2,
+            kv_sequence_length=8,
+            q_num_heads=4,
+            kv_num_heads=4,
+            head_size=64,
+            is_causal=0,
+            has_attn_mask=True,
+            attn_mask_dims=2,
+            attn_mask_type="additive",
+        )
+
+        torch.manual_seed(42)
+        device = "cuda"
+        torch_type = torch.float16
+        mask_filter_value = torch.finfo(torch_type).min
+
+        q = (
+            torch.randn(
+                config.batch_size,
+                config.q_sequence_length,
+                config.q_num_heads,
+                config.head_size,
+                device=device,
+                dtype=torch_type,
+            )
+            * 0.2
+        )
+        k = (
+            torch.randn(
+                config.batch_size,
+                config.kv_sequence_length,
+                config.kv_num_heads,
+                config.head_size,
+                device=device,
+                dtype=torch_type,
+            )
+            * 0.2
+        )
+        v = torch.randn_like(k) * 0.2
+
+        # Create a non-uniform 2D causal-style mask [q_seq=2, kv_seq=8]:
+        # Row 0: attend to positions 0-3, mask out 4-7
+        # Row 1: attend to positions 0-5, mask out 6-7
+        attn_mask = torch.zeros(config.q_sequence_length, config.kv_sequence_length, device=device, dtype=torch_type)
+        attn_mask[0, 4:] = mask_filter_value
+        attn_mask[1, 6:] = mask_filter_value
+
+        # Reference: broadcast [2, 8] → [4, 4, 2, 8] (same mask for all batches/heads)
+        attn_bias_ref = attn_mask.unsqueeze(0).unsqueeze(0).expand(config.batch_size, config.q_num_heads, -1, -1)
+
+        # PyTorch reference
+        out_ref, _ = attention_ref(q=q, k=k, v=v, attn_bias=attn_bias_ref, causal=False)
+
+        # ORT path
+        out_ort, _, _ = attention_prompt_func(
+            q=q,
+            k=k,
+            v=v,
+            config=config,
+            attn_mask=attn_mask,
+            ep="CUDAExecutionProvider",
+            device=device,
+            ort_type=TensorProto.FLOAT16,
+        )
+        out_ort = out_ort.reshape(config.batch_size, config.q_sequence_length, config.q_num_heads, config.head_size)
+
+        out_np = out_ort.float().detach().cpu().numpy()
+        out_ref_np = out_ref.float().detach().cpu().numpy()
+        numpy.testing.assert_allclose(out_np, out_ref_np, rtol=rtol["fp16"], atol=atol["fp16"])
+
+    def test_2d_bool_mask_batch_gt_qseq(self):
+        """2D bool mask [q_seq=2, total_seq=8] with batch=4 — would OOB on old code."""
+        config = AttentionConfig(
+            batch_size=4,
+            q_sequence_length=2,
+            kv_sequence_length=8,
+            q_num_heads=4,
+            kv_num_heads=4,
+            head_size=64,
+            is_causal=0,
+            has_attn_mask=True,
+            attn_mask_dims=2,
+            attn_mask_type="bool",
+        )
+
+        torch.manual_seed(42)
+        device = "cuda"
+        torch_type = torch.float16
+
+        q = (
+            torch.randn(
+                config.batch_size,
+                config.q_sequence_length,
+                config.q_num_heads,
+                config.head_size,
+                device=device,
+                dtype=torch_type,
+            )
+            * 0.2
+        )
+        k = (
+            torch.randn(
+                config.batch_size,
+                config.kv_sequence_length,
+                config.kv_num_heads,
+                config.head_size,
+                device=device,
+                dtype=torch_type,
+            )
+            * 0.2
+        )
+        v = torch.randn_like(k) * 0.2
+
+        # Create non-uniform 2D bool mask [q_seq=2, kv_seq=8]:
+        # Row 0: True for positions 0-3, False for 4-7
+        # Row 1: True for positions 0-5, False for 6-7
+        attn_mask = torch.ones(config.q_sequence_length, config.kv_sequence_length, device=device, dtype=torch.bool)
+        attn_mask[0, 4:] = False
+        attn_mask[1, 6:] = False
+
+        # Zero out K/V at padded positions (use row 0's pattern since 2D mask broadcasts)
+        # For bool mask, the effective seqlen for all batches comes from row 0 (most restrictive)
+        # Actually for cross-attention with different masking per query, just zero out nothing
+        # The reference uses key_padding_mask for padding, or we can use attn_bias directly
+        mask_filter_value = torch.finfo(torch_type).min
+        attn_bias_ref = torch.where(
+            attn_mask.unsqueeze(0).unsqueeze(0).expand(config.batch_size, config.q_num_heads, -1, -1),
+            torch.tensor(0.0, dtype=torch_type, device=device),
+            torch.tensor(mask_filter_value, dtype=torch_type, device=device),
+        )
+
+        # PyTorch reference with explicit bias
+        out_ref, _ = attention_ref(q=q, k=k, v=v, attn_bias=attn_bias_ref, causal=False)
+
+        # ORT path
+        out_ort, _, _ = attention_prompt_func(
+            q=q,
+            k=k,
+            v=v,
+            config=config,
+            attn_mask=attn_mask,
+            ep="CUDAExecutionProvider",
+            device=device,
+            ort_type=TensorProto.FLOAT16,
+        )
+        out_ort = out_ort.reshape(config.batch_size, config.q_sequence_length, config.q_num_heads, config.head_size)
+
+        out_np = out_ort.float().detach().cpu().numpy()
+        out_ref_np = out_ref.float().detach().cpu().numpy()
+        numpy.testing.assert_allclose(out_np, out_ref_np, rtol=rtol["fp16"], atol=atol["fp16"])
+
+
 if __name__ == "__main__":
     unittest.main()

From 68a1b0289b8908e7816c50bca7165b7b51e1e11d Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 5 Mar 2026 22:32:56 +0000
Subject: [PATCH 21/35] Address PR review feedback: transpose helpers, assert
 fixes, SEGFAULT fix, tests

- Refactor 9 transpose type-switch blocks into TransposeBNSHtoBSNH<T>/TransposeBSNHtoBNSH<T> helpers (-117 lines)
- Fix val > 0 assert regression in AttentionBias and FlashSeqlensK kernels (>= 0)
- Add max(0, seq_len + seqlen_offset) clamp for negative seqlens_k edge case
- Add explicit LaunchUngroup<__half>/<__nv_bfloat16> template instantiations (SEGFAULT fix)
- Remove unused nonpad_bias_buffer variable
- Move workspace_buffer to outer scope
- Fix misleading docstring about all-false masks
- Add bias alignment clarifying comment
- Add AllMasked CUDA + decode all-false mask regression tests

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../bert/cutlass_fmha/fmha_launch_template.h  |   1 +
 .../cuda/bert/group_query_attention_impl.cu   |  18 ++
 .../core/providers/cuda/llm/attention.cc      | 290 ++++++------------
 .../providers/cuda/llm/attention_mask_impl.cu |  13 +-
 .../providers/cpu/llm/attention_op_test.cc    | 165 +++++++++-
 5 files changed, 278 insertions(+), 209 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
index 6748aa8a29e4e..5e1fa591abaa2 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
+++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
@@ -264,6 +264,7 @@ void DispatchIsAligned(const MemoryEfficientAttentionParams& params) {
     int num_keys = params.kv_sequence_length;
     int num_queries = params.sequence_length;
     int bias_strideM = num_keys;
+    // Broadcast dimensions use stride=0, which satisfies any alignment (0 % N == 0).
     int bias_strideH = params.broadcast_attn_bias_dim_1 ? 0 : num_queries * num_keys;
     int bias_strideB = params.broadcast_attn_bias_dim_0
                            ? 0
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
index 961c80748d228..d2f74386b701e 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
@@ -1147,6 +1147,24 @@ template Status QkvToContext<__nv_bfloat16, __nv_fp8_e4m3>(
     GroupQueryAttentionData<__nv_bfloat16, __nv_fp8_e4m3>& data);
 #endif
 
+// Explicit instantiations for cross-TU usage by core/providers/cuda/llm/attention.cc
+template Status LaunchUngroup<__half>(
+    const GroupQueryAttentionParameters& parameters,
+    float2* k_buff, float2* v_buff,
+    const float2* k_og, const float2* v_og,
+    const int buff_seqlen, const int og_seqlen,
+    const bool is_bsnh,
+    cudaStream_t stream,
+    const int max_threads_per_block);
+template Status LaunchUngroup<__nv_bfloat16>(
+    const GroupQueryAttentionParameters& parameters,
+    float2* k_buff, float2* v_buff,
+    const float2* k_og, const float2* v_og,
+    const int buff_seqlen, const int og_seqlen,
+    const bool is_bsnh,
+    cudaStream_t stream,
+    const int max_threads_per_block);
+
 template Status LaunchUnpackQKV<half, LAYOUT_BNSH>(const half* packed_qkv, half* unpacked_q, half* unpacked_k, half* unpacked_v, const int num_heads, const int kv_num_heads, const int head_size, const int sequence_length, const int batch_size, cudaStream_t stream, const int max_threads_per_block);
 template Status LaunchUnpackQKV<__nv_bfloat16, LAYOUT_BNSH>(const __nv_bfloat16* packed_qkv, __nv_bfloat16* unpacked_q, __nv_bfloat16* unpacked_k, __nv_bfloat16* unpacked_v, const int num_heads, const int kv_num_heads, const int head_size, const int sequence_length, const int batch_size, cudaStream_t stream, const int max_threads_per_block);
 
diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 5ae4f780a9220..47ac38afe161a 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -74,6 +74,39 @@ Attention<T>::Attention(const OpKernelInfo& info) : CudaKernel(info) {
   ORT_ENFORCE(scale_ > 0 || std::isnan(scale_), "scale must be greater than 0 if specified");
 }
 
+// ============================================================================
+// Transpose helpers: eliminate repeated if-constexpr type-switch blocks.
+// T is the ORT type (MLFloat16, BFloat16, float). The helpers map T to the
+// corresponding CUDA type via ToCudaType<T>::MappedType and forward to the
+// overloaded Transpose functions in contrib_ops.
+// ============================================================================
+
+template <typename T>
+static Status TransposeBNSHtoBSNH(int batch_size, int sequence_length,
+                                  int num_heads, int head_size,
+                                  const void* input, void* output,
+                                  cudaStream_t stream, int max_threads_per_block) {
+  using CudaT = typename ToCudaType<T>::MappedType;
+  return onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
+      batch_size, sequence_length, num_heads, head_size,
+      reinterpret_cast<const CudaT*>(input),
+      reinterpret_cast<CudaT*>(output),
+      stream, max_threads_per_block);
+}
+
+template <typename T>
+static Status TransposeBSNHtoBNSH(int batch_size, int sequence_length,
+                                  int num_heads, int head_size,
+                                  const void* input, void* output,
+                                  cudaStream_t stream, int max_threads_per_block) {
+  using CudaT = typename ToCudaType<T>::MappedType;
+  return onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
+      batch_size, sequence_length, num_heads, head_size,
+      reinterpret_cast<const CudaT*>(input),
+      reinterpret_cast<CudaT*>(output),
+      stream, max_threads_per_block);
+}
+
 // ============================================================================
 // RunFlashAttention: Direct flash attention kernel call
 // ============================================================================
@@ -149,26 +182,11 @@ Status Attention<T>::RunFlashAttention(
     size_t q_bytes = sizeof(T) * parameters.batch_size * parameters.q_sequence_length *
                      parameters.q_num_heads * parameters.head_size;
     q_bsnh_buffer = GetScratchBuffer<void>(q_bytes, context->GetComputeStream());
-    if constexpr (std::is_same_v<T, MLFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.head_size,
-          reinterpret_cast<const half*>(Q->Data<T>()),
-          reinterpret_cast<half*>(q_bsnh_buffer.get()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else if constexpr (std::is_same_v<T, BFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.head_size,
-          Q->Data<BFloat16>(), reinterpret_cast<BFloat16*>(q_bsnh_buffer.get()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.head_size,
-          Q->Data<float>(), reinterpret_cast<float*>(q_bsnh_buffer.get()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    }
+    ORT_RETURN_IF_ERROR(TransposeBNSHtoBSNH<T>(
+        parameters.batch_size, parameters.q_sequence_length,
+        parameters.q_num_heads, parameters.head_size,
+        Q->Data<T>(), q_bsnh_buffer.get(),
+        cuda_stream, device_prop.maxThreadsPerBlock));
     q_data = q_bsnh_buffer.get();
   }
 
@@ -303,42 +321,16 @@ Status Attention<T>::RunFlashAttention(
                        parameters.kv_num_heads * parameters.v_head_size;
       k_bsnh_buffer = GetScratchBuffer<void>(k_bytes, context->GetComputeStream());
       v_bsnh_buffer = GetScratchBuffer<void>(v_bytes, context->GetComputeStream());
-      if constexpr (std::is_same_v<T, MLFloat16>) {
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.head_size,
-            reinterpret_cast<const half*>(K->Data<T>()),
-            reinterpret_cast<half*>(k_bsnh_buffer.get()),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.v_head_size,
-            reinterpret_cast<const half*>(V->Data<T>()),
-            reinterpret_cast<half*>(v_bsnh_buffer.get()),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-      } else if constexpr (std::is_same_v<T, BFloat16>) {
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.head_size,
-            K->Data<BFloat16>(), reinterpret_cast<BFloat16*>(k_bsnh_buffer.get()),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.v_head_size,
-            V->Data<BFloat16>(), reinterpret_cast<BFloat16*>(v_bsnh_buffer.get()),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-      } else {
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.head_size,
-            K->Data<float>(), reinterpret_cast<float*>(k_bsnh_buffer.get()),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.v_head_size,
-            V->Data<float>(), reinterpret_cast<float*>(v_bsnh_buffer.get()),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-      }
+      ORT_RETURN_IF_ERROR(TransposeBNSHtoBSNH<T>(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.head_size,
+          K->Data<T>(), k_bsnh_buffer.get(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
+      ORT_RETURN_IF_ERROR(TransposeBNSHtoBSNH<T>(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.v_head_size,
+          V->Data<T>(), v_bsnh_buffer.get(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
       k_new = k_bsnh_buffer.get();
       v_new = v_bsnh_buffer.get();
     }
@@ -393,54 +385,22 @@ Status Attention<T>::RunFlashAttention(
 
   // --- Transpose output BSNH → BNSH if input was 4D (BNSH) ---
   if (!is_bsnh && out_bsnh_buffer != nullptr) {
-    if constexpr (std::is_same_v<T, MLFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.v_head_size,
-          reinterpret_cast<const half*>(out_bsnh_buffer.get()),
-          reinterpret_cast<half*>(Y->MutableData<T>()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else if constexpr (std::is_same_v<T, BFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.v_head_size,
-          reinterpret_cast<const BFloat16*>(out_bsnh_buffer.get()),
-          Y->MutableData<BFloat16>(),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.v_head_size,
-          reinterpret_cast<const float*>(out_bsnh_buffer.get()),
-          Y->MutableData<float>(),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    }
+    ORT_RETURN_IF_ERROR(TransposeBSNHtoBNSH<T>(
+        parameters.batch_size, parameters.q_sequence_length,
+        parameters.q_num_heads, parameters.v_head_size,
+        out_bsnh_buffer.get(), Y->MutableData<T>(),
+        cuda_stream, device_prop.maxThreadsPerBlock));
   }
 
   // --- Populate present_key/value (BNSH) from K/V (BSNH) ---
   // Skip for decode path where mha_fwd_kvcache already populated present buffers.
   if (!present_kv_already_populated) {
     if (present_key != nullptr && is_bsnh) {
-      if constexpr (std::is_same_v<T, MLFloat16>) {
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.head_size,
-            reinterpret_cast<const half*>(K->Data<T>()),
-            reinterpret_cast<half*>(present_key->MutableData<T>()),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-      } else if constexpr (std::is_same_v<T, BFloat16>) {
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.head_size,
-            K->Data<BFloat16>(), present_key->MutableData<BFloat16>(),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-      } else {
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.head_size,
-            K->Data<float>(), present_key->MutableData<float>(),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-      }
+      ORT_RETURN_IF_ERROR(TransposeBSNHtoBNSH<T>(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.head_size,
+          K->Data<T>(), present_key->MutableData<T>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
     } else if (present_key != nullptr && !is_bsnh) {
       // 4D BNSH prompt: K is already BNSH, just D2D copy to present
       CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
@@ -448,26 +408,11 @@ Status Attention<T>::RunFlashAttention(
           K->SizeInBytes(), cudaMemcpyDeviceToDevice, cuda_stream));
     }
     if (present_value != nullptr && is_bsnh) {
-      if constexpr (std::is_same_v<T, MLFloat16>) {
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.v_head_size,
-            reinterpret_cast<const half*>(V->Data<T>()),
-            reinterpret_cast<half*>(present_value->MutableData<T>()),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-      } else if constexpr (std::is_same_v<T, BFloat16>) {
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.v_head_size,
-            V->Data<BFloat16>(), present_value->MutableData<BFloat16>(),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-      } else {
-        ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-            parameters.batch_size, parameters.kv_sequence_length,
-            parameters.kv_num_heads, parameters.v_head_size,
-            V->Data<float>(), present_value->MutableData<float>(),
-            cuda_stream, device_prop.maxThreadsPerBlock));
-      }
+      ORT_RETURN_IF_ERROR(TransposeBSNHtoBNSH<T>(
+          parameters.batch_size, parameters.kv_sequence_length,
+          parameters.kv_num_heads, parameters.v_head_size,
+          V->Data<T>(), present_value->MutableData<T>(),
+          cuda_stream, device_prop.maxThreadsPerBlock));
     } else if (present_value != nullptr && !is_bsnh) {
       // 4D BNSH prompt: V is already BNSH, just D2D copy to present
       CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
@@ -533,26 +478,11 @@ Status Attention<T>::RunMemoryEfficientAttention(
     size_t q_bytes = sizeof(T) * parameters.batch_size * parameters.q_sequence_length *
                      parameters.q_num_heads * parameters.head_size;
     q_bsnh_buffer = GetScratchBuffer<void>(q_bytes, context->GetComputeStream());
-    if constexpr (std::is_same_v<T, MLFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.head_size,
-          reinterpret_cast<const half*>(Q->Data<T>()),
-          reinterpret_cast<half*>(q_bsnh_buffer.get()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else if constexpr (std::is_same_v<T, BFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.head_size,
-          Q->Data<BFloat16>(), reinterpret_cast<BFloat16*>(q_bsnh_buffer.get()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BNSH_to_BSNH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.head_size,
-          Q->Data<float>(), reinterpret_cast<float*>(q_bsnh_buffer.get()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    }
+    ORT_RETURN_IF_ERROR(TransposeBNSHtoBSNH<T>(
+        parameters.batch_size, parameters.q_sequence_length,
+        parameters.q_num_heads, parameters.head_size,
+        Q->Data<T>(), q_bsnh_buffer.get(),
+        cuda_stream, device_prop.maxThreadsPerBlock));
     q_data = q_bsnh_buffer.get();
   }
 
@@ -620,7 +550,6 @@ Status Attention<T>::RunMemoryEfficientAttention(
 
   // Handle attention mask → attention_bias conversion
   IAllocatorUniquePtr<void> converted_mask_buffer;
-  IAllocatorUniquePtr<void> nonpad_bias_buffer;
   const void* attn_bias_data = nullptr;
   bool broadcast_bias_dim_0 = false;
   bool broadcast_bias_dim_1 = false;
@@ -740,67 +669,35 @@ Status Attention<T>::RunMemoryEfficientAttention(
     p.stream = cuda_stream;
     p.output = out_data;
 
+    IAllocatorUniquePtr<void> workspace_buffer;
     if (onnxruntime::contrib::cuda::MemoryEfficientAttentionParams::need_workspace(
             parameters.v_head_size, sizeof(T) == sizeof(float))) {
       size_t workspace_bytes = sizeof(float) * parameters.batch_size * parameters.q_sequence_length *
                                parameters.q_num_heads * parameters.v_head_size;
-      auto workspace_buffer = GetScratchBuffer<void>(workspace_bytes, context->GetComputeStream());
+      workspace_buffer = GetScratchBuffer<void>(workspace_bytes, context->GetComputeStream());
       p.workspace = workspace_buffer.get();
-      onnxruntime::contrib::cuda::run_memory_efficient_attention(p);
     } else {
       p.workspace = nullptr;
-      onnxruntime::contrib::cuda::run_memory_efficient_attention(p);
     }
+    onnxruntime::contrib::cuda::run_memory_efficient_attention(p);
   }
 
   // --- Transpose output BSNH → BNSH if input was 4D (BNSH) ---
   if (!is_bsnh && out_bsnh_buffer != nullptr) {
-    if constexpr (std::is_same_v<T, MLFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.v_head_size,
-          reinterpret_cast<const half*>(out_bsnh_buffer.get()),
-          reinterpret_cast<half*>(Y->MutableData<T>()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else if constexpr (std::is_same_v<T, BFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.v_head_size,
-          reinterpret_cast<const BFloat16*>(out_bsnh_buffer.get()),
-          Y->MutableData<BFloat16>(),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.q_sequence_length,
-          parameters.q_num_heads, parameters.v_head_size,
-          reinterpret_cast<const float*>(out_bsnh_buffer.get()),
-          Y->MutableData<float>(),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    }
+    ORT_RETURN_IF_ERROR(TransposeBSNHtoBNSH<T>(
+        parameters.batch_size, parameters.q_sequence_length,
+        parameters.q_num_heads, parameters.v_head_size,
+        out_bsnh_buffer.get(), Y->MutableData<T>(),
+        cuda_stream, device_prop.maxThreadsPerBlock));
   }
 
   // Populate present_key/present_value (BNSH) if requested
   if (present_key != nullptr && is_bsnh) {
-    if constexpr (std::is_same_v<T, MLFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.kv_sequence_length,
-          parameters.kv_num_heads, parameters.head_size,
-          reinterpret_cast<const half*>(K->Data<T>()),
-          reinterpret_cast<half*>(present_key->MutableData<T>()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else if constexpr (std::is_same_v<T, BFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.kv_sequence_length,
-          parameters.kv_num_heads, parameters.head_size,
-          K->Data<BFloat16>(), present_key->MutableData<BFloat16>(),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.kv_sequence_length,
-          parameters.kv_num_heads, parameters.head_size,
-          K->Data<float>(), present_key->MutableData<float>(),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    }
+    ORT_RETURN_IF_ERROR(TransposeBSNHtoBNSH<T>(
+        parameters.batch_size, parameters.kv_sequence_length,
+        parameters.kv_num_heads, parameters.head_size,
+        K->Data<T>(), present_key->MutableData<T>(),
+        cuda_stream, device_prop.maxThreadsPerBlock));
   } else if (present_key != nullptr && !is_bsnh) {
     // 4D BNSH prompt: K is already BNSH, just D2D copy to present
     CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
@@ -808,26 +705,11 @@ Status Attention<T>::RunMemoryEfficientAttention(
         K->SizeInBytes(), cudaMemcpyDeviceToDevice, cuda_stream));
   }
   if (present_value != nullptr && is_bsnh) {
-    if constexpr (std::is_same_v<T, MLFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.kv_sequence_length,
-          parameters.kv_num_heads, parameters.v_head_size,
-          reinterpret_cast<const half*>(V->Data<T>()),
-          reinterpret_cast<half*>(present_value->MutableData<T>()),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else if constexpr (std::is_same_v<T, BFloat16>) {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.kv_sequence_length,
-          parameters.kv_num_heads, parameters.v_head_size,
-          V->Data<BFloat16>(), present_value->MutableData<BFloat16>(),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    } else {
-      ORT_RETURN_IF_ERROR(onnxruntime::contrib::cuda::Transpose_BSNH_to_BNSH(
-          parameters.batch_size, parameters.kv_sequence_length,
-          parameters.kv_num_heads, parameters.v_head_size,
-          V->Data<float>(), present_value->MutableData<float>(),
-          cuda_stream, device_prop.maxThreadsPerBlock));
-    }
+    ORT_RETURN_IF_ERROR(TransposeBSNHtoBNSH<T>(
+        parameters.batch_size, parameters.kv_sequence_length,
+        parameters.kv_num_heads, parameters.v_head_size,
+        V->Data<T>(), present_value->MutableData<T>(),
+        cuda_stream, device_prop.maxThreadsPerBlock));
   } else if (present_value != nullptr && !is_bsnh) {
     // 4D BNSH prompt: V is already BNSH, just D2D copy to present
     CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
index b80b040e27103..a2e394b908e1a 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -15,7 +15,7 @@ namespace cuda {
 // where padding starts. The sequence length is the index of first False.
 //
 // Validation (via CUDA_KERNEL_ASSERT, reported asynchronously):
-// - The mask must start with True (first element must be True)
+// - All-false masks are valid (represents fully masked / zero-length sequence)
 // - After the first False, all remaining elements must be False (contiguous padding)
 //
 // Handle broadcasting:
@@ -101,7 +101,9 @@ __global__ void ConvertMaskToSeqlensKernel(
   // seqlens_k output: seq_len + seqlen_offset
   // GQA convention (seqlen_offset=-1): stores last valid index (count - 1)
   // Flash convention (seqlen_offset=0): stores actual count
-  seqlens_k[batch_idx] = seq_len + seqlen_offset;
+  // Clamp to 0: all-false mask (seq_len=0) with negative offset (e.g. GQA decode)
+  // would produce negative seqlens_k, which is undefined in Flash/GQA kernels.
+  seqlens_k[batch_idx] = max(0, seq_len + seqlen_offset);
 }
 
 // Convert boolean mask to sequence lengths with a configurable offset.
@@ -199,6 +201,9 @@ __global__ void ConvertNonpadKvSeqlenToSeqlensKKernel(
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < batch_size) {
     int64_t val = nonpad_kv_seqlen[idx];
+    // GQA convention: val is a token count (1-based), so val=0 is invalid.
+    // seqlens_k = val - 1 gives a last-valid-index; val=0 would yield -1, which is
+    // undefined in GQA's mha_fwd_kvcache kernel.
     CUDA_KERNEL_ASSERT(val > 0);
     CUDA_KERNEL_ASSERT(val <= static_cast<int64_t>(total_sequence_length));
     if (min_expected_seqlen > 0) {
@@ -240,7 +245,7 @@ __global__ void ConvertNonpadKvSeqlenToFlashSeqlensKKernel(
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < batch_size) {
     int64_t val = nonpad_kv_seqlen[idx];
-    CUDA_KERNEL_ASSERT(val > 0);
+    CUDA_KERNEL_ASSERT(val >= 0);
     CUDA_KERNEL_ASSERT(val <= static_cast<int64_t>(total_sequence_length));
     val = max(static_cast<int64_t>(0), min(val, static_cast<int64_t>(total_sequence_length)));
     seqlens_k[idx] = static_cast<int>(val);  // count, not index
@@ -285,7 +290,7 @@ __global__ void ConvertNonpadKvSeqlenToAttentionBiasKernel(
     int b = static_cast<int>(idx / (static_cast<int64_t>(q_seq_len) * total_seq_len));
     int t = static_cast<int>(idx % total_seq_len);
     int64_t valid_len = nonpad_kv_seqlen[b];
-    CUDA_KERNEL_ASSERT(valid_len > 0 && valid_len <= static_cast<int64_t>(total_seq_len));
+    CUDA_KERNEL_ASSERT(valid_len >= 0 && valid_len <= static_cast<int64_t>(total_seq_len));
     valid_len = max(static_cast<int64_t>(0), min(valid_len, static_cast<int64_t>(total_seq_len)));
     attention_bias[idx] = (t < static_cast<int>(valid_len)) ? T(0.0f) : T(mask_filter_value);
   }
diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
index b0c6c6d801c4b..0c3ea3d5c5eb6 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
@@ -481,6 +481,95 @@ TEST(AttentionTest, Attention4DAttnMaskBoolAllFalse) {
   );
 }
 
+// Regression test for T23: all-false bool mask in decode mode (past_sequence_length > 0).
+// Before the T23 fix: seq_len=0 + negative seqlen_offset produced negative seqlens_k → UB/crash.
+// After the T23 fix: clamped to max(0, ...) → uniform softmax → mean of V values.
+TEST(AttentionTest, Attention4DAttnMaskBoolAllFalseDecodeWithPast) {
+  int batch_size = 1;
+  int q_num_heads = 2;
+  int q_sequence_length = 1;     // decode: single token
+  int head_size = 8;
+  int kv_sequence_length = 1;    // appending 1 new token
+  int kv_num_heads = 2;
+  int v_head_size = 8;
+  int past_sequence_length = 3;  // 3 tokens in cache → total_seq = 4
+
+  // Q: [1, 2, 1, 8] — values don't matter for uniform softmax test
+  std::vector<float> q(batch_size * q_num_heads * q_sequence_length * head_size, 0.5f);
+
+  // K: [1, 2, 1, 8]
+  std::vector<float> k(batch_size * kv_num_heads * kv_sequence_length * head_size, 0.5f);
+
+  // V: [1, 2, 1, 8] — new token values (will be concatenated with past_value)
+  std::vector<float> v = {
+      // head 0
+      0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f,
+      // head 1
+      0.8f, 0.8f, 0.8f, 0.8f, 0.8f, 0.8f, 0.8f, 0.8f};
+
+  // past_key: [1, 2, 3, 8]
+  std::vector<float> past_key(batch_size * kv_num_heads * past_sequence_length * head_size, 0.5f);
+
+  // past_value: [1, 2, 3, 8] — distinct per-row values so mean is meaningful
+  std::vector<float> past_value = {
+      // head 0: 3 past positions
+      0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f,
+      0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f,
+      0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f,
+      // head 1: 3 past positions
+      0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+      0.6f, 0.6f, 0.6f, 0.6f, 0.6f, 0.6f, 0.6f, 0.6f,
+      0.7f, 0.7f, 0.7f, 0.7f, 0.7f, 0.7f, 0.7f, 0.7f};
+
+  // Bool mask: [1, 4] — all false (every position masked)
+  std::initializer_list<bool> m = {false, false, false, false};
+
+  // present_key = concat(past_key, k) along seq dim → [1, 2, 4, 8]
+  // past_key is all 0.5, new k is all 0.5 → present_key is all 0.5
+  int total_sequence_length = past_sequence_length + kv_sequence_length;
+  std::vector<float> present_key(batch_size * kv_num_heads * total_sequence_length * head_size, 0.5f);
+
+  // present_value = concat(past_value, v) along seq dim → [1, 2, 4, 8]
+  std::vector<float> present_value = {
+      // head 0: past rows (0.1, 0.2, 0.3) + new row (0.4)
+      0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f,
+      0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f,
+      0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f,
+      0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f,
+      // head 1: past rows (0.5, 0.6, 0.7) + new row (0.8)
+      0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+      0.6f, 0.6f, 0.6f, 0.6f, 0.6f, 0.6f, 0.6f, 0.6f,
+      0.7f, 0.7f, 0.7f, 0.7f, 0.7f, 0.7f, 0.7f, 0.7f,
+      0.8f, 0.8f, 0.8f, 0.8f, 0.8f, 0.8f, 0.8f, 0.8f};
+
+  // With all-false mask, softmax produces uniform weights: 1/4 per position.
+  // Output = mean of V rows (past_value concat new_v):
+  //   head 0: mean(0.1, 0.2, 0.3, 0.4) = 0.25
+  //   head 1: mean(0.5, 0.6, 0.7, 0.8) = 0.65
+  std::vector<float> y = {
+      // head 0
+      0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f,
+      // head 1
+      0.65f, 0.65f, 0.65f, 0.65f, 0.65f, 0.65f, 0.65f, 0.65f};
+
+  ASSERT_EQ(q.size(), batch_size * q_num_heads * q_sequence_length * head_size);
+  ASSERT_EQ(k.size(), batch_size * kv_num_heads * kv_sequence_length * head_size);
+  ASSERT_EQ(v.size(), batch_size * kv_num_heads * kv_sequence_length * v_head_size);
+  ASSERT_EQ(m.size(), q_sequence_length * total_sequence_length);
+  ASSERT_EQ(past_key.size(), batch_size * kv_num_heads * past_sequence_length * head_size);
+  ASSERT_EQ(past_value.size(), batch_size * kv_num_heads * past_sequence_length * v_head_size);
+  ASSERT_EQ(present_key.size(), batch_size * kv_num_heads * total_sequence_length * head_size);
+  ASSERT_EQ(present_value.size(), batch_size * kv_num_heads * total_sequence_length * v_head_size);
+  ASSERT_EQ(y.size(), batch_size * q_num_heads * q_sequence_length * v_head_size);
+
+  RunTest4D(batch_size, q_num_heads, q_sequence_length, head_size, kv_sequence_length, kv_num_heads, v_head_size, past_sequence_length,
+            q, k, v, std::vector<float>(), m, past_key, past_value,
+            -1, -1, std::numeric_limits<float>::quiet_NaN(), std::numeric_limits<float>::quiet_NaN(), -1, TensorType::kFloat,
+            y, present_key, present_value, std::vector<float>(),
+            false, false, true  // disable_cpu, disable_cuda, disable_dml
+  );
+}
+
 TEST(AttentionTest, Attention4DDefaultFloat16) {
   int batch_size = 2;            // Q.shape[0]
   int q_num_heads = 3;           // Q.shape[1]
@@ -1532,10 +1621,84 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_AllMasked) {
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.push_back(DefaultCpuExecutionProvider());
+  if (HasCudaEnvironment(0)) {
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+  }
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
-// Edge case: nonpad_kv_seqlen = total_sequence_length (no positions masked).
+// Edge case: nonpad_kv_seqlen=0 exercised on CUDA Flash/MEA path with fp16 and GQA.
+// This verifies the val >= 0 fix in ConvertNonpadKvSeqlenToFlashSeqlensKKernel (Kernel B)
+// and ConvertNonpadKvSeqlenToAttentionBiasKernel (Kernel C).
+TEST(AttentionTest, Attention_NonPadKVSeqLen_AllMasked_FP16_GQA) {
+  if (!HasCudaEnvironment(530)) {
+    return;  // fp16 requires SM 5.3+
+  }
+
+  OpTester test("Attention", 24, onnxruntime::kOnnxDomain);
+
+  // batch=2: batch 0 is fully masked (nonpad=0), batch 1 has 2 valid positions (nonpad=2).
+  // GQA: q_num_heads=4, kv_num_heads=2 → triggers Flash/MEA GQA path.
+  // 4D BNSH: [B, N, S, H]
+  int batch_size = 2;
+  int q_num_heads = 4;
+  int kv_num_heads = 2;
+  int q_sequence_length = 1;
+  int kv_sequence_length = 4;
+  int head_size = 64;
+
+  int q_elements = batch_size * q_num_heads * q_sequence_length * head_size;
+  int k_elements = batch_size * kv_num_heads * kv_sequence_length * head_size;
+  int v_elements = k_elements;
+
+  // Use constant values for predictable uniform-softmax results.
+  std::vector<float> q(q_elements, 1.0f);
+  std::vector<float> k(k_elements, 1.0f);
+  // V: each row = row_index * 0.1 for distinct values across positions.
+  std::vector<float> v(v_elements);
+  for (int b = 0; b < batch_size; b++) {
+    for (int n = 0; n < kv_num_heads; n++) {
+      for (int s = 0; s < kv_sequence_length; s++) {
+        float val = static_cast<float>(s + 1) * 0.1f;
+        for (int h = 0; h < head_size; h++) {
+          v[(b * kv_num_heads * kv_sequence_length + n * kv_sequence_length + s) * head_size + h] = val;
+        }
+      }
+    }
+  }
+
+  test.AddAttribute<int64_t>("kv_num_heads", kv_num_heads);
+  test.AddAttribute<int64_t>("q_num_heads", q_num_heads);
+
+  test.AddInput<MLFloat16>("Q", {batch_size, q_num_heads, q_sequence_length, head_size}, ToFloat16(q));
+  test.AddInput<MLFloat16>("K", {batch_size, kv_num_heads, kv_sequence_length, head_size}, ToFloat16(k));
+  test.AddInput<MLFloat16>("V", {batch_size, kv_num_heads, kv_sequence_length, head_size}, ToFloat16(v));
+  test.AddOptionalInputEdge<bool>();
+  test.AddOptionalInputEdge<MLFloat16>();
+  test.AddOptionalInputEdge<MLFloat16>();
+  // batch 0: nonpad=0 (all masked), batch 1: nonpad=2 (first 2 of 4 positions valid)
+  test.AddInput<int64_t>("nonpad_kv_seqlen", {batch_size}, {0, 2});
+
+  // Expected output shape: [B, q_num_heads, q_seq, head_size]
+  // Batch 0 (all masked, nonpad=0): Flash/MEA returns zeros for fully-masked sequences.
+  // Batch 1 (2 valid): softmax over positions 0..1 (equal K, so uniform)
+  //   mean of first 2 rows = (0.1 + 0.2) / 2 = 0.15 for each head dim
+  int y_elements = batch_size * q_num_heads * q_sequence_length * head_size;
+  std::vector<float> expected_y(y_elements, 0.0f);  // batch 0 = zeros
+  for (int n = 0; n < q_num_heads; n++) {
+    for (int h = 0; h < head_size; h++) {
+      expected_y[(1 * q_num_heads + n) * head_size + h] = 0.15f;   // batch 1
+    }
+  }
+  test.AddOutput<MLFloat16>("Y", {batch_size, q_num_heads, q_sequence_length, head_size},
+                            ToFloat16(expected_y), false, 0, 0.02f);
+  test.AddOptionalOutputEdge<MLFloat16>();
+  test.AddOptionalOutputEdge<MLFloat16>();
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCudaExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
 TEST(AttentionTest, Attention_NonPadKVSeqLen_NoneMasked) {
   OpTester test("Attention", 24, onnxruntime::kOnnxDomain);
 

From 27ee9afde244f83798931bd7f00d8a9af3ace593 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 5 Mar 2026 23:16:24 +0000
Subject: [PATCH 22/35] Refine comments, fix docstrings, and remove dead code

- Fix outdated/incorrect comments across 5+ files
- Delete dead LaunchConvertNonpadKvSeqlenToSeqlensK function (-61 lines)
- Update eligibility docstrings for Flash/MEA/Unfused paths
- Correct head_size limit, MEA SM/dtype docs, val=0 descriptions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cuda/llm/attention.cc      | 20 ++++---
 .../providers/cuda/llm/attention_mask_impl.cu | 59 ++-----------------
 .../providers/cuda/llm/attention_mask_impl.h  | 24 ++------
 .../providers/cpu/llm/attention_op_test.cc    | 16 ++---
 .../test_onnx_attention/test_gqa.py           |  4 +-
 5 files changed, 31 insertions(+), 92 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 47ac38afe161a..b7f3765db1ec4 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -67,7 +67,7 @@ Attention<T>::Attention(const OpKernelInfo& info) : CudaKernel(info) {
                   qk_matmul_output_mode_ == attention_helper::QKMatMulOutputMode::kQKMask ||
                   qk_matmul_output_mode_ == attention_helper::QKMatMulOutputMode::kQKSoftCap ||
                   qk_matmul_output_mode_ == attention_helper::QKMatMulOutputMode::kQKSoftMax,
-              "qk_matmul_output_mode must be 0, 1, 2, or 3.");
+              "qk_matmul_output_mode must be one of: kNone(-1), kQK(0), kQKMask(1), kQKSoftCap(2), kQKSoftMax(3).");
   scale_ = info.GetAttrOrDefault<float>("scale", std::numeric_limits<T>::quiet_NaN());
   softcap_ = info.GetAttrOrDefault<float>("softcap", 0.0f);
   softmax_precision_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("softmax_precision", 0));
@@ -118,7 +118,7 @@ static Status TransposeBSNHtoBNSH(int batch_size, int sequence_length,
 //           - 4D BNSH: transposes Q/K/V to BSNH before kernel
 //   Path 3: no past, no mask (prompt) -> mha_fwd
 //   Eligibility: fp16/bf16, head_size==v_head_size, no output_qk, no softcap,
-//                no softmax_precision, (no mask OR bool mask + past)
+//                no softmax_precision, (no mask OR bool mask + past OR nonpad_kv_seqlen)
 //
 // PERFORMANCE NOTE: ONNX Attention's decode path is inherently ~15-30% slower than
 // contrib GQA's decode path for grouped-query attention workloads. This is because:
@@ -448,8 +448,9 @@ Status Attention<T>::RunFlashAttention(
 //   Path 1: nonpad_kv_seqlen (opset 24 external cache) -> has_custom_right_padding mode
 //   Path 2: no past, with mask (prompt) -> standard MEA with additive bias
 //   Path 3: no past, no mask (prompt) -> standard MEA
-//   Eligibility: fp16/bf16, SM75+, no past_key (decode excluded),
-//                head_size <= 128, bias stride alignment
+//   Eligibility: see has_memory_efficient_attention() (SM50+/53+/80+ by dtype,
+//                head_size <= 1024), plus: no output_qk, no softcap,
+//                no softmax_precision, no past_key (decode excluded), bias stride alignment
 //
 template <typename T>
 Status Attention<T>::RunMemoryEfficientAttention(
@@ -503,9 +504,9 @@ Status Attention<T>::RunMemoryEfficientAttention(
   IAllocatorUniquePtr<void> v_expand_buffer;
 
   if (is_gqa) {
-    // GQA+MEA only works with fp16/bf16 (MEA doesn't support fp32).
-    // Use if constexpr to avoid instantiating LaunchUngroup<float> which has no explicit
-    // template instantiation in group_query_attention_impl.cu.
+    // GQA+MEA only works with fp16/bf16 (LaunchUngroup lacks fp32 template instantiation
+    // in group_query_attention_impl.cu).
+    // Use if constexpr to avoid instantiating LaunchUngroup<float>.
     if constexpr (std::is_same_v<T, float>) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
                              "GQA with Memory Efficient Attention requires fp16 or bf16, not fp32.");
@@ -744,7 +745,8 @@ Status Attention<T>::RunMemoryEfficientAttention(
 //   Universal fallback via MHA's QkvToContext.
 //   Path 1: nonpad_kv_seqlen -> converts to attention_bias [B, q_seq, total_seq]
 //   Path 2: all other cases -> passes mask/bias directly
-//   Supports: all dtypes (fp16/bf16/fp32), all mask types, all head sizes
+//   Supports: all dtypes (fp16/bf16/fp32), all mask types (bool/float/none), all head sizes
+//   Not supported: softcap, softmax_precision, output_qk modes beyond kNone/kQK
 //   Limitation: MHA only (q_num_heads must equal kv_num_heads)
 //
 template <typename T>
@@ -909,7 +911,7 @@ Status Attention<T>::RunUnfusedAttention(
 // ============================================================================
 // MHA path (q_num_heads == kv_num_heads): uses direct kernel dispatch cascade
 //   flash → memory efficient → unfused
-// GQA path (q_num_heads != kv_num_heads): uses flash (with kvcache) or MEA
+// GQA path (q_num_heads != kv_num_heads): uses flash (handles GQA natively) or MEA
 //   (with head expansion via LaunchUngroup). Unfused fallback not yet supported for GQA.
 // ============================================================================
 template <typename T>
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
index a2e394b908e1a..aa627518d2902 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -99,10 +99,10 @@ __global__ void ConvertMaskToSeqlensKernel(
   }
 
   // seqlens_k output: seq_len + seqlen_offset
-  // GQA convention (seqlen_offset=-1): stores last valid index (count - 1)
-  // Flash convention (seqlen_offset=0): stores actual count
-  // Clamp to 0: all-false mask (seq_len=0) with negative offset (e.g. GQA decode)
-  // would produce negative seqlens_k, which is undefined in Flash/GQA kernels.
+  // Decode with past (seqlen_offset=-kv_seq_len): pre-append cache count
+  // Prompt/MEA (seqlen_offset=0): actual token count
+  // Clamp to 0: all-false mask (seq_len=0) with negative decode offset
+  // would produce negative seqlens_k, which is undefined in Flash kernels.
   seqlens_k[batch_idx] = max(0, seq_len + seqlen_offset);
 }
 
@@ -186,56 +186,7 @@ template Status LaunchConvertBoolMaskToAttentionBias<__half>(
 template Status LaunchConvertBoolMaskToAttentionBias<__nv_bfloat16>(
     const bool*, __nv_bfloat16*, int64_t, float, cudaStream_t, int);
 
-// CUDA kernel to convert nonpad_kv_seqlen (int64) to seqlens_k (int32) for GQA.
-// GQA convention: seqlens_k = nonpad_kv_seqlen - 1 (last valid index, not count).
-//
-// Validation (via CUDA_KERNEL_ASSERT, reported asynchronously):
-// - val must be > 0 (nonpad_kv_seqlen=0 → seqlens_k=0 → attends to garbage at pos 0)
-// - val must be <= total_sequence_length (out of bounds)
-__global__ void ConvertNonpadKvSeqlenToSeqlensKKernel(
-    const int64_t* __restrict__ nonpad_kv_seqlen,
-    int* __restrict__ seqlens_k,
-    const int batch_size,
-    const int total_sequence_length,
-    const int min_expected_seqlen) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < batch_size) {
-    int64_t val = nonpad_kv_seqlen[idx];
-    // GQA convention: val is a token count (1-based), so val=0 is invalid.
-    // seqlens_k = val - 1 gives a last-valid-index; val=0 would yield -1, which is
-    // undefined in GQA's mha_fwd_kvcache kernel.
-    CUDA_KERNEL_ASSERT(val > 0);
-    CUDA_KERNEL_ASSERT(val <= static_cast<int64_t>(total_sequence_length));
-    if (min_expected_seqlen > 0) {
-      CUDA_KERNEL_ASSERT(val >= static_cast<int64_t>(min_expected_seqlen));
-    }
-    val = max(static_cast<int64_t>(1), min(val, static_cast<int64_t>(total_sequence_length)));
-    seqlens_k[idx] = static_cast<int>(val) - 1;
-  }
-}
-
-Status LaunchConvertNonpadKvSeqlenToSeqlensK(
-    const int64_t* nonpad_kv_seqlen,
-    int* seqlens_k,
-    int batch_size,
-    int total_sequence_length,
-    cudaStream_t stream,
-    int max_threads_per_block,
-    int min_expected_seqlen) {
-  if (batch_size == 0) {
-    return Status::OK();
-  }
-
-  int threads = std::min(batch_size, max_threads_per_block);
-  int blocks = (batch_size + threads - 1) / threads;
-
-  ConvertNonpadKvSeqlenToSeqlensKKernel<<<blocks, threads, 0, stream>>>(
-      nonpad_kv_seqlen, seqlens_k, batch_size, total_sequence_length, min_expected_seqlen);
-
-  return CUDA_CALL(cudaGetLastError());
-}
-
-// Like ConvertNonpadKvSeqlenToSeqlensKKernel but produces the actual count (no -1 offset).
+// Convert nonpad_kv_seqlen (int64) to seqlens_k (int32) as actual token count.
 // Flash attention's mha_fwd_kvcache expects seqlens_k_ = number of valid tokens.
 __global__ void ConvertNonpadKvSeqlenToFlashSeqlensKKernel(
     const int64_t* __restrict__ nonpad_kv_seqlen,
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
index a0a6654e0d196..1a049c6be2b49 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
@@ -12,7 +12,7 @@ namespace cuda {
 //
 // The mask is expected to have the following properties:
 // 1. It represents right-padding only (valid tokens first, padding at the end)
-// 2. Each batch's mask should start with True (valid) values
+// 2. All-false masks (zero-length sequence) are valid; otherwise mask should start with True
 // 3. True values should be contiguous, followed by contiguous False (padding) values
 // 4. The mask must be broadcastable to (batch_size, num_heads, q_seq_len, total_seq_len)
 //
@@ -28,7 +28,7 @@ namespace cuda {
 //  -N: subtract N from count (for decode with mha_fwd_kvcache where N=kv_sequence_length,
 //      giving the number of tokens already in cache BEFORE appending new ones)
 //
-// Note: Mask validity (right-padding convention, starts with True, contiguous True/False)
+// Note: Mask validity (right-padding convention, contiguous True/False)
 //   is checked asynchronously via CUDA_KERNEL_ASSERT inside the kernel. Invalid masks will
 //   trigger a device-side assertion failure.
 Status LaunchConvertMaskToFlashSeqlensK(
@@ -56,23 +56,9 @@ Status LaunchConvertBoolMaskToAttentionBias(
     cudaStream_t stream,
     int max_threads_per_block);
 
-// Convert nonpad_kv_seqlen (int64, per-batch valid KV lengths) to seqlens_k (int32) for GQA.
-// GQA convention: seqlens_k[i] = nonpad_kv_seqlen[i] - 1 (last valid index, not count).
-//
-// IMPORTANT: nonpad_kv_seqlen must be >= 1 for every batch element.
-// A value of 0 would produce seqlens_k=0, which GQA interprets as "1 valid token at
-// position 0" (last-valid-index convention), causing silent attention to garbage data.
-Status LaunchConvertNonpadKvSeqlenToSeqlensK(
-    const int64_t* nonpad_kv_seqlen,
-    int* seqlens_k,
-    int batch_size,
-    int total_sequence_length,
-    cudaStream_t stream,
-    int max_threads_per_block,
-    int min_expected_seqlen = 0);
-
-// Like LaunchConvertNonpadKvSeqlenToSeqlensK but produces the actual count (no -1 offset).
-// Flash attention's mha_fwd_kvcache expects seqlens_k_ = number of valid tokens.
+// Convert nonpad_kv_seqlen (int64, per-batch valid KV lengths) to seqlens_k (int32)
+// as actual token count. Flash attention's mha_fwd_kvcache expects seqlens_k_ = number
+// of valid tokens.
 Status LaunchConvertNonpadKvSeqlenToFlashSeqlensK(
     const int64_t* nonpad_kv_seqlen,
     int* seqlens_k,
diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
index 0c3ea3d5c5eb6..e868efdb7289b 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
@@ -481,15 +481,15 @@ TEST(AttentionTest, Attention4DAttnMaskBoolAllFalse) {
   );
 }
 
-// Regression test for T23: all-false bool mask in decode mode (past_sequence_length > 0).
-// Before the T23 fix: seq_len=0 + negative seqlen_offset produced negative seqlens_k → UB/crash.
-// After the T23 fix: clamped to max(0, ...) → uniform softmax → mean of V values.
+// Regression test: all-false bool mask in decode mode (past_sequence_length > 0).
+// Before the fix: seq_len=0 + negative seqlen_offset produced negative seqlens_k → UB/crash.
+// After the fix: clamped to max(0, ...) → uniform softmax → mean of V values.
 TEST(AttentionTest, Attention4DAttnMaskBoolAllFalseDecodeWithPast) {
   int batch_size = 1;
   int q_num_heads = 2;
-  int q_sequence_length = 1;     // decode: single token
+  int q_sequence_length = 1;  // decode: single token
   int head_size = 8;
-  int kv_sequence_length = 1;    // appending 1 new token
+  int kv_sequence_length = 1;  // appending 1 new token
   int kv_num_heads = 2;
   int v_head_size = 8;
   int past_sequence_length = 3;  // 3 tokens in cache → total_seq = 4
@@ -1628,8 +1628,8 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_AllMasked) {
 }
 
 // Edge case: nonpad_kv_seqlen=0 exercised on CUDA Flash/MEA path with fp16 and GQA.
-// This verifies the val >= 0 fix in ConvertNonpadKvSeqlenToFlashSeqlensKKernel (Kernel B)
-// and ConvertNonpadKvSeqlenToAttentionBiasKernel (Kernel C).
+// This verifies the val >= 0 assertion in ConvertNonpadKvSeqlenToFlashSeqlensKKernel
+// and ConvertNonpadKvSeqlenToAttentionBiasKernel.
 TEST(AttentionTest, Attention_NonPadKVSeqLen_AllMasked_FP16_GQA) {
   if (!HasCudaEnvironment(530)) {
     return;  // fp16 requires SM 5.3+
@@ -1687,7 +1687,7 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_AllMasked_FP16_GQA) {
   std::vector<float> expected_y(y_elements, 0.0f);  // batch 0 = zeros
   for (int n = 0; n < q_num_heads; n++) {
     for (int h = 0; h < head_size; h++) {
-      expected_y[(1 * q_num_heads + n) * head_size + h] = 0.15f;   // batch 1
+      expected_y[(1 * q_num_heads + n) * head_size + h] = 0.15f;  // batch 1
     }
   }
   test.AddOutput<MLFloat16>("Y", {batch_size, q_num_heads, q_sequence_length, head_size},
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index 674b165e1a822..9daf7eb6c9bf8 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -833,8 +833,8 @@ def test_gqa_prompt_memory_efficient(self, name, config):
         )
 
     # Note: GQA past tests removed — MEA is ineligible when past_key is present
-    # (ComputeInternal requires past_key == nullptr for MEA). GQA past goes through
-    # flash attention regardless of ORT_DISABLE_FLASH_ATTENTION.
+    # (ComputeInternal requires past_key == nullptr for MEA). GQA past requires
+    # flash attention. The ONNX Attention op does not honor ORT_DISABLE_FLASH_ATTENTION.
 
 
 @unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")

From 381cd83c44f61d637f165cec7643cc1835056ac9 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 5 Mar 2026 23:30:09 +0000
Subject: [PATCH 23/35] Add clarifying comment for DispatchIsAligned bias
 alignment check

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../cuda/bert/cutlass_fmha/fmha_launch_template.h  | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
index 5e1fa591abaa2..29bb4fba6a09a 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
+++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
@@ -258,8 +258,18 @@ void DispatchIsAligned(const MemoryEfficientAttentionParams& params) {
                     params.qk_head_size % AlignedAK::kAlignmentK == 0 &&
                     params.v_head_size % AlignedAK::kAlignmentV == 0;
 
-  // Attention bias strides must also satisfy alignment requirements.
-  // Mirror the checks in AttentionKernel::check_supported to avoid ORT_ENFORCE crashes.
+  // Bias stride alignment check: route to the unaligned kernel when bias strides
+  // don't satisfy the aligned kernel's kAlignmentQ requirement.
+  //
+  // kAlignmentQ is template-dependent (kernel_forward.h:414):
+  //   isAligned=true:  kAlignmentQ = DefaultConfig::kAlignmentA (8 for fp16/bf16 SM75+)
+  //   isAligned=false: kAlignmentQ = GemmType::kMinimumAlignment (4 for fp16/bf16 SM75+)
+  // So check_supported (line 632) enforces DIFFERENT thresholds per path.
+  //
+  // The ONNX Attention kernel (core/providers/cuda/llm/attention.cc) gates MEA eligibility
+  // at kMinimumAlignment (4), allowing strides like 12 that the unaligned kernel handles.
+  // Without this check, such inputs dispatch to the aligned kernel where 12%8≠0 crashes.
+  // Contrib MHA gates at 4*sizeof(T)=8 for fp16, making this check redundant there.
   if (params.attn_bias != nullptr) {
     int num_keys = params.kv_sequence_length;
     int num_queries = params.sequence_length;

From a1582510d26056711a2ba36f551f106f2ed5d338 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Fri, 6 Mar 2026 18:11:51 +0000
Subject: [PATCH 24/35] Fix SM skip thresholds in attention tests (T25)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lowered TestTensorScatterAttentionCUDAFP16 from SM80+ to SM53 since
it doesn't force Flash — the cascade dispatcher in attention.cc
automatically picks the best available backend.

Added explanatory comments to 5 Flash-gated test classes in test_gqa.py
that must keep SM80+ because they explicitly force Flash via
ORT_DISABLE_FLASH_ATTENTION=0.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../test_onnx_attention/test_gqa.py           | 21 ++++++++++++++++---
 .../test_tensorscatter_attention.py           |  9 ++++++--
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index 9daf7eb6c9bf8..abac3b3fef078 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -742,7 +742,10 @@ def gqa_past_padding_test_cases():
 
 @unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
 class TestONNXAttentionFlashGQA(unittest.TestCase):
-    """Test ONNX Attention op (opset 23) GQA path with Flash Attention."""
+    """Test ONNX Attention op (opset 23) GQA path with Flash Attention.
+
+    Requires SM80+: tests explicitly force Flash via ORT_DISABLE_FLASH_ATTENTION=0.
+    """
 
     @parameterized.expand(gqa_prompt_test_cases())
     def test_gqa_prompt_flash(self, name, config):
@@ -775,7 +778,11 @@ def test_gqa_past_flash(self, name, config):
 
 @unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
 class TestONNXAttentionFlashGQABF16(unittest.TestCase):
-    """Test ONNX Attention op (opset 23) GQA path with Flash Attention using BFloat16."""
+    """Test ONNX Attention op (opset 23) GQA path with Flash Attention using BFloat16.
+
+    Requires SM80+: tests explicitly force Flash via ORT_DISABLE_FLASH_ATTENTION=0,
+    and BFloat16 requires Ampere or higher.
+    """
 
     @parameterized.expand(gqa_prompt_test_cases())
     def test_gqa_prompt_flash_bf16(self, name, config):
@@ -842,6 +849,10 @@ class TestONNXAttentionPaddingMaskGQA(unittest.TestCase):
     """
     Test ONNX Attention op (opset 23) GQA path with boolean padding masks.
 
+    Requires SM80+: decode+padding tests explicitly force Flash via
+    ORT_DISABLE_FLASH_ATTENTION=0.  Prompt+padding uses MEA fallback and is
+    tested separately in TestONNXAttentionPaddingMaskMemoryEfficientGQA.
+
     These tests verify that the boolean attn_mask is correctly converted to
     sequence lengths on GPU and that the attention computation respects the
     padding. Tests cover 2D, 3D, and 4D mask shapes.
@@ -1096,7 +1107,10 @@ def gqa_nonpad_kv_seqlen_cpu_test_cases():
 
 @unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
 class TestONNXAttentionGQANonpadKVSeqlen(unittest.TestCase):
-    """Test ONNX Attention op (opset 24) GQA path with nonpad_kv_seqlen (Flash Attention)."""
+    """Test ONNX Attention op (opset 24) GQA path with nonpad_kv_seqlen (Flash Attention).
+
+    Requires SM80+: tests explicitly force Flash via ORT_DISABLE_FLASH_ATTENTION=0.
+    """
 
     @parameterized.expand(gqa_nonpad_kv_seqlen_test_cases())
     def test_gqa_nonpad_kv_seqlen_flash(self, name, config, seqlens):
@@ -1217,6 +1231,7 @@ class TestONNXAttentionGQA4DBNSH(unittest.TestCase):
     """
     Test GQA with 4D BNSH input format [batch, num_heads, seq, head_size].
 
+    Requires SM80+: tests explicitly force Flash via ORT_DISABLE_FLASH_ATTENTION=0.
     The C++ attention op detects 4D inputs and sets transpose_output=false.
     Flash/MEA always expect BSNH, so the dispatcher transposes Q internally.
     """
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
index 521b8469ffa25..cb7dcb09d596e 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
@@ -510,9 +510,14 @@ def test_tensorscatter_attention_cpu_fp32(
         numpy.testing.assert_allclose(output, ref_output, rtol=cpu_fp32_rtol, atol=cpu_fp32_atol)
 
 
-@unittest.skipIf(not has_flash_attention(), "Flash Attention (SM80+) is not available, skipping tests.")
+@unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping tests.")
 class TestTensorScatterAttentionCUDAFP16(unittest.TestCase):
-    """Test TensorScatter + Attention (opset 24) on CUDA with float16 and IO Binding."""
+    """Test TensorScatter + Attention (opset 24) on CUDA with float16 and IO Binding.
+
+    On SM80+ Flash Attention is used; on SM75+ MEA handles the fallback;
+    on older GPUs the unfused path runs.  The cascade in attention.cc picks
+    the best available backend automatically.
+    """
 
     @parameterized.expand(cuda_fp16_test_cases())
     def test_tensorscatter_attention_cuda_fp16(

From 73da07ac4416cabe8e45be811dcbd0daffb7d737 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Fri, 6 Mar 2026 18:20:44 +0000
Subject: [PATCH 25/35] Address Copilot review: env var support, SM skip fix,
 nonpad+mask fallback

T24: Add ORT_DISABLE_FLASH_ATTENTION and ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION
support to ONNX Attention kernel, matching contrib MHA pattern.

T25: Lower SM threshold for tensorscatter fp16 tests to SM53,
allowing MEA/unfused fallback testing on older GPUs.

T26: Replace ORT_ENFORCE crash with graceful MEA fallback when both
nonpad_kv_seqlen and attn_mask are present. Flash does not support
this combination; MEA handles it natively.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cuda/llm/attention.cc      | 75 +++++++++++---
 .../core/providers/cuda/llm/attention.h       |  2 +
 .../providers/cpu/llm/attention_op_test.cc    | 97 +++++++++++++++++++
 3 files changed, 161 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index b7f3765db1ec4..d39164953aed6 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -72,6 +72,10 @@ Attention<T>::Attention(const OpKernelInfo& info) : CudaKernel(info) {
   softcap_ = info.GetAttrOrDefault<float>("softcap", 0.0f);
   softmax_precision_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("softmax_precision", 0));
   ORT_ENFORCE(scale_ > 0 || std::isnan(scale_), "scale must be greater than 0 if specified");
+
+  const auto* kernel_options = this->GetAttentionKernelOptions();
+  disable_flash_attention_ = !std::is_same<T, MLFloat16>::value || !kernel_options->UseFlashAttention();
+  disable_memory_efficient_attention_ = !kernel_options->UseEfficientAttention();
 }
 
 // ============================================================================
@@ -567,6 +571,39 @@ Status Attention<T>::RunMemoryEfficientAttention(
         cuda_stream,
         device_prop.maxThreadsPerBlock));
 
+    // When attn_mask is also provided, convert it to additive attn_bias so MEA
+    // applies both custom right padding (seqlens_k) and the attention mask (attn_bias).
+    if (attn_mask != nullptr) {
+      if (attn_mask->IsDataType<bool>()) {
+        using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
+        int64_t num_elements = attn_mask->Shape().Size();
+        converted_mask_buffer = GetScratchBuffer<void>(
+            num_elements * sizeof(NativeCudaT), context->GetComputeStream());
+        float mask_filter_value = static_cast<float>(std::numeric_limits<T>::lowest());
+        ORT_RETURN_IF_ERROR(LaunchConvertBoolMaskToAttentionBias<NativeCudaT>(
+            attn_mask->Data<bool>(),
+            reinterpret_cast<NativeCudaT*>(converted_mask_buffer.get()),
+            num_elements, mask_filter_value, cuda_stream,
+            device_prop.maxThreadsPerBlock));
+        attn_bias_data = converted_mask_buffer.get();
+      } else {
+        attn_bias_data = attn_mask->Data<T>();
+      }
+
+      size_t mask_dims = attn_mask->Shape().NumDimensions();
+      auto dims = attn_mask->Shape().GetDims();
+      if (mask_dims == 2) {
+        broadcast_bias_dim_0 = true;
+        broadcast_bias_dim_1 = true;
+      } else if (mask_dims == 3) {
+        broadcast_bias_dim_0 = true;
+        broadcast_bias_dim_1 = dims[0] == 1;
+      } else {
+        broadcast_bias_dim_0 = dims[0] == 1;
+        broadcast_bias_dim_1 = dims[1] == 1;
+      }
+    }
+
     onnxruntime::contrib::cuda::MemoryEfficientAttentionParams p;
     p.sm = sm;
     p.is_half = std::is_same<T, MLFloat16>::value;
@@ -583,10 +620,12 @@ Status Attention<T>::RunMemoryEfficientAttention(
     p.scale = parameters.scale;
     p.seqlen_k_ptr = seqlens_k_buffer.get();
     p.has_custom_right_padding = true;
+    p.broadcast_attn_bias_dim_0 = broadcast_bias_dim_0;
+    p.broadcast_attn_bias_dim_1 = broadcast_bias_dim_1;
     p.query = q_data;
     p.key = k_data;
     p.value = v_data;
-    p.attn_bias = nullptr;
+    p.attn_bias = attn_bias_data;
     p.stream = cuda_stream;
     p.output = out_data;
 
@@ -837,6 +876,10 @@ Status Attention<T>::RunUnfusedAttention(
   IAllocatorUniquePtr<void> converted_mask_buffer;
   if (nonpad_kv_seqlen != nullptr) {
     // Convert nonpad_kv_seqlen to additive attention bias: [B, q_seq, total_seq]
+    // TODO: Support nonpad_kv_seqlen + attn_mask composition in unfused path.
+    // When both are present, the nonpad bias and mask bias should be additively composed
+    // into a single attention_bias buffer. Currently only nonpad_kv_seqlen is used here;
+    // the combined case is handled by MEA in RunMemoryEfficientAttention.
     using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
     int64_t bias_elements = static_cast<int64_t>(parameters.batch_size) *
                             parameters.q_sequence_length *
@@ -924,13 +967,13 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* past_value = context->Input<Tensor>(5);
   const Tensor* nonpad_kv_seqlen = context->Input<Tensor>(6);  // optional, Opset 24
 
-  // TODO: Support nonpad_kv_seqlen + attn_mask together. The ONNX spec allows both
-  // (additive composition: attn_bias += attn_mask then attn_bias += padding_mask).
-  // CPU implementation supports it. CUDA blocks it because flash has no bias parameter.
-  // To support: route to MEA (set both seqlen_k_ptr and attn_bias) or unfused
-  // (compose two additive biases). ~20 lines. Implement when a real model needs it.
-  ORT_ENFORCE(nonpad_kv_seqlen == nullptr || attn_mask == nullptr,
-              "nonpad_kv_seqlen and attn_mask cannot both be provided.");
+  // When both nonpad_kv_seqlen and attn_mask are provided, Flash Attention cannot handle
+  // the combination (no bias parameter). Route to MEA which supports seqlen_k_ptr + attn_bias.
+  if (nonpad_kv_seqlen != nullptr && attn_mask != nullptr) {
+    LOGS_DEFAULT(WARNING) << "Both nonpad_kv_seqlen and attn_mask provided. "
+                          << "Flash Attention does not support this combination; "
+                          << "falling back to Memory Efficient Attention.";
+  }
 
   attention_helper::AttentionParameters parameters;
   TensorShape y_shape;
@@ -962,6 +1005,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   {
     auto& device_prop = GetDeviceProp();
     bool flash_eligible =
+        !disable_flash_attention_ &&
         !std::is_same<T, float>::value &&
         onnxruntime::flash::is_supported<T>(device_prop, parameters.head_size,
                                             parameters.q_num_heads, parameters.kv_num_heads) &&
@@ -976,7 +1020,10 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
         // Note: contrib MHA similarly excludes flash when attention_bias is present
         // (no mask support in mha_fwd). Float masks and bool prompt masks route to MEA
         // which supports additive bias natively.
-        (attn_mask == nullptr || (attn_mask->IsDataType<bool>() && past_key != nullptr));
+        (attn_mask == nullptr || (attn_mask->IsDataType<bool>() && past_key != nullptr)) &&
+        // Flash cannot handle nonpad_kv_seqlen + attn_mask simultaneously (no bias parameter
+        // in mha_fwd/mha_fwd_kvcache when seqlens_k is used). Route to MEA instead.
+        !(nonpad_kv_seqlen != nullptr && attn_mask != nullptr);
 
     if (flash_eligible) {
       return RunFlashAttention(context, Q, K, V, attn_mask, past_key, past_value,
@@ -990,6 +1037,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
     auto& device_prop = GetDeviceProp();
     int sm = device_prop.major * 10 + device_prop.minor;
     bool mea_eligible =
+        !disable_memory_efficient_attention_ &&
         onnxruntime::contrib::cuda::has_memory_efficient_attention(
             sm, std::is_same<T, MLFloat16>::value, std::is_same<T, BFloat16>::value,
             parameters.head_size, parameters.v_head_size) &&
@@ -999,10 +1047,11 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
         past_key == nullptr;
 
     // Cutlass FMHA requires bias strides to satisfy minimum alignment even in the
-    // "unaligned" kernel path. When an attention mask is present without nonpad_kv_seqlen,
-    // it becomes an additive bias with bias_strideM = total_sequence_length. Skip MEA if
-    // this stride can't satisfy the kernel's minimum alignment requirement.
-    if (mea_eligible && attn_mask != nullptr && nonpad_kv_seqlen == nullptr) {
+    // "unaligned" kernel path. When an attention mask is present (with or without
+    // nonpad_kv_seqlen), it becomes an additive bias with bias_strideM =
+    // total_sequence_length. Skip MEA if this stride can't satisfy the kernel's
+    // minimum alignment requirement.
+    if (mea_eligible && attn_mask != nullptr) {
       int min_bias_align = 1;
       if ((std::is_same<T, float>::value && sm >= 80) ||
           (!std::is_same<T, float>::value && sm >= 75)) {
diff --git a/onnxruntime/core/providers/cuda/llm/attention.h b/onnxruntime/core/providers/cuda/llm/attention.h
index 690ae5c22bd18..fd5bb81a2fbd3 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.h
+++ b/onnxruntime/core/providers/cuda/llm/attention.h
@@ -48,6 +48,8 @@ class Attention final : public CudaKernel {
   float scale_;
   float softcap_;
   int softmax_precision_;
+  bool disable_flash_attention_;
+  bool disable_memory_efficient_attention_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
index e868efdb7289b..3fb59f8a18968 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
@@ -1783,5 +1783,102 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_ExceedsTotalSeqLen) {
            {}, nullptr, &execution_providers);
 }
 
+// Test combined nonpad_kv_seqlen + bool attn_mask (T26).
+// Both masks should compose additively: nonpad_kv_seqlen masks positions >= valid_len,
+// and attn_mask further masks positions within the valid range.
+// Previously this combination crashed with ORT_ENFORCE; now it falls back to MEA gracefully.
+TEST(AttentionTest, Attention_NonPadKVSeqLen_WithBoolAttnMask) {
+  // batch_size=1, q_num_heads=1, kv_num_heads=1
+  // q_seq_len=1, kv_seq_len=4, head_size=2
+  // nonpad_kv_seqlen=[3] => positions 0,1,2 valid by padding
+  // attn_mask=[true, false, true, true] => position 1 masked by attn_mask
+  // Combined: only positions 0 and 2 are effective
+  OpTester test("Attention", 24, onnxruntime::kOnnxDomain);
+
+  std::vector<int64_t> q_shape = {1, 1, 1, 2};
+  std::vector<int64_t> k_shape = {1, 1, 4, 2};
+  std::vector<int64_t> v_shape = {1, 1, 4, 2};
+
+  // Q and K designed for uniform attention scores on valid positions
+  std::vector<float> q = {1.0f, 1.0f};
+  std::vector<float> k(8, 1.0f);
+  // V: [10, 20], [30, 40], [50, 60], [70, 80]
+  std::vector<float> v = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f};
+
+  test.AddInput<float>("Q", q_shape, q);
+  test.AddInput<float>("K", k_shape, k);
+  test.AddInput<float>("V", v_shape, v);
+  // 2D bool mask [1, 4]: position 1 is false (masked out)
+  test.AddInput<bool>("attn_mask", {1, 4}, {true, false, true, true});
+  test.AddOptionalInputEdge<float>();  // past_key
+  test.AddOptionalInputEdge<float>();  // past_value
+  test.AddInput<int64_t>("nonpad_kv_seqlen", {1}, {3});
+
+  // nonpad masks position 3 (>= valid_len=3), attn_mask masks position 1
+  // Active positions: 0 and 2, with uniform attention scores
+  // Output = mean(V[0], V[2]) = [(10+50)/2, (20+60)/2] = [30.0, 40.0]
+  std::vector<float> expected_y = {30.0f, 40.0f};
+  test.AddOutput<float>("Y", {1, 1, 1, 2}, expected_y, false, 0, 1e-3f);
+  test.AddOptionalOutputEdge<float>();  // present_key
+  test.AddOptionalOutputEdge<float>();  // present_value
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  if (HasCudaEnvironment(0)) {
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+  }
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+// Test combined nonpad_kv_seqlen + float attn_mask with multi-batch.
+// Verifies additive composition works with float attention bias values.
+TEST(AttentionTest, Attention_NonPadKVSeqLen_WithFloatAttnMask_MultiBatch) {
+  // batch_size=2, q_num_heads=1, kv_num_heads=1
+  // q_seq_len=1, kv_seq_len=4, head_size=2
+  // nonpad_kv_seqlen=[3, 2] => batch 0 has 3 valid, batch 1 has 2 valid
+  // 2D float attn_mask [1, 4] = [0, -inf, 0, 0] => masks position 1 for all batches
+  OpTester test("Attention", 24, onnxruntime::kOnnxDomain);
+
+  std::vector<int64_t> q_shape = {2, 1, 1, 2};
+  std::vector<int64_t> k_shape = {2, 1, 4, 2};
+  std::vector<int64_t> v_shape = {2, 1, 4, 2};
+
+  std::vector<float> q = {1.0f, 1.0f, 1.0f, 1.0f};
+  std::vector<float> k(16, 1.0f);
+  // Batch 0 V: [1,1], [2,2], [3,3], [99,99]
+  // Batch 1 V: [5,5], [6,6], [99,99], [99,99]
+  std::vector<float> v = {
+      1.0f, 1.0f, 2.0f, 2.0f, 3.0f, 3.0f, 99.0f, 99.0f,   // batch 0
+      5.0f, 5.0f, 6.0f, 6.0f, 99.0f, 99.0f, 99.0f, 99.0f  // batch 1
+  };
+
+  float neg_inf = -std::numeric_limits<float>::infinity();
+
+  test.AddInput<float>("Q", q_shape, q);
+  test.AddInput<float>("K", k_shape, k);
+  test.AddInput<float>("V", v_shape, v);
+  // 2D float mask [1, 4]: position 1 gets -inf
+  test.AddInput<float>("attn_mask", {1, 4}, {0.0f, neg_inf, 0.0f, 0.0f});
+  test.AddOptionalInputEdge<float>();  // past_key
+  test.AddOptionalInputEdge<float>();  // past_value
+  test.AddInput<int64_t>("nonpad_kv_seqlen", {2}, {3, 2});
+
+  // Batch 0: nonpad masks pos 3, attn_mask masks pos 1 → active: 0,2
+  //   Output = mean(V[0], V[2]) = [(1+3)/2, (1+3)/2] = [2.0, 2.0]
+  // Batch 1: nonpad masks pos 2,3, attn_mask masks pos 1 → active: 0 only
+  //   Output = V[0] = [5.0, 5.0]
+  std::vector<float> expected_y = {2.0f, 2.0f, 5.0f, 5.0f};
+  test.AddOutput<float>("Y", {2, 1, 1, 2}, expected_y, false, 0, 1e-3f);
+  test.AddOptionalOutputEdge<float>();  // present_key
+  test.AddOptionalOutputEdge<float>();  // present_value
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  if (HasCudaEnvironment(0)) {
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+  }
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 }  // namespace test
 }  // namespace onnxruntime

From 1e018940e11ab35eb263b41876dbbff5d2a3f3c5 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Fri, 6 Mar 2026 22:11:10 +0000
Subject: [PATCH 26/35] Wire up nonpad_kv_seqlen + attn_mask composition in
 unfused path (T28)

Add LaunchAddBiasInPlace kernel for element-wise bias composition with
cyclic broadcasting. When both nonpad_kv_seqlen and attn_mask are
provided, the unfused path now composes them additively into a single
attention_bias buffer [B, q, t]. Handles both bool and float masks.

Updated RunUnfusedAttention docstring (3 paths) and ComputeInternal
warning log to reflect that both MEA and Unfused handle the combination.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cuda/llm/attention.cc      | 110 +++++++++++++-----
 .../providers/cuda/llm/attention_mask_impl.cu |  44 +++++++
 .../providers/cuda/llm/attention_mask_impl.h  |  13 +++
 3 files changed, 136 insertions(+), 31 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index d39164953aed6..816424ba9c350 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -74,7 +74,7 @@ Attention<T>::Attention(const OpKernelInfo& info) : CudaKernel(info) {
   ORT_ENFORCE(scale_ > 0 || std::isnan(scale_), "scale must be greater than 0 if specified");
 
   const auto* kernel_options = this->GetAttentionKernelOptions();
-  disable_flash_attention_ = !std::is_same<T, MLFloat16>::value || !kernel_options->UseFlashAttention();
+  disable_flash_attention_ = std::is_same<T, float>::value || !kernel_options->UseFlashAttention();
   disable_memory_efficient_attention_ = !kernel_options->UseEfficientAttention();
 }
 
@@ -124,8 +124,12 @@ static Status TransposeBSNHtoBNSH(int batch_size, int sequence_length,
 //   Eligibility: fp16/bf16, head_size==v_head_size, no output_qk, no softcap,
 //                no softmax_precision, (no mask OR bool mask + past OR nonpad_kv_seqlen)
 //
-// PERFORMANCE NOTE: ONNX Attention's decode path is inherently ~15-30% slower than
-// contrib GQA's decode path for grouped-query attention workloads. This is because:
+// PERFORMANCE NOTE: ONNX Attention's internal-cache decode path (past_key/past_value)
+// is inherently ~15-30% slower than contrib GQA's decode path for grouped-query attention
+// workloads. This overhead does NOT apply when using external KV cache via
+// TensorScatter + nonpad_kv_seqlen (opset 24), which avoids the copy entirely.
+//
+// The internal-cache overhead comes from:
 //
 // 1. No past_present_share_buffer: The ONNX Attention spec requires past_key/value
 //    shape = (B, H, past_seq, head_size) and present_key/value shape =
@@ -138,9 +142,9 @@ static Status TransposeBSNHtoBNSH(int batch_size, int sequence_length,
 //    past_present_share_buffer to function. Since ONNX Attention cannot share buffers
 //    (see point 1), XQA is fundamentally incompatible with this op's spec design.
 //
-// 3. These are spec-level limitations, not implementation gaps. A graph optimizer that
-//    transparently replaces ONNX Attention with contrib GQA on supported hardware
-//    would be the recommended approach to close this performance gap.
+// 3. These are spec-level limitations of the internal-cache path, not implementation gaps.
+//    For production LLM inference, prefer the external-cache path (TensorScatter +
+//    nonpad_kv_seqlen) which achieves parity with contrib GQA performance.
 //
 template <typename T>
 Status Attention<T>::RunFlashAttention(
@@ -782,8 +786,10 @@ Status Attention<T>::RunMemoryEfficientAttention(
 //
 // Unfused Attention dispatch paths:
 //   Universal fallback via MHA's QkvToContext.
-//   Path 1: nonpad_kv_seqlen -> converts to attention_bias [B, q_seq, total_seq]
-//   Path 2: all other cases -> passes mask/bias directly
+//   Path 1: nonpad_kv_seqlen only -> converts to attention_bias [B, q_seq, total_seq]
+//   Path 2: nonpad_kv_seqlen + attn_mask -> composes both into attention_bias [B, q_seq, total_seq]
+//           (nonpad bias + mask bias added element-wise with cyclic broadcasting)
+//   Path 3: all other cases -> passes mask/bias directly
 //   Supports: all dtypes (fp16/bf16/fp32), all mask types (bool/float/none), all head sizes
 //   Not supported: softcap, softmax_precision, output_qk modes beyond kNone/kQK
 //   Limitation: MHA only (q_num_heads must equal kv_num_heads)
@@ -874,12 +880,9 @@ Status Attention<T>::RunUnfusedAttention(
 
   // Handle attention mask / nonpad_kv_seqlen → attention_bias
   IAllocatorUniquePtr<void> converted_mask_buffer;
+  IAllocatorUniquePtr<void> mask_bias_buffer;  // temp buffer for mask→bias when composing
   if (nonpad_kv_seqlen != nullptr) {
     // Convert nonpad_kv_seqlen to additive attention bias: [B, q_seq, total_seq]
-    // TODO: Support nonpad_kv_seqlen + attn_mask composition in unfused path.
-    // When both are present, the nonpad bias and mask bias should be additively composed
-    // into a single attention_bias buffer. Currently only nonpad_kv_seqlen is used here;
-    // the combined case is handled by MEA in RunMemoryEfficientAttention.
     using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
     int64_t bias_elements = static_cast<int64_t>(parameters.batch_size) *
                             parameters.q_sequence_length *
@@ -894,8 +897,44 @@ Status Attention<T>::RunUnfusedAttention(
         contribop_parameters.mask_filter_value,
         cuda_stream,
         device_prop.maxThreadsPerBlock));
+
+    // When attn_mask is also present, compose it into the nonpad bias additively.
+    // The nonpad bias is [B, q, t]; the mask is added with cyclic broadcasting
+    // (e.g. a 2D [q, t] mask repeats over the batch dimension).
+    if (attn_mask != nullptr) {
+      int64_t mask_elements = attn_mask->Shape().Size();
+      const NativeCudaT* mask_bias_ptr = nullptr;
+
+      if (attn_mask->IsDataType<bool>()) {
+        // Convert bool mask to additive bias in a temp buffer, then add in-place.
+        mask_bias_buffer = GetScratchBuffer<void>(mask_elements * sizeof(NativeCudaT), context->GetComputeStream());
+        ORT_RETURN_IF_ERROR(LaunchConvertBoolMaskToAttentionBias<NativeCudaT>(
+            attn_mask->Data<bool>(),
+            reinterpret_cast<NativeCudaT*>(mask_bias_buffer.get()),
+            mask_elements,
+            contribop_parameters.mask_filter_value,
+            cuda_stream,
+            device_prop.maxThreadsPerBlock));
+        mask_bias_ptr = reinterpret_cast<const NativeCudaT*>(mask_bias_buffer.get());
+      } else {
+        // Float mask is already in additive bias format.
+        mask_bias_ptr = reinterpret_cast<const NativeCudaT*>(attn_mask->Data<T>());
+      }
+
+      // Add mask bias into nonpad bias with cyclic broadcasting.
+      // 2D mask [q, t]: mask_elements = q*t, repeats for each batch → correct.
+      // 4D mask [B, 1, q, t]: mask_elements = B*q*t = bias_elements → direct add.
+      ORT_RETURN_IF_ERROR(LaunchAddBiasInPlace<NativeCudaT>(
+          reinterpret_cast<NativeCudaT*>(converted_mask_buffer.get()),
+          mask_bias_ptr,
+          bias_elements,
+          mask_elements,
+          cuda_stream,
+          device_prop.maxThreadsPerBlock));
+    }
+
     data.attention_bias = reinterpret_cast<const CudaT*>(converted_mask_buffer.get());
-    // nonpad bias is [B, q_seq, total_seq] → broadcasts over heads but not batch
+    // Composed bias is [B, q_seq, total_seq] → broadcasts over heads but not batch.
     contribop_parameters.broadcast_attn_bias_dim_0 = false;
     contribop_parameters.broadcast_attn_bias_dim_1 = true;
   } else if (attn_mask != nullptr) {
@@ -968,11 +1007,11 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* nonpad_kv_seqlen = context->Input<Tensor>(6);  // optional, Opset 24
 
   // When both nonpad_kv_seqlen and attn_mask are provided, Flash Attention cannot handle
-  // the combination (no bias parameter). Route to MEA which supports seqlen_k_ptr + attn_bias.
+  // the combination (no bias parameter). Route to MEA or Unfused which support composition.
   if (nonpad_kv_seqlen != nullptr && attn_mask != nullptr) {
     LOGS_DEFAULT(WARNING) << "Both nonpad_kv_seqlen and attn_mask provided. "
                           << "Flash Attention does not support this combination; "
-                          << "falling back to Memory Efficient Attention.";
+                          << "falling back to Memory Efficient Attention or Unfused path.";
   }
 
   attention_helper::AttentionParameters parameters;
@@ -999,8 +1038,31 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
 
   // === KERNEL SELECTION CASCADE ===
   // Priority: flash attention > memory efficient attention > unfused attention
+  //
+  // 4D BNSH handling per kernel:
+  //   Flash: strictly requires BSNH — Q is transposed BNSH→BSNH before calling mha_fwd*.
+  //          K/V passed as BNSH to mha_fwd_kvcache (it handles both layouts).
+  //   MEA:   accepts both BSNH and BNSH natively via is_kv_bsnh flag. Q transposed to BSNH.
+  //   Unfused: accepts both via QkvToContext's qkv_format (Q_K_V_BSNH or Q_K_V_BNSH).
+  //
+  // nonpad_kv_seqlen + attn_mask routing:
+  //   Flash: cannot handle this combo (no bias param when seqlens_k is used) → excluded.
+  //   MEA:   supports both (custom_right_padding for seqlens + additive attn_bias for mask).
+  //   Unfused: nonpad → attention_bias conversion only; mask composition TODO(titaiwang).
   const bool has_output_qk = (qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kNone);
 
+  // Early-reject features not supported by any CUDA kernel path.
+  // TODO(titaiwang): Support softcap and softmax_precision on CUDA kernels.
+  // When a kernel adds support, move these checks to the unfused fallback section.
+  if (parameters.softcap != 0.0f) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                           "softcap is not supported yet in Attention op (CUDA).");
+  }
+  if (parameters.softmax_precision != 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                           "softmax_precision is not supported yet in Attention op (CUDA).");
+  }
+
 #if USE_FLASH_ATTENTION
   {
     auto& device_prop = GetDeviceProp();
@@ -1011,8 +1073,6 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                                             parameters.q_num_heads, parameters.kv_num_heads) &&
         parameters.head_size == parameters.v_head_size &&
         !has_output_qk &&
-        parameters.softcap == 0.0f &&
-        parameters.softmax_precision == 0 &&
         // Bool masks without past_key (prompt) can't use flash because mha_fwd_kvcache's
         // causal semantics are decode-oriented (window offset by seqlens_k). For causal
         // prompt with padding, MEA handles it correctly via attention bias conversion.
@@ -1042,8 +1102,6 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
             sm, std::is_same<T, MLFloat16>::value, std::is_same<T, BFloat16>::value,
             parameters.head_size, parameters.v_head_size) &&
         !has_output_qk &&
-        parameters.softcap == 0.0f &&
-        parameters.softmax_precision == 0 &&
         past_key == nullptr;
 
     // Cutlass FMHA requires bias strides to satisfy minimum alignment even in the
@@ -1072,17 +1130,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
 #endif
 
   // Fallback: unfused attention
-  // TODO: Support softcap and softmax_precision on CUDA kernels.
-  // Currently rejected by all three kernel paths (flash, MEA, unfused).
-  if (parameters.softcap != 0.0f) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
-                           "softcap is not supported yet in Attention op (CUDA).");
-  }
-  if (parameters.softmax_precision != 0) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
-                           "softmax_precision is not supported yet in Attention op (CUDA).");
-  }
-  // TODO: Support additional output_qk modes beyond kNone and kQK.
+  // TODO(titaiwang): Support additional output_qk modes beyond kNone and kQK.
   // Currently only unfused handles output_qk, and only kNone/kQK modes.
   if (qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kNone &&
       qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kQK) {
@@ -1092,7 +1140,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   }
 
   if (is_gqa) {
-    // TODO: Support GQA in unfused attention path for fp32/old-GPU fallback.
+    // TODO(titaiwang): Support GQA in unfused attention path for fp32/old-GPU fallback.
     // Currently blocked because QkvToContext allocates K/V workspace assuming
     // num_heads == kv_num_heads. GQA needs a head expansion step (ExpandKVHeads kernel)
     // to replicate kv_num_heads -> q_num_heads before unfused can process.
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
index aa627518d2902..cab1447cf26d7 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -280,6 +280,50 @@ template Status LaunchConvertNonpadKvSeqlenToAttentionBias<__half>(
 template Status LaunchConvertNonpadKvSeqlenToAttentionBias<__nv_bfloat16>(
     const int64_t*, __nv_bfloat16*, int, int, int, float, cudaStream_t, int);
 
+// Add an addend bias into an existing bias buffer using cyclic broadcasting.
+// Used to compose nonpad_kv_seqlen bias [B, q, t] with an attn_mask bias that
+// may be smaller (e.g. 2D [q, t] broadcasts over batch).
+template <typename T>
+__global__ void AddBiasInPlaceKernel(
+    T* __restrict__ bias,
+    const T* __restrict__ addend,
+    int64_t total_elements,
+    int64_t addend_elements) {
+  for (int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+       idx < total_elements;
+       idx += static_cast<int64_t>(gridDim.x) * blockDim.x) {
+    float sum = static_cast<float>(bias[idx]) + static_cast<float>(addend[idx % addend_elements]);
+    bias[idx] = T(sum);
+  }
+}
+
+template <typename T>
+Status LaunchAddBiasInPlace(
+    T* bias,
+    const T* addend,
+    int64_t total_elements,
+    int64_t addend_elements,
+    cudaStream_t stream,
+    int max_threads_per_block) {
+  if (total_elements == 0) {
+    return Status::OK();
+  }
+
+  int threads = static_cast<int>(std::min(static_cast<int64_t>(max_threads_per_block), total_elements));
+  int64_t blocks = (total_elements + threads - 1) / threads;
+  constexpr int64_t kMaxGridDimX = 65535;
+  unsigned int grid_size = static_cast<unsigned int>(std::min(blocks, kMaxGridDimX));
+
+  AddBiasInPlaceKernel<T><<<grid_size, threads, 0, stream>>>(
+      bias, addend, total_elements, addend_elements);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
+template Status LaunchAddBiasInPlace<float>(float*, const float*, int64_t, int64_t, cudaStream_t, int);
+template Status LaunchAddBiasInPlace<__half>(__half*, const __half*, int64_t, int64_t, cudaStream_t, int);
+template Status LaunchAddBiasInPlace<__nv_bfloat16>(__nv_bfloat16*, const __nv_bfloat16*, int64_t, int64_t, cudaStream_t, int);
+
 // Simple kernel to fill an int32 buffer with a constant value on device.
 // Used for CUDA-graph-capturable seqlens_k initialization (no host memory).
 __global__ void FillInt32Kernel(int* __restrict__ output, const int value, const int count) {
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
index 1a049c6be2b49..48b0894ad5697 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
@@ -82,6 +82,19 @@ Status LaunchConvertNonpadKvSeqlenToAttentionBias(
     cudaStream_t stream,
     int max_threads_per_block);
 
+// Additively compose an addend bias into an existing bias buffer in-place.
+// Supports cyclic broadcasting: addend of size [q, t] is repeated over batch
+// to compose with a bias of size [B, q, t]. When both have the same number
+// of elements (e.g. 4D mask [B, 1, q, t]), it performs a direct element-wise add.
+template <typename T>
+Status LaunchAddBiasInPlace(
+    T* bias,
+    const T* addend,
+    int64_t total_elements,
+    int64_t addend_elements,
+    cudaStream_t stream,
+    int max_threads_per_block);
+
 // Fill an int32 buffer with a constant value entirely on device.
 // CUDA-graph-capturable alternative to host vector + cudaMemcpyAsync.
 Status LaunchFillInt32(int* output, int value, int count, cudaStream_t stream, int max_threads_per_block);

From 31567e76f1d2db2d4d7c869e6d9246f6090e6dd4 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Fri, 6 Mar 2026 22:14:03 +0000
Subject: [PATCH 27/35] Fix 2D mask shape in GQA tests and add mask validation
 (T29)

Fix GQA bool 2D mask shape from [batch_size, total_seq] to
[q_seq_len, total_seq_len] per ONNX spec in test common.py.

Update create_boolean_mask_from_seqlens to return [q_seq, total_seq]
for 2D using first batch's pattern (2D masks broadcast across batches).

Add effective_seqlens adjustment for 2D masks in GQA prompt padding
test (was only applied for 3D).

Add guard test case: batch_size=4, q_seq=1 to catch shape bugs where
batch_size != q_seq_len.

Add attn_mask dim[-2] == q_seq_len validation in attention_helper.h
for 3D Q path, matching existing 4D Q path check.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cpu/llm/attention_helper.h |  7 ++++++
 .../test_onnx_attention/common.py             | 12 ++++++----
 .../test_onnx_attention/test_gqa.py           | 24 +++++++++++++++++--
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/llm/attention_helper.h b/onnxruntime/core/providers/cpu/llm/attention_helper.h
index 7030eaa81a16c..efc908603a990 100644
--- a/onnxruntime/core/providers/cpu/llm/attention_helper.h
+++ b/onnxruntime/core/providers/cpu/llm/attention_helper.h
@@ -91,6 +91,13 @@ inline Status ComputeOutputShapeForAttention(
 
     parameters.transpose_output = true;  // whether to transpose the input/output with permutation (0, 2, 1, 3)
     parameters.q_sequence_length = onnxruntime::narrow<int>(Q->Shape()[1]);
+
+    // Validate mask second-to-last dim matches q_sequence_length (same check as 4D path).
+    // For 2D mask [A, B]: A must equal q_seq. For 3D mask [A, B, C]: B must equal q_seq.
+    ORT_ENFORCE(attn_mask == nullptr ||
+                    attn_mask->Shape()[attn_mask->Shape().NumDimensions() - 2] == Q->Shape()[1],
+                "inconsistent q_sequence_length (between attn_mask and Q)");
+
     parameters.head_size = onnxruntime::narrow<int>(Q->Shape()[2]) / parameters.q_num_heads;
     parameters.kv_sequence_length = onnxruntime::narrow<int>(K->Shape()[1]);
     parameters.v_head_size = onnxruntime::narrow<int>(V->Shape()[2]) / parameters.kv_num_heads;
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/common.py b/onnxruntime/test/python/transformers/test_onnx_attention/common.py
index 28145a7b693d8..48640fa38aca2 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/common.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/common.py
@@ -234,14 +234,13 @@ def create_attention_node_and_io(
             mask_ort_type = ort_type  # additive mask uses same type as Q/K/V
 
         # Mask shapes differ between GQA (bool) and MHA (additive/bool) paths:
-        # GQA bool: 2D=[batch, total_seq] — GQA converts to seqlens_k directly, bypassing ONNX broadcasting.
-        # MHA (additive or bool): 2D=[q_seq, total_seq] — follows ONNX right-aligned broadcasting.
+        # Per ONNX spec, 2D mask is [q_seq, total_seq] for all paths.
         # 3D and 4D are the same for both paths.
         # ONNX broadcasting aligns from the right: 3D [A, B, C] → [_, A, B, C] where A=heads
         is_gqa = config.kv_num_heads != config.q_num_heads
         if config.attn_mask_type == "bool" and is_gqa:
             if config.attn_mask_dims == 2:
-                mask_shape = [config.batch_size, mask_seq_len]
+                mask_shape = [config.q_sequence_length, mask_seq_len]
             elif config.attn_mask_dims == 3:
                 mask_shape = [config.q_num_heads, config.q_sequence_length, mask_seq_len]
             else:  # 4D
@@ -740,7 +739,7 @@ def create_boolean_mask_from_seqlens(
 
     Returns:
         Boolean mask where True = valid, False = padding.
-        - 2D: [batch_size, total_seq_len] - broadcasts to [batch, 1, 1, total_seq]
+        - 2D: [q_seq_len, total_seq_len] - broadcasts to [batch, heads, q_seq, total_seq]
         - 3D: [num_heads, q_seq_len, total_seq_len] - broadcasts to [1, num_heads, q_seq, total_seq]
         - 4D: [batch_size, num_heads, q_seq_len, total_seq_len] - no broadcasting
     """
@@ -753,7 +752,10 @@ def create_boolean_mask_from_seqlens(
     mask_2d = arange < seqlens_expanded  # [batch_size, total_seq_len]
 
     if mask_dims == 2:
-        return mask_2d
+        # 2D: [q_seq_len, total_seq_len] per ONNX spec. Broadcasts across batch and heads.
+        # Since 2D masks can't vary per batch, use the first batch's pattern.
+        mask_1d = mask_2d[0:1, :]  # [1, total_seq_len]
+        return mask_1d.expand(q_seq_len, total_seq_len).contiguous()
     elif mask_dims == 3:
         # 3D mask: [num_heads, q_seq_len, total_seq_len]
         # For right-padding tests, all batches should have the same mask pattern per position.
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index abac3b3fef078..71e052a600f0f 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -398,10 +398,10 @@ def parity_check_gqa_prompt_with_padding(
     )
     v = torch.randn_like(k) * std
 
-    # 3D masks broadcast across batches (no batch dimension), so they can only
+    # 2D and 3D masks broadcast across batches (no per-batch dimension), so they can only
     # represent one padding pattern. The mask uses batch 0's seqlen for all batches.
     # Adjust effective_seqlens so the reference comparison matches the actual mask.
-    if config.attn_mask_dims == 3:
+    if config.attn_mask_dims in (2, 3):
         effective_seqlens = torch.full_like(seqlens, seqlens[0].item())
     else:
         effective_seqlens = seqlens
@@ -676,6 +676,8 @@ def gqa_prompt_padding_test_cases():
     Generate test cases for ONNX Attention op GQA path with boolean padding masks.
 
     Tests 2D, 3D, and 4D boolean masks for right-padding scenarios.
+    Includes a batch_size=4, q_seq=1 case where batch_size != q_seq_len to
+    guard against 2D mask shape bugs (must be [q_seq, total_seq] not [batch, total_seq]).
     """
     batches = [2]
     seqs = [(16, 16)]
@@ -703,6 +705,24 @@ def gqa_prompt_padding_test_cases():
                         name = f"b{b}_sq{sq}_skv{skv}_nh{n}_{n2}_h{h}_mask{mask_dims}d"
                         yield name, config
 
+    # Guard case: batch_size=4 != q_seq_len=1 (decode). This catches the original bug
+    # where 2D mask was [batch, total_seq] instead of [q_seq, total_seq].
+    for mask_dims in mask_dims_options:
+        config = AttentionConfig(
+            batch_size=4,
+            q_sequence_length=1,
+            kv_sequence_length=32,
+            past_kv_sequence_length=0,
+            q_num_heads=8,
+            kv_num_heads=2,
+            head_size=128,
+            is_causal=1,
+            has_attn_mask=True,
+            attn_mask_dims=mask_dims,
+        )
+        name = f"b4_sq1_skv32_nh8_2_h128_mask{mask_dims}d_shape_guard"
+        yield name, config
+
 
 def gqa_past_padding_test_cases():
     """

From c5a1ebb041329f14a7e97304f5f3bd90c276981e Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Fri, 6 Mar 2026 22:20:06 +0000
Subject: [PATCH 28/35] Fix T28 review issues: guard mask dims, prevent
 divide-by-zero, fix stale TODO

1. Add ORT_ENFORCE limiting nonpad+mask composition to 2D masks and
   4D masks with head_dim=1 (per-head masks can't compose into [B,q,t])
2. Guard addend_elements==0 in LaunchAddBiasInPlace to prevent divide-by-zero
3. Update stale TODO comment at line 1051 to reflect completed implementation

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 onnxruntime/core/providers/cuda/llm/attention.cc    | 13 +++++++++++--
 .../core/providers/cuda/llm/attention_mask_impl.cu  |  2 +-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 816424ba9c350..41b2f301aa318 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -901,8 +901,17 @@ Status Attention<T>::RunUnfusedAttention(
     // When attn_mask is also present, compose it into the nonpad bias additively.
     // The nonpad bias is [B, q, t]; the mask is added with cyclic broadcasting
     // (e.g. a 2D [q, t] mask repeats over the batch dimension).
+    // Only 2D masks and 4D masks with head_dim=1 are supported — per-head masks
+    // (3D [H,q,t] or 4D [B,H>1,q,t]) cannot be composed into a [B,q,t] buffer.
     if (attn_mask != nullptr) {
-      int64_t mask_elements = attn_mask->Shape().Size();
+      const auto& mask_shape = attn_mask->Shape();
+      int mask_dims = static_cast<int>(mask_shape.NumDimensions());
+      ORT_ENFORCE(mask_dims == 2 || (mask_dims == 4 && mask_shape[1] == 1),
+                  "nonpad_kv_seqlen + attn_mask composition in unfused path only supports "
+                  "2D masks [q, t] and 4D masks with head_dim=1 [B, 1, q, t]. "
+                  "Got mask shape: ", mask_shape);
+
+      int64_t mask_elements = mask_shape.Size();
       const NativeCudaT* mask_bias_ptr = nullptr;
 
       if (attn_mask->IsDataType<bool>()) {
@@ -1048,7 +1057,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   // nonpad_kv_seqlen + attn_mask routing:
   //   Flash: cannot handle this combo (no bias param when seqlens_k is used) → excluded.
   //   MEA:   supports both (custom_right_padding for seqlens + additive attn_bias for mask).
-  //   Unfused: nonpad → attention_bias conversion only; mask composition TODO(titaiwang).
+  //   Unfused: nonpad → attention_bias; mask composed additively when both present.
   const bool has_output_qk = (qk_matmul_output_mode_ != attention_helper::QKMatMulOutputMode::kNone);
 
   // Early-reject features not supported by any CUDA kernel path.
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
index cab1447cf26d7..a1febc8a26a44 100644
--- a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
+++ b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -305,7 +305,7 @@ Status LaunchAddBiasInPlace(
     int64_t addend_elements,
     cudaStream_t stream,
     int max_threads_per_block) {
-  if (total_elements == 0) {
+  if (total_elements == 0 || addend_elements == 0) {
     return Status::OK();
   }
 

From 7111a4d8855210e93fafc44d5b673d58f4b1d44d Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Fri, 6 Mar 2026 22:45:34 +0000
Subject: [PATCH 29/35] Add Python tests for nonpad_kv_seqlen + attn_mask
 combination (T31)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../test_tensorscatter_attention.py           | 364 +++++++++++++++++-
 1 file changed, 363 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
index cb7dcb09d596e..67205caf5165e 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
@@ -74,7 +74,7 @@ def has_flash_attention():
     return has_cuda_device(80)
 
 
-def numpy_attention_ref(q, k, v, nonpad_kv_seqlen, is_causal=False):
+def numpy_attention_ref(q, k, v, nonpad_kv_seqlen, is_causal=False, attn_bias=None):
     """
     NumPy reference implementation of scaled dot-product attention with padding mask.
 
@@ -84,6 +84,7 @@ def numpy_attention_ref(q, k, v, nonpad_kv_seqlen, is_causal=False):
         v: Value [batch, kv_seq, kv_num_heads, head_size]
         nonpad_kv_seqlen: [batch] — number of valid KV positions per batch
         is_causal: whether to apply causal masking
+        attn_bias: optional additive attention bias, broadcastable to [batch, num_heads, q_seq, kv_seq]
 
     Returns:
         output: [batch, q_seq, num_heads, head_size]
@@ -110,6 +111,10 @@ def numpy_attention_ref(q, k, v, nonpad_kv_seqlen, is_causal=False):
         if valid_len < kv_seq:
             scores[b, :, :, valid_len:] = -numpy.inf
 
+    # Apply additive attention bias (from attn_mask conversion)
+    if attn_bias is not None:
+        scores = scores + attn_bias
+
     # Apply causal mask
     if is_causal:
         for sq in range(q_seq):
@@ -590,5 +595,362 @@ def test_tensorscatter_attention_cuda_fp32(
         numpy.testing.assert_allclose(output, ref_output, rtol=rtol["fp32"], atol=atol["fp32"])
 
 
+# #################################################################################################
+#  TensorScatter + Attention with nonpad_kv_seqlen + attn_mask (T26 / T31)
+# #################################################################################################
+
+
+def build_tensorscatter_attention_graph_with_mask(
+    batch_size,
+    total_kv_seq_len,
+    q_seq_len,
+    q_num_heads,
+    kv_num_heads,
+    head_size,
+    ort_type,
+    mask_type,
+    mask_shape,
+    is_causal=0,
+):
+    """
+    Build ONNX graph: TensorScatter(opset 24) → Attention(opset 24) with both
+    nonpad_kv_seqlen AND attn_mask inputs.
+
+    Args:
+        mask_type: TensorProto type for the mask (BOOL or same as ort_type for additive).
+        mask_shape: shape of the attn_mask tensor (e.g., [q_seq, total_kv_seq] for 2D).
+    """
+    kv_hidden = kv_num_heads * head_size
+    q_hidden = q_num_heads * head_size
+
+    scatter_k_node = helper.make_node(
+        "TensorScatter",
+        inputs=["key_cache", "new_k", "write_indices"],
+        outputs=["updated_key_cache"],
+        name="TensorScatterKey",
+        axis=1,
+    )
+    scatter_v_node = helper.make_node(
+        "TensorScatter",
+        inputs=["value_cache", "new_v", "write_indices"],
+        outputs=["updated_value_cache"],
+        name="TensorScatterValue",
+        axis=1,
+    )
+
+    attention_node = helper.make_node(
+        "Attention",
+        inputs=[
+            "query",
+            "updated_key_cache",
+            "updated_value_cache",
+            "attn_mask",
+            "",  # past_key
+            "",  # past_value
+            "nonpad_kv_seqlen",
+        ],
+        outputs=["output", "present_key", "present_value"],
+        name="Attention_0",
+        is_causal=is_causal,
+        kv_num_heads=kv_num_heads,
+        q_num_heads=q_num_heads,
+        softcap=0.0,
+        qk_matmul_output_mode=0,
+        domain="",
+    )
+
+    cache_shape = [batch_size, total_kv_seq_len, kv_hidden]
+    graph_inputs = [
+        helper.make_tensor_value_info("key_cache", ort_type, cache_shape),
+        helper.make_tensor_value_info("value_cache", ort_type, cache_shape),
+        helper.make_tensor_value_info("new_k", ort_type, [batch_size, q_seq_len, kv_hidden]),
+        helper.make_tensor_value_info("new_v", ort_type, [batch_size, q_seq_len, kv_hidden]),
+        helper.make_tensor_value_info("write_indices", TensorProto.INT64, [batch_size]),
+        helper.make_tensor_value_info("query", ort_type, [batch_size, q_seq_len, q_hidden]),
+        helper.make_tensor_value_info("nonpad_kv_seqlen", TensorProto.INT64, [batch_size]),
+        helper.make_tensor_value_info("attn_mask", mask_type, mask_shape),
+    ]
+
+    present_shape = [batch_size, kv_num_heads, total_kv_seq_len, head_size]
+    graph_outputs = [
+        helper.make_tensor_value_info("output", ort_type, [batch_size, q_seq_len, q_hidden]),
+        helper.make_tensor_value_info("present_key", ort_type, present_shape),
+        helper.make_tensor_value_info("present_value", ort_type, present_shape),
+        helper.make_tensor_value_info("updated_key_cache", ort_type, cache_shape),
+        helper.make_tensor_value_info("updated_value_cache", ort_type, cache_shape),
+    ]
+
+    graph = helper.make_graph(
+        [scatter_k_node, scatter_v_node, attention_node],
+        "TensorScatterAttentionWithMask_Graph",
+        graph_inputs,
+        graph_outputs,
+    )
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 24)])
+    return model.SerializeToString()
+
+
+def run_tensorscatter_attention_with_mask(
+    batch_size,
+    total_kv_seq_len,
+    q_seq_len,
+    q_num_heads,
+    kv_num_heads,
+    head_size,
+    nonpad_seqlens,
+    scatter_positions,
+    mask_positions_to_block,
+    use_bool_mask,
+    ep,
+    device,
+    torch_type,
+    ort_type,
+    is_causal=0,
+    std=0.2,
+):
+    """
+    Run TensorScatter + Attention test with BOTH nonpad_kv_seqlen AND attn_mask.
+
+    Args:
+        mask_positions_to_block: list of KV position indices to mask out via attn_mask
+            (applied uniformly across all batches since 2D mask broadcasts).
+        use_bool_mask: True for bool mask, False for float additive mask.
+    """
+    torch.manual_seed(42)
+    kv_hidden = kv_num_heads * head_size
+    q_hidden = q_num_heads * head_size
+    np_type = numpy.float16 if torch_type == torch.float16 else numpy.float32
+
+    # Generate test data
+    key_cache_np = (torch.randn(batch_size, total_kv_seq_len, kv_hidden, dtype=torch_type) * std).numpy()
+    value_cache_np = (torch.randn(batch_size, total_kv_seq_len, kv_hidden, dtype=torch_type) * std).numpy()
+
+    for b in range(batch_size):
+        old_valid = max(0, nonpad_seqlens[b] - q_seq_len)
+        if old_valid < total_kv_seq_len:
+            key_cache_np[b, old_valid:, :] = 0
+            value_cache_np[b, old_valid:, :] = 0
+
+    new_k_np = (torch.randn(batch_size, q_seq_len, kv_hidden, dtype=torch_type) * std).numpy()
+    new_v_np = (torch.randn(batch_size, q_seq_len, kv_hidden, dtype=torch_type) * std).numpy()
+    query_np = (torch.randn(batch_size, q_seq_len, q_hidden, dtype=torch_type) * std).numpy()
+    write_indices_np = numpy.array(scatter_positions, dtype=numpy.int64)
+    nonpad_kv_seqlen_np = numpy.array(nonpad_seqlens, dtype=numpy.int64)
+
+    # Create attn_mask: 2D [q_seq, total_kv_seq]
+    if use_bool_mask:
+        mask_np = numpy.ones((q_seq_len, total_kv_seq_len), dtype=numpy.bool_)
+        for pos in mask_positions_to_block:
+            mask_np[:, pos] = False
+        mask_ort_type = TensorProto.BOOL
+        # Reference: convert bool to additive bias for numpy_attention_ref
+        ref_bias = numpy.zeros((1, 1, q_seq_len, total_kv_seq_len), dtype=numpy.float32)
+        for pos in mask_positions_to_block:
+            ref_bias[:, :, :, pos] = -numpy.inf
+    else:
+        mask_np = numpy.zeros((q_seq_len, total_kv_seq_len), dtype=np_type)
+        for pos in mask_positions_to_block:
+            mask_np[:, pos] = numpy.finfo(np_type).min
+        mask_ort_type = ort_type
+        ref_bias = numpy.zeros((1, 1, q_seq_len, total_kv_seq_len), dtype=numpy.float32)
+        for pos in mask_positions_to_block:
+            ref_bias[:, :, :, pos] = float(numpy.finfo(np_type).min)
+
+    # --- NumPy reference ---
+    key_cache_ref = key_cache_np.astype(numpy.float32).copy()
+    value_cache_ref = value_cache_np.astype(numpy.float32).copy()
+    new_k_ref = new_k_np.astype(numpy.float32)
+    new_v_ref = new_v_np.astype(numpy.float32)
+
+    for b in range(batch_size):
+        pos = scatter_positions[b]
+        for t in range(q_seq_len):
+            key_cache_ref[b, pos + t, :] = new_k_ref[b, t, :]
+            value_cache_ref[b, pos + t, :] = new_v_ref[b, t, :]
+
+    q_ref = query_np.astype(numpy.float32).reshape(batch_size, q_seq_len, q_num_heads, head_size)
+    k_ref = key_cache_ref.reshape(batch_size, total_kv_seq_len, kv_num_heads, head_size)
+    v_ref = value_cache_ref.reshape(batch_size, total_kv_seq_len, kv_num_heads, head_size)
+
+    ref_output = numpy_attention_ref(
+        q_ref, k_ref, v_ref, nonpad_seqlens, is_causal=bool(is_causal), attn_bias=ref_bias
+    )
+    ref_output_3d = ref_output.reshape(batch_size, q_seq_len, q_hidden)
+
+    # --- ORT execution ---
+    mask_shape = [q_seq_len, total_kv_seq_len]
+    onnx_model_str = build_tensorscatter_attention_graph_with_mask(
+        batch_size=batch_size,
+        total_kv_seq_len=total_kv_seq_len,
+        q_seq_len=q_seq_len,
+        q_num_heads=q_num_heads,
+        kv_num_heads=kv_num_heads,
+        head_size=head_size,
+        ort_type=ort_type,
+        mask_type=mask_ort_type,
+        mask_shape=mask_shape,
+        is_causal=is_causal,
+    )
+
+    sess_options = SessionOptions()
+    session = InferenceSession(onnx_model_str, sess_options, providers=[ep])
+
+    ort_device = "cuda" if "CUDA" in ep else "cpu"
+    device_id = 0
+
+    key_cache_ort = OrtValue.ortvalue_from_numpy(key_cache_np, ort_device, device_id)
+    value_cache_ort = OrtValue.ortvalue_from_numpy(value_cache_np, ort_device, device_id)
+    new_k_ort = OrtValue.ortvalue_from_numpy(new_k_np, ort_device, device_id)
+    new_v_ort = OrtValue.ortvalue_from_numpy(new_v_np, ort_device, device_id)
+    write_indices_ort = OrtValue.ortvalue_from_numpy(write_indices_np, ort_device, device_id)
+    query_ort = OrtValue.ortvalue_from_numpy(query_np, ort_device, device_id)
+    nonpad_ort = OrtValue.ortvalue_from_numpy(nonpad_kv_seqlen_np, ort_device, device_id)
+    mask_ort = OrtValue.ortvalue_from_numpy(mask_np, ort_device, device_id)
+
+    present_shape = [batch_size, kv_num_heads, total_kv_seq_len, head_size]
+    output_ort = OrtValue.ortvalue_from_shape_and_type(
+        [batch_size, q_seq_len, q_hidden], np_type, ort_device, device_id
+    )
+    present_k_ort = OrtValue.ortvalue_from_shape_and_type(present_shape, np_type, ort_device, device_id)
+    present_v_ort = OrtValue.ortvalue_from_shape_and_type(present_shape, np_type, ort_device, device_id)
+
+    io_binding = session.io_binding()
+    io_binding.bind_ortvalue_input("key_cache", key_cache_ort)
+    io_binding.bind_ortvalue_input("value_cache", value_cache_ort)
+    io_binding.bind_ortvalue_input("new_k", new_k_ort)
+    io_binding.bind_ortvalue_input("new_v", new_v_ort)
+    io_binding.bind_ortvalue_input("write_indices", write_indices_ort)
+    io_binding.bind_ortvalue_input("query", query_ort)
+    io_binding.bind_ortvalue_input("nonpad_kv_seqlen", nonpad_ort)
+    io_binding.bind_ortvalue_input("attn_mask", mask_ort)
+
+    io_binding.bind_ortvalue_output("output", output_ort)
+    io_binding.bind_ortvalue_output("present_key", present_k_ort)
+    io_binding.bind_ortvalue_output("present_value", present_v_ort)
+    io_binding.bind_ortvalue_output("updated_key_cache", key_cache_ort)
+    io_binding.bind_ortvalue_output("updated_value_cache", value_cache_ort)
+
+    io_binding.synchronize_inputs()
+    session.run_with_iobinding(io_binding)
+    io_binding.synchronize_outputs()
+
+    output_result = output_ort.numpy()
+    return output_result, ref_output_3d
+
+
+# Test cases for nonpad_kv_seqlen + attn_mask combination
+# Format: (batch, q_seq, q_heads, kv_heads, scatter_pos, nonpad_seqlens, mask_positions, label)
+_NONPAD_MASK_CASES = [
+    # Single batch: mask position 1 within valid range
+    (1, 1, 4, 4, [3], [4], [1], "mha_b1_mask1pos"),
+    # Multi-batch with different valid lengths, mask position 0
+    (2, 1, 4, 4, [2, 4], [3, 5], [0], "mha_b2_mask_pos0"),
+    # GQA with mask blocking two positions
+    (2, 1, 8, 2, [2, 4], [3, 5], [1, 2], "gqa_b2_mask2pos"),
+    # Larger batch with varied lengths
+    (4, 1, 4, 4, [1, 3, 5, 7], [2, 4, 6, 8], [0, 3], "mha_b4_varied"),
+    # GQA with full valid length, mask some positions
+    (2, 1, 8, 2, [7, 7], [8, 8], [2, 5], "gqa_b2_full_mask2"),
+]
+
+
+def _make_mask_test_params(cases, use_bool_mask):
+    """Generate parameterized test cases for nonpad + mask tests."""
+    mask_str = "bool" if use_bool_mask else "float"
+    for batch, q_seq, q_heads, kv_heads, scatter_pos, seqlens, mask_pos, label in cases:
+        name = f"b{batch}_qh{q_heads}_kvh{kv_heads}_{label}_{mask_str}"
+        yield (name, batch, q_seq, q_heads, kv_heads, _HEAD_SIZE, _TOTAL_KV_SEQ_LEN,
+               scatter_pos, seqlens, mask_pos, use_bool_mask)
+
+
+def nonpad_mask_cpu_test_cases():
+    """CPU test cases for nonpad_kv_seqlen + attn_mask, both bool and float masks."""
+    yield from _make_mask_test_params(_NONPAD_MASK_CASES, use_bool_mask=True)
+    yield from _make_mask_test_params(_NONPAD_MASK_CASES, use_bool_mask=False)
+
+
+class TestTensorScatterAttentionWithMaskCPU(unittest.TestCase):
+    """Test TensorScatter + Attention with both nonpad_kv_seqlen and attn_mask on CPU.
+
+    Exercises the T26 fix: graceful fallback from Flash to MEA when both inputs present.
+    On CPU, both masks compose additively in the reference attention implementation.
+    """
+
+    @parameterized.expand(nonpad_mask_cpu_test_cases())
+    def test_nonpad_with_mask_cpu(
+        self,
+        name,
+        batch,
+        q_seq,
+        q_heads,
+        kv_heads,
+        head_size,
+        total_kv,
+        scatter_pos,
+        seqlens,
+        mask_pos,
+        use_bool_mask,
+    ):
+        output, ref_output = run_tensorscatter_attention_with_mask(
+            batch_size=batch,
+            total_kv_seq_len=total_kv,
+            q_seq_len=q_seq,
+            q_num_heads=q_heads,
+            kv_num_heads=kv_heads,
+            head_size=head_size,
+            nonpad_seqlens=seqlens,
+            scatter_positions=scatter_pos,
+            mask_positions_to_block=mask_pos,
+            use_bool_mask=use_bool_mask,
+            ep="CPUExecutionProvider",
+            device="cpu",
+            torch_type=torch.float32,
+            ort_type=TensorProto.FLOAT,
+        )
+        numpy.testing.assert_allclose(output, ref_output, rtol=cpu_fp32_rtol, atol=cpu_fp32_atol)
+
+
+@unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping tests.")
+class TestTensorScatterAttentionWithMaskCUDA(unittest.TestCase):
+    """Test TensorScatter + Attention with both nonpad_kv_seqlen and attn_mask on CUDA.
+
+    Exercises the MEA path which supports seqlen_k_ptr + attn_bias simultaneously.
+    Flash is excluded when both inputs are present; MEA handles the combination.
+    """
+
+    @parameterized.expand(_make_mask_test_params(_NONPAD_MASK_CASES, use_bool_mask=True))
+    def test_nonpad_with_bool_mask_cuda_fp16(
+        self,
+        name,
+        batch,
+        q_seq,
+        q_heads,
+        kv_heads,
+        head_size,
+        total_kv,
+        scatter_pos,
+        seqlens,
+        mask_pos,
+        use_bool_mask,
+    ):
+        output, ref_output = run_tensorscatter_attention_with_mask(
+            batch_size=batch,
+            total_kv_seq_len=total_kv,
+            q_seq_len=q_seq,
+            q_num_heads=q_heads,
+            kv_num_heads=kv_heads,
+            head_size=head_size,
+            nonpad_seqlens=seqlens,
+            scatter_positions=scatter_pos,
+            mask_positions_to_block=mask_pos,
+            use_bool_mask=use_bool_mask,
+            ep="CUDAExecutionProvider",
+            device="cuda",
+            torch_type=torch.float16,
+            ort_type=TensorProto.FLOAT16,
+        )
+        numpy.testing.assert_allclose(output, ref_output, rtol=rtol["fp16"], atol=atol["fp16"])
+
+
 if __name__ == "__main__":
     unittest.main()

From f8cd68939a592f94b1db38118e0b49597e4ab900 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Fri, 6 Mar 2026 22:49:15 +0000
Subject: [PATCH 30/35] Address review feedback: BF16 fix, unfused nonpad+mask,
 test improvements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

T27: Fix BFloat16 Flash accidentally disabled (sizeof check).
T28: Wire nonpad_kv_seqlen + attn_mask in unfused path.
T29: Fix 2D mask shape [batch,seq] → [q_seq,total_seq] per spec.
T31: Add Python tests for nonpad+mask combination.
T32: Extract shared ConvertAttnMaskToBias helper.
T33: Update attention_helper.h comment for nonpad+mask.
T34: Clarify decode perf (internal-cache vs external-cache).
T38: TODO → TODO(titaiwang).
T39: Early-reject softcap/softmax_precision before cascade.
Plus: comment/docstring fixes from audit.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cuda/llm/attention.cc      | 151 +++++++++---------
 .../core/providers/cuda/llm/attention.h       |  10 ++
 .../providers/cpu/llm/attention_op_test.cc    |  59 ++++++-
 .../test_onnx_attention/test_gqa.py           |   2 +-
 4 files changed, 144 insertions(+), 78 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 41b2f301aa318..964488af0b23b 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -111,6 +111,53 @@ static Status TransposeBSNHtoBNSH(int batch_size, int sequence_length,
       stream, max_threads_per_block);
 }
 
+// ============================================================================
+// ConvertAttnMaskToBias: shared helper for mask→additive bias conversion.
+// Used by both Flash (nonpad+mask) and MEA paths to avoid code duplication.
+// Converts bool masks to additive bias (true→0, false→mask_filter_value),
+// passes float masks through directly, and sets broadcast flags from mask shape.
+// ============================================================================
+template <typename T>
+Status Attention<T>::ConvertAttnMaskToBias(
+    OpKernelContext* context,
+    const Tensor* attn_mask,
+    cudaStream_t cuda_stream,
+    int max_threads_per_block,
+    IAllocatorUniquePtr<void>& converted_mask_buffer,
+    const void*& attn_bias_data,
+    bool& broadcast_bias_dim_0,
+    bool& broadcast_bias_dim_1) const {
+  if (attn_mask->IsDataType<bool>()) {
+    using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
+    int64_t num_elements = attn_mask->Shape().Size();
+    converted_mask_buffer = GetScratchBuffer<void>(
+        num_elements * sizeof(NativeCudaT), context->GetComputeStream());
+    float mask_filter_value = static_cast<float>(std::numeric_limits<T>::lowest());
+    ORT_RETURN_IF_ERROR(LaunchConvertBoolMaskToAttentionBias<NativeCudaT>(
+        attn_mask->Data<bool>(),
+        reinterpret_cast<NativeCudaT*>(converted_mask_buffer.get()),
+        num_elements, mask_filter_value, cuda_stream,
+        max_threads_per_block));
+    attn_bias_data = converted_mask_buffer.get();
+  } else {
+    attn_bias_data = attn_mask->Data<T>();
+  }
+
+  size_t mask_dims = attn_mask->Shape().NumDimensions();
+  auto dims = attn_mask->Shape().GetDims();
+  if (mask_dims == 2) {
+    broadcast_bias_dim_0 = true;
+    broadcast_bias_dim_1 = true;
+  } else if (mask_dims == 3) {
+    broadcast_bias_dim_0 = true;
+    broadcast_bias_dim_1 = dims[0] == 1;
+  } else {
+    broadcast_bias_dim_0 = dims[0] == 1;
+    broadcast_bias_dim_1 = dims[1] == 1;
+  }
+  return Status::OK();
+}
+
 // ============================================================================
 // RunFlashAttention: Direct flash attention kernel call
 // ============================================================================
@@ -121,13 +168,17 @@ static Status TransposeBSNHtoBNSH(int batch_size, int sequence_length,
 //           - Supports: bool mask -> seqlens_k, no mask -> fill past_seq_len
 //           - 4D BNSH: transposes Q/K/V to BSNH before kernel
 //   Path 3: no past, no mask (prompt) -> mha_fwd
-//   Eligibility: fp16/bf16, head_size==v_head_size, no output_qk, no softcap,
-//                no softmax_precision, (no mask OR bool mask + past OR nonpad_kv_seqlen)
+//   Eligibility: fp16/bf16, head_size==v_head_size, no output_qk,
+//                (no mask OR bool mask + past OR nonpad_kv_seqlen without mask)
+//   Note: softcap and softmax_precision are early-rejected before the cascade.
+//   Note: nonpad_kv_seqlen + attn_mask is supported but routes to MEA/unfused,
+//         not Flash (Flash has no bias parameter for this combination).
 //
 // PERFORMANCE NOTE: ONNX Attention's internal-cache decode path (past_key/past_value)
-// is inherently ~15-30% slower than contrib GQA's decode path for grouped-query attention
-// workloads. This overhead does NOT apply when using external KV cache via
-// TensorScatter + nonpad_kv_seqlen (opset 24), which avoids the copy entirely.
+// is ~15-30% slower than contrib GQA's decode path for grouped-query attention workloads.
+// When using external KV cache via TensorScatter + nonpad_kv_seqlen (opset 24), the
+// copy overhead (point 1) is eliminated. The remaining ~5-15% gap is from the missing
+// XQA kernel (point 2).
 //
 // The internal-cache overhead comes from:
 //
@@ -137,14 +188,17 @@ static Status TransposeBSNHtoBNSH(int batch_size, int sequence_length,
 //    Since past and present have different shapes, they cannot share the same buffer.
 //    Contrib GQA allows past and present to be the same tensor (in-place append),
 //    eliminating the memset + strided copy overhead (~67MB per decode step for typical LLM).
+//    This overhead does NOT apply to the external-cache path (TensorScatter +
+//    nonpad_kv_seqlen), which bypasses past/present entirely.
 //
 // 2. No XQA kernel: GQA's specialized XQA decode kernel (xqa_loader.h) requires
 //    past_present_share_buffer to function. Since ONNX Attention cannot share buffers
 //    (see point 1), XQA is fundamentally incompatible with this op's spec design.
+//    This accounts for the remaining ~5-15% gap even on the external-cache path.
 //
-// 3. These are spec-level limitations of the internal-cache path, not implementation gaps.
-//    For production LLM inference, prefer the external-cache path (TensorScatter +
-//    nonpad_kv_seqlen) which achieves parity with contrib GQA performance.
+// 3. These are spec-level limitations, not implementation gaps. For production LLM
+//    inference, the external-cache path (TensorScatter + nonpad_kv_seqlen) is
+//    recommended and achieves near-parity with contrib GQA performance.
 //
 template <typename T>
 Status Attention<T>::RunFlashAttention(
@@ -457,8 +511,9 @@ Status Attention<T>::RunFlashAttention(
 //   Path 2: no past, with mask (prompt) -> standard MEA with additive bias
 //   Path 3: no past, no mask (prompt) -> standard MEA
 //   Eligibility: see has_memory_efficient_attention() (SM50+/53+/80+ by dtype,
-//                head_size <= 1024), plus: no output_qk, no softcap,
-//                no softmax_precision, no past_key (decode excluded), bias stride alignment
+//                head_size <= 1024), plus: no output_qk, no past_key (decode excluded),
+//                bias stride alignment.
+//   Note: softcap and softmax_precision are early-rejected before the cascade.
 //
 template <typename T>
 Status Attention<T>::RunMemoryEfficientAttention(
@@ -578,34 +633,10 @@ Status Attention<T>::RunMemoryEfficientAttention(
     // When attn_mask is also provided, convert it to additive attn_bias so MEA
     // applies both custom right padding (seqlens_k) and the attention mask (attn_bias).
     if (attn_mask != nullptr) {
-      if (attn_mask->IsDataType<bool>()) {
-        using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
-        int64_t num_elements = attn_mask->Shape().Size();
-        converted_mask_buffer = GetScratchBuffer<void>(
-            num_elements * sizeof(NativeCudaT), context->GetComputeStream());
-        float mask_filter_value = static_cast<float>(std::numeric_limits<T>::lowest());
-        ORT_RETURN_IF_ERROR(LaunchConvertBoolMaskToAttentionBias<NativeCudaT>(
-            attn_mask->Data<bool>(),
-            reinterpret_cast<NativeCudaT*>(converted_mask_buffer.get()),
-            num_elements, mask_filter_value, cuda_stream,
-            device_prop.maxThreadsPerBlock));
-        attn_bias_data = converted_mask_buffer.get();
-      } else {
-        attn_bias_data = attn_mask->Data<T>();
-      }
-
-      size_t mask_dims = attn_mask->Shape().NumDimensions();
-      auto dims = attn_mask->Shape().GetDims();
-      if (mask_dims == 2) {
-        broadcast_bias_dim_0 = true;
-        broadcast_bias_dim_1 = true;
-      } else if (mask_dims == 3) {
-        broadcast_bias_dim_0 = true;
-        broadcast_bias_dim_1 = dims[0] == 1;
-      } else {
-        broadcast_bias_dim_0 = dims[0] == 1;
-        broadcast_bias_dim_1 = dims[1] == 1;
-      }
+      ORT_RETURN_IF_ERROR(ConvertAttnMaskToBias(context, attn_mask, cuda_stream,
+                                                device_prop.maxThreadsPerBlock,
+                                                converted_mask_buffer, attn_bias_data,
+                                                broadcast_bias_dim_0, broadcast_bias_dim_1));
     }
 
     onnxruntime::contrib::cuda::MemoryEfficientAttentionParams p;
@@ -651,43 +682,10 @@ Status Attention<T>::RunMemoryEfficientAttention(
   // custom_right_padding seqlens approach which would produce NaN.
   else {
     if (attn_mask != nullptr) {
-      if (attn_mask->IsDataType<bool>()) {
-        // Convert bool mask to additive attention bias (true→0.0, false→mask_filter_value).
-        // This handles all-false masks correctly (uniform softmax weights from extreme bias).
-        using NativeCudaT = typename onnxruntime::cuda::OrtToCudaType<T>::type;
-        int64_t num_elements = attn_mask->Shape().Size();
-        converted_mask_buffer = GetScratchBuffer<void>(
-            num_elements * sizeof(NativeCudaT), context->GetComputeStream());
-        float mask_filter_value = static_cast<float>(std::numeric_limits<T>::lowest());
-        ORT_RETURN_IF_ERROR(LaunchConvertBoolMaskToAttentionBias<NativeCudaT>(
-            attn_mask->Data<bool>(),
-            reinterpret_cast<NativeCudaT*>(converted_mask_buffer.get()),
-            num_elements, mask_filter_value, cuda_stream,
-            device_prop.maxThreadsPerBlock));
-        attn_bias_data = converted_mask_buffer.get();
-      } else {
-        attn_bias_data = attn_mask->Data<T>();
-      }
-
-      // Determine broadcast flags based on bias logical shape [B, num_heads, q_seq, kv_seq].
-      // MEA indexes bias as: offset = batch_id * strideB + head_id * strideH + q_pos * strideM + kv_pos.
-      // broadcast_attn_bias_dim_0=true sets strideB=0; dim_1=true sets strideH=0.
-      // strideM is always total_seq (num_keys), so the data must have [q_seq, total_seq] as inner dims.
-      size_t mask_dims = attn_mask->Shape().NumDimensions();
-      auto dims = attn_mask->Shape().GetDims();
-      if (mask_dims == 2) {
-        // 2D mask: [q_seq, total_seq] per ONNX spec. Broadcasts over batch and heads.
-        // MEA reads bias[q_pos * total_seq + kv_pos] for all (batch, head) pairs
-        // via strideB=0, strideH=0, strideM=total_seq.
-        broadcast_bias_dim_0 = true;  // broadcast over batch
-        broadcast_bias_dim_1 = true;  // broadcast over heads
-      } else if (mask_dims == 3) {
-        broadcast_bias_dim_0 = true;
-        broadcast_bias_dim_1 = dims[0] == 1;
-      } else {
-        broadcast_bias_dim_0 = dims[0] == 1;
-        broadcast_bias_dim_1 = dims[1] == 1;
-      }
+      ORT_RETURN_IF_ERROR(ConvertAttnMaskToBias(context, attn_mask, cuda_stream,
+                                                device_prop.maxThreadsPerBlock,
+                                                converted_mask_buffer, attn_bias_data,
+                                                broadcast_bias_dim_0, broadcast_bias_dim_1));
     }
 
     onnxruntime::contrib::cuda::MemoryEfficientAttentionParams p;
@@ -909,7 +907,8 @@ Status Attention<T>::RunUnfusedAttention(
       ORT_ENFORCE(mask_dims == 2 || (mask_dims == 4 && mask_shape[1] == 1),
                   "nonpad_kv_seqlen + attn_mask composition in unfused path only supports "
                   "2D masks [q, t] and 4D masks with head_dim=1 [B, 1, q, t]. "
-                  "Got mask shape: ", mask_shape);
+                  "Got mask shape: ",
+                  mask_shape);
 
       int64_t mask_elements = mask_shape.Size();
       const NativeCudaT* mask_bias_ptr = nullptr;
diff --git a/onnxruntime/core/providers/cuda/llm/attention.h b/onnxruntime/core/providers/cuda/llm/attention.h
index fd5bb81a2fbd3..3f69c7a77f497 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.h
+++ b/onnxruntime/core/providers/cuda/llm/attention.h
@@ -40,6 +40,16 @@ class Attention final : public CudaKernel {
       Tensor* output_qk,
       const attention_helper::AttentionParameters& parameters) const;
 
+  Status ConvertAttnMaskToBias(
+      OpKernelContext* context,
+      const Tensor* attn_mask,
+      cudaStream_t cuda_stream,
+      int max_threads_per_block,
+      IAllocatorUniquePtr<void>& converted_mask_buffer,
+      const void*& attn_bias_data,
+      bool& broadcast_bias_dim_0,
+      bool& broadcast_bias_dim_1) const;
+
  protected:
   bool is_causal_;
   int kv_num_heads_;
diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
index 3fb59f8a18968..6c4d09c9b96e9 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
@@ -1699,6 +1699,63 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_AllMasked_FP16_GQA) {
   execution_providers.push_back(DefaultCudaExecutionProvider());
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
+
+// Regression test: BFloat16 must route to Flash Attention on SM80+.
+// BFloat16 is a 2-byte type so disable_flash_attention_ should be false,
+// allowing Flash to handle GQA natively via kv_num_heads.
+TEST(AttentionTest, Attention_NonPadKVSeqLen_BF16_Flash) {
+  if (!HasCudaEnvironment(800)) {
+    return;  // BFloat16 requires SM 8.0+
+  }
+
+  OpTester test("Attention", 24, onnxruntime::kOnnxDomain);
+
+  int batch_size = 1;
+  int q_num_heads = 4;
+  int kv_num_heads = 2;
+  int q_sequence_length = 1;
+  int kv_sequence_length = 4;
+  int head_size = 64;
+
+  int q_elements = batch_size * q_num_heads * q_sequence_length * head_size;
+  int k_elements = batch_size * kv_num_heads * kv_sequence_length * head_size;
+
+  std::vector<float> q(q_elements, 1.0f);
+  std::vector<float> k(k_elements, 1.0f);
+  std::vector<float> v(k_elements);
+  for (int n = 0; n < kv_num_heads; n++) {
+    for (int s = 0; s < kv_sequence_length; s++) {
+      float val = static_cast<float>(s + 1) * 0.1f;
+      for (int h = 0; h < head_size; h++) {
+        v[(n * kv_sequence_length + s) * head_size + h] = val;
+      }
+    }
+  }
+
+  test.AddAttribute<int64_t>("kv_num_heads", kv_num_heads);
+  test.AddAttribute<int64_t>("q_num_heads", q_num_heads);
+
+  test.AddInput<BFloat16>("Q", {batch_size, q_num_heads, q_sequence_length, head_size}, FloatsToBFloat16s(q));
+  test.AddInput<BFloat16>("K", {batch_size, kv_num_heads, kv_sequence_length, head_size}, FloatsToBFloat16s(k));
+  test.AddInput<BFloat16>("V", {batch_size, kv_num_heads, kv_sequence_length, head_size}, FloatsToBFloat16s(v));
+  test.AddOptionalInputEdge<bool>();
+  test.AddOptionalInputEdge<BFloat16>();
+  test.AddOptionalInputEdge<BFloat16>();
+  test.AddInput<int64_t>("nonpad_kv_seqlen", {batch_size}, {2});
+
+  // 2 valid positions: uniform softmax → mean of V[0] and V[1] = (0.1 + 0.2) / 2 = 0.15
+  int y_elements = batch_size * q_num_heads * q_sequence_length * head_size;
+  std::vector<float> expected_y(y_elements, 0.15f);
+  test.AddOutput<BFloat16>("Y", {batch_size, q_num_heads, q_sequence_length, head_size},
+                           FloatsToBFloat16s(expected_y), false, 0, 0.02f);
+  test.AddOptionalOutputEdge<BFloat16>();
+  test.AddOptionalOutputEdge<BFloat16>();
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCudaExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 TEST(AttentionTest, Attention_NonPadKVSeqLen_NoneMasked) {
   OpTester test("Attention", 24, onnxruntime::kOnnxDomain);
 
@@ -1783,7 +1840,7 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_ExceedsTotalSeqLen) {
            {}, nullptr, &execution_providers);
 }
 
-// Test combined nonpad_kv_seqlen + bool attn_mask (T26).
+// Test combined nonpad_kv_seqlen + bool attn_mask.
 // Both masks should compose additively: nonpad_kv_seqlen masks positions >= valid_len,
 // and attn_mask further masks positions within the valid range.
 // Previously this combination crashed with ORT_ENFORCE; now it falls back to MEA gracefully.
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index 71e052a600f0f..fa58af4e3b963 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -861,7 +861,7 @@ def test_gqa_prompt_memory_efficient(self, name, config):
 
     # Note: GQA past tests removed — MEA is ineligible when past_key is present
     # (ComputeInternal requires past_key == nullptr for MEA). GQA past requires
-    # flash attention. The ONNX Attention op does not honor ORT_DISABLE_FLASH_ATTENTION.
+    # flash attention.
 
 
 @unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")

From 4b87dd8c426212883d42797b1b8f42d472b5b624 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Fri, 6 Mar 2026 23:07:25 +0000
Subject: [PATCH 31/35] Fix test failures: reference mask shape, seqlens size,
 invalid configs

Fix 2D mask reference computation to use per-batch [batch, kv_seq]
instead of first-batch-only [q_seq, total_seq]. Correct seqlens
size for batch=4 GQA tests. Remove invalid past_padding_mea tests
(no kernel path supports past_key + mask in GQA).

All tests pass: 39 C++, 55 GQA, 47 MHA, 73 TensorScatter.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../test_onnx_attention/test_gqa.py           | 50 +++++--------------
 .../test_onnx_attention/test_mha.py           | 23 ++++-----
 .../test_tensorscatter_attention.py           | 19 +++++--
 3 files changed, 36 insertions(+), 56 deletions(-)

diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index fa58af4e3b963..b3d30582196a8 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -422,12 +422,12 @@ def parity_check_gqa_prompt_with_padding(
         device=device,
     )
 
-    key_padding_mask = create_boolean_mask_from_seqlens(
-        seqlens=effective_seqlens,
-        total_seq_len=config.kv_sequence_length,
-        mask_dims=2,
-        device=device,
-    )
+    # Per-batch key_padding_mask [batch, kv_seq] for reference.
+    # Must NOT use create_boolean_mask_from_seqlens(..., mask_dims=2) here because that
+    # returns [q_seq, total_seq] using only the first batch's seqlen, which is wrong
+    # when effective_seqlens vary per batch (4D mask case).
+    arange_kv = torch.arange(config.kv_sequence_length, device=device).unsqueeze(0)
+    key_padding_mask = arange_kv < effective_seqlens.unsqueeze(1)  # [batch, kv_seq]
 
     # --- PyTorch Reference Path ---
     out_ref, _ = attention_ref(
@@ -917,8 +917,11 @@ def test_gqa_prompt_padding_mea(self, name, config):
         """Test prompt phase with padding mask using Memory Efficient Attention."""
         os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
 
+        # Create seqlens with config.batch_size elements.
+        # First batch has shorter valid length, rest at full length.
+        seqlens_list = [config.kv_sequence_length - 6] + [config.kv_sequence_length] * (config.batch_size - 1)
         seqlens = torch.tensor(
-            [config.kv_sequence_length - 6, config.kv_sequence_length],
+            seqlens_list,
             dtype=torch.int32,
             device="cuda",
         )
@@ -934,29 +937,6 @@ def test_gqa_prompt_padding_mea(self, name, config):
             atol=atol["fp16"],
         )
 
-    @parameterized.expand(gqa_past_padding_test_cases())
-    def test_gqa_past_padding_mea(self, name, config):
-        """Test decoding phase with padding mask using Memory Efficient Attention."""
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
-
-        past_seqlens = torch.full(
-            (config.batch_size,),
-            config.past_kv_sequence_length,
-            dtype=torch.int32,
-            device="cuda",
-        )
-
-        parity_check_gqa_past_with_padding(
-            config=config,
-            past_seqlens=past_seqlens,
-            ep="CUDAExecutionProvider",
-            device="cuda",
-            torch_type=torch.float16,
-            ort_type=TensorProto.FLOAT16,
-            rtol=rtol["fp16"],
-            atol=atol["fp16"],
-        )
-
 
 # #################################################################################################
 #  Parity Check with nonpad_kv_seqlen (Opset 24)
@@ -1014,13 +994,9 @@ def parity_check_gqa_prompt_with_nonpad_kv_seqlen(
             k[b, valid_len:, :, :] = 0
             v[b, valid_len:, :, :] = 0
 
-    # Reference: use key_padding_mask [batch, kv_seq]
-    key_padding_mask = create_boolean_mask_from_seqlens(
-        seqlens=nonpad_seqlens.to(torch.int32),
-        total_seq_len=config.kv_sequence_length,
-        mask_dims=2,
-        device=device,
-    )
+    # Per-batch key_padding_mask [batch, kv_seq] for reference
+    arange_kv = torch.arange(config.kv_sequence_length, device=device).unsqueeze(0)
+    key_padding_mask = arange_kv < nonpad_seqlens.unsqueeze(1).to(device)  # [batch, kv_seq]
 
     out_ref, _ = attention_ref(
         q=q,
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
index e6d920b28ffc9..2e72c5e33bc01 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
@@ -683,13 +683,12 @@ def parity_check_mha_prompt_with_bool_mask(
             device=device,
         )
 
-    # Create 2D key_padding_mask for reference (per-batch, shape [batch, total_seq])
-    key_padding_mask = create_boolean_mask_from_seqlens(
-        seqlens=effective_seqlens,
-        total_seq_len=config.kv_sequence_length,
-        mask_dims=2,
-        device=device,
-    )
+    # Per-batch key_padding_mask [batch, kv_seq] for reference.
+    # Must NOT use create_boolean_mask_from_seqlens(..., mask_dims=2) here because that
+    # returns [q_seq, total_seq] using only the first batch's seqlen, which is wrong
+    # when effective_seqlens vary per batch (4D mask case).
+    arange_kv = torch.arange(config.kv_sequence_length, device=device).unsqueeze(0)
+    key_padding_mask = arange_kv < effective_seqlens.unsqueeze(1)  # [batch, kv_seq]
 
     # --- PyTorch Reference Path ---
     out_ref, _ = attention_ref(
@@ -982,13 +981,9 @@ def parity_check_mha_prompt_with_nonpad_kv_seqlen(
             k[b, valid_len:, :, :] = 0
             v[b, valid_len:, :, :] = 0
 
-    # Reference: use key_padding_mask [batch, kv_seq]
-    key_padding_mask = create_boolean_mask_from_seqlens(
-        seqlens=nonpad_seqlens.to(torch.int32),
-        total_seq_len=config.kv_sequence_length,
-        mask_dims=2,
-        device=device,
-    )
+    # Per-batch key_padding_mask [batch, kv_seq] for reference
+    arange_kv = torch.arange(config.kv_sequence_length, device=device).unsqueeze(0)
+    key_padding_mask = arange_kv < nonpad_seqlens.unsqueeze(1).to(device)  # [batch, kv_seq]
 
     out_ref, _ = attention_ref(
         q=q,
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
index 67205caf5165e..c3741cbcdb822 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
@@ -772,9 +772,7 @@ def run_tensorscatter_attention_with_mask(
     k_ref = key_cache_ref.reshape(batch_size, total_kv_seq_len, kv_num_heads, head_size)
     v_ref = value_cache_ref.reshape(batch_size, total_kv_seq_len, kv_num_heads, head_size)
 
-    ref_output = numpy_attention_ref(
-        q_ref, k_ref, v_ref, nonpad_seqlens, is_causal=bool(is_causal), attn_bias=ref_bias
-    )
+    ref_output = numpy_attention_ref(q_ref, k_ref, v_ref, nonpad_seqlens, is_causal=bool(is_causal), attn_bias=ref_bias)
     ref_output_3d = ref_output.reshape(batch_size, q_seq_len, q_hidden)
 
     # --- ORT execution ---
@@ -859,8 +857,19 @@ def _make_mask_test_params(cases, use_bool_mask):
     mask_str = "bool" if use_bool_mask else "float"
     for batch, q_seq, q_heads, kv_heads, scatter_pos, seqlens, mask_pos, label in cases:
         name = f"b{batch}_qh{q_heads}_kvh{kv_heads}_{label}_{mask_str}"
-        yield (name, batch, q_seq, q_heads, kv_heads, _HEAD_SIZE, _TOTAL_KV_SEQ_LEN,
-               scatter_pos, seqlens, mask_pos, use_bool_mask)
+        yield (
+            name,
+            batch,
+            q_seq,
+            q_heads,
+            kv_heads,
+            _HEAD_SIZE,
+            _TOTAL_KV_SEQ_LEN,
+            scatter_pos,
+            seqlens,
+            mask_pos,
+            use_bool_mask,
+        )
 
 
 def nonpad_mask_cpu_test_cases():

From a6dce1a66673250db7f5828b1e86e6021c954309 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Fri, 6 Mar 2026 23:45:09 +0000
Subject: [PATCH 32/35] Validate present_key/present_value outputs in
 TensorScatter attention tests (T42)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All 5 test methods now assert present_key/present_value match the expected
BSNH→BNSH transpose of the updated KV cache. Both runner functions
(run_tensorscatter_attention and run_tensorscatter_attention_with_mask)
now compute and return reference present_k/v for validation.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../test_tensorscatter_attention.py           | 42 ++++++++++++-------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
index c3741cbcdb822..a6a115bb12213 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_tensorscatter_attention.py
@@ -266,7 +266,6 @@ def run_tensorscatter_attention(
     nonpad_seqlens,
     scatter_positions,
     ep,
-    device,
     torch_type,
     ort_type,
     is_causal=0,
@@ -329,6 +328,11 @@ def run_tensorscatter_attention(
     ref_output = numpy_attention_ref(q_ref, k_ref, v_ref, nonpad_seqlens, is_causal=bool(is_causal))
     ref_output_3d = ref_output.reshape(batch_size, q_seq_len, q_hidden)
 
+    # Compute expected present_key/present_value: BSNH → BNSH transpose of updated cache.
+    # Attention op with no past_key simply reshapes+transposes K/V to [B, H, S, D].
+    ref_present_k = k_ref.transpose(0, 2, 1, 3)  # [B, kv_num_heads, total_kv_seq_len, head_size]
+    ref_present_v = v_ref.transpose(0, 2, 1, 3)
+
     # --- ORT execution with IO Binding ---
     onnx_model_str = build_tensorscatter_attention_graph(
         batch_size=batch_size,
@@ -398,7 +402,7 @@ def run_tensorscatter_attention(
     present_k_result = present_k_ort.numpy()
     present_v_result = present_v_ort.numpy()
 
-    return output_result, ref_output_3d, present_k_result, present_v_result
+    return output_result, ref_output_3d, present_k_result, present_v_result, ref_present_k, ref_present_v
 
 
 # #################################################################################################
@@ -497,7 +501,7 @@ def test_tensorscatter_attention_cpu_fp32(
         seqlens,
         is_causal,
     ):
-        output, ref_output, _, _ = run_tensorscatter_attention(
+        output, ref_output, present_k, present_v, ref_present_k, ref_present_v = run_tensorscatter_attention(
             batch_size=batch,
             total_kv_seq_len=total_kv,
             q_seq_len=q_seq,
@@ -507,12 +511,13 @@ def test_tensorscatter_attention_cpu_fp32(
             nonpad_seqlens=seqlens,
             scatter_positions=scatter_pos,
             ep="CPUExecutionProvider",
-            device="cpu",
             torch_type=torch.float32,
             ort_type=TensorProto.FLOAT,
             is_causal=is_causal,
         )
         numpy.testing.assert_allclose(output, ref_output, rtol=cpu_fp32_rtol, atol=cpu_fp32_atol)
+        numpy.testing.assert_allclose(present_k, ref_present_k, rtol=cpu_fp32_rtol, atol=cpu_fp32_atol)
+        numpy.testing.assert_allclose(present_v, ref_present_v, rtol=cpu_fp32_rtol, atol=cpu_fp32_atol)
 
 
 @unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping tests.")
@@ -538,7 +543,7 @@ def test_tensorscatter_attention_cuda_fp16(
         seqlens,
         is_causal,
     ):
-        output, ref_output, _, _ = run_tensorscatter_attention(
+        output, ref_output, present_k, present_v, ref_present_k, ref_present_v = run_tensorscatter_attention(
             batch_size=batch,
             total_kv_seq_len=total_kv,
             q_seq_len=q_seq,
@@ -548,12 +553,13 @@ def test_tensorscatter_attention_cuda_fp16(
             nonpad_seqlens=seqlens,
             scatter_positions=scatter_pos,
             ep="CUDAExecutionProvider",
-            device="cuda",
             torch_type=torch.float16,
             ort_type=TensorProto.FLOAT16,
             is_causal=is_causal,
         )
         numpy.testing.assert_allclose(output, ref_output, rtol=rtol["fp16"], atol=atol["fp16"])
+        numpy.testing.assert_allclose(present_k, ref_present_k, rtol=rtol["fp16"], atol=atol["fp16"])
+        numpy.testing.assert_allclose(present_v, ref_present_v, rtol=rtol["fp16"], atol=atol["fp16"])
 
 
 @unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping tests.")
@@ -577,7 +583,7 @@ def test_tensorscatter_attention_cuda_fp32(
         seqlens,
         is_causal,
     ):
-        output, ref_output, _, _ = run_tensorscatter_attention(
+        output, ref_output, present_k, present_v, ref_present_k, ref_present_v = run_tensorscatter_attention(
             batch_size=batch,
             total_kv_seq_len=total_kv,
             q_seq_len=q_seq,
@@ -587,12 +593,13 @@ def test_tensorscatter_attention_cuda_fp32(
             nonpad_seqlens=seqlens,
             scatter_positions=scatter_pos,
             ep="CUDAExecutionProvider",
-            device="cuda",
             torch_type=torch.float32,
             ort_type=TensorProto.FLOAT,
             is_causal=is_causal,
         )
         numpy.testing.assert_allclose(output, ref_output, rtol=rtol["fp32"], atol=atol["fp32"])
+        numpy.testing.assert_allclose(present_k, ref_present_k, rtol=rtol["fp32"], atol=atol["fp32"])
+        numpy.testing.assert_allclose(present_v, ref_present_v, rtol=rtol["fp32"], atol=atol["fp32"])
 
 
 # #################################################################################################
@@ -702,7 +709,6 @@ def run_tensorscatter_attention_with_mask(
     mask_positions_to_block,
     use_bool_mask,
     ep,
-    device,
     torch_type,
     ort_type,
     is_causal=0,
@@ -775,6 +781,10 @@ def run_tensorscatter_attention_with_mask(
     ref_output = numpy_attention_ref(q_ref, k_ref, v_ref, nonpad_seqlens, is_causal=bool(is_causal), attn_bias=ref_bias)
     ref_output_3d = ref_output.reshape(batch_size, q_seq_len, q_hidden)
 
+    # Compute expected present_key/present_value: BSNH → BNSH transpose of updated cache.
+    ref_present_k = k_ref.transpose(0, 2, 1, 3)
+    ref_present_v = v_ref.transpose(0, 2, 1, 3)
+
     # --- ORT execution ---
     mask_shape = [q_seq_len, total_kv_seq_len]
     onnx_model_str = build_tensorscatter_attention_graph_with_mask(
@@ -833,7 +843,9 @@ def run_tensorscatter_attention_with_mask(
     io_binding.synchronize_outputs()
 
     output_result = output_ort.numpy()
-    return output_result, ref_output_3d
+    present_k_result = present_k_ort.numpy()
+    present_v_result = present_v_ort.numpy()
+    return output_result, ref_output_3d, present_k_result, present_v_result, ref_present_k, ref_present_v
 
 
 # Test cases for nonpad_kv_seqlen + attn_mask combination
@@ -900,7 +912,7 @@ def test_nonpad_with_mask_cpu(
         mask_pos,
         use_bool_mask,
     ):
-        output, ref_output = run_tensorscatter_attention_with_mask(
+        output, ref_output, present_k, present_v, ref_present_k, ref_present_v = run_tensorscatter_attention_with_mask(
             batch_size=batch,
             total_kv_seq_len=total_kv,
             q_seq_len=q_seq,
@@ -912,11 +924,12 @@ def test_nonpad_with_mask_cpu(
             mask_positions_to_block=mask_pos,
             use_bool_mask=use_bool_mask,
             ep="CPUExecutionProvider",
-            device="cpu",
             torch_type=torch.float32,
             ort_type=TensorProto.FLOAT,
         )
         numpy.testing.assert_allclose(output, ref_output, rtol=cpu_fp32_rtol, atol=cpu_fp32_atol)
+        numpy.testing.assert_allclose(present_k, ref_present_k, rtol=cpu_fp32_rtol, atol=cpu_fp32_atol)
+        numpy.testing.assert_allclose(present_v, ref_present_v, rtol=cpu_fp32_rtol, atol=cpu_fp32_atol)
 
 
 @unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping tests.")
@@ -942,7 +955,7 @@ def test_nonpad_with_bool_mask_cuda_fp16(
         mask_pos,
         use_bool_mask,
     ):
-        output, ref_output = run_tensorscatter_attention_with_mask(
+        output, ref_output, present_k, present_v, ref_present_k, ref_present_v = run_tensorscatter_attention_with_mask(
             batch_size=batch,
             total_kv_seq_len=total_kv,
             q_seq_len=q_seq,
@@ -954,11 +967,12 @@ def test_nonpad_with_bool_mask_cuda_fp16(
             mask_positions_to_block=mask_pos,
             use_bool_mask=use_bool_mask,
             ep="CUDAExecutionProvider",
-            device="cuda",
             torch_type=torch.float16,
             ort_type=TensorProto.FLOAT16,
         )
         numpy.testing.assert_allclose(output, ref_output, rtol=rtol["fp16"], atol=atol["fp16"])
+        numpy.testing.assert_allclose(present_k, ref_present_k, rtol=rtol["fp16"], atol=atol["fp16"])
+        numpy.testing.assert_allclose(present_v, ref_present_v, rtol=rtol["fp16"], atol=atol["fp16"])
 
 
 if __name__ == "__main__":

From a16baab8f93503445e89266075a10644d01fd203 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Fri, 6 Mar 2026 23:47:05 +0000
Subject: [PATCH 33/35] Address Copilot review round 3: log level, present_kv
 validation, cleanup

T41: Downgrade nonpad+mask fallback log from WARNING to VERBOSE.
T42: Add present_key/present_value validation in tensorscatter tests.
T43: Remove unused device parameter from test helpers.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 onnxruntime/core/providers/cuda/llm/attention.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
index 964488af0b23b..4f7fd6a664bdc 100644
--- a/onnxruntime/core/providers/cuda/llm/attention.cc
+++ b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -1017,7 +1017,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   // When both nonpad_kv_seqlen and attn_mask are provided, Flash Attention cannot handle
   // the combination (no bias parameter). Route to MEA or Unfused which support composition.
   if (nonpad_kv_seqlen != nullptr && attn_mask != nullptr) {
-    LOGS_DEFAULT(WARNING) << "Both nonpad_kv_seqlen and attn_mask provided. "
+    LOGS_DEFAULT(VERBOSE) << "Both nonpad_kv_seqlen and attn_mask provided. "
                           << "Flash Attention does not support this combination; "
                           << "falling back to Memory Efficient Attention or Unfused path.";
   }

From cb2ae8c51bc5faf2b487ee262f33cdca5f5a0f92 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Sat, 7 Mar 2026 19:01:08 +0000
Subject: [PATCH 34/35] Fix env var leak in tests: restore ORT_DISABLE_* after
 use

Wrap env var changes in @patch.dict class decorators to prevent
leaking between test cases and causing order-dependent failures.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../test_onnx_attention/test_gqa.py           | 47 +++++++---------
 .../test_onnx_attention/test_mha.py           | 53 +++++++++----------
 2 files changed, 44 insertions(+), 56 deletions(-)

diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
index b3d30582196a8..15cc5e3c27803 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_gqa.py
@@ -24,6 +24,7 @@
 
 import os
 import unittest
+from unittest.mock import patch
 
 import numpy
 import torch
@@ -761,6 +762,7 @@ def gqa_past_padding_test_cases():
 
 
 @unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
+@patch.dict(os.environ, {"ORT_DISABLE_FLASH_ATTENTION": "0"})
 class TestONNXAttentionFlashGQA(unittest.TestCase):
     """Test ONNX Attention op (opset 23) GQA path with Flash Attention.
 
@@ -769,7 +771,6 @@ class TestONNXAttentionFlashGQA(unittest.TestCase):
 
     @parameterized.expand(gqa_prompt_test_cases())
     def test_gqa_prompt_flash(self, name, config):
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
         parity_check_gqa_prompt(
             config=config,
             ep="CUDAExecutionProvider",
@@ -783,7 +784,6 @@ def test_gqa_prompt_flash(self, name, config):
 
     @parameterized.expand(gqa_past_test_cases())
     def test_gqa_past_flash(self, name, config):
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
         parity_check_gqa_past(
             config=config,
             ep="CUDAExecutionProvider",
@@ -797,6 +797,7 @@ def test_gqa_past_flash(self, name, config):
 
 
 @unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
+@patch.dict(os.environ, {"ORT_DISABLE_FLASH_ATTENTION": "0"})
 class TestONNXAttentionFlashGQABF16(unittest.TestCase):
     """Test ONNX Attention op (opset 23) GQA path with Flash Attention using BFloat16.
 
@@ -810,7 +811,6 @@ def test_gqa_prompt_flash_bf16(self, name, config):
             self.skipTest("BFloat16 not supported on this device")
 
         config.kv_cache_type = "bfloat16"
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
         parity_check_gqa_prompt(
             config=config,
             ep="CUDAExecutionProvider",
@@ -828,7 +828,6 @@ def test_gqa_past_flash_bf16(self, name, config):
             self.skipTest("BFloat16 not supported on this device")
 
         config.kv_cache_type = "bfloat16"
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
         parity_check_gqa_past(
             config=config,
             ep="CUDAExecutionProvider",
@@ -842,12 +841,12 @@ def test_gqa_past_flash_bf16(self, name, config):
 
 
 @unittest.skipIf(not has_cuda_device(53), "Memory Efficient Attention is not available, skipping tests.")
+@patch.dict(os.environ, {"ORT_DISABLE_FLASH_ATTENTION": "1"})
 class TestONNXAttentionMemoryEfficientGQA(unittest.TestCase):
     """Test ONNX Attention op (opset 23) GQA path with Memory Efficient Attention."""
 
     @parameterized.expand(gqa_prompt_test_cases())
     def test_gqa_prompt_memory_efficient(self, name, config):
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
         parity_check_gqa_prompt(
             config=config,
             ep="CUDAExecutionProvider",
@@ -865,6 +864,7 @@ def test_gqa_prompt_memory_efficient(self, name, config):
 
 
 @unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
+@patch.dict(os.environ, {"ORT_DISABLE_FLASH_ATTENTION": "0"})
 class TestONNXAttentionPaddingMaskGQA(unittest.TestCase):
     """
     Test ONNX Attention op (opset 23) GQA path with boolean padding masks.
@@ -884,8 +884,6 @@ class TestONNXAttentionPaddingMaskGQA(unittest.TestCase):
     @parameterized.expand(gqa_past_padding_test_cases())
     def test_gqa_past_padding_flash(self, name, config):
         """Test decoding phase with padding mask using Flash Attention."""
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
-
         past_seqlens = torch.full(
             (config.batch_size,),
             config.past_kv_sequence_length,
@@ -906,6 +904,7 @@ def test_gqa_past_padding_flash(self, name, config):
 
 
 @unittest.skipIf(not has_cuda_device(53), "Memory Efficient Attention is not available, skipping tests.")
+@patch.dict(os.environ, {"ORT_DISABLE_FLASH_ATTENTION": "1"})
 class TestONNXAttentionPaddingMaskMemoryEfficientGQA(unittest.TestCase):
     """
     Test ONNX Attention op (opset 23) GQA path with boolean padding masks
@@ -915,8 +914,6 @@ class TestONNXAttentionPaddingMaskMemoryEfficientGQA(unittest.TestCase):
     @parameterized.expand(gqa_prompt_padding_test_cases())
     def test_gqa_prompt_padding_mea(self, name, config):
         """Test prompt phase with padding mask using Memory Efficient Attention."""
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
-
         # Create seqlens with config.batch_size elements.
         # First batch has shorter valid length, rest at full length.
         seqlens_list = [config.kv_sequence_length - 6] + [config.kv_sequence_length] * (config.batch_size - 1)
@@ -1102,6 +1099,7 @@ def gqa_nonpad_kv_seqlen_cpu_test_cases():
 
 
 @unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
+@patch.dict(os.environ, {"ORT_DISABLE_FLASH_ATTENTION": "0"})
 class TestONNXAttentionGQANonpadKVSeqlen(unittest.TestCase):
     """Test ONNX Attention op (opset 24) GQA path with nonpad_kv_seqlen (Flash Attention).
 
@@ -1110,7 +1108,6 @@ class TestONNXAttentionGQANonpadKVSeqlen(unittest.TestCase):
 
     @parameterized.expand(gqa_nonpad_kv_seqlen_test_cases())
     def test_gqa_nonpad_kv_seqlen_flash(self, name, config, seqlens):
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
         nonpad_seqlens = torch.tensor(seqlens, dtype=torch.int64, device="cuda")
 
         parity_check_gqa_prompt_with_nonpad_kv_seqlen(
@@ -1126,12 +1123,12 @@ def test_gqa_nonpad_kv_seqlen_flash(self, name, config, seqlens):
 
 
 @unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping tests.")
+@patch.dict(os.environ, {"ORT_DISABLE_FLASH_ATTENTION": "1"})
 class TestONNXAttentionGQANonpadKVSeqlenMEA(unittest.TestCase):
     """Test ONNX Attention op (opset 24) GQA path with nonpad_kv_seqlen (Memory Efficient Attention)."""
 
     @parameterized.expand(gqa_nonpad_kv_seqlen_test_cases())
     def test_gqa_nonpad_kv_seqlen_mea(self, name, config, seqlens):
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
         nonpad_seqlens = torch.tensor(seqlens, dtype=torch.int64, device="cuda")
 
         parity_check_gqa_prompt_with_nonpad_kv_seqlen(
@@ -1223,6 +1220,7 @@ def gqa_4d_bnsh_past_test_cases():
 
 
 @unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping 4D BNSH tests.")
+@patch.dict(os.environ, {"ORT_DISABLE_FLASH_ATTENTION": "0"})
 class TestONNXAttentionGQA4DBNSH(unittest.TestCase):
     """
     Test GQA with 4D BNSH input format [batch, num_heads, seq, head_size].
@@ -1234,7 +1232,6 @@ class TestONNXAttentionGQA4DBNSH(unittest.TestCase):
 
     @parameterized.expand(gqa_4d_bnsh_test_cases())
     def test_gqa_4d_bnsh_prompt(self, name, config):
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
         parity_check_gqa_prompt(
             config=config,
             ep="CUDAExecutionProvider",
@@ -1248,7 +1245,6 @@ def test_gqa_4d_bnsh_prompt(self, name, config):
 
     @parameterized.expand(gqa_4d_bnsh_past_test_cases())
     def test_gqa_4d_bnsh_decode(self, name, config):
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
         parity_check_gqa_past(
             config=config,
             ep="CUDAExecutionProvider",
@@ -1267,6 +1263,7 @@ def test_gqa_4d_bnsh_decode(self, name, config):
 
 
 @unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping float mask tests.")
+@patch.dict(os.environ, {"ORT_DISABLE_FLASH_ATTENTION": "1"})
 class TestONNXAttentionGQAFloatMask(unittest.TestCase):
     """
     Test GQA with float additive attention mask (not bool) during prompt.
@@ -1319,20 +1316,16 @@ def test_gqa_prompt_float_mask_4d(self):
         out_ref, _ = attention_ref(q=q, k=k, v=v, attn_bias=attn_bias_ref, causal=False)
 
         # ORT path (MEA handles GQA+float mask)
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
-        try:
-            out_ort, _, _ = attention_prompt_func(
-                q=q,
-                k=k,
-                v=v,
-                config=config,
-                attn_mask=attn_mask,
-                ep="CUDAExecutionProvider",
-                device=device,
-                ort_type=TensorProto.FLOAT16,
-            )
-        finally:
-            os.environ.pop("ORT_DISABLE_FLASH_ATTENTION", None)
+        out_ort, _, _ = attention_prompt_func(
+            q=q,
+            k=k,
+            v=v,
+            config=config,
+            attn_mask=attn_mask,
+            ep="CUDAExecutionProvider",
+            device=device,
+            ort_type=TensorProto.FLOAT16,
+        )
 
         out_ort = out_ort.reshape(2, 16, 8, 128)
 
diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
index 2e72c5e33bc01..048461d083309 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
@@ -22,9 +22,9 @@
   - Past KV cache
   - 2D, 3D, 4D additive/bool masks with broadcasting
 """
-
 import os
 import unittest
+from unittest.mock import patch
 
 import numpy
 import torch
@@ -1214,6 +1214,7 @@ def mha_unfused_test_cases():
 
 
 @unittest.skipIf(not has_cuda_device(53), "CUDA device not available, skipping unfused tests.")
+@patch.dict(os.environ, {"ORT_DISABLE_FLASH_ATTENTION": "1", "ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION": "1"})
 class TestONNXAttentionMHAUnfused(unittest.TestCase):
     """
     Test the unfused attention kernel by disabling flash and MEA.
@@ -1224,34 +1225,28 @@ class TestONNXAttentionMHAUnfused(unittest.TestCase):
 
     @parameterized.expand(mha_unfused_test_cases())
     def test_mha_unfused_fp16(self, name, config):
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
-        os.environ["ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION"] = "1"
-        try:
-            if "decode" in name:
-                parity_check_mha_past(
-                    config=config,
-                    ep="CUDAExecutionProvider",
-                    device="cuda",
-                    torch_type=torch.float16,
-                    ort_type=TensorProto.FLOAT16,
-                    causal=config.is_causal == 1,
-                    rtol=rtol["fp16"],
-                    atol=atol["fp16"],
-                )
-            else:
-                parity_check_mha_prompt(
-                    config=config,
-                    ep="CUDAExecutionProvider",
-                    device="cuda",
-                    torch_type=torch.float16,
-                    ort_type=TensorProto.FLOAT16,
-                    causal=config.is_causal == 1,
-                    rtol=rtol["fp16"],
-                    atol=atol["fp16"],
-                )
-        finally:
-            os.environ.pop("ORT_DISABLE_FLASH_ATTENTION", None)
-            os.environ.pop("ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION", None)
+        if "decode" in name:
+            parity_check_mha_past(
+                config=config,
+                ep="CUDAExecutionProvider",
+                device="cuda",
+                torch_type=torch.float16,
+                ort_type=TensorProto.FLOAT16,
+                causal=config.is_causal == 1,
+                rtol=rtol["fp16"],
+                atol=atol["fp16"],
+            )
+        else:
+            parity_check_mha_prompt(
+                config=config,
+                ep="CUDAExecutionProvider",
+                device="cuda",
+                torch_type=torch.float16,
+                ort_type=TensorProto.FLOAT16,
+                causal=config.is_causal == 1,
+                rtol=rtol["fp16"],
+                atol=atol["fp16"],
+            )
 
 
 # #################################################################################################

From 579af5f472b8d32b35d1bb1b9b8d3fe7f618f185 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Sat, 7 Mar 2026 19:07:30 +0000
Subject: [PATCH 35/35] lint

---
 .../test/python/transformers/test_onnx_attention/test_mha.py     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
index 048461d083309..5cb1e7b7c50b3 100644
--- a/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
+++ b/onnxruntime/test/python/transformers/test_onnx_attention/test_mha.py
@@ -22,6 +22,7 @@
   - Past KV cache
   - 2D, 3D, 4D additive/bool masks with broadcasting
 """
+
 import os
 import unittest
 from unittest.mock import patch