From c0b4397f8701854aeb6a79424815f0aa187f1490 Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Tue, 28 Apr 2026 17:04:52 -0700
Subject: [PATCH 1/5] Relax GQA seqlens_k shape validation for backward compat
 with older models

PR #28031 tightened seqlens_k shape validation (&&->||), correctly
rejecting non-1D tensors per spec. However, older model builders emit
seqlens_k with shape [1,1] instead of [1], breaking HuggingFace LLMs
(qwen3-0.6b, qwen3-1.7b).

Relax shape check to allow unit dimensions around the batch axis: each
dim must be 1 or batch_size (accepts [B], [B,1], [1,1] but rejects
[2,2] for B=4). Also fixes the same latent && bug in JS/WebGPU EP.

Value bounds checks in Compute() are unchanged.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../jsep/webgpu/ops/group-query-attention.ts  |  18 ++-
 .../cpu/bert/group_query_attention_helper.h   |  14 +-
 .../group_query_attention_op_test.cc          | 142 +++++++++---------
 3 files changed, 98 insertions(+), 76 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
index d218be3ce8b5f..96b5865574abd 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
@@ -193,9 +193,23 @@ export const validateInputs = (
       passPastInKv = true;
     }
   }
+  // Spec requires 1D shape (batch_size), but older model builders may add unit
+  // dimensions (e.g. [B, 1] instead of [B]). Allow shapes where each dim is 1 or batchSize.
   const seqlLens = inputs.length > 4 ? inputs[5] : undefined;
-  if (seqlLens && seqlLens.dims.length !== 1 && seqlLens.dims[0] !== batchSize) {
-    throw new Error('Input "seqlens" is expected to have 1 dimension and the same dim 0 as batch_size');
+  if (seqlLens) {
+    const seqlLenSize = seqlLens.dims.reduce((a, b) => a * b, 1);
+    if (seqlLenSize !== batchSize) {
+      throw new Error(
+        `seqlens_k must have batch_size (${batchSize}) elements, got ${seqlLenSize}.`,
+      );
+    }
+    for (let i = 0; i < seqlLens.dims.length; i++) {
+      if (seqlLens.dims[i] !== 1 && seqlLens.dims[i] !== batchSize) {
+        throw new Error(
+          `seqlens_k has unexpected shape. Each dimension must be 1 or batch_size (${batchSize}), got dims[${i}] = ${seqlLens.dims[i]}.`,
+        );
+      }
+    }
   }
   const totalSequenceLength = -1;
   const maxSequenceLength = -1;
diff --git a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
index f5399e307fbca..9146a9e90e412 100644
--- a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
@@ -261,10 +261,20 @@ Status CheckInputs(const T* query,
                            "Input 'past_key' and 'past_value' shall be both present or both absent.");
   }
 
+  // Spec requires 1D shape (batch_size), but older model builders may add unit
+  // dimensions (e.g. [B, 1] instead of [B]). Allow shapes where each dim is 1 or batch_size.
   const auto& seqlens_k_dim = seqlens_k->Shape().GetDims();
-  if (seqlens_k_dim.size() != 1 || seqlens_k_dim[0] != batch_size) {
+  if (seqlens_k->Shape().Size() != static_cast<int64_t>(batch_size)) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "seqlens_k must be shape (batch_size).");
+                           "seqlens_k must have batch_size (", batch_size, ") elements, got ",
+                           seqlens_k->Shape().Size(), ".");
+  }
+  for (size_t i = 0; i < seqlens_k_dim.size(); ++i) {
+    if (seqlens_k_dim[i] != 1 && seqlens_k_dim[i] != static_cast<int64_t>(batch_size)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "seqlens_k has unexpected shape. Each dimension must be 1 or batch_size (",
+                             batch_size, "), got dim[", i, "] = ", seqlens_k_dim[i], ".");
+    }
   }
 
   if (!onnxruntime::IsScalarOr1ElementVector(total_seqlen)) {
diff --git a/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc b/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
index 0690094031bb8..350623fac8838 100644
--- a/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
@@ -22,7 +22,8 @@ static void RunGQASeqlensKTest(
     OpTester::ExpectResult expect,
     const std::string& expected_message,
     bool provide_past = false,
-    int past_seq_len = 0) {
+    int past_seq_len = 0,
+    const std::vector<int64_t>& seqlens_k_shape = {}) {
   constexpr int num_heads = 1;
   constexpr int kv_num_heads = 1;
   constexpr int head_size = 8;
@@ -52,7 +53,10 @@ static void RunGQASeqlensKTest(
     tester.AddOptionalInputEdge<float>();  // past_value
   }
 
-  tester.AddInput<int32_t>("seqlens_k", {batch_size}, seqlens_k_data);
+  std::vector<int64_t> shape = seqlens_k_shape.empty()
+                                   ? std::vector<int64_t>{batch_size}
+                                   : seqlens_k_shape;
+  tester.AddInput<int32_t>("seqlens_k", shape, seqlens_k_data);
   tester.AddInput<int32_t>("total_sequence_length", {1}, {total_seq_len});
 
   tester.AddOptionalInputEdge<float>();    // cos_cache
@@ -73,8 +77,7 @@ static void RunGQASeqlensKTest(
                           {batch_size, kv_num_heads, declared_present_seqlen, head_size},
                           std::vector<float>(batch_size * kv_num_heads * declared_present_seqlen * head_size, 0.0f));
 
-  // For success tests, we only care that validation passes without crash;
-  // exact output values are not the focus of these security regression tests.
+  // Tolerance is intentionally loose: these tests validate shape acceptance, not output values.
   if (expect == OpTester::ExpectResult::kExpectSuccess) {
     tester.SetOutputTolerance(1e6f);
   }
@@ -231,80 +234,75 @@ TEST(GroupQueryAttentionTest, TotalSeqLenNegative) {
       "total_sequence_length must be positive");
 }
 
-// Shape validation: seqlens_k with wrong rank (2D instead of 1D) must be rejected.
-TEST(GroupQueryAttentionTest, SeqlensKWrongRank) {
-  constexpr int num_heads = 1;
-  constexpr int kv_num_heads = 1;
-  constexpr int head_size = 8;
-  constexpr int hidden_size = num_heads * head_size;
-  constexpr int kv_hidden_size = kv_num_heads * head_size;
-
-  OpTester tester("GroupQueryAttention", 1, onnxruntime::kMSDomain);
-  tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
-  tester.AddAttribute<int64_t>("kv_num_heads", static_cast<int64_t>(kv_num_heads));
-
-  tester.AddInput<float>("query", {1, 1, hidden_size}, std::vector<float>(hidden_size, 1.0f));
-  tester.AddInput<float>("key", {1, 1, kv_hidden_size}, std::vector<float>(kv_hidden_size, 1.0f));
-  tester.AddInput<float>("value", {1, 1, kv_hidden_size}, std::vector<float>(kv_hidden_size, 1.0f));
-  tester.AddOptionalInputEdge<float>();  // past_key
-  tester.AddOptionalInputEdge<float>();  // past_value
-  // 2D shape {1, 1} instead of {1}
-  tester.AddInput<int32_t>("seqlens_k", {1, 1}, {0});
-  tester.AddInput<int32_t>("total_sequence_length", {1}, {1});
-  tester.AddOptionalInputEdge<float>();    // cos_cache
-  tester.AddOptionalInputEdge<float>();    // sin_cache
-  tester.AddOptionalInputEdge<int64_t>();  // position_ids
-  tester.AddOptionalInputEdge<float>();    // attention_bias
-  tester.AddOptionalInputEdge<float>();    // head_sink
+// Backward compat: seqlens_k shape {1, 1} accepted for batch_size=1.
+// Older model builders (e.g. qwen3-0.6b) emit this instead of {1}.
+TEST(GroupQueryAttentionTest, SeqlensKLegacy2DShape) {
+  RunGQASeqlensKTest(
+      /*seqlens_k_data=*/{0},
+      /*total_seq_len=*/1,
+      /*batch_size=*/1,
+      /*sequence_length=*/1,
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      /*provide_past=*/false,
+      /*past_seq_len=*/0,
+      /*seqlens_k_shape=*/{1, 1});
+}
 
-  tester.AddOutput<float>("output", {1, 1, hidden_size}, std::vector<float>(hidden_size, 0.0f));
-  tester.AddOutput<float>("present_key", {1, kv_num_heads, 1, head_size},
-                          std::vector<float>(kv_num_heads * head_size, 0.0f));
-  tester.AddOutput<float>("present_value", {1, kv_num_heads, 1, head_size},
-                          std::vector<float>(kv_num_heads * head_size, 0.0f));
+// Backward compat: seqlens_k shape {2, 1} accepted for batch_size=2.
+TEST(GroupQueryAttentionTest, SeqlensKLegacy2DShapeMultiBatch) {
+  RunGQASeqlensKTest(
+      /*seqlens_k_data=*/{0, 0},
+      /*total_seq_len=*/1,
+      /*batch_size=*/2,
+      /*sequence_length=*/1,
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      /*provide_past=*/false,
+      /*past_seq_len=*/0,
+      /*seqlens_k_shape=*/{2, 1});
+}
 
-  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-  execution_providers.push_back(DefaultCpuExecutionProvider());
-  tester.Run(OpTester::ExpectResult::kExpectFailure, "seqlens_k must be shape (batch_size)",
-             {}, nullptr, &execution_providers);
+// Shape {2, 2} with batch_size=4: correct element count but invalid factored shape.
+TEST(GroupQueryAttentionTest, SeqlensKInvalidFactoredShape) {
+  RunGQASeqlensKTest(
+      /*seqlens_k_data=*/{0, 0, 0, 0},
+      /*total_seq_len=*/1,
+      /*batch_size=*/4,
+      /*sequence_length=*/1,
+      OpTester::ExpectResult::kExpectFailure,
+      "seqlens_k has unexpected shape",
+      /*provide_past=*/false,
+      /*past_seq_len=*/0,
+      /*seqlens_k_shape=*/{2, 2});
 }
 
-// Shape validation: seqlens_k with wrong length (2 elements for batch_size=1) must be rejected.
+// Wrong element count (1D): 2 elements for batch_size=1.
 TEST(GroupQueryAttentionTest, SeqlensKWrongLength) {
-  constexpr int num_heads = 1;
-  constexpr int kv_num_heads = 1;
-  constexpr int head_size = 8;
-  constexpr int hidden_size = num_heads * head_size;
-  constexpr int kv_hidden_size = kv_num_heads * head_size;
-
-  OpTester tester("GroupQueryAttention", 1, onnxruntime::kMSDomain);
-  tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
-  tester.AddAttribute<int64_t>("kv_num_heads", static_cast<int64_t>(kv_num_heads));
-
-  tester.AddInput<float>("query", {1, 1, hidden_size}, std::vector<float>(hidden_size, 1.0f));
-  tester.AddInput<float>("key", {1, 1, kv_hidden_size}, std::vector<float>(kv_hidden_size, 1.0f));
-  tester.AddInput<float>("value", {1, 1, kv_hidden_size}, std::vector<float>(kv_hidden_size, 1.0f));
-  tester.AddOptionalInputEdge<float>();  // past_key
-  tester.AddOptionalInputEdge<float>();  // past_value
-  // Length 2 instead of 1 for batch_size=1
-  tester.AddInput<int32_t>("seqlens_k", {2}, {0, 0});
-  tester.AddInput<int32_t>("total_sequence_length", {1}, {1});
-  tester.AddOptionalInputEdge<float>();    // cos_cache
-  tester.AddOptionalInputEdge<float>();    // sin_cache
-  tester.AddOptionalInputEdge<int64_t>();  // position_ids
-  tester.AddOptionalInputEdge<float>();    // attention_bias
-  tester.AddOptionalInputEdge<float>();    // head_sink
-
-  tester.AddOutput<float>("output", {1, 1, hidden_size}, std::vector<float>(hidden_size, 0.0f));
-  tester.AddOutput<float>("present_key", {1, kv_num_heads, 1, head_size},
-                          std::vector<float>(kv_num_heads * head_size, 0.0f));
-  tester.AddOutput<float>("present_value", {1, kv_num_heads, 1, head_size},
-                          std::vector<float>(kv_num_heads * head_size, 0.0f));
+  RunGQASeqlensKTest(
+      /*seqlens_k_data=*/{0, 0},
+      /*total_seq_len=*/1,
+      /*batch_size=*/1,
+      /*sequence_length=*/1,
+      OpTester::ExpectResult::kExpectFailure,
+      "seqlens_k must have batch_size",
+      /*provide_past=*/false,
+      /*past_seq_len=*/0,
+      /*seqlens_k_shape=*/{2});
+}
 
-  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-  execution_providers.push_back(DefaultCpuExecutionProvider());
-  tester.Run(OpTester::ExpectResult::kExpectFailure, "seqlens_k must be shape (batch_size)",
-             {}, nullptr, &execution_providers);
+// Wrong element count (2D): shape {2, 1} has 2 elements but batch_size=1.
+TEST(GroupQueryAttentionTest, SeqlensKWrongElementCount2D) {
+  RunGQASeqlensKTest(
+      /*seqlens_k_data=*/{0, 0},
+      /*total_seq_len=*/1,
+      /*batch_size=*/1,
+      /*sequence_length=*/1,
+      OpTester::ExpectResult::kExpectFailure,
+      "seqlens_k must have batch_size",
+      /*provide_past=*/false,
+      /*past_seq_len=*/0,
+      /*seqlens_k_shape=*/{2, 1});
 }
 
 }  // namespace test

From 958c8f3db4f3ece5b70124a34da6754fad07ddd7 Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Tue, 28 Apr 2026 23:37:53 -0700
Subject: [PATCH 2/5] Add tests for legacy 2D seqlens_k shapes

Add JS/WebGPU test for [1,1] seqlens_k shape (the exact qwen3 regression
case) and C++ test for trailing batch dim shape {1,B}.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../test/data/ops/group-query-attention.jsonc | 78 +++++++++++++++++++
 .../group_query_attention_op_test.cc          | 15 ++++
 2 files changed, 93 insertions(+)

diff --git a/js/web/test/data/ops/group-query-attention.jsonc b/js/web/test/data/ops/group-query-attention.jsonc
index f71e89f727cb1..83a5dc765280e 100644
--- a/js/web/test/data/ops/group-query-attention.jsonc
+++ b/js/web/test/data/ops/group-query-attention.jsonc
@@ -1409,5 +1409,83 @@
         ]
       }
     ]
+  },
+  {
+    // Backward compat: seqlens_k shape [1, 1] accepted for batch_size=1.
+    // Older model builders (e.g. qwen3-0.6b) emit this instead of [1].
+    "name": "GroupQueryAttention Legacy2D SeqlensK",
+    "operator": "GroupQueryAttention",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "num_heads", "data": 1, "type": "int" },
+      { "name": "kv_num_heads", "data": 1, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [0, 1, 2, 3, 4, 5, 6, 7],
+            "dims": [1, 1, 8],
+            "type": "float32"
+          },
+          // key
+          {
+            "data": [16, 17, 18, 19, 20, 21, 22, 23],
+            "dims": [1, 1, 8],
+            "type": "float32"
+          },
+          // value
+          {
+            "data": [32, 33, 34, 35, 36, 37, 38, 39],
+            "dims": [1, 1, 8],
+            "type": "float32"
+          },
+          // past key, BNSH
+          {
+            "data": [],
+            "dims": [1, 1, 0, 8],
+            "type": "float32"
+          },
+          // past value, BNSH
+          {
+            "data": [],
+            "dims": [1, 1, 0, 8],
+            "type": "float32"
+          },
+          // seqlens_k -- legacy [1, 1] shape instead of [1]
+          {
+            "data": [1],
+            "dims": [1, 1],
+            "type": "int32"
+          },
+          // total_sequence_length
+          {
+            "data": [1],
+            "dims": [1],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [32, 33, 34, 35, 36, 37, 38, 39],
+            "dims": [1, 1, 8],
+            "type": "float32"
+          },
+          {
+            // present key, BNSH
+            "data": [16, 17, 18, 19, 20, 21, 22, 23],
+            "dims": [1, 1, 1, 8],
+            "type": "float32"
+          },
+          {
+            // present value, BNSH
+            "data": [32, 33, 34, 35, 36, 37, 38, 39],
+            "dims": [1, 1, 1, 8],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]
diff --git a/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc b/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
index 350623fac8838..f0958dd32a9a6 100644
--- a/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
@@ -263,6 +263,21 @@ TEST(GroupQueryAttentionTest, SeqlensKLegacy2DShapeMultiBatch) {
       /*seqlens_k_shape=*/{2, 1});
 }
 
+// Backward compat: seqlens_k shape {1, 2} accepted for batch_size=2.
+// Batch dimension in trailing position.
+TEST(GroupQueryAttentionTest, SeqlensKLegacy2DShapeTrailingBatch) {
+  RunGQASeqlensKTest(
+      /*seqlens_k_data=*/{0, 0},
+      /*total_seq_len=*/1,
+      /*batch_size=*/2,
+      /*sequence_length=*/1,
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      /*provide_past=*/false,
+      /*past_seq_len=*/0,
+      /*seqlens_k_shape=*/{1, 2});
+}
+
 // Shape {2, 2} with batch_size=4: correct element count but invalid factored shape.
 TEST(GroupQueryAttentionTest, SeqlensKInvalidFactoredShape) {
   RunGQASeqlensKTest(

From dac0ffa526c38d6c375665ed5133e635c5353cee Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Wed, 29 Apr 2026 11:28:16 -0700
Subject: [PATCH 3/5] Reject scalar seqlens_k, use std::optional in test helper

Address review comments:
- Reject rank-0 (scalar) seqlens_k in both C++ and JS validation
- Use std::optional<vector> for test helper seqlens_k_shape param
- Add SeqlensKScalarRejected test case

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../jsep/webgpu/ops/group-query-attention.ts  |  3 ++
 .../cpu/bert/group_query_attention_helper.h   |  4 +++
 .../group_query_attention_op_test.cc          | 35 +++++++++++++------
 3 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
index 96b5865574abd..9620735ad9b8d 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
@@ -197,6 +197,9 @@ export const validateInputs = (
   // dimensions (e.g. [B, 1] instead of [B]). Allow shapes where each dim is 1 or batchSize.
   const seqlLens = inputs.length > 4 ? inputs[5] : undefined;
   if (seqlLens) {
+    if (seqlLens.dims.length === 0) {
+      throw new Error('seqlens_k must be at least 1D, got scalar.');
+    }
     const seqlLenSize = seqlLens.dims.reduce((a, b) => a * b, 1);
     if (seqlLenSize !== batchSize) {
       throw new Error(
diff --git a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
index 9146a9e90e412..0269523e0f34e 100644
--- a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
@@ -263,6 +263,10 @@ Status CheckInputs(const T* query,
 
   // Spec requires 1D shape (batch_size), but older model builders may add unit
   // dimensions (e.g. [B, 1] instead of [B]). Allow shapes where each dim is 1 or batch_size.
+  if (seqlens_k->Shape().NumDimensions() == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "seqlens_k must be at least 1D, got scalar.");
+  }
   const auto& seqlens_k_dim = seqlens_k->Shape().GetDims();
   if (seqlens_k->Shape().Size() != static_cast<int64_t>(batch_size)) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
diff --git a/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc b/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
index f0958dd32a9a6..26e1ea205761f 100644
--- a/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include <limits>
+#include <optional>
 
 #include "gtest/gtest.h"
 #include "test/common/tensor_op_test_utils.h"
@@ -23,7 +24,7 @@ static void RunGQASeqlensKTest(
     const std::string& expected_message,
     bool provide_past = false,
     int past_seq_len = 0,
-    const std::vector<int64_t>& seqlens_k_shape = {}) {
+    const std::optional<std::vector<int64_t>>& seqlens_k_shape = std::nullopt) {
   constexpr int num_heads = 1;
   constexpr int kv_num_heads = 1;
   constexpr int head_size = 8;
@@ -53,9 +54,9 @@ static void RunGQASeqlensKTest(
     tester.AddOptionalInputEdge<float>();  // past_value
   }
 
-  std::vector<int64_t> shape = seqlens_k_shape.empty()
-                                   ? std::vector<int64_t>{batch_size}
-                                   : seqlens_k_shape;
+  std::vector<int64_t> shape = seqlens_k_shape.has_value()
+                                   ? seqlens_k_shape.value()
+                                   : std::vector<int64_t>{batch_size};
   tester.AddInput<int32_t>("seqlens_k", shape, seqlens_k_data);
   tester.AddInput<int32_t>("total_sequence_length", {1}, {total_seq_len});
 
@@ -246,7 +247,7 @@ TEST(GroupQueryAttentionTest, SeqlensKLegacy2DShape) {
       "",
       /*provide_past=*/false,
       /*past_seq_len=*/0,
-      /*seqlens_k_shape=*/{1, 1});
+      /*seqlens_k_shape=*/std::vector<int64_t>{1, 1});
 }
 
 // Backward compat: seqlens_k shape {2, 1} accepted for batch_size=2.
@@ -260,7 +261,7 @@ TEST(GroupQueryAttentionTest, SeqlensKLegacy2DShapeMultiBatch) {
       "",
       /*provide_past=*/false,
       /*past_seq_len=*/0,
-      /*seqlens_k_shape=*/{2, 1});
+      /*seqlens_k_shape=*/std::vector<int64_t>{2, 1});
 }
 
 // Backward compat: seqlens_k shape {1, 2} accepted for batch_size=2.
@@ -275,7 +276,7 @@ TEST(GroupQueryAttentionTest, SeqlensKLegacy2DShapeTrailingBatch) {
       "",
       /*provide_past=*/false,
       /*past_seq_len=*/0,
-      /*seqlens_k_shape=*/{1, 2});
+      /*seqlens_k_shape=*/std::vector<int64_t>{1, 2});
 }
 
 // Shape {2, 2} with batch_size=4: correct element count but invalid factored shape.
@@ -289,7 +290,7 @@ TEST(GroupQueryAttentionTest, SeqlensKInvalidFactoredShape) {
       "seqlens_k has unexpected shape",
       /*provide_past=*/false,
       /*past_seq_len=*/0,
-      /*seqlens_k_shape=*/{2, 2});
+      /*seqlens_k_shape=*/std::vector<int64_t>{2, 2});
 }
 
 // Wrong element count (1D): 2 elements for batch_size=1.
@@ -303,7 +304,7 @@ TEST(GroupQueryAttentionTest, SeqlensKWrongLength) {
       "seqlens_k must have batch_size",
       /*provide_past=*/false,
       /*past_seq_len=*/0,
-      /*seqlens_k_shape=*/{2});
+      /*seqlens_k_shape=*/std::vector<int64_t>{2});
 }
 
 // Wrong element count (2D): shape {2, 1} has 2 elements but batch_size=1.
@@ -317,7 +318,21 @@ TEST(GroupQueryAttentionTest, SeqlensKWrongElementCount2D) {
       "seqlens_k must have batch_size",
       /*provide_past=*/false,
       /*past_seq_len=*/0,
-      /*seqlens_k_shape=*/{2, 1});
+      /*seqlens_k_shape=*/std::vector<int64_t>{2, 1});
+}
+
+// Scalar seqlens_k must be rejected even when batch_size=1.
+TEST(GroupQueryAttentionTest, SeqlensKScalarRejected) {
+  RunGQASeqlensKTest(
+      /*seqlens_k_data=*/{0},
+      /*total_seq_len=*/1,
+      /*batch_size=*/1,
+      /*sequence_length=*/1,
+      OpTester::ExpectResult::kExpectFailure,
+      "seqlens_k must be at least 1D",
+      /*provide_past=*/false,
+      /*past_seq_len=*/0,
+      /*seqlens_k_shape=*/std::vector<int64_t>{});
 }
 
 }  // namespace test

From e0b7ef5a6b9eeec1adb94961e713e2fa2148bae1 Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Wed, 29 Apr 2026 13:15:59 -0700
Subject: [PATCH 4/5] Use  unchecked operator*

Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 onnxruntime/test/contrib_ops/group_query_attention_op_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc b/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
index 26e1ea205761f..508d8d0f200ac 100644
--- a/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
@@ -55,7 +55,7 @@ static void RunGQASeqlensKTest(
   }
 
   std::vector<int64_t> shape = seqlens_k_shape.has_value()
-                                   ? seqlens_k_shape.value()
+                                   ? *seqlens_k_shape
                                    : std::vector<int64_t>{batch_size};
   tester.AddInput<int32_t>("seqlens_k", shape, seqlens_k_data);
   tester.AddInput<int32_t>("total_sequence_length", {1}, {total_seq_len});

From 8e935851850886332f7a9214d0c699f295b839ff Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Fri, 1 May 2026 18:02:58 +0000
Subject: [PATCH 5/5] Fix JS linting for group-query-attention.ts

---
 js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
index 9620735ad9b8d..9050c1bbb8816 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
@@ -202,9 +202,7 @@ export const validateInputs = (
     }
     const seqlLenSize = seqlLens.dims.reduce((a, b) => a * b, 1);
     if (seqlLenSize !== batchSize) {
-      throw new Error(
-        `seqlens_k must have batch_size (${batchSize}) elements, got ${seqlLenSize}.`,
-      );
+      throw new Error(`seqlens_k must have batch_size (${batchSize}) elements, got ${seqlLenSize}.`);
     }
     for (let i = 0; i < seqlLens.dims.length; i++) {
       if (seqlLens.dims[i] !== 1 && seqlLens.dims[i] !== batchSize) {