From 7632394677c4aca4ce66df96872bb70f74dd629b Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Tue, 24 Feb 2026 12:45:46 -0800
Subject: [PATCH 1/3] update

---
 .../core/providers/cpu/tensor/gather.cc       | 20 ++++++++++--------
 .../providers/cpu/tensor/gather_op_test.cc    | 21 +++++++++++++++++++
 2 files changed, 32 insertions(+), 9 deletions(-)
diff --git a/onnxruntime/core/providers/cpu/tensor/gather.cc b/onnxruntime/core/providers/cpu/tensor/gather.cc
index b13fcd4135f67..f171b33ee5f4f 100644
--- a/onnxruntime/core/providers/cpu/tensor/gather.cc
+++ b/onnxruntime/core/providers/cpu/tensor/gather.cc
@@ -79,9 +79,9 @@ Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uin
     }
   }
 
-  auto lambda = [&](int64_t index) {
-    int64_t batch = index / N;
-    int64_t i = index % N;
+  auto lambda = [&](ptrdiff_t index) {
+    const int64_t batch = static_cast<int64_t>(index / N);
+    const int64_t i = static_cast<int64_t>(index % N);
 
     const int64_t src_offset_batch = batch * data_batch_bytes;
     const int64_t dst_offset_batch = batch * gathered_batch_bytes;
@@ -97,12 +97,14 @@ Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uin
       memcpy(dst_base + dst_offset, src_base + src_offset, narrow<size_t>(block_size));
     }
   };
-  concurrency::ThreadPool::TryParallelFor(tp, SafeInt<ptrdiff_t>(M) * N, static_cast<double>(block_size),
-                                          [&lambda](ptrdiff_t first, ptrdiff_t last) {
-                                            for (int index = static_cast<int>(first), end = static_cast<int>(last); index < end; ++index) {
-                                              lambda(index);
-                                            }
-                                          });
+
+  concurrency::ThreadPool::TryParallelFor(
+      tp, SafeInt<ptrdiff_t>(M) * N, static_cast<double>(block_size),
+      [&lambda](ptrdiff_t first, ptrdiff_t last) {
+        for (ptrdiff_t index = first; index < last; ++index) {
+          lambda(index);
+        }
+      });
 
   return Status::OK();
 }
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index c1a5a31667315..3621bcc4bb40e 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -341,6 +341,27 @@ TEST(GatherOpTest, Gather_axis1_indices2d_string) {
   test.Run();
 }
 
+TEST(GatherOpTest, Gather_OOB_write_read) {
+  OpTester test("Gather");
+  test.AddAttribute<int64_t>("axis", 1LL);
+
+  // Inputs
+  const std::vector<int64_t> data_dims{65537, 2};
+  const std::vector<int64_t> indices_dims{65537};
+  std::vector<uint8_t> data_values(static_cast<size_t>(data_dims[0] * data_dims[1]), 1);
+  std::vector<int64_t> indices_values(static_cast<size_t>(indices_dims[0]), 1);
+  std::vector<uint8_t> expected_output_values(static_cast<size_t>(65537) * static_cast<size_t>(65537), 1);
+
+  test.AddInput<uint8_t>("data", {65537, 2}, data_values);
+  test.AddInput<int64_t>("indices", {65537}, indices_values);
+  test.AddOutput<uint8_t>("output", {65537, 65537}, expected_output_values);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(DefaultCpuExecutionProvider());
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 TEST(GatherOpTest, Gather_axis1_indices2d_bool) {
   OpTester test("Gather");
   test.AddAttribute<int64_t>("axis", 1LL);

From 7617bbc6fe9a680b224e2bc5dd9a7f2590af91a5 Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Tue, 24 Feb 2026 13:07:36 -0800
Subject: [PATCH 2/3] add comment

---
 .../test/providers/cpu/tensor/gather_op_test.cc        | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index 3621bcc4bb40e..4a88f1c9a454c 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -341,7 +341,15 @@ TEST(GatherOpTest, Gather_axis1_indices2d_string) {
   test.Run();
 }
 
-TEST(GatherOpTest, Gather_OOB_write_read) {
+TEST(GatherOpTest, Gather_overflow_check) {
+  // The test uses dimensions (65537, 2) and indices of length 65537, which produce an output
+  // shape of (65537, 65537).
+  // 
+  // 65537 x 65537 = 4,295,098,369 which is greater than the maximum value of a 32-bit integer (2,147,483,647).
+  // 
+  // This test is to verify CPU implementation of the Gather operator doesn't overflow when calculating
+  // the output shape and generating the output tensor.
+
   OpTester test("Gather");
   test.AddAttribute<int64_t>("axis", 1LL);
 

From 3ca8e895365db453a5283c83b2c01f7fd93817c9 Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Tue, 24 Feb 2026 14:05:36 -0800
Subject: [PATCH 3/3] skip on 32-bit platform

---
 .../test/providers/cpu/tensor/gather_op_test.cc        | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index 4a88f1c9a454c..82a9d86a3630a 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -342,11 +342,17 @@ TEST(GatherOpTest, Gather_axis1_indices2d_string) {
 }
 
 TEST(GatherOpTest, Gather_overflow_check) {
+// Skip on 32-bit platforms where size_t overflow would truncate the large expected
+// output shape and where allocating the full reference tensor is infeasible.
+#if SIZE_MAX <= UINT32_MAX
+  GTEST_SKIP() << "Gather_overflow_check skipped on 32-bit platforms.";
+#endif
+
   // The test uses dimensions (65537, 2) and indices of length 65537, which produce an output
   // shape of (65537, 65537).
-  // 
+  //
   // 65537 x 65537 = 4,295,098,369 which is greater than the maximum value of a 32-bit integer (2,147,483,647).
-  // 
+  //
   // This test is to verify CPU implementation of the Gather operator doesn't overflow when calculating
   // the output shape and generating the output tensor.