diff --git a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp index 0b9313632c9a6..f354f9dc47fe5 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include @@ -467,29 +469,15 @@ static std::unique_ptr LhsPackImageDataSme(const size_t ci, const s // pad_ptr must be at least 'ci' floats for padding pixels. // Using a thread_local grow-only buffer to avoid cross-thread interference and ensure sizing is correct. + // + // The pad buffer contents are always zero. Since the buffer is grow-only and never written with non-zero data, + // we only need to zero-initialize newly-grown elements. thread_local std::vector pad_ptr; - const float* old_pad_ptr = pad_ptr.data(); - bool has_pad_ptr_changed = false; if (pad_ptr.size() < padsize) { pad_ptr.resize(padsize, 0.f); - if (pad_ptr.data() != old_pad_ptr) { - has_pad_ptr_changed = true; - } - } else { - // Ensure any previously-used region remains zeroed (grow-only means it should already be zeros, - // but keep this explicit for safety). - std::fill(pad_ptr.begin(), pad_ptr.end(), 0.f); } - LhsCacheKey key = { - ci, ih, iw, - padding, sh, sw, - kh, kw, - 1, 1, - HashWeights(in) - }; - //create lhs in format required for imatmul const auto m = ComputeConvOutSize(ih, kh, padding, sh) * ComputeConvOutSize(iw, kw, padding, sw); @@ -498,18 +486,31 @@ static std::unique_ptr LhsPackImageDataSme(const size_t ci, const s auto nhwc = NChwToNhwc(1, ci, ih, iw, in, 1, 1, false, ThreadPool); - // Cache of computed lhs ptr offsets. thread_local to prevent interference from parallel sessions. - thread_local std::unordered_map> lhs_ptrs_cache; - - if (has_pad_ptr_changed) - { - // If the pad buffer was resized and a re-allocation has occurred, the cached lhs ptrs are invalid as they - // would be referencing the old pad buffer. - // See discussion in https://github.com/microsoft/onnxruntime/pull/27214. - // TODO(hasesh / JonathanC-ARM): A better approach would be to include the pad buffer address in the cache key - // or any other approach that would reduce unnecessary cache invalidations. - lhs_ptrs_cache.clear(); + // Cache of computed lhs ptr offsets. thread_local to prevent interference from parallel sessions. + // + // Entries include pointers to the pad buffer for out-of-bounds pixels, so we must not reuse entries after the + // pad buffer is reallocated. To avoid clearing the entire cache, we group caches by pad buffer identity and + // invalidate only the old group when the pad buffer moves. + using LhsPtrsCache = std::unordered_map>; + thread_local std::unordered_map lhs_ptrs_cache_by_pad; + + // If pad_ptr moved (vector reallocation), drop only the old group to avoid accumulating unreachable entries. + thread_local const float* last_pad_ptr = nullptr; + const float* cur_pad_ptr = pad_ptr.data(); + if (last_pad_ptr != nullptr && last_pad_ptr != cur_pad_ptr) { + lhs_ptrs_cache_by_pad.erase(last_pad_ptr); } + last_pad_ptr = cur_pad_ptr; + + LhsCacheKey key = { + ci, ih, iw, + padding, sh, sw, + kh, kw, + 1, 1, + HashWeights(in) + }; + + auto& lhs_ptrs_cache = lhs_ptrs_cache_by_pad[cur_pad_ptr]; std::shared_ptr lhs_ptrs; if (auto found = lhs_ptrs_cache.find(key); found != lhs_ptrs_cache.end()) { diff --git a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp index e58bad387fdc5..9e70b0742217a 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp @@ -6,6 +6,10 @@ +#include +#include +#include + #include "mlasi_kleidiai.h" #include "kai_ukernel_interface.h" diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp index c35e48b6b90ad..85ab94c0932b1 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp @@ -4,13 +4,16 @@ // SPDX-License-Identifier: MIT // +#include +#include +#include + #include "mlas.h" #include "mlasi_kleidiai.h" #include "kai_ukernel_interface.h" - #include "kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h" diff --git a/onnxruntime/test/mlas/unittest/test_conv2d.h b/onnxruntime/test/mlas/unittest/test_conv2d.h index 20bf0ec84f5bf..9f406c13f03f1 100644 --- a/onnxruntime/test/mlas/unittest/test_conv2d.h +++ b/onnxruntime/test/mlas/unittest/test_conv2d.h @@ -331,5 +331,24 @@ class MlasConv2DTest : public MlasTestBase { } } } + + // + // Regression test: exercise a KleidiAI Conv2D path when KleidiAI is enabled. + // See https://github.com/microsoft/onnxruntime/issues/26669. + // + // The KleidiAI implementation uses an internal per-thread padding buffer for out-of-bounds pixels + // when constructing the LHS indirection table. Historically, if the buffer was too small for a later + // convolution (larger CI), resizing could invalidate cached indirection pointers and lead to + // non-deterministic corruption. + // + // This sequence forces pad-buffer growth by running a smaller-CI convolution followed by a larger-CI + // convolution (with padding to ensure pad pointers are used), then runs the smaller-CI convolution again. + // Repeat a few times to increase the likelihood of triggering a reallocation and verify the path. + // + for (int i = 0; i < 4; ++i) { + Test(1, 1, 64, 11, 11, 32, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1); // smaller CI + Test(1, 1, 320, 11, 11, 32, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1); // larger CI forces pad buffer growth + Test(1, 1, 64, 11, 11, 32, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1); // sanity: back to smaller CI after growth + } } };