microsoft · xadupre · Aug 7, 2025 · Oct 9, 2025 · Oct 9, 2025
diff --git a/onnxruntime/core/providers/cpu/llm/attention.cc b/onnxruntime/core/providers/cpu/llm/attention.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cpu/llm/attention.h"
+#include "core/providers/cpu/llm/attention_helper.h"
 
 #include "core/common/common.h"
 #include "core/common/safeint.h"
@@ -140,10 +141,10 @@ Status Attention<T>::Compute(OpKernelContext* context) const {
   const Tensor* past_value = context->Input<Tensor>(5);
 
   AttentionParameters parameters;
-  std::vector<int64_t> y_shape;
-  std::vector<int64_t> present_key_shape;
-  std::vector<int64_t> present_value_shape;
-  std::vector<int64_t> output_qk_shape;
+  TensorShape y_shape;
+  TensorShape present_key_shape;
+  TensorShape present_value_shape;
+  TensorShape output_qk_shape;
 
   ORT_ENFORCE(attention_helper::ComputeOutputShapeForAttention(
                   Q,

diff --git a/onnxruntime/core/providers/cpu/llm/attention.h b/onnxruntime/core/providers/cpu/llm/attention.h
@@ -5,7 +5,7 @@
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
 #include "core/platform/threadpool.h"
-#include "core/providers/cpu/llm/attention_helper.h"
+#include "core/providers/cpu/llm/attention_parameters.h"
 
 namespace onnxruntime {
 

diff --git a/onnxruntime/core/providers/cpu/llm/attention_helper.cc b/onnxruntime/core/providers/cpu/llm/attention_helper.cc