From 3c6d75b9d09145b22d6289d20e12ae382418e329 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 26 Jan 2026 21:24:45 +0000
Subject: [PATCH 1/8] add lpnorm 22

---
 .../providers/cpu/cpu_execution_provider.cc    | 18 ++++++++++++------
 onnxruntime/core/providers/cpu/nn/lp_norm.cc   | 12 ++++++++++--
 .../onnx_backend_test_series_filters.jsonc     |  3 ---
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index db96089f7d053..c3e141e660071 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -194,8 +194,8 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDoma
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, ConvTranspose);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, Flatten);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 21, InstanceNormalization);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, LpNormalization);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, LpNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 21, float, LpNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 21, double, LpNormalization);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, LRN);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9, AveragePool);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 7, MaxPool);
@@ -1325,6 +1325,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, Softsign);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, ThresholdedRelu);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, AveragePool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, float, LpNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, double, LpNormalization);
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, MLFloat16, Conv);
@@ -1724,10 +1726,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 21,
                                                                       InstanceNormalization)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float,
-                                                                  LpNormalization)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double,
-                                                                  LpNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 21, float,
+                                                                            LpNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 21, double,
+                                                                            LpNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, LRN)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9,
                                                                       AveragePool)>,
@@ -3380,6 +3382,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, Elu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, HardSigmoid)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, float,
+                                                                  LpNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, double,
+                                                                  LpNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, LpPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 22, MaxUnpool)>,
diff --git a/onnxruntime/core/providers/cpu/nn/lp_norm.cc b/onnxruntime/core/providers/cpu/nn/lp_norm.cc
index 2286800c9638b..03f85e8ea5705 100644
--- a/onnxruntime/core/providers/cpu/nn/lp_norm.cc
+++ b/onnxruntime/core/providers/cpu/nn/lp_norm.cc
@@ -7,14 +7,22 @@
 #include "core/providers/common.h"
 
 namespace onnxruntime {
+#define REGISTER_LPNORMALISATION_VERSIONED_KERNEL(type, sinceVersion, endVersion)  \
+  ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(                                        \
+      LpNormalization, sinceVersion, endVersion, type,                             \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
+      LpNorm<type>);
+
 #define REGISTER_LPNORMALISATION_KERNEL(type, sinceVersion)                        \
   ONNX_CPU_OPERATOR_TYPED_KERNEL(                                                  \
       LpNormalization, sinceVersion, type,                                         \
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
       LpNorm<type>);
 
-REGISTER_LPNORMALISATION_KERNEL(float, 1)
-REGISTER_LPNORMALISATION_KERNEL(double, 1)
+REGISTER_LPNORMALISATION_VERSIONED_KERNEL(float, 1, 21)
+REGISTER_LPNORMALISATION_VERSIONED_KERNEL(double, 1, 21)
+REGISTER_LPNORMALISATION_KERNEL(float, 22)
+REGISTER_LPNORMALISATION_KERNEL(double, 22)
 
 using InnerStride = Eigen::InnerStride<Eigen::Dynamic>;
 
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index 31078fc9e1e22..31bccd96491cc 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -39,7 +39,6 @@
         "^test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal*",  // location of infinities
         "^test_attention_4d_attn_mask_3d_causal_expanded*", // webgpu
         "^test_attention_4d_diff_heads_mask4d_padded_kv*", // Need nonpad_kv_seqlen
-        "^test_l2normalization*",  // LpNormalization(22) not implemented
         // TODO: support the following tests in Attention-cuda
         "^test_attention_3d_gqa.*_cuda",  // GQA not supported in Attention-cuda
         "^test_attention_4d_gqa.*_cuda",  // GQA not supported in Attention-cuda
@@ -60,8 +59,6 @@
         "^test_attention_4d_with_qk_matmul_softmax_cuda", // QK matmul + softmax not supported in Attention-cuda
         "^test_attention_3d_with_past_and_present_qk_matmul_softmax_cuda", // QK matmul + softmax not supported in Attention-cuda
         "^test_attention_4d_with_past_and_present_qk_matmul_bias_cuda", // QK matmul + bias not supported in Attention-cuda
-        "^test_l1normalization*",  // LpNormalization(22) not implemented
-        "^test_lpnormalization*",  // LpNormalization(22) not implemented
         "^test_tensorscatter*",  // TensorScatter(24) not implemented
         "^test_castlike_no_saturate_FLOAT_to_FLOAT8*",  // ORT does not support ml_dtypes
         "^test_castlike_UINT4_to*",  // ORT does not support ml_dtypes

From d5d7f9e97c3c770d6d457a8e203583990ce8324e Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Tue, 27 Jan 2026 00:17:54 +0000
Subject: [PATCH 2/8] respect op spec in ONNX

---
 onnxruntime/core/providers/cpu/nn/lp_norm.cc | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/nn/lp_norm.cc b/onnxruntime/core/providers/cpu/nn/lp_norm.cc
index 03f85e8ea5705..93bcd73f5916d 100644
--- a/onnxruntime/core/providers/cpu/nn/lp_norm.cc
+++ b/onnxruntime/core/providers/cpu/nn/lp_norm.cc
@@ -45,12 +45,7 @@ void DoNormalizeP2(
     StridedVec<T> yVec(yData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
 
     auto norm = xVec.template lpNorm<2>();
-    if (norm != 0) {
-      yVec = xVec / norm;
-    } else {
-      // norm is zero, so set the result to zero
-      yVec.setZero();
-    }
+    yVec = xVec / norm;
   }
 };
 
@@ -67,12 +62,7 @@ void DoNormalizeP1(
     StridedVec<T> yVec(yData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
 
     auto norm = xVec.template lpNorm<1>();
-    if (norm != 0) {
-      yVec = xVec / norm;
-    } else {
-      // norm is zero - set the result to zero
-      yVec.setZero();
-    }
+    yVec = xVec / norm;
   }
 };
 

From 891ffa8909d6981902bbd22d635b6f4876a1b0de Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Tue, 27 Jan 2026 00:42:41 +0000
Subject: [PATCH 3/8] update the tests

---
 onnxruntime/test/providers/cpu/nn/lp_norm_op_test.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/nn/lp_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/lp_norm_op_test.cc
index b7cead66bd7fb..2a773c20d7c6f 100644
--- a/onnxruntime/test/providers/cpu/nn/lp_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/lp_norm_op_test.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <limits>
+
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 using namespace std;
@@ -136,12 +138,13 @@ void L1NormalizationWithZeroNorm() {
   test.AddAttribute("p", static_cast<int64_t>(1));
 
   // With default axis (axis = -1), one of the norms will be evaluated to zero
-  // for the following input
+  // for the following input. Per ONNX spec, 0/0 = NaN.
   vector<T> input = {2.f, 2.f, 0.f, 0.f};
   vector<int64_t> input_dims = {2, 2};
   test.AddInput<T>("input", input_dims, input);
 
-  vector<T> expected_output = {0.5f, 0.5f, 0.f, 0.f};
+  T nan_val = std::numeric_limits<T>::quiet_NaN();
+  vector<T> expected_output = {0.5f, 0.5f, nan_val, nan_val};
   test.AddOutput<T>("Y", input_dims, expected_output);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
 }
@@ -156,12 +159,13 @@ void L2NormalizationWithZeroNorm() {
   OpTester test("LpNormalization");
 
   // With default axis (axis = -1), one of the norms will be evaluated to zero
-  // for the following input
+  // for the following input. Per ONNX spec, 0/0 = NaN.
   vector<T> input = {1.f, 0.f, 0.f, 0.f};
   vector<int64_t> input_dims = {2, 2};
   test.AddInput<T>("input", input_dims, input);
 
-  vector<T> expected_output = {1.f, 0.f, 0.f, 0.f};
+  T nan_val = std::numeric_limits<T>::quiet_NaN();
+  vector<T> expected_output = {1.f, 0.f, nan_val, nan_val};
   test.AddOutput<T>("Y", input_dims, expected_output);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
 }

From a4d7b897517327612aacc28d52ee6733ee3e2826 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Tue, 27 Jan 2026 21:12:54 +0000
Subject: [PATCH 4/8] revert implemetation changes

---
 onnxruntime/core/providers/cpu/nn/lp_norm.cc  | 26 ++++++++++---------
 .../test/providers/cpu/nn/lp_norm_op_test.cc  | 12 +++------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/nn/lp_norm.cc b/onnxruntime/core/providers/cpu/nn/lp_norm.cc
index 93bcd73f5916d..2286800c9638b 100644
--- a/onnxruntime/core/providers/cpu/nn/lp_norm.cc
+++ b/onnxruntime/core/providers/cpu/nn/lp_norm.cc
@@ -7,22 +7,14 @@
 #include "core/providers/common.h"
 
 namespace onnxruntime {
-#define REGISTER_LPNORMALISATION_VERSIONED_KERNEL(type, sinceVersion, endVersion)  \
-  ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(                                        \
-      LpNormalization, sinceVersion, endVersion, type,                             \
-      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
-      LpNorm<type>);
-
 #define REGISTER_LPNORMALISATION_KERNEL(type, sinceVersion)                        \
   ONNX_CPU_OPERATOR_TYPED_KERNEL(                                                  \
       LpNormalization, sinceVersion, type,                                         \
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
       LpNorm<type>);
 
-REGISTER_LPNORMALISATION_VERSIONED_KERNEL(float, 1, 21)
-REGISTER_LPNORMALISATION_VERSIONED_KERNEL(double, 1, 21)
-REGISTER_LPNORMALISATION_KERNEL(float, 22)
-REGISTER_LPNORMALISATION_KERNEL(double, 22)
+REGISTER_LPNORMALISATION_KERNEL(float, 1)
+REGISTER_LPNORMALISATION_KERNEL(double, 1)
 
 using InnerStride = Eigen::InnerStride<Eigen::Dynamic>;
 
@@ -45,7 +37,12 @@ void DoNormalizeP2(
     StridedVec<T> yVec(yData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
 
     auto norm = xVec.template lpNorm<2>();
-    yVec = xVec / norm;
+    if (norm != 0) {
+      yVec = xVec / norm;
+    } else {
+      // norm is zero, so set the result to zero
+      yVec.setZero();
+    }
   }
 };
 
@@ -62,7 +59,12 @@ void DoNormalizeP1(
     StridedVec<T> yVec(yData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
 
     auto norm = xVec.template lpNorm<1>();
-    yVec = xVec / norm;
+    if (norm != 0) {
+      yVec = xVec / norm;
+    } else {
+      // norm is zero - set the result to zero
+      yVec.setZero();
+    }
   }
 };
 
diff --git a/onnxruntime/test/providers/cpu/nn/lp_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/lp_norm_op_test.cc
index 2a773c20d7c6f..b7cead66bd7fb 100644
--- a/onnxruntime/test/providers/cpu/nn/lp_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/lp_norm_op_test.cc
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <limits>
-
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 using namespace std;
@@ -138,13 +136,12 @@ void L1NormalizationWithZeroNorm() {
   test.AddAttribute("p", static_cast<int64_t>(1));
 
   // With default axis (axis = -1), one of the norms will be evaluated to zero
-  // for the following input. Per ONNX spec, 0/0 = NaN.
+  // for the following input
   vector<T> input = {2.f, 2.f, 0.f, 0.f};
   vector<int64_t> input_dims = {2, 2};
   test.AddInput<T>("input", input_dims, input);
 
-  T nan_val = std::numeric_limits<T>::quiet_NaN();
-  vector<T> expected_output = {0.5f, 0.5f, nan_val, nan_val};
+  vector<T> expected_output = {0.5f, 0.5f, 0.f, 0.f};
   test.AddOutput<T>("Y", input_dims, expected_output);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
 }
@@ -159,13 +156,12 @@ void L2NormalizationWithZeroNorm() {
   OpTester test("LpNormalization");
 
   // With default axis (axis = -1), one of the norms will be evaluated to zero
-  // for the following input. Per ONNX spec, 0/0 = NaN.
+  // for the following input
   vector<T> input = {1.f, 0.f, 0.f, 0.f};
   vector<int64_t> input_dims = {2, 2};
   test.AddInput<T>("input", input_dims, input);
 
-  T nan_val = std::numeric_limits<T>::quiet_NaN();
-  vector<T> expected_output = {1.f, 0.f, nan_val, nan_val};
+  vector<T> expected_output = {1.f, 0.f, 0.f, 0.f};
   test.AddOutput<T>("Y", input_dims, expected_output);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
 }

From 857c046690044b3f68fb74cc4943675d60ce7ce9 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Tue, 27 Jan 2026 21:15:34 +0000
Subject: [PATCH 5/8] disable not supported l2norm test for now

---
 .../test/testdata/onnx_backend_test_series_filters.jsonc        | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index 823ed63460cc9..cd57bc82aabf4 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -29,6 +29,8 @@
 
     // Tests that are failing temporarily and should be fixed
     "current_failing_tests": [
+        // TODO(titaiwang): onnx 1.21 should fix lpnorm zero norm issue
+        "^test_l2normalization*",  // LpNormalization(22) not implemented
         "^test_adagrad",
         "^test_adagrad_multiple",
         "^test_attention_4d_fp16*",  // precision issue: 1 / 192 mismatched elements

From 3184cdc1caefb2841f379033c1a30dd6b92a4657 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Tue, 27 Jan 2026 21:48:09 +0000
Subject: [PATCH 6/8] typo

---
 onnxruntime/core/providers/cpu/nn/lp_norm.cc | 88 +++++++++++---------
 1 file changed, 47 insertions(+), 41 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/nn/lp_norm.cc b/onnxruntime/core/providers/cpu/nn/lp_norm.cc
index 2286800c9638b..190ef9ede34ff 100644
--- a/onnxruntime/core/providers/cpu/nn/lp_norm.cc
+++ b/onnxruntime/core/providers/cpu/nn/lp_norm.cc
@@ -7,14 +7,22 @@
 #include "core/providers/common.h"
 
 namespace onnxruntime {
+#define REGISTER_LPNORMALISATION_VERSIONED_KERNEL(type, sinceVersion, endVersion)  \
+  ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(                                        \
+      LpNormalization, sinceVersion, endVersion, type,                             \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
+      LpNorm<type>);
+
 #define REGISTER_LPNORMALISATION_KERNEL(type, sinceVersion)                        \
   ONNX_CPU_OPERATOR_TYPED_KERNEL(                                                  \
       LpNormalization, sinceVersion, type,                                         \
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
       LpNorm<type>);
 
-REGISTER_LPNORMALISATION_KERNEL(float, 1)
-REGISTER_LPNORMALISATION_KERNEL(double, 1)
+REGISTER_LPNORMALISATION_VERSIONED_KERNEL(float, 1, 21)
+REGISTER_LPNORMALISATION_VERSIONED_KERNEL(double, 1, 21)
+REGISTER_LPNORMALISATION_KERNEL(float, 22)
+REGISTER_LPNORMALISATION_KERNEL(double, 22)
 
 using InnerStride = Eigen::InnerStride<Eigen::Dynamic>;
 
@@ -43,48 +51,46 @@ void DoNormalizeP2(
       // norm is zero, so set the result to zero
       yVec.setZero();
     }
-  }
-};
+  };
 
-template <typename T>
-void DoNormalizeP1(
-    const T* xData,
-    T* yData,
-    const int64_t m,
-    const int64_t n,
-    const int64_t sf) {
-  for (int i = 0; i < n; ++i) {
-    auto base = (i / sf) * sf * m + (i % sf);
-    ConstStridedVec<T> xVec(xData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
-    StridedVec<T> yVec(yData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
+  template <typename T>
+  void DoNormalizeP1(
+      const T* xData,
+      T* yData,
+      const int64_t m,
+      const int64_t n,
+      const int64_t sf) {
+    for (int i = 0; i < n; ++i) {
+      auto base = (i / sf) * sf * m + (i % sf);
+      ConstStridedVec<T> xVec(xData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
+      StridedVec<T> yVec(yData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
 
-    auto norm = xVec.template lpNorm<1>();
-    if (norm != 0) {
-      yVec = xVec / norm;
-    } else {
-      // norm is zero - set the result to zero
-      yVec.setZero();
-    }
-  }
-};
+      auto norm = xVec.template lpNorm<1>();
+      if (norm != 0) {
+        yVec = xVec / norm;
+      } else {
+        // norm is zero, so set the result to zero
+        yVec.setZero();
+      }
+    };
 
-template <typename T>
-Status LpNorm<T>::Compute(OpKernelContext* p_op_kernel_context) const {
-  const auto* input = p_op_kernel_context->Input<Tensor>(0);
-  const TensorShape& input_shape = input->Shape();
-  Tensor* output = p_op_kernel_context->Output(0, input_shape);
+    template <typename T>
+    Status LpNorm<T>::Compute(OpKernelContext * p_op_kernel_context) const {
+      const auto* input = p_op_kernel_context->Input<Tensor>(0);
+      const TensorShape& input_shape = input->Shape();
+      Tensor* output = p_op_kernel_context->Output(0, input_shape);
 
-  const auto canonical_axis = HandleNegativeAxis(axis_, static_cast<int64_t>(input_shape.NumDimensions()));
-  const int64_t m = input_shape.GetDims()[onnxruntime::narrow<size_t>(canonical_axis)];
-  const int64_t n = input_shape.Size() / m;
-  const int64_t sf = input_shape.SizeFromDimension(SafeInt<size_t>(canonical_axis) + 1);
+      const auto canonical_axis = HandleNegativeAxis(axis_, static_cast<int64_t>(input_shape.NumDimensions()));
+      const int64_t m = input_shape.GetDims()[onnxruntime::narrow<size_t>(canonical_axis)];
+      const int64_t n = input_shape.Size() / m;
+      const int64_t sf = input_shape.SizeFromDimension(SafeInt<size_t>(canonical_axis) + 1);
 
-  if (p_ == 1) {
-    DoNormalizeP1(input->Data<T>(), output->MutableData<T>(), m, n, sf);
-  } else if (p_ == 2) {
-    DoNormalizeP2(input->Data<T>(), output->MutableData<T>(), m, n, sf);
-  }
+      if (p_ == 1) {
+        DoNormalizeP1(input->Data<T>(), output->MutableData<T>(), m, n, sf);
+      } else if (p_ == 2) {
+        DoNormalizeP2(input->Data<T>(), output->MutableData<T>(), m, n, sf);
+      }
 
-  return Status::OK();
-}
-}  // namespace onnxruntime
+      return Status::OK();
+    }
+  }  // namespace onnxruntime

From a470e5a68b6320f1cb5f4cd3139c736a993a3603 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Tue, 27 Jan 2026 21:52:06 +0000
Subject: [PATCH 7/8] formatting

---
 onnxruntime/core/providers/cpu/nn/lp_norm.cc | 76 ++++++++++----------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/nn/lp_norm.cc b/onnxruntime/core/providers/cpu/nn/lp_norm.cc
index 190ef9ede34ff..03f85e8ea5705 100644
--- a/onnxruntime/core/providers/cpu/nn/lp_norm.cc
+++ b/onnxruntime/core/providers/cpu/nn/lp_norm.cc
@@ -51,46 +51,48 @@ void DoNormalizeP2(
       // norm is zero, so set the result to zero
       yVec.setZero();
     }
-  };
+  }
+};
 
-  template <typename T>
-  void DoNormalizeP1(
-      const T* xData,
-      T* yData,
-      const int64_t m,
-      const int64_t n,
-      const int64_t sf) {
-    for (int i = 0; i < n; ++i) {
-      auto base = (i / sf) * sf * m + (i % sf);
-      ConstStridedVec<T> xVec(xData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
-      StridedVec<T> yVec(yData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
+template <typename T>
+void DoNormalizeP1(
+    const T* xData,
+    T* yData,
+    const int64_t m,
+    const int64_t n,
+    const int64_t sf) {
+  for (int i = 0; i < n; ++i) {
+    auto base = (i / sf) * sf * m + (i % sf);
+    ConstStridedVec<T> xVec(xData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
+    StridedVec<T> yVec(yData + base, 1, onnxruntime::narrow<size_t>(m), InnerStride(onnxruntime::narrow<size_t>(sf)));
 
-      auto norm = xVec.template lpNorm<1>();
-      if (norm != 0) {
-        yVec = xVec / norm;
-      } else {
-        // norm is zero, so set the result to zero
-        yVec.setZero();
-      }
-    };
+    auto norm = xVec.template lpNorm<1>();
+    if (norm != 0) {
+      yVec = xVec / norm;
+    } else {
+      // norm is zero - set the result to zero
+      yVec.setZero();
+    }
+  }
+};
 
-    template <typename T>
-    Status LpNorm<T>::Compute(OpKernelContext * p_op_kernel_context) const {
-      const auto* input = p_op_kernel_context->Input<Tensor>(0);
-      const TensorShape& input_shape = input->Shape();
-      Tensor* output = p_op_kernel_context->Output(0, input_shape);
+template <typename T>
+Status LpNorm<T>::Compute(OpKernelContext* p_op_kernel_context) const {
+  const auto* input = p_op_kernel_context->Input<Tensor>(0);
+  const TensorShape& input_shape = input->Shape();
+  Tensor* output = p_op_kernel_context->Output(0, input_shape);
 
-      const auto canonical_axis = HandleNegativeAxis(axis_, static_cast<int64_t>(input_shape.NumDimensions()));
-      const int64_t m = input_shape.GetDims()[onnxruntime::narrow<size_t>(canonical_axis)];
-      const int64_t n = input_shape.Size() / m;
-      const int64_t sf = input_shape.SizeFromDimension(SafeInt<size_t>(canonical_axis) + 1);
+  const auto canonical_axis = HandleNegativeAxis(axis_, static_cast<int64_t>(input_shape.NumDimensions()));
+  const int64_t m = input_shape.GetDims()[onnxruntime::narrow<size_t>(canonical_axis)];
+  const int64_t n = input_shape.Size() / m;
+  const int64_t sf = input_shape.SizeFromDimension(SafeInt<size_t>(canonical_axis) + 1);
 
-      if (p_ == 1) {
-        DoNormalizeP1(input->Data<T>(), output->MutableData<T>(), m, n, sf);
-      } else if (p_ == 2) {
-        DoNormalizeP2(input->Data<T>(), output->MutableData<T>(), m, n, sf);
-      }
+  if (p_ == 1) {
+    DoNormalizeP1(input->Data<T>(), output->MutableData<T>(), m, n, sf);
+  } else if (p_ == 2) {
+    DoNormalizeP2(input->Data<T>(), output->MutableData<T>(), m, n, sf);
+  }
 
-      return Status::OK();
-    }
-  }  // namespace onnxruntime
+  return Status::OK();
+}
+}  // namespace onnxruntime

From 55bcaa9c17f1e18a3cd053bb38a20895f7bd44ef Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Tue, 27 Jan 2026 23:24:35 +0000
Subject: [PATCH 8/8] update the op doc

---
 docs/OperatorKernels.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 7cc57a636362f..08840c623b709 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -240,7 +240,8 @@ Do not modify directly.*
 |||[13, 15]|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[1, 10]|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|LpNormalization|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(double), tensor(float)|
+|LpNormalization|*in* input:**T**<br> *out* output:**T**|22+|**T** = tensor(double), tensor(float)|
+|||[1, 21]|**T** = tensor(double), tensor(float)|
 |LpPool|*in* X:**T**<br> *out* Y:**T**|22+|**T** = tensor(float)|
 |||[18, 21]|**T** = tensor(float)|
 |||[11, 17]|**T** = tensor(float)|