From 7f32da235790990d3a85da07ac7eb198db1c5730 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 9 Mar 2026 21:34:17 +0000
Subject: [PATCH 01/18] Integrate ONNX 1.21.0: submodule, deps, patches, and
 build fixes

Update ONNX submodule to rel-1.21.0 branch (commit fbbe45b8e2).
Update cmake/deps.txt with new URL and SHA1.
Update vcpkg port (portfile.cmake, vcpkg.json) for 1.21.0.
Regenerate onnx.patch and binskim.patch for 1.21.0 CMakeLists.txt changes.
Update all 7 requirements.txt files to onnx==1.21.0.
Bump kMaxSupportedOpset from 25 to 26 in optimizer_api.h.
Fix ONNX_UNUSED macro removal (replaced with [[maybe_unused]]) in
contrib_defs.h, dml_defs.h, and test_opaque_api.cc.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cmake/deps.txt                                   |  2 +-
 cmake/external/onnx                              |  2 +-
 cmake/patches/onnx/onnx.patch                    | 16 ++++++++--------
 cmake/vcpkg-ports/onnx/binskim.patch             | 16 ++++++++--------
 cmake/vcpkg-ports/onnx/portfile.cmake            |  4 ++--
 cmake/vcpkg-ports/onnx/vcpkg.json                |  4 ++--
 .../core/graph/contrib_ops/contrib_defs.h        | 14 ++++++--------
 onnxruntime/core/graph/dml_ops/dml_defs.h        | 14 ++++++--------
 .../transpose_optimization/optimizer_api.h       |  2 +-
 onnxruntime/test/opaque_api/test_opaque_api.cc   |  7 +++----
 onnxruntime/test/python/requirements.txt         |  2 +-
 .../aarch64/python/cpu/scripts/requirements.txt  |  2 +-
 .../linux/docker/scripts/lort/requirements.txt   |  2 +-
 .../docker/scripts/manylinux/requirements.txt    |  2 +-
 .../github/linux/docker/scripts/requirements.txt |  2 +-
 .../github/linux/python/requirements.txt         |  2 +-
 .../github/windows/python/requirements.txt       |  2 +-
 17 files changed, 45 insertions(+), 50 deletions(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index a5eaf2ed69efb..2d7196646434f 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.250325.1.zip;826c8bd47c2258ec61b8b218e031e5b33d27f761
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.20.1.zip;30b80c81a1a381188896e86abe460c3c3f3091fd
+onnx;https://github.com/onnx/onnx/archive/fbbe45b8e25b5b0018cc038caaf906d3b09634ee.zip;c38208d94ec0dd799a8468ac72f6058f74d44830
 # Use the latest commit of 10.9-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
diff --git a/cmake/external/onnx b/cmake/external/onnx
index d3f6b795aedb4..fbbe45b8e25b5 160000
--- a/cmake/external/onnx
+++ b/cmake/external/onnx
@@ -1 +1 @@
-Subproject commit d3f6b795aedb48eaecc881bf5e8f5dd6efbe25b3
+Subproject commit fbbe45b8e25b5b0018cc038caaf906d3b09634ee
diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
index 76e94a71364bf..0a5680778790b 100644
--- a/cmake/patches/onnx/onnx.patch
+++ b/cmake/patches/onnx/onnx.patch
@@ -1,16 +1,16 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 584c0419a..5d4ffff99 100644
+index 044996e..ded7e39 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -52,6 +52,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
+@@ -53,6 +53,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
  option(ONNX_DISABLE_EXCEPTIONS "Disable exception handling." OFF)
  option(ONNX_DISABLE_STATIC_REGISTRATION "Disable static registration for ONNX operator schemas." OFF)
  option(ONNX_USE_UNITY_BUILD "Enable Unity (Jumbo) build for" OFF)
 +option(ONNX_MINIMAL_BUILD "Build only essential ONNX components" OFF)
+ option(ONNX_INSTALL "Install ONNX targets, headers, and CMake config files" ON)
  if(WIN32)
    option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF)
- endif()
-@@ -397,14 +398,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
+@@ -399,14 +400,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
                                 onnx/onnx-operators.in.proto
                                 onnx/onnx-data.in.proto)
  
@@ -48,10 +48,10 @@ index 584c0419a..5d4ffff99 100644
  set(LINKED_PROTOBUF_TARGET protobuf::libprotobuf)
  if(ONNX_USE_LITE_PROTO)
 diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
-index 07f2b9071..388d9f7a3 100644
+index 1987edd..04b3088 100644
 --- a/cmake/Utils.cmake
 +++ b/cmake/Utils.cmake
-@@ -31,18 +31,7 @@ endfunction()
+@@ -103,18 +103,7 @@ endfunction()
  
  function(add_onnx_compile_options target)
    if(MSVC)
@@ -71,10 +71,10 @@ index 07f2b9071..388d9f7a3 100644
        target_compile_options(${target} PRIVATE "/WX")
      endif()
 diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
-index 887151217..ac2e8c463 100644
+index a6a8a83..153da87 100644
 --- a/onnx/defs/nn/old.cc
 +++ b/onnx/defs/nn/old.cc
-@@ -4152,7 +4152,6 @@ ONNX_OPERATOR_SET_SCHEMA(
+@@ -4026,7 +4026,6 @@ ONNX_OPERATOR_SET_SCHEMA(
      GroupNormalization,
      18,
      OpSchema()
diff --git a/cmake/vcpkg-ports/onnx/binskim.patch b/cmake/vcpkg-ports/onnx/binskim.patch
index 76e94a71364bf..0a5680778790b 100644
--- a/cmake/vcpkg-ports/onnx/binskim.patch
+++ b/cmake/vcpkg-ports/onnx/binskim.patch
@@ -1,16 +1,16 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 584c0419a..5d4ffff99 100644
+index 044996e..ded7e39 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -52,6 +52,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
+@@ -53,6 +53,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
  option(ONNX_DISABLE_EXCEPTIONS "Disable exception handling." OFF)
  option(ONNX_DISABLE_STATIC_REGISTRATION "Disable static registration for ONNX operator schemas." OFF)
  option(ONNX_USE_UNITY_BUILD "Enable Unity (Jumbo) build for" OFF)
 +option(ONNX_MINIMAL_BUILD "Build only essential ONNX components" OFF)
+ option(ONNX_INSTALL "Install ONNX targets, headers, and CMake config files" ON)
  if(WIN32)
    option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF)
- endif()
-@@ -397,14 +398,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
+@@ -399,14 +400,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
                                 onnx/onnx-operators.in.proto
                                 onnx/onnx-data.in.proto)
  
@@ -48,10 +48,10 @@ index 584c0419a..5d4ffff99 100644
  set(LINKED_PROTOBUF_TARGET protobuf::libprotobuf)
  if(ONNX_USE_LITE_PROTO)
 diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
-index 07f2b9071..388d9f7a3 100644
+index 1987edd..04b3088 100644
 --- a/cmake/Utils.cmake
 +++ b/cmake/Utils.cmake
-@@ -31,18 +31,7 @@ endfunction()
+@@ -103,18 +103,7 @@ endfunction()
  
  function(add_onnx_compile_options target)
    if(MSVC)
@@ -71,10 +71,10 @@ index 07f2b9071..388d9f7a3 100644
        target_compile_options(${target} PRIVATE "/WX")
      endif()
 diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
-index 887151217..ac2e8c463 100644
+index a6a8a83..153da87 100644
 --- a/onnx/defs/nn/old.cc
 +++ b/onnx/defs/nn/old.cc
-@@ -4152,7 +4152,6 @@ ONNX_OPERATOR_SET_SCHEMA(
+@@ -4026,7 +4026,6 @@ ONNX_OPERATOR_SET_SCHEMA(
      GroupNormalization,
      18,
      OpSchema()
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
index c526529466e97..ce95fa0e4535e 100644
--- a/cmake/vcpkg-ports/onnx/portfile.cmake
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -3,8 +3,8 @@ vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
 vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO onnx/onnx
-    REF "v${VERSION}"
-    SHA512 4bbc4c09e4bb3eb6049d653ce49200564e8c5dcf1154a30f894f24e15f1986d1f2fe2f4ca32fe383c559e2a0b20681f33d649376bf63e4345df6972a2c78eac8
+    REF fbbe45b8e25b5b0018cc038caaf906d3b09634ee
+    SHA512 971a71b6d0fdb96270f82851c6a5940cc1c34d224247b678033ba179ffd8cc7bfecf59b235d013a0b94d089bd7d6fe46d01b2d6f5056bdb9fdff98fba0cc4e27
     PATCHES
         fix-cmakelists.patch
         fix-dependency-protobuf.patch
diff --git a/cmake/vcpkg-ports/onnx/vcpkg.json b/cmake/vcpkg-ports/onnx/vcpkg.json
index 5800b031aa143..5ad70ff409e05 100644
--- a/cmake/vcpkg-ports/onnx/vcpkg.json
+++ b/cmake/vcpkg-ports/onnx/vcpkg.json
@@ -1,7 +1,7 @@
 {
   "name": "onnx",
-  "version-semver": "1.20.1",
-  "port-version": 1,
+  "version-semver": "1.21.0",
+  "port-version": 0,
   "description": "Open standard for machine learning interoperability",
   "homepage": "https://onnx.ai",
   "license": "Apache-2.0",
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.h b/onnxruntime/core/graph/contrib_ops/contrib_defs.h
index 5b3904669f9fc..f88257b2baf08 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.h
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.h
@@ -35,19 +35,17 @@ inline bool HasRawData(const ONNX_NAMESPACE::TensorProto& ten_proto) {
   ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_HELPER(__COUNTER__, name)
 #define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_HELPER(Counter, name) \
   ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ(Counter, name)
-#define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ(Counter, name)         \
-  static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce( \
-      op_schema_register_once##name##Counter) ONNX_UNUSED =      \
-      ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__)
+#define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ(Counter, name)                                                       \
+  static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce op_schema_register_once##name##Counter \
+      [[maybe_unused]] = ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__)
 
 #define ONNX_CONTRIB_OPERATOR_SCHEMA_ELSEWHERE(name, schema_func) \
   ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_HELPER_ELSEWHERE(__COUNTER__, name, schema_func)
 #define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_HELPER_ELSEWHERE(Counter, name, schema_func) \
   ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func)
-#define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func) \
-  static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce(                \
-      op_schema_register_once##name##Counter) ONNX_UNUSED =                     \
-      schema_func(ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__))
+#define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func)                                  \
+  static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce op_schema_register_once##name##Counter \
+      [[maybe_unused]] = schema_func(ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__))
 
 void RegisterContribSchemas();
 void RegisterNchwcSchemas();
diff --git a/onnxruntime/core/graph/dml_ops/dml_defs.h b/onnxruntime/core/graph/dml_ops/dml_defs.h
index 5479005382ec9..9551f72adfe17 100644
--- a/onnxruntime/core/graph/dml_ops/dml_defs.h
+++ b/onnxruntime/core/graph/dml_ops/dml_defs.h
@@ -11,19 +11,17 @@ namespace dml {
   MS_DML_OPERATOR_SCHEMA_UNIQ_HELPER(__COUNTER__, name)
 #define MS_DML_OPERATOR_SCHEMA_UNIQ_HELPER(Counter, name) \
   MS_DML_OPERATOR_SCHEMA_UNIQ(Counter, name)
-#define MS_DML_OPERATOR_SCHEMA_UNIQ(Counter, name)               \
-  static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce( \
-      op_schema_register_once##name##Counter) ONNX_UNUSED =      \
-      ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__)
+#define MS_DML_OPERATOR_SCHEMA_UNIQ(Counter, name)                                                      \
+  static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce op_schema_register_once##name##Counter \
+      [[maybe_unused]] = ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__)
 
 #define MS_DML_OPERATOR_SCHEMA_ELSEWHERE(name, schema_func) \
   MS_DML_OPERATOR_SCHEMA_UNIQ_HELPER_ELSEWHERE(__COUNTER__, name, schema_func)
 #define MS_DML_OPERATOR_SCHEMA_UNIQ_HELPER_ELSEWHERE(Counter, name, schema_func) \
   MS_DML_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func)
-#define MS_DML_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func) \
-  static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce(          \
-      op_schema_register_once##name##Counter) ONNX_UNUSED =               \
-      schema_func(ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__))
+#define MS_DML_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func)                               \
+  static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce op_schema_register_once##name##Counter \
+      [[maybe_unused]] = schema_func(ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__))
 
 void RegisterDmlSchemas();
 }  // namespace dml
diff --git a/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h b/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
index 6ff4da05fbf57..012777897c3a1 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
+++ b/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
@@ -466,7 +466,7 @@ class GraphRef {
 }  // namespace api
 
 constexpr int64_t kMinSupportedOpset = 7;
-constexpr int64_t kMaxSupportedOpset = 25;
+constexpr int64_t kMaxSupportedOpset = 26;
 
 // enum of results that a CostCheckFn can return.
 enum class CostCheckResult {
diff --git a/onnxruntime/test/opaque_api/test_opaque_api.cc b/onnxruntime/test/opaque_api/test_opaque_api.cc
index da3ad08ae1ce2..e39e76a912aec 100644
--- a/onnxruntime/test/opaque_api/test_opaque_api.cc
+++ b/onnxruntime/test/opaque_api/test_opaque_api.cc
@@ -118,10 +118,9 @@ ONNX_OPERATOR_KERNEL_EX(
   ONNX_TEST_OPERATOR_SCHEMA_UNIQ_HELPER(__COUNTER__, name)
 #define ONNX_TEST_OPERATOR_SCHEMA_UNIQ_HELPER(Counter, name) \
   ONNX_TEST_OPERATOR_SCHEMA_UNIQ(Counter, name)
-#define ONNX_TEST_OPERATOR_SCHEMA_UNIQ(Counter, name)            \
-  static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce( \
-      op_schema_register_once##name##Counter) ONNX_UNUSED =      \
-      ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__)
+#define ONNX_TEST_OPERATOR_SCHEMA_UNIQ(Counter, name)                                                   \
+  static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce op_schema_register_once##name##Counter \
+      [[maybe_unused]] = ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__)
 
 static void RegisterCustomKernel() {
   // Register our custom type
diff --git a/onnxruntime/test/python/requirements.txt b/onnxruntime/test/python/requirements.txt
index 8ddba809b9228..3ece2f39d4042 100644
--- a/onnxruntime/test/python/requirements.txt
+++ b/onnxruntime/test/python/requirements.txt
@@ -1,3 +1,3 @@
-onnx==1.20.1
+onnx==1.21.0
 pytest
 onnx-ir
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index 7e2b6e74cfdde..b4c2f163e22ac 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -7,4 +7,4 @@ wheel
 protobuf==4.25.8
 sympy==1.14
 flatbuffers
-onnx==1.20.1
+onnx==1.21.0
diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
index 63a8e96d8c128..eb52681341012 100644
--- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
@@ -3,7 +3,7 @@ beartype==0.15.0
 flatbuffers
 cerberus
 h5py
-onnx==1.20.1
+onnx==1.21.0
 # Python dependencies required for pytorch development
 astunparse
 expecttest!=0.2.0
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index ffcad5ee67208..9a0a6d0f51900 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -9,4 +9,4 @@ sympy==1.14
 flatbuffers
 neural-compressor>=2.2.1
 triton==3.5.0
-onnx==1.20.1
+onnx==1.21.0
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index ad57cc715589b..3d886832e1ccb 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -12,4 +12,4 @@ protobuf==6.33.0
 packaging
 onnxscript==0.6.2
 onnx-ir==0.1.16
-onnx==1.20.1
+onnx==1.21.0
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index 994776e8fb6fd..bfe9ab0d8a508 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -12,4 +12,4 @@ onnxscript==0.6.2
 onnx-ir==0.1.16
 jinja2
 markupsafe
-onnx==1.20.1
+onnx==1.21.0
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index 83593ff47e453..2dfba37c6f381 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -14,4 +14,4 @@ jinja2
 markupsafe
 semver
 packaging
-onnx==1.20.1
+onnx==1.21.0

From 5023110b8b65e098dd4cbe75bd8a146f254e8983 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 9 Mar 2026 21:34:41 +0000
Subject: [PATCH 02/18] Add BitCast and CumProd CPU kernels for ONNX opset 26

BitCast (opset 26): Zero-copy tensor type reinterpretation for types
with matching bit-widths. Supports all standard numeric types.
Registered in cpu_execution_provider.cc with 17 passing tests.

CumProd (opset 26): Cumulative product along a given axis with
optional exclusive and reverse attributes. Supports float, double,
int32, int64, uint32, uint64. Identity element is 1 (vs 0 for CumSum).
Registered in cpu_execution_provider.cc with 33 passing tests.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../providers/cpu/cpu_execution_provider.cc   |  18 +
 .../core/providers/cpu/math/cumprod.cc        | 222 ++++++++++
 onnxruntime/core/providers/cpu/math/cumprod.h |  28 ++
 .../core/providers/cpu/tensor/bitcast_op.cc   |  78 ++++
 .../core/providers/cpu/tensor/bitcast_op.h    |  21 +
 .../test/providers/cpu/math/cumprod_test.cc   | 409 ++++++++++++++++++
 .../providers/cpu/tensor/bitcast_op_test.cc   | 183 ++++++++
 7 files changed, 959 insertions(+)
 create mode 100644 onnxruntime/core/providers/cpu/math/cumprod.cc
 create mode 100644 onnxruntime/core/providers/cpu/math/cumprod.h
 create mode 100644 onnxruntime/core/providers/cpu/tensor/bitcast_op.cc
 create mode 100644 onnxruntime/core/providers/cpu/tensor/bitcast_op.h
 create mode 100644 onnxruntime/test/providers/cpu/math/cumprod_test.cc
 create mode 100644 onnxruntime/test/providers/cpu/tensor/bitcast_op_test.cc

diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 74b8f8e468097..f7b4918546067 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -1497,6 +1497,15 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 25, Un
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 25, Scan);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 25, Size);
 
+// Opset 26
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, BitCast);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, float, CumProd);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, double, CumProd);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, int32_t, CumProd);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, int64_t, CumProd);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, uint32_t, CumProd);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, uint64_t, CumProd);
+
 // !!PLEASE READ BELOW!! Following that, add new entries above this comment
 
 /*  *** IMPORTANT! ***
@@ -3661,6 +3670,15 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 25, Squeeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 25, Transpose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 25, Unsqueeze)>,
+
+      // opset 26
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, BitCast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, float, CumProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, double, CumProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, int32_t, CumProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, int64_t, CumProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, uint32_t, CumProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 26, uint64_t, CumProd)>,
   };
   for (auto& function_table_entry : function_table) {
     KernelCreateInfo info = function_table_entry();
diff --git a/onnxruntime/core/providers/cpu/math/cumprod.cc b/onnxruntime/core/providers/cpu/math/cumprod.cc
new file mode 100644
index 0000000000000..6706c2d3ea8d0
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/math/cumprod.cc
@@ -0,0 +1,222 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <functional>
+
+#include "cumprod.h"
+#include "core/providers/common.h"
+#include "core/providers/cpu/tensor/utils.h"
+#include "core/framework/op_kernel.h"
+#include "core/framework/tensorprotoutils.h"
+
+using namespace onnxruntime;
+
+namespace onnxruntime {
+
+namespace cumprod_op {
+Status GetAxis(const Tensor* axis_tensor, int64_t input_rank, int64_t& axis_out) {
+  if (!axis_tensor)
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor must be provided to the CumProd op");
+
+  if (axis_tensor->Shape().NumDimensions() > 1)
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor should be 0D or 1D");
+
+  if (axis_tensor->IsDataType<int32_t>()) {
+    axis_out = static_cast<int64_t>(axis_tensor->Data<int32_t>()[0]);
+  } else if (axis_tensor->IsDataType<int64_t>()) {
+    axis_out = axis_tensor->Data<int64_t>()[0];
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor should be of type `int32_t` or `int64_t`");
+  }
+
+  axis_out = HandleNegativeAxis(axis_out, input_rank);
+
+  return Status::OK();
+}
+
+}  // namespace cumprod_op
+
+// Opset 26 kernels
+ONNX_CPU_OPERATOR_TYPED_KERNEL(
+    CumProd,
+    26,
+    float,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
+                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+    CumProd<float>);
+
+ONNX_CPU_OPERATOR_TYPED_KERNEL(
+    CumProd,
+    26,
+    double,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<double>())
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
+                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+    CumProd<double>);
+
+ONNX_CPU_OPERATOR_TYPED_KERNEL(
+    CumProd,
+    26,
+    int32_t,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<int32_t>())
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
+                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+    CumProd<int32_t>);
+
+ONNX_CPU_OPERATOR_TYPED_KERNEL(
+    CumProd,
+    26,
+    int64_t,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<int64_t>())
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
+                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+    CumProd<int64_t>);
+
+ONNX_CPU_OPERATOR_TYPED_KERNEL(
+    CumProd,
+    26,
+    uint32_t,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<uint32_t>())
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
+                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+    CumProd<uint32_t>);
+
+ONNX_CPU_OPERATOR_TYPED_KERNEL(
+    CumProd,
+    26,
+    uint64_t,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<uint64_t>())
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
+                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+    CumProd<uint64_t>);
+
+template <typename T>
+CumProd<T>::CumProd(const OpKernelInfo& info) : OpKernel(info), exclusive_(), reverse_() {
+  int64_t exclusive = 0;
+  auto status = info.GetAttr("exclusive", &exclusive);
+  if (status.IsOK()) {
+    if (exclusive == 1 || exclusive == 0) {
+      exclusive_ = exclusive;
+    } else {
+      ORT_ENFORCE(false, "attribute exclusive can only be 0 or 1");
+    }
+  }
+  int64_t reverse = 0;
+  status = info.GetAttr("reverse", &reverse);
+  if (status.IsOK()) {
+    if (reverse == 1 || reverse == 0) {
+      reverse_ = reverse;
+    } else {
+      ORT_ENFORCE(false, "attribute reverse can only be 0 or 1");
+    }
+  }
+}
+
+template <typename T>
+Status CumProd<T>::Compute(OpKernelContext* ctx) const {
+  const Tensor* input = ctx->Input<Tensor>(0);
+  size_t rank = input->Shape().NumDimensions();
+  if (rank == 0)
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Cannot apply CumProd operator on a scalar");
+
+  const Tensor* axis_tensor = ctx->Input<Tensor>(1);
+
+  TensorShape output_shape(input->Shape());
+  auto& output_tensor = *ctx->Output(0, output_shape);
+
+  if (output_shape.Size() == 0)
+    return Status::OK();
+
+  int64_t axis_input = 0;
+  ORT_THROW_IF_ERROR(cumprod_op::GetAxis(axis_tensor, rank, axis_input));
+
+  // We solve the problem by using the identity that (in the case of exclusive)
+  // 1) out[upper_dims...][0][lower_dims...] = 1
+  // 2) out[upper_dims...][i][lower_dims...] =
+  //      in[upper_dims...][i-1][lower_dims...] * out[upper_dims...][i-1][lower_dims...]
+  // We loop through the [upper_dims...] and start applying the identity in each slice.
+  // Since the [lower_dims...] are adjacent in memory, we can multiply them like vectors.
+
+  const auto input_shape = input->Shape().GetDims();
+  const size_t axis = onnxruntime::narrow<size_t>(axis_input);
+  const int64_t dim = input->Shape()[axis];  // dimension size for the axis
+  const int64_t upper_dim_count =            // number of slices we can walk through iteratively
+      std::accumulate(input_shape.begin(), input_shape.begin() + axis, static_cast<int64_t>(1), std::multiplies<int64_t>());
+  const int64_t lower_dim_size =  // sizes of the slices we can treat as 1D arrays
+      std::accumulate(input_shape.begin() + axis + 1, input_shape.end(), static_cast<int64_t>(1), std::multiplies<int64_t>());
+
+  if (!reverse_) {
+    const auto* input_iter = input->Data<T>();
+    auto* output_iter = output_tensor.MutableData<T>();
+    const auto* prev_output_iter = output_iter;
+
+    if (exclusive_) {
+      for (int64_t outer = 0; outer < upper_dim_count; outer++) {
+        prev_output_iter = output_iter;
+        for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+          *(output_iter++) = static_cast<T>(1);
+        }
+        for (int64_t cum_axis = 1; cum_axis < dim; cum_axis++) {
+          for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+            *(output_iter++) = *(prev_output_iter++) * *(input_iter++);
+          }
+        }
+        input_iter += lower_dim_size;
+      }
+    } else {
+      for (int64_t outer = 0; outer < upper_dim_count; outer++) {
+        prev_output_iter = output_iter;
+        for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+          *(output_iter++) = *(input_iter++);
+        }
+        for (int64_t cum_axis = 1; cum_axis < dim; cum_axis++) {
+          for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+            *(output_iter++) = *(prev_output_iter++) * *(input_iter++);
+          }
+        }
+      }
+    }
+  } else {
+    const auto* input_iter = input->Data<T>() + input->Shape().Size();
+    auto* output_iter = output_tensor.MutableData<T>() + output_shape.Size();
+    const auto* prev_output_iter = output_iter;
+
+    if (exclusive_) {
+      for (int64_t outer = upper_dim_count - 1; outer >= 0; outer--) {
+        prev_output_iter = output_iter;
+        for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) {
+          *(--output_iter) = static_cast<T>(1);
+        }
+        for (int64_t cum_axis = dim - 1; cum_axis > 0; cum_axis--) {
+          for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) {
+            *(--output_iter) = *(--prev_output_iter) * *(--input_iter);
+          }
+        }
+        input_iter -= lower_dim_size;
+      }
+    } else {
+      for (int64_t outer = upper_dim_count - 1; outer >= 0; outer--) {
+        prev_output_iter = output_iter;
+        for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) {
+          *(--output_iter) = *(--input_iter);
+        }
+        for (int64_t cum_axis = dim - 1; cum_axis > 0; cum_axis--) {
+          for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) {
+            *(--output_iter) = *(--prev_output_iter) * *(--input_iter);
+          }
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+};  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/math/cumprod.h b/onnxruntime/core/providers/cpu/math/cumprod.h
new file mode 100644
index 0000000000000..9b8c6a83cc187
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/math/cumprod.h
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+
+template <class T>
+class CumProd final : public OpKernel {
+ public:
+  explicit CumProd(const OpKernelInfo& op_kernel_info);
+
+  Status Compute(OpKernelContext* p_op_kernel_context) const override;
+
+ private:
+  int64_t exclusive_;
+  int64_t reverse_;
+};
+
+namespace cumprod_op {
+
+Status GetAxis(const Tensor* axis_tensor, int64_t input_rank, int64_t& axis_out);
+
+}  // namespace cumprod_op
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/bitcast_op.cc b/onnxruntime/core/providers/cpu/tensor/bitcast_op.cc
new file mode 100644
index 0000000000000..9d9fa3e0c462b
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/bitcast_op.cc
@@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "bitcast_op.h"
+#include "core/framework/op_kernel.h"
+#include "core/framework/tensor.h"
+
+#include <cstring>
+
+namespace onnxruntime {
+
+ONNX_CPU_OPERATOR_KERNEL(
+    BitCast,
+    26,
+    KernelDefBuilder()
+        .TypeConstraint("T1", {DataTypeImpl::GetTensorType<float>(),
+                                DataTypeImpl::GetTensorType<double>(),
+                                DataTypeImpl::GetTensorType<int8_t>(),
+                                DataTypeImpl::GetTensorType<int16_t>(),
+                                DataTypeImpl::GetTensorType<int32_t>(),
+                                DataTypeImpl::GetTensorType<int64_t>(),
+                                DataTypeImpl::GetTensorType<uint8_t>(),
+                                DataTypeImpl::GetTensorType<uint16_t>(),
+                                DataTypeImpl::GetTensorType<uint32_t>(),
+                                DataTypeImpl::GetTensorType<uint64_t>(),
+                                DataTypeImpl::GetTensorType<MLFloat16>(),
+                                DataTypeImpl::GetTensorType<BFloat16>()})
+        .TypeConstraint("T2", {DataTypeImpl::GetTensorType<float>(),
+                                DataTypeImpl::GetTensorType<double>(),
+                                DataTypeImpl::GetTensorType<int8_t>(),
+                                DataTypeImpl::GetTensorType<int16_t>(),
+                                DataTypeImpl::GetTensorType<int32_t>(),
+                                DataTypeImpl::GetTensorType<int64_t>(),
+                                DataTypeImpl::GetTensorType<uint8_t>(),
+                                DataTypeImpl::GetTensorType<uint16_t>(),
+                                DataTypeImpl::GetTensorType<uint32_t>(),
+                                DataTypeImpl::GetTensorType<uint64_t>(),
+                                DataTypeImpl::GetTensorType<MLFloat16>(),
+                                DataTypeImpl::GetTensorType<BFloat16>()})
+        .MayInplace(0, 0),
+    BitCast);
+
+BitCast::BitCast(const OpKernelInfo& info) : OpKernel(info) {
+  int64_t to;
+  Status status = info.GetAttr("to", &to);
+  ORT_ENFORCE(status.IsOK(), "Attribute 'to' is not set.");
+  to_ = gsl::narrow_cast<ONNX_NAMESPACE::TensorProto_DataType>(to);
+}
+
+Status BitCast::Compute(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  ORT_ENFORCE(input != nullptr, "BitCast: input tensor is null.");
+
+  const size_t input_element_size = input->DataType()->Size();
+
+  const auto* output_type = DataTypeImpl::TensorTypeFromONNXEnum(to_);
+  const size_t output_element_size = output_type->GetElementType()->Size();
+
+  ORT_RETURN_IF_NOT(input_element_size == output_element_size,
+                    "BitCast requires input and output types to have the same bit-width. ",
+                    "Input element size: ", input_element_size, " bytes, ",
+                    "output element size: ", output_element_size, " bytes.");
+
+  Tensor* output = context->Output(0, input->Shape());
+
+  const size_t num_bytes = input->SizeInBytes();
+  if (num_bytes > 0) {
+    const void* src = input->DataRaw();
+    void* dst = output->MutableDataRaw();
+    if (src != dst) {
+      std::memcpy(dst, src, num_bytes);
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/bitcast_op.h b/onnxruntime/core/providers/cpu/tensor/bitcast_op.h
new file mode 100644
index 0000000000000..dd5bdecde56c4
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/bitcast_op.h
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "core/graph/onnx_protobuf.h"
+
+namespace onnxruntime {
+
+class BitCast final : public OpKernel {
+ public:
+  explicit BitCast(const OpKernelInfo& info);
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  ONNX_NAMESPACE::TensorProto_DataType to_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/math/cumprod_test.cc b/onnxruntime/test/providers/cpu/math/cumprod_test.cc
new file mode 100644
index 0000000000000..3f9596e43aa40
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/math/cumprod_test.cc
@@ -0,0 +1,409 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+#include "test/util/include/default_providers.h"
+#include "core/util/math.h"
+
+namespace onnxruntime {
+namespace test {
+
+// 1D tests - basic functionality
+TEST(CumProdTest, _1DTest) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<float>("x", {5}, {1.f, 2.f, 3.f, 4.f, 5.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<float>("y", {5}, {1.f, 2.f, 6.f, 24.f, 120.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _1DTestInvalidAxis) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<float>("x", {5}, {1.f, 2.f, 3.f, 4.f, 5.f});
+  test.AddInput<int32_t>("axis", {}, {-3});
+  test.AddOutput<float>("y", {5}, {1.f, 2.f, 6.f, 24.f, 120.f});
+  test.Run(OpTester::ExpectResult::kExpectFailure, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _1DTestNegAxis) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<float>("x", {5}, {1.f, 2.f, 3.f, 4.f, 5.f});
+  test.AddInput<int32_t>("axis", {}, {-1});
+  test.AddOutput<float>("y", {5}, {1.f, 2.f, 6.f, 24.f, 120.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// Exclusive mode: identity element is 1, shift right
+// input:  [1, 2, 3, 4, 5]
+// output: [1, 1, 2, 6, 24]
+TEST(CumProdTest, _1DTestExclusive) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddInput<float>("x", {5}, {1.f, 2.f, 3.f, 4.f, 5.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<float>("y", {5}, {1.f, 1.f, 2.f, 6.f, 24.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// Exclusive with axis dim=1: all elements should be identity (1)
+TEST(CumProdTest, _1DTestExclusiveAxisHasSingleValue) {
+  {
+    // forward
+    OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+    test.AddAttribute<int64_t>("exclusive", 1);
+    test.AddInput<float>("x", {1, 2}, {3.f, 4.f});
+    test.AddInput<int32_t>("axis", {}, {0});
+    test.AddOutput<float>("y", {1, 2}, {1.f, 1.f});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  }
+  {
+    // reverse
+    OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+    test.AddAttribute<int64_t>("exclusive", 1);
+    test.AddAttribute<int64_t>("reverse", 1);
+    test.AddInput<float>("x", {1, 2}, {3.f, 4.f});
+    test.AddInput<int32_t>("axis", {}, {0});
+    test.AddOutput<float>("y", {1, 2}, {1.f, 1.f});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  }
+}
+
+// 2D tests
+// input: [[1, 2, 3], [4, 5, 6]], axis=0
+// output: [[1, 2, 3], [4, 10, 18]]
+TEST(CumProdTest, _2DTestAxis0) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<float>("x", {2, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<float>("y", {2, 3}, {1.f, 2.f, 3.f, 4.f, 10.f, 18.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// input: [[1, 2, 3], [4, 5, 6]], axis=1
+// output: [[1, 2, 6], [4, 20, 120]]
+TEST(CumProdTest, _2DTestAxis1) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<float>("x", {2, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+  test.AddInput<int32_t>("axis", {}, {1});
+  test.AddOutput<float>("y", {2, 3}, {1.f, 2.f, 6.f, 4.f, 20.f, 120.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// Exclusive 2D axis=0: identity row, then element-wise product with input
+// input: [[1, 2, 3], [4, 5, 6]], axis=0, exclusive
+// output: [[1, 1, 1], [1, 2, 3]]
+TEST(CumProdTest, _2DTestExclusiveAxis0) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddInput<float>("x", {2, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<float>("y", {2, 3}, {1.f, 1.f, 1.f, 1.f, 2.f, 3.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// Exclusive 2D axis=1
+// input: [[1, 2, 3], [4, 5, 6]], axis=1, exclusive
+// output: [[1, 1, 2], [1, 4, 20]]
+TEST(CumProdTest, _2DTestExclusiveAxis1) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddInput<float>("x", {2, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+  test.AddInput<int32_t>("axis", {}, {1});
+  test.AddOutput<float>("y", {2, 3}, {1.f, 1.f, 2.f, 1.f, 4.f, 20.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// 3D tests with shape {2, 3, 4}
+// Using values 1..24
+TEST(CumProdTest, _3DTestAxis0) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  // axis=0: product along first dimension
+  // out[0,:,:] = x[0,:,:], out[1,:,:] = x[0,:,:] * x[1,:,:]
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                         13.f, 28.f, 45.f, 64.f, 85.f, 108.f, 133.f, 160.f, 189.f, 220.f, 253.f, 288.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _3DTestAxis1) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {1});
+  // axis=1: product along second dimension
+  // out[:,0,:] = x[:,0,:], out[:,1,:] = x[:,0,:]*x[:,1,:], out[:,2,:] = x[:,0,:]*x[:,1,:]*x[:,2,:]
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {1.f, 2.f, 3.f, 4.f, 5.f, 12.f, 21.f, 32.f, 45.f, 120.f, 231.f, 384.f,
+                         13.f, 14.f, 15.f, 16.f, 221.f, 252.f, 285.f, 320.f, 4641.f, 5544.f, 6555.f, 7680.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _3DTestAxis2) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {2});
+  // axis=2: product along last dimension
+  // out[:,:,0] = x[:,:,0], out[:,:,1] = x[:,:,0]*x[:,:,1], etc.
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {1.f, 2.f, 6.f, 24.f, 5.f, 30.f, 210.f, 1680.f, 9.f, 90.f, 990.f, 11880.f,
+                         13.f, 182.f, 2730.f, 43680.f, 17.f, 306.f, 5814.f, 116280.f, 21.f, 462.f, 10626.f, 255024.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// 3D exclusive tests
+TEST(CumProdTest, _3DTestAxis0Exclusive) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  // exclusive axis=0: first slice is all 1s, second = x[0,:,:]
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,
+                         1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _3DTestAxis1Exclusive) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {1});
+  // exclusive axis=1: out[:,0,:] = 1, out[:,1,:] = x[:,0,:], out[:,2,:] = x[:,0,:]*x[:,1,:]
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {1.f, 1.f, 1.f, 1.f, 1.f, 2.f, 3.f, 4.f, 5.f, 12.f, 21.f, 32.f,
+                         1.f, 1.f, 1.f, 1.f, 13.f, 14.f, 15.f, 16.f, 221.f, 252.f, 285.f, 320.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _3DTestAxis2Exclusive) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {2});
+  // exclusive axis=2: out[:,:,0] = 1, out[:,:,1] = x[:,:,0], out[:,:,2] = x[:,:,0]*x[:,:,1], etc.
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {1.f, 1.f, 2.f, 6.f, 1.f, 5.f, 30.f, 210.f, 1.f, 9.f, 90.f, 990.f,
+                         1.f, 13.f, 182.f, 2730.f, 1.f, 17.f, 306.f, 5814.f, 1.f, 21.f, 462.f, 10626.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// Reverse tests
+// input: [1, 2, 3, 4, 5], reverse
+// output: [120, 120, 60, 20, 5]
+TEST(CumProdTest, _1DTestReverse) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("reverse", 1);
+  test.AddInput<float>("x", {5}, {1.f, 2.f, 3.f, 4.f, 5.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<float>("y", {5}, {120.f, 120.f, 60.f, 20.f, 5.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// Reverse exclusive
+// input: [1, 2, 3, 4, 5], reverse, exclusive
+// output: [120, 60, 20, 5, 1]
+TEST(CumProdTest, _1DTestReverseExclusive) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddAttribute<int64_t>("reverse", 1);
+  test.AddInput<float>("x", {5}, {1.f, 2.f, 3.f, 4.f, 5.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<float>("y", {5}, {120.f, 60.f, 20.f, 5.f, 1.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// 3D reverse tests
+TEST(CumProdTest, _3DTestAxis0Reverse) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("reverse", 1);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  // reverse axis=0: out[1,:,:]=x[1,:,:], out[0,:,:]=x[0,:,:]*x[1,:,:]
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {13.f, 28.f, 45.f, 64.f, 85.f, 108.f, 133.f, 160.f, 189.f, 220.f, 253.f, 288.f,
+                         13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _3DTestAxis1Reverse) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("reverse", 1);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {1});
+  // reverse axis=1: out[:,2,:]=x[:,2,:], out[:,1,:]=x[:,1,:]*x[:,2,:], out[:,0,:]=x[:,0,:]*x[:,1,:]*x[:,2,:]
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {45.f, 120.f, 231.f, 384.f, 45.f, 60.f, 77.f, 96.f, 9.f, 10.f, 11.f, 12.f,
+                         4641.f, 5544.f, 6555.f, 7680.f, 357.f, 396.f, 437.f, 480.f, 21.f, 22.f, 23.f, 24.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _3DTestAxis2Reverse) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("reverse", 1);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {2});
+  // reverse axis=2: out[:,:,3]=x[:,:,3], out[:,:,2]=x[:,:,2]*x[:,:,3], etc.
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {24.f, 24.f, 12.f, 4.f, 1680.f, 336.f, 56.f, 8.f, 11880.f, 1320.f, 132.f, 12.f,
+                         43680.f, 3360.f, 240.f, 16.f, 116280.f, 6840.f, 380.f, 20.f, 255024.f, 12144.f, 552.f, 24.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// 3D reverse exclusive tests
+TEST(CumProdTest, _3DTestAxis0ReverseExclusive) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("reverse", 1);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  // reverse exclusive axis=0: out[1,:,:]=1, out[0,:,:]=x[1,:,:]
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f,
+                         1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _3DTestAxis1ReverseExclusive) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("reverse", 1);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {1});
+  // reverse exclusive axis=1: out[:,2,:]=1, out[:,1,:]=x[:,2,:], out[:,0,:]=x[:,1,:]*x[:,2,:]
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {45.f, 60.f, 77.f, 96.f, 9.f, 10.f, 11.f, 12.f, 1.f, 1.f, 1.f, 1.f,
+                         357.f, 396.f, 437.f, 480.f, 21.f, 22.f, 23.f, 24.f, 1.f, 1.f, 1.f, 1.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _3DTestAxis2ReverseExclusive) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("reverse", 1);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddInput<float>("x", {2, 3, 4},
+                       {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f,
+                        13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f});
+  test.AddInput<int32_t>("axis", {}, {2});
+  // reverse exclusive axis=2: out[:,:,3]=1, out[:,:,2]=x[:,:,3], out[:,:,1]=x[:,:,2]*x[:,:,3], etc.
+  test.AddOutput<float>("y", {2, 3, 4},
+                        {24.f, 12.f, 4.f, 1.f, 336.f, 56.f, 8.f, 1.f, 1320.f, 132.f, 12.f, 1.f,
+                         3360.f, 240.f, 16.f, 1.f, 6840.f, 380.f, 20.f, 1.f, 12144.f, 552.f, 24.f, 1.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// Type-specific tests
+TEST(CumProdTest, _1DTestInt32) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<int32_t>("x", {5}, {1, 2, 3, 4, 5});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<int32_t>("y", {5}, {1, 2, 6, 24, 120});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _1DTestInt64) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<int64_t>("x", {5}, {1, 2, 3, 4, 5});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<int64_t>("y", {5}, {1, 2, 6, 24, 120});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _1DTestDouble) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<double>("x", {5}, {1., 2., 3., 4., 5.});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<double>("y", {5}, {1., 2., 6., 24., 120.});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _1DTestDouble_WithInt64Axis) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<double>("x", {5}, {1., 2., 3., 4., 5.});
+  test.AddInput<int64_t>("axis", {}, {0});
+  test.AddOutput<double>("y", {5}, {1., 2., 6., 24., 120.});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _1DTestUint32) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<uint32_t>("x", {5}, {1, 2, 3, 4, 5});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<uint32_t>("y", {5}, {1, 2, 6, 24, 120});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(CumProdTest, _1DTestUint64) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<uint64_t>("x", {5}, {1, 2, 3, 4, 5});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<uint64_t>("y", {5}, {1, 2, 6, 24, 120});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// Matches ONNX spec example exactly
+// input: [1, 2, 3], axis=0 -> [1, 2, 6]
+TEST(CumProdTest, _OnnxSpecExample) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddInput<float>("x", {3}, {1.f, 2.f, 3.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<float>("y", {3}, {1.f, 2.f, 6.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// ONNX spec example: exclusive=1 -> [1, 1, 2]
+TEST(CumProdTest, _OnnxSpecExampleExclusive) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddInput<float>("x", {3}, {1.f, 2.f, 3.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<float>("y", {3}, {1.f, 1.f, 2.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// ONNX spec example: reverse=1 -> [6, 6, 3]
+TEST(CumProdTest, _OnnxSpecExampleReverse) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("reverse", 1);
+  test.AddInput<float>("x", {3}, {1.f, 2.f, 3.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<float>("y", {3}, {6.f, 6.f, 3.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// ONNX spec example: exclusive=1, reverse=1 -> [6, 3, 1]
+TEST(CumProdTest, _OnnxSpecExampleReverseExclusive) {
+  OpTester test("CumProd", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("exclusive", 1);
+  test.AddAttribute<int64_t>("reverse", 1);
+  test.AddInput<float>("x", {3}, {1.f, 2.f, 3.f});
+  test.AddInput<int32_t>("axis", {}, {0});
+  test.AddOutput<float>("y", {3}, {6.f, 3.f, 1.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/bitcast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/bitcast_op_test.cc
new file mode 100644
index 0000000000000..d2c674b1081b2
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/tensor/bitcast_op_test.cc
@@ -0,0 +1,183 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+#include "core/framework/to_tensor_proto_element_type.h"
+
+#include <cstring>
+#include <vector>
+
+namespace onnxruntime {
+namespace test {
+
+template <typename SrcType, typename DstType>
+void TestBitCastOp(const std::vector<int64_t>& shape,
+                   const std::vector<SrcType>& input,
+                   const std::vector<DstType>& expected_output) {
+  OpTester test("BitCast", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("to", utils::ToTensorProtoElementType<DstType>());
+  test.AddInput<SrcType>("input", shape, input);
+  test.AddOutput<DstType>("output", shape, expected_output);
+  // BitCast is CPU-only for now; exclude providers that don't support it.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
+// float32 and int32 are both 4 bytes.
+// IEEE 754: 1.0f = 0x3F800000 = 1065353216 as int32
+TEST(BitCastTest, Float32ToInt32) {
+  std::vector<float> input = {0.0f, 1.0f, -1.0f, 0.5f};
+  std::vector<int32_t> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(float));
+
+  TestBitCastOp<float, int32_t>({4}, input, expected);
+}
+
+TEST(BitCastTest, Int32ToFloat32) {
+  // 0x3F800000 = 1065353216 -> 1.0f
+  // 0x40000000 = 1073741824 -> 2.0f
+  std::vector<int32_t> input = {0, 1065353216, 1073741824};
+  std::vector<float> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(int32_t));
+
+  TestBitCastOp<int32_t, float>({3}, input, expected);
+}
+
+// double and int64 are both 8 bytes.
+TEST(BitCastTest, DoubleToInt64) {
+  std::vector<double> input = {0.0, 1.0, -1.0};
+  std::vector<int64_t> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(double));
+
+  TestBitCastOp<double, int64_t>({3}, input, expected);
+}
+
+TEST(BitCastTest, Int64ToDouble) {
+  std::vector<int64_t> input = {0, 4607182418800017408};  // 0 and 1.0 as int64
+  std::vector<double> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(int64_t));
+
+  TestBitCastOp<int64_t, double>({2}, input, expected);
+}
+
+// float16 and uint16 are both 2 bytes.
+TEST(BitCastTest, Float16ToUInt16) {
+  std::vector<MLFloat16> input = {MLFloat16(0.0f), MLFloat16(1.0f), MLFloat16(0.5f)};
+  std::vector<uint16_t> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(MLFloat16));
+
+  TestBitCastOp<MLFloat16, uint16_t>({3}, input, expected);
+}
+
+TEST(BitCastTest, UInt16ToFloat16) {
+  std::vector<uint16_t> input = {0x0000, 0x3C00, 0x3800};  // 0.0, 1.0, 0.5 in float16
+  std::vector<MLFloat16> expected;
+  expected.reserve(input.size());
+  for (auto v : input) {
+    expected.push_back(MLFloat16::FromBits(v));
+  }
+
+  TestBitCastOp<uint16_t, MLFloat16>({3}, input, expected);
+}
+
+// BFloat16 and int16 are both 2 bytes.
+TEST(BitCastTest, BFloat16ToInt16) {
+  std::vector<BFloat16> input = {BFloat16(0.0f), BFloat16(1.0f)};
+  std::vector<int16_t> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(BFloat16));
+
+  TestBitCastOp<BFloat16, int16_t>({2}, input, expected);
+}
+
+// int8 and uint8 are both 1 byte.
+TEST(BitCastTest, Int8ToUInt8) {
+  std::vector<int8_t> input = {0, 1, -1, 127, -128};
+  std::vector<uint8_t> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(int8_t));
+
+  TestBitCastOp<int8_t, uint8_t>({5}, input, expected);
+}
+
+TEST(BitCastTest, UInt8ToInt8) {
+  std::vector<uint8_t> input = {0, 1, 127, 128, 255};
+  std::vector<int8_t> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(uint8_t));
+
+  TestBitCastOp<uint8_t, int8_t>({5}, input, expected);
+}
+
+// Same type (identity-like).
+TEST(BitCastTest, Float32ToFloat32) {
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  TestBitCastOp<float, float>({3}, input, input);
+}
+
+// Multi-dimensional input.
+TEST(BitCastTest, Float32ToInt32_2D) {
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int32_t> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(float));
+
+  TestBitCastOp<float, int32_t>({2, 3}, input, expected);
+}
+
+TEST(BitCastTest, Float32ToInt32_3D) {
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+                               7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+  std::vector<int32_t> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(float));
+
+  TestBitCastOp<float, int32_t>({2, 2, 3}, input, expected);
+}
+
+// Empty tensor.
+TEST(BitCastTest, EmptyTensor) {
+  std::vector<float> input = {};
+  std::vector<int32_t> expected = {};
+  TestBitCastOp<float, int32_t>({0}, input, expected);
+}
+
+// Scalar (0-dim) tensor.
+TEST(BitCastTest, ScalarTensor) {
+  std::vector<float> input = {42.0f};
+  std::vector<int32_t> expected(1);
+  std::memcpy(expected.data(), input.data(), sizeof(float));
+
+  OpTester test("BitCast", 26, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("to", utils::ToTensorProtoElementType<int32_t>());
+  test.AddInput<float>("input", {}, input);
+  test.AddOutput<int32_t>("output", {}, expected);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
+// uint32 and float32 (same size, 4 bytes).
+TEST(BitCastTest, UInt32ToFloat32) {
+  std::vector<uint32_t> input = {0, 0x3F800000, 0x40000000};  // 0.0f, 1.0f, 2.0f
+  std::vector<float> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(uint32_t));
+
+  TestBitCastOp<uint32_t, float>({3}, input, expected);
+}
+
+// uint64 and double (same size, 8 bytes).
+TEST(BitCastTest, UInt64ToDouble) {
+  std::vector<uint64_t> input = {0, 0x3FF0000000000000ULL};  // 0.0 and 1.0 as uint64
+  std::vector<double> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(uint64_t));
+
+  TestBitCastOp<uint64_t, double>({2}, input, expected);
+}
+
+// int16 and uint16 (same size, 2 bytes).
+TEST(BitCastTest, Int16ToUInt16) {
+  std::vector<int16_t> input = {0, 1, -1, 32767, -32768};
+  std::vector<uint16_t> expected(input.size());
+  std::memcpy(expected.data(), input.data(), input.size() * sizeof(int16_t));
+
+  TestBitCastOp<int16_t, uint16_t>({5}, input, expected);
+}
+
+}  // namespace test
+}  // namespace onnxruntime

From f2bc8bf95efd6a3fb3e48e7f759b097b64023f5f Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 9 Mar 2026 21:34:49 +0000
Subject: [PATCH 03/18] Regenerate OperatorKernels.md for ONNX opset 26

Add BitCast and CumProd entries to the CPU provider kernel documentation.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 docs/OperatorKernels.md | 446 +---------------------------------------
 1 file changed, 4 insertions(+), 442 deletions(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index abdcc81586909..10fc961865314 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -6,7 +6,6 @@ Do not modify directly.*
 
 - [CPUExecutionProvider](#cpuexecutionprovider)
 - [CUDAExecutionProvider](#cudaexecutionprovider)
-- [DmlExecutionProvider](#dmlexecutionprovider)
 
 ---------------
 
@@ -54,6 +53,7 @@ Do not modify directly.*
 |||14|**T** = tensor(double), tensor(float)<br/> **U** = tensor(double), tensor(float)|
 |||[9, 13]|**T** = tensor(double), tensor(float)|
 |||[7, 8]|**T** = tensor(double), tensor(float)|
+|BitCast|*in* input:**T1**<br> *out* output:**T2**|26+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |BitShift|*in* X:**T**<br> *in* Y:**T**<br> *out* Z:**T**|11+|**T** = tensor(uint32), tensor(uint64), tensor(uint8)|
 |BitwiseAnd|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |BitwiseNot|*in* X:**T**<br> *out* Y:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -99,6 +99,7 @@ Do not modify directly.*
 |Cosh|*in* input:**T**<br> *out* output:**T**|22+|**T** = tensor(float)|
 |||[9, 21]|**T** = tensor(float)|
 |Crop|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float)|
+|CumProd|*in* x:**T**<br> *in* axis:**T2**<br> *out* y:**T**|26+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T2** = tensor(int32), tensor(int64)|
 |CumSum|*in* x:**T**<br> *in* axis:**T2**<br> *out* y:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T2** = tensor(int32), tensor(int64)|
 |||[11, 13]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T2** = tensor(int32), tensor(int64)|
 |DFT|*in* input:**T1**<br> *in* dft_length:**T2**<br> *in* axis:**tensor(int64)**<br> *out* output:**T1**<br><br>or<br><br>*in* input:**T1**<br> *in* dft_length:**T2**<br> *out* output:**T1**|20+|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int32), tensor(int64)|
@@ -655,7 +656,8 @@ Do not modify directly.*
 |ArgMin|*in* data:**T**<br> *out* reduced:**tensor(int64)**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||12|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 11]|**T** = tensor(double), tensor(float), tensor(float16)|
-|Attention|*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *in* nonpad_kv_seqlen:**tensor(int64)**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**<br><br>or<br><br>*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**|23+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
+|Attention|*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *in* nonpad_kv_seqlen:**tensor(int64)**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**<br><br>or<br><br>*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**|24+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
+|||23|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
 |AveragePool|*in* X:**T**<br> *out* Y:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[19, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[11, 18]|**T** = tensor(double), tensor(float), tensor(float16)|
@@ -1077,443 +1079,3 @@ Do not modify directly.*
 |||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 | |
 | |
-
-
-<a name="dmlexecutionprovider"/>
-
-## Operators implemented by DmlExecutionProvider
-
-| Op Name | Parameters | OpSet Version | Types Supported |
-|---------|------------|---------------|-----------------|
-|**Operator Domain:** *ai.onnx*||||
-|Abs|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)|
-|||6+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)|
-|Acos|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
-|Acosh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
-|Add|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||7+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Affine|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|And|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|7+|**T** = tensor(bool)|
-|ArgMax|*in* data:**T**<br> *out* reduced:**tensor(int64)**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||1+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|ArgMin|*in* data:**T**<br> *out* reduced:**tensor(int64)**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||1+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Asin|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
-|Asinh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
-|Atan|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
-|Atanh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
-|AveragePool|*in* X:**T**<br> *out* Y:**T**|19+|**T** = tensor(float), tensor(float16)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||10+|**T** = tensor(float), tensor(float16)|
-|||7+|**T** = tensor(float), tensor(float16)|
-|BatchNormalization|*in* X:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *in* input_mean:**U**<br> *in* input_var:**U**<br> *out* Y:**T**<br> *out* running_mean:**U**<br> *out* running_var:**U**<br><br>or<br><br>*in* X:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *in* mean:**T**<br> *in* var:**T**<br> *out* Y:**T**<br> *out* mean:**T**<br> *out* var:**T**<br> *out* saved_mean:**T**<br> *out* saved_var:**T**<br><br>or<br><br>*in* X:**T**<br> *in* scale:**T1**<br> *in* B:**T1**<br> *in* input_mean:**T2**<br> *in* input_var:**T2**<br> *out* Y:**T**<br> *out* running_mean:**T2**<br> *out* running_var:**T2**|15+|**T** = tensor(float), tensor(float16)|
-|||14+|**T** = tensor(float), tensor(float16)|
-|||9+|**T** = tensor(float), tensor(float16)|
-|||7+|**T** = tensor(float), tensor(float16)|
-|BitShift|*in* X:**T**<br> *in* Y:**T**<br> *out* Z:**T**|11+|**T** = tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|BitwiseAnd|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|BitwiseNot|*in* X:**T**<br> *out* Y:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|BitwiseOr|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|BitwiseXor|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Cast|*in* input:**T1**<br> *out* output:**T2**|21+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||19+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||9+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||6+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|CastLike|*in* input:**T1**<br> *in* target_type:**T2**<br> *out* output:**T2**|21+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||19+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||15+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Ceil|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|Celu|*in* X:**T**<br> *out* Y:**T**|12+|**T** = tensor(float), tensor(float16)|
-|Clip|*in* input:**T**<br> *in* min:**T**<br> *in* max:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|Col2Im|*in* input:**T**<br> *in* image_shape:**tensor(int64)**<br> *in* block_shape:**tensor(int64)**<br> *out* output:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Concat|*in* inputs:**T**<br> *out* concat_result:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||4+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|ConcatFromSequence|*in* input_sequence:**S**<br> *out* concat_result:**T**|11+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|ConstantOfShape|*in* input:**T1**<br> *out* output:**T2**|21+|**T1** = tensor(int64)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||9+|**T1** = tensor(int64)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Conv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|ConvInteger|*in* x:**T1**<br> *in* w:**T2**<br> *in* x_zero_point:**T1**<br> *in* w_zero_point:**T2**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int32)|
-|ConvTranspose|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|Cos|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
-|Cosh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
-|Crop|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
-|CumSum|*in* x:**T**<br> *in* axis:**T2**<br> *out* y:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||11+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|DFT|*in* input:**T1**<br> *in* dft_length:**T2**<br> *in* axis:**tensor(int64)**<br> *out* output:**T1**<br><br>or<br><br>*in* input:**T1**<br> *in* dft_length:**T2**<br> *out* output:**T1**|20+|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(int32), tensor(int64)|
-|||17+|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(int32), tensor(int64)|
-|DepthToSpace|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T3**|21+|**T1** = tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
-|||19+|**T1** = tensor(int32), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
-|||13+|**T** = tensor(int32), tensor(int8), tensor(uint8)|
-|||10+|**T** = tensor(int32), tensor(int8), tensor(uint8)|
-|Div|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||7+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Dropout|*in* data:**T**<br> *in* ratio:**T1**<br> *in* training_mode:**T2**<br> *out* output:**T**<br> *out* mask:**T2**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**<br> *out* mask:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**<br> *out* mask:**T1**|7+|**T** = tensor(float), tensor(float16)|
-|DynamicQuantizeLinear|*in* x:**T1**<br> *out* y:**T2**<br> *out* y_scale:**tensor(float)**<br> *out* y_zero_point:**T2**|11+|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
-|Einsum|*in* Inputs:**T**<br> *out* Output:**T**|12+|**T** = tensor(float), tensor(float16)|
-|Elu|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(float), tensor(float16)|
-|Equal|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|19+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
-|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
-|||11+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
-|||7+|**T** = tensor(float), tensor(float16)<br/> **T1** = tensor(bool)|
-|Erf|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||9+|**T** = tensor(float), tensor(float16)|
-|Exp|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|Expand|*in* input:**T**<br> *in* shape:**tensor(int64)**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||8+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|EyeLike|*in* input:**T1**<br> *out* output:**T2**|9+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Flatten|*in* input:**T**<br> *out* output:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||9+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Floor|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|GRU|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|14+|**T** = tensor(float), tensor(float16)|
-|||7+|**T** = tensor(float), tensor(float16)|
-|Gather|*in* data:**T**<br> *in* indices:**Tind**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|GatherElements|*in* data:**T**<br> *in* indices:**Tind**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|GatherND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||12+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Gemm|*in* A:**T**<br> *in* B:**T**<br> *in* C:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||9+|**T** = tensor(float), tensor(float16)|
-|||7+|**T** = tensor(float), tensor(float16)|
-|GlobalAveragePool|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|GlobalLpPool|*in* X:**T**<br> *out* Y:**T**|2+|**T** = tensor(float), tensor(float16)|
-|GlobalMaxPool|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|Greater|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
-|||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
-|||7+|**T** = tensor(float), tensor(float16)<br/> **T1** = tensor(bool)|
-|GreaterOrEqual|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|16+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
-|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
-|GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
-|GroupNorm||21+|**M** = tensor(float), tensor(float16)<br/> **T** = tensor(float), tensor(float16)|
-|HardSigmoid|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(float), tensor(float16)|
-|Hardmax|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|21+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||19+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||16+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||14+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|If|*in* cond:**B**<br> *out* outputs:**V**|19+|**B** = tensor(bool)<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||16+|**B** = tensor(bool)<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**B** = tensor(bool)<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**B** = tensor(bool)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||7+|**B** = tensor(bool)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|ImageScaler|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
-|InstanceNormalization|*in* input:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *out* output:**T**|6+|**T** = tensor(float), tensor(float16)|
-|IsInf|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(float)<br/> **T2** = tensor(bool)|
-|||10+|**T1** = tensor(float)<br/> **T2** = tensor(bool)|
-|IsNaN|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
-|||13+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
-|||9+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
-|LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|LSTM|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *in* initial_c:**T**<br> *in* P:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**<br> *out* Y_c:**T**|14+|**T** = tensor(float), tensor(float16)|
-|||7+|**T** = tensor(float), tensor(float16)|
-|LayerNormalization|*in* X:**T**<br> *in* Scale:**T**<br> *in* B:**T**<br> *out* Y:**T**<br> *out* Mean:**U**<br> *out* InvStdDev:**U**<br><br>or<br><br>*in* X:**T**<br> *in* Scale:**V**<br> *in* B:**V**<br> *out* Y:**V**<br> *out* Mean:**U**<br> *out* InvStdDev:**U**|17+|**T** = tensor(float), tensor(float16)<br/> **U** = tensor(float)|
-|||1+|**T** = tensor(float), tensor(float16)<br/> **U** = tensor(float), tensor(float16)<br/> **V** = tensor(float), tensor(float16)|
-|LeakyRelu|*in* X:**T**<br> *out* Y:**T**|16+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|Less|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
-|||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
-|||7+|**T** = tensor(float), tensor(float16)<br/> **T1** = tensor(bool)|
-|LessOrEqual|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|16+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
-|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
-|Log|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|LogSoftmax|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|LpNormalization|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
-|LpPool|*in* X:**T**<br> *out* Y:**T**|18+|**T** = tensor(float), tensor(float16)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||2+|**T** = tensor(float), tensor(float16)|
-|MatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||9+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|MatMulInteger|*in* A:**T1**<br> *in* B:**T2**<br> *in* a_zero_point:**T1**<br> *in* b_zero_point:**T2**<br> *out* Y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int32)|
-|Max|*in* data_0:**T**<br> *out* max:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||8+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|MaxPool|*in* X:**T**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T**<br> *out* Y:**T**<br> *out* Indices:**I**|12+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)|
-|||11+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)|
-|||10+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)|
-|||8+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|MaxRoiPool|*in* X:**T**<br> *in* rois:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|MaxUnpool|*in* X:**T1**<br> *in* I:**T2**<br> *in* output_shape:**T2**<br> *out* output:**T1**|11+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int64)|
-|||9+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int64)|
-|Mean|*in* data_0:**T**<br> *out* mean:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||8+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|MeanVarianceNormalization|*in* X:**T**<br> *out* Y:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||9+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|MemcpyFromHost|*in* X:**T**<br> *out* Y:**T**|1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)|
-|MemcpyToHost|*in* X:**T**<br> *out* Y:**T**|1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)|
-|Min|*in* data_0:**T**<br> *out* min:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||8+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|Mod|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
-|||10+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
-|Mul|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||7+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Neg|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)|
-|||6+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)|
-|NonZero|*in* X:**T**<br> *out* Y:**tensor(int64)**|13+|**T** = tensor(bool), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
-|||9+|**T** = tensor(bool), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
-|Not|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(bool)|
-|OneHot|*in* indices:**T1**<br> *in* depth:**T2**<br> *in* values:**T3**<br> *out* output:**T3**|11+|**T1** = tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T3** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||9+|**T1** = tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T3** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|OptionalGetElement|*in* input:**O**<br> *out* output:**V**|18+|**O** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||15+|**O** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8))<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|OptionalHasElement|*in* input:**O**<br> *out* output:**B**|18+|**B** = tensor(bool)<br/> **O** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||15+|**B** = tensor(bool)<br/> **O** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8))|
-|Or|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|7+|**T** = tensor(bool)|
-|PRelu|*in* X:**T**<br> *in* slope:**T**<br> *out* Y:**T**|16+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
-|||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
-|||7+|**T** = tensor(float), tensor(float16)|
-|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||2+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|ParametricSoftplus|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|Pow|*in* X:**T**<br> *in* Y:**T**<br> *out* Z:**T**<br><br>or<br><br>*in* X:**T**<br> *in* Y:**T1**<br> *out* Z:**T**|15+|**T** = tensor(float), tensor(float16), tensor(int32)<br/> **T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
-|||13+|**T** = tensor(float), tensor(float16), tensor(int32)<br/> **T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
-|||12+|**T** = tensor(float), tensor(float16), tensor(int32)<br/> **T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
-|||7+|**T** = tensor(float), tensor(float16)|
-|QLinearConv|*in* x:**T1**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T1**<br> *in* w:**T2**<br> *in* w_scale:**tensor(float)**<br> *in* w_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *in* B:**T4**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)<br/> **T4** = tensor(int32)|
-|QLinearMatMul|*in* a:**T1**<br> *in* a_scale:**TS**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**TS**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**TS**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**<br><br>or<br><br>*in* a:**T1**<br> *in* a_scale:**tensor(float)**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**tensor(float)**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**|21+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)|
-|||10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)|
-|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**T2**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|21+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)|
-|||19+|**T1** = tensor(float), tensor(float16), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
-|||13+|**T1** = tensor(float), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
-|||10+|**T1** = tensor(float), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
-|RNN|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|14+|**T** = tensor(float), tensor(float16)|
-|||7+|**T** = tensor(float), tensor(float16)|
-|Range|*in* start:**T**<br> *in* limit:**T**<br> *in* delta:**T**<br> *out* output:**T**|11+|**T** = tensor(float), tensor(int16), tensor(int32), tensor(int64)|
-|Reciprocal|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|ReduceL1|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||13+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||11+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||1+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|ReduceL2|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||13+|**T** = tensor(float), tensor(float16)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|ReduceLogSum|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||13+|**T** = tensor(float), tensor(float16)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|ReduceLogSumExp|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||13+|**T** = tensor(float), tensor(float16)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|ReduceMax|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|20+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|ReduceMean|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||13+|**T** = tensor(float), tensor(float16)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|ReduceMin|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|20+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|ReduceProd|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||13+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||11+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||1+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|ReduceSum|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||11+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||1+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|ReduceSumSquare|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||13+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||11+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|||1+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|Relu|*in* X:**T**<br> *out* Y:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
-|||13+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|Reshape|*in* data:**T**<br> *in* shape:**tensor(int64)**<br> *out* reshaped:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reshaped:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||5+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|19+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
-|||18+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
-|||13+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
-|||11+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
-|||10+|**T** = tensor(float), tensor(float16)|
-|ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|RoiAlign|*in* X:**T1**<br> *in* rois:**T1**<br> *in* batch_indices:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int32), tensor(int64)|
-|||10+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int32), tensor(int64)|
-|Round|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
-|STFT|*in* signal:**T1**<br> *in* frame_step:**T2**<br> *in* window:**T1**<br> *in* frame_length:**T2**<br> *out* output:**T1**|17+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int32), tensor(int64)|
-|ScaledTanh|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
-|Scatter|*in* data:**T**<br> *in* indices:**Tind**<br> *in* updates:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|||9+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|ScatterElements|*in* data:**T**<br> *in* indices:**Tind**<br> *in* updates:**T**<br> *out* output:**T**|16+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|ScatterND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *in* updates:**T**<br> *out* output:**T**|16+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Selu|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(float), tensor(float16)|
-|SequenceAt|*in* input_sequence:**S**<br> *in* position:**I**<br> *out* tensor:**T**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|SequenceConstruct|*in* inputs:**T**<br> *out* output_sequence:**S**|11+|**S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|SequenceEmpty|*out* output:**S**|11+|**S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
-|SequenceErase|*in* input_sequence:**S**<br> *in* position:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
-|SequenceInsert|*in* input_sequence:**S**<br> *in* tensor:**T**<br> *in* position:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
-|SequenceLength|*in* input_sequence:**S**<br> *out* length:**I**|11+|**I** = tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
-|Shape|*in* data:**T**<br> *out* shape:**T1**|21+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
-|||19+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
-|||15+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
-|||13+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
-|||1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
-|Shrink|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
-|Sigmoid|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|Sign|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|SimplifiedLayerNormalization|*in* X:**T**<br> *in* scale:**V**<br> *out* Y:**V**<br> *out* inv_std_var:**U**|1+|**T** = tensor(float), tensor(float16)<br/> **U** = tensor(float), tensor(float16)<br/> **V** = tensor(float), tensor(float16)|
-|Sin|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
-|Sinh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
-|Size|*in* data:**T**<br> *out* size:**T1**|21+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
-|||19+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
-|||13+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
-|||1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
-|Slice|*in* data:**T**<br> *in* starts:**Tind**<br> *in* ends:**Tind**<br> *in* axes:**Tind**<br> *in* steps:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|||10+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Softmax|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||11+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|Softplus|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|Softsign|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
-|SpaceToDepth|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Split|*in* input:**T**<br> *in* split:**T**<br> *out* outputs...:**T**<br><br>or<br><br>*in* input:**T**<br> *in* split:**tensor(int64)**<br> *out* outputs:**T**<br><br>or<br><br>*in* input:**T**<br> *out* outputs:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||2+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Sqrt|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|Squeeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* squeezed:**T**<br><br>or<br><br>*in* data:**T**<br> *out* squeezed:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Sub|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||7+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Sum|*in* data_0:**T**<br> *out* sum:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||8+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|Tan|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
-|Tanh|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
-|||6+|**T** = tensor(float), tensor(float16)|
-|ThresholdedRelu|*in* X:**T**<br> *out* Y:**T**|10+|**T** = tensor(float), tensor(float16)|
-|||1+|**T** = tensor(float), tensor(float16)|
-|Tile|*in* input:**T**<br> *in* repeats:**T1**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *in* tiles:**T**<br> *in* axis:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||6+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|TopK|*in* X:**T**<br> *in* K:**tensor(int64)**<br> *out* Values:**T**<br> *out* Indices:**I**<br><br>or<br><br>*in* X:**T**<br> *out* Values:**T**<br> *out* Indices:**I**|11+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||10+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||1+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Transpose|*in* data:**T**<br> *out* transposed:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Trilu|*in* input:**T**<br> *in* k:**tensor(int64)**<br> *out* output:**T**|14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Unsqueeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* expanded:**T**<br><br>or<br><br>*in* data:**T**<br> *out* expanded:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Upsample|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T**<br> *out* Y:**T**|10+|**T** = tensor(float), tensor(float16)|
-|||9+|**T** = tensor(float), tensor(float16)|
-|||7+|**T** = tensor(float), tensor(float16)|
-|Where|*in* condition:**B**<br> *in* X:**T**<br> *in* Y:**T**<br> *out* output:**T**|16+|**B** = tensor(bool)<br/> **T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||9+|**B** = tensor(bool)<br/> **T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Xor|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|7+|**T** = tensor(bool)|
-| |
-| |
-|**Operator Domain:** *com.microsoft*||||
-|Attention|*in* input:**T**<br> *in* weights:**T**<br> *in* bias:**T**<br> *in* mask_index:**M**<br> *in* past:**T**<br> *in* attention_bias:**T**<br> *in* past_sequence_length:**M**<br> *out* output:**T**<br> *out* present:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float), tensor(float16)|
-|BiasAdd|*in* X:**T**<br> *in* bias:**T**<br> *in* skip:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|BiasGelu|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|1+|**T** = tensor(float), tensor(float16)|
-|BiasSplitGelu|*in* X:**T**<br> *in* bias:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|ConvTransposeWithDynamicPads|*in* X:**T**<br> *in* W:**T**<br> *in* Pads:**tensor(int64)**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|DequantizeLinear|*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|1+|**T1** = tensor(int32), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
-|DynamicQuantizeMatMul|*in* A:**T1**<br> *in* B:**T2**<br> *in* b_scale:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
-|EmbedLayerNormalization|*in* input_ids:**T1**<br> *in* segment_ids:**T1**<br> *in* word_embedding:**T**<br> *in* position_embedding:**T**<br> *in* segment_embedding:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* mask:**T1**<br> *in* position_ids:**T1**<br> *out* output:**T**<br> *out* mask_index:**T1**<br> *out* embedding_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
-|FastGelu|*in* X:**T**<br> *in* bias:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|FusedMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|FusedMatMulActivation|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|Gelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|GroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *out* Y:**T**|1+|**M** = tensor(float), tensor(float16)<br/> **T** = tensor(float), tensor(float16)|
-|GroupQueryAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T_CACHE**<br> *in* past_value:**T_CACHE**<br> *in* seqlens_k:**M**<br> *in* total_sequence_length:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *in* position_ids:**tensor(int64)**<br> *in* attention_bias:**T**<br> *in* head_sink:**T**<br> *in* k_scale:**T_KV_SCALE**<br> *in* v_scale:**T_KV_SCALE**<br> *out* output:**T**<br> *out* present_key:**T_CACHE**<br> *out* present_value:**T_CACHE**<br> *out* output_qk:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float), tensor(float16)|
-|MatMulIntegerToFloat|*in* A:**T1**<br> *in* B:**T2**<br> *in* a_scale:**T3**<br> *in* b_scale:**T3**<br> *in* a_zero_point:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T3**<br> *out* Y:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float), tensor(float16)|
-|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T3**<br> *in* g_idx:**T4**<br> *in* bias:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
-|MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* attention_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* past_sequence_length:**M**<br> *in* cache_indirection:**M**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**<br> *out* qk:**QK**|1+|**M** = tensor(int32)<br/> **T** = tensor(float), tensor(float16)|
-|NhwcConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|QAttention|*in* input:**T1**<br> *in* weight:**T2**<br> *in* bias:**T3**<br> *in* input_scale:**T3**<br> *in* weight_scale:**T3**<br> *in* mask_index:**T4**<br> *in* input_zero_point:**T1**<br> *in* weight_zero_point:**T2**<br> *in* past:**T3**<br> *out* output:**T3**<br> *out* present:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float), tensor(float16)<br/> **T4** = tensor(int32)|
-|QLinearAdd|*in* A:**T**<br> *in* A_scale:**tensor(float)**<br> *in* A_zero_point:**T**<br> *in* B:**T**<br> *in* B_scale:**tensor(float)**<br> *in* B_zero_point:**T**<br> *in* C_scale:**tensor(float)**<br> *in* C_zero_point:**T**<br> *out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)|
-|QLinearAveragePool|*in* X:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
-|QLinearConcat|*in* Y_scale:**TF**<br> *in* Y_zero_point:**T8**<br> *in* inputs:**TV**<br> *out* Y:**T8**|1+|**T8** = tensor(int8), tensor(uint8)<br/> **TF** = tensor(float)<br/> **TV** = tensor(float), tensor(int8), tensor(uint8)|
-|QLinearGlobalAveragePool|*in* X:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
-|QLinearSigmoid|*in* X:**T**<br> *in* X_scale:**tensor(float)**<br> *in* X_zero_point:**T**<br> *in* Y_scale:**tensor(float)**<br> *in* Y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
-|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|1+|**T1** = tensor(float), tensor(float16), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
-|QuickGelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|RotaryEmbedding|*in* input:**T**<br> *in* position_ids:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**|1+|**M** = tensor(int64)<br/> **T** = tensor(float), tensor(float16)|
-|SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
-|SkipSimplifiedLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
-| |
-| |
-|**Operator Domain:** *com.microsoft.dml*||||
-|DmlFusedAdd|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|1+|**T** = tensor(float), tensor(float16)|
-|DmlFusedBatchNormalization|*in* X:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *in* mean:**T**<br> *in* var:**T**<br> *out* Y:**T**<br> *out* mean:**T**<br> *out* var:**T**<br> *out* saved_mean:**T**<br> *out* saved_var:**T**|1+|**T** = tensor(float), tensor(float16)|
-|DmlFusedConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|DmlFusedConvTranspose|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|DmlFusedGemm|*in* A:**T**<br> *in* B:**T**<br> *in* C:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|DmlFusedInstanceNormalization|*in* input:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
-|DmlFusedMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|DmlFusedMeanVarianceNormalization|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
-|DmlFusedSum|*in* data_0:**T**<br> *out* sum:**T**|1+|**T** = tensor(float), tensor(float16)|
-| |
-| |

From 0189eff60123fcd0229ff6754ffa5a6ced482a25 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 9 Mar 2026 22:26:45 +0000
Subject: [PATCH 04/18] Fix onnx version to 1.21.0rc1 and apply lintrunner
 formatting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Change onnx==1.21.0 to onnx==1.21.0rc1 in all 7 requirements.txt
files since the final 1.21.0 release is not yet published.
Apply lintrunner auto-formatting fixes to whitespace/alignment.

Verified SHA1 (deps.txt) and SHA512 (vcpkg portfile) hashes match
the downloaded archives. No v1.21.0 tag exists yet — commit hash
URL is correct.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/graph/contrib_ops/contrib_defs.h     |  4 +-
 onnxruntime/core/graph/dml_ops/dml_defs.h     |  4 +-
 .../core/providers/cpu/math/cumprod.cc        | 12 ++---
 .../core/providers/cpu/tensor/bitcast_op.cc   | 44 +++++++++----------
 .../test/opaque_api/test_opaque_api.cc        |  2 +-
 .../providers/cpu/tensor/bitcast_op_test.cc   |  2 +-
 onnxruntime/test/python/requirements.txt      |  2 +-
 .../python/cpu/scripts/requirements.txt       |  2 +-
 .../docker/scripts/lort/requirements.txt      |  2 +-
 .../docker/scripts/manylinux/requirements.txt |  2 +-
 .../linux/docker/scripts/requirements.txt     |  2 +-
 .../github/linux/python/requirements.txt      |  2 +-
 .../github/windows/python/requirements.txt    |  2 +-
 13 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.h b/onnxruntime/core/graph/contrib_ops/contrib_defs.h
index f88257b2baf08..ceb18386de9e4 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.h
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.h
@@ -35,7 +35,7 @@ inline bool HasRawData(const ONNX_NAMESPACE::TensorProto& ten_proto) {
   ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_HELPER(__COUNTER__, name)
 #define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_HELPER(Counter, name) \
   ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ(Counter, name)
-#define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ(Counter, name)                                                       \
+#define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ(Counter, name)                                               \
   static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce op_schema_register_once##name##Counter \
       [[maybe_unused]] = ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__)
 
@@ -43,7 +43,7 @@ inline bool HasRawData(const ONNX_NAMESPACE::TensorProto& ten_proto) {
   ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_HELPER_ELSEWHERE(__COUNTER__, name, schema_func)
 #define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_HELPER_ELSEWHERE(Counter, name, schema_func) \
   ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func)
-#define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func)                                  \
+#define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func)                        \
   static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce op_schema_register_once##name##Counter \
       [[maybe_unused]] = schema_func(ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__))
 
diff --git a/onnxruntime/core/graph/dml_ops/dml_defs.h b/onnxruntime/core/graph/dml_ops/dml_defs.h
index 9551f72adfe17..ca97b655be3b3 100644
--- a/onnxruntime/core/graph/dml_ops/dml_defs.h
+++ b/onnxruntime/core/graph/dml_ops/dml_defs.h
@@ -11,7 +11,7 @@ namespace dml {
   MS_DML_OPERATOR_SCHEMA_UNIQ_HELPER(__COUNTER__, name)
 #define MS_DML_OPERATOR_SCHEMA_UNIQ_HELPER(Counter, name) \
   MS_DML_OPERATOR_SCHEMA_UNIQ(Counter, name)
-#define MS_DML_OPERATOR_SCHEMA_UNIQ(Counter, name)                                                      \
+#define MS_DML_OPERATOR_SCHEMA_UNIQ(Counter, name)                                                     \
   static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce op_schema_register_once##name##Counter \
       [[maybe_unused]] = ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__)
 
@@ -19,7 +19,7 @@ namespace dml {
   MS_DML_OPERATOR_SCHEMA_UNIQ_HELPER_ELSEWHERE(__COUNTER__, name, schema_func)
 #define MS_DML_OPERATOR_SCHEMA_UNIQ_HELPER_ELSEWHERE(Counter, name, schema_func) \
   MS_DML_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func)
-#define MS_DML_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func)                               \
+#define MS_DML_OPERATOR_SCHEMA_UNIQ_ELSEWHERE(Counter, name, schema_func)                              \
   static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce op_schema_register_once##name##Counter \
       [[maybe_unused]] = schema_func(ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__))
 
diff --git a/onnxruntime/core/providers/cpu/math/cumprod.cc b/onnxruntime/core/providers/cpu/math/cumprod.cc
index 6706c2d3ea8d0..a37bbe32a9c7a 100644
--- a/onnxruntime/core/providers/cpu/math/cumprod.cc
+++ b/onnxruntime/core/providers/cpu/math/cumprod.cc
@@ -44,7 +44,7 @@ ONNX_CPU_OPERATOR_TYPED_KERNEL(
     KernelDefBuilder()
         .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
         .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
-                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+                                                      DataTypeImpl::GetTensorType<int64_t>()}),
     CumProd<float>);
 
 ONNX_CPU_OPERATOR_TYPED_KERNEL(
@@ -54,7 +54,7 @@ ONNX_CPU_OPERATOR_TYPED_KERNEL(
     KernelDefBuilder()
         .TypeConstraint("T", DataTypeImpl::GetTensorType<double>())
         .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
-                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+                                                      DataTypeImpl::GetTensorType<int64_t>()}),
     CumProd<double>);
 
 ONNX_CPU_OPERATOR_TYPED_KERNEL(
@@ -64,7 +64,7 @@ ONNX_CPU_OPERATOR_TYPED_KERNEL(
     KernelDefBuilder()
         .TypeConstraint("T", DataTypeImpl::GetTensorType<int32_t>())
         .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
-                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+                                                      DataTypeImpl::GetTensorType<int64_t>()}),
     CumProd<int32_t>);
 
 ONNX_CPU_OPERATOR_TYPED_KERNEL(
@@ -74,7 +74,7 @@ ONNX_CPU_OPERATOR_TYPED_KERNEL(
     KernelDefBuilder()
         .TypeConstraint("T", DataTypeImpl::GetTensorType<int64_t>())
         .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
-                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+                                                      DataTypeImpl::GetTensorType<int64_t>()}),
     CumProd<int64_t>);
 
 ONNX_CPU_OPERATOR_TYPED_KERNEL(
@@ -84,7 +84,7 @@ ONNX_CPU_OPERATOR_TYPED_KERNEL(
     KernelDefBuilder()
         .TypeConstraint("T", DataTypeImpl::GetTensorType<uint32_t>())
         .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
-                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+                                                      DataTypeImpl::GetTensorType<int64_t>()}),
     CumProd<uint32_t>);
 
 ONNX_CPU_OPERATOR_TYPED_KERNEL(
@@ -94,7 +94,7 @@ ONNX_CPU_OPERATOR_TYPED_KERNEL(
     KernelDefBuilder()
         .TypeConstraint("T", DataTypeImpl::GetTensorType<uint64_t>())
         .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
-                                                       DataTypeImpl::GetTensorType<int64_t>()}),
+                                                      DataTypeImpl::GetTensorType<int64_t>()}),
     CumProd<uint64_t>);
 
 template <typename T>
diff --git a/onnxruntime/core/providers/cpu/tensor/bitcast_op.cc b/onnxruntime/core/providers/cpu/tensor/bitcast_op.cc
index 9d9fa3e0c462b..e932438f6a914 100644
--- a/onnxruntime/core/providers/cpu/tensor/bitcast_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/bitcast_op.cc
@@ -14,29 +14,29 @@ ONNX_CPU_OPERATOR_KERNEL(
     26,
     KernelDefBuilder()
         .TypeConstraint("T1", {DataTypeImpl::GetTensorType<float>(),
-                                DataTypeImpl::GetTensorType<double>(),
-                                DataTypeImpl::GetTensorType<int8_t>(),
-                                DataTypeImpl::GetTensorType<int16_t>(),
-                                DataTypeImpl::GetTensorType<int32_t>(),
-                                DataTypeImpl::GetTensorType<int64_t>(),
-                                DataTypeImpl::GetTensorType<uint8_t>(),
-                                DataTypeImpl::GetTensorType<uint16_t>(),
-                                DataTypeImpl::GetTensorType<uint32_t>(),
-                                DataTypeImpl::GetTensorType<uint64_t>(),
-                                DataTypeImpl::GetTensorType<MLFloat16>(),
-                                DataTypeImpl::GetTensorType<BFloat16>()})
+                               DataTypeImpl::GetTensorType<double>(),
+                               DataTypeImpl::GetTensorType<int8_t>(),
+                               DataTypeImpl::GetTensorType<int16_t>(),
+                               DataTypeImpl::GetTensorType<int32_t>(),
+                               DataTypeImpl::GetTensorType<int64_t>(),
+                               DataTypeImpl::GetTensorType<uint8_t>(),
+                               DataTypeImpl::GetTensorType<uint16_t>(),
+                               DataTypeImpl::GetTensorType<uint32_t>(),
+                               DataTypeImpl::GetTensorType<uint64_t>(),
+                               DataTypeImpl::GetTensorType<MLFloat16>(),
+                               DataTypeImpl::GetTensorType<BFloat16>()})
         .TypeConstraint("T2", {DataTypeImpl::GetTensorType<float>(),
-                                DataTypeImpl::GetTensorType<double>(),
-                                DataTypeImpl::GetTensorType<int8_t>(),
-                                DataTypeImpl::GetTensorType<int16_t>(),
-                                DataTypeImpl::GetTensorType<int32_t>(),
-                                DataTypeImpl::GetTensorType<int64_t>(),
-                                DataTypeImpl::GetTensorType<uint8_t>(),
-                                DataTypeImpl::GetTensorType<uint16_t>(),
-                                DataTypeImpl::GetTensorType<uint32_t>(),
-                                DataTypeImpl::GetTensorType<uint64_t>(),
-                                DataTypeImpl::GetTensorType<MLFloat16>(),
-                                DataTypeImpl::GetTensorType<BFloat16>()})
+                               DataTypeImpl::GetTensorType<double>(),
+                               DataTypeImpl::GetTensorType<int8_t>(),
+                               DataTypeImpl::GetTensorType<int16_t>(),
+                               DataTypeImpl::GetTensorType<int32_t>(),
+                               DataTypeImpl::GetTensorType<int64_t>(),
+                               DataTypeImpl::GetTensorType<uint8_t>(),
+                               DataTypeImpl::GetTensorType<uint16_t>(),
+                               DataTypeImpl::GetTensorType<uint32_t>(),
+                               DataTypeImpl::GetTensorType<uint64_t>(),
+                               DataTypeImpl::GetTensorType<MLFloat16>(),
+                               DataTypeImpl::GetTensorType<BFloat16>()})
         .MayInplace(0, 0),
     BitCast);
 
diff --git a/onnxruntime/test/opaque_api/test_opaque_api.cc b/onnxruntime/test/opaque_api/test_opaque_api.cc
index e39e76a912aec..e4479ce473939 100644
--- a/onnxruntime/test/opaque_api/test_opaque_api.cc
+++ b/onnxruntime/test/opaque_api/test_opaque_api.cc
@@ -118,7 +118,7 @@ ONNX_OPERATOR_KERNEL_EX(
   ONNX_TEST_OPERATOR_SCHEMA_UNIQ_HELPER(__COUNTER__, name)
 #define ONNX_TEST_OPERATOR_SCHEMA_UNIQ_HELPER(Counter, name) \
   ONNX_TEST_OPERATOR_SCHEMA_UNIQ(Counter, name)
-#define ONNX_TEST_OPERATOR_SCHEMA_UNIQ(Counter, name)                                                   \
+#define ONNX_TEST_OPERATOR_SCHEMA_UNIQ(Counter, name)                                                  \
   static ONNX_NAMESPACE::OpSchemaRegistry::OpSchemaRegisterOnce op_schema_register_once##name##Counter \
       [[maybe_unused]] = ONNX_NAMESPACE::OpSchema(#name, __FILE__, __LINE__)
 
diff --git a/onnxruntime/test/providers/cpu/tensor/bitcast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/bitcast_op_test.cc
index d2c674b1081b2..65a2de1ee7f3f 100644
--- a/onnxruntime/test/providers/cpu/tensor/bitcast_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/bitcast_op_test.cc
@@ -124,7 +124,7 @@ TEST(BitCastTest, Float32ToInt32_2D) {
 
 TEST(BitCastTest, Float32ToInt32_3D) {
   std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
-                               7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+                              7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
   std::vector<int32_t> expected(input.size());
   std::memcpy(expected.data(), input.data(), input.size() * sizeof(float));
 
diff --git a/onnxruntime/test/python/requirements.txt b/onnxruntime/test/python/requirements.txt
index 3ece2f39d4042..c3f671ebf7de9 100644
--- a/onnxruntime/test/python/requirements.txt
+++ b/onnxruntime/test/python/requirements.txt
@@ -1,3 +1,3 @@
-onnx==1.21.0
+onnx==1.21.0rc1
 pytest
 onnx-ir
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index b4c2f163e22ac..d4511e31964ba 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -7,4 +7,4 @@ wheel
 protobuf==4.25.8
 sympy==1.14
 flatbuffers
-onnx==1.21.0
+onnx==1.21.0rc1
diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
index eb52681341012..c33fd1d102a5c 100644
--- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
@@ -3,7 +3,7 @@ beartype==0.15.0
 flatbuffers
 cerberus
 h5py
-onnx==1.21.0
+onnx==1.21.0rc1
 # Python dependencies required for pytorch development
 astunparse
 expecttest!=0.2.0
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 9a0a6d0f51900..cdd375d49f0b6 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -9,4 +9,4 @@ sympy==1.14
 flatbuffers
 neural-compressor>=2.2.1
 triton==3.5.0
-onnx==1.21.0
+onnx==1.21.0rc1
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 3d886832e1ccb..8628e81251eac 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -12,4 +12,4 @@ protobuf==6.33.0
 packaging
 onnxscript==0.6.2
 onnx-ir==0.1.16
-onnx==1.21.0
+onnx==1.21.0rc1
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index bfe9ab0d8a508..8b7af9d97a6b1 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -12,4 +12,4 @@ onnxscript==0.6.2
 onnx-ir==0.1.16
 jinja2
 markupsafe
-onnx==1.21.0
+onnx==1.21.0rc1
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index 2dfba37c6f381..110d3785d1369 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -14,4 +14,4 @@ jinja2
 markupsafe
 semver
 packaging
-onnx==1.21.0
+onnx==1.21.0rc1

From 86dbc8e60dea7a37c884f89e71f746929e7c7223 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 9 Mar 2026 22:29:48 +0000
Subject: [PATCH 05/18] Address PR review feedback: code fixes and restore DML
 docs

- cumprod.cc: Add #include <numeric>, validate axis tensor has exactly
  one element (0-D scalar or 1-D shape [1])
- bitcast_op.cc: Add null check for TensorTypeFromONNXEnum return value
- OperatorKernels.md: Restore DML section that was accidentally removed
  during regeneration, add BitCast and CumProd entries manually

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 docs/OperatorKernels.md                       | 444 +++++++++++++++++-
 .../core/providers/cpu/math/cumprod.cc        |   6 +-
 .../core/providers/cpu/tensor/bitcast_op.cc   |   2 +
 3 files changed, 448 insertions(+), 4 deletions(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 10fc961865314..e172045020572 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -6,6 +6,7 @@ Do not modify directly.*
 
 - [CPUExecutionProvider](#cpuexecutionprovider)
 - [CUDAExecutionProvider](#cudaexecutionprovider)
+- [DmlExecutionProvider](#dmlexecutionprovider)
 
 ---------------
 
@@ -656,8 +657,7 @@ Do not modify directly.*
 |ArgMin|*in* data:**T**<br> *out* reduced:**tensor(int64)**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||12|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 11]|**T** = tensor(double), tensor(float), tensor(float16)|
-|Attention|*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *in* nonpad_kv_seqlen:**tensor(int64)**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**<br><br>or<br><br>*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**|24+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
-|||23|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
+|Attention|*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *in* nonpad_kv_seqlen:**tensor(int64)**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**<br><br>or<br><br>*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**|23+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
 |AveragePool|*in* X:**T**<br> *out* Y:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[19, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[11, 18]|**T** = tensor(double), tensor(float), tensor(float16)|
@@ -1079,3 +1079,443 @@ Do not modify directly.*
 |||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 | |
 | |
+
+
+<a name="dmlexecutionprovider"/>
+
+## Operators implemented by DmlExecutionProvider
+
+| Op Name | Parameters | OpSet Version | Types Supported |
+|---------|------------|---------------|-----------------|
+|**Operator Domain:** *ai.onnx*||||
+|Abs|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)|
+|||6+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)|
+|Acos|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
+|Acosh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
+|Add|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||7+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Affine|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|And|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|7+|**T** = tensor(bool)|
+|ArgMax|*in* data:**T**<br> *out* reduced:**tensor(int64)**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||1+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|ArgMin|*in* data:**T**<br> *out* reduced:**tensor(int64)**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||1+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Asin|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
+|Asinh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
+|Atan|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
+|Atanh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
+|AveragePool|*in* X:**T**<br> *out* Y:**T**|19+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||10+|**T** = tensor(float), tensor(float16)|
+|||7+|**T** = tensor(float), tensor(float16)|
+|BatchNormalization|*in* X:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *in* input_mean:**U**<br> *in* input_var:**U**<br> *out* Y:**T**<br> *out* running_mean:**U**<br> *out* running_var:**U**<br><br>or<br><br>*in* X:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *in* mean:**T**<br> *in* var:**T**<br> *out* Y:**T**<br> *out* mean:**T**<br> *out* var:**T**<br> *out* saved_mean:**T**<br> *out* saved_var:**T**<br><br>or<br><br>*in* X:**T**<br> *in* scale:**T1**<br> *in* B:**T1**<br> *in* input_mean:**T2**<br> *in* input_var:**T2**<br> *out* Y:**T**<br> *out* running_mean:**T2**<br> *out* running_var:**T2**|15+|**T** = tensor(float), tensor(float16)|
+|||14+|**T** = tensor(float), tensor(float16)|
+|||9+|**T** = tensor(float), tensor(float16)|
+|||7+|**T** = tensor(float), tensor(float16)|
+|BitShift|*in* X:**T**<br> *in* Y:**T**<br> *out* Z:**T**|11+|**T** = tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|BitwiseAnd|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|BitwiseNot|*in* X:**T**<br> *out* Y:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|BitwiseOr|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|BitwiseXor|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Cast|*in* input:**T1**<br> *out* output:**T2**|21+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||19+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||9+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||6+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|CastLike|*in* input:**T1**<br> *in* target_type:**T2**<br> *out* output:**T2**|21+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||19+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||15+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Ceil|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|Celu|*in* X:**T**<br> *out* Y:**T**|12+|**T** = tensor(float), tensor(float16)|
+|Clip|*in* input:**T**<br> *in* min:**T**<br> *in* max:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|Col2Im|*in* input:**T**<br> *in* image_shape:**tensor(int64)**<br> *in* block_shape:**tensor(int64)**<br> *out* output:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Concat|*in* inputs:**T**<br> *out* concat_result:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||4+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|ConcatFromSequence|*in* input_sequence:**S**<br> *out* concat_result:**T**|11+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|ConstantOfShape|*in* input:**T1**<br> *out* output:**T2**|21+|**T1** = tensor(int64)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||9+|**T1** = tensor(int64)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Conv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|ConvInteger|*in* x:**T1**<br> *in* w:**T2**<br> *in* x_zero_point:**T1**<br> *in* w_zero_point:**T2**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int32)|
+|ConvTranspose|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|Cos|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
+|Cosh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
+|Crop|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|CumSum|*in* x:**T**<br> *in* axis:**T2**<br> *out* y:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||11+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|DFT|*in* input:**T1**<br> *in* dft_length:**T2**<br> *in* axis:**tensor(int64)**<br> *out* output:**T1**<br><br>or<br><br>*in* input:**T1**<br> *in* dft_length:**T2**<br> *out* output:**T1**|20+|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(int32), tensor(int64)|
+|||17+|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(int32), tensor(int64)|
+|DepthToSpace|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T3**|21+|**T1** = tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||19+|**T1** = tensor(int32), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||13+|**T** = tensor(int32), tensor(int8), tensor(uint8)|
+|||10+|**T** = tensor(int32), tensor(int8), tensor(uint8)|
+|Div|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||7+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Dropout|*in* data:**T**<br> *in* ratio:**T1**<br> *in* training_mode:**T2**<br> *out* output:**T**<br> *out* mask:**T2**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**<br> *out* mask:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**<br> *out* mask:**T1**|7+|**T** = tensor(float), tensor(float16)|
+|DynamicQuantizeLinear|*in* x:**T1**<br> *out* y:**T2**<br> *out* y_scale:**tensor(float)**<br> *out* y_zero_point:**T2**|11+|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
+|Einsum|*in* Inputs:**T**<br> *out* Output:**T**|12+|**T** = tensor(float), tensor(float16)|
+|Elu|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(float), tensor(float16)|
+|Equal|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|19+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|||11+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|||7+|**T** = tensor(float), tensor(float16)<br/> **T1** = tensor(bool)|
+|Erf|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||9+|**T** = tensor(float), tensor(float16)|
+|Exp|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|Expand|*in* input:**T**<br> *in* shape:**tensor(int64)**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||8+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|EyeLike|*in* input:**T1**<br> *out* output:**T2**|9+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Flatten|*in* input:**T**<br> *out* output:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||9+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Floor|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|GRU|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|14+|**T** = tensor(float), tensor(float16)|
+|||7+|**T** = tensor(float), tensor(float16)|
+|Gather|*in* data:**T**<br> *in* indices:**Tind**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|GatherElements|*in* data:**T**<br> *in* indices:**Tind**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|GatherND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||12+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Gemm|*in* A:**T**<br> *in* B:**T**<br> *in* C:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||9+|**T** = tensor(float), tensor(float16)|
+|||7+|**T** = tensor(float), tensor(float16)|
+|GlobalAveragePool|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|GlobalLpPool|*in* X:**T**<br> *out* Y:**T**|2+|**T** = tensor(float), tensor(float16)|
+|GlobalMaxPool|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|Greater|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|||7+|**T** = tensor(float), tensor(float16)<br/> **T1** = tensor(bool)|
+|GreaterOrEqual|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|16+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|GroupNorm||21+|**M** = tensor(float), tensor(float16)<br/> **T** = tensor(float), tensor(float16)|
+|HardSigmoid|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(float), tensor(float16)|
+|Hardmax|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|21+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||19+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||16+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||14+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|If|*in* cond:**B**<br> *out* outputs:**V**|19+|**B** = tensor(bool)<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||16+|**B** = tensor(bool)<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**B** = tensor(bool)<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**B** = tensor(bool)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||7+|**B** = tensor(bool)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|ImageScaler|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|InstanceNormalization|*in* input:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *out* output:**T**|6+|**T** = tensor(float), tensor(float16)|
+|IsInf|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(float)<br/> **T2** = tensor(bool)|
+|||10+|**T1** = tensor(float)<br/> **T2** = tensor(bool)|
+|IsNaN|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
+|||13+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
+|||9+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
+|LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|LSTM|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *in* initial_c:**T**<br> *in* P:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**<br> *out* Y_c:**T**|14+|**T** = tensor(float), tensor(float16)|
+|||7+|**T** = tensor(float), tensor(float16)|
+|LayerNormalization|*in* X:**T**<br> *in* Scale:**T**<br> *in* B:**T**<br> *out* Y:**T**<br> *out* Mean:**U**<br> *out* InvStdDev:**U**<br><br>or<br><br>*in* X:**T**<br> *in* Scale:**V**<br> *in* B:**V**<br> *out* Y:**V**<br> *out* Mean:**U**<br> *out* InvStdDev:**U**|17+|**T** = tensor(float), tensor(float16)<br/> **U** = tensor(float)|
+|||1+|**T** = tensor(float), tensor(float16)<br/> **U** = tensor(float), tensor(float16)<br/> **V** = tensor(float), tensor(float16)|
+|LeakyRelu|*in* X:**T**<br> *out* Y:**T**|16+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|Less|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|||7+|**T** = tensor(float), tensor(float16)<br/> **T1** = tensor(bool)|
+|LessOrEqual|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|16+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|Log|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|LogSoftmax|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|LpNormalization|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|LpPool|*in* X:**T**<br> *out* Y:**T**|18+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||2+|**T** = tensor(float), tensor(float16)|
+|MatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||9+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|MatMulInteger|*in* A:**T1**<br> *in* B:**T2**<br> *in* a_zero_point:**T1**<br> *in* b_zero_point:**T2**<br> *out* Y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int32)|
+|Max|*in* data_0:**T**<br> *out* max:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||8+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|MaxPool|*in* X:**T**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T**<br> *out* Y:**T**<br> *out* Indices:**I**|12+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)|
+|||11+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)|
+|||10+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)|
+|||8+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|MaxRoiPool|*in* X:**T**<br> *in* rois:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|MaxUnpool|*in* X:**T1**<br> *in* I:**T2**<br> *in* output_shape:**T2**<br> *out* output:**T1**|11+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int64)|
+|||9+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int64)|
+|Mean|*in* data_0:**T**<br> *out* mean:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||8+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|MeanVarianceNormalization|*in* X:**T**<br> *out* Y:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||9+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|MemcpyFromHost|*in* X:**T**<br> *out* Y:**T**|1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)|
+|MemcpyToHost|*in* X:**T**<br> *out* Y:**T**|1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)|
+|Min|*in* data_0:**T**<br> *out* min:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||8+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|Mod|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
+|||10+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
+|Mul|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||7+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Neg|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)|
+|||6+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8)|
+|NonZero|*in* X:**T**<br> *out* Y:**tensor(int64)**|13+|**T** = tensor(bool), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
+|||9+|**T** = tensor(bool), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
+|Not|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(bool)|
+|OneHot|*in* indices:**T1**<br> *in* depth:**T2**<br> *in* values:**T3**<br> *out* output:**T3**|11+|**T1** = tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T3** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||9+|**T1** = tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T3** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|OptionalGetElement|*in* input:**O**<br> *out* output:**V**|18+|**O** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||15+|**O** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8))<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|OptionalHasElement|*in* input:**O**<br> *out* output:**B**|18+|**B** = tensor(bool)<br/> **O** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||15+|**B** = tensor(bool)<br/> **O** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8))|
+|Or|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|7+|**T** = tensor(bool)|
+|PRelu|*in* X:**T**<br> *in* slope:**T**<br> *out* Y:**T**|16+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
+|||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
+|||7+|**T** = tensor(float), tensor(float16)|
+|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||2+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|ParametricSoftplus|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|Pow|*in* X:**T**<br> *in* Y:**T**<br> *out* Z:**T**<br><br>or<br><br>*in* X:**T**<br> *in* Y:**T1**<br> *out* Z:**T**|15+|**T** = tensor(float), tensor(float16), tensor(int32)<br/> **T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int32)<br/> **T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
+|||12+|**T** = tensor(float), tensor(float16), tensor(int32)<br/> **T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
+|||7+|**T** = tensor(float), tensor(float16)|
+|QLinearConv|*in* x:**T1**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T1**<br> *in* w:**T2**<br> *in* w_scale:**tensor(float)**<br> *in* w_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *in* B:**T4**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)<br/> **T4** = tensor(int32)|
+|QLinearMatMul|*in* a:**T1**<br> *in* a_scale:**TS**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**TS**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**TS**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**<br><br>or<br><br>*in* a:**T1**<br> *in* a_scale:**tensor(float)**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**tensor(float)**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**|21+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)|
+|||10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)|
+|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**T2**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|21+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)|
+|||19+|**T1** = tensor(float), tensor(float16), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
+|||13+|**T1** = tensor(float), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
+|||10+|**T1** = tensor(float), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
+|RNN|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|14+|**T** = tensor(float), tensor(float16)|
+|||7+|**T** = tensor(float), tensor(float16)|
+|Range|*in* start:**T**<br> *in* limit:**T**<br> *in* delta:**T**<br> *out* output:**T**|11+|**T** = tensor(float), tensor(int16), tensor(int32), tensor(int64)|
+|Reciprocal|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|ReduceL1|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||11+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||1+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|ReduceL2|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||13+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|ReduceLogSum|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||13+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|ReduceLogSumExp|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||13+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|ReduceMax|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|20+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|ReduceMean|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||13+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|ReduceMin|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|20+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|ReduceProd|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||11+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||1+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|ReduceSum|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||11+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||1+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|ReduceSumSquare|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||11+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|||1+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|Relu|*in* X:**T**<br> *out* Y:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
+|||13+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|Reshape|*in* data:**T**<br> *in* shape:**tensor(int64)**<br> *out* reshaped:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reshaped:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||5+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|19+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||18+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||13+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||11+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||10+|**T** = tensor(float), tensor(float16)|
+|ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|RoiAlign|*in* X:**T1**<br> *in* rois:**T1**<br> *in* batch_indices:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int32), tensor(int64)|
+|||10+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int32), tensor(int64)|
+|Round|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|STFT|*in* signal:**T1**<br> *in* frame_step:**T2**<br> *in* window:**T1**<br> *in* frame_length:**T2**<br> *out* output:**T1**|17+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int32), tensor(int64)|
+|ScaledTanh|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|Scatter|*in* data:**T**<br> *in* indices:**Tind**<br> *in* updates:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||9+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|ScatterElements|*in* data:**T**<br> *in* indices:**Tind**<br> *in* updates:**T**<br> *out* output:**T**|16+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|ScatterND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *in* updates:**T**<br> *out* output:**T**|16+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Selu|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(float), tensor(float16)|
+|SequenceAt|*in* input_sequence:**S**<br> *in* position:**I**<br> *out* tensor:**T**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|SequenceConstruct|*in* inputs:**T**<br> *out* output_sequence:**S**|11+|**S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|SequenceEmpty|*out* output:**S**|11+|**S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
+|SequenceErase|*in* input_sequence:**S**<br> *in* position:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
+|SequenceInsert|*in* input_sequence:**S**<br> *in* tensor:**T**<br> *in* position:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
+|SequenceLength|*in* input_sequence:**S**<br> *out* length:**I**|11+|**I** = tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
+|Shape|*in* data:**T**<br> *out* shape:**T1**|21+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|||19+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|||15+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|||13+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|||1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|Shrink|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
+|Sigmoid|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|Sign|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|SimplifiedLayerNormalization|*in* X:**T**<br> *in* scale:**V**<br> *out* Y:**V**<br> *out* inv_std_var:**U**|1+|**T** = tensor(float), tensor(float16)<br/> **U** = tensor(float), tensor(float16)<br/> **V** = tensor(float), tensor(float16)|
+|Sin|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
+|Sinh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
+|Size|*in* data:**T**<br> *out* size:**T1**|21+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|||19+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|||13+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|||1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|Slice|*in* data:**T**<br> *in* starts:**Tind**<br> *in* ends:**Tind**<br> *in* axes:**Tind**<br> *in* steps:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||10+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Softmax|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|Softplus|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|Softsign|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|SpaceToDepth|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Split|*in* input:**T**<br> *in* split:**T**<br> *out* outputs...:**T**<br><br>or<br><br>*in* input:**T**<br> *in* split:**tensor(int64)**<br> *out* outputs:**T**<br><br>or<br><br>*in* input:**T**<br> *out* outputs:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||2+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Sqrt|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|Squeeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* squeezed:**T**<br><br>or<br><br>*in* data:**T**<br> *out* squeezed:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Sub|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||7+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Sum|*in* data_0:**T**<br> *out* sum:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||8+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|Tan|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
+|Tanh|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
+|||6+|**T** = tensor(float), tensor(float16)|
+|ThresholdedRelu|*in* X:**T**<br> *out* Y:**T**|10+|**T** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16)|
+|Tile|*in* input:**T**<br> *in* repeats:**T1**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *in* tiles:**T**<br> *in* axis:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||6+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|TopK|*in* X:**T**<br> *in* K:**tensor(int64)**<br> *out* Values:**T**<br> *out* Indices:**I**<br><br>or<br><br>*in* X:**T**<br> *out* Values:**T**<br> *out* Indices:**I**|11+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||10+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||1+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Transpose|*in* data:**T**<br> *out* transposed:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Trilu|*in* input:**T**<br> *in* k:**tensor(int64)**<br> *out* output:**T**|14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Unsqueeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* expanded:**T**<br><br>or<br><br>*in* data:**T**<br> *out* expanded:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Upsample|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T**<br> *out* Y:**T**|10+|**T** = tensor(float), tensor(float16)|
+|||9+|**T** = tensor(float), tensor(float16)|
+|||7+|**T** = tensor(float), tensor(float16)|
+|Where|*in* condition:**B**<br> *in* X:**T**<br> *in* Y:**T**<br> *out* output:**T**|16+|**B** = tensor(bool)<br/> **T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||9+|**B** = tensor(bool)<br/> **T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Xor|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|7+|**T** = tensor(bool)|
+| |
+| |
+|**Operator Domain:** *com.microsoft*||||
+|Attention|*in* input:**T**<br> *in* weights:**T**<br> *in* bias:**T**<br> *in* mask_index:**M**<br> *in* past:**T**<br> *in* attention_bias:**T**<br> *in* past_sequence_length:**M**<br> *out* output:**T**<br> *out* present:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float), tensor(float16)|
+|BiasAdd|*in* X:**T**<br> *in* bias:**T**<br> *in* skip:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|BiasGelu|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|1+|**T** = tensor(float), tensor(float16)|
+|BiasSplitGelu|*in* X:**T**<br> *in* bias:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|ConvTransposeWithDynamicPads|*in* X:**T**<br> *in* W:**T**<br> *in* Pads:**tensor(int64)**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|DequantizeLinear|*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|1+|**T1** = tensor(int32), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|DynamicQuantizeMatMul|*in* A:**T1**<br> *in* B:**T2**<br> *in* b_scale:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
+|EmbedLayerNormalization|*in* input_ids:**T1**<br> *in* segment_ids:**T1**<br> *in* word_embedding:**T**<br> *in* position_embedding:**T**<br> *in* segment_embedding:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* mask:**T1**<br> *in* position_ids:**T1**<br> *out* output:**T**<br> *out* mask_index:**T1**<br> *out* embedding_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
+|FastGelu|*in* X:**T**<br> *in* bias:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|FusedMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|FusedMatMulActivation|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|Gelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|GroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *out* Y:**T**|1+|**M** = tensor(float), tensor(float16)<br/> **T** = tensor(float), tensor(float16)|
+|GroupQueryAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T_CACHE**<br> *in* past_value:**T_CACHE**<br> *in* seqlens_k:**M**<br> *in* total_sequence_length:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *in* position_ids:**tensor(int64)**<br> *in* attention_bias:**T**<br> *in* head_sink:**T**<br> *in* k_scale:**T_KV_SCALE**<br> *in* v_scale:**T_KV_SCALE**<br> *out* output:**T**<br> *out* present_key:**T_CACHE**<br> *out* present_value:**T_CACHE**<br> *out* output_qk:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float), tensor(float16)|
+|MatMulIntegerToFloat|*in* A:**T1**<br> *in* B:**T2**<br> *in* a_scale:**T3**<br> *in* b_scale:**T3**<br> *in* a_zero_point:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T3**<br> *out* Y:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float), tensor(float16)|
+|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T3**<br> *in* g_idx:**T4**<br> *in* bias:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
+|MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* attention_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* past_sequence_length:**M**<br> *in* cache_indirection:**M**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**<br> *out* qk:**QK**|1+|**M** = tensor(int32)<br/> **T** = tensor(float), tensor(float16)|
+|NhwcConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|QAttention|*in* input:**T1**<br> *in* weight:**T2**<br> *in* bias:**T3**<br> *in* input_scale:**T3**<br> *in* weight_scale:**T3**<br> *in* mask_index:**T4**<br> *in* input_zero_point:**T1**<br> *in* weight_zero_point:**T2**<br> *in* past:**T3**<br> *out* output:**T3**<br> *out* present:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float), tensor(float16)<br/> **T4** = tensor(int32)|
+|QLinearAdd|*in* A:**T**<br> *in* A_scale:**tensor(float)**<br> *in* A_zero_point:**T**<br> *in* B:**T**<br> *in* B_scale:**tensor(float)**<br> *in* B_zero_point:**T**<br> *in* C_scale:**tensor(float)**<br> *in* C_zero_point:**T**<br> *out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)|
+|QLinearAveragePool|*in* X:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
+|QLinearConcat|*in* Y_scale:**TF**<br> *in* Y_zero_point:**T8**<br> *in* inputs:**TV**<br> *out* Y:**T8**|1+|**T8** = tensor(int8), tensor(uint8)<br/> **TF** = tensor(float)<br/> **TV** = tensor(float), tensor(int8), tensor(uint8)|
+|QLinearGlobalAveragePool|*in* X:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
+|QLinearSigmoid|*in* X:**T**<br> *in* X_scale:**tensor(float)**<br> *in* X_zero_point:**T**<br> *in* Y_scale:**tensor(float)**<br> *in* Y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
+|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|1+|**T1** = tensor(float), tensor(float16), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
+|QuickGelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|RotaryEmbedding|*in* input:**T**<br> *in* position_ids:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**|1+|**M** = tensor(int64)<br/> **T** = tensor(float), tensor(float16)|
+|SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
+|SkipSimplifiedLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
+| |
+| |
+|**Operator Domain:** *com.microsoft.dml*||||
+|DmlFusedAdd|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|1+|**T** = tensor(float), tensor(float16)|
+|DmlFusedBatchNormalization|*in* X:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *in* mean:**T**<br> *in* var:**T**<br> *out* Y:**T**<br> *out* mean:**T**<br> *out* var:**T**<br> *out* saved_mean:**T**<br> *out* saved_var:**T**|1+|**T** = tensor(float), tensor(float16)|
+|DmlFusedConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|DmlFusedConvTranspose|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|DmlFusedGemm|*in* A:**T**<br> *in* B:**T**<br> *in* C:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|DmlFusedInstanceNormalization|*in* input:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|DmlFusedMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|DmlFusedMeanVarianceNormalization|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|DmlFusedSum|*in* data_0:**T**<br> *out* sum:**T**|1+|**T** = tensor(float), tensor(float16)|
+| |
+| |
diff --git a/onnxruntime/core/providers/cpu/math/cumprod.cc b/onnxruntime/core/providers/cpu/math/cumprod.cc
index a37bbe32a9c7a..2a7e63a68e9b3 100644
--- a/onnxruntime/core/providers/cpu/math/cumprod.cc
+++ b/onnxruntime/core/providers/cpu/math/cumprod.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include <functional>
+#include <numeric>
 
 #include "cumprod.h"
 #include "core/providers/common.h"
@@ -18,8 +19,9 @@ Status GetAxis(const Tensor* axis_tensor, int64_t input_rank, int64_t& axis_out)
   if (!axis_tensor)
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor must be provided to the CumProd op");
 
-  if (axis_tensor->Shape().NumDimensions() > 1)
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor should be 0D or 1D");
+  if (axis_tensor->Shape().NumDimensions() > 1 || axis_tensor->Shape().Size() != 1)
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Axis tensor must be a scalar (0-D) or 1-D tensor with exactly one element");
 
   if (axis_tensor->IsDataType<int32_t>()) {
     axis_out = static_cast<int64_t>(axis_tensor->Data<int32_t>()[0]);
diff --git a/onnxruntime/core/providers/cpu/tensor/bitcast_op.cc b/onnxruntime/core/providers/cpu/tensor/bitcast_op.cc
index e932438f6a914..e447f66e07635 100644
--- a/onnxruntime/core/providers/cpu/tensor/bitcast_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/bitcast_op.cc
@@ -54,6 +54,8 @@ Status BitCast::Compute(OpKernelContext* context) const {
   const size_t input_element_size = input->DataType()->Size();
 
   const auto* output_type = DataTypeImpl::TensorTypeFromONNXEnum(to_);
+  ORT_RETURN_IF_NOT(output_type != nullptr,
+                    "BitCast: unsupported target type (ONNX enum value: ", to_, ").");
   const size_t output_element_size = output_type->GetElementType()->Size();
 
   ORT_RETURN_IF_NOT(input_element_size == output_element_size,

From 8ff65796b9c4de7e5eebf44a226e4db856f9a69e Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 11 Mar 2026 17:17:50 +0000
Subject: [PATCH 06/18] Fix MakeInitializer shape/data mismatch in
 ReshapeFusion test

ONNX 1.21.0 (onnx/onnx#7675) added stricter raw_data size validation
in ParseData<T>. The test had shape {4} but only 3 values {2, 64, 32},
which old ONNX silently ignored. Fix shape to {3}.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 onnxruntime/test/optimizer/graph_transform_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index d7780da36626c..356ac1be7d34e 100644
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -4355,7 +4355,7 @@ TEST_F(GraphTransformationTests, ReshapeFusion_Contiguous_Reshape) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
     auto* input_arg = builder.MakeInput<float>({{8, 16, 32}});
     auto* shape_initializer_1 = builder.MakeInitializer<int64_t>({4}, {2, 4, 16, 32});
-    auto* shape_initializer_2 = builder.MakeInitializer<int64_t>({4}, {2, 64, 32});
+    auto* shape_initializer_2 = builder.MakeInitializer<int64_t>({3}, {2, 64, 32});
     auto* axes_initializer = builder.MakeInitializer<int64_t>({1}, {1});
     auto* reshape_out_1 = builder.MakeIntermediate();
     auto* reshape_out_2 = builder.MakeIntermediate();

From 29744dae8b95f0f91911a0f4167f2de92942823b Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Wed, 11 Mar 2026 17:38:18 +0000
Subject: [PATCH 07/18] webgl-operators.md update

---
 js/web/docs/webgl-operators.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/js/web/docs/webgl-operators.md b/js/web/docs/webgl-operators.md
index 8e55d8c4e2564..98a79de61727c 100644
--- a/js/web/docs/webgl-operators.md
+++ b/js/web/docs/webgl-operators.md
@@ -24,6 +24,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [AveragePool](https://github.com/onnx/onnx/blob/main/docs/Operators.md#AveragePool) | [7-9](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#AveragePool-7), [10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#AveragePool-10), [11-18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#AveragePool-11), [19-21](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#AveragePool-19), [22+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#AveragePool-22) |
 | [BatchNormalization](https://github.com/onnx/onnx/blob/main/docs/Operators.md#BatchNormalization) | [7-8](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#BatchNormalization-7), [9-13](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#BatchNormalization-9), [14](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#BatchNormalization-14), [15+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#BatchNormalization-15) |
 | [Bernoulli](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Bernoulli) |  |
+| [BitCast](https://github.com/onnx/onnx/blob/main/docs/Operators.md#BitCast) |  |
 | [BitShift](https://github.com/onnx/onnx/blob/main/docs/Operators.md#BitShift) |  |
 | [BitwiseAnd](https://github.com/onnx/onnx/blob/main/docs/Operators.md#BitwiseAnd) |  |
 | [BitwiseNot](https://github.com/onnx/onnx/blob/main/docs/Operators.md#BitwiseNot) |  |
@@ -47,6 +48,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [ConvTranspose](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ConvTranspose) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ConvTranspose-1), [11-21](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ConvTranspose-11), [22+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ConvTranspose-22) |
 | [Cos](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Cos) | [7-21](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Cos-7), [22+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Cos-22) |
 | [Cosh](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Cosh) |  |
+| [CumProd](https://github.com/onnx/onnx/blob/main/docs/Operators.md#CumProd) |  |
 | [CumSum](https://github.com/onnx/onnx/blob/main/docs/Operators.md#CumSum) |  |
 | [DFT](https://github.com/onnx/onnx/blob/main/docs/Operators.md#DFT) |  |
 | [DeformConv](https://github.com/onnx/onnx/blob/main/docs/Operators.md#DeformConv) |  |

From a1b929c67a1e233e52bb73b403a8f41c446ed3c9 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 12 Mar 2026 19:56:27 +0000
Subject: [PATCH 08/18] Update ONNX to 1.21.0rc2 and address PR review comments

- Update ONNX submodule, deps.txt, vcpkg portfile to rc2 commit a51ac075
- Update onnx==1.21.0rc2 in all 7 requirements.txt files
- Fix cumprod.cc review comments (namespace, ORT_ENFORCE, type, closing brace)
- Add 5 test exclusions: 4 DFT rfft/irfft tests (ORT lacks IRFFT) + 1 BitCast bool test

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cmake/deps.txt                                |  2 +-
 cmake/external/onnx                           |  2 +-
 cmake/vcpkg-ports/onnx/portfile.cmake         |  4 ++--
 .../core/providers/cpu/math/cumprod.cc        | 20 ++++++-------------
 onnxruntime/test/python/requirements.txt      |  2 +-
 .../onnx_backend_test_series_filters.jsonc    | 11 +++++++++-
 .../python/cpu/scripts/requirements.txt       |  2 +-
 .../docker/scripts/lort/requirements.txt      |  2 +-
 .../docker/scripts/manylinux/requirements.txt |  2 +-
 .../linux/docker/scripts/requirements.txt     |  2 +-
 .../github/linux/python/requirements.txt      |  2 +-
 .../github/windows/python/requirements.txt    |  2 +-
 12 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 2d7196646434f..9ee8dc85a4ffc 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.250325.1.zip;826c8bd47c2258ec61b8b218e031e5b33d27f761
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/fbbe45b8e25b5b0018cc038caaf906d3b09634ee.zip;c38208d94ec0dd799a8468ac72f6058f74d44830
+onnx;https://github.com/onnx/onnx/archive/a51ac0754e0f61d3a4fa70a3821aeaeb740ac7a5.zip;9f250d23582b974a1bd5119ada7306a298bc6411
 # Use the latest commit of 10.9-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
diff --git a/cmake/external/onnx b/cmake/external/onnx
index fbbe45b8e25b5..a51ac0754e0f6 160000
--- a/cmake/external/onnx
+++ b/cmake/external/onnx
@@ -1 +1 @@
-Subproject commit fbbe45b8e25b5b0018cc038caaf906d3b09634ee
+Subproject commit a51ac0754e0f61d3a4fa70a3821aeaeb740ac7a5
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
index ce95fa0e4535e..b411799a647a4 100644
--- a/cmake/vcpkg-ports/onnx/portfile.cmake
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -3,8 +3,8 @@ vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
 vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO onnx/onnx
-    REF fbbe45b8e25b5b0018cc038caaf906d3b09634ee
-    SHA512 971a71b6d0fdb96270f82851c6a5940cc1c34d224247b678033ba179ffd8cc7bfecf59b235d013a0b94d089bd7d6fe46d01b2d6f5056bdb9fdff98fba0cc4e27
+    REF a51ac0754e0f61d3a4fa70a3821aeaeb740ac7a5
+    SHA512 e29c75cf22fea46f659f03cff470e0aecc22fbe3ee4baf082208371a192c6fa58c2080391d4959b95db12b239eb329e215020c85a40be756a2a44fa5b375cb6f
     PATCHES
         fix-cmakelists.patch
         fix-dependency-protobuf.patch
diff --git a/onnxruntime/core/providers/cpu/math/cumprod.cc b/onnxruntime/core/providers/cpu/math/cumprod.cc
index 2a7e63a68e9b3..3c0646b3d6b9a 100644
--- a/onnxruntime/core/providers/cpu/math/cumprod.cc
+++ b/onnxruntime/core/providers/cpu/math/cumprod.cc
@@ -10,8 +10,6 @@
 #include "core/framework/op_kernel.h"
 #include "core/framework/tensorprotoutils.h"
 
-using namespace onnxruntime;
-
 namespace onnxruntime {
 
 namespace cumprod_op {
@@ -104,27 +102,21 @@ CumProd<T>::CumProd(const OpKernelInfo& info) : OpKernel(info), exclusive_(), re
   int64_t exclusive = 0;
   auto status = info.GetAttr("exclusive", &exclusive);
   if (status.IsOK()) {
-    if (exclusive == 1 || exclusive == 0) {
-      exclusive_ = exclusive;
-    } else {
-      ORT_ENFORCE(false, "attribute exclusive can only be 0 or 1");
-    }
+    ORT_ENFORCE(exclusive == 0 || exclusive == 1, "exclusive attribute must be 0 or 1, got: ", exclusive);
+    exclusive_ = exclusive;
   }
   int64_t reverse = 0;
   status = info.GetAttr("reverse", &reverse);
   if (status.IsOK()) {
-    if (reverse == 1 || reverse == 0) {
-      reverse_ = reverse;
-    } else {
-      ORT_ENFORCE(false, "attribute reverse can only be 0 or 1");
-    }
+    ORT_ENFORCE(reverse == 0 || reverse == 1, "reverse attribute must be 0 or 1, got: ", reverse);
+    reverse_ = reverse;
   }
 }
 
 template <typename T>
 Status CumProd<T>::Compute(OpKernelContext* ctx) const {
   const Tensor* input = ctx->Input<Tensor>(0);
-  size_t rank = input->Shape().NumDimensions();
+  int64_t rank = static_cast<int64_t>(input->Shape().NumDimensions());
   if (rank == 0)
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Cannot apply CumProd operator on a scalar");
 
@@ -221,4 +213,4 @@ Status CumProd<T>::Compute(OpKernelContext* ctx) const {
   return Status::OK();
 }
 
-};  // namespace onnxruntime
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/python/requirements.txt b/onnxruntime/test/python/requirements.txt
index c3f671ebf7de9..58a0214d4f0c8 100644
--- a/onnxruntime/test/python/requirements.txt
+++ b/onnxruntime/test/python/requirements.txt
@@ -1,3 +1,3 @@
-onnx==1.21.0rc1
+onnx==1.21.0rc2
 pytest
 onnx-ir
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index b40d0bc67d3ed..262d5dfe797e6 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -379,7 +379,16 @@
         "^test_quantizelinear_int4",
         "^test_quantizelinear_uint4",
         // topk uint64 is not implemented in ORT yet.
-        "^test_top_k_uint64"
+        "^test_top_k_uint64",
+        // ORT DFT kernel does not implement IRFFT (inverse real FFT) — it always outputs
+        // complex (last dim=2) which is wrong for IRFFT. These ONNX backend tests were
+        // added in onnx commit ee910d0e4 for DFT-20 spec clarification.
+        "^test_dft_rfft",
+        "^test_dft_irfft",
+        "^test_dft_rfft_opset19",
+        "^test_dft_irfft_opset19",
+        // ORT BitCast kernel does not support bool type.
+        "^test_bitcast_bool_to_uint8"
     ],
     "current_failing_tests_x86": [
         "^test_vgg19",
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index d4511e31964ba..97a98c32f7aaf 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -7,4 +7,4 @@ wheel
 protobuf==4.25.8
 sympy==1.14
 flatbuffers
-onnx==1.21.0rc1
+onnx==1.21.0rc2
diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
index c33fd1d102a5c..5428b0253a9d1 100644
--- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
@@ -3,7 +3,7 @@ beartype==0.15.0
 flatbuffers
 cerberus
 h5py
-onnx==1.21.0rc1
+onnx==1.21.0rc2
 # Python dependencies required for pytorch development
 astunparse
 expecttest!=0.2.0
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index cdd375d49f0b6..ea9fe01438082 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -9,4 +9,4 @@ sympy==1.14
 flatbuffers
 neural-compressor>=2.2.1
 triton==3.5.0
-onnx==1.21.0rc1
+onnx==1.21.0rc2
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 8628e81251eac..392956c4d0b21 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -12,4 +12,4 @@ protobuf==6.33.0
 packaging
 onnxscript==0.6.2
 onnx-ir==0.1.16
-onnx==1.21.0rc1
+onnx==1.21.0rc2
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index 8b7af9d97a6b1..9f9028ea37459 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -12,4 +12,4 @@ onnxscript==0.6.2
 onnx-ir==0.1.16
 jinja2
 markupsafe
-onnx==1.21.0rc1
+onnx==1.21.0rc2
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index 110d3785d1369..5f1e3b9b19c0b 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -14,4 +14,4 @@ jinja2
 markupsafe
 semver
 packaging
-onnx==1.21.0rc1
+onnx==1.21.0rc2

From 54948d8c4a74f1436b9faee645b1a7b56b26c4da Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 12 Mar 2026 20:29:46 +0000
Subject: [PATCH 09/18] Add test_bitcast_bool_to_uint8 and DFT rfft/irfft to
 C++ test runner broken tests

The JSONC filter only covers Python backend tests. The C++ onnx_test_runner
uses hardcoded arrays in TestCase.cc GetBrokenTests(). Add BitCast bool
and DFT rfft/irfft filters to cover the C++ test runner path.

ORT BitCast kernel doesn't register bool type, and ORT DFT kernel lacks
IRFFT (inverse real FFT) support.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Agent-signed-off: Developer (45720d0d) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 onnxruntime/test/onnx/TestCase.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index fbb9fb2797a88..e9758f1071c85 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -1462,6 +1462,12 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
     broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal_expanded", "unknown version"});
     broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal_expanded", "unknown version"});
     broken_tests->insert({"convinteger_with_padding", "unknown version"});
+    // Fails since ONNX==1.21.0
+    broken_tests->insert({"dft_irfft", "unknown version"});
+    broken_tests->insert({"dft_irfft_opset19", "unknown version"});
+    broken_tests->insert({"dft_rfft", "unknown version"});
+    broken_tests->insert({"dft_rfft_opset19", "unknown version"});
+    broken_tests->insert({"bitcast_bool_to_uint8", "ORT BitCast kernel does not register bool type"});
   }
 
 #ifdef DISABLE_CONTRIB_OPS

From aed5d3d154a1a3a710c1edc2596a1ddceb2f0ee4 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 12 Mar 2026 22:00:56 +0000
Subject: [PATCH 10/18] Patch ONNX Slice shape inference for dim_value==0 UB

ONNX 1.21.0rc2 enables _GLIBCXX_ASSERTIONS (onnx/onnx#7601) which
exposes pre-existing undefined behavior in Slice shape inference:
std::clamp(start, 0, dim_value-1) with dim_value=0 violates lo<=hi.
Add early-exit guard for both opset 10 and 11 locations in old.cc.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cmake/patches/onnx/onnx.patch | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
index 0a5680778790b..e8e29f99a7cf1 100644
--- a/cmake/patches/onnx/onnx.patch
+++ b/cmake/patches/onnx/onnx.patch
@@ -82,3 +82,37 @@ index a6a8a83..153da87 100644
          .SetDoc(GroupNormalization_ver18_doc)
          .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
          .Attr(
+diff --git a/onnx/defs/tensor/old.cc b/onnx/defs/tensor/old.cc
+index c2ba43b7c..c0f229c6e 100644
+--- a/onnx/defs/tensor/old.cc
++++ b/onnx/defs/tensor/old.cc
+@@ -2632,6 +2632,16 @@ ONNX_OPERATOR_SET_SCHEMA(
+ 
+             const auto input_dim_value = input_dim.dim_value();
+ 
++            // empty dimension: output is always 0 regardless of starts/ends/steps
++            if (input_dim_value == 0) {
++              ctx.getOutputType(0)
++                  ->mutable_tensor_type()
++                  ->mutable_shape()
++                  ->mutable_dim(static_cast<int>(axis))
++                  ->set_dim_value(0);
++              continue;
++            }
++
+             // process step
+             auto step = steps[axis_index];
+             if (step == 0) {
+@@ -6315,6 +6325,12 @@ ONNX_OPERATOR_SET_SCHEMA(
+ 
+             const auto input_dim_value = input_dim.dim_value();
+ 
++            // empty dimension: output is always 0 regardless of starts/ends/steps
++            if (input_dim_value == 0) {
++              ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->mutable_dim(axis)->set_dim_value(0);
++              continue;
++            }
++
+             // process step
+             auto step = steps[axis_index];
+             if (step == 0) {

From 1d32aea331098fb959425c803f094becb1668f61 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 12 Mar 2026 22:03:06 +0000
Subject: [PATCH 11/18] Sync Slice dim_value==0 fix to vcpkg binskim.patch

The onnx.patch fix must also be in binskim.patch for Windows CI builds.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cmake/vcpkg-ports/onnx/binskim.patch | 34 ++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/cmake/vcpkg-ports/onnx/binskim.patch b/cmake/vcpkg-ports/onnx/binskim.patch
index 0a5680778790b..e8e29f99a7cf1 100644
--- a/cmake/vcpkg-ports/onnx/binskim.patch
+++ b/cmake/vcpkg-ports/onnx/binskim.patch
@@ -82,3 +82,37 @@ index a6a8a83..153da87 100644
          .SetDoc(GroupNormalization_ver18_doc)
          .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
          .Attr(
+diff --git a/onnx/defs/tensor/old.cc b/onnx/defs/tensor/old.cc
+index c2ba43b7c..c0f229c6e 100644
+--- a/onnx/defs/tensor/old.cc
++++ b/onnx/defs/tensor/old.cc
+@@ -2632,6 +2632,16 @@ ONNX_OPERATOR_SET_SCHEMA(
+ 
+             const auto input_dim_value = input_dim.dim_value();
+ 
++            // empty dimension: output is always 0 regardless of starts/ends/steps
++            if (input_dim_value == 0) {
++              ctx.getOutputType(0)
++                  ->mutable_tensor_type()
++                  ->mutable_shape()
++                  ->mutable_dim(static_cast<int>(axis))
++                  ->set_dim_value(0);
++              continue;
++            }
++
+             // process step
+             auto step = steps[axis_index];
+             if (step == 0) {
+@@ -6315,6 +6325,12 @@ ONNX_OPERATOR_SET_SCHEMA(
+ 
+             const auto input_dim_value = input_dim.dim_value();
+ 
++            // empty dimension: output is always 0 regardless of starts/ends/steps
++            if (input_dim_value == 0) {
++              ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->mutable_dim(axis)->set_dim_value(0);
++              continue;
++            }
++
+             // process step
+             auto step = steps[axis_index];
+             if (step == 0) {

From b691c5250c70da2564062c456eec8f532f6fb34c Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 12 Mar 2026 22:27:26 +0000
Subject: [PATCH 12/18] Add Slice dim_value==0 fix for defs.cc (opset 13)

Covers the third std::clamp UB location in processSliceInputs.
All three sites now patched: old.cc:2646, old.cc:6329, defs.cc:792.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cmake/patches/onnx/onnx.patch        | 17 +++++++++++++++++
 cmake/vcpkg-ports/onnx/binskim.patch | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
index e8e29f99a7cf1..925669ae2d719 100644
--- a/cmake/patches/onnx/onnx.patch
+++ b/cmake/patches/onnx/onnx.patch
@@ -116,3 +116,20 @@ index c2ba43b7c..c0f229c6e 100644
              // process step
              auto step = steps[axis_index];
              if (step == 0) {
+diff --git a/onnx/defs/tensor/defs.cc b/onnx/defs/tensor/defs.cc
+index 4e0c5f913..a7cd6171a 100644
+--- a/onnx/defs/tensor/defs.cc
++++ b/onnx/defs/tensor/defs.cc
+@@ -794,6 +794,12 @@ static void processSliceInputs(const int64_t input_rank, int64_t& start, int64_
+   if (step == 0) {
+     fail_shape_inference("'step' cannot be 0 for Slice");
+   }
++  // empty dimension: start=0, end=0 so caller computes output_dim=0
++  if (input_rank == 0) {
++    start = 0;
++    end = 0;
++    return;
++  }
+   // process start
+   if (start < 0)
+     start += input_rank;
diff --git a/cmake/vcpkg-ports/onnx/binskim.patch b/cmake/vcpkg-ports/onnx/binskim.patch
index e8e29f99a7cf1..925669ae2d719 100644
--- a/cmake/vcpkg-ports/onnx/binskim.patch
+++ b/cmake/vcpkg-ports/onnx/binskim.patch
@@ -116,3 +116,20 @@ index c2ba43b7c..c0f229c6e 100644
              // process step
              auto step = steps[axis_index];
              if (step == 0) {
+diff --git a/onnx/defs/tensor/defs.cc b/onnx/defs/tensor/defs.cc
+index 4e0c5f913..a7cd6171a 100644
+--- a/onnx/defs/tensor/defs.cc
++++ b/onnx/defs/tensor/defs.cc
+@@ -794,6 +794,12 @@ static void processSliceInputs(const int64_t input_rank, int64_t& start, int64_
+   if (step == 0) {
+     fail_shape_inference("'step' cannot be 0 for Slice");
+   }
++  // empty dimension: start=0, end=0 so caller computes output_dim=0
++  if (input_rank == 0) {
++    start = 0;
++    end = 0;
++    return;
++  }
+   // process start
+   if (start < 0)
+     start += input_rank;

From d05c769293517e37e95316adcebf49a055b4d61c Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 16 Mar 2026 18:58:36 +0000
Subject: [PATCH 13/18] Update ONNX to 1.21.0rc3

- Update cmake/deps.txt: commit hash and SHA1 for rc3 zip
- Update cmake/external/onnx submodule to rc3 commit (e6c12c5fa)
- Update cmake/vcpkg-ports/onnx/portfile.cmake: REF and SHA512
- Update onnx==1.21.0rc3 in all 7 requirements.txt files
- Verified all vcpkg patches (binskim, fix-cmakelists, fix-dependency-protobuf)
  and cmake/patches/onnx/onnx.patch apply cleanly to rc3

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Agent-signed-off: Developer (257e49bb) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cmake/deps.txt                                                | 2 +-
 cmake/external/onnx                                           | 2 +-
 cmake/vcpkg-ports/onnx/portfile.cmake                         | 4 ++--
 onnxruntime/test/python/requirements.txt                      | 2 +-
 .../inference/aarch64/python/cpu/scripts/requirements.txt     | 2 +-
 .../github/linux/docker/scripts/lort/requirements.txt         | 2 +-
 .../github/linux/docker/scripts/manylinux/requirements.txt    | 2 +-
 tools/ci_build/github/linux/docker/scripts/requirements.txt   | 2 +-
 tools/ci_build/github/linux/python/requirements.txt           | 2 +-
 tools/ci_build/github/windows/python/requirements.txt         | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 9ee8dc85a4ffc..b7d233d5b19a2 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.250325.1.zip;826c8bd47c2258ec61b8b218e031e5b33d27f761
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/a51ac0754e0f61d3a4fa70a3821aeaeb740ac7a5.zip;9f250d23582b974a1bd5119ada7306a298bc6411
+onnx;https://github.com/onnx/onnx/archive/e6c12c5fa7857729e081e2ec90f96dfefeb79b83.zip;de083cbccbd6e427e94deb263c63aa474059fd01
 # Use the latest commit of 10.9-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
diff --git a/cmake/external/onnx b/cmake/external/onnx
index a51ac0754e0f6..e6c12c5fa7857 160000
--- a/cmake/external/onnx
+++ b/cmake/external/onnx
@@ -1 +1 @@
-Subproject commit a51ac0754e0f61d3a4fa70a3821aeaeb740ac7a5
+Subproject commit e6c12c5fa7857729e081e2ec90f96dfefeb79b83
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
index b411799a647a4..c590d8bc70b4f 100644
--- a/cmake/vcpkg-ports/onnx/portfile.cmake
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -3,8 +3,8 @@ vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
 vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO onnx/onnx
-    REF a51ac0754e0f61d3a4fa70a3821aeaeb740ac7a5
-    SHA512 e29c75cf22fea46f659f03cff470e0aecc22fbe3ee4baf082208371a192c6fa58c2080391d4959b95db12b239eb329e215020c85a40be756a2a44fa5b375cb6f
+    REF e6c12c5fa7857729e081e2ec90f96dfefeb79b83
+    SHA512 f54b7020486a80ab4942d9e6aa9f7c393ca16b6a51717c7bd81cfbaf3cd9cbf8c45281f3bf55388dcf16885d8f0b8ae5656d8b5a3af15757aad7fa2f372a2be2
     PATCHES
         fix-cmakelists.patch
         fix-dependency-protobuf.patch
diff --git a/onnxruntime/test/python/requirements.txt b/onnxruntime/test/python/requirements.txt
index 58a0214d4f0c8..42c5f0c12450e 100644
--- a/onnxruntime/test/python/requirements.txt
+++ b/onnxruntime/test/python/requirements.txt
@@ -1,3 +1,3 @@
-onnx==1.21.0rc2
+onnx==1.21.0rc3
 pytest
 onnx-ir
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index 97a98c32f7aaf..cc94f92fb57d6 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -7,4 +7,4 @@ wheel
 protobuf==4.25.8
 sympy==1.14
 flatbuffers
-onnx==1.21.0rc2
+onnx==1.21.0rc3
diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
index 5428b0253a9d1..0207f6a3cf6f1 100644
--- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
@@ -3,7 +3,7 @@ beartype==0.15.0
 flatbuffers
 cerberus
 h5py
-onnx==1.21.0rc2
+onnx==1.21.0rc3
 # Python dependencies required for pytorch development
 astunparse
 expecttest!=0.2.0
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index ea9fe01438082..8eaa5d326975a 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -9,4 +9,4 @@ sympy==1.14
 flatbuffers
 neural-compressor>=2.2.1
 triton==3.5.0
-onnx==1.21.0rc2
+onnx==1.21.0rc3
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 392956c4d0b21..c654621f00879 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -12,4 +12,4 @@ protobuf==6.33.0
 packaging
 onnxscript==0.6.2
 onnx-ir==0.1.16
-onnx==1.21.0rc2
+onnx==1.21.0rc3
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index 9f9028ea37459..71642228fee80 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -12,4 +12,4 @@ onnxscript==0.6.2
 onnx-ir==0.1.16
 jinja2
 markupsafe
-onnx==1.21.0rc2
+onnx==1.21.0rc3
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index 5f1e3b9b19c0b..eae136cf452bd 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -14,4 +14,4 @@ jinja2
 markupsafe
 semver
 packaging
-onnx==1.21.0rc2
+onnx==1.21.0rc3

From d43772b5972d74c8efa5140b49af4374ea4e96bd Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 16 Mar 2026 19:07:18 +0000
Subject: [PATCH 14/18] Remove Slice dim_value==0 patch hunks already in ONNX
 rc3

The Slice shape inference fix for dim_value==0 (tensor/old.cc and
tensor/defs.cc) was cherry-picked into ONNX rc3 natively (commit
33afebf43, PR #7739). The parameter was also renamed from 'input_rank'
to 'input_dim_size_or_value'. Remove these 3 hunks from both
onnx.patch and binskim.patch to prevent build failures.

Retained hunks: CMakeLists ONNX_MINIMAL_BUILD, Utils.cmake protobuf
warnings, GroupNormalization Deprecate removal.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Agent-signed-off: Developer (257e49bb) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cmake/patches/onnx/onnx.patch        | 51 ----------------------------
 cmake/vcpkg-ports/onnx/binskim.patch | 51 ----------------------------
 2 files changed, 102 deletions(-)

diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
index 925669ae2d719..0a5680778790b 100644
--- a/cmake/patches/onnx/onnx.patch
+++ b/cmake/patches/onnx/onnx.patch
@@ -82,54 +82,3 @@ index a6a8a83..153da87 100644
          .SetDoc(GroupNormalization_ver18_doc)
          .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
          .Attr(
-diff --git a/onnx/defs/tensor/old.cc b/onnx/defs/tensor/old.cc
-index c2ba43b7c..c0f229c6e 100644
---- a/onnx/defs/tensor/old.cc
-+++ b/onnx/defs/tensor/old.cc
-@@ -2632,6 +2632,16 @@ ONNX_OPERATOR_SET_SCHEMA(
- 
-             const auto input_dim_value = input_dim.dim_value();
- 
-+            // empty dimension: output is always 0 regardless of starts/ends/steps
-+            if (input_dim_value == 0) {
-+              ctx.getOutputType(0)
-+                  ->mutable_tensor_type()
-+                  ->mutable_shape()
-+                  ->mutable_dim(static_cast<int>(axis))
-+                  ->set_dim_value(0);
-+              continue;
-+            }
-+
-             // process step
-             auto step = steps[axis_index];
-             if (step == 0) {
-@@ -6315,6 +6325,12 @@ ONNX_OPERATOR_SET_SCHEMA(
- 
-             const auto input_dim_value = input_dim.dim_value();
- 
-+            // empty dimension: output is always 0 regardless of starts/ends/steps
-+            if (input_dim_value == 0) {
-+              ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->mutable_dim(axis)->set_dim_value(0);
-+              continue;
-+            }
-+
-             // process step
-             auto step = steps[axis_index];
-             if (step == 0) {
-diff --git a/onnx/defs/tensor/defs.cc b/onnx/defs/tensor/defs.cc
-index 4e0c5f913..a7cd6171a 100644
---- a/onnx/defs/tensor/defs.cc
-+++ b/onnx/defs/tensor/defs.cc
-@@ -794,6 +794,12 @@ static void processSliceInputs(const int64_t input_rank, int64_t& start, int64_
-   if (step == 0) {
-     fail_shape_inference("'step' cannot be 0 for Slice");
-   }
-+  // empty dimension: start=0, end=0 so caller computes output_dim=0
-+  if (input_rank == 0) {
-+    start = 0;
-+    end = 0;
-+    return;
-+  }
-   // process start
-   if (start < 0)
-     start += input_rank;
diff --git a/cmake/vcpkg-ports/onnx/binskim.patch b/cmake/vcpkg-ports/onnx/binskim.patch
index 925669ae2d719..0a5680778790b 100644
--- a/cmake/vcpkg-ports/onnx/binskim.patch
+++ b/cmake/vcpkg-ports/onnx/binskim.patch
@@ -82,54 +82,3 @@ index a6a8a83..153da87 100644
          .SetDoc(GroupNormalization_ver18_doc)
          .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
          .Attr(
-diff --git a/onnx/defs/tensor/old.cc b/onnx/defs/tensor/old.cc
-index c2ba43b7c..c0f229c6e 100644
---- a/onnx/defs/tensor/old.cc
-+++ b/onnx/defs/tensor/old.cc
-@@ -2632,6 +2632,16 @@ ONNX_OPERATOR_SET_SCHEMA(
- 
-             const auto input_dim_value = input_dim.dim_value();
- 
-+            // empty dimension: output is always 0 regardless of starts/ends/steps
-+            if (input_dim_value == 0) {
-+              ctx.getOutputType(0)
-+                  ->mutable_tensor_type()
-+                  ->mutable_shape()
-+                  ->mutable_dim(static_cast<int>(axis))
-+                  ->set_dim_value(0);
-+              continue;
-+            }
-+
-             // process step
-             auto step = steps[axis_index];
-             if (step == 0) {
-@@ -6315,6 +6325,12 @@ ONNX_OPERATOR_SET_SCHEMA(
- 
-             const auto input_dim_value = input_dim.dim_value();
- 
-+            // empty dimension: output is always 0 regardless of starts/ends/steps
-+            if (input_dim_value == 0) {
-+              ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->mutable_dim(axis)->set_dim_value(0);
-+              continue;
-+            }
-+
-             // process step
-             auto step = steps[axis_index];
-             if (step == 0) {
-diff --git a/onnx/defs/tensor/defs.cc b/onnx/defs/tensor/defs.cc
-index 4e0c5f913..a7cd6171a 100644
---- a/onnx/defs/tensor/defs.cc
-+++ b/onnx/defs/tensor/defs.cc
-@@ -794,6 +794,12 @@ static void processSliceInputs(const int64_t input_rank, int64_t& start, int64_
-   if (step == 0) {
-     fail_shape_inference("'step' cannot be 0 for Slice");
-   }
-+  // empty dimension: start=0, end=0 so caller computes output_dim=0
-+  if (input_rank == 0) {
-+    start = 0;
-+    end = 0;
-+    return;
-+  }
-   // process start
-   if (start < 0)
-     start += input_rank;

From 29fcbe3d5d56750ed41dcd293b9f1cc4fc7251ca Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 16 Mar 2026 21:35:31 +0000
Subject: [PATCH 15/18] Parallelize CumProd outer loop with thread pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the 4 sequential outer loops (forward/reverse × exclusive/non-exclusive)
with concurrency::ThreadPool::TryBatchParallelFor. Each outer iteration processes
an independent slice, making them safe to parallelize.

Refactored from sequential pointer arithmetic (input_iter++/output_iter++) to
index-based access using base offset = outer * dim * lower_dim_size, which is
required for parallel execution where iterations cannot share mutable iterators.

Agent-signed-off: Developer (257e49bb) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/providers/cpu/math/cumprod.cc        | 138 +++++++++++-------
 1 file changed, 84 insertions(+), 54 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/math/cumprod.cc b/onnxruntime/core/providers/cpu/math/cumprod.cc
index 3c0646b3d6b9a..f03b19a7fea0d 100644
--- a/onnxruntime/core/providers/cpu/math/cumprod.cc
+++ b/onnxruntime/core/providers/cpu/math/cumprod.cc
@@ -9,6 +9,7 @@
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/platform/threadpool.h"
 
 namespace onnxruntime {
 
@@ -146,67 +147,96 @@ Status CumProd<T>::Compute(OpKernelContext* ctx) const {
   const int64_t lower_dim_size =  // sizes of the slices we can treat as 1D arrays
       std::accumulate(input_shape.begin() + axis + 1, input_shape.end(), static_cast<int64_t>(1), std::multiplies<int64_t>());
 
-  if (!reverse_) {
-    const auto* input_iter = input->Data<T>();
-    auto* output_iter = output_tensor.MutableData<T>();
-    const auto* prev_output_iter = output_iter;
+  const T* input_data = input->Data<T>();
+  T* output_data = output_tensor.MutableData<T>();
+  const int64_t slice_size = dim * lower_dim_size;
+  auto* tp = ctx->GetOperatorThreadPool();
 
+  if (!reverse_) {
     if (exclusive_) {
-      for (int64_t outer = 0; outer < upper_dim_count; outer++) {
-        prev_output_iter = output_iter;
-        for (int64_t inner = 0; inner < lower_dim_size; inner++) {
-          *(output_iter++) = static_cast<T>(1);
-        }
-        for (int64_t cum_axis = 1; cum_axis < dim; cum_axis++) {
-          for (int64_t inner = 0; inner < lower_dim_size; inner++) {
-            *(output_iter++) = *(prev_output_iter++) * *(input_iter++);
-          }
-        }
-        input_iter += lower_dim_size;
-      }
+      concurrency::ThreadPool::TryBatchParallelFor(
+          tp, static_cast<int32_t>(upper_dim_count),
+          [&](ptrdiff_t outer) {
+            const int64_t base = outer * slice_size;
+            const T* in = input_data + base;
+            T* out = output_data + base;
+
+            for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+              out[inner] = static_cast<T>(1);
+            }
+            for (int64_t cum_axis = 1; cum_axis < dim; cum_axis++) {
+              const int64_t curr_offset = cum_axis * lower_dim_size;
+              const int64_t prev_offset = (cum_axis - 1) * lower_dim_size;
+              for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+                out[curr_offset + inner] = out[prev_offset + inner] * in[prev_offset + inner];
+              }
+            }
+          },
+          0);
     } else {
-      for (int64_t outer = 0; outer < upper_dim_count; outer++) {
-        prev_output_iter = output_iter;
-        for (int64_t inner = 0; inner < lower_dim_size; inner++) {
-          *(output_iter++) = *(input_iter++);
-        }
-        for (int64_t cum_axis = 1; cum_axis < dim; cum_axis++) {
-          for (int64_t inner = 0; inner < lower_dim_size; inner++) {
-            *(output_iter++) = *(prev_output_iter++) * *(input_iter++);
-          }
-        }
-      }
+      concurrency::ThreadPool::TryBatchParallelFor(
+          tp, static_cast<int32_t>(upper_dim_count),
+          [&](ptrdiff_t outer) {
+            const int64_t base = outer * slice_size;
+            const T* in = input_data + base;
+            T* out = output_data + base;
+
+            for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+              out[inner] = in[inner];
+            }
+            for (int64_t cum_axis = 1; cum_axis < dim; cum_axis++) {
+              const int64_t curr_offset = cum_axis * lower_dim_size;
+              const int64_t prev_offset = (cum_axis - 1) * lower_dim_size;
+              for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+                out[curr_offset + inner] = out[prev_offset + inner] * in[curr_offset + inner];
+              }
+            }
+          },
+          0);
     }
   } else {
-    const auto* input_iter = input->Data<T>() + input->Shape().Size();
-    auto* output_iter = output_tensor.MutableData<T>() + output_shape.Size();
-    const auto* prev_output_iter = output_iter;
-
     if (exclusive_) {
-      for (int64_t outer = upper_dim_count - 1; outer >= 0; outer--) {
-        prev_output_iter = output_iter;
-        for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) {
-          *(--output_iter) = static_cast<T>(1);
-        }
-        for (int64_t cum_axis = dim - 1; cum_axis > 0; cum_axis--) {
-          for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) {
-            *(--output_iter) = *(--prev_output_iter) * *(--input_iter);
-          }
-        }
-        input_iter -= lower_dim_size;
-      }
+      concurrency::ThreadPool::TryBatchParallelFor(
+          tp, static_cast<int32_t>(upper_dim_count),
+          [&](ptrdiff_t outer) {
+            const int64_t base = outer * slice_size;
+            const T* in = input_data + base;
+            T* out = output_data + base;
+
+            const int64_t last_offset = (dim - 1) * lower_dim_size;
+            for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+              out[last_offset + inner] = static_cast<T>(1);
+            }
+            for (int64_t cum_axis = dim - 2; cum_axis >= 0; cum_axis--) {
+              const int64_t curr_offset = cum_axis * lower_dim_size;
+              const int64_t next_offset = (cum_axis + 1) * lower_dim_size;
+              for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+                out[curr_offset + inner] = out[next_offset + inner] * in[next_offset + inner];
+              }
+            }
+          },
+          0);
     } else {
-      for (int64_t outer = upper_dim_count - 1; outer >= 0; outer--) {
-        prev_output_iter = output_iter;
-        for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) {
-          *(--output_iter) = *(--input_iter);
-        }
-        for (int64_t cum_axis = dim - 1; cum_axis > 0; cum_axis--) {
-          for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) {
-            *(--output_iter) = *(--prev_output_iter) * *(--input_iter);
-          }
-        }
-      }
+      concurrency::ThreadPool::TryBatchParallelFor(
+          tp, static_cast<int32_t>(upper_dim_count),
+          [&](ptrdiff_t outer) {
+            const int64_t base = outer * slice_size;
+            const T* in = input_data + base;
+            T* out = output_data + base;
+
+            const int64_t last_offset = (dim - 1) * lower_dim_size;
+            for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+              out[last_offset + inner] = in[last_offset + inner];
+            }
+            for (int64_t cum_axis = dim - 2; cum_axis >= 0; cum_axis--) {
+              const int64_t curr_offset = cum_axis * lower_dim_size;
+              const int64_t next_offset = (cum_axis + 1) * lower_dim_size;
+              for (int64_t inner = 0; inner < lower_dim_size; inner++) {
+                out[curr_offset + inner] = out[next_offset + inner] * in[curr_offset + inner];
+              }
+            }
+          },
+          0);
     }
   }
 

From ffe98ecbb8260f419149bf104f4fcdc198692910 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 23 Mar 2026 20:06:17 +0000
Subject: [PATCH 16/18] Update ONNX to 1.21.0rc4

Update ONNX dependency from 1.21.0rc3 to 1.21.0rc4 (commit c751ddbce897).
RC4 includes bug fixes (Slice SIGABRT on empty dimensions) and
security hardening (ExternalDataInfo attribute injection).

Changes:
- cmake/deps.txt: Updated archive URL and SHA1 hash
- cmake/external/onnx: Updated submodule to rc4 commit
- cmake/vcpkg-ports/onnx/portfile.cmake: Updated REF and SHA512
- 7 requirements.txt files: onnx==1.21.0rc4

Agent-signed-off: Developer (dc55daf6) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cmake/deps.txt                                                | 2 +-
 cmake/external/onnx                                           | 2 +-
 cmake/vcpkg-ports/onnx/portfile.cmake                         | 4 ++--
 onnxruntime/test/python/requirements.txt                      | 2 +-
 .../inference/aarch64/python/cpu/scripts/requirements.txt     | 2 +-
 .../github/linux/docker/scripts/lort/requirements.txt         | 2 +-
 .../github/linux/docker/scripts/manylinux/requirements.txt    | 2 +-
 tools/ci_build/github/linux/docker/scripts/requirements.txt   | 2 +-
 tools/ci_build/github/linux/python/requirements.txt           | 2 +-
 tools/ci_build/github/windows/python/requirements.txt         | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index b7d233d5b19a2..754151ea75d3b 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.250325.1.zip;826c8bd47c2258ec61b8b218e031e5b33d27f761
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/e6c12c5fa7857729e081e2ec90f96dfefeb79b83.zip;de083cbccbd6e427e94deb263c63aa474059fd01
+onnx;https://github.com/onnx/onnx/archive/c751ddbce897302ab57802ec506a3ee0e41ae717.zip;0a7cdc4f43478098bbd44bf2e7a99a95d9e9c809
 # Use the latest commit of 10.9-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
diff --git a/cmake/external/onnx b/cmake/external/onnx
index e6c12c5fa7857..c751ddbce8973 160000
--- a/cmake/external/onnx
+++ b/cmake/external/onnx
@@ -1 +1 @@
-Subproject commit e6c12c5fa7857729e081e2ec90f96dfefeb79b83
+Subproject commit c751ddbce897302ab57802ec506a3ee0e41ae717
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
index c590d8bc70b4f..cc59eddc6642c 100644
--- a/cmake/vcpkg-ports/onnx/portfile.cmake
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -3,8 +3,8 @@ vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
 vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO onnx/onnx
-    REF e6c12c5fa7857729e081e2ec90f96dfefeb79b83
-    SHA512 f54b7020486a80ab4942d9e6aa9f7c393ca16b6a51717c7bd81cfbaf3cd9cbf8c45281f3bf55388dcf16885d8f0b8ae5656d8b5a3af15757aad7fa2f372a2be2
+    REF c751ddbce897302ab57802ec506a3ee0e41ae717
+    SHA512 b23c6ee83334b8b19db7106e18327546e0630f799f5b6355febf5d3c6d59eae133c9a7b1a2e6b35765ddde9478d32596af4c914891381b4018a82b665d59db64
     PATCHES
         fix-cmakelists.patch
         fix-dependency-protobuf.patch
diff --git a/onnxruntime/test/python/requirements.txt b/onnxruntime/test/python/requirements.txt
index 42c5f0c12450e..361879108bebd 100644
--- a/onnxruntime/test/python/requirements.txt
+++ b/onnxruntime/test/python/requirements.txt
@@ -1,3 +1,3 @@
-onnx==1.21.0rc3
+onnx==1.21.0rc4
 pytest
 onnx-ir
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index cc94f92fb57d6..39ceca68de935 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -7,4 +7,4 @@ wheel
 protobuf==4.25.8
 sympy==1.14
 flatbuffers
-onnx==1.21.0rc3
+onnx==1.21.0rc4
diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
index 0207f6a3cf6f1..8c17b02dc25ea 100644
--- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
@@ -3,7 +3,7 @@ beartype==0.15.0
 flatbuffers
 cerberus
 h5py
-onnx==1.21.0rc3
+onnx==1.21.0rc4
 # Python dependencies required for pytorch development
 astunparse
 expecttest!=0.2.0
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 8eaa5d326975a..1a75a90ea6024 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -9,4 +9,4 @@ sympy==1.14
 flatbuffers
 neural-compressor>=2.2.1
 triton==3.5.0
-onnx==1.21.0rc3
+onnx==1.21.0rc4
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index c654621f00879..3228a052ae885 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -12,4 +12,4 @@ protobuf==6.33.0
 packaging
 onnxscript==0.6.2
 onnx-ir==0.1.16
-onnx==1.21.0rc3
+onnx==1.21.0rc4
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index 71642228fee80..9eec8e52cc45d 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -12,4 +12,4 @@ onnxscript==0.6.2
 onnx-ir==0.1.16
 jinja2
 markupsafe
-onnx==1.21.0rc3
+onnx==1.21.0rc4
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index eae136cf452bd..24639d52e7ebe 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -14,4 +14,4 @@ jinja2
 markupsafe
 semver
 packaging
-onnx==1.21.0rc3
+onnx==1.21.0rc4

From ff7b87e3094fc837e708312757fb37c8cccaba46 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 30 Mar 2026 19:42:42 +0000
Subject: [PATCH 17/18] Update ONNX to official 1.21.0 release

Update from rc4 to the official ONNX 1.21.0 release (tagged v1.21.0,
commit be2b5fde82d9c8874f3d19328bdfe3b6962dc67b, March 27 2026).

- cmake/deps.txt: commit hash + SHA1
- cmake/external/onnx: submodule to v1.21.0
- cmake/vcpkg-ports/onnx/portfile.cmake: REF + SHA512
- 7 requirements.txt files: onnx==1.21.0rc4 -> onnx==1.21.0
- All 4 patches verified to apply cleanly

Agent-signed-off: Developer (257e49bb) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cmake/deps.txt                                                | 2 +-
 cmake/external/onnx                                           | 2 +-
 cmake/vcpkg-ports/onnx/portfile.cmake                         | 4 ++--
 onnxruntime/test/python/requirements.txt                      | 2 +-
 .../inference/aarch64/python/cpu/scripts/requirements.txt     | 2 +-
 .../github/linux/docker/scripts/lort/requirements.txt         | 2 +-
 .../github/linux/docker/scripts/manylinux/requirements.txt    | 2 +-
 tools/ci_build/github/linux/docker/scripts/requirements.txt   | 2 +-
 tools/ci_build/github/linux/python/requirements.txt           | 2 +-
 tools/ci_build/github/windows/python/requirements.txt         | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 754151ea75d3b..95398b1a31f20 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.250325.1.zip;826c8bd47c2258ec61b8b218e031e5b33d27f761
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/c751ddbce897302ab57802ec506a3ee0e41ae717.zip;0a7cdc4f43478098bbd44bf2e7a99a95d9e9c809
+onnx;https://github.com/onnx/onnx/archive/be2b5fde82d9c8874f3d19328bdfe3b6962dc67b.zip;451dd6ad7ffafc76ab26c9508adaa91d935f10ba
 # Use the latest commit of 10.9-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
diff --git a/cmake/external/onnx b/cmake/external/onnx
index c751ddbce8973..be2b5fde82d9c 160000
--- a/cmake/external/onnx
+++ b/cmake/external/onnx
@@ -1 +1 @@
-Subproject commit c751ddbce897302ab57802ec506a3ee0e41ae717
+Subproject commit be2b5fde82d9c8874f3d19328bdfe3b6962dc67b
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
index cc59eddc6642c..2e372e048d8a1 100644
--- a/cmake/vcpkg-ports/onnx/portfile.cmake
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -3,8 +3,8 @@ vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
 vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO onnx/onnx
-    REF c751ddbce897302ab57802ec506a3ee0e41ae717
-    SHA512 b23c6ee83334b8b19db7106e18327546e0630f799f5b6355febf5d3c6d59eae133c9a7b1a2e6b35765ddde9478d32596af4c914891381b4018a82b665d59db64
+    REF be2b5fde82d9c8874f3d19328bdfe3b6962dc67b
+    SHA512 486eb73a6e66f0d39bde2e5fbbe69ef2f6db46573714224824ee854d9a553ff47d642eb0526cd173c651e5f30093fcb0f97b4e0441ad77818cf0bc98ae422ad7
     PATCHES
         fix-cmakelists.patch
         fix-dependency-protobuf.patch
diff --git a/onnxruntime/test/python/requirements.txt b/onnxruntime/test/python/requirements.txt
index 361879108bebd..3ece2f39d4042 100644
--- a/onnxruntime/test/python/requirements.txt
+++ b/onnxruntime/test/python/requirements.txt
@@ -1,3 +1,3 @@
-onnx==1.21.0rc4
+onnx==1.21.0
 pytest
 onnx-ir
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index 39ceca68de935..b4c2f163e22ac 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -7,4 +7,4 @@ wheel
 protobuf==4.25.8
 sympy==1.14
 flatbuffers
-onnx==1.21.0rc4
+onnx==1.21.0
diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
index 8c17b02dc25ea..eb52681341012 100644
--- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
@@ -3,7 +3,7 @@ beartype==0.15.0
 flatbuffers
 cerberus
 h5py
-onnx==1.21.0rc4
+onnx==1.21.0
 # Python dependencies required for pytorch development
 astunparse
 expecttest!=0.2.0
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 1a75a90ea6024..9a0a6d0f51900 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -9,4 +9,4 @@ sympy==1.14
 flatbuffers
 neural-compressor>=2.2.1
 triton==3.5.0
-onnx==1.21.0rc4
+onnx==1.21.0
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 3228a052ae885..3d886832e1ccb 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -12,4 +12,4 @@ protobuf==6.33.0
 packaging
 onnxscript==0.6.2
 onnx-ir==0.1.16
-onnx==1.21.0rc4
+onnx==1.21.0
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index 9eec8e52cc45d..bfe9ab0d8a508 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -12,4 +12,4 @@ onnxscript==0.6.2
 onnx-ir==0.1.16
 jinja2
 markupsafe
-onnx==1.21.0rc4
+onnx==1.21.0
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index 24639d52e7ebe..2dfba37c6f381 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -14,4 +14,4 @@ jinja2
 markupsafe
 semver
 packaging
-onnx==1.21.0rc4
+onnx==1.21.0

From ea5315e0ee4d14d3baef87423bed62aa60e14a11 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 2 Apr 2026 20:06:42 +0000
Subject: [PATCH 18/18] fix SHA

---
 cmake/deps.txt                        | 2 +-
 cmake/vcpkg-ports/onnx/portfile.cmake | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 95398b1a31f20..3ab9b81663f87 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.250325.1.zip;826c8bd47c2258ec61b8b218e031e5b33d27f761
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/be2b5fde82d9c8874f3d19328bdfe3b6962dc67b.zip;451dd6ad7ffafc76ab26c9508adaa91d935f10ba
+onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.21.0.zip;321d4acc807c8e0fb0bbcc0424a143dffde1e846
 # Use the latest commit of 10.9-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
index 2e372e048d8a1..3450dcb2e80ce 100644
--- a/cmake/vcpkg-ports/onnx/portfile.cmake
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -3,8 +3,8 @@ vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
 vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO onnx/onnx
-    REF be2b5fde82d9c8874f3d19328bdfe3b6962dc67b
-    SHA512 486eb73a6e66f0d39bde2e5fbbe69ef2f6db46573714224824ee854d9a553ff47d642eb0526cd173c651e5f30093fcb0f97b4e0441ad77818cf0bc98ae422ad7
+    REF "v${VERSION}"
+    SHA512 3cee4c0fbc9e260e360a62a59e324e0b127a5749f958e0704989b407a4c1179c637ef86e41a406e7868537a62a11a821e3433005eb0725f979145f8d514926bd
     PATCHES
         fix-cmakelists.patch
         fix-dependency-protobuf.patch