microsoft · tianleiwu · Feb 24, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/.github/actions/macos-ci-setup/action.yml b/.github/actions/macos-ci-setup/action.yml
@@ -8,7 +8,7 @@ inputs:
   python_version:
     required: false
     type: string
-    default: "3.11"
+    default: "3.14"
   node_version:
     required: false
     type: string

diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
@@ -68,6 +68,21 @@ jobs:
     secrets:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
+  build-linux-x64-release-py314:
+    name: Build Linux x64 Release (Python 3.14)
+    uses: ./.github/workflows/reusable_linux_build.yml
+    with:
+      pool_name: "onnxruntime-github-Ubuntu2204-AMD-CPU"
+      build_config: Release
+      architecture: x64
+      dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+      docker_image_repo: onnxruntimecpubuildpythonx64
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --build_nuget --enable_transformers_tool_test --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON'
+      python_path_prefix: 'PATH=/opt/python/cp314-cp314/bin:$PATH' # $ needs escaping in single quotes
+      job_identifier: build-linux-x64-release-py314
+    secrets:
+      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
   orttraining-linux-ci-pipeline:
     name: Build Linux x64 Release with training
     uses: ./.github/workflows/reusable_linux_build.yml
@@ -109,7 +124,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
       docker_image_repo: onnxruntimecpubuildpythonaarch64
       extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON'
-      python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' # $ needs escaping in single quotes
+      python_path_prefix: 'PATH=/opt/python/cp314-cp314/bin:$PATH' # $ needs escaping in single quotes
       job_identifier: build-linux-arm64-release
     secrets:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -16,7 +16,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  python_version: 3.11
+  python_version: "3.14"
 
 jobs:
   cpu:
@@ -28,6 +28,7 @@ jobs:
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"},
           {"machine": "arm64", "target": "arm64", "build_config": "Release"}
         ]
+      python_version: "3.14"
 
   coreml:
     uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -39,6 +40,7 @@ jobs:
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"},
           {"machine": "arm64", "target": "arm64", "build_config": "Release"}
         ]
+      python_version: "3.14"
 
   xnnpack:
     uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -49,6 +51,7 @@ jobs:
         [
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"}
         ]
+      python_version: "3.14"
 
   webgpu:
     uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -60,6 +63,7 @@ jobs:
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"},
           {"machine": "arm64", "target": "arm64", "build_config": "Release"}
         ]
+      python_version: "3.14"
 
   iphone_simulator:
     runs-on: macos-15

diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -19,7 +19,7 @@ on:
       python_version:
         required: false
         type: string
-        default: "3.11"
+        default: "3.14"
       matrix_include:
         required: false
         type: string

diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
@@ -32,7 +32,7 @@ jobs:
 
       - uses: actions/setup-python@v6
         with:
-          python-version: '3.12'
+          python-version: '3.14'
           architecture: x64
 
       - name: Locate vcvarsall and Setup Env
@@ -173,7 +173,7 @@ jobs:
 
       - uses: actions/setup-python@v6
         with:
-          python-version: '3.12'
+          python-version: '3.14'
           architecture: x64
 
       - uses: actions/setup-node@v6

diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -46,7 +46,7 @@ protoc_linux_aarch64;https://github.com/protocolbuffers/protobuf/releases/downlo
 protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-osx-universal_binary.zip;23710c3d1c2036d8d65a6a22234372fa2d7af9ef
 psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
 pthreadpool;https://github.com/google/pthreadpool/archive/dcc9f28589066af0dbd4555579281230abbf74dd.zip;533a77943203ef15ca608bcd9dbe2c94da7451d2
-pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f780292da9db273c8ef06ccf5fd4b623624143e9
+pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v3.0.2.zip;a064e663b4d7a337ac291d1bef7337ef4e60a1ae
 pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/403d652dca4c1046e8145950b1c0997a9f748b57.zip;30b2a07fe4bae8574f89176e56274cacdd6d135b
 re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88
 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac

diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
@@ -6,7 +6,6 @@ onnxruntime_fetchcontent_declare(
     URL ${DEP_URL_pybind11}
     URL_HASH SHA1=${DEP_SHA1_pybind11}
     EXCLUDE_FROM_ALL
-    FIND_PACKAGE_ARGS 2.13 NAMES pybind11
+    FIND_PACKAGE_ARGS 3.0 NAMES pybind11
 )
 onnxruntime_fetchcontent_makeavailable(pybind11_project)
-
diff --git a/cmake/vcpkg-ports/pybind11/portfile.cmake b/cmake/vcpkg-ports/pybind11/portfile.cmake
@@ -2,7 +2,8 @@ vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO pybind/pybind11
     REF "v${VERSION}"
-    SHA512 497c25b33b09a9c42f67131ab82e35d689e8ce089dd7639be997305ff9a6d502447b79c824508c455d559e61f0186335b54dd2771d903a7c1621833930622d1a
+    # SHA512 for the zip (not tar.gz) file.
+    SHA512 786b1bf534ac67a8d5669f8babf67bb13e48b3a3da1b6344e43ae10a84b80bbc8fea5f12a65fd18739c341fefef5622c5dc096db964dff33cc62ea4259b2e2c1
     HEAD_REF master
 )
 

diff --git a/cmake/vcpkg-ports/pybind11/vcpkg.json b/cmake/vcpkg-ports/pybind11/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "pybind11",
-  "version": "2.13.6",
+  "version": "3.0.2",
   "description": "pybind11 is a lightweight header-only library that exposes C++ types in Python and vice versa, mainly to create Python bindings of existing C++ code",
   "homepage": "https://github.com/pybind/pybind11",
   "license": "BSD-3-Clause",

diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -6,6 +6,7 @@
 PyTorch-ONNX exporter (torch.onnx.export).
 """
 
+import contextlib
 import typing
 
 try:
@@ -22,7 +23,7 @@
 _registered_ops: typing.AbstractSet[str] = set()
 
 
-def _reg(symbolic_fn: typing.Callable, namespace: str = ""):
+def _reg(symbolic_fn: typing.Callable, namespace: str = "aten"):
     name = f"{namespace}::{symbolic_fn.__name__}"
     torch.onnx.register_custom_op_symbolic(name, symbolic_fn, _OPSET_VERSION)
     _registered_ops.add(name)
@@ -49,13 +50,6 @@ def grid_sampler(g, input, grid, mode, padding_mode, align_corners):
         padding_mode_str = ["zeros", "border", "reflection"][padding_mode]
         align_corners = int(symbolic_helper._maybe_get_const(align_corners, "b"))
 
-        # From opset v13 onward, the output shape can be specified with
-        # (N, C, H, W) (N, H_out, W_out, 2) => (N, C, H_out, W_out)
-        # input_shape = input.type().sizes()
-        # gird_shape = grid.type().sizes()
-        # output_shape = input_shape[:2] + gird_shape[1:3]
-        # g.op(...).setType(input.type().with_sizes(output_shape))
-
         return g.op(
             "com.microsoft::GridSample",
             input,
@@ -71,15 +65,24 @@ def inverse(g, self):
         return g.op("com.microsoft::Inverse", self).setType(self.type())
 
     _reg(inverse)
+    torch.onnx.register_custom_op_symbolic("aten::linalg_inv", inverse, _OPSET_VERSION)
+    _registered_ops.add("aten::linalg_inv")
+
+    def gelu(g, self: torch._C.Value, approximate="none"):
+        # PyTorch can emit aten::gelu with or without the optional approximate arg.
+        if not isinstance(approximate, str):
+            approximate = symbolic_helper._maybe_get_const(approximate, "s")
 
-    @torch.onnx.symbolic_helper.parse_args("v", "s")
-    def gelu(g, self: torch._C.Value, approximate: str = "none"):
-        # Use microsoft::Gelu for performance if possible. It only supports approximate == "none"
+        # Use microsoft::Gelu for performance if possible. It only supports approximate == "none".
         if approximate == "none":
             return g.op("com.microsoft::Gelu", self).setType(self.type())
         return torch.onnx.symbolic_opset9.gelu(g, self, approximate)
 
     _reg(gelu)
+    # Some PyTorch versions dispatch GELU symbolic lookup by exporter opset.
+    # Registering across stable opsets keeps ORT Gelu fusion consistently enabled.
+    for opset in range(9, 21):
+        torch.onnx.register_custom_op_symbolic("aten::gelu", gelu, opset)
 
     def triu(g, self, diagonal):
         return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1).setType(self.type())
@@ -127,3 +130,8 @@ def unregister():
             for version in symbolic_helper._onnx_stable_opsets:
                 if version >= _OPSET_VERSION and symbolic_registry.is_registered_op(kind, namespace, version):
                     del symbolic_registry._registry[(namespace, version)][kind]
+
+    # Also clean up gelu's multi-opset registrations (see register()).
+    for opset in range(9, 21):
+        with contextlib.suppress(Exception):
+            torch.onnx.unregister_custom_op_symbolic("aten::gelu", opset)
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -1112,11 +1112,11 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             if (
                 (mul_val is None)
                 or not (isinstance(mul_val, np.ndarray) and mul_val.size == 1)
-                or (float(mul_val) >= 0)
+                or (mul_val.item() >= 0)
             ):
                 return
-            if float(mul_val) != -10000:
-                self.mask_filter_value = float(mul_val)
+            if mul_val.item() != -10000:
+                self.mask_filter_value = mul_val.item()
 
         if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
             mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) if not is_no_mask_attention else None

diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -290,6 +290,7 @@ def do_export_internal(model: nn.Module, onnx_io_tuple: tuple, onnx_inputs: tupl
             input_names=onnx_inp_names,
             output_names=onnx_out_names,
             dynamic_axes=onnx_dynamic_axes,
+            dynamo=False,
         )
 
         onnx_path.unlink(missing_ok=True)

diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
@@ -473,7 +473,7 @@ def export_onnx(
                     input_names=input_names,
                     output_names=output_names,
                     dynamic_axes=dynamic_axes,
-                    opset_version=11,
+                    opset_version=14,
                     do_constant_folding=True,
                     use_external_data_format=True,
                     verbose=verbose,

diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -235,6 +235,7 @@ def run_torchscript_separate_export(
         opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
+        dynamo=False,
     )
 
     # Check decoder_model.onnx and save all external data to one file
@@ -294,6 +295,7 @@ def run_torchscript_separate_export(
         opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
+        dynamo=False,
     )
 
     # Check decoder_with_past_model.onnx and save all external data to one file

diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
@@ -391,8 +391,9 @@ def export_onnx(
                 input_names=input_names,
                 output_names=output_names,
                 dynamic_axes=dynamic_axes,
-                opset_version=17,
+                opset_version=18,
                 do_constant_folding=True,
+                dynamo=False,
                 verbose=verbose,
             )
 

diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py
@@ -110,8 +110,9 @@ def export_onnx(
                 input_names=input_names,
                 output_names=output_names,
                 dynamic_axes=dynamic_axes,
-                opset_version=17,
+                opset_version=18,
                 do_constant_folding=True,
+                dynamo=False,
                 verbose=verbose,
             )
 

diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
@@ -293,8 +293,9 @@ def export_onnx(
                 input_names=input_names,
                 output_names=output_names,
                 dynamic_axes=dynamic_axes,
-                opset_version=17,
+                opset_version=18,
                 do_constant_folding=True,
+                dynamo=False,
                 verbose=verbose,
             )
 

diff --git a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
@@ -49,6 +49,7 @@ def torch_onnx_export(
             keep_initializers_as_inputs=keep_initializers_as_inputs,
             custom_opsets=custom_opsets,
             export_modules_as_functions=export_modules_as_functions,
+            dynamo=False,
         )
     else:
         torch.onnx.export(

diff --git a/onnxruntime/test/python/test_pytorch_export_contrib_ops.py b/onnxruntime/test/python/test_pytorch_export_contrib_ops.py
@@ -59,6 +59,9 @@ def setUp(self):
         torch.manual_seed(0)
         pytorch_export_contrib_ops.register()
 
+    def tearDown(self):
+        pytorch_export_contrib_ops.unregister()
+
     def run_test(
         self,
         model,
@@ -101,6 +104,7 @@ def run_test(
                 input_names=input_names,
                 output_names=output_names,
                 custom_opsets=custom_opsets,
+                dynamo=False,
             )
 
             # compute onnxruntime output prediction
@@ -143,12 +147,13 @@ def test_gelu_is_fused_by_default(self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
         f.seek(0)
         onnx_model = onnx.load(f)
-        node = onnx_model.graph.node[0]
-        self.assertEqual(node.op_type, "Gelu")
-        self.assertEqual(node.domain, "com.microsoft")
+        # Default GELU should be mapped to ORT contrib Gelu for performance.
+        gelu_nodes = [n for n in onnx_model.graph.node if n.op_type == "Gelu" and n.domain == "com.microsoft"]
+        self.assertEqual(len(gelu_nodes), 1)
 
     @parameterized.parameterized.expand([("default_approximate", "none"), ("tanh_approximate", "tanh")])
     @unittest.skipIf(_torch_version_lower_than("1.12"), "Gelu's approximate parameter unsupported in PyTorch < 1.12")
@@ -230,8 +235,8 @@ def forward(self, input):
 # IR version 4 style export.
 ONNXExporterTest_opset9_IRv4 = type(
     "TestONNXRuntime_opset9_IRv4",
-    (unittest.TestCase,),
-    dict(ONNXExporterTest.__dict__, keep_initializers_as_inputs=False),
+    (ONNXExporterTest,),
+    dict(keep_initializers_as_inputs=False),
 )
 
 

diff --git a/onnxruntime/test/python/transformers/parity_utilities.py b/onnxruntime/test/python/transformers/parity_utilities.py
@@ -92,6 +92,7 @@ def export_onnx(model, onnx_model_path, float16, hidden_size, device):
         dynamic_axes=dynamic_axes,
         opset_version=11,
         do_constant_folding=True,
+        dynamo=False,
     )
     print("exported:", onnx_model_path)
 

diff --git a/onnxruntime/test/python/transformers/test_gelu_fusions.py b/onnxruntime/test/python/transformers/test_gelu_fusions.py
@@ -75,17 +75,22 @@ def test_fusions(self, test_case, dynamo):
         dummy_input = torch.ones(3, dtype=torch.float32)
         test_name = f"{operator}_{source}"
         onnx_path = f"{test_name}.onnx"
+
+        # For Torch 2.10+, torch.nn.functional.gelu(approximate="tanh") exports as Gelu node.
+        # So we force opset_version=18 here.
         torch.onnx.export(
             model,
             (dummy_input,),
             onnx_path,
             input_names=["input"],
             output_names=["output"],
-            dynamo=dynamo,
+            opset_version=18,
+            dynamo=False,
             optimize=True,  # Only meaningful when dynamo is True
         )
         optimizer = optimize_model(onnx_path, "bert")
         # optimizer.save_model_to_file(f"{operator}_{source}_opt.onnx")
+
         os.remove(onnx_path)
         # Remove the associated .data file (dynamo)
         data_path = onnx_path + ".data"