From ee355a9d592d841ee74703c69d166b9425aeb387 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 28 Jan 2026 12:08:23 -0800
Subject: [PATCH 1/4] [MLAS] Fix Data Race in MlasLutGemm by Serializing LUT
 Generation (#27179)

## Problem Description
The `MatMulNBitsLutGemm.Float32_2Bits_Asymmetric_Batch32_256x256` test
was exhibiting flaky behavior (failure rate ~2-20%) with numerical
mismatches.
Investigation revealed a **race condition** in the
[GenerateLUT](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp#L326)
step within
[MlasLutGemm](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/inc/mlas_qnbit.h#L328).

When the batch size `M > 1`,
[MlasLutGemm](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/inc/mlas_qnbit.h#L328)
attempted to parallelize the LUT generation over the batch dimension
using `MlasTrySimpleParallel`. However, the underlying
[GenerateLUT](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp#L326)
implementation (specifically shared usage of `lut_scales`/`lut_biases`
or internal buffers) is not thread-safe for concurrent execution on the
same destination buffers or related state. This led to corruption of the
Look-Up Tables or scales, causing random output errors.

## Solution
This PR modifies
[onnxruntime/core/mlas/lib/qlutgemm.cpp](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/lib/qlutgemm.cpp)
to **serialize the
[GenerateLUT](file:///home/tlwu/onnxruntime/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp#324-355)
loop**.
Instead of using `MlasTrySimpleParallel`, we now use a simple `for` loop
to process each row of the batch sequentially.

**Performance Impact:**
The
[GenerateLUT](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp#L326)
step is computationally lightweight compared to the subsequent
[TMACComputeGemm](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp#L505)
matrix multiplication. Serializing this setup step has negligible impact
on overall inference latency (micro-benchmarks showed no measurable
regression), but effectively eliminates the race condition.

## Verification
* **Reproduction:** The issue was reliably reproduced by running
`MatMulNBitsLutGemm.Float32_2Bits_Asymmetric_Batch32_256x256` in a loop
(failing ~1 in 5 times).
* **Verification:** After applying the fix, the same test passed **50/50
iterations** consistently.
* **Regression Testing:** Standard `MatMulNBitsLutGemm` tests (including
`BlkLen64` and `M=1` cases) continue to pass.
---
 onnxruntime/core/mlas/lib/qlutgemm.cpp | 48 +++++++++++++-------------
 1 file changed, 24 insertions(+), 24 deletions(-)
diff --git a/onnxruntime/core/mlas/lib/qlutgemm.cpp b/onnxruntime/core/mlas/lib/qlutgemm.cpp
index f029e539f02a1..cb099c2409a44 100644
--- a/onnxruntime/core/mlas/lib/qlutgemm.cpp
+++ b/onnxruntime/core/mlas/lib/qlutgemm.cpp
@@ -548,32 +548,32 @@ MlasLutGemm(
 
     // const int num_groups = static_cast<int>(K / BlkLen);
 
-    // Parallelize over M (batch dimension)
-    // Each iteration processes one row of the activation matrix
+    // Iterate over M (batch dimension)
+    // Each iteration processes one row of the activation matrix.
+    // NOTE: This loop is intentionally serialized. Previous attempts to parallelize
+    // using MlasTrySimpleParallel caused flaky test failures (race conditions)
+    // when M > 1 (e.g., Batch32 case). Since GenerateLUT is lightweight,
+    // serial execution ensures correctness with negligible performance impact.
     // TODO(vraspar): Ideally we have to do block parallelism here
 
-    MlasTrySimpleParallel(
-        threadpool,
-        static_cast<size_t>(M),
-        [&](ptrdiff_t ine11) {
-            const size_t row_offset = static_cast<size_t>(ine11) * K;
-            const size_t lut_offset = static_cast<size_t>(ine11) * K * 4;  // 4 bytes per K element for 2-bit LUT
-            const size_t scale_bias_offset = static_cast<size_t>(ine11) * lut_scales_size;
-
-            // Call the dispatch function for this row
-            // ggml_tmac_mul_mat_task_init
-            Dispatch->GenerateLUT(
-                const_cast<float*>(a_float + row_offset),  // Input activation for this row
-                qlut + lut_offset,                         // Output LUT for this row
-                lut_scales + scale_bias_offset,            // Scales for this row
-                lut_biases + scale_bias_offset,            // Biases for this row
-                M,
-                K,
-                N,
-                tmac_params.act_group_size
-            );
-        }
-    );
+    for (size_t ine11 = 0; ine11 < static_cast<size_t>(M); ine11++) {
+        const size_t row_offset = ine11 * K;
+        const size_t lut_offset = ine11 * K * 4;  // 4 bytes per K element for 2-bit LUT
+        const size_t scale_bias_offset = ine11 * lut_scales_size;
+
+        // Call the dispatch function for this row
+        // ggml_tmac_mul_mat_task_init
+        Dispatch->GenerateLUT(
+            const_cast<float*>(a_float + row_offset),  // Input activation for this row
+            qlut + lut_offset,                         // Output LUT for this row
+            lut_scales + scale_bias_offset,            // Scales for this row
+            lut_biases + scale_bias_offset,            // Biases for this row
+            M,
+            K,
+            N,
+            tmac_params.act_group_size
+        );
+    }
 
     // all relevant LUT's have been generated
     // equivalent of lut_mul_mat's ggml_backend_tmac_mul_mat function ggml_barrier line

From c29365a3a613db8816f1ddc079102ca83f92c9bb Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 28 Jan 2026 12:43:39 -0800
Subject: [PATCH 2/4] remove coloredlogs (#27135)

See related issues:
https://github.com/microsoft/onnxruntime/issues/26889
---
 dockerfiles/Dockerfile.source                          |  2 +-
 docs/python/requirements.txt                           |  1 -
 onnxruntime/python/tools/tensorrt/perf/benchmark.py    | 10 +++++-----
 .../python/tools/tensorrt/perf/benchmark_wrapper.py    |  1 -
 onnxruntime/python/tools/tensorrt/perf/perf_utils.py   |  2 --
 .../python/tools/tensorrt/perf/requirements.txt        |  1 -
 .../python/tools/transformers/benchmark_helper.py      |  9 ++++-----
 .../tools/transformers/convert_to_packing_mode.py      |  9 ++++-----
 .../models/longformer/benchmark_longformer.py          |  2 +-
 .../transformers/models/longformer/convert_to_onnx.py  |  2 +-
 .../transformers/models/stable_diffusion/benchmark.py  |  5 ++---
 .../models/stable_diffusion/demo_txt2img.py            |  5 +++--
 .../models/stable_diffusion/demo_txt2img_xl.py         |  5 +++--
 .../models/stable_diffusion/optimize_pipeline.py       |  3 +--
 .../stable_diffusion/requirements/requirements.txt     |  1 -
 .../Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb       |  4 ++--
 .../notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb | 10 +++++-----
 ...Tensorflow_Tf2onnx_Bert-Squad_OnnxRuntime_CPU.ipynb |  4 ++--
 onnxruntime/python/tools/transformers/optimizer.py     |  8 +++-----
 onnxruntime/python/tools/transformers/requirements.txt |  1 -
 onnxruntime/python/tools/transformers/run_benchmark.sh |  2 +-
 .../test/python/transformers/test_gpt2_benchmark.py    |  3 +--
 .../test/python/transformers/test_gpt2_to_onnx.py      |  3 +--
 requirements.txt                                       |  1 -
 .../linux-gpu-tensorrt-daily-perf-pipeline.yml         |  6 +++---
 .../templates/py-package-smoking-test.yml              |  2 +-
 tools/ci_build/github/windows/python/requirements.txt  |  1 -
 .../requirements/transformers-test/requirements.txt    |  1 -
 28 files changed, 44 insertions(+), 60 deletions(-)

diff --git a/dockerfiles/Dockerfile.source b/dockerfiles/Dockerfile.source
index ea28e144ee95a..51291e59aa0d5 100644
--- a/dockerfiles/Dockerfile.source
+++ b/dockerfiles/Dockerfile.source
@@ -16,4 +16,4 @@ RUN cd /code && /bin/bash ./build.sh --allow_running_as_root --skip_submodule_sy
 FROM mcr.microsoft.com/azurelinux/base/python:3
 COPY --from=0 /code/build/Linux/Release/dist /root
 COPY --from=0 /code/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt
-RUN tdnf install -y ca-certificates python3-setuptools python3-wheel python3-pip python3-numpy python3-flatbuffers python3-packaging python3-protobuf python3-mpmath python3-sympy && python3 -m pip install coloredlogs humanfriendly && python3 -m pip install --no-index --find-links /root onnxruntime  && rm -rf /root/*.whl
+RUN tdnf install -y ca-certificates python3-setuptools python3-wheel python3-pip python3-numpy python3-flatbuffers python3-packaging python3-protobuf python3-mpmath python3-sympy && python3 -m pip install humanfriendly && python3 -m pip install --no-index --find-links /root onnxruntime  && rm -rf /root/*.whl
diff --git a/docs/python/requirements.txt b/docs/python/requirements.txt
index 0be11c8760892..0b0e5d464b26e 100644
--- a/docs/python/requirements.txt
+++ b/docs/python/requirements.txt
@@ -11,7 +11,6 @@ furo
 pyquickhelper
 pandas
 pydot
-coloredlogs
 flatbuffers
 numpy<2.0.0
 packaging
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index 66ab0c44f8814..2017cf154f21e 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -12,7 +12,6 @@
 import timeit
 from datetime import datetime
 
-import coloredlogs
 import numpy as np
 from perf_utils import (
     acl,
@@ -2259,12 +2258,13 @@ def parse_arguments():
 
 def setup_logger(verbose):
     if verbose:
-        coloredlogs.install(
-            level="DEBUG",
-            fmt="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
+            force=True,
         )
     else:
-        coloredlogs.install(fmt="%(message)s")
+        logging.basicConfig(format="%(message)s", force=True)
         logging.getLogger("transformers").setLevel(logging.WARNING)
 
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
index 204fe61396663..7bfe25b1549cf 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
@@ -11,7 +11,6 @@
 import pprint
 import re
 
-import coloredlogs  # noqa: F401
 from benchmark import *  # noqa: F403
 from perf_utils import *  # noqa: F403
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
index 8d2f4b07b7984..4b83e1a8fc41f 100644
--- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
+++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
@@ -5,8 +5,6 @@
 import subprocess
 import sys
 
-import coloredlogs  # noqa: F401
-
 debug = False
 debug_verbose = False
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/requirements.txt b/onnxruntime/python/tools/tensorrt/perf/requirements.txt
index 0afbf47e88307..2a4b319cfc57e 100644
--- a/onnxruntime/python/tools/tensorrt/perf/requirements.txt
+++ b/onnxruntime/python/tools/tensorrt/perf/requirements.txt
@@ -1,4 +1,3 @@
 onnxconverter-common
 onnxmltools
 pandas
-coloredlogs
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
index 8055e5e4ae876..56b670e8f2306 100644
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -18,7 +18,6 @@
 from time import sleep
 from typing import Any
 
-import coloredlogs
 import numpy
 import torch
 import transformers
@@ -147,12 +146,12 @@ def create_onnxruntime_session(
 
 def setup_logger(verbose=True):
     if verbose:
-        coloredlogs.install(
-            level="DEBUG",
-            fmt="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
+        logging.basicConfig(
+            format="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
+            level=logging.DEBUG,
         )
     else:
-        coloredlogs.install(fmt="%(message)s")
+        logging.basicConfig(format="%(message)s", level=logging.INFO)
         logging.getLogger("transformers").setLevel(logging.WARNING)
 
 
diff --git a/onnxruntime/python/tools/transformers/convert_to_packing_mode.py b/onnxruntime/python/tools/transformers/convert_to_packing_mode.py
index 9a6388b3f350d..d8177fcd3cb02 100644
--- a/onnxruntime/python/tools/transformers/convert_to_packing_mode.py
+++ b/onnxruntime/python/tools/transformers/convert_to_packing_mode.py
@@ -7,7 +7,6 @@
 import logging
 import os
 
-import coloredlogs
 from constants import (
     AttentionInputIDs,
     AttentionOutputIDs,
@@ -358,12 +357,12 @@ def _parse_arguments():
 
 def _setup_logger(verbose):
     if verbose:
-        coloredlogs.install(
-            level="DEBUG",
-            fmt="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
+        logging.basicConfig(
+            format="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
+            level=logging.DEBUG,
         )
     else:
-        coloredlogs.install(fmt="%(funcName)20s: %(message)s")
+        logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO)
 
 
 def main():
diff --git a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
index 21848deaf99fe..674dc831d70f9 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
@@ -11,7 +11,7 @@
 #   conda create -n gpu_env python=3.8
 #   conda activate gpu_env
 #   pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
-#   pip3 install onnx transformers onnxruntime-gpu numpy sympy coloredlogs psutil py3nvml
+#   pip3 install onnx transformers onnxruntime-gpu numpy sympy psutil py3nvml
 #   python benchmark_longformer.py
 #
 # When there is no parameter, pre-defined tests will run on the longformer-base-4096 model.
diff --git a/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
index b80feec892994..513a115352556 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
@@ -18,7 +18,7 @@
 #   conda create -n longformer python=3.8
 #   conda activate longformer
 #   python3 -m pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
-#   python3 -m pip install coloredlogs flatbuffers numpy packaging sympy protobuf==3.20.1 onnx==1.12.0 transformers==4.18.0
+#   python3 -m pip install flatbuffers numpy packaging sympy protobuf==3.20.1 onnx==1.12.0 transformers==4.18.0
 #   python3 -m pip install -i https://test.pypi.org/simple/ ort-nightly-gpu
 #   cd ./torch_extensions
 #   rm -rf build
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
index ed2e346972a6c..e90af970032e5 100755
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
@@ -5,14 +5,13 @@
 
 import argparse
 import csv
+import logging
 import os
 import statistics
 import sys
 import time
 from pathlib import Path
 
-import coloredlogs
-
 # import torch before onnxruntime so that onnxruntime uses the cuDNN in the torch package.
 import torch
 from benchmark_helper import measure_memory
@@ -1332,7 +1331,7 @@ def main():
             if version.parse(ort_version) < version.parse("1.16"):
                 raise ValueError("CUDA graph requires ONNX Runtime 1.16 or later")
 
-    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
+    logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO, force=True)
 
     memory_monitor_type = "cuda"
 
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py
index a3caba138f44a..d851e785e8d84 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py
@@ -20,7 +20,8 @@
 # limitations under the License.
 # --------------------------------------------------------------------------
 
-import coloredlogs
+import logging
+
 from cuda import cudart
 from demo_utils import (
     add_controlnet_arguments,
@@ -86,7 +87,7 @@ def run_inference(warmup=False):
 
 
 if __name__ == "__main__":
-    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
+    logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO)
 
     parser = arg_parser("Options for Stable Diffusion Demo")
     add_controlnet_arguments(parser)
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
index c3e91a405b53f..739f3cb5025e7 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
@@ -20,7 +20,8 @@
 # limitations under the License.
 # --------------------------------------------------------------------------
 
-import coloredlogs
+import logging
+
 from cuda import cudart
 from demo_utils import (
     add_controlnet_arguments,
@@ -252,7 +253,7 @@ def main(args):
 
 
 if __name__ == "__main__":
-    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
+    logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO)
 
     parser = arg_parser("Options for Stable Diffusion XL Demo")
     add_controlnet_arguments(parser)
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
index 33397cf75e127..25c034f7b70b5 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
@@ -23,7 +23,6 @@
 import warnings
 from pathlib import Path
 
-import coloredlogs
 import onnx
 from fusion_options import FusionOptions
 from onnx_model_clip import ClipOnnxModel
@@ -587,5 +586,5 @@ def main(argv: list[str] | None = None):
 
 
 if __name__ == "__main__":
-    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
+    logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO)
     main()
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt
index 73929214b22ea..e7852f7478db8 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt
@@ -4,7 +4,6 @@ transformers==4.50.0
 numpy>=1.24.1
 accelerate
 onnx==1.18.0
-coloredlogs
 packaging
 # Use newer version of protobuf might cause crash
 protobuf==4.25.8
diff --git a/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb
index 5e81e754e1109..6603c9c387517 100644
--- a/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb
@@ -52,7 +52,7 @@
     "else:\n",
     "    !{sys.executable} -m pip install install torch --index-url https://download.pytorch.org/whl/cpu -q\n",
     "\n",
-    "!{sys.executable} -m pip install onnxruntime transformers==4.18 onnx psutil pandas py-cpuinfo py3nvml netron coloredlogs --no-warn-script-location -q"
+    "!{sys.executable} -m pip install onnxruntime transformers==4.18 onnx psutil pandas py-cpuinfo py3nvml netron --no-warn-script-location -q"
    ]
   },
   {
@@ -719,4 +719,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
index 7295ae1436c99..76458ca3220c9 100644
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
@@ -59,7 +59,7 @@
     "\n",
     "if sys.platform in ['linux', 'win32']: # Linux or Windows\n",
     "    !{sys.executable} -m pip install torch --index-url https://download.pytorch.org/whl/cu118 -q\n",
-    "    !{sys.executable} -m pip install onnxruntime-gpu onnx transformers psutil pandas py-cpuinfo py3nvml coloredlogs wget netron sympy protobuf==3.20.3 -q\n",
+    "    !{sys.executable} -m pip install onnxruntime-gpu onnx transformers psutil pandas py-cpuinfo py3nvml wget netron sympy protobuf==3.20.3 -q\n",
     "else: # Mac\n",
     "    print(\"CUDA is not available on MacOS\")"
    ]
@@ -196,9 +196,9 @@
       "Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
       "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
       "- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:02<00:00, 16.27it/s]\n",
-      "convert squad examples to features: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 256.11it/s]\n",
-      "add example index and unique id: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<?, ?it/s]\n"
+      "100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 48/48 [00:02<00:00, 16.27it/s]\n",
+      "convert squad examples to features: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1000/1000 [00:03<00:00, 256.11it/s]\n",
+      "add example index and unique id: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1000/1000 [00:00<?, ?it/s]\n"
      ]
     }
    ],
@@ -1969,4 +1969,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Tf2onnx_Bert-Squad_OnnxRuntime_CPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Tf2onnx_Bert-Squad_OnnxRuntime_CPU.ipynb
index aebde9a08a24a..25a045ea7d231 100644
--- a/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Tf2onnx_Bert-Squad_OnnxRuntime_CPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Tf2onnx_Bert-Squad_OnnxRuntime_CPU.ipynb
@@ -379,7 +379,7 @@
    "cell_type": "code",
    "execution_count": 11,
    "source": [
-    "!{sys.executable} -m pip install --quiet coloredlogs sympy \n",
+    "!{sys.executable} -m pip install --quiet sympy \n",
     "\n",
     "optimized_model_path =  os.path.join(output_dir, 'tf2onnx_{}_opt_cpu.onnx'.format(model_name_or_path))\n",
     "\n",
@@ -729,4 +729,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index f4e8bcbe9103e..dd5a640ccbf86 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -23,7 +23,6 @@
 import tempfile
 from pathlib import Path
 
-import coloredlogs
 from fusion_options import FusionOptions
 from onnx import ModelProto, load_model
 from onnx_model import OnnxModel
@@ -562,12 +561,11 @@ def _parse_arguments():
 
 def _setup_logger(verbose):
     if verbose:
-        coloredlogs.install(
-            level="DEBUG",
-            fmt="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
+        logging.basicConfig(
+            format="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s", level=logging.DEBUG, force=True
         )
     else:
-        coloredlogs.install(fmt="%(funcName)20s: %(message)s")
+        logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO, force=True)
 
 
 def main():
diff --git a/onnxruntime/python/tools/transformers/requirements.txt b/onnxruntime/python/tools/transformers/requirements.txt
index 005816e43813d..e048fde4a4eb3 100644
--- a/onnxruntime/python/tools/transformers/requirements.txt
+++ b/onnxruntime/python/tools/transformers/requirements.txt
@@ -1,6 +1,5 @@
 onnx >= 1.8
 numpy >= 1.19.0
-coloredlogs
 psutil
 py-cpuinfo
 py3nvml
diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh
index 25997f40d348f..c16d60d0d5046 100755
--- a/onnxruntime/python/tools/transformers/run_benchmark.sh
+++ b/onnxruntime/python/tools/transformers/run_benchmark.sh
@@ -95,7 +95,7 @@ if [ "$run_install" = true ] ; then
   else
     pip install onnxruntime-gpu
   fi
-  pip install --upgrade onnx coloredlogs packaging psutil py3nvml numpy transformers sympy
+  pip install --upgrade onnx packaging psutil py3nvml numpy transformers sympy
 fi
 
 if [ "$use_package" = true ] ; then
diff --git a/onnxruntime/test/python/transformers/test_gpt2_benchmark.py b/onnxruntime/test/python/transformers/test_gpt2_benchmark.py
index 2d9bc035fe4fd..40be872250f1a 100644
--- a/onnxruntime/test/python/transformers/test_gpt2_benchmark.py
+++ b/onnxruntime/test/python/transformers/test_gpt2_benchmark.py
@@ -9,7 +9,6 @@
 import os
 import unittest
 
-import coloredlogs
 import pytest
 from parity_utilities import find_transformers_source
 
@@ -50,6 +49,6 @@ def test_gpt2_int8(self):
 
 
 if __name__ == "__main__":
-    coloredlogs.install(fmt="%(message)s")
+    logging.basicConfig(format="%(message)s")
     logging.getLogger("transformers").setLevel(logging.ERROR)
     unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_gpt2_to_onnx.py b/onnxruntime/test/python/transformers/test_gpt2_to_onnx.py
index e179d3d087120..bda99abbb7287 100644
--- a/onnxruntime/test/python/transformers/test_gpt2_to_onnx.py
+++ b/onnxruntime/test/python/transformers/test_gpt2_to_onnx.py
@@ -7,7 +7,6 @@
 import logging
 import unittest
 
-import coloredlogs
 import pytest
 from parity_utilities import find_transformers_source
 
@@ -58,6 +57,6 @@ def test_auto_mixed_precision(self):
 
 
 if __name__ == "__main__":
-    coloredlogs.install(fmt="%(message)s")
+    logging.basicConfig(format="%(message)s")
     logging.getLogger("transformers").setLevel(logging.ERROR)
     unittest.main()
diff --git a/requirements.txt b/requirements.txt
index 2fd9362c949dd..ff8cc04d6f219 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-coloredlogs
 flatbuffers
 numpy >= 1.21.6
 packaging
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index c00cbb06f26fd..4bfb9c630fede 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -142,7 +142,7 @@ jobs:
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
         condition: always()
 
-      - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
+      - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas]'
         displayName: 'Install dashboard dependencies'
 
       - script: |
@@ -165,7 +165,7 @@ jobs:
 
     - ${{ if eq(parameters.PostToDashboard, true) }}:
 
-      - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
+      - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas]'
         displayName: 'Install dashboard dependencies'
 
       - script: |
@@ -191,4 +191,4 @@ jobs:
         pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
         artifactName: 'result-$(Build.BuildNumber)'
 
-    - template: templates/clean-agent-build-directory-step.yml
\ No newline at end of file
+    - template: templates/clean-agent-build-directory-step.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
index 9b15f389e5349..4ec074055fcc2 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
@@ -54,7 +54,7 @@ jobs:
         FILE_NAME="${files[0]}"
         FILE_NAME=$(basename $FILE_NAME)
         PYTHON_PACKAGE_NAME=$(echo "$FILE_NAME" | cut -f 1 -d '-')
-        python3 -m pip install coloredlogs flatbuffers numpy packaging protobuf sympy
+        python3 -m pip install flatbuffers numpy packaging protobuf sympy
         python3 -m pip install --no-index --find-links . $PYTHON_PACKAGE_NAME
         python3 -m pip show $PYTHON_PACKAGE_NAME
         python3 -c "import onnxruntime as ort; print(ort.__version__)"
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index 4e24bf7cbfa97..a86eef170bc25 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -14,5 +14,4 @@ jinja2
 markupsafe
 semver
 packaging
-coloredlogs
 onnx==1.20.1; python_version < "3.14"
diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt
index e95509c7ddec3..1523b420bfdbd 100644
--- a/tools/ci_build/requirements/transformers-test/requirements.txt
+++ b/tools/ci_build/requirements/transformers-test/requirements.txt
@@ -6,7 +6,6 @@ numpy==2.2.6; python_version < "3.14"
 numpy==2.3.2; python_version >= "3.14"
 torch==2.8.0
 torchvision==0.23.0
-coloredlogs==15.0
 transformers==4.52.1
 parameterized>=0.8.1
 sentencepiece

From f0d7b3aa32087727489a3535f53420e8acc746fe Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 28 Jan 2026 14:54:01 -0800
Subject: [PATCH 3/4] Add API GetTensorElementTypeAndShapeDataReference
 (#27175) Adds C/C++ API named `GetTensorElementTypeAndShapeDataReference`
 that returns an OrtValue tensor's shape and type without allocating a new
 buffer for the shape data. This new API function can be used instead of
 `OrtApi::GetTypeInfo()` or `OrtApi::GetTensorTypeAndShape` to decrease the
 number of heap allocations and thus improve inference latency for plugin EPs
 kernels that frequently retrieve tensor shapes during inference. (e.g.,
 WebGPU plugin EP)

---
 .../core/session/onnxruntime_c_api.h          | 25 ++++++
 .../core/session/onnxruntime_cxx_api.h        | 13 +++
 .../core/session/onnxruntime_cxx_inline.h     |  7 ++
 .../core/framework/tensor_type_and_shape.cc   | 58 ++++++++++++
 onnxruntime/core/session/onnxruntime_c_api.cc |  3 +
 onnxruntime/core/session/ort_apis.h           |  5 ++
 onnxruntime/test/shared_lib/test_inference.cc | 88 +++++++++++++++++++
 7 files changed, 199 insertions(+)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index cd49d6d9573bc..6ae1539d4c294 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -7195,6 +7195,31 @@ struct OrtApi {
    * \since 1.24
    */
   ORT_API_T(void, RunOptionsSetSyncStream, _Inout_ OrtRunOptions* options, _In_ OrtSyncStream* sync_stream);
+
+  /** \brief Get the element data type and shape for an OrtValue that represents a Tensor (scalar, dense, or sparse).
+   *
+   * \note This function is an alternative to ::GetTensorTypeAndShape() that does not allocate a new array for
+   *       the shape data. The OrtValue instance's internal shape data is returned directly.
+   *
+   * \note Returns an error if the underlying OrtValue is not a Tensor.
+   *
+   * \param[in] value The OrtValue instance.
+   * \param[out] elem_type Output parameter set to the tensor element data type.
+   * \param[out] shape_data Output parameter set to the OrtValue instance's internal shape data array.
+   *                        For a scalar, `shape_data` is NULL and `shape_data_count` is 0.
+   *                        Must not be released as it is owned by the OrtValue instance. This pointer becomes invalid
+   *                        when the OrtValue is released or if the underlying shape data is updated or reallocated.
+   * \param[out] shape_data_count Output parameter set to the number of elements in `shape_data`.
+   *                              `shape_data_count` is 0 for a scalar.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.24.
+   */
+  ORT_API2_STATUS(GetTensorElementTypeAndShapeDataReference, _In_ const OrtValue* value,
+                  _Out_ ONNXTensorElementDataType* elem_type,
+                  _Outptr_result_maybenull_ const int64_t** shape_data,
+                  _Out_ size_t* shape_data_count);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 0dbf206bea992..5cf8cf88bb054 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2220,6 +2220,19 @@ struct ConstValueImpl : Base<T> {
   const R* GetSparseTensorValues() const;
 
 #endif
+
+  /// <summary>
+  /// Returns the tensor's element type and a reference to the tensor's internal shape data. The shape data is owned
+  /// by the Ort::Value and becomes invalid when the Ort::Value is destroyed or if the underlying shape data is
+  /// updated or reallocated.
+  ///
+  /// For a scalar, shape.shape is nullptr and shape.shape_len is 0.
+  ///
+  /// Wraps OrtApi::GetTensorElementTypeAndShapeDataReference.
+  /// </summary>
+  /// <param name="elem_type">Output parameter set to the element's data type.</param>
+  /// <param name="shape">Output parameter set to the OrtValue instance's shape data and number of elements.</param>
+  void GetTensorElementTypeAndShapeDataReference(ONNXTensorElementDataType& elem_type, Shape& shape) const;
 };
 
 template <typename T>
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 9d95c6a880467..1a3e49130a1d1 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -2377,6 +2377,13 @@ inline const R* ConstValueImpl<T>::GetSparseTensorValues() const {
 
 #endif
 
+template <typename T>
+void ConstValueImpl<T>::GetTensorElementTypeAndShapeDataReference(ONNXTensorElementDataType& elem_type,
+                                                                  Shape& shape) const {
+  ThrowOnError(GetApi().GetTensorElementTypeAndShapeDataReference(this->p_, &elem_type, &shape.shape,
+                                                                  &shape.shape_len));
+}
+
 template <typename T>
 void ValueImpl<T>::FillStringTensor(const char* const* s, size_t s_len) {
   ThrowOnError(GetApi().FillStringTensor(this->p_, s, s_len));
diff --git a/onnxruntime/core/framework/tensor_type_and_shape.cc b/onnxruntime/core/framework/tensor_type_and_shape.cc
index 0bac24a2c3aa0..16817ba1707bd 100644
--- a/onnxruntime/core/framework/tensor_type_and_shape.cc
+++ b/onnxruntime/core/framework/tensor_type_and_shape.cc
@@ -310,6 +310,64 @@ std::unique_ptr<OrtTensorTypeAndShapeInfo> OrtTensorTypeAndShapeInfo::GetTensorS
   return GetTensorShapeAndTypeHelper(type, shape, dim_params);
 }
 
+ORT_API_STATUS_IMPL(OrtApis::GetTensorElementTypeAndShapeDataReference, _In_ const OrtValue* value,
+                    _Out_ ONNXTensorElementDataType* elem_type,
+                    _Outptr_result_maybenull_ const int64_t** shape_data,
+                    _Out_ size_t* shape_data_count) {
+  API_IMPL_BEGIN
+  if (!value->IsAllocated() || (!value->IsTensor() && !value->IsSparseTensor())) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                 "Input parameter `value` must contain a constructed tensor or sparse tensor");
+  }
+
+  if (elem_type == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                 "Output parameter `elem_type` must not be NULL");
+  }
+
+  if (shape_data == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                 "Output parameter `shape_data` must not be NULL");
+  }
+
+  if (shape_data_count == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                 "Output parameter `shape_data_count` must not be NULL");
+  }
+
+  gsl::span<const int64_t> shape_span;
+  onnxruntime::MLDataType ml_data_type = nullptr;
+  ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+
+  if (value->IsTensor()) {
+    const Tensor& tensor = value->Get<onnxruntime::Tensor>();
+    ml_data_type = tensor.DataType();
+    shape_span = tensor.Shape().GetDims();
+  } else {
+#if !defined(DISABLE_SPARSE_TENSORS)
+    const SparseTensor& tensor = value->Get<onnxruntime::SparseTensor>();
+    ml_data_type = tensor.DataType();
+    shape_span = tensor.DenseShape().GetDims();
+#else
+    return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "SparseTensor is not supported in this build.");
+#endif
+  }
+
+  if (ml_data_type != nullptr) {
+    type = MLDataTypeToOnnxRuntimeTensorElementDataType(ml_data_type);
+  }
+
+  if (type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) {
+    return OrtApis::CreateStatus(ORT_FAIL, "Tensor does not have a valid or supported tensor element data type");
+  }
+
+  *elem_type = type;
+  *shape_data = shape_span.empty() ? nullptr : shape_span.data();
+  *shape_data_count = shape_span.size();
+  return nullptr;
+  API_IMPL_END
+}
+
 ORT_API_STATUS_IMPL(OrtApis::GetTensorTypeAndShape,
                     _In_ const OrtValue* v, _Outptr_ OrtTensorTypeAndShapeInfo** out) {
   API_IMPL_BEGIN
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 0e60a7c5c39ae..1b40ab5edfff8 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -4802,6 +4802,8 @@ static constexpr OrtApi ort_api_1_to_24 = {
     &OrtApis::EpAssignedNode_GetDomain,
     &OrtApis::EpAssignedNode_GetOperatorType,
     &OrtApis::RunOptionsSetSyncStream,
+    &OrtApis::GetTensorElementTypeAndShapeDataReference,
+    // End of Version 24 - DO NOT MODIFY ABOVE (see above text for more information)
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
@@ -4838,6 +4840,7 @@ static_assert(offsetof(OrtApi, SetEpDynamicOptions) / sizeof(void*) == 284, "Siz
 
 static_assert(offsetof(OrtApi, GetEpApi) / sizeof(void*) == 317, "Size of version 22 API cannot change");
 static_assert(offsetof(OrtApi, CreateExternalInitializerInfo) / sizeof(void*) == 389, "Size of version 23 API cannot change");
+static_assert(offsetof(OrtApi, GetTensorElementTypeAndShapeDataReference) / sizeof(void*) == 414, "Size of version 24 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
 static_assert(std::string_view(ORT_VERSION) == "1.24.0",
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index efea582e3f798..ab3dd45629777 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -808,4 +808,9 @@ ORT_API_STATUS_IMPL(EpAssignedSubgraph_GetNodes, _In_ const OrtEpAssignedSubgrap
 ORT_API_STATUS_IMPL(EpAssignedNode_GetName, _In_ const OrtEpAssignedNode* ep_node, _Outptr_ const char** out);
 ORT_API_STATUS_IMPL(EpAssignedNode_GetDomain, _In_ const OrtEpAssignedNode* ep_node, _Outptr_ const char** out);
 ORT_API_STATUS_IMPL(EpAssignedNode_GetOperatorType, _In_ const OrtEpAssignedNode* ep_node, _Outptr_ const char** out);
+
+ORT_API_STATUS_IMPL(GetTensorElementTypeAndShapeDataReference, _In_ const OrtValue* value,
+                    _Out_ ONNXTensorElementDataType* elem_type,
+                    _Outptr_result_maybenull_ const int64_t** shape_data,
+                    _Out_ size_t* shape_data_count);
 }  // namespace OrtApis
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index a96a2c48b4ca6..4e991716dd108 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -480,6 +480,94 @@ TEST(CApiTest, dim_param) {
   ASSERT_EQ(strcmp(dim_param, ""), 0);
 }
 
+// Tests calling OrtApi::GetTensorElementTypeAndShapeDataReference for a dense OrtValue tensor.
+TEST(CApiTest, Value_GetTensorElementTypeAndShapeDataReference_DenseTensor) {
+  Ort::MemoryInfo info_cpu = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemTypeDefault);
+
+  const std::array<int64_t, 2> x_shape = {3, 2};
+  std::array<float, 3 * 2> x_values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  Ort::Value x_value = Ort::Value::CreateTensor(info_cpu, x_values.data(), x_values.size(),
+                                                x_shape.data(), x_shape.size());
+  Ort::TensorTypeAndShapeInfo type_shape_info = x_value.GetTensorTypeAndShapeInfo();
+
+  ONNXTensorElementDataType elem_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+  Ort::Value::Shape shape{};
+  x_value.GetTensorElementTypeAndShapeDataReference(elem_type, shape);
+
+  ASSERT_EQ(elem_type, type_shape_info.GetElementType());
+
+  std::vector<int64_t> expected_shape = type_shape_info.GetShape();
+  gsl::span<const int64_t> actual_shape(shape.shape, shape.shape_len);
+  ASSERT_EQ(actual_shape, gsl::span<const int64_t>(expected_shape));
+}
+
+// Tests calling OrtApi::GetTensorElementTypeAndShapeDataReference for a scalar OrtValue tensor.
+TEST(CApiTest, Value_GetTensorElementTypeAndShapeDataReference_Scalar) {
+  Ort::MemoryInfo info_cpu = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemTypeDefault);
+
+  std::vector<int64_t> x_shape = {};  // Scalar (no shape)
+  std::array<float, 1> x_values = {1.0f};
+  Ort::Value x_value = Ort::Value::CreateTensor(info_cpu, x_values.data(), x_values.size(),
+                                                x_shape.data(), x_shape.size());
+  Ort::TensorTypeAndShapeInfo type_shape_info = x_value.GetTensorTypeAndShapeInfo();
+
+  ONNXTensorElementDataType elem_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+  Ort::Value::Shape shape{};
+  x_value.GetTensorElementTypeAndShapeDataReference(elem_type, shape);
+
+  ASSERT_EQ(elem_type, type_shape_info.GetElementType());
+
+  std::vector<int64_t> expected_shape = type_shape_info.GetShape();
+  gsl::span<const int64_t> actual_shape(shape.shape, shape.shape_len);
+  ASSERT_EQ(actual_shape, gsl::span<const int64_t>(expected_shape));
+  ASSERT_EQ(shape.shape, nullptr);
+  ASSERT_EQ(shape.shape_len, 0);
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+// Tests calling OrtApi::GetTensorElementTypeAndShapeDataReference for a sparse OrtValue tensor.
+TEST(CApiTest, Value_GetTensorElementTypeAndShapeDataReference_SparseTensor) {
+  std::vector<int64_t> common_shape{9, 9};
+  std::vector<float> A_values{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0,
+                              10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+                              18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
+                              26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0,
+                              34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0,
+                              42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0,
+                              50.0, 51.0, 52.0, 53.0};
+
+  // 2 - D index
+  std::vector<int64_t> indices_shape{gsl::narrow<int64_t>(A_values.size()), 2};
+  std::vector<int64_t> A_indices{0, 1, 0, 2, 0, 6, 0, 7, 0, 8, 1, 0, 1,
+                                 1, 1, 2, 1, 6, 1, 7, 1, 8, 2, 0, 2, 1,
+                                 2, 2, 2, 6, 2, 7, 2, 8, 3, 3, 3, 4, 3,
+                                 5, 3, 6, 3, 7, 3, 8, 4, 3, 4, 4, 4, 5,
+                                 4, 6, 4, 7, 4, 8, 5, 3, 5, 4, 5, 5, 5,
+                                 6, 5, 7, 5, 8, 6, 0, 6, 1, 6, 2, 6, 3,
+                                 6, 4, 6, 5, 7, 0, 7, 1, 7, 2, 7, 3, 7,
+                                 4, 7, 5, 8, 0, 8, 1, 8, 2, 8, 3, 8, 4,
+                                 8, 5};
+
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+  Ort::Value::Shape ort_dense_shape{common_shape.data(), common_shape.size()};
+  Ort::Value::Shape ort_values_shape{&indices_shape[0], 1U};
+  auto value_sparse = Ort::Value::CreateSparseTensor(info, A_values.data(), ort_dense_shape, ort_values_shape);
+  value_sparse.UseCooIndices(A_indices.data(), A_indices.size());
+
+  Ort::TensorTypeAndShapeInfo type_shape_info = value_sparse.GetTensorTypeAndShapeInfo();
+
+  ONNXTensorElementDataType elem_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+  Ort::Value::Shape shape{};
+  value_sparse.GetTensorElementTypeAndShapeDataReference(elem_type, shape);
+
+  ASSERT_EQ(elem_type, type_shape_info.GetElementType());
+
+  std::vector<int64_t> expected_shape = type_shape_info.GetShape();
+  gsl::span<const int64_t> actual_shape(shape.shape, shape.shape_len);
+  ASSERT_EQ(actual_shape, gsl::span<const int64_t>(expected_shape));
+}
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
+
 static std::pair<bool, bool> LoadAndGetInputShapePresent(const ORTCHAR_T* const model_url) {
   Ort::Session session(*ort_env, model_url, Ort::SessionOptions{});
   const auto input_num = session.GetInputCount();

From 061b03e788f5d8401388db4d440d9b1ccbfc7caa Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 28 Jan 2026 23:27:43 -0800
Subject: [PATCH 4/4] [MLAS] Fix Flaky LuT GEMM Tests by Replacing Gather with
 Shuffle (#27174)

## Problem Description
The `MatMulNBitsLutGemm` test suite, specifically
`Float32_2Bits_Symmetric_256x256_BlkLen64`, was observing intermittent
failures (flakiness).
The failure manifested as numerical mismatches exceeding the tolerance,
suggesting non-deterministic behavior in the kernel execution.

## Root Cause Analysis
The issue was traced to the usage of `_mm256_i32gather_ps` in
sqnbitgemm_lut_kernel_avx2.cpp
While the gather indices were technically calculating addresses within
the bounds of the allocated buffer, gather instructions on certain AVX2
hardware implementations can exhibit non-deterministic behavior or
subtle performance/prefetching artifacts when operating on specific
stride patterns (in this case, gathering with a stride of 4 floats).

## Solution
This PR replaces the `_mm256_i32gather_ps` instruction with a sequence
of **contiguous loads (`_mm256_loadu_ps`) followed by deterministic
shuffles**.

### How it works:
1. **Contiguous Load**: We load 4 contiguous vectors of 8 floats
elements using `_mm256_loadu_ps`. This is always memory-safe and
deterministic.
2. **Deterministic Shuffle**: We apply a verified sequence of `unpack`
and `permutevar8x32` instructions to rearrange these 32 linearly loaded
elements into the exact same stride-4 layout that the gather instruction
produced.

### Benefits:
* **Stability**: Eliminates the hardware-dependent non-determinism of
gather.
* **Safety**: Usage of `loadu` guarantees we only touch memory within
the explicit range of the 32 elements we intend to load.
* **Correctness**: The shuffle logic was verified against the reference
gather behavior using a C++ reproduction script to ensure bit-exact
layout equivalence.

### Performance

Micro-benchmark on MatMulNBitsLutGemm (256x256, BlkLen=64).
Original (Gather): ~55.55 us
Fixed (Load+Shuffle): ~57.79 us
Delta: +2.24 us (~4% slower)

The slight performance regression is expected because replacing a single
hardware gather instruction with a sequence of loadu, unpack, and
permute instructions adds instruction count overhead. However, this is a
necessary tradeoff to ensure deterministic behavior and memory safety
across all AVX2 implementations.

## Verification
* **Tests**: All 9 tests in `MatMulNBitsLutGemm` passed successfully
(including the previously flaky `BlkLen64` case).
---
 .../mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp   | 54 ++++++++++++++-----
 .../test/contrib_ops/matmul_2bits_test.cc     |  6 ++-
 2 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp
index b54f051ca1504..a89993d4515b8 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp
@@ -187,21 +187,53 @@ get_bias_scale()
     return 3;
 }
 
+static inline void
+MlasAvx2LoaduDeinterleave32Ps(const float* src, __m256& v0, __m256& v1, __m256& v2, __m256& v3)
+{
+    // Process 32 activations contiguously using loadu + shuffle.
+    // This allows us to mix neighbors (src[4i], src[4i+1], src[4i+2], src[4i+3]) across lanes,
+    // which matches the T-MAC weight packing.
+    // We use loadu + shuffle instead of gather to avoid potential issues with gather
+    // on some hardware and ensure deterministic behavior.
+    __m256 vec_b0 = _mm256_loadu_ps(src + 0);
+    __m256 vec_b1 = _mm256_loadu_ps(src + 8);
+    __m256 vec_b2 = _mm256_loadu_ps(src + 16);
+    __m256 vec_b3 = _mm256_loadu_ps(src + 24);
+
+    __m256 t0 = _mm256_unpacklo_ps(vec_b0, vec_b1);
+    __m256 t1 = _mm256_unpackhi_ps(vec_b0, vec_b1);
+    __m256 t2 = _mm256_unpacklo_ps(vec_b2, vec_b3);
+    __m256 t3 = _mm256_unpackhi_ps(vec_b2, vec_b3);
+
+    __m256 u0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t2)));
+    __m256 u1 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t2)));
+    __m256 u2 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t1), _mm256_castps_pd(t3)));
+    __m256 u3 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t1), _mm256_castps_pd(t3)));
+
+    const __m256i perm_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+    v0 = _mm256_permutevar8x32_ps(u0, perm_idx);
+    v1 = _mm256_permutevar8x32_ps(u1, perm_idx);
+    v2 = _mm256_permutevar8x32_ps(u2, perm_idx);
+    v3 = _mm256_permutevar8x32_ps(u3, perm_idx);
+}
+
 void
 partial_max_g4_int8_k8(float* lut_scales, const float* b)
 {
-    // TODO(vraspar): add support for arm neon
-    const __m256i vec_bi = _mm256_set_epi32(112, 96, 80, 64, 48, 32, 16, 0);
-    __m256 vec_b0 = _mm256_i32gather_ps(b + 0, vec_bi, 1);
-    __m256 vec_b1 = _mm256_i32gather_ps(b + 1, vec_bi, 1);
-    __m256 vec_b2 = _mm256_i32gather_ps(b + 2, vec_bi, 1);
-    __m256 vec_b3 = _mm256_i32gather_ps(b + 3, vec_bi, 1);
+    __m256 vec_b0, vec_b1, vec_b2, vec_b3;
+    MlasAvx2LoaduDeinterleave32Ps(b, vec_b0, vec_b1, vec_b2, vec_b3);
+
     const __m256 vec_sign = _mm256_set1_ps(-0.0f);
     __m256 vec_babs0 = _mm256_andnot_ps(vec_sign, vec_b0);
     __m256 vec_babs1 = _mm256_andnot_ps(vec_sign, vec_b1);
     __m256 vec_babs2 = _mm256_andnot_ps(vec_sign, vec_b2);
     __m256 vec_babs3 = _mm256_andnot_ps(vec_sign, vec_b3);
+
+    // The upper bound for the LUT values (mixtures of 4 activations) is the sum
+    // of their absolute values.
     __m256 abssum = _mm256_add_ps(_mm256_add_ps(vec_babs0, vec_babs1), _mm256_add_ps(vec_babs2, vec_babs3));
+
+    // Reduce max across lanes to find the global maximum sum in this chunk.
     __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(abssum, 1), _mm256_castps256_ps128(abssum));
     max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
     max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
@@ -222,16 +254,14 @@ lut_ctor_g4_int8_impl(
 )
 {
     __m256 vec_lut[16];
-    float biases = 0.0;
-    const __m256i vec_bi = _mm256_set_epi32(112, 96, 80, 64, 48, 32, 16, 0);
+    float biases = 0.0f;
     float scales = *lut_scales;
     float t_scales = scales ? 1.0f / scales : 0.0f;
 
     for (int k = 0; k < act_k / 32; ++k) {
-        __m256 vec_b0 = _mm256_i32gather_ps(b + k * 32 + 0, vec_bi, 1);
-        __m256 vec_b1 = _mm256_i32gather_ps(b + k * 32 + 1, vec_bi, 1);
-        __m256 vec_b2 = _mm256_i32gather_ps(b + k * 32 + 2, vec_bi, 1);
-        __m256 vec_b3 = _mm256_i32gather_ps(b + k * 32 + 3, vec_bi, 1);
+        const float* b_chunk = b + k * 32;
+        __m256 vec_b0, vec_b1, vec_b2, vec_b3;
+        MlasAvx2LoaduDeinterleave32Ps(b_chunk, vec_b0, vec_b1, vec_b2, vec_b3);
 
         PRAGMA_UNROLL
         for (int g = 1; g < 16; g += 2) {
diff --git a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc
index 853458312cd1f..3d5e3e5f360b4 100644
--- a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc
@@ -371,8 +371,10 @@ TEST(MatMulNBitsLutGemm, Float32_2Bits_Symmetric_256x256) {
   TestMatMul2BitsLutGemm<float>(1, 256, 256, 32, false);
 }
 
-// TODO: Re-enable once LUT GEMM asymmetric quantization accuracy issue is resolved
-TEST(MatMulNBitsLutGemm, DISABLED_Float32_2Bits_Asymmetric_256x256) {
+// This test was previously disabled due to accuracy issues related to non-deterministic
+// gather operations. It is now re-enabled after replacing gather with deterministic
+// load+shuffle operations to improve determinism and stability.
+TEST(MatMulNBitsLutGemm, Float32_2Bits_Asymmetric_256x256) {
   TestMatMul2BitsLutGemm<float>(1, 256, 256, 32, true);
 }