From ee355a9d592d841ee74703c69d166b9425aeb387 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 28 Jan 2026 12:08:23 -0800 Subject: [PATCH 1/4] [MLAS] Fix Data Race in MlasLutGemm by Serializing LUT Generation (#27179) ## Problem Description The `MatMulNBitsLutGemm.Float32_2Bits_Asymmetric_Batch32_256x256` test was exhibiting flaky behavior (failure rate ~2-20%) with numerical mismatches. Investigation revealed a **race condition** in the [GenerateLUT](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp#L326) step within [MlasLutGemm](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/inc/mlas_qnbit.h#L328). When the batch size `M > 1`, [MlasLutGemm](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/inc/mlas_qnbit.h#L328) attempted to parallelize the LUT generation over the batch dimension using `MlasTrySimpleParallel`. However, the underlying [GenerateLUT](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp#L326) implementation (specifically shared usage of `lut_scales`/`lut_biases` or internal buffers) is not thread-safe for concurrent execution on the same destination buffers or related state. This led to corruption of the Look-Up Tables or scales, causing random output errors. ## Solution This PR modifies [onnxruntime/core/mlas/lib/qlutgemm.cpp](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/lib/qlutgemm.cpp) to **serialize the [GenerateLUT](file:///home/tlwu/onnxruntime/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp#324-355) loop**. Instead of using `MlasTrySimpleParallel`, we now use a simple `for` loop to process each row of the batch sequentially. **Performance Impact:** The [GenerateLUT](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp#L326) step is computationally lightweight compared to the subsequent [TMACComputeGemm](https://github.com/microsoft/onnxruntime/blob/38dfc91f38fe53da9eaf7e9fb9b158904eb3cd5b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp#L505) matrix multiplication. Serializing this setup step has negligible impact on overall inference latency (micro-benchmarks showed no measurable regression), but effectively eliminates the race condition. ## Verification * **Reproduction:** The issue was reliably reproduced by running `MatMulNBitsLutGemm.Float32_2Bits_Asymmetric_Batch32_256x256` in a loop (failing ~1 in 5 times). * **Verification:** After applying the fix, the same test passed **50/50 iterations** consistently. * **Regression Testing:** Standard `MatMulNBitsLutGemm` tests (including `BlkLen64` and `M=1` cases) continue to pass. --- onnxruntime/core/mlas/lib/qlutgemm.cpp | 48 +++++++++++++------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/onnxruntime/core/mlas/lib/qlutgemm.cpp b/onnxruntime/core/mlas/lib/qlutgemm.cpp index f029e539f02a1..cb099c2409a44 100644 --- a/onnxruntime/core/mlas/lib/qlutgemm.cpp +++ b/onnxruntime/core/mlas/lib/qlutgemm.cpp @@ -548,32 +548,32 @@ MlasLutGemm( // const int num_groups = static_cast(K / BlkLen); - // Parallelize over M (batch dimension) - // Each iteration processes one row of the activation matrix + // Iterate over M (batch dimension) + // Each iteration processes one row of the activation matrix. + // NOTE: This loop is intentionally serialized. Previous attempts to parallelize + // using MlasTrySimpleParallel caused flaky test failures (race conditions) + // when M > 1 (e.g., Batch32 case). Since GenerateLUT is lightweight, + // serial execution ensures correctness with negligible performance impact. // TODO(vraspar): Ideally we have to do block parallelism here - MlasTrySimpleParallel( - threadpool, - static_cast(M), - [&](ptrdiff_t ine11) { - const size_t row_offset = static_cast(ine11) * K; - const size_t lut_offset = static_cast(ine11) * K * 4; // 4 bytes per K element for 2-bit LUT - const size_t scale_bias_offset = static_cast(ine11) * lut_scales_size; - - // Call the dispatch function for this row - // ggml_tmac_mul_mat_task_init - Dispatch->GenerateLUT( - const_cast(a_float + row_offset), // Input activation for this row - qlut + lut_offset, // Output LUT for this row - lut_scales + scale_bias_offset, // Scales for this row - lut_biases + scale_bias_offset, // Biases for this row - M, - K, - N, - tmac_params.act_group_size - ); - } - ); + for (size_t ine11 = 0; ine11 < static_cast(M); ine11++) { + const size_t row_offset = ine11 * K; + const size_t lut_offset = ine11 * K * 4; // 4 bytes per K element for 2-bit LUT + const size_t scale_bias_offset = ine11 * lut_scales_size; + + // Call the dispatch function for this row + // ggml_tmac_mul_mat_task_init + Dispatch->GenerateLUT( + const_cast(a_float + row_offset), // Input activation for this row + qlut + lut_offset, // Output LUT for this row + lut_scales + scale_bias_offset, // Scales for this row + lut_biases + scale_bias_offset, // Biases for this row + M, + K, + N, + tmac_params.act_group_size + ); + } // all relevant LUT's have been generated // equivalent of lut_mul_mat's ggml_backend_tmac_mul_mat function ggml_barrier line From c29365a3a613db8816f1ddc079102ca83f92c9bb Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 28 Jan 2026 12:43:39 -0800 Subject: [PATCH 2/4] remove coloredlogs (#27135) See related issues: https://github.com/microsoft/onnxruntime/issues/26889 --- dockerfiles/Dockerfile.source | 2 +- docs/python/requirements.txt | 1 - onnxruntime/python/tools/tensorrt/perf/benchmark.py | 10 +++++----- .../python/tools/tensorrt/perf/benchmark_wrapper.py | 1 - onnxruntime/python/tools/tensorrt/perf/perf_utils.py | 2 -- .../python/tools/tensorrt/perf/requirements.txt | 1 - .../python/tools/transformers/benchmark_helper.py | 9 ++++----- .../tools/transformers/convert_to_packing_mode.py | 9 ++++----- .../models/longformer/benchmark_longformer.py | 2 +- .../transformers/models/longformer/convert_to_onnx.py | 2 +- .../transformers/models/stable_diffusion/benchmark.py | 5 ++--- .../models/stable_diffusion/demo_txt2img.py | 5 +++-- .../models/stable_diffusion/demo_txt2img_xl.py | 5 +++-- .../models/stable_diffusion/optimize_pipeline.py | 3 +-- .../stable_diffusion/requirements/requirements.txt | 1 - .../Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb | 4 ++-- .../notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb | 10 +++++----- ...Tensorflow_Tf2onnx_Bert-Squad_OnnxRuntime_CPU.ipynb | 4 ++-- onnxruntime/python/tools/transformers/optimizer.py | 8 +++----- onnxruntime/python/tools/transformers/requirements.txt | 1 - onnxruntime/python/tools/transformers/run_benchmark.sh | 2 +- .../test/python/transformers/test_gpt2_benchmark.py | 3 +-- .../test/python/transformers/test_gpt2_to_onnx.py | 3 +-- requirements.txt | 1 - .../linux-gpu-tensorrt-daily-perf-pipeline.yml | 6 +++--- .../templates/py-package-smoking-test.yml | 2 +- tools/ci_build/github/windows/python/requirements.txt | 1 - .../requirements/transformers-test/requirements.txt | 1 - 28 files changed, 44 insertions(+), 60 deletions(-) diff --git a/dockerfiles/Dockerfile.source b/dockerfiles/Dockerfile.source index ea28e144ee95a..51291e59aa0d5 100644 --- a/dockerfiles/Dockerfile.source +++ b/dockerfiles/Dockerfile.source @@ -16,4 +16,4 @@ RUN cd /code && /bin/bash ./build.sh --allow_running_as_root --skip_submodule_sy FROM mcr.microsoft.com/azurelinux/base/python:3 COPY --from=0 /code/build/Linux/Release/dist /root COPY --from=0 /code/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt -RUN tdnf install -y ca-certificates python3-setuptools python3-wheel python3-pip python3-numpy python3-flatbuffers python3-packaging python3-protobuf python3-mpmath python3-sympy && python3 -m pip install coloredlogs humanfriendly && python3 -m pip install --no-index --find-links /root onnxruntime && rm -rf /root/*.whl +RUN tdnf install -y ca-certificates python3-setuptools python3-wheel python3-pip python3-numpy python3-flatbuffers python3-packaging python3-protobuf python3-mpmath python3-sympy && python3 -m pip install humanfriendly && python3 -m pip install --no-index --find-links /root onnxruntime && rm -rf /root/*.whl diff --git a/docs/python/requirements.txt b/docs/python/requirements.txt index 0be11c8760892..0b0e5d464b26e 100644 --- a/docs/python/requirements.txt +++ b/docs/python/requirements.txt @@ -11,7 +11,6 @@ furo pyquickhelper pandas pydot -coloredlogs flatbuffers numpy<2.0.0 packaging diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py index 66ab0c44f8814..2017cf154f21e 100644 --- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py +++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py @@ -12,7 +12,6 @@ import timeit from datetime import datetime -import coloredlogs import numpy as np from perf_utils import ( acl, @@ -2259,12 +2258,13 @@ def parse_arguments(): def setup_logger(verbose): if verbose: - coloredlogs.install( - level="DEBUG", - fmt="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s", + logging.basicConfig( + level=logging.DEBUG, + format="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s", + force=True, ) else: - coloredlogs.install(fmt="%(message)s") + logging.basicConfig(format="%(message)s", force=True) logging.getLogger("transformers").setLevel(logging.WARNING) diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py index 204fe61396663..7bfe25b1549cf 100644 --- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py +++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py @@ -11,7 +11,6 @@ import pprint import re -import coloredlogs # noqa: F401 from benchmark import * # noqa: F403 from perf_utils import * # noqa: F403 diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py index 8d2f4b07b7984..4b83e1a8fc41f 100644 --- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py +++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py @@ -5,8 +5,6 @@ import subprocess import sys -import coloredlogs # noqa: F401 - debug = False debug_verbose = False diff --git a/onnxruntime/python/tools/tensorrt/perf/requirements.txt b/onnxruntime/python/tools/tensorrt/perf/requirements.txt index 0afbf47e88307..2a4b319cfc57e 100644 --- a/onnxruntime/python/tools/tensorrt/perf/requirements.txt +++ b/onnxruntime/python/tools/tensorrt/perf/requirements.txt @@ -1,4 +1,3 @@ onnxconverter-common onnxmltools pandas -coloredlogs \ No newline at end of file diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py index 8055e5e4ae876..56b670e8f2306 100644 --- a/onnxruntime/python/tools/transformers/benchmark_helper.py +++ b/onnxruntime/python/tools/transformers/benchmark_helper.py @@ -18,7 +18,6 @@ from time import sleep from typing import Any -import coloredlogs import numpy import torch import transformers @@ -147,12 +146,12 @@ def create_onnxruntime_session( def setup_logger(verbose=True): if verbose: - coloredlogs.install( - level="DEBUG", - fmt="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s", + logging.basicConfig( + format="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s", + level=logging.DEBUG, ) else: - coloredlogs.install(fmt="%(message)s") + logging.basicConfig(format="%(message)s", level=logging.INFO) logging.getLogger("transformers").setLevel(logging.WARNING) diff --git a/onnxruntime/python/tools/transformers/convert_to_packing_mode.py b/onnxruntime/python/tools/transformers/convert_to_packing_mode.py index 9a6388b3f350d..d8177fcd3cb02 100644 --- a/onnxruntime/python/tools/transformers/convert_to_packing_mode.py +++ b/onnxruntime/python/tools/transformers/convert_to_packing_mode.py @@ -7,7 +7,6 @@ import logging import os -import coloredlogs from constants import ( AttentionInputIDs, AttentionOutputIDs, @@ -358,12 +357,12 @@ def _parse_arguments(): def _setup_logger(verbose): if verbose: - coloredlogs.install( - level="DEBUG", - fmt="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s", + logging.basicConfig( + format="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s", + level=logging.DEBUG, ) else: - coloredlogs.install(fmt="%(funcName)20s: %(message)s") + logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO) def main(): diff --git a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py index 21848deaf99fe..674dc831d70f9 100644 --- a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py +++ b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py @@ -11,7 +11,7 @@ # conda create -n gpu_env python=3.8 # conda activate gpu_env # pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 -# pip3 install onnx transformers onnxruntime-gpu numpy sympy coloredlogs psutil py3nvml +# pip3 install onnx transformers onnxruntime-gpu numpy sympy psutil py3nvml # python benchmark_longformer.py # # When there is no parameter, pre-defined tests will run on the longformer-base-4096 model. diff --git a/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py index b80feec892994..513a115352556 100644 --- a/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py @@ -18,7 +18,7 @@ # conda create -n longformer python=3.8 # conda activate longformer # python3 -m pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html -# python3 -m pip install coloredlogs flatbuffers numpy packaging sympy protobuf==3.20.1 onnx==1.12.0 transformers==4.18.0 +# python3 -m pip install flatbuffers numpy packaging sympy protobuf==3.20.1 onnx==1.12.0 transformers==4.18.0 # python3 -m pip install -i https://test.pypi.org/simple/ ort-nightly-gpu # cd ./torch_extensions # rm -rf build diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py index ed2e346972a6c..e90af970032e5 100755 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py @@ -5,14 +5,13 @@ import argparse import csv +import logging import os import statistics import sys import time from pathlib import Path -import coloredlogs - # import torch before onnxruntime so that onnxruntime uses the cuDNN in the torch package. import torch from benchmark_helper import measure_memory @@ -1332,7 +1331,7 @@ def main(): if version.parse(ort_version) < version.parse("1.16"): raise ValueError("CUDA graph requires ONNX Runtime 1.16 or later") - coloredlogs.install(fmt="%(funcName)20s: %(message)s") + logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO, force=True) memory_monitor_type = "cuda" diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py index a3caba138f44a..d851e785e8d84 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py @@ -20,7 +20,8 @@ # limitations under the License. # -------------------------------------------------------------------------- -import coloredlogs +import logging + from cuda import cudart from demo_utils import ( add_controlnet_arguments, @@ -86,7 +87,7 @@ def run_inference(warmup=False): if __name__ == "__main__": - coloredlogs.install(fmt="%(funcName)20s: %(message)s") + logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO) parser = arg_parser("Options for Stable Diffusion Demo") add_controlnet_arguments(parser) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py index c3e91a405b53f..739f3cb5025e7 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py @@ -20,7 +20,8 @@ # limitations under the License. # -------------------------------------------------------------------------- -import coloredlogs +import logging + from cuda import cudart from demo_utils import ( add_controlnet_arguments, @@ -252,7 +253,7 @@ def main(args): if __name__ == "__main__": - coloredlogs.install(fmt="%(funcName)20s: %(message)s") + logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO) parser = arg_parser("Options for Stable Diffusion XL Demo") add_controlnet_arguments(parser) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py index 33397cf75e127..25c034f7b70b5 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py @@ -23,7 +23,6 @@ import warnings from pathlib import Path -import coloredlogs import onnx from fusion_options import FusionOptions from onnx_model_clip import ClipOnnxModel @@ -587,5 +586,5 @@ def main(argv: list[str] | None = None): if __name__ == "__main__": - coloredlogs.install(fmt="%(funcName)20s: %(message)s") + logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO) main() diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt index 73929214b22ea..e7852f7478db8 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt @@ -4,7 +4,6 @@ transformers==4.50.0 numpy>=1.24.1 accelerate onnx==1.18.0 -coloredlogs packaging # Use newer version of protobuf might cause crash protobuf==4.25.8 diff --git a/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb index 5e81e754e1109..6603c9c387517 100644 --- a/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb +++ b/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb @@ -52,7 +52,7 @@ "else:\n", " !{sys.executable} -m pip install install torch --index-url https://download.pytorch.org/whl/cpu -q\n", "\n", - "!{sys.executable} -m pip install onnxruntime transformers==4.18 onnx psutil pandas py-cpuinfo py3nvml netron coloredlogs --no-warn-script-location -q" + "!{sys.executable} -m pip install onnxruntime transformers==4.18 onnx psutil pandas py-cpuinfo py3nvml netron --no-warn-script-location -q" ] }, { @@ -719,4 +719,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb index 7295ae1436c99..76458ca3220c9 100644 --- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb +++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb @@ -59,7 +59,7 @@ "\n", "if sys.platform in ['linux', 'win32']: # Linux or Windows\n", " !{sys.executable} -m pip install torch --index-url https://download.pytorch.org/whl/cu118 -q\n", - " !{sys.executable} -m pip install onnxruntime-gpu onnx transformers psutil pandas py-cpuinfo py3nvml coloredlogs wget netron sympy protobuf==3.20.3 -q\n", + " !{sys.executable} -m pip install onnxruntime-gpu onnx transformers psutil pandas py-cpuinfo py3nvml wget netron sympy protobuf==3.20.3 -q\n", "else: # Mac\n", " print(\"CUDA is not available on MacOS\")" ] @@ -196,9 +196,9 @@ "Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n", "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:02<00:00, 16.27it/s]\n", - "convert squad examples to features: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 256.11it/s]\n", - "add example index and unique id: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00= 1.8 numpy >= 1.19.0 -coloredlogs psutil py-cpuinfo py3nvml diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh index 25997f40d348f..c16d60d0d5046 100755 --- a/onnxruntime/python/tools/transformers/run_benchmark.sh +++ b/onnxruntime/python/tools/transformers/run_benchmark.sh @@ -95,7 +95,7 @@ if [ "$run_install" = true ] ; then else pip install onnxruntime-gpu fi - pip install --upgrade onnx coloredlogs packaging psutil py3nvml numpy transformers sympy + pip install --upgrade onnx packaging psutil py3nvml numpy transformers sympy fi if [ "$use_package" = true ] ; then diff --git a/onnxruntime/test/python/transformers/test_gpt2_benchmark.py b/onnxruntime/test/python/transformers/test_gpt2_benchmark.py index 2d9bc035fe4fd..40be872250f1a 100644 --- a/onnxruntime/test/python/transformers/test_gpt2_benchmark.py +++ b/onnxruntime/test/python/transformers/test_gpt2_benchmark.py @@ -9,7 +9,6 @@ import os import unittest -import coloredlogs import pytest from parity_utilities import find_transformers_source @@ -50,6 +49,6 @@ def test_gpt2_int8(self): if __name__ == "__main__": - coloredlogs.install(fmt="%(message)s") + logging.basicConfig(format="%(message)s") logging.getLogger("transformers").setLevel(logging.ERROR) unittest.main() diff --git a/onnxruntime/test/python/transformers/test_gpt2_to_onnx.py b/onnxruntime/test/python/transformers/test_gpt2_to_onnx.py index e179d3d087120..bda99abbb7287 100644 --- a/onnxruntime/test/python/transformers/test_gpt2_to_onnx.py +++ b/onnxruntime/test/python/transformers/test_gpt2_to_onnx.py @@ -7,7 +7,6 @@ import logging import unittest -import coloredlogs import pytest from parity_utilities import find_transformers_source @@ -58,6 +57,6 @@ def test_auto_mixed_precision(self): if __name__ == "__main__": - coloredlogs.install(fmt="%(message)s") + logging.basicConfig(format="%(message)s") logging.getLogger("transformers").setLevel(logging.ERROR) unittest.main() diff --git a/requirements.txt b/requirements.txt index 2fd9362c949dd..ff8cc04d6f219 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -coloredlogs flatbuffers numpy >= 1.21.6 packaging diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml index c00cbb06f26fd..4bfb9c630fede 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml @@ -142,7 +142,7 @@ jobs: workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/' condition: always() - - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs' + - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas]' displayName: 'Install dashboard dependencies' - script: | @@ -165,7 +165,7 @@ jobs: - ${{ if eq(parameters.PostToDashboard, true) }}: - - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs' + - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas]' displayName: 'Install dashboard dependencies' - script: | @@ -191,4 +191,4 @@ jobs: pathtoPublish: '$(Build.SourcesDirectory)/Artifact' artifactName: 'result-$(Build.BuildNumber)' - - template: templates/clean-agent-build-directory-step.yml \ No newline at end of file + - template: templates/clean-agent-build-directory-step.yml diff --git a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml index 9b15f389e5349..4ec074055fcc2 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml @@ -54,7 +54,7 @@ jobs: FILE_NAME="${files[0]}" FILE_NAME=$(basename $FILE_NAME) PYTHON_PACKAGE_NAME=$(echo "$FILE_NAME" | cut -f 1 -d '-') - python3 -m pip install coloredlogs flatbuffers numpy packaging protobuf sympy + python3 -m pip install flatbuffers numpy packaging protobuf sympy python3 -m pip install --no-index --find-links . $PYTHON_PACKAGE_NAME python3 -m pip show $PYTHON_PACKAGE_NAME python3 -c "import onnxruntime as ort; print(ort.__version__)" diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt index 4e24bf7cbfa97..a86eef170bc25 100644 --- a/tools/ci_build/github/windows/python/requirements.txt +++ b/tools/ci_build/github/windows/python/requirements.txt @@ -14,5 +14,4 @@ jinja2 markupsafe semver packaging -coloredlogs onnx==1.20.1; python_version < "3.14" diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt index e95509c7ddec3..1523b420bfdbd 100644 --- a/tools/ci_build/requirements/transformers-test/requirements.txt +++ b/tools/ci_build/requirements/transformers-test/requirements.txt @@ -6,7 +6,6 @@ numpy==2.2.6; python_version < "3.14" numpy==2.3.2; python_version >= "3.14" torch==2.8.0 torchvision==0.23.0 -coloredlogs==15.0 transformers==4.52.1 parameterized>=0.8.1 sentencepiece From f0d7b3aa32087727489a3535f53420e8acc746fe Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Wed, 28 Jan 2026 14:54:01 -0800 Subject: [PATCH 3/4] Add API GetTensorElementTypeAndShapeDataReference (#27175) Adds C/C++ API named `GetTensorElementTypeAndShapeDataReference` that returns an OrtValue tensor's shape and type without allocating a new buffer for the shape data. This new API function can be used instead of `OrtApi::GetTypeInfo()` or `OrtApi::GetTensorTypeAndShape` to decrease the number of heap allocations and thus improve inference latency for plugin EPs kernels that frequently retrieve tensor shapes during inference. (e.g., WebGPU plugin EP) --- .../core/session/onnxruntime_c_api.h | 25 ++++++ .../core/session/onnxruntime_cxx_api.h | 13 +++ .../core/session/onnxruntime_cxx_inline.h | 7 ++ .../core/framework/tensor_type_and_shape.cc | 58 ++++++++++++ onnxruntime/core/session/onnxruntime_c_api.cc | 3 + onnxruntime/core/session/ort_apis.h | 5 ++ onnxruntime/test/shared_lib/test_inference.cc | 88 +++++++++++++++++++ 7 files changed, 199 insertions(+) diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index cd49d6d9573bc..6ae1539d4c294 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -7195,6 +7195,31 @@ struct OrtApi { * \since 1.24 */ ORT_API_T(void, RunOptionsSetSyncStream, _Inout_ OrtRunOptions* options, _In_ OrtSyncStream* sync_stream); + + /** \brief Get the element data type and shape for an OrtValue that represents a Tensor (scalar, dense, or sparse). + * + * \note This function is an alternative to ::GetTensorTypeAndShape() that does not allocate a new array for + * the shape data. The OrtValue instance's internal shape data is returned directly. + * + * \note Returns an error if the underlying OrtValue is not a Tensor. + * + * \param[in] value The OrtValue instance. + * \param[out] elem_type Output parameter set to the tensor element data type. + * \param[out] shape_data Output parameter set to the OrtValue instance's internal shape data array. + * For a scalar, `shape_data` is NULL and `shape_data_count` is 0. + * Must not be released as it is owned by the OrtValue instance. This pointer becomes invalid + * when the OrtValue is released or if the underlying shape data is updated or reallocated. + * \param[out] shape_data_count Output parameter set to the number of elements in `shape_data`. + * `shape_data_count` is 0 for a scalar. + * + * \snippet{doc} snippets.dox OrtStatus Return Value + * + * \since Version 1.24. + */ + ORT_API2_STATUS(GetTensorElementTypeAndShapeDataReference, _In_ const OrtValue* value, + _Out_ ONNXTensorElementDataType* elem_type, + _Outptr_result_maybenull_ const int64_t** shape_data, + _Out_ size_t* shape_data_count); }; /* diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index 0dbf206bea992..5cf8cf88bb054 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -2220,6 +2220,19 @@ struct ConstValueImpl : Base { const R* GetSparseTensorValues() const; #endif + + /// + /// Returns the tensor's element type and a reference to the tensor's internal shape data. The shape data is owned + /// by the Ort::Value and becomes invalid when the Ort::Value is destroyed or if the underlying shape data is + /// updated or reallocated. + /// + /// For a scalar, shape.shape is nullptr and shape.shape_len is 0. + /// + /// Wraps OrtApi::GetTensorElementTypeAndShapeDataReference. + /// + /// Output parameter set to the element's data type. + /// Output parameter set to the OrtValue instance's shape data and number of elements. + void GetTensorElementTypeAndShapeDataReference(ONNXTensorElementDataType& elem_type, Shape& shape) const; }; template diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index 9d95c6a880467..1a3e49130a1d1 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -2377,6 +2377,13 @@ inline const R* ConstValueImpl::GetSparseTensorValues() const { #endif +template +void ConstValueImpl::GetTensorElementTypeAndShapeDataReference(ONNXTensorElementDataType& elem_type, + Shape& shape) const { + ThrowOnError(GetApi().GetTensorElementTypeAndShapeDataReference(this->p_, &elem_type, &shape.shape, + &shape.shape_len)); +} + template void ValueImpl::FillStringTensor(const char* const* s, size_t s_len) { ThrowOnError(GetApi().FillStringTensor(this->p_, s, s_len)); diff --git a/onnxruntime/core/framework/tensor_type_and_shape.cc b/onnxruntime/core/framework/tensor_type_and_shape.cc index 0bac24a2c3aa0..16817ba1707bd 100644 --- a/onnxruntime/core/framework/tensor_type_and_shape.cc +++ b/onnxruntime/core/framework/tensor_type_and_shape.cc @@ -310,6 +310,64 @@ std::unique_ptr OrtTensorTypeAndShapeInfo::GetTensorS return GetTensorShapeAndTypeHelper(type, shape, dim_params); } +ORT_API_STATUS_IMPL(OrtApis::GetTensorElementTypeAndShapeDataReference, _In_ const OrtValue* value, + _Out_ ONNXTensorElementDataType* elem_type, + _Outptr_result_maybenull_ const int64_t** shape_data, + _Out_ size_t* shape_data_count) { + API_IMPL_BEGIN + if (!value->IsAllocated() || (!value->IsTensor() && !value->IsSparseTensor())) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, + "Input parameter `value` must contain a constructed tensor or sparse tensor"); + } + + if (elem_type == nullptr) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, + "Output parameter `elem_type` must not be NULL"); + } + + if (shape_data == nullptr) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, + "Output parameter `shape_data` must not be NULL"); + } + + if (shape_data_count == nullptr) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, + "Output parameter `shape_data_count` must not be NULL"); + } + + gsl::span shape_span; + onnxruntime::MLDataType ml_data_type = nullptr; + ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; + + if (value->IsTensor()) { + const Tensor& tensor = value->Get(); + ml_data_type = tensor.DataType(); + shape_span = tensor.Shape().GetDims(); + } else { +#if !defined(DISABLE_SPARSE_TENSORS) + const SparseTensor& tensor = value->Get(); + ml_data_type = tensor.DataType(); + shape_span = tensor.DenseShape().GetDims(); +#else + return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "SparseTensor is not supported in this build."); +#endif + } + + if (ml_data_type != nullptr) { + type = MLDataTypeToOnnxRuntimeTensorElementDataType(ml_data_type); + } + + if (type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) { + return OrtApis::CreateStatus(ORT_FAIL, "Tensor does not have a valid or supported tensor element data type"); + } + + *elem_type = type; + *shape_data = shape_span.empty() ? nullptr : shape_span.data(); + *shape_data_count = shape_span.size(); + return nullptr; + API_IMPL_END +} + ORT_API_STATUS_IMPL(OrtApis::GetTensorTypeAndShape, _In_ const OrtValue* v, _Outptr_ OrtTensorTypeAndShapeInfo** out) { API_IMPL_BEGIN diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 0e60a7c5c39ae..1b40ab5edfff8 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -4802,6 +4802,8 @@ static constexpr OrtApi ort_api_1_to_24 = { &OrtApis::EpAssignedNode_GetDomain, &OrtApis::EpAssignedNode_GetOperatorType, &OrtApis::RunOptionsSetSyncStream, + &OrtApis::GetTensorElementTypeAndShapeDataReference, + // End of Version 24 - DO NOT MODIFY ABOVE (see above text for more information) }; // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase. @@ -4838,6 +4840,7 @@ static_assert(offsetof(OrtApi, SetEpDynamicOptions) / sizeof(void*) == 284, "Siz static_assert(offsetof(OrtApi, GetEpApi) / sizeof(void*) == 317, "Size of version 22 API cannot change"); static_assert(offsetof(OrtApi, CreateExternalInitializerInfo) / sizeof(void*) == 389, "Size of version 23 API cannot change"); +static_assert(offsetof(OrtApi, GetTensorElementTypeAndShapeDataReference) / sizeof(void*) == 414, "Size of version 24 API cannot change"); // So that nobody forgets to finish an API version, this check will serve as a reminder: static_assert(std::string_view(ORT_VERSION) == "1.24.0", diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h index efea582e3f798..ab3dd45629777 100644 --- a/onnxruntime/core/session/ort_apis.h +++ b/onnxruntime/core/session/ort_apis.h @@ -808,4 +808,9 @@ ORT_API_STATUS_IMPL(EpAssignedSubgraph_GetNodes, _In_ const OrtEpAssignedSubgrap ORT_API_STATUS_IMPL(EpAssignedNode_GetName, _In_ const OrtEpAssignedNode* ep_node, _Outptr_ const char** out); ORT_API_STATUS_IMPL(EpAssignedNode_GetDomain, _In_ const OrtEpAssignedNode* ep_node, _Outptr_ const char** out); ORT_API_STATUS_IMPL(EpAssignedNode_GetOperatorType, _In_ const OrtEpAssignedNode* ep_node, _Outptr_ const char** out); + +ORT_API_STATUS_IMPL(GetTensorElementTypeAndShapeDataReference, _In_ const OrtValue* value, + _Out_ ONNXTensorElementDataType* elem_type, + _Outptr_result_maybenull_ const int64_t** shape_data, + _Out_ size_t* shape_data_count); } // namespace OrtApis diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index a96a2c48b4ca6..4e991716dd108 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -480,6 +480,94 @@ TEST(CApiTest, dim_param) { ASSERT_EQ(strcmp(dim_param, ""), 0); } +// Tests calling OrtApi::GetTensorElementTypeAndShapeDataReference for a dense OrtValue tensor. +TEST(CApiTest, Value_GetTensorElementTypeAndShapeDataReference_DenseTensor) { + Ort::MemoryInfo info_cpu = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemTypeDefault); + + const std::array x_shape = {3, 2}; + std::array x_values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + Ort::Value x_value = Ort::Value::CreateTensor(info_cpu, x_values.data(), x_values.size(), + x_shape.data(), x_shape.size()); + Ort::TensorTypeAndShapeInfo type_shape_info = x_value.GetTensorTypeAndShapeInfo(); + + ONNXTensorElementDataType elem_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; + Ort::Value::Shape shape{}; + x_value.GetTensorElementTypeAndShapeDataReference(elem_type, shape); + + ASSERT_EQ(elem_type, type_shape_info.GetElementType()); + + std::vector expected_shape = type_shape_info.GetShape(); + gsl::span actual_shape(shape.shape, shape.shape_len); + ASSERT_EQ(actual_shape, gsl::span(expected_shape)); +} + +// Tests calling OrtApi::GetTensorElementTypeAndShapeDataReference for a scalar OrtValue tensor. +TEST(CApiTest, Value_GetTensorElementTypeAndShapeDataReference_Scalar) { + Ort::MemoryInfo info_cpu = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemTypeDefault); + + std::vector x_shape = {}; // Scalar (no shape) + std::array x_values = {1.0f}; + Ort::Value x_value = Ort::Value::CreateTensor(info_cpu, x_values.data(), x_values.size(), + x_shape.data(), x_shape.size()); + Ort::TensorTypeAndShapeInfo type_shape_info = x_value.GetTensorTypeAndShapeInfo(); + + ONNXTensorElementDataType elem_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; + Ort::Value::Shape shape{}; + x_value.GetTensorElementTypeAndShapeDataReference(elem_type, shape); + + ASSERT_EQ(elem_type, type_shape_info.GetElementType()); + + std::vector expected_shape = type_shape_info.GetShape(); + gsl::span actual_shape(shape.shape, shape.shape_len); + ASSERT_EQ(actual_shape, gsl::span(expected_shape)); + ASSERT_EQ(shape.shape, nullptr); + ASSERT_EQ(shape.shape_len, 0); +} + +#if !defined(DISABLE_SPARSE_TENSORS) +// Tests calling OrtApi::GetTensorElementTypeAndShapeDataReference for a sparse OrtValue tensor. +TEST(CApiTest, Value_GetTensorElementTypeAndShapeDataReference_SparseTensor) { + std::vector common_shape{9, 9}; + std::vector A_values{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, + 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, + 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, + 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, + 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, + 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, + 50.0, 51.0, 52.0, 53.0}; + + // 2 - D index + std::vector indices_shape{gsl::narrow(A_values.size()), 2}; + std::vector A_indices{0, 1, 0, 2, 0, 6, 0, 7, 0, 8, 1, 0, 1, + 1, 1, 2, 1, 6, 1, 7, 1, 8, 2, 0, 2, 1, + 2, 2, 2, 6, 2, 7, 2, 8, 3, 3, 3, 4, 3, + 5, 3, 6, 3, 7, 3, 8, 4, 3, 4, 4, 4, 5, + 4, 6, 4, 7, 4, 8, 5, 3, 5, 4, 5, 5, 5, + 6, 5, 7, 5, 8, 6, 0, 6, 1, 6, 2, 6, 3, + 6, 4, 6, 5, 7, 0, 7, 1, 7, 2, 7, 3, 7, + 4, 7, 5, 8, 0, 8, 1, 8, 2, 8, 3, 8, 4, + 8, 5}; + + Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); + Ort::Value::Shape ort_dense_shape{common_shape.data(), common_shape.size()}; + Ort::Value::Shape ort_values_shape{&indices_shape[0], 1U}; + auto value_sparse = Ort::Value::CreateSparseTensor(info, A_values.data(), ort_dense_shape, ort_values_shape); + value_sparse.UseCooIndices(A_indices.data(), A_indices.size()); + + Ort::TensorTypeAndShapeInfo type_shape_info = value_sparse.GetTensorTypeAndShapeInfo(); + + ONNXTensorElementDataType elem_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; + Ort::Value::Shape shape{}; + value_sparse.GetTensorElementTypeAndShapeDataReference(elem_type, shape); + + ASSERT_EQ(elem_type, type_shape_info.GetElementType()); + + std::vector expected_shape = type_shape_info.GetShape(); + gsl::span actual_shape(shape.shape, shape.shape_len); + ASSERT_EQ(actual_shape, gsl::span(expected_shape)); +} +#endif // !defined(DISABLE_SPARSE_TENSORS) + static std::pair LoadAndGetInputShapePresent(const ORTCHAR_T* const model_url) { Ort::Session session(*ort_env, model_url, Ort::SessionOptions{}); const auto input_num = session.GetInputCount(); From 061b03e788f5d8401388db4d440d9b1ccbfc7caa Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 28 Jan 2026 23:27:43 -0800 Subject: [PATCH 4/4] [MLAS] Fix Flaky LuT GEMM Tests by Replacing Gather with Shuffle (#27174) ## Problem Description The `MatMulNBitsLutGemm` test suite, specifically `Float32_2Bits_Symmetric_256x256_BlkLen64`, was observing intermittent failures (flakiness). The failure manifested as numerical mismatches exceeding the tolerance, suggesting non-deterministic behavior in the kernel execution. ## Root Cause Analysis The issue was traced to the usage of `_mm256_i32gather_ps` in sqnbitgemm_lut_kernel_avx2.cpp While the gather indices were technically calculating addresses within the bounds of the allocated buffer, gather instructions on certain AVX2 hardware implementations can exhibit non-deterministic behavior or subtle performance/prefetching artifacts when operating on specific stride patterns (in this case, gathering with a stride of 4 floats). ## Solution This PR replaces the `_mm256_i32gather_ps` instruction with a sequence of **contiguous loads (`_mm256_loadu_ps`) followed by deterministic shuffles**. ### How it works: 1. **Contiguous Load**: We load 4 contiguous vectors of 8 floats elements using `_mm256_loadu_ps`. This is always memory-safe and deterministic. 2. **Deterministic Shuffle**: We apply a verified sequence of `unpack` and `permutevar8x32` instructions to rearrange these 32 linearly loaded elements into the exact same stride-4 layout that the gather instruction produced. ### Benefits: * **Stability**: Eliminates the hardware-dependent non-determinism of gather. * **Safety**: Usage of `loadu` guarantees we only touch memory within the explicit range of the 32 elements we intend to load. * **Correctness**: The shuffle logic was verified against the reference gather behavior using a C++ reproduction script to ensure bit-exact layout equivalence. ### Performance Micro-benchmark on MatMulNBitsLutGemm (256x256, BlkLen=64). Original (Gather): ~55.55 us Fixed (Load+Shuffle): ~57.79 us Delta: +2.24 us (~4% slower) The slight performance regression is expected because replacing a single hardware gather instruction with a sequence of loadu, unpack, and permute instructions adds instruction count overhead. However, this is a necessary tradeoff to ensure deterministic behavior and memory safety across all AVX2 implementations. ## Verification * **Tests**: All 9 tests in `MatMulNBitsLutGemm` passed successfully (including the previously flaky `BlkLen64` case). --- .../mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp | 54 ++++++++++++++----- .../test/contrib_ops/matmul_2bits_test.cc | 6 ++- 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp index b54f051ca1504..a89993d4515b8 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp @@ -187,21 +187,53 @@ get_bias_scale() return 3; } +static inline void +MlasAvx2LoaduDeinterleave32Ps(const float* src, __m256& v0, __m256& v1, __m256& v2, __m256& v3) +{ + // Process 32 activations contiguously using loadu + shuffle. + // This allows us to mix neighbors (src[4i], src[4i+1], src[4i+2], src[4i+3]) across lanes, + // which matches the T-MAC weight packing. + // We use loadu + shuffle instead of gather to avoid potential issues with gather + // on some hardware and ensure deterministic behavior. + __m256 vec_b0 = _mm256_loadu_ps(src + 0); + __m256 vec_b1 = _mm256_loadu_ps(src + 8); + __m256 vec_b2 = _mm256_loadu_ps(src + 16); + __m256 vec_b3 = _mm256_loadu_ps(src + 24); + + __m256 t0 = _mm256_unpacklo_ps(vec_b0, vec_b1); + __m256 t1 = _mm256_unpackhi_ps(vec_b0, vec_b1); + __m256 t2 = _mm256_unpacklo_ps(vec_b2, vec_b3); + __m256 t3 = _mm256_unpackhi_ps(vec_b2, vec_b3); + + __m256 u0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t2))); + __m256 u1 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t2))); + __m256 u2 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t1), _mm256_castps_pd(t3))); + __m256 u3 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t1), _mm256_castps_pd(t3))); + + const __m256i perm_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + v0 = _mm256_permutevar8x32_ps(u0, perm_idx); + v1 = _mm256_permutevar8x32_ps(u1, perm_idx); + v2 = _mm256_permutevar8x32_ps(u2, perm_idx); + v3 = _mm256_permutevar8x32_ps(u3, perm_idx); +} + void partial_max_g4_int8_k8(float* lut_scales, const float* b) { - // TODO(vraspar): add support for arm neon - const __m256i vec_bi = _mm256_set_epi32(112, 96, 80, 64, 48, 32, 16, 0); - __m256 vec_b0 = _mm256_i32gather_ps(b + 0, vec_bi, 1); - __m256 vec_b1 = _mm256_i32gather_ps(b + 1, vec_bi, 1); - __m256 vec_b2 = _mm256_i32gather_ps(b + 2, vec_bi, 1); - __m256 vec_b3 = _mm256_i32gather_ps(b + 3, vec_bi, 1); + __m256 vec_b0, vec_b1, vec_b2, vec_b3; + MlasAvx2LoaduDeinterleave32Ps(b, vec_b0, vec_b1, vec_b2, vec_b3); + const __m256 vec_sign = _mm256_set1_ps(-0.0f); __m256 vec_babs0 = _mm256_andnot_ps(vec_sign, vec_b0); __m256 vec_babs1 = _mm256_andnot_ps(vec_sign, vec_b1); __m256 vec_babs2 = _mm256_andnot_ps(vec_sign, vec_b2); __m256 vec_babs3 = _mm256_andnot_ps(vec_sign, vec_b3); + + // The upper bound for the LUT values (mixtures of 4 activations) is the sum + // of their absolute values. __m256 abssum = _mm256_add_ps(_mm256_add_ps(vec_babs0, vec_babs1), _mm256_add_ps(vec_babs2, vec_babs3)); + + // Reduce max across lanes to find the global maximum sum in this chunk. __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(abssum, 1), _mm256_castps256_ps128(abssum)); max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4)); max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4)); @@ -222,16 +254,14 @@ lut_ctor_g4_int8_impl( ) { __m256 vec_lut[16]; - float biases = 0.0; - const __m256i vec_bi = _mm256_set_epi32(112, 96, 80, 64, 48, 32, 16, 0); + float biases = 0.0f; float scales = *lut_scales; float t_scales = scales ? 1.0f / scales : 0.0f; for (int k = 0; k < act_k / 32; ++k) { - __m256 vec_b0 = _mm256_i32gather_ps(b + k * 32 + 0, vec_bi, 1); - __m256 vec_b1 = _mm256_i32gather_ps(b + k * 32 + 1, vec_bi, 1); - __m256 vec_b2 = _mm256_i32gather_ps(b + k * 32 + 2, vec_bi, 1); - __m256 vec_b3 = _mm256_i32gather_ps(b + k * 32 + 3, vec_bi, 1); + const float* b_chunk = b + k * 32; + __m256 vec_b0, vec_b1, vec_b2, vec_b3; + MlasAvx2LoaduDeinterleave32Ps(b_chunk, vec_b0, vec_b1, vec_b2, vec_b3); PRAGMA_UNROLL for (int g = 1; g < 16; g += 2) { diff --git a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc index 853458312cd1f..3d5e3e5f360b4 100644 --- a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc @@ -371,8 +371,10 @@ TEST(MatMulNBitsLutGemm, Float32_2Bits_Symmetric_256x256) { TestMatMul2BitsLutGemm(1, 256, 256, 32, false); } -// TODO: Re-enable once LUT GEMM asymmetric quantization accuracy issue is resolved -TEST(MatMulNBitsLutGemm, DISABLED_Float32_2Bits_Asymmetric_256x256) { +// This test was previously disabled due to accuracy issues related to non-deterministic +// gather operations. It is now re-enabled after replacing gather with deterministic +// load+shuffle operations to improve determinism and stability. +TEST(MatMulNBitsLutGemm, Float32_2Bits_Asymmetric_256x256) { TestMatMul2BitsLutGemm(1, 256, 256, 32, true); }