From dd6024dbb5ea1dfd65d7772130969b9a6e2ef4b3 Mon Sep 17 00:00:00 2001
From: Herman Semenoff <GermanAizek@yandex.ru>
Date: Mon, 1 Dec 2025 11:57:04 +0300
Subject: [PATCH] ggml-quants: use _mm256_testz_si256 for mask checks in AVX2

`_mm256_testz_si256` directly checks if all bits of a vector are zero, which is a more efficient approach for conditional branching than extracting an 8-bit mask and then checking if the mask is non-zero. This optimization leverages specific AVX2 instruction capabilities, potentially reducing instruction latency and improving overall performance by avoiding unnecessary register transfers for the mask.

References:
*   When to use _mm256_testz_si256 vs _mm256_movemask_epi8: [https://stackoverflow.com/questions/27643534/when-to-use-mm256-testz-si256-vs-mm256-movemask-epi8](https://stackoverflow.com/questions/27643534/when-to-use-mm256-testz-si256-vs-mm256-movemask-epi8)
*   AVX2: _mm256_testz_si256 vs _mm256_cmpeq_epi32 and _mm256_movemask_epi8: [https://stackoverflow.com/questions/43206253/avx2-mm256-testz-si256-vs-mm256-cmpeq-epi32-and-mm256-movemask-epi8](https://stackoverflow.com/questions/43206253/avx2-mm256-testz-si256-vs-mm256-cmpeq-epi32-and-mm256-movemask-epi8)
*   Intel Intrinsics Guide for `_mm256_testz_si256`: [https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
*   Intel Intrinsics Guide for `_mm256_movemask_epi8`: [https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
*   Efficiently checking for zero vectors with AVX2: [https://lemire.me/blog/2018/06/18/efficiently-checking-for-zero-vectors-with-avx2/](https://lemire.me/blog/2018/06/18/efficiently-checking-for-zero-vectors-with-avx2/)
---
 ggml/src/ggml-quants.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index de5cbd75e86..3e7a05498a8 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -5119,8 +5119,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
                     __m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
                     __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00));
                     __m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00));
-                    int mask = _mm256_movemask_epi8(cmp);
-                    if (mask) {
+                    if (!_mm256_testz_si256(cmp, cmp)) {
                         for (size_t j = 0; j < 16; ++j) {
                             if (!validate_fp16(f[i + j], i + j)) {
                                 return false;
@@ -5160,8 +5159,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
                     __m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
                     __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000));
                     __m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000));
-                    int mask = _mm256_movemask_epi8(cmp);
-                    if (mask) {
+                    if (!_mm256_testz_si256(cmp, cmp)) {
                         for (size_t j = 0; j < 8; ++j) {
                             if (!validate_float(f[i + j], i + j)) {
                                 return false;