From dd6024dbb5ea1dfd65d7772130969b9a6e2ef4b3 Mon Sep 17 00:00:00 2001 From: Herman Semenoff Date: Mon, 1 Dec 2025 11:57:04 +0300 Subject: [PATCH] ggml-quants: use _mm256_testz_si256 for mask checks in AVX2 `_mm256_testz_si256` directly checks if all bits of a vector are zero, which is a more efficient approach for conditional branching than extracting an 8-bit mask and then checking if the mask is non-zero. This optimization leverages specific AVX2 instruction capabilities, potentially reducing instruction latency and improving overall performance by avoiding unnecessary register transfers for the mask. References: * When to use _mm256_testz_si256 vs _mm256_movemask_epi8: [https://stackoverflow.com/questions/27643534/when-to-use-mm256-testz-si256-vs-mm256-movemask-epi8](https://stackoverflow.com/questions/27643534/when-to-use-mm256-testz-si256-vs-mm256-movemask-epi8) * AVX2: _mm256_testz_si256 vs _mm256_cmpeq_epi32 and _mm256_movemask_epi8: [https://stackoverflow.com/questions/43206253/avx2-mm256-testz-si256-vs-mm256-cmpeq-epi32-and-mm256-movemask-epi8](https://stackoverflow.com/questions/43206253/avx2-mm256-testz-si256-vs-mm256-cmpeq-epi32-and-mm256-movemask-epi8) * Intel Intrinsics Guide for `_mm256_testz_si256`: [https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256) * Intel Intrinsics Guide for `_mm256_movemask_epi8`: [https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8) * Efficiently checking for zero vectors with AVX2: [https://lemire.me/blog/2018/06/18/efficiently-checking-for-zero-vectors-with-avx2/](https://lemire.me/blog/2018/06/18/efficiently-checking-for-zero-vectors-with-avx2/) --- ggml/src/ggml-quants.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index de5cbd75e86..3e7a05498a8 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5119,8 +5119,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte __m256i v = _mm256_loadu_si256((const __m256i *)(f + i)); __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00)); __m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00)); - int mask = _mm256_movemask_epi8(cmp); - if (mask) { + if (!_mm256_testz_si256(cmp, cmp)) { for (size_t j = 0; j < 16; ++j) { if (!validate_fp16(f[i + j], i + j)) { return false; @@ -5160,8 +5159,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte __m256i v = _mm256_loadu_si256((const __m256i *)(f + i)); __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000)); __m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000)); - int mask = _mm256_movemask_epi8(cmp); - if (mask) { + if (!_mm256_testz_si256(cmp, cmp)) { for (size_t j = 0; j < 8; ++j) { if (!validate_float(f[i + j], i + j)) { return false;