From 44e5ca6b24ecce2c64ac4fbfddcff2e50d7b3d39 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 17 Oct 2024 01:16:42 -0500 Subject: [PATCH] Docs: LUT instruction latencies on x86 --- README.md | 4 ++-- include/stringzilla/stringzilla.h | 12 ++++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 821be609..dbfd3f9b 100644 --- a/README.md +++ b/README.md @@ -171,7 +171,7 @@ __Who is this for?__ arm: 9.4 MB/s - uniform_int_distribution
+ std::uniform_int_distribution
x86: 47.2 · arm: 20.4 MB/s @@ -193,7 +193,7 @@ __Who is this for?__ ⚪ - transform
+ std::transform
x86: 3.81 · arm: 2.65 GB/s diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h index fbfbf28b..8176ee13 100644 --- a/include/stringzilla/stringzilla.h +++ b/include/stringzilla/stringzilla.h @@ -5323,8 +5323,16 @@ SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, s // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls. // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards. // - // - `_mm512_mask_blend_epi8` - 1 cycle latency, and generally 2x can run in parallel. - // - `_mm512_test_epi8_mask` - 3 cycles latency, same as most comparison functions in AVX-512. + // - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)": + // - On Ice Lake: 3 cycles latency, ports: 1*p5 + // - On Genoa: 6 cycles latency, ports: 1*FP12 + // - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)": + // - On Ice Lake: 3 cycles latency, ports: 1*p05 + // - On Genoa: 1 cycle latency, ports: 1*FP0123 + // - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)": + // - On Ice Lake: 3 cycles latency, ports: 1*p5 + // - On Genoa: 4 cycles latency, ports: 1*FP01 + // sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec; lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut)); lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));