From 44e5ca6b24ecce2c64ac4fbfddcff2e50d7b3d39 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 17 Oct 2024 01:16:42 -0500
Subject: [PATCH] Docs: LUT instruction latencies on x86
---
README.md | 4 ++--
include/stringzilla/stringzilla.h | 12 ++++++++++--
2 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/README.md b/README.md
index 821be609..dbfd3f9b 100644
--- a/README.md
+++ b/README.md
@@ -171,7 +171,7 @@ __Who is this for?__
arm: 9.4 MB/s
- uniform_int_distribution
+ std::uniform_int_distribution
x86: 47.2 ·
arm: 20.4 MB/s
|
@@ -193,7 +193,7 @@ __Who is this for?__
⚪ |
- transform
+ std::transform
x86: 3.81 ·
arm: 2.65 GB/s
|
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index fbfbf28b..8176ee13 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -5323,8 +5323,16 @@ SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, s
// operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
// Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
//
- // - `_mm512_mask_blend_epi8` - 1 cycle latency, and generally 2x can run in parallel.
- // - `_mm512_test_epi8_mask` - 3 cycles latency, same as most comparison functions in AVX-512.
+ // - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
+ // - On Ice Lake: 3 cycles latency, ports: 1*p5
+ // - On Genoa: 6 cycles latency, ports: 1*FP12
+ // - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
+ // - On Ice Lake: 3 cycles latency, ports: 1*p05
+ // - On Genoa: 1 cycle latency, ports: 1*FP0123
+ // - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
+ // - On Ice Lake: 3 cycles latency, ports: 1*p5
+ // - On Genoa: 4 cycles latency, ports: 1*FP01
+ //
sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));