From 44e5ca6b24ecce2c64ac4fbfddcff2e50d7b3d39 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 17 Oct 2024 01:16:42 -0500
Subject: [PATCH] Docs: LUT instruction latencies on x86

---
 README.md                         |  4 ++--
 include/stringzilla/stringzilla.h | 12 ++++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/README.md b/README.md
index 821be609..dbfd3f9b 100644
--- a/README.md
+++ b/README.md
@@ -171,7 +171,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>9.4</b> MB/s
     </td>
     <td align="center">
-      <code>uniform_int_distribution</code><br/>
+      <code>std::uniform_int_distribution</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>47.2</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>20.4</b> MB/s
     </td>
@@ -193,7 +193,7 @@ __Who is this for?__
   <tr>
     <td align="center">⚪</td>
     <td align="center">
-      <code>transform</code><br/>
+      <code>std::transform</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>3.81</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>2.65</b> GB/s
     </td>
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index fbfbf28b..8176ee13 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -5323,8 +5323,16 @@ SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, s
     // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
     // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
     //
-    //  - `_mm512_mask_blend_epi8` - 1 cycle latency, and generally 2x can run in parallel.
-    //  - `_mm512_test_epi8_mask` - 3 cycles latency, same as most comparison functions in AVX-512.
+    //  - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
+    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
+    //      - On Genoa: 6 cycles latency, ports: 1*FP12
+    //  - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
+    //      - On Ice Lake: 3 cycles latency, ports: 1*p05
+    //      - On Genoa: 1 cycle latency, ports: 1*FP0123
+    //  - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
+    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
+    //      - On Genoa: 4 cycles latency, ports: 1*FP01
+    //
     sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
     lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
     lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));