Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into main-dev
  • Loading branch information
ashvardanian committed Oct 17, 2024
2 parents fb55d54 + 1891dbf commit 19e6998
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 4 deletions.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,11 @@ if(${STRINGZILLA_BUILD_SHARED})
target_compile_definitions(stringzillite PRIVATE "SZ_AVOID_LIBC=1")
target_compile_definitions(stringzillite PRIVATE "SZ_OVERRIDE_LIBC=1")

if (MSVC)
target_link_libraries(stringzilla_shared PRIVATE msvcrt.lib)
target_link_libraries(stringzillite PRIVATE msvcrt.lib)
endif()

# Avoid built-ins on MSVC and other compilers, as that will cause compileration errors
target_compile_options(stringzillite PRIVATE
"$<$<CXX_COMPILER_ID:GNU,Clang>:-fno-builtin;-nostdlib>"
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ __Who is this for?__
<span style="color:#ABABAB;">arm:</span> <b>9.4</b> MB/s
</td>
<td align="center">
<code>uniform_int_distribution</code><br/>
<code>std::uniform_int_distribution</code><br/>
<span style="color:#ABABAB;">x86:</span> <b>47.2</b> &centerdot;
<span style="color:#ABABAB;">arm:</span> <b>20.4</b> MB/s
</td>
Expand All @@ -193,7 +193,7 @@ __Who is this for?__
<tr>
<td align="center">⚪</td>
<td align="center">
<code>transform</code><br/>
<code>std::transform</code><br/>
<span style="color:#ABABAB;">x86:</span> <b>3.81</b> &centerdot;
<span style="color:#ABABAB;">arm:</span> <b>2.65</b> GB/s
</td>
Expand Down
12 changes: 10 additions & 2 deletions include/stringzilla/stringzilla.h
Original file line number Diff line number Diff line change
Expand Up @@ -5323,8 +5323,16 @@ SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, s
// operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
// Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
//
// - `_mm512_mask_blend_epi8` - 1 cycle latency, and generally 2x can run in parallel.
// - `_mm512_test_epi8_mask` - 3 cycles latency, same as most comparison functions in AVX-512.
// - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
// - On Ice Lake: 3 cycles latency, ports: 1*p5
// - On Genoa: 6 cycles latency, ports: 1*FP12
// - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
// - On Ice Lake: 3 cycles latency, ports: 1*p05
// - On Genoa: 1 cycle latency, ports: 1*FP0123
// - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
// - On Ice Lake: 3 cycles latency, ports: 1*p5
// - On Genoa: 4 cycles latency, ports: 1*FP01
//
sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));
Expand Down

0 comments on commit 19e6998

Please sign in to comment.