Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 37 additions & 36 deletions src/native/containers/dn-simdhash-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,31 +9,29 @@
// HACK: for better language server parsing
#include "dn-simdhash.h"

// FIXME: Temporary workaround until the NEON path is replaced with an optimized one.
// See https://github.com/dotnet/runtime/issues/113074
#ifdef __ARM_ARCH
#define DN_SIMDHASH_USE_SCALAR_FALLBACK 1
#if defined(_M_AMD64) || defined(_M_X64) || (_M_IX86_FP == 2) || defined(__SSE2__)
#define DN_SIMDHASH_USE_SSE2 1
#endif

#if defined(__clang__) || defined (__GNUC__) // use vector intrinsics

#if defined(__wasm_simd128__)
#include <wasm_simd128.h>
#elif defined(_M_AMD64) || defined(_M_X64) || (_M_IX86_FP == 2) || defined(__SSE2__)
#elif DN_SIMDHASH_USE_SSE2
#include <emmintrin.h>
#elif defined(__ARM_NEON)
#elif defined(__ARM_ARCH_ISA_A64)
#include <arm_neon.h>
#elif defined(__wasm)
#define DN_SIMDHASH_USE_SCALAR_FALLBACK 1
#ifdef DN_SIMDHASH_WARNINGS
#pragma message("WARNING: Building dn_simdhash for WASM without -msimd128! Performance will be terrible!")
#endif
#else
#else // target identification
#define DN_SIMDHASH_USE_SCALAR_FALLBACK 1
#ifdef DN_SIMDHASH_WARNINGS
#pragma message("WARNING: Unsupported architecture for dn_simdhash! Performance will be terrible!")
#endif
#endif
#endif // target identification

// extract/replace lane opcodes require constant indices on some target architectures,
// and in some cases it is profitable to do a single-byte memory load/store instead of
Expand All @@ -42,7 +40,7 @@
typedef uint8_t dn_u8x16 __attribute__ ((vector_size (DN_SIMDHASH_VECTOR_WIDTH), aligned(DN_SIMDHASH_VECTOR_WIDTH)));
typedef union {
_Alignas(DN_SIMDHASH_VECTOR_WIDTH) dn_u8x16 vec;
#if defined(_M_AMD64) || defined(_M_X64) || (_M_IX86_FP == 2) || defined(__SSE2__)
#if DN_SIMDHASH_USE_SSE2
_Alignas(DN_SIMDHASH_VECTOR_WIDTH) __m128i m128;
#endif
_Alignas(DN_SIMDHASH_VECTOR_WIDTH) uint8_t values[DN_SIMDHASH_VECTOR_WIDTH];
Expand All @@ -54,18 +52,24 @@ typedef uint8_t dn_simdhash_search_vector;
typedef dn_simdhash_suffixes dn_simdhash_search_vector;
#endif

// Extracting lanes from a vector register on x86/x64 has horrible latency,
// so it's better to do regular byte loads from the stack
#if defined(__wasm_simd128__)
// For wasm with -msimd128, clang generates truly bizarre load/store code
// where it does two byte memory loads, then a vector load, then two
// lane insertions to write the byte loads into the loaded vector
// before finally passing it to find_first_matching_suffix. So we have to vec[].
// See https://github.com/llvm/llvm-project/issues/87398#issuecomment-2050696298
// Also see https://github.com/llvm/llvm-project/issues/88460
#define dn_simdhash_extract_lane(suffixes, lane) \
suffixes.vec[lane]
#elif defined(__ARM_ARCH_ISA_A64)
// On Ampere ARM64, lane extracts are a single cheap opcode and by using lane
// extracts only the eager load of the suffixes does a single vector load instead of
// two 64bit low/high loads
#define dn_simdhash_extract_lane(suffixes, lane) \
suffixes.vec[lane]
#else
// Extracting lanes from a vector register on x86/x64 has horrible latency,
// so it's better to do regular byte loads from the stack
#define dn_simdhash_extract_lane(suffixes, lane) \
suffixes.values[lane]
#endif
Expand All @@ -79,6 +83,15 @@ ctz (uint32_t value)
return (uint32_t)__builtin_ctz(value);
}

static DN_FORCEINLINE(uint64_t)
ctzll (uint64_t value)
{
// __builtin_ctzll is undefined for 0
if (value == 0)
return 64;
return (uint64_t)__builtin_ctzll(value);
}

static DN_FORCEINLINE(dn_simdhash_search_vector)
build_search_vector (uint8_t needle)
{
Expand All @@ -103,37 +116,25 @@ find_first_matching_suffix_simd (
// Only used by the vectorized implementations; discarded by scalar.
dn_simdhash_suffixes haystack
) {
#if defined(__wasm_simd128__)
#ifdef DN_SIMDHASH_USE_SCALAR_FALLBACK
dn_simdhash_assert(!"Scalar fallback should be in use here");
return 32;
#elif defined(__wasm_simd128__)
return ctz(wasm_i8x16_bitmask(wasm_i8x16_eq(needle.vec, haystack.vec)));
#elif defined(_M_AMD64) || defined(_M_X64) || (_M_IX86_FP == 2) || defined(__SSE2__)
#elif DN_SIMDHASH_USE_SSE2
return ctz(_mm_movemask_epi8(_mm_cmpeq_epi8(needle.m128, haystack.m128)));
#elif defined(__ARM_NEON)
dn_simdhash_assert(!"Scalar fallback should be in use here");
return 32;
/*
dn_simdhash_suffixes match_vector;
// Completely untested.
static const dn_simdhash_suffixes byte_mask = {
{ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }
};
union {
uint8_t b[4];
uint32_t u;
} msb;
match_vector.vec = vceqq_u8(needle.vec, haystack.vec);
dn_simdhash_suffixes masked;
masked.vec = vandq_u8(match_vector.vec, byte_mask.vec);
msb.b[0] = vaddv_u8(vget_low_u8(masked.vec));
msb.b[1] = vaddv_u8(vget_high_u8(masked.vec));
return ctz(msb.u);
*/
#elif defined(__ARM_ARCH_ISA_A64)
// See https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
uint16x8_t match_vector16 = vreinterpretq_u16_u8(vceqq_u8(needle.vec, haystack.vec));
uint8x8_t match_bits = vshrn_n_u16(match_vector16, 4);
uint64_t match_bits_scalar = vget_lane_u64(vreinterpret_u64_u8(match_bits), 0);
return ctzll(match_bits_scalar) >> 2;
#else
dn_simdhash_assert(!"Scalar fallback should be in use here");
return 32;
#error "Missing platform implementation of find_first_matching_suffix_simd"
#endif
}

#elif defined(_M_AMD64) || defined(_M_X64) || (_M_IX86_FP == 2) || defined(__SSE2__)
#elif DN_SIMDHASH_USE_SSE2
// neither clang or gcc, but we have SSE2 available, so assume this is MSVC on x86 or x86-64
// msvc neon intrinsics don't seem to expose a 128-bit wide vector so there's no neon in here
#include <intrin.h> // for _BitScanForward
Expand Down
26 changes: 15 additions & 11 deletions src/native/containers/dn-simdhash-specialization.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,15 +182,17 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc
#define bucket_suffixes (bucket->suffixes)
#elif !defined(DN_SIMDHASH_USE_SCALAR_FALLBACK)
// Perform an eager load of the vector if SIMD is in use, even though we do
// byte loads to extract lanes on non-wasm platforms. It's faster on x64 for
// byte loads to extract lanes on some platforms. It's faster on x64 for
// a reason I can't identify, and it significantly improves wasm codegen
dn_simdhash_suffixes bucket_suffixes = bucket->suffixes;
#else
// Load through the pointer instead. An eager load just copies to the stack for
// no good reason.
#define bucket_suffixes (bucket->suffixes)
#endif

uint8_t count = dn_simdhash_extract_lane(bucket_suffixes, DN_SIMDHASH_COUNT_SLOT),
// Loading this late at the point of the if with DN_UNLIKELY doesn't seem to improve codegen or perf
overflow_count = dn_simdhash_extract_lane(bucket_suffixes, DN_SIMDHASH_CASCADED_SLOT);
// We could early-out here when count==0, but it doesn't appear to meaningfully improve
// search performance to do so, and might actually worsen it
Expand All @@ -200,18 +202,20 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc
uint32_t index = find_first_matching_suffix_simd(search_vector, bucket_suffixes);
#endif
#undef bucket_suffixes
for (; index < count; index++) {
// FIXME: Could be profitable to manually hoist the data load outside of the loop,
// if not out of SCAN_BUCKET_INTERNAL entirely. Clang appears to do LICM on it.
// It's better to index bucket->keys each iteration inside the loop than to precompute
// a pointer outside and bump the pointer, because in many cases the bucket will be
// empty, and in many other cases it will have one match. Putting the index inside the
// loop means that for empty/no-match buckets we don't do the index calculation at all.
if (DN_SIMDHASH_KEY_EQUALS(DN_SIMDHASH_GET_DATA(hash), needle, bucket->keys[index]))
return index;

if (DN_LIKELY(index < count)) {
DN_SIMDHASH_KEY_T *key = &bucket->keys[index];
do {
// FIXME: Could be profitable to manually hoist the data load outside of the loop,
// if not out of SCAN_BUCKET_INTERNAL entirely. Clang appears to do LICM on it.
if (DN_SIMDHASH_KEY_EQUALS(DN_SIMDHASH_GET_DATA(hash), needle, *key))
return index;
key++;
index++;
} while (index < count);
}

if (overflow_count)
if (DN_UNLIKELY(overflow_count))
return DN_SIMDHASH_SCAN_BUCKET_OVERFLOWED;
else
return DN_SIMDHASH_SCAN_BUCKET_NO_OVERFLOW;
Expand Down
Loading