dotnet · kg · Mar 7, 2025 · Mar 3, 2025 · Mar 3, 2025 · Mar 4, 2025
diff --git a/src/native/containers/dn-simdhash-arch.h b/src/native/containers/dn-simdhash-arch.h
@@ -9,31 +9,29 @@
 // HACK: for better language server parsing
 #include "dn-simdhash.h"
 
-// FIXME: Temporary workaround until the NEON path is replaced with an optimized one.
-// See https://github.com/dotnet/runtime/issues/113074
-#ifdef __ARM_ARCH
-#define DN_SIMDHASH_USE_SCALAR_FALLBACK 1
+#if defined(_M_AMD64) || defined(_M_X64) || (_M_IX86_FP == 2) || defined(__SSE2__)
+#define DN_SIMDHASH_USE_SSE2 1
 #endif
 
 #if defined(__clang__) || defined (__GNUC__) // use vector intrinsics
 
 #if defined(__wasm_simd128__)
 #include <wasm_simd128.h>
-#elif defined(_M_AMD64) || defined(_M_X64) || (_M_IX86_FP == 2) || defined(__SSE2__)
+#elif DN_SIMDHASH_USE_SSE2
 #include <emmintrin.h>
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_ARCH_ISA_A64)
 #include <arm_neon.h>
 #elif defined(__wasm)
 #define DN_SIMDHASH_USE_SCALAR_FALLBACK 1
 #ifdef DN_SIMDHASH_WARNINGS
 #pragma message("WARNING: Building dn_simdhash for WASM without -msimd128! Performance will be terrible!")
 #endif
-#else
+#else // target identification
 #define DN_SIMDHASH_USE_SCALAR_FALLBACK 1
 #ifdef DN_SIMDHASH_WARNINGS
 #pragma message("WARNING: Unsupported architecture for dn_simdhash! Performance will be terrible!")
 #endif
-#endif
+#endif // target identification
 
 // extract/replace lane opcodes require constant indices on some target architectures,
 //  and in some cases it is profitable to do a single-byte memory load/store instead of
@@ -42,7 +40,7 @@
 typedef uint8_t dn_u8x16 __attribute__ ((vector_size (DN_SIMDHASH_VECTOR_WIDTH), aligned(DN_SIMDHASH_VECTOR_WIDTH)));
 typedef union {
 	_Alignas(DN_SIMDHASH_VECTOR_WIDTH) dn_u8x16 vec;
-#if defined(_M_AMD64) || defined(_M_X64) || (_M_IX86_FP == 2) || defined(__SSE2__)
+#if DN_SIMDHASH_USE_SSE2
 	_Alignas(DN_SIMDHASH_VECTOR_WIDTH) __m128i m128;
 #endif
 	_Alignas(DN_SIMDHASH_VECTOR_WIDTH) uint8_t values[DN_SIMDHASH_VECTOR_WIDTH];
@@ -54,18 +52,24 @@ typedef uint8_t dn_simdhash_search_vector;
 typedef dn_simdhash_suffixes dn_simdhash_search_vector;
 #endif
 
-// Extracting lanes from a vector register on x86/x64 has horrible latency,
-//  so it's better to do regular byte loads from the stack
 #if defined(__wasm_simd128__)
 // For wasm with -msimd128, clang generates truly bizarre load/store code
 //  where it does two byte memory loads, then a vector load, then two
 //  lane insertions to write the byte loads into the loaded vector
 //  before finally passing it to find_first_matching_suffix. So we have to vec[].
 // See https://github.com/llvm/llvm-project/issues/87398#issuecomment-2050696298
 // Also see https://github.com/llvm/llvm-project/issues/88460
+#define dn_simdhash_extract_lane(suffixes, lane) \
+	suffixes.vec[lane]
+#elif defined(__ARM_ARCH_ISA_A64)
+// On Ampere ARM64, lane extracts are a single cheap opcode and by using lane
+//  extracts only the eager load of the suffixes does a single vector load instead of
+//  two 64bit low/high loads
 #define dn_simdhash_extract_lane(suffixes, lane) \
 	suffixes.vec[lane]
 #else
+// Extracting lanes from a vector register on x86/x64 has horrible latency,
+//  so it's better to do regular byte loads from the stack
 #define dn_simdhash_extract_lane(suffixes, lane) \
 	suffixes.values[lane]
 #endif
@@ -79,6 +83,15 @@ ctz (uint32_t value)
 	return (uint32_t)__builtin_ctz(value);
 }
 
+static DN_FORCEINLINE(uint64_t)
+ctzll (uint64_t value)
+{
+	// __builtin_ctzll is undefined for 0
+	if (value == 0)
+		return 64;
+	return (uint64_t)__builtin_ctzll(value);
+}
+
 static DN_FORCEINLINE(dn_simdhash_search_vector)
 build_search_vector (uint8_t needle)
 {
@@ -103,37 +116,25 @@ find_first_matching_suffix_simd (
 	// Only used by the vectorized implementations; discarded by scalar.
 	dn_simdhash_suffixes haystack
 ) {
-#if defined(__wasm_simd128__)
+#ifdef DN_SIMDHASH_USE_SCALAR_FALLBACK
+    dn_simdhash_assert(!"Scalar fallback should be in use here");
+    return 32;
+#elif defined(__wasm_simd128__)
 	return ctz(wasm_i8x16_bitmask(wasm_i8x16_eq(needle.vec, haystack.vec)));
-#elif defined(_M_AMD64) || defined(_M_X64) || (_M_IX86_FP == 2) || defined(__SSE2__)
+#elif DN_SIMDHASH_USE_SSE2
 	return ctz(_mm_movemask_epi8(_mm_cmpeq_epi8(needle.m128, haystack.m128)));
-#elif defined(__ARM_NEON)
-	dn_simdhash_assert(!"Scalar fallback should be in use here");
-	return 32;
-	/*
-	dn_simdhash_suffixes match_vector;
-	// Completely untested.
-	static const dn_simdhash_suffixes byte_mask = {
-		{ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }
-	};
-	union {
-		uint8_t b[4];
-		uint32_t u;
-	} msb;
-	match_vector.vec = vceqq_u8(needle.vec, haystack.vec);
-	dn_simdhash_suffixes masked;
-	masked.vec = vandq_u8(match_vector.vec, byte_mask.vec);
-	msb.b[0] = vaddv_u8(vget_low_u8(masked.vec));
-	msb.b[1] = vaddv_u8(vget_high_u8(masked.vec));
-	return ctz(msb.u);
-	*/
+#elif defined(__ARM_ARCH_ISA_A64)
+	// See https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
+	uint16x8_t match_vector16 = vreinterpretq_u16_u8(vceqq_u8(needle.vec, haystack.vec));
+	uint8x8_t match_bits = vshrn_n_u16(match_vector16, 4);
+	uint64_t match_bits_scalar = vget_lane_u64(vreinterpret_u64_u8(match_bits), 0);
+	return ctzll(match_bits_scalar) >> 2;
 #else
-	dn_simdhash_assert(!"Scalar fallback should be in use here");
-    return 32;
+    #error "Missing platform implementation of find_first_matching_suffix_simd"
 #endif
 }
 
-#elif defined(_M_AMD64) || defined(_M_X64) || (_M_IX86_FP == 2) || defined(__SSE2__)
+#elif DN_SIMDHASH_USE_SSE2
 // neither clang or gcc, but we have SSE2 available, so assume this is MSVC on x86 or x86-64
 // msvc neon intrinsics don't seem to expose a 128-bit wide vector so there's no neon in here
 #include <intrin.h> // for _BitScanForward

diff --git a/src/native/containers/dn-simdhash-specialization.h b/src/native/containers/dn-simdhash-specialization.h
@@ -182,15 +182,17 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc
 	#define bucket_suffixes (bucket->suffixes)
 #elif !defined(DN_SIMDHASH_USE_SCALAR_FALLBACK)
 	// Perform an eager load of the vector if SIMD is in use, even though we do
-	//  byte loads to extract lanes on non-wasm platforms. It's faster on x64 for
+	//  byte loads to extract lanes on some platforms. It's faster on x64 for
 	//  a reason I can't identify, and it significantly improves wasm codegen
 	dn_simdhash_suffixes bucket_suffixes = bucket->suffixes;
 #else
 	// Load through the pointer instead. An eager load just copies to the stack for
 	//  no good reason.
 	#define bucket_suffixes (bucket->suffixes)
 #endif
+
 	uint8_t count = dn_simdhash_extract_lane(bucket_suffixes, DN_SIMDHASH_COUNT_SLOT),
+	// Loading this late at the point of the if with DN_UNLIKELY doesn't seem to improve codegen or perf
 		overflow_count = dn_simdhash_extract_lane(bucket_suffixes, DN_SIMDHASH_CASCADED_SLOT);
 	// We could early-out here when count==0, but it doesn't appear to meaningfully improve
 	//  search performance to do so, and might actually worsen it
@@ -200,18 +202,20 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc
 	uint32_t index = find_first_matching_suffix_simd(search_vector, bucket_suffixes);
 #endif
 #undef bucket_suffixes
-	for (; index < count; index++) {
-		// FIXME: Could be profitable to manually hoist the data load outside of the loop,
-		//  if not out of SCAN_BUCKET_INTERNAL entirely. Clang appears to do LICM on it.
-		// It's better to index bucket->keys each iteration inside the loop than to precompute
-		//  a pointer outside and bump the pointer, because in many cases the bucket will be
-		//  empty, and in many other cases it will have one match. Putting the index inside the
-		//  loop means that for empty/no-match buckets we don't do the index calculation at all.
-		if (DN_SIMDHASH_KEY_EQUALS(DN_SIMDHASH_GET_DATA(hash), needle, bucket->keys[index]))
-			return index;
+
+	if (DN_LIKELY(index < count)) {
+		DN_SIMDHASH_KEY_T *key = &bucket->keys[index];
+		do {
+			// FIXME: Could be profitable to manually hoist the data load outside of the loop,
+			//  if not out of SCAN_BUCKET_INTERNAL entirely. Clang appears to do LICM on it.
+			if (DN_SIMDHASH_KEY_EQUALS(DN_SIMDHASH_GET_DATA(hash), needle, *key))
+				return index;
+			key++;
+			index++;
+		} while (index < count);
 	}
 
-	if (overflow_count)
+	if (DN_UNLIKELY(overflow_count))
 		return DN_SIMDHASH_SCAN_BUCKET_OVERFLOWED;
 	else
 		return DN_SIMDHASH_SCAN_BUCKET_NO_OVERFLOW;