diff --git a/src/Nethermind/Nethermind.Benchmark/Core/FastHashBenchmarks.cs b/src/Nethermind/Nethermind.Benchmark/Core/FastHashBenchmarks.cs new file mode 100644 index 000000000000..a6481332d087 --- /dev/null +++ b/src/Nethermind/Nethermind.Benchmark/Core/FastHashBenchmarks.cs @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: 2024 Demerzel Solutions Limited +// SPDX-License-Identifier: LGPL-3.0-only + +using System; +using System.Runtime.InteropServices; +using BenchmarkDotNet.Attributes; +using Nethermind.Core.Extensions; + +namespace Nethermind.Benchmarks.Core; + +[ShortRunJob] +[DisassemblyDiagnoser] +[MemoryDiagnoser] +public class FastHashBenchmarks +{ + private byte[] _data = null!; + + [Params(16, 20, 32, 64, 128, 256, 512, 1024)] + public int Size; + + [GlobalSetup] + public void Setup() + { + _data = new byte[Size]; + Random.Shared.NextBytes(_data); + } + + [Benchmark(Baseline = true)] + public int FastHash() + { + return ((ReadOnlySpan)_data).FastHash(); + } + + [Benchmark] + public int FastHashAes() + { + ref byte start = ref MemoryMarshal.GetReference(_data); + return SpanExtensions.FastHashAesX64(ref start, _data.Length, SpanExtensions.ComputeSeed(_data.Length)); + } + + [Benchmark] + public int FastHashCrc() + { + ref byte start = ref MemoryMarshal.GetReference(_data); + return SpanExtensions.FastHashCrc(ref start, _data.Length, SpanExtensions.ComputeSeed(_data.Length)); + } +} diff --git a/src/Nethermind/Nethermind.Core.Test/BytesTests.cs b/src/Nethermind/Nethermind.Core.Test/BytesTests.cs index a8e64a7de955..ed7ccc0c2ac4 100644 --- a/src/Nethermind/Nethermind.Core.Test/BytesTests.cs +++ b/src/Nethermind/Nethermind.Core.Test/BytesTests.cs @@ -467,5 +467,139 @@ public void NullableComparison() { Bytes.NullableEqualityComparer.Equals(null, null).Should().BeTrue(); } + + [Test] + public void FastHash_EmptyInput_ReturnsZero() + { + ReadOnlySpan empty = ReadOnlySpan.Empty; + empty.FastHash().Should().Be(0); + } + + [Test] + public void FastHash_SameInput_ReturnsSameHash() + { + byte[] input = new byte[100]; + TestContext.CurrentContext.Random.NextBytes(input); + + int hash1 = ((ReadOnlySpan)input).FastHash(); + int hash2 = ((ReadOnlySpan)input).FastHash(); + + hash1.Should().Be(hash2); + } + + [Test] + public void FastHash_DifferentInput_ReturnsDifferentHash() + { + byte[] input1 = new byte[100]; + byte[] input2 = new byte[100]; + TestContext.CurrentContext.Random.NextBytes(input1); + Array.Copy(input1, input2, input1.Length); + input2[50] ^= 0xFF; // Flip bits at position 50 + + int hash1 = ((ReadOnlySpan)input1).FastHash(); + int hash2 = ((ReadOnlySpan)input2).FastHash(); + + hash1.Should().NotBe(hash2); + } + + // Test cases for the fold-back bug fix: remaining in [49-63] after 64-byte initial load + // For len=113 to 127, remaining = len-64 = 49 to 63, which requires the last64 fold-back + [TestCase(113)] // remaining=49, boundary case for last64 + [TestCase(120)] // remaining=56, middle of the gap range + [TestCase(127)] // remaining=63, upper boundary + [TestCase(65)] // remaining=1, lower boundary for >64 path + [TestCase(80)] // remaining=16 + [TestCase(96)] // remaining=32 + [TestCase(112)] // remaining=48, boundary where last64 is NOT needed + public void FastHash_AllBytesAreHashed_FoldBackCoverage(int length) + { + byte[] input = new byte[length]; + TestContext.CurrentContext.Random.NextBytes(input); + + int originalHash = ((ReadOnlySpan)input).FastHash(); + + // Verify that changing any byte changes the hash + // This catches the gap bug where bytes[64-71] weren't being hashed + for (int i = 0; i < length; i++) + { + byte[] modified = (byte[])input.Clone(); + modified[i] ^= 0xFF; + + int modifiedHash = ((ReadOnlySpan)modified).FastHash(); + modifiedHash.Should().NotBe(originalHash, $"Changing byte at index {i} should change the hash for length {length}"); + } + } + + // Specifically test the gap range that was buggy: bytes[64-71] for len=120 + [Test] + public void FastHash_GapBytesAreHashed_Len120() + { + byte[] input = new byte[120]; + TestContext.CurrentContext.Random.NextBytes(input); + + int originalHash = ((ReadOnlySpan)input).FastHash(); + + // The bug was that bytes[64-71] weren't hashed for len=120 + // Test each byte in the gap + for (int i = 64; i < 72; i++) + { + byte[] modified = (byte[])input.Clone(); + modified[i] ^= 0xFF; + + int modifiedHash = ((ReadOnlySpan)modified).FastHash(); + modifiedHash.Should().NotBe(originalHash, $"Changing byte at index {i} (in gap range) should change the hash"); + } + } + + // Test medium-large case (33-64 bytes) with overlap to verify it works + [TestCase(50)] // Tests overlap in medium-large path + public void FastHash_MediumLarge_AllBytesContribute(int length) + { + byte[] input = new byte[length]; + TestContext.CurrentContext.Random.NextBytes(input); + + int originalHash = ((ReadOnlySpan)input).FastHash(); + + // Test ALL bytes to verify overlap handling works + for (int i = 0; i < length; i++) + { + byte[] modified = (byte[])input.Clone(); + modified[i] ^= 0xFF; + + int modifiedHash = ((ReadOnlySpan)modified).FastHash(); + modifiedHash.Should().NotBe(originalHash, $"Changing byte at index {i} should change the hash for length {length}"); + } + } + + [TestCase(1)] + [TestCase(7)] + [TestCase(8)] + [TestCase(15)] + [TestCase(16)] + [TestCase(31)] + [TestCase(32)] + [TestCase(33)] + [TestCase(64)] + [TestCase(128)] + [TestCase(256)] + [TestCase(500)] + public void FastHash_VariousLengths_AllBytesContribute(int length) + { + byte[] input = new byte[length]; + TestContext.CurrentContext.Random.NextBytes(input); + + int originalHash = ((ReadOnlySpan)input).FastHash(); + + // Test first, middle, and last bytes to ensure all contribute + int[] indicesToTest = [0, length / 2, length - 1]; + foreach (int i in indicesToTest) + { + byte[] modified = (byte[])input.Clone(); + modified[i] ^= 0xFF; + + int modifiedHash = ((ReadOnlySpan)modified).FastHash(); + modifiedHash.Should().NotBe(originalHash, $"Changing byte at index {i} should change the hash for length {length}"); + } + } } } diff --git a/src/Nethermind/Nethermind.Core/Extensions/SpanExtensions.cs b/src/Nethermind/Nethermind.Core/Extensions/SpanExtensions.cs index e4ef7139ddb8..7ae71df3f508 100644 --- a/src/Nethermind/Nethermind.Core/Extensions/SpanExtensions.cs +++ b/src/Nethermind/Nethermind.Core/Extensions/SpanExtensions.cs @@ -22,6 +22,8 @@ public static class SpanExtensions // the performance of the network as a whole. private static readonly uint s_instanceRandom = (uint)System.Security.Cryptography.RandomNumberGenerator.GetInt32(int.MinValue, int.MaxValue); + internal static uint ComputeSeed(int len) => s_instanceRandom + (uint)len; + public static string ToHexString(this in Memory memory, bool withZeroX = false) { return ToHexString(memory.Span, withZeroX, false, false); @@ -227,149 +229,286 @@ public static ArrayPoolListRef ToPooledListRef(this in ReadOnlySpan spa [SkipLocalsInit] public static int FastHash(this ReadOnlySpan input) { - // Fast hardware-accelerated, non-cryptographic hash. - // Core idea: CRC32C is extremely cheap on CPUs with SSE4.2/ARM CRC, - // and gives good diffusion for hashing. We then optionally add extra - // mixing to reduce "CRC linearity" artifacts. - int len = input.Length; - - // Contract choice: empty input hashes to 0. - // (Also avoids doing any ref work on an empty span.) if (len == 0) return 0; - // Using ref + Unsafe.ReadUnaligned lets the JIT hoist bounds checks - // and keep the hot loop tight. - ref byte start = ref MemoryMarshal.GetReference(input); - // Seed with an instance-random value so attackers cannot trivially - // engineer lots of same-bucket keys. Mixing in length makes "same prefix, - // different length" less correlated (CRC alone can be length-sensitive). + ref byte start = ref MemoryMarshal.GetReference(input); uint seed = s_instanceRandom + (uint)len; - // Small: 1-7 bytes. - // Using the tail routine here avoids building a synthetic - // 64-bit value with shifts/byte-permute. - if (len < 8) + if (len >= 16) { - uint small = CrcTailOrdered(seed, ref start, len); - // FinalMix breaks some remaining linearity and improves avalanche for tiny inputs. - return (int)FinalMix(small); + if (x64.Aes.IsSupported) return FastHashAesX64(ref start, len, seed); + if (Arm.Aes.IsSupported) return FastHashAesArm(ref start, len, seed); } - // Medium: 8-31 bytes. - // A single CRC lane is usually fine here - overhead dominates, - // and latency hiding is less important. - if (len < 32) + return FastHashCrc(ref start, len, seed); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + [SkipLocalsInit] + internal static int FastHashAesX64(ref byte start, int len, uint seed) + { + Vector128 seedVec = Vector128.CreateScalar(seed).AsByte(); + Vector128 acc0 = Unsafe.As>(ref start) ^ seedVec; + + if (len > 64) { - uint h = seed; - ref byte p = ref start; + Vector128 acc1 = Unsafe.As>(ref Unsafe.Add(ref start, 16)) ^ seedVec; + Vector128 acc2 = Unsafe.As>(ref Unsafe.Add(ref start, 32)) ^ seedVec; + Vector128 acc3 = Unsafe.As>(ref Unsafe.Add(ref start, 48)) ^ seedVec; - // Process as many full 64-bit words as possible. - // "& ~7" is a cheap round-down-to-multiple-of-8 (no division/mod). - int full = len & ~7; - int tail = len - full; + ref byte p = ref Unsafe.Add(ref start, 64); + int remaining = len - 64; - // Streaming CRC over 8-byte chunks. - // ReadUnaligned keeps us safe for arbitrary input alignment. - for (int i = 0; i < full; i += 8) + while (remaining >= 64) { - h = BitOperations.Crc32C(h, Unsafe.ReadUnaligned(ref p)); - p = ref Unsafe.Add(ref p, 8); + acc0 = x64.Aes.Encrypt(Unsafe.As>(ref p), acc0); + acc1 = x64.Aes.Encrypt(Unsafe.As>(ref Unsafe.Add(ref p, 16)), acc1); + acc2 = x64.Aes.Encrypt(Unsafe.As>(ref Unsafe.Add(ref p, 32)), acc2); + acc3 = x64.Aes.Encrypt(Unsafe.As>(ref Unsafe.Add(ref p, 48)), acc3); + + p = ref Unsafe.Add(ref p, 64); + remaining -= 64; } - // Hash remaining 1-7 bytes in strict order (no over-read). - if (tail != 0) - h = CrcTailOrdered(h, ref p, tail); + // Fold 4 lanes: 3 XOR + 1 AES (minimal serial latency) + acc0 ^= acc1; + acc2 ^= acc3; + acc0 ^= acc2; + acc0 = x64.Aes.Encrypt(seedVec, acc0); - // Final mixing for better bit diffusion than raw CRC, - // especially for shorter payloads. - return (int)FinalMix(h); + // Drain remaining 0-63 bytes + while (remaining >= 16) + { + acc0 = x64.Aes.Encrypt(Unsafe.As>(ref p), acc0); + p = ref Unsafe.Add(ref p, 16); + remaining -= 16; + } + + // Remaining 1-15 bytes: use CRC to avoid overlap with drain blocks + if (remaining > 0) + { + uint crc = seed; + if (remaining >= 8) + { + crc = BitOperations.Crc32C(crc, Unsafe.ReadUnaligned(ref p)); + p = ref Unsafe.Add(ref p, 8); + remaining -= 8; + } + if ((remaining & 4) != 0) + { + crc = BitOperations.Crc32C(crc, Unsafe.ReadUnaligned(ref p)); + p = ref Unsafe.Add(ref p, 4); + } + if ((remaining & 2) != 0) + { + crc = BitOperations.Crc32C(crc, Unsafe.ReadUnaligned(ref p)); + p = ref Unsafe.Add(ref p, 2); + } + if ((remaining & 1) != 0) + { + crc = BitOperations.Crc32C(crc, p); + } + acc0 = x64.Aes.Encrypt(Vector128.CreateScalar(crc).AsByte(), acc0); + } } + else if (len > 32) + { + ref byte p = ref Unsafe.Add(ref start, 16); + int remaining = len - 16; - // Large: 32+ bytes. - // Use multiple independent CRC accumulators ("lanes") to hide crc32 - // latency and increase ILP. CRC32C instructions have decent throughput - // but non-trivial latency; 4 lanes keeps the CPU busy. - uint h0 = seed; - uint h1 = seed ^ 0x9E3779B9u; // golden-ratio-ish constants to separate lanes - uint h2 = seed ^ 0x85EBCA6Bu; // constants borrowed from common finalizers (good bit dispersion) - uint h3 = seed ^ 0xC2B2AE35u; - - ref byte q = ref start; - - // Consume all full 64-bit words first. Tail (1-7 bytes) is handled later. - int aligned = len & ~7; - int remaining = aligned; - - // 64-byte unroll: - // - amortizes loop branch/compare overhead - // - feeds enough independent work to keep OoO cores busy - // - maps nicely onto cache line sized chunks - while (remaining >= 64) + while (remaining > 16) + { + acc0 = x64.Aes.Encrypt(Unsafe.As>(ref p), acc0); + p = ref Unsafe.Add(ref p, 16); + remaining -= 16; + } + + Vector128 last = Unsafe.As>(ref Unsafe.Add(ref start, len - 16)); + acc0 = x64.Aes.Encrypt(last, acc0); + } + else { - h0 = BitOperations.Crc32C(h0, Unsafe.ReadUnaligned(ref q)); - h1 = BitOperations.Crc32C(h1, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 8))); - h2 = BitOperations.Crc32C(h2, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 16))); - h3 = BitOperations.Crc32C(h3, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 24))); - - h0 = BitOperations.Crc32C(h0, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 32))); - h1 = BitOperations.Crc32C(h1, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 40))); - h2 = BitOperations.Crc32C(h2, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 48))); - h3 = BitOperations.Crc32C(h3, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 56))); - - q = ref Unsafe.Add(ref q, 64); - remaining -= 64; + Vector128 data = Unsafe.As>(ref Unsafe.Add(ref start, len - 16)); + acc0 = x64.Aes.Encrypt(data, acc0); } - // One more half-unroll for 32 bytes if present. - // Keeps the "drain" path short and avoids a smaller loop with more branches. - if (remaining >= 32) + ulong compressed = acc0.AsUInt64().GetElement(0) ^ acc0.AsUInt64().GetElement(1); + return (int)(uint)(compressed ^ (compressed >> 32)); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + [SkipLocalsInit] + internal static int FastHashAesArm(ref byte start, int len, uint seed) + { + Vector128 seedVec = Vector128.CreateScalar(seed).AsByte(); + Vector128 acc0 = Unsafe.As>(ref start) ^ seedVec; + + if (len > 64) { - h0 = BitOperations.Crc32C(h0, Unsafe.ReadUnaligned(ref q)); - h1 = BitOperations.Crc32C(h1, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 8))); - h2 = BitOperations.Crc32C(h2, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 16))); - h3 = BitOperations.Crc32C(h3, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 24))); + Vector128 acc1 = Unsafe.As>(ref Unsafe.Add(ref start, 16)) ^ seedVec; + Vector128 acc2 = Unsafe.As>(ref Unsafe.Add(ref start, 32)) ^ seedVec; + Vector128 acc3 = Unsafe.As>(ref Unsafe.Add(ref start, 48)) ^ seedVec; + + ref byte p = ref Unsafe.Add(ref start, 64); + int remaining = len - 64; + + while (remaining >= 64) + { + acc0 = Arm.Aes.MixColumns(Arm.Aes.Encrypt(Unsafe.As>(ref p), acc0)); + acc1 = Arm.Aes.MixColumns(Arm.Aes.Encrypt(Unsafe.As>(ref Unsafe.Add(ref p, 16)), acc1)); + acc2 = Arm.Aes.MixColumns(Arm.Aes.Encrypt(Unsafe.As>(ref Unsafe.Add(ref p, 32)), acc2)); + acc3 = Arm.Aes.MixColumns(Arm.Aes.Encrypt(Unsafe.As>(ref Unsafe.Add(ref p, 48)), acc3)); + + p = ref Unsafe.Add(ref p, 64); + remaining -= 64; + } - q = ref Unsafe.Add(ref q, 32); - remaining -= 32; + acc0 ^= acc1; + acc2 ^= acc3; + acc0 ^= acc2; + acc0 = Arm.Aes.MixColumns(Arm.Aes.Encrypt(seedVec, acc0)); + + while (remaining >= 16) + { + acc0 = Arm.Aes.MixColumns(Arm.Aes.Encrypt(Unsafe.As>(ref p), acc0)); + p = ref Unsafe.Add(ref p, 16); + remaining -= 16; + } + + if (remaining > 0) + { + uint crc = seed; + if (remaining >= 8) + { + crc = BitOperations.Crc32C(crc, Unsafe.ReadUnaligned(ref p)); + p = ref Unsafe.Add(ref p, 8); + remaining -= 8; + } + if ((remaining & 4) != 0) + { + crc = BitOperations.Crc32C(crc, Unsafe.ReadUnaligned(ref p)); + p = ref Unsafe.Add(ref p, 4); + } + if ((remaining & 2) != 0) + { + crc = BitOperations.Crc32C(crc, Unsafe.ReadUnaligned(ref p)); + p = ref Unsafe.Add(ref p, 2); + } + if ((remaining & 1) != 0) + { + crc = BitOperations.Crc32C(crc, p); + } + acc0 = Arm.Aes.MixColumns(Arm.Aes.Encrypt(Vector128.CreateScalar(crc).AsByte(), acc0)); + } } + else if (len > 32) + { + ref byte p = ref Unsafe.Add(ref start, 16); + int remaining = len - 16; - // Drain any remaining full 64-bit words (0, 8, 16, or 24 bytes). - // This is branchy but only runs once, so it is cheaper than another loop. - if (remaining != 0) + while (remaining > 16) + { + acc0 = Arm.Aes.MixColumns(Arm.Aes.Encrypt(Unsafe.As>(ref p), acc0)); + p = ref Unsafe.Add(ref p, 16); + remaining -= 16; + } + + Vector128 last = Unsafe.As>(ref Unsafe.Add(ref start, len - 16)); + acc0 = Arm.Aes.MixColumns(Arm.Aes.Encrypt(last, acc0)); + } + else { - // remaining is a multiple of 8 here. - if (remaining >= 8) h0 = BitOperations.Crc32C(h0, Unsafe.ReadUnaligned(ref q)); - if (remaining >= 16) h1 = BitOperations.Crc32C(h1, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 8))); - if (remaining == 24) h2 = BitOperations.Crc32C(h2, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 16))); + Vector128 data = Unsafe.As>(ref Unsafe.Add(ref start, len - 16)); + acc0 = Arm.Aes.MixColumns(Arm.Aes.Encrypt(data, acc0)); } - // Fold lanes down to one 32-bit value. - // Rotates permute bit positions so each lane contributes differently. - // Adds (rather than XOR) deliberately introduce carries - // - CRC is linear over GF(2), and carry breaks that, making simple algebraic - // structure harder to exploit for collision clustering in hash tables. - h2 = BitOperations.RotateLeft(h2, 17) + BitOperations.RotateLeft(h3, 23); - h0 += BitOperations.RotateLeft(h1, 11); - uint hash = h2 + h0; - - // Handle tail bytes (1-7 bytes) that were not part of the 64-bit-aligned stream. - // This is exact, in-order processing - no overlap and no over-read. - int tailBytes = len - aligned; - if (tailBytes != 0) + ulong compressed = acc0.AsUInt64().GetElement(0) ^ acc0.AsUInt64().GetElement(1); + return (int)(uint)(compressed ^ (compressed >> 32)); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + [SkipLocalsInit] + internal static int FastHashCrc(ref byte start, int len, uint seed) + { + uint hash; + if (len < 16) { - ref byte tailRef = ref Unsafe.Add(ref start, aligned); - hash = CrcTailOrdered(hash, ref tailRef, tailBytes); + if (len >= 8) + { + ulong lo = Unsafe.ReadUnaligned(ref start); + ulong hi = Unsafe.ReadUnaligned(ref Unsafe.Add(ref start, len - 8)); + uint h0 = BitOperations.Crc32C(seed, lo); + uint h1 = BitOperations.Crc32C(seed ^ 0x9E3779B9u, hi); + hash = h0 + BitOperations.RotateLeft(h1, 11); + } + else + { + hash = CrcTailOrdered(seed, ref start, len); + } + } + else + { + uint h0 = seed; + uint h1 = seed ^ 0x9E3779B9u; + uint h2 = seed ^ 0x85EBCA6Bu; + uint h3 = seed ^ 0xC2B2AE35u; + + ref byte q = ref start; + int aligned = len & ~7; + int remaining = aligned; + + while (remaining >= 64) + { + h0 = BitOperations.Crc32C(h0, Unsafe.ReadUnaligned(ref q)); + h1 = BitOperations.Crc32C(h1, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 8))); + h2 = BitOperations.Crc32C(h2, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 16))); + h3 = BitOperations.Crc32C(h3, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 24))); + + h0 = BitOperations.Crc32C(h0, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 32))); + h1 = BitOperations.Crc32C(h1, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 40))); + h2 = BitOperations.Crc32C(h2, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 48))); + h3 = BitOperations.Crc32C(h3, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 56))); + + q = ref Unsafe.Add(ref q, 64); + remaining -= 64; + } + + if (remaining >= 32) + { + h0 = BitOperations.Crc32C(h0, Unsafe.ReadUnaligned(ref q)); + h1 = BitOperations.Crc32C(h1, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 8))); + h2 = BitOperations.Crc32C(h2, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 16))); + h3 = BitOperations.Crc32C(h3, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 24))); + + q = ref Unsafe.Add(ref q, 32); + remaining -= 32; + } + + if (remaining >= 8) h0 = BitOperations.Crc32C(h0, Unsafe.ReadUnaligned(ref q)); + if (remaining >= 16) h1 = BitOperations.Crc32C(h1, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 8))); + if (remaining >= 24) h2 = BitOperations.Crc32C(h2, Unsafe.ReadUnaligned(ref Unsafe.Add(ref q, 16))); + + h2 = BitOperations.RotateLeft(h2, 17) + BitOperations.RotateLeft(h3, 23); + h0 += BitOperations.RotateLeft(h1, 11); + hash = h2 + h0; + + int tailBytes = len - aligned; + if (tailBytes != 0) + { + ref byte tailRef = ref Unsafe.Add(ref start, aligned); + hash = CrcTailOrdered(hash, ref tailRef, tailBytes); + } } - // FinalMix breaks some remaining linearity and improves avalanche - return (int)FinalMix(hash); + hash ^= hash >> 16; + hash *= 0x9E3779B1u; + hash ^= hash >> 16; + return (int)hash; [MethodImpl(MethodImplOptions.AggressiveInlining)] static uint CrcTailOrdered(uint hash, ref byte p, int length) { - // length is 1..7 - // Process 4-2-1 bytes in natural order if ((length & 4) != 0) { hash = BitOperations.Crc32C(hash, Unsafe.ReadUnaligned(ref p)); @@ -386,19 +525,6 @@ static uint CrcTailOrdered(uint hash, ref byte p, int length) } return hash; } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static uint FinalMix(uint x) - { - // A tiny finalizer to improve avalanche: - // - xor-fold high bits down - // - multiply by an odd constant to spread changes across bits - // - xor-fold again to propagate the multiply result - x ^= x >> 16; - x *= 0x9E3779B1u; - x ^= x >> 16; - return x; - } } ///