diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32.Vectorized.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32.Vectorized.cs index 3a26cabadf34f..47cc9f1609f95 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32.Vectorized.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32.Vectorized.cs @@ -5,6 +5,7 @@ using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.InteropServices; +using static System.IO.Hashing.VectorHelper; namespace System.IO.Hashing { @@ -17,7 +18,9 @@ public partial class Crc32 private static bool CanBeVectorized(ReadOnlySpan source) => BitConverter.IsLittleEndian && VectorHelper.IsSupported - && source.Length >= Vector128.Count * 4; + // Vectorization can process spans as short as a single vector (16 bytes), but if ARM intrinsics are supported they + // seem to be more performant for spans less than 8 vectors (128 bytes). + && source.Length >= Vector128.Count * (System.Runtime.Intrinsics.Arm.Crc32.IsSupported ? 8 : 1); // Processes the bytes in source in 64 byte chunks using carryless/polynomial multiplication intrinsics, // followed by processing 16 byte chunks, and then processing remaining bytes individually. Requires @@ -35,102 +38,81 @@ private static uint UpdateVectorized(uint crc, ReadOnlySpan source) ref byte srcRef = ref MemoryMarshal.GetReference(source); int length = source.Length; - Vector128 x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); - Vector128 x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64(); - Vector128 x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64(); - Vector128 x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64(); - Vector128 x5; + Vector128 kConstants; + Vector128 x1; // Accumulator for the new CRC + Vector128 x2; - x1 ^= Vector128.CreateScalar(crc).AsUInt64(); - Vector128 x0 = Vector128.Create(0x0154442bd4UL, 0x01c6e41596UL); // k1, k2 - - srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 4); - length -= Vector128.Count * 4; - - // Parallel fold blocks of 64, if any. - while (length >= Vector128.Count * 4) + if (length >= Vector128.Count * 8) { - x5 = VectorHelper.CarrylessMultiplyLower(x1, x0); - Vector128 x6 = VectorHelper.CarrylessMultiplyLower(x2, x0); - Vector128 x7 = VectorHelper.CarrylessMultiplyLower(x3, x0); - Vector128 x8 = VectorHelper.CarrylessMultiplyLower(x4, x0); - - x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0); - x2 = VectorHelper.CarrylessMultiplyUpper(x2, x0); - x3 = VectorHelper.CarrylessMultiplyUpper(x3, x0); - x4 = VectorHelper.CarrylessMultiplyUpper(x4, x0); - - Vector128 y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); - Vector128 y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64(); - Vector128 y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64(); - Vector128 y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64(); - - x1 ^= x5; - x2 ^= x6; - x3 ^= x7; - x4 ^= x8; - - x1 ^= y5; - x2 ^= y6; - x3 ^= y7; - x4 ^= y8; + x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); + x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64(); + Vector128 x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64(); + Vector128 x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64(); srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 4); length -= Vector128.Count * 4; - } - - // Fold into 128-bits. - x0 = Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL); // k3, k4 - x5 = VectorHelper.CarrylessMultiplyLower(x1, x0); - x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0); - x1 ^= x2; - x1 ^= x5; + // Load and XOR the initial CRC value + x1 ^= Vector128.CreateScalar(crc).AsUInt64(); + + kConstants = Vector128.Create(0x0154442bd4UL, 0x01c6e41596UL); // k1, k2 + + // Parallel fold blocks of 64, if any. + do + { + Vector128 y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); + Vector128 y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64(); + Vector128 y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64(); + Vector128 y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64(); + + x1 = FoldPolynomialPair(y5, x1, kConstants); + x2 = FoldPolynomialPair(y6, x2, kConstants); + x3 = FoldPolynomialPair(y7, x3, kConstants); + x4 = FoldPolynomialPair(y8, x4, kConstants); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 4); + length -= Vector128.Count * 4; + } while (length >= Vector128.Count * 4); + + // Fold into 128-bits. + kConstants = Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL); // k3, k4 + x1 = FoldPolynomialPair(x2, x1, kConstants); + x1 = FoldPolynomialPair(x3, x1, kConstants); + x1 = FoldPolynomialPair(x4, x1, kConstants); + } + else + { + // For shorter sources just load the first vector and XOR with the CRC + Debug.Assert(length >= 16); - x5 = VectorHelper.CarrylessMultiplyLower(x1, x0); - x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0); - x1 ^= x3; - x1 ^= x5; + x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); + x1 ^= Vector128.CreateScalar(crc).AsUInt64(); - x5 = VectorHelper.CarrylessMultiplyLower(x1, x0); - x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0); - x1 ^= x4; - x1 ^= x5; + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); + length -= Vector128.Count; + } // Single fold blocks of 16, if any. while (length >= Vector128.Count) { - x2 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); - - x5 = VectorHelper.CarrylessMultiplyLower(x1, x0); - x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0); - x1 ^= x2; - x1 ^= x5; + x1 = FoldPolynomialPair(Vector128.LoadUnsafe(ref srcRef).AsUInt64(), x1, + Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL)); srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); length -= Vector128.Count; } // Fold 128 bits to 64 bits. - x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x1, x0); - x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); - x1 = VectorHelper.ShiftRightBytesInVector(x1, 8); - x1 ^= x2; - - x0 = Vector128.CreateScalar(0x0163cd6124UL); // k5, k0 - - x2 = VectorHelper.ShiftRightBytesInVector(x1, 4); - x1 &= x3; - x1 = VectorHelper.CarrylessMultiplyLower(x1, x0); - x1 ^= x2; + Vector128 bitmask = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); + x1 = ShiftRightBytesInVector(x1, 8) ^ + CarrylessMultiplyLower(x1, Vector128.CreateScalar(0x00ccaa009eUL)); + x1 = CarrylessMultiplyLower(x1 & bitmask, Vector128.CreateScalar(0x0163cd6124UL)) ^ // k5, k0 + ShiftRightBytesInVector(x1, 4); // Reduce to 32 bits. - x0 = Vector128.Create(0x01db710641UL, 0x01f7011641UL); // polynomial - - x2 = x1 & x3; - x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x2, x0); - x2 &= x3; - x2 = VectorHelper.CarrylessMultiplyLower(x2, x0); + kConstants = Vector128.Create(0x01db710641UL, 0x01f7011641UL); // polynomial + x2 = CarrylessMultiplyLeftLowerRightUpper(x1 & bitmask, kConstants) & bitmask; + x2 = CarrylessMultiplyLower(x2, kConstants); x1 ^= x2; // Process the remaining bytes, if any diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64.Vectorized.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64.Vectorized.cs index cb2339c128d70..095bbce58ae46 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64.Vectorized.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64.Vectorized.cs @@ -5,6 +5,7 @@ using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.InteropServices; +using static System.IO.Hashing.VectorHelper; namespace System.IO.Hashing { @@ -72,7 +73,7 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan source) // Load and XOR the initial CRC value // CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register. // because data will be byte-reflected and will align with initial crc at correct place. - x0 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc)); + x0 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc)); kConstants = Vector128.Create(0x5cf79dea9ac37d6UL, 0x001067e571d7d5c2UL); // k3, k4 @@ -81,36 +82,36 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan source) { Vector128 y1 = LoadFromSource(ref srcRef, 0); Vector128 y2 = LoadFromSource(ref srcRef, 16); - x0 = VectorHelper.FoldPolynomialPair(y1, x0, kConstants); - x1 = VectorHelper.FoldPolynomialPair(y2, x1, kConstants); + x0 = FoldPolynomialPair(y1, x0, kConstants); + x1 = FoldPolynomialPair(y2, x1, kConstants); y1 = LoadFromSource(ref srcRef, 32); y2 = LoadFromSource(ref srcRef, 48); - x2 = VectorHelper.FoldPolynomialPair(y1, x2, kConstants); - x3 = VectorHelper.FoldPolynomialPair(y2, x3, kConstants); + x2 = FoldPolynomialPair(y1, x2, kConstants); + x3 = FoldPolynomialPair(y2, x3, kConstants); y1 = LoadFromSource(ref srcRef, 64); y2 = LoadFromSource(ref srcRef, 80); - x4 = VectorHelper.FoldPolynomialPair(y1, x4, kConstants); - x5 = VectorHelper.FoldPolynomialPair(y2, x5, kConstants); + x4 = FoldPolynomialPair(y1, x4, kConstants); + x5 = FoldPolynomialPair(y2, x5, kConstants); y1 = LoadFromSource(ref srcRef, 96); y2 = LoadFromSource(ref srcRef, 112); - x6 = VectorHelper.FoldPolynomialPair(y1, x6, kConstants); - x7 = VectorHelper.FoldPolynomialPair(y2, x7, kConstants); + x6 = FoldPolynomialPair(y1, x6, kConstants); + x7 = FoldPolynomialPair(y2, x7, kConstants); srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 8); length -= Vector128.Count * 8; } while (length >= Vector128.Count * 8); // Fold into 128-bits in x7 - x7 = VectorHelper.FoldPolynomialPair(x7, x0, Vector128.Create(0xe464f4df5fb60ac1UL, 0xb649c5b35a759cf2UL)); // k9, k10 - x7 = VectorHelper.FoldPolynomialPair(x7, x1, Vector128.Create(0x9af04e1eff82d0ddUL, 0x6e82e609297f8fe8UL)); // k11, k12 - x7 = VectorHelper.FoldPolynomialPair(x7, x2, Vector128.Create(0x97c516e98bd2e73UL, 0xb76477b31e22e7bUL)); // k13, k14 - x7 = VectorHelper.FoldPolynomialPair(x7, x3, Vector128.Create(0x5f6843ca540df020UL, 0xddf4b6981205b83fUL)); // k15, k16 - x7 = VectorHelper.FoldPolynomialPair(x7, x4, Vector128.Create(0x54819d8713758b2cUL, 0x4a6b90073eb0af5aUL)); // k17, k18 - x7 = VectorHelper.FoldPolynomialPair(x7, x5, Vector128.Create(0x571bee0a227ef92bUL, 0x44bef2a201b5200cUL)); // k19, k20 - x7 = VectorHelper.FoldPolynomialPair(x7, x6, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2 + x7 = FoldPolynomialPair(x7, x0, Vector128.Create(0xe464f4df5fb60ac1UL, 0xb649c5b35a759cf2UL)); // k9, k10 + x7 = FoldPolynomialPair(x7, x1, Vector128.Create(0x9af04e1eff82d0ddUL, 0x6e82e609297f8fe8UL)); // k11, k12 + x7 = FoldPolynomialPair(x7, x2, Vector128.Create(0x97c516e98bd2e73UL, 0xb76477b31e22e7bUL)); // k13, k14 + x7 = FoldPolynomialPair(x7, x3, Vector128.Create(0x5f6843ca540df020UL, 0xddf4b6981205b83fUL)); // k15, k16 + x7 = FoldPolynomialPair(x7, x4, Vector128.Create(0x54819d8713758b2cUL, 0x4a6b90073eb0af5aUL)); // k17, k18 + x7 = FoldPolynomialPair(x7, x5, Vector128.Create(0x571bee0a227ef92bUL, 0x44bef2a201b5200cUL)); // k19, k20 + x7 = FoldPolynomialPair(x7, x6, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2 } else { @@ -122,7 +123,7 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan source) // Load and XOR the initial CRC value // CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register. // because the data will be byte-reflected and will align with initial crc at correct place. - x7 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc)); + x7 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc)); srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); length -= Vector128.Count; @@ -131,7 +132,7 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan source) // Single fold blocks of 16, if any, into x7 while (length >= Vector128.Count) { - x7 = VectorHelper.FoldPolynomialPair(LoadFromSource(ref srcRef, 0), x7, + x7 = FoldPolynomialPair(LoadFromSource(ref srcRef, 0), x7, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2 srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); @@ -139,14 +140,14 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan source) } // Compute CRC of a 128-bit value and fold to the upper 64-bits - x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(0x5f5c3c7eb52fab6UL)) ^ // k5 - VectorHelper.ShiftLowerToUpper(x7); + x7 = CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(0x5f5c3c7eb52fab6UL)) ^ // k5 + ShiftLowerToUpper(x7); // Barrett reduction kConstants = Vector128.Create(0x578d29d06cc4f872UL, 0x42f0e1eba9ea3693UL); // k7, k8 Vector128 temp = x7; - x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL)); - x7 = VectorHelper.CarrylessMultiplyUpper(x7, kConstants); + x7 = CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL)); + x7 = CarrylessMultiplyUpper(x7, kConstants); x7 ^= temp; // Process the remaining bytes, if any