Skip to content

Commit

Permalink
Vectorize shorter buffers for CRC-32 on Intel (#86539)
Browse files Browse the repository at this point in the history
  • Loading branch information
brantburnett authored May 22, 2023
1 parent 52597f5 commit ed33e6c
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 99 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.InteropServices;
using static System.IO.Hashing.VectorHelper;

namespace System.IO.Hashing
{
Expand All @@ -17,7 +18,9 @@ public partial class Crc32
private static bool CanBeVectorized(ReadOnlySpan<byte> source) =>
BitConverter.IsLittleEndian
&& VectorHelper.IsSupported
&& source.Length >= Vector128<byte>.Count * 4;
// Vectorization can process spans as short as a single vector (16 bytes), but if ARM intrinsics are supported they
// seem to be more performant for spans less than 8 vectors (128 bytes).
&& source.Length >= Vector128<byte>.Count * (System.Runtime.Intrinsics.Arm.Crc32.IsSupported ? 8 : 1);

// Processes the bytes in source in 64 byte chunks using carryless/polynomial multiplication intrinsics,
// followed by processing 16 byte chunks, and then processing remaining bytes individually. Requires
Expand All @@ -35,102 +38,81 @@ private static uint UpdateVectorized(uint crc, ReadOnlySpan<byte> source)
ref byte srcRef = ref MemoryMarshal.GetReference(source);
int length = source.Length;

Vector128<ulong> x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
Vector128<ulong> x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
Vector128<ulong> x5;
Vector128<ulong> kConstants;
Vector128<ulong> x1; // Accumulator for the new CRC
Vector128<ulong> x2;

x1 ^= Vector128.CreateScalar(crc).AsUInt64();
Vector128<ulong> x0 = Vector128.Create(0x0154442bd4UL, 0x01c6e41596UL); // k1, k2

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
length -= Vector128<byte>.Count * 4;

// Parallel fold blocks of 64, if any.
while (length >= Vector128<byte>.Count * 4)
if (length >= Vector128<byte>.Count * 8)
{
x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
Vector128<ulong> x6 = VectorHelper.CarrylessMultiplyLower(x2, x0);
Vector128<ulong> x7 = VectorHelper.CarrylessMultiplyLower(x3, x0);
Vector128<ulong> x8 = VectorHelper.CarrylessMultiplyLower(x4, x0);

x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
x2 = VectorHelper.CarrylessMultiplyUpper(x2, x0);
x3 = VectorHelper.CarrylessMultiplyUpper(x3, x0);
x4 = VectorHelper.CarrylessMultiplyUpper(x4, x0);

Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();

x1 ^= x5;
x2 ^= x6;
x3 ^= x7;
x4 ^= x8;

x1 ^= y5;
x2 ^= y6;
x3 ^= y7;
x4 ^= y8;
x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
length -= Vector128<byte>.Count * 4;
}

// Fold into 128-bits.
x0 = Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL); // k3, k4

x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
x1 ^= x2;
x1 ^= x5;
// Load and XOR the initial CRC value
x1 ^= Vector128.CreateScalar(crc).AsUInt64();

kConstants = Vector128.Create(0x0154442bd4UL, 0x01c6e41596UL); // k1, k2

// Parallel fold blocks of 64, if any.
do
{
Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();

x1 = FoldPolynomialPair(y5, x1, kConstants);
x2 = FoldPolynomialPair(y6, x2, kConstants);
x3 = FoldPolynomialPair(y7, x3, kConstants);
x4 = FoldPolynomialPair(y8, x4, kConstants);

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
length -= Vector128<byte>.Count * 4;
} while (length >= Vector128<byte>.Count * 4);

// Fold into 128-bits.
kConstants = Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL); // k3, k4
x1 = FoldPolynomialPair(x2, x1, kConstants);
x1 = FoldPolynomialPair(x3, x1, kConstants);
x1 = FoldPolynomialPair(x4, x1, kConstants);
}
else
{
// For shorter sources just load the first vector and XOR with the CRC
Debug.Assert(length >= 16);

x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
x1 ^= x3;
x1 ^= x5;
x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
x1 ^= Vector128.CreateScalar(crc).AsUInt64();

x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
x1 ^= x4;
x1 ^= x5;
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
}

// Single fold blocks of 16, if any.
while (length >= Vector128<byte>.Count)
{
x2 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();

x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
x1 ^= x2;
x1 ^= x5;
x1 = FoldPolynomialPair(Vector128.LoadUnsafe(ref srcRef).AsUInt64(), x1,
Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL));

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
}

// Fold 128 bits to 64 bits.
x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x1, x0);
x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
x1 = VectorHelper.ShiftRightBytesInVector(x1, 8);
x1 ^= x2;

x0 = Vector128.CreateScalar(0x0163cd6124UL); // k5, k0

x2 = VectorHelper.ShiftRightBytesInVector(x1, 4);
x1 &= x3;
x1 = VectorHelper.CarrylessMultiplyLower(x1, x0);
x1 ^= x2;
Vector128<ulong> bitmask = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
x1 = ShiftRightBytesInVector(x1, 8) ^
CarrylessMultiplyLower(x1, Vector128.CreateScalar(0x00ccaa009eUL));
x1 = CarrylessMultiplyLower(x1 & bitmask, Vector128.CreateScalar(0x0163cd6124UL)) ^ // k5, k0
ShiftRightBytesInVector(x1, 4);

// Reduce to 32 bits.
x0 = Vector128.Create(0x01db710641UL, 0x01f7011641UL); // polynomial

x2 = x1 & x3;
x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x2, x0);
x2 &= x3;
x2 = VectorHelper.CarrylessMultiplyLower(x2, x0);
kConstants = Vector128.Create(0x01db710641UL, 0x01f7011641UL); // polynomial
x2 = CarrylessMultiplyLeftLowerRightUpper(x1 & bitmask, kConstants) & bitmask;
x2 = CarrylessMultiplyLower(x2, kConstants);
x1 ^= x2;

// Process the remaining bytes, if any
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.InteropServices;
using static System.IO.Hashing.VectorHelper;

namespace System.IO.Hashing
{
Expand Down Expand Up @@ -72,7 +73,7 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
// Load and XOR the initial CRC value
// CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
// because data will be byte-reflected and will align with initial crc at correct place.
x0 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc));
x0 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc));

kConstants = Vector128.Create(0x5cf79dea9ac37d6UL, 0x001067e571d7d5c2UL); // k3, k4

Expand All @@ -81,36 +82,36 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
{
Vector128<ulong> y1 = LoadFromSource(ref srcRef, 0);
Vector128<ulong> y2 = LoadFromSource(ref srcRef, 16);
x0 = VectorHelper.FoldPolynomialPair(y1, x0, kConstants);
x1 = VectorHelper.FoldPolynomialPair(y2, x1, kConstants);
x0 = FoldPolynomialPair(y1, x0, kConstants);
x1 = FoldPolynomialPair(y2, x1, kConstants);

y1 = LoadFromSource(ref srcRef, 32);
y2 = LoadFromSource(ref srcRef, 48);
x2 = VectorHelper.FoldPolynomialPair(y1, x2, kConstants);
x3 = VectorHelper.FoldPolynomialPair(y2, x3, kConstants);
x2 = FoldPolynomialPair(y1, x2, kConstants);
x3 = FoldPolynomialPair(y2, x3, kConstants);

y1 = LoadFromSource(ref srcRef, 64);
y2 = LoadFromSource(ref srcRef, 80);
x4 = VectorHelper.FoldPolynomialPair(y1, x4, kConstants);
x5 = VectorHelper.FoldPolynomialPair(y2, x5, kConstants);
x4 = FoldPolynomialPair(y1, x4, kConstants);
x5 = FoldPolynomialPair(y2, x5, kConstants);

y1 = LoadFromSource(ref srcRef, 96);
y2 = LoadFromSource(ref srcRef, 112);
x6 = VectorHelper.FoldPolynomialPair(y1, x6, kConstants);
x7 = VectorHelper.FoldPolynomialPair(y2, x7, kConstants);
x6 = FoldPolynomialPair(y1, x6, kConstants);
x7 = FoldPolynomialPair(y2, x7, kConstants);

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 8);
length -= Vector128<byte>.Count * 8;
} while (length >= Vector128<byte>.Count * 8);

// Fold into 128-bits in x7
x7 = VectorHelper.FoldPolynomialPair(x7, x0, Vector128.Create(0xe464f4df5fb60ac1UL, 0xb649c5b35a759cf2UL)); // k9, k10
x7 = VectorHelper.FoldPolynomialPair(x7, x1, Vector128.Create(0x9af04e1eff82d0ddUL, 0x6e82e609297f8fe8UL)); // k11, k12
x7 = VectorHelper.FoldPolynomialPair(x7, x2, Vector128.Create(0x97c516e98bd2e73UL, 0xb76477b31e22e7bUL)); // k13, k14
x7 = VectorHelper.FoldPolynomialPair(x7, x3, Vector128.Create(0x5f6843ca540df020UL, 0xddf4b6981205b83fUL)); // k15, k16
x7 = VectorHelper.FoldPolynomialPair(x7, x4, Vector128.Create(0x54819d8713758b2cUL, 0x4a6b90073eb0af5aUL)); // k17, k18
x7 = VectorHelper.FoldPolynomialPair(x7, x5, Vector128.Create(0x571bee0a227ef92bUL, 0x44bef2a201b5200cUL)); // k19, k20
x7 = VectorHelper.FoldPolynomialPair(x7, x6, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
x7 = FoldPolynomialPair(x7, x0, Vector128.Create(0xe464f4df5fb60ac1UL, 0xb649c5b35a759cf2UL)); // k9, k10
x7 = FoldPolynomialPair(x7, x1, Vector128.Create(0x9af04e1eff82d0ddUL, 0x6e82e609297f8fe8UL)); // k11, k12
x7 = FoldPolynomialPair(x7, x2, Vector128.Create(0x97c516e98bd2e73UL, 0xb76477b31e22e7bUL)); // k13, k14
x7 = FoldPolynomialPair(x7, x3, Vector128.Create(0x5f6843ca540df020UL, 0xddf4b6981205b83fUL)); // k15, k16
x7 = FoldPolynomialPair(x7, x4, Vector128.Create(0x54819d8713758b2cUL, 0x4a6b90073eb0af5aUL)); // k17, k18
x7 = FoldPolynomialPair(x7, x5, Vector128.Create(0x571bee0a227ef92bUL, 0x44bef2a201b5200cUL)); // k19, k20
x7 = FoldPolynomialPair(x7, x6, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
}
else
{
Expand All @@ -122,7 +123,7 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
// Load and XOR the initial CRC value
// CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
// because the data will be byte-reflected and will align with initial crc at correct place.
x7 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc));
x7 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc));

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
Expand All @@ -131,22 +132,22 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
// Single fold blocks of 16, if any, into x7
while (length >= Vector128<byte>.Count)
{
x7 = VectorHelper.FoldPolynomialPair(LoadFromSource(ref srcRef, 0), x7,
x7 = FoldPolynomialPair(LoadFromSource(ref srcRef, 0), x7,
Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
}

// Compute CRC of a 128-bit value and fold to the upper 64-bits
x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(0x5f5c3c7eb52fab6UL)) ^ // k5
VectorHelper.ShiftLowerToUpper(x7);
x7 = CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(0x5f5c3c7eb52fab6UL)) ^ // k5
ShiftLowerToUpper(x7);

// Barrett reduction
kConstants = Vector128.Create(0x578d29d06cc4f872UL, 0x42f0e1eba9ea3693UL); // k7, k8
Vector128<ulong> temp = x7;
x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL));
x7 = VectorHelper.CarrylessMultiplyUpper(x7, kConstants);
x7 = CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL));
x7 = CarrylessMultiplyUpper(x7, kConstants);
x7 ^= temp;

// Process the remaining bytes, if any
Expand Down

0 comments on commit ed33e6c

Please sign in to comment.