-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Vectorized MemoryExtensions.CommonPrefixLength #68210
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2033,6 +2033,18 @@ public static int CommonPrefixLength<T>(this Span<T> span, ReadOnlySpan<T> other | |
/// <returns>The length of the common prefix shared by the two spans. If there's no shared prefix, 0 is returned.</returns> | ||
public static int CommonPrefixLength<T>(this ReadOnlySpan<T> span, ReadOnlySpan<T> other) | ||
{ | ||
if (RuntimeHelpers.IsBitwiseEquatable<T>()) | ||
{ | ||
nuint length = Math.Min((nuint)(uint)span.Length, (nuint)(uint)other.Length); | ||
nuint size = (uint)Unsafe.SizeOf<T>(); | ||
nuint index = SpanHelpers.CommonPrefixLength( | ||
ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(span)), | ||
ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(other)), | ||
length * size); | ||
|
||
return (int)(index / size); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The use of byte and then dividing to get rid of partials is clever... but deserves a big comment :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Too verbose now? |
||
} | ||
|
||
// Shrink one of the spans if necessary to ensure they're both the same length. We can then iterate until | ||
// the Length of one of them and at least have bounds checks removed from that one. | ||
SliceLongerSpanToMatchShorterLength(ref span, ref other); | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -2111,6 +2111,95 @@ public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref | |||
return firstLength - secondLength; | ||||
} | ||||
|
||||
public static nuint CommonPrefixLength(ref byte first, ref byte second, nuint length) | ||||
{ | ||||
nuint i; | ||||
|
||||
if (!Vector128.IsHardwareAccelerated || length < (nuint)Vector128<byte>.Count) | ||||
gfoidl marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
{ | ||||
// To have kind of fast path for small inputs, we handle as much elements needed | ||||
// so that either we are done or can use the unrolled loop below. | ||||
i = length % 4; | ||||
|
||||
if (i > 0) | ||||
{ | ||||
if (first != second) | ||||
{ | ||||
return 0; | ||||
} | ||||
|
||||
if (i > 1) | ||||
{ | ||||
if (Unsafe.Add(ref first, 1) != Unsafe.Add(ref second, 1)) | ||||
{ | ||||
return 1; | ||||
} | ||||
|
||||
if (i > 2 && Unsafe.Add(ref first, 2) != Unsafe.Add(ref second, 2)) | ||||
{ | ||||
return 2; | ||||
} | ||||
} | ||||
} | ||||
|
||||
for (; (nint)i <= (nint)length - 4; i += 4) | ||||
{ | ||||
if (Unsafe.Add(ref first, i + 0) != Unsafe.Add(ref second, i + 0)) return i + 0; | ||||
if (Unsafe.Add(ref first, i + 1) != Unsafe.Add(ref second, i + 1)) return i + 1; | ||||
if (Unsafe.Add(ref first, i + 2) != Unsafe.Add(ref second, i + 2)) return i + 2; | ||||
if (Unsafe.Add(ref first, i + 3) != Unsafe.Add(ref second, i + 3)) return i + 3; | ||||
} | ||||
|
||||
return length; | ||||
} | ||||
|
||||
Debug.Assert(length >= (uint)Vector128<byte>.Count); | ||||
|
||||
int mask; | ||||
nuint lengthToExamine = length - (nuint)Vector128<byte>.Count; | ||||
|
||||
Vector128<byte> firstVec; | ||||
Vector128<byte> secondVec; | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Remove this and do -- It does still fold the load most of the time, but there are cases where storing to a local can make it "worse" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Good to know, thanks! Checked codegen, no difference. |
||||
Vector128<byte> maskVec; | ||||
i = 0; | ||||
|
||||
if (lengthToExamine != 0) | ||||
{ | ||||
do | ||||
{ | ||||
firstVec = Vector128.LoadUnsafe(ref first, i); | ||||
secondVec = Vector128.LoadUnsafe(ref second, i); | ||||
maskVec = Vector128.Equals(firstVec, secondVec); | ||||
mask = (int)maskVec.ExtractMostSignificantBits(); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why convert to So keeping it as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍🏻
So it's a bit more work for the JIT. What is peferable now? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We don't have any official guidance right yet. My personal recommendation is that if you need a |
||||
|
||||
if (mask != 0xFFFF) | ||||
{ | ||||
goto Found; | ||||
} | ||||
|
||||
i += (nuint)Vector128<byte>.Count; | ||||
} while (i < lengthToExamine); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: using |
||||
} | ||||
|
||||
// Do final compare as Vector128<byte>.Count from end rather than start | ||||
i = lengthToExamine; | ||||
firstVec = Vector128.LoadUnsafe(ref first, i); | ||||
secondVec = Vector128.LoadUnsafe(ref second, i); | ||||
maskVec = Vector128.Equals(firstVec, secondVec); | ||||
mask = (int)maskVec.ExtractMostSignificantBits(); | ||||
|
||||
if (mask != 0xFFFF) | ||||
{ | ||||
goto Found; | ||||
} | ||||
|
||||
return length; | ||||
|
||||
Found: | ||||
mask = ~mask; | ||||
return i + (uint)BitOperations.TrailingZeroCount(mask); | ||||
} | ||||
|
||||
// Vector sub-search adapted from https://github.com/aspnet/KestrelHttpServer/pull/1138 | ||||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||||
private static int LocateLastFoundByte(Vector<byte> match) | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was going to suggest
nuint.Min
but then I remembered thatIntPtr
andUIntPtr
still have them explicitly implemented until the work around dotnet/csharplang#6031 goes in.