From 2b87d850cb6db16d9dca92177713920730d3a1f2 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Wed, 23 Nov 2022 05:02:57 +0000 Subject: [PATCH] Improve IndexOfAnyAsciiSearcher ARM throughput (#78739) * Improve IndexOfAnyAsciiSearcher ARM throughput Also solves an edge-case bug for byte overloads on ARM64 where we'd get false positives (negatives for Except) when haystack values were exactly 128 above a value in the needle * Remove type name asserts --- .../tests/Span/IndexOfAny.byte.cs | 44 +++++++++++++ .../tests/Span/IndexOfAny.char.cs | 66 +++++++++++++++++++ .../IndexOfAnyAsciiSearcher.cs | 23 ++++--- 3 files changed, 123 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Memory/tests/Span/IndexOfAny.byte.cs b/src/libraries/System.Memory/tests/Span/IndexOfAny.byte.cs index 5690c54941ba4..5791f29982095 100644 --- a/src/libraries/System.Memory/tests/Span/IndexOfAny.byte.cs +++ b/src/libraries/System.Memory/tests/Span/IndexOfAny.byte.cs @@ -553,6 +553,50 @@ static int IndexOfAnyReferenceImpl(ReadOnlySpan searchSpace, ReadOnlySpan< } } + [Theory] + [InlineData(true)] + [InlineData(false)] + public static void AsciiNeedle_ProperlyHandlesEdgeCases_Byte(bool needleContainsZero) + { + // There is some special handling we have to do for ASCII needles to properly filter out non-ASCII results + ReadOnlySpan needleValues = needleContainsZero ? "AEIOU\0"u8 : "AEIOU!"u8; + IndexOfAnyValues needle = IndexOfAnyValues.Create(needleValues); + + ReadOnlySpan repeatingHaystack = "AaAaAaAaAaAa"u8; + Assert.Equal(0, repeatingHaystack.IndexOfAny(needle)); + Assert.Equal(1, repeatingHaystack.IndexOfAnyExcept(needle)); + Assert.Equal(10, repeatingHaystack.LastIndexOfAny(needle)); + Assert.Equal(11, repeatingHaystack.LastIndexOfAnyExcept(needle)); + + ReadOnlySpan haystackWithZeroes = "Aa\0Aa\0Aa\0"u8; + Assert.Equal(0, haystackWithZeroes.IndexOfAny(needle)); + Assert.Equal(1, haystackWithZeroes.IndexOfAnyExcept(needle)); + Assert.Equal(needleContainsZero ? 8 : 6, haystackWithZeroes.LastIndexOfAny(needle)); + Assert.Equal(needleContainsZero ? 7 : 8, haystackWithZeroes.LastIndexOfAnyExcept(needle)); + + Span haystackWithOffsetNeedle = new byte[100]; + for (int i = 0; i < haystackWithOffsetNeedle.Length; i++) + { + haystackWithOffsetNeedle[i] = (byte)(128 + needleValues[i % needleValues.Length]); + } + + Assert.Equal(-1, haystackWithOffsetNeedle.IndexOfAny(needle)); + Assert.Equal(0, haystackWithOffsetNeedle.IndexOfAnyExcept(needle)); + Assert.Equal(-1, haystackWithOffsetNeedle.LastIndexOfAny(needle)); + Assert.Equal(haystackWithOffsetNeedle.Length - 1, haystackWithOffsetNeedle.LastIndexOfAnyExcept(needle)); + + // Mix matching characters back in + for (int i = 0; i < haystackWithOffsetNeedle.Length; i += 3) + { + haystackWithOffsetNeedle[i] = needleValues[i % needleValues.Length]; + } + + Assert.Equal(0, haystackWithOffsetNeedle.IndexOfAny(needle)); + Assert.Equal(1, haystackWithOffsetNeedle.IndexOfAnyExcept(needle)); + Assert.Equal(haystackWithOffsetNeedle.Length - 1, haystackWithOffsetNeedle.LastIndexOfAny(needle)); + Assert.Equal(haystackWithOffsetNeedle.Length - 2, haystackWithOffsetNeedle.LastIndexOfAnyExcept(needle)); + } + private static int IndexOf(Span span, byte value) { int index = span.IndexOf(value); diff --git a/src/libraries/System.Memory/tests/Span/IndexOfAny.char.cs b/src/libraries/System.Memory/tests/Span/IndexOfAny.char.cs index 7189231af38cd..c616ec2adf865 100644 --- a/src/libraries/System.Memory/tests/Span/IndexOfAny.char.cs +++ b/src/libraries/System.Memory/tests/Span/IndexOfAny.char.cs @@ -799,6 +799,72 @@ static int IndexOfAnyReferenceImpl(ReadOnlySpan searchSpace, ReadOnlySpan< } } + [Theory] + [InlineData(true)] + [InlineData(false)] + public static void AsciiNeedle_ProperlyHandlesEdgeCases_Char(bool needleContainsZero) + { + // There is some special handling we have to do for ASCII needles to properly filter out non-ASCII results + ReadOnlySpan needleValues = needleContainsZero ? "AEIOU\0" : "AEIOU!"; + IndexOfAnyValues needle = IndexOfAnyValues.Create(needleValues); + + ReadOnlySpan repeatingHaystack = "AaAaAaAaAaAa"; + Assert.Equal(0, repeatingHaystack.IndexOfAny(needle)); + Assert.Equal(1, repeatingHaystack.IndexOfAnyExcept(needle)); + Assert.Equal(10, repeatingHaystack.LastIndexOfAny(needle)); + Assert.Equal(11, repeatingHaystack.LastIndexOfAnyExcept(needle)); + + ReadOnlySpan haystackWithZeroes = "Aa\0Aa\0Aa\0"; + Assert.Equal(0, haystackWithZeroes.IndexOfAny(needle)); + Assert.Equal(1, haystackWithZeroes.IndexOfAnyExcept(needle)); + Assert.Equal(needleContainsZero ? 8 : 6, haystackWithZeroes.LastIndexOfAny(needle)); + Assert.Equal(needleContainsZero ? 7 : 8, haystackWithZeroes.LastIndexOfAnyExcept(needle)); + + Span haystackWithOffsetNeedle = new char[100]; + for (int i = 0; i < haystackWithOffsetNeedle.Length; i++) + { + haystackWithOffsetNeedle[i] = (char)(128 + needleValues[i % needleValues.Length]); + } + + Assert.Equal(-1, haystackWithOffsetNeedle.IndexOfAny(needle)); + Assert.Equal(0, haystackWithOffsetNeedle.IndexOfAnyExcept(needle)); + Assert.Equal(-1, haystackWithOffsetNeedle.LastIndexOfAny(needle)); + Assert.Equal(haystackWithOffsetNeedle.Length - 1, haystackWithOffsetNeedle.LastIndexOfAnyExcept(needle)); + + // Mix matching characters back in + for (int i = 0; i < haystackWithOffsetNeedle.Length; i += 3) + { + haystackWithOffsetNeedle[i] = needleValues[i % needleValues.Length]; + } + + Assert.Equal(0, haystackWithOffsetNeedle.IndexOfAny(needle)); + Assert.Equal(1, haystackWithOffsetNeedle.IndexOfAnyExcept(needle)); + Assert.Equal(haystackWithOffsetNeedle.Length - 1, haystackWithOffsetNeedle.LastIndexOfAny(needle)); + Assert.Equal(haystackWithOffsetNeedle.Length - 2, haystackWithOffsetNeedle.LastIndexOfAnyExcept(needle)); + + // With chars, the lower byte could be matching, but we have to check that the higher byte is also 0 + for (int i = 0; i < haystackWithOffsetNeedle.Length; i++) + { + haystackWithOffsetNeedle[i] = (char)(((i + 1) * 256) + needleValues[i % needleValues.Length]); + } + + Assert.Equal(-1, haystackWithOffsetNeedle.IndexOfAny(needle)); + Assert.Equal(0, haystackWithOffsetNeedle.IndexOfAnyExcept(needle)); + Assert.Equal(-1, haystackWithOffsetNeedle.LastIndexOfAny(needle)); + Assert.Equal(haystackWithOffsetNeedle.Length - 1, haystackWithOffsetNeedle.LastIndexOfAnyExcept(needle)); + + // Mix matching characters back in + for (int i = 0; i < haystackWithOffsetNeedle.Length; i += 3) + { + haystackWithOffsetNeedle[i] = needleValues[i % needleValues.Length]; + } + + Assert.Equal(0, haystackWithOffsetNeedle.IndexOfAny(needle)); + Assert.Equal(1, haystackWithOffsetNeedle.IndexOfAnyExcept(needle)); + Assert.Equal(haystackWithOffsetNeedle.Length - 1, haystackWithOffsetNeedle.LastIndexOfAny(needle)); + Assert.Equal(haystackWithOffsetNeedle.Length - 2, haystackWithOffsetNeedle.LastIndexOfAnyExcept(needle)); + } + private static int IndexOf(Span span, char value) { int index = span.IndexOf(value); diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs index c33807fd33a18..16d6f4e72f2e7 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs @@ -490,26 +490,23 @@ private static Vector128 IndexOfAnyLookup(Vector // X86: Downcast every character using saturation. // - Values <= 32767 result in min(value, 255). // - Values > 32767 result in 0. Because of this we must do more work to handle needles that contain 0. - // ARM64: Take the low byte of each character. - // - All values result in (value & 0xFF). + // ARM64: Do narrowing saturation over unsigned values. + // - All values result in min(value, 255) Vector128 source = Sse2.IsSupported ? Sse2.PackUnsignedSaturate(source0, source1) - : AdvSimd.Arm64.UnzipEven(source0.AsByte(), source1.AsByte()); + : AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(source0.AsUInt16()), source1.AsUInt16()); Vector128 result = IndexOfAnyLookupCore(source, bitmapLookup); - // On ARM64, we ignored the high byte of every character when packing (see above). - // The 'result' can therefore contain false positives - e.g. 0x141 would match 0x41 ('A'). // On X86, PackUnsignedSaturate resulted in values becoming 0 for inputs above 32767. // Any value above 32767 would therefore match against 0. If 0 is present in the needle, we must clear the false positives. // In both cases, we can correct the result by clearing any bits that matched with a non-ascii source character. - if (AdvSimd.Arm64.IsSupported || TOptimizations.NeedleContainsZero) + if (TOptimizations.NeedleContainsZero) { + Debug.Assert(Sse2.IsSupported); Vector128 ascii0 = Vector128.LessThan(source0.AsUInt16(), Vector128.Create((ushort)128)).AsInt16(); Vector128 ascii1 = Vector128.LessThan(source1.AsUInt16(), Vector128.Create((ushort)128)).AsInt16(); - Vector128 ascii = Sse2.IsSupported - ? Sse2.PackSignedSaturate(ascii0, ascii1).AsByte() - : AdvSimd.Arm64.UnzipEven(ascii0.AsByte(), ascii1.AsByte()); + Vector128 ascii = Sse2.PackSignedSaturate(ascii0, ascii1).AsByte(); result &= ascii; } @@ -542,7 +539,13 @@ private static Vector128 IndexOfAnyLookupCore(Vector128 source, Vect ? source : source & Vector128.Create((byte)0xF); - Vector128 highNibbles = Vector128.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector128.Create((byte)0xF); + // On ARM, we have an instruction for an arithmetic right shift of 1-byte signed values. + // The shift will map values above 127 to values above 16, which the shuffle will then map to 0. + // This is how we exclude non-ASCII values from results on ARM. + // On X86, use a 4-byte value shift with AND 15 to emulate a 1-byte value logical shift. + Vector128 highNibbles = AdvSimd.IsSupported + ? AdvSimd.ShiftRightArithmetic(source.AsSByte(), 4).AsByte() + : Sse2.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector128.Create((byte)0xF); // The bitmapLookup represents a 8x16 table of bits, indicating whether a character is present in the needle. // Lookup the rows via the lower nibble and the column via the higher nibble.