Skip to content

Commit 74d69fd

Browse files
authored
Improve choice of IndexOfXx routine for some TryFindNextStartingPosition implementations (#89099)
Earlier in .NET 8, we updated the Regex compiler and source generator to be able to vectorize a search for any set, not just simple ones. When one of the main routines couldn't be used, we emit a specialized IndexOfAny helper that uses SearchValues to search for any matching ASCII character or a Unicode character, and if it encounters a Unicode character, it falls back to a linear scan. This meant that a bunch of sets that wouldn't previously have taken these paths now do, but some of those sets have more efficient means of searching; for example, for the set `[^aA]` that searches case-insensitive for anything other than an 'A', with these scheme we'll emit a whole routine that uses SearchValues with a fallback, but we could just use IndexOfAnyExcept('A', 'a'). This fixes the compiler / source generator to prefer such helpers instead when available.
1 parent bced584 commit 74d69fd

File tree

5 files changed

+142
-86
lines changed

5 files changed

+142
-86
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,25 +1063,49 @@ void EmitFixedSet_LeftToRight()
10631063
(true, _) => $"{span}.Slice(i + {primarySet.Distance})",
10641064
};
10651065

1066-
Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));
1066+
// Get the IndexOf* expression to use to perform the search.
1067+
string indexOf;
1068+
if (primarySet.Chars is not null)
1069+
{
1070+
// We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
1071+
string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny";
1072+
if (primarySet.Negated)
1073+
{
1074+
indexOfName = indexOfAnyName = "IndexOfAnyExcept";
1075+
}
10671076

1068-
string indexOf =
1069-
primarySet.Chars is not null ? primarySet.Chars.Length switch
1077+
indexOf = primarySet.Chars.Length switch
10701078
{
1071-
1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})",
1072-
2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
1073-
3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
1074-
_ => $"{span}.IndexOfAny({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
1075-
} :
1076-
primarySet.AsciiSet is not null ? $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})" :
1077-
primarySet.Range is not null ? (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
1079+
1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})",
1080+
2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
1081+
3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
1082+
_ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
1083+
};
1084+
}
1085+
else if (primarySet.AsciiSet is not null)
1086+
{
1087+
// We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it.
1088+
Debug.Assert(!primarySet.Negated);
1089+
indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})";
1090+
}
1091+
else if (primarySet.Range is not null)
1092+
{
1093+
// We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case,
1094+
// where we end up with a set of a single char, we can use IndexOf instead.
1095+
indexOf = (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
10781096
{
10791097
(false, false) => $"{span}.IndexOfAnyInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
10801098
(true, false) => $"{span}.IndexOf({Literal(primarySet.Range.Value.LowInclusive)})",
10811099
(false, true) => $"{span}.IndexOfAnyExceptInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
10821100
(true, true) => $"{span}.IndexOfAnyExcept({Literal(primarySet.Range.Value.LowInclusive)})",
1083-
} :
1084-
$"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
1101+
};
1102+
}
1103+
else
1104+
{
1105+
// We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that
1106+
// will perform the search as efficiently as possible.
1107+
indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
1108+
}
10851109

10861110
if (needLoop)
10871111
{
@@ -1184,6 +1208,7 @@ void EmitFixedSet_RightToLeft()
11841208

11851209
if (set.Chars is { Length: 1 })
11861210
{
1211+
Debug.Assert(!set.Negated);
11871212
writer.WriteLine($"pos = inputSpan.Slice(0, pos).LastIndexOf({Literal(set.Chars[0])});");
11881213
using (EmitBlock(writer, "if (pos >= 0)"))
11891214
{
@@ -3307,7 +3332,7 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
33073332
{
33083333
if (iterationCount is null &&
33093334
node.Kind is RegexNodeKind.Notonelazy &&
3310-
subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch
3335+
subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max efficiently optimized by IndexOfAny, and we need to reserve 1 for node.Ch
33113336
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
33123337
(literal.String is not null ||
33133338
literal.SetChars is not null ||

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -901,37 +901,35 @@ void EmitFixedSet_LeftToRight()
901901
Ldloc(textSpanLocal);
902902
}
903903

904-
Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));
905-
906904
if (primarySet.Chars is not null)
907905
{
908906
switch (primarySet.Chars.Length)
909907
{
910908
case 1:
911909
// tmp = ...IndexOf(setChars[0]);
912910
Ldc(primarySet.Chars[0]);
913-
Call(s_spanIndexOfChar);
911+
Call(primarySet.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar);
914912
break;
915913

916914
case 2:
917915
// tmp = ...IndexOfAny(setChars[0], setChars[1]);
918916
Ldc(primarySet.Chars[0]);
919917
Ldc(primarySet.Chars[1]);
920-
Call(s_spanIndexOfAnyCharChar);
918+
Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharChar : s_spanIndexOfAnyCharChar);
921919
break;
922920

923921
case 3:
924922
// tmp = ...IndexOfAny(setChars[0], setChars[1], setChars[2]});
925923
Ldc(primarySet.Chars[0]);
926924
Ldc(primarySet.Chars[1]);
927925
Ldc(primarySet.Chars[2]);
928-
Call(s_spanIndexOfAnyCharCharChar);
926+
Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar);
929927
break;
930928

931929
default:
932930
Ldstr(new string(primarySet.Chars));
933931
Call(s_stringAsSpanMethod);
934-
Call(s_spanIndexOfAnySpan);
932+
Call(primarySet.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan);
935933
break;
936934
}
937935
}
@@ -1166,6 +1164,8 @@ void EmitFixedSet_RightToLeft()
11661164

11671165
if (set.Chars is { Length: 1 })
11681166
{
1167+
Debug.Assert(!set.Negated);
1168+
11691169
// pos = inputSpan.Slice(0, pos).LastIndexOf(set.Chars[0]);
11701170
Ldloca(inputSpan);
11711171
Ldc(0);

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
using System.Collections.Generic;
55
using System.Diagnostics;
6-
using System.Globalization;
76

87
namespace System.Text.RegularExpressions
98
{
@@ -95,7 +94,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
9594
if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass)
9695
{
9796
// See if the set is limited to holding only a few characters.
98-
Span<char> scratch = stackalloc char[5]; // max optimized by IndexOfAny today
97+
Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
9998
int scratchCount;
10099
char[]? chars = null;
101100
if (!RegexCharClass.IsNegated(charClass) &&
@@ -109,12 +108,14 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
109108
{
110109
// The set contains one and only one character, meaning every match starts
111110
// with the same literal value (potentially case-insensitive). Search for that.
111+
Debug.Assert(!RegexCharClass.IsNegated(charClass));
112112
FixedDistanceLiteral = (chars[0], null, 0);
113113
FindMode = FindNextStartingPositionMode.LeadingChar_RightToLeft;
114114
}
115115
else
116116
{
117117
// The set may match multiple characters. Search for that.
118+
Debug.Assert(!RegexCharClass.IsNegated(charClass) || chars is null);
118119
FixedDistanceSets = new List<FixedDistanceSet>()
119120
{
120121
new FixedDistanceSet(chars, charClass, 0)
@@ -154,22 +155,32 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
154155

155156
// As a backup, see if we can find a literal after a leading atomic loop. That might be better than whatever sets we find, so
156157
// we want to know whether we have one in our pocket before deciding whether to use a leading set (we'll prefer a leading
157-
// set if it's something for which we can vectorize a search).
158+
// set if it's something for which we can search efficiently).
158159
(RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);
159160

160-
// If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support a vectorized
161-
// search and we did successfully find a literal after an atomic loop we could search instead, we prefer the vectorizable search.
161+
// If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support an efficient
162+
// search and we did successfully find a literal after an atomic loop we could search instead, we prefer the efficient search.
163+
// For example, if we have a negated set, we will still prefer the literal-after-an-atomic-loop because negated sets typically
164+
// contain _many_ characters (e.g. [^a] is everything but 'a') and are thus more likely to very quickly match, which means any
165+
// vectorization employed is less likely to kick in and be worth the startup overhead.
162166
if (fixedDistanceSets is not null)
163167
{
168+
// Sort the sets by "quality", such that whatever set is first is the one deemed most efficient to use.
169+
// In some searches, we may use multiple sets, so we want the subsequent ones to also be the efficiency runners-up.
164170
RegexPrefixAnalyzer.SortFixedDistanceSetsByQuality(fixedDistanceSets);
165-
if (fixedDistanceSets[0].Chars is not null || literalAfterLoop is null)
171+
172+
// If there is no literal after the loop, use whatever set we got.
173+
// If there is a literal after the loop, consider it to be better than a negated set and better than a set with many characters.
174+
if (literalAfterLoop is null ||
175+
(fixedDistanceSets[0].Chars is not null && !fixedDistanceSets[0].Negated))
166176
{
167177
// Determine whether to do searching based on one or more sets or on a single literal. Compiled engines
168178
// don't need to special-case literals as they already do codegen to create the optimal lookup based on
169179
// the set's characteristics.
170180
if (!compiled &&
171181
fixedDistanceSets.Count == 1 &&
172-
fixedDistanceSets[0].Chars is { Length: 1 })
182+
fixedDistanceSets[0].Chars is { Length: 1 } &&
183+
!fixedDistanceSets[0].Negated)
173184
{
174185
FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], null, fixedDistanceSets[0].Distance);
175186
FindMode = FindNextStartingPositionMode.FixedDistanceChar_LeftToRight;
@@ -186,8 +197,9 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
186197

187198
// Store the sets, and compute which mode to use.
188199
FixedDistanceSets = fixedDistanceSets;
189-
FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ? FindNextStartingPositionMode.LeadingSet_LeftToRight
190-
: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight;
200+
FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ?
201+
FindNextStartingPositionMode.LeadingSet_LeftToRight :
202+
FindNextStartingPositionMode.FixedDistanceSets_LeftToRight;
191203
_asciiLookups = new uint[fixedDistanceSets.Count][];
192204
}
193205
return;
@@ -322,6 +334,7 @@ private static (string String, int Distance)? FindFixedDistanceString(List<Fixed
322334
return best;
323335
}
324336

337+
#if SYSTEM_TEXT_REGULAREXPRESSIONS
325338
/// <summary>Try to advance to the next starting position that might be a location for a match.</summary>
326339
/// <param name="textSpan">The text to search.</param>
327340
/// <param name="pos">The position in <paramref name="textSpan"/>. This is updated with the found position.</param>
@@ -578,12 +591,11 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
578591
{
579592
FixedDistanceSet primarySet = FixedDistanceSets![0];
580593
char[]? chars = primarySet.Chars;
581-
string set = primarySet.Set;
582594

583595
ReadOnlySpan<char> span = textSpan.Slice(pos);
584596
if (chars is not null)
585597
{
586-
int i = span.IndexOfAny(chars);
598+
int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars);
587599
if (i >= 0)
588600
{
589601
pos += i;
@@ -595,7 +607,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
595607
ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
596608
for (int i = 0; i < span.Length; i++)
597609
{
598-
if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup))
610+
if (RegexCharClass.CharInClass(span[i], primarySet.Set, ref startingAsciiLookup))
599611
{
600612
pos += i;
601613
return true;
@@ -653,7 +665,8 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
653665
for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
654666
{
655667
int offset = inputPosition + primarySet.Distance;
656-
int index = textSpan.Slice(offset).IndexOfAny(primarySet.Chars);
668+
ReadOnlySpan<char> textSpanAtOffset = textSpan.Slice(offset);
669+
int index = primarySet.Negated ? textSpanAtOffset.IndexOfAnyExcept(primarySet.Chars) : textSpanAtOffset.IndexOfAny(primarySet.Chars);
657670
if (index < 0)
658671
{
659672
break;
@@ -769,6 +782,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
769782
return true;
770783
}
771784
}
785+
#endif
772786
}
773787

774788
/// <summary>Mode to use for searching for the next location of a possible match.</summary>

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1419,7 +1419,7 @@ public char FirstCharOfOneOrMulti()
14191419
/// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant.
14201420
/// The Negated value indicates whether the Char/SetChars should be considered exclusionary.
14211421
/// </returns>
1422-
public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today
1422+
public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max efficiently optimized by IndexOfAny today
14231423
{
14241424
Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated.");
14251425

0 commit comments

Comments
 (0)