Skip to content

Commit 99b7601

Browse files
authored
Enable Regex to use SearchValues<string> in compiled / source generator for IgnoreCase multi-strings (#98791)
* Enable Regex to use SearchValues<string> in compiled / source generator TryFindNextStartingPosition The analyzer determines a set of prefixes that can start any match, and then uses SearchValues with IndexOfAny to find the next one from that set. It's currently only enabled for case-insensitive; we need to do some more perf validation before enabling for case-sensitive. * Address PR feedback * Fix unit test
1 parent 6f8d3e3 commit 99b7601

File tree

7 files changed

+473
-22
lines changed

7 files changed

+473
-22
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,11 @@ private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter w
732732
EmitIndexOfString_RightToLeft();
733733
break;
734734

735+
case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
736+
case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
737+
EmitIndexOfStrings_LeftToRight();
738+
break;
739+
735740
case FindNextStartingPositionMode.LeadingSet_LeftToRight:
736741
case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight:
737742
EmitFixedSet_LeftToRight();
@@ -1041,6 +1046,37 @@ UnicodeCategory.NonSpacingMark or
10411046
}
10421047
}
10431048

1049+
// Emits a case-sensitive left-to-right search for any one of multiple leading prefixes.
1050+
void EmitIndexOfStrings_LeftToRight()
1051+
{
1052+
RegexFindOptimizations opts = regexTree.FindOptimizations;
1053+
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingStrings_LeftToRight or FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight);
1054+
1055+
string prefixes = string.Join(", ", opts.LeadingPrefixes.Select(prefix => Literal(prefix)));
1056+
StringComparison stringComparison = opts.FindMode is FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight ?
1057+
StringComparison.OrdinalIgnoreCase :
1058+
StringComparison.Ordinal;
1059+
string fieldName = GetSHA256FieldName($"s_indexOfAnyStrings_{stringComparison}_", prefixes);
1060+
1061+
if (!requiredHelpers.ContainsKey(fieldName))
1062+
{
1063+
requiredHelpers.Add(fieldName,
1064+
[
1065+
$"/// <summary>Supports searching for the specified strings.</summary>",
1066+
$"internal static readonly SearchValues<string> {fieldName} = SearchValues.Create([{prefixes}], StringComparison.{stringComparison});", // explicitly using an array in case prefixes is large
1067+
]);
1068+
}
1069+
1070+
writer.WriteLine($"// The pattern has multiple strings that could begin the match. Search for any of them.");
1071+
writer.WriteLine($"// If none can be found, there's no match.");
1072+
writer.WriteLine($"int i = inputSpan.Slice(pos).IndexOfAny({HelpersTypeName}.{fieldName});");
1073+
using (EmitBlock(writer, "if (i >= 0)"))
1074+
{
1075+
writer.WriteLine("base.runtextpos = pos + i;");
1076+
writer.WriteLine("return true;");
1077+
}
1078+
}
1079+
10441080
// Emits a case-sensitive right-to-left search for a substring.
10451081
void EmitIndexOfString_RightToLeft()
10461082
{

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,6 +1054,21 @@ public static bool IsAscii(ReadOnlySpan<char> s)
10541054
#endif
10551055
}
10561056

1057+
/// <summary>Gets whether the set description string is for two ASCII letters that case to each other under OrdinalIgnoreCase rules.</summary>
1058+
public static bool SetContainsAsciiOrdinalIgnoreCaseCharacter(string set, Span<char> twoChars)
1059+
{
1060+
Debug.Assert(twoChars.Length >= 2);
1061+
return
1062+
!IsNegated(set) &&
1063+
GetSetChars(set, twoChars) == 2 &&
1064+
twoChars[0] < 128 &&
1065+
twoChars[1] < 128 &&
1066+
twoChars[0] != twoChars[1] &&
1067+
char.IsLetter(twoChars[0]) &&
1068+
char.IsLetter(twoChars[1]) &&
1069+
(twoChars[0] | 0x20) == (twoChars[1] | 0x20);
1070+
}
1071+
10571072
/// <summary>Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.</summary>
10581073
/// <remarks>This may enumerate negated characters if the set is negated. This will return false if the set has subtraction.</remarks>
10591074
private static bool CanEasilyEnumerateSetContents(string set) =>

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,8 @@ protected void EmitTryFindNextPossibleStartingPosition()
460460
{
461461
case FindNextStartingPositionMode.LeadingString_LeftToRight:
462462
case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
463+
case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
464+
case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
463465
case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
464466
EmitIndexOfString_LeftToRight();
465467
break;
@@ -745,15 +747,19 @@ bool EmitAnchors()
745747
return false;
746748
}
747749

748-
// Emits a case-sensitive left-to-right search for a substring.
750+
// Emits a case-sensitive left-to-right search for a substring or substrings.
749751
void EmitIndexOfString_LeftToRight()
750752
{
751753
RegexFindOptimizations opts = _regexTree.FindOptimizations;
752-
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight);
754+
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or
755+
FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or
756+
FindNextStartingPositionMode.FixedDistanceString_LeftToRight or
757+
FindNextStartingPositionMode.LeadingStrings_LeftToRight or
758+
FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight);
753759

754760
using RentedLocalBuilder i = RentInt32Local();
755761

756-
// int i = inputSpan.Slice(pos).IndexOf(prefix);
762+
// int i = inputSpan.Slice(pos)...
757763
Ldloca(inputSpan);
758764
Ldloc(pos);
759765
if (opts.FindMode is FindNextStartingPositionMode.FixedDistanceString_LeftToRight &&
@@ -763,11 +769,21 @@ void EmitIndexOfString_LeftToRight()
763769
Add();
764770
}
765771
Call(s_spanSliceIntMethod);
766-
string literalString = opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ?
767-
opts.LeadingPrefix :
768-
opts.FixedDistanceLiteral.String!;
769-
LoadSearchValues([literalString], opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
770-
Call(s_spanIndexOfAnySearchValuesString);
772+
773+
// ...IndexOf(prefix);
774+
if (opts.FindMode is FindNextStartingPositionMode.LeadingStrings_LeftToRight or FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight)
775+
{
776+
LoadSearchValues(opts.LeadingPrefixes, opts.FindMode is FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
777+
Call(s_spanIndexOfAnySearchValuesString);
778+
}
779+
else
780+
{
781+
string literalString = opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ?
782+
opts.LeadingPrefix :
783+
opts.FixedDistanceLiteral.String!;
784+
LoadSearchValues([literalString], opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
785+
Call(s_spanIndexOfAnySearchValuesString);
786+
}
771787
Stloc(i);
772788

773789
// if (i < 0) goto ReturnFalse;

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,28 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
137137
return;
138138
}
139139

140-
// We're now left-to-right only and looking for sets.
140+
// We're now left-to-right only and looking for multiple prefixes and/or sets.
141+
142+
// If there are multiple leading strings, we can search for any of them.
143+
if (compiled)
144+
{
145+
if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: true) is { Length: > 1 } caseInsensitivePrefixes)
146+
{
147+
LeadingPrefixes = caseInsensitivePrefixes;
148+
FindMode = FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight;
149+
return;
150+
}
151+
152+
// TODO: While some benchmarks benefit from this significantly, others regressed a bit (in particular those with few
153+
// matches). Before enabling this, we need to investigate the performance impact on real-world scenarios,
154+
// and see if there are ways to reduce the impact.
155+
//if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: false) is { Length: > 1 } caseSensitivePrefixes)
156+
//{
157+
// LeadingPrefixes = caseSensitivePrefixes;
158+
// FindMode = FindNextStartingPositionMode.LeadingStrings_LeftToRight;
159+
// return;
160+
//}
161+
}
141162

142163
// Build up a list of all of the sets that are a fixed distance from the start of the expression.
143164
List<FixedDistanceSet>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, thorough: !interpreter);
@@ -244,6 +265,9 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
244265
/// <summary>Gets the leading prefix. May be an empty string.</summary>
245266
public string LeadingPrefix { get; } = string.Empty;
246267

268+
/// <summary>Gets the leading prefixes. May be an empty array.</summary>
269+
public string[] LeadingPrefixes { get; } = Array.Empty<string>();
270+
247271
/// <summary>When in fixed distance literal mode, gets the literal and how far it is from the start of the pattern.</summary>
248272
public (char Char, string? String, int Distance) FixedDistanceLiteral { get; }
249273

@@ -767,10 +791,16 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
767791
return false;
768792
}
769793

794+
// Not supported in the interpreter, but we could end up here for patterns so complex the compiler gave up on them.
795+
796+
case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
797+
case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
798+
return true;
799+
770800
// Nothing special to look for. Just return true indicating this is a valid position to try to match.
771801

772802
default:
773-
Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch);
803+
Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch, $"Unexpected FindMode {FindMode}");
774804
return true;
775805
}
776806
}
@@ -810,6 +840,11 @@ internal enum FindNextStartingPositionMode
810840
/// <summary>A multi-character ordinal case-insensitive substring at the beginning of the pattern.</summary>
811841
LeadingString_OrdinalIgnoreCase_LeftToRight,
812842

843+
/// <summary>Multiple leading prefix strings</summary>
844+
LeadingStrings_LeftToRight,
845+
/// <summary>Multiple leading ordinal case-insensitive prefix strings</summary>
846+
LeadingStrings_OrdinalIgnoreCase_LeftToRight,
847+
813848
/// <summary>A set starting the pattern.</summary>
814849
LeadingSet_LeftToRight,
815850
/// <summary>A set starting the right-to-left pattern.</summary>

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2561,14 +2561,7 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil
25612561
{
25622562
// In particular we want to look for sets that contain only the upper and lowercase variant
25632563
// of the same ASCII letter.
2564-
if (RegexCharClass.IsNegated(child.Str!) ||
2565-
RegexCharClass.GetSetChars(child.Str!, twoChars) != 2 ||
2566-
twoChars[0] >= 128 ||
2567-
twoChars[1] >= 128 ||
2568-
twoChars[0] == twoChars[1] ||
2569-
!char.IsLetter(twoChars[0]) ||
2570-
!char.IsLetter(twoChars[1]) ||
2571-
((twoChars[0] | 0x20) != (twoChars[1] | 0x20)))
2564+
if (!RegexCharClass.SetContainsAsciiOrdinalIgnoreCaseCharacter(child.Str!, twoChars))
25722565
{
25732566
break;
25742567
}

0 commit comments

Comments
 (0)