diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
index 19488703f07f7..63b660251e909 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -1247,14 +1247,17 @@ void EmitFixedSet_RightToLeft()
void EmitLiteralAfterAtomicLoop()
{
Debug.Assert(regexTree.FindOptimizations.LiteralAfterLoop is not null);
- (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = regexTree.FindOptimizations.LiteralAfterLoop.Value;
+ (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal) target = regexTree.FindOptimizations.LiteralAfterLoop.Value;
Debug.Assert(target.LoopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic);
Debug.Assert(target.LoopNode.N == int.MaxValue);
+ string stringComparisonComment = target.Literal.StringComparison == StringComparison.OrdinalIgnoreCase ? "ordinal case-insensitive " : "";
+ string stringComparisonArgument = target.Literal.StringComparison == StringComparison.OrdinalIgnoreCase ? ", StringComparison.OrdinalIgnoreCase" : "";
+
writer.Write($"// The pattern begins with an atomic loop for {DescribeSet(target.LoopNode.Str!)}, followed by ");
writer.WriteLine(
- target.Literal.String is not null ? $"the string {Literal(target.Literal.String)}." :
+ target.Literal.String is not null ? $"the {stringComparisonComment}string {Literal(target.Literal.String)}." :
target.Literal.Chars is not null ? $"one of the characters {Literal(new string(target.Literal.Chars))}" :
$"the character {Literal(target.Literal.Char)}.");
writer.WriteLine($"// Search for the literal, and then walk backwards to the beginning of the loop.");
@@ -1275,7 +1278,7 @@ void EmitLiteralAfterAtomicLoop()
// Find the literal. If we can't find it, we're done searching.
writer.Write("int i = slice.");
writer.WriteLine(
- target.Literal.String is string literalString ? $"IndexOf({Literal(literalString)});" :
+ target.Literal.String is string literalString ? $"IndexOf({Literal(literalString)}{stringComparisonArgument});" :
target.Literal.Chars is not char[] literalChars ? $"IndexOf({Literal(target.Literal.Char)});" :
literalChars.Length switch
{
@@ -2582,6 +2585,9 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
if (rm.Tree.FindOptimizations.FindMode == FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight &&
rm.Tree.FindOptimizations.LiteralAfterLoop?.LoopNode == node)
{
+ // This is the set loop that's part of the literal-after-loop optimization: the end of the loop
+ // is stored in runtrackpos, so we just need to transfer that to pos. The optimization is only
+ // selected if the shape of the tree is amenable.
Debug.Assert(sliceStaticPos == 0, "This should be the first node and thus static position shouldn't have advanced.");
writer.WriteLine("// Skip loop already matched in TryFindNextPossibleStartingPosition.");
writer.WriteLine("pos = base.runtrackpos;");
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index 20ccc3afefcca..0cbcac44de04b 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -1234,7 +1234,7 @@ void EmitFixedSet_RightToLeft()
void EmitLiteralAfterAtomicLoop()
{
Debug.Assert(_regexTree.FindOptimizations.LiteralAfterLoop is not null);
- (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = _regexTree.FindOptimizations.LiteralAfterLoop.Value;
+ (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal) target = _regexTree.FindOptimizations.LiteralAfterLoop.Value;
Debug.Assert(target.LoopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic);
Debug.Assert(target.LoopNode.N == int.MaxValue);
@@ -1260,7 +1260,16 @@ void EmitLiteralAfterAtomicLoop()
{
Ldstr(literalString);
Call(s_stringAsSpanMethod);
- Call(s_spanIndexOfSpan);
+ if (target.Literal.StringComparison is StringComparison.OrdinalIgnoreCase)
+ {
+ Ldc((int)target.Literal.StringComparison);
+ Call(s_spanIndexOfSpanStringComparison);
+ }
+ else
+ {
+ Debug.Assert(target.Literal.StringComparison is StringComparison.Ordinal);
+ Call(s_spanIndexOfSpan);
+ }
}
else if (target.Literal.Chars is not char[] literalChars)
{
@@ -2605,10 +2614,12 @@ void EmitNegativeLookaroundAssertion(RegexNode node)
void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
{
// Before we handle general-purpose matching logic for nodes, handle any special-casing.
- // -
if (_regexTree!.FindOptimizations.FindMode == FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight &&
_regexTree!.FindOptimizations.LiteralAfterLoop?.LoopNode == node)
{
+ // This is the set loop that's part of the literal-after-loop optimization: the end of the loop
+ // is stored in runtrackpos, so we just need to transfer that to pos. The optimization is only
+ // selected if the shape of the tree is amenable.
Debug.Assert(sliceStaticPos == 0, "This should be the first node and thus static position shouldn't have advanced.");
// pos = base.runtrackpos;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
index 517c9da6b4270..973187385c26a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
@@ -156,7 +156,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
// As a backup, see if we can find a literal after a leading atomic loop. That might be better than whatever sets we find, so
// we want to know whether we have one in our pocket before deciding whether to use a leading set (we'll prefer a leading
// set if it's something for which we can search efficiently).
- (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);
+ (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);
// If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support an efficient
// search and we did successfully find a literal after an atomic loop we could search instead, we prefer the efficient search.
@@ -274,7 +274,7 @@ public FixedDistanceSet(char[]? chars, string set, int distance)
}
/// When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop.
- public (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? LiteralAfterLoop { get; }
+ public (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal)? LiteralAfterLoop { get; }
/// Analyzes a list of fixed-distance sets to extract a case-sensitive string at a fixed distance.
private static (string String, int Distance)? FindFixedDistanceString(List fixedDistanceSets)
@@ -731,7 +731,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan,
case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight:
{
Debug.Assert(LiteralAfterLoop is not null);
- (RegexNode loopNode, (char Char, string? String, char[]? Chars) literal) = LiteralAfterLoop.GetValueOrDefault();
+ (RegexNode loopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) literal) = LiteralAfterLoop.GetValueOrDefault();
Debug.Assert(loopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic);
Debug.Assert(loopNode.N == int.MaxValue);
@@ -742,7 +742,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan,
ReadOnlySpan slice = textSpan.Slice(startingPos);
// Find the literal. If we can't find it, we're done searching.
- int i = literal.String is not null ? slice.IndexOf(literal.String.AsSpan()) :
+ int i = literal.String is not null ? slice.IndexOf(literal.String.AsSpan(), literal.StringComparison) :
literal.Chars is not null ? slice.IndexOfAny(literal.Chars.AsSpan()) :
slice.IndexOf(literal.Char);
if (i < 0)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
index f306f5504f059..a2c0304483a15 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@@ -776,7 +776,7 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne
/// Analyzes the pattern for a leading set loop followed by a non-overlapping literal. If such a pattern is found, an implementation
/// can search for the literal and then walk backward through all matches for the loop until the beginning is found.
///
- public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? FindLiteralFollowingLeadingLoop(RegexNode node)
+ public static (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal)? FindLiteralFollowingLeadingLoop(RegexNode node)
{
if ((node.Options & RegexOptions.RightToLeft) != 0)
{
@@ -804,6 +804,10 @@ public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Li
// could also be made to support Oneloopatomic and Notoneloopatomic, but the scenarios for that are rare.
Debug.Assert(node.ChildCount() >= 2);
RegexNode firstChild = node.Child(0);
+ while (firstChild.Kind is RegexNodeKind.Atomic or RegexNodeKind.Capture)
+ {
+ firstChild = firstChild.Child(0);
+ }
if (firstChild.Kind is not (RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy) || firstChild.N != int.MaxValue)
{
return null;
@@ -816,37 +820,84 @@ public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Li
{
if (node.ChildCount() == 2)
{
+ // If the UpdateBumpalong is the last node, nothing meaningful follows the set loop.
return null;
}
nextChild = node.Child(2);
}
- // If the subsequent node is a literal, we need to ensure it doesn't overlap with the prior set.
- // If there's no overlap, we have a winner.
- switch (nextChild.Kind)
+ // Is the set loop followed by a case-sensitive string we can search for?
+ if (FindPrefix(nextChild) is { Length: >= 1 } prefix)
{
- case RegexNodeKind.One when !RegexCharClass.CharInClass(nextChild.Ch, firstChild.Str!):
- return (firstChild, (nextChild.Ch, null, null));
+ // The literal can be searched for as either a single char or as a string.
+ // But we need to make sure that its starting character isn't part of the preceding
+ // set, as then we can't know for certain where the set loop ends.
+ return
+ RegexCharClass.CharInClass(prefix[0], firstChild.Str!) ? null :
+ prefix.Length == 1 ? (firstChild, (prefix[0], null, StringComparison.Ordinal, null)) :
+ (firstChild, ('\0', prefix, StringComparison.Ordinal, null));
+ }
+
+ // Is the set loop followed by an ordinal case-insensitive string we can search for? We could
+ // search for a string with at least one char, but if it has only one, we're better off just
+ // searching as a set, so we look for strings with at least two chars.
+ if (FindPrefixOrdinalCaseInsensitive(nextChild) is { Length: >= 2 } ordinalCaseInsensitivePrefix)
+ {
+ // The literal can be searched for as a case-insensitive string. As with ordinal above,
+ // though, we need to make sure its starting character isn't part of the previous set.
+ // If that starting character participates in case conversion, then we need to test out
+ // both casings (FindPrefixOrdinalCaseInsensitive will only return strings composed of
+ // characters that either are ASCII or that don't participate in case conversion).
+ Debug.Assert(
+ !RegexCharClass.ParticipatesInCaseConversion(ordinalCaseInsensitivePrefix[0]) ||
+ ordinalCaseInsensitivePrefix[0] < 128);
+
+ if (RegexCharClass.ParticipatesInCaseConversion(ordinalCaseInsensitivePrefix[0]))
+ {
+ if (RegexCharClass.CharInClass((char)(ordinalCaseInsensitivePrefix[0] | 0x20), firstChild.Str!) ||
+ RegexCharClass.CharInClass((char)(ordinalCaseInsensitivePrefix[0] & ~0x20), firstChild.Str!))
+ {
+ return null;
+ }
+ }
+ else if (RegexCharClass.CharInClass(ordinalCaseInsensitivePrefix[0], firstChild.Str!))
+ {
+ return null;
+ }
- case RegexNodeKind.Multi when !RegexCharClass.CharInClass(nextChild.Str![0], firstChild.Str!):
- return (firstChild, ('\0', nextChild.Str, null));
+ return (firstChild, ('\0', ordinalCaseInsensitivePrefix, StringComparison.OrdinalIgnoreCase, null));
+ }
+
+ // Is the set loop followed by a set we can search for? Whereas the above helpers will drill down into
+ // children as is appropriate, to examine a set here, we need to drill in ourselves. We can drill through
+ // atomic and capture nodes, as they don't affect flow control, and into the left-most node of a concatenate,
+ // as the first child is guaranteed next. We can also drill into a loop or lazy loop that has a guaranteed
+ // iteration, for the same reason as with concatenate.
+ while ((nextChild.Kind is RegexNodeKind.Atomic or RegexNodeKind.Capture or RegexNodeKind.Concatenate) ||
+ (nextChild.Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop && nextChild.M >= 1))
+ {
+ nextChild = nextChild.Child(0);
+ }
- case RegexNodeKind.Set when !RegexCharClass.IsNegated(nextChild.Str!):
- Span chars = stackalloc char[5]; // maximum number of chars optimized by IndexOfAny
- chars = chars.Slice(0, RegexCharClass.GetSetChars(nextChild.Str!, chars));
- if (!chars.IsEmpty)
+ // If the resulting node is a set with at least one iteration, we can search for it.
+ if (nextChild.IsSetFamily &&
+ !RegexCharClass.IsNegated(nextChild.Str!) &&
+ (nextChild.Kind is RegexNodeKind.Set || nextChild.M >= 1))
+ {
+ Span chars = stackalloc char[5]; // maximum number of chars optimized by IndexOfAny
+ chars = chars.Slice(0, RegexCharClass.GetSetChars(nextChild.Str!, chars));
+ if (!chars.IsEmpty)
+ {
+ foreach (char c in chars)
{
- foreach (char c in chars)
+ if (RegexCharClass.CharInClass(c, firstChild.Str!))
{
- if (RegexCharClass.CharInClass(c, firstChild.Str!))
- {
- return null;
- }
+ return null;
}
-
- return (firstChild, ('\0', null, chars.ToArray()));
}
- break;
+
+ return (firstChild, ('\0', null, StringComparison.Ordinal, chars.ToArray()));
+ }
}
// Otherwise, we couldn't find the pattern of an atomic set loop followed by a literal.
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs
index 9aed94346faee..c3ff5b595caa2 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs
@@ -958,6 +958,17 @@ public static IEnumerable