Improve literal-after-loop regex optimization (#93190)

* Improve literal-after-loop regex optimization Regex currently has an optimization that looks to see whether the pattern begins with a set loop followed by some literal, in which case it can optimize the search for matches by searching for the literal and then walking backwards through the starting set. However, it's missing a handful of cases we can easily support: - It currently gives up if the set loop is wrapped in an atomic and/or a capture. - It currently gives up if the literal is a set that's wrapped in an atomic, capture, concatenate, loop, or lazy loop. - If the set loop is followed by an ignore-case string, it currently only searches for the starting set of that string, rather than more of it. - If the literal is a set, we'd only examine it if it was exactly one iteration (RegexNodeKind.Set) rather than a loop with at least one iteration. This fixes all of those issues, such that the optimization extends to more patterns. * Add a few more tests and comments * Address PR feedback
dotnet · Oct 19, 2023 · 0cd1774 · 0cd1774
1 parent 3a3e1c7
commit 0cd1774
Show file tree

Hide file tree

Showing 6 changed files with 124 additions and 33 deletions.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -1247,14 +1247,17 @@ void EmitFixedSet_RightToLeft()
             void EmitLiteralAfterAtomicLoop()
             {
                 Debug.Assert(regexTree.FindOptimizations.LiteralAfterLoop is not null);
-                (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = regexTree.FindOptimizations.LiteralAfterLoop.Value;
+                (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal) target = regexTree.FindOptimizations.LiteralAfterLoop.Value;
 
                 Debug.Assert(target.LoopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic);
                 Debug.Assert(target.LoopNode.N == int.MaxValue);
 
+                string stringComparisonComment = target.Literal.StringComparison == StringComparison.OrdinalIgnoreCase ? "ordinal case-insensitive " : "";
+                string stringComparisonArgument = target.Literal.StringComparison == StringComparison.OrdinalIgnoreCase ? ", StringComparison.OrdinalIgnoreCase" : "";
+
                 writer.Write($"// The pattern begins with an atomic loop for {DescribeSet(target.LoopNode.Str!)}, followed by ");
                 writer.WriteLine(
-                    target.Literal.String is not null ? $"the string {Literal(target.Literal.String)}." :
+                    target.Literal.String is not null ? $"the {stringComparisonComment}string {Literal(target.Literal.String)}." :
                     target.Literal.Chars is not null ? $"one of the characters {Literal(new string(target.Literal.Chars))}" :
                     $"the character {Literal(target.Literal.Char)}.");
                 writer.WriteLine($"// Search for the literal, and then walk backwards to the beginning of the loop.");
@@ -1275,7 +1278,7 @@ void EmitLiteralAfterAtomicLoop()
                     // Find the literal.  If we can't find it, we're done searching.
                     writer.Write("int i = slice.");
                     writer.WriteLine(
-                        target.Literal.String is string literalString ? $"IndexOf({Literal(literalString)});" :
+                        target.Literal.String is string literalString ? $"IndexOf({Literal(literalString)}{stringComparisonArgument});" :
                         target.Literal.Chars is not char[] literalChars ? $"IndexOf({Literal(target.Literal.Char)});" :
                         literalChars.Length switch
                         {
@@ -2582,6 +2585,9 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
                 if (rm.Tree.FindOptimizations.FindMode == FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight &&
                     rm.Tree.FindOptimizations.LiteralAfterLoop?.LoopNode == node)
                 {
+                    // This is the set loop that's part of the literal-after-loop optimization: the end of the loop
+                    // is stored in runtrackpos, so we just need to transfer that to pos. The optimization is only
+                    // selected if the shape of the tree is amenable.
                     Debug.Assert(sliceStaticPos == 0, "This should be the first node and thus static position shouldn't have advanced.");
                     writer.WriteLine("// Skip loop already matched in TryFindNextPossibleStartingPosition.");
                     writer.WriteLine("pos = base.runtrackpos;");

diff --git a/...raries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/...raries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -1234,7 +1234,7 @@ void EmitFixedSet_RightToLeft()
             void EmitLiteralAfterAtomicLoop()
             {
                 Debug.Assert(_regexTree.FindOptimizations.LiteralAfterLoop is not null);
-                (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = _regexTree.FindOptimizations.LiteralAfterLoop.Value;
+                (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal) target = _regexTree.FindOptimizations.LiteralAfterLoop.Value;
 
                 Debug.Assert(target.LoopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic);
                 Debug.Assert(target.LoopNode.N == int.MaxValue);
@@ -1260,7 +1260,16 @@ void EmitLiteralAfterAtomicLoop()
                 {
                     Ldstr(literalString);
                     Call(s_stringAsSpanMethod);
-                    Call(s_spanIndexOfSpan);
+                    if (target.Literal.StringComparison is StringComparison.OrdinalIgnoreCase)
+                    {
+                        Ldc((int)target.Literal.StringComparison);
+                        Call(s_spanIndexOfSpanStringComparison);
+                    }
+                    else
+                    {
+                        Debug.Assert(target.Literal.StringComparison is StringComparison.Ordinal);
+                        Call(s_spanIndexOfSpan);
+                    }
                 }
                 else if (target.Literal.Chars is not char[] literalChars)
                 {
@@ -2605,10 +2614,12 @@ void EmitNegativeLookaroundAssertion(RegexNode node)
             void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
             {
                 // Before we handle general-purpose matching logic for nodes, handle any special-casing.
-                // -
                 if (_regexTree!.FindOptimizations.FindMode == FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight &&
                     _regexTree!.FindOptimizations.LiteralAfterLoop?.LoopNode == node)
                 {
+                    // This is the set loop that's part of the literal-after-loop optimization: the end of the loop
+                    // is stored in runtrackpos, so we just need to transfer that to pos. The optimization is only
+                    // selected if the shape of the tree is amenable.
                     Debug.Assert(sliceStaticPos == 0, "This should be the first node and thus static position shouldn't have advanced.");
 
                     // pos = base.runtrackpos;

diff --git a/...stem.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/...stem.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
@@ -156,7 +156,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
             // As a backup, see if we can find a literal after a leading atomic loop.  That might be better than whatever sets we find, so
             // we want to know whether we have one in our pocket before deciding whether to use a leading set (we'll prefer a leading
             // set if it's something for which we can search efficiently).
-            (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);
+            (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);
 
             // If we got such sets, we'll likely use them.  However, if the best of them is something that doesn't support an efficient
             // search and we did successfully find a literal after an atomic loop we could search instead, we prefer the efficient search.
@@ -274,7 +274,7 @@ public FixedDistanceSet(char[]? chars, string set, int distance)
         }
 
         /// <summary>When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop.</summary>
-        public (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? LiteralAfterLoop { get; }
+        public (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal)? LiteralAfterLoop { get; }
 
         /// <summary>Analyzes a list of fixed-distance sets to extract a case-sensitive string at a fixed distance.</summary>
         private static (string String, int Distance)? FindFixedDistanceString(List<FixedDistanceSet> fixedDistanceSets)
@@ -731,7 +731,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
                 case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight:
                     {
                         Debug.Assert(LiteralAfterLoop is not null);
-                        (RegexNode loopNode, (char Char, string? String, char[]? Chars) literal) = LiteralAfterLoop.GetValueOrDefault();
+                        (RegexNode loopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) literal) = LiteralAfterLoop.GetValueOrDefault();
 
                         Debug.Assert(loopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic);
                         Debug.Assert(loopNode.N == int.MaxValue);
@@ -742,7 +742,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
                             ReadOnlySpan<char> slice = textSpan.Slice(startingPos);
 
                             // Find the literal.  If we can't find it, we're done searching.
-                            int i = literal.String is not null ? slice.IndexOf(literal.String.AsSpan()) :
+                            int i = literal.String is not null ? slice.IndexOf(literal.String.AsSpan(), literal.StringComparison) :
                                     literal.Chars is not null ? slice.IndexOfAny(literal.Chars.AsSpan()) :
                                     slice.IndexOf(literal.Char);
                             if (i < 0)

diff --git a/.../System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/.../System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@@ -776,7 +776,7 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne
         /// Analyzes the pattern for a leading set loop followed by a non-overlapping literal. If such a pattern is found, an implementation
         /// can search for the literal and then walk backward through all matches for the loop until the beginning is found.
         /// </summary>
-        public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? FindLiteralFollowingLeadingLoop(RegexNode node)
+        public static (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal)? FindLiteralFollowingLeadingLoop(RegexNode node)
         {
             if ((node.Options & RegexOptions.RightToLeft) != 0)
             {
@@ -804,6 +804,10 @@ public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Li
             // could also be made to support Oneloopatomic and Notoneloopatomic, but the scenarios for that are rare.
             Debug.Assert(node.ChildCount() >= 2);
             RegexNode firstChild = node.Child(0);
+            while (firstChild.Kind is RegexNodeKind.Atomic or RegexNodeKind.Capture)
+            {
+                firstChild = firstChild.Child(0);
+            }
             if (firstChild.Kind is not (RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy) || firstChild.N != int.MaxValue)
             {
                 return null;
@@ -816,37 +820,84 @@ public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Li
             {
                 if (node.ChildCount() == 2)
                 {
+                    // If the UpdateBumpalong is the last node, nothing meaningful follows the set loop.
                     return null;
                 }
                 nextChild = node.Child(2);
             }
 
-            // If the subsequent node is a literal, we need to ensure it doesn't overlap with the prior set.
-            // If there's no overlap, we have a winner.
-            switch (nextChild.Kind)
+            // Is the set loop followed by a case-sensitive string we can search for?
+            if (FindPrefix(nextChild) is { Length: >= 1 } prefix)
             {
-                case RegexNodeKind.One when !RegexCharClass.CharInClass(nextChild.Ch, firstChild.Str!):
-                    return (firstChild, (nextChild.Ch, null, null));
+                // The literal can be searched for as either a single char or as a string.
+                // But we need to make sure that its starting character isn't part of the preceding
+                // set, as then we can't know for certain where the set loop ends.
+                return
+                    RegexCharClass.CharInClass(prefix[0], firstChild.Str!) ? null :
+                    prefix.Length == 1 ? (firstChild, (prefix[0], null, StringComparison.Ordinal, null)) :
+                    (firstChild, ('\0', prefix, StringComparison.Ordinal, null));
+            }
+
+            // Is the set loop followed by an ordinal case-insensitive string we can search for? We could
+            // search for a string with at least one char, but if it has only one, we're better off just
+            // searching as a set, so we look for strings with at least two chars.
+            if (FindPrefixOrdinalCaseInsensitive(nextChild) is { Length: >= 2 } ordinalCaseInsensitivePrefix)
+            {
+                // The literal can be searched for as a case-insensitive string. As with ordinal above,
+                // though, we need to make sure its starting character isn't part of the previous set.
+                // If that starting character participates in case conversion, then we need to test out
+                // both casings (FindPrefixOrdinalCaseInsensitive will only return strings composed of
+                // characters that either are ASCII or that don't participate in case conversion).
+                Debug.Assert(
+                    !RegexCharClass.ParticipatesInCaseConversion(ordinalCaseInsensitivePrefix[0]) ||
+                    ordinalCaseInsensitivePrefix[0] < 128);
+
+                if (RegexCharClass.ParticipatesInCaseConversion(ordinalCaseInsensitivePrefix[0]))
+                {
+                    if (RegexCharClass.CharInClass((char)(ordinalCaseInsensitivePrefix[0] | 0x20), firstChild.Str!) ||
+                        RegexCharClass.CharInClass((char)(ordinalCaseInsensitivePrefix[0] & ~0x20), firstChild.Str!))
+                    {
+                        return null;
+                    }
+                }
+                else if (RegexCharClass.CharInClass(ordinalCaseInsensitivePrefix[0], firstChild.Str!))
+                {
+                    return null;
+                }
 
-                case RegexNodeKind.Multi when !RegexCharClass.CharInClass(nextChild.Str![0], firstChild.Str!):
-                    return (firstChild, ('\0', nextChild.Str, null));
+                return (firstChild, ('\0', ordinalCaseInsensitivePrefix, StringComparison.OrdinalIgnoreCase, null));
+            }
+
+            // Is the set loop followed by a set we can search for? Whereas the above helpers will drill down into
+            // children as is appropriate, to examine a set here, we need to drill in ourselves. We can drill through
+            // atomic and capture nodes, as they don't affect flow control, and into the left-most node of a concatenate,
+            // as the first child is guaranteed next. We can also drill into a loop or lazy loop that has a guaranteed
+            // iteration, for the same reason as with concatenate.
+            while ((nextChild.Kind is RegexNodeKind.Atomic or RegexNodeKind.Capture or RegexNodeKind.Concatenate) ||
+                   (nextChild.Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop && nextChild.M >= 1))
+            {
+                nextChild = nextChild.Child(0);
+            }
 
-                case RegexNodeKind.Set when !RegexCharClass.IsNegated(nextChild.Str!):
-                    Span<char> chars = stackalloc char[5]; // maximum number of chars optimized by IndexOfAny
-                    chars = chars.Slice(0, RegexCharClass.GetSetChars(nextChild.Str!, chars));
-                    if (!chars.IsEmpty)
+            // If the resulting node is a set with at least one iteration, we can search for it.
+            if (nextChild.IsSetFamily &&
+                !RegexCharClass.IsNegated(nextChild.Str!) &&
+                (nextChild.Kind is RegexNodeKind.Set || nextChild.M >= 1))
+            {
+                Span<char> chars = stackalloc char[5]; // maximum number of chars optimized by IndexOfAny
+                chars = chars.Slice(0, RegexCharClass.GetSetChars(nextChild.Str!, chars));
+                if (!chars.IsEmpty)
+                {
+                    foreach (char c in chars)
                     {
-                        foreach (char c in chars)
+                        if (RegexCharClass.CharInClass(c, firstChild.Str!))
                         {
-                            if (RegexCharClass.CharInClass(c, firstChild.Str!))
-                            {
-                                return null;
-                            }
+                            return null;
                         }
-
-                        return (firstChild, ('\0', null, chars.ToArray()));
                     }
-                    break;
+
+                    return (firstChild, ('\0', null, StringComparison.Ordinal, chars.ToArray()));
+                }
             }
 
             // Otherwise, we couldn't find the pattern of an atomic set loop followed by a literal.

diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs
@@ -958,6 +958,17 @@ public static IEnumerable<object[]> Groups_MemberData()
                 yield return (enUS, @"(?((\w{3}))\1\1|no)", "no", RegexOptions.None, new string[] { "no", "" });
             }
 
+            // Special cases involving starting position search optimizations
+            yield return (enUS, @"(\d*)(hello)(\d*)", "123hello456", RegexOptions.None, new string[] { "123hello456", "123", "hello", "456" });
+            yield return (enUS, @"((\d*))[AaBbCc](\d*)", "1b", RegexOptions.None, new string[] { "1b", "1", "1", "" });
+            yield return (enUS, @"((\d*))[AaBbCc](\d*)", "b1", RegexOptions.None, new string[] { "b1", "", "", "1" });
+            yield return (enUS, @"(\w*)(hello)(\w*)", "hello", RegexOptions.None, new string[] { "hello", "", "hello", "" });
+            if (!RegexHelpers.IsNonBacktracking(engine)) // atomic not supported
+            {
+                yield return (enUS, @"(?>(\d*))(hello)(\d*)", "123hello456", RegexOptions.None, new string[] { "123hello456", "123", "hello", "456" });
+                yield return (enUS, @"((?>\d*))(hello)(\d*)", "123hello456", RegexOptions.None, new string[] { "123hello456", "123", "hello", "456" });
+            }
+
             // Invalid unicode
             yield return (enUS, "([\u0000-\uFFFF-[azAZ09]]|[\u0000-\uFFFF-[^azAZ09]])+", "azAZBCDE1234567890BCDEFAZza", RegexOptions.None, new string[] { "azAZBCDE1234567890BCDEFAZza", "a" });
             yield return (enUS, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]+", "abcxyzABCXYZ123890", RegexOptions.None, new string[] { "bcxyzABCXYZ123890" });