diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 19488703f07f7..63b660251e909 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -1247,14 +1247,17 @@ void EmitFixedSet_RightToLeft() void EmitLiteralAfterAtomicLoop() { Debug.Assert(regexTree.FindOptimizations.LiteralAfterLoop is not null); - (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = regexTree.FindOptimizations.LiteralAfterLoop.Value; + (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal) target = regexTree.FindOptimizations.LiteralAfterLoop.Value; Debug.Assert(target.LoopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic); Debug.Assert(target.LoopNode.N == int.MaxValue); + string stringComparisonComment = target.Literal.StringComparison == StringComparison.OrdinalIgnoreCase ? "ordinal case-insensitive " : ""; + string stringComparisonArgument = target.Literal.StringComparison == StringComparison.OrdinalIgnoreCase ? ", StringComparison.OrdinalIgnoreCase" : ""; + writer.Write($"// The pattern begins with an atomic loop for {DescribeSet(target.LoopNode.Str!)}, followed by "); writer.WriteLine( - target.Literal.String is not null ? $"the string {Literal(target.Literal.String)}." : + target.Literal.String is not null ? $"the {stringComparisonComment}string {Literal(target.Literal.String)}." : target.Literal.Chars is not null ? $"one of the characters {Literal(new string(target.Literal.Chars))}" : $"the character {Literal(target.Literal.Char)}."); writer.WriteLine($"// Search for the literal, and then walk backwards to the beginning of the loop."); @@ -1275,7 +1278,7 @@ void EmitLiteralAfterAtomicLoop() // Find the literal. If we can't find it, we're done searching. writer.Write("int i = slice."); writer.WriteLine( - target.Literal.String is string literalString ? $"IndexOf({Literal(literalString)});" : + target.Literal.String is string literalString ? $"IndexOf({Literal(literalString)}{stringComparisonArgument});" : target.Literal.Chars is not char[] literalChars ? $"IndexOf({Literal(target.Literal.Char)});" : literalChars.Length switch { @@ -2582,6 +2585,9 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck if (rm.Tree.FindOptimizations.FindMode == FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight && rm.Tree.FindOptimizations.LiteralAfterLoop?.LoopNode == node) { + // This is the set loop that's part of the literal-after-loop optimization: the end of the loop + // is stored in runtrackpos, so we just need to transfer that to pos. The optimization is only + // selected if the shape of the tree is amenable. Debug.Assert(sliceStaticPos == 0, "This should be the first node and thus static position shouldn't have advanced."); writer.WriteLine("// Skip loop already matched in TryFindNextPossibleStartingPosition."); writer.WriteLine("pos = base.runtrackpos;"); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 20ccc3afefcca..0cbcac44de04b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1234,7 +1234,7 @@ void EmitFixedSet_RightToLeft() void EmitLiteralAfterAtomicLoop() { Debug.Assert(_regexTree.FindOptimizations.LiteralAfterLoop is not null); - (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = _regexTree.FindOptimizations.LiteralAfterLoop.Value; + (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal) target = _regexTree.FindOptimizations.LiteralAfterLoop.Value; Debug.Assert(target.LoopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic); Debug.Assert(target.LoopNode.N == int.MaxValue); @@ -1260,7 +1260,16 @@ void EmitLiteralAfterAtomicLoop() { Ldstr(literalString); Call(s_stringAsSpanMethod); - Call(s_spanIndexOfSpan); + if (target.Literal.StringComparison is StringComparison.OrdinalIgnoreCase) + { + Ldc((int)target.Literal.StringComparison); + Call(s_spanIndexOfSpanStringComparison); + } + else + { + Debug.Assert(target.Literal.StringComparison is StringComparison.Ordinal); + Call(s_spanIndexOfSpan); + } } else if (target.Literal.Chars is not char[] literalChars) { @@ -2605,10 +2614,12 @@ void EmitNegativeLookaroundAssertion(RegexNode node) void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) { // Before we handle general-purpose matching logic for nodes, handle any special-casing. - // - if (_regexTree!.FindOptimizations.FindMode == FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight && _regexTree!.FindOptimizations.LiteralAfterLoop?.LoopNode == node) { + // This is the set loop that's part of the literal-after-loop optimization: the end of the loop + // is stored in runtrackpos, so we just need to transfer that to pos. The optimization is only + // selected if the shape of the tree is amenable. Debug.Assert(sliceStaticPos == 0, "This should be the first node and thus static position shouldn't have advanced."); // pos = base.runtrackpos; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index 517c9da6b4270..973187385c26a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -156,7 +156,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) // As a backup, see if we can find a literal after a leading atomic loop. That might be better than whatever sets we find, so // we want to know whether we have one in our pocket before deciding whether to use a leading set (we'll prefer a leading // set if it's something for which we can search efficiently). - (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root); + (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root); // If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support an efficient // search and we did successfully find a literal after an atomic loop we could search instead, we prefer the efficient search. @@ -274,7 +274,7 @@ public FixedDistanceSet(char[]? chars, string set, int distance) } /// When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop. - public (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? LiteralAfterLoop { get; } + public (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal)? LiteralAfterLoop { get; } /// Analyzes a list of fixed-distance sets to extract a case-sensitive string at a fixed distance. private static (string String, int Distance)? FindFixedDistanceString(List fixedDistanceSets) @@ -731,7 +731,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight: { Debug.Assert(LiteralAfterLoop is not null); - (RegexNode loopNode, (char Char, string? String, char[]? Chars) literal) = LiteralAfterLoop.GetValueOrDefault(); + (RegexNode loopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) literal) = LiteralAfterLoop.GetValueOrDefault(); Debug.Assert(loopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic); Debug.Assert(loopNode.N == int.MaxValue); @@ -742,7 +742,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, ReadOnlySpan slice = textSpan.Slice(startingPos); // Find the literal. If we can't find it, we're done searching. - int i = literal.String is not null ? slice.IndexOf(literal.String.AsSpan()) : + int i = literal.String is not null ? slice.IndexOf(literal.String.AsSpan(), literal.StringComparison) : literal.Chars is not null ? slice.IndexOfAny(literal.Chars.AsSpan()) : slice.IndexOf(literal.Char); if (i < 0) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index f306f5504f059..a2c0304483a15 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -776,7 +776,7 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne /// Analyzes the pattern for a leading set loop followed by a non-overlapping literal. If such a pattern is found, an implementation /// can search for the literal and then walk backward through all matches for the loop until the beginning is found. /// - public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? FindLiteralFollowingLeadingLoop(RegexNode node) + public static (RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal)? FindLiteralFollowingLeadingLoop(RegexNode node) { if ((node.Options & RegexOptions.RightToLeft) != 0) { @@ -804,6 +804,10 @@ public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Li // could also be made to support Oneloopatomic and Notoneloopatomic, but the scenarios for that are rare. Debug.Assert(node.ChildCount() >= 2); RegexNode firstChild = node.Child(0); + while (firstChild.Kind is RegexNodeKind.Atomic or RegexNodeKind.Capture) + { + firstChild = firstChild.Child(0); + } if (firstChild.Kind is not (RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy) || firstChild.N != int.MaxValue) { return null; @@ -816,37 +820,84 @@ public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Li { if (node.ChildCount() == 2) { + // If the UpdateBumpalong is the last node, nothing meaningful follows the set loop. return null; } nextChild = node.Child(2); } - // If the subsequent node is a literal, we need to ensure it doesn't overlap with the prior set. - // If there's no overlap, we have a winner. - switch (nextChild.Kind) + // Is the set loop followed by a case-sensitive string we can search for? + if (FindPrefix(nextChild) is { Length: >= 1 } prefix) { - case RegexNodeKind.One when !RegexCharClass.CharInClass(nextChild.Ch, firstChild.Str!): - return (firstChild, (nextChild.Ch, null, null)); + // The literal can be searched for as either a single char or as a string. + // But we need to make sure that its starting character isn't part of the preceding + // set, as then we can't know for certain where the set loop ends. + return + RegexCharClass.CharInClass(prefix[0], firstChild.Str!) ? null : + prefix.Length == 1 ? (firstChild, (prefix[0], null, StringComparison.Ordinal, null)) : + (firstChild, ('\0', prefix, StringComparison.Ordinal, null)); + } + + // Is the set loop followed by an ordinal case-insensitive string we can search for? We could + // search for a string with at least one char, but if it has only one, we're better off just + // searching as a set, so we look for strings with at least two chars. + if (FindPrefixOrdinalCaseInsensitive(nextChild) is { Length: >= 2 } ordinalCaseInsensitivePrefix) + { + // The literal can be searched for as a case-insensitive string. As with ordinal above, + // though, we need to make sure its starting character isn't part of the previous set. + // If that starting character participates in case conversion, then we need to test out + // both casings (FindPrefixOrdinalCaseInsensitive will only return strings composed of + // characters that either are ASCII or that don't participate in case conversion). + Debug.Assert( + !RegexCharClass.ParticipatesInCaseConversion(ordinalCaseInsensitivePrefix[0]) || + ordinalCaseInsensitivePrefix[0] < 128); + + if (RegexCharClass.ParticipatesInCaseConversion(ordinalCaseInsensitivePrefix[0])) + { + if (RegexCharClass.CharInClass((char)(ordinalCaseInsensitivePrefix[0] | 0x20), firstChild.Str!) || + RegexCharClass.CharInClass((char)(ordinalCaseInsensitivePrefix[0] & ~0x20), firstChild.Str!)) + { + return null; + } + } + else if (RegexCharClass.CharInClass(ordinalCaseInsensitivePrefix[0], firstChild.Str!)) + { + return null; + } - case RegexNodeKind.Multi when !RegexCharClass.CharInClass(nextChild.Str![0], firstChild.Str!): - return (firstChild, ('\0', nextChild.Str, null)); + return (firstChild, ('\0', ordinalCaseInsensitivePrefix, StringComparison.OrdinalIgnoreCase, null)); + } + + // Is the set loop followed by a set we can search for? Whereas the above helpers will drill down into + // children as is appropriate, to examine a set here, we need to drill in ourselves. We can drill through + // atomic and capture nodes, as they don't affect flow control, and into the left-most node of a concatenate, + // as the first child is guaranteed next. We can also drill into a loop or lazy loop that has a guaranteed + // iteration, for the same reason as with concatenate. + while ((nextChild.Kind is RegexNodeKind.Atomic or RegexNodeKind.Capture or RegexNodeKind.Concatenate) || + (nextChild.Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop && nextChild.M >= 1)) + { + nextChild = nextChild.Child(0); + } - case RegexNodeKind.Set when !RegexCharClass.IsNegated(nextChild.Str!): - Span chars = stackalloc char[5]; // maximum number of chars optimized by IndexOfAny - chars = chars.Slice(0, RegexCharClass.GetSetChars(nextChild.Str!, chars)); - if (!chars.IsEmpty) + // If the resulting node is a set with at least one iteration, we can search for it. + if (nextChild.IsSetFamily && + !RegexCharClass.IsNegated(nextChild.Str!) && + (nextChild.Kind is RegexNodeKind.Set || nextChild.M >= 1)) + { + Span chars = stackalloc char[5]; // maximum number of chars optimized by IndexOfAny + chars = chars.Slice(0, RegexCharClass.GetSetChars(nextChild.Str!, chars)); + if (!chars.IsEmpty) + { + foreach (char c in chars) { - foreach (char c in chars) + if (RegexCharClass.CharInClass(c, firstChild.Str!)) { - if (RegexCharClass.CharInClass(c, firstChild.Str!)) - { - return null; - } + return null; } - - return (firstChild, ('\0', null, chars.ToArray())); } - break; + + return (firstChild, ('\0', null, StringComparison.Ordinal, chars.ToArray())); + } } // Otherwise, we couldn't find the pattern of an atomic set loop followed by a literal. diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs index 9aed94346faee..c3ff5b595caa2 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs @@ -958,6 +958,17 @@ public static IEnumerable Groups_MemberData() yield return (enUS, @"(?((\w{3}))\1\1|no)", "no", RegexOptions.None, new string[] { "no", "" }); } + // Special cases involving starting position search optimizations + yield return (enUS, @"(\d*)(hello)(\d*)", "123hello456", RegexOptions.None, new string[] { "123hello456", "123", "hello", "456" }); + yield return (enUS, @"((\d*))[AaBbCc](\d*)", "1b", RegexOptions.None, new string[] { "1b", "1", "1", "" }); + yield return (enUS, @"((\d*))[AaBbCc](\d*)", "b1", RegexOptions.None, new string[] { "b1", "", "", "1" }); + yield return (enUS, @"(\w*)(hello)(\w*)", "hello", RegexOptions.None, new string[] { "hello", "", "hello", "" }); + if (!RegexHelpers.IsNonBacktracking(engine)) // atomic not supported + { + yield return (enUS, @"(?>(\d*))(hello)(\d*)", "123hello456", RegexOptions.None, new string[] { "123hello456", "123", "hello", "456" }); + yield return (enUS, @"((?>\d*))(hello)(\d*)", "123hello456", RegexOptions.None, new string[] { "123hello456", "123", "hello", "456" }); + } + // Invalid unicode yield return (enUS, "([\u0000-\uFFFF-[azAZ09]]|[\u0000-\uFFFF-[^azAZ09]])+", "azAZBCDE1234567890BCDEFAZza", RegexOptions.None, new string[] { "azAZBCDE1234567890BCDEFAZza", "a" }); yield return (enUS, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]+", "abcxyzABCXYZ123890", RegexOptions.None, new string[] { "bcxyzABCXYZ123890" }); diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs index acec9ac285741..c962bc072b6fd 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs @@ -120,15 +120,27 @@ public void LeadingSet(string pattern, int options, int expectedMode, string exp } [Theory] - [InlineData(@"\d*a", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, null, 'a')] - [InlineData(@"\d*abc", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, "abc", 0)] - public void LiteralAfterLoop(string pattern, int options, int expectedMode, string? expectedString, char expectedChar) + [InlineData(@"\d*a", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, null, StringComparison.Ordinal, 'a', null)] + [InlineData(@"\d*abc", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, "abc", StringComparison.Ordinal, 0, null)] + [InlineData(@"(\d*)(abc)", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, "abc", StringComparison.Ordinal, 0, null)] + [InlineData(@"((\d*)(abc))", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, "abc", StringComparison.Ordinal, 0, null)] + [InlineData(@"(?>\s*)(((abc)+){2,})", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, "abc", StringComparison.Ordinal, 0, null)] + [InlineData(@"((((\s*)))((((?i)abc)+){2,}))", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, "abc", StringComparison.OrdinalIgnoreCase, 0, null)] + [InlineData(@"((((\s*)))((((?i)a)+){2,}))", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, null, StringComparison.Ordinal, 0, new char[] { 'A', 'a' })] + [InlineData(@"((((?>\s*)))((([Aa][Bb][Cc])+){2,}))", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, "abc", StringComparison.OrdinalIgnoreCase, 0, null)] + [InlineData(@"((((\s*)))((([Aa][Bb]c)+){2,}))", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, "ab", StringComparison.OrdinalIgnoreCase, 0, null)] + [InlineData(@"((((?>\s*)))((([Aa]bc)+){2,}))", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, null, StringComparison.Ordinal, 0, new char[] { 'A', 'a' })] + [InlineData(@"((((\s*)))((([Sst])+){2,}))", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, null, StringComparison.Ordinal, 0, new char[] { 'S', 's', 't' })] + [InlineData(@"\d*[AaBb]{3,}", 0, (int)FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight, null, StringComparison.Ordinal, 0, new char[] { 'A', 'B', 'a', 'b' })] + public void LiteralAfterLoop(string pattern, int options, int expectedMode, string? expectedString, StringComparison expectedStringComparison, char expectedChar, char[]? expectedSet) { RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options); Assert.Equal((FindNextStartingPositionMode)expectedMode, opts.FindMode); Assert.NotNull(opts.LiteralAfterLoop); Assert.Equal(expectedString, opts.LiteralAfterLoop.Value.Literal.String); + Assert.Equal(expectedStringComparison, opts.LiteralAfterLoop.Value.Literal.StringComparison); Assert.Equal(expectedChar, opts.LiteralAfterLoop.Value.Literal.Char); + Assert.Equal(expectedSet, opts.LiteralAfterLoop.Value.Literal.Chars); } [Theory]