diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index f3c5552b8404a..e3c760c2728be 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -50,7 +50,7 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture) } // If there's a leading case-sensitive substring, just use IndexOf and inherit all of its optimizations. - string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(tree); + string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(tree.Root); if (caseSensitivePrefix.Length > 1) { LeadingCaseSensitivePrefix = caseSensitivePrefix; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 869b5cc5aa285..692ee7d5557c1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -50,11 +50,11 @@ private RegexPrefixAnalyzer(Span intStack) _skipAllChildren = false; } - /// Computes the leading substring in ; may be empty. - public static string FindCaseSensitivePrefix(RegexTree tree) + /// Computes the leading substring in ; may be empty. + public static string FindCaseSensitivePrefix(RegexNode node) { var vsb = new ValueStringBuilder(stackalloc char[64]); - Process(tree.Root, ref vsb); + Process(node, ref vsb); return vsb.ToString(); // Processes the node, adding any prefix text to the builder. @@ -87,6 +87,59 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) return !rtl; } + // Alternation: find a string that's a shared prefix of all branches + case RegexNodeKind.Alternate: + { + int childCount = node.ChildCount(); + + // Store the initial branch into the target builder, keeping track + // of how much was appended. Any of this contents that doesn't overlap + // will every other branch will be removed before returning. + int initialLength = vsb.Length; + Process(node.Child(0), ref vsb); + int addedLength = vsb.Length - initialLength; + + // Then explore the rest of the branches, finding the length + // of prefix they all share in common with the initial branch. + if (addedLength != 0) + { + var alternateSb = new ValueStringBuilder(64); + + // Process each branch. If we reach a point where we've proven there's + // no overlap, we can bail early. + for (int i = 1; i < childCount && addedLength != 0; i++) + { + alternateSb.Length = 0; + + // Process the branch into a temporary builder. + Process(node.Child(i), ref alternateSb); + + // Find how much overlap there is between this branch's prefix + // and the smallest amount of prefix that overlapped with all + // the previously seen branches. + addedLength = Math.Min(addedLength, alternateSb.Length); + for (int j = 0; j < addedLength; j++) + { + if (vsb[initialLength + j] != alternateSb[j]) + { + addedLength = j; + break; + } + } + } + + alternateSb.Dispose(); + + // Then cull back on what was added based on the other branches. + vsb.Length = initialLength + addedLength; + } + + // Don't explore anything after the alternation. We could make this work if desirable, + // but it's currently not worth the extra complication. The entire contents of every + // branch would need to be identical other than zero-width anchors/assertions. + return false; + } + // One character case RegexNodeKind.One when (node.Options & RegexOptions.IgnoreCase) == 0: vsb.Append(node.Ch); diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 8fd3ea6e32231..005e9b27bb203 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -205,6 +205,10 @@ public static IEnumerable Match_MemberData() yield return (@"(^|($|a+))bc", " aabc", RegexOptions.None, 0, 5, true, "aabc"); yield return (@"yz(^|a+)bc", " yzaabc", RegexOptions.None, 0, 7, true, "yzaabc"); yield return (@"(^a|a$) bc", "a bc", RegexOptions.None, 0, 4, true, "a bc"); + yield return (@"(abcdefg|abcdef|abc|a)h", " ah ", RegexOptions.None, 0, 8, true, "ah"); + yield return (@"(^abcdefg|abcdef|^abc|a)h", " abcdefh ", RegexOptions.None, 0, 13, true, "abcdefh"); + yield return (@"(a|^abcdefg|abcdef|^abc)h", " abcdefh ", RegexOptions.None, 0, 13, true, "abcdefh"); + yield return (@"(abcdefg|abcdef)h", " abcdefghij ", RegexOptions.None, 0, 16, true, "abcdefgh"); if (!RegexHelpers.IsNonBacktracking(engine)) {