From 609d85a43deab2cc3032c7cc7e458bd91b9f5738 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 21 Jul 2025 13:52:06 -0400 Subject: [PATCH] Auto-atomic for more loops followed by boundaries Today we will make a loop like `\w+\b` or `\d+\b` atomic, because the only thing the `\b` can match after that point is a non-word character, and that means it can't give back a word character or digit to satisfy the loop. But we can extend that further, since we can use the same logic to make such a loop atomic as long as the only things it can match are any subset of word characters. So, for example `[a-f0-9]+\b`. --- .../Text/RegularExpressions/RegexCharClass.cs | 54 ++++++++++++++++++- .../Text/RegularExpressions/RegexNode.cs | 2 +- .../tests/UnitTests/RegexReductionTests.cs | 12 +++++ 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 83de73a49759f1..22dd227990406c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -1272,6 +1272,55 @@ public static bool IsWordChar(char ch) (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0; } + /// Determines whether the characters that match the specified set are known to all be word characters. + public static bool IsKnownWordClassSubset(string set) + { + // Check for common sets that we know to be subsets of \w. + if (set is + WordClass or DigitClass or LetterClass or LetterOrDigitClass or + AsciiLetterClass or AsciiLetterOrDigitClass or + HexDigitClass or HexDigitUpperClass or HexDigitLowerClass) + { + return true; + } + + // Check for sets composed of Unicode categories that are part of \w. + Span categories = stackalloc UnicodeCategory[16]; + if (TryGetOnlyCategories(set, categories, out int numCategories, out bool negated) && !negated) + { + foreach (UnicodeCategory cat in categories.Slice(0, numCategories)) + { + if (!IsWordCategory(cat)) + { + return false; + } + } + + return true; + } + + // If we can enumerate every character in the set quickly, do so, checking to see whether they're all in \w. + if (CanEasilyEnumerateSetContents(set)) + { + for (int i = SetStartIndex; i < SetStartIndex + set[SetLengthIndex]; i += 2) + { + int curSetEnd = set[i + 1]; + for (int c = set[i]; c < curSetEnd; c++) + { + if (!CharInClass((char)c, WordClass)) + { + return false; + } + } + } + + return true; + } + + // Unlikely to be a subset of \w, and we don't know for sure. + return false; + } + /// Determines whether a character is considered a word character for the purposes of testing a word character boundary. public static bool IsBoundaryWordChar(char ch) { @@ -1288,10 +1337,13 @@ public static bool IsBoundaryWordChar(char ch) int chDiv8 = ch >> 3; return (uint)chDiv8 < (uint)ascii.Length ? (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 : - ((WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0 || + (IsWordCategory(CharUnicodeInfo.GetUnicodeCategory(ch)) || (ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner)); } + private static bool IsWordCategory(UnicodeCategory category) => + (WordCategoriesMask & (1 << (int)category)) != 0; + /// Determines whether the 'a' and 'b' values differ by only a single bit, setting that bit in 'mask'. /// This isn't specific to RegexCharClass; it's just a convenient place to host it. public static bool DifferByOneBit(char a, char b, out int mask) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index b81599c77ee787..633c0fdff99d78 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2181,7 +2181,7 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!): case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!): - case RegexNodeKind.Boundary when node.M > 0 && node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass: + case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsKnownWordClassSubset(node.Str!): case RegexNodeKind.NonBoundary when node.M > 0 && node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass: case RegexNodeKind.ECMABoundary when node.M > 0 && node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass: case RegexNodeKind.NonECMABoundary when node.M > 0 && node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass: diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index 96c324cda81818..f063d790c06745 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -341,6 +341,15 @@ public class RegexReductionTests [InlineData("\\d+\\b", "(?>\\d+)\\b")] [InlineData("\\W+\\B", "(?>\\W+)\\B")] [InlineData("\\D+\\B", "(?>\\D+)\\B")] + [InlineData(@"[0-9]+\b", @"(?>[0-9]+)\b")] + [InlineData(@"[a-z]+\b", @"(?>[a-z]+)\b")] + [InlineData(@"[A-Z]+\b", @"(?>[A-Z]+)\b")] + [InlineData(@"[a-zA-Z]+\b", @"(?>[a-zA-Z]+)\b")] + [InlineData(@"[a-fA-F0-9]+\b", @"(?>[a-fA-F0-9]+)\b")] + [InlineData(@"[A-F0-9]+\b", @"(?>[A-F0-9]+)\b")] + [InlineData(@"[a-f0-9]+\b", @"(?>[a-f0-9]+)\b")] + [InlineData(@"[\p{L}\d]+\b", @"(?>[\p{L}\d]+)\b")] + [InlineData(@"[\p{L}\p{Mn}]+\b", @"(?>[\p{L}\p{Mn}]+)\b")] [InlineData(@"\d+\D", @"(?>\d+)\D")] [InlineData(@"\D+\d", @"(?>\D+)\d")] [InlineData(@"\s+\S", @"(?>\s+)\S")] @@ -494,6 +503,9 @@ public void PatternsReduceIdentically(string actual, string expected) [InlineData(@"\d*\b", @"(?>\d*)\b")] [InlineData(@"\W*\B", @"(?>\W*)\B")] [InlineData(@"\D*\B", @"(?>\D*)\B")] + [InlineData(@"\b[a-z ]+\b", @"\b(?>[a-z ]+)\b")] + [InlineData(@"\b[\p{L}\p{Mn}a]+\b", @"\b(?>[\p{L}\p{Mn}a]+)\b")] + [InlineData(@"\b[\p{C}]+\b", @"\b(?>[\p{C}]+)\b")] // Loops inside alternation constructs [InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")] [InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]