From 609d85a43deab2cc3032c7cc7e458bd91b9f5738 Mon Sep 17 00:00:00 2001
From: Stephen Toub <stoub@microsoft.com>
Date: Mon, 21 Jul 2025 13:52:06 -0400
Subject: [PATCH] Auto-atomic for more loops followed by boundaries

Today we will make a loop like `\w+\b` or `\d+\b` atomic, because the only thing the `\b` can match after that point is a non-word character, and that means it can't give back a word character or digit to satisfy the loop. But we can extend that further, since we can use the same logic to make such a loop atomic as long as the only things it can match are any subset of word characters. So, for example `[a-f0-9]+\b`.
---
 .../Text/RegularExpressions/RegexCharClass.cs | 54 ++++++++++++++++++-
 .../Text/RegularExpressions/RegexNode.cs      |  2 +-
 .../tests/UnitTests/RegexReductionTests.cs    | 12 +++++
 3 files changed, 66 insertions(+), 2 deletions(-)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index 83de73a49759f1..22dd227990406c 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -1272,6 +1272,55 @@ public static bool IsWordChar(char ch)
                 (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
         }
 
+        /// <summary>Determines whether the characters that match the specified set are known to all be word characters.</summary>
+        public static bool IsKnownWordClassSubset(string set)
+        {
+            // Check for common sets that we know to be subsets of \w.
+            if (set is
+                WordClass or DigitClass or LetterClass or LetterOrDigitClass or
+                AsciiLetterClass or AsciiLetterOrDigitClass or
+                HexDigitClass or HexDigitUpperClass or HexDigitLowerClass)
+            {
+                return true;
+            }
+
+            // Check for sets composed of Unicode categories that are part of \w.
+            Span<UnicodeCategory> categories = stackalloc UnicodeCategory[16];
+            if (TryGetOnlyCategories(set, categories, out int numCategories, out bool negated) && !negated)
+            {
+                foreach (UnicodeCategory cat in categories.Slice(0, numCategories))
+                {
+                    if (!IsWordCategory(cat))
+                    {
+                        return false;
+                    }
+                }
+
+                return true;
+            }
+
+            // If we can enumerate every character in the set quickly, do so, checking to see whether they're all in \w.
+            if (CanEasilyEnumerateSetContents(set))
+            {
+                for (int i = SetStartIndex; i < SetStartIndex + set[SetLengthIndex]; i += 2)
+                {
+                    int curSetEnd = set[i + 1];
+                    for (int c = set[i]; c < curSetEnd; c++)
+                    {
+                        if (!CharInClass((char)c, WordClass))
+                        {
+                            return false;
+                        }
+                    }
+                }
+
+                return true;
+            }
+
+            // Unlikely to be a subset of \w, and we don't know for sure.
+            return false;
+        }
+
         /// <summary>Determines whether a character is considered a word character for the purposes of testing a word character boundary.</summary>
         public static bool IsBoundaryWordChar(char ch)
         {
@@ -1288,10 +1337,13 @@ public static bool IsBoundaryWordChar(char ch)
             int chDiv8 = ch >> 3;
             return (uint)chDiv8 < (uint)ascii.Length ?
                 (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
-                ((WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0 ||
+                (IsWordCategory(CharUnicodeInfo.GetUnicodeCategory(ch)) ||
                  (ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner));
         }
 
+        private static bool IsWordCategory(UnicodeCategory category) =>
+            (WordCategoriesMask & (1 << (int)category)) != 0;
+
         /// <summary>Determines whether the 'a' and 'b' values differ by only a single bit, setting that bit in 'mask'.</summary>
         /// <remarks>This isn't specific to RegexCharClass; it's just a convenient place to host it.</remarks>
         public static bool DifferByOneBit(char a, char b, out int mask)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
index b81599c77ee787..633c0fdff99d78 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -2181,7 +2181,7 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
 
                             case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
                             case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
-                            case RegexNodeKind.Boundary when node.M > 0 && node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
+                            case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsKnownWordClassSubset(node.Str!):
                             case RegexNodeKind.NonBoundary when node.M > 0 && node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
                             case RegexNodeKind.ECMABoundary when node.M > 0 && node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
                             case RegexNodeKind.NonECMABoundary when node.M > 0 && node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs
index 96c324cda81818..f063d790c06745 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs
@@ -341,6 +341,15 @@ public class RegexReductionTests
         [InlineData("\\d+\\b", "(?>\\d+)\\b")]
         [InlineData("\\W+\\B", "(?>\\W+)\\B")]
         [InlineData("\\D+\\B", "(?>\\D+)\\B")]
+        [InlineData(@"[0-9]+\b", @"(?>[0-9]+)\b")]
+        [InlineData(@"[a-z]+\b", @"(?>[a-z]+)\b")]
+        [InlineData(@"[A-Z]+\b", @"(?>[A-Z]+)\b")]
+        [InlineData(@"[a-zA-Z]+\b", @"(?>[a-zA-Z]+)\b")]
+        [InlineData(@"[a-fA-F0-9]+\b", @"(?>[a-fA-F0-9]+)\b")]
+        [InlineData(@"[A-F0-9]+\b", @"(?>[A-F0-9]+)\b")]
+        [InlineData(@"[a-f0-9]+\b", @"(?>[a-f0-9]+)\b")]
+        [InlineData(@"[\p{L}\d]+\b", @"(?>[\p{L}\d]+)\b")]
+        [InlineData(@"[\p{L}\p{Mn}]+\b", @"(?>[\p{L}\p{Mn}]+)\b")]
         [InlineData(@"\d+\D", @"(?>\d+)\D")]
         [InlineData(@"\D+\d", @"(?>\D+)\d")]
         [InlineData(@"\s+\S", @"(?>\s+)\S")]
@@ -494,6 +503,9 @@ public void PatternsReduceIdentically(string actual, string expected)
         [InlineData(@"\d*\b", @"(?>\d*)\b")]
         [InlineData(@"\W*\B", @"(?>\W*)\B")]
         [InlineData(@"\D*\B", @"(?>\D*)\B")]
+        [InlineData(@"\b[a-z ]+\b", @"\b(?>[a-z ]+)\b")]
+        [InlineData(@"\b[\p{L}\p{Mn}a]+\b", @"\b(?>[\p{L}\p{Mn}a]+)\b")]
+        [InlineData(@"\b[\p{C}]+\b", @"\b(?>[\p{C}]+)\b")]
         // Loops inside alternation constructs
         [InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")]
         [InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]