From 5277c17d7527400b23e94afc830afe9a7547e5a7 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 29 Jul 2025 21:48:44 -0400 Subject: [PATCH] Improve boundary handling in atomic tests These were set up to require what comes after the boundary to also be disjoint from its predecessor being tested. But that's not necessary; the boundary itself is sufficient to determine atomicity. --- .../Text/RegularExpressions/RegexNode.cs | 24 +++++++++---------- .../tests/UnitTests/RegexReductionTests.cs | 4 ++++ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 0f3d427de9c639..0b0c0a8dd6a672 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2350,15 +2350,15 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i case RegexNodeKind.Multi when node.Ch != subsequent.Str![0]: case RegexNodeKind.End: case RegexNodeKind.EndZ or RegexNodeKind.Eol when node.Ch != '\n': + case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsBoundaryWordChar(node.Ch): + case RegexNodeKind.NonBoundary when node.M > 0 && !RegexCharClass.IsBoundaryWordChar(node.Ch): + case RegexNodeKind.ECMABoundary when node.M > 0 && RegexCharClass.IsECMAWordChar(node.Ch): + case RegexNodeKind.NonECMABoundary when node.M > 0 && !RegexCharClass.IsECMAWordChar(node.Ch): return true; case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && node.Ch != subsequent.Ch: case RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic when subsequent.M == 0 && node.Ch == subsequent.Ch: case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!): - case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsBoundaryWordChar(node.Ch): - case RegexNodeKind.NonBoundary when node.M > 0 && !RegexCharClass.IsBoundaryWordChar(node.Ch): - case RegexNodeKind.ECMABoundary when node.M > 0 && RegexCharClass.IsECMAWordChar(node.Ch): - case RegexNodeKind.NonECMABoundary when node.M > 0 && !RegexCharClass.IsECMAWordChar(node.Ch): // The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well. break; @@ -2397,14 +2397,14 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i case RegexNodeKind.Multi when !RegexCharClass.CharInClass(subsequent.Str![0], node.Str!): case RegexNodeKind.End: case RegexNodeKind.EndZ or RegexNodeKind.Eol when !RegexCharClass.CharInClass('\n', node.Str!): - return true; - - case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!): - case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!): case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsKnownWordClassSubset(node.Str!): case RegexNodeKind.NonBoundary when node.M > 0 && node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass: case RegexNodeKind.ECMABoundary when node.M > 0 && node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass: case RegexNodeKind.NonECMABoundary when node.M > 0 && node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass: + return true; + + case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!): + case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!): // The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well. break; @@ -2444,14 +2444,14 @@ bool MayOverlapStartingOrEndingSet(string set) => case RegexNodeKind.Multi when !CharInStartingOrEndingSet(subsequent.Str![0]): case RegexNodeKind.End: case RegexNodeKind.EndZ or RegexNodeKind.Eol when !CharInStartingOrEndingSet('\n'): - return true; - - case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !CharInStartingOrEndingSet(subsequent.Ch): - case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !MayOverlapStartingOrEndingSet(subsequent.Str!): case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsKnownWordClassSubset(loopStartingSet) && RegexCharClass.IsKnownWordClassSubset(loopEndingSet): case RegexNodeKind.NonBoundary when node.M > 0 && (loopStartingSet is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass) && (loopEndingSet is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass): case RegexNodeKind.ECMABoundary when node.M > 0 && (loopStartingSet is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass) && (loopEndingSet is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass): case RegexNodeKind.NonECMABoundary when node.M > 0 && (loopStartingSet is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass) && (loopEndingSet is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass): + return true; + + case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !CharInStartingOrEndingSet(subsequent.Ch): + case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !MayOverlapStartingOrEndingSet(subsequent.Str!): // The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well. break; diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index 5adb53144e3e23..68fb3925972dd1 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -381,8 +381,12 @@ public class RegexReductionTests [InlineData("(a+)b", "((?>a+))b")] [InlineData("a*(?:bcd|efg)", "(?>a*)(?:bcd|efg)")] [InlineData("\\w+\\b", "(?>\\w+)\\b")] + [InlineData("\\w+\\ba", "(?>\\w+)\\ba")] + [InlineData("\\w+\\b\\w", "(?>\\w+)\\b\\w")] [InlineData("\\d+\\b", "(?>\\d+)\\b")] [InlineData("\\W+\\B", "(?>\\W+)\\B")] + [InlineData("\\W+\\B#", "(?>\\W+)\\B#")] + [InlineData("\\W+\\B\\W", "(?>\\W+)\\B\\W")] [InlineData("\\D+\\B", "(?>\\D+)\\B")] [InlineData(@"[0-9]+\b", @"(?>[0-9]+)\b")] [InlineData(@"[a-z]+\b", @"(?>[a-z]+)\b")]