diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 859c41b6c34988..45ea2d6e6e83e3 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -790,6 +790,11 @@ private RegexNode ReduceAtomic() start = endExclusive; } + // Force a re-reduction if we know we've exposed new opportunities that'll be handled. + reordered |= + child.ChildCount() == 2 && + (child.Child(0).Kind is RegexNodeKind.Empty || child.Child(1).Kind is RegexNodeKind.Empty); // can be transformed into a ? or ?? + // If anything was reordered, there may be new optimization opportunities inside // of the alternation, so reduce it again. if (reordered) @@ -1032,6 +1037,22 @@ private RegexNode ReduceAlternation() if (node.Kind == RegexNodeKind.Alternate) { node = RemoveRedundantEmptiesAndNothings(node); + + // If the alternation is actually just a ? or ?? in disguise, transform it accordingly. + // (a|) becomes a? + // (|a) becomes a?? + // Such "optional" nodes are processed more efficiently, including being able to be better coalesced with surrounding nodes. + if (node.Kind is RegexNodeKind.Alternate && node.ChildCount() == 2) + { + if (node.Child(1).Kind is RegexNodeKind.Empty) + { + node = node.Child(0).MakeQuantifier(lazy: false, min: 0, max: 1); + } + else if (node.Child(0).Kind is RegexNodeKind.Empty) + { + node = node.Child(1).MakeQuantifier(lazy: true, min: 0, max: 1); + } + } } } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index 226f9c3be4d50f..8cbf120e660186 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -306,6 +306,12 @@ public class RegexReductionTests [InlineData("abcd|aefg", "a(?>bcd|efg)")] [InlineData("abcd|abc|ab|a", "a(?>bcd|bc|b|)")] [InlineData("^abcd|^abce", "^(?:abc[de])")] + [InlineData("abc|", "(?:abc)?")] + [InlineData("a|", "a?")] + [InlineData("(?:abc|)d", "(?>(?:abc)?)d")] + [InlineData("(?:a|)a", "a{1,2}")] + [InlineData("(?:a|)a*", "a*")] + [InlineData("a+(?:a|)", "a+")] // [InlineData("abcde|abcdef", "abcde(?>|f)")] // TODO https://github.com/dotnet/runtime/issues/66031: Need to reorganize optimizations to avoid an extra Empty being left at the end of the tree [InlineData("abcdef|abcde", "abcde(?>f|)")] [InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")]