Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -893,10 +893,7 @@ private RegexNode ReduceLoops()

// If the Loop or Lazyloop now only has one child node and its a Set, One, or Notone,
// reduce to just Setloop/lazy, Oneloop/lazy, or Notoneloop/lazy. The parser will
// generally have only produced the latter, but other reductions could have exposed
// this. We can also reduce or eliminate certain loops that are nops, e.g.
// a loop with a minimum of 0 that wraps a zero-width assertion is either asserting something
// or not, and is thus useless.
// generally have only produced the latter, but other reductions could have exposed this.
if (u.ChildCount() == 1)
{
RegexNode child = u.Child(0);
Expand All @@ -910,14 +907,27 @@ private RegexNode ReduceLoops()
break;

case RegexNodeKind.Empty:
case RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround or
// A loop around an empty is itself empty, regardless of iteration counts.
u = child;
break;

case RegexNodeKind.PositiveLookaround when ContainsKind(child, [RegexNodeKind.Capture]) is false:
case RegexNodeKind.NegativeLookaround or
RegexNodeKind.Beginning or RegexNodeKind.Start or
RegexNodeKind.Bol or RegexNodeKind.Eol or
RegexNodeKind.End or RegexNodeKind.EndZ or
RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or
RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary
when u.M == 0:
u = new RegexNode(RegexNodeKind.Empty, Options);
RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary:
// A loop around (most) zero-width assertions can also be reduced. If it has a lower bound of 0,
// then it's either asserting something or not, and is thus useless and replaceable by empty.
// If it has a lower bound > 0, then the contents are still needed, but the loop isn't, since
// it's non-consuming and thus any more repetitions than 1 are redundant. The one zero-width assertion
// that can't be handled in this way is a PositiveLookaround, because it might contain capture groups
// with captures that must persist past the lookaround (in contrast, negative lookarounds undo all
// captures); if it were to be removed, it could affect both subsequent backreferences as well as access
// to capture information in the resulting Match. Thus, we can only transform a PositiveLookaround in
// this manner if it doesn't contain any captures.
u = u.M == 0 ? new RegexNode(RegexNodeKind.Empty, Options) : child;
break;
}
}
Expand Down Expand Up @@ -2058,7 +2068,7 @@ private RegexNode ReduceLookaround()
// Captures inside of negative lookarounds are undone after the lookaround. Thus, if there's nothing
// inside of the negative lookaround that needs that capture group (namely a backreference), we can
// remove the capture.
if (Kind is RegexNodeKind.NegativeLookaround && ContainsBackreference(Child(0)) is false)
if (Kind is RegexNodeKind.NegativeLookaround && ContainsKind(Child(0), [RegexNodeKind.Backreference, RegexNodeKind.BackreferenceConditional]) is false)
{
if (RemoveCaptures(this, 0))
{
Expand Down Expand Up @@ -2131,26 +2141,32 @@ RegexNodeKind.Beginning or RegexNodeKind.Start or
RegexNodeKind.Bol or RegexNodeKind.Eol or
RegexNodeKind.End or RegexNodeKind.EndZ or
RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or
RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary;
RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary or
RegexNodeKind.UpdateBumpalong;

/// <summary>Gets whether the node contains a backreference anywhere in its tree.</summary>
private static bool? ContainsBackreference(RegexNode node)
/// <summary>Gets whether the node contains any of the specified kinds anywhere in its tree.</summary>
/// <returns><see langword="true"/> if it does, <see langword="false"/> if it does't, and <see langword="null"/> if it can't be determined.</returns>
private static bool? ContainsKind(RegexNode node, ReadOnlySpan<RegexNodeKind> kinds)
{
if (node.Kind is RegexNodeKind.Backreference or RegexNodeKind.BackreferenceConditional)
foreach (RegexNodeKind kind in kinds)
{
return true;
if (node.Kind == kind)
{
return true;
}
}

if (!StackHelper.TryEnsureSufficientExecutionStack())
{
// If we can't recur further, just stop optimizing.
// If we can't recur further, just stop optimizing. We need to return null to signal
// that the result can't be trusted.
return null;
}

int childCount = node.ChildCount();
for (int i = 0; i < childCount; i++)
{
if (ContainsBackreference(node.Child(i)) is true)
if (ContainsKind(node.Child(i), kinds) is true)
{
return true;
}
Expand Down Expand Up @@ -2787,25 +2803,10 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil
// Skip over empty nodes, as they're pure nops. They would ideally have been optimized away,
// but can still remain in some situations.
}
else if (consumeZeroWidthNodes &&
// anchors
child.Kind is RegexNodeKind.Beginning or
RegexNodeKind.Bol or
RegexNodeKind.Start or
// boundaries
RegexNodeKind.Boundary or
RegexNodeKind.ECMABoundary or
RegexNodeKind.NonBoundary or
RegexNodeKind.NonECMABoundary or
// lookarounds
RegexNodeKind.NegativeLookaround or
RegexNodeKind.PositiveLookaround or
// logic
RegexNodeKind.UpdateBumpalong)
else if (consumeZeroWidthNodes && IsZeroWidthAssertion(child.Kind))
{
// Skip over zero-width nodes that might be reasonable at the beginning of or within a substring.
// We can only do these if consumeZeroWidthNodes is true, as otherwise we'd be producing a string that
// may not fully represent the semantics of this portion of the pattern.
// Skip over zero-width nodes. We can only do these if consumeZeroWidthNodes is true, as otherwise we'd
// be producing a string that may not fully represent the semantics of this portion of the pattern.
}
else
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ public static IEnumerable<object[]> Match_MemberData()
yield return (@"(?:(?!(b)b)\1a)*", "babababa", RegexOptions.None, 0, 8, true, string.Empty);
yield return (@"(.*?)a(?!(a+)b\2c)\2(.*)", "baaabaac", RegexOptions.None, 0, 8, false, string.Empty);
yield return (@"(?!(abc))+\w\w\w", "abcdef", RegexOptions.None, 0, 6, true, "bcd");
yield return (@"(?=(abc))?\1", "abc", RegexOptions.None, 0, 3, true, "abc");
yield return (@"(?=(abc))+\1", "abc", RegexOptions.None, 0, 3, true, "abc");
yield return (@"(?=(abc))*\1", "abc", RegexOptions.None, 0, 3, true, "abc");

// Zero-width positive lookbehind assertion
yield return (@"(\w){6}(?<=XXX)def", "abcXXXdef", RegexOptions.None, 0, 9, true, "abcXXXdef");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,15 @@ public class RegexReductionTests
[InlineData("(?!(abc))", "(?!abc)")]
[InlineData("(?!a(b*)c)", "(?!ab*c)")]
[InlineData("(?!a((((b))))c)", "(?!abc)")]
[InlineData(@"(?=(?=(?=abc)))", @"(?=abc)")]
[InlineData(@"(?=(?<=(?=abc)))", @"(?<=(?=abc))")]
[InlineData(@"(?=\G)abc", @"\Gabc")]
[InlineData(@"(?=^)abc", @"^abc")]
[InlineData(@"(?=\b)abc", @"\babc")]
[InlineData(@"abc(?=\z)", @"abc\z")]
[InlineData(@"abc(?=\Z)", @"abc\Z")]
[InlineData(@"abc(?=\A)", @"abc\A")]
[InlineData(@"abc(?=$)", @"abc$")]
// Alternation reduction
[InlineData("a|b", "[ab]")]
[InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")]
Expand Down Expand Up @@ -409,16 +418,6 @@ public class RegexReductionTests
[InlineData(@"\z\z", @"\z")]
[InlineData(@"\G\G", @"\G")]
[InlineData(@"\A\A", @"\A")]
// Lookarounds
[InlineData(@"(?=^)abc", @"^abc")]
[InlineData(@"(?=\G)abc", @"\Gabc")]
[InlineData(@"abc(?=$)", @"abc$")]
[InlineData(@"(?=\b)abc", @"\babc")]
[InlineData(@"abc(?=\z)", @"abc\z")]
[InlineData(@"abc(?=\Z)", @"abc\Z")]
[InlineData(@"abc(?=\A)", @"abc\A")]
[InlineData(@"(?=(?=(?=abc)))", @"(?=abc)")]
[InlineData(@"(?=(?<=(?=abc)))", @"(?<=(?=abc))")]
// Nothing handling
[InlineData(@"\wabc(?!)def", "(?!)")]
[InlineData(@"\wabc(?!)def|ghi(?!)", "(?!)")]
Expand Down
Loading