Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Put back FindCaseSensitivePrefix regex alternation support #64204

Merged
merged 2 commits into from
Jan 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture)
}

// If there's a leading case-sensitive substring, just use IndexOf and inherit all of its optimizations.
string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(tree);
string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(tree.Root);
if (caseSensitivePrefix.Length > 1)
{
LeadingCaseSensitivePrefix = caseSensitivePrefix;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ private RegexPrefixAnalyzer(Span<int> intStack)
_skipAllChildren = false;
}

/// <summary>Computes the leading substring in <paramref name="tree"/>; may be empty.</summary>
public static string FindCaseSensitivePrefix(RegexTree tree)
/// <summary>Computes the leading substring in <paramref name="node"/>; may be empty.</summary>
public static string FindCaseSensitivePrefix(RegexNode node)
{
var vsb = new ValueStringBuilder(stackalloc char[64]);
Process(tree.Root, ref vsb);
Process(node, ref vsb);
return vsb.ToString();

// Processes the node, adding any prefix text to the builder.
Expand Down Expand Up @@ -87,6 +87,59 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
return !rtl;
}

// Alternation: find a string that's a shared prefix of all branches
case RegexNodeKind.Alternate:
{
int childCount = node.ChildCount();

// Store the initial branch into the target builder, keeping track
// of how much was appended. Any of this contents that doesn't overlap
// will every other branch will be removed before returning.
int initialLength = vsb.Length;
Process(node.Child(0), ref vsb);
int addedLength = vsb.Length - initialLength;

// Then explore the rest of the branches, finding the length
// of prefix they all share in common with the initial branch.
if (addedLength != 0)
{
var alternateSb = new ValueStringBuilder(64);

// Process each branch. If we reach a point where we've proven there's
// no overlap, we can bail early.
for (int i = 1; i < childCount && addedLength != 0; i++)
{
alternateSb.Length = 0;

// Process the branch into a temporary builder.
Process(node.Child(i), ref alternateSb);

// Find how much overlap there is between this branch's prefix
// and the smallest amount of prefix that overlapped with all
// the previously seen branches.
addedLength = Math.Min(addedLength, alternateSb.Length);
for (int j = 0; j < addedLength; j++)
{
if (vsb[initialLength + j] != alternateSb[j])
{
addedLength = j;
break;
}
}
}

alternateSb.Dispose();

// Then cull back on what was added based on the other branches.
vsb.Length = initialLength + addedLength;
}

// Don't explore anything after the alternation. We could make this work if desirable,
// but it's currently not worth the extra complication. The entire contents of every
// branch would need to be identical other than zero-width anchors/assertions.
return false;
}

// One character
case RegexNodeKind.One when (node.Options & RegexOptions.IgnoreCase) == 0:
vsb.Append(node.Ch);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,10 @@ public static IEnumerable<object[]> Match_MemberData()
yield return (@"(^|($|a+))bc", " aabc", RegexOptions.None, 0, 5, true, "aabc");
yield return (@"yz(^|a+)bc", " yzaabc", RegexOptions.None, 0, 7, true, "yzaabc");
yield return (@"(^a|a$) bc", "a bc", RegexOptions.None, 0, 4, true, "a bc");
yield return (@"(abcdefg|abcdef|abc|a)h", " ah ", RegexOptions.None, 0, 8, true, "ah");
yield return (@"(^abcdefg|abcdef|^abc|a)h", " abcdefh ", RegexOptions.None, 0, 13, true, "abcdefh");
yield return (@"(a|^abcdefg|abcdef|^abc)h", " abcdefh ", RegexOptions.None, 0, 13, true, "abcdefh");
yield return (@"(abcdefg|abcdef)h", " abcdefghij ", RegexOptions.None, 0, 16, true, "abcdefgh");

if (!RegexHelpers.IsNonBacktracking(engine))
{
Expand Down