Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1272,6 +1272,55 @@ public static bool IsWordChar(char ch)
(WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
}

/// <summary>Determines whether the characters that match the specified set are known to all be word characters.</summary>
public static bool IsKnownWordClassSubset(string set)
{
// Check for common sets that we know to be subsets of \w.
if (set is
WordClass or DigitClass or LetterClass or LetterOrDigitClass or
AsciiLetterClass or AsciiLetterOrDigitClass or
HexDigitClass or HexDigitUpperClass or HexDigitLowerClass)
{
return true;
}

// Check for sets composed of Unicode categories that are part of \w.
Span<UnicodeCategory> categories = stackalloc UnicodeCategory[16];
if (TryGetOnlyCategories(set, categories, out int numCategories, out bool negated) && !negated)
{
foreach (UnicodeCategory cat in categories.Slice(0, numCategories))
{
if (!IsWordCategory(cat))
{
return false;
}
}

return true;
}

// If we can enumerate every character in the set quickly, do so, checking to see whether they're all in \w.
if (CanEasilyEnumerateSetContents(set))
{
for (int i = SetStartIndex; i < SetStartIndex + set[SetLengthIndex]; i += 2)
{
int curSetEnd = set[i + 1];
for (int c = set[i]; c < curSetEnd; c++)
{
if (!CharInClass((char)c, WordClass))
{
return false;
}
}
}

return true;
}

// Unlikely to be a subset of \w, and we don't know for sure.
return false;
}

/// <summary>Determines whether a character is considered a word character for the purposes of testing a word character boundary.</summary>
public static bool IsBoundaryWordChar(char ch)
{
Expand All @@ -1288,10 +1337,13 @@ public static bool IsBoundaryWordChar(char ch)
int chDiv8 = ch >> 3;
return (uint)chDiv8 < (uint)ascii.Length ?
(ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
((WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0 ||
(IsWordCategory(CharUnicodeInfo.GetUnicodeCategory(ch)) ||
(ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner));
}

private static bool IsWordCategory(UnicodeCategory category) =>
(WordCategoriesMask & (1 << (int)category)) != 0;

/// <summary>Determines whether the 'a' and 'b' values differ by only a single bit, setting that bit in 'mask'.</summary>
/// <remarks>This isn't specific to RegexCharClass; it's just a convenient place to host it.</remarks>
public static bool DifferByOneBit(char a, char b, out int mask)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2181,7 +2181,7 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i

case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
case RegexNodeKind.Boundary when node.M > 0 && node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsKnownWordClassSubset(node.Str!):
case RegexNodeKind.NonBoundary when node.M > 0 && node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
case RegexNodeKind.ECMABoundary when node.M > 0 && node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
case RegexNodeKind.NonECMABoundary when node.M > 0 && node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,15 @@ public class RegexReductionTests
[InlineData("\\d+\\b", "(?>\\d+)\\b")]
[InlineData("\\W+\\B", "(?>\\W+)\\B")]
[InlineData("\\D+\\B", "(?>\\D+)\\B")]
[InlineData(@"[0-9]+\b", @"(?>[0-9]+)\b")]
[InlineData(@"[a-z]+\b", @"(?>[a-z]+)\b")]
[InlineData(@"[A-Z]+\b", @"(?>[A-Z]+)\b")]
[InlineData(@"[a-zA-Z]+\b", @"(?>[a-zA-Z]+)\b")]
[InlineData(@"[a-fA-F0-9]+\b", @"(?>[a-fA-F0-9]+)\b")]
[InlineData(@"[A-F0-9]+\b", @"(?>[A-F0-9]+)\b")]
[InlineData(@"[a-f0-9]+\b", @"(?>[a-f0-9]+)\b")]
[InlineData(@"[\p{L}\d]+\b", @"(?>[\p{L}\d]+)\b")]
[InlineData(@"[\p{L}\p{Mn}]+\b", @"(?>[\p{L}\p{Mn}]+)\b")]
[InlineData(@"\d+\D", @"(?>\d+)\D")]
[InlineData(@"\D+\d", @"(?>\D+)\d")]
[InlineData(@"\s+\S", @"(?>\s+)\S")]
Expand Down Expand Up @@ -494,6 +503,9 @@ public void PatternsReduceIdentically(string actual, string expected)
[InlineData(@"\d*\b", @"(?>\d*)\b")]
[InlineData(@"\W*\B", @"(?>\W*)\B")]
[InlineData(@"\D*\B", @"(?>\D*)\B")]
[InlineData(@"\b[a-z ]+\b", @"\b(?>[a-z ]+)\b")]
[InlineData(@"\b[\p{L}\p{Mn}a]+\b", @"\b(?>[\p{L}\p{Mn}a]+)\b")]
[InlineData(@"\b[\p{C}]+\b", @"\b(?>[\p{C}]+)\b")]
// Loops inside alternation constructs
[InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")]
[InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]
Expand Down
Loading