Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,17 @@ private static void AddIsWordCharHelper(Dictionary<string, string[]> requiredHel
"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
"internal static bool IsWordChar(char ch)",
"{",
" // Mask of Unicode categories that combine to form [\\w]",
" const int WordCategories =",
" 1 << (int)UnicodeCategory.UppercaseLetter |",
" 1 << (int)UnicodeCategory.LowercaseLetter |",
" 1 << (int)UnicodeCategory.TitlecaseLetter |",
" 1 << (int)UnicodeCategory.ModifierLetter |",
" 1 << (int)UnicodeCategory.OtherLetter |",
" 1 << (int)UnicodeCategory.NonSpacingMark |",
" 1 << (int)UnicodeCategory.DecimalDigitNumber |",
" 1 << (int)UnicodeCategory.ConnectorPunctuation;",
"",
" // Bitmap for whether each character 0 through 127 is in [\\w]",
" ReadOnlySpan<byte> ascii = new byte[]",
" {",
Expand All @@ -309,18 +320,7 @@ private static void AddIsWordCharHelper(Dictionary<string, string[]> requiredHel
" int chDiv8 = ch >> 3;",
" return (uint)chDiv8 < (uint)ascii.Length ?",
" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :",
" CharUnicodeInfo.GetUnicodeCategory(ch) switch",
" {",
" UnicodeCategory.UppercaseLetter or",
" UnicodeCategory.LowercaseLetter or",
" UnicodeCategory.TitlecaseLetter or",
" UnicodeCategory.ModifierLetter or",
" UnicodeCategory.OtherLetter or",
" UnicodeCategory.NonSpacingMark or",
" UnicodeCategory.DecimalDigitNumber or",
" UnicodeCategory.ConnectorPunctuation => true,",
" _ => false,",
" };",
" (WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;",
"}",
});
}
Expand Down Expand Up @@ -4641,11 +4641,16 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s
Span<UnicodeCategory> categories = stackalloc UnicodeCategory[30]; // number of UnicodeCategory values (though it's unheard of to have a set with all of them)
if (RegexCharClass.TryGetOnlyCategories(charClass, categories, out int numCategories, out bool negated))
{
// TODO https://github.com/dotnet/roslyn/issues/58246: Use pattern matching instead of switch once C# code gen quality improves.
int categoryMask = 0;
foreach (UnicodeCategory category in categories.Slice(0, numCategories))
{
categoryMask |= 1 << (int)category;
}

negate ^= negated;
return numCategories == 1 ?
$"(char.GetUnicodeCategory({chExpr}) {(negate ? "!=" : "==")} UnicodeCategory.{categories[0]})" :
$"(char.GetUnicodeCategory({chExpr}) switch {{ {string.Join(" or ", categories.Slice(0, numCategories).ToArray().Select(c => $"UnicodeCategory.{c}"))} => {(negate ? "false" : "true")}, _ => {(negate ? "true" : "false")} }})";
$"((0x{categoryMask:X} & (1 << (int)char.GetUnicodeCategory({chExpr}))) {(negate ? "==" : "!=")} 0)";
}

// Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1148,31 +1148,25 @@ public static bool IsWordChar(char ch)
// This is the same as IsBoundaryWordChar, except that IsBoundaryWordChar also
// returns true for \u200c and \u200d.

// Fast lookup in our lookup table for ASCII characters. This is purely an optimization, and has the
// behavior as if we fell through to the switch below (which was actually used to produce the lookup table).
ReadOnlySpan<byte> asciiLookup = WordCharAsciiLookup;
int chDiv8 = ch >> 3;
if ((uint)chDiv8 < (uint)asciiLookup.Length)
{
return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;
}

// For non-ASCII, fall back to checking the Unicode category.
switch (CharUnicodeInfo.GetUnicodeCategory(ch))
{
case UnicodeCategory.UppercaseLetter:
case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.TitlecaseLetter:
case UnicodeCategory.ModifierLetter:
case UnicodeCategory.OtherLetter:
case UnicodeCategory.NonSpacingMark:
case UnicodeCategory.DecimalDigitNumber:
case UnicodeCategory.ConnectorPunctuation:
return true;

default:
return false;
}
// Mask of Unicode categories that combine to form [\\w]"
const int WordCategories =
1 << (int)UnicodeCategory.UppercaseLetter |
1 << (int)UnicodeCategory.LowercaseLetter |
1 << (int)UnicodeCategory.TitlecaseLetter |
1 << (int)UnicodeCategory.ModifierLetter |
1 << (int)UnicodeCategory.OtherLetter |
1 << (int)UnicodeCategory.NonSpacingMark |
1 << (int)UnicodeCategory.DecimalDigitNumber |
1 << (int)UnicodeCategory.ConnectorPunctuation;

// Bitmap for whether each character 0 through 127 is in [\\w]",
ReadOnlySpan<byte> ascii = WordCharAsciiLookup;

// If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.",
int chDiv8 = ch >> 3;
return (uint)chDiv8 < (uint)ascii.Length ?
(ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
(WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
}

/// <summary>Determines whether a character is considered a word character for the purposes of testing a word character boundary.</summary>
Expand All @@ -1182,33 +1176,28 @@ public static bool IsBoundaryWordChar(char ch)
// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.

// Fast lookup in our lookup table for ASCII characters. This is purely an optimization, and has the
// behavior as if we fell through to the switch below (which was actually used to produce the lookup table).
ReadOnlySpan<byte> asciiLookup = WordCharAsciiLookup;
const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';

// Mask of Unicode categories that combine to form [\\w]"
const int WordCategories =
1 << (int)UnicodeCategory.UppercaseLetter |
1 << (int)UnicodeCategory.LowercaseLetter |
1 << (int)UnicodeCategory.TitlecaseLetter |
1 << (int)UnicodeCategory.ModifierLetter |
1 << (int)UnicodeCategory.OtherLetter |
1 << (int)UnicodeCategory.NonSpacingMark |
1 << (int)UnicodeCategory.DecimalDigitNumber |
1 << (int)UnicodeCategory.ConnectorPunctuation;

// Bitmap for whether each character 0 through 127 is in [\\w]",
ReadOnlySpan<byte> ascii = WordCharAsciiLookup;

// If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.",
int chDiv8 = ch >> 3;
if ((uint)chDiv8 < (uint)asciiLookup.Length)
{
return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;
}

// For non-ASCII, fall back to checking the Unicode category.
switch (CharUnicodeInfo.GetUnicodeCategory(ch))
{
case UnicodeCategory.UppercaseLetter:
case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.TitlecaseLetter:
case UnicodeCategory.ModifierLetter:
case UnicodeCategory.OtherLetter:
case UnicodeCategory.NonSpacingMark:
case UnicodeCategory.DecimalDigitNumber:
case UnicodeCategory.ConnectorPunctuation:
return true;

default:
const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';
return ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner;
}
return (uint)chDiv8 < (uint)ascii.Length ?
(ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
((WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0 ||
(ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner));
}

/// <summary>Determines whether the 'a' and 'b' values differ by only a single bit, setting that bit in 'mask'.</summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -354,29 +354,29 @@ file static class Utilities
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static bool IsWordChar(char ch)
{
// Mask of Unicode categories that combine to form [\w]
const int WordCategories =
1 << (int)UnicodeCategory.UppercaseLetter |
1 << (int)UnicodeCategory.LowercaseLetter |
1 << (int)UnicodeCategory.TitlecaseLetter |
1 << (int)UnicodeCategory.ModifierLetter |
1 << (int)UnicodeCategory.OtherLetter |
1 << (int)UnicodeCategory.NonSpacingMark |
1 << (int)UnicodeCategory.DecimalDigitNumber |
1 << (int)UnicodeCategory.ConnectorPunctuation;

// Bitmap for whether each character 0 through 127 is in [\w]
ReadOnlySpan<byte> ascii = new byte[]
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
};

// If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
int chDiv8 = ch >> 3;
return (uint)chDiv8 < (uint)ascii.Length ?
(ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
CharUnicodeInfo.GetUnicodeCategory(ch) switch
{
UnicodeCategory.UppercaseLetter or
UnicodeCategory.LowercaseLetter or
UnicodeCategory.TitlecaseLetter or
UnicodeCategory.ModifierLetter or
UnicodeCategory.OtherLetter or
UnicodeCategory.NonSpacingMark or
UnicodeCategory.DecimalDigitNumber or
UnicodeCategory.ConnectorPunctuation => true,
_ => false,
};
(WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
}

/// <summary>Pushes 2 values onto the backtracking stack.</summary>
Expand Down