Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ private static void AddIsWordCharHelper(Dictionary<string, string[]> requiredHel
"internal static bool IsWordChar(char ch)",
"{",
" // Mask of Unicode categories that combine to form [\\w]",
" const int WordCategories =",
" const int WordCategoriesMask =",
" 1 << (int)UnicodeCategory.UppercaseLetter |",
" 1 << (int)UnicodeCategory.LowercaseLetter |",
" 1 << (int)UnicodeCategory.TitlecaseLetter |",
Expand All @@ -321,7 +321,7 @@ private static void AddIsWordCharHelper(Dictionary<string, string[]> requiredHel
" int chDiv8 = ch >> 3;",
" return (uint)chDiv8 < (uint)ascii.Length ?",
" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :",
" (WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;",
" (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;",
"}",
});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1142,14 +1142,8 @@ public static bool IsECMAWordChar(char ch) =>
0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
};

/// <summary>Determines whether a character is considered a word character for the purposes of testing the \w set.</summary>
public static bool IsWordChar(char ch)
{
// This is the same as IsBoundaryWordChar, except that IsBoundaryWordChar also
// returns true for \u200c and \u200d.

// Mask of Unicode categories that combine to form [\\w]"
const int WordCategories =
/// <summary>Mask of Unicode categories that combine to form [\\w]</summary>
private const int WordCategoriesMask =
1 << (int)UnicodeCategory.UppercaseLetter |
1 << (int)UnicodeCategory.LowercaseLetter |
1 << (int)UnicodeCategory.TitlecaseLetter |
Expand All @@ -1159,14 +1153,20 @@ public static bool IsWordChar(char ch)
1 << (int)UnicodeCategory.DecimalDigitNumber |
1 << (int)UnicodeCategory.ConnectorPunctuation;

// Bitmap for whether each character 0 through 127 is in [\\w]",
ReadOnlySpan<byte> ascii = WordCharAsciiLookup;
/// <summary>Determines whether a character is considered a word character for the purposes of testing the \w set.</summary>
public static bool IsWordChar(char ch)
{
// This is the same as IsBoundaryWordChar, except that IsBoundaryWordChar also
// returns true for \u200c and \u200d.

// Bitmap for whether each character 0 through 127 is in [\\w]
ReadOnlySpan<byte> ascii = WordCharAsciiLookup;

// If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.",
int chDiv8 = ch >> 3;
return (uint)chDiv8 < (uint)ascii.Length ?
(ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
(WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
// If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
int chDiv8 = ch >> 3;
return (uint)chDiv8 < (uint)ascii.Length ?
(ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
(WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
}

/// <summary>Determines whether a character is considered a word character for the purposes of testing a word character boundary.</summary>
Expand All @@ -1178,25 +1178,14 @@ public static bool IsBoundaryWordChar(char ch)
// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';

// Mask of Unicode categories that combine to form [\\w]"
const int WordCategories =
1 << (int)UnicodeCategory.UppercaseLetter |
1 << (int)UnicodeCategory.LowercaseLetter |
1 << (int)UnicodeCategory.TitlecaseLetter |
1 << (int)UnicodeCategory.ModifierLetter |
1 << (int)UnicodeCategory.OtherLetter |
1 << (int)UnicodeCategory.NonSpacingMark |
1 << (int)UnicodeCategory.DecimalDigitNumber |
1 << (int)UnicodeCategory.ConnectorPunctuation;

// Bitmap for whether each character 0 through 127 is in [\\w]",
// Bitmap for whether each character 0 through 127 is in [\\w]
ReadOnlySpan<byte> ascii = WordCharAsciiLookup;

// If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.",
// If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
int chDiv8 = ch >> 3;
return (uint)chDiv8 < (uint)ascii.Length ?
(ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
((WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0 ||
((WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0 ||
(ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ file static class Utilities
internal static bool IsWordChar(char ch)
{
// Mask of Unicode categories that combine to form [\w]
const int WordCategories =
const int WordCategoriesMask =
1 << (int)UnicodeCategory.UppercaseLetter |
1 << (int)UnicodeCategory.LowercaseLetter |
1 << (int)UnicodeCategory.TitlecaseLetter |
Expand All @@ -364,19 +364,19 @@ internal static bool IsWordChar(char ch)
1 << (int)UnicodeCategory.NonSpacingMark |
1 << (int)UnicodeCategory.DecimalDigitNumber |
1 << (int)UnicodeCategory.ConnectorPunctuation;

// Bitmap for whether each character 0 through 127 is in [\w]
ReadOnlySpan<byte> ascii = new byte[]
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
};

// If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
int chDiv8 = ch >> 3;
return (uint)chDiv8 < (uint)ascii.Length ?
(ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
(WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
(WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
}

/// <summary>Pushes 2 values onto the backtracking stack.</summary>
Expand Down