diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 8d8c5c9ab753d0..a68ee5dd029549 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -302,42 +302,104 @@ private static string GetTimeoutExpression(int matchTimeout) => "Regex.InfiniteMatchTimeout" : $"TimeSpan.FromMilliseconds({matchTimeout.ToString(CultureInfo.InvariantCulture)})"; + private const string IsBoundary = nameof(IsBoundary); + private const string IsECMABoundary = nameof(IsECMABoundary); + private const string IsWordChar = nameof(IsWordChar); + private const string IsBoundaryWordChar = nameof(IsBoundaryWordChar); + private const string IsPostWordCharBoundary = nameof(IsPostWordCharBoundary); + private const string IsPreWordCharBoundary = nameof(IsPreWordCharBoundary); + private const string IsECMABoundaryWordChar = nameof(IsECMABoundaryWordChar); + private const string WordCategoriesMask = nameof(WordCategoriesMask); + private const string WordCharBitmap = nameof(WordCharBitmap); + + private static void AddWordCharHelpersSupport(Dictionary requiredHelpers) + { + const string WordCharHelpersSupport = nameof(WordCharHelpersSupport); + if (!requiredHelpers.ContainsKey(WordCharHelpersSupport)) + { + requiredHelpers.Add(WordCharHelpersSupport, + [ + "/// Provides a mask of Unicode categories that combine to form [\\w].", + $"private const int {WordCategoriesMask} =", + " 1 << (int)UnicodeCategory.UppercaseLetter |", + " 1 << (int)UnicodeCategory.LowercaseLetter |", + " 1 << (int)UnicodeCategory.TitlecaseLetter |", + " 1 << (int)UnicodeCategory.ModifierLetter |", + " 1 << (int)UnicodeCategory.OtherLetter |", + " 1 << (int)UnicodeCategory.NonSpacingMark |", + " 1 << (int)UnicodeCategory.DecimalDigitNumber |", + " 1 << (int)UnicodeCategory.ConnectorPunctuation;", + "", + "/// Gets a bitmap for whether each character 0 through 127 is in [\\w]", + $"private static ReadOnlySpan {WordCharBitmap} => new byte[]", + "{", + " 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,", + " 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07", + "};", + ]); + } + } + /// Adds the IsWordChar helper to the required helpers collection. private static void AddIsWordCharHelper(Dictionary requiredHelpers) { - const string IsWordChar = nameof(IsWordChar); if (!requiredHelpers.ContainsKey(IsWordChar)) { requiredHelpers.Add(IsWordChar, [ - "/// Determines whether the character is part of the [\\w] set.", - "[MethodImpl(MethodImplOptions.AggressiveInlining)]", - "internal static bool IsWordChar(char ch)", - "{", - " // Mask of Unicode categories that combine to form [\\w]", - " const int WordCategoriesMask =", - " 1 << (int)UnicodeCategory.UppercaseLetter |", - " 1 << (int)UnicodeCategory.LowercaseLetter |", - " 1 << (int)UnicodeCategory.TitlecaseLetter |", - " 1 << (int)UnicodeCategory.ModifierLetter |", - " 1 << (int)UnicodeCategory.OtherLetter |", - " 1 << (int)UnicodeCategory.NonSpacingMark |", - " 1 << (int)UnicodeCategory.DecimalDigitNumber |", - " 1 << (int)UnicodeCategory.ConnectorPunctuation;", - "", - " // Bitmap for whether each character 0 through 127 is in [\\w]", - " ReadOnlySpan ascii = new byte[]", - " {", - " 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,", - " 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07", - " };", - "", - " // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.", - " int chDiv8 = ch >> 3;", - " return (uint)chDiv8 < (uint)ascii.Length ?", - " (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :", - " (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;", - "}", + $"/// Determines whether the character is part of the [\\w] set.", + $"[MethodImpl(MethodImplOptions.AggressiveInlining)]", + $"internal static bool {IsWordChar}(char ch)", + $"{{", + $" // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.", + $" ReadOnlySpan ascii = {WordCharBitmap};", + $" int chDiv8 = ch >> 3;", + $" return (uint)chDiv8 < (uint)ascii.Length ?", + $" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :", + $" ({WordCategoriesMask} & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;", + $"}}", + ]); + + AddWordCharHelpersSupport(requiredHelpers); + } + } + + /// Adds the IsBoundary helper to the required helpers collection. + private static void AddIsBoundaryWordCharHelper(Dictionary requiredHelpers) + { + if (!requiredHelpers.ContainsKey(IsBoundaryWordChar)) + { + requiredHelpers.Add(IsBoundaryWordChar, + [ + $"/// Determines whether the specified index is a boundary word character.", + $"/// This is the same as \\w plus U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.", + $"[MethodImpl(MethodImplOptions.AggressiveInlining)]", + $"internal static bool {IsBoundaryWordChar}(char ch)", + $"{{", + $" ReadOnlySpan ascii = {WordCharBitmap};", + $" int chDiv8 = ch >> 3;", + $" return (uint)chDiv8 < (uint)ascii.Length ?", + $" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :", + $" (({WordCategoriesMask} & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0) || (ch is '\u200C' or '\u200D');", + $"}}", + ]); + + AddWordCharHelpersSupport(requiredHelpers); + } + } + + /// Adds the IsECMABoundary helper to the required helpers collection. + private static void AddIsECMABoundaryWordCharHelper(Dictionary requiredHelpers) + { + if (!requiredHelpers.ContainsKey(IsECMABoundaryWordChar)) + { + requiredHelpers.Add(IsECMABoundaryWordChar, + [ + $"/// Determines whether the specified index is a boundary (ECMAScript) word character.", + $"[MethodImpl(MethodImplOptions.AggressiveInlining)]", + $"internal static bool {IsECMABoundaryWordChar}(char ch) =>", + $" char.IsAsciiLetterOrDigit(ch) ||", + $" ch is '_' or '\\u0130'; // latin capital letter I with dot above", ]); } } @@ -345,7 +407,6 @@ private static void AddIsWordCharHelper(Dictionary requiredHel /// Adds the IsBoundary helper to the required helpers collection. private static void AddIsBoundaryHelper(Dictionary requiredHelpers, bool checkOverflow) { - const string IsBoundary = nameof(IsBoundary); if (!requiredHelpers.ContainsKey(IsBoundary)) { string uncheckedKeyword = checkOverflow ? "unchecked" : ""; @@ -353,24 +414,62 @@ private static void AddIsBoundaryHelper(Dictionary requiredHel [ $"/// Determines whether the specified index is a boundary.", $"[MethodImpl(MethodImplOptions.AggressiveInlining)]", - $"internal static bool IsBoundary(ReadOnlySpan inputSpan, int index)", + $"internal static bool {IsBoundary}(ReadOnlySpan inputSpan, int index)", $"{{", $" int indexMinus1 = index - 1;", - $" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexMinus1])) !=", - $" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));", - $"", - $" static bool IsBoundaryWordChar(char ch) => IsWordChar(ch) || (ch == '\\u200C' | ch == '\\u200D');", + $" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && {IsBoundaryWordChar}(inputSpan[indexMinus1])) !=", + $" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && {IsBoundaryWordChar}(inputSpan[index]));", $"}}", ]); - AddIsWordCharHelper(requiredHelpers); + AddIsBoundaryWordCharHelper(requiredHelpers); + } + } + + /// Adds the IsPreWordCharBoundary helper to the required helpers collection. + private static void AddIsPreWordCharBoundaryHelper(Dictionary requiredHelpers, bool checkOverflow) + { + if (!requiredHelpers.ContainsKey(IsPreWordCharBoundary)) + { + string uncheckedKeyword = checkOverflow ? "unchecked" : ""; + requiredHelpers.Add(IsPreWordCharBoundary, + [ + $"/// Determines whether the specified index is a boundary.", + $"/// This variant is only employed when the subsequent character will separately be validated as a word character.", + $"[MethodImpl(MethodImplOptions.AggressiveInlining)]", + $"internal static bool {IsPreWordCharBoundary}(ReadOnlySpan inputSpan, int index)", + $"{{", + $" int indexMinus1 = index - 1;", + $" return {uncheckedKeyword}((uint)indexMinus1 >= (uint)inputSpan.Length || !{IsBoundaryWordChar}(inputSpan[indexMinus1]));", + $"}}", + ]); + + AddIsBoundaryWordCharHelper(requiredHelpers); + } + } + + /// Adds the IsPostWordCharBoundary helper to the required helpers collection. + private static void AddIsPostWordCharBoundaryHelper(Dictionary requiredHelpers, bool checkOverflow) + { + if (!requiredHelpers.ContainsKey(IsPostWordCharBoundary)) + { + string uncheckedKeyword = checkOverflow ? "unchecked" : ""; + requiredHelpers.Add(IsPostWordCharBoundary, + [ + $"/// Determines whether the specified index is a boundary.", + $"/// This variant is only employed when the previous character has already been validated as a word character.", + $"[MethodImpl(MethodImplOptions.AggressiveInlining)]", + $"internal static bool {IsPostWordCharBoundary}(ReadOnlySpan inputSpan, int index) =>", + $" {uncheckedKeyword}((uint)index >= (uint)inputSpan.Length || !{IsBoundaryWordChar}(inputSpan[index]));", + ]); + + AddIsBoundaryWordCharHelper(requiredHelpers); } } /// Adds the IsECMABoundary helper to the required helpers collection. private static void AddIsECMABoundaryHelper(Dictionary requiredHelpers, bool checkOverflow) { - const string IsECMABoundary = nameof(IsECMABoundary); if (!requiredHelpers.ContainsKey(IsECMABoundary)) { string uncheckedKeyword = checkOverflow ? "unchecked" : ""; @@ -378,18 +477,15 @@ private static void AddIsECMABoundaryHelper(Dictionary require [ $"/// Determines whether the specified index is a boundary (ECMAScript).", $"[MethodImpl(MethodImplOptions.AggressiveInlining)]", - $"internal static bool IsECMABoundary(ReadOnlySpan inputSpan, int index)", + $"internal static bool {IsECMABoundary}(ReadOnlySpan inputSpan, int index)", $"{{", $" int indexMinus1 = index - 1;", - $" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[indexMinus1])) !=", - $" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));", - $"", - $" static bool IsECMAWordChar(char ch) =>", - $" char.IsAsciiLetterOrDigit(ch) ||", - $" ch == '_' ||", - $" ch == '\\u0130'; // latin capital letter I with dot above", + $" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && {IsECMABoundaryWordChar}(inputSpan[indexMinus1])) !=", + $" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && {IsECMABoundaryWordChar}(inputSpan[index]));", $"}}", ]); + + AddIsECMABoundaryWordCharHelper(requiredHelpers); } } @@ -3177,20 +3273,33 @@ void EmitBoundary(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.Boundary or RegexNodeKind.NonBoundary or RegexNodeKind.ECMABoundary or RegexNodeKind.NonECMABoundary, $"Unexpected kind: {node.Kind}"); + string negation = node.Kind is RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary ? "!" : ""; + string call; - if (node.Kind is RegexNodeKind.Boundary or RegexNodeKind.NonBoundary) - { - call = node.Kind is RegexNodeKind.Boundary ? - $"!{HelpersTypeName}.IsBoundary" : - $"{HelpersTypeName}.IsBoundary"; - AddIsBoundaryHelper(requiredHelpers, checkOverflow); - } - else + switch (node.Kind) { - call = node.Kind is RegexNodeKind.ECMABoundary ? - $"!{HelpersTypeName}.IsECMABoundary" : - $"{HelpersTypeName}.IsECMABoundary"; - AddIsECMABoundaryHelper(requiredHelpers, checkOverflow); + case RegexNodeKind.Boundary or RegexNodeKind.NonBoundary: + if (node.IsKnownPrecededByWordChar()) + { + call = $"{negation}{HelpersTypeName}.{IsPostWordCharBoundary}"; + AddIsPostWordCharBoundaryHelper(requiredHelpers, checkOverflow); + } + else if (node.IsKnownSucceededByWordChar()) + { + call = $"{negation}{HelpersTypeName}.{IsPreWordCharBoundary}"; + AddIsPreWordCharBoundaryHelper(requiredHelpers, checkOverflow); + } + else + { + call = $"{negation}{HelpersTypeName}.{IsBoundary}"; + AddIsBoundaryHelper(requiredHelpers, checkOverflow); + } + break; + + default: + call = $"{negation}{HelpersTypeName}.{IsECMABoundary}"; + AddIsECMABoundaryHelper(requiredHelpers, checkOverflow); + break; } using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}))")) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 3026d35bdf3338..43e9dd324eb162 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -35,6 +35,8 @@ internal abstract class RegexCompiler private static MethodInfo MatchLengthMethod => field ??= RegexRunnerMethod("MatchLength"); private static MethodInfo MatchIndexMethod => field ??= RegexRunnerMethod("MatchIndex"); private static MethodInfo IsBoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsBoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan), typeof(int)])!; + private static MethodInfo IsPreWordCharBoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsPreWordCharBoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan), typeof(int)])!; + private static MethodInfo IsPostWordCharBoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsPostWordCharBoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan), typeof(int)])!; private static MethodInfo IsWordCharMethod => field ??= RegexRunnerMethod("IsWordChar"); private static MethodInfo IsECMABoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsECMABoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan), typeof(int)])!; private static MethodInfo CrawlposMethod => field ??= RegexRunnerMethod("Crawlpos"); @@ -3050,25 +3052,41 @@ void EmitBoundary(RegexNode node) } switch (node.Kind) { - case RegexNodeKind.Boundary: - Call(IsBoundaryMethod); - BrfalseFar(doneLabel); - break; - - case RegexNodeKind.NonBoundary: - Call(IsBoundaryMethod); - BrtrueFar(doneLabel); - break; + case RegexNodeKind.Boundary or RegexNodeKind.NonBoundary: + if (node.IsKnownPrecededByWordChar()) + { + Call(IsPostWordCharBoundaryMethod); + } + else if (node.IsKnownSucceededByWordChar()) + { + Call(IsPreWordCharBoundaryMethod); + } + else + { + Call(IsBoundaryMethod); + } - case RegexNodeKind.ECMABoundary: - Call(IsECMABoundaryMethod); - BrfalseFar(doneLabel); + if (node.Kind is RegexNodeKind.Boundary) + { + BrfalseFar(doneLabel); + } + else + { + BrtrueFar(doneLabel); + } break; default: - Debug.Assert(node.Kind == RegexNodeKind.NonECMABoundary); Call(IsECMABoundaryMethod); - BrtrueFar(doneLabel); + + if (node.Kind is RegexNodeKind.ECMABoundary) + { + BrfalseFar(doneLabel); + } + else + { + BrtrueFar(doneLabel); + } break; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 496e6d368bd5fc..d15dc22eb01b1a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2452,6 +2452,55 @@ bool MayOverlapStartingOrEndingSet(string set) => } } + /// Gets whether this node is known to be immediately preceded by a word character. + public bool IsKnownPrecededByWordChar() => IsKnownPrecededOrSucceededByWordChar(false); + + /// Gets whether this node is known to be immediately succeeded by a word character. + public bool IsKnownSucceededByWordChar() => IsKnownPrecededOrSucceededByWordChar(true); + + private bool IsKnownPrecededOrSucceededByWordChar(bool succeeded) + { + RegexNode node = this; + Debug.Assert(node.Kind is not RegexNodeKind.Concatenate, "The existing logic assumes that the node itself isn't a concatenation."); + + // As in CanBeMadeAtomic, conservatively walk up through a limited set of constructs to the next concatenation. + while (true) + { + if ((node.Options & RegexOptions.RightToLeft) != 0 || + node.Parent is not RegexNode parent) + { + return false; + } + + switch (parent.Kind) + { + case RegexNodeKind.Atomic: + case RegexNodeKind.Alternate: + case RegexNodeKind.Capture: + node = parent; + continue; + + case RegexNodeKind.Concatenate: + var peers = (List)parent.Children!; + int index = peers.IndexOf(node) + (succeeded ? 1 : -1); + if ((uint)index < (uint)peers.Count) + { + // Now that we've found the concatenation, build a set that represents the characters that could come + // before or after this node, depending on whether we're looking for a preceding or succeeding word character. + return + RegexPrefixAnalyzer.FindFirstOrLastCharClass(peers[index], findFirst: succeeded) is string set && + RegexCharClass.IsKnownWordClassSubset(set); + } + + node = parent; + continue; + + default: + return false; + } + } + } + /// Computes a min bound on the required length of any string that could possibly match. /// The min computed length. If the result is 0, there is no minimum we can enforce. /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 34e5a84c85a5da..40a6affed70314 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -910,7 +910,7 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne /// public static string? FindLastCharClass(RegexNode root) => FindFirstOrLastCharClass(root, findFirst: false); - private static string? FindFirstOrLastCharClass(RegexNode root, bool findFirst) + public static string? FindFirstOrLastCharClass(RegexNode root, bool findFirst) { // Explore the graph, adding found chars into a result set, which is lazily initialized so that // we can initialize it to a parsed set if we discover one first (this is helpful not just for allocation diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index f07f0c821d0fb0..2e699c8e85a8d4 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -418,6 +418,21 @@ internal static bool IsBoundary(ReadOnlySpan inputSpan, int index) ((uint)index < (uint)inputSpan.Length && RegexCharClass.IsBoundaryWordChar(inputSpan[index])); } + /// Determines whether the specified index is a boundary.", + /// This variant is only employed when the subsequent character will separately be validated as a word character.", + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool IsPreWordCharBoundary(ReadOnlySpan inputSpan, int index) + { + int indexMinus1 = index - 1; + return (uint)indexMinus1 >= (uint)inputSpan.Length || !RegexCharClass.IsBoundaryWordChar(inputSpan[indexMinus1]); + } + + /// Determines whether the specified index is a boundary. + /// This variant is only employed when the previous character has already been validated as a word character. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool IsPostWordCharBoundary(ReadOnlySpan inputSpan, int index) => + (uint)index >= (uint)inputSpan.Length || !RegexCharClass.IsBoundaryWordChar(inputSpan[index]); + /// Called to determine a char's inclusion in the \w set. internal static bool IsWordChar(char ch) => RegexCharClass.IsWordChar(ch); diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs index e8896708277097..e340314f81e58e 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs @@ -367,25 +367,8 @@ file static class Utilities [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool IsWordChar(char ch) { - // Mask of Unicode categories that combine to form [\w] - const int WordCategoriesMask = - 1 << (int)UnicodeCategory.UppercaseLetter | - 1 << (int)UnicodeCategory.LowercaseLetter | - 1 << (int)UnicodeCategory.TitlecaseLetter | - 1 << (int)UnicodeCategory.ModifierLetter | - 1 << (int)UnicodeCategory.OtherLetter | - 1 << (int)UnicodeCategory.NonSpacingMark | - 1 << (int)UnicodeCategory.DecimalDigitNumber | - 1 << (int)UnicodeCategory.ConnectorPunctuation; - - // Bitmap for whether each character 0 through 127 is in [\w] - ReadOnlySpan ascii = new byte[] - { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03, - 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07 - }; - // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category. + ReadOnlySpan ascii = WordCharBitmap; int chDiv8 = ch >> 3; return (uint)chDiv8 < (uint)ascii.Length ? (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 : @@ -454,6 +437,24 @@ internal static int ValidateStackCookie(int expected, int actual) } return actual; } + + /// Provides a mask of Unicode categories that combine to form [\w]. + private const int WordCategoriesMask = + 1 << (int)UnicodeCategory.UppercaseLetter | + 1 << (int)UnicodeCategory.LowercaseLetter | + 1 << (int)UnicodeCategory.TitlecaseLetter | + 1 << (int)UnicodeCategory.ModifierLetter | + 1 << (int)UnicodeCategory.OtherLetter | + 1 << (int)UnicodeCategory.NonSpacingMark | + 1 << (int)UnicodeCategory.DecimalDigitNumber | + 1 << (int)UnicodeCategory.ConnectorPunctuation; + + /// Gets a bitmap for whether each character 0 through 127 is in [\w] + private static ReadOnlySpan WordCharBitmap => new byte[] + { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03, + 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07 + }; } } """