Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -302,94 +302,190 @@ private static string GetTimeoutExpression(int matchTimeout) =>
"Regex.InfiniteMatchTimeout" :
$"TimeSpan.FromMilliseconds({matchTimeout.ToString(CultureInfo.InvariantCulture)})";

private const string IsBoundary = nameof(IsBoundary);
private const string IsECMABoundary = nameof(IsECMABoundary);
private const string IsWordChar = nameof(IsWordChar);
private const string IsBoundaryWordChar = nameof(IsBoundaryWordChar);
private const string IsPostWordCharBoundary = nameof(IsPostWordCharBoundary);
private const string IsPreWordCharBoundary = nameof(IsPreWordCharBoundary);
private const string IsECMABoundaryWordChar = nameof(IsECMABoundaryWordChar);
private const string WordCategoriesMask = nameof(WordCategoriesMask);
private const string WordCharBitmap = nameof(WordCharBitmap);

private static void AddWordCharHelpersSupport(Dictionary<string, string[]> requiredHelpers)
{
const string WordCharHelpersSupport = nameof(WordCharHelpersSupport);
if (!requiredHelpers.ContainsKey(WordCharHelpersSupport))
{
requiredHelpers.Add(WordCharHelpersSupport,
[
"/// <summary>Provides a mask of Unicode categories that combine to form [\\w].</summary>",
$"private const int {WordCategoriesMask} =",
" 1 << (int)UnicodeCategory.UppercaseLetter |",
" 1 << (int)UnicodeCategory.LowercaseLetter |",
" 1 << (int)UnicodeCategory.TitlecaseLetter |",
" 1 << (int)UnicodeCategory.ModifierLetter |",
" 1 << (int)UnicodeCategory.OtherLetter |",
" 1 << (int)UnicodeCategory.NonSpacingMark |",
" 1 << (int)UnicodeCategory.DecimalDigitNumber |",
" 1 << (int)UnicodeCategory.ConnectorPunctuation;",
"",
"/// <summary>Gets a bitmap for whether each character 0 through 127 is in [\\w]</summary>",
$"private static ReadOnlySpan<byte> {WordCharBitmap} => new byte[]",
"{",
" 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,",
" 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07",
"};",
]);
}
}

/// <summary>Adds the IsWordChar helper to the required helpers collection.</summary>
private static void AddIsWordCharHelper(Dictionary<string, string[]> requiredHelpers)
{
const string IsWordChar = nameof(IsWordChar);
if (!requiredHelpers.ContainsKey(IsWordChar))
{
requiredHelpers.Add(IsWordChar,
[
"/// <summary>Determines whether the character is part of the [\\w] set.</summary>",
"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
"internal static bool IsWordChar(char ch)",
"{",
" // Mask of Unicode categories that combine to form [\\w]",
" const int WordCategoriesMask =",
" 1 << (int)UnicodeCategory.UppercaseLetter |",
" 1 << (int)UnicodeCategory.LowercaseLetter |",
" 1 << (int)UnicodeCategory.TitlecaseLetter |",
" 1 << (int)UnicodeCategory.ModifierLetter |",
" 1 << (int)UnicodeCategory.OtherLetter |",
" 1 << (int)UnicodeCategory.NonSpacingMark |",
" 1 << (int)UnicodeCategory.DecimalDigitNumber |",
" 1 << (int)UnicodeCategory.ConnectorPunctuation;",
"",
" // Bitmap for whether each character 0 through 127 is in [\\w]",
" ReadOnlySpan<byte> ascii = new byte[]",
" {",
" 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,",
" 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07",
" };",
"",
" // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.",
" int chDiv8 = ch >> 3;",
" return (uint)chDiv8 < (uint)ascii.Length ?",
" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :",
" (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;",
"}",
$"/// <summary>Determines whether the character is part of the [\\w] set.</summary>",
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
$"internal static bool {IsWordChar}(char ch)",
$"{{",
$" // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.",
$" ReadOnlySpan<byte> ascii = {WordCharBitmap};",
$" int chDiv8 = ch >> 3;",
$" return (uint)chDiv8 < (uint)ascii.Length ?",
$" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :",
$" ({WordCategoriesMask} & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;",
$"}}",
]);

AddWordCharHelpersSupport(requiredHelpers);
}
}

/// <summary>Adds the IsBoundary helper to the required helpers collection.</summary>
private static void AddIsBoundaryWordCharHelper(Dictionary<string, string[]> requiredHelpers)
{
if (!requiredHelpers.ContainsKey(IsBoundaryWordChar))
{
requiredHelpers.Add(IsBoundaryWordChar,
[
$"/// <summary>Determines whether the specified index is a boundary word character.</summary>",
$"/// <remarks>This is the same as \\w plus U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.</remarks>",
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
$"internal static bool {IsBoundaryWordChar}(char ch)",
$"{{",
$" ReadOnlySpan<byte> ascii = {WordCharBitmap};",
$" int chDiv8 = ch >> 3;",
$" return (uint)chDiv8 < (uint)ascii.Length ?",
$" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :",
$" (({WordCategoriesMask} & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0) || (ch is '\u200C' or '\u200D');",
$"}}",
]);

AddWordCharHelpersSupport(requiredHelpers);
}
}

/// <summary>Adds the IsECMABoundary helper to the required helpers collection.</summary>
private static void AddIsECMABoundaryWordCharHelper(Dictionary<string, string[]> requiredHelpers)
{
if (!requiredHelpers.ContainsKey(IsECMABoundaryWordChar))
{
requiredHelpers.Add(IsECMABoundaryWordChar,
[
$"/// <summary>Determines whether the specified index is a boundary (ECMAScript) word character.</summary>",
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
$"internal static bool {IsECMABoundaryWordChar}(char ch) =>",
$" char.IsAsciiLetterOrDigit(ch) ||",
$" ch is '_' or '\\u0130'; // latin capital letter I with dot above",
]);
}
}

/// <summary>Adds the IsBoundary helper to the required helpers collection.</summary>
private static void AddIsBoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
const string IsBoundary = nameof(IsBoundary);
if (!requiredHelpers.ContainsKey(IsBoundary))
{
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
requiredHelpers.Add(IsBoundary,
[
$"/// <summary>Determines whether the specified index is a boundary.</summary>",
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
$"internal static bool IsBoundary(ReadOnlySpan<char> inputSpan, int index)",
$"internal static bool {IsBoundary}(ReadOnlySpan<char> inputSpan, int index)",
$"{{",
$" int indexMinus1 = index - 1;",
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexMinus1])) !=",
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));",
$"",
$" static bool IsBoundaryWordChar(char ch) => IsWordChar(ch) || (ch == '\\u200C' | ch == '\\u200D');",
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && {IsBoundaryWordChar}(inputSpan[indexMinus1])) !=",
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && {IsBoundaryWordChar}(inputSpan[index]));",
$"}}",
]);

AddIsWordCharHelper(requiredHelpers);
AddIsBoundaryWordCharHelper(requiredHelpers);
}
}

/// <summary>Adds the IsPreWordCharBoundary helper to the required helpers collection.</summary>
private static void AddIsPreWordCharBoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
if (!requiredHelpers.ContainsKey(IsPreWordCharBoundary))
{
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
requiredHelpers.Add(IsPreWordCharBoundary,
[
$"/// <summary>Determines whether the specified index is a boundary.</summary>",
$"/// <remarks>This variant is only employed when the subsequent character will separately be validated as a word character.</remarks>",
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
$"internal static bool {IsPreWordCharBoundary}(ReadOnlySpan<char> inputSpan, int index)",
$"{{",
$" int indexMinus1 = index - 1;",
$" return {uncheckedKeyword}((uint)indexMinus1 >= (uint)inputSpan.Length || !{IsBoundaryWordChar}(inputSpan[indexMinus1]));",
$"}}",
]);

AddIsBoundaryWordCharHelper(requiredHelpers);
}
}

/// <summary>Adds the IsPostWordCharBoundary helper to the required helpers collection.</summary>
private static void AddIsPostWordCharBoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
if (!requiredHelpers.ContainsKey(IsPostWordCharBoundary))
{
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
requiredHelpers.Add(IsPostWordCharBoundary,
[
$"/// <summary>Determines whether the specified index is a boundary.</summary>",
$"/// <remarks>This variant is only employed when the previous character has already been validated as a word character.</remarks>",
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
$"internal static bool {IsPostWordCharBoundary}(ReadOnlySpan<char> inputSpan, int index) =>",
$" {uncheckedKeyword}((uint)index >= (uint)inputSpan.Length || !{IsBoundaryWordChar}(inputSpan[index]));",
]);

AddIsBoundaryWordCharHelper(requiredHelpers);
}
}

/// <summary>Adds the IsECMABoundary helper to the required helpers collection.</summary>
private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
const string IsECMABoundary = nameof(IsECMABoundary);
if (!requiredHelpers.ContainsKey(IsECMABoundary))
{
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
requiredHelpers.Add(IsECMABoundary,
[
$"/// <summary>Determines whether the specified index is a boundary (ECMAScript).</summary>",
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
$"internal static bool IsECMABoundary(ReadOnlySpan<char> inputSpan, int index)",
$"internal static bool {IsECMABoundary}(ReadOnlySpan<char> inputSpan, int index)",
$"{{",
$" int indexMinus1 = index - 1;",
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[indexMinus1])) !=",
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));",
$"",
$" static bool IsECMAWordChar(char ch) =>",
$" char.IsAsciiLetterOrDigit(ch) ||",
$" ch == '_' ||",
$" ch == '\\u0130'; // latin capital letter I with dot above",
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && {IsECMABoundaryWordChar}(inputSpan[indexMinus1])) !=",
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && {IsECMABoundaryWordChar}(inputSpan[index]));",
$"}}",
]);

AddIsECMABoundaryWordCharHelper(requiredHelpers);
}
}

Expand Down Expand Up @@ -3177,20 +3273,33 @@ void EmitBoundary(RegexNode node)
{
Debug.Assert(node.Kind is RegexNodeKind.Boundary or RegexNodeKind.NonBoundary or RegexNodeKind.ECMABoundary or RegexNodeKind.NonECMABoundary, $"Unexpected kind: {node.Kind}");

string negation = node.Kind is RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary ? "!" : "";

string call;
if (node.Kind is RegexNodeKind.Boundary or RegexNodeKind.NonBoundary)
{
call = node.Kind is RegexNodeKind.Boundary ?
$"!{HelpersTypeName}.IsBoundary" :
$"{HelpersTypeName}.IsBoundary";
AddIsBoundaryHelper(requiredHelpers, checkOverflow);
}
else
switch (node.Kind)
{
call = node.Kind is RegexNodeKind.ECMABoundary ?
$"!{HelpersTypeName}.IsECMABoundary" :
$"{HelpersTypeName}.IsECMABoundary";
AddIsECMABoundaryHelper(requiredHelpers, checkOverflow);
case RegexNodeKind.Boundary or RegexNodeKind.NonBoundary:
if (node.IsKnownPrecededByWordChar())
{
call = $"{negation}{HelpersTypeName}.{IsPostWordCharBoundary}";
AddIsPostWordCharBoundaryHelper(requiredHelpers, checkOverflow);
}
else if (node.IsKnownSucceededByWordChar())
{
call = $"{negation}{HelpersTypeName}.{IsPreWordCharBoundary}";
AddIsPreWordCharBoundaryHelper(requiredHelpers, checkOverflow);
}
else
{
call = $"{negation}{HelpersTypeName}.{IsBoundary}";
AddIsBoundaryHelper(requiredHelpers, checkOverflow);
}
break;

default:
call = $"{negation}{HelpersTypeName}.{IsECMABoundary}";
AddIsECMABoundaryHelper(requiredHelpers, checkOverflow);
break;
}

using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}))"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ internal abstract class RegexCompiler
private static MethodInfo MatchLengthMethod => field ??= RegexRunnerMethod("MatchLength");
private static MethodInfo MatchIndexMethod => field ??= RegexRunnerMethod("MatchIndex");
private static MethodInfo IsBoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsBoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan<char>), typeof(int)])!;
private static MethodInfo IsPreWordCharBoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsPreWordCharBoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan<char>), typeof(int)])!;
private static MethodInfo IsPostWordCharBoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsPostWordCharBoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan<char>), typeof(int)])!;
private static MethodInfo IsWordCharMethod => field ??= RegexRunnerMethod("IsWordChar");
private static MethodInfo IsECMABoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsECMABoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan<char>), typeof(int)])!;
private static MethodInfo CrawlposMethod => field ??= RegexRunnerMethod("Crawlpos");
Expand Down Expand Up @@ -3050,25 +3052,41 @@ void EmitBoundary(RegexNode node)
}
switch (node.Kind)
{
case RegexNodeKind.Boundary:
Call(IsBoundaryMethod);
BrfalseFar(doneLabel);
break;

case RegexNodeKind.NonBoundary:
Call(IsBoundaryMethod);
BrtrueFar(doneLabel);
break;
case RegexNodeKind.Boundary or RegexNodeKind.NonBoundary:
if (node.IsKnownPrecededByWordChar())
{
Call(IsPostWordCharBoundaryMethod);
}
else if (node.IsKnownSucceededByWordChar())
{
Call(IsPreWordCharBoundaryMethod);
}
else
{
Call(IsBoundaryMethod);
}

case RegexNodeKind.ECMABoundary:
Call(IsECMABoundaryMethod);
BrfalseFar(doneLabel);
if (node.Kind is RegexNodeKind.Boundary)
{
BrfalseFar(doneLabel);
}
else
{
BrtrueFar(doneLabel);
}
break;

default:
Debug.Assert(node.Kind == RegexNodeKind.NonECMABoundary);
Call(IsECMABoundaryMethod);
BrtrueFar(doneLabel);

if (node.Kind is RegexNodeKind.ECMABoundary)
{
BrfalseFar(doneLabel);
}
else
{
BrtrueFar(doneLabel);
}
break;
}
}
Expand Down
Loading
Loading