Skip to content

[RegexDiff X64] [stephentoub] Avoid extra boundary checks when preceeded/suc ... #1303

@MihuBot

Description

@MihuBot

Job completed in 18 minutes 16 seconds (remote runner delay: 1 minute 14 seconds).
dotnet/runtime#118105
Using arguments: regexdiff

3865 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"{\\s*(?<P>\\D\\w*)\\s*\\:\\s*var\\(\\s*(?<B> ..." (9881 uses)
[GeneratedRegex("{\\s*(?<P>\\D\\w*)\\s*\\:\\s*var\\(\\s*(?<B>\\D\\w*)\\s*\\)\\s*(;\\s*(?<P>\\D\\w*)\\s*\\:\\s*var\\(\\s*(?<B>\\D\\w*)\\s*\\)\\s*\\s*)*}")]
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool IsWordChar(char ch)
         {
-            // Mask of Unicode categories that combine to form [\w]
-            const int WordCategoriesMask =
-                1 << (int)UnicodeCategory.UppercaseLetter |
-                1 << (int)UnicodeCategory.LowercaseLetter |
-                1 << (int)UnicodeCategory.TitlecaseLetter |
-                1 << (int)UnicodeCategory.ModifierLetter |
-                1 << (int)UnicodeCategory.OtherLetter |
-                1 << (int)UnicodeCategory.NonSpacingMark |
-                1 << (int)UnicodeCategory.DecimalDigitNumber |
-                1 << (int)UnicodeCategory.ConnectorPunctuation;
-        
-            // Bitmap for whether each character 0 through 127 is in [\w]
-            ReadOnlySpan<byte> ascii = new byte[]
-            {
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
-                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
-            };
-        
             // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
+            ReadOnlySpan<byte> ascii = WordCharBitmap;
             int chDiv8 = ch >> 3;
             return (uint)chDiv8 < (uint)ascii.Length ?
                 (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
                 StackPush(ref stack, ref pos, arg0, arg1, arg2);
             }
         }
+        
+        /// <summary>Provides a mask of Unicode categories that combine to form [\w].</summary>
+        private const int WordCategoriesMask =
+            1 << (int)UnicodeCategory.UppercaseLetter |
+            1 << (int)UnicodeCategory.LowercaseLetter |
+            1 << (int)UnicodeCategory.TitlecaseLetter |
+            1 << (int)UnicodeCategory.ModifierLetter |
+            1 << (int)UnicodeCategory.OtherLetter |
+            1 << (int)UnicodeCategory.NonSpacingMark |
+            1 << (int)UnicodeCategory.DecimalDigitNumber |
+            1 << (int)UnicodeCategory.ConnectorPunctuation;
+        
+        /// <summary>Gets a bitmap for whether each character 0 through 127 is in [\w]</summary>
+        private static ReadOnlySpan<byte> WordCharBitmap => new byte[]
+            {
+                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
+                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
+            };
+        
     }
 }
"[A-z-[dDfFiIoOqQuUwWzZ]]\\d[A-z-[dDfFiIoOqQu ..." (5703 uses)
[GeneratedRegex("[A-z-[dDfFiIoOqQuUwWzZ]]\\d[A-z-[dDfFiIoOqQuU]] *\\d[A-z-[dDfFiIoOqQuU]]\\d\\b", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.CultureInvariant)]
              }
              
              // Match if at a word boundary.
-               if (!Utilities.IsBoundary(inputSpan, pos + 6))
+               if (!Utilities.IsPostWordCharBoundary(inputSpan, pos + 6))
              {
                  return false; // The input didn't match.
              }
  /// <summary>Whether <see cref="s_defaultTimeout"/> is non-infinite.</summary>
  internal static readonly bool s_hasTimeout = s_defaultTimeout != Regex.InfiniteMatchTimeout;
  
-   /// <summary>Determines whether the specified index is a boundary.</summary>
+   /// <summary>Determines whether the specified index is a boundary word character.</summary>
+   /// <remarks>This is the same as \w plus U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.</remarks>
  [MethodImpl(MethodImplOptions.AggressiveInlining)]
-   internal static bool IsBoundary(ReadOnlySpan<char> inputSpan, int index)
+   internal static bool IsBoundaryWordChar(char ch)
  {
-       int indexMinus1 = index - 1;
-       return ((uint)indexMinus1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexMinus1])) !=
-              ((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));
-   
-       static bool IsBoundaryWordChar(char ch) => IsWordChar(ch) || (ch == '\u200C' | ch == '\u200D');
+       ReadOnlySpan<byte> ascii = WordCharBitmap;
+       int chDiv8 = ch >> 3;
+       return (uint)chDiv8 < (uint)ascii.Length ?
+           (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
+           ((WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0) || (ch is '‌' or '‍');
  }
  
-   /// <summary>Determines whether the character is part of the [\w] set.</summary>
+   /// <summary>Determines whether the specified index is a boundary.</summary>
+   /// <remarks>This variant is only employed when the previous character has already been validated as a word character.</remarks>
  [MethodImpl(MethodImplOptions.AggressiveInlining)]
-   internal static bool IsWordChar(char ch)
-   {
-       // Mask of Unicode categories that combine to form [\w]
-       const int WordCategoriesMask =
-           1 << (int)UnicodeCategory.UppercaseLetter |
-           1 << (int)UnicodeCategory.LowercaseLetter |
-           1 << (int)UnicodeCategory.TitlecaseLetter |
-           1 << (int)UnicodeCategory.ModifierLetter |
-           1 << (int)UnicodeCategory.OtherLetter |
-           1 << (int)UnicodeCategory.NonSpacingMark |
-           1 << (int)UnicodeCategory.DecimalDigitNumber |
-           1 << (int)UnicodeCategory.ConnectorPunctuation;
+   internal static bool IsPostWordCharBoundary(ReadOnlySpan<char> inputSpan, int index) =>
+       ((uint)index >= (uint)inputSpan.Length || !IsBoundaryWordChar(inputSpan[index]));
  
-       // Bitmap for whether each character 0 through 127 is in [\w]
-       ReadOnlySpan<byte> ascii = new byte[]
+   /// <summary>Provides a mask of Unicode categories that combine to form [\w].</summary>
+   private const int WordCategoriesMask =
+       1 << (int)UnicodeCategory.UppercaseLetter |
+       1 << (int)UnicodeCategory.LowercaseLetter |
+       1 << (int)UnicodeCategory.TitlecaseLetter |
+       1 << (int)UnicodeCategory.ModifierLetter |
+       1 << (int)UnicodeCategory.OtherLetter |
+       1 << (int)UnicodeCategory.NonSpacingMark |
+       1 << (int)UnicodeCategory.DecimalDigitNumber |
+       1 << (int)UnicodeCategory.ConnectorPunctuation;
+   
+   /// <summary>Gets a bitmap for whether each character 0 through 127 is in [\w]</summary>
+   private static ReadOnlySpan<byte> WordCharBitmap => new byte[]
      {
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
          0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
      };
  
-       // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
-       int chDiv8 = ch >> 3;
-       return (uint)chDiv8 < (uint)ascii.Length ?
-           (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
-           (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
-   }
  
  /// <summary>Supports searching for characters in or not in "ABCEGHJKLMNPRSTVXY[\\]^_`abceghjklmnprstvxyK".</summary>
  internal static readonly SearchValues<char> s_nonAscii_0DD9414ACADF36B5FCB9FD5EDD16B6170F356585861BFF97C0F99F5B6EB09472 = SearchValues.Create("ABCEGHJKLMNPRSTVXY[\\]^_`abceghjklmnprstvxyK");
"^\\w+([_.-]\\w+)*$" (5006 uses)
[GeneratedRegex("^\\w+([_.-]\\w+)*$", RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture)]
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool IsWordChar(char ch)
         {
-            // Mask of Unicode categories that combine to form [\w]
-            const int WordCategoriesMask =
-                1 << (int)UnicodeCategory.UppercaseLetter |
-                1 << (int)UnicodeCategory.LowercaseLetter |
-                1 << (int)UnicodeCategory.TitlecaseLetter |
-                1 << (int)UnicodeCategory.ModifierLetter |
-                1 << (int)UnicodeCategory.OtherLetter |
-                1 << (int)UnicodeCategory.NonSpacingMark |
-                1 << (int)UnicodeCategory.DecimalDigitNumber |
-                1 << (int)UnicodeCategory.ConnectorPunctuation;
-        
-            // Bitmap for whether each character 0 through 127 is in [\w]
-            ReadOnlySpan<byte> ascii = new byte[]
-            {
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
-                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
-            };
-        
             // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
+            ReadOnlySpan<byte> ascii = WordCharBitmap;
             int chDiv8 = ch >> 3;
             return (uint)chDiv8 < (uint)ascii.Length ?
                 (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
                 StackPush(ref stack, ref pos, arg0, arg1);
             }
         }
+        
+        /// <summary>Provides a mask of Unicode categories that combine to form [\w].</summary>
+        private const int WordCategoriesMask =
+            1 << (int)UnicodeCategory.UppercaseLetter |
+            1 << (int)UnicodeCategory.LowercaseLetter |
+            1 << (int)UnicodeCategory.TitlecaseLetter |
+            1 << (int)UnicodeCategory.ModifierLetter |
+            1 << (int)UnicodeCategory.OtherLetter |
+            1 << (int)UnicodeCategory.NonSpacingMark |
+            1 << (int)UnicodeCategory.DecimalDigitNumber |
+            1 << (int)UnicodeCategory.ConnectorPunctuation;
+        
+        /// <summary>Gets a bitmap for whether each character 0 through 127 is in [\w]</summary>
+        private static ReadOnlySpan<byte> WordCharBitmap => new byte[]
+            {
+                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
+                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
+            };
+        
     }
 }
"^(\\w*)=(.*?)" (3778 uses)
[GeneratedRegex("^(\\w*)=(.*?)")]
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool IsWordChar(char ch)
         {
-            // Mask of Unicode categories that combine to form [\w]
-            const int WordCategoriesMask =
-                1 << (int)UnicodeCategory.UppercaseLetter |
-                1 << (int)UnicodeCategory.LowercaseLetter |
-                1 << (int)UnicodeCategory.TitlecaseLetter |
-                1 << (int)UnicodeCategory.ModifierLetter |
-                1 << (int)UnicodeCategory.OtherLetter |
-                1 << (int)UnicodeCategory.NonSpacingMark |
-                1 << (int)UnicodeCategory.DecimalDigitNumber |
-                1 << (int)UnicodeCategory.ConnectorPunctuation;
-        
-            // Bitmap for whether each character 0 through 127 is in [\w]
-            ReadOnlySpan<byte> ascii = new byte[]
-            {
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
-                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
-            };
-        
             // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
+            ReadOnlySpan<byte> ascii = WordCharBitmap;
             int chDiv8 = ch >> 3;
             return (uint)chDiv8 < (uint)ascii.Length ?
                 (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
                 (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
         }
+        
+        /// <summary>Provides a mask of Unicode categories that combine to form [\w].</summary>
+        private const int WordCategoriesMask =
+            1 << (int)UnicodeCategory.UppercaseLetter |
+            1 << (int)UnicodeCategory.LowercaseLetter |
+            1 << (int)UnicodeCategory.TitlecaseLetter |
+            1 << (int)UnicodeCategory.ModifierLetter |
+            1 << (int)UnicodeCategory.OtherLetter |
+            1 << (int)UnicodeCategory.NonSpacingMark |
+            1 << (int)UnicodeCategory.DecimalDigitNumber |
+            1 << (int)UnicodeCategory.ConnectorPunctuation;
+        
+        /// <summary>Gets a bitmap for whether each character 0 through 127 is in [\w]</summary>
+        private static ReadOnlySpan<byte> WordCharBitmap => new byte[]
+            {
+                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
+                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
+            };
+        
     }
 }
"^(\\w+\\.)+\\w+$" (2468 uses)
[GeneratedRegex("^(\\w+\\.)+\\w+$")]
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool IsWordChar(char ch)
         {
-            // Mask of Unicode categories that combine to form [\w]
-            const int WordCategoriesMask =
-                1 << (int)UnicodeCategory.UppercaseLetter |
-                1 << (int)UnicodeCategory.LowercaseLetter |
-                1 << (int)UnicodeCategory.TitlecaseLetter |
-                1 << (int)UnicodeCategory.ModifierLetter |
-                1 << (int)UnicodeCategory.OtherLetter |
-                1 << (int)UnicodeCategory.NonSpacingMark |
-                1 << (int)UnicodeCategory.DecimalDigitNumber |
-                1 << (int)UnicodeCategory.ConnectorPunctuation;
-        
-            // Bitmap for whether each character 0 through 127 is in [\w]
-            ReadOnlySpan<byte> ascii = new byte[]
-            {
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
-                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
-            };
-        
             // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
+            ReadOnlySpan<byte> ascii = WordCharBitmap;
             int chDiv8 = ch >> 3;
             return (uint)chDiv8 < (uint)ascii.Length ?
                 (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
                 StackPush(ref stack, ref pos, arg0, arg1);
             }
         }
+        
+        /// <summary>Provides a mask of Unicode categories that combine to form [\w].</summary>
+        private const int WordCategoriesMask =
+            1 << (int)UnicodeCategory.UppercaseLetter |
+            1 << (int)UnicodeCategory.LowercaseLetter |
+            1 << (int)UnicodeCategory.TitlecaseLetter |
+            1 << (int)UnicodeCategory.ModifierLetter |
+            1 << (int)UnicodeCategory.OtherLetter |
+            1 << (int)UnicodeCategory.NonSpacingMark |
+            1 << (int)UnicodeCategory.DecimalDigitNumber |
+            1 << (int)UnicodeCategory.ConnectorPunctuation;
+        
+        /// <summary>Gets a bitmap for whether each character 0 through 127 is in [\w]</summary>
+        private static ReadOnlySpan<byte> WordCharBitmap => new byte[]
+            {
+                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
+                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
+            };
+        
     }
 }
"{(?<env>env:)??\\w+(\\s+(\\?\\?)??\\s+\\w+)??}" (2282 uses)
[GeneratedRegex("{(?<env>env:)??\\w+(\\s+(\\?\\?)??\\s+\\w+)??}")]
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool IsWordChar(char ch)
         {
-            // Mask of Unicode categories that combine to form [\w]
-            const int WordCategoriesMask =
-                1 << (int)UnicodeCategory.UppercaseLetter |
-                1 << (int)UnicodeCategory.LowercaseLetter |
-                1 << (int)UnicodeCategory.TitlecaseLetter |
-                1 << (int)UnicodeCategory.ModifierLetter |
-                1 << (int)UnicodeCategory.OtherLetter |
-                1 << (int)UnicodeCategory.NonSpacingMark |
-                1 << (int)UnicodeCategory.DecimalDigitNumber |
-                1 << (int)UnicodeCategory.ConnectorPunctuation;
-        
-            // Bitmap for whether each character 0 through 127 is in [\w]
-            ReadOnlySpan<byte> ascii = new byte[]
-            {
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
-                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
-            };
-        
             // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
+            ReadOnlySpan<byte> ascii = WordCharBitmap;
             int chDiv8 = ch >> 3;
             return (uint)chDiv8 < (uint)ascii.Length ?
                 (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
                 StackPush(ref stack, ref pos, arg0, arg1, arg2);
             }
         }
+        
+        /// <summary>Provides a mask of Unicode categories that combine to form [\w].</summary>
+        private const int WordCategoriesMask =
+            1 << (int)UnicodeCategory.UppercaseLetter |
+            1 << (int)UnicodeCategory.LowercaseLetter |
+            1 << (int)UnicodeCategory.TitlecaseLetter |
+            1 << (int)UnicodeCategory.ModifierLetter |
+            1 << (int)UnicodeCategory.OtherLetter |
+            1 << (int)UnicodeCategory.NonSpacingMark |
+            1 << (int)UnicodeCategory.DecimalDigitNumber |
+            1 << (int)UnicodeCategory.ConnectorPunctuation;
+        
+        /// <summary>Gets a bitmap for whether each character 0 through 127 is in [\w]</summary>
+        private static ReadOnlySpan<byte> WordCharBitmap => new byte[]
+            {
+                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
+                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
+            };
+        
     }
 }
", Version=\\d+.\\d+.\\d+.\\d+, Culture=\\w+, ..." (2239 uses)
[GeneratedRegex(", Version=\\d+.\\d+.\\d+.\\d+, Culture=\\w+, PublicKeyToken=\\w+")]
      [MethodImpl(MethodImplOptions.AggressiveInlining)]
      internal static bool IsWordChar(char ch)
      {
-           // Mask of Unicode categories that combine to form [\w]
-           const int WordCategoriesMask =
-               1 << (int)UnicodeCategory.UppercaseLetter |
-               1 << (int)UnicodeCategory.LowercaseLetter |
-               1 << (int)UnicodeCategory.TitlecaseLetter |
-               1 << (int)UnicodeCategory.ModifierLetter |
-               1 << (int)UnicodeCategory.OtherLetter |
-               1 << (int)UnicodeCategory.NonSpacingMark |
-               1 << (int)UnicodeCategory.DecimalDigitNumber |
-               1 << (int)UnicodeCategory.ConnectorPunctuation;
-       
-           // Bitmap for whether each character 0 through 127 is in [\w]
-           ReadOnlySpan<byte> ascii = new byte[]
-           {
-               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
-               0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
-           };
-       
          // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
+           ReadOnlySpan<byte> ascii = WordCharBitmap;
          int chDiv8 = ch >> 3;
          return (uint)chDiv8 < (uint)ascii.Length ?
              (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
              (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
      }
      
+       /// <summary>Provides a mask of Unicode categories that combine to form [\w].</summary>
+       private const int WordCategoriesMask =
+           1 << (int)UnicodeCategory.UppercaseLetter |
+           1 << (int)UnicodeCategory.LowercaseLetter |
+           1 << (int)UnicodeCategory.TitlecaseLetter |
+           1 << (int)UnicodeCategory.ModifierLetter |
+           1 << (int)UnicodeCategory.OtherLetter |
+           1 << (int)UnicodeCategory.NonSpacingMark |
+           1 << (int)UnicodeCategory.DecimalDigitNumber |
+           1 << (int)UnicodeCategory.ConnectorPunctuation;
+       
+       /// <summary>Gets a bitmap for whether each character 0 through 127 is in [\w]</summary>
+       private static ReadOnlySpan<byte> WordCharBitmap => new byte[]
+           {
+               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
+               0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
+           };
+       
+       
      /// <summary>Supports searching for the string ", Version=".</summary>
      internal static readonly SearchValues<string> s_indexOfString_F484FBA9DDF61CC32D17E4ED223128BF4D7C62347668A9B369CE2C1E6BBB3513 = SearchValues.Create([", Version="], StringComparison.Ordinal);
  }
"^-+ *BEGIN (?<keyName>\\w+( \\w+)*) PRIVATE ..." (1964 uses)
[GeneratedRegex("^-+ *BEGIN (?<keyName>\\w+( \\w+)*) PRIVATE KEY *-+\\r?\\n(Proc-Type: 4,ENCRYPTED\\r?\\nDEK-Info: (?<cipherName>[A-Z0-9-]+),(?<salt>[A-F0-9]+)\\r?\\n\\r?\\n)?(?<data>([a-zA-Z0-9/+=]{1,80}\\r?\\n)+)-+ *END \\k<keyName> PRIVATE KEY *-+", RegexOptions.Multiline)]
  [MethodImpl(MethodImplOptions.AggressiveInlining)]
  internal static bool IsWordChar(char ch)
  {
-       // Mask of Unicode categories that combine to form [\w]
-       const int WordCategoriesMask =
-           1 << (int)UnicodeCategory.UppercaseLetter |
-           1 << (int)UnicodeCategory.LowercaseLetter |
-           1 << (int)UnicodeCategory.TitlecaseLetter |
-           1 << (int)UnicodeCategory.ModifierLetter |
-           1 << (int)UnicodeCategory.OtherLetter |
-           1 << (int)UnicodeCategory.NonSpacingMark |
-           1 << (int)UnicodeCategory.DecimalDigitNumber |
-           1 << (int)UnicodeCategory.ConnectorPunctuation;
-   
-       // Bitmap for whether each character 0 through 127 is in [\w]
-       ReadOnlySpan<byte> ascii = new byte[]
-       {
-           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
-           0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
-       };
-   
      // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
+       ReadOnlySpan<byte> ascii = WordCharBitmap;
      int chDiv8 = ch >> 3;
      return (uint)chDiv8 < (uint)ascii.Length ?
          (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
      }
  }
  
+   /// <summary>Provides a mask of Unicode categories that combine to form [\w].</summary>
+   private const int WordCategoriesMask =
+       1 << (int)UnicodeCategory.UppercaseLetter |
+       1 << (int)UnicodeCategory.LowercaseLetter |
+       1 << (int)UnicodeCategory.TitlecaseLetter |
+       1 << (int)UnicodeCategory.ModifierLetter |
+       1 << (int)UnicodeCategory.OtherLetter |
+       1 << (int)UnicodeCategory.NonSpacingMark |
+       1 << (int)UnicodeCategory.DecimalDigitNumber |
+       1 << (int)UnicodeCategory.ConnectorPunctuation;
+   
+   /// <summary>Gets a bitmap for whether each character 0 through 127 is in [\w]</summary>
+   private static ReadOnlySpan<byte> WordCharBitmap => new byte[]
+       {
+           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
+           0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
+       };
+   
+   
  /// <summary>Supports searching for characters in or not in "0123456789ABCDEF".</summary>
  internal static readonly SearchValues<char> s_asciiHexDigitsUpper = SearchValues.Create("0123456789ABCDEF");
"&(?!#?\\w+;)" (1880 uses)
[GeneratedRegex("&(?!#?\\w+;)")]
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool IsWordChar(char ch)
         {
-            // Mask of Unicode categories that combine to form [\w]
-            const int WordCategoriesMask =
-                1 << (int)UnicodeCategory.UppercaseLetter |
-                1 << (int)UnicodeCategory.LowercaseLetter |
-                1 << (int)UnicodeCategory.TitlecaseLetter |
-                1 << (int)UnicodeCategory.ModifierLetter |
-                1 << (int)UnicodeCategory.OtherLetter |
-                1 << (int)UnicodeCategory.NonSpacingMark |
-                1 << (int)UnicodeCategory.DecimalDigitNumber |
-                1 << (int)UnicodeCategory.ConnectorPunctuation;
-        
-            // Bitmap for whether each character 0 through 127 is in [\w]
-            ReadOnlySpan<byte> ascii = new byte[]
-            {
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
-                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
-            };
-        
             // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
+            ReadOnlySpan<byte> ascii = WordCharBitmap;
             int chDiv8 = ch >> 3;
             return (uint)chDiv8 < (uint)ascii.Length ?
                 (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
                 (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
         }
+        
+        /// <summary>Provides a mask of Unicode categories that combine to form [\w].</summary>
+        private const int WordCategoriesMask =
+            1 << (int)UnicodeCategory.UppercaseLetter |
+            1 << (int)UnicodeCategory.LowercaseLetter |
+            1 << (int)UnicodeCategory.TitlecaseLetter |
+            1 << (int)UnicodeCategory.ModifierLetter |
+            1 << (int)UnicodeCategory.OtherLetter |
+            1 << (int)UnicodeCategory.NonSpacingMark |
+            1 << (int)UnicodeCategory.DecimalDigitNumber |
+            1 << (int)UnicodeCategory.ConnectorPunctuation;
+        
+        /// <summary>Gets a bitmap for whether each character 0 through 127 is in [\w]</summary>
+        private static ReadOnlySpan<byte> WordCharBitmap => new byte[]
+            {
+                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
+                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
+            };
+        
     }
 }
"\\A\\s*(?<name>\\w+)(\\s*\\((?<arguments>.*) ..." (1751 uses)
[GeneratedRegex("\\A\\s*(?<name>\\w+)(\\s*\\((?<arguments>.*)\\))?\\s*\\Z", RegexOptions.Singleline)]
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool IsWordChar(char ch)
         {
-            // Mask of Unicode categories that combine to form [\w]
-            const int WordCategoriesMask =
-                1 << (int)UnicodeCategory.UppercaseLetter |
-                1 << (int)UnicodeCategory.LowercaseLetter |
-                1 << (int)UnicodeCategory.TitlecaseLetter |
-                1 << (int)UnicodeCategory.ModifierLetter |
-                1 << (int)UnicodeCategory.OtherLetter |
-                1 << (int)UnicodeCategory.NonSpacingMark |
-                1 << (int)UnicodeCategory.DecimalDigitNumber |
-                1 << (int)UnicodeCategory.ConnectorPunctuation;
-        
-            // Bitmap for whether each character 0 through 127 is in [\w]
-            ReadOnlySpan<byte> ascii = new byte[]
-            {
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
-                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
-            };
-        
             // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
+            ReadOnlySpan<byte> ascii = WordCharBitmap;
             int chDiv8 = ch >> 3;
             return (uint)chDiv8 < (uint)ascii.Length ?
                 (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
                 StackPush(ref stack, ref pos, arg0, arg1, arg2);
             }
         }
+        
+        /// <summary>Provides a mask of Unicode categories that combine to form [\w].</summary>
+        private const int WordCategoriesMask =
+            1 << (int)UnicodeCategory.UppercaseLetter |
+            1 << (int)UnicodeCategory.LowercaseLetter |
+            1 << (int)UnicodeCategory.TitlecaseLetter |
+            1 << (int)UnicodeCategory.ModifierLetter |
+            1 << (int)UnicodeCategory.OtherLetter |
+            1 << (int)UnicodeCategory.NonSpacingMark |
+            1 << (int)UnicodeCategory.DecimalDigitNumber |
+            1 << (int)UnicodeCategory.ConnectorPunctuation;
+        
+        /// <summary>Gets a bitmap for whether each character 0 through 127 is in [\w]</summary>
+        private static ReadOnlySpan<byte> WordCharBitmap => new byte[]
+            {
+                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
+                0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
+            };
+        
     }
 }

For more diff examples, see https://gist.github.com/MihuBot/3dc0c347ab5ededb4c479718266d90f0

JIT assembly changes
Total bytes of base: 54138884
Total bytes of diff: 53799262
Total bytes of delta: -339622 (-0.63 % of base)
Total relative delta: -96.31
    diff is an improvement.
    relative diff is an improvement.

For a list of JIT diff regressions, see Regressions.md
For a list of JIT diff improvements, see Improvements.md

Sample source code for further analysis
const string JsonPath = "RegexResults-1303.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2rQ5ESA");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions