Skip to content

[RegexDiff X64] [stephentoub] Auto-atomic for more loops followed by boundaries #1254

@MihuBot

Description

@MihuBot

Job completed in 18 minutes 58 seconds (remote runner delay: 1 minute 15 seconds).
dotnet/runtime#117892
Using arguments: regexdiff

16 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"\\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9] ..." (293 uses)
[GeneratedRegex("\\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\b", RegexOptions.CultureInvariant)]
  ///                 ○ Match a character in the set [0-9].<br/>
  ///     ○ Match a sequence of expressions.<br/>
  ///         ○ Match a character in the set [01] greedily, optionally.<br/>
-   ///         ○ Match a character in the set [0-9] greedily at least 1 and at most 2 times.<br/>
+   ///         ○ Match a character in the set [0-9] atomically at least 1 and at most 2 times.<br/>
  /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
                  int alternation_starting_pos1 = 0;
                  int charloop_starting_pos = 0, charloop_ending_pos = 0;
                  int charloop_starting_pos1 = 0, charloop_ending_pos1 = 0;
-                   int charloop_starting_pos2 = 0, charloop_ending_pos2 = 0;
                  int loop_iteration = 0;
                  int stackpos = 0;
                  int startingStackpos = 0;
                              CharLoopEnd1:
                          //}
                          
-                           // Match a character in the set [0-9] greedily at least 1 and at most 2 times.
-                           //{
-                               charloop_starting_pos2 = pos;
-                               
+                           // Match a character in the set [0-9] atomically at least 1 and at most 2 times.
+                           {
                              int iteration1 = 0;
                              while (iteration1 < 2 && (uint)iteration1 < (uint)slice.Length && char.IsAsciiDigit(slice[iteration1]))
                              {
                              
                              slice = slice.Slice(iteration1);
                              pos += iteration1;
-                               
-                               charloop_ending_pos2 = pos;
-                               charloop_starting_pos2++;
-                               goto CharLoopEnd2;
-                               
-                               CharLoopBacktrack2:
-                               
-                               if (Utilities.s_hasTimeout)
-                               {
-                                   base.CheckTimeout();
-                               }
-                               
-                               if (charloop_starting_pos2 >= charloop_ending_pos2)
-                               {
-                                   goto CharLoopBacktrack1;
-                               }
-                               pos = --charloop_ending_pos2;
-                               slice = inputSpan.Slice(pos);
-                               
-                               CharLoopEnd2:
-                           //}
+                           }
                          
                          alternation_branch = 1;
                          goto AlternationMatch1;
                          case 0:
                              goto AlternationBranch1;
                          case 1:
-                               goto CharLoopBacktrack2;
+                               goto CharLoopBacktrack1;
                      }
                      
                      AlternationMatch1:;
"\\b[\\w\\d\\.\\-]+\\@[\\w\\d\\.\\-]+\\.[a-z] ..." (267 uses)
[GeneratedRegex("\\b[\\w\\d\\.\\-]+\\@[\\w\\d\\.\\-]+\\.[a-z]{2,6}\\b")]
  /// ○ Match '@'.<br/>
  /// ○ Match a character in the set [-.\w\d] greedily at least once.<br/>
  /// ○ Match '.'.<br/>
-   /// ○ Match a character in the set [a-z] greedily at least 2 and at most 6 times.<br/>
+   /// ○ Match a character in the set [a-z] atomically at least 2 and at most 6 times.<br/>
  /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
                  int matchStart = pos;
                  char ch;
                  int charloop_starting_pos = 0, charloop_ending_pos = 0;
-                   int charloop_starting_pos1 = 0, charloop_ending_pos1 = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Match if at a word boundary.
                      goto CharLoopBacktrack;
                  }
                  
-                   // Match a character in the set [a-z] greedily at least 2 and at most 6 times.
-                   //{
+                   // Match a character in the set [a-z] atomically at least 2 and at most 6 times.
+                   {
                      pos++;
                      slice = inputSpan.Slice(pos);
-                       charloop_starting_pos1 = pos;
-                       
                      int iteration2 = 0;
                      while (iteration2 < 6 && (uint)iteration2 < (uint)slice.Length && char.IsAsciiLetterLower(slice[iteration2]))
                      {
                      
                      slice = slice.Slice(iteration2);
                      pos += iteration2;
-                       
-                       charloop_ending_pos1 = pos;
-                       charloop_starting_pos1 += 2;
-                       goto CharLoopEnd1;
-                       
-                       CharLoopBacktrack1:
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       if (charloop_starting_pos1 >= charloop_ending_pos1)
-                       {
-                           goto CharLoopBacktrack;
-                       }
-                       pos = --charloop_ending_pos1;
-                       slice = inputSpan.Slice(pos);
-                       
-                       CharLoopEnd1:
-                   //}
+                   }
                  
                  // Match if at a word boundary.
                  if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       goto CharLoopBacktrack1;
+                       goto CharLoopBacktrack;
                  }
                  
                  // The input matched.
"\\A\\b[0-9a-f]+\\b\\Z" (178 uses)
[GeneratedRegex("\\A\\b[0-9a-f]+\\b\\Z")]
  /// <code>
  /// ○ Match if at the beginning of the string.<br/>
  /// ○ Match if at a word boundary.<br/>
-   /// ○ Match a character in the set [0-9a-f] greedily at least once.<br/>
+   /// ○ Match a character in the set [0-9a-f] atomically at least once.<br/>
  /// ○ Match if at a word boundary.<br/>
  /// ○ Match if at the end of the string or if before an ending newline.<br/>
  /// </code>
              {
                  int pos = base.runtextpos;
                  int matchStart = pos;
-                   int charloop_starting_pos = 0, charloop_ending_pos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Match if at the beginning of the string.
                      return false; // The input didn't match.
                  }
                  
-                   // Match a character in the set [0-9a-f] greedily at least once.
-                   //{
-                       charloop_starting_pos = pos;
-                       
+                   // Match a character in the set [0-9a-f] atomically at least once.
+                   {
                      int iteration = slice.IndexOfAnyExcept(Utilities.s_asciiHexDigitsLower);
                      if (iteration < 0)
                      {
                      
                      slice = slice.Slice(iteration);
                      pos += iteration;
-                       
-                       charloop_ending_pos = pos;
-                       charloop_starting_pos++;
-                       goto CharLoopEnd;
-                       
-                       CharLoopBacktrack:
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       if (charloop_starting_pos >= charloop_ending_pos)
-                       {
-                           return false; // The input didn't match.
-                       }
-                       pos = --charloop_ending_pos;
-                       slice = inputSpan.Slice(pos);
-                       
-                       CharLoopEnd:
-                   //}
+                   }
                  
                  // Match if at a word boundary.
                  if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       goto CharLoopBacktrack;
+                       return false; // The input didn't match.
                  }
                  
                  // Match if at the end of the string or if before an ending newline.
                  if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n'))
                  {
-                       goto CharLoopBacktrack;
+                       return false; // The input didn't match.
                  }
                  
                  // The input matched.
"\\A\\p{Lu}{2,}\\b" (117 uses)
[GeneratedRegex("\\A\\p{Lu}{2,}\\b")]
  /// Explanation:<br/>
  /// <code>
  /// ○ Match if at the beginning of the string.<br/>
-   /// ○ Match a character in the set [\p{Lu}] greedily at least twice.<br/>
+   /// ○ Match a character in the set [\p{Lu}] atomically at least twice.<br/>
  /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
              {
                  int pos = base.runtextpos;
                  int matchStart = pos;
-                   int charloop_starting_pos = 0, charloop_ending_pos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Match if at the beginning of the string.
                      return false; // The input didn't match.
                  }
                  
-                   // Match a character in the set [\p{Lu}] greedily at least twice.
-                   //{
-                       charloop_starting_pos = pos;
-                       
+                   // Match a character in the set [\p{Lu}] atomically at least twice.
+                   {
                      int iteration = 0;
                      while ((uint)iteration < (uint)slice.Length && char.IsUpper(slice[iteration]))
                      {
                      
                      slice = slice.Slice(iteration);
                      pos += iteration;
-                       
-                       charloop_ending_pos = pos;
-                       charloop_starting_pos += 2;
-                       goto CharLoopEnd;
-                       
-                       CharLoopBacktrack:
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       if (charloop_starting_pos >= charloop_ending_pos)
-                       {
-                           return false; // The input didn't match.
-                       }
-                       pos = --charloop_ending_pos;
-                       slice = inputSpan.Slice(pos);
-                       
-                       CharLoopEnd:
-                   //}
+                   }
                  
                  // Match if at a word boundary.
                  if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       goto CharLoopBacktrack;
+                       return false; // The input didn't match.
                  }
                  
                  // The input matched.
"(\\d+)|(\\b([MDCLXVI]+)\\b)" (112 uses)
[GeneratedRegex("(\\d+)|(\\b([MDCLXVI]+)\\b)", RegexOptions.IgnoreCase)]
  ///     ○ 2nd capture group.<br/>
  ///         ○ Match if at a word boundary.<br/>
  ///         ○ 3rd capture group.<br/>
-   ///             ○ Match a character in the set [CDILMVXcdilmvx] greedily at least once.<br/>
+   ///             ○ Match a character in the set [CDILMVXcdilmvx] atomically at least once.<br/>
  ///         ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
                  int capture_starting_pos = 0;
                  int capture_starting_pos1 = 0;
                  int capture_starting_pos2 = 0;
-                   int charloop_capture_pos = 0;
-                   int charloop_starting_pos = 0, charloop_ending_pos = 0;
-                   int stackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
-                   // Atomic group.
+                   // Match with 2 alternative expressions, atomically.
                  {
-                       int atomic_stackpos = stackpos;
+                       int alternation_starting_pos = pos;
+                       int alternation_starting_capturepos = base.Crawlpos();
                      
-                       // Match with 2 alternative expressions, atomically.
-                       //{
-                           int alternation_starting_pos = pos;
-                           int alternation_starting_capturepos = base.Crawlpos();
-                           
-                           // Branch 0
+                       // Branch 0
+                       {
+                           // 1st capture group.
                          {
-                               // 1st capture group.
+                               capture_starting_pos = pos;
+                               
+                               // Match a Unicode digit atomically at least once.
                              {
-                                   capture_starting_pos = pos;
-                                   
-                                   // Match a Unicode digit atomically at least once.
+                                   int iteration = 0;
+                                   while ((uint)iteration < (uint)slice.Length && char.IsDigit(slice[iteration]))
                                  {
-                                       int iteration = 0;
-                                       while ((uint)iteration < (uint)slice.Length && char.IsDigit(slice[iteration]))
-                                       {
-                                           iteration++;
-                                       }
-                                       
-                                       if (iteration == 0)
-                                       {
-                                           goto AlternationBranch;
-                                       }
-                                       
-                                       slice = slice.Slice(iteration);
-                                       pos += iteration;
+                                       iteration++;
                                  }
                                  
-                                   base.Capture(1, capture_starting_pos, pos);
+                                   if (iteration == 0)
+                                   {
+                                       goto AlternationBranch;
+                                   }
+                                   
+                                   slice = slice.Slice(iteration);
+                                   pos += iteration;
                              }
                              
-                               goto AlternationMatch;
-                               
-                               AlternationBranch:
-                               pos = alternation_starting_pos;
-                               slice = inputSpan.Slice(pos);
-                               UncaptureUntil(alternation_starting_capturepos);
+                               base.Capture(1, capture_starting_pos, pos);
                          }
                          
-                           // Branch 1
-                           {
-                               // 2nd capture group.
-                               //{
-                                   capture_starting_pos1 = pos;
-                                   
-                                   // Match if at a word boundary.
-                                   if (!Utilities.IsBoundary(inputSpan, pos))
-                                   {
-                                       UncaptureUntil(0);
-                                       return false; // The input didn't match.
-                                   }
-                                   
-                                   // 3rd capture group.
-                                   //{
-                                       capture_starting_pos2 = pos;
-                                       
-                                       // Match a character in the set [CDILMVXcdilmvx] greedily at least once.
-                                       //{
-                                           charloop_starting_pos = pos;
-                                           
-                                           int iteration1 = slice.IndexOfAnyExcept(Utilities.s_ascii_1832400118324001);
-                                           if (iteration1 < 0)
-                                           {
-                                               iteration1 = slice.Length;
-                                           }
-                                           
-                                           if (iteration1 == 0)
-                                           {
-                                               UncaptureUntil(0);
-                                               return false; // The input didn't match.
-                                           }
-                                           
-                                           slice = slice.Slice(iteration1);
-                                           pos += iteration1;
-                                           
-                                           charloop_ending_pos = pos;
-                                           charloop_starting_pos++;
-                                           goto CharLoopEnd;
-                                           
-                                           CharLoopBacktrack:
-                                           UncaptureUntil(charloop_capture_pos);
-                                           
-                                           if (Utilities.s_hasTimeout)
-                                           {
-                                               base.CheckTimeout();
-                                           }
-                                           
-                                           if (charloop_starting_pos >= charloop_ending_pos)
-                                           {
-                                               UncaptureUntil(0);
-                                               return false; // The input didn't match.
-                                           }
-                                           pos = --charloop_ending_pos;
-                                           slice = inputSpan.Slice(pos);
-                                           
-                                           CharLoopEnd:
-                                           charloop_capture_pos = base.Crawlpos();
-                                       //}
-                                       
-                                       base.Capture(3, capture_starting_pos2, pos);
-                                       
-                                       goto CaptureSkipBacktrack;
-                                       
-                                       CaptureBacktrack:
-                                       goto CharLoopBacktrack;
-                                       
-                                       CaptureSkipBacktrack:;
-                                   //}
-                                   
-                                   // Match if at a word boundary.
-                                   if (!Utilities.IsBoundary(inputSpan, pos))
-                                   {
-                                       goto CaptureBacktrack;
-                                   }
-                                   
-                                   base.Capture(2, capture_starting_pos1, pos);
-                               //}
-                               
-                           }
+                           goto AlternationMatch;
                          
-                           AlternationMatch:;
-                       //}
+                           AlternationBranch:
+                           pos = alternation_starting_pos;
+                           slice = inputSpan.Slice(pos);
+                           UncaptureUntil(alternation_starting_capturepos);
+                       }
                      
-                       stackpos = atomic_stackpos;
+                       // Branch 1
+                       {
+                           // 2nd capture group.
+                           {
+                               capture_starting_pos1 = pos;
+                               
+                               // Match if at a word boundary.
+                               if (!Utilities.IsBoundary(inputSpan, pos))
+                               {
+                                   UncaptureUntil(0);
+                                   return false; // The input didn't match.
+                               }
+                               
+                               // 3rd capture group.
+                               {
+                                   capture_starting_pos2 = pos;
+                                   
+                                   // Match a character in the set [CDILMVXcdilmvx] atomically at least once.
+                                   {
+                                       int iteration1 = slice.IndexOfAnyExcept(Utilities.s_ascii_1832400118324001);
+                                       if (iteration1 < 0)
+                                       {
+                                           iteration1 = slice.Length;
+                                       }
+                                       
+                                       if (iteration1 == 0)
+                                       {
+                                           UncaptureUntil(0);
+                                           return false; // The input didn't match.
+                                       }
+                                       
+                                       slice = slice.Slice(iteration1);
+                                       pos += iteration1;
+                                   }
+                                   
+                                   base.Capture(3, capture_starting_pos2, pos);
+                               }
+                               
+                               // Match if at a word boundary.
+                               if (!Utilities.IsBoundary(inputSpan, pos))
+                               {
+                                   UncaptureUntil(0);
+                                   return false; // The input didn't match.
+                               }
+                               
+                               base.Capture(2, capture_starting_pos1, pos);
+                           }
+                           
+                       }
+                       
+                       AlternationMatch:;
                  }
                  
                  // The input matched.
"(\\b(?:(?:2(?:[0-4][0-9]|5[0-5])|[0-1]?[0-9] ..." (85 uses)
[GeneratedRegex("(\\b(?:(?:2(?:[0-4][0-9]|5[0-5])|[0-1]?[0-9]?[0-9])\\.){3}(?:(?:2([0-4][0-9]|5[0-5])|[0-1]?[0-9]?[0-9]))\\b)")]
  ///                         ○ Match a character in the set [0-5].<br/>
  ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match a character in the set [01] greedily, optionally.<br/>
-   ///             ○ Match a character in the set [0-9] greedily at least 1 and at most 2 times.<br/>
+   ///             ○ Match a character in the set [0-9] atomically at least 1 and at most 2 times.<br/>
  ///     ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
                  int capture_starting_pos = 0;
                  int capture_starting_pos1 = 0;
                  int charloop_capture_pos = 0;
-                   int charloop_capture_pos1 = 0;
                  int charloop_starting_pos = 0, charloop_ending_pos = 0;
                  int charloop_starting_pos1 = 0, charloop_ending_pos1 = 0;
-                   int charloop_starting_pos2 = 0, charloop_ending_pos2 = 0;
                  int loop_iteration = 0;
                  int stackpos = 0;
                  int startingStackpos = 0;
                                  charloop_capture_pos = base.Crawlpos();
                              //}
                              
-                               // Match a character in the set [0-9] greedily at least 1 and at most 2 times.
-                               //{
-                                   charloop_starting_pos2 = pos;
-                                   
+                               // Match a character in the set [0-9] atomically at least 1 and at most 2 times.
+                               {
                                  int iteration1 = 0;
                                  while (iteration1 < 2 && (uint)iteration1 < (uint)slice.Length && char.IsAsciiDigit(slice[iteration1]))
                                  {
                                  
                                  slice = slice.Slice(iteration1);
                                  pos += iteration1;
-                                   
-                                   charloop_ending_pos2 = pos;
-                                   charloop_starting_pos2++;
-                                   goto CharLoopEnd2;
-                                   
-                                   CharLoopBacktrack2:
-                                   UncaptureUntil(charloop_capture_pos1);
-                                   
-                                   if (Utilities.s_hasTimeout)
-                                   {
-                                       base.CheckTimeout();
-                                   }
-                                   
-                                   if (charloop_starting_pos2 >= charloop_ending_pos2)
-                                   {
-                                       goto CharLoopBacktrack1;
-                                   }
-                                   pos = --charloop_ending_pos2;
-                                   slice = inputSpan.Slice(pos);
-                                   
-                                   CharLoopEnd2:
-                                   charloop_capture_pos1 = base.Crawlpos();
-                               //}
+                               }
                              
                              alternation_branch = 1;
                              goto AlternationMatch1;
                              case 0:
                                  goto AlternationBranch1;
                              case 1:
-                                   goto CharLoopBacktrack2;
+                                   goto CharLoopBacktrack1;
                          }
                          
                          AlternationMatch1:;

For more diff examples, see https://gist.github.com/MihuBot/f53211ef2653356a070d40dbd2e82145

Total bytes of base: 54709209
Total bytes of diff: 54706120
Total bytes of delta: -3089 (-0.01 % of base)
Total relative delta: -1.74
    diff is an improvement.
    relative diff is an improvement.

For a list of JIT diff improvements, see Improvements.md

Sample source code for further analysis
const string JsonPath = "RegexResults-1254.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2Lh8F_A");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions