Skip to content

[RegexDiff X64] [stephentoub] Support auto-atomicity on {lazy} loops #1262

@MihuBot

Description

@MihuBot

Job completed in 19 minutes 13 seconds (remote runner delay: 1 minute 21 seconds).
dotnet/runtime#117943
Using arguments: regexdiff

60 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"^-+ *BEGIN (?<keyName>\\w+( \\w+)*) PRIVATE ..." (1964 uses)
[GeneratedRegex("^-+ *BEGIN (?<keyName>\\w+( \\w+)*) PRIVATE KEY *-+\\r?\\n(Proc-Type: 4,ENCRYPTED\\r?\\nDEK-Info: (?<cipherName>[A-Z0-9-]+),(?<salt>[A-F0-9]+)\\r?\\n\\r?\\n)?(?<data>([a-zA-Z0-9/+=]{1,80}\\r?\\n)+)-+ *END \\k<keyName> PRIVATE KEY *-+", RegexOptions.Multiline)]
  ///         ○ Match '\r' atomically, optionally.<br/>
  ///         ○ Match '\n'.<br/>
  /// ○ "data" capture group.<br/>
-   ///     ○ Loop greedily at least once.<br/>
+   ///     ○ Loop greedily and atomically at least once.<br/>
  ///         ○ 3rd capture group.<br/>
  ///             ○ Match a character in the set [+/-9=A-Za-z] atomically at least 1 and at most 80 times.<br/>
  ///             ○ Match '\r' atomically, optionally.<br/>
                  int loop_iteration2 = 0;
                  int matchLength = 0;
                  int stackpos = 0;
+                   int startingStackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Match if at the beginning of a line.
                  //}
                  
                  // "data" capture group.
-                   //{
+                   {
                      capture_starting_pos5 = pos;
                      
-                       // Loop greedily at least once.
-                       //{
+                       // Loop greedily and atomically at least once.
+                       {
+                           startingStackpos = stackpos;
                          loop_iteration2 = 0;
                          
                          LoopBody2:
                              goto LoopIterationNoMatch1;
                          }
                          
-                           LoopEnd2:;
-                       //}
+                           LoopEnd2:
+                           stackpos = startingStackpos; // Ensure any remaining backtracking state is removed.
+                       }
                      
                      base.Capture(7, capture_starting_pos5, pos);
-                       
-                       goto CaptureSkipBacktrack1;
-                       
-                       CaptureBacktrack1:
-                       goto LoopIterationNoMatch2;
-                       
-                       CaptureSkipBacktrack1:;
-                   //}
+                   }
                  
                  // Match '-' atomically at least once.
                  {
                      
                      if (iteration9 == 0)
                      {
-                           goto CaptureBacktrack1;
+                           goto LoopIterationNoMatch1;
                      }
                      
                      slice = slice.Slice(iteration9);
                  // Match the string "END ".
                  if (!slice.StartsWith("END "))
                  {
-                       goto CaptureBacktrack1;
+                       goto LoopIterationNoMatch1;
                  }
                  
                  // Match the same text as matched by the "keyName" capture group.
                      // If the "keyName" capture group hasn't matched, the backreference doesn't match.
                      if (!base.IsMatched(4))
                      {
-                           goto CaptureBacktrack1;
+                           goto LoopIterationNoMatch1;
                      }
                      
                      // Get the captured text.  If it doesn't match at the current position, the backreference doesn't match.
                      if (slice.Length < matchLength || 
                          !inputSpan.Slice(base.MatchIndex(4), matchLength).SequenceEqual(slice.Slice(0, matchLength)))
                      {
-                           goto CaptureBacktrack1;
+                           goto LoopIterationNoMatch1;
                      }
                      
                      pos += matchLength;
                  // Match the string " PRIVATE KEY".
                  if (!slice.StartsWith(" PRIVATE KEY"))
                  {
-                       goto CaptureBacktrack1;
+                       goto LoopIterationNoMatch1;
                  }
                  
                  // Match ' ' atomically any number of times.
                      
                      if (iteration12 == 0)
                      {
-                           goto CaptureBacktrack1;
+                           goto LoopIterationNoMatch1;
                      }
                      
                      slice = slice.Slice(iteration12);
"^([\\w\\.\\-]+)@([\\w\\-]+)((\\.(\\w){2,3})+)$" (550 uses)
[GeneratedRegex("^([\\w\\.\\-]+)@([\\w\\-]+)((\\.(\\w){2,3})+)$")]
  /// ○ 2nd capture group.<br/>
  ///     ○ Match a character in the set [\-\w] atomically at least once.<br/>
  /// ○ 3rd capture group.<br/>
-   ///     ○ Loop greedily at least once.<br/>
+   ///     ○ Loop greedily and atomically at least once.<br/>
  ///         ○ 4th capture group.<br/>
  ///             ○ Match '.'.<br/>
  ///             ○ Loop greedily at least 2 and at most 3 times.<br/>
                  int loop_iteration1 = 0;
                  int stackpos = 0;
                  int startingStackpos = 0;
+                   int startingStackpos1 = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Match if at the beginning of the string.
                  }
                  
                  // 3rd capture group.
-                   //{
+                   {
                      capture_starting_pos2 = pos;
                      
-                       // Loop greedily at least once.
-                       //{
-                           loop_iteration = 0;
+                       // Atomic group.
+                       {
+                           int atomic_stackpos = stackpos;
                          
-                           LoopBody:
-                           Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), pos);
-                           
-                           loop_iteration++;
-                           
-                           // 4th capture group.
+                           // Loop greedily and atomically at least once.
                          //{
-                               int capture_starting_pos3 = pos;
+                               startingStackpos = stackpos;
+                               loop_iteration = 0;
                              
-                               // Match '.'.
-                               if (slice.IsEmpty || slice[0] != '.')
-                               {
-                                   goto LoopIterationNoMatch;
-                               }
+                               LoopBody:
+                               Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), pos);
                              
-                               // Loop greedily at least 2 and at most 3 times.
+                               loop_iteration++;
+                               
+                               // 4th capture group.
                              //{
-                                   pos++;
-                                   slice = inputSpan.Slice(pos);
-                                   startingStackpos = stackpos;
-                                   loop_iteration1 = 0;
+                                   int capture_starting_pos3 = pos;
                                  
-                                   LoopBody1:
-                                   Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), pos);
-                                   
-                                   loop_iteration1++;
-                                   
-                                   // 5th capture group.
+                                   // Match '.'.
+                                   if (slice.IsEmpty || slice[0] != '.')
                                  {
-                                       int capture_starting_pos4 = pos;
-                                       
-                                       // Match a word character.
-                                       if (slice.IsEmpty || !Utilities.IsWordChar(slice[0]))
-                                       {
-                                           goto LoopIterationNoMatch1;
-                                       }
-                                       
+                                       goto LoopIterationNoMatch;
+                                   }
+                                   
+                                   // Loop greedily at least 2 and at most 3 times.
+                                   //{
                                      pos++;
                                      slice = inputSpan.Slice(pos);
-                                       base.Capture(5, capture_starting_pos4, pos);
-                                   }
-                                   
-                                   
-                                   // The loop has an upper bound of 3. Continue iterating greedily if it hasn't yet been reached.
-                                   if (loop_iteration1 < 3)
-                                   {
-                                       goto LoopBody1;
-                                   }
-                                   goto LoopEnd1;
-                                   
-                                   // The loop iteration failed. Put state back to the way it was before the iteration.
-                                   LoopIterationNoMatch1:
-                                   if (--loop_iteration1 < 0)
-                                   {
-                                       // Unable to match the remainder of the expression after exhausting the loop.
-                                       goto LoopIterationNoMatch;
-                                   }
-                                   pos = base.runstack![--stackpos];
-                                   UncaptureUntil(base.runstack![--stackpos]);
-                                   slice = inputSpan.Slice(pos);
-                                   if (loop_iteration1 < 2)
-                                   {
-                                       // All possible iterations have matched, but it's below the required minimum of 2. Fail the loop.
-                                       if (loop_iteration1 != 0)
+                                       startingStackpos1 = stackpos;
+                                       loop_iteration1 = 0;
+                                       
+                                       LoopBody1:
+                                       Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), pos);
+                                       
+                                       loop_iteration1++;
+                                       
+                                       // 5th capture group.
                                      {
-                                           // Ensure any stale backtracking state is removed.
-                                           stackpos = startingStackpos;
+                                           int capture_starting_pos4 = pos;
+                                           
+                                           // Match a word character.
+                                           if (slice.IsEmpty || !Utilities.IsWordChar(slice[0]))
+                                           {
+                                               goto LoopIterationNoMatch1;
+                                           }
+                                           
+                                           pos++;
+                                           slice = inputSpan.Slice(pos);
+                                           base.Capture(5, capture_starting_pos4, pos);
                                      }
-                                       goto LoopIterationNoMatch;
-                                   }
+                                       
+                                       
+                                       // The loop has an upper bound of 3. Continue iterating greedily if it hasn't yet been reached.
+                                       if (loop_iteration1 < 3)
+                                       {
+                                           goto LoopBody1;
+                                       }
+                                       goto LoopEnd1;
+                                       
+                                       // The loop iteration failed. Put state back to the way it was before the iteration.
+                                       LoopIterationNoMatch1:
+                                       if (--loop_iteration1 < 0)
+                                       {
+                                           // Unable to match the remainder of the expression after exhausting the loop.
+                                           goto LoopIterationNoMatch;
+                                       }
+                                       pos = base.runstack![--stackpos];
+                                       UncaptureUntil(base.runstack![--stackpos]);
+                                       slice = inputSpan.Slice(pos);
+                                       if (loop_iteration1 < 2)
+                                       {
+                                           // All possible iterations have matched, but it's below the required minimum of 2. Fail the loop.
+                                           if (loop_iteration1 != 0)
+                                           {
+                                               // Ensure any stale backtracking state is removed.
+                                               stackpos = startingStackpos1;
+                                           }
+                                           goto LoopIterationNoMatch;
+                                       }
+                                       
+                                       LoopEnd1:
+                                       
+                                       Utilities.StackPush(ref base.runstack!, ref stackpos, startingStackpos1, loop_iteration1);
+                                       goto LoopSkipBacktrack;
+                                       
+                                       LoopBacktrack:
+                                       Utilities.StackPop(base.runstack!, ref stackpos, out loop_iteration1, out startingStackpos1);
+                                       if (Utilities.s_hasTimeout)
+                                       {
+                                           base.CheckTimeout();
+                                       }
+                                       
+                                       goto LoopIterationNoMatch1;
+                                       
+                                       LoopSkipBacktrack:;
+                                   //}
                                  
-                                   LoopEnd1:
+                                   base.Capture(4, capture_starting_pos3, pos);
                                  
-                                   Utilities.StackPush(ref base.runstack!, ref stackpos, startingStackpos, loop_iteration1);
-                                   goto LoopSkipBacktrack;
+                                   Utilities.StackPush(ref base.runstack!, ref stackpos, capture_starting_pos3);
+                                   goto CaptureSkipBacktrack;
                                  
-                                   LoopBacktrack:
-                                   Utilities.StackPop(base.runstack!, ref stackpos, out loop_iteration1, out startingStackpos);
-                                   if (Utilities.s_hasTimeout)
-                                   {
-                                       base.CheckTimeout();
-                                   }
+                                   CaptureBacktrack:
+                                   capture_starting_pos3 = base.runstack![--stackpos];
+                                   goto LoopBacktrack;
                                  
-                                   goto LoopIterationNoMatch1;
-                                   
-                                   LoopSkipBacktrack:;
+                                   CaptureSkipBacktrack:;
                              //}
                              
-                               base.Capture(4, capture_starting_pos3, pos);
                              
-                               Utilities.StackPush(ref base.runstack!, ref stackpos, capture_starting_pos3);
-                               goto CaptureSkipBacktrack;
+                               // The loop has no upper bound. Continue iterating greedily.
+                               goto LoopBody;
                              
-                               CaptureBacktrack:
-                               capture_starting_pos3 = base.runstack![--stackpos];
-                               goto LoopBacktrack;
+                               // The loop iteration failed. Put state back to the way it was before the iteration.
+                               LoopIterationNoMatch:
+                               if (--loop_iteration < 0)
+                               {
+                                   // Unable to match the remainder of the expression after exhausting the loop.
+                                   UncaptureUntil(0);
+                                   return false; // The input didn't match.
+                               }
+                               pos = base.runstack![--stackpos];
+                               UncaptureUntil(base.runstack![--stackpos]);
+                               slice = inputSpan.Slice(pos);
+                               if (loop_iteration == 0)
+                               {
+                                   // No iterations have been matched to backtrack into. Fail the loop.
+                                   UncaptureUntil(0);
+                                   return false; // The input didn't match.
+                               }
                              
-                               CaptureSkipBacktrack:;
+                               LoopEnd:
+                               stackpos = startingStackpos; // Ensure any remaining backtracking state is removed.
                          //}
                          
-                           
-                           // The loop has no upper bound. Continue iterating greedily.
-                           goto LoopBody;
-                           
-                           // The loop iteration failed. Put state back to the way it was before the iteration.
-                           LoopIterationNoMatch:
-                           if (--loop_iteration < 0)
-                           {
-                               // Unable to match the remainder of the expression after exhausting the loop.
-                               UncaptureUntil(0);
-                               return false; // The input didn't match.
-                           }
-                           pos = base.runstack![--stackpos];
-                           UncaptureUntil(base.runstack![--stackpos]);
-                           slice = inputSpan.Slice(pos);
-                           if (loop_iteration == 0)
-                           {
-                               // No iterations have been matched to backtrack into. Fail the loop.
-                               UncaptureUntil(0);
-                               return false; // The input didn't match.
-                           }
-                           
-                           goto LoopEnd;
-                           
-                           LoopBacktrack1:
-                           if (Utilities.s_hasTimeout)
-                           {
-                               base.CheckTimeout();
-                           }
-                           
-                           if (loop_iteration == 0)
-                           {
-                               // No iterations of the loop remain to backtrack into. Fail the loop.
-                               UncaptureUntil(0);
-                               return false; // The input didn't match.
-                           }
-                           goto CaptureBacktrack;
-                           LoopEnd:;
-                       //}
+                           stackpos = atomic_stackpos;
+                       }
                      
                      base.Capture(3, capture_starting_pos2, pos);
-                       
-                       goto CaptureSkipBacktrack1;
-                       
-                       CaptureBacktrack1:
-                       goto LoopBacktrack1;
-                       
-                       CaptureSkipBacktrack1:;
-                   //}
+                   }
                  
                  // Match if at the end of the string or if before an ending newline.
                  if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n'))
                  {
-                       goto CaptureBacktrack1;
+                       UncaptureUntil(0);
+                       return false; // The input didn't match.
                  }
                  
                  // The input matched.
"^-+ *BEGIN (?<keyName>\\w+( \\w+)*) PRIVATE ..." (524 uses)
[GeneratedRegex("^-+ *BEGIN (?<keyName>\\w+( \\w+)*) PRIVATE KEY *-+\\r?\\n((Proc-Type: 4,ENCRYPTED\\r?\\nDEK-Info: (?<cipherName>[A-Z0-9-]+),(?<salt>[A-F0-9]+)\\r?\\n\\r?\\n)|(Comment: \"?[^\\r\\n]*\"?\\r?\\n))?(?<data>([a-zA-Z0-9/+=]{1,80}\\r?\\n)+)-+ *END \\k<keyName> PRIVATE KEY *-+", RegexOptions.Multiline)]
  ///                 ○ Match '\r' atomically, optionally.<br/>
  ///                 ○ Match '\n'.<br/>
  /// ○ "data" capture group.<br/>
-   ///     ○ Loop greedily at least once.<br/>
+   ///     ○ Loop greedily and atomically at least once.<br/>
  ///         ○ 5th capture group.<br/>
  ///             ○ Match a character in the set [+/-9=A-Za-z] atomically at least 1 and at most 80 times.<br/>
  ///             ○ Match '\r' atomically, optionally.<br/>
                  int loop_iteration2 = 0;
                  int matchLength = 0;
                  int stackpos = 0;
+                   int startingStackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Match if at the beginning of a line.
                  //}
                  
                  // "data" capture group.
-                   //{
+                   {
                      capture_starting_pos7 = pos;
                      
-                       // Loop greedily at least once.
-                       //{
+                       // Loop greedily and atomically at least once.
+                       {
+                           startingStackpos = stackpos;
                          loop_iteration2 = 0;
                          
                          LoopBody2:
                              goto LoopBacktrack;
                          }
                          
-                           LoopEnd2:;
-                       //}
+                           LoopEnd2:
+                           stackpos = startingStackpos; // Ensure any remaining backtracking state is removed.
+                       }
                      
                      base.Capture(9, capture_starting_pos7, pos);
-                       
-                       goto CaptureSkipBacktrack3;
-                       
-                       CaptureBacktrack3:
-                       goto LoopIterationNoMatch2;
-                       
-                       CaptureSkipBacktrack3:;
-                   //}
+                   }
                  
                  // Match '-' atomically at least once.
                  {
                      
                      if (iteration10 == 0)
                      {
-                           goto CaptureBacktrack3;
+                           goto LoopBacktrack;
                      }
                      
                      slice = slice.Slice(iteration10);
                  // Match the string "END ".
                  if (!slice.StartsWith("END "))
                  {
-                       goto CaptureBacktrack3;
+                       goto LoopBacktrack;
                  }
                  
                  // Match the same text as matched by the "keyName" capture group.
                      // If the "keyName" capture group hasn't matched, the backreference doesn't match.
                      if (!base.IsMatched(6))
                      {
-                           goto CaptureBacktrack3;
+                           goto LoopBacktrack;
                      }
                      
                      // Get the captured text.  If it doesn't match at the current position, the backreference doesn't match.
                      if (slice.Length < matchLength || 
                          !inputSpan.Slice(base.MatchIndex(6), matchLength).SequenceEqual(slice.Slice(0, matchLength)))
                      {
-                           goto CaptureBacktrack3;
+                           goto LoopBacktrack;
                      }
                      
                      pos += matchLength;
                  // Match the string " PRIVATE KEY".
                  if (!slice.StartsWith(" PRIVATE KEY"))
                  {
-                       goto CaptureBacktrack3;
+                       goto LoopBacktrack;
                  }
                  
                  // Match ' ' atomically any number of times.
                      
                      if (iteration13 == 0)
                      {
-                           goto CaptureBacktrack3;
+                           goto LoopBacktrack;
                      }
                      
                      slice = slice.Slice(iteration13);
"(<br>)+$" (119 uses)
[GeneratedRegex("(<br>)+$", RegexOptions.IgnoreCase | RegexOptions.Multiline)]
  /// <code>RegexOptions.IgnoreCase | RegexOptions.Multiline</code><br/>
  /// Explanation:<br/>
  /// <code>
-   /// ○ Loop greedily at least once.<br/>
+   /// ○ Loop greedily and atomically at least once.<br/>
  ///     ○ 1st capture group.<br/>
  ///         ○ Match '&lt;'.<br/>
  ///         ○ Match a character in the set [Bb].<br/>
                  int matchStart = pos;
                  int loop_iteration = 0;
                  int stackpos = 0;
+                   int startingStackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
-                   // Loop greedily at least once.
-                   //{
+                   // Loop greedily and atomically at least once.
+                   {
+                       startingStackpos = stackpos;
                      loop_iteration = 0;
                      
                      LoopBody:
                          return false; // The input didn't match.
                      }
                      
-                       LoopEnd:;
-                   //}
+                       LoopEnd:
+                       stackpos = startingStackpos; // Ensure any remaining backtracking state is removed.
+                   }
                  
                  // Match if at the end of a line.
                  if ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n')
                  {
-                       goto LoopIterationNoMatch;
+                       UncaptureUntil(0);
+                       return false; // The input didn't match.
                  }
                  
                  // The input matched.
"\\s*(<\\s*A\\s*>)+\\s*(<\\s*/\\s*A\\s*>)+\\s*" (118 uses)
[GeneratedRegex("\\s*(<\\s*A\\s*>)+\\s*(<\\s*/\\s*A\\s*>)+\\s*", RegexOptions.IgnoreCase)]
  ///         ○ Match a whitespace character atomically any number of times.<br/>
  ///         ○ Match '&gt;'.<br/>
  /// ○ Match a whitespace character atomically any number of times.<br/>
-   /// ○ Loop greedily at least once.<br/>
+   /// ○ Loop greedily and atomically at least once.<br/>
  ///     ○ 2nd capture group.<br/>
  ///         ○ Match '&lt;'.<br/>
  ///         ○ Match a whitespace character atomically any number of times.<br/>
                  int loop_iteration = 0;
                  int loop_iteration1 = 0;
                  int stackpos = 0;
+                   int startingStackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Skip loop already matched in TryFindNextPossibleStartingPosition.
                      pos += iteration2;
                  }
                  
-                   // Loop greedily at least once.
-                   //{
+                   // Loop greedily and atomically at least once.
+                   {
+                       startingStackpos = stackpos;
                      loop_iteration1 = 0;
                      
                      LoopBody1:
                          goto LoopIterationNoMatch;
                      }
                      
-                       LoopEnd1:;
-                   //}
+                       LoopEnd1:
+                       stackpos = startingStackpos; // Ensure any remaining backtracking state is removed.
+                   }
                  
                  // Match a whitespace character atomically any number of times.
                  {

For more diff examples, see https://gist.github.com/MihuBot/df46ebc7056909eecf97b280d9aade04

Total bytes of base: 54706120
Total bytes of diff: 54705424
Total bytes of delta: -696 (-0.00 % of base)
Total relative delta: -0.27
    diff is an improvement.
    relative diff is an improvement.

For a list of JIT diff regressions, see Regressions.md
For a list of JIT diff improvements, see Improvements.md

Sample source code for further analysis
const string JsonPath = "RegexResults-1262.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2Qwz_E");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions