Skip to content

[RegexDiff X64] [stephentoub] Further reduce loops around zero-width assertions #1306

@MihuBot

Description

@MihuBot

Job completed in 17 minutes 49 seconds (remote runner delay: 41 seconds).
dotnet/runtime#118111
Using arguments: regexdiff

3 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"(?<=\\b)+demi\\s+douzaine" (106 uses)
[GeneratedRegex("(?<=\\b)+demi\\s+douzaine", RegexOptions.IgnoreCase | RegexOptions.Singleline)]
     /// <code>RegexOptions.IgnoreCase | RegexOptions.Singleline</code><br/>
     /// Explanation:<br/>
     /// <code>
-    /// ○ Loop greedily at least once right-to-left.<br/>
-    ///     ○ Zero-width positive lookbehind.<br/>
-    ///         ○ Match if at a word boundary.<br/>
+    /// ○ Zero-width positive lookbehind.<br/>
+    ///     ○ Match if at a word boundary.<br/>
     /// ○ Match a character in the set [Dd].<br/>
     /// ○ Match a character in the set [Ee].<br/>
     /// ○ Match a character in the set [Mm].<br/>
                     // Any possible match is at least 13 characters.
                     if (pos <= inputSpan.Length - 13)
                     {
-                        // The pattern begins with a character in the set [Dd].
-                        // Find the next occurrence. If it can't be found, there's no match.
-                        int i = inputSpan.Slice(pos).IndexOfAny('D', 'd');
+                        // The pattern has the literal "demi" ordinal case-insensitive at the beginning of the pattern. Find the next occurrence.
+                        // If it can't be found, there's no match.
+                        int i = inputSpan.Slice(pos).IndexOfAny(Utilities.s_indexOfString_demi_OrdinalIgnoreCase);
                         if (i >= 0)
                         {
                             base.runtextpos = pos + i;
                 {
                     int pos = base.runtextpos;
                     int matchStart = pos;
-                    int loop_iteration = 0, loop_starting_pos = 0;
-                    int stackpos = 0;
                     ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                     
-                    // Loop greedily at least once right-to-left.
-                    //{
-                        loop_starting_pos = pos;
-                        loop_iteration = 0;
-                        
-                        LoopBody:
-                        Utilities.StackPush(ref base.runstack!, ref stackpos, loop_starting_pos, pos);
-                        
-                        loop_starting_pos = pos;
-                        loop_iteration++;
-                        
-                        // Zero-width positive lookbehind.
-                        {
-                            slice = inputSpan.Slice(pos);
-                            int positivelookbehind_starting_pos = pos;
-                            
-                            if (Utilities.s_hasTimeout)
-                            {
-                                base.CheckTimeout();
-                            }
-                            
-                            // Match if at a word boundary.
-                            if (!Utilities.IsBoundary(inputSpan, pos))
-                            {
-                                goto LoopIterationNoMatch;
-                            }
-                            
-                            pos = positivelookbehind_starting_pos;
-                            slice = inputSpan.Slice(pos);
-                        }
-                        
-                        
-                        // The loop has a lower bound of 1 but no upper bound. Continue iterating greedily
-                        // if the last iteration wasn't empty (or if it was, if the lower bound hasn't yet been reached).
-                        if (pos != loop_starting_pos || loop_iteration == 0)
-                        {
-                            goto LoopBody;
-                        }
-                        goto LoopEnd;
-                        
-                        // The loop iteration failed. Put state back to the way it was before the iteration.
-                        LoopIterationNoMatch:
-                        if (--loop_iteration < 0)
-                        {
-                            // Unable to match the remainder of the expression after exhausting the loop.
-                            return false; // The input didn't match.
-                        }
-                        Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos);
+                    // Zero-width positive lookbehind.
+                    {
                         slice = inputSpan.Slice(pos);
-                        if (loop_iteration == 0)
+                        int positivelookbehind_starting_pos = pos;
+                        
+                        if (Utilities.s_hasTimeout)
+                        {
+                            base.CheckTimeout();
+                        }
+                        
+                        // Match if at a word boundary.
+                        if (!Utilities.IsBoundary(inputSpan, pos))
                         {
-                            // All possible iterations have matched, but it's below the required minimum of 1. Fail the loop.
                             return false; // The input didn't match.
                         }
                         
-                        LoopEnd:;
-                    //}
+                        pos = positivelookbehind_starting_pos;
+                        slice = inputSpan.Slice(pos);
+                    }
                     
                     if ((uint)slice.Length < 4 ||
                         !slice.StartsWith("demi", StringComparison.OrdinalIgnoreCase)) // Match the string "demi" (ordinal case-insensitive)
                     {
-                        goto LoopIterationNoMatch;
+                        return false; // The input didn't match.
                     }
                     
                     // Match a whitespace character atomically at least once.
                         
                         if (iteration == 0)
                         {
-                            goto LoopIterationNoMatch;
+                            return false; // The input didn't match.
                         }
                         
                         slice = slice.Slice(iteration);
                     if ((uint)slice.Length < 8 ||
                         !slice.StartsWith("douzaine", StringComparison.OrdinalIgnoreCase)) // Match the string "douzaine" (ordinal case-insensitive)
                     {
-                        goto LoopIterationNoMatch;
+                        return false; // The input didn't match.
                     }
                     
                     // The input matched.
                 (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
         }
         
-        /// <summary>Pops 2 values from the backtracking stack.</summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
-        {
-            arg0 = stack[--pos];
-            arg1 = stack[--pos];
-        }
-        
-        /// <summary>Pushes 2 values onto the backtracking stack.</summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1)
-        {
-            // If there's space available for all 2 values, store them.
-            int[] s = stack;
-            int p = pos;
-            if ((uint)(p + 1) < (uint)s.Length)
-            {
-                s[p] = arg0;
-                s[p + 1] = arg1;
-                pos += 2;
-                return;
-            }
-        
-            // Otherwise, resize the stack to make room and try again.
-            WithResize(ref stack, ref pos, arg0, arg1);
-        
-            // <summary>Resize the backtracking stack array and push 2 values onto the stack.</summary>
-            [MethodImpl(MethodImplOptions.NoInlining)]
-            static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1)
-            {
-                Array.Resize(ref stack, (pos + 1) * 2);
-                StackPush(ref stack, ref pos, arg0, arg1);
-            }
-        }
+        /// <summary>Supports searching for the string "demi".</summary>
+        internal static readonly SearchValues<string> s_indexOfString_demi_OrdinalIgnoreCase = SearchValues.Create(["demi"], StringComparison.OrdinalIgnoreCase);
     }
 }
"^((?!-)+)([a-zA-Z_-]+$).*((?<!-)+)" (57 uses)
[GeneratedRegex("^((?!-)+)([a-zA-Z_-]+$).*((?<!-)+)", RegexOptions.Singleline)]
  /// <code>
  /// ○ Match if at the beginning of the string.<br/>
  /// ○ 1st capture group.<br/>
-   ///     ○ Loop greedily at least once.<br/>
-   ///         ○ Zero-width negative lookahead.<br/>
-   ///             ○ Match '-'.<br/>
+   ///     ○ Zero-width negative lookahead.<br/>
+   ///         ○ Match '-'.<br/>
  /// ○ 2nd capture group.<br/>
  ///     ○ Match a character in the set [\-A-Z_a-z] atomically at least once.<br/>
  ///     ○ Match if at the end of the string or if before an ending newline.<br/>
  /// ○ Match any character greedily any number of times.<br/>
  /// ○ 3rd capture group.<br/>
-   ///     ○ Loop greedily and atomically at least once right-to-left.<br/>
-   ///         ○ Zero-width negative lookbehind.<br/>
-   ///             ○ Match '-' right-to-left.<br/>
+   ///     ○ Zero-width negative lookbehind.<br/>
+   ///         ○ Match '-' right-to-left.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                  int capture_starting_pos2 = 0;
                  int charloop_capture_pos = 0;
                  int charloop_starting_pos = 0, charloop_ending_pos = 0;
-                   int loop_iteration = 0, loop_starting_pos = 0;
-                   int loop_iteration1 = 0, loop_starting_pos1 = 0;
-                   int stackpos = 0;
-                   int startingStackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Match if at the beginning of the string.
                  }
                  
                  // 1st capture group.
-                   //{
+                   {
                      capture_starting_pos = pos;
                      
-                       // Loop greedily at least once.
-                       //{
-                           loop_starting_pos = pos;
-                           loop_iteration = 0;
-                           
-                           LoopBody:
-                           Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), loop_starting_pos, pos);
-                           
-                           loop_starting_pos = pos;
-                           loop_iteration++;
-                           
-                           // Zero-width negative lookahead.
-                           {
-                               slice = inputSpan.Slice(pos);
-                               int negativelookahead__starting_pos = pos;
-                               if (Utilities.s_hasTimeout)
-                               {
-                                   base.CheckTimeout();
-                               }
-                               
-                               // Match '-'.
-                               if (slice.IsEmpty || slice[0] != '-')
-                               {
-                                   goto NegativeLookaroundMatch;
-                               }
-                               
-                               goto LoopIterationNoMatch;
-                               
-                               NegativeLookaroundMatch:
-                               pos = negativelookahead__starting_pos;
-                               slice = inputSpan.Slice(pos);
-                           }
-                           
-                           
-                           // The loop has a lower bound of 1 but no upper bound. Continue iterating greedily
-                           // if the last iteration wasn't empty (or if it was, if the lower bound hasn't yet been reached).
-                           if (pos != loop_starting_pos || loop_iteration == 0)
-                           {
-                               goto LoopBody;
-                           }
-                           goto LoopEnd;
-                           
-                           // The loop iteration failed. Put state back to the way it was before the iteration.
-                           LoopIterationNoMatch:
-                           if (--loop_iteration < 0)
-                           {
-                               // Unable to match the remainder of the expression after exhausting the loop.
-                               UncaptureUntil(0);
-                               return false; // The input didn't match.
-                           }
-                           Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos);
-                           UncaptureUntil(base.runstack![--stackpos]);
+                       // Zero-width negative lookahead.
+                       {
                          slice = inputSpan.Slice(pos);
-                           if (loop_iteration == 0)
+                           int negativelookahead__starting_pos = pos;
+                           if (Utilities.s_hasTimeout)
                          {
-                               // All possible iterations have matched, but it's below the required minimum of 1. Fail the loop.
-                               UncaptureUntil(0);
-                               return false; // The input didn't match.
+                               base.CheckTimeout();
                          }
                          
-                           LoopEnd:;
-                       //}
+                           // Match '-'.
+                           if (slice.IsEmpty || slice[0] != '-')
+                           {
+                               goto NegativeLookaroundMatch;
+                           }
+                           
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
+                           
+                           NegativeLookaroundMatch:
+                           pos = negativelookahead__starting_pos;
+                           slice = inputSpan.Slice(pos);
+                       }
                      
                      base.Capture(1, capture_starting_pos, pos);
-                       
-                       goto CaptureSkipBacktrack;
-                       
-                       CaptureBacktrack:
-                       goto LoopIterationNoMatch;
-                       
-                       CaptureSkipBacktrack:;
-                   //}
+                   }
                  
                  // 2nd capture group.
                  {
                          
                          if (iteration == 0)
                          {
-                               goto CaptureBacktrack;
+                               UncaptureUntil(0);
+                               return false; // The input didn't match.
                          }
                          
                          slice = slice.Slice(iteration);
                      // Match if at the end of the string or if before an ending newline.
                      if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n'))
                      {
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      base.Capture(2, capture_starting_pos1, pos);
                      
                      if (charloop_starting_pos >= charloop_ending_pos)
                      {
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      pos = --charloop_ending_pos;
                      slice = inputSpan.Slice(pos);
                  {
                      capture_starting_pos2 = pos;
                      
-                       // Loop greedily and atomically at least once right-to-left.
+                       // Zero-width negative lookbehind.
                      {
-                           startingStackpos = stackpos;
-                           loop_starting_pos1 = pos;
-                           loop_iteration1 = 0;
-                           
-                           LoopBody1:
-                           Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), loop_starting_pos1, pos);
-                           
-                           loop_starting_pos1 = pos;
-                           loop_iteration1++;
-                           
-                           // Zero-width negative lookbehind.
-                           {
-                               slice = inputSpan.Slice(pos);
-                               int negativelookbehind__starting_pos = pos;
-                               if (Utilities.s_hasTimeout)
-                               {
-                                   base.CheckTimeout();
-                               }
-                               
-                               // Match '-' right-to-left.
-                               if ((uint)(pos - 1) >= inputSpan.Length || inputSpan[pos - 1] != '-')
-                               {
-                                   goto NegativeLookaroundMatch1;
-                               }
-                               pos--;
-                               
-                               goto LoopIterationNoMatch1;
-                               
-                               NegativeLookaroundMatch1:
-                               pos = negativelookbehind__starting_pos;
-                               slice = inputSpan.Slice(pos);
-                           }
-                           
-                           
-                           // The loop has a lower bound of 1 but no upper bound. Continue iterating greedily
-                           // if the last iteration wasn't empty (or if it was, if the lower bound hasn't yet been reached).
-                           if (pos != loop_starting_pos1 || loop_iteration1 == 0)
-                           {
-                               goto LoopBody1;
-                           }
-                           goto LoopEnd1;
-                           
-                           // The loop iteration failed. Put state back to the way it was before the iteration.
-                           LoopIterationNoMatch1:
-                           if (--loop_iteration1 < 0)
-                           {
-                               // Unable to match the remainder of the expression after exhausting the loop.
-                               goto CharLoopBacktrack;
-                           }
-                           Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos1);
-                           UncaptureUntil(base.runstack![--stackpos]);
                          slice = inputSpan.Slice(pos);
-                           if (loop_iteration1 == 0)
+                           int negativelookbehind__starting_pos = pos;
+                           if (Utilities.s_hasTimeout)
                          {
-                               // All possible iterations have matched, but it's below the required minimum of 1. Fail the loop.
-                               goto CharLoopBacktrack;
+                               base.CheckTimeout();
                          }
                          
-                           LoopEnd1:
-                           stackpos = startingStackpos; // Ensure any remaining backtracking state is removed.
+                           // Match '-' right-to-left.
+                           if ((uint)(pos - 1) >= inputSpan.Length || inputSpan[pos - 1] != '-')
+                           {
+                               goto NegativeLookaroundMatch1;
+                           }
+                           pos--;
+                           
+                           goto CharLoopBacktrack;
+                           
+                           NegativeLookaroundMatch1:
+                           pos = negativelookbehind__starting_pos;
+                           slice = inputSpan.Slice(pos);
                      }
                      
                      base.Capture(3, capture_starting_pos2, pos);
      /// <summary>Whether <see cref="s_defaultTimeout"/> is non-infinite.</summary>
      internal static readonly bool s_hasTimeout = s_defaultTimeout != Regex.InfiniteMatchTimeout;
      
-       /// <summary>Pops 2 values from the backtracking stack.</summary>
-       [MethodImpl(MethodImplOptions.AggressiveInlining)]
-       internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
-       {
-           arg0 = stack[--pos];
-           arg1 = stack[--pos];
-       }
-       
-       /// <summary>Pushes 3 values onto the backtracking stack.</summary>
-       [MethodImpl(MethodImplOptions.AggressiveInlining)]
-       internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
-       {
-           // If there's space available for all 3 values, store them.
-           int[] s = stack;
-           int p = pos;
-           if ((uint)(p + 2) < (uint)s.Length)
-           {
-               s[p] = arg0;
-               s[p + 1] = arg1;
-               s[p + 2] = arg2;
-               pos += 3;
-               return;
-           }
-       
-           // Otherwise, resize the stack to make room and try again.
-           WithResize(ref stack, ref pos, arg0, arg1, arg2);
-       
-           // <summary>Resize the backtracking stack array and push 3 values onto the stack.</summary>
-           [MethodImpl(MethodImplOptions.NoInlining)]
-           static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
-           {
-               Array.Resize(ref stack, (pos + 2) * 2);
-               StackPush(ref stack, ref pos, arg0, arg1, arg2);
-           }
-       }
-       
      /// <summary>Supports searching for characters in or not in "-ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz".</summary>
      internal static readonly SearchValues<char> s_ascii_200000FEFFFF87FEFFFF07 = SearchValues.Create("-ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz");
  }

For more diff examples, see https://gist.github.com/MihuBot/a15ee2958fd2e1688755cccbc6104e54

JIT assembly changes
Total bytes of base: 54136129
Total bytes of diff: 54134019
Total bytes of delta: -2110 (-0.00 % of base)
Total relative delta: 0.21
    diff is an improvement.
    relative diff is a regression.

For a list of JIT diff regressions, see Regressions.md
For a list of JIT diff improvements, see Improvements.md

Sample source code for further analysis
const string JsonPath = "RegexResults-1306.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2r7wB2A");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions