-
Notifications
You must be signed in to change notification settings - Fork 5.1k
Remove capture groups from negative lookarounds #118084
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Any captures performed inside of negative lookarounds do not persist to outside of the lookaround. As such, as long as there are no backreferences inside of the lookaround that would read on those captures, we can eliminate the capturing.
@MihuBot regexdiff |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull Request Overview
This PR optimizes regular expression patterns by removing unnecessary capture groups from negative lookarounds. Since captures inside negative lookarounds are automatically undone after the lookaround completes, any capture groups that don't have backreferences within the lookaround can be safely eliminated, simplifying the pattern.
Key changes:
- Adds logic to detect and remove redundant capture groups in negative lookarounds when no backreferences exist
- Implements recursive capture removal with stack overflow protection
- Adds comprehensive test coverage for the new optimization
Reviewed Changes
Copilot reviewed 2 out of 2 changed files in this pull request and generated 2 comments.
File | Description |
---|---|
RegexNode.cs | Implements the core optimization logic to remove captures from negative lookarounds and adds backreference detection |
RegexReductionTests.cs | Adds test cases validating the capture group removal optimization for both negative and positive lookarounds |
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
Show resolved
Hide resolved
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
Show resolved
Hide resolved
Tagging subscribers to this area: @dotnet/area-system-text-regularexpressions |
70 out of 18857 patterns have generated source code changes. Examples of GeneratedRegex source diffs"&(?!(amp;)|(lt;)|(gt;)|(quot;))" (1847 uses)[GeneratedRegex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase)] /// ○ Match '&'.<br/>
/// ○ Zero-width negative lookahead.<br/>
/// ○ Match with 4 alternative expressions, atomically.<br/>
- /// ○ 1st capture group.<br/>
+ /// ○ Match a sequence of expressions.<br/>
/// ○ Match a character in the set [Aa].<br/>
/// ○ Match a character in the set [Mm].<br/>
/// ○ Match a character in the set [Pp].<br/>
/// ○ Match ';'.<br/>
- /// ○ 2nd capture group.<br/>
+ /// ○ Match a sequence of expressions.<br/>
/// ○ Match a character in the set [Ll].<br/>
/// ○ Match a character in the set [Tt].<br/>
/// ○ Match ';'.<br/>
- /// ○ 3rd capture group.<br/>
+ /// ○ Match a sequence of expressions.<br/>
/// ○ Match a character in the set [Gg].<br/>
/// ○ Match a character in the set [Tt].<br/>
/// ○ Match ';'.<br/>
- /// ○ 4th capture group.<br/>
+ /// ○ Match a sequence of expressions.<br/>
/// ○ Match a character in the set [Qq].<br/>
/// ○ Match a character in the set [Uu].<br/>
/// ○ Match a character in the set [Oo].<br/>
{
int pos = base.runtextpos;
int matchStart = pos;
- int capture_starting_pos = 0;
- int capture_starting_pos1 = 0;
- int capture_starting_pos2 = 0;
- int capture_starting_pos3 = 0;
- int negativelookahead__capture_pos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
// Match '&'.
if (slice.IsEmpty || slice[0] != '&')
{
- UncaptureUntil(0);
return false; // The input didn't match.
}
base.CheckTimeout();
}
- negativelookahead__capture_pos = base.Crawlpos();
// Match with 4 alternative expressions, atomically.
{
if ((uint)slice.Length < 2)
switch (slice[1])
{
case 'A' or 'a':
- // 1st capture group.
+
+ if ((uint)slice.Length < 5 ||
+ !slice.Slice(2).StartsWith("mp;", StringComparison.OrdinalIgnoreCase)) // Match the string "mp;" (ordinal case-insensitive)
{
- pos++;
- slice = inputSpan.Slice(pos);
- capture_starting_pos = pos;
-
- if ((uint)slice.Length < 4 ||
- !slice.StartsWith("amp;", StringComparison.OrdinalIgnoreCase)) // Match the string "amp;" (ordinal case-insensitive)
- {
- goto NegativeLookaroundMatch;
- }
-
- pos += 4;
- slice = inputSpan.Slice(pos);
- base.Capture(1, capture_starting_pos, pos);
+ goto NegativeLookaroundMatch;
}
+ pos += 5;
+ slice = inputSpan.Slice(pos);
break;
case 'L' or 'l':
- // 2nd capture group.
+
+ if ((uint)slice.Length < 4 ||
+ !slice.Slice(2).StartsWith("t;", StringComparison.OrdinalIgnoreCase)) // Match the string "t;" (ordinal case-insensitive)
{
- pos++;
- slice = inputSpan.Slice(pos);
- capture_starting_pos1 = pos;
-
- if ((uint)slice.Length < 3 ||
- !slice.StartsWith("lt;", StringComparison.OrdinalIgnoreCase)) // Match the string "lt;" (ordinal case-insensitive)
- {
- goto NegativeLookaroundMatch;
- }
-
- pos += 3;
- slice = inputSpan.Slice(pos);
- base.Capture(2, capture_starting_pos1, pos);
+ goto NegativeLookaroundMatch;
}
+ pos += 4;
+ slice = inputSpan.Slice(pos);
break;
case 'G' or 'g':
- // 3rd capture group.
+
+ if ((uint)slice.Length < 4 ||
+ !slice.Slice(2).StartsWith("t;", StringComparison.OrdinalIgnoreCase)) // Match the string "t;" (ordinal case-insensitive)
{
- pos++;
- slice = inputSpan.Slice(pos);
- capture_starting_pos2 = pos;
-
- if ((uint)slice.Length < 3 ||
- !slice.StartsWith("gt;", StringComparison.OrdinalIgnoreCase)) // Match the string "gt;" (ordinal case-insensitive)
- {
- goto NegativeLookaroundMatch;
- }
-
- pos += 3;
- slice = inputSpan.Slice(pos);
- base.Capture(3, capture_starting_pos2, pos);
+ goto NegativeLookaroundMatch;
}
+ pos += 4;
+ slice = inputSpan.Slice(pos);
break;
case 'Q' or 'q':
- // 4th capture group.
+
+ if ((uint)slice.Length < 6 ||
+ !slice.Slice(2).StartsWith("uot;", StringComparison.OrdinalIgnoreCase)) // Match the string "uot;" (ordinal case-insensitive)
{
- pos++;
- slice = inputSpan.Slice(pos);
- capture_starting_pos3 = pos;
-
- if ((uint)slice.Length < 5 ||
- !slice.StartsWith("quot;", StringComparison.OrdinalIgnoreCase)) // Match the string "quot;" (ordinal case-insensitive)
- {
- goto NegativeLookaroundMatch;
- }
-
- pos += 5;
- slice = inputSpan.Slice(pos);
- base.Capture(4, capture_starting_pos3, pos);
+ goto NegativeLookaroundMatch;
}
+ pos += 6;
+ slice = inputSpan.Slice(pos);
break;
default:
}
}
- UncaptureUntil(0);
return false; // The input didn't match.
NegativeLookaroundMatch:
pos = negativelookahead__starting_pos;
slice = inputSpan.Slice(pos);
- UncaptureUntil(negativelookahead__capture_pos);
}
// The input matched.
base.runtextpos = pos;
base.Capture(0, matchStart, pos);
return true;
-
- // <summary>Undo captures until it reaches the specified capture position.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- void UncaptureUntil(int capturePosition)
- {
- while (base.Crawlpos() > capturePosition)
- {
- base.Uncapture();
- }
- }
}
}
} "&(?!(amp;)|(lt;)|(gt;)|(quot;)|(nbsp;)|(reg;))" (783 uses)[GeneratedRegex("&(?!(amp;)|(lt;)|(gt;)|(quot;)|(nbsp;)|(reg;))", RegexOptions.IgnoreCase)] /// ○ Match '&'.<br/>
/// ○ Zero-width negative lookahead.<br/>
/// ○ Match with 6 alternative expressions, atomically.<br/>
- /// ○ 1st capture group.<br/>
+ /// ○ Match a sequence of expressions.<br/>
/// ○ Match a character in the set [Aa].<br/>
/// ○ Match a character in the set [Mm].<br/>
/// ○ Match a character in the set [Pp].<br/>
/// ○ Match ';'.<br/>
- /// ○ 2nd capture group.<br/>
+ /// ○ Match a sequence of expressions.<br/>
/// ○ Match a character in the set [Ll].<br/>
/// ○ Match a character in the set [Tt].<br/>
/// ○ Match ';'.<br/>
- /// ○ 3rd capture group.<br/>
+ /// ○ Match a sequence of expressions.<br/>
/// ○ Match a character in the set [Gg].<br/>
/// ○ Match a character in the set [Tt].<br/>
/// ○ Match ';'.<br/>
- /// ○ 4th capture group.<br/>
+ /// ○ Match a sequence of expressions.<br/>
/// ○ Match a character in the set [Qq].<br/>
/// ○ Match a character in the set [Uu].<br/>
/// ○ Match a character in the set [Oo].<br/>
/// ○ Match a character in the set [Tt].<br/>
/// ○ Match ';'.<br/>
- /// ○ 5th capture group.<br/>
+ /// ○ Match a sequence of expressions.<br/>
/// ○ Match a character in the set [Nn].<br/>
/// ○ Match a character in the set [Bb].<br/>
/// ○ Match a character in the set [Ss].<br/>
/// ○ Match a character in the set [Pp].<br/>
/// ○ Match ';'.<br/>
- /// ○ 6th capture group.<br/>
+ /// ○ Match a sequence of expressions.<br/>
/// ○ Match a character in the set [Rr].<br/>
/// ○ Match a character in the set [Ee].<br/>
/// ○ Match a character in the set [Gg].<br/>
{
int pos = base.runtextpos;
int matchStart = pos;
- int capture_starting_pos = 0;
- int capture_starting_pos1 = 0;
- int capture_starting_pos2 = 0;
- int capture_starting_pos3 = 0;
- int capture_starting_pos4 = 0;
- int capture_starting_pos5 = 0;
- int negativelookahead__capture_pos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
// Match '&'.
if (slice.IsEmpty || slice[0] != '&')
{
- UncaptureUntil(0);
return false; // The input didn't match.
}
base.CheckTimeout();
}
- negativelookahead__capture_pos = base.Crawlpos();
// Match with 6 alternative expressions, atomically.
{
if ((uint)slice.Length < 2)
switch (slice[1])
{
case 'A' or 'a':
- // 1st capture group.
+
+ if ((uint)slice.Length < 5 ||
+ !slice.Slice(2).StartsWith("mp;", StringComparison.OrdinalIgnoreCase)) // Match the string "mp;" (ordinal case-insensitive)
{
- pos++;
- slice = inputSpan.Slice(pos);
- capture_starting_pos = pos;
-
- if ((uint)slice.Length < 4 ||
- !slice.StartsWith("amp;", StringComparison.OrdinalIgnoreCase)) // Match the string "amp;" (ordinal case-insensitive)
- {
- goto NegativeLookaroundMatch;
- }
-
- pos += 4;
- slice = inputSpan.Slice(pos);
- base.Capture(1, capture_starting_pos, pos);
+ goto NegativeLookaroundMatch;
}
+ pos += 5;
+ slice = inputSpan.Slice(pos);
break;
case 'L' or 'l':
- // 2nd capture group.
+
+ if ((uint)slice.Length < 4 ||
+ !slice.Slice(2).StartsWith("t;", StringComparison.OrdinalIgnoreCase)) // Match the string "t;" (ordinal case-insensitive)
{
- pos++;
- slice = inputSpan.Slice(pos);
- capture_starting_pos1 = pos;
-
- if ((uint)slice.Length < 3 ||
- !slice.StartsWith("lt;", StringComparison.OrdinalIgnoreCase)) // Match the string "lt;" (ordinal case-insensitive)
- {
- goto NegativeLookaroundMatch;
- }
-
- pos += 3;
- slice = inputSpan.Slice(pos);
- base.Capture(2, capture_starting_pos1, pos);
+ goto NegativeLookaroundMatch;
}
+ pos += 4;
+ slice = inputSpan.Slice(pos);
break;
case 'G' or 'g':
- // 3rd capture group.
+
+ if ((uint)slice.Length < 4 ||
+ !slice.Slice(2).StartsWith("t;", StringComparison.OrdinalIgnoreCase)) // Match the string "t;" (ordinal case-insensitive)
{
- pos++;
- slice = inputSpan.Slice(pos);
- capture_starting_pos2 = pos;
-
- if ((uint)slice.Length < 3 ||
- !slice.StartsWith("gt;", StringComparison.OrdinalIgnoreCase)) // Match the string "gt;" (ordinal case-insensitive)
- {
- goto NegativeLookaroundMatch;
- }
-
- pos += 3;
- slice = inputSpan.Slice(pos);
- base.Capture(3, capture_starting_pos2, pos);
+ goto NegativeLookaroundMatch;
}
+ pos += 4;
+ slice = inputSpan.Slice(pos);
break;
case 'Q' or 'q':
- // 4th capture group.
+
+ if ((uint)slice.Length < 6 ||
+ !slice.Slice(2).StartsWith("uot;", StringComparison.OrdinalIgnoreCase)) // Match the string "uot;" (ordinal case-insensitive)
{
- pos++;
- slice = inputSpan.Slice(pos);
- capture_starting_pos3 = pos;
-
- if ((uint)slice.Length < 5 ||
- !slice.StartsWith("quot;", StringComparison.OrdinalIgnoreCase)) // Match the string "quot;" (ordinal case-insensitive)
- {
- goto NegativeLookaroundMatch;
- }
-
- pos += 5;
- slice = inputSpan.Slice(pos);
- base.Capture(4, capture_starting_pos3, pos);
+ goto NegativeLookaroundMatch;
}
+ pos += 6;
+ slice = inputSpan.Slice(pos);
break;
case 'N' or 'n':
- // 5th capture group.
+
+ if ((uint)slice.Length < 6 ||
+ !slice.Slice(2).StartsWith("bsp;", StringComparison.OrdinalIgnoreCase)) // Match the string "bsp;" (ordinal case-insensitive)
{
- pos++;
- slice = inputSpan.Slice(pos);
- capture_starting_pos4 = pos;
-
- if ((uint)slice.Length < 5 ||
- !slice.StartsWith("nbsp;", StringComparison.OrdinalIgnoreCase)) // Match the string "nbsp;" (ordinal case-insensitive)
- {
- goto NegativeLookaroundMatch;
- }
-
- pos += 5;
- slice = inputSpan.Slice(pos);
- base.Capture(5, capture_starting_pos4, pos);
+ goto NegativeLookaroundMatch;
}
+ pos += 6;
+ slice = inputSpan.Slice(pos);
break;
case 'R' or 'r':
- // 6th capture group.
+
+ if ((uint)slice.Length < 5 ||
+ !slice.Slice(2).StartsWith("eg;", StringComparison.OrdinalIgnoreCase)) // Match the string "eg;" (ordinal case-insensitive)
{
- pos++;
- slice = inputSpan.Slice(pos);
- capture_starting_pos5 = pos;
-
- if ((uint)slice.Length < 4 ||
- !slice.StartsWith("reg;", StringComparison.OrdinalIgnoreCase)) // Match the string "reg;" (ordinal case-insensitive)
- {
- goto NegativeLookaroundMatch;
- }
-
- pos += 4;
- slice = inputSpan.Slice(pos);
- base.Capture(6, capture_starting_pos5, pos);
+ goto NegativeLookaroundMatch;
}
+ pos += 5;
+ slice = inputSpan.Slice(pos);
break;
default:
}
}
- UncaptureUntil(0);
return false; // The input didn't match.
NegativeLookaroundMatch:
pos = negativelookahead__starting_pos;
slice = inputSpan.Slice(pos);
- UncaptureUntil(negativelookahead__capture_pos);
}
// The input matched.
base.runtextpos = pos;
base.Capture(0, matchStart, pos);
return true;
-
- // <summary>Undo captures until it reaches the specified capture position.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- void UncaptureUntil(int capturePosition)
- {
- while (base.Crawlpos() > capturePosition)
- {
- base.Uncapture();
- }
- }
}
}
} "(?!(^[A-Z]))([A-Z])" (70 uses)[GeneratedRegex("(?!(^[A-Z]))([A-Z])")] /// Explanation:<br/>
/// <code>
/// ○ Zero-width negative lookahead.<br/>
- /// ○ 1st capture group.<br/>
- /// ○ Match if at the beginning of the string.<br/>
- /// ○ Match a character in the set [A-Z].<br/>
+ /// ○ Match if at the beginning of the string.<br/>
+ /// ○ Match a character in the set [A-Z].<br/>
/// ○ 2nd capture group.<br/>
/// ○ Match a character in the set [A-Z].<br/>
/// </code>
int pos = base.runtextpos;
int matchStart = pos;
int capture_starting_pos = 0;
- int capture_starting_pos1 = 0;
- int negativelookahead__capture_pos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
// Zero-width negative lookahead.
base.CheckTimeout();
}
- negativelookahead__capture_pos = base.Crawlpos();
- // 1st capture group.
+ // Match if at the beginning of the string.
+ if (pos != 0)
{
- capture_starting_pos = pos;
-
- // Match if at the beginning of the string.
- if (pos != 0)
- {
- goto NegativeLookaroundMatch;
- }
-
- // Match a character in the set [A-Z].
- if (slice.IsEmpty || !char.IsAsciiLetterUpper(slice[0]))
- {
- goto NegativeLookaroundMatch;
- }
-
- pos++;
- slice = inputSpan.Slice(pos);
- base.Capture(1, capture_starting_pos, pos);
+ goto NegativeLookaroundMatch;
+ }
+
+ // Match a character in the set [A-Z].
+ if (slice.IsEmpty || !char.IsAsciiLetterUpper(slice[0]))
+ {
+ goto NegativeLookaroundMatch;
}
UncaptureUntil(0);
NegativeLookaroundMatch:
pos = negativelookahead__starting_pos;
slice = inputSpan.Slice(pos);
- UncaptureUntil(negativelookahead__capture_pos);
}
// 2nd capture group.
{
- capture_starting_pos1 = pos;
+ capture_starting_pos = pos;
// Match a character in the set [A-Z].
if (slice.IsEmpty || !char.IsAsciiLetterUpper(slice[0]))
pos++;
slice = inputSpan.Slice(pos);
- base.Capture(2, capture_starting_pos1, pos);
+ base.Capture(2, capture_starting_pos, pos);
}
// The input matched. For more diff examples, see https://gist.github.com/MihuBot/92ee92ddd485ed9c74d00540c589475f
For a list of JIT diff improvements, see Improvements.md Sample source code for further analysisconst string JsonPath = "RegexResults-1285.json";
if (!File.Exists(JsonPath))
{
await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2kAZabA");
using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}
using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");
record KnownPattern(string Pattern, RegexOptions Options, int Count);
sealed class RegexEntry
{
public required KnownPattern Regex { get; set; }
public required string MainSource { get; set; }
public required string PrSource { get; set; }
public string? FullDiff { get; set; }
public string? ShortDiff { get; set; }
public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Neat
Any captures performed inside of negative lookarounds do not persist to outside of the lookaround. As such, as long as there are no backreferences inside of the lookaround that would read on those captures, we can eliminate the capturing.