Skip to content

Commit c74a167

Browse files
authored
Remove capture groups from negative lookarounds (#118084)
Any captures performed inside of negative lookarounds do not persist to outside of the lookaround. As such, as long as there are no backreferences inside of the lookaround that would read on those captures, we can eliminate the capturing.
1 parent 655836e commit c74a167

File tree

2 files changed

+71
-0
lines changed

2 files changed

+71
-0
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2044,6 +2044,43 @@ private RegexNode ReduceLookaround()
20442044
Debug.Assert(Kind is RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround);
20452045
Debug.Assert(ChildCount() == 1);
20462046

2047+
// Captures inside of negative lookarounds are undone after the lookaround. Thus, if there's nothing
2048+
// inside of the negative lookaround that needs that capture group (namely a backreference), we can
2049+
// remove the capture.
2050+
if (Kind is RegexNodeKind.NegativeLookaround && ContainsBackreference(Child(0)) is false)
2051+
{
2052+
if (RemoveCaptures(this, 0))
2053+
{
2054+
// If we removed captures, we may have changed the structure of the tree in a way that exposed more
2055+
// optimization possibility, so re-reduce the children.
2056+
ReplaceChild(0, Child(0));
2057+
}
2058+
2059+
static bool RemoveCaptures(RegexNode parent, int nodeIndex)
2060+
{
2061+
RegexNode node = parent.Child(nodeIndex);
2062+
2063+
if (node.Kind is RegexNodeKind.Capture)
2064+
{
2065+
parent.ReplaceChild(nodeIndex, node.Child(0));
2066+
RemoveCaptures(parent, nodeIndex);
2067+
return true;
2068+
}
2069+
2070+
bool changesMade = false;
2071+
if (StackHelper.TryEnsureSufficientExecutionStack())
2072+
{
2073+
int childCount = node.ChildCount();
2074+
for (int i = 0; i < childCount; i++)
2075+
{
2076+
changesMade |= RemoveCaptures(node, i);
2077+
}
2078+
}
2079+
2080+
return changesMade;
2081+
}
2082+
}
2083+
20472084
// A lookaround is a zero-width atomic assertion.
20482085
// As it's atomic, nothing will backtrack into it, and we can
20492086
// eliminate any ending backtracking from it.
@@ -2066,6 +2103,32 @@ private RegexNode ReduceLookaround()
20662103
return this;
20672104
}
20682105

2106+
/// <summary>Gets whether the node contains a backreference anywhere in its tree.</summary>
2107+
private static bool? ContainsBackreference(RegexNode node)
2108+
{
2109+
if (node.Kind is RegexNodeKind.Backreference or RegexNodeKind.BackreferenceConditional)
2110+
{
2111+
return true;
2112+
}
2113+
2114+
if (!StackHelper.TryEnsureSufficientExecutionStack())
2115+
{
2116+
// If we can't recur further, just stop optimizing.
2117+
return null;
2118+
}
2119+
2120+
int childCount = node.ChildCount();
2121+
for (int i = 0; i < childCount; i++)
2122+
{
2123+
if (ContainsBackreference(node.Child(i)) is true)
2124+
{
2125+
return true;
2126+
}
2127+
}
2128+
2129+
return false;
2130+
}
2131+
20692132
/// <summary>Optimizations for backreference conditionals.</summary>
20702133
private RegexNode ReduceBackreferenceConditional()
20712134
{

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,10 @@ public class RegexReductionTests
269269
[InlineData("(?>(?>(?>(?>))))", "")]
270270
[InlineData("(?>(?>(?>(?>(?!)))))", "(?!)")]
271271
[InlineData("(?=(?>))", "")]
272+
// Lookaround reduction
273+
[InlineData("(?!(abc))", "(?!abc)")]
274+
[InlineData("(?!a(b*)c)", "(?!ab*c)")]
275+
[InlineData("(?!a((((b))))c)", "(?!abc)")]
272276
// Alternation reduction
273277
[InlineData("a|b", "[ab]")]
274278
[InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")]
@@ -541,6 +545,10 @@ public void PatternsReduceIdentically(string actual, string expected)
541545
[InlineData("(abc?)*?d", "(?>(ab(?>c?))*)d")]
542546
[InlineData("(aba)+d", "(?>(aba)+)d")]
543547
[InlineData("(abc*)*d", "(?>(ab(?>c*))*)d")]
548+
// Lookaround reduction
549+
[InlineData("(?=(abc))", "(?=abc)")]
550+
[InlineData("(?=a(b*)c)", "(?=ab*c)")]
551+
[InlineData("(?=a((((b))))c)", "(?=abc)")]
544552
// Loops inside alternation constructs
545553
[InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")]
546554
[InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]

0 commit comments

Comments
 (0)