Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,330 changes: 924 additions & 406 deletions src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -515,27 +515,52 @@ private void EliminateEndingBacktracking()
}
}

/// <summary>Whether this node is considered to be atomic based on its parent.</summary>
/// <summary>Whether this node may be considered to be atomic based on its parent.</summary>
/// <remarks>
/// This is used to determine whether additional atomic nodes may be valuable to
/// be introduced into the tree. It should not be used to determine for sure whether
/// a node will be backtracked into.
/// This may have false negatives, meaning the node may actually be atomic even if this returns false.
/// But any true result may be relied on to mean the node will actually be considered to be atomic.
/// </remarks>
public bool IsAtomicByParent()
{
RegexNode? next = Next;
if (next is null) return false;
if (next.Type == Atomic) return true;

// We only walk up one group as a balance between optimization and cost.
if ((next.Type != Concatenate && next.Type != Capture) ||
next.Child(next.ChildCount() - 1) != this)
// Walk up the parent hierarchy.
for (RegexNode? parent = Next; parent is not null; parent = parent.Next)
{
return false;
switch (parent.Type)
{
case Atomic:
case Prevent:
case Require:
// If the parent is atomic, so is the child. That's the whole purpose
// of the Atomic node, and lookarounds are also implicitly atomic.
return true;

case Alternate:
case Testref:
// Skip alternations. Each branch is considered independently,
// so any atomicity applied to the alternation also applies to
// each individual branch. This is true as well for conditional
// backreferences, where each of the yes/no branches are independent.
case Testgroup when parent.Child(0) != this:
// As with alternations, each yes/no branch of an expression conditional
// are independent from each other, but the conditional expression itself
// can be backtracked into from each of the branches, so we can't make
// it atomic just because the whole conditional is.
case Capture:
// Skip captures. They don't affect atomicity.
case Concatenate when parent.Child(parent.ChildCount() - 1) == this:
// If the parent is a concatenation and this is the last node,
// any atomicity applying to the concatenation applies to this
// node, too.
continue;

default:
// For any other parent type, give up on trying to prove atomicity.
return false;
}
}

next = next.Next;
return next != null && next.Type == Atomic;
// The parent was null, so nothing can backtrack in.
return true;
}

/// <summary>
Expand Down Expand Up @@ -2191,197 +2216,42 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
return false;
}

bool supported = false;

// We only support the default left-to-right, not right-to-left, which requires more complication in the generated code.
// (Right-to-left is only employed when explicitly asked for by the developer or by lookbehind assertions.)
// We also limit the recursion involved to prevent stack dives; this limitation can be removed by switching
// away from a recursive implementation (done for convenience) to an iterative one that's more complicated
// but within the same problems.
if ((Options & RegexOptions.RightToLeft) == 0)
if ((Options & RegexOptions.RightToLeft) != 0)
{
int childCount = ChildCount();
Debug.Assert((Options & HasCapturesFlag) == 0);

switch (Type)
{
// One/Notone/Set/Multi don't involve any repetition and are easily supported.
case One:
case Notone:
case Set:
case Multi:
// Boundaries are like set checks and don't involve repetition, either.
case Boundary:
case NonBoundary:
case ECMABoundary:
case NonECMABoundary:
// Anchors are also trivial.
case Beginning:
case Start:
case Bol:
case Eol:
case End:
case EndZ:
// {Set/One/Notone}loopatomic are optimized nodes that represent non-backtracking variable-length loops.
// These consume their {Set/One} inputs as long as they match, and don't give up anything they
// matched, which means we can support them without backtracking.
case Oneloopatomic:
case Notoneloopatomic:
case Setloopatomic:
// "Empty" is easy: nothing is emitted for it.
// "Nothing" is also easy: it doesn't match anything.
// "UpdateBumpalong" doesn't match anything, it's just an optional directive to the engine.
case Empty:
case Nothing:
case UpdateBumpalong:
// Backreferences are supported
case Ref:
supported = true;
break;

// Conditional backreference tests are also supported, so long as both their yes/no branches are supported.
case Testref:
supported =
Child(0).SupportsSimplifiedCodeGenerationImplementation() &&
(childCount == 1 || Child(1).SupportsSimplifiedCodeGenerationImplementation());
break;

// Single character greedy/lazy loops are supported if either they're actually a repeater
// or they're not contained in any construct other than simple nesting (e.g. concat, capture).
case Oneloop:
case Notoneloop:
case Setloop:
case Onelazy:
case Notonelazy:
case Setlazy:
Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type.");
supported = M == N || AncestorsAllowBacktracking(Next);
break;

// For greedy and lazy loops, they're supported if the node they wrap is supported
// and either the node is actually a repeater, is atomic, or is in the tree in a
// location where backtracking is allowed.
case Loop:
case Lazyloop:
supported =
(M == N || (Next != null && Next.Type == Atomic) || AncestorsAllowBacktracking(Next)) &&
Child(0).SupportsSimplifiedCodeGenerationImplementation();
break;

// We can handle atomic as long as its child is supported.
// Lookahead assertions also only require that the child node be supported.
// The RightToLeft check earlier is important to differentiate lookbehind,
// which is not supported.
case Atomic:
case Require:
case Prevent:
supported = Child(0).SupportsSimplifiedCodeGenerationImplementation();
break;

// We can handle alternates as long as they're atomic (a root / global alternate is
// effectively atomic, as nothing will try to backtrack into it as it's the last thing).
// Its children must all also be supported.
case Alternate:
if (Next != null &&
(IsAtomicByParent() || // atomic alternate
(Next.Type == Capture && Next.Next is null))) // root alternate
{
goto case Concatenate;
}
break;

// Concatenation doesn't require backtracking as long as its children don't.
case Concatenate:
supported = true;
for (int i = 0; i < childCount; i++)
{
if (!Child(i).SupportsSimplifiedCodeGenerationImplementation())
{
supported = false;
break;
}
}
break;

case Capture:
supported = Child(0).SupportsSimplifiedCodeGenerationImplementation();
if (supported)
{
// Captures are currently only supported in certain places in the tree.
RegexNode? parent = Next;
while (parent != null)
{
switch (parent.Type)
{
case Alternate:
case Atomic:
case Capture:
case Concatenate:
case Require:
parent = parent.Next;
break;

default:
parent = null;
supported = false;
break;
}
}

// If we've found a supported capture, mark all of the nodes in its parent
// hierarchy as containing a capture.
if (supported)
{
parent = this;
while (parent != null && ((parent.Options & HasCapturesFlag) == 0))
{
parent.Options |= HasCapturesFlag;
parent = parent.Next;
}
}
}
break;

case Testgroup:
supported =
Child(0).SupportsSimplifiedCodeGenerationImplementation() &&
Child(1).SupportsSimplifiedCodeGenerationImplementation() &&
(childCount == 2 || Child(2).SupportsSimplifiedCodeGenerationImplementation());
break;

default:
Debug.Fail($"Unknown type: {Type}");
supported = false;
break;
}
// RightToLeft isn't supported. That applies to both the top-level options as well as when used
// to specify positive and negative lookbehinds.
return false;
}
#if DEBUG
if (!supported && (Options & RegexOptions.Debug) != 0)

// TODO: This should be moved somewhere else, to a pass somewhere where we explicitly
// annotate the tree, potentially as part of the final optimization pass. It doesn't
// belong in this check.
switch (Type)
{
Debug.WriteLine($"Unable to use non-backtracking code gen: node {Description()} isn't supported.");
case Capture:
// If we've found a supported capture, mark all of the nodes in its parent
// hierarchy as containing a capture.
RegexNode? parent = this;
while (parent != null && ((parent.Options & HasCapturesFlag) == 0))
{
parent.Options |= HasCapturesFlag;
parent = parent.Next;
}
break;
}
#endif
return supported;

static bool AncestorsAllowBacktracking(RegexNode? node)
int childCount = ChildCount();
for (int i = 0; i < childCount; i++)
{
while (node is not null)
// The node isn't supported if any of its children aren't supported.
if (!Child(i).SupportsSimplifiedCodeGenerationImplementation())
{
switch (node.Type)
{
case Concatenate:
case Capture:
case Atomic:
node = node.Next;
break;

default:
return false;
}
return false;
}

return true;
}

// Supported.
return true;
}

/// <summary>Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node.</summary>
Expand All @@ -2393,15 +2263,19 @@ static bool AncestorsAllowBacktracking(RegexNode? node)
/// <summary>Gets whether the node is a Notone/Notoneloop/Notoneloopatomic/Notonelazy node.</summary>
public bool IsNotoneFamily => Type is Notone or Notoneloop or Notoneloopatomic or Notonelazy;

/// <summary>Gets whether this node may be a source of backtracking.</summary>
public bool InstigatesBacktracking =>
Type switch
/// <summary>Gets whether this node is contained inside of a loop.</summary>
public bool IsInLoop()
{
for (RegexNode? parent = Next; parent is not null; parent = parent.Next)
{
Oneloop or Notoneloop or Setloop or Onelazy or Notonelazy or Setlazy or Loop or Lazyloop when !IsAtomicByParent() && M != N => true,
Alternate => !IsAtomicByParent(),
Ref or Testref or Testgroup => true,
_ => false,
};
if (parent.Type is Loop or Lazyloop)
{
return true;
}
}

return false;
}

private string TypeName =>
Type switch
Expand Down
Loading