diff --git a/src/libraries/System.Text.RegularExpressions/gen/DiagnosticDescriptors.cs b/src/libraries/System.Text.RegularExpressions/gen/DiagnosticDescriptors.cs index aec397eb7d6437..2c1d2a0d4a881f 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/DiagnosticDescriptors.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/DiagnosticDescriptors.cs @@ -8,11 +8,13 @@ namespace System.Text.RegularExpressions.Generator { internal static class DiagnosticDescriptors { + private const string Category = "RegexGenerator"; + public static DiagnosticDescriptor InvalidRegexGeneratorAttribute { get; } = new DiagnosticDescriptor( id: "SYSLIB1040", title: new LocalizableResourceString(nameof(SR.InvalidRegexGeneratorAttributeTitle), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), messageFormat: new LocalizableResourceString(nameof(SR.InvalidRegexGeneratorAttributeMessage), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), - category: "RegexGenerator", + category: Category, DiagnosticSeverity.Error, isEnabledByDefault: true, customTags: WellKnownDiagnosticTags.NotConfigurable); @@ -21,7 +23,7 @@ internal static class DiagnosticDescriptors id: "SYSLIB1041", title: new LocalizableResourceString(nameof(SR.InvalidRegexGeneratorAttributeTitle), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), messageFormat: new LocalizableResourceString(nameof(SR.MultipleRegexGeneratorAttributesMessage), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), - category: "RegexGenerator", + category: Category, DiagnosticSeverity.Error, isEnabledByDefault: true, customTags: WellKnownDiagnosticTags.NotConfigurable); @@ -30,7 +32,7 @@ internal static class DiagnosticDescriptors id: "SYSLIB1042", title: new LocalizableResourceString(nameof(SR.InvalidRegexGeneratorAttributeTitle), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), messageFormat: new LocalizableResourceString(nameof(SR.InvalidRegexArgumentsMessage), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), - category: "RegexGenerator", + category: Category, DiagnosticSeverity.Error, isEnabledByDefault: true, customTags: WellKnownDiagnosticTags.NotConfigurable); @@ -39,7 +41,7 @@ internal static class DiagnosticDescriptors id: "SYSLIB1043", title: new LocalizableResourceString(nameof(SR.InvalidRegexGeneratorAttributeTitle), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), messageFormat: new LocalizableResourceString(nameof(SR.RegexMethodMustHaveValidSignatureMessage), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), - category: "RegexGenerator", + category: Category, DiagnosticSeverity.Error, isEnabledByDefault: true, customTags: WellKnownDiagnosticTags.NotConfigurable); @@ -48,9 +50,17 @@ internal static class DiagnosticDescriptors id: "SYSLIB1044", title: new LocalizableResourceString(nameof(SR.InvalidRegexGeneratorAttributeTitle), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), messageFormat: new LocalizableResourceString(nameof(SR.InvalidLangVersionMessage), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), - category: "RegexGenerator", + category: Category, DiagnosticSeverity.Error, isEnabledByDefault: true, customTags: WellKnownDiagnosticTags.NotConfigurable); + + public static DiagnosticDescriptor LimitedSourceGeneration { get; } = new DiagnosticDescriptor( + id: "SYSLIB1045", + title: new LocalizableResourceString(nameof(SR.LimitedSourceGenerationTitle), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), + messageFormat: new LocalizableResourceString(nameof(SR.LimitedSourceGenerationMessage), SR.ResourceManager, typeof(FxResources.System.Text.RegularExpressions.Generator.SR)), + category: Category, + DiagnosticSeverity.Info, + isEnabledByDefault: true); } } diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index ad5306008cf9c1..62898cbdd5c938 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -6,6 +6,7 @@ using System.CodeDom.Compiler; using System.Collections; using System.Collections.Generic; +using System.Collections.Immutable; using System.Diagnostics; using System.Globalization; using System.IO; @@ -14,6 +15,7 @@ using System.Security.Cryptography; using System.Text; using System.Threading; +using Microsoft.CodeAnalysis; using Microsoft.CodeAnalysis.CSharp; // NOTE: The logic in this file is largely a copy of logic in RegexCompiler, emitting C# instead of MSIL. @@ -24,12 +26,6 @@ namespace System.Text.RegularExpressions.Generator { public partial class RegexGenerator { - /// - /// Value added to the written code to enable subsequent replacement with any variable declarations - /// dynamically discovered during code generation. - /// - private const string AdditionalDeclarationsPlaceholder = "<>PLACEHOLDER_FOR_ADDITIONAL_DECLARATIONS"; - /// Code for a [GeneratedCode] attribute to put on the top-level generated members. private static readonly string s_generatedCodeAttribute = $"[global::System.CodeDom.Compiler.GeneratedCodeAttribute(\"{typeof(RegexGenerator).Assembly.GetName().Name}\", \"{typeof(RegexGenerator).Assembly.GetName().Version}\")]"; /// Header comments and usings to include at the top of every generated file. @@ -45,7 +41,7 @@ public partial class RegexGenerator }; /// Generates the code for one regular expression class. - private static string EmitRegexType(RegexType regexClass) + private static (string, ImmutableArray) EmitRegexType(RegexType regexClass) { var sb = new StringBuilder(1024); var writer = new IndentedTextWriter(new StringWriter(sb)); @@ -86,7 +82,7 @@ private static string EmitRegexType(RegexType regexClass) generatedName += ComputeStringHash(generatedName).ToString("X"); // Generate the regex type - EmitRegexMethod(writer, regexClass.Method, generatedName); + ImmutableArray diagnostics = EmitRegexMethod(writer, regexClass.Method, generatedName); while (writer.Indent != 0) { @@ -95,10 +91,10 @@ private static string EmitRegexType(RegexType regexClass) } writer.Flush(); - return sb.ToString(); + return (sb.ToString(), diagnostics); // FNV-1a hash function. The actual algorithm used doesn't matter; just something simple - // to create a pseudo-random value based on input text. + // to create a deterministic, pseudo-random value that's based on input text. static uint ComputeStringHash(string s) { uint hashCode = 2166136261; @@ -111,12 +107,49 @@ static uint ComputeStringHash(string s) } /// Gets whether a given regular expression method is supported by the code generator. - private static bool SupportsCustomCodeGeneration(RegexMethod rm) => - // The generator doesn't currently know how to emit code for NonBacktracking. - (rm.Options & RegexOptions.NonBacktracking) == 0; + private static bool SupportsCodeGeneration(RegexMethod rm) + { + RegexNode root = rm.Code.Tree.Root; + + if (!root.SupportsCompilation()) + { + return false; + } + + if (ExceedsMaxDepthForSimpleCodeGeneration(root, allowedDepth: 40)) + { + // Deep RegexNode trees can result in emitting C# code that exceeds C# compiler + // limitations, leading to "CS8078: An expression is too long or complex to compile". + // Place an artificial limit on max tree depth in order to mitigate such issues. + // The allowed depth can be tweaked as needed;its exceedingly rare to find + // expressions with such deep trees. + return false; + } + + return true; + + static bool ExceedsMaxDepthForSimpleCodeGeneration(RegexNode node, int allowedDepth) + { + if (allowedDepth <= 0) + { + return true; + } + + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (ExceedsMaxDepthForSimpleCodeGeneration(node.Child(i), allowedDepth - 1)) + { + return true; + } + } + + return false; + } + } /// Generates the code for a regular expression method. - private static void EmitRegexMethod(IndentedTextWriter writer, RegexMethod rm, string id) + private static ImmutableArray EmitRegexMethod(IndentedTextWriter writer, RegexMethod rm, string id) { string patternExpression = Literal(rm.Pattern); string optionsExpression = $"(global::System.Text.RegularExpressions.RegexOptions)({(int)rm.Options})"; @@ -134,11 +167,11 @@ private static void EmitRegexMethod(IndentedTextWriter writer, RegexMethod rm, s writer.Write(" public static global::System.Text.RegularExpressions.Regex Instance { get; } = "); // If we can't support custom generation for this regex, spit out a Regex constructor call. - if (!SupportsCustomCodeGeneration(rm)) + if (!SupportsCodeGeneration(rm)) { writer.WriteLine($"new global::System.Text.RegularExpressions.Regex({patternExpression}, {optionsExpression}, {timeoutExpression});"); writer.WriteLine("}"); - return; + return ImmutableArray.Create(Diagnostic.Create(DiagnosticDescriptors.LimitedSourceGeneration, rm.MethodSyntax.GetLocation())); } writer.WriteLine($"new {id}();"); @@ -213,6 +246,7 @@ private static void EmitRegexMethod(IndentedTextWriter writer, RegexMethod rm, s writer.WriteLine($" }}"); writer.WriteLine($" }}"); writer.WriteLine("}"); + return ImmutableArray.Empty; static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht) { @@ -242,7 +276,6 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, { RegexOptions options = (RegexOptions)rm.Options; RegexCode code = rm.Code; - bool rtl = code.RightToLeft; bool hasTextInfo = false; // In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later. @@ -254,11 +287,10 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, // Emit locals initialization writer.WriteLine("global::System.ReadOnlySpan runtextSpan = base.runtext;"); writer.WriteLine("int runtextpos = base.runtextpos;"); - if (rtl) - { - writer.WriteLine("int runtextbeg = base.runtextbeg;"); - } - writer.WriteLine($"int runtextend = base.runtextend;{AdditionalDeclarationsPlaceholder}"); // placeholder at the end of a line so the generated indents line up + writer.Write($"int runtextend = base.runtextend;"); + writer.Flush(); + int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length; + int additionalDeclarationsIndent = writer.Indent; writer.WriteLine(); // Generate length check. If the input isn't long enough to possibly match, fail quickly. @@ -266,19 +298,12 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, // especially since we want the "return false" code regardless. int minRequiredLength = rm.Code.Tree.MinRequiredLength; Debug.Assert(minRequiredLength >= 0); - string clause = !rtl ? - minRequiredLength switch - { - 0 => "if (runtextpos <= runtextend)", - 1 => "if (runtextpos < runtextend)", - _ => $"if (runtextpos < runtextend - {minRequiredLength - 1})" - } : - minRequiredLength switch - { - 0 => "if (runtextpos >= runtextbeg)", - 1 => "if (runtextpos > runtextbeg)", - _ => $"if (runtextpos - {minRequiredLength - 1} > runtextbeg)" - }; + string clause = minRequiredLength switch + { + 0 => "if (runtextpos <= runtextend)", + 1 => "if (runtextpos < runtextend)", + _ => $"if (runtextpos < runtextend - {minRequiredLength - 1})" + }; using (EmitBlock(writer, clause)) { // Emit any anchors. @@ -299,11 +324,6 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, EmitIndexOf_LeftToRight(code.FindOptimizations.LeadingCaseSensitivePrefix); break; - case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: - Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); - EmitIndexOf_RightToLeft(code.FindOptimizations.LeadingCaseSensitivePrefix); - break; - case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: @@ -312,12 +332,6 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, EmitFixedSet_LeftToRight(); break; - case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: - case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: - Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - EmitFixedSet_RightToLeft(); - break; - default: Debug.Fail($"Unexpected mode: {code.FindOptimizations.FindMode}"); goto case FindNextStartingPositionMode.NoSearch; @@ -332,11 +346,11 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, writer.WriteLine("// No match"); writer.WriteLine("ReturnFalse:"); - writer.WriteLine(!rm.Code.RightToLeft ? "base.runtextpos = runtextend;" : "base.runtextpos = runtextbeg;"); + writer.WriteLine("base.runtextpos = runtextend;"); writer.WriteLine("return false;"); // We're done. Patch up any additional declarations. - ReplaceAdditionalDeclarations(additionalDeclarations, writer); + ReplaceAdditionalDeclarations(writer, additionalDeclarations, additionalDeclarationsPosition, additionalDeclarationsIndent); return; // Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further @@ -346,85 +360,40 @@ bool EmitAnchors() // Generate anchor checks. if ((code.FindOptimizations.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0) { - // TODO: Interpreted and Compiled differ in various places as to whether they update positions, as do LTR vs RTL. Determine why. switch (code.FindOptimizations.LeadingAnchor) { case RegexPrefixAnalyzer.Beginning: writer.WriteLine("// Beginning \\A anchor"); - if (!rtl) - { - using (EmitBlock(writer, "if (runtextpos > runtextbeg)")) - { - writer.WriteLine("goto ReturnFalse;"); - } - } - else + using (EmitBlock(writer, "if (runtextpos > runtextbeg)")) { - // TODO: RegexOptions.Compiled doesn't ever return false here. Instead it updates the position. Why? - using (EmitBlock(writer, "if (runtextpos > runtextbeg)")) - { - writer.WriteLine("base.runtextpos = runtextbeg;"); - } + writer.WriteLine("goto ReturnFalse;"); } writer.WriteLine("return true;"); return true; case RegexPrefixAnalyzer.Start: writer.WriteLine("// Start \\G anchor"); - if (!rtl) - { - using (EmitBlock(writer, "if (runtextpos > runtextstart)")) - { - writer.WriteLine("goto ReturnFalse;"); - } - } - else + using (EmitBlock(writer, "if (runtextpos > runtextstart)")) { - // TODO: RegexOptions.Compiled doesn't ever return false here. Instead it updates the position. Why? - using (EmitBlock(writer, "if (runtextpos < runtextstart)")) - { - writer.WriteLine("goto ReturnFalse;"); - } + writer.WriteLine("goto ReturnFalse;"); } writer.WriteLine("return true;"); return true; case RegexPrefixAnalyzer.EndZ: - // TODO: Why are the LTR and RTL cases inconsistent here with RegexOptions.Compiled? writer.WriteLine("// End \\Z anchor"); - if (!rtl) - { - using (EmitBlock(writer, "if (runtextpos < runtextend - 1)")) - { - writer.WriteLine("base.runtextpos = runtextend - 1;"); - } - } - else + using (EmitBlock(writer, "if (runtextpos < runtextend - 1)")) { - // TODO: This differs subtly between interpreted and compiled. Why? - using (EmitBlock(writer, "if (runtextpos < runtextend - 1 || (runtextpos == runtextend - 1 && runtextSpan[runtextpos] != '\\n'))")) - { - writer.WriteLine("goto ReturnFalse;"); - } + writer.WriteLine("base.runtextpos = runtextend - 1;"); } writer.WriteLine("return true;"); return true; case RegexPrefixAnalyzer.End: writer.WriteLine("// End \\z anchor"); - if (!rtl) - { - using (EmitBlock(writer, "if (runtextpos < runtextend)")) - { - writer.WriteLine("base.runtextpos = runtextend;"); - } - } - else + using (EmitBlock(writer, "if (runtextpos < runtextend)")) { - using (EmitBlock(writer, "if (runtextpos < runtextend)")) - { - writer.WriteLine("goto ReturnFalse;"); - } + writer.WriteLine("base.runtextpos = runtextend;"); } writer.WriteLine("return true;"); return true; @@ -434,7 +403,6 @@ bool EmitAnchors() // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike // the other anchors, which all skip all subsequent processing if found, with BOL we just use it // to boost our position to the next line, and then continue normally with any searches. - Debug.Assert(!rtl, "RightToLeft isn't implemented and should have been filtered out previously"); writer.WriteLine("// Beginning-of-line anchor"); using (EmitBlock(writer, "if (runtextpos > runtextbeg && runtextSpan[runtextpos - 1] != '\\n')")) { @@ -464,46 +432,6 @@ void EmitIndexOf_LeftToRight(string prefix) writer.WriteLine("}"); } - // Emits a case-sensitive right-to-left prefix search for a string at the beginning of the pattern. - void EmitIndexOf_RightToLeft(string prefix) - { - writer.WriteLine($"int i = global::System.MemoryExtensions.LastIndexOf(runtextSpan.Slice(runtextbeg, runtextpos - runtextbeg), {Literal(prefix)});"); - writer.WriteLine("if (i >= 0)"); - writer.WriteLine("{"); - writer.WriteLine($" base.runtextpos = runtextbeg + i + {prefix.Length};"); - writer.WriteLine(" return true;"); - writer.WriteLine("}"); - } - - // Emits a right-to-left search for a set at a fixed position from the start of the pattern. - // (Currently that position will always be a distance of 0, meaning the start of the pattern itself.) - void EmitFixedSet_RightToLeft() - { - (char[]? Chars, string Set, int Distance, bool CaseInsensitive) set = code.FindOptimizations.FixedDistanceSets![0]; - Debug.Assert(set.Distance == 0); - - if (set.Chars is { Length: 1 } && !set.CaseInsensitive) - { - writer.WriteLine($"int i = global::System.MemoryExtensions.LastIndexOf(runtextSpan.Slice(runtextbeg, runtextpos - runtextbeg), {Literal(set.Chars[0])});"); - writer.WriteLine("if (i >= 0)"); - writer.WriteLine("{"); - writer.WriteLine(" base.runtextpos = runtextbeg + i + 1;"); - writer.WriteLine(" return true;"); - writer.WriteLine("}"); - } - else - { - using (EmitBlock(writer, "for (int i = runtextpos - 1; i >= runtextbeg; i--)")) - { - using (EmitBlock(writer, $"if ({MatchCharacterClass(hasTextInfo, options, "runtextSpan[i]", set.Set, set.CaseInsensitive, additionalDeclarations)})")) - { - writer.WriteLine("base.runtextpos = i + 1;"); - writer.WriteLine("return true;"); - } - } - } - } - // Emits a left-to-right search for a set at a fixed position from the start of the pattern, // and potentially other sets at other fixed positions in the pattern. void EmitFixedSet_LeftToRight() @@ -629,10 +557,8 @@ static void EmitTextInfo(IndentedTextWriter writer, ref bool hasTextInfo, RegexM bool needsCulture = rm.Code.FindOptimizations.FindMode switch { FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive or - FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive or FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or - FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or - FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive => true, + FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive => true, _ when rm.Code.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive), @@ -651,64 +577,32 @@ FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or /// Emits the body of the Go override. private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id) { - Debug.Assert(rm.Code.Tree.Root.Type == RegexNode.Capture); - - if ((rm.Options & RegexOptions.NonBacktracking) != 0) - { - EmitNonBacktrackingGo(writer, rm, id); - return; - } - RegexNode root = rm.Code.Tree.Root; - if (!ExceedsMaxDepthForSimpleCodeGeneration(root) && - root.Child(0).SupportsSimplifiedCodeGenerationImplementation() && - (((RegexOptions)root.Options) & RegexOptions.RightToLeft) == 0) - { - EmitSimplifiedGo(writer, rm, id); - return; - } - - EmitCompleteGo(writer, rm, id); - - // Deep RegexNode trees used with the simplified code generator can result in - // emitting C# code that exceeds C# compiler limitations, leading to "CS8078: An - // expression is too long or complex to compile". Place an artificial limit on - // max tree depth in order to mitigate such issues. - static bool ExceedsMaxDepthForSimpleCodeGeneration(RegexNode node, int maxDepth = 30) - { - if (maxDepth <= 0) - { - return true; - } - - int childCount = node.ChildCount(); - for (int i = 0; i < childCount; i++) - { - if (ExceedsMaxDepthForSimpleCodeGeneration(node.Child(i), maxDepth - 1)) - { - return true; - } - } - - return false; - } - } - - /// Emits the body of a Go method supporting RegexOptions.NonBacktracking. - private static void EmitNonBacktrackingGo(IndentedTextWriter writer, RegexMethod rm, string id) - { - // TODO: Implement this and remove SupportsCustomCodeGeneration. - } + // In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled + // version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via + // RegexWriter; the interpreter would then sit in a loop processing those opcodes, and the RegexCompiler iterated through the + // opcodes generating code for each equivalent to what the interpreter would do albeit with some decisions made at compile-time + // rather than at run-time. This approach, however, lead to complicated code that wasn't pay-for-play (e.g. a big backtracking + // jump table that all compilations went through even if there was no backtracking), that didn't factor in the shape of the + // tree (e.g. it's difficult to add optimizations based on interactions between nodes in the graph), and that didn't read well + // when decompiled from IL to C# or when directly emitted as C# as part of a source generator. + // + // This implementation is instead based on directly walking the RegexNode tree and outputting code for each node in the graph. + // A dedicated for each kind of RegexNode emits the code necessary to handle that node's processing, including recursively + // calling the relevant function for any of its children nodes. Backtracking is handled not via a giant jump table, but instead + // by emitting direct jumps to each backtracking construct. This is achieved by having all match failures jump to a "done" + // label that can be changed by a previous emitter, e.g. before EmitLoop returns, it ensures that "doneLabel" is set to the + // label that code should jump back to when backtracking. That way, a subsequent EmitXx function doesn't need to know exactly + // where to jump: it simply always jumps to "doneLabel" on match failure, and "doneLabel" is always configured to point to + // the right location. In an expression without backtracking, or before any backtracking constructs have been encountered, + // "doneLabel" is simply the final return location from the Go method that will undo any captures and exit, signaling to + // the calling scan loop that nothing was matched. - /// Emits the body of a simplified Go implementation that's possible when there's minimal backtracking required by the expression. - private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, string id) - { // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated // code with other costs, like the (small) overhead of slicing to create the temp span to iterate. const int MaxUnrollSize = 16; RegexOptions options = (RegexOptions)rm.Options; RegexCode code = rm.Code; - bool rtl = code.RightToLeft; bool hasTimeout = false; // Helper to define names. Names start unadorned, but as soon as there's repetition, @@ -760,9 +654,13 @@ string ReserveName(string prefix) writer.WriteLine("string runtext = base.runtext!;"); writer.WriteLine("int runtextpos = base.runtextpos;"); writer.WriteLine("int runtextend = base.runtextend;"); - writer.WriteLine($"int original_runtextpos = runtextpos;{AdditionalDeclarationsPlaceholder}"); // placeholder at the end of a line so the generated indents line up - writer.WriteLine("int runstackpos = 0;"); + writer.WriteLine($"int original_runtextpos = runtextpos;"); hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm); + writer.Write("int runstackpos = 0;"); + writer.Flush(); + int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length; + int additionalDeclarationsIndent = writer.Indent; + writer.WriteLine(); // TextInfo textInfo = CultureInfo.CurrentCulture.TextInfo; // only if the whole expression or any subportion is ignoring case, and we're not using invariant bool hasTextInfo = EmitInitializeCultureForGoIfNecessary(writer, rm); @@ -804,7 +702,7 @@ string ReserveName(string prefix) } // We're done. Patch up any additional declarations. - ReplaceAdditionalDeclarations(additionalDeclarations, writer); + ReplaceAdditionalDeclarations(writer, additionalDeclarations, additionalDeclarationsPosition, additionalDeclarationsIndent); return; static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0; @@ -1018,9 +916,10 @@ void EmitAllBranches() // construct is responsible for unwinding back to its starting crawl position. If // it eventually ends up failing, that failure will result in jumping to the next branch // of the alternation, which will again dutifully unwind the remaining captures until - // what they were at the start of the alternation. + // what they were at the start of the alternation. Of course, if there are no captures + // anywhere in the regex, we don't have to do any of that. string? startingCrawlPos = null; - if ((node.Options & RegexNode.HasCapturesFlag) != 0 || !isAtomic) + if (expressionHasCaptures && ((node.Options & RegexNode.HasCapturesFlag) != 0 || !isAtomic)) { startingCrawlPos = ReserveName("alternation_starting_crawlpos"); additionalDeclarations.Add($"int {startingCrawlPos} = 0;"); @@ -1070,7 +969,10 @@ void EmitAllBranches() { EmitRunstackResizeIfNeeded(2); writer.WriteLine($"{RunstackPush()} = {i};"); - writer.WriteLine($"{RunstackPush()} = {startingCrawlPos};"); + if (startingCrawlPos is not null) + { + writer.WriteLine($"{RunstackPush()} = {startingCrawlPos};"); + } writer.WriteLine($"{RunstackPush()} = {startingRunTextPos};"); } labelMap[i] = doneLabel; @@ -1107,13 +1009,20 @@ void EmitAllBranches() // "doneLabel" to the label for this section. Thus, we only need to emit it if // something can backtrack to us, which can't happen if we're inside of an atomic // node. Thus, emit the backtracking section only if we're non-atomic. - if (!isAtomic) + if (isAtomic) + { + doneLabel = originalDoneLabel; + } + else { doneLabel = backtrackLabel; MarkLabel(backtrackLabel, emitSemicolon: false); writer.WriteLine($"{startingRunTextPos} = {RunstackPop()};"); - writer.WriteLine($"{startingCrawlPos} = {RunstackPop()};"); + if (startingCrawlPos is not null) + { + writer.WriteLine($"{startingCrawlPos} = {RunstackPop()};"); + } using (EmitBlock(writer, $"switch ({RunstackPop()})")) { for (int i = 0; i < labelMap.Length; i++) @@ -1177,6 +1086,8 @@ void EmitBackreference(RegexNode node) // Emits the code for an if(backreference)-then-else conditional. void EmitBackreferenceConditional(RegexNode node) { + bool isAtomic = node.IsAtomicByParent(); + // We're branching in a complicated fashion. Make sure textSpanPos is 0. TransferTextSpanPosToRunTextPos(); @@ -1247,43 +1158,53 @@ void EmitBackreferenceConditional(RegexNode node) } } - // If either the yes branch or the no branch contained backtracking, subsequent expressions - // might try to backtrack to here, so output a backtracking map based on resumeAt. - if (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel) + if (isAtomic) { - // Skip the backtracking section. - writer.WriteLine($"goto {endRef};"); - writer.WriteLine(); + doneLabel = originalDoneLabel; + } + else + { + // If either the yes branch or the no branch contained backtracking, subsequent expressions + // might try to backtrack to here, so output a backtracking map based on resumeAt. + if (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel) + { + // Skip the backtracking section. + writer.WriteLine($"goto {endRef};"); + writer.WriteLine(); - string backtrack = ReserveName("ConditionalBackreferenceBacktrack"); - doneLabel = backtrack; - MarkLabel(backtrack); + string backtrack = ReserveName("ConditionalBackreferenceBacktrack"); + doneLabel = backtrack; + MarkLabel(backtrack); - writer.WriteLine($"{resumeAt} = {RunstackPop()};"); + writer.WriteLine($"{resumeAt} = {RunstackPop()};"); - using (EmitBlock(writer, $"switch ({resumeAt})")) - { - if (postIfDoneLabel != originalDoneLabel) + using (EmitBlock(writer, $"switch ({resumeAt})")) { - writer.WriteLine($"case 0: goto {postIfDoneLabel};"); - } + if (postIfDoneLabel != originalDoneLabel) + { + writer.WriteLine($"case 0: goto {postIfDoneLabel};"); + } - if (postElseDoneLabel != originalDoneLabel) - { - writer.WriteLine($"case 1: goto {postElseDoneLabel};"); - } + if (postElseDoneLabel != originalDoneLabel) + { + writer.WriteLine($"case 1: goto {postElseDoneLabel};"); + } - writer.WriteLine($"default: goto {originalDoneLabel};"); + writer.WriteLine($"default: goto {originalDoneLabel};"); + } } } if (postIfDoneLabel != originalDoneLabel || hasNo) { MarkLabel(endRef); - if (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel) + if (!isAtomic) { - EmitRunstackResizeIfNeeded(1); - writer.WriteLine($"{RunstackPush()} = {resumeAt};"); + if (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel) + { + EmitRunstackResizeIfNeeded(1); + writer.WriteLine($"{RunstackPush()} = {resumeAt};"); + } } } } @@ -1291,6 +1212,8 @@ void EmitBackreferenceConditional(RegexNode node) // Emits the code for an if(expression)-then-else conditional. void EmitExpressionConditional(RegexNode node) { + bool isAtomic = node.IsAtomicByParent(); + // We're branching in a complicated fashion. Make sure textSpanPos is 0. TransferTextSpanPosToRunTextPos(); @@ -1335,7 +1258,10 @@ void EmitExpressionConditional(RegexNode node) string postConditionalDoneLabel = doneLabel; string resumeAt = ReserveName("conditionalexpression_resumeAt"); - additionalDeclarations.Add($"int {resumeAt} = 0;"); + if (!isAtomic) + { + additionalDeclarations.Add($"int {resumeAt} = 0;"); + } // If we get to this point of the code, the conditional successfully matched, so run the "yes" branch. // Since the "yes" branch may have a different execution path than the "no" branch or the lack of @@ -1345,7 +1271,7 @@ void EmitExpressionConditional(RegexNode node) EmitNode(yesBranch); TransferTextSpanPosToRunTextPos(); // ensure all subsequent code sees the same textSpanPos value by setting it to 0 string postYesDoneLabel = doneLabel; - if (postYesDoneLabel != originalDoneLabel) + if (!isAtomic && postYesDoneLabel != originalDoneLabel) { writer.WriteLine($"{resumeAt} = 0;"); } @@ -1373,7 +1299,7 @@ void EmitExpressionConditional(RegexNode node) EmitNode(noBranch); TransferTextSpanPosToRunTextPos(); // ensure all subsequent code sees the same textSpanPos value by setting it to 0 postNoDoneLabel = doneLabel; - if (postNoDoneLabel != originalDoneLabel) + if (!isAtomic && postNoDoneLabel != originalDoneLabel) { writer.WriteLine($"{resumeAt} = 1;"); } @@ -1383,42 +1309,49 @@ void EmitExpressionConditional(RegexNode node) // There's only a yes branch. If it's going to cause us to output a backtracking // label but code may not end up taking the yes branch path, we need to emit a resumeAt // that will cause the backtracking to immediately pass through this node. - if (postYesDoneLabel != originalDoneLabel) + if (!isAtomic && postYesDoneLabel != originalDoneLabel) { writer.WriteLine($"{resumeAt} = 2;"); } } - if (postYesDoneLabel != postConditionalDoneLabel || postNoDoneLabel != postConditionalDoneLabel) + if (isAtomic) { - // Skip the backtracking section. - writer.WriteLine($"goto {end};"); - writer.WriteLine(); + doneLabel = originalDoneLabel; + } + else + { + if (postYesDoneLabel != postConditionalDoneLabel || postNoDoneLabel != postConditionalDoneLabel) + { + // Skip the backtracking section. + writer.WriteLine($"goto {end};"); + writer.WriteLine(); - string backtrack = ReserveName("ConditionalExpressionBacktrack"); - doneLabel = backtrack; - MarkLabel(backtrack); + string backtrack = ReserveName("ConditionalExpressionBacktrack"); + doneLabel = backtrack; + MarkLabel(backtrack); - using (EmitBlock(writer, $"switch ({RunstackPop()})")) - { - if (postYesDoneLabel != postConditionalDoneLabel) + using (EmitBlock(writer, $"switch ({RunstackPop()})")) { - writer.WriteLine($"case 0: goto {postYesDoneLabel};"); - } + if (postYesDoneLabel != postConditionalDoneLabel) + { + writer.WriteLine($"case 0: goto {postYesDoneLabel};"); + } - if (postNoDoneLabel != postConditionalDoneLabel && postNoDoneLabel != originalDoneLabel) - { - writer.WriteLine($"case 1: goto {postNoDoneLabel};"); - } + if (postNoDoneLabel != postConditionalDoneLabel && postNoDoneLabel != originalDoneLabel) + { + writer.WriteLine($"case 1: goto {postNoDoneLabel};"); + } - writer.WriteLine($"default: goto {postConditionalDoneLabel};"); + writer.WriteLine($"default: goto {postConditionalDoneLabel};"); + } } - } - if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel) - { - EmitRunstackResizeIfNeeded(1); - writer.WriteLine($"{RunstackPush()} = {resumeAt};"); + if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel) + { + EmitRunstackResizeIfNeeded(1); + writer.WriteLine($"{RunstackPush()} = {resumeAt};"); + } } MarkLabel(end); @@ -1430,6 +1363,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) Debug.Assert(node.Type == RegexNode.Capture); int capnum = RegexParser.MapCaptureNumber(node.M, rm.Code.Caps); int uncapnum = RegexParser.MapCaptureNumber(node.N, rm.Code.Caps); + bool isAtomic = node.IsAtomicByParent(); TransferTextSpanPosToRunTextPos(); string startingRunTextPos = ReserveName("capture_starting_runtextpos"); @@ -1463,7 +1397,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) writer.WriteLine($"base.TransferCapture({capnum}, {uncapnum}, {startingRunTextPos}, runtextpos);"); } - if (childBacktracks || node.IsInLoop()) + if (!isAtomic && (childBacktracks || node.IsInLoop())) { writer.WriteLine(); @@ -1490,6 +1424,10 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) doneLabel = backtrack; MarkLabel(end); } + else + { + doneLabel = originalDoneLabel; + } } // Emits code to unwind the capture stack until the crawl position specified in the provided local. @@ -1720,11 +1658,16 @@ void EmitUpdateBumpalong() writer.WriteLine("base.runtextpos = runtextpos;"); } + // Emits code for a concatenation void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired) { + // Emit the code for each child one after the other. int childCount = node.ChildCount(); for (int i = 0; i < childCount; i++) { + // If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence + // and then skip the individual length checks for each. We also want to minimize the repetition of if blocks, + // and so we try to emit a series of clauses all part of the same if block rather than one if block per child. if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) { bool wroteClauses = true; @@ -1753,7 +1696,6 @@ void WriteSingleCharChild(RegexNode child) if (child.Type is RegexNode.One or RegexNode.Notone or RegexNode.Set) { WriteSingleCharChild(child); - writer.Write($" /* {DescribeNode(child)} */"); } else if (child.Type is RegexNode.Oneloop or RegexNode.Onelazy or RegexNode.Oneloopatomic or RegexNode.Setloop or RegexNode.Setlazy or RegexNode.Setloopatomic or @@ -1764,10 +1706,6 @@ RegexNode.Notoneloop or RegexNode.Notonelazy or RegexNode.Notoneloopatomic && for (int c = 0; c < child.M; c++) { WriteSingleCharChild(child); - if (c == 0) - { - writer.Write($" /* {DescribeNode(child)} */"); - } } } else @@ -1795,11 +1733,10 @@ RegexNode.Notoneloop or RegexNode.Notonelazy or RegexNode.Notoneloopatomic && } i--; + continue; } - else - { - EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: emitLengthChecksIfRequired); - } + + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: emitLengthChecksIfRequired); } } @@ -2291,10 +2228,11 @@ void EmitLazy(RegexNode node) int minIterations = node.M; int maxIterations = node.N; string originalDoneLabel = doneLabel; + bool isAtomic = node.IsAtomicByParent(); // If this is actually an atomic lazy loop, we need to output just the minimum number of iterations, // as nothing will backtrack into the lazy loop to get it progress further. - if (node.IsAtomicByParent()) + if (isAtomic) { switch (minIterations) { @@ -2433,43 +2371,46 @@ void EmitLazy(RegexNode node) MarkLabel(endLoop); - // Store the capture's state and skip the backtracking section - EmitRunstackResizeIfNeeded(3); - writer.WriteLine($"{RunstackPush()} = {startingRunTextPos};"); - writer.WriteLine($"{RunstackPush()} = {iterationCount};"); - writer.WriteLine($"{RunstackPush()} = {sawEmpty};"); - string skipBacktrack = ReserveName("SkipBacktrack"); - writer.WriteLine($"goto {skipBacktrack};"); - writer.WriteLine(); + if (!isAtomic) + { + // Store the capture's state and skip the backtracking section + EmitRunstackResizeIfNeeded(3); + writer.WriteLine($"{RunstackPush()} = {startingRunTextPos};"); + writer.WriteLine($"{RunstackPush()} = {iterationCount};"); + writer.WriteLine($"{RunstackPush()} = {sawEmpty};"); + string skipBacktrack = ReserveName("SkipBacktrack"); + writer.WriteLine($"goto {skipBacktrack};"); + writer.WriteLine(); - // Emit a backtracking section that restores the capture's state and then jumps to the previous done label - string backtrack = ReserveName($"LazyLoopBacktrack"); - MarkLabel(backtrack); + // Emit a backtracking section that restores the capture's state and then jumps to the previous done label + string backtrack = ReserveName($"LazyLoopBacktrack"); + MarkLabel(backtrack); - writer.WriteLine($"{sawEmpty} = {RunstackPop()};"); - writer.WriteLine($"{iterationCount} = {RunstackPop()};"); - writer.WriteLine($"{startingRunTextPos} = {RunstackPop()};"); + writer.WriteLine($"{sawEmpty} = {RunstackPop()};"); + writer.WriteLine($"{iterationCount} = {RunstackPop()};"); + writer.WriteLine($"{startingRunTextPos} = {RunstackPop()};"); - if (maxIterations == int.MaxValue) - { - using (EmitBlock(writer, $"if ({sawEmpty} == 0)")) + if (maxIterations == int.MaxValue) { - writer.WriteLine($"goto {body};"); + using (EmitBlock(writer, $"if ({sawEmpty} == 0)")) + { + writer.WriteLine($"goto {body};"); + } } - } - else - { - using (EmitBlock(writer, $"if ({iterationCount} < {maxIterations} && {sawEmpty} == 0)")) + else { - writer.WriteLine($"goto {body};"); + using (EmitBlock(writer, $"if ({iterationCount} < {maxIterations} && {sawEmpty} == 0)")) + { + writer.WriteLine($"goto {body};"); + } } - } - writer.WriteLine($"goto {doneLabel};"); - writer.WriteLine(); + writer.WriteLine($"goto {doneLabel};"); + writer.WriteLine(); - doneLabel = backtrack; - MarkLabel(skipBacktrack); + doneLabel = backtrack; + MarkLabel(skipBacktrack); + } } // Emits the code to handle a loop (repeater) with a fixed number of iterations. @@ -2705,6 +2646,7 @@ void EmitLoop(RegexNode node) Debug.Assert(node.N >= node.M, $"Unexpected M={node.M}, N={node.N}"); int minIterations = node.M; int maxIterations = node.N; + bool isAtomic = node.IsAtomicByParent(); // We might loop any number of times. In order to ensure this loop and subsequent code sees textSpanPos // the same regardless, we always need it to contain the same value, and the easiest such value is 0. @@ -2814,1235 +2756,72 @@ void EmitLoop(RegexNode node) } } - if (childBacktracks) + if (isAtomic) { - writer.WriteLine($"goto {endLoop};"); - writer.WriteLine(); - - string backtrack = ReserveName("LoopBacktrack"); - MarkLabel(backtrack); - using (EmitBlock(writer, $"if ({iterationCount} == 0)")) - { - writer.WriteLine($"goto {originalDoneLabel};"); - } - writer.WriteLine($"goto {doneLabel};"); - doneLabel = backtrack; + doneLabel = originalDoneLabel; + MarkLabel(endLoop); } - - MarkLabel(endLoop); - - - - if (node.IsInLoop()) - { - writer.WriteLine(); - - // Store the capture's state - EmitRunstackResizeIfNeeded(3); - writer.WriteLine($"{RunstackPush()} = {startingRunTextPos};"); - writer.WriteLine($"{RunstackPush()} = {iterationCount};"); - - // Skip past the backtracking section - string end = ReserveName("SkipBacktrack"); - writer.WriteLine($"goto {end};"); - writer.WriteLine(); - - // Emit a backtracking section that restores the capture's state and then jumps to the previous done label - string backtrack = ReserveName("LoopBacktrack"); - MarkLabel(backtrack); - writer.WriteLine($"{iterationCount} = {RunstackPop()};"); - writer.WriteLine($"{startingRunTextPos} = {RunstackPop()};"); - - writer.WriteLine($"goto {doneLabel};"); - writer.WriteLine(); - - doneLabel = backtrack; - MarkLabel(end); - } - } - - void EmitRunstackResizeIfNeeded(int count) - { - string subCount = count > 1 ? $" - {count - 1}" : ""; - using (EmitBlock(writer, $"if (runstackpos >= base.runstack!.Length{subCount})")) - { - writer.WriteLine("global::System.Array.Resize(ref base.runstack, base.runstack.Length * 2);"); - } - } - - string RunstackPush() => "base.runstack[runstackpos++]"; - string RunstackPop() => "base.runstack![--runstackpos]"; - } - - /// Emits the body of a complete Go implementation that fully supports backtracking. - private static void EmitCompleteGo(IndentedTextWriter writer, RegexMethod rm, string id) - { - const int Stackpop = 0; // pop one - const int Stackpop2 = 1; // pop two - const int Capback = 3; // uncapture - const int Capback2 = 4; // uncapture 2 - const int Branchmarkback2 = 5; // back2 part of branchmark - const int Lazybranchmarkback2 = 6; // back2 part of lazybranchmark - const int Branchcountback2 = 7; // back2 part of branchcount - const int Lazybranchcountback2 = 8; // back2 part of lazybranchcount - const int Forejumpback = 9; // back part of forejump - const int Uniquecount = 10; - const string Backtrack = "Backtrack"; // label for backtracking - - int[] codes = rm.Code.Codes; - RegexOptions options = rm.Options; - - int labelCounter = 0; - string DefineLabel(string prefix = "L") => $"{prefix}{labelCounter++}"; - void MarkLabel(string label) => writer.WriteLine($"{label}:"); - - var labels = new string?[codes.Length]; // a label for every operation in _codes - BacktrackNote[]? notes = null; // a list of the backtracking states to be generated - int noteCount = 0; // true count of _notes (allocation grows exponentially) - - int currentOpcode = 0; // the current opcode being processed - int currentCodePos = 0; // the current code being translated - int currentBacktrackNote = 0; // the current backtrack-note being translated - - // special code fragments - var uniqueNote = new int[Uniquecount]; // notes indices for code that should be emitted <= once - var forwardJumpsThroughSwitch = new int[codes.Length]; // indices for forward-jumps-through-switch (for allocations) - - // Generates the forward logic corresponding directly to the regex codes. - // In the absence of backtracking, this is all we would need. - writer.WriteLine("string runtext = base.runtext!;"); - writer.WriteLine("int runtextbeg = base.runtextbeg;"); - writer.WriteLine("int runtextend = base.runtextend;"); - writer.WriteLine("int runtextpos = base.runtextpos;"); - writer.WriteLine("int[] runtrack = base.runtrack!;"); - writer.WriteLine("int runtrackpos = base.runtrackpos;"); - writer.WriteLine("int[] runstack = base.runstack!;"); - writer.WriteLine("int runstackpos = base.runstackpos;"); - writer.WriteLine("int tmp1, tmp2, ch;"); - bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm); - bool hasTextInfo = EmitInitializeCultureForGoIfNecessary(writer, rm); - writer.WriteLine(); - - uniqueNote.AsSpan().Fill(-1); - for (int codepos = 0; codepos < codes.Length; codepos += RegexCode.OpcodeSize(codes[codepos])) - { - forwardJumpsThroughSwitch[codepos] = -1; - labels[codepos] = DefineLabel(); - } - - currentBacktrackNote = -1; - for (int codepos = 0; codepos < codes.Length; codepos += RegexCode.OpcodeSize(codes[codepos])) - { - currentCodePos = codepos; - currentOpcode = codes[codepos]; - EmitOneCode(labels[codepos]); - writer.WriteLine(); - } - - // Generate the backtracking switch jump table that allows us to simulate a stack of addresses, - // and contains the calls that expand the tracking and the grouping stack when they get too full. - MarkLabel(Backtrack); - - // (Equivalent of EnsureStorage, but written to avoid unnecessary local spilling.) - writer.WriteLine("int limit = base.runtrackcount * 4;"); - using (EmitBlock(writer, "if (runstackpos < limit)")) - { - writer.WriteLine("base.runstackpos = runstackpos;"); - writer.WriteLine("base.DoubleStack(); // might change runstackpos and runstack"); - writer.WriteLine("runstackpos = base.runstackpos;"); - writer.WriteLine("runstack = base.runstack!;"); - } - using (EmitBlock(writer, "if (runtrackpos < limit)")) - { - writer.WriteLine("base.runtrackpos = runtrackpos;"); - writer.WriteLine("base.DoubleTrack(); // might change runtrackpos and runtrack"); - writer.WriteLine("runtrackpos = base.runtrackpos;"); - writer.WriteLine("runtrack = base.runtrack!;"); - } - writer.WriteLine(); - using (EmitBlock(writer, "switch (runtrack[runtrackpos++])")) - { - for (int i = 0; i < noteCount; i++) + else { - using (EmitBlock(writer, $"case {i}:")) + if (childBacktracks) { - Debug.Assert(notes is not null); - BacktrackNote n = notes[i]; - if (n.flags != 0) - { - currentCodePos = n.codepos; - currentBacktrackNote = i; - currentOpcode = codes[n.codepos] | n.flags; - EmitOneCode(null); // should always end in a goto - } - else - { - writer.WriteLine($"goto {n.label};"); - } - } - - writer.WriteLine(); - } - - using (EmitBlock(writer, "default:")) - { - writer.WriteLine("global::System.Diagnostics.Debug.Fail($\"Unexpected backtracking state {runtrack[runtrackpos - 1]}\");"); - writer.WriteLine("break;"); - } - } - - return; - - /// - /// The main translation function. It translates the logic for a single opcode at - /// the current position. The structure of this function exactly mirrors - /// the structure of the inner loop of RegexInterpreter.Go(). - /// - /// - /// Note that since we're generating code, we can collapse many cases that are - /// dealt with one-at-a-time in RegexIntepreter. We can also unroll loops that - /// iterate over constant strings or sets. - /// - void EmitOneCode(string? label) - { - writer.WriteLine($"// {SymbolDisplay.FormatLiteral(RegexCode.OpcodeDescription(currentCodePos, rm.Code.Codes, rm.Code.Strings), quote: false)}"); - - if (label is not null) - { - MarkLabel(label); - } - - // Before executing any Regex code in the unrolled loop, - // we try checking for the match timeout: - EmitTimeoutCheck(writer, hasTimeout); - - // Now generate the code for the Regex code saved in _regexopcode. - switch (currentOpcode) - { - case RegexCode.Stop: - writer.WriteLine("base.runtextpos = runtextpos;"); - writer.WriteLine("return;"); - break; - - case RegexCode.Nothing: - writer.WriteLine($"goto {Backtrack};"); - break; - - case RegexCode.UpdateBumpalong: - // UpdateBumpalong should only exist in the code stream at such a point where the root - // of the backtracking stack contains the runtextpos from the start of this Go call. Replace - // that tracking value with the current runtextpos value. - writer.WriteLine("runtrack[^1] = runtextpos;"); - break; - - case RegexCode.Goto: - Goto(Operand(0)); - break; - - case RegexCode.Testref: - using (EmitBlock(writer, $"if (!base.IsMatched({Operand(0)}))")) - { - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.Lazybranch: - PushTrack("runtextpos"); - Track(); - break; - - case RegexCode.Lazybranch | RegexCode.Back: - writer.WriteLine($"runtextpos = {PopTrack()};"); - Goto(Operand(0)); - break; - - case RegexCode.Nullmark: - PushStack(-1); - TrackUnique(Stackpop); - break; - - case RegexCode.Setmark: - PushStack("runtextpos"); - TrackUnique(Stackpop); - break; - - case RegexCode.Nullmark | RegexCode.Back: - case RegexCode.Setmark | RegexCode.Back: - PopDiscardStack(); - writer.WriteLine($"goto {Backtrack};"); - break; - - case RegexCode.Getmark: - writer.WriteLine($"runtextpos = {PopStack()};"); - PushTrack("runtextpos"); - Track(); - break; - - case RegexCode.Getmark | RegexCode.Back: - PushStack(PopTrack()); - writer.WriteLine($"goto {Backtrack};"); - break; - - case RegexCode.Capturemark: - { - if (Operand(1) != -1) - { - using (EmitBlock(writer, $"if (!base.IsMatched({Operand(1)}))")) - { - writer.WriteLine($"goto {Backtrack};"); - } - } - - const string Stacked = "tmp1"; - writer.WriteLine($"{Stacked} = {PopStack()};"); - writer.WriteLine(Operand(1) != -1 ? - $"base.TransferCapture({Operand(0)}, {Operand(1)}, {Stacked}, runtextpos);" : - $"base.Capture({Operand(0)}, {Stacked}, runtextpos);"); - PushTrack(Stacked); - TrackUnique(Operand(0) != -1 && Operand(1) != -1 ? Capback2 : Capback); - } - break; - - case RegexCode.Capturemark | RegexCode.Back: - PushStack(PopTrack()); - writer.WriteLine("base.Uncapture();"); - if (Operand(0) != -1 && Operand(1) != -1) - { - writer.WriteLine("base.Uncapture();"); - } - writer.WriteLine($"goto {Backtrack};"); - break; - - case RegexCode.Branchmark: - { - const string Mark = "tmp1"; - writer.WriteLine($"{Mark} = {PopStack()}; // mark"); - PushTrack(Mark); - using (EmitBlock(writer, $"if (runtextpos != {Mark})")) - { - PushTrack("runtextpos"); - PushStack("runtextpos"); - Track(); - Goto(Operand(0)); - } - using (EmitBlock(writer, "else")) - { - TrackUnique2(Branchmarkback2); - } - } - break; - - case RegexCode.Branchmark | RegexCode.Back: - writer.WriteLine($"runtextpos = {PopTrack()};"); - PopDiscardStack(); - TrackUnique2(Branchmarkback2); // track spot 0 is already in place - Advance(); - break; - - case RegexCode.Branchmark | RegexCode.Back2: - PushStack(PopTrack()); - writer.WriteLine($"goto {Backtrack};"); - break; - - case RegexCode.Lazybranchmark: - { - const string Mark = "tmp1"; - writer.WriteLine($"{Mark} = {PopStack()}; // mark"); - PushTrack($"{Mark} != -1 ? {Mark} : runtextpos"); - using (EmitBlock(writer, $"if (runtextpos != {Mark})")) - { - PushTrack("runtextpos"); - Track(); - Advance(); - } - PushStack(Mark); - TrackUnique2(Lazybranchmarkback2); - } - break; - - case RegexCode.Lazybranchmark | RegexCode.Back: - writer.WriteLine($"runtextpos = {PopTrack()};"); - PushStack("runtextpos"); - TrackUnique2(Lazybranchmarkback2); - Goto(Operand(0)); - break; - - case RegexCode.Lazybranchmark | RegexCode.Back2: - writer.WriteLine($"{ReadyReplaceStack(0)} = {PopTrack()};"); - writer.WriteLine($"goto {Backtrack};"); - break; - - case RegexCode.Nullcount: - PushStack(-1); - PushStack(Operand(0)); - TrackUnique(Stackpop2); - break; - - case RegexCode.Setcount: - PushStack("runtextpos"); - PushStack(Operand(0)); - TrackUnique(Stackpop2); - break; - - case RegexCode.Nullcount | RegexCode.Back: - case RegexCode.Setcount | RegexCode.Back: - PopDiscardStack(2); - writer.WriteLine($"goto {Backtrack};"); - break; - - case RegexCode.Branchcount: - { - const string Count = "tmp1"; - const string Mark = "tmp2"; - writer.WriteLine($"{Count} = {PopStack()}; // count"); - writer.WriteLine($"{Mark} = {PopStack()}; // mark"); - PushTrack(Mark); - using (EmitBlock(writer, $"if ({Count} < ({Mark} == runtextpos ? 0 : {Operand(1)}))")) - { - PushStack("runtextpos"); - PushStack($"{Count} + 1"); - Track(); - Goto(Operand(0)); - } - PushTrack(Count); - TrackUnique2(Branchcountback2); - } - break; - - case RegexCode.Branchcount | RegexCode.Back: - { - const string Count = "tmp1"; - writer.WriteLine($"{Count} = {PopStack()} - 1; // count"); - using (EmitBlock(writer, $"if ({Count} >= 0)")) - { - writer.WriteLine($"runtextpos = {PopStack()};"); - PushTrack(Count); - TrackUnique2(Branchcountback2); - Advance(); - } - writer.WriteLine($"{ReadyReplaceStack(0)} = {PopTrack()};"); - PushStack(Count); - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.Branchcount | RegexCode.Back2: - { - const string Mark = "tmp1"; - writer.WriteLine($"{Mark} = {PopTrack()}; // mark"); - PushStack(PopTrack()); - PushStack(Mark); - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.Lazybranchcount: - { - const string Count = "tmp1"; - writer.WriteLine($"{Count} = {PopStack()}; // count"); - PushTrack(PopStack()); // mark - using (EmitBlock(writer, $"if ({Count} < 0)")) - { - PushStack("runtextpos"); - PushStack($"{Count} + 1"); - TrackUnique2(Lazybranchcountback2); - Goto(Operand(0)); - } - PushTrack(Count); - PushTrack("runtextpos"); - Track(); - } - break; - - case RegexCode.Lazybranchcount | RegexCode.Back: - { - const string C = "tmp1"; - writer.WriteLine($"runtextpos = {PopTrack()};"); - writer.WriteLine($"{C} = {PopTrack()}; // c"); - using (EmitBlock(writer, $"if ({C} < {Operand(1)} && runtextpos != {TopTrack()})")) - { - PushStack("runtextpos"); - PushStack($"{C} + 1"); - TrackUnique2(Lazybranchcountback2); - Goto(Operand(0)); - } - PushStack(PopTrack()); - PushStack(C); - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.Lazybranchcount | RegexCode.Back2: - writer.WriteLine($"{ReadyReplaceStack(1)} = {PopTrack()};"); - writer.WriteLine($"{ReadyReplaceStack(0)} = {TopStack()} - 1;"); - ReadyReplaceStack(0); - writer.WriteLine($"goto {Backtrack};"); - break; - - case RegexCode.Setjump: - PushStack("runtrack.Length - runtrackpos"); - PushStack("base.Crawlpos()"); - TrackUnique(Stackpop2); - break; - - case RegexCode.Setjump | RegexCode.Back: - PopDiscardStack(2); - writer.WriteLine($"goto {Backtrack};"); - break; - - case RegexCode.Backjump: - { - const string Stacked = "tmp1"; - writer.WriteLine($"{Stacked} = {PopStack()}; // stacked"); - writer.WriteLine($"runtrackpos = runtrack.Length - {PopStack()};"); - writer.WriteLine($"while (base.Crawlpos() != {Stacked}) base.Uncapture();"); - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.Forejump: - { - const string Stacked = "tmp1"; - writer.WriteLine($"{Stacked} = {PopStack()}; // stacked"); - writer.WriteLine($"runtrackpos = runtrack.Length - {PopStack()};"); - PushTrack(Stacked); - TrackUnique(Forejumpback); - } - break; - - case RegexCode.Forejump | RegexCode.Back: - { - const string TrackedCrawlpos = "tmp1"; - writer.WriteLine($"{TrackedCrawlpos} = {PopTrack()}; // tracked crawlpos"); - writer.WriteLine($"while (base.Crawlpos() != {TrackedCrawlpos}) base.Uncapture();"); - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.Bol: - using (EmitBlock(writer, $"if (runtextpos <= runtextbeg)")) - { - writer.WriteLine($"goto {labels[NextCodepos()]};"); - } - using (EmitBlock(writer, $"if ({Leftchar()} != '\\n')")) - { - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.Eol: - using (EmitBlock(writer, $"if (runtextpos >= runtextend)")) - { - writer.WriteLine($"goto {labels[NextCodepos()]};"); - } - using (EmitBlock(writer, $"if ({Rightchar()} != '\\n')")) - { - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.Boundary: - case RegexCode.NonBoundary: - using (EmitBlock(writer, $"if ({(Code() == RegexCode.Boundary ? "!" : "")}base.IsBoundary(runtextpos, runtextbeg, runtextend))")) - { - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.ECMABoundary: - case RegexCode.NonECMABoundary: - using (EmitBlock(writer, $"if ({(Code() == RegexCode.ECMABoundary ? "!" : "")}base.IsECMABoundary(runtextpos, runtextbeg, runtextend))")) - { - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.Beginning: - using (EmitBlock(writer, $"if (runtextpos > runtextbeg)")) - { - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.Start: - using (EmitBlock(writer, $"if (runtextpos != runtextstart)")) - { - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.EndZ: - using (EmitBlock(writer, $"if (runtextpos < runtextend - 1)")) - { - writer.WriteLine($"goto {Backtrack};"); - } - using (EmitBlock(writer, $"if (runtextpos >= runtextend)")) - { - writer.WriteLine($"goto {labels[NextCodepos()]};"); - } - using (EmitBlock(writer, $"if ({Rightchar()} != '\\n')")) - { - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.End: - using (EmitBlock(writer, $"if (runtextpos < runtextend)")) - { - writer.WriteLine($"goto {Backtrack};"); - } - break; - - case RegexCode.One: - case RegexCode.Notone: - case RegexCode.Set: - case RegexCode.One | RegexCode.Rtl: - case RegexCode.Notone | RegexCode.Rtl: - case RegexCode.Set | RegexCode.Rtl: - case RegexCode.One | RegexCode.Ci: - case RegexCode.Notone | RegexCode.Ci: - case RegexCode.Set | RegexCode.Ci: - case RegexCode.One | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Notone | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Set | RegexCode.Ci | RegexCode.Rtl: - { - string clause; - string expr; - if (!IsRightToLeft()) - { - clause = $"runtextpos >= runtextend || "; - expr = Rightcharnext(); - } - else - { - clause = $"runtextpos <= runtextbeg || "; - expr = Leftcharnext(); - } - - clause += Code() == RegexCode.Set ? - $"!{MatchCharacterClass(hasTextInfo, options, expr, rm.Code.Strings[Operand(0)], IsCaseInsensitive(), null)}" : - $"{ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive())} {(Code() == RegexCode.One ? "!=" : "==")} {Operand(0)}"; - - using (EmitBlock(writer, $"if ({clause})")) - { - writer.WriteLine($"goto {Backtrack};"); - } - } - break; - - case RegexCode.Multi: - case RegexCode.Multi | RegexCode.Ci: - { - string str = rm.Code.Strings[Operand(0)]; - Debug.Assert(str.Length != 0); - writer.WriteLine($"if (runtextend - runtextpos < {str.Length} ||"); - for (int i = 0; i < str.Length; i++) - { - writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos{(i == 0 ? "" : $" + {i}")}]", IsCaseInsensitive())} != {Literal(str[i])}"); - writer.WriteLine(i < str.Length - 1 ? " ||" : ")"); - } - using (EmitBlock(writer, null)) - { - writer.WriteLine($"goto {Backtrack};"); - } - EmitAdd(writer, "runtextpos", str.Length); - break; - } - - case RegexCode.Multi | RegexCode.Rtl: - case RegexCode.Multi | RegexCode.Ci | RegexCode.Rtl: - { - string str = rm.Code.Strings[Operand(0)]; - Debug.Assert(str.Length != 0); - writer.WriteLine($"if (runtextpos - runtextbeg < {str.Length} ||"); - for (int i = str.Length; i > 0;) - { - i--; - writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos - {str.Length - i}]", IsCaseInsensitive())} != {Literal(str[i])}"); - writer.WriteLine(i == 0 ? ")" : " ||"); - } - using (EmitBlock(writer, null)) - { - writer.WriteLine($"goto {Backtrack};"); - } - writer.WriteLine($"runtextpos -= {str.Length};"); - break; - } - - case RegexCode.Ref: - case RegexCode.Ref | RegexCode.Ci: - case RegexCode.Ref | RegexCode.Rtl: - case RegexCode.Ref | RegexCode.Ci | RegexCode.Rtl: - { - const string Length = "tmp1"; - const string Index = "tmp2"; - - using (EmitBlock(writer, $"if (!base.IsMatched({Operand(0)}))")) - { - writer.WriteLine($"goto {((options & RegexOptions.ECMAScript) != 0 ? AdvanceLabel() : Backtrack)};"); - } - - writer.WriteLine($"{Length} = base.MatchLength({Operand(0)}); // length"); - - using (EmitBlock(writer, !IsRightToLeft() ? $"if (runtextend - runtextpos < {Length})" : $"if (runtextpos - runtextbeg < {Length})")) - { - writer.WriteLine($"goto {Backtrack};"); - } - - if (!IsRightToLeft()) - { - writer.WriteLine($"{Index} = base.MatchIndex({Operand(0)}) + {Length}; // index"); - writer.WriteLine($"runtextpos += {Length};"); - } - else - { - writer.WriteLine($"{Index} = base.MatchIndex({Operand(0)}); // index"); - writer.WriteLine($"runtextpos -= {Length};"); - } - - using (EmitBlock(writer, "while (true)")) - { - using (EmitBlock(writer, $"if ({Length} <= 0)")) - { - writer.WriteLine($"goto {AdvanceLabel()};"); - } - - using (EmitBlock(writer, !IsRightToLeft() ? - $"if ({ToLowerIfNeeded(hasTextInfo, options, $"runtext[{Index} - {Length}]", IsCaseInsensitive())} != {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos - {Length}--]", IsCaseInsensitive())})" : - $"if ({ToLowerIfNeeded(hasTextInfo, options, $"runtext[{Index} + --{Length}]", IsCaseInsensitive())} != {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos + {Length}]", IsCaseInsensitive())})")) - { - writer.WriteLine($"break;"); - } - } - - writer.WriteLine($"goto {Backtrack};"); - break; - } - - case RegexCode.Onerep: - case RegexCode.Notonerep: - case RegexCode.Setrep: - case RegexCode.Onerep | RegexCode.Ci: - case RegexCode.Notonerep | RegexCode.Ci: - case RegexCode.Setrep | RegexCode.Ci: - { - int c = Operand(1); - if (c != 0) - { - using (EmitBlock(writer, $"if (runtextend - runtextpos < {c})")) - { - writer.WriteLine($"goto {Backtrack};"); - } - - using (EmitBlock(writer, $"for (int i = 0; i < {c}; i++)")) - { - string expr = "runtext[runtextpos + i]"; - if (Code() == RegexCode.Setrep) - { - EmitTimeoutCheck(writer, hasTimeout); - expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, rm.Code.Strings[Operand(0)], IsCaseInsensitive(), null)}"; - } - else - { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); - expr = $"{expr} {(Code() == RegexCode.Onerep ? "!=" : "==")} {Literal((char)Operand(0))}"; - } - - using (EmitBlock(writer, $"if ({expr})")) - { - writer.WriteLine($"goto {Backtrack};"); - } - } - EmitAdd(writer, "runtextpos", c); - } - } - break; - - case RegexCode.Onerep | RegexCode.Rtl: - case RegexCode.Notonerep | RegexCode.Rtl: - case RegexCode.Setrep | RegexCode.Rtl: - case RegexCode.Onerep | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Notonerep | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Setrep | RegexCode.Ci | RegexCode.Rtl: - { - int c = Operand(1); - if (c != 0) - { - const string Length = "tmp1"; - - using (EmitBlock(writer, $"if (runtextpos - runtextbeg < {c})")) - { - writer.WriteLine($"goto {Backtrack};"); - } - writer.WriteLine($"runtextpos -= {c};"); - writer.WriteLine($"{Length} = {c}; // length"); - - string l1 = DefineLabel(); - MarkLabel(l1); - - string expr = $"runtext[runtextpos + --{Length}]"; - if (Code() == RegexCode.Setrep) - { - EmitTimeoutCheck(writer, hasTimeout); - using (EmitBlock(writer, $"if (!{MatchCharacterClass(hasTextInfo, options, expr, rm.Code.Strings[Operand(0)], IsCaseInsensitive(), null)})")) - { - writer.WriteLine($"goto {Backtrack};"); - } - } - else - { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); - string op = Code() == RegexCode.Onerep ? "!=" : "=="; - using (EmitBlock(writer, $"if ({expr} {op} {Literal((char)Operand(0))})")) - { - writer.WriteLine($"goto {Backtrack};"); - } - } - - using (EmitBlock(writer, $"if ({Length} > 0)")) - { - writer.WriteLine($"goto {l1};"); - } - } - break; - } - - case RegexCode.Oneloop: - case RegexCode.Notoneloop: - case RegexCode.Setloop: - case RegexCode.Oneloop | RegexCode.Rtl: - case RegexCode.Notoneloop | RegexCode.Rtl: - case RegexCode.Setloop | RegexCode.Rtl: - case RegexCode.Oneloop | RegexCode.Ci: - case RegexCode.Notoneloop | RegexCode.Ci: - case RegexCode.Setloop | RegexCode.Ci: - case RegexCode.Oneloop | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Notoneloop | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Setloop | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Oneloopatomic: - case RegexCode.Notoneloopatomic: - case RegexCode.Setloopatomic: - case RegexCode.Oneloopatomic | RegexCode.Rtl: - case RegexCode.Notoneloopatomic | RegexCode.Rtl: - case RegexCode.Setloopatomic | RegexCode.Rtl: - case RegexCode.Oneloopatomic | RegexCode.Ci: - case RegexCode.Notoneloopatomic | RegexCode.Ci: - case RegexCode.Setloopatomic | RegexCode.Ci: - case RegexCode.Oneloopatomic | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Notoneloopatomic | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Setloopatomic | RegexCode.Ci | RegexCode.Rtl: - { - int c = Operand(1); - if (c != 0) - { - const string Len = "tmp1"; - const string I = "tmp2"; - - if (c == int.MaxValue) - { - writer.WriteLine(!IsRightToLeft() ? - $"{Len} = runtextend - runtextpos; // length" : - $"{Len} = runtextpos - runtextbeg; // length"); - } - else - { - writer.WriteLine(!IsRightToLeft() ? - $"{Len} = global::System.Math.Min(runtextend - runtextpos, {c}); // length" : - $"{Len} = global::System.Math.Min(runtextpos - runtextbeg, {c}); // length"); - } - - string? set = Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic ? rm.Code.Strings[Operand(0)] : null; - Span setChars = stackalloc char[5]; // max optimized by IndexOfAny today - int numSetChars; - - // If this is a notoneloop{atomic} and we're left-to-right and case-sensitive, - // we can use the vectorized IndexOf to search for the target character. - if ((Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic) && - !IsRightToLeft() && - !IsCaseInsensitive()) - { - writer.WriteLine($"{I} = global::System.MemoryExtensions.IndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal((char)Operand(0))}); // i"); - using (EmitBlock(writer, $"if ({I} == -1)")) - { - writer.WriteLine($"runtextpos += {Len};"); - writer.WriteLine($"{I} = 0;"); - } - using (EmitBlock(writer, "else")) - { - writer.WriteLine($"runtextpos += {I};"); - writer.WriteLine($"{I} = {Len} - {I};"); - } - } - else if ((Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) && - !IsRightToLeft() && - !IsCaseInsensitive() && - (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) != 0 && - RegexCharClass.IsNegated(set!)) - { - // Similarly, if this is a setloop{atomic} and we're left-to-right and case-sensitive, - // and if the set contains only a few negated chars, we can use the vectorized IndexOfAny - // to search for those chars. - Debug.Assert(numSetChars > 1); - writer.WriteLine(numSetChars switch - { - 2 => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars[0])}, {Literal(setChars[1])}); // i", - 3 => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])}); // i", - _ => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars.Slice(0, numSetChars).ToString())}); // i", - }); - using (EmitBlock(writer, $"if ({I} == -1)")) - { - writer.WriteLine($"runtextpos += {Len};"); - writer.WriteLine($"{I} = 0;"); - } - using (EmitBlock(writer, "else")) - { - writer.WriteLine($"runtextpos += {I};"); - writer.WriteLine($"{I} = {Len} - {I};"); - } - } - else if ((Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) && - !IsRightToLeft() && - set == RegexCharClass.AnyClass) - { - // If someone uses .* along with RegexOptions.Singleline, that becomes [anycharacter]*, which means it'll - // consume everything. As such, we can simply update our position to be the last allowed, without - // actually checking anything. - writer.WriteLine($"runtextpos += {Len};"); - writer.WriteLine($"{I} = 0;"); - } - else - { - // Otherwise, we emit the open-coded loop. - writer.WriteLine($"{I} = {Len} + 1;"); - using (EmitBlock(writer, $"while (--{I} > {0})")) - { - string expr = !IsRightToLeft() ? - Rightcharnext() : - Leftcharnext(); - - if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) - { - EmitTimeoutCheck(writer, hasTimeout); - expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, rm.Code.Strings[Operand(0)], IsCaseInsensitive(), null)}"; - } - else - { - string op = Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic ? "!=" : "=="; - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); - expr = $"{expr} {op} {Literal((char)Operand(0))}"; - } - - using (EmitBlock(writer, $"if ({expr})")) - { - writer.WriteLine(!IsRightToLeft() ? - "runtextpos--;" : - "runtextpos++;"); - writer.WriteLine("break;"); - } - } - } - - if (Code() != RegexCode.Oneloopatomic && Code() != RegexCode.Notoneloopatomic && Code() != RegexCode.Setloopatomic) - { - using (EmitBlock(writer, $"if ({I} >= {Len})")) - { - writer.WriteLine($"goto {AdvanceLabel()};"); - } - PushTrack($"{Len} - {I} - 1"); - PushTrack(!IsRightToLeft() ? - "runtextpos - 1" : - "runtextpos + 1"); - Track(); - } - } - break; - } - - case RegexCode.Oneloop | RegexCode.Back: - case RegexCode.Notoneloop | RegexCode.Back: - case RegexCode.Setloop | RegexCode.Back: - case RegexCode.Oneloop | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Notoneloop | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Setloop | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Oneloop | RegexCode.Ci | RegexCode.Back: - case RegexCode.Notoneloop | RegexCode.Ci | RegexCode.Back: - case RegexCode.Setloop | RegexCode.Ci | RegexCode.Back: - case RegexCode.Oneloop | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Notoneloop | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Setloop | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: - { - const string Position = "tmp1"; - writer.WriteLine($"runtextpos = {PopTrack()};"); - writer.WriteLine($"{Position} = {PopTrack()}; // position"); - using (EmitBlock(writer, $"if ({Position} > 0)")) - { - PushTrack($"{Position} - 1"); - PushTrack(!IsRightToLeft() ? - "runtextpos - 1" : - "runtextpos + 1"); - Trackagain(); - } - Advance(); - } - break; - - case RegexCode.Onelazy: - case RegexCode.Notonelazy: - case RegexCode.Setlazy: - case RegexCode.Onelazy | RegexCode.Rtl: - case RegexCode.Notonelazy | RegexCode.Rtl: - case RegexCode.Setlazy | RegexCode.Rtl: - case RegexCode.Onelazy | RegexCode.Ci: - case RegexCode.Notonelazy | RegexCode.Ci: - case RegexCode.Setlazy | RegexCode.Ci: - case RegexCode.Onelazy | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Notonelazy | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Setlazy | RegexCode.Ci | RegexCode.Rtl: - { - int count = Operand(1); - if (count != 0) - { - const string C = "tmp1"; - if (count == int.MaxValue) - { - writer.WriteLine(!IsRightToLeft() ? - $"{C} = runtextend - runtextpos; // count" : - $"{C} = runtextpos - runtextbeg; // count"); - } - else - { - writer.WriteLine(!IsRightToLeft() ? - $"{C} = global::System.Math.Min(runtextend - runtextpos, {count}); // count" : - $"{C} = global::System.Math.Min(runtextpos - runtextbeg, {count}); // count"); - } - - using (EmitBlock(writer, $"if ({C} <= 0)")) - { - writer.WriteLine($"goto {AdvanceLabel()};"); - } - - PushTrack($"{C} - 1"); - PushTrack("runtextpos"); - Track(); - } - break; - } + writer.WriteLine($"goto {endLoop};"); + writer.WriteLine(); - case RegexCode.Onelazy | RegexCode.Back: - case RegexCode.Notonelazy | RegexCode.Back: - case RegexCode.Setlazy | RegexCode.Back: - case RegexCode.Onelazy | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Notonelazy | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Setlazy | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Onelazy | RegexCode.Ci | RegexCode.Back: - case RegexCode.Notonelazy | RegexCode.Ci | RegexCode.Back: - case RegexCode.Setlazy | RegexCode.Ci | RegexCode.Back: - case RegexCode.Onelazy | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Notonelazy | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Setlazy | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: + string backtrack = ReserveName("LoopBacktrack"); + MarkLabel(backtrack); + using (EmitBlock(writer, $"if ({iterationCount} == 0)")) { - const string I = "tmp1"; - - writer.WriteLine($"runtextpos = {PopTrack()};"); - writer.WriteLine($"{I} = {PopTrack()}; // i"); - - string expr = !IsRightToLeft() ? - Rightcharnext() : - Leftcharnext(); - - if (Code() == RegexCode.Setlazy) - { - EmitTimeoutCheck(writer, hasTimeout); - expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, rm.Code.Strings[Operand(0)], IsCaseInsensitive(), null)}"; - } - else - { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); - expr = $"{expr} {(Code() == RegexCode.Onelazy ? "!=" : "==")} {Literal((char)Operand(0))}"; - } - - using (EmitBlock(writer, $"if ({expr})")) - { - writer.WriteLine($"goto {Backtrack};"); - } - - using (EmitBlock(writer, $"if ({I} > 0)")) - { - PushTrack($"{I} - 1"); - PushTrack("runtextpos"); - Trackagain(); - } - - Advance(); + writer.WriteLine($"goto {originalDoneLabel};"); } - break; - - default: - Debug.Fail($"Unimplemented state: {currentOpcode:X8}"); - break; - } - } - - - - /// - /// Branch to the label corresponding to the regex code at i - /// - /// - /// A trick: since track and stack space is gobbled up unboundedly - /// only as a result of branching backwards, this is where we check - /// for sufficient space and trigger reallocations. - /// - /// If the "goto" is backwards, we generate code that checks - /// available space against the amount of space that would be needed - /// in the worst case by code that will only go forward; if there's - /// not enough, we push the destination on the tracking stack, then - /// we jump to the place where we invoke the allocator. - /// - /// Since forward gotos pose no threat, they just turn into a Br. - /// - void Goto(int i) - { - // When going backwards, ensure enough space. - if (i < currentCodePos) - { - using (EmitBlock(writer, $"if (runtrackpos <= {rm.Code.TrackCount * 4} || runstackpos <= {rm.Code.TrackCount * 3})")) - { - writer.WriteLine($"{ReadyPushTrack()} = {AddGoto(i)};"); - writer.WriteLine($"goto {Backtrack};"); + writer.WriteLine($"goto {doneLabel};"); + doneLabel = backtrack; } - } - - writer.WriteLine($"goto {labels[i]};"); - } - - string ReadyPushTrack() => "runtrack[--runtrackpos]"; - - void Track() => PushTrack(AddTrack()); - - /// - /// Pushes the current switch index on the tracking stack so the backtracking - /// logic will be repeated again next time we backtrack here. - /// - void Trackagain() => PushTrack(currentBacktrackNote); - - void PushTrack(T expr) where T : notnull => writer.WriteLine($"{ReadyPushTrack()} = {(expr is IFormattable ? ((IFormattable)expr).ToString(null, CultureInfo.InvariantCulture) : expr.ToString())};"); - - /// Retrieves the top entry on the tracking stack without popping. - string TopTrack() => "runtrack[runtrackpos]"; - - int Operand(int i) => codes[currentCodePos + i + 1]; - - /// True if the current operation is marked for the leftward direction. - bool IsRightToLeft() => (currentOpcode & RegexCode.Rtl) != 0; - - /// True if the current operation is marked for case insensitive operation. - bool IsCaseInsensitive() => (currentOpcode & RegexCode.Ci) != 0; - - /// Returns the raw regex opcode (masking out Back and Rtl). - int Code() => currentOpcode & RegexCode.Mask; - /// Saves the value of a local variable on the grouping stack. - void PushStack(T expr) where T : notnull => writer.WriteLine($"{ReadyPushStack()} = {(expr is IFormattable ? ((IFormattable)expr).ToString(null, CultureInfo.InvariantCulture) : expr.ToString())};"); + MarkLabel(endLoop); - string ReadyPushStack() => "runstack[--runstackpos]"; - - /// Retrieves the top entry on the stack without popping. - string TopStack() => "runstack[runstackpos]"; - - void TrackUnique(int i) => PushTrack(AddUniqueTrack(i)); - - void TrackUnique2(int i) => PushTrack(AddUniqueTrack(i, RegexCode.Back2)); - - int AddUniqueTrack(int i, int flags = RegexCode.Back) - { - if (uniqueNote[i] == -1) - { - uniqueNote[i] = AddTrack(flags); - } - - return uniqueNote[i]; - } - - /// - /// Returns the position of the next operation in the regex code, taking - /// into account the different numbers of arguments taken by operations - /// - int NextCodepos() => currentCodePos + RegexCode.OpcodeSize(codes[currentCodePos]); - - /// The label for the next (forward) operation. - string AdvanceLabel() => labels[NextCodepos()]!; - - /// Goto the next (forward) operation. - void Advance() => writer.WriteLine($"goto {AdvanceLabel()};"); + if (node.IsInLoop()) + { + writer.WriteLine(); - /// Loads the char to the left of the current position. - string Leftchar() => "runtext[runtextpos - 1]"; + // Store the capture's state + EmitRunstackResizeIfNeeded(3); + writer.WriteLine($"{RunstackPush()} = {startingRunTextPos};"); + writer.WriteLine($"{RunstackPush()} = {iterationCount};"); - /// Loads the char to the left of the current position and advances (leftward). - string Leftcharnext() => "runtext[--runtextpos]"; + // Skip past the backtracking section + string end = ReserveName("SkipBacktrack"); + writer.WriteLine($"goto {end};"); + writer.WriteLine(); - /// Loads the char to the right of the current position. - string Rightchar() => "runtext[runtextpos]"; + // Emit a backtracking section that restores the capture's state and then jumps to the previous done label + string backtrack = ReserveName("LoopBacktrack"); + MarkLabel(backtrack); + writer.WriteLine($"{iterationCount} = {RunstackPop()};"); + writer.WriteLine($"{startingRunTextPos} = {RunstackPop()};"); - /// Loads the char to the right of the current position and advances the current position. - string Rightcharnext() => "runtext[runtextpos++]"; + writer.WriteLine($"goto {doneLabel};"); + writer.WriteLine(); - /// - /// Adds a backtrack note to the list of them, and returns the index of the new - /// note (which is also the index for the jump used by the switch table) - /// - int AddBacktrackNote(int flags, string l, int codepos) - { - if (notes == null || noteCount >= notes.Length) - { - var newnotes = new BacktrackNote[notes == null ? 16 : notes.Length * 2]; - if (notes != null) - { - Array.Copy(notes, newnotes, noteCount); + doneLabel = backtrack; + MarkLabel(end); } - notes = newnotes; } - - notes[noteCount] = new BacktrackNote(flags, l, codepos); - return noteCount++; } - /// - /// Adds a backtrack note for the current operation; creates a new label for - /// where the code will be, and returns the switch index. - /// - int AddTrack(int flags = RegexCode.Back) => AddBacktrackNote(flags, DefineLabel(), currentCodePos); - - int AddGoto(int destpos) + void EmitRunstackResizeIfNeeded(int count) { - if (forwardJumpsThroughSwitch[destpos] == -1) + string subCount = count > 1 ? $" - {count - 1}" : ""; + using (EmitBlock(writer, $"if (runstackpos >= base.runstack!.Length{subCount})")) { - forwardJumpsThroughSwitch[destpos] = AddBacktrackNote(0, labels[destpos]!, destpos); + writer.WriteLine("global::System.Array.Resize(ref base.runstack, base.runstack.Length * 2);"); } - - return forwardJumpsThroughSwitch[destpos]; } - /// Pops an element off the tracking stack. - string PopTrack() => "runtrack[runtrackpos++]"; - - /// Pops an element off the grouping stack (leave it on the operand stack). - string PopStack() => "runstack[runstackpos++]"; - - /// Pops i elements off the grouping stack and discards them. - void PopDiscardStack(int i = 1) => EmitAdd(writer, "runstackpos", i); - - /// Prologue to code that will replace the ith element on the grouping stack. - string ReadyReplaceStack(int i) => i == 0 ? "runstack[runstackpos]" : $"runstack[runstackpos + {i}]"; + string RunstackPush() => "base.runstack[runstackpos++]"; + string RunstackPop() => "base.runstack![--runstackpos]"; } - /// - /// Keeps track of an operation that needs to be referenced in the backtrack-jump - /// switch table, and that needs backtracking code to be emitted (if flags != 0) - /// - private record BacktrackNote(int flags, string label, int codepos); - private static bool EmitLoopTimeoutCounterIfNeeded(IndentedTextWriter writer, RegexMethod rm) { if (rm.MatchTimeout != Timeout.Infinite) @@ -4293,22 +3072,31 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options /// Replaces in with /// all of the variable declarations in . /// - private static void ReplaceAdditionalDeclarations(HashSet declarations, IndentedTextWriter writer) + /// The writer around a StringWriter to have additional declarations inserted into. + /// The additional declarations to insert. + /// The position into the writer at which to insert the additional declarations. + /// The indentation to use for the additional declarations. + private static void ReplaceAdditionalDeclarations(IndentedTextWriter writer, HashSet declarations, int position, int indent) { - StringBuilder sb = ((StringWriter)writer.InnerWriter).GetStringBuilder(); - string replacement = ""; - if (declarations.Count != 0) { - var tmp = new StringBuilder().AppendLine(); - foreach (string decl in declarations) + var arr = new string[declarations.Count]; + declarations.CopyTo(arr); + Array.Sort(arr); + + StringBuilder tmp = new StringBuilder().AppendLine(); + foreach (string decl in arr) { - tmp.Append(' ', writer.Indent * 4).AppendLine(decl); + for (int i = 0; i < indent; i++) + { + tmp.Append(IndentedTextWriter.DefaultTabString); + } + + tmp.AppendLine(decl); } - replacement = tmp.ToString(); - } - sb.Replace(AdditionalDeclarationsPlaceholder, replacement); + ((StringWriter)writer.InnerWriter).GetStringBuilder().Insert(position, tmp.ToString()); + } } private static string Literal(char c) => SymbolDisplay.FormatLiteral(c, quote: true); diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs index cb3bed4d27fa29..c8e88bd2c2b38c 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs @@ -190,6 +190,7 @@ private static bool IsSyntaxTargetForGeneration(SyntaxNode node) => SymbolDisplayFormat.FullyQualifiedFormat.WithGlobalNamespaceStyle(SymbolDisplayGlobalNamespaceStyle.Omitted)); var regexMethod = new RegexMethod( + methodSyntax, regexMethodSymbol.Name, methodSyntax.Modifiers.ToString(), pattern, @@ -231,7 +232,7 @@ static bool IsAllowedKind(SyntaxKind kind) => } /// A regex method. - internal sealed record RegexMethod(string MethodName, string Modifiers, string Pattern, RegexOptions Options, int MatchTimeout, RegexCode Code); + internal sealed record RegexMethod(MethodDeclarationSyntax MethodSyntax, string MethodName, string Modifiers, string Pattern, RegexOptions Options, int MatchTimeout, RegexCode Code); /// A type holding a regex method. internal sealed record RegexType(RegexMethod? Method, string Keyword, string Namespace, string Name, string Constraints) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs index 56bcb17935b6f9..a459c8312c2639 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs @@ -70,8 +70,12 @@ public void Initialize(IncrementalGeneratorInitializationContext context) context.ReportDiagnostic(d); break; - case string s: - code.Add(s); + case ValueTuple> t: + code.Add(t.Item1); + foreach (Diagnostic d in t.Item2) + { + context.ReportDiagnostic(d); + } break; } } diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/Strings.resx b/src/libraries/System.Text.RegularExpressions/gen/Resources/Strings.resx index 4f6ea8594572b2..2ce09c60fb6e59 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/Strings.resx +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/Strings.resx @@ -137,6 +137,12 @@ C# LangVersion of 10 or greater is required + + RegexGenerator limitation reached. + + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + Regular expression parser error '{0}' at offset {1}. diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.cs.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.cs.xlf index 311813a6cf146f..1b24236f5aedd2 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.cs.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.cs.xlf @@ -152,6 +152,16 @@ Délka nemůže být menší než 0 nebo přesáhnout délku vstupu. + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} Neplatný vzor {0} u posunu {1}. {2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.de.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.de.xlf index 532c4b4bee22f4..af011807dac541 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.de.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.de.xlf @@ -152,6 +152,16 @@ Die Länge darf nicht kleiner als 0 sein oder die Eingabelänge überschreiten. + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} Ungültiges Muster "{0}" bei Offset {1}. {2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.es.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.es.xlf index 14bedaae5801c3..03cd5902030cc3 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.es.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.es.xlf @@ -152,6 +152,16 @@ La longitud no puede ser inferior a 0 ni superar la longitud de entrada. + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} Patrón '{0}' no válido en el desplazamiento {1}. {2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.fr.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.fr.xlf index e1c7019d5872b4..2059255bf3bfb9 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.fr.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.fr.xlf @@ -152,6 +152,16 @@ La longueur ne peut pas être inférieure à 0 ou supérieure à la longueur d'entrée. + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} Modèle « {0} » non valide au niveau du décalage {1}. {2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.it.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.it.xlf index 52becd96f3fa7a..7e5f8ef424400a 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.it.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.it.xlf @@ -152,6 +152,16 @@ Lenght non può essere minore di zero o superare la lunghezza di input. + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} Criterio '{0}' non valido alla posizione di offset {1}. {2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ja.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ja.xlf index b27fec1ee2afdd..7070423342c461 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ja.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ja.xlf @@ -152,6 +152,16 @@ 長さを 0 未満に設定したり、入力の長さを超えることはできません。 + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} オフセット {1} に無効なパターン '{0}' があります。{2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ko.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ko.xlf index 8ec1a365f3f9f0..ebd89b0c6b28bb 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ko.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ko.xlf @@ -152,6 +152,16 @@ 길이는 0보다 작거나 입력 길이를 초과할 수 없습니다. + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} 오프셋 {1}에서 잘못된 패턴 '{0}'. {2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pl.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pl.xlf index 3879856ae00738..d014e1ebb9e907 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pl.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pl.xlf @@ -152,6 +152,16 @@ Długość nie może być mniejsza od 0 ani przekraczać długości danych wejściowych. + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} Nieprawidłowy wzorzec „{0}” przy przesunięciu {1}. {2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pt-BR.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pt-BR.xlf index be090a6d3611bd..97a7fd1efa5c62 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pt-BR.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pt-BR.xlf @@ -152,6 +152,16 @@ Comprimento não pode ser menor que 0 ou exceder o comprimento de entrada. + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} Padrão inválido '{0}' no deslocamento {1}. {2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ru.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ru.xlf index 08b344ec0d923b..9af130a2871941 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ru.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ru.xlf @@ -152,6 +152,16 @@ Длина не может быть меньше 0 или превышать длину ввода. + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} Недопустимый шаблон "{0}" со смещением {1}. {2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.tr.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.tr.xlf index 70d5c0b730b72e..f5e6edd4646057 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.tr.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.tr.xlf @@ -152,6 +152,16 @@ Uzunluk sıfırdan küçük olamaz ve giriş uzunluğunu aşamaz. + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} {1} ofsetinde geçersiz “{0}” deseni. {2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hans.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hans.xlf index 047c5b18c02937..fe30a513654142 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hans.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hans.xlf @@ -152,6 +152,16 @@ 长度不能小于 0 或超过输入长度。 + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} 偏移 {0} 处的模式“{1}”无效。{2} diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hant.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hant.xlf index 0d2b1fff76ba1c..5bbb62ca0363f6 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hant.xlf +++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hant.xlf @@ -152,6 +152,16 @@ 長度不能小於零或超過輸入長度。 + + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + The RegexGenerator couldn't generate a complete source implementation for the specified regular expression, due to an unsupported option or too complex a regular expression. The implementation will interpret the regular expression at run-time. + + + + RegexGenerator limitation reached. + RegexGenerator limitation reached. + + Invalid pattern '{0}' at offset {1}. {2} 位移 {1} 的模式 '{0}' 無效。{2} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index ee276b33deb756..284c8647f31f43 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -82,8 +82,13 @@ internal Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, Cult else if (RuntimeFeature.IsDynamicCodeCompiled && UseOptionC()) { // If the compile option is set and compilation is supported, then compile the code. + // If the compiler can't compile this regex, it'll return null, and we'll fall back + // to the interpreter. factory = Compile(pattern, _code, options, matchTimeout != InfiniteMatchTimeout); - _code = null; + if (factory is not null) + { + _code = null; + } } } @@ -215,7 +220,7 @@ protected IDictionary? CapNames /// instantiating a non-compiled regex. /// [MethodImpl(MethodImplOptions.NoInlining)] - private static RegexRunnerFactory Compile(string pattern, RegexCode code, RegexOptions options, bool hasTimeout) => + private static RegexRunnerFactory? Compile(string pattern, RegexCode code, RegexOptions options, bool hasTimeout) => RegexCompiler.Compile(pattern, code, options, hasTimeout); [Obsolete(Obsoletions.RegexCompileToAssemblyMessage, DiagnosticId = Obsoletions.RegexCompileToAssemblyDiagId, UrlFormat = Obsoletions.SharedUrlFormat)] diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index a971e3f66a6345..a3e38808752109 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -3,7 +3,6 @@ using System.Collections.Generic; using System.Diagnostics; -using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.Reflection; using System.Reflection.Emit; @@ -13,8 +12,7 @@ namespace System.Text.RegularExpressions { /// - /// RegexCompiler translates a block of RegexCode to MSIL, and creates a - /// subclass of the RegexRunner type. + /// RegexCompiler translates a block of RegexCode to MSIL, and creates a subclass of the RegexRunner type. /// internal abstract class RegexCompiler { @@ -23,14 +21,8 @@ internal abstract class RegexCompiler private static readonly FieldInfo s_runtextstartField = RegexRunnerField("runtextstart"); private static readonly FieldInfo s_runtextposField = RegexRunnerField("runtextpos"); private static readonly FieldInfo s_runtextField = RegexRunnerField("runtext"); - private static readonly FieldInfo s_runtrackposField = RegexRunnerField("runtrackpos"); - private static readonly FieldInfo s_runtrackField = RegexRunnerField("runtrack"); - private static readonly FieldInfo s_runstackposField = RegexRunnerField("runstackpos"); private static readonly FieldInfo s_runstackField = RegexRunnerField("runstack"); - protected static readonly FieldInfo s_runtrackcountField = RegexRunnerField("runtrackcount"); - private static readonly MethodInfo s_doubleStackMethod = RegexRunnerMethod("DoubleStack"); - private static readonly MethodInfo s_doubleTrackMethod = RegexRunnerMethod("DoubleTrack"); private static readonly MethodInfo s_captureMethod = RegexRunnerMethod("Capture"); private static readonly MethodInfo s_transferCaptureMethod = RegexRunnerMethod("TransferCapture"); private static readonly MethodInfo s_uncaptureMethod = RegexRunnerMethod("Uncapture"); @@ -42,9 +34,6 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_crawlposMethod = RegexRunnerMethod("Crawlpos"); private static readonly MethodInfo s_charInClassMethod = RegexRunnerMethod("CharInClass"); private static readonly MethodInfo s_checkTimeoutMethod = RegexRunnerMethod("CheckTimeout"); -#if DEBUG - private static readonly MethodInfo s_dumpStateM = RegexRunnerMethod("DumpState"); -#endif private static readonly MethodInfo s_charIsDigitMethod = typeof(char).GetMethod("IsDigit", new Type[] { typeof(char) })!; private static readonly MethodInfo s_charIsWhiteSpaceMethod = typeof(char).GetMethod("IsWhiteSpace", new Type[] { typeof(char) })!; @@ -52,9 +41,6 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!; private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!; private static readonly MethodInfo s_cultureInfoGetTextInfoMethod = typeof(CultureInfo).GetMethod("get_TextInfo")!; -#if DEBUG - private static readonly MethodInfo s_debugWriteLine = typeof(Debug).GetMethod("WriteLine", new Type[] { typeof(string) })!; -#endif private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan).GetMethod("get_Item", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan).GetMethod("get_Length")!; private static readonly MethodInfo s_memoryMarshalGetReference = typeof(MemoryMarshal).GetMethod("GetReference", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); @@ -63,8 +49,6 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); - private static readonly MethodInfo s_spanLastIndexOfChar = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); - private static readonly MethodInfo s_spanLastIndexOfSpan = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); @@ -75,55 +59,26 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_textInfoToLowerMethod = typeof(TextInfo).GetMethod("ToLower", new Type[] { typeof(char) })!; private static readonly MethodInfo s_arrayResize = typeof(Array).GetMethod("Resize")!.MakeGenericMethod(typeof(int)); + /// The ILGenerator currently in use. protected ILGenerator? _ilg; - - // tokens representing local variables - private LocalBuilder? _runtextbegLocal; - private LocalBuilder? _runtextendLocal; - private LocalBuilder? _runtextposLocal; - private LocalBuilder? _runtextLocal; - private LocalBuilder? _runtextSpanLocal; - private LocalBuilder? _runtrackposLocal; - private LocalBuilder? _runtrackLocal; - private LocalBuilder? _runstackposLocal; - private LocalBuilder? _runstackLocal; - private LocalBuilder? _textInfoLocal; // cached to avoid extraneous TLS hits from CurrentCulture and virtual calls to TextInfo - private LocalBuilder? _loopTimeoutCounterLocal; // timeout counter for setrep and setloop - - protected RegexOptions _options; // options - protected RegexCode? _code; // the RegexCode object - protected int[]? _codes; // the RegexCodes being translated - protected string[]? _strings; // the stringtable associated with the RegexCodes - protected bool _hasTimeout; // whether the regex has a non-infinite timeout - - private Label[]? _labels; // a label for every operation in _codes - private BacktrackNote[]? _notes; // a list of the backtracking states to be generated - private int _notecount; // true count of _notes (allocation grows exponentially) - protected int _trackcount; // count of backtracking states (used to reduce allocations) - private Label _backtrack; // label for backtracking - private Stack? _int32LocalsPool; // pool of Int32 local variables - private Stack? _readOnlySpanCharLocalsPool; // pool of ReadOnlySpan local variables - - private int _regexopcode; // the current opcode being processed - private int _codepos; // the current code being translated - private int _backpos; // the current backtrack-note being translated - - // special code fragments - private int[]? _uniquenote; // _notes indices for code that should be emitted <= once - private int[]? _goto; // indices for forward-jumps-through-switch (for allocations) - - // indices for unique code fragments - private const int Stackpop = 0; // pop one - private const int Stackpop2 = 1; // pop two - private const int Capback = 3; // uncapture - private const int Capback2 = 4; // uncapture 2 - private const int Branchmarkback2 = 5; // back2 part of branchmark - private const int Lazybranchmarkback2 = 6; // back2 part of lazybranchmark - private const int Branchcountback2 = 7; // back2 part of branchcount - private const int Lazybranchcountback2 = 8; // back2 part of lazybranchcount - private const int Forejumpback = 9; // back part of forejump - private const int Uniquecount = 10; - private const int LoopTimeoutCheckCount = 2048; // A conservative value to guarantee the correct timeout handling. + /// The options for the expression. + protected RegexOptions _options; + /// The code written for the expression. + protected RegexCode? _code; + /// Whether this expression has a non-infinite timeout. + protected bool _hasTimeout; + + /// Pool of Int32 LocalBuilders. + private Stack? _int32LocalsPool; + /// Pool of ReadOnlySpan of char locals. + private Stack? _readOnlySpanCharLocalsPool; + + /// Local representing a cached TextInfo for the culture to use for all case-insensitive operations. + private LocalBuilder? _textInfo; + /// Local representing a timeout counter for loops (set loops and node loops). + private LocalBuilder? _loopTimeoutCounter; + /// A frequency with which the timeout should be validated. + private const int LoopTimeoutCheckCount = 2048; private static FieldInfo RegexRunnerField(string fieldname) => typeof(RegexRunner).GetField(fieldname, BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance | BindingFlags.Static)!; @@ -133,114 +88,15 @@ internal abstract class RegexCompiler /// Entry point to dynamically compile a regular expression. The expression is compiled to /// an in-memory assembly. /// - internal static RegexRunnerFactory Compile(string pattern, RegexCode code, RegexOptions options, bool hasTimeout) => + internal static RegexRunnerFactory? Compile(string pattern, RegexCode code, RegexOptions options, bool hasTimeout) => new RegexLWCGCompiler().FactoryInstanceFromCode(pattern, code, options, hasTimeout); - /// - /// Keeps track of an operation that needs to be referenced in the backtrack-jump - /// switch table, and that needs backtracking code to be emitted (if flags != 0) - /// - private sealed class BacktrackNote - { - internal int _codepos; - internal int _flags; - internal Label _label; - - public BacktrackNote(int flags, Label label, int codepos) - { - _codepos = codepos; - _flags = flags; - _label = label; - } - } - - /// - /// Adds a backtrack note to the list of them, and returns the index of the new - /// note (which is also the index for the jump used by the switch table) - /// - private int AddBacktrackNote(int flags, Label l, int codepos) - { - if (_notes == null || _notecount >= _notes.Length) - { - var newnotes = new BacktrackNote[_notes == null ? 16 : _notes.Length * 2]; - if (_notes != null) - { - Array.Copy(_notes, newnotes, _notecount); - } - _notes = newnotes; - } - - _notes[_notecount] = new BacktrackNote(flags, l, codepos); - - return _notecount++; - } - - /// - /// Adds a backtrack note for the current operation; creates a new label for - /// where the code will be, and returns the switch index. - /// - private int AddTrack() => AddTrack(RegexCode.Back); - - /// - /// Adds a backtrack note for the current operation; creates a new label for - /// where the code will be, and returns the switch index. - /// - private int AddTrack(int flags) => AddBacktrackNote(flags, DefineLabel(), _codepos); - - /// - /// Adds a switchtable entry for the specified position (for the forward - /// logic; does not cause backtracking logic to be generated) - /// - private int AddGoto(int destpos) - { - if (_goto![destpos] == -1) - { - _goto[destpos] = AddBacktrackNote(0, _labels![destpos], destpos); - } - - return _goto[destpos]; - } - - /// - /// Adds a note for backtracking code that only needs to be generated once; - /// if it's already marked to be generated, returns the switch index - /// for the unique piece of code. - /// - private int AddUniqueTrack(int i) => AddUniqueTrack(i, RegexCode.Back); - - /// - /// Adds a note for backtracking code that only needs to be generated once; - /// if it's already marked to be generated, returns the switch index - /// for the unique piece of code. - /// - private int AddUniqueTrack(int i, int flags) - { - if (_uniquenote![i] == -1) - { - _uniquenote[i] = AddTrack(flags); - } - - return _uniquenote[i]; - } - /// A macro for _ilg.DefineLabel private Label DefineLabel() => _ilg!.DefineLabel(); /// A macro for _ilg.MarkLabel private void MarkLabel(Label l) => _ilg!.MarkLabel(l); - /// Returns the ith operand of the current operation. - private int Operand(int i) => _codes![_codepos + i + 1]; - - /// True if the current operation is marked for the leftward direction. - private bool IsRightToLeft() => (_regexopcode & RegexCode.Rtl) != 0; - - /// True if the current operation is marked for case insensitive operation. - private bool IsCaseInsensitive() => (_regexopcode & RegexCode.Ci) != 0; - - /// Returns the raw regex opcode (masking out Back and Rtl). - private int Code() => _regexopcode & RegexCode.Mask; - /// A macro for _ilg.Emit(Opcodes.Ldstr, str) protected void Ldstr(string str) => _ilg!.Emit(OpCodes.Ldstr, str); @@ -253,9 +109,6 @@ private int AddUniqueTrack(int i, int flags) /// A macro for _ilg.Emit(OpCodes.Ret). protected void Ret() => _ilg!.Emit(OpCodes.Ret); - /// A macro for _ilg.Emit(OpCodes.Newobj, constructor). - protected void Newobj(ConstructorInfo constructor) => _ilg!.Emit(OpCodes.Newobj, constructor); - /// A macro for _ilg.Emit(OpCodes.Dup). protected void Dup() => _ilg!.Emit(OpCodes.Dup); @@ -277,18 +130,9 @@ private int AddUniqueTrack(int i, int flags) /// A macro for _ilg.Emit(OpCodes.Add). private void Add() => _ilg!.Emit(OpCodes.Add); - /// A macro for _ilg.Emit(OpCodes.Add); a true flag can turn it into a Sub. - private void Add(bool negate) => _ilg!.Emit(negate ? OpCodes.Sub : OpCodes.Add); - /// A macro for _ilg.Emit(OpCodes.Sub). private void Sub() => _ilg!.Emit(OpCodes.Sub); - /// A macro for _ilg.Emit(OpCodes.Sub) or _ilg.Emit(OpCodes.Add). - private void Sub(bool negate) => _ilg!.Emit(negate ? OpCodes.Add : OpCodes.Sub); - - /// A macro for _ilg.Emit(OpCodes.Neg). - private void Neg() => _ilg!.Emit(OpCodes.Neg); - /// A macro for _ilg.Emit(OpCodes.Mul). private void Mul() => _ilg!.Emit(OpCodes.Mul); @@ -335,7 +179,7 @@ private int AddUniqueTrack(int i, int flags) protected void Ldthisfld(FieldInfo ft) { Ldthis(); - Ldfld(ft); + _ilg!.Emit(OpCodes.Ldfld, ft); } /// A macro for Ldthis(); Ldfld(); Stloc(); @@ -345,17 +189,6 @@ private void Mvfldloc(FieldInfo ft, LocalBuilder lt) Stloc(lt); } - /// A macro for Ldthis(); Ldloc(); Stfld(); - private void Mvlocfld(LocalBuilder lt, FieldInfo ft) - { - Ldthis(); - Ldloc(lt); - Stfld(ft); - } - - /// A macro for _ilg.Emit(OpCodes.Ldfld). - private void Ldfld(FieldInfo ft) => _ilg!.Emit(OpCodes.Ldfld, ft); - /// A macro for _ilg.Emit(OpCodes.Stfld). protected void Stfld(FieldInfo ft) => _ilg!.Emit(OpCodes.Stfld, ft); @@ -389,18 +222,12 @@ private void Mvlocfld(LocalBuilder lt, FieldInfo ft) /// A macro for _ilg.Emit(OpCodes.Bge_Un) (long form). private void BgeUnFar(Label l) => _ilg!.Emit(OpCodes.Bge_Un, l); - /// A macro for _ilg.Emit(OpCodes.Bgt) (long form). - private void BgtFar(Label l) => _ilg!.Emit(OpCodes.Bgt, l); - /// A macro for _ilg.Emit(OpCodes.Bne) (long form). private void BneFar(Label l) => _ilg!.Emit(OpCodes.Bne_Un, l); /// A macro for _ilg.Emit(OpCodes.Beq) (long form). private void BeqFar(Label l) => _ilg!.Emit(OpCodes.Beq, l); - /// A macro for _ilg.Emit(OpCodes.Brfalse_S) (short jump). - private void Brfalse(Label l) => _ilg!.Emit(OpCodes.Brfalse_S, l); - /// A macro for _ilg.Emit(OpCodes.Brtrue_S) (short jump). private void Brtrue(Label l) => _ilg!.Emit(OpCodes.Brtrue_S, l); @@ -422,9 +249,6 @@ private void Mvlocfld(LocalBuilder lt, FieldInfo ft) /// A macro for _ilg.Emit(OpCodes.Bgt_S) (short jump). private void Bgt(Label l) => _ilg!.Emit(OpCodes.Bgt_S, l); - /// A macro for _ilg.Emit(OpCodes.Bgt_Un_S) (short jump). - private void BgtUn(Label l) => _ilg!.Emit(OpCodes.Bgt_Un_S, l); - /// A macro for _ilg.Emit(OpCodes.Bne_S) (short jump). private void Bne(Label l) => _ilg!.Emit(OpCodes.Bne_Un_S, l); @@ -448,9 +272,6 @@ private void Mvlocfld(LocalBuilder lt, FieldInfo ft) /// Declares a local CultureInfo. private LocalBuilder? DeclareTextInfo() => _ilg!.DeclareLocal(typeof(TextInfo)); - /// Declares a local int[]. - private LocalBuilder DeclareInt32Array() => _ilg!.DeclareLocal(typeof(int[])); - /// Declares a local string. private LocalBuilder DeclareString() => _ilg!.DeclareLocal(typeof(string)); @@ -498,271 +319,17 @@ public void Dispose() } } - /// Loads the char to the right of the current position. - private void Rightchar() - { - Ldloc(_runtextLocal!); - Ldloc(_runtextposLocal!); - Call(s_stringGetCharsMethod); - } - - /// Loads the char to the right of the current position and advances the current position. - private void Rightcharnext() - { - Ldloc(_runtextLocal!); - Ldloc(_runtextposLocal!); - Call(s_stringGetCharsMethod); - Ldloc(_runtextposLocal!); - Ldc(1); - Add(); - Stloc(_runtextposLocal!); - } - - /// Loads the char to the left of the current position. - private void Leftchar() - { - Ldloc(_runtextLocal!); - Ldloc(_runtextposLocal!); - Ldc(1); - Sub(); - Call(s_stringGetCharsMethod); - } - - /// Loads the char to the left of the current position and advances (leftward). - private void Leftcharnext() - { - Ldloc(_runtextposLocal!); - Ldc(1); - Sub(); - Stloc(_runtextposLocal!); - Ldloc(_runtextLocal!); - Ldloc(_runtextposLocal!); - Call(s_stringGetCharsMethod); - } - - /// Creates a backtrack note and pushes the switch index it on the tracking stack. - private void Track() - { - ReadyPushTrack(); - Ldc(AddTrack()); - DoPush(); - } - - /// - /// Pushes the current switch index on the tracking stack so the backtracking - /// logic will be repeated again next time we backtrack here. - /// - private void Trackagain() - { - ReadyPushTrack(); - Ldc(_backpos); - DoPush(); - } - - /// Saves the value of a local variable on the tracking stack. - private void PushTrack(LocalBuilder lt) - { - ReadyPushTrack(); - Ldloc(lt); - DoPush(); - } - - /// - /// Creates a backtrack note for a piece of code that should only be generated once, - /// and emits code that pushes the switch index on the backtracking stack. - /// - private void TrackUnique(int i) - { - ReadyPushTrack(); - Ldc(AddUniqueTrack(i)); - DoPush(); - } - - /// - /// Creates a second-backtrack note for a piece of code that should only be - /// generated once, and emits code that pushes the switch index on the - /// backtracking stack. - /// - private void TrackUnique2(int i) - { - ReadyPushTrack(); - Ldc(AddUniqueTrack(i, RegexCode.Back2)); - DoPush(); - } - - /// Prologue to code that will push an element on the tracking stack. - private void ReadyPushTrack() - { - Ldloc(_runtrackposLocal!); - Ldc(1); - Sub(); - Stloc(_runtrackposLocal!); - Ldloc(_runtrackLocal!); - Ldloc(_runtrackposLocal!); - } - - /// Pops an element off the tracking stack (leave it on the operand stack). - private void PopTrack() - { - Ldloc(_runtrackLocal!); - Ldloc(_runtrackposLocal!); - LdelemI4(); - using RentedLocalBuilder tmp = RentInt32Local(); - Stloc(tmp); - Ldloc(_runtrackposLocal!); - Ldc(1); - Add(); - Stloc(_runtrackposLocal!); - Ldloc(tmp); - } - - /// Retrieves the top entry on the tracking stack without popping. - private void TopTrack() - { - Ldloc(_runtrackLocal!); - Ldloc(_runtrackposLocal!); - LdelemI4(); - } - - /// Saves the value of a local variable on the grouping stack. - private void PushStack(LocalBuilder lt) - { - ReadyPushStack(); - Ldloc(lt); - DoPush(); - } - - /// Prologue to code that will replace the ith element on the grouping stack. - internal void ReadyReplaceStack(int i) - { - Ldloc(_runstackLocal!); - Ldloc(_runstackposLocal!); - if (i != 0) - { - Ldc(i); - Add(); - } - } - - /// Prologue to code that will push an element on the grouping stack. - private void ReadyPushStack() - { - Ldloc(_runstackposLocal!); - Ldc(1); - Sub(); - Stloc(_runstackposLocal!); - Ldloc(_runstackLocal!); - Ldloc(_runstackposLocal!); - } - - /// Retrieves the top entry on the stack without popping. - private void TopStack() - { - Ldloc(_runstackLocal!); - Ldloc(_runstackposLocal!); - LdelemI4(); - } - - /// Pops an element off the grouping stack (leave it on the operand stack). - private void PopStack() - { - using RentedLocalBuilder elementLocal = RentInt32Local(); - Ldloc(_runstackLocal!); - Ldloc(_runstackposLocal!); - LdelemI4(); - Stloc(elementLocal); - Ldloc(_runstackposLocal!); - Ldc(1); - Add(); - Stloc(_runstackposLocal!); - Ldloc(elementLocal); - } - - /// Pops 1 element off the grouping stack and discards it. - private void PopDiscardStack() => PopDiscardStack(1); - - /// Pops i elements off the grouping stack and discards them. - private void PopDiscardStack(int i) - { - Ldloc(_runstackposLocal!); - Ldc(i); - Add(); - Stloc(_runstackposLocal!); - } - - /// Epilogue to code that will replace an element on a stack (use Ld* in between). - private void DoReplace() => StelemI4(); - - /// Epilogue to code that will push an element on a stack (use Ld* in between). - private void DoPush() => StelemI4(); - - /// Jump to the backtracking switch. - private void Back() => BrFar(_backtrack); - - /// - /// Branch to the MSIL corresponding to the regex code at i - /// - /// - /// A trick: since track and stack space is gobbled up unboundedly - /// only as a result of branching backwards, this is where we check - /// for sufficient space and trigger reallocations. - /// - /// If the "goto" is backwards, we generate code that checks - /// available space against the amount of space that would be needed - /// in the worst case by code that will only go forward; if there's - /// not enough, we push the destination on the tracking stack, then - /// we jump to the place where we invoke the allocator. - /// - /// Since forward gotos pose no threat, they just turn into a Br. - /// - private void Goto(int i) - { - if (i < _codepos) - { - Label l1 = DefineLabel(); - - // When going backwards, ensure enough space. - Ldloc(_runtrackposLocal!); - Ldc(_trackcount * 4); - Ble(l1); - Ldloc(_runstackposLocal!); - Ldc(_trackcount * 3); - BgtFar(_labels![i]); - MarkLabel(l1); - ReadyPushTrack(); - Ldc(AddGoto(i)); - DoPush(); - BrFar(_backtrack); - } - else - { - BrFar(_labels![i]); - } - } - - /// - /// Returns the position of the next operation in the regex code, taking - /// into account the different numbers of arguments taken by operations - /// - private int NextCodepos() => _codepos + RegexCode.OpcodeSize(_codes![_codepos]); - - /// The label for the next (forward) operation. - private Label AdvanceLabel() => _labels![NextCodepos()]; - - /// Goto the next (forward) operation. - private void Advance() => BrFar(AdvanceLabel()); - /// Sets the culture local to CultureInfo.CurrentCulture. private void InitLocalCultureInfo() { - Debug.Assert(_textInfoLocal != null); + Debug.Assert(_textInfo != null); Call(s_cultureInfoGetCurrentCultureMethod); Callvirt(s_cultureInfoGetTextInfoMethod); - Stloc(_textInfoLocal); + Stloc(_textInfo); } - /// Whether ToLower operations should be performed with the invariant culture as opposed to the one in . - private bool UseToLowerInvariant => _textInfoLocal == null || (_options & RegexOptions.CultureInvariant) != 0; + /// Whether ToLower operations should be performed with the invariant culture as opposed to the one in . + private bool UseToLowerInvariant => _textInfo == null || (_options & RegexOptions.CultureInvariant) != 0; /// Invokes either char.ToLowerInvariant(c) or _textInfo.ToLower(c). private void CallToLower() @@ -775,169 +342,31 @@ private void CallToLower() { using RentedLocalBuilder currentCharLocal = RentInt32Local(); Stloc(currentCharLocal); - Ldloc(_textInfoLocal!); + Ldloc(_textInfo!); Ldloc(currentCharLocal); Callvirt(s_textInfoToLowerMethod); } } - /// - /// Generates the first section of the MSIL. This section contains all - /// the forward logic, and corresponds directly to the regex codes. - /// In the absence of backtracking, this is all we would need. - /// - private void GenerateForwardSection() - { - _uniquenote = new int[Uniquecount]; - _labels = new Label[_codes!.Length]; - _goto = new int[_codes.Length]; - - // initialize - - Array.Fill(_uniquenote, -1); - for (int codepos = 0; codepos < _codes.Length; codepos += RegexCode.OpcodeSize(_codes[codepos])) - { - _goto[codepos] = -1; - _labels[codepos] = DefineLabel(); - } - - // emit variable initializers - - Mvfldloc(s_runtextField, _runtextLocal!); - Mvfldloc(s_runtextbegField, _runtextbegLocal!); - Mvfldloc(s_runtextendField, _runtextendLocal!); - Mvfldloc(s_runtextposField, _runtextposLocal!); - Mvfldloc(s_runtrackField, _runtrackLocal!); - Mvfldloc(s_runtrackposField, _runtrackposLocal!); - Mvfldloc(s_runstackField, _runstackLocal!); - Mvfldloc(s_runstackposField, _runstackposLocal!); - - _backpos = -1; - - for (int codepos = 0; codepos < _codes.Length; codepos += RegexCode.OpcodeSize(_codes[codepos])) - { - MarkLabel(_labels[codepos]); - _codepos = codepos; - _regexopcode = _codes[codepos]; - GenerateOneCode(); - } - } - - /// - /// Generates the middle section of the MSIL. This section contains the - /// big switch jump that allows us to simulate a stack of addresses, - /// and it also contains the calls that expand the tracking and the - /// grouping stack when they get too full. - /// - private void GenerateMiddleSection() - { - using RentedLocalBuilder limitLocal = RentInt32Local(); - Label afterDoubleStack = DefineLabel(); - Label afterDoubleTrack = DefineLabel(); - - // Backtrack: - MarkLabel(_backtrack); - - // (Equivalent of EnsureStorage, but written to avoid unnecessary local spilling.) - - // int limitLocal = runtrackcount * 4; - Ldthisfld(s_runtrackcountField); - Ldc(4); - Mul(); - Stloc(limitLocal); - - // if (runstackpos < limit) - // { - // this.runstackpos = runstackpos; - // DoubleStack(); // might change runstackpos and runstack - // runstackpos = this.runstackpos; - // runstack = this.runstack; - // } - Ldloc(_runstackposLocal!); - Ldloc(limitLocal); - Bge(afterDoubleStack); - Mvlocfld(_runstackposLocal!, s_runstackposField); - Ldthis(); - Call(s_doubleStackMethod); - Mvfldloc(s_runstackposField, _runstackposLocal!); - Mvfldloc(s_runstackField, _runstackLocal!); - MarkLabel(afterDoubleStack); - - // if (runtrackpos < limit) - // { - // this.runtrackpos = runtrackpos; - // DoubleTrack(); // might change runtrackpos and runtrack - // runtrackpos = this.runtrackpos; - // runtrack = this.runtrack; - // } - Ldloc(_runtrackposLocal!); - Ldloc(limitLocal); - Bge(afterDoubleTrack); - Mvlocfld(_runtrackposLocal!, s_runtrackposField); - Ldthis(); - Call(s_doubleTrackMethod); - Mvfldloc(s_runtrackposField, _runtrackposLocal!); - Mvfldloc(s_runtrackField, _runtrackLocal!); - MarkLabel(afterDoubleTrack); - - // runtrack[runtrackpos++] - PopTrack(); - - // Backtracking jump table - var table = new Label[_notecount]; - for (int i = 0; i < _notecount; i++) - { - table[i] = _notes![i]._label; - } - Switch(table); - } - - /// - /// Generates the last section of the MSIL. This section contains all of - /// the backtracking logic. - /// - private void GenerateBacktrackSection() - { - for (int i = 0; i < _notecount; i++) - { - BacktrackNote n = _notes![i]; - if (n._flags != 0) - { - MarkLabel(n._label); - _codepos = n._codepos; - _backpos = i; - _regexopcode = _codes![n._codepos] | n._flags; - GenerateOneCode(); - } - } - } - - /// - /// Generates FindFirstChar. - /// - protected void GenerateFindFirstChar() + /// Generates the implementation for FindFirstChar. + protected void EmitFindFirstChar() { Debug.Assert(_code != null); _int32LocalsPool?.Clear(); _readOnlySpanCharLocalsPool?.Clear(); - _runtextposLocal = DeclareInt32(); - _runtextendLocal = DeclareInt32(); - if (_code.RightToLeft) - { - _runtextbegLocal = DeclareInt32(); - } - _runtextSpanLocal = DeclareReadOnlySpanChar(); - _textInfoLocal = null; + LocalBuilder runtextSpan = DeclareReadOnlySpanChar(); + LocalBuilder runtextpos = DeclareInt32(); + LocalBuilder runtextend = DeclareInt32(); + + _textInfo = null; if ((_options & RegexOptions.CultureInvariant) == 0) { bool needsCulture = _code.FindOptimizations.FindMode switch { FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive or - FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive or FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or - FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or - FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive => true, + FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive => true, _ when _code.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive), @@ -946,7 +375,7 @@ FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or if (needsCulture) { - _textInfoLocal = DeclareTextInfo(); + _textInfo = DeclareTextInfo(); InitLocalCultureInfo(); } } @@ -955,15 +384,11 @@ FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or // int runtextpos = this.runtextpos; // int runtextend = this.runtextend; // ReadOnlySpan runtextSpan = this.runtext.AsSpan(); - Mvfldloc(s_runtextposField, _runtextposLocal); - Mvfldloc(s_runtextendField, _runtextendLocal); + Mvfldloc(s_runtextposField, runtextpos); + Mvfldloc(s_runtextendField, runtextend); Ldthisfld(s_runtextField); Call(s_stringAsSpanMethod); - Stloc(_runtextSpanLocal); - if (_code.RightToLeft) - { - Mvfldloc(s_runtextbegField, _runtextbegLocal!); - } + Stloc(runtextSpan); // Generate length check. If the input isn't long enough to possibly match, fail quickly. // It's rare for min required length to be 0, so we don't bother special-casing the check, @@ -972,46 +397,25 @@ FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or Debug.Assert(minRequiredLength >= 0); Label returnFalse = DefineLabel(); Label finishedLengthCheck = DefineLabel(); - if (!_code.RightToLeft) - { - // if (runtextpos > runtextend - _code.Tree.MinRequiredLength) - // { - // this.runtextpos = runtextend; - // return false; - // } - Ldloc(_runtextposLocal); - Ldloc(_runtextendLocal); - if (minRequiredLength > 0) - { - Ldc(minRequiredLength); - Sub(); - } - Ble(finishedLengthCheck); - MarkLabel(returnFalse); - Ldthis(); - Ldloc(_runtextendLocal); - } - else + // if (runtextpos > runtextend - _code.Tree.MinRequiredLength) + // { + // this.runtextpos = runtextend; + // return false; + // } + Ldloc(runtextpos); + Ldloc(runtextend); + if (minRequiredLength > 0) { - // if (runtextpos - _code.Tree.MinRequiredLength < runtextbeg) - // { - // this.runtextpos = runtextbeg; - // return false; - // } - Ldloc(_runtextposLocal); - if (minRequiredLength > 0) - { - Ldc(minRequiredLength); - Sub(); - } - Ldloc(_runtextbegLocal!); - Bge(finishedLengthCheck); - - MarkLabel(returnFalse); - Ldthis(); - Ldloc(_runtextbegLocal!); + Ldc(minRequiredLength); + Sub(); } + Ble(finishedLengthCheck); + + MarkLabel(returnFalse); + Ldthis(); + Ldloc(runtextend); + Stfld(s_runtextposField); Ldc(0); Ret(); @@ -1024,17 +428,11 @@ FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or } // Either anchors weren't specified, or they don't completely root all matches to a specific location. - switch (_code.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: Debug.Assert(!string.IsNullOrEmpty(_code.FindOptimizations.LeadingCaseSensitivePrefix)); - GenerateIndexOf_LeftToRight(_code.FindOptimizations.LeadingCaseSensitivePrefix); - break; - - case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: - Debug.Assert(!string.IsNullOrEmpty(_code.FindOptimizations.LeadingCaseSensitivePrefix)); - GenerateIndexOf_RightToLeft(_code.FindOptimizations.LeadingCaseSensitivePrefix); + EmitIndexOf_LeftToRight(_code.FindOptimizations.LeadingCaseSensitivePrefix); break; case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: @@ -1042,13 +440,7 @@ FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: Debug.Assert(_code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - GenerateFixedSet_LeftToRight(); - break; - - case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: - case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: - Debug.Assert(_code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - GenerateFixedSet_RightToLeft(); + EmitFixedSet_LeftToRight(); break; default: @@ -1074,21 +466,10 @@ bool GenerateAnchors() case RegexPrefixAnalyzer.Beginning: { Label l1 = DefineLabel(); - Ldloc(_runtextposLocal); - if (!_code.RightToLeft) - { - Ldthisfld(s_runtextbegField); - Ble(l1); - Br(returnFalse); - } - else - { - Ldloc(_runtextbegLocal!); - Ble(l1); - Ldthis(); - Ldloc(_runtextbegLocal!); - Stfld(s_runtextposField); - } + Ldloc(runtextpos); + Ldthisfld(s_runtextbegField); + Ble(l1); + Br(returnFalse); MarkLabel(l1); } Ldc(1); @@ -1098,16 +479,9 @@ bool GenerateAnchors() case RegexPrefixAnalyzer.Start: { Label l1 = DefineLabel(); - Ldloc(_runtextposLocal); + Ldloc(runtextpos); Ldthisfld(s_runtextstartField); - if (!_code.RightToLeft) - { - Ble(l1); - } - else - { - Bge(l1); - } + Ble(l1); Br(returnFalse); MarkLabel(l1); } @@ -1118,41 +492,17 @@ bool GenerateAnchors() case RegexPrefixAnalyzer.EndZ: { Label l1 = DefineLabel(); - if (!_code.RightToLeft) - { - Ldloc(_runtextposLocal); - Ldloc(_runtextendLocal); - Ldc(1); - Sub(); - Bge(l1); - Ldthis(); - Ldloc(_runtextendLocal); - Ldc(1); - Sub(); - Stfld(s_runtextposField); - MarkLabel(l1); - } - else - { - Label l2 = DefineLabel(); - Ldloc(_runtextposLocal); - Ldloc(_runtextendLocal); - Ldc(1); - Sub(); - Blt(l1); - Ldloc(_runtextposLocal); - Ldloc(_runtextendLocal); - Beq(l2); - Ldloca(_runtextSpanLocal); - Ldloc(_runtextposLocal); - Call(s_spanGetItemMethod); - LdindU2(); - Ldc('\n'); - Beq(l2); - MarkLabel(l1); - BrFar(returnFalse); - MarkLabel(l2); - } + Ldloc(runtextpos); + Ldloc(runtextend); + Ldc(1); + Sub(); + Bge(l1); + Ldthis(); + Ldloc(runtextend); + Ldc(1); + Sub(); + Stfld(s_runtextposField); + MarkLabel(l1); } Ldc(1); Ret(); @@ -1161,20 +511,12 @@ bool GenerateAnchors() case RegexPrefixAnalyzer.End: { Label l1 = DefineLabel(); - Ldloc(_runtextposLocal); - Ldloc(_runtextendLocal); - if (!_code.RightToLeft) - { - Bge(l1); - Ldthis(); - Ldloc(_runtextendLocal); - Stfld(s_runtextposField); - } - else - { - Bge(l1); - Br(returnFalse); - } + Ldloc(runtextpos); + Ldloc(runtextend); + Bge(l1); + Ldthis(); + Ldloc(runtextend); + Stfld(s_runtextposField); MarkLabel(l1); } Ldc(1); @@ -1188,17 +530,16 @@ bool GenerateAnchors() // the other anchors, which all skip all subsequent processing if found, with BOL we just use it // to boost our position to the next line, and then continue normally with any prefix or char class searches. - Debug.Assert(!_code.RightToLeft, "RightToLeft isn't implemented and should have been filtered out previously"); Label atBeginningOfLine = DefineLabel(); // if (runtextpos > runtextbeg... - Ldloc(_runtextposLocal!); + Ldloc(runtextpos!); Ldthisfld(s_runtextbegField); Ble(atBeginningOfLine); // ... && runtextSpan[runtextpos - 1] != '\n') { ... } - Ldloca(_runtextSpanLocal); - Ldloc(_runtextposLocal); + Ldloca(runtextSpan); + Ldloc(runtextpos); Ldc(1); Sub(); Call(s_spanGetItemMethod); @@ -1207,8 +548,8 @@ bool GenerateAnchors() Beq(atBeginningOfLine); // int tmp = runtextSpan.Slice(runtextpos).IndexOf('\n'); - Ldloca(_runtextSpanLocal); - Ldloc(_runtextposLocal); + Ldloca(runtextSpan); + Ldloc(runtextpos); Call(s_spanSliceIntMethod); Ldc('\n'); Call(s_spanIndexOfChar); @@ -1225,20 +566,20 @@ bool GenerateAnchors() Ldc(-1); Beq(returnFalse); Ldloc(newlinePos); - Ldloc(_runtextposLocal); + Ldloc(runtextpos); Add(); Ldc(1); Add(); - Ldloc(_runtextendLocal); + Ldloc(runtextend); Bgt(returnFalse); // runtextpos = newlinePos + runtextpos + 1; Ldloc(newlinePos); - Ldloc(_runtextposLocal); + Ldloc(runtextpos); Add(); Ldc(1); Add(); - Stloc(_runtextposLocal); + Stloc(runtextpos); } MarkLabel(atBeginningOfLine); @@ -1250,15 +591,15 @@ bool GenerateAnchors() return false; } - void GenerateIndexOf_LeftToRight(string prefix) + void EmitIndexOf_LeftToRight(string prefix) { using RentedLocalBuilder i = RentInt32Local(); // int i = runtextSpan.Slice(runtextpos, runtextend - runtextpos).IndexOf(prefix); - Ldloca(_runtextSpanLocal); - Ldloc(_runtextposLocal); - Ldloc(_runtextendLocal); - Ldloc(_runtextposLocal); + Ldloca(runtextSpan); + Ldloc(runtextpos); + Ldloc(runtextend); + Ldloc(runtextpos); Sub(); Call(s_spanSliceIntIntMethod); Ldstr(prefix); @@ -1274,135 +615,15 @@ void GenerateIndexOf_LeftToRight(string prefix) // base.runtextpos = runtextpos + i; // return true; Ldthis(); - Ldloc(_runtextposLocal); - Ldloc(i); - Add(); - Stfld(s_runtextposField); - Ldc(1); - Ret(); - } - - void GenerateIndexOf_RightToLeft(string prefix) - { - using RentedLocalBuilder i = RentInt32Local(); - - // int i = runtextSpan.Slice(runtextbeg, runtextpos - runtextbeg).LastIndexOf(prefix); - Ldloca(_runtextSpanLocal); - Ldloc(_runtextbegLocal!); - Ldloc(_runtextposLocal); - Ldloc(_runtextbegLocal!); - Sub(); - Call(s_spanSliceIntIntMethod); - Ldstr(prefix); - Call(s_stringAsSpanMethod); - Call(s_spanLastIndexOfSpan); - Stloc(i); - - // if (i < 0) goto ReturnFalse; - Ldloc(i); - Ldc(0); - BltFar(returnFalse); - - // base.runtextpos = runtextbeg + i + LeadingCaseSensitivePrefix.Length; - // return true; - Ldthis(); - Ldloc(_runtextbegLocal!); + Ldloc(runtextpos); Ldloc(i); Add(); - Ldc(prefix.Length); - Add(); Stfld(s_runtextposField); Ldc(1); Ret(); } - void GenerateFixedSet_RightToLeft() - { - (char[]? Chars, string Set, int Distance, bool CaseInsensitive) set = _code.FindOptimizations.FixedDistanceSets![0]; - Debug.Assert(set.Distance == 0); - - using RentedLocalBuilder i = RentInt32Local(); - - if (set.Chars is { Length: 1 } && !set.CaseInsensitive) - { - // int i = runtextSpan.Slice(runtextbeg, runtextpos - runtextbeg).LastIndexOf(set.Chars[0]); - Ldloca(_runtextSpanLocal); - Ldloc(_runtextbegLocal!); - Ldloc(_runtextposLocal); - Ldloc(_runtextbegLocal!); - Sub(); - Call(s_spanSliceIntIntMethod); - Ldc(set.Chars[0]); - Call(s_spanLastIndexOfChar); - Stloc(i); - - // if (i < 0) goto ReturnFalse; - Ldloc(i); - Ldc(0); - BltFar(returnFalse); - - // base.runtextpos = runtextbeg + i + 1; - // return true; - Ldthis(); - Ldloc(_runtextbegLocal!); - Ldloc(i); - Add(); - Ldc(1); - Add(); - Stfld(s_runtextposField); - Ldc(1); - Ret(); - } - else - { - Label condition = DefineLabel(); - Label increment = DefineLabel(); - Label body = DefineLabel(); - - // for (int i = runtextpos - 1; ... - Ldloc(_runtextposLocal); - Ldc(1); - Sub(); - Stloc(i); - BrFar(condition); - - // if (MatchCharClass(runtextSpan[i], set)) - MarkLabel(body); - Ldloca(_runtextSpanLocal); - Ldloc(i); - Call(s_spanGetItemMethod); - LdindU2(); - EmitMatchCharacterClass(set.Set, set.CaseInsensitive); - Brfalse(increment); - - // base.runtextpos = i + 1; - // return true; - Ldthis(); - Ldloc(i); - Ldc(1); - Add(); - Stfld(s_runtextposField); - Ldc(1); - Ret(); - - // for (...; ...; i--) - MarkLabel(increment); - Ldloc(i); - Ldc(1); - Sub(); - Stloc(i); - - // for (...; i >= runtextbeg; ...) - MarkLabel(condition); - Ldloc(i); - Ldloc(_runtextbegLocal!); - BgeFar(body); - - BrFar(returnFalse); - } - } - - void GenerateFixedSet_LeftToRight() + void EmitFixedSet_LeftToRight() { List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = _code.FindOptimizations.FixedDistanceSets; (char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0]; @@ -1413,10 +634,10 @@ void GenerateFixedSet_LeftToRight() using RentedLocalBuilder textSpanLocal = RentReadOnlySpanCharLocal(); // ReadOnlySpan span = runtextSpan.Slice(runtextpos, runtextend - runtextpos); - Ldloca(_runtextSpanLocal); - Ldloc(_runtextposLocal); - Ldloc(_runtextendLocal); - Ldloc(_runtextposLocal); + Ldloca(runtextSpan); + Ldloc(runtextpos); + Ldloc(runtextend); + Ldloc(runtextpos); Sub(); Call(s_spanSliceIntIntMethod); Stloc(textSpanLocal); @@ -1565,7 +786,7 @@ void GenerateFixedSet_LeftToRight() // this.runtextpos = runtextpos + i; // return true; Ldthis(); - Ldloc(_runtextposLocal); + Ldloc(runtextpos); Ldloc(iLocal); Add(); Stfld(s_runtextposField); @@ -1601,36 +822,45 @@ void GenerateFixedSet_LeftToRight() } } - private bool TryGenerateSimplifiedGo(RegexNode node) + /// Generates the implementation for Go. + protected void EmitGo() { + // In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled + // version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via + // RegexWriter; the interpreter would then sit in a loop processing those opcodes, and the RegexCompiler iterated through the + // opcodes generating code for each equivalent to what the interpreter would do albeit with some decisions made at compile-time + // rather than at run-time. This approach, however, lead to complicated code that wasn't pay-for-play (e.g. a big backtracking + // jump table that all compilations went through even if there was no backtracking), that didn't factor in the shape of the + // tree (e.g. it's difficult to add optimizations based on interactions between nodes in the graph), and that didn't read well + // when decompiled from IL to C# or when directly emitted as C# as part of a source generator. + // + // This implementation is instead based on directly walking the RegexNode tree and outputting code for each node in the graph. + // A dedicated for each kind of RegexNode emits the code necessary to handle that node's processing, including recursively + // calling the relevant function for any of its children nodes. Backtracking is handled not via a giant jump table, but instead + // by emitting direct jumps to each backtracking construct. This is achieved by having all match failures jump to a "done" + // label that can be changed by a previous emitter, e.g. before EmitLoop returns, it ensures that "doneLabel" is set to the + // label that code should jump back to when backtracking. That way, a subsequent EmitXx function doesn't need to know exactly + // where to jump: it simply always jumps to "doneLabel" on match failure, and "doneLabel" is always configured to point to + // the right location. In an expression without backtracking, or before any backtracking constructs have been encountered, + // "doneLabel" is simply the final return location from the Go method that will undo any captures and exit, signaling to + // the calling scan loop that nothing was matched. + + Debug.Assert(_code != null); + _int32LocalsPool?.Clear(); + _readOnlySpanCharLocalsPool?.Clear(); + + // Get the root Capture node of the tree. + RegexNode node = _code.Tree.Root; Debug.Assert(node.Type == RegexNode.Capture, "Every generated tree should begin with a capture node"); Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child"); - // RightToLeft is rare and not worth adding a lot of custom code to handle in this path. - if ((node.Options & RegexOptions.RightToLeft) != 0) - { - return false; - } - // Skip the Capture node. We handle the implicit root capture specially. node = node.Child(0); - if (!node.SupportsSimplifiedCodeGenerationImplementation()) - { - return false; - } - - // We've determined that the RegexNode can be handled with this optimized path. Generate the code. -#if DEBUG - if ((_options & RegexOptions.Debug) != 0) - { - Debug.WriteLine("Using optimized non-backtracking code gen."); - } -#endif // In some limited cases, FindFirstChar will only return true if it successfully matched the whole thing. // This is the case, in particular, for strings. We can special case these to do essentially nothing // in Go other than emit the capture. - if (!IsCaseInsensitive(node)) // FindFirstChar may not be 100% accurate on casing in all cultures + if (!IsCaseInsensitive(node)) // FindFirstChar may yield false positives on these in some cultures when case-insensitive { switch (node.Type) { @@ -1654,11 +884,11 @@ private bool TryGenerateSimplifiedGo(RegexNode node) Add(); Stfld(s_runtextposField); Ret(); - return true; + return; } } - // Declare some locals. + // Initialize the main locals used throughout the implementation. LocalBuilder runtextLocal = DeclareString(); LocalBuilder originalruntextposLocal = DeclareInt32(); LocalBuilder runtextposLocal = DeclareInt32(); @@ -1669,7 +899,7 @@ private bool TryGenerateSimplifiedGo(RegexNode node) Label originalDoneLabel = doneLabel; if (_hasTimeout) { - _loopTimeoutCounterLocal = DeclareInt32(); + _loopTimeoutCounter = DeclareInt32(); } // CultureInfo culture = CultureInfo.CurrentCulture; // only if the whole expression or any subportion is ignoring case, and we're not using invariant @@ -1758,8 +988,8 @@ private bool TryGenerateSimplifiedGo(RegexNode node) // return; Ret(); - // Generated code successfully with non-backtracking implementation. - return true; + // Generated code successfully. + return; static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0; @@ -1859,6 +1089,7 @@ void EmitAlternation(RegexNode node) Label matchLabel = DefineLabel(); // Save off runtextpos. We'll need to reset this each time a branch fails. + // startingRunTextPos = runtextpos; LocalBuilder startingRunTextPos = DeclareInt32(); Ldloc(runtextposLocal); Stloc(startingRunTextPos); @@ -1882,10 +1113,12 @@ void EmitAlternation(RegexNode node) // construct is responsible for unwinding back to its starting crawl position. If // it eventually ends up failing, that failure will result in jumping to the next branch // of the alternation, which will again dutifully unwind the remaining captures until - // what they were at the start of the alternation. + // what they were at the start of the alternation. Of course, if there are no captures + // anywhere in the regex, we don't have to do any of that. LocalBuilder? startingCrawlpos = null; - if ((node.Options & RegexNode.HasCapturesFlag) != 0 || !isAtomic) + if (expressionHasCaptures && ((node.Options & RegexNode.HasCapturesFlag) != 0 || !isAtomic)) { + // startingCrawlpos = base.Crawlpos(); startingCrawlpos = DeclareInt32(); Ldthis(); Call(s_crawlposMethod); @@ -1931,9 +1164,16 @@ void EmitAlternation(RegexNode node) // still points to the nextBranch, which similarly is where we'll want to jump to. if (!isAtomic) { + // if (runstackpos + 3 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2); + // base.runstack[runstackpos++] = i; + // base.runstack[runstackpos++] = startingCrawlpos; + // base.runstack[runstackpos++] = startingRunTextPos; EmitRunstackResizeIfNeeded(3); EmitRunstackPush(() => Ldc(i)); - EmitRunstackPush(() => Ldloc(startingCrawlpos!)); + if (startingCrawlpos is not null) + { + EmitRunstackPush(() => Ldloc(startingCrawlpos)); + } EmitRunstackPush(() => Ldloc(startingRunTextPos)); } labelMap[i] = doneLabel; @@ -1942,6 +1182,9 @@ void EmitAlternation(RegexNode node) // Before jumping to the end, we need to zero out textSpanPos, so that no // matter what the value is after the branch, whatever follows the alternate // will see the same textSpanPos. + // runtextpos += textSpanPos; + // textSpanPos = 0; + // goto matchLabel; TransferTextSpanPosToRunTextPos(); BrFar(matchLabel); @@ -1951,6 +1194,10 @@ void EmitAlternation(RegexNode node) // needs to be reset, uncapturing it. if (!isLastBranch) { + // NextBranch: + // runtextpos = startingRunTextPos; + // textSpan = runtext.AsSpan(runtextpos, runtextend - runtextpos); + // while (base.Crawlpos() > startingCrawlpos) base.Uncapture(); MarkLabel(nextBranch); Ldloc(startingRunTextPos); Stloc(runtextposLocal); @@ -1970,7 +1217,11 @@ void EmitAlternation(RegexNode node) // "doneLabel" to the label for this section. Thus, we only need to emit it if // something can backtrack to us, which can't happen if we're inside of an atomic // node. Thus, emit the backtracking section only if we're non-atomic. - if (!isAtomic) + if (isAtomic) + { + doneLabel = originalDoneLabel; + } + else { doneLabel = backtrackLabel; MarkLabel(backtrackLabel); @@ -1980,8 +1231,11 @@ void EmitAlternation(RegexNode node) // switch (base.runstack[--runstackpos]) { ... } // branch number EmitRunstackPop(); Stloc(startingRunTextPos); - EmitRunstackPop(); - Stloc(startingCrawlpos!); + if (startingCrawlpos is not null) + { + EmitRunstackPop(); + Stloc(startingCrawlpos); + } EmitRunstackPop(); Switch(labelMap); } @@ -2083,6 +1337,8 @@ void EmitBackreference(RegexNode node) // Emits the code for an if(backreference)-then-else conditional. void EmitBackreferenceConditional(RegexNode node) { + bool isAtomic = node.IsAtomicByParent(); + // We're branching in a complicated fashion. Make sure textSpanPos is 0. TransferTextSpanPosToRunTextPos(); @@ -2113,11 +1369,13 @@ void EmitBackreferenceConditional(RegexNode node) Label postIfDoneLabel = doneLabel; if (postIfDoneLabel != originalDoneLabel) { + // resumeAt = 0; Ldc(0); Stloc(resumeAt); } if (postIfDoneLabel != originalDoneLabel || hasNo) { + // goto endRef; BrFar(endRef); } @@ -2133,6 +1391,7 @@ void EmitBackreferenceConditional(RegexNode node) postElseDoneLabel = doneLabel; if (postElseDoneLabel != originalDoneLabel) { + // resumeAt = 1; Ldc(1); Stloc(resumeAt); } @@ -2144,51 +1403,62 @@ void EmitBackreferenceConditional(RegexNode node) // that will cause the backtracking to immediately pass through this node. if (postIfDoneLabel != originalDoneLabel) { + // resumeAt = 2; Ldc(2); Stloc(resumeAt); } } - // If either the yes branch or the no branch contained backtracking, subsequent expressions - // might try to backtrack to here, so output a backtracking map based on resumeAt. - if (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel) + if (isAtomic) + { + doneLabel = originalDoneLabel; + } + else { - // Skip the backtracking section - Br(endRef); + // If either the yes branch or the no branch contained backtracking, subsequent expressions + // might try to backtrack to here, so output a backtracking map based on resumeAt. + if (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel) + { + // Skip the backtracking section + // goto endRef; + Br(endRef); - Label backtrack = DefineLabel(); - doneLabel = backtrack; - MarkLabel(backtrack); + Label backtrack = DefineLabel(); + doneLabel = backtrack; + MarkLabel(backtrack); - // resumeAt = base.runstack[--runstackpos]; - EmitRunstackPop(); - Stloc(resumeAt); + // resumeAt = base.runstack[--runstackpos]; + EmitRunstackPop(); + Stloc(resumeAt); - if (postIfDoneLabel != originalDoneLabel) - { - // if (resumeAt == 0) goto postIfDoneLabel; - Ldloc(resumeAt); - Ldc(0); - BeqFar(postIfDoneLabel); - } + if (postIfDoneLabel != originalDoneLabel) + { + // if (resumeAt == 0) goto postIfDoneLabel; + Ldloc(resumeAt); + Ldc(0); + BeqFar(postIfDoneLabel); + } - if (postElseDoneLabel != originalDoneLabel) - { - // if (resumeAt == 1) goto postElseDoneLabel; - Ldloc(resumeAt); - Ldc(1); - BeqFar(postElseDoneLabel); - } + if (postElseDoneLabel != originalDoneLabel) + { + // if (resumeAt == 1) goto postElseDoneLabel; + Ldloc(resumeAt); + Ldc(1); + BeqFar(postElseDoneLabel); + } - // goto originalDoneLabel; - BrFar(originalDoneLabel); + // goto originalDoneLabel; + BrFar(originalDoneLabel); + } } if (postIfDoneLabel != originalDoneLabel || hasNo) { MarkLabel(endRef); - if (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel) + if (!isAtomic && (postIfDoneLabel != originalDoneLabel || postElseDoneLabel != originalDoneLabel)) { + // if (runstackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2); + // base.runstack[runstackpos++] = resumeAt; EmitRunstackResizeIfNeeded(1); EmitRunstackPush(() => Ldloc(resumeAt)); } @@ -2198,6 +1468,8 @@ void EmitBackreferenceConditional(RegexNode node) // Emits the code for an if(expression)-then-else conditional. void EmitExpressionConditional(RegexNode node) { + bool isAtomic = node.IsAtomicByParent(); + // We're branching in a complicated fashion. Make sure textSpanPos is 0. TransferTextSpanPosToRunTextPos(); @@ -2243,7 +1515,7 @@ void EmitExpressionConditional(RegexNode node) } Label postConditionalDoneLabel = doneLabel; - LocalBuilder resumeAt = DeclareInt32(); + LocalBuilder? resumeAt = !isAtomic ? DeclareInt32() : null; // If we get to this point of the code, the conditional successfully matched, so run the "yes" branch. // Since the "yes" branch may have a different execution path than the "no" branch or the lack of @@ -2253,13 +1525,15 @@ void EmitExpressionConditional(RegexNode node) EmitNode(yesBranch); TransferTextSpanPosToRunTextPos(); // ensure all subsequent code sees the same textSpanPos value by setting it to 0 Label postYesDoneLabel = doneLabel; - if (postYesDoneLabel != originalDoneLabel) + if (resumeAt is not null && postYesDoneLabel != originalDoneLabel) { + // resumeAt = 0; Ldc(0); Stloc(resumeAt); } if (postYesDoneLabel != originalDoneLabel || noBranch is not null) { + // goto end; BrFar(end); } @@ -2272,6 +1546,7 @@ void EmitExpressionConditional(RegexNode node) MarkLabel(no); if (startingCrawlPos is not null) { + // while (base.Crawlpos() > startingCrawlPos) base.Uncapture(); EmitUncaptureUntil(startingCrawlPos); } @@ -2282,6 +1557,7 @@ void EmitExpressionConditional(RegexNode node) postNoDoneLabel = doneLabel; if (postNoDoneLabel != originalDoneLabel) { + // goto end; BrFar(end); } } @@ -2290,43 +1566,57 @@ void EmitExpressionConditional(RegexNode node) // There's only a yes branch. If it's going to cause us to output a backtracking // label but code may not end up taking the yes branch path, we need to emit a resumeAt // that will cause the backtracking to immediately pass through this node. - if (postYesDoneLabel != originalDoneLabel) + if (resumeAt is not null && postYesDoneLabel != originalDoneLabel) { + // resumeAt = 2; Ldc(2); Stloc(resumeAt); } } - if (postYesDoneLabel != postConditionalDoneLabel || postNoDoneLabel != postConditionalDoneLabel) + if (isAtomic) { - // Skip the backtracking section. - BrFar(end); + doneLabel = originalDoneLabel; + } + else + { + Debug.Assert(resumeAt is not null); + if (postYesDoneLabel != postConditionalDoneLabel || postNoDoneLabel != postConditionalDoneLabel) + { + // Skip the backtracking section. + BrFar(end); - Label backtrack = DefineLabel(); - doneLabel = backtrack; - MarkLabel(backtrack); + Label backtrack = DefineLabel(); + doneLabel = backtrack; + MarkLabel(backtrack); - if (postYesDoneLabel != postConditionalDoneLabel) - { - Ldloc(resumeAt); - Ldc(0); - BeqFar(postYesDoneLabel); + if (postYesDoneLabel != postConditionalDoneLabel) + { + // if (resumeAt == 0) goto postYesDoneLabel; + Ldloc(resumeAt); + Ldc(0); + BeqFar(postYesDoneLabel); + } + + if (postNoDoneLabel != postConditionalDoneLabel && postNoDoneLabel != originalDoneLabel) + { + // if (resumeAt == 1) goto postNoDoneLabel; + Ldloc(resumeAt); + Ldc(1); + BeqFar(postNoDoneLabel); + } + + // goto postConditionalDoneLabel; + BrFar(postConditionalDoneLabel); } - if (postNoDoneLabel != postConditionalDoneLabel && postNoDoneLabel != originalDoneLabel) + if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel) { - Ldloc(resumeAt); - Ldc(1); - BeqFar(postNoDoneLabel); + // if (runstackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2); + // base.runstack[runstackpos++] = resumeAt; + EmitRunstackResizeIfNeeded(1); + EmitRunstackPush(() => Ldloc(resumeAt)); } - - BrFar(postConditionalDoneLabel); - } - - if (postYesDoneLabel != originalDoneLabel || postNoDoneLabel != originalDoneLabel) - { - EmitRunstackResizeIfNeeded(1); - EmitRunstackPush(() => Ldloc(resumeAt)); } MarkLabel(end); @@ -2338,6 +1628,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) Debug.Assert(node.Type == RegexNode.Capture); int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps); int uncapnum = RegexParser.MapCaptureNumber(node.N, _code.Caps); + bool isAtomic = node.IsAtomicByParent(); // runtextpos += textSpanPos; // textSpan = textSpan.Slice(textSpanPos); @@ -2388,8 +1679,10 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) Call(s_transferCaptureMethod); } - if (childBacktracks || node.IsInLoop()) + if (!isAtomic && (childBacktracks || node.IsInLoop())) { + // if (runstackpos + 1 >= base.runstack.Length) Array.Resize(ref base.runstack, base.runstack.Length * 2); + // base.runstack[runstackpos++] = startingRunTextPos; EmitRunstackResizeIfNeeded(1); EmitRunstackPush(() => Ldloc(startingRunTextPos)); @@ -2417,6 +1710,10 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) doneLabel = backtrack; MarkLabel(end); } + else + { + doneLabel = originalDoneLabel; + } } // Emits code to unwind the capture stack until the crawl position specified in the provided local. @@ -2447,6 +1744,7 @@ void EmitPositiveLookaheadAssertion(RegexNode node) Label originalDoneLabel = doneLabel; // Save off runtextpos. We'll need to reset this upon successful completion of the lookahead. + // startingRunTextPos = runtextpos; LocalBuilder startingRunTextPos = DeclareInt32(); Ldloc(runtextposLocal); Stloc(startingRunTextPos); @@ -2457,6 +1755,8 @@ void EmitPositiveLookaheadAssertion(RegexNode node) // After the child completes successfully, reset the text positions. // Do not reset captures, which persist beyond the lookahead. + // runtextpos = startingRunTextPos; + // textSpan = runtext.AsSpan(runtextpos, runtextend - runtextpos); Ldloc(startingRunTextPos); Stloc(runtextposLocal); LoadTextSpanLocal(); @@ -2472,6 +1772,7 @@ void EmitNegativeLookaheadAssertion(RegexNode node) Label originalDoneLabel = doneLabel; // Save off runtextpos. We'll need to reset this upon successful completion of the lookahead. + // startingRunTextPos = runtextpos; LocalBuilder startingRunTextPos = DeclareInt32(); Ldloc(runtextposLocal); Stloc(startingRunTextPos); @@ -2485,6 +1786,7 @@ void EmitNegativeLookaheadAssertion(RegexNode node) // If the generated code ends up here, it matched the lookahead, which actually // means failure for a _negative_ lookahead, so we need to jump to the original done. + // goto originalDoneLabel; BrFar(originalDoneLabel); // Failures (success for a negative lookahead) jump here. @@ -2495,6 +1797,7 @@ void EmitNegativeLookaheadAssertion(RegexNode node) } // After the child completes in failure (success for negative lookahead), reset the text positions. + // runtextpos = startingRunTextPos; Ldloc(startingRunTextPos); Stloc(runtextposLocal); LoadTextSpanLocal(); @@ -2646,9 +1949,12 @@ void EmitUpdateBumpalong() // Emits code for a concatenation void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired) { + // Emit the code for each child one after the other. int childCount = node.ChildCount(); for (int i = 0; i < childCount; i++) { + // If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence + // and then skip the individual length checks for each. if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) { EmitSpanLengthCheck(requiredLength); @@ -3110,12 +2416,14 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true) // Track the current runtextpos. Each time we backtrack, we'll reset to the stored position, which // is also incremented each time we match another character in the loop. + // int startingRunTextPos = runtextpos; LocalBuilder startingRunTextPos = DeclareInt32(); Ldloc(runtextposLocal); Stloc(startingRunTextPos); // Skip the backtracking section for the initial subsequent matching. We've already matched the // minimum number of iterations, which means we can successfully match with zero additional iterations. + // goto endLoopLabel; Label endLoopLabel = DefineLabel(); BrFar(endLoopLabel); @@ -3127,6 +2435,7 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true) // are before this node, in which case this is wasted effort, but still functionally correct. if (crawlPos is not null) { + // while (base.Crawlpos() > crawlPos) base.Uncapture(); EmitUncaptureUntil(crawlPos); } @@ -3234,10 +2543,11 @@ void EmitLazy(RegexNode node) int minIterations = node.M; int maxIterations = node.N; Label originalDoneLabel = doneLabel; + bool isAtomic = node.IsAtomicByParent(); // If this is actually an atomic lazy loop, we need to output just the minimum number of iterations, // as nothing will backtrack into the lazy loop to get it progress further. - if (node.IsAtomicByParent()) + if (isAtomic) { switch (minIterations) { @@ -3416,51 +2726,54 @@ void EmitLazy(RegexNode node) MarkLabel(endLoop); - // Store the capture's state and skip the backtracking section - EmitRunstackResizeIfNeeded(3); - EmitRunstackPush(() => Ldloc(startingRunTextPos)); - EmitRunstackPush(() => Ldloc(iterationCount)); - EmitRunstackPush(() => Ldloc(sawEmpty)); - Label skipBacktrack = DefineLabel(); - BrFar(skipBacktrack); + if (!isAtomic) + { + // Store the capture's state and skip the backtracking section + EmitRunstackResizeIfNeeded(3); + EmitRunstackPush(() => Ldloc(startingRunTextPos)); + EmitRunstackPush(() => Ldloc(iterationCount)); + EmitRunstackPush(() => Ldloc(sawEmpty)); + Label skipBacktrack = DefineLabel(); + BrFar(skipBacktrack); - // Emit a backtracking section that restores the capture's state and then jumps to the previous done label - Label backtrack = DefineLabel(); - MarkLabel(backtrack); + // Emit a backtracking section that restores the capture's state and then jumps to the previous done label + Label backtrack = DefineLabel(); + MarkLabel(backtrack); - // sawEmpty = base.runstack[--runstackpos]; - // iterationCount = base.runstack[--runstackpos]; - // startingRunTextPos = base.runstack[--runstackpos]; - EmitRunstackPop(); - Stloc(sawEmpty); - EmitRunstackPop(); - Stloc(iterationCount); - EmitRunstackPop(); - Stloc(startingRunTextPos); + // sawEmpty = base.runstack[--runstackpos]; + // iterationCount = base.runstack[--runstackpos]; + // startingRunTextPos = base.runstack[--runstackpos]; + EmitRunstackPop(); + Stloc(sawEmpty); + EmitRunstackPop(); + Stloc(iterationCount); + EmitRunstackPop(); + Stloc(startingRunTextPos); - if (maxIterations == int.MaxValue) - { - // if (sawEmpty != 0) goto doneLabel; - Ldloc(sawEmpty); - Ldc(0); - BneFar(doneLabel); - } - else - { - // if (iterationCount >= maxIterations || sawEmpty != 0) goto doneLabel; - Ldloc(iterationCount); - Ldc(maxIterations); - BgeFar(doneLabel); - Ldloc(sawEmpty); - Ldc(0); - BneFar(doneLabel); - } + if (maxIterations == int.MaxValue) + { + // if (sawEmpty != 0) goto doneLabel; + Ldloc(sawEmpty); + Ldc(0); + BneFar(doneLabel); + } + else + { + // if (iterationCount >= maxIterations || sawEmpty != 0) goto doneLabel; + Ldloc(iterationCount); + Ldc(maxIterations); + BgeFar(doneLabel); + Ldloc(sawEmpty); + Ldc(0); + BneFar(doneLabel); + } - // goto body; - BrFar(body); + // goto body; + BrFar(body); - doneLabel = backtrack; - MarkLabel(skipBacktrack); + doneLabel = backtrack; + MarkLabel(skipBacktrack); + } } // Emits the code to handle a loop (repeater) with a fixed number of iterations. @@ -3849,6 +3162,7 @@ void EmitLoop(RegexNode node) Debug.Assert(node.N >= node.M, $"Unexpected M={node.M}, N={node.N}"); int minIterations = node.M; int maxIterations = node.N; + bool isAtomic = node.IsAtomicByParent(); // We might loop any number of times. In order to ensure this loop and subsequent code sees textSpanPos // the same regardless, we always need it to contain the same value, and the easiest such value is 0. @@ -3997,1901 +3311,171 @@ void EmitLoop(RegexNode node) // int poppedCrawlPos = base.runstack[--runstackpos]; // while (base.Crawlpos() > poppedCrawlPos) base.Uncapture(); using RentedLocalBuilder poppedCrawlPos = RentInt32Local(); - EmitRunstackPop(); - Stloc(poppedCrawlPos); - EmitUncaptureUntil(poppedCrawlPos); - } - LoadTextSpanLocal(); - - if (minIterations > 0) - { - // if (iterationCount == 0) goto originalDoneLabel; - Ldloc(iterationCount); - Ldc(0); - BeqFar(originalDoneLabel); - - // if (iterationCount < minIterations) goto doneLabel/originalDoneLabel; - Ldloc(iterationCount); - Ldc(minIterations); - BltFar(childBacktracks ? doneLabel : originalDoneLabel); - } - - if (childBacktracks) - { - // goto endLoop; - BrFar(endLoop); - - // Backtrack: - Label backtrack = DefineLabel(); - MarkLabel(backtrack); - - // if (iterationCount == 0) goto originalDoneLabel; - Ldloc(iterationCount); - Ldc(0); - BeqFar(originalDoneLabel); - - // goto doneLabel; - BrFar(doneLabel); - - doneLabel = backtrack; - } - - MarkLabel(endLoop); - - if (node.IsInLoop()) - { - // Store the capture's state - EmitRunstackResizeIfNeeded(3); - EmitRunstackPush(() => Ldloc(startingRunTextPos)); - EmitRunstackPush(() => Ldloc(iterationCount)); - - // Skip past the backtracking section - // goto end; - Label end = DefineLabel(); - BrFar(end); - - // Emit a backtracking section that restores the capture's state and then jumps to the previous done label - Label backtrack = DefineLabel(); - MarkLabel(backtrack); - - // iterationCount = base.runstack[--runstack]; - // startingRunTextPos = base.runstack[--runstack]; - EmitRunstackPop(); - Stloc(iterationCount); - EmitRunstackPop(); - Stloc(startingRunTextPos); - - // goto doneLabel; - BrFar(doneLabel); - - doneLabel = backtrack; - MarkLabel(end); - } - } - - void EmitRunstackResizeIfNeeded(int count) - { - Debug.Assert(count >= 1); - - // if (runstackpos >= base.runstack!.Length - (count - 1)) - // { - // Array.Resize(ref base.runstack, base.runstack.Length * 2); - // } - - Label skipResize = DefineLabel(); - - Ldloc(runstackpos); - Ldthisfld(s_runstackField); - Ldlen(); - if (count > 1) - { - Ldc(count - 1); - Sub(); - } - Blt(skipResize); - - Ldthis(); - _ilg!.Emit(OpCodes.Ldflda, s_runstackField); - Ldthisfld(s_runstackField); - Ldlen(); - Ldc(2); - Mul(); - Call(s_arrayResize); - - MarkLabel(skipResize); - } - - void EmitRunstackPush(Action load) - { - // base.runstack[runstackpos] = load(); - Ldthisfld(s_runstackField); - Ldloc(runstackpos); - load(); - StelemI4(); - - // runstackpos++; - Ldloc(runstackpos); - Ldc(1); - Add(); - Stloc(runstackpos); - } - - void EmitRunstackPop() - { - // ... = base.runstack[--runstackpos]; - Ldthisfld(s_runstackField); - Ldloc(runstackpos); - Ldc(1); - Sub(); - Stloc(runstackpos); - Ldloc(runstackpos); - LdelemI4(); - } - } - - /// Generates the code for "RegexRunner.Go". - protected void GenerateGo() - { - Debug.Assert(_code != null); - _int32LocalsPool?.Clear(); - _readOnlySpanCharLocalsPool?.Clear(); - - // Generate simpler code when we're dealing with simpler regexes. - if (TryGenerateSimplifiedGo(_code.Tree.Root)) - { - return; - } - - // We're dealing with a regex more complicated that the fast-path non-backtracking - // implementation can handle. Do the full-fledged thing. - - // declare some locals - - _runtextposLocal = DeclareInt32(); - _runtextLocal = DeclareString(); - _runtrackposLocal = DeclareInt32(); - _runtrackLocal = DeclareInt32Array(); - _runstackposLocal = DeclareInt32(); - _runstackLocal = DeclareInt32Array(); - if (_hasTimeout) - { - _loopTimeoutCounterLocal = DeclareInt32(); - } - _runtextbegLocal = DeclareInt32(); - _runtextendLocal = DeclareInt32(); - - InitializeCultureForGoIfNecessary(); - - // clear some tables - - _labels = null; - _notes = null; - _notecount = 0; - - // globally used labels - - _backtrack = DefineLabel(); - - // emit the code! - - GenerateForwardSection(); - GenerateMiddleSection(); - GenerateBacktrackSection(); - } - - private void InitializeCultureForGoIfNecessary() - { - _textInfoLocal = null; - if ((_options & RegexOptions.CultureInvariant) == 0) - { - bool needsCulture = (_options & RegexOptions.IgnoreCase) != 0; - if (!needsCulture) - { - for (int codepos = 0; codepos < _codes!.Length; codepos += RegexCode.OpcodeSize(_codes[codepos])) - { - if ((_codes[codepos] & RegexCode.Ci) == RegexCode.Ci) - { - needsCulture = true; - break; - } - } - } - - if (needsCulture) - { - // cache CultureInfo in local variable which saves excessive thread local storage accesses - _textInfoLocal = DeclareTextInfo(); - InitLocalCultureInfo(); - } - } - } - - /// - /// The main translation function. It translates the logic for a single opcode at - /// the current position. The structure of this function exactly mirrors - /// the structure of the inner loop of RegexInterpreter.Go(). - /// - /// - /// The C# code from RegexInterpreter.Go() that corresponds to each case is - /// included as a comment. - /// - /// Note that since we're generating code, we can collapse many cases that are - /// dealt with one-at-a-time in RegexIntepreter. We can also unroll loops that - /// iterate over constant strings or sets. - /// - private void GenerateOneCode() - { -#if DEBUG - if ((_options & RegexOptions.Debug) != 0) - DumpBacktracking(); -#endif - - // Before executing any RegEx code in the unrolled loop, - // we try checking for the match timeout: - - if (_hasTimeout) - { - Ldthis(); - Call(s_checkTimeoutMethod); - } - - // Now generate the IL for the RegEx code saved in _regexopcode. - // We unroll the loop done by the RegexCompiler creating as very long method - // that is longer if the pattern is longer: - - switch (_regexopcode) - { - case RegexCode.Stop: - //: return; - Mvlocfld(_runtextposLocal!, s_runtextposField); // update _textpos - Ret(); - break; - - case RegexCode.Nothing: - //: break Backward; - Back(); - break; - - case RegexCode.UpdateBumpalong: - // UpdateBumpalong should only exist in the code stream at such a point where the root - // of the backtracking stack contains the runtextpos from the start of this Go call. Replace - // that tracking value with the current runtextpos value. - //: base.runtrack[base.runtrack.Length - 1] = runtextpos; - Ldloc(_runtrackLocal!); - Dup(); - Ldlen(); - Ldc(1); - Sub(); - Ldloc(_runtextposLocal!); - StelemI4(); - break; - - case RegexCode.Goto: - //: Goto(Operand(0)); - Goto(Operand(0)); - break; - - case RegexCode.Testref: - //: if (!_match.IsMatched(Operand(0))) - //: break Backward; - Ldthis(); - Ldc(Operand(0)); - Call(s_isMatchedMethod); - BrfalseFar(_backtrack); - break; - - case RegexCode.Lazybranch: - //: Track(Textpos()); - PushTrack(_runtextposLocal!); - Track(); - break; - - case RegexCode.Lazybranch | RegexCode.Back: - //: Trackframe(1); - //: Textto(Tracked(0)); - //: Goto(Operand(0)); - PopTrack(); - Stloc(_runtextposLocal!); - Goto(Operand(0)); - break; - - case RegexCode.Nullmark: - //: Stack(-1); - //: Track(); - ReadyPushStack(); - Ldc(-1); - DoPush(); - TrackUnique(Stackpop); - break; - - case RegexCode.Setmark: - //: Stack(Textpos()); - //: Track(); - PushStack(_runtextposLocal!); - TrackUnique(Stackpop); - break; - - case RegexCode.Nullmark | RegexCode.Back: - case RegexCode.Setmark | RegexCode.Back: - //: Stackframe(1); - //: break Backward; - PopDiscardStack(); - Back(); - break; - - case RegexCode.Getmark: - //: Stackframe(1); - //: Track(Stacked(0)); - //: Textto(Stacked(0)); - ReadyPushTrack(); - PopStack(); - Stloc(_runtextposLocal!); - Ldloc(_runtextposLocal!); - DoPush(); - - Track(); - break; - - case RegexCode.Getmark | RegexCode.Back: - //: Trackframe(1); - //: Stack(Tracked(0)); - //: break Backward; - ReadyPushStack(); - PopTrack(); - DoPush(); - Back(); - break; - - case RegexCode.Capturemark: - //: if (!IsMatched(Operand(1))) - //: break Backward; - //: Stackframe(1); - //: if (Operand(1) != -1) - //: TransferCapture(Operand(0), Operand(1), Stacked(0), Textpos()); - //: else - //: Capture(Operand(0), Stacked(0), Textpos()); - //: Track(Stacked(0)); - - //: Stackframe(1); - //: Capture(Operand(0), Stacked(0), Textpos()); - //: Track(Stacked(0)); - - if (Operand(1) != -1) - { - Ldthis(); - Ldc(Operand(1)); - Call(s_isMatchedMethod); - BrfalseFar(_backtrack); - } - - using (RentedLocalBuilder stackedLocal = RentInt32Local()) - { - PopStack(); - Stloc(stackedLocal); - - if (Operand(1) != -1) - { - Ldthis(); - Ldc(Operand(0)); - Ldc(Operand(1)); - Ldloc(stackedLocal); - Ldloc(_runtextposLocal!); - Call(s_transferCaptureMethod); - } - else - { - Ldthis(); - Ldc(Operand(0)); - Ldloc(stackedLocal); - Ldloc(_runtextposLocal!); - Call(s_captureMethod); - } - - PushTrack(stackedLocal); - } - - TrackUnique(Operand(0) != -1 && Operand(1) != -1 ? Capback2 : Capback); - break; - - - case RegexCode.Capturemark | RegexCode.Back: - //: Trackframe(1); - //: Stack(Tracked(0)); - //: Uncapture(); - //: if (Operand(0) != -1 && Operand(1) != -1) - //: Uncapture(); - //: break Backward; - ReadyPushStack(); - PopTrack(); - DoPush(); - Ldthis(); - Call(s_uncaptureMethod); - if (Operand(0) != -1 && Operand(1) != -1) - { - Ldthis(); - Call(s_uncaptureMethod); - } - Back(); - break; - - case RegexCode.Branchmark: - //: Stackframe(1); - //: - //: if (Textpos() != Stacked(0)) - //: { // Nonempty match -> loop now - //: Track(Stacked(0), Textpos()); // Save old mark, textpos - //: Stack(Textpos()); // Make new mark - //: Goto(Operand(0)); // Loop - //: } - //: else - //: { // Empty match -> straight now - //: Track2(Stacked(0)); // Save old mark - //: Advance(1); // Straight - //: } - //: continue Forward; - { - Label l1 = DefineLabel(); - - PopStack(); - using (RentedLocalBuilder mark = RentInt32Local()) - { - Stloc(mark); // Stacked(0) -> temp - PushTrack(mark); - Ldloc(mark); - } - Ldloc(_runtextposLocal!); - Beq(l1); // mark == textpos -> branch - - // (matched != 0) - - PushTrack(_runtextposLocal!); - PushStack(_runtextposLocal!); - Track(); - Goto(Operand(0)); // Goto(Operand(0)) - - // else - - MarkLabel(l1); - TrackUnique2(Branchmarkback2); - break; - } - - case RegexCode.Branchmark | RegexCode.Back: - //: Trackframe(2); - //: Stackframe(1); - //: Textto(Tracked(1)); // Recall position - //: Track2(Tracked(0)); // Save old mark - //: Advance(1); - PopTrack(); - Stloc(_runtextposLocal!); - PopStack(); - Pop(); - // track spot 0 is already in place - TrackUnique2(Branchmarkback2); - Advance(); - break; - - case RegexCode.Branchmark | RegexCode.Back2: - //: Trackframe(1); - //: Stack(Tracked(0)); // Recall old mark - //: break Backward; // Backtrack - ReadyPushStack(); - PopTrack(); - DoPush(); - Back(); - break; - - case RegexCode.Lazybranchmark: - //: StackPop(); - //: int oldMarkPos = StackPeek(); - //: - //: if (Textpos() != oldMarkPos) { // Nonempty match -> next loop - //: { // Nonempty match -> next loop - //: if (oldMarkPos != -1) - //: Track(Stacked(0), Textpos()); // Save old mark, textpos - //: else - //: TrackPush(Textpos(), Textpos()); - //: } - //: else - //: { // Empty match -> no loop - //: Track2(Stacked(0)); // Save old mark - //: } - //: Advance(1); - //: continue Forward; - { - using (RentedLocalBuilder mark = RentInt32Local()) - { - PopStack(); - Stloc(mark); // Stacked(0) -> temp - - // if (oldMarkPos != -1) - Label l2 = DefineLabel(); - Label l3 = DefineLabel(); - Ldloc(mark); - Ldc(-1); - Beq(l2); // mark == -1 -> branch - PushTrack(mark); - Br(l3); - // else - MarkLabel(l2); - PushTrack(_runtextposLocal!); - MarkLabel(l3); - - // if (Textpos() != mark) - Label l1 = DefineLabel(); - Ldloc(_runtextposLocal!); - Ldloc(mark); - Beq(l1); // mark == textpos -> branch - PushTrack(_runtextposLocal!); - Track(); - Br(AdvanceLabel()); // Advance (near) - // else - MarkLabel(l1); - ReadyPushStack(); // push the current textPos on the stack. - // May be ignored by 'back2' or used by a true empty match. - Ldloc(mark); - } - - DoPush(); - TrackUnique2(Lazybranchmarkback2); - - break; - } - - case RegexCode.Lazybranchmark | RegexCode.Back: - //: Trackframe(2); - //: Track2(Tracked(0)); // Save old mark - //: Stack(Textpos()); // Make new mark - //: Textto(Tracked(1)); // Recall position - //: Goto(Operand(0)); // Loop - - PopTrack(); - Stloc(_runtextposLocal!); - PushStack(_runtextposLocal!); - TrackUnique2(Lazybranchmarkback2); - Goto(Operand(0)); - break; - - case RegexCode.Lazybranchmark | RegexCode.Back2: - //: Stackframe(1); - //: Trackframe(1); - //: Stack(Tracked(0)); // Recall old mark - //: break Backward; - ReadyReplaceStack(0); - PopTrack(); - DoReplace(); - Back(); - break; - - case RegexCode.Nullcount: - //: Stack(-1, Operand(0)); - //: Track(); - ReadyPushStack(); - Ldc(-1); - DoPush(); - ReadyPushStack(); - Ldc(Operand(0)); - DoPush(); - TrackUnique(Stackpop2); - break; - - case RegexCode.Setcount: - //: Stack(Textpos(), Operand(0)); - //: Track(); - PushStack(_runtextposLocal!); - ReadyPushStack(); - Ldc(Operand(0)); - DoPush(); - TrackUnique(Stackpop2); - break; - - case RegexCode.Nullcount | RegexCode.Back: - case RegexCode.Setcount | RegexCode.Back: - //: Stackframe(2); - //: break Backward; - PopDiscardStack(2); - Back(); - break; - - case RegexCode.Branchcount: - //: Stackframe(2); - //: int mark = Stacked(0); - //: int count = Stacked(1); - //: - //: if (count >= Operand(1) || Textpos() == mark && count >= 0) - //: { // Max loops or empty match -> straight now - //: Track2(mark, count); // Save old mark, count - //: Advance(2); // Straight - //: } - //: else - //: { // Nonempty match -> count+loop now - //: Track(mark); // remember mark - //: Stack(Textpos(), count + 1); // Make new mark, incr count - //: Goto(Operand(0)); // Loop - //: } - //: continue Forward; - { - using (RentedLocalBuilder count = RentInt32Local()) - { - PopStack(); - Stloc(count); // count -> temp - PopStack(); - using (RentedLocalBuilder mark = RentInt32Local()) - { - Stloc(mark); // mark -> temp2 - PushTrack(mark); - Ldloc(mark); - } - - Label l1 = DefineLabel(); - Label l2 = DefineLabel(); - Ldloc(_runtextposLocal!); - Bne(l1); // mark != textpos -> l1 - Ldloc(count); - Ldc(0); - Bge(l2); // count >= 0 && mark == textpos -> l2 - - MarkLabel(l1); - Ldloc(count); - Ldc(Operand(1)); - Bge(l2); // count >= Operand(1) -> l2 - - // else - PushStack(_runtextposLocal!); - ReadyPushStack(); - Ldloc(count); // mark already on track - Ldc(1); - Add(); - DoPush(); - Track(); - Goto(Operand(0)); - - // if (count >= Operand(1) || Textpos() == mark) - MarkLabel(l2); - PushTrack(count); // mark already on track - } - TrackUnique2(Branchcountback2); - break; - } - - case RegexCode.Branchcount | RegexCode.Back: - //: Trackframe(1); - //: Stackframe(2); - //: if (Stacked(1) > 0) // Positive -> can go straight - //: { - //: Textto(Stacked(0)); // Zap to mark - //: Track2(Tracked(0), Stacked(1) - 1); // Save old mark, old count - //: Advance(2); // Straight - //: continue Forward; - //: } - //: Stack(Tracked(0), Stacked(1) - 1); // recall old mark, old count - //: break Backward; - { - using (RentedLocalBuilder count = RentInt32Local()) - { - Label l1 = DefineLabel(); - PopStack(); - Ldc(1); - Sub(); - Stloc(count); - Ldloc(count); - Ldc(0); - Blt(l1); - - // if (count >= 0) - PopStack(); - Stloc(_runtextposLocal!); - PushTrack(count); // Tracked(0) is already on the track - TrackUnique2(Branchcountback2); - Advance(); - - // else - MarkLabel(l1); - ReadyReplaceStack(0); - PopTrack(); - DoReplace(); - PushStack(count); - } - Back(); - break; - } - - case RegexCode.Branchcount | RegexCode.Back2: - //: Trackframe(2); - //: Stack(Tracked(0), Tracked(1)); // Recall old mark, old count - //: break Backward; // Backtrack - - PopTrack(); - using (RentedLocalBuilder tmp = RentInt32Local()) - { - Stloc(tmp); - ReadyPushStack(); - PopTrack(); - DoPush(); - PushStack(tmp); - } - Back(); - break; - - case RegexCode.Lazybranchcount: - //: Stackframe(2); - //: int mark = Stacked(0); - //: int count = Stacked(1); - //: - //: if (count < 0) - //: { // Negative count -> loop now - //: Track2(mark); // Save old mark - //: Stack(Textpos(), count + 1); // Make new mark, incr count - //: Goto(Operand(0)); // Loop - //: } - //: else - //: { // Nonneg count or empty match -> straight now - //: Track(mark, count, Textpos()); // Save mark, count, position - //: } - { - PopStack(); - using (RentedLocalBuilder count = RentInt32Local()) - { - Stloc(count); // count -> temp - PopStack(); - using (RentedLocalBuilder mark = RentInt32Local()) - { - Stloc(mark); // mark -> temp2 - - Label l1 = DefineLabel(); - Ldloc(count); - Ldc(0); - Bge(l1); // count >= 0 -> l1 - - // if (count < 0) - PushTrack(mark); - PushStack(_runtextposLocal!); - ReadyPushStack(); - Ldloc(count); - Ldc(1); - Add(); - DoPush(); - TrackUnique2(Lazybranchcountback2); - Goto(Operand(0)); - - // else - MarkLabel(l1); - PushTrack(mark); - } - PushTrack(count); - } - PushTrack(_runtextposLocal!); - Track(); - break; - } - - case RegexCode.Lazybranchcount | RegexCode.Back: - //: Trackframe(3); - //: int mark = Tracked(0); - //: int textpos = Tracked(2); - //: if (Tracked(1) < Operand(1) && textpos != mark) - //: { // Under limit and not empty match -> loop - //: Textto(Tracked(2)); // Recall position - //: Stack(Textpos(), Tracked(1) + 1); // Make new mark, incr count - //: Track2(Tracked(0)); // Save old mark - //: Goto(Operand(0)); // Loop - //: continue Forward; - //: } - //: else - //: { - //: Stack(Tracked(0), Tracked(1)); // Recall old mark, count - //: break Backward; // backtrack - //: } - { - using (RentedLocalBuilder cLocal = RentInt32Local()) - { - Label l1 = DefineLabel(); - - PopTrack(); - Stloc(_runtextposLocal!); - PopTrack(); - Stloc(cLocal); - Ldloc(cLocal); - Ldc(Operand(1)); - Bge(l1); // Tracked(1) >= Operand(1) -> l1 - - Ldloc(_runtextposLocal!); - TopTrack(); - Beq(l1); // textpos == mark -> l1 - - PushStack(_runtextposLocal!); - ReadyPushStack(); - Ldloc(cLocal); - Ldc(1); - Add(); - DoPush(); - TrackUnique2(Lazybranchcountback2); - Goto(Operand(0)); - - MarkLabel(l1); - ReadyPushStack(); - PopTrack(); - DoPush(); - PushStack(cLocal); - } - Back(); - break; - } - - case RegexCode.Lazybranchcount | RegexCode.Back2: - // < - ReadyReplaceStack(1); - PopTrack(); - DoReplace(); - ReadyReplaceStack(0); - TopStack(); - Ldc(1); - Sub(); - DoReplace(); - Back(); - break; - - case RegexCode.Setjump: - //: Stack(Trackpos(), Crawlpos()); - //: Track(); - ReadyPushStack(); - Ldthisfld(s_runtrackField); - Ldlen(); - Ldloc(_runtrackposLocal!); - Sub(); - DoPush(); - ReadyPushStack(); - Ldthis(); - Call(s_crawlposMethod); - DoPush(); - TrackUnique(Stackpop2); - break; - - case RegexCode.Setjump | RegexCode.Back: - //: Stackframe(2); - PopDiscardStack(2); - Back(); - break; - - case RegexCode.Backjump: - //: Stackframe(2); - //: Trackto(Stacked(0)); - //: while (Crawlpos() != Stacked(1)) - //: Uncapture(); - //: break Backward; - { - Label l1 = DefineLabel(); - Label l2 = DefineLabel(); - - using (RentedLocalBuilder stackedLocal = RentInt32Local()) - { - PopStack(); - Stloc(stackedLocal); - Ldthisfld(s_runtrackField); - Ldlen(); - PopStack(); - Sub(); - Stloc(_runtrackposLocal!); - - MarkLabel(l1); - Ldthis(); - Call(s_crawlposMethod); - Ldloc(stackedLocal); - Beq(l2); - Ldthis(); - Call(s_uncaptureMethod); - Br(l1); - } - - MarkLabel(l2); - Back(); - break; - } - - case RegexCode.Forejump: - //: Stackframe(2); - //: Trackto(Stacked(0)); - //: Track(Stacked(1)); - PopStack(); - using (RentedLocalBuilder tmp = RentInt32Local()) - { - Stloc(tmp); - Ldthisfld(s_runtrackField); - Ldlen(); - PopStack(); - Sub(); - Stloc(_runtrackposLocal!); - PushTrack(tmp); - } - TrackUnique(Forejumpback); - break; - - case RegexCode.Forejump | RegexCode.Back: - //: Trackframe(1); - //: while (Crawlpos() != Tracked(0)) - //: Uncapture(); - //: break Backward; - { - Label l1 = DefineLabel(); - Label l2 = DefineLabel(); - - using (RentedLocalBuilder trackedLocal = RentInt32Local()) - { - PopTrack(); - Stloc(trackedLocal); - - MarkLabel(l1); - Ldthis(); - Call(s_crawlposMethod); - Ldloc(trackedLocal); - Beq(l2); - Ldthis(); - Call(s_uncaptureMethod); - Br(l1); - } - - MarkLabel(l2); - Back(); - break; - } - - case RegexCode.Bol: - //: if (Leftchars() > 0 && CharAt(Textpos() - 1) != '\n') - //: break Backward; - { - Label l1 = _labels![NextCodepos()]; - Ldloc(_runtextposLocal!); - Ldloc(_runtextbegLocal!); - Ble(l1); - Leftchar(); - Ldc('\n'); - BneFar(_backtrack); - break; - } - - case RegexCode.Eol: - //: if (Rightchars() > 0 && CharAt(Textpos()) != '\n') - //: break Backward; - { - Label l1 = _labels![NextCodepos()]; - Ldloc(_runtextposLocal!); - Ldloc(_runtextendLocal!); - Bge(l1); - Rightchar(); - Ldc('\n'); - BneFar(_backtrack); - break; - } - - case RegexCode.Boundary: - case RegexCode.NonBoundary: - //: if (!IsBoundary(Textpos(), _textbeg, _textend)) - //: break Backward; - Ldthis(); - Ldloc(_runtextposLocal!); - Ldloc(_runtextbegLocal!); - Ldloc(_runtextendLocal!); - Call(s_isBoundaryMethod); - if (Code() == RegexCode.Boundary) - { - BrfalseFar(_backtrack); - } - else - { - BrtrueFar(_backtrack); - } - break; - - case RegexCode.ECMABoundary: - case RegexCode.NonECMABoundary: - //: if (!IsECMABoundary(Textpos(), _textbeg, _textend)) - //: break Backward; - Ldthis(); - Ldloc(_runtextposLocal!); - Ldloc(_runtextbegLocal!); - Ldloc(_runtextendLocal!); - Call(s_isECMABoundaryMethod); - if (Code() == RegexCode.ECMABoundary) - { - BrfalseFar(_backtrack); - } - else - { - BrtrueFar(_backtrack); - } - break; - - case RegexCode.Beginning: - //: if (Leftchars() > 0) - //: break Backward; - Ldloc(_runtextposLocal!); - Ldloc(_runtextbegLocal!); - BgtFar(_backtrack); - break; - - case RegexCode.Start: - //: if (Textpos() != Textstart()) - //: break Backward; - Ldloc(_runtextposLocal!); - Ldthisfld(s_runtextstartField); - BneFar(_backtrack); - break; - - case RegexCode.EndZ: - //: if (Rightchars() > 1 || Rightchars() == 1 && CharAt(Textpos()) != '\n') - //: break Backward; - Ldloc(_runtextposLocal!); - Ldloc(_runtextendLocal!); - Ldc(1); - Sub(); - BltFar(_backtrack); - Ldloc(_runtextposLocal!); - Ldloc(_runtextendLocal!); - Bge(_labels![NextCodepos()]); - Rightchar(); - Ldc('\n'); - BneFar(_backtrack); - break; - - case RegexCode.End: - //: if (Rightchars() > 0) - //: break Backward; - Ldloc(_runtextposLocal!); - Ldloc(_runtextendLocal!); - BltFar(_backtrack); - break; - - case RegexCode.One: - case RegexCode.Notone: - case RegexCode.Set: - case RegexCode.One | RegexCode.Rtl: - case RegexCode.Notone | RegexCode.Rtl: - case RegexCode.Set | RegexCode.Rtl: - case RegexCode.One | RegexCode.Ci: - case RegexCode.Notone | RegexCode.Ci: - case RegexCode.Set | RegexCode.Ci: - case RegexCode.One | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Notone | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Set | RegexCode.Ci | RegexCode.Rtl: - - //: if (Rightchars() < 1 || Rightcharnext() != (char)Operand(0)) - //: break Backward; - - Ldloc(_runtextposLocal!); - - if (!IsRightToLeft()) - { - Ldloc(_runtextendLocal!); - BgeFar(_backtrack); - Rightcharnext(); - } - else - { - Ldloc(_runtextbegLocal!); - BleFar(_backtrack); - Leftcharnext(); - } - - if (Code() == RegexCode.Set) - { - EmitMatchCharacterClass(_strings![Operand(0)], IsCaseInsensitive()); - BrfalseFar(_backtrack); - } - else - { - if (IsCaseInsensitive()) - { - CallToLower(); - } - - Ldc(Operand(0)); - if (Code() == RegexCode.One) - { - BneFar(_backtrack); - } - else - { - BeqFar(_backtrack); - } - } - break; - - case RegexCode.Multi: - case RegexCode.Multi | RegexCode.Ci: - //: String Str = _strings[Operand(0)]; - //: int i, c; - //: if (Rightchars() < (c = Str.Length)) - //: break Backward; - //: for (i = 0; c > 0; i++, c--) - //: if (Str[i] != Rightcharnext()) - //: break Backward; - { - string str = _strings![Operand(0)]; - - Ldc(str.Length); - Ldloc(_runtextendLocal!); - Ldloc(_runtextposLocal!); - Sub(); - BgtFar(_backtrack); - - // unroll the string - for (int i = 0; i < str.Length; i++) - { - Ldloc(_runtextLocal!); - Ldloc(_runtextposLocal!); - if (i != 0) - { - Ldc(i); - Add(); - } - Call(s_stringGetCharsMethod); - if (IsCaseInsensitive()) - { - CallToLower(); - } - - Ldc(str[i]); - BneFar(_backtrack); - } - - Ldloc(_runtextposLocal!); - Ldc(str.Length); - Add(); - Stloc(_runtextposLocal!); - break; - } - - case RegexCode.Multi | RegexCode.Rtl: - case RegexCode.Multi | RegexCode.Ci | RegexCode.Rtl: - //: String Str = _strings[Operand(0)]; - //: int c; - //: if (Leftchars() < (c = Str.Length)) - //: break Backward; - //: while (c > 0) - //: if (Str[--c] != Leftcharnext()) - //: break Backward; - { - string str = _strings![Operand(0)]; - - Ldc(str.Length); - Ldloc(_runtextposLocal!); - Ldloc(_runtextbegLocal!); - Sub(); - BgtFar(_backtrack); - - // unroll the string - for (int i = str.Length; i > 0;) - { - i--; - Ldloc(_runtextLocal!); - Ldloc(_runtextposLocal!); - Ldc(str.Length - i); - Sub(); - Call(s_stringGetCharsMethod); - if (IsCaseInsensitive()) - { - CallToLower(); - } - Ldc(str[i]); - BneFar(_backtrack); - } - - Ldloc(_runtextposLocal!); - Ldc(str.Length); - Sub(); - Stloc(_runtextposLocal!); - - break; - } - - case RegexCode.Ref: - case RegexCode.Ref | RegexCode.Rtl: - case RegexCode.Ref | RegexCode.Ci: - case RegexCode.Ref | RegexCode.Ci | RegexCode.Rtl: - //: int capnum = Operand(0); - //: int j, c; - //: if (!_match.IsMatched(capnum)) { - //: if (!RegexOptions.ECMAScript) - //: break Backward; - //: } else { - //: if (Rightchars() < (c = _match.MatchLength(capnum))) - //: break Backward; - //: for (j = _match.MatchIndex(capnum); c > 0; j++, c--) - //: if (CharAt(j) != Rightcharnext()) - //: break Backward; - //: } - { - using RentedLocalBuilder lenLocal = RentInt32Local(); - using RentedLocalBuilder indexLocal = RentInt32Local(); - Label l1 = DefineLabel(); - - Ldthis(); - Ldc(Operand(0)); - Call(s_isMatchedMethod); - if ((_options & RegexOptions.ECMAScript) != 0) - { - Brfalse(AdvanceLabel()); - } - else - { - BrfalseFar(_backtrack); // !IsMatched() -> back - } - - Ldthis(); - Ldc(Operand(0)); - Call(s_matchLengthMethod); - Stloc(lenLocal); - Ldloc(lenLocal); - if (!IsRightToLeft()) - { - Ldloc(_runtextendLocal!); - Ldloc(_runtextposLocal!); - } - else - { - Ldloc(_runtextposLocal!); - Ldloc(_runtextbegLocal!); - } - Sub(); - BgtFar(_backtrack); // Matchlength() > Rightchars() -> back - - Ldthis(); - Ldc(Operand(0)); - Call(s_matchIndexMethod); - if (!IsRightToLeft()) - { - Ldloc(lenLocal); - Add(IsRightToLeft()); - } - Stloc(indexLocal); // index += len - - Ldloc(_runtextposLocal!); - Ldloc(lenLocal); - Add(IsRightToLeft()); - Stloc(_runtextposLocal!); // texpos += len - - MarkLabel(l1); - Ldloc(lenLocal); - Ldc(0); - Ble(AdvanceLabel()); - Ldloc(_runtextLocal!); - Ldloc(indexLocal); - Ldloc(lenLocal); - if (IsRightToLeft()) - { - Ldc(1); - Sub(); - Stloc(lenLocal); - Ldloc(lenLocal); - } - Sub(IsRightToLeft()); - Call(s_stringGetCharsMethod); - if (IsCaseInsensitive()) - { - CallToLower(); - } - - Ldloc(_runtextLocal!); - Ldloc(_runtextposLocal!); - Ldloc(lenLocal); - if (!IsRightToLeft()) - { - Ldloc(lenLocal); - Ldc(1); - Sub(); - Stloc(lenLocal); - } - Sub(IsRightToLeft()); - Call(s_stringGetCharsMethod); - if (IsCaseInsensitive()) - { - CallToLower(); - } - - Beq(l1); - Back(); - break; - } - - case RegexCode.Onerep: - case RegexCode.Notonerep: - case RegexCode.Setrep: - case RegexCode.Onerep | RegexCode.Rtl: - case RegexCode.Notonerep | RegexCode.Rtl: - case RegexCode.Setrep | RegexCode.Rtl: - case RegexCode.Onerep | RegexCode.Ci: - case RegexCode.Notonerep | RegexCode.Ci: - case RegexCode.Setrep | RegexCode.Ci: - case RegexCode.Onerep | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Notonerep | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Setrep | RegexCode.Ci | RegexCode.Rtl: - //: int c = Operand(1); - //: if (Rightchars() < c) - //: break Backward; - //: char ch = (char)Operand(0); - //: while (c-- > 0) - //: if (Rightcharnext() != ch) - //: break Backward; - { - int c = Operand(1); - if (c == 0) - break; - - Ldc(c); - if (!IsRightToLeft()) - { - Ldloc(_runtextendLocal!); - Ldloc(_runtextposLocal!); - } - else - { - Ldloc(_runtextposLocal!); - Ldloc(_runtextbegLocal!); - } - Sub(); - BgtFar(_backtrack); // Matchlength() > Rightchars() -> back - - Ldloc(_runtextposLocal!); - Ldc(c); - Add(IsRightToLeft()); - Stloc(_runtextposLocal!); // texpos += len - - using RentedLocalBuilder lenLocal = RentInt32Local(); - Label l1 = DefineLabel(); - Ldc(c); - Stloc(lenLocal); - - MarkLabel(l1); - Ldloc(_runtextLocal!); - Ldloc(_runtextposLocal!); - Ldloc(lenLocal); - if (IsRightToLeft()) - { - Ldc(1); - Sub(); - Stloc(lenLocal); - Ldloc(lenLocal); - Add(); - } - else - { - Ldloc(lenLocal); - Ldc(1); - Sub(); - Stloc(lenLocal); - Sub(); - } - Call(s_stringGetCharsMethod); - - if (Code() == RegexCode.Setrep) - { - EmitTimeoutCheck(); - EmitMatchCharacterClass(_strings![Operand(0)], IsCaseInsensitive()); - BrfalseFar(_backtrack); - } - else - { - if (IsCaseInsensitive()) - { - CallToLower(); - } - - Ldc(Operand(0)); - if (Code() == RegexCode.Onerep) - { - BneFar(_backtrack); - } - else - { - BeqFar(_backtrack); - } - } - Ldloc(lenLocal); - Ldc(0); - if (Code() == RegexCode.Setrep) - { - BgtFar(l1); - } - else - { - Bgt(l1); - } - break; - } - - case RegexCode.Oneloop: - case RegexCode.Notoneloop: - case RegexCode.Setloop: - case RegexCode.Oneloop | RegexCode.Rtl: - case RegexCode.Notoneloop | RegexCode.Rtl: - case RegexCode.Setloop | RegexCode.Rtl: - case RegexCode.Oneloop | RegexCode.Ci: - case RegexCode.Notoneloop | RegexCode.Ci: - case RegexCode.Setloop | RegexCode.Ci: - case RegexCode.Oneloop | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Notoneloop | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Setloop | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Oneloopatomic: - case RegexCode.Notoneloopatomic: - case RegexCode.Setloopatomic: - case RegexCode.Oneloopatomic | RegexCode.Rtl: - case RegexCode.Notoneloopatomic | RegexCode.Rtl: - case RegexCode.Setloopatomic | RegexCode.Rtl: - case RegexCode.Oneloopatomic | RegexCode.Ci: - case RegexCode.Notoneloopatomic | RegexCode.Ci: - case RegexCode.Setloopatomic | RegexCode.Ci: - case RegexCode.Oneloopatomic | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Notoneloopatomic | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Setloopatomic | RegexCode.Ci | RegexCode.Rtl: - //: int len = Operand(1); - //: if (len > Rightchars()) - //: len = Rightchars(); - //: char ch = (char)Operand(0); - //: int i; - //: for (i = len; i > 0; i--) - //: { - //: if (Rightcharnext() != ch) - //: { - //: Leftnext(); - //: break; - //: } - //: } - //: if (len > i) - //: Track(len - i - 1, Textpos() - 1); - { - int c = Operand(1); - if (c == 0) - { - break; - } - - using RentedLocalBuilder lenLocal = RentInt32Local(); - using RentedLocalBuilder iLocal = RentInt32Local(); + EmitRunstackPop(); + Stloc(poppedCrawlPos); + EmitUncaptureUntil(poppedCrawlPos); + } + LoadTextSpanLocal(); - if (!IsRightToLeft()) - { - Ldloc(_runtextendLocal!); - Ldloc(_runtextposLocal!); - } - else - { - Ldloc(_runtextposLocal!); - Ldloc(_runtextbegLocal!); - } - Sub(); - Stloc(lenLocal); - if (c != int.MaxValue) - { - Label l4 = DefineLabel(); - Ldloc(lenLocal); - Ldc(c); - Blt(l4); - Ldc(c); - Stloc(lenLocal); - MarkLabel(l4); - } + if (minIterations > 0) + { + // if (iterationCount == 0) goto originalDoneLabel; + Ldloc(iterationCount); + Ldc(0); + BeqFar(originalDoneLabel); - Label loopEnd = DefineLabel(); - string? set = Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic ? _strings![Operand(0)] : null; - Span setChars = stackalloc char[5]; // max optimized by IndexOfAny today - int numSetChars; + // if (iterationCount < minIterations) goto doneLabel/originalDoneLabel; + Ldloc(iterationCount); + Ldc(minIterations); + BltFar(childBacktracks ? doneLabel : originalDoneLabel); + } - // If this is a notoneloop{atomic} and we're left-to-right and case-sensitive, - // we can use the vectorized IndexOf to search for the target character. - if ((Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic) && - !IsRightToLeft() && - (!IsCaseInsensitive())) - { - // i = runtext.AsSpan(runtextpos, len).IndexOf(ch); - Ldloc(_runtextLocal!); - Ldloc(_runtextposLocal!); - Ldloc(lenLocal); - Call(s_stringAsSpanIntIntMethod); - Ldc(Operand(0)); - Call(s_spanIndexOfChar); - Stloc(iLocal); + if (isAtomic) + { + doneLabel = originalDoneLabel; + MarkLabel(endLoop); + } + else + { + if (childBacktracks) + { + // goto endLoop; + BrFar(endLoop); - Label charFound = DefineLabel(); + // Backtrack: + Label backtrack = DefineLabel(); + MarkLabel(backtrack); - // if (i != -1) goto charFound; - Ldloc(iLocal); - Ldc(-1); - Bne(charFound); - - // runtextpos += len; - // i = 0; - // goto loopEnd; - Ldloc(_runtextposLocal!); - Ldloc(lenLocal); - Add(); - Stloc(_runtextposLocal!); - Ldc(0); - Stloc(iLocal); - BrFar(loopEnd); - - // charFound: - // runtextpos += i; - // i = len - i; - // goto loopEnd; - MarkLabel(charFound); - Ldloc(_runtextposLocal!); - Ldloc(iLocal); - Add(); - Stloc(_runtextposLocal!); - Ldloc(lenLocal); - Ldloc(iLocal); - Sub(); - Stloc(iLocal); - BrFar(loopEnd); - } - else if ((Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) && - !IsRightToLeft() && - !IsCaseInsensitive() && - (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) != 0 && - RegexCharClass.IsNegated(set!)) - { - // Similarly, if this is a setloop{atomic} and we're left-to-right and case-sensitive, - // and if the set contains only a few negated chars, we can use the vectorized IndexOfAny - // to search for those chars. - Debug.Assert(numSetChars > 1); - - // i = runtext.AsSpan(runtextpos, len).IndexOfAny(ch1, ch2{, ch3}); - Ldloc(_runtextLocal!); - Ldloc(_runtextposLocal!); - Ldloc(lenLocal); - Call(s_stringAsSpanIntIntMethod); - switch (numSetChars) - { - case 2: - Ldc(setChars[0]); - Ldc(setChars[1]); - Call(s_spanIndexOfAnyCharChar); - break; - - case 3: - Ldc(setChars[0]); - Ldc(setChars[1]); - Ldc(setChars[2]); - Call(s_spanIndexOfAnyCharCharChar); - break; - - default: - Ldstr(setChars.Slice(0, numSetChars).ToString()); - Call(s_stringAsSpanMethod); - Call(s_spanIndexOfSpan); - break; - } - Stloc(iLocal); + // if (iterationCount == 0) goto originalDoneLabel; + Ldloc(iterationCount); + Ldc(0); + BeqFar(originalDoneLabel); - Label charFound = DefineLabel(); + // goto doneLabel; + BrFar(doneLabel); - // if (i != -1) goto charFound; - Ldloc(iLocal); - Ldc(-1); - Bne(charFound); - - // runtextpos += len; - // i = 0; - // goto loopEnd; - Ldloc(_runtextposLocal!); - Ldloc(lenLocal); - Add(); - Stloc(_runtextposLocal!); - Ldc(0); - Stloc(iLocal); - BrFar(loopEnd); - - // charFound: - // runtextpos += i; - // i = len - i; - // goto loopEnd; - MarkLabel(charFound); - Ldloc(_runtextposLocal!); - Ldloc(iLocal); - Add(); - Stloc(_runtextposLocal!); - Ldloc(lenLocal); - Ldloc(iLocal); - Sub(); - Stloc(iLocal); - BrFar(loopEnd); - } - else if ((Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) && - !IsRightToLeft() && - set == RegexCharClass.AnyClass) - { - // If someone uses .* along with RegexOptions.Singleline, that becomes [anycharacter]*, which means it'll - // consume everything. As such, we can simply update our position to be the last allowed, without - // actually checking anything. - - // runtextpos += len; - // i = 0; - // goto loopEnd; - Ldloc(_runtextposLocal!); - Ldloc(lenLocal); - Add(); - Stloc(_runtextposLocal!); - Ldc(0); - Stloc(iLocal); - BrFar(loopEnd); - } - else - { - // Otherwise, we emit the open-coded loop. + doneLabel = backtrack; + } - Ldloc(lenLocal); - Ldc(1); - Add(); - Stloc(iLocal); + MarkLabel(endLoop); - Label loopCondition = DefineLabel(); - MarkLabel(loopCondition); - Ldloc(iLocal); - Ldc(1); - Sub(); - Stloc(iLocal); - Ldloc(iLocal); - Ldc(0); - if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) - { - BleFar(loopEnd); - } - else - { - Ble(loopEnd); - } + if (node.IsInLoop()) + { + // Store the capture's state + EmitRunstackResizeIfNeeded(3); + EmitRunstackPush(() => Ldloc(startingRunTextPos)); + EmitRunstackPush(() => Ldloc(iterationCount)); - if (IsRightToLeft()) - { - Leftcharnext(); - } - else - { - Rightcharnext(); - } + // Skip past the backtracking section + // goto end; + Label end = DefineLabel(); + BrFar(end); - if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) - { - EmitTimeoutCheck(); - EmitMatchCharacterClass(_strings![Operand(0)], IsCaseInsensitive()); - BrtrueFar(loopCondition); - } - else - { - if (IsCaseInsensitive()) - { - CallToLower(); - } + // Emit a backtracking section that restores the capture's state and then jumps to the previous done label + Label backtrack = DefineLabel(); + MarkLabel(backtrack); - Ldc(Operand(0)); - if (Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic) - { - Beq(loopCondition); - } - else - { - Debug.Assert(Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic); - Bne(loopCondition); - } - } + // iterationCount = base.runstack[--runstack]; + // startingRunTextPos = base.runstack[--runstack]; + EmitRunstackPop(); + Stloc(iterationCount); + EmitRunstackPop(); + Stloc(startingRunTextPos); - Ldloc(_runtextposLocal!); - Ldc(1); - Sub(IsRightToLeft()); - Stloc(_runtextposLocal!); - } + // goto doneLabel; + BrFar(doneLabel); - // loopEnd: - MarkLabel(loopEnd); - if (Code() != RegexCode.Oneloopatomic && Code() != RegexCode.Notoneloopatomic && Code() != RegexCode.Setloopatomic) - { - // if (len <= i) goto advance; - Ldloc(lenLocal); - Ldloc(iLocal); - Ble(AdvanceLabel()); + doneLabel = backtrack; + MarkLabel(end); + } + } + } - // TrackPush(len - i - 1, runtextpos - Bump()) - ReadyPushTrack(); - Ldloc(lenLocal); - Ldloc(iLocal); - Sub(); - Ldc(1); - Sub(); - DoPush(); + void EmitRunstackResizeIfNeeded(int count) + { + Debug.Assert(count >= 1); - ReadyPushTrack(); - Ldloc(_runtextposLocal!); - Ldc(1); - Sub(IsRightToLeft()); - DoPush(); + // if (runstackpos >= base.runstack!.Length - (count - 1)) + // { + // Array.Resize(ref base.runstack, base.runstack.Length * 2); + // } - Track(); - } - break; - } + Label skipResize = DefineLabel(); - case RegexCode.Oneloop | RegexCode.Back: - case RegexCode.Notoneloop | RegexCode.Back: - case RegexCode.Setloop | RegexCode.Back: - case RegexCode.Oneloop | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Notoneloop | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Setloop | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Oneloop | RegexCode.Ci | RegexCode.Back: - case RegexCode.Notoneloop | RegexCode.Ci | RegexCode.Back: - case RegexCode.Setloop | RegexCode.Ci | RegexCode.Back: - case RegexCode.Oneloop | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Notoneloop | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Setloop | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: - //: Trackframe(2); - //: int i = Tracked(0); - //: int pos = Tracked(1); - //: Textto(pos); - //: if (i > 0) - //: Track(i - 1, pos - 1); - //: Advance(2); - PopTrack(); - Stloc(_runtextposLocal!); - PopTrack(); - using (RentedLocalBuilder posLocal = RentInt32Local()) - { - Stloc(posLocal); - Ldloc(posLocal); - Ldc(0); - BleFar(AdvanceLabel()); - ReadyPushTrack(); - Ldloc(posLocal); - } - Ldc(1); + Ldloc(runstackpos); + Ldthisfld(s_runstackField); + Ldlen(); + if (count > 1) + { + Ldc(count - 1); Sub(); - DoPush(); - ReadyPushTrack(); - Ldloc(_runtextposLocal!); - Ldc(1); - Sub(IsRightToLeft()); - DoPush(); - Trackagain(); - Advance(); - break; + } + Blt(skipResize); - case RegexCode.Onelazy: - case RegexCode.Notonelazy: - case RegexCode.Setlazy: - case RegexCode.Onelazy | RegexCode.Rtl: - case RegexCode.Notonelazy | RegexCode.Rtl: - case RegexCode.Setlazy | RegexCode.Rtl: - case RegexCode.Onelazy | RegexCode.Ci: - case RegexCode.Notonelazy | RegexCode.Ci: - case RegexCode.Setlazy | RegexCode.Ci: - case RegexCode.Onelazy | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Notonelazy | RegexCode.Ci | RegexCode.Rtl: - case RegexCode.Setlazy | RegexCode.Ci | RegexCode.Rtl: - //: int c = Operand(1); - //: if (c > Rightchars()) - //: c = Rightchars(); - //: if (c > 0) - //: Track(c - 1, Textpos()); - { - int c = Operand(1); - if (c == 0) - { - break; - } + Ldthis(); + _ilg!.Emit(OpCodes.Ldflda, s_runstackField); + Ldthisfld(s_runstackField); + Ldlen(); + Ldc(2); + Mul(); + Call(s_arrayResize); - if (!IsRightToLeft()) - { - Ldloc(_runtextendLocal!); - Ldloc(_runtextposLocal!); - } - else - { - Ldloc(_runtextposLocal!); - Ldloc(_runtextbegLocal!); - } - Sub(); - using (RentedLocalBuilder cLocal = RentInt32Local()) - { - Stloc(cLocal); - if (c != int.MaxValue) - { - Label l4 = DefineLabel(); - Ldloc(cLocal); - Ldc(c); - Blt(l4); - Ldc(c); - Stloc(cLocal); - MarkLabel(l4); - } - Ldloc(cLocal); - Ldc(0); - Ble(AdvanceLabel()); - ReadyPushTrack(); - Ldloc(cLocal); - } - Ldc(1); - Sub(); - DoPush(); - PushTrack(_runtextposLocal!); - Track(); - break; - } + MarkLabel(skipResize); + } - case RegexCode.Onelazy | RegexCode.Back: - case RegexCode.Notonelazy | RegexCode.Back: - case RegexCode.Setlazy | RegexCode.Back: - case RegexCode.Onelazy | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Notonelazy | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Setlazy | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Onelazy | RegexCode.Ci | RegexCode.Back: - case RegexCode.Notonelazy | RegexCode.Ci | RegexCode.Back: - case RegexCode.Setlazy | RegexCode.Ci | RegexCode.Back: - case RegexCode.Onelazy | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Notonelazy | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: - case RegexCode.Setlazy | RegexCode.Ci | RegexCode.Rtl | RegexCode.Back: - //: Trackframe(2); - //: int pos = Tracked(1); - //: Textto(pos); - //: if (Rightcharnext() != (char)Operand(0)) - //: break Backward; - //: int i = Tracked(0); - //: if (i > 0) - //: Track(i - 1, pos + 1); - - PopTrack(); - Stloc(_runtextposLocal!); - PopTrack(); - using (RentedLocalBuilder iLocal = RentInt32Local()) - { - Stloc(iLocal); + void EmitRunstackPush(Action load) + { + // base.runstack[runstackpos] = load(); + Ldthisfld(s_runstackField); + Ldloc(runstackpos); + load(); + StelemI4(); - if (!IsRightToLeft()) - { - Rightcharnext(); - } - else - { - Leftcharnext(); - } + // runstackpos++; + Ldloc(runstackpos); + Ldc(1); + Add(); + Stloc(runstackpos); + } - if (Code() == RegexCode.Setlazy) - { - EmitMatchCharacterClass(_strings![Operand(0)], IsCaseInsensitive()); - BrfalseFar(_backtrack); - } - else - { - if (IsCaseInsensitive()) - { - CallToLower(); - } + void EmitRunstackPop() + { + // ... = base.runstack[--runstackpos]; + Ldthisfld(s_runstackField); + Ldloc(runstackpos); + Ldc(1); + Sub(); + Stloc(runstackpos); + Ldloc(runstackpos); + LdelemI4(); + } + } - Ldc(Operand(0)); - if (Code() == RegexCode.Onelazy) - { - BneFar(_backtrack); - } - else - { - BeqFar(_backtrack); - } + private void InitializeCultureForGoIfNecessary() + { + _textInfo = null; + if ((_options & RegexOptions.CultureInvariant) == 0) + { + bool needsCulture = (_options & RegexOptions.IgnoreCase) != 0; + if (!needsCulture) + { + int[] codes = _code!.Codes; + for (int codepos = 0; codepos < codes.Length; codepos += RegexCode.OpcodeSize(codes[codepos])) + { + if ((codes[codepos] & RegexCode.Ci) == RegexCode.Ci) + { + needsCulture = true; + break; } - - Ldloc(iLocal); - Ldc(0); - BleFar(AdvanceLabel()); - ReadyPushTrack(); - Ldloc(iLocal); } - Ldc(1); - Sub(); - DoPush(); - PushTrack(_runtextposLocal!); - Trackagain(); - Advance(); - break; + } - default: - Debug.Fail($"Unimplemented state: {_regexopcode:X8}"); - break; + if (needsCulture) + { + // cache CultureInfo in local variable which saves excessive thread local storage accesses + _textInfo = DeclareTextInfo(); + InitLocalCultureInfo(); + } } } @@ -6227,17 +3811,17 @@ private void EmitTimeoutCheck() return; } - Debug.Assert(_loopTimeoutCounterLocal != null); + Debug.Assert(_loopTimeoutCounter != null); // Increment counter for each loop iteration. - Ldloc(_loopTimeoutCounterLocal); + Ldloc(_loopTimeoutCounter); Ldc(1); Add(); - Stloc(_loopTimeoutCounterLocal); + Stloc(_loopTimeoutCounter); // Emit code to check the timeout every 2048th iteration. Label label = DefineLabel(); - Ldloc(_loopTimeoutCounterLocal); + Ldloc(_loopTimeoutCounter); Ldc(LoopTimeoutCheckCount); RemUn(); Brtrue(label); @@ -6245,42 +3829,5 @@ private void EmitTimeoutCheck() Call(s_checkTimeoutMethod); MarkLabel(label); } - -#if DEBUG - /// Emit code to print out the current state of the runner. - [ExcludeFromCodeCoverage(Justification = "Debug only")] - private void DumpBacktracking() - { - Mvlocfld(_runtextposLocal!, s_runtextposField); - Mvlocfld(_runtrackposLocal!, s_runtrackposField); - Mvlocfld(_runstackposLocal!, s_runstackposField); - Ldthis(); - Call(s_dumpStateM); - - var sb = new StringBuilder(); - if (_backpos > 0) - { - sb.Append($"{_backpos:D6} "); - } - else - { - sb.Append(" "); - } - sb.Append(_code!.OpcodeDescription(_codepos)); - - if ((_regexopcode & RegexCode.Back) != 0) - { - sb.Append(" Back"); - } - - if ((_regexopcode & RegexCode.Back2) != 0) - { - sb.Append(" Back2"); - } - - Ldstr(sb.ToString()); - Call(s_debugWriteLine!); - } -#endif } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index 53b78c5d324796..34b7f1b1130592 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -29,17 +29,15 @@ internal sealed class RegexLWCGCompiler : RegexCompiler /// Id number to use for the next compiled regex. private static int s_regexCount; - public RegexLWCGCompiler() - { - } - /// The top-level driver. Initializes everything then calls the Generate* methods. - public RegexRunnerFactory FactoryInstanceFromCode(string pattern, RegexCode code, RegexOptions options, bool hasTimeout) + public RegexRunnerFactory? FactoryInstanceFromCode(string pattern, RegexCode code, RegexOptions options, bool hasTimeout) { + if (!code.Tree.Root.SupportsCompilation()) + { + return null; + } + _code = code; - _codes = code.Codes; - _strings = code.Strings; - _trackcount = code.TrackCount; _options = options; _hasTimeout = hasTimeout; @@ -54,13 +52,13 @@ public RegexRunnerFactory FactoryInstanceFromCode(string pattern, RegexCode code description = string.Concat("_", pattern.Length > DescriptionLimit ? pattern.AsSpan(0, DescriptionLimit) : pattern); } - DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", null, typeof(CompiledRegexRunner)); - GenerateGo(); - DynamicMethod findFirstCharMethod = DefineDynamicMethod($"Regex{regexNum}_FindFirstChar{description}", typeof(bool), typeof(CompiledRegexRunner)); - GenerateFindFirstChar(); + EmitFindFirstChar(); + + DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", null, typeof(CompiledRegexRunner)); + EmitGo(); - return new CompiledRegexRunnerFactory(goMethod, findFirstCharMethod, _trackcount); + return new CompiledRegexRunnerFactory(goMethod, findFirstCharMethod, code.TrackCount); } /// Begins the definition of a new method (no args) with a specified return value. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index d3adddb5e1c30f..582cb1130be7ab 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -523,7 +523,8 @@ private void EliminateEndingBacktracking() public bool IsAtomicByParent() { // Walk up the parent hierarchy. - for (RegexNode? parent = Next; parent is not null; parent = parent.Next) + RegexNode child = this; + for (RegexNode? parent = child.Next; parent is not null; child = parent, parent = child.Next) { switch (parent.Type) { @@ -540,14 +541,14 @@ public bool IsAtomicByParent() // so any atomicity applied to the alternation also applies to // each individual branch. This is true as well for conditional // backreferences, where each of the yes/no branches are independent. - case Testgroup when parent.Child(0) != this: + case Testgroup when parent.Child(0) != child: // As with alternations, each yes/no branch of an expression conditional // are independent from each other, but the conditional expression itself // can be backtracked into from each of the branches, so we can't make // it atomic just because the whole conditional is. case Capture: // Skip captures. They don't affect atomicity. - case Concatenate when parent.Child(parent.ChildCount() - 1) == this: + case Concatenate when parent.Child(parent.ChildCount() - 1) == child: // If the parent is a concatenation and this is the last node, // any atomicity applying to the concatenation applies to this // node, too. @@ -2207,49 +2208,44 @@ public int ChildCount() return 1; } - // Determines whether the node supports an optimized code gen strategy based on walking the node tree. - internal bool SupportsSimplifiedCodeGenerationImplementation() + // Determines whether the node supports a compilation / code generation strategy based on walking the node tree. + internal bool SupportsCompilation() { if (!StackHelper.TryEnsureSufficientExecutionStack()) { - // If we can't recur further, simplified code generation isn't supported as the tree is too deep. + // If we can't recur further, code generation isn't supported as the tree is too deep. return false; } - if ((Options & RegexOptions.RightToLeft) != 0) + if ((Options & (RegexOptions.RightToLeft | RegexOptions.NonBacktracking)) != 0) { - // RightToLeft isn't supported. That applies to both the top-level options as well as when used - // to specify positive and negative lookbehinds. + // NonBacktracking isn't supported, nor RightToLeft. The latter applies to both the top-level + // options as well as when used to specify positive and negative lookbehinds. return false; } - // TODO: This should be moved somewhere else, to a pass somewhere where we explicitly - // annotate the tree, potentially as part of the final optimization pass. It doesn't - // belong in this check. - switch (Type) - { - case Capture: - // If we've found a supported capture, mark all of the nodes in its parent - // hierarchy as containing a capture. - RegexNode? parent = this; - while (parent != null && ((parent.Options & HasCapturesFlag) == 0)) - { - parent.Options |= HasCapturesFlag; - parent = parent.Next; - } - break; - } - int childCount = ChildCount(); for (int i = 0; i < childCount; i++) { // The node isn't supported if any of its children aren't supported. - if (!Child(i).SupportsSimplifiedCodeGenerationImplementation()) + if (!Child(i).SupportsCompilation()) { return false; } } + // TODO: This should be moved somewhere else, to a pass somewhere where we explicitly + // annotate the tree, potentially as part of the final optimization pass. It doesn't + // belong in this check. + if (Type == Capture) + { + // If we've found a supported capture, mark all of the nodes in its parent hierarchy as containing a capture. + for (RegexNode? parent = this; parent != null && (parent.Options & HasCapturesFlag) == 0; parent = parent.Next) + { + parent.Options |= HasCapturesFlag; + } + } + // Supported. return true; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs index 460a7a4d5156d8..4c009b46bf73dd 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs @@ -5,6 +5,7 @@ using System.Threading.Tasks; using Xunit; using System.Linq; +using System.Runtime.CompilerServices; namespace System.Text.RegularExpressions.Tests { @@ -326,6 +327,43 @@ public static IEnumerable Matches_TestData() }; } } + +#if !NETFRAMEWORK // these tests currently fail on .NET Framework, and we need to check IsDynamicCodeCompiled but that doesn't exist on .NET Framework + if (engine != RegexEngine.Interpreter && // these tests currently fail with RegexInterpreter + RuntimeFeature.IsDynamicCodeCompiled) // if dynamic code isn't compiled, RegexOptions.Compiled falls back to the interpreter, for which these tests currently fail + { + // Fails on interpreter and .NET Framework: [ActiveIssue("https://github.com/dotnet/runtime/issues/62094")] + yield return new object[] + { + engine, "@(a*)+?", "@", RegexOptions.None, new[] + { + new CaptureData("@", 0, 1) + } + }; + + // Fails on interpreter and .NET Framework: [ActiveIssue("https://github.com/dotnet/runtime/issues/62094")] + yield return new object[] + { + engine, @"(?:){93}", "x", RegexOptions.None, new[] + { + new CaptureData("", 0, 0), + new CaptureData("", 1, 0) + } + }; + + if (!RegexHelpers.IsNonBacktracking(engine)) // atomic subexpressions aren't supported + { + // Fails on interpreter and .NET Framework: [ActiveIssue("https://github.com/dotnet/runtime/issues/62094")] + yield return new object[] + { + engine, @"()(?>\1+?).\b", "xxxx", RegexOptions.None, new[] + { + new CaptureData("x", 3, 1), + } + }; + } + } +#endif } } @@ -336,9 +374,6 @@ public async Task Matches(RegexEngine engine, string pattern, string input, Rege Regex regexAdvanced = await RegexHelpers.GetRegexAsync(engine, pattern, options); VerifyMatches(regexAdvanced.Matches(input), expected); VerifyMatches(regexAdvanced.Match(input), expected); - - VerifyMatches(Regex.Matches(input, pattern, options), expected); - VerifyMatches(Regex.Match(input, pattern, options), expected); } private static void VerifyMatches(Match match, CaptureData[] expected) @@ -361,18 +396,18 @@ private static void VerifyMatches(MatchCollection matches, CaptureData[] expecte private static void VerifyMatch(Match match, CaptureData expected) { Assert.True(match.Success); - RegexAssert.Equal(expected.Value, match); Assert.Equal(expected.Index, match.Index); Assert.Equal(expected.Length, match.Length); + RegexAssert.Equal(expected.Value, match); - RegexAssert.Equal(expected.Value, match.Groups[0]); Assert.Equal(expected.Index, match.Groups[0].Index); Assert.Equal(expected.Length, match.Groups[0].Length); + RegexAssert.Equal(expected.Value, match.Groups[0]); Assert.Equal(1, match.Captures.Count); - RegexAssert.Equal(expected.Value, match.Captures[0]); Assert.Equal(expected.Index, match.Captures[0].Index); Assert.Equal(expected.Length, match.Captures[0].Length); + RegexAssert.Equal(expected.Value, match.Captures[0]); } [Fact] diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs index 5320ae273f838d..294cbfe3fde06a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs @@ -1,7 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Collections.Generic; +using System.Collections.Immutable; using System.Diagnostics; using System.Globalization; using System.IO; @@ -123,11 +123,12 @@ internal static async Task SourceGenRegexAsync( // Run the generator GeneratorDriverRunResult generatorResults = s_generatorDriver.RunGenerators(comp!, cancellationToken).GetRunResult(); - if (generatorResults.Diagnostics.Length != 0) + ImmutableArray generatorDiagnostics = generatorResults.Diagnostics.RemoveAll(d => d.Severity <= DiagnosticSeverity.Info); + if (generatorDiagnostics.Length != 0) { throw new ArgumentException( string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))) + Environment.NewLine + - string.Join(Environment.NewLine, generatorResults.Diagnostics)); + string.Join(Environment.NewLine, generatorDiagnostics)); } // Compile the assembly to a stream @@ -138,7 +139,7 @@ internal static async Task SourceGenRegexAsync( { throw new ArgumentException( string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))) + Environment.NewLine + - string.Join(Environment.NewLine, results.Diagnostics.Concat(generatorResults.Diagnostics))); + string.Join(Environment.NewLine, results.Diagnostics.Concat(generatorDiagnostics))); } dll.Position = 0; diff --git a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Generators.Tests/RegexGeneratorParserTests.cs b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Generators.Tests/RegexGeneratorParserTests.cs index 2212211696af9a..1e8523d2f73f4a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Generators.Tests/RegexGeneratorParserTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Generators.Tests/RegexGeneratorParserTests.cs @@ -163,6 +163,66 @@ partial class C Assert.Equal("SYSLIB1044", Assert.Single(diagnostics).Id); } + [Fact] + public async Task Diagnostic_RightToLeft_LimitedSupport() + { + IReadOnlyList diagnostics = await RunGenerator(@" + using System.Text.RegularExpressions; + partial class C + { + [RegexGenerator(""ab"", RegexOptions.RightToLeft)] + private static partial Regex RightToLeftNotSupported(); + } + "); + + Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id); + } + + [Fact] + public async Task Diagnostic_NonBacktracking_LimitedSupport() + { + IReadOnlyList diagnostics = await RunGenerator(@" + using System.Text.RegularExpressions; + partial class C + { + [RegexGenerator(""ab"", RegexOptions.NonBacktracking)] + private static partial Regex RightToLeftNotSupported(); + } + "); + + Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id); + } + + [Fact] + public async Task Diagnostic_PositiveLookbehind_LimitedSupport() + { + IReadOnlyList diagnostics = await RunGenerator(@" + using System.Text.RegularExpressions; + partial class C + { + [RegexGenerator(""(?<=\b20)\d{2}\b"")] + private static partial Regex PositiveLookbehindNotSupported(); + } + "); + + Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id); + } + + [Fact] + public async Task Diagnostic_NegativeLookbehind_LimitedSupport() + { + IReadOnlyList diagnostics = await RunGenerator(@" + using System.Text.RegularExpressions; + partial class C + { + [RegexGenerator(""(?