From f8d2f2a03f60e49ae480817b1676b0c7260e27a5 Mon Sep 17 00:00:00 2001
From: Stephen Toub <stoub@microsoft.com>
Date: Thu, 28 Apr 2022 10:28:36 -0400
Subject: [PATCH] Fix RegexCompiler regression on 32-bit for some set matching

We added an optimization to regex where for sets containing values all within 64 characters of each other (e.g. all hex digits), we use a ulong to represent a bitmap and can implement the check in an entirely branchless manner.  This results in a measurable win on 64-bit, e.g. upwards of 20% for some patterns.  Unfortunately, it also results in a measurable regression on 32-bit.  This PR fixes that for RegexCompiler by special-casing the optimization to only apply when IntPtr.Size == 8.  For the source generator, we don't have the same luxury of knowing that the code is emitted and used on the same bitness, so since it would result in very complicated code to emit multiple implementations and since we generally prefer optimizing for 64-bit, I've left it in for the source generator.
---
 .../gen/RegexGenerator.Emitter.cs                             | 4 ++++
 .../src/System/Text/RegularExpressions/RegexCompiler.cs       | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
index a84b26bcd9c2e7..046afcad11d90f 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -4331,6 +4331,10 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s
             // Next, handle sets where the high - low + 1 range is <= 64.  In that case, we can emit
             // a branchless lookup in a ulong that does not rely on loading any objects (e.g. the string-based
             // lookup we use later).  This nicely handles common sets like [0-9A-Fa-f], [0-9a-f], [A-Za-z], etc.
+            // Note that unlike RegexCompiler, the source generator doesn't know whether the code is going to be
+            // run in a 32-bit or 64-bit process: in a 64-bit process, this is an optimization, but in a 32-bit process,
+            // it's a deoptimization.  In general we optimize for 64-bit perf, so this code remains; it complicates
+            // the code too much to try to include both this and a fallback for the check.
             if (analysis.OnlyRanges && (analysis.UpperBoundExclusiveIfOnlyRanges - analysis.LowerBoundInclusiveIfOnlyRanges) <= 64)
             {
                 additionalDeclarations.Add("ulong charMinusLow;");
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index e61fd151c23b50..9747b3bf50a138 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -5203,7 +5203,9 @@ private void EmitMatchCharacterClass(string charClass)
             // Next, handle sets where the high - low + 1 range is <= 64.  In that case, we can emit
             // a branchless lookup in a ulong that does not rely on loading any objects (e.g. the string-based
             // lookup we use later).  This nicely handles common sets like [0-9A-Fa-f], [0-9a-f], [A-Za-z], etc.
-            if (analysis.OnlyRanges && (analysis.UpperBoundExclusiveIfOnlyRanges - analysis.LowerBoundInclusiveIfOnlyRanges) <= 64)
+            // We skip this on 32-bit, as otherwise using 64-bit numbers in this manner is a deoptimization
+            // when compared to the subsequent fallbacks.
+            if (IntPtr.Size == 8 && analysis.OnlyRanges && (analysis.UpperBoundExclusiveIfOnlyRanges - analysis.LowerBoundInclusiveIfOnlyRanges) <= 64)
             {
                 // Create the 64-bit value with 1s at indices corresponding to every character in the set,
                 // where the bit is computed to be the char value minus the lower bound starting from