Skip to content

Commit bc363cd

Browse files
committed
Use SearchValues for non-ASCII Regex fallback
When we encounter a set for which we can't use one of the IndexOfXx variants, we use IndexOfAny with SearchValues. If the whole set is ASCII, this is easy. But if there are any non-ASCII characters in the set, we currently emit a helper method that first does a search for all of the ASCII values or anything that's non-ASCII, and then if something non-ASCII is found, it proceeds to do a scalar scan matching every character. Now that SearchValues has a vectorized implementation of a probabilistic map, we can instead fall back to that when the set isn't too large and thus can be fully enumerated into a SearchValues instance. We still do this as a two-step process, as searching for the ASCII subset is measurably faster than the probabilistic map search. However, if we see that there's no ASCII at all in the target set, we can then skip the helper entirely and just do the IndexOfAny with the SearchValues for the non-ASCII targets.
1 parent 74d69fd commit bc363cd

File tree

2 files changed

+219
-121
lines changed

2 files changed

+219
-121
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 106 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -401,51 +401,63 @@ private static string EmitSearchValuesOrLiteral(ReadOnlySpan<char> chars, Dictio
401401
/// <summary>Adds a SearchValues instance declaration to the required helpers collection.</summary>
402402
private static string EmitSearchValues(char[] asciiChars, Dictionary<string, string[]> requiredHelpers)
403403
{
404-
Debug.Assert(RegexCharClass.IsAscii(asciiChars));
404+
Array.Sort(asciiChars);
405405

406-
// The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
407-
byte[] bitmap = new byte[16];
408-
foreach (char c in asciiChars)
406+
string fieldName;
407+
if (RegexCharClass.IsAscii(asciiChars))
409408
{
410-
bitmap[c >> 3] |= (byte)(1 << (c & 7));
409+
// The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
410+
byte[] bitmap = new byte[16];
411+
foreach (char c in asciiChars)
412+
{
413+
bitmap[c >> 3] |= (byte)(1 << (c & 7));
414+
}
415+
416+
string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);
417+
418+
fieldName = hexBitmap switch
419+
{
420+
"FFFFFFFF000000000000000000000080" => "s_asciiControl",
421+
"000000000000FF030000000000000000" => "s_asciiDigits",
422+
"0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
423+
"000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
424+
"000000000000FF037E0000007E000000" => "s_asciiHexDigits",
425+
"000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
426+
"000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
427+
"00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
428+
"00000000010000000000000000000000" => "s_asciiSeparators",
429+
"00000000100800700000004001000050" => "s_asciiSymbols",
430+
"003E0000010000000000000000000000" => "s_asciiWhiteSpace",
431+
"000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",
432+
433+
"00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
434+
"FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
435+
"FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
436+
"FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
437+
"FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
438+
"FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
439+
"FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
440+
"FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
441+
"FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
442+
"FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
443+
"FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",
444+
445+
_ => $"s_ascii_{hexBitmap.TrimStart('0')}"
446+
};
411447
}
412-
413-
string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);
414-
415-
string fieldName = hexBitmap switch
448+
else
416449
{
417-
"FFFFFFFF000000000000000000000080" => "s_asciiControl",
418-
"000000000000FF030000000000000000" => "s_asciiDigits",
419-
"0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
420-
"000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
421-
"000000000000FF037E0000007E000000" => "s_asciiHexDigits",
422-
"000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
423-
"000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
424-
"00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
425-
"00000000010000000000000000000000" => "s_asciiSeparators",
426-
"00000000100800700000004001000050" => "s_asciiSymbols",
427-
"003E0000010000000000000000000000" => "s_asciiWhiteSpace",
428-
"000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",
429-
430-
"00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
431-
"FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
432-
"FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
433-
"FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
434-
"FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
435-
"FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
436-
"FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
437-
"FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
438-
"FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
439-
"FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
440-
"FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",
441-
442-
_ => $"s_ascii_{hexBitmap.TrimStart('0')}"
443-
};
450+
Array.Sort(asciiChars);
451+
using (SHA256 sha = SHA256.Create())
452+
{
453+
#pragma warning disable CA1850 // SHA256.HashData isn't available on netstandard2.0
454+
fieldName = $"s_nonAscii_{BitConverter.ToString(sha.ComputeHash(Encoding.UTF8.GetBytes(asciiChars))).Replace("-", "")}";
455+
#pragma warning restore CA1850
456+
}
457+
}
444458

445459
if (!requiredHelpers.ContainsKey(fieldName))
446460
{
447-
Array.Sort(asciiChars);
448-
449461
string setLiteral = Literal(new string(asciiChars));
450462

451463
requiredHelpers.Add(fieldName, new string[]
@@ -458,22 +470,41 @@ private static string EmitSearchValues(char[] asciiChars, Dictionary<string, str
458470
return $"{HelpersTypeName}.{fieldName}";
459471
}
460472

461-
private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
473+
private static string EmitIndexOfAnyCustomHelperCall(string set, Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
462474
{
463475
// In order to optimize the search for ASCII characters, we use SearchValues to vectorize a search
464476
// for those characters plus anything non-ASCII (if we find something non-ASCII, we'll fall back to
465477
// a sequential walk). In order to do that search, we actually build up a set for all of the ASCII
466478
// characters _not_ contained in the set, and then do a search for the inverse of that, which will be
467479
// all of the target ASCII characters and all of non-ASCII.
468-
var asciiChars = new List<char>();
480+
var excludedAsciiChars = new List<char>();
469481
for (int i = 0; i < 128; i++)
470482
{
471483
if (!RegexCharClass.CharInClass((char)i, set))
472484
{
473-
asciiChars.Add((char)i);
485+
excludedAsciiChars.Add((char)i);
474486
}
475487
}
476488

489+
// We should only be here if the set might contain a non-ASCII character. As such, we need a fallback
490+
// for if IndexOfAny for the ASCII characters or any non-ASCII character hits a non-ASCII character.
491+
// Worst case, that fallback can be a linear scan, but if we can easily determine the full set of
492+
// characters included in the set, and if it's reasonably small enough, we can just hand them all
493+
// to SearchValues and let it optimize the search as best as possible. We still want the ASCII
494+
// fast path if there are any ASCII characters, though, as we assume if there are any ASCII chars
495+
// in the set that they will be more likely to occur, and SearchValues is very good at optimizing ASCII.
496+
const int SearchValuesFallbackLimit = 128; // somewhat arbitrary limit guided by SearchValues' probabilistic map implementation
497+
Span<char> allCharsInSet = stackalloc char[SearchValuesFallbackLimit];
498+
allCharsInSet = allCharsInSet.Slice(0, RegexCharClass.GetSetChars(set, allCharsInSet));
499+
bool allCharsInSetNegated = RegexCharClass.IsNegated(set);
500+
501+
// In the case where there aren't any ASCII chars, if we do have the full set, we can avoid
502+
// emitting a custom helper and just use IndexOfAny.
503+
if (excludedAsciiChars.Count == 128 && !allCharsInSet.IsEmpty)
504+
{
505+
return $"IndexOfAny{(allCharsInSetNegated ? "Except" : "")}({EmitSearchValues(allCharsInSet.ToArray(), requiredHelpers)})";
506+
}
507+
477508
// If this is a known set, use a predetermined simple name for the helper.
478509
string? helperName = set switch
479510
{
@@ -529,40 +560,55 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
529560

530561
if (!requiredHelpers.ContainsKey(helperName))
531562
{
532-
var additionalDeclarations = new HashSet<string>();
533-
string matchExpr = MatchCharacterClass("span[i]", set, negate: false, additionalDeclarations, requiredHelpers);
534-
535563
var lines = new List<string>();
536564
lines.Add($"/// <summary>Finds the next index of any character that matches {EscapeXmlComment(DescribeSet(set))}.</summary>");
537565
lines.Add($"[MethodImpl(MethodImplOptions.AggressiveInlining)]");
538566
lines.Add($"internal static int {helperName}(this ReadOnlySpan<char> span)");
539567
lines.Add($"{{");
540568
int uncheckedStart = lines.Count;
541-
lines.Add(asciiChars.Count == 128 ?
542-
$" int i = span.IndexOfAnyExceptInRange('\0', '\u007f');" :
543-
$" int i = span.IndexOfAnyExcept({EmitSearchValues(asciiChars.ToArray(), requiredHelpers)});");
569+
lines.Add($" // Search for the first character that's either ASCII and in the target set or non-ASCII (whether or not it's in the target set.");
570+
lines.Add($" int i = span.IndexOfAnyExcept({EmitSearchValues(excludedAsciiChars.ToArray(), requiredHelpers)});");
544571
lines.Add($" if ((uint)i < (uint)span.Length)");
545572
lines.Add($" {{");
573+
lines.Add($" // If the character at the found position is ASCII, it's in the target set.");
546574
lines.Add($" if (char.IsAscii(span[i]))");
547575
lines.Add($" {{");
548576
lines.Add($" return i;");
549577
lines.Add($" }}");
550578
lines.Add($"");
551-
if (additionalDeclarations.Count > 0)
579+
if (!allCharsInSet.IsEmpty)
552580
{
553-
lines.AddRange(additionalDeclarations.Select(s => $" {s}"));
581+
lines.Add($" // Search for the first character that's in the target set.");
582+
lines.Add($" int j = span.Slice(i).IndexOfAny{(allCharsInSetNegated ? "Except" : "")}({EmitSearchValues(allCharsInSet.ToArray(), requiredHelpers)});");
583+
lines.Add($" if (j >= 0)");
584+
lines.Add($" {{");
585+
lines.Add($" return i + j;");
586+
lines.Add($" }}");
587+
}
588+
else
589+
{
590+
var additionalDeclarations = new HashSet<string>();
591+
string matchExpr = MatchCharacterClass("span[i]", set, negate: false, additionalDeclarations, requiredHelpers);
592+
593+
lines.Add($" // The current character is non-ASCII. Walk through the remainder of the characters looking");
594+
lines.Add($" // for the first one that's in the target set.");
595+
if (additionalDeclarations.Count > 0)
596+
{
597+
lines.AddRange(additionalDeclarations.Select(s => $" {s}"));
598+
}
599+
lines.Add($" do");
600+
lines.Add($" {{");
601+
lines.Add($" if ({matchExpr})");
602+
lines.Add($" {{");
603+
lines.Add($" return i;");
604+
lines.Add($" }}");
605+
lines.Add($" i++;");
606+
lines.Add($" }}");
607+
lines.Add($" while ((uint)i < (uint)span.Length);");
554608
}
555-
lines.Add($" do");
556-
lines.Add($" {{");
557-
lines.Add($" if ({matchExpr})");
558-
lines.Add($" {{");
559-
lines.Add($" return i;");
560-
lines.Add($" }}");
561-
lines.Add($" i++;");
562-
lines.Add($" }}");
563-
lines.Add($" while ((uint)i < (uint)span.Length);");
564609
lines.Add($" }}");
565610
lines.Add($"");
611+
lines.Add($" // No match found.");
566612
lines.Add($" return -1;");
567613
lines.Add($"}}");
568614

@@ -580,7 +626,7 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
580626
requiredHelpers.Add(helperName, lines.ToArray());
581627
}
582628

583-
return helperName;
629+
return $"{helperName}()";
584630
}
585631

586632
/// <summary>Emits the body of the Scan method override.</summary>
@@ -1104,7 +1150,7 @@ void EmitFixedSet_LeftToRight()
11041150
{
11051151
// We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that
11061152
// will perform the search as efficiently as possible.
1107-
indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
1153+
indexOf = $"{span}.{EmitIndexOfAnyCustomHelperCall(primarySet.Set, requiredHelpers, checkOverflow)}";
11081154
}
11091155

11101156
if (needLoop)

0 commit comments

Comments
 (0)