Skip to content

Commit 48179aa

Browse files
committed
Fix edge-case of ASCII set after Notonelazy that doesn't overlap with target
1 parent 866b27c commit 48179aa

File tree

3 files changed

+16
-5
lines changed

3 files changed

+16
-5
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,6 @@ private static string EmitIndexOfAnyValuesOrLiteral(ReadOnlySpan<char> chars, Di
379379
private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary<string, string[]> requiredHelpers)
380380
{
381381
Debug.Assert(RegexCharClass.IsAscii(asciiChars));
382-
Debug.Assert(asciiChars.AsSpan().SequenceEqual(asciiChars.OrderBy(c => c).ToArray()));
383382

384383
// The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
385384
byte[] bitmap = new byte[16];
@@ -404,6 +403,8 @@ private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary<string,
404403

405404
if (!requiredHelpers.ContainsKey(helperName))
406405
{
406+
Array.Sort(asciiChars);
407+
407408
requiredHelpers.Add(helperName, new string[]
408409
{
409410
$"internal static readonly IndexOfAnyValues<char> {fieldName} = IndexOfAnyValues.Create({Literal(new string(asciiChars))});",
@@ -3168,8 +3169,8 @@ literal.SetChars is not null ||
31683169
}
31693170
else if (literal.AsciiChars is not null) // set of only ASCII characters
31703171
{
3171-
overlap = literal.AsciiChars.Contains(node.Ch);
31723172
char[] asciiChars = literal.AsciiChars;
3173+
overlap = asciiChars.Contains(node.Ch);
31733174
if (!overlap)
31743175
{
31753176
Debug.Assert(node.Ch < 128);

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3399,7 +3399,7 @@ node.Kind is RegexNodeKind.Notonelazy &&
33993399
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
34003400
(literal.String is not null ||
34013401
literal.SetChars is not null ||
3402-
literal.AsciiChars is not null ||
3402+
(literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set
34033403
literal.Range.LowInclusive == literal.Range.HighInclusive ||
34043404
(literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
34053405
{
@@ -3474,8 +3474,15 @@ literal.AsciiChars is not null ||
34743474
}
34753475
else if (literal.AsciiChars is not null) // set of only ASCII characters
34763476
{
3477-
overlap = literal.AsciiChars.AsSpan().Contains(node.Ch);
3478-
LoadIndexOfAnyValues(literal.AsciiChars);
3477+
char[] asciiChars = literal.AsciiChars;
3478+
overlap = asciiChars.AsSpan().Contains(node.Ch);
3479+
if (!overlap)
3480+
{
3481+
Debug.Assert(node.Ch < 128);
3482+
Array.Resize(ref asciiChars, asciiChars.Length + 1);
3483+
asciiChars[asciiChars.Length - 1] = node.Ch;
3484+
}
3485+
LoadIndexOfAnyValues(asciiChars);
34793486
Call(s_spanIndexOfAnyIndexOfAnyValues);
34803487
}
34813488
else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,9 @@ public static IEnumerable<object[]> Match_MemberData()
382382
yield return (@"a[^c]*?[bcdef]", "xyza12345e6789", lineOption, 0, 14, true, "a12345e");
383383
yield return (@"a[^b]*?[bcdef]", "xyza12345f6789", lineOption, 0, 14, true, "a12345f");
384384
yield return (@"a[^c]*?[bcdef]", "xyza12345g6789", lineOption, 0, 14, false, "");
385+
386+
yield return ("a[^b]*?[cdefgz]", "xyza123bc4", lineOption, 0, 10, false, "");
387+
yield return ("a[^b]*?[bdefgz]", "xyza123bc4", lineOption, 0, 10, true, "a123b");
385388
}
386389

387390
// Nested loops

0 commit comments

Comments
 (0)