@@ -363,6 +363,57 @@ private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> require
363363 }
364364 }
365365
366+ /// <summary>Adds an IndexOfAnyValues instance declaration to the required helpers collection if the chars are ASCII.</summary>
367+ private static string EmitIndexOfAnyValuesOrLiteral ( ReadOnlySpan < char > chars , Dictionary < string , string [ ] > requiredHelpers )
368+ {
369+ // IndexOfAnyValues<char> is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII.
370+ // Only emit IndexOfAnyValues instances when we know they'll be faster to avoid increasing the startup cost too much.
371+ Debug . Assert ( chars . Length is 4 or 5 ) ;
372+
373+ return RegexCharClass . IsAscii ( chars )
374+ ? EmitIndexOfAnyValues ( chars . ToArray ( ) , requiredHelpers )
375+ : Literal ( chars . ToString ( ) ) ;
376+ }
377+
378+ /// <summary>Adds an IndexOfAnyValues instance declaration to the required helpers collection.</summary>
379+ private static string EmitIndexOfAnyValues ( char [ ] asciiChars , Dictionary < string , string [ ] > requiredHelpers )
380+ {
381+ Debug . Assert ( RegexCharClass . IsAscii ( asciiChars ) ) ;
382+ Debug . Assert ( asciiChars . AsSpan ( ) . SequenceEqual ( asciiChars . OrderBy ( c => c ) . ToArray ( ) ) ) ;
383+
384+ // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
385+ byte [ ] bitmap = new byte [ 16 ] ;
386+ foreach ( char c in asciiChars )
387+ {
388+ bitmap [ c >> 3 ] |= ( byte ) ( 1 << ( c & 7 ) ) ;
389+ }
390+
391+ string hexBitmap = BitConverter . ToString ( bitmap ) . Replace ( "-" , string . Empty ) ;
392+
393+ string fieldName = hexBitmap switch
394+ {
395+ "0000000000000000FEFFFF07FEFFFF07" => "AsciiLetter" ,
396+ "000000000000FF03FEFFFF07FEFFFF07" => "AsciiLetterOrDigit" ,
397+ "000000000000FF037E0000007E000000" => "AsciiHexDigit" ,
398+ "000000000000FF03000000007E000000" => "AsciiHexDigitLower" ,
399+ "000000000000FF037E00000000000000" => "AsciiHexDigitUpper" ,
400+ _ => $ "Ascii_{ hexBitmap . TrimStart ( '0' ) } "
401+ } ;
402+
403+ string helperName = $ "IndexOfAnyValues_{ fieldName } ";
404+
405+ if ( ! requiredHelpers . ContainsKey ( helperName ) )
406+ {
407+ requiredHelpers . Add ( helperName , new string [ ]
408+ {
409+ $ "internal static readonly IndexOfAnyValues<char> { fieldName } =",
410+ $ " IndexOfAnyValues.Create({ Literal ( new string ( asciiChars ) ) } );",
411+ } ) ;
412+ }
413+
414+ return $ "{ HelpersTypeName } .{ fieldName } ";
415+ }
416+
366417 /// <summary>Emits the body of the Scan method override.</summary>
367418 private static ( bool NeedsTryFind , bool NeedsTryMatch ) EmitScan ( IndentedTextWriter writer , RegexMethod rm )
368419 {
@@ -810,7 +861,7 @@ void EmitFixedSet_LeftToRight()
810861 int setIndex = 0 ;
811862 bool canUseIndexOf =
812863 primarySet . Set != RegexCharClass . NotNewLineClass &&
813- ( primarySet . Chars is not null || primarySet . Range is not null ) ;
864+ ( primarySet . Chars is not null || primarySet . Range is not null || primarySet . AsciiSet is not null ) ;
814865 bool needLoop = ! canUseIndexOf || setsToUse > 1 ;
815866
816867 FinishEmitBlock loopBlock = default ;
@@ -841,7 +892,12 @@ void EmitFixedSet_LeftToRight()
841892 1 => $ "{ span } .IndexOf({ Literal ( primarySet . Chars [ 0 ] ) } )",
842893 2 => $ "{ span } .IndexOfAny({ Literal ( primarySet . Chars [ 0 ] ) } , { Literal ( primarySet . Chars [ 1 ] ) } )",
843894 3 => $ "{ span } .IndexOfAny({ Literal ( primarySet . Chars [ 0 ] ) } , { Literal ( primarySet . Chars [ 1 ] ) } , { Literal ( primarySet . Chars [ 2 ] ) } )",
844- _ => $ "{ span } .IndexOfAny({ Literal ( new string ( primarySet . Chars ) ) } )",
895+ _ => $ "{ span } .IndexOfAny({ EmitIndexOfAnyValuesOrLiteral ( primarySet . Chars , requiredHelpers ) } )",
896+ } :
897+ primarySet . AsciiSet is not null ? primarySet . AsciiSet . Value . Negated switch
898+ {
899+ false => $ "{ span } .IndexOfAny({ EmitIndexOfAnyValues ( primarySet . AsciiSet . Value . Chars , requiredHelpers ) } )",
900+ true => $ "{ span } .IndexOfAnyExcept({ EmitIndexOfAnyValues ( primarySet . AsciiSet . Value . Chars , requiredHelpers ) } )",
845901 } :
846902 ( primarySet . Range . Value . LowInclusive == primarySet . Range . Value . HighInclusive , primarySet . Range . Value . Negated ) switch
847903 {
@@ -1010,7 +1066,7 @@ void EmitLiteralAfterAtomicLoop()
10101066 {
10111067 2 => $ "IndexOfAny({ Literal ( literalChars [ 0 ] ) } , { Literal ( literalChars [ 1 ] ) } );",
10121068 3 => $ "IndexOfAny({ Literal ( literalChars [ 0 ] ) } , { Literal ( literalChars [ 1 ] ) } , { Literal ( literalChars [ 2 ] ) } );",
1013- _ => $ "IndexOfAny({ Literal ( new string ( literalChars ) ) } );",
1069+ _ => $ "IndexOfAny({ EmitIndexOfAnyValuesOrLiteral ( literalChars , requiredHelpers ) } );",
10141070 } ) ;
10151071
10161072 FinishEmitBlock indexOfFoundBlock = default ;
@@ -2920,7 +2976,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
29202976 if ( ! rtl &&
29212977 node . N > 1 && // no point in using IndexOf for small loops, in particular optionals
29222978 subsequent ? . FindStartingLiteralNode ( ) is RegexNode literalNode &&
2923- TryEmitIndexOf ( literalNode , useLast : true , negate : false , out int literalLength , out string indexOfExpr ) )
2979+ TryEmitIndexOf ( requiredHelpers , literalNode , useLast : true , negate : false , out int literalLength , out string indexOfExpr ) )
29242980 {
29252981 writer . WriteLine ( $ "if ({ startingPos } >= { endingPos } ||") ;
29262982
@@ -3079,6 +3135,7 @@ node.Kind is RegexNodeKind.Notonelazy &&
30793135 ! literal . Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
30803136 ( literal . String is not null ||
30813137 literal . SetChars is not null ||
3138+ ( literal . AsciiChars is not null && node . Ch < 128 ) || // for ASCII sets, only allow when the target can be efficiently included in the set
30823139 literal . Range . LowInclusive == literal . Range . HighInclusive ||
30833140 ( literal . Range . LowInclusive <= node . Ch && node . Ch <= literal . Range . HighInclusive ) ) ) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
30843141 {
@@ -3104,12 +3161,24 @@ literal.SetChars is not null ||
31043161 {
31053162 ( true , 2 ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ Literal ( literal . SetChars [ 0 ] ) } , { Literal ( literal . SetChars [ 1 ] ) } );",
31063163 ( true , 3 ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ Literal ( literal . SetChars [ 0 ] ) } , { Literal ( literal . SetChars [ 1 ] ) } , { Literal ( literal . SetChars [ 2 ] ) } );",
3107- ( true , _ ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ Literal ( literal . SetChars ) } );",
3164+ ( true , _ ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ EmitIndexOfAnyValuesOrLiteral ( literal . SetChars . AsSpan ( ) , requiredHelpers ) } );",
31083165
31093166 ( false , 2 ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ Literal ( node . Ch ) } , { Literal ( literal . SetChars [ 0 ] ) } , { Literal ( literal . SetChars [ 1 ] ) } );",
3110- ( false , _ ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ Literal ( $ "{ node . Ch } { literal . SetChars } ") } );",
3167+ ( false , _ ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ EmitIndexOfAnyValuesOrLiteral ( $ "{ node . Ch } { literal . SetChars } ". AsSpan ( ) , requiredHelpers ) } );",
31113168 } ) ;
31123169 }
3170+ else if ( literal . AsciiChars is not null ) // set of only ASCII characters
3171+ {
3172+ overlap = literal . AsciiChars . Contains ( node . Ch ) ;
3173+ char [ ] asciiChars = literal . AsciiChars ;
3174+ if ( ! overlap )
3175+ {
3176+ Debug . Assert ( node . Ch < 128 ) ;
3177+ Array . Resize ( ref asciiChars , asciiChars . Length + 1 ) ;
3178+ asciiChars [ asciiChars . Length - 1 ] = node . Ch ;
3179+ }
3180+ writer . WriteLine ( $ "{ startingPos } = { sliceSpan } .IndexOfAny({ EmitIndexOfAnyValues ( asciiChars , requiredHelpers ) } );") ;
3181+ }
31133182 else if ( literal . Range . LowInclusive == literal . Range . HighInclusive ) // single char from a RegexNode.One
31143183 {
31153184 overlap = literal . Range . LowInclusive == node . Ch ;
@@ -3144,7 +3213,7 @@ literal.SetChars is not null ||
31443213 node . Kind is RegexNodeKind . Setlazy &&
31453214 node . Str == RegexCharClass . AnyClass &&
31463215 subsequent ? . FindStartingLiteralNode ( ) is RegexNode literal2 &&
3147- TryEmitIndexOf ( literal2 , useLast : false , negate : false , out _ , out string ? indexOfExpr ) )
3216+ TryEmitIndexOf ( requiredHelpers , literal2 , useLast : false , negate : false , out _ , out string ? indexOfExpr ) )
31483217 {
31493218 // e.g. ".*?string" with RegexOptions.Singleline
31503219 // This lazy loop will consume all characters until the subsequent literal. If the subsequent literal
@@ -3592,7 +3661,7 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true)
35923661 // For the loop, we're validating that each char matches the target node.
35933662 // For IndexOf, we're looking for the first thing that _doesn't_ match the target node,
35943663 // and thus similarly validating that everything does.
3595- if ( TryEmitIndexOf ( node , useLast : false , negate : true , out _ , out string ? indexOfExpr ) )
3664+ if ( TryEmitIndexOf ( requiredHelpers , node , useLast : false , negate : true , out _ , out string ? indexOfExpr ) )
35963665 {
35973666 using ( EmitBlock ( writer , $ "if ({ sliceSpan } .Slice({ sliceStaticPos } , { iterations } ).{ indexOfExpr } >= 0)") )
35983667 {
@@ -3685,7 +3754,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
36853754 TransferSliceStaticPosToPos ( ) ;
36863755 writer . WriteLine ( $ "int { iterationLocal } = inputSpan.Length - pos;") ;
36873756 }
3688- else if ( maxIterations == int . MaxValue && TryEmitIndexOf ( node , useLast : false , negate : true , out _ , out string indexOfExpr ) )
3757+ else if ( maxIterations == int . MaxValue && TryEmitIndexOf ( requiredHelpers , node , useLast : false , negate : true , out _ , out string indexOfExpr ) )
36893758 {
36903759 // We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is
36913760 // purely for simplicity; it could be removed in the future with additional code to handle that case.
@@ -4316,6 +4385,7 @@ private static void EmitTimeoutCheckIfNeeded(IndentedTextWriter writer, RegexMet
43164385 /// <param name="indexOfExpr">The resulting expression if it returns true; otherwise, null.</param>
43174386 /// <returns>true if an expression could be produced; otherwise, false.</returns>
43184387 private static bool TryEmitIndexOf (
4388+ Dictionary < string , string [ ] > requiredHelpers ,
43194389 RegexNode node ,
43204390 bool useLast , bool negate ,
43214391 out int literalLength , [ NotNullWhen ( true ) ] out string ? indexOfExpr )
@@ -4362,7 +4432,7 @@ private static bool TryEmitIndexOf(
43624432 1 => $ "{ last } { indexOfName } ({ Literal ( setChars [ 0 ] ) } )",
43634433 2 => $ "{ last } { indexOfAnyName } ({ Literal ( setChars [ 0 ] ) } , { Literal ( setChars [ 1 ] ) } )",
43644434 3 => $ "{ last } { indexOfAnyName } ({ Literal ( setChars [ 0 ] ) } , { Literal ( setChars [ 1 ] ) } , { Literal ( setChars [ 2 ] ) } )",
4365- _ => $ "{ last } { indexOfAnyName } ({ Literal ( setChars . ToString ( ) ) } )",
4435+ _ => $ "{ last } { indexOfAnyName } ({ EmitIndexOfAnyValuesOrLiteral ( setChars , requiredHelpers ) } )",
43664436 } ;
43674437
43684438 literalLength = 1 ;
@@ -4380,6 +4450,18 @@ private static bool TryEmitIndexOf(
43804450 literalLength = 1 ;
43814451 return true ;
43824452 }
4453+
4454+ if ( RegexCharClass . TryGetAsciiSetChars ( node . Str , out char [ ] ? asciiChars ) )
4455+ {
4456+ string indexOfAnyName = ! negated ?
4457+ "IndexOfAny" :
4458+ "IndexOfAnyExcept" ;
4459+
4460+ indexOfExpr = $ "{ last } { indexOfAnyName } ({ EmitIndexOfAnyValues ( asciiChars , requiredHelpers ) } )";
4461+
4462+ literalLength = 1 ;
4463+ return true ;
4464+ }
43834465 }
43844466
43854467 indexOfExpr = null ;
0 commit comments