@@ -401,51 +401,63 @@ private static string EmitSearchValuesOrLiteral(ReadOnlySpan<char> chars, Dictio
401401 /// <summary>Adds a SearchValues instance declaration to the required helpers collection.</summary>
402402 private static string EmitSearchValues ( char [ ] asciiChars , Dictionary < string , string [ ] > requiredHelpers )
403403 {
404- Debug . Assert ( RegexCharClass . IsAscii ( asciiChars ) ) ;
404+ Array . Sort ( asciiChars ) ;
405405
406- // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
407- byte [ ] bitmap = new byte [ 16 ] ;
408- foreach ( char c in asciiChars )
406+ string fieldName ;
407+ if ( RegexCharClass . IsAscii ( asciiChars ) )
409408 {
410- bitmap [ c >> 3 ] |= ( byte ) ( 1 << ( c & 7 ) ) ;
409+ // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
410+ byte [ ] bitmap = new byte [ 16 ] ;
411+ foreach ( char c in asciiChars )
412+ {
413+ bitmap [ c >> 3 ] |= ( byte ) ( 1 << ( c & 7 ) ) ;
414+ }
415+
416+ string hexBitmap = BitConverter . ToString ( bitmap ) . Replace ( "-" , string . Empty ) ;
417+
418+ fieldName = hexBitmap switch
419+ {
420+ "FFFFFFFF000000000000000000000080" => "s_asciiControl" ,
421+ "000000000000FF030000000000000000" => "s_asciiDigits" ,
422+ "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters" ,
423+ "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits" ,
424+ "000000000000FF037E0000007E000000" => "s_asciiHexDigits" ,
425+ "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower" ,
426+ "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper" ,
427+ "00000000EEF7008C010000B800000028" => "s_asciiPunctuation" ,
428+ "00000000010000000000000000000000" => "s_asciiSeparators" ,
429+ "00000000100800700000004001000050" => "s_asciiSymbols" ,
430+ "003E0000010000000000000000000000" => "s_asciiWhiteSpace" ,
431+ "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars" ,
432+
433+ "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl" ,
434+ "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits" ,
435+ "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters" ,
436+ "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits" ,
437+ "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower" ,
438+ "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation" ,
439+ "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators" ,
440+ "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols" ,
441+ "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper" ,
442+ "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace" ,
443+ "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars" ,
444+
445+ _ => $ "s_ascii_{ hexBitmap . TrimStart ( '0' ) } "
446+ } ;
411447 }
412-
413- string hexBitmap = BitConverter . ToString ( bitmap ) . Replace ( "-" , string . Empty ) ;
414-
415- string fieldName = hexBitmap switch
448+ else
416449 {
417- "FFFFFFFF000000000000000000000080" => "s_asciiControl" ,
418- "000000000000FF030000000000000000" => "s_asciiDigits" ,
419- "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters" ,
420- "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits" ,
421- "000000000000FF037E0000007E000000" => "s_asciiHexDigits" ,
422- "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower" ,
423- "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper" ,
424- "00000000EEF7008C010000B800000028" => "s_asciiPunctuation" ,
425- "00000000010000000000000000000000" => "s_asciiSeparators" ,
426- "00000000100800700000004001000050" => "s_asciiSymbols" ,
427- "003E0000010000000000000000000000" => "s_asciiWhiteSpace" ,
428- "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars" ,
429-
430- "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl" ,
431- "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits" ,
432- "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters" ,
433- "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits" ,
434- "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower" ,
435- "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation" ,
436- "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators" ,
437- "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols" ,
438- "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper" ,
439- "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace" ,
440- "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars" ,
441-
442- _ => $ "s_ascii_{ hexBitmap . TrimStart ( '0' ) } "
443- } ;
450+ Array . Sort ( asciiChars ) ;
451+ using ( SHA256 sha = SHA256 . Create ( ) )
452+ {
453+ #pragma warning disable CA1850 // SHA256.HashData isn't available on netstandard2.0
454+ fieldName = $ "s_nonAscii_{ BitConverter . ToString ( sha . ComputeHash ( Encoding . UTF8 . GetBytes ( asciiChars ) ) ) . Replace ( "-" , "" ) } ";
455+ #pragma warning restore CA1850
456+ }
457+ }
444458
445459 if ( ! requiredHelpers . ContainsKey ( fieldName ) )
446460 {
447- Array . Sort ( asciiChars ) ;
448-
449461 string setLiteral = Literal ( new string ( asciiChars ) ) ;
450462
451463 requiredHelpers . Add ( fieldName , new string [ ]
@@ -458,22 +470,41 @@ private static string EmitSearchValues(char[] asciiChars, Dictionary<string, str
458470 return $ "{ HelpersTypeName } .{ fieldName } ";
459471 }
460472
461- private static string EmitIndexOfAnyCustomHelper ( string set , Dictionary < string , string [ ] > requiredHelpers , bool checkOverflow )
473+ private static string EmitIndexOfAnyCustomHelperCall ( string set , Dictionary < string , string [ ] > requiredHelpers , bool checkOverflow )
462474 {
463475 // In order to optimize the search for ASCII characters, we use SearchValues to vectorize a search
464476 // for those characters plus anything non-ASCII (if we find something non-ASCII, we'll fall back to
465477 // a sequential walk). In order to do that search, we actually build up a set for all of the ASCII
466478 // characters _not_ contained in the set, and then do a search for the inverse of that, which will be
467479 // all of the target ASCII characters and all of non-ASCII.
468- var asciiChars = new List < char > ( ) ;
480+ var excludedAsciiChars = new List < char > ( ) ;
469481 for ( int i = 0 ; i < 128 ; i ++ )
470482 {
471483 if ( ! RegexCharClass . CharInClass ( ( char ) i , set ) )
472484 {
473- asciiChars . Add ( ( char ) i ) ;
485+ excludedAsciiChars . Add ( ( char ) i ) ;
474486 }
475487 }
476488
489+ // We should only be here if the set might contain a non-ASCII character. As such, we need a fallback
490+ // for if IndexOfAny for the ASCII characters or any non-ASCII character hits a non-ASCII character.
491+ // Worst case, that fallback can be a linear scan, but if we can easily determine the full set of
492+ // characters included in the set, and if it's reasonably small enough, we can just hand them all
493+ // to SearchValues and let it optimize the search as best as possible. We still want the ASCII
494+ // fast path if there are any ASCII characters, though, as we assume if there are any ASCII chars
495+ // in the set that they will be more likely to occur, and SearchValues is very good at optimizing ASCII.
496+ const int SearchValuesFallbackLimit = 128 ; // somewhat arbitrary limit guided by SearchValues' probabilistic map implementation
497+ Span < char > allCharsInSet = stackalloc char [ SearchValuesFallbackLimit ] ;
498+ allCharsInSet = allCharsInSet . Slice ( 0 , RegexCharClass . GetSetChars ( set , allCharsInSet ) ) ;
499+ bool allCharsInSetNegated = RegexCharClass . IsNegated ( set ) ;
500+
501+ // In the case where there aren't any ASCII chars, if we do have the full set, we can avoid
502+ // emitting a custom helper and just use IndexOfAny.
503+ if ( excludedAsciiChars . Count == 128 && ! allCharsInSet . IsEmpty )
504+ {
505+ return $ "IndexOfAny{ ( allCharsInSetNegated ? "Except" : "" ) } ({ EmitSearchValues ( allCharsInSet . ToArray ( ) , requiredHelpers ) } )";
506+ }
507+
477508 // If this is a known set, use a predetermined simple name for the helper.
478509 string ? helperName = set switch
479510 {
@@ -529,40 +560,55 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
529560
530561 if ( ! requiredHelpers . ContainsKey ( helperName ) )
531562 {
532- var additionalDeclarations = new HashSet < string > ( ) ;
533- string matchExpr = MatchCharacterClass ( "span[i]" , set , negate : false , additionalDeclarations , requiredHelpers ) ;
534-
535563 var lines = new List < string > ( ) ;
536564 lines . Add ( $ "/// <summary>Finds the next index of any character that matches { EscapeXmlComment ( DescribeSet ( set ) ) } .</summary>") ;
537565 lines . Add ( $ "[MethodImpl(MethodImplOptions.AggressiveInlining)]") ;
538566 lines . Add ( $ "internal static int { helperName } (this ReadOnlySpan<char> span)") ;
539567 lines . Add ( $ "{{") ;
540568 int uncheckedStart = lines . Count ;
541- lines . Add ( asciiChars . Count == 128 ?
542- $ " int i = span.IndexOfAnyExceptInRange('\0 ', '\u007f ');" :
543- $ " int i = span.IndexOfAnyExcept({ EmitSearchValues ( asciiChars . ToArray ( ) , requiredHelpers ) } );") ;
569+ lines . Add ( $ " // Search for the first character that's either ASCII and in the target set or non-ASCII (whether or not it's in the target set.") ;
570+ lines . Add ( $ " int i = span.IndexOfAnyExcept({ EmitSearchValues ( excludedAsciiChars . ToArray ( ) , requiredHelpers ) } );") ;
544571 lines . Add ( $ " if ((uint)i < (uint)span.Length)") ;
545572 lines . Add ( $ " {{") ;
573+ lines . Add ( $ " // If the character at the found position is ASCII, it's in the target set.") ;
546574 lines . Add ( $ " if (char.IsAscii(span[i]))") ;
547575 lines . Add ( $ " {{") ;
548576 lines . Add ( $ " return i;") ;
549577 lines . Add ( $ " }}") ;
550578 lines . Add ( $ "" ) ;
551- if ( additionalDeclarations . Count > 0 )
579+ if ( ! allCharsInSet . IsEmpty )
552580 {
553- lines . AddRange ( additionalDeclarations . Select ( s => $ " { s } ") ) ;
581+ lines . Add ( $ " // Search for the first character that's in the target set.") ;
582+ lines . Add ( $ " int j = span.Slice(i).IndexOfAny{ ( allCharsInSetNegated ? "Except" : "" ) } ({ EmitSearchValues ( allCharsInSet . ToArray ( ) , requiredHelpers ) } );") ;
583+ lines . Add ( $ " if (j >= 0)") ;
584+ lines . Add ( $ " {{") ;
585+ lines . Add ( $ " return i + j;") ;
586+ lines . Add ( $ " }}") ;
587+ }
588+ else
589+ {
590+ var additionalDeclarations = new HashSet < string > ( ) ;
591+ string matchExpr = MatchCharacterClass ( "span[i]" , set , negate : false , additionalDeclarations , requiredHelpers ) ;
592+
593+ lines . Add ( $ " // The current character is non-ASCII. Walk through the remainder of the characters looking") ;
594+ lines . Add ( $ " // for the first one that's in the target set.") ;
595+ if ( additionalDeclarations . Count > 0 )
596+ {
597+ lines . AddRange ( additionalDeclarations . Select ( s => $ " { s } ") ) ;
598+ }
599+ lines . Add ( $ " do") ;
600+ lines . Add ( $ " {{") ;
601+ lines . Add ( $ " if ({ matchExpr } )") ;
602+ lines . Add ( $ " {{") ;
603+ lines . Add ( $ " return i;") ;
604+ lines . Add ( $ " }}") ;
605+ lines . Add ( $ " i++;") ;
606+ lines . Add ( $ " }}") ;
607+ lines . Add ( $ " while ((uint)i < (uint)span.Length);") ;
554608 }
555- lines . Add ( $ " do") ;
556- lines . Add ( $ " {{") ;
557- lines . Add ( $ " if ({ matchExpr } )") ;
558- lines . Add ( $ " {{") ;
559- lines . Add ( $ " return i;") ;
560- lines . Add ( $ " }}") ;
561- lines . Add ( $ " i++;") ;
562- lines . Add ( $ " }}") ;
563- lines . Add ( $ " while ((uint)i < (uint)span.Length);") ;
564609 lines . Add ( $ " }}") ;
565610 lines . Add ( $ "" ) ;
611+ lines . Add ( $ " // No match found.") ;
566612 lines . Add ( $ " return -1;") ;
567613 lines . Add ( $ "}}") ;
568614
@@ -580,7 +626,7 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
580626 requiredHelpers . Add ( helperName , lines . ToArray ( ) ) ;
581627 }
582628
583- return helperName ;
629+ return $ " { helperName } ()" ;
584630 }
585631
586632 /// <summary>Emits the body of the Scan method override.</summary>
@@ -1104,7 +1150,7 @@ void EmitFixedSet_LeftToRight()
11041150 {
11051151 // We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that
11061152 // will perform the search as efficiently as possible.
1107- indexOf = $ "{ span } .{ EmitIndexOfAnyCustomHelper ( primarySet . Set , requiredHelpers , checkOverflow ) } () ";
1153+ indexOf = $ "{ span } .{ EmitIndexOfAnyCustomHelperCall ( primarySet . Set , requiredHelpers , checkOverflow ) } ";
11081154 }
11091155
11101156 if ( needLoop )
0 commit comments