diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs index 4db4d2f03d9e01..8e9a7db1dd3198 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; +using System.Diagnostics; using System.Numerics; using System.Runtime.InteropServices; @@ -76,6 +77,8 @@ public static unsafe int GetHashCodeOrdinal(ReadOnlySpan s) // useful if the string only contains ASCII characters public static unsafe int GetHashCodeOrdinalIgnoreCaseAscii(ReadOnlySpan s) { + Debug.Assert(KeyAnalyzer.IsAllAscii(s)); + // We "normalize to lowercase" every char by ORing with 0x20. This casts // a very wide net because it will change, e.g., '^' to '~'. But that should // be ok because we expect this to be very rare in practice. @@ -138,6 +141,9 @@ public static unsafe int GetHashCodeOrdinalIgnoreCaseAscii(ReadOnlySpan s) public static unsafe int GetHashCodeOrdinalIgnoreCase(ReadOnlySpan s) { +#if NET + return string.GetHashCode(s, StringComparison.OrdinalIgnoreCase); +#else int length = s.Length; char[]? rentedArray = null; @@ -145,7 +151,7 @@ public static unsafe int GetHashCodeOrdinalIgnoreCase(ReadOnlySpan s) stackalloc char[256] : (rentedArray = ArrayPool.Shared.Rent(length)); - length = s.ToUpperInvariant(scratch); // NOTE: this really should be the (non-existent) ToUpperOrdinal + length = s.ToUpperInvariant(scratch); // NOTE: this both allocates and really should be the (non-existent) ToUpperOrdinal int hash = GetHashCodeOrdinal(scratch.Slice(0, length)); if (rentedArray is not null) @@ -154,6 +160,7 @@ public static unsafe int GetHashCodeOrdinalIgnoreCase(ReadOnlySpan s) } return hash; +#endif } } } diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs index 9f6094edb97763..208cf88c6ea62d 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs @@ -32,31 +32,40 @@ public static AnalysisResults Analyze( ReadOnlySpan uniqueStrings, bool ignoreCase, int minLength, int maxLength) { Debug.Assert(!uniqueStrings.IsEmpty); + bool allUniqueStringsAreConfirmedAscii = ignoreCase && AreAllAscii(uniqueStrings); // Try to pick a substring comparer. If we can't find a good substring comparer, fallback to a full string comparer. AnalysisResults results; - if (minLength == 0 || !TryUseSubstring(uniqueStrings, ignoreCase, minLength, maxLength, out results)) + if (minLength == 0 || !TryUseSubstring(uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, out results)) { - results = CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan()); + results = CreateAnalysisResults(uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan()); } return results; } /// Try to find the minimal unique substring index/length to use for comparisons. - private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results) + private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool allUniqueStringsAreConfirmedAscii, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results) { const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... it's not worth the increase in algorithmic complexity to analyze longer substrings - int uniqueStringsLength = uniqueStrings.Length; // Sufficient uniqueness factor of 95% is good enough. // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad. - int acceptableNonUniqueCount = uniqueStringsLength / 20; + int acceptableNonUniqueCount = uniqueStrings.Length / 20; + + // If we're case-sensitive, we can just use a case-sensitive substring comparer, which is cheap. + // For case-insensitive / ignoreCase, we don't know which portion of the input strings we're going to care about. + // However, if all of the strings are entirely ASCII, we know that no portion of any will be non-ASCII and thus can + // use a comparer that only knows how to do ASCII-based ordinal casing, which is cheaper than full Unicode casing, + // in particular for GetHashCode. + SubstringComparer comparer = + !ignoreCase ? new JustifiedSubstringComparer() : + allUniqueStringsAreConfirmedAscii ? new JustifiedCaseInsensitiveAsciiSubstringComparer() : + new JustifiedCaseInsensitiveSubstringComparer(); - SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer(); HashSet set = new HashSet( #if NET6_0_OR_GREATER - uniqueStringsLength, + uniqueStrings.Length, #endif comparer); @@ -77,7 +86,7 @@ private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ign if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount)) { results = CreateAnalysisResults( - uniqueStrings, ignoreCase, minLength, maxLength, index, count, + uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, index, count, static (string s, int index, int count) => s.AsSpan(index, count)); return true; } @@ -101,7 +110,7 @@ private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ign if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount)) { results = CreateAnalysisResults( - uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count, + uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, comparer.Index, count, static (string s, int index, int count) => s.AsSpan(s.Length + index, count)); return true; } @@ -115,7 +124,7 @@ private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ign } private static AnalysisResults CreateAnalysisResults( - ReadOnlySpan uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getHashString) + ReadOnlySpan uniqueStrings, bool allUniqueStringsAreConfirmedAscii, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getHashString) { // Start off by assuming all strings are ASCII bool allAsciiIfIgnoreCase = true; @@ -134,10 +143,9 @@ private static AnalysisResults CreateAnalysisResults( foreach (string uniqueString in uniqueStrings) { // Get a span representing the slice of the uniqueString which will be hashed. - ReadOnlySpan hashString = getHashString(uniqueString, index, count); - // If the slice isn't ASCII, bail out to return the results. - if (!IsAllAscii(hashString)) + if (!allUniqueStringsAreConfirmedAscii && + !IsAllAscii(getHashString(uniqueString, index, count))) { allAsciiIfIgnoreCase = false; canSwitchIgnoreCaseHashToCaseSensitive = false; @@ -153,13 +161,14 @@ private static AnalysisResults CreateAnalysisResults( // and as we have just checked that IsAllAscii(hashString) is true // then we know IsAllAscii(uniqueString) must be true, // so we can skip the check. - if (count > 0 && !IsAllAscii(uniqueString.AsSpan())) - { - canSwitchIgnoreCaseHashToCaseSensitive = false; - } - else if (ContainsAnyAsciiLetters(uniqueString.AsSpan())) + if ((count > 0 && !allUniqueStringsAreConfirmedAscii && !IsAllAscii(uniqueString.AsSpan())) || + ContainsAnyAsciiLetters(uniqueString.AsSpan())) { canSwitchIgnoreCaseHashToCaseSensitive = false; + if (allUniqueStringsAreConfirmedAscii) + { + break; + } } } } @@ -177,6 +186,19 @@ private static AnalysisResults CreateAnalysisResults( private delegate ReadOnlySpan GetSpan(string s, int index, int count); + private static bool AreAllAscii(ReadOnlySpan strings) + { + foreach (string s in strings) + { + if (!IsAllAscii(s.AsSpan())) + { + return false; + } + } + + return true; + } + internal static unsafe bool IsAllAscii(ReadOnlySpan s) { #if NET8_0_OR_GREATER @@ -296,5 +318,11 @@ private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringCompar public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase); public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); } + + private sealed class JustifiedCaseInsensitiveAsciiSubstringComparer : SubstringComparer + { + public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase); + public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCaseAscii(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); + } } } diff --git a/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenDictionaryTests.cs b/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenDictionaryTests.cs index 13e695ef0a791a..86a6f3539278f8 100644 --- a/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenDictionaryTests.cs +++ b/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenDictionaryTests.cs @@ -361,6 +361,35 @@ protected override string CreateTKey(int seed) } protected override string CreateTValue(int seed) => CreateTKey(seed); + + [Theory] + [InlineData(0)] + [InlineData(25)] + [InlineData(50)] + [InlineData(75)] + [InlineData(100)] + public void ContainsKey_WithNonAscii(int percentageWithNonAscii) + { + Random rand = new(42); + + Dictionary expected = new(GetKeyIEqualityComparer()); + for (int i = 0; i < 100; i++) + { + int stringLength = rand.Next(4, 30); + byte[] bytes = new byte[stringLength]; + rand.NextBytes(bytes); + string value = Convert.ToBase64String(bytes); + if (rand.Next(100) < percentageWithNonAscii) + { + value = value.Replace(value[rand.Next(value.Length)], '\u05D0'); + } + expected.Add(value, value); + } + + FrozenDictionary actual = expected.ToFrozenDictionary(GetKeyIEqualityComparer()); + + Assert.All(expected, kvp => actual.ContainsKey(kvp.Key)); + } } public class FrozenDictionary_Generic_Tests_string_string_Default : FrozenDictionary_Generic_Tests_string_string diff --git a/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenSetTests.cs b/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenSetTests.cs index 8465f66c4c3747..2558f9bedcd895 100644 --- a/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenSetTests.cs +++ b/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenSetTests.cs @@ -6,7 +6,6 @@ using System.Globalization; using System.Linq; using Xunit; -using System.Numerics; using System.Collections.Immutable; using System.Diagnostics.CodeAnalysis; @@ -338,6 +337,35 @@ protected override string CreateT(int seed) rand.NextBytes(bytes1); return Convert.ToBase64String(bytes1); } + + [Theory] + [InlineData(0)] + [InlineData(25)] + [InlineData(50)] + [InlineData(75)] + [InlineData(100)] + public void Contains_WithNonAscii(int percentageWithNonAscii) + { + Random rand = new(42); + + HashSet expected = new(GetIEqualityComparer()); + for (int i = 0; i < 100; i++) + { + int stringLength = rand.Next(4, 30); + byte[] bytes = new byte[stringLength]; + rand.NextBytes(bytes); + string value = Convert.ToBase64String(bytes); + if (rand.Next(100) < percentageWithNonAscii) + { + value = value.Replace(value[rand.Next(value.Length)], '\u05D0'); + } + expected.Add(value); + } + + FrozenSet actual = expected.ToFrozenSet(GetIEqualityComparer()); + + Assert.All(expected, s => actual.Contains(s)); + } } public class FrozenSet_Generic_Tests_string_Default : FrozenSet_Generic_Tests_string