diff --git a/src/libraries/System.Collections.Immutable/src/System.Collections.Immutable.csproj b/src/libraries/System.Collections.Immutable/src/System.Collections.Immutable.csproj index 2f270f3d451b89..4c3a418894711b 100644 --- a/src/libraries/System.Collections.Immutable/src/System.Collections.Immutable.csproj +++ b/src/libraries/System.Collections.Immutable/src/System.Collections.Immutable.csproj @@ -10,6 +10,8 @@ The System.Collections.Immutable library is built-in as part of the shared frame + + diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs index da050f12a83cb5..79aa510156aa3f 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs @@ -2,11 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; +using System.Collections.Frozen.String.SubstringEquality; using System.Collections.Generic; using System.Diagnostics; -#if !NET8_0_OR_GREATER using System.Runtime.CompilerServices; -#endif namespace System.Collections.Frozen { @@ -33,84 +32,83 @@ public static AnalysisResults Analyze( { Debug.Assert(!uniqueStrings.IsEmpty); - // Try to pick a substring comparer. If we can't find a good substring comparer, fallback to a full string comparer. - AnalysisResults results; - if (minLength == 0 || !TryUseSubstring(uniqueStrings, ignoreCase, minLength, maxLength, out results)) + if (minLength > 0) { - results = CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan()); - } + const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit...it's not worth the increase in algorithmic complexity to analyze longer substrings + int uniqueStringsLength = uniqueStrings.Length; - return results; - } - - /// Try to find the minimal unique substring index/length to use for comparisons. - private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results) - { - const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings + // Sufficient uniqueness factor of 95% is good enough. + // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad. + int acceptableNonUniqueCount = uniqueStringsLength / 20; - SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer(); - HashSet set = new HashSet( -#if NET6_0_OR_GREATER - uniqueStrings.Length, -#endif - comparer); + ISubstringEqualityComparer leftComparer = ignoreCase ? new LeftSubstringCaseInsensitiveComparer() : new LeftSubstringOrdinalComparer(); + HashSet leftSet = MakeHashSet(uniqueStringsLength, leftComparer); - // For each substring length... - int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit); - for (int count = 1; count <= maxSubstringLength; count++) - { - comparer.IsLeft = true; - comparer.Count = count; + // we lazily spin up the right comparators when/if needed + ISubstringEqualityComparer? rightComparer = null; + HashSet? rightSet = null; - // For each index, get a uniqueness factor for the left-justified substrings. - // If any is above our threshold, we're done. - for (int index = 0; index <= minLength - count; index++) + // For each substring length...preferring the shortest length that provides + // enough uniqueness + int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit); + for (int count = 1; count <= maxSubstringLength; count++) { - comparer.Index = index; + leftComparer.Count = count; - if (HasSufficientUniquenessFactor(set, uniqueStrings)) + // For each index, get a uniqueness factor for the left-justified substrings. + // If any is above our threshold, we're done. + for (int index = 0; index <= minLength - count; index++) { - results = CreateAnalysisResults( - uniqueStrings, ignoreCase, minLength, maxLength, index, count, - static (string s, int index, int count) => s.AsSpan(index, count)); - return true; - } - } + leftComparer.Index = index; - // There were no left-justified substrings of this length available. - // If all of the strings are of the same length, then just checking left-justification is sufficient. - // But if any strings are of different lengths, then we'll get different alignments for left- vs - // right-justified substrings, and so we also check right-justification. - if (minLength != maxLength) - { - // toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it) - comparer.IsLeft = false; + if (HasSufficientUniquenessFactor(leftSet, uniqueStrings, acceptableNonUniqueCount)) + { + return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, leftComparer); + } + } - // For each index, get a uniqueness factor for the right-justified substrings. - // If any is above our threshold, we're done. - for (int index = 0; index <= minLength - count; index++) + // There were no left-justified substrings of this length available. + // If all of the strings are of the same length, then just checking left-justification is sufficient. + // But if any strings are of different lengths, then we'll get different alignments for left- vs + // right-justified substrings, and so we also check right-justification. + if (minLength != maxLength) { - // Get a uniqueness factor for the right-justified substrings. - // If it's above our threshold, we're done. - comparer.Index = -index - count; - if (HasSufficientUniquenessFactor(set, uniqueStrings)) + rightComparer ??= ignoreCase ? new RightSubstringCaseInsensitiveComparer() : new RightSubstringOrdinalComparer(); + rightSet ??= MakeHashSet(uniqueStringsLength, rightComparer); + + // when Index is negative, we're offsetting from the right, ensure we're at least + // far enough from the right that we have count characters available + rightComparer!.Count = count; + rightComparer!.Index = -count; + + // For each index, get a uniqueness factor for the right-justified substrings. + // If any is above our threshold, we're done. + for (int offset = 0; offset <= minLength - count; offset++, rightComparer!.Index--) { - results = CreateAnalysisResults( - uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count, - static (string s, int index, int count) => s.AsSpan(s.Length + index, count)); - return true; + if (HasSufficientUniquenessFactor(rightSet!, uniqueStrings, acceptableNonUniqueCount)) + { + return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, rightComparer); + } } } } } - // Could not find a substring index/length that was good enough. - results = default; - return false; + // Could not find a substring index/length that was good enough, use the entire string. + return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, s_FullComparer); + } + + private static HashSet MakeHashSet(int length, IEqualityComparer comparer) + { + return new HashSet( +#if NET6_0_OR_GREATER + length, +#endif + comparer); } private static AnalysisResults CreateAnalysisResults( - ReadOnlySpan uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getSubstringSpan) + ReadOnlySpan uniqueStrings, bool ignoreCase, int minLength, int maxLength, ISubstringEqualityComparer comparer) { // Start off by assuming all strings are ASCII bool allAsciiIfIgnoreCase = true; @@ -129,7 +127,7 @@ private static AnalysisResults CreateAnalysisResults( foreach (string s in uniqueStrings) { // Get the span for the substring. - ReadOnlySpan substring = getSubstringSpan(s, index, count); + ReadOnlySpan substring = comparer.Slice(s); // If the substring isn't ASCII, bail out to return the results. if (!IsAllAscii(substring)) @@ -155,11 +153,9 @@ private static AnalysisResults CreateAnalysisResults( } // Return the analysis results. - return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, index, count, minLength, maxLength); + return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, comparer.Index, comparer.Count, minLength, maxLength); } - private delegate ReadOnlySpan GetSpan(string s, int index, int count); - internal static unsafe bool IsAllAscii(ReadOnlySpan s) { #if NET8_0_OR_GREATER @@ -202,7 +198,7 @@ internal static unsafe bool IsAllAscii(ReadOnlySpan s) #if NET8_0_OR_GREATER private static readonly SearchValues s_asciiLetters = SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); #endif - private static bool ContainsAnyLetters(ReadOnlySpan s) + internal static bool ContainsAnyLetters(ReadOnlySpan s) { Debug.Assert(IsAllAscii(s)); @@ -221,18 +217,13 @@ private static bool ContainsAnyLetters(ReadOnlySpan s) #endif } - private static bool HasSufficientUniquenessFactor(HashSet set, ReadOnlySpan uniqueStrings) + internal static bool HasSufficientUniquenessFactor(HashSet set, ReadOnlySpan uniqueStrings, int acceptableNonUniqueCount) { - set.Clear(); - - // Sufficient uniqueness factor of 95% is good enough. - // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad. - int acceptableNonUniqueCount = uniqueStrings.Length / 20; - foreach (string s in uniqueStrings) { if (!set.Add(s) && --acceptableNonUniqueCount < 0) { + set.Clear(); return false; } } @@ -263,25 +254,6 @@ public AnalysisResults(bool ignoreCase, bool allAsciiIfIgnoreCase, int hashIndex public bool RightJustifiedSubstring => HashIndex < 0; } - private abstract class SubstringComparer : IEqualityComparer - { - public int Index; - public int Count; - public bool IsLeft; - public abstract bool Equals(string? x, string? y); - public abstract int GetHashCode(string s); - } - - private sealed class JustifiedSubstringComparer : SubstringComparer - { - public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count)); - public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); - } - - private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer - { - public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase); - public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); - } + private static FullStringEqualityComparer s_FullComparer = new FullStringEqualityComparer(); } } diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/SubstringEquality/SubstringEqualityComparerBase.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/SubstringEquality/SubstringEqualityComparerBase.cs new file mode 100644 index 00000000000000..92047b96707df6 --- /dev/null +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/SubstringEquality/SubstringEqualityComparerBase.cs @@ -0,0 +1,76 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Runtime.CompilerServices; + +namespace System.Collections.Frozen.String.SubstringEquality +{ + internal interface ISubstringEqualityComparer : IEqualityComparer + { + /// + /// The index at which to begin this slice + /// + /// Offset from the left side (if zero or positive) or right side (if negative) + public int Index { get; set; } + + /// + /// The desired length for the slice (exclusive). + /// + public int Count { get; set; } + + /// + /// Creates a new readonly span over the portion of the target string. + /// + /// The target string. + /// is null. + /// + /// Thrown when the specified Index or Count is not in range. + /// + public abstract ReadOnlySpan Slice(string s); + } + + internal abstract class SubstringEqualityComparerBase : ISubstringEqualityComparer + where TThisWrapper : struct, SubstringEqualityComparerBase.IGenericSpecializedWrapper + { + /// A wrapper around this that enables access to important members without making virtual calls. + private readonly TThisWrapper _this; + + protected SubstringEqualityComparerBase() + { + _this = default; + _this.Store(this); + } + + /// + public int Index { get; set; } + /// + public int Count { get; set; } + + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ReadOnlySpan Slice(string s) => _this.Slice(s); + + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool Equals(string? x, string? y) => _this.Equals(x, y); + + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetHashCode(string s) => _this.GetHashCode(s); + + /// Used to enable generic specialization with reference types. + /// + /// To avoid each of those incurring virtual dispatch to the derived type, the derived + /// type hands down a struct wrapper through which all calls are performed. This base + /// class uses that generic struct wrapper to specialize and de-virtualize. + /// + internal interface IGenericSpecializedWrapper + { + void Store(ISubstringEqualityComparer @this); + public ReadOnlySpan Slice(string s); + public bool Equals(string? x, string? y); + public int GetHashCode(string s); + } + } +} diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/SubstringEquality/SubstringEqualityComparers.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/SubstringEquality/SubstringEqualityComparers.cs new file mode 100644 index 00000000000000..302482d37343ce --- /dev/null +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/SubstringEquality/SubstringEqualityComparers.cs @@ -0,0 +1,102 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Runtime.CompilerServices; + +namespace System.Collections.Frozen.String.SubstringEquality +{ + /// + internal sealed class LeftSubstringOrdinalComparer : SubstringEqualityComparerBase + { + internal struct GSW : IGenericSpecializedWrapper + { + private LeftSubstringOrdinalComparer _this; + public void Store(ISubstringEqualityComparer @this) => _this = (LeftSubstringOrdinalComparer)@this; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ReadOnlySpan Slice(string s) => s.AsSpan(_this.Index, _this.Count); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool Equals(string? x, string? y) => Slice(x!).SequenceEqual(Slice(y!)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(Slice(s)); + } + } + + /// + internal sealed class RightSubstringOrdinalComparer : SubstringEqualityComparerBase + { + internal struct GSW : IGenericSpecializedWrapper + { + private RightSubstringOrdinalComparer _this; + public void Store(ISubstringEqualityComparer @this) => _this = (RightSubstringOrdinalComparer)@this; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ReadOnlySpan Slice(string s) => s.AsSpan(s.Length + _this.Index, _this.Count); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool Equals(string? x, string? y) => Slice(x!).SequenceEqual(Slice(y!)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(Slice(s)); + } + } + + /// + internal sealed class LeftSubstringCaseInsensitiveComparer : SubstringEqualityComparerBase + { + internal struct GSW : IGenericSpecializedWrapper + { + private LeftSubstringCaseInsensitiveComparer _this; + public void Store(ISubstringEqualityComparer @this) => _this = (LeftSubstringCaseInsensitiveComparer)@this; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ReadOnlySpan Slice(string s) => s.AsSpan(_this.Index, _this.Count); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool Equals(string? x, string? y) => Slice(x!).Equals(Slice(y!), StringComparison.OrdinalIgnoreCase); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(Slice(s)); + } + } + + internal sealed class RightSubstringCaseInsensitiveComparer : SubstringEqualityComparerBase + { + internal struct GSW : IGenericSpecializedWrapper + { + private RightSubstringCaseInsensitiveComparer _this; + public void Store(ISubstringEqualityComparer @this) => _this = (RightSubstringCaseInsensitiveComparer)@this; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ReadOnlySpan Slice(string s) => s.AsSpan(s.Length + _this.Index, _this.Count); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool Equals(string? x, string? y) => Slice(x!).Equals(Slice(y!), StringComparison.OrdinalIgnoreCase); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(Slice(s)); + } + } + + internal sealed class FullStringEqualityComparer : SubstringEqualityComparerBase + { + internal struct GSW : IGenericSpecializedWrapper + { + public void Store(ISubstringEqualityComparer @this) + { + // this one doesn't do slicing, so no wrapper or state + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ReadOnlySpan Slice(string s) => s.AsSpan(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool Equals(string? x, string? y) => x!.Equals(y!); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetHashCode(string s) => s.GetHashCode(); + } + } +} diff --git a/src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs b/src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs index 6ddec56b6005e8..21e612c90ca953 100644 --- a/src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs +++ b/src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs @@ -1,7 +1,8 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System; +using System.Collections.Generic; +using System.Linq; using Xunit; namespace System.Collections.Frozen.Tests @@ -212,5 +213,70 @@ public static void IsAllAscii() Assert.True(KeyAnalyzer.IsAllAscii("abcdefghij".AsSpan())); Assert.False(KeyAnalyzer.IsAllAscii("abcdéfghij".AsSpan())); } + + [Fact] + public static void ContainsAnyLetters() + { + Assert.True(KeyAnalyzer.ContainsAnyLetters("abc".AsSpan())); + Assert.True(KeyAnalyzer.ContainsAnyLetters("ABC".AsSpan())); + Assert.False(KeyAnalyzer.ContainsAnyLetters("123".AsSpan())); + // note, must only pass ASCII to ContainsAnyLetters, anything else is a Debug.Assert + // and it would not have been called in the actual implementation + } + + [Fact] + public static void HasSufficientUniquenessFactor() + { + HashSet set = new HashSet(StringComparer.Ordinal); + + set.Clear(); + Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "b", "c" }, 0)); + Assert.Equal(3, set.Count); + + set.Clear(); + Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "b", "a" }, 1)); + Assert.Equal(2, set.Count); // set should only have the non-collided ones + + set.Clear(); + Assert.False(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "aa", "ab", "aa" }, 0)); + Assert.Equal(0, set.Count); // if we failed it should empty the set + } + + [Fact] + public static void HasSufficientUniquenessFactorInsensitive() + { + HashSet set = new HashSet(StringComparer.OrdinalIgnoreCase); + + set.Clear(); + Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "B", "c" }, 0)); + Assert.Equal(3, set.Count); + + set.Clear(); + Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "aa", "AA" }, 1)); + Assert.Equal(1, set.Count); // set should only have the non-collided ones + + set.Clear(); + Assert.False(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "aa", "AA" }, 0)); + Assert.Equal(0, set.Count); // if we failed it should empty the set + } + + // reuse the typical data declared in the FrozenFromKnownValuesTests + public static IEnumerable TypicalData() => FrozenFromKnownValuesTests.StringStringData(); + + [Theory] + [MemberData(nameof(TypicalData))] + public static void HasSufficientUniquenessKnownData(Dictionary source) + { + string[] keys = source.Keys.ToArray(); + HashSet set = new HashSet(source.Comparer); + + int allowedCollisions = keys.Length / 20; + bool passable = KeyAnalyzer.HasSufficientUniquenessFactor(set, keys.AsSpan(), allowedCollisions); + + if (passable) + Assert.InRange(set.Count, keys.Length - allowedCollisions, keys.Length); + else + Assert.Equal(0, set.Count); + } } }