diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.WebAssembly.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.WebAssembly.cs index 1d265dc70450a8..13dcd4784fecab 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.WebAssembly.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.WebAssembly.cs @@ -16,8 +16,10 @@ public partial class CompareInfo private TextInfo thisTextInfo => _thisTextInfo ??= new CultureInfo(m_name).TextInfo; - private static bool LocalizedHashCodeSupportsCompareOptions(CompareOptions options) => - options == CompareOptions.IgnoreCase || options == CompareOptions.None; + private bool LocalizedHashCodeSupportsCompareOptions(CompareOptions options) => + (m_name.Split('-')[0] == "ja") ? + options == CompareOptions.IgnoreCase || options == CompareOptions.IgnoreKanaType : + options == CompareOptions.IgnoreCase || options == CompareOptions.None; private static void AssertHybridOnWasm(CompareOptions options) { Debug.Assert(!GlobalizationMode.Invariant); @@ -129,40 +131,38 @@ private unsafe int JsIndexOfCore(ReadOnlySpan source, ReadOnlySpan t return idx; } - // there are chars that are ignored by ICU hashing algorithm but not ignored by invariant hashing - // Control: 1105 (out of 1105) - // Format: 697 (out of 731) - // OtherPunctuation: 6919 (out of 7004) - // SpaceSeparator: 289 (out of 289) - // OpenPunctuation: 1275 (out of 1343) - // ClosePunctuation: 1241 (out of 1309) - // DashPunctuation: 408 (out of 425) - // ConnectorPunctuation: 170 (out of 170) - // InitialQuotePunctuation: 204 (out of 204) - // FinalQuotePunctuation: 170 (out of 170) - // LineSeparator: 17 (out of 17) - // ParagraphSeparator: 17 (out of 17) - // OtherLetter: 34 (out of 784142) - // SpacingCombiningMark: 68 (out of 4420) - // ModifierLetter: 51 (out of 4012) - // EnclosingMark: 85 (out of 221) - // NonSpacingMark: 3281 (out of 18105) - // we can skip them all (~1027k chars) by checking for the remaining UnicodeCategories (~291k chars) - // skipping more characters than ICU would lead to hashes with smaller distribution and more collisions in hash tables + // there are chars that ignored by JS Equal function. ICU has its own set of ignored unicodes as well (5219 chars) + // Control: 65 (out of 1105) + // SpaceSeparator: 17 (out of 289) + // OtherPunctuation: 628 (out of 7004) + // OpenPunctuation: 79 (out of 1343) + // ClosePunctuation: 77 (out of 1309) + // DashPunctuation: 26 (out of 425) + // ConnectorPunctuation: 10 (out of 170) + // InitialQuotePunctuation: 12 (out of 204) + // Format: 170 (out of 731) + // FinalQuotePunctuation: 10 (out of 170) + // NonSpacingMark: 708 (out of 18105) + // EnclosingMark: 5 (out of 221)) // 0488, 0489, A670, A671, A672 + // ModifierLetter: 3 (out of 4012) + // OtherLetter: 2 (out of 784142) + // SpacingCombiningMark: 12 (out of 4420) + // LineSeparator: 1 (out of 17) + // ParagraphSeparator: 1 (out of 17) + // total: 1826 + // we can skip them by checking for the remaining UnicodeCategories + // skipping more characters than JS's localeCompare leads to hashes with smaller distribution and more collisions in hash tables // but it makes the behavior correct and consistent with locale-aware equals, which is acceptable tradeoff private static bool ShouldNotBeSkipped(UnicodeCategory category) => category == UnicodeCategory.LowercaseLetter || category == UnicodeCategory.UppercaseLetter || category == UnicodeCategory.TitlecaseLetter || + category == UnicodeCategory.CurrencySymbol || + category == UnicodeCategory.DecimalDigitNumber || category == UnicodeCategory.LetterNumber || - category == UnicodeCategory.OtherNumber || - category == UnicodeCategory.Surrogate || - category == UnicodeCategory.PrivateUse || category == UnicodeCategory.MathSymbol || - category == UnicodeCategory.CurrencySymbol || - category == UnicodeCategory.ModifierSymbol || - category == UnicodeCategory.OtherSymbol || - category == UnicodeCategory.OtherNotAssigned; + category == UnicodeCategory.ModifierSymbol; + private ReadOnlySpan SanitizeForInvariantHash(ReadOnlySpan source, CompareOptions options) { diff --git a/src/libraries/System.Runtime/tests/System.Globalization.Tests/CompareInfo/CompareInfoTests.HashCode.cs b/src/libraries/System.Runtime/tests/System.Globalization.Tests/CompareInfo/CompareInfoTests.HashCode.cs index 1ad0639711d96b..696d8d47622e60 100644 --- a/src/libraries/System.Runtime/tests/System.Globalization.Tests/CompareInfo/CompareInfoTests.HashCode.cs +++ b/src/libraries/System.Runtime/tests/System.Globalization.Tests/CompareInfo/CompareInfoTests.HashCode.cs @@ -74,42 +74,47 @@ private void TryAddToCustomDictionary(StringComparer comparer, string str1, stri } } - public static IEnumerable CheckHashingOfSkippedChars_TestData() + public static IEnumerable CharsIgnoredByEqualFunction() { - // one char from each ignored category that is skipped on ICU - yield return new object[] { '\u0008', s_invariantCompare }; // Control: BACKSPACE - yield return new object[] { '\u200B', s_invariantCompare }; // Format: ZERO WIDTH SPACE - yield return new object[] { '\u180A', s_invariantCompare }; // OtherPunctuation: MONGOLIAN NIRUGU - yield return new object[] { '\uFE73', s_invariantCompare }; // OtherLetter: THAI CHARACTER PAIYANNOI - yield return new object[] { '\u0F3E', s_invariantCompare }; // SpacingCombiningMark: "TIBETAN MARK GTER YIG MGO UM RTAGS GNYIS - yield return new object[] { '\u0640', s_invariantCompare }; // ModifierLetter: ARABIC TATWEEL - yield return new object[] { '\u0488', s_invariantCompare }; // EnclosingMark: COMBINING CYRILLIC HUNDRED THOUSANDS SIGN - yield return new object[] { '\u034F', s_invariantCompare }; // NonSpacingMark: DIAERESIS - CompareInfo thaiCmpInfo = new CultureInfo("th-TH").CompareInfo; - yield return new object[] { '\u0020', thaiCmpInfo }; // SpaceSeparator: SPACE - yield return new object[] { '\u0028', thaiCmpInfo }; // OpenPunctuation: LEFT PARENTHESIS - yield return new object[] { '\u007D', thaiCmpInfo }; // ClosePunctuation: RIGHT PARENTHESIS - yield return new object[] { '\u2013', thaiCmpInfo }; // DashPunctuation: EN DASH - yield return new object[] { '\u005F', thaiCmpInfo }; // ConnectorPunctuation: LOW LINE - yield return new object[] { '\u2018', thaiCmpInfo }; // InitialQuotePunctuation: LEFT SINGLE QUOTATION MARK - yield return new object[] { '\u2019', thaiCmpInfo }; // FinalQuotePunctuation: RIGHT SINGLE QUOTATION MARK - yield return new object[] { '\u2028', thaiCmpInfo }; // LineSeparator: LINE SEPARATOR - yield return new object[] { '\u2029', thaiCmpInfo }; // ParagraphSeparator: PARAGRAPH SEPARATOR + string str1 = "ab"; + // browser supports 240 cultures (e.g. "en-US"), out of which 54 neutral cultures (e.g. "en") + CultureInfo[] cultures = CultureInfo.GetCultures(CultureTypes.NeutralCultures); + List ignoredCodepoints = new(); + foreach(var culture in cultures) + { + // japanese with None is not supported, JS always ignores Kana when localeCompare is used + CompareOptions options = culture.Name == "ja" ? + CompareOptions.IgnoreKanaType : + CompareOptions.None; + CompareInfo cmpInfo = culture.CompareInfo; + var hashCode1 = cmpInfo.GetHashCode(str1, options); + for(int codePoint = 0; codePoint < 0x10FFFF; codePoint++) + { + char character = (char)codePoint; + string str2 = $"a{character}b"; + // in HybridGlobalization CompareInfo.Compare uses JS's localeCompare() + if (cmpInfo.Compare(str1, str2, options) == 0) + { + // do not test same codepoint with different cultures + if (!ignoredCodepoints.Contains(codePoint)) + { + ignoredCodepoints.Add(codePoint); + yield return new object[] { hashCode1, str2, cmpInfo, options }; + } + } + } + } } - [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))] - [MemberData(nameof(CheckHashingOfSkippedChars_TestData))] - public void CheckHashingOfSkippedChars(char character, CompareInfo cmpInfo) + // In non-hybrid we have hashing and Equal function from the same source - ICU4C, so this test is not necessary + // Hybrid has Equal function from JS and hashing from managed invariant algorithm, they might start diverging at some point + [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsHybridGlobalizationOnBrowser))] + [MemberData(nameof(CharsIgnoredByEqualFunction))] + public void CheckHashingOfSkippedChars(int hashCode1, string str2, CompareInfo cmpInfo, CompareOptions options) { - string str1 = $"a{character}b"; - string str2 = "ab"; - CompareOptions options = CompareOptions.None; - var hashCode1 = cmpInfo.GetHashCode(str1, options); var hashCode2 = cmpInfo.GetHashCode(str2, options); bool areHashCodesEqual = hashCode1 == hashCode2; Assert.True(areHashCodesEqual); - StringComparer stringComparer = new CustomComparer(cmpInfo, options); - TryAddToCustomDictionary(stringComparer, str1, str2, areHashCodesEqual); } public static IEnumerable GetHashCodeTestData => new[] diff --git a/src/mono/sample/wasm/browser-bench/String.cs b/src/mono/sample/wasm/browser-bench/String.cs index c00448632fc16d..f50a3929c35a13 100644 --- a/src/mono/sample/wasm/browser-bench/String.cs +++ b/src/mono/sample/wasm/browser-bench/String.cs @@ -31,6 +31,8 @@ public StringTask() new StringEndsWithMeasurement(), new StringIndexOfMeasurement(), new StringLastIndexOfMeasurement(), + new StringHashCodeNoneMeasurement(), + new StringHashCodeIgnoreCaseMeasurement(), }; } @@ -288,5 +290,33 @@ public override Task BeforeBatch() public override string Name => "String LastIndexOf"; public override void RunStep() => compareInfo.LastIndexOf(str, needleSameAsStrStart, CompareOptions.None); } + + public class StringHashCodeNoneMeasurement : StringMeasurement + { + protected CompareInfo compareInfo; + + public override Task BeforeBatch() + { + compareInfo = new CultureInfo("th-TH").CompareInfo; + InitializeString(); + return Task.CompletedTask; + } + public override string Name => "String HashCode None"; + public override void RunStep() => compareInfo.GetHashCode(str, CompareOptions.None); + } + + public class StringHashCodeIgnoreCaseMeasurement : StringMeasurement + { + protected CompareInfo compareInfo; + + public override Task BeforeBatch() + { + compareInfo = new CultureInfo("th-TH").CompareInfo; + InitializeString(); + return Task.CompletedTask; + } + public override string Name => "String HashCode IgnoreCase"; + public override void RunStep() => compareInfo.GetHashCode(str, CompareOptions.IgnoreCase); + } } }