dotnet · ilonatommy · Jan 22, 2024 · Jan 22, 2024 · Jan 23, 2024 · Jan 24, 2024
diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.WebAssembly.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.WebAssembly.cs
@@ -16,8 +16,10 @@ public partial class CompareInfo
 
         private TextInfo thisTextInfo => _thisTextInfo ??= new CultureInfo(m_name).TextInfo;
 
-        private static bool LocalizedHashCodeSupportsCompareOptions(CompareOptions options) =>
-            options == CompareOptions.IgnoreCase || options == CompareOptions.None;
+        private bool LocalizedHashCodeSupportsCompareOptions(CompareOptions options) =>
+            (m_name.Split('-')[0] == "ja") ?
+                options == CompareOptions.IgnoreCase || options == CompareOptions.IgnoreKanaType :
+                options == CompareOptions.IgnoreCase || options == CompareOptions.None;
         private static void AssertHybridOnWasm(CompareOptions options)
         {
             Debug.Assert(!GlobalizationMode.Invariant);
@@ -129,40 +131,38 @@ private unsafe int JsIndexOfCore(ReadOnlySpan<char> source, ReadOnlySpan<char> t
             return idx;
         }
 
-        // there are chars that are ignored by ICU hashing algorithm but not ignored by invariant hashing
-        // Control: 1105 (out of 1105)
-        // Format: 697 (out of 731)
-        // OtherPunctuation: 6919 (out of 7004)
-        // SpaceSeparator: 289 (out of 289)
-        // OpenPunctuation: 1275 (out of 1343)
-        // ClosePunctuation: 1241 (out of 1309)
-        // DashPunctuation: 408 (out of 425)
-        // ConnectorPunctuation: 170 (out of 170)
-        // InitialQuotePunctuation: 204 (out of 204)
-        // FinalQuotePunctuation: 170 (out of 170)
-        // LineSeparator: 17 (out of 17)
-        // ParagraphSeparator: 17 (out of 17)
-        // OtherLetter: 34 (out of 784142)
-        // SpacingCombiningMark: 68 (out of 4420)
-        // ModifierLetter: 51 (out of 4012)
-        // EnclosingMark: 85 (out of 221)
-        // NonSpacingMark: 3281 (out of 18105)
-        // we can skip them all (~1027k chars) by checking for the remaining UnicodeCategories (~291k chars)
-        // skipping more characters than ICU would lead to hashes with smaller distribution and more collisions in hash tables
+        // there are chars that ignored by JS Equal function. ICU has its own set of ignored unicodes as well (5219 chars)
+        // Control: 65 (out of 1105)
+        // SpaceSeparator: 17 (out of 289)
+        // OtherPunctuation: 628 (out of 7004)
+        // OpenPunctuation: 79 (out of 1343)
+        // ClosePunctuation: 77 (out of 1309)
+        // DashPunctuation: 26 (out of 425)
+        // ConnectorPunctuation: 10 (out of 170)
+        // InitialQuotePunctuation: 12 (out of 204)
+        // Format: 170 (out of 731)
+        // FinalQuotePunctuation: 10 (out of 170)
+        // NonSpacingMark: 708 (out of 18105)
+        // EnclosingMark: 5 (out of 221)) // 0488, 0489, A670, A671, A672
+        // ModifierLetter: 3 (out of 4012)
+        // OtherLetter: 2 (out of 784142)
+        // SpacingCombiningMark: 12 (out of 4420)
+        // LineSeparator: 1 (out of 17)
+        // ParagraphSeparator: 1 (out of 17)
+        // total: 1826
+        // we can skip them by checking for the remaining UnicodeCategories
+        // skipping more characters than JS's localeCompare leads to hashes with smaller distribution and more collisions in hash tables
         // but it makes the behavior correct and consistent with locale-aware equals, which is acceptable tradeoff
         private static bool ShouldNotBeSkipped(UnicodeCategory category) =>
             category == UnicodeCategory.LowercaseLetter ||
             category == UnicodeCategory.UppercaseLetter ||
             category == UnicodeCategory.TitlecaseLetter ||
+            category == UnicodeCategory.CurrencySymbol ||
+            category == UnicodeCategory.DecimalDigitNumber ||
             category == UnicodeCategory.LetterNumber ||
-            category == UnicodeCategory.OtherNumber ||
-            category == UnicodeCategory.Surrogate ||
-            category == UnicodeCategory.PrivateUse ||
             category == UnicodeCategory.MathSymbol ||
-            category == UnicodeCategory.CurrencySymbol ||
-            category == UnicodeCategory.ModifierSymbol ||
-            category == UnicodeCategory.OtherSymbol ||
-            category == UnicodeCategory.OtherNotAssigned;
+            category == UnicodeCategory.ModifierSymbol;
+
 
         private ReadOnlySpan<char> SanitizeForInvariantHash(ReadOnlySpan<char> source, CompareOptions options)
         {

diff --git a/.../System.Runtime/tests/System.Globalization.Tests/CompareInfo/CompareInfoTests.HashCode.cs b/.../System.Runtime/tests/System.Globalization.Tests/CompareInfo/CompareInfoTests.HashCode.cs
@@ -74,42 +74,47 @@ private void TryAddToCustomDictionary(StringComparer comparer, string str1, stri
             }
         }
 
-        public static IEnumerable<object[]> CheckHashingOfSkippedChars_TestData()
+        public static IEnumerable<object[]> CharsIgnoredByEqualFunction()
         {
-            // one char from each ignored category that is skipped on ICU
-            yield return new object[] { '\u0008', s_invariantCompare }; // Control: BACKSPACE
-            yield return new object[] { '\u200B', s_invariantCompare }; // Format: ZERO WIDTH SPACE
-            yield return new object[] { '\u180A', s_invariantCompare }; // OtherPunctuation: MONGOLIAN NIRUGU
-            yield return new object[] { '\uFE73', s_invariantCompare }; // OtherLetter: THAI CHARACTER PAIYANNOI
-            yield return new object[] { '\u0F3E', s_invariantCompare }; // SpacingCombiningMark: "TIBETAN MARK GTER YIG MGO UM RTAGS GNYIS
-            yield return new object[] { '\u0640', s_invariantCompare }; // ModifierLetter: ARABIC TATWEEL
-            yield return new object[] { '\u0488', s_invariantCompare }; // EnclosingMark: COMBINING CYRILLIC HUNDRED THOUSANDS SIGN
-            yield return new object[] { '\u034F', s_invariantCompare }; // NonSpacingMark: DIAERESIS
-            CompareInfo thaiCmpInfo = new CultureInfo("th-TH").CompareInfo;
-            yield return new object[] { '\u0020', thaiCmpInfo }; // SpaceSeparator: SPACE
-            yield return new object[] { '\u0028', thaiCmpInfo }; // OpenPunctuation: LEFT PARENTHESIS
-            yield return new object[] { '\u007D', thaiCmpInfo }; // ClosePunctuation: RIGHT PARENTHESIS
-            yield return new object[] { '\u2013', thaiCmpInfo }; // DashPunctuation: EN DASH
-            yield return new object[] { '\u005F', thaiCmpInfo }; // ConnectorPunctuation: LOW LINE
-            yield return new object[] { '\u2018', thaiCmpInfo }; // InitialQuotePunctuation: LEFT SINGLE QUOTATION MARK
-            yield return new object[] { '\u2019', thaiCmpInfo }; // FinalQuotePunctuation: RIGHT SINGLE QUOTATION MARK
-            yield return new object[] { '\u2028', thaiCmpInfo }; // LineSeparator: LINE SEPARATOR
-            yield return new object[] { '\u2029', thaiCmpInfo }; // ParagraphSeparator: PARAGRAPH SEPARATOR
+            string str1 = "ab";
+            // browser supports 240 cultures (e.g. "en-US"), out of which 54 neutral cultures (e.g. "en")
+            CultureInfo[] cultures = CultureInfo.GetCultures(CultureTypes.NeutralCultures);
+            List<int> ignoredCodepoints = new();
+            foreach(var culture in cultures)
+            {
+                // japanese with None is not supported, JS always ignores Kana when localeCompare is used
+                CompareOptions options = culture.Name == "ja" ?
+                    CompareOptions.IgnoreKanaType :
+                    CompareOptions.None;
+                CompareInfo cmpInfo = culture.CompareInfo;
+                var hashCode1 = cmpInfo.GetHashCode(str1, options);
+                for(int codePoint = 0; codePoint < 0x10FFFF; codePoint++)
+                {
+                    char character = (char)codePoint;
+					string str2 = $"a{character}b";
+                    // in HybridGlobalization CompareInfo.Compare uses JS's localeCompare()
+                    if (cmpInfo.Compare(str1, str2, options) == 0)
+                    {
+                        // do not test same codepoint with different cultures
+                        if (!ignoredCodepoints.Contains(codePoint))
+                        {
+                            ignoredCodepoints.Add(codePoint);
+                            yield return new object[] { hashCode1, str2, cmpInfo, options };
+                        }
+                    }
+                }
+            }
         }
 
-        [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))]
-        [MemberData(nameof(CheckHashingOfSkippedChars_TestData))]
-        public void CheckHashingOfSkippedChars(char character, CompareInfo cmpInfo)
+        // In non-hybrid we have hashing and Equal function from the same source - ICU4C, so this test is not necessary
+        // Hybrid has Equal function from JS and hashing from managed invariant algorithm, they might start diverging at some point
+        [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsHybridGlobalizationOnBrowser))]
+        [MemberData(nameof(CharsIgnoredByEqualFunction))]
+        public void CheckHashingOfSkippedChars(int hashCode1, string str2, CompareInfo cmpInfo, CompareOptions options)
         {
-            string str1 = $"a{character}b";
-            string str2 = "ab";
-            CompareOptions options = CompareOptions.None;
-            var hashCode1 = cmpInfo.GetHashCode(str1, options);
             var hashCode2 = cmpInfo.GetHashCode(str2, options);
             bool areHashCodesEqual = hashCode1 == hashCode2;
             Assert.True(areHashCodesEqual);
-            StringComparer stringComparer = new CustomComparer(cmpInfo, options);
-            TryAddToCustomDictionary(stringComparer, str1, str2, areHashCodesEqual);
         }
 
         public static IEnumerable<object[]> GetHashCodeTestData => new[]

@@ -31,6 +31,8 @@ public StringTask()
                 new StringEndsWithMeasurement(),
                 new StringIndexOfMeasurement(),
                 new StringLastIndexOfMeasurement(),
+                new StringHashCodeNoneMeasurement(),
+                new StringHashCodeIgnoreCaseMeasurement(),
             };
         }
 
@@ -288,5 +290,33 @@ public override Task BeforeBatch()
             public override string Name => "String LastIndexOf";
             public override void RunStep() => compareInfo.LastIndexOf(str, needleSameAsStrStart, CompareOptions.None);
         }
+
+        public class StringHashCodeNoneMeasurement : StringMeasurement
+        {
+            protected CompareInfo compareInfo;
+
+            public override Task BeforeBatch()
+            {
+                compareInfo = new CultureInfo("th-TH").CompareInfo;
+                InitializeString();
+                return Task.CompletedTask;
+            }
+            public override string Name => "String HashCode None";
+            public override void RunStep() => compareInfo.GetHashCode(str, CompareOptions.None);
+        }
+
+        public class StringHashCodeIgnoreCaseMeasurement : StringMeasurement
+        {
+            protected CompareInfo compareInfo;
+
+            public override Task BeforeBatch()
+            {
+                compareInfo = new CultureInfo("th-TH").CompareInfo;
+                InitializeString();
+                return Task.CompletedTask;
+            }
+            public override string Name => "String HashCode IgnoreCase";
+            public override void RunStep() => compareInfo.GetHashCode(str, CompareOptions.IgnoreCase);
+        }
     }
 }