Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[browser] HybridGlobalization correct HashCode ranges of skipped unicodes #97351

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ public partial class CompareInfo

private TextInfo thisTextInfo => _thisTextInfo ??= new CultureInfo(m_name).TextInfo;

private static bool LocalizedHashCodeSupportsCompareOptions(CompareOptions options) =>
options == CompareOptions.IgnoreCase || options == CompareOptions.None;
private bool LocalizedHashCodeSupportsCompareOptions(CompareOptions options) =>
(m_name.Split('-')[0] == "ja") ?
options == CompareOptions.IgnoreCase || options == CompareOptions.IgnoreKanaType :
options == CompareOptions.IgnoreCase || options == CompareOptions.None;
private static void AssertHybridOnWasm(CompareOptions options)
{
Debug.Assert(!GlobalizationMode.Invariant);
Expand Down Expand Up @@ -129,40 +131,38 @@ private unsafe int JsIndexOfCore(ReadOnlySpan<char> source, ReadOnlySpan<char> t
return idx;
}

// there are chars that are ignored by ICU hashing algorithm but not ignored by invariant hashing
// Control: 1105 (out of 1105)
// Format: 697 (out of 731)
// OtherPunctuation: 6919 (out of 7004)
// SpaceSeparator: 289 (out of 289)
// OpenPunctuation: 1275 (out of 1343)
// ClosePunctuation: 1241 (out of 1309)
// DashPunctuation: 408 (out of 425)
// ConnectorPunctuation: 170 (out of 170)
// InitialQuotePunctuation: 204 (out of 204)
// FinalQuotePunctuation: 170 (out of 170)
// LineSeparator: 17 (out of 17)
// ParagraphSeparator: 17 (out of 17)
// OtherLetter: 34 (out of 784142)
// SpacingCombiningMark: 68 (out of 4420)
// ModifierLetter: 51 (out of 4012)
// EnclosingMark: 85 (out of 221)
// NonSpacingMark: 3281 (out of 18105)
// we can skip them all (~1027k chars) by checking for the remaining UnicodeCategories (~291k chars)
// skipping more characters than ICU would lead to hashes with smaller distribution and more collisions in hash tables
// there are chars that ignored by JS Equal function. ICU has its own set of ignored unicodes as well (5219 chars)
// Control: 65 (out of 1105)
// SpaceSeparator: 17 (out of 289)
// OtherPunctuation: 628 (out of 7004)
// OpenPunctuation: 79 (out of 1343)
// ClosePunctuation: 77 (out of 1309)
// DashPunctuation: 26 (out of 425)
// ConnectorPunctuation: 10 (out of 170)
// InitialQuotePunctuation: 12 (out of 204)
// Format: 170 (out of 731)
// FinalQuotePunctuation: 10 (out of 170)
// NonSpacingMark: 708 (out of 18105)
// EnclosingMark: 5 (out of 221)) // 0488, 0489, A670, A671, A672
// ModifierLetter: 3 (out of 4012)
// OtherLetter: 2 (out of 784142)
// SpacingCombiningMark: 12 (out of 4420)
// LineSeparator: 1 (out of 17)
// ParagraphSeparator: 1 (out of 17)
// total: 1826
// we can skip them by checking for the remaining UnicodeCategories
// skipping more characters than JS's localeCompare leads to hashes with smaller distribution and more collisions in hash tables
// but it makes the behavior correct and consistent with locale-aware equals, which is acceptable tradeoff
private static bool ShouldNotBeSkipped(UnicodeCategory category) =>
category == UnicodeCategory.LowercaseLetter ||
category == UnicodeCategory.UppercaseLetter ||
category == UnicodeCategory.TitlecaseLetter ||
category == UnicodeCategory.CurrencySymbol ||
category == UnicodeCategory.DecimalDigitNumber ||
category == UnicodeCategory.LetterNumber ||
category == UnicodeCategory.OtherNumber ||
category == UnicodeCategory.Surrogate ||
category == UnicodeCategory.PrivateUse ||
category == UnicodeCategory.MathSymbol ||
category == UnicodeCategory.CurrencySymbol ||
category == UnicodeCategory.ModifierSymbol ||
category == UnicodeCategory.OtherSymbol ||
category == UnicodeCategory.OtherNotAssigned;
category == UnicodeCategory.ModifierSymbol;


private ReadOnlySpan<char> SanitizeForInvariantHash(ReadOnlySpan<char> source, CompareOptions options)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,42 +74,47 @@ private void TryAddToCustomDictionary(StringComparer comparer, string str1, stri
}
}

public static IEnumerable<object[]> CheckHashingOfSkippedChars_TestData()
public static IEnumerable<object[]> CharsIgnoredByEqualFunction()
{
// one char from each ignored category that is skipped on ICU
yield return new object[] { '\u0008', s_invariantCompare }; // Control: BACKSPACE
yield return new object[] { '\u200B', s_invariantCompare }; // Format: ZERO WIDTH SPACE
yield return new object[] { '\u180A', s_invariantCompare }; // OtherPunctuation: MONGOLIAN NIRUGU
yield return new object[] { '\uFE73', s_invariantCompare }; // OtherLetter: THAI CHARACTER PAIYANNOI
yield return new object[] { '\u0F3E', s_invariantCompare }; // SpacingCombiningMark: "TIBETAN MARK GTER YIG MGO UM RTAGS GNYIS
yield return new object[] { '\u0640', s_invariantCompare }; // ModifierLetter: ARABIC TATWEEL
yield return new object[] { '\u0488', s_invariantCompare }; // EnclosingMark: COMBINING CYRILLIC HUNDRED THOUSANDS SIGN
yield return new object[] { '\u034F', s_invariantCompare }; // NonSpacingMark: DIAERESIS
CompareInfo thaiCmpInfo = new CultureInfo("th-TH").CompareInfo;
yield return new object[] { '\u0020', thaiCmpInfo }; // SpaceSeparator: SPACE
yield return new object[] { '\u0028', thaiCmpInfo }; // OpenPunctuation: LEFT PARENTHESIS
yield return new object[] { '\u007D', thaiCmpInfo }; // ClosePunctuation: RIGHT PARENTHESIS
yield return new object[] { '\u2013', thaiCmpInfo }; // DashPunctuation: EN DASH
yield return new object[] { '\u005F', thaiCmpInfo }; // ConnectorPunctuation: LOW LINE
yield return new object[] { '\u2018', thaiCmpInfo }; // InitialQuotePunctuation: LEFT SINGLE QUOTATION MARK
yield return new object[] { '\u2019', thaiCmpInfo }; // FinalQuotePunctuation: RIGHT SINGLE QUOTATION MARK
yield return new object[] { '\u2028', thaiCmpInfo }; // LineSeparator: LINE SEPARATOR
yield return new object[] { '\u2029', thaiCmpInfo }; // ParagraphSeparator: PARAGRAPH SEPARATOR
string str1 = "ab";
// browser supports 240 cultures (e.g. "en-US"), out of which 54 neutral cultures (e.g. "en")
CultureInfo[] cultures = CultureInfo.GetCultures(CultureTypes.NeutralCultures);
List<int> ignoredCodepoints = new();
foreach(var culture in cultures)
{
// japanese with None is not supported, JS always ignores Kana when localeCompare is used
CompareOptions options = culture.Name == "ja" ?
CompareOptions.IgnoreKanaType :
CompareOptions.None;
CompareInfo cmpInfo = culture.CompareInfo;
var hashCode1 = cmpInfo.GetHashCode(str1, options);
for(int codePoint = 0; codePoint < 0x10FFFF; codePoint++)
{
char character = (char)codePoint;
string str2 = $"a{character}b";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just silly idea: is it possible that codepoints are skipped only when are before or after another specific code point ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would expect only surrogates to work this way. This cast might not work for surrogates, though (they are 2 chars, not one). I need to check it, thanks

// in HybridGlobalization CompareInfo.Compare uses JS's localeCompare()
if (cmpInfo.Compare(str1, str2, options) == 0)
{
// do not test same codepoint with different cultures
if (!ignoredCodepoints.Contains(codePoint))
{
ignoredCodepoints.Add(codePoint);
yield return new object[] { hashCode1, str2, cmpInfo, options };
}
}
}
}
}

[ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))]
[MemberData(nameof(CheckHashingOfSkippedChars_TestData))]
public void CheckHashingOfSkippedChars(char character, CompareInfo cmpInfo)
// In non-hybrid we have hashing and Equal function from the same source - ICU4C, so this test is not necessary
// Hybrid has Equal function from JS and hashing from managed invariant algorithm, they might start diverging at some point
[ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsHybridGlobalizationOnBrowser))]
[MemberData(nameof(CharsIgnoredByEqualFunction))]
public void CheckHashingOfSkippedChars(int hashCode1, string str2, CompareInfo cmpInfo, CompareOptions options)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be good to also have another test like

foreach locale
    foreach codepoint
        var s1=$"A{codepoint}B"
        var s2=$"AB"
        var h1 = locale.getHash(s1)
        var h2 = locale.getHash(s2)
        if(locale.equals(s1, s2))
             assert(h1 == h2)
        else
             // We know that the hash collisions are OK, when they are rare. So this should fail in very small % of cases.
             assert(h1 != h2)

After we learned how bad the hash collisions are, we could comment out assert(h1 != h2) or add few known collisions as exception to the rules.

{
string str1 = $"a{character}b";
string str2 = "ab";
CompareOptions options = CompareOptions.None;
var hashCode1 = cmpInfo.GetHashCode(str1, options);
var hashCode2 = cmpInfo.GetHashCode(str2, options);
bool areHashCodesEqual = hashCode1 == hashCode2;
Assert.True(areHashCodesEqual);
StringComparer stringComparer = new CustomComparer(cmpInfo, options);
TryAddToCustomDictionary(stringComparer, str1, str2, areHashCodesEqual);
}

public static IEnumerable<object[]> GetHashCodeTestData => new[]
Expand Down
30 changes: 30 additions & 0 deletions src/mono/sample/wasm/browser-bench/String.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ public StringTask()
new StringEndsWithMeasurement(),
new StringIndexOfMeasurement(),
new StringLastIndexOfMeasurement(),
new StringHashCodeNoneMeasurement(),
new StringHashCodeIgnoreCaseMeasurement(),
};
}

Expand Down Expand Up @@ -288,5 +290,33 @@ public override Task BeforeBatch()
public override string Name => "String LastIndexOf";
public override void RunStep() => compareInfo.LastIndexOf(str, needleSameAsStrStart, CompareOptions.None);
}

public class StringHashCodeNoneMeasurement : StringMeasurement
{
protected CompareInfo compareInfo;

public override Task BeforeBatch()
{
compareInfo = new CultureInfo("th-TH").CompareInfo;
InitializeString();
return Task.CompletedTask;
}
public override string Name => "String HashCode None";
public override void RunStep() => compareInfo.GetHashCode(str, CompareOptions.None);
}

public class StringHashCodeIgnoreCaseMeasurement : StringMeasurement
{
protected CompareInfo compareInfo;

public override Task BeforeBatch()
{
compareInfo = new CultureInfo("th-TH").CompareInfo;
InitializeString();
return Task.CompletedTask;
}
public override string Name => "String HashCode IgnoreCase";
public override void RunStep() => compareInfo.GetHashCode(str, CompareOptions.IgnoreCase);
}
}
}
Loading