Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[browser] HybridGlobalization correct HashCode ranges of skipped unicodes #97351

Closed
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -130,39 +130,74 @@ private unsafe int JsIndexOfCore(ReadOnlySpan<char> source, ReadOnlySpan<char> t
}

// there are chars that are ignored by ICU hashing algorithm but not ignored by invariant hashing
// Control: 1105 (out of 1105)
// Format: 697 (out of 731)
// OtherPunctuation: 6919 (out of 7004)
// SpaceSeparator: 289 (out of 289)
// OpenPunctuation: 1275 (out of 1343)
// ClosePunctuation: 1241 (out of 1309)
// DashPunctuation: 408 (out of 425)
// ConnectorPunctuation: 170 (out of 170)
// InitialQuotePunctuation: 204 (out of 204)
// FinalQuotePunctuation: 170 (out of 170)
// LineSeparator: 17 (out of 17)
// ParagraphSeparator: 17 (out of 17)
// OtherLetter: 34 (out of 784142)
// SpacingCombiningMark: 68 (out of 4420)
// ModifierLetter: 51 (out of 4012)
// EnclosingMark: 85 (out of 221)
// NonSpacingMark: 3281 (out of 18105)
// we can skip them all (~1027k chars) by checking for the remaining UnicodeCategories (~291k chars)
// Control: 59 (out of 1105)
// Format: 43 (out of 731)
// NonSpacingMark: 195 (out of 18105)
// EnclosingMark: 5 (out of 221) // 0488, 0489, A670, A671, A672
// ModifierLetter: 2 (out of 4012) // 0640, 07FA
// SpacingCombiningMark: 4 (out of 4420) // 0F3E, 0F3F, 1CE1, 1CF7
// OtherPunctuation: 4 (out of 7004) // 180A, 1CD3, 10F86 (\uD803\uDC00), 10F87 (\uD803\uDF87)
// OtherLetter: 683 (out of 784142)
// OtherNotAssigned: 3581 (out of 24718)
// UppercaseLetter: 4 (out of 19159) // 10591 (\uD801\uDC91), 10592 (\uD801\uDC92), 10594 (\uD801\uDC94), 10595 (\uD801\uDC95)
// LowercaseLetter: 24 (out of 24565) // 10597 - 105AF
// OtherNumber: 1 (out of 5100) // 10FC6 (\uD843\uDFC6)
// PrivateUse: 614 (out of 108800)
// total: 5219 chars
// skipping more characters than ICU would lead to hashes with smaller distribution and more collisions in hash tables
// but it makes the behavior correct and consistent with locale-aware equals, which is acceptable tradeoff
private static bool ShouldNotBeSkipped(UnicodeCategory category) =>
category == UnicodeCategory.LowercaseLetter ||
category == UnicodeCategory.UppercaseLetter ||
category == UnicodeCategory.TitlecaseLetter ||
category == UnicodeCategory.LetterNumber ||
category == UnicodeCategory.OtherNumber ||
category == UnicodeCategory.Surrogate ||
category == UnicodeCategory.PrivateUse ||
category == UnicodeCategory.MathSymbol ||
category == UnicodeCategory.CurrencySymbol ||
category == UnicodeCategory.ModifierSymbol ||
category == UnicodeCategory.OtherSymbol ||
category == UnicodeCategory.OtherNotAssigned;
private static bool ShouldBeSkipped(UnicodeCategory category, char value)
{
switch (category)
{
case UnicodeCategory.Control:
case UnicodeCategory.Format:
case UnicodeCategory.NonSpacingMark:
case UnicodeCategory.OtherLetter:
case UnicodeCategory.OtherNotAssigned:
case UnicodeCategory.PrivateUse:
{
return true;
}
case UnicodeCategory.LowercaseLetter:
{
// some skipped unicodes, e.g. from Elbasan script, are surrogates
int codePoint = char.ConvertToUtf32(value.ToString(), 0);
return 0x10597 <= codePoint && codePoint <= 0x105AF;
}
case UnicodeCategory.UppercaseLetter:
{
int codePoint = char.ConvertToUtf32(value.ToString(), 0);
ilonatommy marked this conversation as resolved.
Show resolved Hide resolved
return 0x10591 <= codePoint && codePoint <= 0x10595;
}
case UnicodeCategory.OtherNumber:
{
int codePoint = char.ConvertToUtf32(value.ToString(), 0);
return codePoint == 0x10FC6;
}
case UnicodeCategory.OtherPunctuation:
{
if (value == '\u180A' || value == '\u1CD3')
return true;
int codePoint = char.ConvertToUtf32(value.ToString(), 0);
return codePoint == 0x10F86 || codePoint == 0x10F87;
}
case UnicodeCategory.EnclosingMark:
{
return value == '\u0488' || value == '\u0489' || value == '\uA670' || value == '\uA671' || value == '\uA672';
}
case UnicodeCategory.ModifierLetter:
{
return value == '\u0640' || value == '\u07FA';
}
case UnicodeCategory.SpacingCombiningMark:
{
return value == '\u0F3E' || value == '\u0F3F' || value == '\u1CE1' || value == '\u1CF7';
}
default:
return false;
}
}

private ReadOnlySpan<char> SanitizeForInvariantHash(ReadOnlySpan<char> source, CompareOptions options)
{
Expand All @@ -171,10 +206,11 @@ private ReadOnlySpan<char> SanitizeForInvariantHash(ReadOnlySpan<char> source, C
foreach (char c in source)
{
UnicodeCategory category = CharUnicodeInfo.GetUnicodeCategory(c);
if (ShouldNotBeSkipped(category))
if (ShouldBeSkipped(category, c))
{
result[resultIndex++] = c;
continue;
}
result[resultIndex++] = c;
}
if ((options & CompareOptions.IgnoreCase) != 0)
{
Expand Down
30 changes: 30 additions & 0 deletions src/mono/sample/wasm/browser-bench/String.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ public StringTask()
new StringEndsWithMeasurement(),
new StringIndexOfMeasurement(),
new StringLastIndexOfMeasurement(),
new StringHashCodeNoneMeasurement(),
new StringHashCodeIgnoreCaseMeasurement(),
};
}

Expand Down Expand Up @@ -288,5 +290,33 @@ public override Task BeforeBatch()
public override string Name => "String LastIndexOf";
public override void RunStep() => compareInfo.LastIndexOf(str, needleSameAsStrStart, CompareOptions.None);
}

public class StringHashCodeNoneMeasurement : StringMeasurement
{
protected CompareInfo compareInfo;

public override Task BeforeBatch()
{
compareInfo = new CultureInfo("th-TH").CompareInfo;
InitializeString();
return Task.CompletedTask;
}
public override string Name => "String HashCode None";
public override void RunStep() => compareInfo.GetHashCode(str, CompareOptions.None);
}

public class StringHashCodeIgnoreCaseMeasurement : StringMeasurement
{
protected CompareInfo compareInfo;

public override Task BeforeBatch()
{
compareInfo = new CultureInfo("th-TH").CompareInfo;
InitializeString();
return Task.CompletedTask;
}
public override string Name => "String HashCode IgnoreCase";
public override void RunStep() => compareInfo.GetHashCode(str, CompareOptions.IgnoreCase);
}
}
}
Loading