-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Faster optimized frozen dictionary creation (1/n) #87510
Changes from all commits
095762a
98a7a02
0b631d9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,32 +46,29 @@ public static AnalysisResults Analyze( | |
/// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary> | ||
private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results) | ||
{ | ||
const double SufficientUniquenessFactor = 0.95; // 95% is good enough | ||
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings | ||
|
||
SubstringComparer leftComparer = ignoreCase ? new LeftJustifiedCaseInsensitiveSubstringComparer() : new LeftJustifiedSubstringComparer(); | ||
HashSet<string> leftSet = new HashSet<string>( | ||
SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer(); | ||
HashSet<string> set = new HashSet<string>( | ||
#if NET6_0_OR_GREATER | ||
uniqueStrings.Length, | ||
#endif | ||
leftComparer); | ||
|
||
HashSet<string>? rightSet = null; | ||
SubstringComparer? rightComparer = null; | ||
comparer); | ||
|
||
// For each substring length... | ||
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit); | ||
for (int count = 1; count <= maxSubstringLength; count++) | ||
{ | ||
leftComparer.Count = count; | ||
comparer.IsLeft = true; | ||
comparer.Count = count; | ||
|
||
// For each index, get a uniqueness factor for the left-justified substrings. | ||
// If any is above our threshold, we're done. | ||
for (int index = 0; index <= minLength - count; index++) | ||
{ | ||
leftComparer.Index = index; | ||
double factor = GetUniquenessFactor(leftSet, uniqueStrings); | ||
if (factor >= SufficientUniquenessFactor) | ||
comparer.Index = index; | ||
|
||
if (HasSufficientUniquenessFactor(set, uniqueStrings)) | ||
{ | ||
results = CreateAnalysisResults( | ||
uniqueStrings, ignoreCase, minLength, maxLength, index, count, | ||
|
@@ -86,31 +83,20 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign | |
// right-justified substrings, and so we also check right-justification. | ||
if (minLength != maxLength) | ||
{ | ||
// Lazily-initialize the right-comparer/set state, as it's often not needed. | ||
if (rightComparer is null) | ||
{ | ||
rightComparer = ignoreCase ? new RightJustifiedCaseInsensitiveSubstringComparer() : new RightJustifiedSubstringComparer(); | ||
rightSet = new HashSet<string>( | ||
#if NET6_0_OR_GREATER | ||
uniqueStrings.Length, | ||
#endif | ||
rightComparer); | ||
} | ||
rightComparer.Count = count; | ||
Debug.Assert(rightSet is not null); | ||
// toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it) | ||
comparer.IsLeft = false; | ||
|
||
// For each index, get a uniqueness factor for the right-justified substrings. | ||
// If any is above our threshold, we're done. | ||
for (int index = 0; index <= minLength - count; index++) | ||
{ | ||
// Get a uniqueness factor for the right-justified substrings. | ||
// If it's above our threshold, we're done. | ||
rightComparer.Index = -index - count; | ||
double factor = GetUniquenessFactor(rightSet, uniqueStrings); | ||
if (factor >= SufficientUniquenessFactor) | ||
comparer.Index = -index - count; | ||
if (HasSufficientUniquenessFactor(set, uniqueStrings)) | ||
{ | ||
results = CreateAnalysisResults( | ||
uniqueStrings, ignoreCase, minLength, maxLength, rightComparer.Index, count, | ||
uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count, | ||
static (string s, int index, int count) => s.AsSpan(s.Length + index, count)); | ||
return true; | ||
} | ||
|
@@ -235,15 +221,23 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s) | |
#endif | ||
} | ||
|
||
private static double GetUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings) | ||
private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings) | ||
{ | ||
set.Clear(); | ||
|
||
// SufficientUniquenessFactor of 95% is good enough. | ||
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad. | ||
int acceptableNonUniqueCount = uniqueStrings.Length / 20; | ||
|
||
foreach (string s in uniqueStrings) | ||
{ | ||
set.Add(s); | ||
if (!set.Add(s) && --acceptableNonUniqueCount < 0) | ||
{ | ||
return false; | ||
} | ||
} | ||
|
||
return set.Count / (double)uniqueStrings.Length; | ||
return true; | ||
} | ||
|
||
internal readonly struct AnalysisResults | ||
|
@@ -273,32 +267,21 @@ private abstract class SubstringComparer : IEqualityComparer<string> | |
{ | ||
public int Index; | ||
public int Count; | ||
public bool IsLeft; | ||
public abstract bool Equals(string? x, string? y); | ||
public abstract int GetHashCode(string s); | ||
} | ||
|
||
private sealed class LeftJustifiedSubstringComparer : SubstringComparer | ||
{ | ||
public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).SequenceEqual(y.AsSpan(Index, Count)); | ||
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(Index, Count)); | ||
} | ||
|
||
private sealed class LeftJustifiedCaseInsensitiveSubstringComparer : SubstringComparer | ||
{ | ||
public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).Equals(y.AsSpan(Index, Count), StringComparison.OrdinalIgnoreCase); | ||
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(Index, Count)); | ||
} | ||
|
||
private sealed class RightJustifiedSubstringComparer : SubstringComparer | ||
private sealed class JustifiedSubstringComparer : SubstringComparer | ||
{ | ||
public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).SequenceEqual(y.AsSpan(y!.Length + Index, Count)); | ||
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(s.Length + Index, Count)); | ||
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count)); | ||
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); | ||
} | ||
|
||
private sealed class RightJustifiedCaseInsensitiveSubstringComparer : SubstringComparer | ||
private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm happy to hear it's helpful, and the fewer the types the better, but I'm a little surprised this makes a positive impact on throughput, since it's adding more work on every comparison. What's the logic for why it makes things faster? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is that true just in the example you're profiling or generally? The HashSet will happen once regardless of how many retries are needed. |
||
{ | ||
public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).Equals(y.AsSpan(y!.Length + Index, Count), StringComparison.OrdinalIgnoreCase); | ||
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(s.Length + Index, Count)); | ||
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase); | ||
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This constant was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was too lazy/tired to turn the const name into three separate words. If you don't mind I am going to do that in next PR.