Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster optimized frozen dictionary creation (1/n) #87510

Merged
merged 3 commits into from
Jun 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -232,24 +232,26 @@ private static FrozenDictionary<TKey, TValue> ChooseImplementationOptimizedForRe
Dictionary<string, TValue> stringEntries = (Dictionary<string, TValue>)(object)source;
IEqualityComparer<string> stringComparer = (IEqualityComparer<string>)(object)comparer;

// this array is needed for every strategy
string[] entries = (string[])(object)source.Keys.ToArray();

// Calculate the minimum and maximum lengths of the strings in the dictionary. Several of the analyses need this.
int minLength = int.MaxValue, maxLength = 0;
foreach (KeyValuePair<string, TValue> kvp in stringEntries)
foreach (string key in entries)
{
if (kvp.Key.Length < minLength) minLength = kvp.Key.Length;
if (kvp.Key.Length > maxLength) maxLength = kvp.Key.Length;
if (key.Length < minLength) minLength = key.Length;
if (key.Length > maxLength) maxLength = key.Length;
}
Debug.Assert(minLength >= 0 && maxLength >= minLength);

// Try to create an implementation that uses length buckets, where each bucket contains up to only a few strings of the same length.
FrozenDictionary<string, TValue>? frozenDictionary = LengthBucketsFrozenDictionary<TValue>.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength);
FrozenDictionary<string, TValue>? frozenDictionary = LengthBucketsFrozenDictionary<TValue>.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength, entries);
if (frozenDictionary is not null)
{
return (FrozenDictionary<TKey, TValue>)(object)frozenDictionary;
}

// Analyze the keys for unique substrings and create an implementation that minimizes the cost of hashing keys.
string[] entries = (string[])(object)source.Keys.ToArray();
KeyAnalyzer.AnalysisResults analysis = KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), minLength, maxLength);
if (analysis.SubstringHashing)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,32 +46,29 @@ public static AnalysisResults Analyze(
/// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
{
const double SufficientUniquenessFactor = 0.95; // 95% is good enough
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings

SubstringComparer leftComparer = ignoreCase ? new LeftJustifiedCaseInsensitiveSubstringComparer() : new LeftJustifiedSubstringComparer();
HashSet<string> leftSet = new HashSet<string>(
SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
HashSet<string> set = new HashSet<string>(
#if NET6_0_OR_GREATER
uniqueStrings.Length,
#endif
leftComparer);

HashSet<string>? rightSet = null;
SubstringComparer? rightComparer = null;
comparer);

// For each substring length...
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
for (int count = 1; count <= maxSubstringLength; count++)
{
leftComparer.Count = count;
comparer.IsLeft = true;
comparer.Count = count;

// For each index, get a uniqueness factor for the left-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
{
leftComparer.Index = index;
double factor = GetUniquenessFactor(leftSet, uniqueStrings);
if (factor >= SufficientUniquenessFactor)
comparer.Index = index;

if (HasSufficientUniquenessFactor(set, uniqueStrings))
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, index, count,
Expand All @@ -86,31 +83,20 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
// right-justified substrings, and so we also check right-justification.
if (minLength != maxLength)
{
// Lazily-initialize the right-comparer/set state, as it's often not needed.
if (rightComparer is null)
{
rightComparer = ignoreCase ? new RightJustifiedCaseInsensitiveSubstringComparer() : new RightJustifiedSubstringComparer();
rightSet = new HashSet<string>(
#if NET6_0_OR_GREATER
uniqueStrings.Length,
#endif
rightComparer);
}
rightComparer.Count = count;
Debug.Assert(rightSet is not null);
// toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it)
comparer.IsLeft = false;

// For each index, get a uniqueness factor for the right-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
{
// Get a uniqueness factor for the right-justified substrings.
// If it's above our threshold, we're done.
rightComparer.Index = -index - count;
double factor = GetUniquenessFactor(rightSet, uniqueStrings);
if (factor >= SufficientUniquenessFactor)
comparer.Index = -index - count;
if (HasSufficientUniquenessFactor(set, uniqueStrings))
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, rightComparer.Index, count,
uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count,
static (string s, int index, int count) => s.AsSpan(s.Length + index, count));
return true;
}
Expand Down Expand Up @@ -235,15 +221,23 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
#endif
}

private static double GetUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
{
set.Clear();

// SufficientUniquenessFactor of 95% is good enough.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This constant was deleted.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was too lazy/tired to turn the const name into three separate words. If you don't mind I am going to do that in next PR.

// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
int acceptableNonUniqueCount = uniqueStrings.Length / 20;

foreach (string s in uniqueStrings)
{
set.Add(s);
if (!set.Add(s) && --acceptableNonUniqueCount < 0)
{
return false;
}
}

return set.Count / (double)uniqueStrings.Length;
return true;
}

internal readonly struct AnalysisResults
Expand Down Expand Up @@ -273,32 +267,21 @@ private abstract class SubstringComparer : IEqualityComparer<string>
{
public int Index;
public int Count;
public bool IsLeft;
public abstract bool Equals(string? x, string? y);
public abstract int GetHashCode(string s);
}

private sealed class LeftJustifiedSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).SequenceEqual(y.AsSpan(Index, Count));
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(Index, Count));
}

private sealed class LeftJustifiedCaseInsensitiveSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).Equals(y.AsSpan(Index, Count), StringComparison.OrdinalIgnoreCase);
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(Index, Count));
}

private sealed class RightJustifiedSubstringComparer : SubstringComparer
private sealed class JustifiedSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).SequenceEqual(y.AsSpan(y!.Length + Index, Count));
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(s.Length + Index, Count));
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count));
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
}

private sealed class RightJustifiedCaseInsensitiveSubstringComparer : SubstringComparer
private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm happy to hear it's helpful, and the fewer the types the better, but I'm a little surprised this makes a positive impact on throughput, since it's adding more work on every comparison. What's the logic for why it makes things faster?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's adding more work on every comparison

That is true, but it's less work compared to creating a new HashSet<string>(keys.Length)

image

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that true just in the example you're profiling or generally? The HashSet will happen once regardless of how many retries are needed.

{
public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).Equals(y.AsSpan(y!.Length + Index, Count), StringComparison.OrdinalIgnoreCase);
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(s.Length + Index, Count));
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ private LengthBucketsFrozenDictionary(
}

internal static LengthBucketsFrozenDictionary<TValue>? CreateLengthBucketsFrozenDictionaryIfAppropriate(
Dictionary<string, TValue> source, IEqualityComparer<string> comparer, int minLength, int maxLength)
Dictionary<string, TValue> source, IEqualityComparer<string> comparer, int minLength, int maxLength, string[] keys)
{
Debug.Assert(source.Count != 0);
Debug.Assert(comparer == EqualityComparer<string>.Default || comparer == StringComparer.Ordinal || comparer == StringComparer.OrdinalIgnoreCase);
Expand Down Expand Up @@ -81,7 +81,6 @@ private LengthBucketsFrozenDictionary(
return null;
}

var keys = new string[source.Count];
var values = new TValue[keys.Length];
var lengthBuckets = new KeyValuePair<string, int>[spread][];

Expand Down