From 095762a7b41445e7762d1b31012347976c73884d Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Tue, 13 Jun 2023 19:26:22 +0200 Subject: [PATCH 1/3] every strategy needs an array of keys, we can create it up-front and iterate over it rather than the dictionary to get min and max lengths (1-2% gain) --- .../System/Collections/Frozen/FrozenDictionary.cs | 12 +++++++----- .../Frozen/String/LengthBucketsFrozenDictionary.cs | 3 +-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs index 5369038f7e546..8582703ffdcb7 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs @@ -232,24 +232,26 @@ private static FrozenDictionary ChooseImplementationOptimizedForRe Dictionary stringEntries = (Dictionary)(object)source; IEqualityComparer stringComparer = (IEqualityComparer)(object)comparer; + // this array is needed for every strategy + string[] entries = (string[])(object)source.Keys.ToArray(); + // Calculate the minimum and maximum lengths of the strings in the dictionary. Several of the analyses need this. int minLength = int.MaxValue, maxLength = 0; - foreach (KeyValuePair kvp in stringEntries) + foreach (string key in entries) { - if (kvp.Key.Length < minLength) minLength = kvp.Key.Length; - if (kvp.Key.Length > maxLength) maxLength = kvp.Key.Length; + if (key.Length < minLength) minLength = key.Length; + if (key.Length > maxLength) maxLength = key.Length; } Debug.Assert(minLength >= 0 && maxLength >= minLength); // Try to create an implementation that uses length buckets, where each bucket contains up to only a few strings of the same length. - FrozenDictionary? frozenDictionary = LengthBucketsFrozenDictionary.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength); + FrozenDictionary? frozenDictionary = LengthBucketsFrozenDictionary.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength, entries); if (frozenDictionary is not null) { return (FrozenDictionary)(object)frozenDictionary; } // Analyze the keys for unique substrings and create an implementation that minimizes the cost of hashing keys. - string[] entries = (string[])(object)source.Keys.ToArray(); KeyAnalyzer.AnalysisResults analysis = KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), minLength, maxLength); if (analysis.SubstringHashing) { diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs index c48b7aeac1c73..4a83ac05485b0 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs @@ -36,7 +36,7 @@ private LengthBucketsFrozenDictionary( } internal static LengthBucketsFrozenDictionary? CreateLengthBucketsFrozenDictionaryIfAppropriate( - Dictionary source, IEqualityComparer comparer, int minLength, int maxLength) + Dictionary source, IEqualityComparer comparer, int minLength, int maxLength, string[] keys) { Debug.Assert(source.Count != 0); Debug.Assert(comparer == EqualityComparer.Default || comparer == StringComparer.Ordinal || comparer == StringComparer.OrdinalIgnoreCase); @@ -81,7 +81,6 @@ private LengthBucketsFrozenDictionary( return null; } - var keys = new string[source.Count]; var values = new TValue[keys.Length]; var lengthBuckets = new KeyValuePair[spread][]; From 98a7a029b7f5c7c29199bd460f04036ffd24add8 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Tue, 13 Jun 2023 19:30:32 +0200 Subject: [PATCH 2/3] Instead of ensuring that at least 95% of data is good, we stop when we know that at least 5% is bad (13-14% gain) --- .../Collections/Frozen/String/KeyAnalyzer.cs | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs index ee80e6a08549a..e59ad6f78ea8e 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs @@ -46,7 +46,6 @@ public static AnalysisResults Analyze( /// Try to find the minimal unique substring index/length to use for comparisons. private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results) { - const double SufficientUniquenessFactor = 0.95; // 95% is good enough const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings SubstringComparer leftComparer = ignoreCase ? new LeftJustifiedCaseInsensitiveSubstringComparer() : new LeftJustifiedSubstringComparer(); @@ -70,8 +69,8 @@ private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ign for (int index = 0; index <= minLength - count; index++) { leftComparer.Index = index; - double factor = GetUniquenessFactor(leftSet, uniqueStrings); - if (factor >= SufficientUniquenessFactor) + + if (HasSufficientUniquenessFactor(leftSet, uniqueStrings)) { results = CreateAnalysisResults( uniqueStrings, ignoreCase, minLength, maxLength, index, count, @@ -106,8 +105,7 @@ private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ign // Get a uniqueness factor for the right-justified substrings. // If it's above our threshold, we're done. rightComparer.Index = -index - count; - double factor = GetUniquenessFactor(rightSet, uniqueStrings); - if (factor >= SufficientUniquenessFactor) + if (HasSufficientUniquenessFactor(rightSet, uniqueStrings)) { results = CreateAnalysisResults( uniqueStrings, ignoreCase, minLength, maxLength, rightComparer.Index, count, @@ -235,15 +233,23 @@ private static bool ContainsAnyLetters(ReadOnlySpan s) #endif } - private static double GetUniquenessFactor(HashSet set, ReadOnlySpan uniqueStrings) + private static bool HasSufficientUniquenessFactor(HashSet set, ReadOnlySpan uniqueStrings) { set.Clear(); + + // SufficientUniquenessFactor of 95% is good enough. + // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad. + int acceptableNonUniqueCount = uniqueStrings.Length / 20; + foreach (string s in uniqueStrings) { - set.Add(s); + if (!set.Add(s) && --acceptableNonUniqueCount < 0) + { + return false; + } } - return set.Count / (double)uniqueStrings.Length; + return true; } internal readonly struct AnalysisResults From 0b631d990e3ddaa3d24b6e541f26753e00474804 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Tue, 13 Jun 2023 20:37:29 +0200 Subject: [PATCH 3/3] toggle the direction and re-use the comparer and hashset (3% time gain, 12% allocations reduction) --- .../Collections/Frozen/String/KeyAnalyzer.cs | 61 ++++++------------- 1 file changed, 19 insertions(+), 42 deletions(-) diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs index e59ad6f78ea8e..9da8b3c61e573 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs @@ -48,29 +48,27 @@ private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ign { const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings - SubstringComparer leftComparer = ignoreCase ? new LeftJustifiedCaseInsensitiveSubstringComparer() : new LeftJustifiedSubstringComparer(); - HashSet leftSet = new HashSet( + SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer(); + HashSet set = new HashSet( #if NET6_0_OR_GREATER uniqueStrings.Length, #endif - leftComparer); - - HashSet? rightSet = null; - SubstringComparer? rightComparer = null; + comparer); // For each substring length... int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit); for (int count = 1; count <= maxSubstringLength; count++) { - leftComparer.Count = count; + comparer.IsLeft = true; + comparer.Count = count; // For each index, get a uniqueness factor for the left-justified substrings. // If any is above our threshold, we're done. for (int index = 0; index <= minLength - count; index++) { - leftComparer.Index = index; + comparer.Index = index; - if (HasSufficientUniquenessFactor(leftSet, uniqueStrings)) + if (HasSufficientUniquenessFactor(set, uniqueStrings)) { results = CreateAnalysisResults( uniqueStrings, ignoreCase, minLength, maxLength, index, count, @@ -85,18 +83,8 @@ private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ign // right-justified substrings, and so we also check right-justification. if (minLength != maxLength) { - // Lazily-initialize the right-comparer/set state, as it's often not needed. - if (rightComparer is null) - { - rightComparer = ignoreCase ? new RightJustifiedCaseInsensitiveSubstringComparer() : new RightJustifiedSubstringComparer(); - rightSet = new HashSet( -#if NET6_0_OR_GREATER - uniqueStrings.Length, -#endif - rightComparer); - } - rightComparer.Count = count; - Debug.Assert(rightSet is not null); + // toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it) + comparer.IsLeft = false; // For each index, get a uniqueness factor for the right-justified substrings. // If any is above our threshold, we're done. @@ -104,11 +92,11 @@ private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ign { // Get a uniqueness factor for the right-justified substrings. // If it's above our threshold, we're done. - rightComparer.Index = -index - count; - if (HasSufficientUniquenessFactor(rightSet, uniqueStrings)) + comparer.Index = -index - count; + if (HasSufficientUniquenessFactor(set, uniqueStrings)) { results = CreateAnalysisResults( - uniqueStrings, ignoreCase, minLength, maxLength, rightComparer.Index, count, + uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count, static (string s, int index, int count) => s.AsSpan(s.Length + index, count)); return true; } @@ -279,32 +267,21 @@ private abstract class SubstringComparer : IEqualityComparer { public int Index; public int Count; + public bool IsLeft; public abstract bool Equals(string? x, string? y); public abstract int GetHashCode(string s); } - private sealed class LeftJustifiedSubstringComparer : SubstringComparer - { - public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).SequenceEqual(y.AsSpan(Index, Count)); - public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(Index, Count)); - } - - private sealed class LeftJustifiedCaseInsensitiveSubstringComparer : SubstringComparer - { - public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).Equals(y.AsSpan(Index, Count), StringComparison.OrdinalIgnoreCase); - public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(Index, Count)); - } - - private sealed class RightJustifiedSubstringComparer : SubstringComparer + private sealed class JustifiedSubstringComparer : SubstringComparer { - public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).SequenceEqual(y.AsSpan(y!.Length + Index, Count)); - public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(s.Length + Index, Count)); + public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count)); + public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); } - private sealed class RightJustifiedCaseInsensitiveSubstringComparer : SubstringComparer + private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer { - public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).Equals(y.AsSpan(y!.Length + Index, Count), StringComparison.OrdinalIgnoreCase); - public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(s.Length + Index, Count)); + public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase); + public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); } } }