From 4ad08877f2689c8423fcc964662d22de6d859ad9 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 16 Jun 2023 10:37:18 +0200 Subject: [PATCH 1/6] avoid the need of having `Action storeDestIndexFromSrcIndex` by writing the destination indexes to the provided buffer with hashcodes and moving the responsibility to the caller (1-4% gain) --- .../Collections/Frozen/FrozenHashTable.cs | 17 ++++++++--------- .../Frozen/Int32/Int32FrozenDictionary.cs | 11 ++++++++--- .../Collections/Frozen/Int32/Int32FrozenSet.cs | 4 +--- .../System/Collections/Frozen/ItemsFrozenSet.cs | 12 ++++++++---- .../Frozen/KeysAndValuesFrozenDictionary.cs | 17 +++++++++-------- .../String/OrdinalStringFrozenDictionary.cs | 16 +++++++++------- .../Frozen/String/OrdinalStringFrozenSet.cs | 11 ++++++++--- 7 files changed, 51 insertions(+), 37 deletions(-) diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs index 15ba7478de231..85f2562e74f8b 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs @@ -36,18 +36,15 @@ private FrozenHashTable(int[] hashCodes, Bucket[] buckets, ulong fastModMultipli } /// Initializes a frozen hash table. - /// Pre-calculated hash codes. - /// A delegate that assigns the index to a specific entry. It's passed the destination and source indices. + /// Pre-calculated hash codes. When the method finishes, it assigns each value to destination index. /// true to spend additional effort tuning for subsequent read speed on the table; false to prioritize construction time. /// - /// This method will iterate through the incoming entries and will invoke the hasher on each once. /// It will then determine the optimal number of hash buckets to allocate and will populate the - /// bucket table. In the process of doing so, it calls out to the to indicate - /// the resulting index for that entry. - /// then uses this index to reference individual entries by indexing into . + /// bucket table. The caller is responsible to consume the values written to and update the destination (if desired). + /// then uses this index to reference individual entries by indexing into . /// /// A frozen hash table. - public static FrozenHashTable Create(ReadOnlySpan hashCodes, Action storeDestIndexFromSrcIndex, bool optimizeForReading = true) + public static FrozenHashTable Create(Span hashCodes, bool optimizeForReading = true) { // Determine how many buckets to use. This might be fewer than the number of entries // if any entries have identical hashcodes (not just different hashcodes that might @@ -100,8 +97,10 @@ public static FrozenHashTable Create(ReadOnlySpan hashCodes, Action= 0) { - hashtableHashcodes[count] = hashCodes[index]; - storeDestIndexFromSrcIndex(count, index); + ref int hashCode = ref hashCodes[index]; + hashtableHashcodes[count] = hashCode; + // we have used the hash code for the last time, now we re-use the buffer to store destination index + hashCode = count; count++; bucketCount++; diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs index e98715887addd..ef6cf0f3be7da 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs @@ -35,9 +35,14 @@ internal Int32FrozenDictionary(Dictionary source) : base(EqualityCo hashCodes[i] = entries[i].Key; } - _hashTable = FrozenHashTable.Create( - hashCodes, - (destIndex, srcIndex) => _values[destIndex] = entries[srcIndex].Value); + _hashTable = FrozenHashTable.Create(hashCodes); + + for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++) + { + int destIndex = hashCodes[srcIndex]; + + _values[destIndex] = entries[srcIndex].Value; + } ArrayPool.Shared.Return(arrayPoolHashCodes); } diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs index 4317b319abfcc..d565f0f4e3db1 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs @@ -25,9 +25,7 @@ internal Int32FrozenSet(HashSet source) : base(EqualityComparer.Defaul int[] entries = ArrayPool.Shared.Rent(count); source.CopyTo(entries); - _hashTable = FrozenHashTable.Create( - new ReadOnlySpan(entries, 0, count), - static delegate { }); + _hashTable = FrozenHashTable.Create(new Span(entries, 0, count)); ArrayPool.Shared.Return(entries); } diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/ItemsFrozenSet.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/ItemsFrozenSet.cs index db94a1a8e38f6..6037a138d01c7 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/ItemsFrozenSet.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/ItemsFrozenSet.cs @@ -30,10 +30,14 @@ protected ItemsFrozenSet(HashSet source, bool optimizeForReading = true) : ba hashCodes[i] = entries[i] is T t ? Comparer.GetHashCode(t) : 0; } - _hashTable = FrozenHashTable.Create( - hashCodes, - (destIndex, srcIndex) => _items[destIndex] = entries[srcIndex], - optimizeForReading); + _hashTable = FrozenHashTable.Create(hashCodes, optimizeForReading); + + for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++) + { + int destIndex = hashCodes[srcIndex]; + + _items[destIndex] = entries[srcIndex]; + } ArrayPool.Shared.Return(arrayPoolHashCodes); } diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/KeysAndValuesFrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/KeysAndValuesFrozenDictionary.cs index ab0cba77c2f10..ce2f3ec79aa7c 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/KeysAndValuesFrozenDictionary.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/KeysAndValuesFrozenDictionary.cs @@ -32,14 +32,15 @@ protected KeysAndValuesFrozenDictionary(Dictionary source, bool op hashCodes[i] = Comparer.GetHashCode(entries[i].Key); } - _hashTable = FrozenHashTable.Create( - hashCodes, - (destIndex, srcIndex) => - { - _keys[destIndex] = entries[srcIndex].Key; - _values[destIndex] = entries[srcIndex].Value; - }, - optimizeForReading); + _hashTable = FrozenHashTable.Create(hashCodes, optimizeForReading); + + for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++) + { + int destIndex = hashCodes[srcIndex]; + + _keys[destIndex] = entries[srcIndex].Key; + _values[destIndex] = entries[srcIndex].Value; + } ArrayPool.Shared.Return(arrayPoolHashCodes); } diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenDictionary.cs index 2cb519ead4521..e510954f7c333 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenDictionary.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenDictionary.cs @@ -47,13 +47,15 @@ internal OrdinalStringFrozenDictionary( hashCodes[i] = GetHashCode(keys[i]); } - _hashTable = FrozenHashTable.Create( - hashCodes, - (destIndex, srcIndex) => - { - _keys[destIndex] = keys[srcIndex]; - _values[destIndex] = values[srcIndex]; - }); + _hashTable = FrozenHashTable.Create(hashCodes); + + for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++) + { + int destIndex = hashCodes[srcIndex]; + + _keys[destIndex] = keys[srcIndex]; + _values[destIndex] = values[srcIndex]; + } ArrayPool.Shared.Return(arrayPoolHashCodes); } diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenSet.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenSet.cs index 1ee4b277b2113..62ce56ee3472e 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenSet.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenSet.cs @@ -38,9 +38,14 @@ internal OrdinalStringFrozenSet( hashCodes[i] = GetHashCode(entries[i]); } - _hashTable = FrozenHashTable.Create( - hashCodes, - (destIndex, srcIndex) => _items[destIndex] = entries[srcIndex]); + _hashTable = FrozenHashTable.Create(hashCodes); + + for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++) + { + int destIndex = hashCodes[srcIndex]; + + _items[destIndex] = entries[srcIndex]; + } ArrayPool.Shared.Return(arrayPoolHashCodes); } From 4014ff8fe2c28ce99ff8b5a8e4a401ebedb39dae Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 16 Jun 2023 13:33:56 +0200 Subject: [PATCH 2/6] CalcNumBuckets searches for the best number of buckets, which currently is the first prime number that would give less than 5% collision rate for unique hash codes. When bestNumCollisions was set to codes.Count (the number of unique hash codes), it meant "start the search assuming that current best collision rate is 100%". The first iteration would then check all values, as any result would be better than 100% collision rate. It would set the new best collision rate, which then would be used by next iterations. Setting bestNumCollisions to `codes.Count / 20 + 1` (just one more collision than 5%) at the beginning means: find me the first bucket that meets the criteria. If none is found, the last prime number is returned, which matches the previous behavior. +23% improvement --- .../src/System/Collections/Frozen/FrozenHashTable.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs index 85f2562e74f8b..5dfcdd0a1da15 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs @@ -147,7 +147,6 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR { Debug.Assert(hashCodes.Length != 0); - const double AcceptableCollisionRate = 0.05; // What is a satisfactory rate of hash collisions? const int LargeInputSizeThreshold = 1000; // What is the limit for an input to be considered "small"? const int MaxSmallBucketTableMultiplier = 16; // How large a bucket table should be allowed for small inputs? const int MaxLargeBucketTableMultiplier = 3; // How large a bucket table should be allowed for large inputs? @@ -208,7 +207,8 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR int[] seenBuckets = ArrayPool.Shared.Rent((maxNumBuckets / BitsPerInt32) + 1); int bestNumBuckets = maxNumBuckets; - int bestNumCollisions = codes.Count; + // just one more collision than the acceptable collision rate (5%) + int bestNumCollisions = (codes.Count / 20) + 1; // Iterate through each available prime between the min and max discovered. For each, compute // the collision ratio. @@ -246,7 +246,7 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR { bestNumBuckets = numBuckets; - if (numCollisions / (double)codes.Count <= AcceptableCollisionRate) + if (numCollisions <= (codes.Count / 20)) { break; } From 1876a75fe57279b5aa9c7347368f4bfbfaa5d7ad Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 16 Jun 2023 14:30:07 +0200 Subject: [PATCH 3/6] For cases where the key is an integer and we know the input us already unique (because it comes from a dictionary or a hash set) there is no need to create another hash set Also, in cases where simply all hash codes are unique, we can iterate over a span rather than a hash set +9% gain for scenarios where the key was an integer (time), 10-20% allocations drop up to +5% gain where string keys turned out to have unique hash codes --- .../Collections/Frozen/FrozenHashTable.cs | 80 +++++++++++++------ .../Frozen/Int32/Int32FrozenDictionary.cs | 2 +- .../Frozen/Int32/Int32FrozenSet.cs | 2 +- 3 files changed, 59 insertions(+), 25 deletions(-) diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs index 5dfcdd0a1da15..8c377b1765063 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs @@ -38,18 +38,19 @@ private FrozenHashTable(int[] hashCodes, Bucket[] buckets, ulong fastModMultipli /// Initializes a frozen hash table. /// Pre-calculated hash codes. When the method finishes, it assigns each value to destination index. /// true to spend additional effort tuning for subsequent read speed on the table; false to prioritize construction time. + /// True when the input hash codes are already unique (provided from a dictionary of integers for example). /// /// It will then determine the optimal number of hash buckets to allocate and will populate the /// bucket table. The caller is responsible to consume the values written to and update the destination (if desired). /// then uses this index to reference individual entries by indexing into . /// /// A frozen hash table. - public static FrozenHashTable Create(Span hashCodes, bool optimizeForReading = true) + public static FrozenHashTable Create(Span hashCodes, bool optimizeForReading = true, bool hashCodesAreUnique = false) { // Determine how many buckets to use. This might be fewer than the number of entries // if any entries have identical hashcodes (not just different hashcodes that might // map to the same bucket). - int numBuckets = CalcNumBuckets(hashCodes, optimizeForReading); + int numBuckets = CalcNumBuckets(hashCodes, optimizeForReading, hashCodesAreUnique); ulong fastModMultiplier = HashHelpers.GetFastModMultiplier((uint)numBuckets); // Create two spans: @@ -143,9 +144,10 @@ public void FindMatchingEntries(int hashCode, out int startIndex, out int endInd /// sizes, starting at the exact number of hash codes and incrementing the bucket count by 1 per trial, /// this is a trade-off between speed of determining a good number of buckets and maximal density. /// - private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForReading) + private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForReading, bool hashCodesAreUnique) { Debug.Assert(hashCodes.Length != 0); + Debug.Assert(!hashCodesAreUnique || new HashSet(hashCodes.ToArray()).Count == hashCodes.Length); const int LargeInputSizeThreshold = 1000; // What is the limit for an input to be considered "small"? const int MaxSmallBucketTableMultiplier = 16; // How large a bucket table should be allowed for small inputs? @@ -157,38 +159,42 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR } // Filter out duplicate codes, since no increase in buckets will avoid collisions from duplicate input hash codes. - var codes = + HashSet? codes = hashCodesAreUnique ? null : #if NETCOREAPP2_0_OR_GREATER new HashSet(hashCodes.Length); #else new HashSet(); #endif - foreach (int hashCode in hashCodes) + if (codes is not null) { - codes.Add(hashCode); + foreach (int hashCode in hashCodes) + { + codes.Add(hashCode); + } } - Debug.Assert(codes.Count != 0); + int uniqueCodesCount = hashCodesAreUnique ? hashCodes.Length : codes!.Count; + Debug.Assert(uniqueCodesCount != 0); // In our precomputed primes table, find the index of the smallest prime that's at least as large as our number of // hash codes. If there are more codes than in our precomputed primes table, which accommodates millions of values, // give up and just use the next prime. ReadOnlySpan primes = HashHelpers.Primes; int minPrimeIndexInclusive = 0; - while ((uint)minPrimeIndexInclusive < (uint)primes.Length && codes.Count > primes[minPrimeIndexInclusive]) + while ((uint)minPrimeIndexInclusive < (uint)primes.Length && uniqueCodesCount > primes[minPrimeIndexInclusive]) { minPrimeIndexInclusive++; } if (minPrimeIndexInclusive >= primes.Length) { - return HashHelpers.GetPrime(codes.Count); + return HashHelpers.GetPrime(uniqueCodesCount); } // Determine the largest number of buckets we're willing to use, based on a multiple of the number of inputs. // For smaller inputs, we allow for a larger multiplier. int maxNumBuckets = - codes.Count * - (codes.Count >= LargeInputSizeThreshold ? MaxLargeBucketTableMultiplier : MaxSmallBucketTableMultiplier); + uniqueCodesCount * + (uniqueCodesCount >= LargeInputSizeThreshold ? MaxLargeBucketTableMultiplier : MaxSmallBucketTableMultiplier); // Find the index of the smallest prime that accommodates our max buckets. int maxPrimeIndexExclusive = minPrimeIndexInclusive; @@ -208,7 +214,7 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR int bestNumBuckets = maxNumBuckets; // just one more collision than the acceptable collision rate (5%) - int bestNumCollisions = (codes.Count / 20) + 1; + int bestNumCollisions = (uniqueCodesCount / 20) + 1; // Iterate through each available prime between the min and max discovered. For each, compute // the collision ratio. @@ -221,22 +227,50 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR // Determine the bucket for each hash code and mark it as seen. If it was already seen, // track it as a collision. int numCollisions = 0; - foreach (int code in codes) + + if (codes is not null && uniqueCodesCount != hashCodes.Length) { - uint bucketNum = (uint)code % (uint)numBuckets; - if ((seenBuckets[bucketNum / BitsPerInt32] & (1 << (int)bucketNum)) != 0) + foreach (int code in codes) { - numCollisions++; - if (numCollisions >= bestNumCollisions) + uint bucketNum = (uint)code % (uint)numBuckets; + if ((seenBuckets[bucketNum / BitsPerInt32] & (1 << (int)bucketNum)) != 0) { - // If we've already hit the previously known best number of collisions, - // there's no point in continuing as worst case we'd just use that. - break; + numCollisions++; + if (numCollisions >= bestNumCollisions) + { + // If we've already hit the previously known best number of collisions, + // there's no point in continuing as worst case we'd just use that. + break; + } + } + else + { + seenBuckets[bucketNum / BitsPerInt32] |= 1 << (int)bucketNum; } } - else + } + else + { + // We can get here in two cases: + // 1. hashCodesAreUnique was true (the input comes from a unique collection of integers), so we have skipped the hash set creation. + // 2. all hash codes turned out to be unique. In such scenario, it's faster to iterate over a span. + foreach (int code in hashCodes) { - seenBuckets[bucketNum / BitsPerInt32] |= 1 << (int)bucketNum; + uint bucketNum = (uint)code % (uint)numBuckets; + if ((seenBuckets[bucketNum / BitsPerInt32] & (1 << (int)bucketNum)) != 0) + { + numCollisions++; + if (numCollisions >= bestNumCollisions) + { + // If we've already hit the previously known best number of collisions, + // there's no point in continuing as worst case we'd just use that. + break; + } + } + else + { + seenBuckets[bucketNum / BitsPerInt32] |= 1 << (int)bucketNum; + } } } @@ -246,7 +280,7 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR { bestNumBuckets = numBuckets; - if (numCollisions <= (codes.Count / 20)) + if (numCollisions <= (uniqueCodesCount / 20)) { break; } diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs index ef6cf0f3be7da..f3719928f8fa8 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs @@ -35,7 +35,7 @@ internal Int32FrozenDictionary(Dictionary source) : base(EqualityCo hashCodes[i] = entries[i].Key; } - _hashTable = FrozenHashTable.Create(hashCodes); + _hashTable = FrozenHashTable.Create(hashCodes, hashCodesAreUnique: true); for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++) { diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs index d565f0f4e3db1..22e7f093be4ad 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs @@ -25,7 +25,7 @@ internal Int32FrozenSet(HashSet source) : base(EqualityComparer.Defaul int[] entries = ArrayPool.Shared.Rent(count); source.CopyTo(entries); - _hashTable = FrozenHashTable.Create(new Span(entries, 0, count)); + _hashTable = FrozenHashTable.Create(new Span(entries, 0, count), hashCodesAreUnique: true); ArrayPool.Shared.Return(entries); } From a1fb8be3decee6f740342f0de05eefe096b245ec Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Mon, 19 Jun 2023 09:15:28 +0200 Subject: [PATCH 4/6] Revert "CalcNumBuckets searches for the best number of buckets, which currently is the first prime number that would give less than 5% collision rate for unique hash codes." as it's not finished yet This reverts commit 4014ff8fe2c28ce99ff8b5a8e4a401ebedb39dae. # Conflicts: # src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs --- .../src/System/Collections/Frozen/FrozenHashTable.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs index 8c377b1765063..5c557a2dc531c 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs @@ -149,6 +149,7 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR Debug.Assert(hashCodes.Length != 0); Debug.Assert(!hashCodesAreUnique || new HashSet(hashCodes.ToArray()).Count == hashCodes.Length); + const double AcceptableCollisionRate = 0.05; // What is a satisfactory rate of hash collisions? const int LargeInputSizeThreshold = 1000; // What is the limit for an input to be considered "small"? const int MaxSmallBucketTableMultiplier = 16; // How large a bucket table should be allowed for small inputs? const int MaxLargeBucketTableMultiplier = 3; // How large a bucket table should be allowed for large inputs? @@ -213,8 +214,7 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR int[] seenBuckets = ArrayPool.Shared.Rent((maxNumBuckets / BitsPerInt32) + 1); int bestNumBuckets = maxNumBuckets; - // just one more collision than the acceptable collision rate (5%) - int bestNumCollisions = (uniqueCodesCount / 20) + 1; + int bestNumCollisions = uniqueCodesCount; // Iterate through each available prime between the min and max discovered. For each, compute // the collision ratio. @@ -280,7 +280,7 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR { bestNumBuckets = numBuckets; - if (numCollisions <= (uniqueCodesCount / 20)) + if (numCollisions / (double)uniqueCodesCount <= AcceptableCollisionRate) { break; } From cfe0f2cd4510c81e0d56c3a1f54e63eba6be198a Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Thu, 22 Jun 2023 14:16:57 +0200 Subject: [PATCH 5/6] apply suggestion from Stephen --- .../System/Collections/Frozen/FrozenHashTable.cs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs index 5c557a2dc531c..5826fde788a2a 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs @@ -160,20 +160,22 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR } // Filter out duplicate codes, since no increase in buckets will avoid collisions from duplicate input hash codes. - HashSet? codes = hashCodesAreUnique ? null : + HashSet? codes = null; + int uniqueCodesCount = hashCodes.Length; + if (!hashCodesAreUnique) + { + codes = #if NETCOREAPP2_0_OR_GREATER - new HashSet(hashCodes.Length); + new HashSet(hashCodes.Length); #else - new HashSet(); + new HashSet(); #endif - if (codes is not null) - { foreach (int hashCode in hashCodes) { codes.Add(hashCode); } + uniqueCodesCount = codes.Count; } - int uniqueCodesCount = hashCodesAreUnique ? hashCodes.Length : codes!.Count; Debug.Assert(uniqueCodesCount != 0); // In our precomputed primes table, find the index of the smallest prime that's at least as large as our number of From e62cb33db7a56f98f9023d58f155db0d23cf9b17 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Thu, 22 Jun 2023 14:23:21 +0200 Subject: [PATCH 6/6] Apply suggestions from code review Co-authored-by: Stephen Toub --- .../src/System/Collections/Frozen/FrozenHashTable.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs index 5826fde788a2a..363ab51d035f7 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs @@ -253,9 +253,7 @@ private static int CalcNumBuckets(ReadOnlySpan hashCodes, bool optimizeForR } else { - // We can get here in two cases: - // 1. hashCodesAreUnique was true (the input comes from a unique collection of integers), so we have skipped the hash set creation. - // 2. all hash codes turned out to be unique. In such scenario, it's faster to iterate over a span. + // All of the hash codes in hashCodes are unique. In such scenario, it's faster to iterate over a span. foreach (int code in hashCodes) { uint bucketNum = (uint)code % (uint)numBuckets;