Skip to content

Commit 71ec6c2

Browse files
committed
use a better algorithm to combine unordered hashes
1 parent 2825fc2 commit 71ec6c2

File tree

3 files changed

+105
-18
lines changed

3 files changed

+105
-18
lines changed

compiler/vmops.nim

+6-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ from md5 import getMD5
1818
from sighashes import symBodyDigest
1919
from times import cpuTime
2020

21-
from hashes import hash
21+
from hashes import hash, hashUInt64, hashUInt32
2222

2323
template mathop(op) {.dirty.} =
2424
registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`)
@@ -236,6 +236,11 @@ proc registerAdditionalOps*(c: PCtx) =
236236
registerCallback c, "stdlib.hashes.hashVmImplByte", hashVmImplByte
237237
registerCallback c, "stdlib.hashes.hashVmImplChar", hashVmImplByte
238238

239+
registerCallback c, "stdlib.hashes.hashUInt64", proc (a: VmArgs) {.nimcall.} =
240+
a.setResult hashUInt64(cast[uint64](getInt(a, 0)))
241+
registerCallback c, "stdlib.hashes.hashUInt32", proc (a: VmArgs) {.nimcall.} =
242+
a.setResult hashUInt32(cast[uint32](getInt(a, 0)))
243+
239244
if optBenchmarkVM in c.config.globalOptions:
240245
wrap0(cpuTime, timesop)
241246
else:

lib/pure/collections/sets.nim

+1-17
Original file line numberDiff line numberDiff line change
@@ -571,24 +571,8 @@ proc map*[A, B](data: HashSet[A], op: proc (x: A): B {.closure.}): HashSet[B] =
571571
result = initHashSet[B]()
572572
for item in items(data): result.incl(op(item))
573573

574-
from std/algorithm import sort
575-
576574
proc hash*[A](s: HashSet[A]): Hash =
577-
## Hashing of HashSet.
578-
# This handles tombstones (iterating over all `s.data` would be wrong).
579-
# Iterating over items(s) requires a commutative hash combiner like `xor`
580-
# to avoid depending on order (which could differ for 2 HashSet's with different
581-
# `data.len` but same elements, after insertions and deletions), eg:
582-
# for h in s:
583-
# result = result xor hash(h)
584-
# But `xor` has bad mixing properties (eg, would give 0 if HashSet contains
585-
# a, b such that hash(a) == hash(b) and a != b). `sort` should have low
586-
# overhead compared to the surrounding code.
587-
var s2: seq[Hash]
588-
for h in s: s2.add hash(h)
589-
s2.sort
590-
for h in s2: result = result !& hash(h)
591-
result = !$result
575+
hashUnordered(s)
592576

593577
proc `$`*[A](s: HashSet[A]): string =
594578
## Converts the set `s` to a string, mostly for logging and printing purposes.

lib/pure/hashes.nim

+98
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,58 @@ type
4949
## always have a size of a power of two and can use the ``and``
5050
## operator instead of ``mod`` for truncation of the hash value.
5151

52+
type UHash = uint
53+
54+
proc hashUInt64*(x: uint64): Hash {.inline.} =
55+
## for internal use; user code should prefer `hash` overloads
56+
when nimvm: # in vmops
57+
doAssert false
58+
else:
59+
# would be orders of magnitude worse, see thashes_perf toHighOrderBits
60+
# hashData(cast[pointer](unsafeAddr x), type(x).sizeof)
61+
62+
# would a bit worse, see thashes_perf toInt64
63+
# type ByteArr = array[int64.sizeof, uint8]
64+
# result = murmurHash(cast[ptr ByteArr](unsafeAddr x)[])
65+
66+
# inspired from https://gist.github.com/badboy/6267743#64-bit-mix-functions
67+
var x = x
68+
x = (not x) + (x shl 21) # x = (x shl 21) - x - 1;
69+
x = x xor (x shr 24)
70+
x = (x + (x shl 3)) + (x shl 8) # x * 265
71+
x = x xor (x shr 14)
72+
x = (x + (x shl 2)) + (x shl 4) # x * 21
73+
x = x xor (x shr 28)
74+
x = x + (x shl 31)
75+
result = cast[Hash](x)
76+
77+
proc hashUInt32*(x: uint32): Hash {.inline.} =
78+
## for internal use; user code should prefer `hash` overloads
79+
# calling `hashUInt64(x)` would perform 1.7X slower, see thashes_perf toInt32
80+
when nimvm: # in vmops
81+
doAssert false
82+
else:
83+
# inspired from https://gist.github.com/badboy/6267743
84+
var x = x xor ((x shr 20) xor (x shr 12))
85+
result = cast[Hash](x xor (x shr 7) xor (x shr 4))
86+
87+
when defined(js):
88+
proc hash*(x: string): Hash {.noSideEffect.}
89+
90+
proc nonlinearHash*(x: Hash): Hash =
91+
when defined(js):
92+
when nimvm:
93+
# this could also be `hashUInt64(cast[uint64](x))` on a 32 bit machine,
94+
# but we can't query for int.sizeof since that's hardcoded for nim js
95+
hashUInt64(cast[uint64](x))
96+
else: hash($x.float) # workaround
97+
else:
98+
when sizeof(Hash) == sizeof(uint64): hashUInt64(cast[uint64](x))
99+
else: hashUInt32(cast[uint32](x))
100+
52101
proc `!&`*(h: Hash, val: int): Hash {.inline.} =
53102
## Mixes a hash value `h` with `val` to produce a new hash value.
103+
## Uses Jenkins hash: https://en.wikipedia.org/wiki/Jenkins_hash_function
54104
##
55105
## This is only needed if you need to implement a hash proc for a new datatype.
56106
let h = cast[uint](h)
@@ -72,6 +122,8 @@ proc `!$`*(h: Hash): Hash {.inline.} =
72122

73123
proc hashData*(data: pointer, size: int): Hash =
74124
## Hashes an array of bytes of size `size`.
125+
# should probably reuse `proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash`
126+
# which uses better murmurhash algorithm.
75127
var h: Hash = 0
76128
when defined(js):
77129
var p: cstring
@@ -407,6 +459,52 @@ proc hash*[A](x: set[A]): Hash =
407459
result = result !& hash(it)
408460
result = !$result
409461

462+
template hashUnordered*(iter: untyped): Hash =
463+
## Hashing of unordered elements.
464+
runnableExamples:
465+
doAssert hashUnordered(@[10, 20]) == hashUnordered(@[20, 10])
466+
doAssert hashUnordered(@[10, 20]) != hashUnordered(@[11, 19])
467+
doAssert hashUnordered(@[10, 10]) != hashUnordered(@[11, 11])
468+
static: doAssert hashUnordered(@[10, 10]) != hashUnordered(@[11, 11])
469+
var x: seq[int]
470+
discard hashUnordered(x) # 0 elements works
471+
discard hashUnordered(items(x)) # iterator works
472+
# Example use case: for HashSet's, the result must be order-independant because 2
473+
# HashSet's with different `data.len` but same elements (say after insertions
474+
# and deletions) must hash to the same result.
475+
# To combine individual hashes `hi = hash(si)`, we must either sort the `hi`
476+
# (best hash properties to avoid collisions but requires allocations + sorting)
477+
# or combine them with a commutative and associative operator;
478+
# we also want to avoid trivial cases of bad collisions, ruling out obvious
479+
# combiners, eg:
480+
# `xor`: `xor(hi, hj)` is 0 if hi == hj
481+
# `+`: trivial collisions eg @[10,20] vs @[10+1, 20-1]
482+
# `*`: trivial collisions eg @[10,20] vs @[10 div 2, 20*2], and 0 if any input
483+
# is 0.
484+
# So we combine with `+` but via `nonlinearHash(hash(ai))` to mitigate such
485+
# collisions. As a final refinement, we also add non-linear mixing with
486+
# `len(iter)`.
487+
#
488+
# Note: see also https://crypto.stackexchange.com/questions/54544/how-to-to-calculate-the-hash-of-an-unordered-set
489+
# A more robust but more complex / expensive approach for this problem is
490+
# studied here: http://people.csail.mit.edu/devadas/pubs/mhashes.pdf
491+
when false:
492+
# sort based approach; requires `from std/algorithm import sort`
493+
var s2: seq[Hash]
494+
for h in s: s2.add hash(h)
495+
s2.sort
496+
for h in s2: result = result !& hash(h)
497+
result = !$result
498+
499+
mixin hash
500+
var ret: UHash # prevent checked arithmetics
501+
var count = 0
502+
for ai in iter:
503+
ret += cast[UHash](nonlinearHash(hash(ai)))
504+
count.inc
505+
var result = cast[Hash](ret) !& count # extra non-linear mixing with num elements
506+
result = !$ result
507+
result
410508

411509
when isMainModule:
412510
block empty:

0 commit comments

Comments
 (0)