49
49
# # always have a size of a power of two and can use the ``and``
50
50
# # operator instead of ``mod`` for truncation of the hash value.
51
51
52
+ type UHash = uint
53
+
54
+ proc hashUInt64 * (x: uint64 ): Hash {.inline .} =
55
+ # # for internal use; user code should prefer `hash` overloads
56
+ when nimvm : # in vmops
57
+ doAssert false
58
+ else :
59
+ # would be orders of magnitude worse, see thashes_perf toHighOrderBits
60
+ # hashData(cast[pointer](unsafeAddr x), type(x).sizeof)
61
+
62
+ # would a bit worse, see thashes_perf toInt64
63
+ # type ByteArr = array[int64.sizeof, uint8]
64
+ # result = murmurHash(cast[ptr ByteArr](unsafeAddr x)[])
65
+
66
+ # inspired from https://gist.github.com/badboy/6267743#64-bit-mix-functions
67
+ var x = x
68
+ x = (not x) + (x shl 21 ) # x = (x shl 21) - x - 1;
69
+ x = x xor (x shr 24 )
70
+ x = (x + (x shl 3 )) + (x shl 8 ) # x * 265
71
+ x = x xor (x shr 14 )
72
+ x = (x + (x shl 2 )) + (x shl 4 ) # x * 21
73
+ x = x xor (x shr 28 )
74
+ x = x + (x shl 31 )
75
+ result = cast [Hash ](x)
76
+
77
+ proc hashUInt32 * (x: uint32 ): Hash {.inline .} =
78
+ # # for internal use; user code should prefer `hash` overloads
79
+ # calling `hashUInt64(x)` would perform 1.7X slower, see thashes_perf toInt32
80
+ when nimvm : # in vmops
81
+ doAssert false
82
+ else :
83
+ # inspired from https://gist.github.com/badboy/6267743
84
+ var x = x xor ((x shr 20 ) xor (x shr 12 ))
85
+ result = cast [Hash ](x xor (x shr 7 ) xor (x shr 4 ))
86
+
87
+ when defined (js):
88
+ proc hash * (x: string ): Hash {.noSideEffect .}
89
+
90
+ proc nonlinearHash * (x: Hash ): Hash =
91
+ when defined (js):
92
+ when nimvm :
93
+ # this could also be `hashUInt64(cast[uint64](x))` on a 32 bit machine,
94
+ # but we can't query for int.sizeof since that's hardcoded for nim js
95
+ hashUInt64 (cast [uint64 ](x))
96
+ else : hash ($ x.float ) # workaround
97
+ else :
98
+ when sizeof (Hash ) == sizeof (uint64 ): hashUInt64 (cast [uint64 ](x))
99
+ else : hashUInt32 (cast [uint32 ](x))
100
+
52
101
proc `!&` * (h: Hash , val: int ): Hash {.inline .} =
53
102
# # Mixes a hash value `h` with `val` to produce a new hash value.
103
+ # # Uses Jenkins hash: https://en.wikipedia.org/wiki/Jenkins_hash_function
54
104
# #
55
105
# # This is only needed if you need to implement a hash proc for a new datatype.
56
106
let h = cast [uint ](h)
@@ -72,6 +122,8 @@ proc `!$`*(h: Hash): Hash {.inline.} =
72
122
73
123
proc hashData * (data: pointer , size: int ): Hash =
74
124
# # Hashes an array of bytes of size `size`.
125
+ # should probably reuse `proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash`
126
+ # which uses better murmurhash algorithm.
75
127
var h: Hash = 0
76
128
when defined (js):
77
129
var p: cstring
@@ -407,6 +459,52 @@ proc hash*[A](x: set[A]): Hash =
407
459
result = result !& hash (it)
408
460
result = !$ result
409
461
462
+ template hashUnordered * (iter: untyped ): Hash =
463
+ # # Hashing of unordered elements.
464
+ runnableExamples:
465
+ doAssert hashUnordered (@ [10 , 20 ]) == hashUnordered (@ [20 , 10 ])
466
+ doAssert hashUnordered (@ [10 , 20 ]) != hashUnordered (@ [11 , 19 ])
467
+ doAssert hashUnordered (@ [10 , 10 ]) != hashUnordered (@ [11 , 11 ])
468
+ static : doAssert hashUnordered (@ [10 , 10 ]) != hashUnordered (@ [11 , 11 ])
469
+ var x: seq [int ]
470
+ discard hashUnordered (x) # 0 elements works
471
+ discard hashUnordered (items (x)) # iterator works
472
+ # Example use case: for HashSet's, the result must be order-independant because 2
473
+ # HashSet's with different `data.len` but same elements (say after insertions
474
+ # and deletions) must hash to the same result.
475
+ # To combine individual hashes `hi = hash(si)`, we must either sort the `hi`
476
+ # (best hash properties to avoid collisions but requires allocations + sorting)
477
+ # or combine them with a commutative and associative operator;
478
+ # we also want to avoid trivial cases of bad collisions, ruling out obvious
479
+ # combiners, eg:
480
+ # `xor`: `xor(hi, hj)` is 0 if hi == hj
481
+ # `+`: trivial collisions eg @[10,20] vs @[10+1, 20-1]
482
+ # `*`: trivial collisions eg @[10,20] vs @[10 div 2, 20*2], and 0 if any input
483
+ # is 0.
484
+ # So we combine with `+` but via `nonlinearHash(hash(ai))` to mitigate such
485
+ # collisions. As a final refinement, we also add non-linear mixing with
486
+ # `len(iter)`.
487
+ #
488
+ # Note: see also https://crypto.stackexchange.com/questions/54544/how-to-to-calculate-the-hash-of-an-unordered-set
489
+ # A more robust but more complex / expensive approach for this problem is
490
+ # studied here: http://people.csail.mit.edu/devadas/pubs/mhashes.pdf
491
+ when false :
492
+ # sort based approach; requires `from std/algorithm import sort`
493
+ var s2: seq [Hash ]
494
+ for h in s: s2.add hash (h)
495
+ s2.sort
496
+ for h in s2: result = result !& hash (h)
497
+ result = !$ result
498
+
499
+ mixin hash
500
+ var ret: UHash # prevent checked arithmetics
501
+ var count = 0
502
+ for ai in iter:
503
+ ret += cast [UHash ](nonlinearHash (hash (ai)))
504
+ count.inc
505
+ var result = cast [Hash ](ret) !& count # extra non-linear mixing with num elements
506
+ result = !$ result
507
+ result
410
508
411
509
when isMainModule :
412
510
block empty:
0 commit comments