-
Notifications
You must be signed in to change notification settings - Fork 12
/
minhash_test.go
69 lines (57 loc) · 1.43 KB
/
minhash_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
package lshensemble
import (
"fmt"
"math"
"testing"
)
func TestMinhash(t *testing.T) {
m := NewMinhash(1, 256)
m.Push([]byte("Test some input"))
}
func data(size int) [][]byte {
d := make([][]byte, size)
for i := range d {
d[i] = []byte(fmt.Sprintf("salt%d %d", i, size))
}
return d
}
func hashing(mh *Minhash, start, end int, data [][]byte) {
for i := start; i < end; i++ {
mh.Push(data[i])
}
}
func benchmark(minhashSize, dataSize int, t *testing.B) {
if dataSize < 10 {
fmt.Printf("\n")
return
}
// Data is a set of unique values
d := data(dataSize)
// a and b are two subsets of data with some overlaps
a_start, a_end := 0, int(float64(dataSize)*0.65)
b_start, b_end := int(float64(dataSize)*0.35), dataSize
m1 := NewMinhash(1, minhashSize)
m2 := NewMinhash(1, minhashSize)
t.ResetTimer()
hashing(m1, a_start, a_end, d)
hashing(m2, b_start, b_end, d)
est := m1.mw.Similarity(m2.mw)
act := float64(a_end-b_start) / float64(b_end-a_start)
err := math.Abs(act - est)
fmt.Printf("Data size: %8d, ", dataSize)
fmt.Printf("Real resemblance: %.8f, ", act)
fmt.Printf("Estimated resemblance: %.8f, ", est)
fmt.Printf("Absolute Error: %.8f\n", err)
}
func BenchmarkMinWise64(b *testing.B) {
benchmark(64, b.N, b)
}
func BenchmarkMinWise128(b *testing.B) {
benchmark(128, b.N, b)
}
func BenchmarkMinWise256(b *testing.B) {
benchmark(256, b.N, b)
}
func BenchmarkMinWise512(b *testing.B) {
benchmark(512, b.N, b)
}