-
Notifications
You must be signed in to change notification settings - Fork 12
/
bootstrap.go
148 lines (138 loc) · 5.38 KB
/
bootstrap.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
package lshensemble
import "errors"
var (
errDomainSizeOrder = errors.New("Domain records must be sorted in ascending order of size")
)
func bootstrapOptimalPartitions(domains <-chan *DomainRecord, numPart int) ([]Partition, int) {
sizes, counts := computeSizeDistribution(domains)
partitions := optimalPartitions(sizes, counts, numPart)
return partitions, len(sizes)
}
func bootstrapOptimal(index *LshEnsemble, sortedDomains <-chan *DomainRecord) error {
var currPart int
var currSize int
for rec := range sortedDomains {
if currSize > rec.Size {
return errDomainSizeOrder
}
currSize = rec.Size
if currSize > index.Partitions[currPart].Upper {
currPart++
}
if currPart >= len(index.Partitions) ||
!(index.Partitions[currPart].Lower <= currSize &&
currSize <= index.Partitions[currPart].Upper) {
return errors.New("Domain records does not match the existing partitions")
}
index.Add(rec.Key, rec.Signature, currPart)
}
index.Index()
return nil
}
// BootstrapLshEnsembleOptimal builds an index from domains using optimal
// partitioning.
// The returned index consists of MinHash LSH implemented using LshForest.
// numPart is the number of partitions to create.
// numHash is the number of hash functions in MinHash.
// maxK is the maximum value for the MinHash parameter K - the number of hash
// functions per "band".
// sortedDomainFactory is factory function that returns a DomainRecord channel
// emitting domains in sorted order by their sizes.
func BootstrapLshEnsembleOptimal(numPart, numHash, maxK int,
sortedDomainFactory func() <-chan *DomainRecord) (*LshEnsemble, error) {
partitions, count := bootstrapOptimalPartitions(sortedDomainFactory(), numPart)
index := NewLshEnsemble(partitions, numHash, maxK, count)
err := bootstrapOptimal(index, sortedDomainFactory())
if err != nil {
return nil, err
}
return index, nil
}
// BootstrapLshEnsemblePlusOptimal builds an index from domains using optimal
// partitioning.
// The returned index consists of MinHash LSH implemented using LshForestArray.
// numPart is the number of partitions to create.
// numHash is the number of hash functions in MinHash.
// maxK is the maximum value for the MinHash parameter K - the number of hash
// functions per "band".
// sortedDomainFactory is factory function that returns a DomainRecord channel
// emitting domains in sorted order by their sizes.
func BootstrapLshEnsemblePlusOptimal(numPart, numHash, maxK int,
sortedDomainFactory func() <-chan *DomainRecord) (*LshEnsemble, error) {
partitions, count := bootstrapOptimalPartitions(sortedDomainFactory(), numPart)
index := NewLshEnsemblePlus(partitions, numHash, maxK, count)
err := bootstrapOptimal(index, sortedDomainFactory())
if err != nil {
return nil, err
}
return index, nil
}
func bootstrapEquiDepth(index *LshEnsemble, totalNumDomains int, sortedDomains <-chan *DomainRecord) error {
numPart := len(index.Partitions)
depth := totalNumDomains / numPart
var currDepth, currPart int
var currSize int
for rec := range sortedDomains {
if currSize > rec.Size {
return errDomainSizeOrder
}
currSize = rec.Size
index.Add(rec.Key, rec.Signature, currPart)
currDepth++
index.Partitions[currPart].Upper = rec.Size
if currDepth >= depth && currPart < numPart-1 {
currPart++
index.Partitions[currPart].Lower = rec.Size
currDepth = 0
}
}
index.Index()
return nil
}
// BootstrapLshEnsembleEquiDepth builds an index from a channel of domains
// using equi-depth partitions -- partitions have approximately the same
// number of domains.
// The returned index consists of MinHash LSH implemented using LshForest.
// numPart is the number of partitions to create.
// numHash is the number of hash functions in MinHash.
// maxK is the maximum value for the MinHash parameter K - the number of hash functions per "band".
// sortedDomains is a DomainRecord channel emitting domains in sorted order by their sizes.
func BootstrapLshEnsembleEquiDepth(numPart, numHash, maxK, totalNumDomains int,
sortedDomains <-chan *DomainRecord) (*LshEnsemble, error) {
index := NewLshEnsemble(make([]Partition, numPart), numHash, maxK,
totalNumDomains)
err := bootstrapEquiDepth(index, totalNumDomains, sortedDomains)
if err != nil {
return nil, err
}
return index, nil
}
// BootstrapLshEnsemblePlusEquiDepth builds an index from a channel of domains
// using equi-depth partitions -- partitions have approximately the same
// number of domains.
// The returned index consists of MinHash LSH implemented using LshForestArray.
// numPart is the number of partitions to create.
// numHash is the number of hash functions in MinHash.
// maxK is the maximum value for the MinHash parameter K - the number of hash functions per "band".
// sortedDomains is a DomainRecord channel emitting domains in sorted order by their sizes.
func BootstrapLshEnsemblePlusEquiDepth(numPart, numHash, maxK,
totalNumDomains int, sortedDomains <-chan *DomainRecord) (*LshEnsemble, error) {
index := NewLshEnsemblePlus(make([]Partition, numPart), numHash, maxK,
totalNumDomains)
err := bootstrapEquiDepth(index, totalNumDomains, sortedDomains)
if err != nil {
return nil, err
}
return index, nil
}
// Recs2Chan is a utility function that converts a DomainRecord slice in memory to a DomainRecord channel.
func Recs2Chan(recs []*DomainRecord) <-chan *DomainRecord {
c := make(chan *DomainRecord, 1000)
go func() {
for _, r := range recs {
c <- r
}
close(c)
}()
return c
}