forked from mfonda/simhash
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsimhash.go
254 lines (219 loc) · 6.54 KB
/
simhash.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
// Copyright 2013 Matthew Fonda. All rights reserved.
// Use of this source code is governed by the MIT
// license that can be found in the LICENSE file.
// simhash package implements Charikar's simhash algorithm to generate a 64-bit
// fingerprint of a given document.
//
// simhash fingerprints have the property that similar documents will have a similar
// fingerprint. Therefore, the hamming distance between two fingerprints will be small
// if the documents are similar
package simhash
import (
"bytes"
"hash/fnv"
"regexp"
"github.com/go-dedup/text"
)
type Simhash interface {
NewSimhash() *Simhash
Vectorize(features []Feature) Vector
VectorizeBytes(features [][]byte) Vector
Fingerprint(v Vector) uint64
BuildSimhash(doc string, doc2words text.Doc2Words) uint64
GetSimhash(fs FeatureSet) uint64
SimhashBytes(b [][]byte) uint64
NewWordFeatureSet(b []byte) *WordFeatureSet
Shingle(w int, b [][]byte) [][]byte
}
type SimhashBase struct {
}
type Vector [64]int
// Feature consists of a 64-bit hash and a weight
type Feature interface {
// Sum returns the 64-bit sum of this feature
Sum() uint64
// Weight returns the weight of this feature
Weight() int
}
// FeatureSet represents a set of features in a given document
type FeatureSet interface {
GetFeatures() []Feature
}
var Doc2words = text.GetWordsFactory(text.Decorators(
text.SplitCamelCase,
text.ToLower,
text.RemovePunctuation,
text.Compact,
text.Trim,
))
////////////////////////////////////////////////////////////////////////////
// Function definitions
// NewSimhash makes a new Simhash
func NewSimhash() *SimhashBase {
return &SimhashBase{}
}
// Vectorize generates 64 dimension vectors given a set of features.
// Vectors are initialized to zero. The i-th element of the vector is then
// incremented by weight of the i-th feature if the i-th bit of the feature
// is set, and decremented by the weight of the i-th feature otherwise.
func (st *SimhashBase) Vectorize(features []Feature) Vector {
var v Vector
for _, feature := range features {
sum := feature.Sum()
weight := feature.Weight()
for i := uint8(0); i < 64; i++ {
bit := ((sum >> i) & 1)
if bit == 1 {
v[i] += weight
} else {
v[i] -= weight
}
}
}
return v
}
// VectorizeBytes generates 64 dimension vectors given a set of [][]byte,
// where each []byte is a feature with even weight.
//
// Vectors are initialized to zero. The i-th element of the vector is then
// incremented by weight of the i-th feature if the i-th bit of the feature
// is set, and decremented by the weight of the i-th feature otherwise.
func (st *SimhashBase) VectorizeBytes(features [][]byte) Vector {
var v Vector
h := fnv.New64()
for _, feature := range features {
h.Reset()
h.Write(feature)
sum := h.Sum64()
for i := uint8(0); i < 64; i++ {
bit := ((sum >> i) & 1)
if bit == 1 {
v[i]++
} else {
v[i]--
}
}
}
return v
}
// Fingerprint returns a 64-bit fingerprint of the given vector.
// The fingerprint f of a given 64-dimension vector v is defined as follows:
// f[i] = 1 if v[i] >= 0
// f[i] = 0 if v[i] < 0
func (st *SimhashBase) Fingerprint(v Vector) uint64 {
var f uint64
for i := uint8(0); i < 64; i++ {
if v[i] >= 0 {
f |= (1 << i)
}
}
return f
}
type feature struct {
sum uint64
weight int
}
// Sum returns the 64-bit hash of this feature
func (f feature) Sum() uint64 {
return f.sum
}
// Weight returns the weight of this feature
func (f feature) Weight() int {
return f.weight
}
// Returns a new feature representing the given byte slice, using a weight of 1
func NewFeature(f []byte) feature {
h := fnv.New64()
h.Write(f)
return feature{h.Sum64(), 1}
}
// Returns a new feature representing the given byte slice with the given weight
func NewFeatureWithWeight(f []byte, weight int) feature {
fw := NewFeature(f)
fw.weight = weight
return fw
}
// Compare calculates the Hamming distance between two 64-bit integers
//
// Currently, this is calculated using the Kernighan method [1]. Other methods
// exist which may be more efficient and are worth exploring at some point
//
// [1] http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan
func Compare(a uint64, b uint64) uint8 {
v := a ^ b
var c uint8
for c = 0; v != 0; c++ {
v &= v - 1
}
return c
}
// BuildSimhash returns a 64-bit simhash of the given string
func (st *SimhashBase) BuildSimhash(doc string, doc2words text.Doc2Words) uint64 {
return st.Fingerprint(st.Vectorize(BuildFeatures(doc, doc2words)))
}
// BuildFeatures returns a []Feature representing each word in the byte slice
func BuildFeatures(doc string, doc2words text.Doc2Words) []Feature {
words := doc2words(doc)
features := make([]Feature, len(words))
for i, w := range words {
features[i] = NewFeature([]byte(w))
}
//fmt.Printf("%#v\n", features)
return features
}
// GetSimhash returns a 64-bit simhash of the given feature set
func (st *SimhashBase) GetSimhash(fs FeatureSet) uint64 {
return st.Fingerprint(st.Vectorize(fs.GetFeatures()))
}
// Returns a 64-bit simhash of the given bytes
func (st *SimhashBase) SimhashBytes(b [][]byte) uint64 {
return st.Fingerprint(st.VectorizeBytes(b))
}
// WordFeatureSet is a feature set in which each word is a feature,
// all equal weight.
type WordFeatureSet struct {
B []byte
}
func (st *SimhashBase) NewWordFeatureSet(b []byte) *WordFeatureSet {
fs := &WordFeatureSet{b}
fs.Normalize()
return fs
}
func (w *WordFeatureSet) Normalize() {
w.B = bytes.ToLower(w.B)
}
var boundaries = regexp.MustCompile(`[\w']+(?:\://[\w\./]+){0,1}`)
// Returns a []Feature representing each word in the byte slice
func (w *WordFeatureSet) GetFeatures() []Feature {
return DoGetFeatures(w.B, boundaries)
}
// Splits the given []byte using the given regexp, then returns a slice
// containing a Feature constructed from each piece matched by the regexp
func DoGetFeatures(b []byte, r *regexp.Regexp) []Feature {
words := r.FindAll(b, -1)
features := make([]Feature, len(words))
for i, w := range words {
features[i] = NewFeature(w)
}
return features
}
// Shingle returns the w-shingling of the given set of bytes. For example, if the given
// input was {"this", "is", "a", "test"}, this returns {"this is", "is a", "a test"}
func (st *SimhashBase) Shingle(w int, b [][]byte) [][]byte {
if w < 1 {
// TODO: use error here instead of panic?
panic("simhash.Shingle(): k must be a positive integer")
}
if w == 1 {
return b
}
if w > len(b) {
w = len(b)
}
count := len(b) - w + 1
shingles := make([][]byte, count)
for i := 0; i < count; i++ {
shingles[i] = bytes.Join(b[i:i+w], []byte(" "))
}
return shingles
}