Skip to content

Commit 73fd3fc

Browse files
committed
ingested boomfilter in repo
1 parent 5b5872b commit 73fd3fc

File tree

7 files changed

+350
-27
lines changed

7 files changed

+350
-27
lines changed

algo/cm-sketch.go

+320
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
/*
2+
* Copyright 2016-2025 Hypermode Inc. and Contributors
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package algo
18+
19+
import (
20+
"bytes"
21+
"encoding/binary"
22+
"errors"
23+
"fmt"
24+
"hash"
25+
"hash/fnv"
26+
"io"
27+
"math"
28+
)
29+
30+
// This code is copied from https://www.github.com/tylertreat/BoomFilters/refs/heads/master/countmin.go
31+
// CountMinSketch implements a Count-Min Sketch as described by Cormode and
32+
// Muthukrishnan in An Improved Data Stream Summary: The Count-Min Sketch and
33+
// its Applications:
34+
//
35+
// http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
36+
//
37+
// A Count-Min Sketch (CMS) is a probabilistic data structure which
38+
// approximates the frequency of events in a data stream. Unlike a hash map, a
39+
// CMS uses sub-linear space at the expense of a configurable error factor.
40+
// Similar to Counting Bloom filters, items are hashed to a series of buckets,
41+
// which increment a counter. The frequency of an item is estimated by taking
42+
// the minimum of each of the item's respective counter values.
43+
//
44+
// Count-Min Sketches are useful for counting the frequency of events in
45+
// massive data sets or unbounded streams online. In these situations, storing
46+
// the entire data set or allocating counters for every event in memory is
47+
// impractical. It may be possible for offline processing, but real-time
48+
// processing requires fast, space-efficient solutions like the CMS. For
49+
// approximating set cardinality, refer to the HyperLogLog.
50+
type CountMinSketch struct {
51+
matrix [][]uint64 // count matrix
52+
width uint // matrix width
53+
depth uint // matrix depth
54+
count uint64 // number of items added
55+
epsilon float64 // relative-accuracy factor
56+
delta float64 // relative-accuracy probability
57+
hash hash.Hash64 // hash function (kernel for all depth functions)
58+
}
59+
60+
// hashKernel returns the upper and lower base hash values from which the k
61+
// hashes are derived.
62+
func hashKernel(data []byte, hash hash.Hash64) (uint32, uint32) {
63+
hash.Write(data)
64+
sum := hash.Sum64()
65+
hash.Reset()
66+
upper := uint32(sum & 0xffffffff)
67+
lower := uint32((sum >> 32) & 0xffffffff)
68+
return upper, lower
69+
}
70+
71+
// NewCountMinSketch creates a new Count-Min Sketch whose relative accuracy is
72+
// within a factor of epsilon with probability delta. Both of these parameters
73+
// affect the space and time complexity.
74+
func NewCountMinSketch(epsilon, delta float64) *CountMinSketch {
75+
var (
76+
width = uint(math.Ceil(math.E / epsilon))
77+
depth = uint(math.Ceil(math.Log(1 / delta)))
78+
matrix = make([][]uint64, depth)
79+
)
80+
81+
for i := uint(0); i < depth; i++ {
82+
matrix[i] = make([]uint64, width)
83+
}
84+
85+
return &CountMinSketch{
86+
matrix: matrix,
87+
width: width,
88+
depth: depth,
89+
epsilon: epsilon,
90+
delta: delta,
91+
hash: fnv.New64(),
92+
}
93+
}
94+
95+
// Epsilon returns the relative-accuracy factor, epsilon.
96+
func (c *CountMinSketch) Epsilon() float64 {
97+
return c.epsilon
98+
}
99+
100+
// Delta returns the relative-accuracy probability, delta.
101+
func (c *CountMinSketch) Delta() float64 {
102+
return c.delta
103+
}
104+
105+
// TotalCount returns the number of items added to the sketch.
106+
func (c *CountMinSketch) TotalCount() uint64 {
107+
return c.count
108+
}
109+
110+
// AddInt will add the data to the set n times. Returns the CountMinSketch to allow for
111+
// chaining.
112+
func (c *CountMinSketch) AddInt(data []byte, n uint64) *CountMinSketch {
113+
lower, upper := hashKernel(data, c.hash)
114+
existingValue := uint64(math.MaxUint64)
115+
116+
// Increment count in each row.
117+
for i := uint(0); i < c.depth; i++ {
118+
index := (uint(lower) + uint(upper)*i) % c.width
119+
val := c.matrix[i][index]
120+
if val < existingValue {
121+
existingValue = val
122+
}
123+
if c.matrix[i][index] < n {
124+
c.matrix[i][index] = n
125+
}
126+
}
127+
128+
diff := n - existingValue
129+
if diff < 0 {
130+
diff = 0
131+
}
132+
c.count += diff
133+
return c
134+
}
135+
136+
// Add will add the data to the set. Returns the CountMinSketch to allow for
137+
// chaining.
138+
func (c *CountMinSketch) Add(data []byte) *CountMinSketch {
139+
lower, upper := hashKernel(data, c.hash)
140+
141+
// Increment count in each row.
142+
for i := uint(0); i < c.depth; i++ {
143+
c.matrix[i][(uint(lower)+uint(upper)*i)%c.width]++
144+
}
145+
146+
c.count++
147+
return c
148+
}
149+
150+
// Count returns the approximate count for the specified item, correct within
151+
// epsilon * total count with a probability of delta.
152+
func (c *CountMinSketch) Count(data []byte) uint64 {
153+
var (
154+
lower, upper = hashKernel(data, c.hash)
155+
count = uint64(math.MaxUint64)
156+
)
157+
158+
for i := uint(0); i < c.depth; i++ {
159+
count = uint64(math.Min(float64(count),
160+
float64(c.matrix[i][(uint(lower)+uint(upper)*i)%c.width])))
161+
}
162+
163+
return count
164+
}
165+
166+
// Merge combines this CountMinSketch with another. Returns an error if the
167+
// matrix width and depth are not equal.
168+
func (c *CountMinSketch) Merge(other *CountMinSketch) error {
169+
if c.depth != other.depth {
170+
return errors.New("matrix depth must match")
171+
}
172+
173+
if c.width != other.width {
174+
return errors.New("matrix width must match")
175+
}
176+
177+
for i := uint(0); i < c.depth; i++ {
178+
for j := uint(0); j < c.width; j++ {
179+
c.matrix[i][j] += other.matrix[i][j]
180+
}
181+
}
182+
183+
c.count += other.count
184+
return nil
185+
}
186+
187+
// Reset restores the CountMinSketch to its original state. It returns itself
188+
// to allow for chaining.
189+
func (c *CountMinSketch) Reset() *CountMinSketch {
190+
for i := 0; i < len(c.matrix); i++ {
191+
for j := 0; j < len(c.matrix[i]); j++ {
192+
c.matrix[i][j] = 0
193+
}
194+
}
195+
196+
c.count = 0
197+
return c
198+
}
199+
200+
// SetHash sets the hashing function used.
201+
func (c *CountMinSketch) SetHash(h hash.Hash64) {
202+
c.hash = h
203+
}
204+
205+
// WriteDataTo writes a binary representation of the CMS data to
206+
// an io stream. It returns the number of bytes written and error
207+
func (c *CountMinSketch) WriteDataTo(stream io.Writer) (int, error) {
208+
buf := new(bytes.Buffer)
209+
// serialize epsilon and delta as cms configuration check
210+
err := binary.Write(buf, binary.LittleEndian, c.epsilon)
211+
if err != nil {
212+
return 0, err
213+
}
214+
err = binary.Write(buf, binary.LittleEndian, c.delta)
215+
if err != nil {
216+
return 0, err
217+
}
218+
err = binary.Write(buf, binary.LittleEndian, c.count)
219+
if err != nil {
220+
return 0, err
221+
}
222+
// encode matrix
223+
for i := range c.matrix {
224+
err = binary.Write(buf, binary.LittleEndian, c.matrix[i])
225+
if err != nil {
226+
return 0, err
227+
}
228+
}
229+
230+
return stream.Write(buf.Bytes())
231+
}
232+
233+
// ReadDataFrom reads a binary representation of the CMS data written
234+
// by WriteDataTo() from io stream. It returns the number of bytes read
235+
// and error
236+
// If serialized CMS configuration is different it returns error with expected params
237+
func (c *CountMinSketch) ReadDataFrom(stream io.Reader) (int, error) {
238+
var (
239+
count uint64
240+
epsilon, delta float64
241+
)
242+
243+
err := binary.Read(stream, binary.LittleEndian, &epsilon)
244+
if err != nil {
245+
return 0, err
246+
}
247+
err = binary.Read(stream, binary.LittleEndian, &delta)
248+
if err != nil {
249+
return 0, err
250+
}
251+
252+
// check if serialized and target cms configurations are same
253+
if c.epsilon != epsilon || c.delta != delta {
254+
return 0, fmt.Errorf("expected cms values for epsilon %f and delta %f", epsilon, delta)
255+
}
256+
257+
err = binary.Read(stream, binary.LittleEndian, &count)
258+
if err != nil {
259+
return 0, err
260+
}
261+
262+
for i := uint(0); i < c.depth; i++ {
263+
err = binary.Read(stream, binary.LittleEndian, c.matrix[i])
264+
}
265+
// count size of matrix and count
266+
size := int(c.depth*c.width)*binary.Size(uint64(0)) + binary.Size(count) + 2*binary.Size(float64(0))
267+
268+
c.count = count
269+
270+
return size, err
271+
}
272+
273+
// TestAndRemove attemps to remove n counts of data from the CMS. If
274+
// n is greater than the data count, TestAndRemove is a no-op and
275+
// returns false. Else, return true and decrement count by n.
276+
func (c *CountMinSketch) TestAndRemove(data []byte, n uint64) bool {
277+
h, count := c.traverseDepth(data)
278+
279+
if n > count {
280+
return false
281+
}
282+
283+
for i := uint(0); i < c.depth; i++ {
284+
*h[i] -= n
285+
}
286+
287+
return true
288+
}
289+
290+
// TestAndRemoveAll counts data frequency, performs TestAndRemove(data, count),
291+
// and returns true if count is positive. If count is 0, TestAndRemoveAll is a
292+
// no-op and returns false.
293+
func (c *CountMinSketch) TestAndRemoveAll(data []byte) bool {
294+
h, count := c.traverseDepth(data)
295+
296+
if count == 0 {
297+
return false
298+
}
299+
300+
for i := uint(0); i < c.depth; i++ {
301+
*h[i] -= count
302+
}
303+
304+
return true
305+
}
306+
307+
func (c *CountMinSketch) traverseDepth(data []byte) ([]*uint64, uint64) {
308+
var (
309+
lower, upper = hashKernel(data, c.hash)
310+
count = uint64(math.MaxUint64)
311+
h = make([]*uint64, c.depth)
312+
)
313+
314+
for i := uint(0); i < c.depth; i++ {
315+
h[i] = &c.matrix[i][(uint(lower)+uint(upper)*i)%c.width]
316+
count = uint64(math.Min(float64(count), float64(*h[i])))
317+
}
318+
319+
return h, count
320+
}

go.mod

-2
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ require (
3434
github.com/google/uuid v1.6.0
3535
github.com/gorilla/websocket v1.5.3
3636
github.com/graph-gophers/graphql-go v1.5.0
37-
github.com/harshil-goel/BoomFilters v0.0.0-20241110225754-c092867bb322
3837
github.com/hashicorp/vault/api v1.15.0
3938
github.com/minio/minio-go/v6 v6.0.57
4039
github.com/mitchellh/panicwrap v1.0.0
@@ -84,7 +83,6 @@ require (
8483
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
8584
github.com/cespare/xxhash/v2 v2.3.0 // indirect
8685
github.com/chewxy/math32 v1.11.0 // indirect
87-
github.com/d4l3k/messagediff v1.2.1 // indirect
8886
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
8987
github.com/distribution/reference v0.5.0 // indirect
9088
github.com/docker/go-units v0.5.0 // indirect

go.sum

+2-6
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,8 @@ github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XL
111111
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
112112
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
113113
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
114-
github.com/chewxy/math32 v1.10.1 h1:LFpeY0SLJXeaiej/eIp2L40VYfscTvKh/FSEZ68uMkU=
115-
github.com/chewxy/math32 v1.10.1/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
114+
github.com/chewxy/math32 v1.11.0 h1:8sek2JWqeaKkVnHa7bPVqCEOUPbARo4SGxs6toKyAOo=
115+
github.com/chewxy/math32 v1.11.0/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
116116
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
117117
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
118118
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
@@ -130,8 +130,6 @@ github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfc
130130
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
131131
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
132132
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
133-
github.com/d4l3k/messagediff v1.2.1 h1:ZcAIMYsUg0EAp9X+tt8/enBE/Q8Yd5kzPynLyKptt9U=
134-
github.com/d4l3k/messagediff v1.2.1/go.mod h1:Oozbb1TVXFac9FtSIxHBMnBCq2qeH/2KkEQxENCrlLo=
135133
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
136134
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
137135
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
@@ -328,8 +326,6 @@ github.com/grpc-ecosystem/grpc-gateway v1.9.0 h1:bM6ZAFZmc/wPFaRDi0d5L7hGEZEx/2u
328326
github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=
329327
github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 h1:asbCHRVmodnJTuQ3qamDwqVOIjwqUPTYmYuemVOx+Ys=
330328
github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0/go.mod h1:ggCgvZ2r7uOoQjOyu2Y1NhHmEPPzzuhWgcza5M1Ji1I=
331-
github.com/harshil-goel/BoomFilters v0.0.0-20241110225754-c092867bb322 h1:pk5qge6xZUS4Vh3pZpInFVW/fCSQSPNAQgdOF7Qo1kg=
332-
github.com/harshil-goel/BoomFilters v0.0.0-20241110225754-c092867bb322/go.mod h1:ssEcAHqGeDHiNeR3QVo4iZPFSTzY4UympUKUf3Ia1Rg=
333329
github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q=
334330
github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8=
335331
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=

posting/lists.go

-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ func Init(ps *badger.DB, cacheSize int64, deleteOnUpdates bool) {
4747
go x.MonitorMemoryMetrics(closer)
4848

4949
memoryLayer = initMemoryLayer(cacheSize, deleteOnUpdates)
50-
GlobalStatsHolder = NewStatsHolder()
5150
}
5251

5352
func UpdateMaxCost(maxCost int64) {

0 commit comments

Comments
 (0)